whyrating-engine-legacy/scrapers/base.py

"""
Base Scraper Interface

This module defines the abstract base class that all scrapers must implement.
It ensures consistent interface across different scraper implementations.
"""

from abc import ABC, abstractmethod
from typing import Any, Callable, Dict, List, Optional


class BaseScraper(ABC):
    """
    Abstract base class for all scrapers in the ReviewIQ system.

    All concrete scraper implementations must inherit from this class
    and implement the required abstract methods.
    """

    @abstractmethod
    def scrape(
        self,
        driver: Any,
        url: str,
        max_reviews: int = 5000,
        timeout_no_new: int = 15,
        flush_callback: Optional[Callable[[List[Dict]], None]] = None,
        flush_batch_size: int = 500,
        progress_callback: Optional[Callable[[int, Optional[int]], None]] = None,
        validation_only: bool = False
    ) -> Dict[str, Any]:
        """
        Scrape reviews from the given URL.

        Args:
            driver: WebDriver instance (e.g., Selenium WebDriver)
            url: The URL to scrape reviews from
            max_reviews: Maximum number of reviews to collect
            timeout_no_new: Seconds to wait with no new reviews before stopping
            flush_callback: Optional callback called with reviews batches for streaming
            flush_batch_size: Number of reviews before triggering flush_callback
            progress_callback: Optional callback(current_count, total_count) for progress
            validation_only: If True, return early after extracting metadata only

        Returns:
            Dictionary containing:
                - reviews: List of review dictionaries
                - total: Total number of reviews collected
                - error: Error message if any, None otherwise
                - Additional scraper-specific metadata
        """
        pass

    @abstractmethod
    def validate_url(self, url: str) -> bool:
        """
        Validate if the given URL is supported by this scraper.

        Args:
            url: The URL to validate

        Returns:
            True if the URL is valid for this scraper, False otherwise
        """
        pass

    @abstractmethod
    def get_business_info(self, driver: Any, url: str) -> Dict[str, Any]:
        """
        Extract business information from the URL without scraping reviews.

        Args:
            driver: WebDriver instance
            url: The URL to extract info from

        Returns:
            Dictionary containing business metadata (name, rating, address, etc.)
        """
        pass

    @property
    @abstractmethod
    def name(self) -> str:
        """Return the human-readable name of this scraper."""
        pass

    @property
    @abstractmethod
    def version(self) -> str:
        """Return the version string of this scraper."""
        pass

    @property
    @abstractmethod
    def supported_domains(self) -> List[str]:
        """Return list of domains this scraper supports."""
        pass