Phase 0: Project restructure to ReviewIQ platform architecture

New structure: - scrapers/google_reviews/v1_0_0.py (was modules/scraper_clean.py) - scrapers/base.py (BaseScraper interface) - scrapers/registry.py (ScraperRegistry for version routing) - core/database.py, models.py, config.py, enums.py - utils/logger.py, crash_analyzer.py, health_checks.py, helpers.py, date_converter.py - workers/chrome_pool.py - services/webhook_service.py - api/ routes structure (empty, ready for Phase 2) - tests/ structure mirroring source All imports updated in: - api_server_production.py (7 import paths updated) - utils/health_checks.py (scraper import path) Legacy modules moved to modules/_legacy/: - data_storage.py, image_handler.py, s3_handler.py (unused) Syntax verified, frontend build passing. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-24 15:22:08 +00:00
parent bb0291f265
commit 544e028c3f
37 changed files with 5782 additions and 30 deletions
--- a/scrapers/init.py
+++ b/scrapers/init.py
@@ -0,0 +1,10 @@
+"""
+Scrapers Package
+
+This package contains all scraper implementations for the ReviewIQ system.
+"""
+
+from scrapers.base import BaseScraper
+from scrapers.registry import ScraperRegistry, registry
+
+__all__ = ["BaseScraper", "ScraperRegistry", "registry"]
--- a/scrapers/base.py
+++ b/scrapers/base.py
@@ -0,0 +1,97 @@
+"""
+Base Scraper Interface
+
+This module defines the abstract base class that all scrapers must implement.
+It ensures consistent interface across different scraper implementations.
+"""
+
+from abc import ABC, abstractmethod
+from typing import Any, Callable, Dict, List, Optional
+
+
+class BaseScraper(ABC):
+    """
+    Abstract base class for all scrapers in the ReviewIQ system.
+
+    All concrete scraper implementations must inherit from this class
+    and implement the required abstract methods.
+    """
+
+    @abstractmethod
+    def scrape(
+        self,
+        driver: Any,
+        url: str,
+        max_reviews: int = 5000,
+        timeout_no_new: int = 15,
+        flush_callback: Optional[Callable[[List[Dict]], None]] = None,
+        flush_batch_size: int = 500,
+        progress_callback: Optional[Callable[[int, Optional[int]], None]] = None,
+        validation_only: bool = False
+    ) -> Dict[str, Any]:
+        """
+        Scrape reviews from the given URL.
+
+        Args:
+            driver: WebDriver instance (e.g., Selenium WebDriver)
+            url: The URL to scrape reviews from
+            max_reviews: Maximum number of reviews to collect
+            timeout_no_new: Seconds to wait with no new reviews before stopping
+            flush_callback: Optional callback called with reviews batches for streaming
+            flush_batch_size: Number of reviews before triggering flush_callback
+            progress_callback: Optional callback(current_count, total_count) for progress
+            validation_only: If True, return early after extracting metadata only
+
+        Returns:
+            Dictionary containing:
+                - reviews: List of review dictionaries
+                - total: Total number of reviews collected
+                - error: Error message if any, None otherwise
+                - Additional scraper-specific metadata
+        """
+        pass
+
+    @abstractmethod
+    def validate_url(self, url: str) -> bool:
+        """
+        Validate if the given URL is supported by this scraper.
+
+        Args:
+            url: The URL to validate
+
+        Returns:
+            True if the URL is valid for this scraper, False otherwise
+        """
+        pass
+
+    @abstractmethod
+    def get_business_info(self, driver: Any, url: str) -> Dict[str, Any]:
+        """
+        Extract business information from the URL without scraping reviews.
+
+        Args:
+            driver: WebDriver instance
+            url: The URL to extract info from
+
+        Returns:
+            Dictionary containing business metadata (name, rating, address, etc.)
+        """
+        pass
+
+    @property
+    @abstractmethod
+    def name(self) -> str:
+        """Return the human-readable name of this scraper."""
+        pass
+
+    @property
+    @abstractmethod
+    def version(self) -> str:
+        """Return the version string of this scraper."""
+        pass
+
+    @property
+    @abstractmethod
+    def supported_domains(self) -> List[str]:
+        """Return list of domains this scraper supports."""
+        pass
--- a/scrapers/google_reviews/init.py
+++ b/scrapers/google_reviews/init.py
@@ -0,0 +1,21 @@
+"""
+Google Reviews Scraper Package
+
+This package contains the Google Reviews scraper implementations.
+"""
+
+from scrapers.google_reviews.v1_0_0 import (
+    scrape_reviews,
+    fast_scrape_reviews,
+    get_business_card_info,
+    extract_about_info,
+    LogCapture,
+)
+
+__all__ = [
+    "scrape_reviews",
+    "fast_scrape_reviews",
+    "get_business_card_info",
+    "extract_about_info",
+    "LogCapture",
+]
--- a/scrapers/google_reviews/v1_0_0.py
+++ b/scrapers/google_reviews/v1_0_0.py
--- a/scrapers/registry.py
+++ b/scrapers/registry.py
@@ -0,0 +1,138 @@
+"""
+Scraper Registry
+
+This module provides a registry for managing and discovering scrapers.
+It allows dynamic registration and lookup of scraper implementations.
+"""
+
+from typing import Dict, List, Optional, Type
+
+from scrapers.base import BaseScraper
+
+
+class ScraperRegistry:
+    """
+    Registry for managing scraper implementations.
+
+    The registry allows:
+    - Registering scrapers by name and version
+    - Looking up scrapers by domain or name
+    - Listing all available scrapers
+
+    Usage:
+        registry = ScraperRegistry()
+        registry.register(GoogleReviewsScraper)
+        scraper = registry.get_scraper_for_url("https://google.com/maps/place/...")
+    """
+
+    _instance: Optional["ScraperRegistry"] = None
+    _scrapers: Dict[str, Type[BaseScraper]]
+
+    def __new__(cls) -> "ScraperRegistry":
+        """Singleton pattern to ensure one global registry."""
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+            cls._instance._scrapers = {}
+            cls._instance._domain_map = {}
+        return cls._instance
+
+    def register(self, scraper_class: Type[BaseScraper], name: Optional[str] = None) -> None:
+        """
+        Register a scraper class with the registry.
+
+        Args:
+            scraper_class: The scraper class to register (must inherit from BaseScraper)
+            name: Optional name override, defaults to scraper_class.name property
+        """
+        # Create a temporary instance to get properties
+        # Note: In production, we might want scraper_class to have class-level properties
+        instance = scraper_class.__new__(scraper_class)
+
+        scraper_name = name or instance.name
+        scraper_version = instance.version
+        key = f"{scraper_name}:{scraper_version}"
+
+        self._scrapers[key] = scraper_class
+
+        # Map domains to this scraper
+        for domain in instance.supported_domains:
+            if domain not in self._domain_map:
+                self._domain_map[domain] = []
+            self._domain_map[domain].append(key)
+
+    def get_scraper(self, name: str, version: Optional[str] = None) -> Optional[Type[BaseScraper]]:
+        """
+        Get a scraper class by name and optional version.
+
+        Args:
+            name: The scraper name
+            version: Optional version string. If not provided, returns the latest.
+
+        Returns:
+            The scraper class, or None if not found
+        """
+        if version:
+            key = f"{name}:{version}"
+            return self._scrapers.get(key)
+
+        # Find latest version for this name
+        matching = [k for k in self._scrapers.keys() if k.startswith(f"{name}:")]
+        if not matching:
+            return None
+
+        # Sort by version and return latest
+        matching.sort(reverse=True)
+        return self._scrapers.get(matching[0])
+
+    def get_scraper_for_url(self, url: str) -> Optional[Type[BaseScraper]]:
+        """
+        Find a suitable scraper for the given URL.
+
+        Args:
+            url: The URL to find a scraper for
+
+        Returns:
+            The scraper class that can handle this URL, or None if no match
+        """
+        from urllib.parse import urlparse
+
+        parsed = urlparse(url)
+        domain = parsed.netloc.lower()
+
+        # Remove www. prefix for matching
+        if domain.startswith("www."):
+            domain = domain[4:]
+
+        scraper_keys = self._domain_map.get(domain, [])
+        if not scraper_keys:
+            return None
+
+        # Return the latest version
+        scraper_keys.sort(reverse=True)
+        return self._scrapers.get(scraper_keys[0])
+
+    def list_scrapers(self) -> List[Dict[str, str]]:
+        """
+        List all registered scrapers.
+
+        Returns:
+            List of dictionaries with scraper info (name, version, domains)
+        """
+        result = []
+        for key, scraper_class in self._scrapers.items():
+            instance = scraper_class.__new__(scraper_class)
+            result.append({
+                "name": instance.name,
+                "version": instance.version,
+                "domains": instance.supported_domains
+            })
+        return result
+
+    def clear(self) -> None:
+        """Clear all registered scrapers. Useful for testing."""
+        self._scrapers.clear()
+        self._domain_map.clear()
+
+
+# Global registry instance
+registry = ScraperRegistry()