Phase 0: Project restructure to ReviewIQ platform architecture

New structure:
- scrapers/google_reviews/v1_0_0.py (was modules/scraper_clean.py)
- scrapers/base.py (BaseScraper interface)
- scrapers/registry.py (ScraperRegistry for version routing)
- core/database.py, models.py, config.py, enums.py
- utils/logger.py, crash_analyzer.py, health_checks.py, helpers.py, date_converter.py
- workers/chrome_pool.py
- services/webhook_service.py
- api/ routes structure (empty, ready for Phase 2)
- tests/ structure mirroring source

All imports updated in:
- api_server_production.py (7 import paths updated)
- utils/health_checks.py (scraper import path)

Legacy modules moved to modules/_legacy/:
- data_storage.py, image_handler.py, s3_handler.py (unused)

Syntax verified, frontend build passing.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-24 15:22:08 +00:00
parent bb0291f265
commit 544e028c3f
37 changed files with 5782 additions and 30 deletions

10
scrapers/__init__.py Normal file
View File

@@ -0,0 +1,10 @@
"""
Scrapers Package
This package contains all scraper implementations for the ReviewIQ system.
"""
from scrapers.base import BaseScraper
from scrapers.registry import ScraperRegistry, registry
__all__ = ["BaseScraper", "ScraperRegistry", "registry"]

97
scrapers/base.py Normal file
View File

@@ -0,0 +1,97 @@
"""
Base Scraper Interface
This module defines the abstract base class that all scrapers must implement.
It ensures consistent interface across different scraper implementations.
"""
from abc import ABC, abstractmethod
from typing import Any, Callable, Dict, List, Optional
class BaseScraper(ABC):
"""
Abstract base class for all scrapers in the ReviewIQ system.
All concrete scraper implementations must inherit from this class
and implement the required abstract methods.
"""
@abstractmethod
def scrape(
self,
driver: Any,
url: str,
max_reviews: int = 5000,
timeout_no_new: int = 15,
flush_callback: Optional[Callable[[List[Dict]], None]] = None,
flush_batch_size: int = 500,
progress_callback: Optional[Callable[[int, Optional[int]], None]] = None,
validation_only: bool = False
) -> Dict[str, Any]:
"""
Scrape reviews from the given URL.
Args:
driver: WebDriver instance (e.g., Selenium WebDriver)
url: The URL to scrape reviews from
max_reviews: Maximum number of reviews to collect
timeout_no_new: Seconds to wait with no new reviews before stopping
flush_callback: Optional callback called with reviews batches for streaming
flush_batch_size: Number of reviews before triggering flush_callback
progress_callback: Optional callback(current_count, total_count) for progress
validation_only: If True, return early after extracting metadata only
Returns:
Dictionary containing:
- reviews: List of review dictionaries
- total: Total number of reviews collected
- error: Error message if any, None otherwise
- Additional scraper-specific metadata
"""
pass
@abstractmethod
def validate_url(self, url: str) -> bool:
"""
Validate if the given URL is supported by this scraper.
Args:
url: The URL to validate
Returns:
True if the URL is valid for this scraper, False otherwise
"""
pass
@abstractmethod
def get_business_info(self, driver: Any, url: str) -> Dict[str, Any]:
"""
Extract business information from the URL without scraping reviews.
Args:
driver: WebDriver instance
url: The URL to extract info from
Returns:
Dictionary containing business metadata (name, rating, address, etc.)
"""
pass
@property
@abstractmethod
def name(self) -> str:
"""Return the human-readable name of this scraper."""
pass
@property
@abstractmethod
def version(self) -> str:
"""Return the version string of this scraper."""
pass
@property
@abstractmethod
def supported_domains(self) -> List[str]:
"""Return list of domains this scraper supports."""
pass

View File

@@ -0,0 +1,21 @@
"""
Google Reviews Scraper Package
This package contains the Google Reviews scraper implementations.
"""
from scrapers.google_reviews.v1_0_0 import (
scrape_reviews,
fast_scrape_reviews,
get_business_card_info,
extract_about_info,
LogCapture,
)
__all__ = [
"scrape_reviews",
"fast_scrape_reviews",
"get_business_card_info",
"extract_about_info",
"LogCapture",
]

File diff suppressed because it is too large Load Diff

138
scrapers/registry.py Normal file
View File

@@ -0,0 +1,138 @@
"""
Scraper Registry
This module provides a registry for managing and discovering scrapers.
It allows dynamic registration and lookup of scraper implementations.
"""
from typing import Dict, List, Optional, Type
from scrapers.base import BaseScraper
class ScraperRegistry:
"""
Registry for managing scraper implementations.
The registry allows:
- Registering scrapers by name and version
- Looking up scrapers by domain or name
- Listing all available scrapers
Usage:
registry = ScraperRegistry()
registry.register(GoogleReviewsScraper)
scraper = registry.get_scraper_for_url("https://google.com/maps/place/...")
"""
_instance: Optional["ScraperRegistry"] = None
_scrapers: Dict[str, Type[BaseScraper]]
def __new__(cls) -> "ScraperRegistry":
"""Singleton pattern to ensure one global registry."""
if cls._instance is None:
cls._instance = super().__new__(cls)
cls._instance._scrapers = {}
cls._instance._domain_map = {}
return cls._instance
def register(self, scraper_class: Type[BaseScraper], name: Optional[str] = None) -> None:
"""
Register a scraper class with the registry.
Args:
scraper_class: The scraper class to register (must inherit from BaseScraper)
name: Optional name override, defaults to scraper_class.name property
"""
# Create a temporary instance to get properties
# Note: In production, we might want scraper_class to have class-level properties
instance = scraper_class.__new__(scraper_class)
scraper_name = name or instance.name
scraper_version = instance.version
key = f"{scraper_name}:{scraper_version}"
self._scrapers[key] = scraper_class
# Map domains to this scraper
for domain in instance.supported_domains:
if domain not in self._domain_map:
self._domain_map[domain] = []
self._domain_map[domain].append(key)
def get_scraper(self, name: str, version: Optional[str] = None) -> Optional[Type[BaseScraper]]:
"""
Get a scraper class by name and optional version.
Args:
name: The scraper name
version: Optional version string. If not provided, returns the latest.
Returns:
The scraper class, or None if not found
"""
if version:
key = f"{name}:{version}"
return self._scrapers.get(key)
# Find latest version for this name
matching = [k for k in self._scrapers.keys() if k.startswith(f"{name}:")]
if not matching:
return None
# Sort by version and return latest
matching.sort(reverse=True)
return self._scrapers.get(matching[0])
def get_scraper_for_url(self, url: str) -> Optional[Type[BaseScraper]]:
"""
Find a suitable scraper for the given URL.
Args:
url: The URL to find a scraper for
Returns:
The scraper class that can handle this URL, or None if no match
"""
from urllib.parse import urlparse
parsed = urlparse(url)
domain = parsed.netloc.lower()
# Remove www. prefix for matching
if domain.startswith("www."):
domain = domain[4:]
scraper_keys = self._domain_map.get(domain, [])
if not scraper_keys:
return None
# Return the latest version
scraper_keys.sort(reverse=True)
return self._scrapers.get(scraper_keys[0])
def list_scrapers(self) -> List[Dict[str, str]]:
"""
List all registered scrapers.
Returns:
List of dictionaries with scraper info (name, version, domains)
"""
result = []
for key, scraper_class in self._scrapers.items():
instance = scraper_class.__new__(scraper_class)
result.append({
"name": instance.name,
"version": instance.version,
"domains": instance.supported_domains
})
return result
def clear(self) -> None:
"""Clear all registered scrapers. Useful for testing."""
self._scrapers.clear()
self._domain_map.clear()
# Global registry instance
registry = ScraperRegistry()