Phase 0: Project restructure to ReviewIQ platform architecture
New structure: - scrapers/google_reviews/v1_0_0.py (was modules/scraper_clean.py) - scrapers/base.py (BaseScraper interface) - scrapers/registry.py (ScraperRegistry for version routing) - core/database.py, models.py, config.py, enums.py - utils/logger.py, crash_analyzer.py, health_checks.py, helpers.py, date_converter.py - workers/chrome_pool.py - services/webhook_service.py - api/ routes structure (empty, ready for Phase 2) - tests/ structure mirroring source All imports updated in: - api_server_production.py (7 import paths updated) - utils/health_checks.py (scraper import path) Legacy modules moved to modules/_legacy/: - data_storage.py, image_handler.py, s3_handler.py (unused) Syntax verified, frontend build passing. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
10
scrapers/__init__.py
Normal file
10
scrapers/__init__.py
Normal file
@@ -0,0 +1,10 @@
|
||||
"""
|
||||
Scrapers Package
|
||||
|
||||
This package contains all scraper implementations for the ReviewIQ system.
|
||||
"""
|
||||
|
||||
from scrapers.base import BaseScraper
|
||||
from scrapers.registry import ScraperRegistry, registry
|
||||
|
||||
__all__ = ["BaseScraper", "ScraperRegistry", "registry"]
|
||||
97
scrapers/base.py
Normal file
97
scrapers/base.py
Normal file
@@ -0,0 +1,97 @@
|
||||
"""
|
||||
Base Scraper Interface
|
||||
|
||||
This module defines the abstract base class that all scrapers must implement.
|
||||
It ensures consistent interface across different scraper implementations.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Callable, Dict, List, Optional
|
||||
|
||||
|
||||
class BaseScraper(ABC):
|
||||
"""
|
||||
Abstract base class for all scrapers in the ReviewIQ system.
|
||||
|
||||
All concrete scraper implementations must inherit from this class
|
||||
and implement the required abstract methods.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def scrape(
|
||||
self,
|
||||
driver: Any,
|
||||
url: str,
|
||||
max_reviews: int = 5000,
|
||||
timeout_no_new: int = 15,
|
||||
flush_callback: Optional[Callable[[List[Dict]], None]] = None,
|
||||
flush_batch_size: int = 500,
|
||||
progress_callback: Optional[Callable[[int, Optional[int]], None]] = None,
|
||||
validation_only: bool = False
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Scrape reviews from the given URL.
|
||||
|
||||
Args:
|
||||
driver: WebDriver instance (e.g., Selenium WebDriver)
|
||||
url: The URL to scrape reviews from
|
||||
max_reviews: Maximum number of reviews to collect
|
||||
timeout_no_new: Seconds to wait with no new reviews before stopping
|
||||
flush_callback: Optional callback called with reviews batches for streaming
|
||||
flush_batch_size: Number of reviews before triggering flush_callback
|
||||
progress_callback: Optional callback(current_count, total_count) for progress
|
||||
validation_only: If True, return early after extracting metadata only
|
||||
|
||||
Returns:
|
||||
Dictionary containing:
|
||||
- reviews: List of review dictionaries
|
||||
- total: Total number of reviews collected
|
||||
- error: Error message if any, None otherwise
|
||||
- Additional scraper-specific metadata
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def validate_url(self, url: str) -> bool:
|
||||
"""
|
||||
Validate if the given URL is supported by this scraper.
|
||||
|
||||
Args:
|
||||
url: The URL to validate
|
||||
|
||||
Returns:
|
||||
True if the URL is valid for this scraper, False otherwise
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_business_info(self, driver: Any, url: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Extract business information from the URL without scraping reviews.
|
||||
|
||||
Args:
|
||||
driver: WebDriver instance
|
||||
url: The URL to extract info from
|
||||
|
||||
Returns:
|
||||
Dictionary containing business metadata (name, rating, address, etc.)
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def name(self) -> str:
|
||||
"""Return the human-readable name of this scraper."""
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def version(self) -> str:
|
||||
"""Return the version string of this scraper."""
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def supported_domains(self) -> List[str]:
|
||||
"""Return list of domains this scraper supports."""
|
||||
pass
|
||||
21
scrapers/google_reviews/__init__.py
Normal file
21
scrapers/google_reviews/__init__.py
Normal file
@@ -0,0 +1,21 @@
|
||||
"""
|
||||
Google Reviews Scraper Package
|
||||
|
||||
This package contains the Google Reviews scraper implementations.
|
||||
"""
|
||||
|
||||
from scrapers.google_reviews.v1_0_0 import (
|
||||
scrape_reviews,
|
||||
fast_scrape_reviews,
|
||||
get_business_card_info,
|
||||
extract_about_info,
|
||||
LogCapture,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"scrape_reviews",
|
||||
"fast_scrape_reviews",
|
||||
"get_business_card_info",
|
||||
"extract_about_info",
|
||||
"LogCapture",
|
||||
]
|
||||
2284
scrapers/google_reviews/v1_0_0.py
Normal file
2284
scrapers/google_reviews/v1_0_0.py
Normal file
File diff suppressed because it is too large
Load Diff
138
scrapers/registry.py
Normal file
138
scrapers/registry.py
Normal file
@@ -0,0 +1,138 @@
|
||||
"""
|
||||
Scraper Registry
|
||||
|
||||
This module provides a registry for managing and discovering scrapers.
|
||||
It allows dynamic registration and lookup of scraper implementations.
|
||||
"""
|
||||
|
||||
from typing import Dict, List, Optional, Type
|
||||
|
||||
from scrapers.base import BaseScraper
|
||||
|
||||
|
||||
class ScraperRegistry:
|
||||
"""
|
||||
Registry for managing scraper implementations.
|
||||
|
||||
The registry allows:
|
||||
- Registering scrapers by name and version
|
||||
- Looking up scrapers by domain or name
|
||||
- Listing all available scrapers
|
||||
|
||||
Usage:
|
||||
registry = ScraperRegistry()
|
||||
registry.register(GoogleReviewsScraper)
|
||||
scraper = registry.get_scraper_for_url("https://google.com/maps/place/...")
|
||||
"""
|
||||
|
||||
_instance: Optional["ScraperRegistry"] = None
|
||||
_scrapers: Dict[str, Type[BaseScraper]]
|
||||
|
||||
def __new__(cls) -> "ScraperRegistry":
|
||||
"""Singleton pattern to ensure one global registry."""
|
||||
if cls._instance is None:
|
||||
cls._instance = super().__new__(cls)
|
||||
cls._instance._scrapers = {}
|
||||
cls._instance._domain_map = {}
|
||||
return cls._instance
|
||||
|
||||
def register(self, scraper_class: Type[BaseScraper], name: Optional[str] = None) -> None:
|
||||
"""
|
||||
Register a scraper class with the registry.
|
||||
|
||||
Args:
|
||||
scraper_class: The scraper class to register (must inherit from BaseScraper)
|
||||
name: Optional name override, defaults to scraper_class.name property
|
||||
"""
|
||||
# Create a temporary instance to get properties
|
||||
# Note: In production, we might want scraper_class to have class-level properties
|
||||
instance = scraper_class.__new__(scraper_class)
|
||||
|
||||
scraper_name = name or instance.name
|
||||
scraper_version = instance.version
|
||||
key = f"{scraper_name}:{scraper_version}"
|
||||
|
||||
self._scrapers[key] = scraper_class
|
||||
|
||||
# Map domains to this scraper
|
||||
for domain in instance.supported_domains:
|
||||
if domain not in self._domain_map:
|
||||
self._domain_map[domain] = []
|
||||
self._domain_map[domain].append(key)
|
||||
|
||||
def get_scraper(self, name: str, version: Optional[str] = None) -> Optional[Type[BaseScraper]]:
|
||||
"""
|
||||
Get a scraper class by name and optional version.
|
||||
|
||||
Args:
|
||||
name: The scraper name
|
||||
version: Optional version string. If not provided, returns the latest.
|
||||
|
||||
Returns:
|
||||
The scraper class, or None if not found
|
||||
"""
|
||||
if version:
|
||||
key = f"{name}:{version}"
|
||||
return self._scrapers.get(key)
|
||||
|
||||
# Find latest version for this name
|
||||
matching = [k for k in self._scrapers.keys() if k.startswith(f"{name}:")]
|
||||
if not matching:
|
||||
return None
|
||||
|
||||
# Sort by version and return latest
|
||||
matching.sort(reverse=True)
|
||||
return self._scrapers.get(matching[0])
|
||||
|
||||
def get_scraper_for_url(self, url: str) -> Optional[Type[BaseScraper]]:
|
||||
"""
|
||||
Find a suitable scraper for the given URL.
|
||||
|
||||
Args:
|
||||
url: The URL to find a scraper for
|
||||
|
||||
Returns:
|
||||
The scraper class that can handle this URL, or None if no match
|
||||
"""
|
||||
from urllib.parse import urlparse
|
||||
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc.lower()
|
||||
|
||||
# Remove www. prefix for matching
|
||||
if domain.startswith("www."):
|
||||
domain = domain[4:]
|
||||
|
||||
scraper_keys = self._domain_map.get(domain, [])
|
||||
if not scraper_keys:
|
||||
return None
|
||||
|
||||
# Return the latest version
|
||||
scraper_keys.sort(reverse=True)
|
||||
return self._scrapers.get(scraper_keys[0])
|
||||
|
||||
def list_scrapers(self) -> List[Dict[str, str]]:
|
||||
"""
|
||||
List all registered scrapers.
|
||||
|
||||
Returns:
|
||||
List of dictionaries with scraper info (name, version, domains)
|
||||
"""
|
||||
result = []
|
||||
for key, scraper_class in self._scrapers.items():
|
||||
instance = scraper_class.__new__(scraper_class)
|
||||
result.append({
|
||||
"name": instance.name,
|
||||
"version": instance.version,
|
||||
"domains": instance.supported_domains
|
||||
})
|
||||
return result
|
||||
|
||||
def clear(self) -> None:
|
||||
"""Clear all registered scrapers. Useful for testing."""
|
||||
self._scrapers.clear()
|
||||
self._domain_map.clear()
|
||||
|
||||
|
||||
# Global registry instance
|
||||
registry = ScraperRegistry()
|
||||
Reference in New Issue
Block a user