New structure: - scrapers/google_reviews/v1_0_0.py (was modules/scraper_clean.py) - scrapers/base.py (BaseScraper interface) - scrapers/registry.py (ScraperRegistry for version routing) - core/database.py, models.py, config.py, enums.py - utils/logger.py, crash_analyzer.py, health_checks.py, helpers.py, date_converter.py - workers/chrome_pool.py - services/webhook_service.py - api/ routes structure (empty, ready for Phase 2) - tests/ structure mirroring source All imports updated in: - api_server_production.py (7 import paths updated) - utils/health_checks.py (scraper import path) Legacy modules moved to modules/_legacy/: - data_storage.py, image_handler.py, s3_handler.py (unused) Syntax verified, frontend build passing. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
98 lines
3.0 KiB
Python
98 lines
3.0 KiB
Python
"""
|
|
Base Scraper Interface
|
|
|
|
This module defines the abstract base class that all scrapers must implement.
|
|
It ensures consistent interface across different scraper implementations.
|
|
"""
|
|
|
|
from abc import ABC, abstractmethod
|
|
from typing import Any, Callable, Dict, List, Optional
|
|
|
|
|
|
class BaseScraper(ABC):
|
|
"""
|
|
Abstract base class for all scrapers in the ReviewIQ system.
|
|
|
|
All concrete scraper implementations must inherit from this class
|
|
and implement the required abstract methods.
|
|
"""
|
|
|
|
@abstractmethod
|
|
def scrape(
|
|
self,
|
|
driver: Any,
|
|
url: str,
|
|
max_reviews: int = 5000,
|
|
timeout_no_new: int = 15,
|
|
flush_callback: Optional[Callable[[List[Dict]], None]] = None,
|
|
flush_batch_size: int = 500,
|
|
progress_callback: Optional[Callable[[int, Optional[int]], None]] = None,
|
|
validation_only: bool = False
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Scrape reviews from the given URL.
|
|
|
|
Args:
|
|
driver: WebDriver instance (e.g., Selenium WebDriver)
|
|
url: The URL to scrape reviews from
|
|
max_reviews: Maximum number of reviews to collect
|
|
timeout_no_new: Seconds to wait with no new reviews before stopping
|
|
flush_callback: Optional callback called with reviews batches for streaming
|
|
flush_batch_size: Number of reviews before triggering flush_callback
|
|
progress_callback: Optional callback(current_count, total_count) for progress
|
|
validation_only: If True, return early after extracting metadata only
|
|
|
|
Returns:
|
|
Dictionary containing:
|
|
- reviews: List of review dictionaries
|
|
- total: Total number of reviews collected
|
|
- error: Error message if any, None otherwise
|
|
- Additional scraper-specific metadata
|
|
"""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def validate_url(self, url: str) -> bool:
|
|
"""
|
|
Validate if the given URL is supported by this scraper.
|
|
|
|
Args:
|
|
url: The URL to validate
|
|
|
|
Returns:
|
|
True if the URL is valid for this scraper, False otherwise
|
|
"""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def get_business_info(self, driver: Any, url: str) -> Dict[str, Any]:
|
|
"""
|
|
Extract business information from the URL without scraping reviews.
|
|
|
|
Args:
|
|
driver: WebDriver instance
|
|
url: The URL to extract info from
|
|
|
|
Returns:
|
|
Dictionary containing business metadata (name, rating, address, etc.)
|
|
"""
|
|
pass
|
|
|
|
@property
|
|
@abstractmethod
|
|
def name(self) -> str:
|
|
"""Return the human-readable name of this scraper."""
|
|
pass
|
|
|
|
@property
|
|
@abstractmethod
|
|
def version(self) -> str:
|
|
"""Return the version string of this scraper."""
|
|
pass
|
|
|
|
@property
|
|
@abstractmethod
|
|
def supported_domains(self) -> List[str]:
|
|
"""Return list of domains this scraper supports."""
|
|
pass
|