Phase 0: Project restructure to ReviewIQ platform architecture
New structure: - scrapers/google_reviews/v1_0_0.py (was modules/scraper_clean.py) - scrapers/base.py (BaseScraper interface) - scrapers/registry.py (ScraperRegistry for version routing) - core/database.py, models.py, config.py, enums.py - utils/logger.py, crash_analyzer.py, health_checks.py, helpers.py, date_converter.py - workers/chrome_pool.py - services/webhook_service.py - api/ routes structure (empty, ready for Phase 2) - tests/ structure mirroring source All imports updated in: - api_server_production.py (7 import paths updated) - utils/health_checks.py (scraper import path) Legacy modules moved to modules/_legacy/: - data_storage.py, image_handler.py, s3_handler.py (unused) Syntax verified, frontend build passing. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
97
scrapers/base.py
Normal file
97
scrapers/base.py
Normal file
@@ -0,0 +1,97 @@
|
||||
"""
|
||||
Base Scraper Interface
|
||||
|
||||
This module defines the abstract base class that all scrapers must implement.
|
||||
It ensures consistent interface across different scraper implementations.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Callable, Dict, List, Optional
|
||||
|
||||
|
||||
class BaseScraper(ABC):
|
||||
"""
|
||||
Abstract base class for all scrapers in the ReviewIQ system.
|
||||
|
||||
All concrete scraper implementations must inherit from this class
|
||||
and implement the required abstract methods.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def scrape(
|
||||
self,
|
||||
driver: Any,
|
||||
url: str,
|
||||
max_reviews: int = 5000,
|
||||
timeout_no_new: int = 15,
|
||||
flush_callback: Optional[Callable[[List[Dict]], None]] = None,
|
||||
flush_batch_size: int = 500,
|
||||
progress_callback: Optional[Callable[[int, Optional[int]], None]] = None,
|
||||
validation_only: bool = False
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Scrape reviews from the given URL.
|
||||
|
||||
Args:
|
||||
driver: WebDriver instance (e.g., Selenium WebDriver)
|
||||
url: The URL to scrape reviews from
|
||||
max_reviews: Maximum number of reviews to collect
|
||||
timeout_no_new: Seconds to wait with no new reviews before stopping
|
||||
flush_callback: Optional callback called with reviews batches for streaming
|
||||
flush_batch_size: Number of reviews before triggering flush_callback
|
||||
progress_callback: Optional callback(current_count, total_count) for progress
|
||||
validation_only: If True, return early after extracting metadata only
|
||||
|
||||
Returns:
|
||||
Dictionary containing:
|
||||
- reviews: List of review dictionaries
|
||||
- total: Total number of reviews collected
|
||||
- error: Error message if any, None otherwise
|
||||
- Additional scraper-specific metadata
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def validate_url(self, url: str) -> bool:
|
||||
"""
|
||||
Validate if the given URL is supported by this scraper.
|
||||
|
||||
Args:
|
||||
url: The URL to validate
|
||||
|
||||
Returns:
|
||||
True if the URL is valid for this scraper, False otherwise
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_business_info(self, driver: Any, url: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Extract business information from the URL without scraping reviews.
|
||||
|
||||
Args:
|
||||
driver: WebDriver instance
|
||||
url: The URL to extract info from
|
||||
|
||||
Returns:
|
||||
Dictionary containing business metadata (name, rating, address, etc.)
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def name(self) -> str:
|
||||
"""Return the human-readable name of this scraper."""
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def version(self) -> str:
|
||||
"""Return the version string of this scraper."""
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def supported_domains(self) -> List[str]:
|
||||
"""Return list of domains this scraper supports."""
|
||||
pass
|
||||
Reference in New Issue
Block a user