Phase 0: Project restructure to ReviewIQ platform architecture

New structure:
- scrapers/google_reviews/v1_0_0.py (was modules/scraper_clean.py)
- scrapers/base.py (BaseScraper interface)
- scrapers/registry.py (ScraperRegistry for version routing)
- core/database.py, models.py, config.py, enums.py
- utils/logger.py, crash_analyzer.py, health_checks.py, helpers.py, date_converter.py
- workers/chrome_pool.py
- services/webhook_service.py
- api/ routes structure (empty, ready for Phase 2)
- tests/ structure mirroring source

All imports updated in:
- api_server_production.py (7 import paths updated)
- utils/health_checks.py (scraper import path)

Legacy modules moved to modules/_legacy/:
- data_storage.py, image_handler.py, s3_handler.py (unused)

Syntax verified, frontend build passing.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-24 15:22:08 +00:00
parent bb0291f265
commit 544e028c3f
37 changed files with 5782 additions and 30 deletions

138
scrapers/registry.py Normal file
View File

@@ -0,0 +1,138 @@
"""
Scraper Registry
This module provides a registry for managing and discovering scrapers.
It allows dynamic registration and lookup of scraper implementations.
"""
from typing import Dict, List, Optional, Type
from scrapers.base import BaseScraper
class ScraperRegistry:
"""
Registry for managing scraper implementations.
The registry allows:
- Registering scrapers by name and version
- Looking up scrapers by domain or name
- Listing all available scrapers
Usage:
registry = ScraperRegistry()
registry.register(GoogleReviewsScraper)
scraper = registry.get_scraper_for_url("https://google.com/maps/place/...")
"""
_instance: Optional["ScraperRegistry"] = None
_scrapers: Dict[str, Type[BaseScraper]]
def __new__(cls) -> "ScraperRegistry":
"""Singleton pattern to ensure one global registry."""
if cls._instance is None:
cls._instance = super().__new__(cls)
cls._instance._scrapers = {}
cls._instance._domain_map = {}
return cls._instance
def register(self, scraper_class: Type[BaseScraper], name: Optional[str] = None) -> None:
"""
Register a scraper class with the registry.
Args:
scraper_class: The scraper class to register (must inherit from BaseScraper)
name: Optional name override, defaults to scraper_class.name property
"""
# Create a temporary instance to get properties
# Note: In production, we might want scraper_class to have class-level properties
instance = scraper_class.__new__(scraper_class)
scraper_name = name or instance.name
scraper_version = instance.version
key = f"{scraper_name}:{scraper_version}"
self._scrapers[key] = scraper_class
# Map domains to this scraper
for domain in instance.supported_domains:
if domain not in self._domain_map:
self._domain_map[domain] = []
self._domain_map[domain].append(key)
def get_scraper(self, name: str, version: Optional[str] = None) -> Optional[Type[BaseScraper]]:
"""
Get a scraper class by name and optional version.
Args:
name: The scraper name
version: Optional version string. If not provided, returns the latest.
Returns:
The scraper class, or None if not found
"""
if version:
key = f"{name}:{version}"
return self._scrapers.get(key)
# Find latest version for this name
matching = [k for k in self._scrapers.keys() if k.startswith(f"{name}:")]
if not matching:
return None
# Sort by version and return latest
matching.sort(reverse=True)
return self._scrapers.get(matching[0])
def get_scraper_for_url(self, url: str) -> Optional[Type[BaseScraper]]:
"""
Find a suitable scraper for the given URL.
Args:
url: The URL to find a scraper for
Returns:
The scraper class that can handle this URL, or None if no match
"""
from urllib.parse import urlparse
parsed = urlparse(url)
domain = parsed.netloc.lower()
# Remove www. prefix for matching
if domain.startswith("www."):
domain = domain[4:]
scraper_keys = self._domain_map.get(domain, [])
if not scraper_keys:
return None
# Return the latest version
scraper_keys.sort(reverse=True)
return self._scrapers.get(scraper_keys[0])
def list_scrapers(self) -> List[Dict[str, str]]:
"""
List all registered scrapers.
Returns:
List of dictionaries with scraper info (name, version, domains)
"""
result = []
for key, scraper_class in self._scrapers.items():
instance = scraper_class.__new__(scraper_class)
result.append({
"name": instance.name,
"version": instance.version,
"domains": instance.supported_domains
})
return result
def clear(self) -> None:
"""Clear all registered scrapers. Useful for testing."""
self._scrapers.clear()
self._domain_map.clear()
# Global registry instance
registry = ScraperRegistry()