feat: Add scraper version routing with v1.1.0 as default

- Import both v1.0.0 and v1.1.0 scraper versions
- Add SCRAPER_VERSIONS registry mapping version strings to functions
- Add get_scraper_for_version() to route based on job metadata
- Default to v1.1.0 (multi-sort) for new jobs
- Frontend can select specific version via scraper_version parameter
- Validation endpoint continues using v1.0.0 for speed

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-24 19:04:06 +00:00
parent 7771c734c6
commit d64f06ba9e

View File

@@ -28,7 +28,23 @@ from fastapi.responses import JSONResponse, StreamingResponse
from core.database import DatabaseManager, JobStatus from core.database import DatabaseManager, JobStatus
from services.webhook_service import WebhookDispatcher, WebhookManager from services.webhook_service import WebhookDispatcher, WebhookManager
from utils.health_checks import HealthCheckSystem from utils.health_checks import HealthCheckSystem
from scrapers.google_reviews.v1_0_0 import fast_scrape_reviews, LogCapture, get_business_card_info # Clean scraper from scrapers.google_reviews.v1_0_0 import fast_scrape_reviews as fast_scrape_v1_0_0, LogCapture, get_business_card_info
from scrapers.google_reviews.v1_1_0 import fast_scrape_reviews as fast_scrape_v1_1_0
# Scraper version registry - maps version strings to scraper functions
SCRAPER_VERSIONS = {
"1.0.0": fast_scrape_v1_0_0,
"v1.0.0": fast_scrape_v1_0_0,
"1.1.0": fast_scrape_v1_1_0,
"v1.1.0": fast_scrape_v1_1_0,
}
DEFAULT_SCRAPER_VERSION = "1.1.0" # Latest version as default
def get_scraper_for_version(version: str = None):
"""Get the appropriate scraper function for a version string."""
if version and version in SCRAPER_VERSIONS:
return SCRAPER_VERSIONS[version], version
return SCRAPER_VERSIONS[DEFAULT_SCRAPER_VERSION], DEFAULT_SCRAPER_VERSION
from utils.crash_analyzer import analyze_crash, summarize_crash_patterns, apply_auto_fix from utils.crash_analyzer import analyze_crash, summarize_crash_patterns, apply_auto_fix
from utils.logger import StructuredLogger, LogEntry from utils.logger import StructuredLogger, LogEntry
from workers.chrome_pool import ( from workers.chrome_pool import (
@@ -44,6 +60,7 @@ from api.routes import (
batches_router, set_batches_db, batches_router, set_batches_db,
dashboard_router, set_dashboard_db, dashboard_router, set_dashboard_db,
admin_router, set_admin_db, admin_router, set_admin_db,
pipelines_router, set_pipelines_db,
) )
# Configure logging # Configure logging
@@ -92,6 +109,7 @@ async def lifespan(app: FastAPI):
set_batches_db(db) set_batches_db(db)
set_dashboard_db(db) set_dashboard_db(db)
set_admin_db(db) set_admin_db(db)
set_pipelines_db(db.pool) # Pipeline router uses raw asyncpg pool
# Initialize health check system with canary monitoring # Initialize health check system with canary monitoring
# DISABLED: Canary tests consume Google Maps requests and trigger rate limiting # DISABLED: Canary tests consume Google Maps requests and trigger rate limiting
@@ -153,6 +171,7 @@ app.add_middleware(
app.include_router(batches_router) app.include_router(batches_router)
app.include_router(dashboard_router) app.include_router(dashboard_router)
app.include_router(admin_router) app.include_router(admin_router)
app.include_router(pipelines_router)
# ==================== Request/Response Models ==================== # ==================== Request/Response Models ====================
@@ -1122,7 +1141,7 @@ async def check_reviews(request: ScrapeRequest):
log.info(f"Creating Chrome with default settings") log.info(f"Creating Chrome with default settings")
result = await asyncio.to_thread( result = await asyncio.to_thread(
fast_scrape_reviews, fast_scrape_v1_0_0, # Use v1.0.0 for validation (fastest)
url=url, url=url,
headless=False, # Use Xvfb display headless=False, # Use Xvfb display
validation_only=True, # Return early after getting total_reviews validation_only=True, # Return early after getting total_reviews
@@ -1704,10 +1723,15 @@ async def run_scraping_job(job_id: UUID):
# Schedule the coroutine on the event loop # Schedule the coroutine on the event loop
asyncio.run_coroutine_threadsafe(save(), loop) asyncio.run_coroutine_threadsafe(save(), loop)
# Get scraper version from metadata (frontend can specify version)
requested_version = metadata.get('scraper_version') if metadata else None
scraper_func, actual_version = get_scraper_for_version(requested_version)
log.info(f"Using scraper version {actual_version} for job {job_id}")
# Run scraping with progress callback and shared log capture # Run scraping with progress callback and shared log capture
# headless=False because Docker uses Xvfb virtual display # headless=False because Docker uses Xvfb virtual display
result = await asyncio.to_thread( result = await asyncio.to_thread(
fast_scrape_reviews, scraper_func,
url=url, url=url,
headless=False, headless=False,
progress_callback=progress_callback, progress_callback=progress_callback,