feat: Add scraper version routing with v1.1.0 as default
- Import both v1.0.0 and v1.1.0 scraper versions - Add SCRAPER_VERSIONS registry mapping version strings to functions - Add get_scraper_for_version() to route based on job metadata - Default to v1.1.0 (multi-sort) for new jobs - Frontend can select specific version via scraper_version parameter - Validation endpoint continues using v1.0.0 for speed Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -28,7 +28,23 @@ from fastapi.responses import JSONResponse, StreamingResponse
|
|||||||
from core.database import DatabaseManager, JobStatus
|
from core.database import DatabaseManager, JobStatus
|
||||||
from services.webhook_service import WebhookDispatcher, WebhookManager
|
from services.webhook_service import WebhookDispatcher, WebhookManager
|
||||||
from utils.health_checks import HealthCheckSystem
|
from utils.health_checks import HealthCheckSystem
|
||||||
from scrapers.google_reviews.v1_0_0 import fast_scrape_reviews, LogCapture, get_business_card_info # Clean scraper
|
from scrapers.google_reviews.v1_0_0 import fast_scrape_reviews as fast_scrape_v1_0_0, LogCapture, get_business_card_info
|
||||||
|
from scrapers.google_reviews.v1_1_0 import fast_scrape_reviews as fast_scrape_v1_1_0
|
||||||
|
|
||||||
|
# Scraper version registry - maps version strings to scraper functions
|
||||||
|
SCRAPER_VERSIONS = {
|
||||||
|
"1.0.0": fast_scrape_v1_0_0,
|
||||||
|
"v1.0.0": fast_scrape_v1_0_0,
|
||||||
|
"1.1.0": fast_scrape_v1_1_0,
|
||||||
|
"v1.1.0": fast_scrape_v1_1_0,
|
||||||
|
}
|
||||||
|
DEFAULT_SCRAPER_VERSION = "1.1.0" # Latest version as default
|
||||||
|
|
||||||
|
def get_scraper_for_version(version: str = None):
|
||||||
|
"""Get the appropriate scraper function for a version string."""
|
||||||
|
if version and version in SCRAPER_VERSIONS:
|
||||||
|
return SCRAPER_VERSIONS[version], version
|
||||||
|
return SCRAPER_VERSIONS[DEFAULT_SCRAPER_VERSION], DEFAULT_SCRAPER_VERSION
|
||||||
from utils.crash_analyzer import analyze_crash, summarize_crash_patterns, apply_auto_fix
|
from utils.crash_analyzer import analyze_crash, summarize_crash_patterns, apply_auto_fix
|
||||||
from utils.logger import StructuredLogger, LogEntry
|
from utils.logger import StructuredLogger, LogEntry
|
||||||
from workers.chrome_pool import (
|
from workers.chrome_pool import (
|
||||||
@@ -44,6 +60,7 @@ from api.routes import (
|
|||||||
batches_router, set_batches_db,
|
batches_router, set_batches_db,
|
||||||
dashboard_router, set_dashboard_db,
|
dashboard_router, set_dashboard_db,
|
||||||
admin_router, set_admin_db,
|
admin_router, set_admin_db,
|
||||||
|
pipelines_router, set_pipelines_db,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Configure logging
|
# Configure logging
|
||||||
@@ -92,6 +109,7 @@ async def lifespan(app: FastAPI):
|
|||||||
set_batches_db(db)
|
set_batches_db(db)
|
||||||
set_dashboard_db(db)
|
set_dashboard_db(db)
|
||||||
set_admin_db(db)
|
set_admin_db(db)
|
||||||
|
set_pipelines_db(db.pool) # Pipeline router uses raw asyncpg pool
|
||||||
|
|
||||||
# Initialize health check system with canary monitoring
|
# Initialize health check system with canary monitoring
|
||||||
# DISABLED: Canary tests consume Google Maps requests and trigger rate limiting
|
# DISABLED: Canary tests consume Google Maps requests and trigger rate limiting
|
||||||
@@ -153,6 +171,7 @@ app.add_middleware(
|
|||||||
app.include_router(batches_router)
|
app.include_router(batches_router)
|
||||||
app.include_router(dashboard_router)
|
app.include_router(dashboard_router)
|
||||||
app.include_router(admin_router)
|
app.include_router(admin_router)
|
||||||
|
app.include_router(pipelines_router)
|
||||||
|
|
||||||
|
|
||||||
# ==================== Request/Response Models ====================
|
# ==================== Request/Response Models ====================
|
||||||
@@ -1122,7 +1141,7 @@ async def check_reviews(request: ScrapeRequest):
|
|||||||
log.info(f"Creating Chrome with default settings")
|
log.info(f"Creating Chrome with default settings")
|
||||||
|
|
||||||
result = await asyncio.to_thread(
|
result = await asyncio.to_thread(
|
||||||
fast_scrape_reviews,
|
fast_scrape_v1_0_0, # Use v1.0.0 for validation (fastest)
|
||||||
url=url,
|
url=url,
|
||||||
headless=False, # Use Xvfb display
|
headless=False, # Use Xvfb display
|
||||||
validation_only=True, # Return early after getting total_reviews
|
validation_only=True, # Return early after getting total_reviews
|
||||||
@@ -1704,10 +1723,15 @@ async def run_scraping_job(job_id: UUID):
|
|||||||
# Schedule the coroutine on the event loop
|
# Schedule the coroutine on the event loop
|
||||||
asyncio.run_coroutine_threadsafe(save(), loop)
|
asyncio.run_coroutine_threadsafe(save(), loop)
|
||||||
|
|
||||||
|
# Get scraper version from metadata (frontend can specify version)
|
||||||
|
requested_version = metadata.get('scraper_version') if metadata else None
|
||||||
|
scraper_func, actual_version = get_scraper_for_version(requested_version)
|
||||||
|
log.info(f"Using scraper version {actual_version} for job {job_id}")
|
||||||
|
|
||||||
# Run scraping with progress callback and shared log capture
|
# Run scraping with progress callback and shared log capture
|
||||||
# headless=False because Docker uses Xvfb virtual display
|
# headless=False because Docker uses Xvfb virtual display
|
||||||
result = await asyncio.to_thread(
|
result = await asyncio.to_thread(
|
||||||
fast_scrape_reviews,
|
scraper_func,
|
||||||
url=url,
|
url=url,
|
||||||
headless=False,
|
headless=False,
|
||||||
progress_callback=progress_callback,
|
progress_callback=progress_callback,
|
||||||
|
|||||||
Reference in New Issue
Block a user