Initial commit - WhyRating Engine (Google Reviews Scraper)

2026-02-02 18:19:00 +00:00
parent 0543a08242
commit 2206ddeff2
136 changed files with 51138 additions and 855 deletions
--- a/api/routes/init.py
+++ b/api/routes/init.py
@@ -7,6 +7,7 @@ from api.routes.batches import router as batches_router, set_database as set_bat
 from api.routes.dashboard import router as dashboard_router, set_database as set_dashboard_db
 from api.routes.admin import router as admin_router, set_database as set_admin_db
 from api.routes.pipelines import router as pipelines_router, set_database as set_pipelines_db
+from api.routes.reviewiq_analytics import router as reviewiq_analytics_router, set_database as set_reviewiq_analytics_db

 __all__ = [
    'batches_router',
@@ -17,4 +18,6 @@ __all__ = [
    'set_admin_db',
    'pipelines_router',
    'set_pipelines_db',
+    'reviewiq_analytics_router',
+    'set_reviewiq_analytics_db',
 ]
--- a/api/routes/pipelines.py
+++ b/api/routes/pipelines.py
@@ -277,15 +277,17 @@ async def execute_pipeline(

    pipeline = await _get_pipeline_instance(pipeline_id)

+    # Create execution record
+    execution_id = str(uuid.uuid4())
+
    # Prepare input data
    input_data = request.input_data or {}
    if request.job_id:
        input_data["job_id"] = request.job_id
    if request.business_id:
        input_data["business_id"] = request.business_id
-
-    # Create execution record
-    execution_id = str(uuid.uuid4())
+    # Pass execution_id so Stage 5 synthesis can store results
+    input_data["execution_id"] = execution_id
    stages = request.stages or pipeline.get_stage_names()

    # Prepare input summary for storage
@@ -604,6 +606,7 @@ async def get_widget_data(
    pipeline_id: str,
    widget_id: str,
    business_id: str | None = Query(None, description="Filter by business"),
+    job_id: str | None = Query(None, description="Filter by job ID"),
    time_range: str = Query("30d", description="Time range (e.g., 7d, 30d, 90d)"),
    page: int = Query(1, ge=1, description="Page number for paginated widgets"),
    page_size: int = Query(10, ge=1, le=100, description="Items per page"),
@@ -621,6 +624,7 @@ async def get_widget_data(
    try:
        params = {
            "business_id": business_id,
+            "job_id": job_id,
            "time_range": time_range,
            "page": page,
            "page_size": page_size,
--- a/api/routes/sessions.py
+++ b/api/routes/sessions.py
@@ -0,0 +1,300 @@
+"""
+Session Routes for Google Reviews Scraper API
+
+Provides session handoff endpoints for efficient validation → scraping workflow.
+Uses scraper v1.2.0 with session support.
+
+Endpoints:
+  POST /sessions/validate - Validate URL, keep browser alive, return session_id
+  POST /sessions/scrape   - Scrape using existing session (skips navigation)
+  GET  /sessions          - List active sessions
+  GET  /sessions/{id}     - Get session status
+  DELETE /sessions/{id}   - Release session manually
+
+Usage:
+  1. POST /sessions/validate with URL → returns session_id
+  2. Frontend shows business info to user for confirmation
+  3. POST /sessions/scrape with session_id → scrapes using existing browser
+"""
+
+import asyncio
+import logging
+from typing import Optional, Dict, Any
+
+from fastapi import APIRouter, HTTPException
+from pydantic import BaseModel, HttpUrl, Field
+
+# Import v1.2.0 scraper with session support
+from scrapers.google_reviews.v1_2_0 import (
+    validate_with_session,
+    scrape_with_session,
+    LogCapture
+)
+from scrapers.google_reviews.session_manager import get_session_manager
+
+log = logging.getLogger("api_sessions")
+
+# Create router
+router = APIRouter(prefix="/sessions", tags=["sessions"])
+
+
+# ============================================================================
+# Request/Response Models
+# ============================================================================
+
+class GeoLocation(BaseModel):
+    lat: float
+    lng: float
+
+class Viewport(BaseModel):
+    width: int
+    height: int
+
+class BrowserFingerprint(BaseModel):
+    userAgent: Optional[str] = None
+    timezone: Optional[str] = None
+    language: Optional[str] = None
+    platform: Optional[str] = None
+    viewport: Optional[Viewport] = None
+    geolocation: Optional[GeoLocation] = None
+
+
+class ValidateRequest(BaseModel):
+    """Request body for session validation."""
+    url: HttpUrl = Field(..., description="Google Maps URL to validate")
+    browser_fingerprint: Optional[BrowserFingerprint] = None
+    geolocation: Optional[GeoLocation] = None
+    session_ttl: int = Field(300, description="Session TTL in seconds (default: 5 min)", ge=60, le=900)
+
+
+class ValidateResponse(BaseModel):
+    """Response from session validation."""
+    session_id: Optional[str] = Field(None, description="Session ID for scraping (None if validation failed)")
+    business_info: Dict[str, Any] = Field(default_factory=dict)
+    total_reviews: Optional[int] = None
+    success: bool
+    error: Optional[str] = None
+    expires_in: Optional[int] = Field(None, description="Seconds until session expires")
+
+
+class ScrapeWithSessionRequest(BaseModel):
+    """Request body for scraping with an existing session."""
+    session_id: str = Field(..., description="Session ID from validation")
+    max_reviews: Optional[int] = Field(None, description="Max reviews to collect (None = unlimited)", ge=1, le=50000)
+    sort_strategy: str = Field("auto", description="Sort strategy: auto, multi, newest, lowest, highest, relevant")
+    initial_sort: Optional[str] = Field(None, description="Initial sort order for first pass")
+
+
+class ScrapeWithSessionResponse(BaseModel):
+    """Response from session-based scraping."""
+    reviews: list = Field(default_factory=list)
+    count: int = 0
+    total_reviews: int = 0
+    success: bool
+    error: Optional[str] = None
+    time: float = 0
+    session_reused: bool = Field(True, description="Indicates session was reused from validation")
+    business_info: Dict[str, Any] = Field(default_factory=dict)
+
+
+class SessionInfo(BaseModel):
+    """Information about an active session."""
+    session_id: str
+    business: str
+    state: str
+    total_reviews: int
+    age_seconds: int
+    ttl_remaining: int
+
+
+class SessionListResponse(BaseModel):
+    """Response listing all active sessions."""
+    total_sessions: int
+    sessions: list
+
+
+# ============================================================================
+# Endpoints
+# ============================================================================
+
+@router.post("/validate", response_model=ValidateResponse, summary="Validate URL and Create Session")
+async def validate_and_create_session(request: ValidateRequest):
+    """
+    Validate a Google Maps URL and keep the browser session alive for scraping.
+
+    This endpoint:
+    1. Creates a Chrome browser
+    2. Navigates to the Google Maps URL
+    3. Extracts business information
+    4. Keeps the browser ALIVE and returns a session_id
+
+    The session can then be used with /sessions/scrape to continue scraping
+    without re-navigating (saves 4-16 seconds per job).
+
+    Session expires after TTL (default: 5 minutes).
+    """
+    try:
+        url = str(request.url)
+        log.info(f"Validating URL with session: {url[:80]}...")
+
+        # Build fingerprint dict
+        fingerprint = None
+        if request.browser_fingerprint:
+            fp = request.browser_fingerprint
+            fingerprint = {
+                "userAgent": fp.userAgent,
+                "timezone": fp.timezone,
+                "language": fp.language,
+                "platform": fp.platform,
+            }
+            if fp.viewport:
+                fingerprint["viewport"] = {"width": fp.viewport.width, "height": fp.viewport.height}
+            if fp.geolocation:
+                fingerprint["geolocation"] = {"lat": fp.geolocation.lat, "lng": fp.geolocation.lng}
+        elif request.geolocation:
+            fingerprint = {"geolocation": {"lat": request.geolocation.lat, "lng": request.geolocation.lng}}
+
+        # Run validation in thread (blocks Chrome operations)
+        result = await asyncio.to_thread(
+            validate_with_session,
+            url=url,
+            headless=False,  # Headed Chrome with Xvfb
+            browser_fingerprint=fingerprint,
+            session_ttl=request.session_ttl
+        )
+
+        return ValidateResponse(
+            session_id=result.get("session_id"),
+            business_info=result.get("business_info", {}),
+            total_reviews=result.get("total_reviews"),
+            success=result.get("success", False),
+            error=result.get("error"),
+            expires_in=result.get("expires_in")
+        )
+
+    except Exception as e:
+        log.error(f"Session validation error: {e}")
+        return ValidateResponse(
+            session_id=None,
+            success=False,
+            error=str(e)
+        )
+
+
+@router.post("/scrape", response_model=ScrapeWithSessionResponse, summary="Scrape Using Existing Session")
+async def scrape_using_session(request: ScrapeWithSessionRequest):
+    """
+    Scrape reviews using an existing validated session.
+
+    This endpoint:
+    1. Retrieves the browser from the session (already on Google Maps page)
+    2. Skips navigation and consent handling (already done)
+    3. Clicks Reviews tab and starts scraping
+    4. Releases the session when done
+
+    Saves 4-16 seconds compared to starting fresh.
+    """
+    try:
+        log.info(f"Scraping with session {request.session_id}...")
+
+        # Run scraping in thread
+        result = await asyncio.to_thread(
+            scrape_with_session,
+            session_id=request.session_id,
+            max_reviews=request.max_reviews,
+            sort_strategy=request.sort_strategy,
+            initial_sort=request.initial_sort
+        )
+
+        return ScrapeWithSessionResponse(
+            reviews=result.get("reviews", []),
+            count=result.get("count", 0),
+            total_reviews=result.get("total_reviews", 0),
+            success=result.get("success", False),
+            error=result.get("error"),
+            time=result.get("time", 0),
+            session_reused=result.get("session_reused", True),
+            business_info=result.get("business_info", {})
+        )
+
+    except Exception as e:
+        log.error(f"Session scraping error: {e}")
+        return ScrapeWithSessionResponse(
+            success=False,
+            error=str(e)
+        )
+
+
+@router.get("", response_model=SessionListResponse, summary="List Active Sessions")
+async def list_sessions():
+    """
+    List all active browser sessions.
+
+    Returns information about each session including:
+    - Business name
+    - State (validated, scraping)
+    - Time until expiration
+    """
+    session_manager = get_session_manager()
+    stats = session_manager.get_stats()
+
+    return SessionListResponse(
+        total_sessions=stats.get("total_sessions", 0),
+        sessions=stats.get("sessions", [])
+    )
+
+
+@router.get("/{session_id}", summary="Get Session Status")
+async def get_session_status(session_id: str):
+    """
+    Get the status of a specific session.
+    """
+    session_manager = get_session_manager()
+    session = session_manager.get_session(session_id)
+
+    if not session:
+        raise HTTPException(status_code=404, detail=f"Session {session_id} not found or expired")
+
+    import time
+    now = time.time()
+
+    return {
+        "session_id": session.session_id,
+        "business": session.business_info.get("name", "unknown"),
+        "state": session.state,
+        "total_reviews": session.total_reviews,
+        "url": session.url,
+        "age_seconds": int(now - session.created_at),
+        "ttl_remaining": int(session.expires_at - now)
+    }
+
+
+@router.delete("/{session_id}", summary="Release Session")
+async def release_session(session_id: str):
+    """
+    Manually release a session and close its browser.
+
+    Use this if the user cancels before scraping.
+    """
+    session_manager = get_session_manager()
+    session = session_manager.get_session(session_id)
+
+    if not session:
+        raise HTTPException(status_code=404, detail=f"Session {session_id} not found or expired")
+
+    session_manager.release_session(session_id, reason="manual_release")
+
+    return {
+        "success": True,
+        "message": f"Session {session_id} released"
+    }
+
+
+# ============================================================================
+# Helper to register router with main app
+# ============================================================================
+
+def register_session_routes(app):
+    """Register session routes with the FastAPI app."""
+    app.include_router(router)
+    log.info("Session routes registered at /sessions")