Initial commit - WhyRating Engine (Google Reviews Scraper)

This commit is contained in:
Alejandro Gutiérrez
2026-02-02 18:19:00 +00:00
parent 0543a08242
commit 2206ddeff2
136 changed files with 51138 additions and 855 deletions

View File

@@ -7,6 +7,7 @@ from api.routes.batches import router as batches_router, set_database as set_bat
from api.routes.dashboard import router as dashboard_router, set_database as set_dashboard_db
from api.routes.admin import router as admin_router, set_database as set_admin_db
from api.routes.pipelines import router as pipelines_router, set_database as set_pipelines_db
from api.routes.reviewiq_analytics import router as reviewiq_analytics_router, set_database as set_reviewiq_analytics_db
__all__ = [
'batches_router',
@@ -17,4 +18,6 @@ __all__ = [
'set_admin_db',
'pipelines_router',
'set_pipelines_db',
'reviewiq_analytics_router',
'set_reviewiq_analytics_db',
]

View File

@@ -277,15 +277,17 @@ async def execute_pipeline(
pipeline = await _get_pipeline_instance(pipeline_id)
# Create execution record
execution_id = str(uuid.uuid4())
# Prepare input data
input_data = request.input_data or {}
if request.job_id:
input_data["job_id"] = request.job_id
if request.business_id:
input_data["business_id"] = request.business_id
# Create execution record
execution_id = str(uuid.uuid4())
# Pass execution_id so Stage 5 synthesis can store results
input_data["execution_id"] = execution_id
stages = request.stages or pipeline.get_stage_names()
# Prepare input summary for storage
@@ -604,6 +606,7 @@ async def get_widget_data(
pipeline_id: str,
widget_id: str,
business_id: str | None = Query(None, description="Filter by business"),
job_id: str | None = Query(None, description="Filter by job ID"),
time_range: str = Query("30d", description="Time range (e.g., 7d, 30d, 90d)"),
page: int = Query(1, ge=1, description="Page number for paginated widgets"),
page_size: int = Query(10, ge=1, le=100, description="Items per page"),
@@ -621,6 +624,7 @@ async def get_widget_data(
try:
params = {
"business_id": business_id,
"job_id": job_id,
"time_range": time_range,
"page": page,
"page_size": page_size,

300
api/routes/sessions.py Normal file
View File

@@ -0,0 +1,300 @@
"""
Session Routes for Google Reviews Scraper API
Provides session handoff endpoints for efficient validation → scraping workflow.
Uses scraper v1.2.0 with session support.
Endpoints:
POST /sessions/validate - Validate URL, keep browser alive, return session_id
POST /sessions/scrape - Scrape using existing session (skips navigation)
GET /sessions - List active sessions
GET /sessions/{id} - Get session status
DELETE /sessions/{id} - Release session manually
Usage:
1. POST /sessions/validate with URL → returns session_id
2. Frontend shows business info to user for confirmation
3. POST /sessions/scrape with session_id → scrapes using existing browser
"""
import asyncio
import logging
from typing import Optional, Dict, Any
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel, HttpUrl, Field
# Import v1.2.0 scraper with session support
from scrapers.google_reviews.v1_2_0 import (
validate_with_session,
scrape_with_session,
LogCapture
)
from scrapers.google_reviews.session_manager import get_session_manager
log = logging.getLogger("api_sessions")
# Create router
router = APIRouter(prefix="/sessions", tags=["sessions"])
# ============================================================================
# Request/Response Models
# ============================================================================
class GeoLocation(BaseModel):
lat: float
lng: float
class Viewport(BaseModel):
width: int
height: int
class BrowserFingerprint(BaseModel):
userAgent: Optional[str] = None
timezone: Optional[str] = None
language: Optional[str] = None
platform: Optional[str] = None
viewport: Optional[Viewport] = None
geolocation: Optional[GeoLocation] = None
class ValidateRequest(BaseModel):
"""Request body for session validation."""
url: HttpUrl = Field(..., description="Google Maps URL to validate")
browser_fingerprint: Optional[BrowserFingerprint] = None
geolocation: Optional[GeoLocation] = None
session_ttl: int = Field(300, description="Session TTL in seconds (default: 5 min)", ge=60, le=900)
class ValidateResponse(BaseModel):
"""Response from session validation."""
session_id: Optional[str] = Field(None, description="Session ID for scraping (None if validation failed)")
business_info: Dict[str, Any] = Field(default_factory=dict)
total_reviews: Optional[int] = None
success: bool
error: Optional[str] = None
expires_in: Optional[int] = Field(None, description="Seconds until session expires")
class ScrapeWithSessionRequest(BaseModel):
"""Request body for scraping with an existing session."""
session_id: str = Field(..., description="Session ID from validation")
max_reviews: Optional[int] = Field(None, description="Max reviews to collect (None = unlimited)", ge=1, le=50000)
sort_strategy: str = Field("auto", description="Sort strategy: auto, multi, newest, lowest, highest, relevant")
initial_sort: Optional[str] = Field(None, description="Initial sort order for first pass")
class ScrapeWithSessionResponse(BaseModel):
"""Response from session-based scraping."""
reviews: list = Field(default_factory=list)
count: int = 0
total_reviews: int = 0
success: bool
error: Optional[str] = None
time: float = 0
session_reused: bool = Field(True, description="Indicates session was reused from validation")
business_info: Dict[str, Any] = Field(default_factory=dict)
class SessionInfo(BaseModel):
"""Information about an active session."""
session_id: str
business: str
state: str
total_reviews: int
age_seconds: int
ttl_remaining: int
class SessionListResponse(BaseModel):
"""Response listing all active sessions."""
total_sessions: int
sessions: list
# ============================================================================
# Endpoints
# ============================================================================
@router.post("/validate", response_model=ValidateResponse, summary="Validate URL and Create Session")
async def validate_and_create_session(request: ValidateRequest):
"""
Validate a Google Maps URL and keep the browser session alive for scraping.
This endpoint:
1. Creates a Chrome browser
2. Navigates to the Google Maps URL
3. Extracts business information
4. Keeps the browser ALIVE and returns a session_id
The session can then be used with /sessions/scrape to continue scraping
without re-navigating (saves 4-16 seconds per job).
Session expires after TTL (default: 5 minutes).
"""
try:
url = str(request.url)
log.info(f"Validating URL with session: {url[:80]}...")
# Build fingerprint dict
fingerprint = None
if request.browser_fingerprint:
fp = request.browser_fingerprint
fingerprint = {
"userAgent": fp.userAgent,
"timezone": fp.timezone,
"language": fp.language,
"platform": fp.platform,
}
if fp.viewport:
fingerprint["viewport"] = {"width": fp.viewport.width, "height": fp.viewport.height}
if fp.geolocation:
fingerprint["geolocation"] = {"lat": fp.geolocation.lat, "lng": fp.geolocation.lng}
elif request.geolocation:
fingerprint = {"geolocation": {"lat": request.geolocation.lat, "lng": request.geolocation.lng}}
# Run validation in thread (blocks Chrome operations)
result = await asyncio.to_thread(
validate_with_session,
url=url,
headless=False, # Headed Chrome with Xvfb
browser_fingerprint=fingerprint,
session_ttl=request.session_ttl
)
return ValidateResponse(
session_id=result.get("session_id"),
business_info=result.get("business_info", {}),
total_reviews=result.get("total_reviews"),
success=result.get("success", False),
error=result.get("error"),
expires_in=result.get("expires_in")
)
except Exception as e:
log.error(f"Session validation error: {e}")
return ValidateResponse(
session_id=None,
success=False,
error=str(e)
)
@router.post("/scrape", response_model=ScrapeWithSessionResponse, summary="Scrape Using Existing Session")
async def scrape_using_session(request: ScrapeWithSessionRequest):
"""
Scrape reviews using an existing validated session.
This endpoint:
1. Retrieves the browser from the session (already on Google Maps page)
2. Skips navigation and consent handling (already done)
3. Clicks Reviews tab and starts scraping
4. Releases the session when done
Saves 4-16 seconds compared to starting fresh.
"""
try:
log.info(f"Scraping with session {request.session_id}...")
# Run scraping in thread
result = await asyncio.to_thread(
scrape_with_session,
session_id=request.session_id,
max_reviews=request.max_reviews,
sort_strategy=request.sort_strategy,
initial_sort=request.initial_sort
)
return ScrapeWithSessionResponse(
reviews=result.get("reviews", []),
count=result.get("count", 0),
total_reviews=result.get("total_reviews", 0),
success=result.get("success", False),
error=result.get("error"),
time=result.get("time", 0),
session_reused=result.get("session_reused", True),
business_info=result.get("business_info", {})
)
except Exception as e:
log.error(f"Session scraping error: {e}")
return ScrapeWithSessionResponse(
success=False,
error=str(e)
)
@router.get("", response_model=SessionListResponse, summary="List Active Sessions")
async def list_sessions():
"""
List all active browser sessions.
Returns information about each session including:
- Business name
- State (validated, scraping)
- Time until expiration
"""
session_manager = get_session_manager()
stats = session_manager.get_stats()
return SessionListResponse(
total_sessions=stats.get("total_sessions", 0),
sessions=stats.get("sessions", [])
)
@router.get("/{session_id}", summary="Get Session Status")
async def get_session_status(session_id: str):
"""
Get the status of a specific session.
"""
session_manager = get_session_manager()
session = session_manager.get_session(session_id)
if not session:
raise HTTPException(status_code=404, detail=f"Session {session_id} not found or expired")
import time
now = time.time()
return {
"session_id": session.session_id,
"business": session.business_info.get("name", "unknown"),
"state": session.state,
"total_reviews": session.total_reviews,
"url": session.url,
"age_seconds": int(now - session.created_at),
"ttl_remaining": int(session.expires_at - now)
}
@router.delete("/{session_id}", summary="Release Session")
async def release_session(session_id: str):
"""
Manually release a session and close its browser.
Use this if the user cancels before scraping.
"""
session_manager = get_session_manager()
session = session_manager.get_session(session_id)
if not session:
raise HTTPException(status_code=404, detail=f"Session {session_id} not found or expired")
session_manager.release_session(session_id, reason="manual_release")
return {
"success": True,
"message": f"Session {session_id} released"
}
# ============================================================================
# Helper to register router with main app
# ============================================================================
def register_session_routes(app):
"""Register session routes with the FastAPI app."""
app.include_router(router)
log.info("Session routes registered at /sessions")