Initial commit - WhyRating Engine (Google Reviews Scraper)

This commit is contained in:
Alejandro Gutiérrez
2026-02-02 18:19:00 +00:00
parent 0543a08242
commit 2206ddeff2
136 changed files with 51138 additions and 855 deletions

View File

@@ -61,7 +61,9 @@ from api.routes import (
dashboard_router, set_dashboard_db,
admin_router, set_admin_db,
pipelines_router, set_pipelines_db,
reviewiq_analytics_router, set_reviewiq_analytics_db,
)
from api.routes.sessions import router as sessions_router
# Configure logging
logging.basicConfig(
@@ -110,6 +112,7 @@ async def lifespan(app: FastAPI):
set_dashboard_db(db)
set_admin_db(db)
set_pipelines_db(db.pool) # Pipeline router uses raw asyncpg pool
set_reviewiq_analytics_db(db.pool) # ReviewIQ analytics uses raw asyncpg pool
# Initialize health check system with canary monitoring
# DISABLED: Canary tests consume Google Maps requests and trigger rate limiting
@@ -124,12 +127,15 @@ async def lifespan(app: FastAPI):
# Start Chrome worker pools (1 for validation, 2 for scraping)
# These pre-warm Chrome instances for instant availability
# headless=False because Docker uses Xvfb virtual display for better compatibility
# In Docker: headless=False with Xvfb virtual display for better compatibility
# Locally: use CHROME_HEADLESS env var to control (default: headed for scraping)
is_docker = os.path.exists("/.dockerenv") or os.environ.get("DOCKER_CONTAINER", "false").lower() == "true"
chrome_headless = os.environ.get("CHROME_HEADLESS", "false").lower() == "true"
await asyncio.to_thread(
start_worker_pools,
validation_size=1,
scraping_size=2,
headless=False
headless=chrome_headless if not is_docker else False
)
log.info("Chrome worker pools started (1 validation + 2 scraping)")
@@ -172,6 +178,8 @@ app.include_router(batches_router)
app.include_router(dashboard_router)
app.include_router(admin_router)
app.include_router(pipelines_router)
app.include_router(reviewiq_analytics_router)
app.include_router(sessions_router) # Session handoff for validation → scraping
# ==================== Request/Response Models ====================
@@ -220,6 +228,10 @@ class ScrapeRequest(BaseModel):
callback_url: Optional[HttpUrl] = Field(None, description="URL to call when job completes (alternative to webhook)")
scraper_version: Optional[str] = Field(None, description="Specific scraper version to use")
scraper_variant: Optional[str] = Field(None, description="Scraper variant (e.g., 'fast', 'thorough', 'stealth')")
# Testing options
max_reviews: Optional[int] = Field(None, description="Maximum reviews to collect (for testing, default: unlimited)", ge=1, le=10000)
# Session handoff (v1.2.0) - reuse browser from validation
session_id: Optional[str] = Field(None, description="Session ID from /sessions/validate for browser reuse")
class GoogleReviewsScrapeRequest(BaseModel):
@@ -236,6 +248,10 @@ class GoogleReviewsScrapeRequest(BaseModel):
callback_url: Optional[HttpUrl] = Field(None, description="URL to call when job completes (alternative to webhook)")
scraper_version: Optional[str] = Field(None, description="Specific scraper version to use")
scraper_variant: Optional[str] = Field(None, description="Scraper variant (e.g., 'fast', 'thorough', 'stealth')")
# Testing options
max_reviews: Optional[int] = Field(None, description="Maximum reviews to collect (for testing, default: unlimited)", ge=1, le=10000)
# Session handoff (v1.2.0) - reuse browser from validation
session_id: Optional[str] = Field(None, description="Session ID from /sessions/validate for browser reuse")
class JobResponse(BaseModel):
@@ -548,16 +564,21 @@ async def get_job(job_id: UUID):
except:
review_topics = None
# Extract business info from metadata if available
metadata = job.get('metadata')
if isinstance(metadata, str):
try:
metadata = json.loads(metadata)
except:
metadata = None
# Read business info from dedicated columns (with fallback to metadata for older jobs)
business_name = job.get('business_name')
business_category = job.get('business_category')
business_name = metadata.get('business_name') if metadata else None
business_category = metadata.get('business_category') if metadata else None
# Fallback to metadata for jobs created before migration
if not business_name or not business_category:
metadata = job.get('metadata')
if isinstance(metadata, str):
try:
metadata = json.loads(metadata)
except:
metadata = None
if metadata:
business_name = business_name or metadata.get('business_name')
# Note: business_category was not previously stored in metadata
return JobResponse(
job_id=str(job['job_id']),
@@ -1051,17 +1072,22 @@ async def list_jobs(
result = []
for job in jobs:
# Extract business info from metadata if available
metadata = job.get('metadata')
if isinstance(metadata, str):
try:
metadata = json.loads(metadata)
except:
metadata = None
# Read business info from dedicated columns (with fallback to metadata for older jobs)
business_name = job.get('business_name')
business_address = job.get('business_address')
business_category = job.get('business_category')
business_name = metadata.get('business_name') if metadata else None
business_address = metadata.get('business_address') if metadata else None
business_category = metadata.get('business_category') if metadata else None
# Fallback to metadata for jobs created before migration
if not business_name:
metadata = job.get('metadata')
if isinstance(metadata, str):
try:
metadata = json.loads(metadata)
except:
metadata = None
if metadata:
business_name = business_name or metadata.get('business_name')
business_address = business_address or metadata.get('business_address')
# Parse review_topics if it's a string
review_topics = job.get('review_topics')
@@ -1191,6 +1217,193 @@ async def get_stats():
return StatsResponse(**stats)
# ==================== GBP Categories Endpoints ====================
@app.get("/categories", summary="Get GBP Categories")
async def get_categories(
search: Optional[str] = Query(None, description="Search term for category name"),
parent: Optional[str] = Query(None, description="Parent path (ltree) to filter children"),
level: Optional[int] = Query(None, description="Category level (1-4)", ge=1, le=4),
limit: int = Query(5000, description="Maximum number of results", ge=1, le=10000),
offset: int = Query(0, description="Offset for pagination", ge=0),
):
"""
Get Google Business Profile categories.
Supports filtering by:
- search: Text search in category name
- parent: Get children of a specific path
- level: Filter by hierarchy level (1=Sector, 2=Business Type, 3=Sub-category, 4=Category)
"""
if not db or not db.pool:
raise HTTPException(status_code=500, detail="Database not initialized")
async with db.pool.acquire() as conn:
# Build query dynamically based on filters
conditions = []
params = []
param_idx = 1
if search:
conditions.append(f"name ILIKE ${param_idx}")
params.append(f"%{search}%")
param_idx += 1
if parent:
conditions.append(f"path <@ ${param_idx}::ltree AND path != ${param_idx}::ltree")
params.append(parent)
param_idx += 1
if level:
conditions.append(f"level = ${param_idx}")
params.append(level)
param_idx += 1
where_clause = " AND ".join(conditions) if conditions else "TRUE"
# Get total count
count_query = f"SELECT COUNT(*) FROM gbp_categories WHERE {where_clause}"
total = await conn.fetchval(count_query, *params)
# Get categories
query = f"""
SELECT id, name, slug, path::text as path, level, parent_id, category_count
FROM gbp_categories
WHERE {where_clause}
ORDER BY path
LIMIT ${param_idx} OFFSET ${param_idx + 1}
"""
params.extend([limit, offset])
rows = await conn.fetch(query, *params)
categories = [dict(row) for row in rows]
return {
"categories": categories,
"total": total,
"limit": limit,
"offset": offset,
}
@app.get("/categories/tree", summary="Get GBP Categories Tree")
async def get_categories_tree(
root: Optional[str] = Query(None, description="Root path to start the tree from"),
max_depth: int = Query(4, description="Maximum depth of the tree", ge=1, le=4),
):
"""
Get categories as a hierarchical tree structure.
Returns nested categories starting from root (or all roots if not specified).
"""
if not db or not db.pool:
raise HTTPException(status_code=500, detail="Database not initialized")
async with db.pool.acquire() as conn:
if root:
# Get subtree starting from root
query = """
SELECT id, name, slug, path::text as path, level, parent_id, category_count
FROM gbp_categories
WHERE path <@ $1::ltree
ORDER BY path
"""
rows = await conn.fetch(query, root)
else:
# Get all categories
query = """
SELECT id, name, slug, path::text as path, level, parent_id, category_count
FROM gbp_categories
ORDER BY path
"""
rows = await conn.fetch(query)
categories = [dict(row) for row in rows]
# Build tree structure
def build_tree(cats, parent_path=None, current_depth=1):
if current_depth > max_depth:
return []
result = []
for cat in cats:
cat_parts = cat['path'].split('.')
if parent_path is None:
# Root level - single segment paths
if len(cat_parts) == 1:
children = build_tree(cats, cat['path'], current_depth + 1)
result.append({
**cat,
'children': children if children else None
})
else:
# Check if this is a direct child of parent_path
parent_parts = parent_path.split('.')
if (len(cat_parts) == len(parent_parts) + 1 and
cat['path'].startswith(parent_path + '.')):
children = build_tree(cats, cat['path'], current_depth + 1)
result.append({
**cat,
'children': children if children else None
})
return result
tree = build_tree(categories)
return {
"tree": tree,
"total": len(categories),
}
@app.get("/categories/{path:path}", summary="Get Category by Path")
async def get_category_by_path(path: str):
"""
Get a specific category by its ltree path.
Also returns ancestors and direct children.
"""
if not db or not db.pool:
raise HTTPException(status_code=500, detail="Database not initialized")
async with db.pool.acquire() as conn:
# Get the category
category = await conn.fetchrow("""
SELECT id, name, slug, path::text as path, level, parent_id, category_count
FROM gbp_categories
WHERE path = $1::ltree
""", path)
if not category:
raise HTTPException(status_code=404, detail="Category not found")
category = dict(category)
# Get ancestors
ancestors = await conn.fetch("""
SELECT id, name, slug, path::text as path, level, parent_id, category_count
FROM gbp_categories
WHERE path @> $1::ltree AND path != $1::ltree
ORDER BY path
""", path)
# Get direct children
children = await conn.fetch("""
SELECT id, name, slug, path::text as path, level, parent_id, category_count
FROM gbp_categories
WHERE path ~ ($1 || '.*{1}')::lquery
ORDER BY name
""", path)
return {
"category": category,
"ancestors": [dict(a) for a in ancestors],
"children": [dict(c) for c in children],
}
@app.get("/pool-stats", summary="Get Worker Pool Statistics")
async def pool_stats():
"""Get Chrome worker pool statistics"""
@@ -1331,10 +1544,82 @@ async def get_crash_report(job_id: UUID):
)
# Available sort orders for retry strategy
SORT_ORDERS = ["newest", "lowest", "highest", "relevant"]
# Fingerprint rotation for retry - realistic browser profiles to avoid bot detection
import random
FINGERPRINT_PROFILES = [
{
"platform": "MacIntel",
"timezone": "Europe/Madrid",
"language": "es-ES",
"userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"viewport": {"width": 1440, "height": 900}
},
{
"platform": "Win32",
"timezone": "Europe/London",
"language": "en-GB",
"userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
"viewport": {"width": 1920, "height": 1080}
},
{
"platform": "MacIntel",
"timezone": "America/New_York",
"language": "en-US",
"userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
"viewport": {"width": 1680, "height": 1050}
},
{
"platform": "Win32",
"timezone": "Europe/Paris",
"language": "fr-FR",
"userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"viewport": {"width": 1366, "height": 768}
},
{
"platform": "MacIntel",
"timezone": "Europe/Berlin",
"language": "de-DE",
"userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
"viewport": {"width": 1512, "height": 982}
},
]
def get_rotated_fingerprint(retry_attempt: int = 0, previous_fingerprints: list = None) -> dict:
"""
Get a fingerprint profile for retry, avoiding previously used ones.
Args:
retry_attempt: Which retry attempt this is (0-indexed)
previous_fingerprints: List of previously used fingerprint platforms
Returns:
A fingerprint profile dict
"""
previous_fingerprints = previous_fingerprints or []
# Filter out previously used profiles
available = [fp for fp in FINGERPRINT_PROFILES
if fp["platform"] not in previous_fingerprints]
# If all used, cycle back
if not available:
available = FINGERPRINT_PROFILES
# Select based on retry attempt (deterministic but varied)
selected = available[retry_attempt % len(available)]
return selected.copy()
@app.post("/jobs/{job_id}/retry", response_model=RetryJobResponse, summary="Retry Failed Job")
async def retry_job(
job_id: UUID,
apply_fix: bool = Query(False, description="Apply auto-fix parameters based on crash analysis")
apply_fix: bool = Query(False, description="Apply auto-fix parameters based on crash analysis"),
next_sort: bool = Query(False, description="Use a different sort order than the original job (for partial jobs)")
):
"""
Retry a failed or partial job, optionally applying auto-fix parameters.
@@ -1344,6 +1629,11 @@ async def retry_job(
- Applies recommended parameter adjustments (e.g., reduced batch size for memory issues)
- Creates a new job with the adjusted parameters
When next_sort=true:
- Uses a different sort order than previously attempted
- Helps get different reviews when stuck at ~1000 limit
- Tracks sort_orders_attempted for review merging
Returns the new job ID for tracking.
"""
if not db:
@@ -1418,6 +1708,72 @@ async def retry_job(
applied_fixes = analysis.auto_fix_params
log.info(f"Applying auto-fix for pattern '{analysis.pattern}': {applied_fixes}")
# Handle next_sort: use a different sort order than previously attempted
selected_sort = None
if next_sort:
# Get previously attempted sort orders
sort_orders_attempted = original_metadata.get('sort_orders_attempted', [])
# If no sort was tracked, assume "newest" was used (default)
if not sort_orders_attempted:
initial_sort_used = original_metadata.get('initial_sort_used', 'newest')
sort_orders_attempted = [initial_sort_used]
# Find next unused sort order
for sort_order in SORT_ORDERS:
if sort_order not in sort_orders_attempted:
selected_sort = sort_order
break
if selected_sort:
# Set the new sort strategy
original_metadata['initial_sort'] = selected_sort
original_metadata['sort_strategy'] = 'single' # Don't auto-trigger multi-sort
# Track all attempted sorts (including this one)
original_metadata['sort_orders_attempted'] = sort_orders_attempted + [selected_sort]
# Track retry chain for review merging
if 'retry_chain' not in original_metadata:
original_metadata['retry_chain'] = [str(job_id)]
else:
original_metadata['retry_chain'].append(str(job_id))
original_metadata['retry_info'] = original_metadata.get('retry_info', {})
original_metadata['retry_info']['original_job_id'] = str(job_id)
original_metadata['retry_info']['retry_reason'] = 'next_sort'
original_metadata['retry_info']['selected_sort'] = selected_sort
log.info(f"Retry with next_sort: using '{selected_sort}' (previously tried: {sort_orders_attempted})")
else:
log.warn(f"All sort orders already attempted: {sort_orders_attempted}")
# Fingerprint rotation: if bot was detected, use a different fingerprint
selected_fingerprint = None
if next_sort and original_metadata.get('bot_detected', False):
# Get previously used fingerprints
previous_fingerprints = original_metadata.get('fingerprints_used', [])
retry_count = len(original_metadata.get('retry_chain', []))
# Get a rotated fingerprint
selected_fingerprint = get_rotated_fingerprint(retry_count, previous_fingerprints)
# Store the fingerprint in metadata
original_metadata['browser_fingerprint'] = selected_fingerprint
# Track used fingerprints
if 'fingerprints_used' not in original_metadata:
original_metadata['fingerprints_used'] = []
original_metadata['fingerprints_used'].append(selected_fingerprint['platform'])
original_metadata['retry_info']['fingerprint_rotated'] = True
original_metadata['retry_info']['new_fingerprint'] = {
'platform': selected_fingerprint['platform'],
'timezone': selected_fingerprint['timezone']
}
log.info(f"Fingerprint rotated for retry: {selected_fingerprint['platform']}, {selected_fingerprint['timezone']}")
# Create new job with same URL and (possibly modified) metadata
new_job_id = await db.create_job(
url=original_job['url'],
@@ -1431,11 +1787,28 @@ async def retry_job(
log.info(f"Created retry job {new_job_id} for original job {job_id}")
# Build response message
message = f"Retry job created from original job {job_id}"
if selected_sort:
message += f" (using sort: {selected_sort})"
if selected_fingerprint:
message += f" (fingerprint: {selected_fingerprint['platform']}/{selected_fingerprint['timezone']})"
# Build applied_fixes response
retry_fixes = {}
if selected_sort:
retry_fixes["selected_sort"] = selected_sort
if selected_fingerprint:
retry_fixes["fingerprint"] = {
"platform": selected_fingerprint["platform"],
"timezone": selected_fingerprint["timezone"]
}
return RetryJobResponse(
job_id=str(new_job_id),
status="started",
message=f"Retry job created from original job {job_id}",
applied_fixes=applied_fixes
message=message,
applied_fixes=applied_fixes if applied_fixes else (retry_fixes if retry_fixes else None)
)
@@ -1529,8 +1902,9 @@ async def liveness():
Use this for Kubernetes liveness probe - restart container if fails.
"""
# If health system is disabled, just return healthy (server is alive)
if not health_system:
raise HTTPException(status_code=503, detail="Health system not initialized")
return {"status": "healthy", "message": "Server is alive (health system disabled)"}
return await health_system.check_liveness()
@@ -1542,8 +1916,12 @@ async def readiness():
Use this for Kubernetes readiness probe - remove from load balancer if fails.
"""
# If health system is disabled, check if DB is connected
if not health_system:
raise HTTPException(status_code=503, detail="Health system not initialized")
if db and db.pool:
return {"status": "ready", "message": "Server is ready (health system disabled)"}
else:
raise HTTPException(status_code=503, detail="Database not connected")
result = await health_system.check_readiness()
@@ -1728,17 +2106,67 @@ async def run_scraping_job(job_id: UUID):
scraper_func, actual_version = get_scraper_for_version(requested_version)
log.info(f"Using scraper version {actual_version} for job {job_id}")
# Run scraping with progress callback and shared log capture
# headless=False because Docker uses Xvfb virtual display
result = await asyncio.to_thread(
scraper_func,
url=url,
headless=False,
progress_callback=progress_callback,
log_capture=log_capture,
flush_callback=flush_callback,
browser_fingerprint=browser_fingerprint # Pass user's browser fingerprint
)
# Get sort strategy parameters from metadata (for retry with different sort)
initial_sort = metadata.get('initial_sort') if metadata else None
sort_strategy = metadata.get('sort_strategy', 'auto') if metadata else 'auto'
max_reviews = metadata.get('max_reviews') if metadata else None
session_id = metadata.get('session_id') if metadata else None
if initial_sort:
log.info(f"Using initial_sort={initial_sort}, sort_strategy={sort_strategy} for job {job_id}")
if max_reviews:
log.info(f"Using max_reviews={max_reviews} limit for job {job_id} (testing mode)")
# Check if we have a session_id for browser reuse (session handoff from validation)
if session_id:
log.info(f"Using session handoff (session_id={session_id}) for job {job_id} - skipping navigation")
from scrapers.google_reviews.v1_2_0 import scrape_with_session
result = await asyncio.to_thread(
scrape_with_session,
session_id=session_id,
max_reviews=max_reviews,
progress_callback=progress_callback,
flush_callback=flush_callback,
sort_strategy=sort_strategy,
initial_sort=initial_sort
)
# Add logs from session scraping
if 'logs' in result:
for log_entry in result.get('logs', []):
log_capture.entries.append(log_entry)
else:
# Run scraping with progress callback and shared log capture
# headless=False because Docker uses Xvfb virtual display
result = await asyncio.to_thread(
scraper_func,
url=url,
headless=False,
progress_callback=progress_callback,
log_capture=log_capture,
flush_callback=flush_callback,
browser_fingerprint=browser_fingerprint, # Pass user's browser fingerprint
initial_sort=initial_sort, # Sort order for retry strategy
sort_strategy=sort_strategy, # Sort strategy (auto, multi, single)
max_reviews=max_reviews # Optional limit for testing
)
# Update job metadata with tracking info from scraper result
tracking_metadata = {
'bot_detected': result.get('bot_detected', False),
'initial_sort_used': result.get('initial_sort_used', 'newest'),
'multi_sort': result.get('multi_sort', {}),
}
# Preserve existing sort_orders_attempted and add current sort
existing_sorts = metadata.get('sort_orders_attempted', []) if metadata else []
current_sort = result.get('initial_sort_used', 'newest')
if current_sort not in existing_sorts:
tracking_metadata['sort_orders_attempted'] = existing_sorts + [current_sort]
else:
tracking_metadata['sort_orders_attempted'] = existing_sorts
# Update metadata in database
await db.update_job_metadata(job_id, tracking_metadata)
if result.get('bot_detected'):
log.warn(f"Bot detection flagged for job {job_id} - sort button was hidden")
if result['success']:
# Save session fingerprint if captured
@@ -1746,6 +2174,18 @@ async def run_scraping_job(job_id: UUID):
await db.update_session_fingerprint(job_id, result['session_fingerprint'])
log.info(f"Saved session fingerprint for job {job_id}")
# Save business info to dedicated columns (queryable/indexable)
business_info = result.get('business_info', {})
if business_info:
await db.update_business_info(
job_id=job_id,
business_name=business_info.get('name'),
business_category=business_info.get('category'),
business_address=business_info.get('address'),
business_rating=business_info.get('rating')
)
log.info(f"Saved business info for job {job_id}: {business_info.get('name')} ({business_info.get('category')})")
# Save results to database (including scraper logs and review topics)
await db.save_job_result(
job_id=job_id,