Initial commit - WhyRating Engine (Google Reviews Scraper)
This commit is contained in:
@@ -61,7 +61,9 @@ from api.routes import (
|
||||
dashboard_router, set_dashboard_db,
|
||||
admin_router, set_admin_db,
|
||||
pipelines_router, set_pipelines_db,
|
||||
reviewiq_analytics_router, set_reviewiq_analytics_db,
|
||||
)
|
||||
from api.routes.sessions import router as sessions_router
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
@@ -110,6 +112,7 @@ async def lifespan(app: FastAPI):
|
||||
set_dashboard_db(db)
|
||||
set_admin_db(db)
|
||||
set_pipelines_db(db.pool) # Pipeline router uses raw asyncpg pool
|
||||
set_reviewiq_analytics_db(db.pool) # ReviewIQ analytics uses raw asyncpg pool
|
||||
|
||||
# Initialize health check system with canary monitoring
|
||||
# DISABLED: Canary tests consume Google Maps requests and trigger rate limiting
|
||||
@@ -124,12 +127,15 @@ async def lifespan(app: FastAPI):
|
||||
|
||||
# Start Chrome worker pools (1 for validation, 2 for scraping)
|
||||
# These pre-warm Chrome instances for instant availability
|
||||
# headless=False because Docker uses Xvfb virtual display for better compatibility
|
||||
# In Docker: headless=False with Xvfb virtual display for better compatibility
|
||||
# Locally: use CHROME_HEADLESS env var to control (default: headed for scraping)
|
||||
is_docker = os.path.exists("/.dockerenv") or os.environ.get("DOCKER_CONTAINER", "false").lower() == "true"
|
||||
chrome_headless = os.environ.get("CHROME_HEADLESS", "false").lower() == "true"
|
||||
await asyncio.to_thread(
|
||||
start_worker_pools,
|
||||
validation_size=1,
|
||||
scraping_size=2,
|
||||
headless=False
|
||||
headless=chrome_headless if not is_docker else False
|
||||
)
|
||||
log.info("Chrome worker pools started (1 validation + 2 scraping)")
|
||||
|
||||
@@ -172,6 +178,8 @@ app.include_router(batches_router)
|
||||
app.include_router(dashboard_router)
|
||||
app.include_router(admin_router)
|
||||
app.include_router(pipelines_router)
|
||||
app.include_router(reviewiq_analytics_router)
|
||||
app.include_router(sessions_router) # Session handoff for validation → scraping
|
||||
|
||||
|
||||
# ==================== Request/Response Models ====================
|
||||
@@ -220,6 +228,10 @@ class ScrapeRequest(BaseModel):
|
||||
callback_url: Optional[HttpUrl] = Field(None, description="URL to call when job completes (alternative to webhook)")
|
||||
scraper_version: Optional[str] = Field(None, description="Specific scraper version to use")
|
||||
scraper_variant: Optional[str] = Field(None, description="Scraper variant (e.g., 'fast', 'thorough', 'stealth')")
|
||||
# Testing options
|
||||
max_reviews: Optional[int] = Field(None, description="Maximum reviews to collect (for testing, default: unlimited)", ge=1, le=10000)
|
||||
# Session handoff (v1.2.0) - reuse browser from validation
|
||||
session_id: Optional[str] = Field(None, description="Session ID from /sessions/validate for browser reuse")
|
||||
|
||||
|
||||
class GoogleReviewsScrapeRequest(BaseModel):
|
||||
@@ -236,6 +248,10 @@ class GoogleReviewsScrapeRequest(BaseModel):
|
||||
callback_url: Optional[HttpUrl] = Field(None, description="URL to call when job completes (alternative to webhook)")
|
||||
scraper_version: Optional[str] = Field(None, description="Specific scraper version to use")
|
||||
scraper_variant: Optional[str] = Field(None, description="Scraper variant (e.g., 'fast', 'thorough', 'stealth')")
|
||||
# Testing options
|
||||
max_reviews: Optional[int] = Field(None, description="Maximum reviews to collect (for testing, default: unlimited)", ge=1, le=10000)
|
||||
# Session handoff (v1.2.0) - reuse browser from validation
|
||||
session_id: Optional[str] = Field(None, description="Session ID from /sessions/validate for browser reuse")
|
||||
|
||||
|
||||
class JobResponse(BaseModel):
|
||||
@@ -548,16 +564,21 @@ async def get_job(job_id: UUID):
|
||||
except:
|
||||
review_topics = None
|
||||
|
||||
# Extract business info from metadata if available
|
||||
metadata = job.get('metadata')
|
||||
if isinstance(metadata, str):
|
||||
try:
|
||||
metadata = json.loads(metadata)
|
||||
except:
|
||||
metadata = None
|
||||
# Read business info from dedicated columns (with fallback to metadata for older jobs)
|
||||
business_name = job.get('business_name')
|
||||
business_category = job.get('business_category')
|
||||
|
||||
business_name = metadata.get('business_name') if metadata else None
|
||||
business_category = metadata.get('business_category') if metadata else None
|
||||
# Fallback to metadata for jobs created before migration
|
||||
if not business_name or not business_category:
|
||||
metadata = job.get('metadata')
|
||||
if isinstance(metadata, str):
|
||||
try:
|
||||
metadata = json.loads(metadata)
|
||||
except:
|
||||
metadata = None
|
||||
if metadata:
|
||||
business_name = business_name or metadata.get('business_name')
|
||||
# Note: business_category was not previously stored in metadata
|
||||
|
||||
return JobResponse(
|
||||
job_id=str(job['job_id']),
|
||||
@@ -1051,17 +1072,22 @@ async def list_jobs(
|
||||
|
||||
result = []
|
||||
for job in jobs:
|
||||
# Extract business info from metadata if available
|
||||
metadata = job.get('metadata')
|
||||
if isinstance(metadata, str):
|
||||
try:
|
||||
metadata = json.loads(metadata)
|
||||
except:
|
||||
metadata = None
|
||||
# Read business info from dedicated columns (with fallback to metadata for older jobs)
|
||||
business_name = job.get('business_name')
|
||||
business_address = job.get('business_address')
|
||||
business_category = job.get('business_category')
|
||||
|
||||
business_name = metadata.get('business_name') if metadata else None
|
||||
business_address = metadata.get('business_address') if metadata else None
|
||||
business_category = metadata.get('business_category') if metadata else None
|
||||
# Fallback to metadata for jobs created before migration
|
||||
if not business_name:
|
||||
metadata = job.get('metadata')
|
||||
if isinstance(metadata, str):
|
||||
try:
|
||||
metadata = json.loads(metadata)
|
||||
except:
|
||||
metadata = None
|
||||
if metadata:
|
||||
business_name = business_name or metadata.get('business_name')
|
||||
business_address = business_address or metadata.get('business_address')
|
||||
|
||||
# Parse review_topics if it's a string
|
||||
review_topics = job.get('review_topics')
|
||||
@@ -1191,6 +1217,193 @@ async def get_stats():
|
||||
return StatsResponse(**stats)
|
||||
|
||||
|
||||
# ==================== GBP Categories Endpoints ====================
|
||||
|
||||
@app.get("/categories", summary="Get GBP Categories")
|
||||
async def get_categories(
|
||||
search: Optional[str] = Query(None, description="Search term for category name"),
|
||||
parent: Optional[str] = Query(None, description="Parent path (ltree) to filter children"),
|
||||
level: Optional[int] = Query(None, description="Category level (1-4)", ge=1, le=4),
|
||||
limit: int = Query(5000, description="Maximum number of results", ge=1, le=10000),
|
||||
offset: int = Query(0, description="Offset for pagination", ge=0),
|
||||
):
|
||||
"""
|
||||
Get Google Business Profile categories.
|
||||
|
||||
Supports filtering by:
|
||||
- search: Text search in category name
|
||||
- parent: Get children of a specific path
|
||||
- level: Filter by hierarchy level (1=Sector, 2=Business Type, 3=Sub-category, 4=Category)
|
||||
"""
|
||||
if not db or not db.pool:
|
||||
raise HTTPException(status_code=500, detail="Database not initialized")
|
||||
|
||||
async with db.pool.acquire() as conn:
|
||||
# Build query dynamically based on filters
|
||||
conditions = []
|
||||
params = []
|
||||
param_idx = 1
|
||||
|
||||
if search:
|
||||
conditions.append(f"name ILIKE ${param_idx}")
|
||||
params.append(f"%{search}%")
|
||||
param_idx += 1
|
||||
|
||||
if parent:
|
||||
conditions.append(f"path <@ ${param_idx}::ltree AND path != ${param_idx}::ltree")
|
||||
params.append(parent)
|
||||
param_idx += 1
|
||||
|
||||
if level:
|
||||
conditions.append(f"level = ${param_idx}")
|
||||
params.append(level)
|
||||
param_idx += 1
|
||||
|
||||
where_clause = " AND ".join(conditions) if conditions else "TRUE"
|
||||
|
||||
# Get total count
|
||||
count_query = f"SELECT COUNT(*) FROM gbp_categories WHERE {where_clause}"
|
||||
total = await conn.fetchval(count_query, *params)
|
||||
|
||||
# Get categories
|
||||
query = f"""
|
||||
SELECT id, name, slug, path::text as path, level, parent_id, category_count
|
||||
FROM gbp_categories
|
||||
WHERE {where_clause}
|
||||
ORDER BY path
|
||||
LIMIT ${param_idx} OFFSET ${param_idx + 1}
|
||||
"""
|
||||
params.extend([limit, offset])
|
||||
|
||||
rows = await conn.fetch(query, *params)
|
||||
categories = [dict(row) for row in rows]
|
||||
|
||||
return {
|
||||
"categories": categories,
|
||||
"total": total,
|
||||
"limit": limit,
|
||||
"offset": offset,
|
||||
}
|
||||
|
||||
|
||||
@app.get("/categories/tree", summary="Get GBP Categories Tree")
|
||||
async def get_categories_tree(
|
||||
root: Optional[str] = Query(None, description="Root path to start the tree from"),
|
||||
max_depth: int = Query(4, description="Maximum depth of the tree", ge=1, le=4),
|
||||
):
|
||||
"""
|
||||
Get categories as a hierarchical tree structure.
|
||||
|
||||
Returns nested categories starting from root (or all roots if not specified).
|
||||
"""
|
||||
if not db or not db.pool:
|
||||
raise HTTPException(status_code=500, detail="Database not initialized")
|
||||
|
||||
async with db.pool.acquire() as conn:
|
||||
if root:
|
||||
# Get subtree starting from root
|
||||
query = """
|
||||
SELECT id, name, slug, path::text as path, level, parent_id, category_count
|
||||
FROM gbp_categories
|
||||
WHERE path <@ $1::ltree
|
||||
ORDER BY path
|
||||
"""
|
||||
rows = await conn.fetch(query, root)
|
||||
else:
|
||||
# Get all categories
|
||||
query = """
|
||||
SELECT id, name, slug, path::text as path, level, parent_id, category_count
|
||||
FROM gbp_categories
|
||||
ORDER BY path
|
||||
"""
|
||||
rows = await conn.fetch(query)
|
||||
|
||||
categories = [dict(row) for row in rows]
|
||||
|
||||
# Build tree structure
|
||||
def build_tree(cats, parent_path=None, current_depth=1):
|
||||
if current_depth > max_depth:
|
||||
return []
|
||||
|
||||
result = []
|
||||
for cat in cats:
|
||||
cat_parts = cat['path'].split('.')
|
||||
|
||||
if parent_path is None:
|
||||
# Root level - single segment paths
|
||||
if len(cat_parts) == 1:
|
||||
children = build_tree(cats, cat['path'], current_depth + 1)
|
||||
result.append({
|
||||
**cat,
|
||||
'children': children if children else None
|
||||
})
|
||||
else:
|
||||
# Check if this is a direct child of parent_path
|
||||
parent_parts = parent_path.split('.')
|
||||
if (len(cat_parts) == len(parent_parts) + 1 and
|
||||
cat['path'].startswith(parent_path + '.')):
|
||||
children = build_tree(cats, cat['path'], current_depth + 1)
|
||||
result.append({
|
||||
**cat,
|
||||
'children': children if children else None
|
||||
})
|
||||
|
||||
return result
|
||||
|
||||
tree = build_tree(categories)
|
||||
|
||||
return {
|
||||
"tree": tree,
|
||||
"total": len(categories),
|
||||
}
|
||||
|
||||
|
||||
@app.get("/categories/{path:path}", summary="Get Category by Path")
|
||||
async def get_category_by_path(path: str):
|
||||
"""
|
||||
Get a specific category by its ltree path.
|
||||
|
||||
Also returns ancestors and direct children.
|
||||
"""
|
||||
if not db or not db.pool:
|
||||
raise HTTPException(status_code=500, detail="Database not initialized")
|
||||
|
||||
async with db.pool.acquire() as conn:
|
||||
# Get the category
|
||||
category = await conn.fetchrow("""
|
||||
SELECT id, name, slug, path::text as path, level, parent_id, category_count
|
||||
FROM gbp_categories
|
||||
WHERE path = $1::ltree
|
||||
""", path)
|
||||
|
||||
if not category:
|
||||
raise HTTPException(status_code=404, detail="Category not found")
|
||||
|
||||
category = dict(category)
|
||||
|
||||
# Get ancestors
|
||||
ancestors = await conn.fetch("""
|
||||
SELECT id, name, slug, path::text as path, level, parent_id, category_count
|
||||
FROM gbp_categories
|
||||
WHERE path @> $1::ltree AND path != $1::ltree
|
||||
ORDER BY path
|
||||
""", path)
|
||||
|
||||
# Get direct children
|
||||
children = await conn.fetch("""
|
||||
SELECT id, name, slug, path::text as path, level, parent_id, category_count
|
||||
FROM gbp_categories
|
||||
WHERE path ~ ($1 || '.*{1}')::lquery
|
||||
ORDER BY name
|
||||
""", path)
|
||||
|
||||
return {
|
||||
"category": category,
|
||||
"ancestors": [dict(a) for a in ancestors],
|
||||
"children": [dict(c) for c in children],
|
||||
}
|
||||
|
||||
|
||||
@app.get("/pool-stats", summary="Get Worker Pool Statistics")
|
||||
async def pool_stats():
|
||||
"""Get Chrome worker pool statistics"""
|
||||
@@ -1331,10 +1544,82 @@ async def get_crash_report(job_id: UUID):
|
||||
)
|
||||
|
||||
|
||||
# Available sort orders for retry strategy
|
||||
SORT_ORDERS = ["newest", "lowest", "highest", "relevant"]
|
||||
|
||||
# Fingerprint rotation for retry - realistic browser profiles to avoid bot detection
|
||||
import random
|
||||
|
||||
FINGERPRINT_PROFILES = [
|
||||
{
|
||||
"platform": "MacIntel",
|
||||
"timezone": "Europe/Madrid",
|
||||
"language": "es-ES",
|
||||
"userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
"viewport": {"width": 1440, "height": 900}
|
||||
},
|
||||
{
|
||||
"platform": "Win32",
|
||||
"timezone": "Europe/London",
|
||||
"language": "en-GB",
|
||||
"userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
|
||||
"viewport": {"width": 1920, "height": 1080}
|
||||
},
|
||||
{
|
||||
"platform": "MacIntel",
|
||||
"timezone": "America/New_York",
|
||||
"language": "en-US",
|
||||
"userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
|
||||
"viewport": {"width": 1680, "height": 1050}
|
||||
},
|
||||
{
|
||||
"platform": "Win32",
|
||||
"timezone": "Europe/Paris",
|
||||
"language": "fr-FR",
|
||||
"userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
"viewport": {"width": 1366, "height": 768}
|
||||
},
|
||||
{
|
||||
"platform": "MacIntel",
|
||||
"timezone": "Europe/Berlin",
|
||||
"language": "de-DE",
|
||||
"userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
|
||||
"viewport": {"width": 1512, "height": 982}
|
||||
},
|
||||
]
|
||||
|
||||
def get_rotated_fingerprint(retry_attempt: int = 0, previous_fingerprints: list = None) -> dict:
|
||||
"""
|
||||
Get a fingerprint profile for retry, avoiding previously used ones.
|
||||
|
||||
Args:
|
||||
retry_attempt: Which retry attempt this is (0-indexed)
|
||||
previous_fingerprints: List of previously used fingerprint platforms
|
||||
|
||||
Returns:
|
||||
A fingerprint profile dict
|
||||
"""
|
||||
previous_fingerprints = previous_fingerprints or []
|
||||
|
||||
# Filter out previously used profiles
|
||||
available = [fp for fp in FINGERPRINT_PROFILES
|
||||
if fp["platform"] not in previous_fingerprints]
|
||||
|
||||
# If all used, cycle back
|
||||
if not available:
|
||||
available = FINGERPRINT_PROFILES
|
||||
|
||||
# Select based on retry attempt (deterministic but varied)
|
||||
selected = available[retry_attempt % len(available)]
|
||||
|
||||
return selected.copy()
|
||||
|
||||
|
||||
@app.post("/jobs/{job_id}/retry", response_model=RetryJobResponse, summary="Retry Failed Job")
|
||||
async def retry_job(
|
||||
job_id: UUID,
|
||||
apply_fix: bool = Query(False, description="Apply auto-fix parameters based on crash analysis")
|
||||
apply_fix: bool = Query(False, description="Apply auto-fix parameters based on crash analysis"),
|
||||
next_sort: bool = Query(False, description="Use a different sort order than the original job (for partial jobs)")
|
||||
):
|
||||
"""
|
||||
Retry a failed or partial job, optionally applying auto-fix parameters.
|
||||
@@ -1344,6 +1629,11 @@ async def retry_job(
|
||||
- Applies recommended parameter adjustments (e.g., reduced batch size for memory issues)
|
||||
- Creates a new job with the adjusted parameters
|
||||
|
||||
When next_sort=true:
|
||||
- Uses a different sort order than previously attempted
|
||||
- Helps get different reviews when stuck at ~1000 limit
|
||||
- Tracks sort_orders_attempted for review merging
|
||||
|
||||
Returns the new job ID for tracking.
|
||||
"""
|
||||
if not db:
|
||||
@@ -1418,6 +1708,72 @@ async def retry_job(
|
||||
applied_fixes = analysis.auto_fix_params
|
||||
log.info(f"Applying auto-fix for pattern '{analysis.pattern}': {applied_fixes}")
|
||||
|
||||
# Handle next_sort: use a different sort order than previously attempted
|
||||
selected_sort = None
|
||||
if next_sort:
|
||||
# Get previously attempted sort orders
|
||||
sort_orders_attempted = original_metadata.get('sort_orders_attempted', [])
|
||||
|
||||
# If no sort was tracked, assume "newest" was used (default)
|
||||
if not sort_orders_attempted:
|
||||
initial_sort_used = original_metadata.get('initial_sort_used', 'newest')
|
||||
sort_orders_attempted = [initial_sort_used]
|
||||
|
||||
# Find next unused sort order
|
||||
for sort_order in SORT_ORDERS:
|
||||
if sort_order not in sort_orders_attempted:
|
||||
selected_sort = sort_order
|
||||
break
|
||||
|
||||
if selected_sort:
|
||||
# Set the new sort strategy
|
||||
original_metadata['initial_sort'] = selected_sort
|
||||
original_metadata['sort_strategy'] = 'single' # Don't auto-trigger multi-sort
|
||||
|
||||
# Track all attempted sorts (including this one)
|
||||
original_metadata['sort_orders_attempted'] = sort_orders_attempted + [selected_sort]
|
||||
|
||||
# Track retry chain for review merging
|
||||
if 'retry_chain' not in original_metadata:
|
||||
original_metadata['retry_chain'] = [str(job_id)]
|
||||
else:
|
||||
original_metadata['retry_chain'].append(str(job_id))
|
||||
|
||||
original_metadata['retry_info'] = original_metadata.get('retry_info', {})
|
||||
original_metadata['retry_info']['original_job_id'] = str(job_id)
|
||||
original_metadata['retry_info']['retry_reason'] = 'next_sort'
|
||||
original_metadata['retry_info']['selected_sort'] = selected_sort
|
||||
|
||||
log.info(f"Retry with next_sort: using '{selected_sort}' (previously tried: {sort_orders_attempted})")
|
||||
else:
|
||||
log.warn(f"All sort orders already attempted: {sort_orders_attempted}")
|
||||
|
||||
# Fingerprint rotation: if bot was detected, use a different fingerprint
|
||||
selected_fingerprint = None
|
||||
if next_sort and original_metadata.get('bot_detected', False):
|
||||
# Get previously used fingerprints
|
||||
previous_fingerprints = original_metadata.get('fingerprints_used', [])
|
||||
retry_count = len(original_metadata.get('retry_chain', []))
|
||||
|
||||
# Get a rotated fingerprint
|
||||
selected_fingerprint = get_rotated_fingerprint(retry_count, previous_fingerprints)
|
||||
|
||||
# Store the fingerprint in metadata
|
||||
original_metadata['browser_fingerprint'] = selected_fingerprint
|
||||
|
||||
# Track used fingerprints
|
||||
if 'fingerprints_used' not in original_metadata:
|
||||
original_metadata['fingerprints_used'] = []
|
||||
original_metadata['fingerprints_used'].append(selected_fingerprint['platform'])
|
||||
|
||||
original_metadata['retry_info']['fingerprint_rotated'] = True
|
||||
original_metadata['retry_info']['new_fingerprint'] = {
|
||||
'platform': selected_fingerprint['platform'],
|
||||
'timezone': selected_fingerprint['timezone']
|
||||
}
|
||||
|
||||
log.info(f"Fingerprint rotated for retry: {selected_fingerprint['platform']}, {selected_fingerprint['timezone']}")
|
||||
|
||||
# Create new job with same URL and (possibly modified) metadata
|
||||
new_job_id = await db.create_job(
|
||||
url=original_job['url'],
|
||||
@@ -1431,11 +1787,28 @@ async def retry_job(
|
||||
|
||||
log.info(f"Created retry job {new_job_id} for original job {job_id}")
|
||||
|
||||
# Build response message
|
||||
message = f"Retry job created from original job {job_id}"
|
||||
if selected_sort:
|
||||
message += f" (using sort: {selected_sort})"
|
||||
if selected_fingerprint:
|
||||
message += f" (fingerprint: {selected_fingerprint['platform']}/{selected_fingerprint['timezone']})"
|
||||
|
||||
# Build applied_fixes response
|
||||
retry_fixes = {}
|
||||
if selected_sort:
|
||||
retry_fixes["selected_sort"] = selected_sort
|
||||
if selected_fingerprint:
|
||||
retry_fixes["fingerprint"] = {
|
||||
"platform": selected_fingerprint["platform"],
|
||||
"timezone": selected_fingerprint["timezone"]
|
||||
}
|
||||
|
||||
return RetryJobResponse(
|
||||
job_id=str(new_job_id),
|
||||
status="started",
|
||||
message=f"Retry job created from original job {job_id}",
|
||||
applied_fixes=applied_fixes
|
||||
message=message,
|
||||
applied_fixes=applied_fixes if applied_fixes else (retry_fixes if retry_fixes else None)
|
||||
)
|
||||
|
||||
|
||||
@@ -1529,8 +1902,9 @@ async def liveness():
|
||||
|
||||
Use this for Kubernetes liveness probe - restart container if fails.
|
||||
"""
|
||||
# If health system is disabled, just return healthy (server is alive)
|
||||
if not health_system:
|
||||
raise HTTPException(status_code=503, detail="Health system not initialized")
|
||||
return {"status": "healthy", "message": "Server is alive (health system disabled)"}
|
||||
|
||||
return await health_system.check_liveness()
|
||||
|
||||
@@ -1542,8 +1916,12 @@ async def readiness():
|
||||
|
||||
Use this for Kubernetes readiness probe - remove from load balancer if fails.
|
||||
"""
|
||||
# If health system is disabled, check if DB is connected
|
||||
if not health_system:
|
||||
raise HTTPException(status_code=503, detail="Health system not initialized")
|
||||
if db and db.pool:
|
||||
return {"status": "ready", "message": "Server is ready (health system disabled)"}
|
||||
else:
|
||||
raise HTTPException(status_code=503, detail="Database not connected")
|
||||
|
||||
result = await health_system.check_readiness()
|
||||
|
||||
@@ -1728,17 +2106,67 @@ async def run_scraping_job(job_id: UUID):
|
||||
scraper_func, actual_version = get_scraper_for_version(requested_version)
|
||||
log.info(f"Using scraper version {actual_version} for job {job_id}")
|
||||
|
||||
# Run scraping with progress callback and shared log capture
|
||||
# headless=False because Docker uses Xvfb virtual display
|
||||
result = await asyncio.to_thread(
|
||||
scraper_func,
|
||||
url=url,
|
||||
headless=False,
|
||||
progress_callback=progress_callback,
|
||||
log_capture=log_capture,
|
||||
flush_callback=flush_callback,
|
||||
browser_fingerprint=browser_fingerprint # Pass user's browser fingerprint
|
||||
)
|
||||
# Get sort strategy parameters from metadata (for retry with different sort)
|
||||
initial_sort = metadata.get('initial_sort') if metadata else None
|
||||
sort_strategy = metadata.get('sort_strategy', 'auto') if metadata else 'auto'
|
||||
max_reviews = metadata.get('max_reviews') if metadata else None
|
||||
session_id = metadata.get('session_id') if metadata else None
|
||||
if initial_sort:
|
||||
log.info(f"Using initial_sort={initial_sort}, sort_strategy={sort_strategy} for job {job_id}")
|
||||
if max_reviews:
|
||||
log.info(f"Using max_reviews={max_reviews} limit for job {job_id} (testing mode)")
|
||||
|
||||
# Check if we have a session_id for browser reuse (session handoff from validation)
|
||||
if session_id:
|
||||
log.info(f"Using session handoff (session_id={session_id}) for job {job_id} - skipping navigation")
|
||||
from scrapers.google_reviews.v1_2_0 import scrape_with_session
|
||||
result = await asyncio.to_thread(
|
||||
scrape_with_session,
|
||||
session_id=session_id,
|
||||
max_reviews=max_reviews,
|
||||
progress_callback=progress_callback,
|
||||
flush_callback=flush_callback,
|
||||
sort_strategy=sort_strategy,
|
||||
initial_sort=initial_sort
|
||||
)
|
||||
# Add logs from session scraping
|
||||
if 'logs' in result:
|
||||
for log_entry in result.get('logs', []):
|
||||
log_capture.entries.append(log_entry)
|
||||
else:
|
||||
# Run scraping with progress callback and shared log capture
|
||||
# headless=False because Docker uses Xvfb virtual display
|
||||
result = await asyncio.to_thread(
|
||||
scraper_func,
|
||||
url=url,
|
||||
headless=False,
|
||||
progress_callback=progress_callback,
|
||||
log_capture=log_capture,
|
||||
flush_callback=flush_callback,
|
||||
browser_fingerprint=browser_fingerprint, # Pass user's browser fingerprint
|
||||
initial_sort=initial_sort, # Sort order for retry strategy
|
||||
sort_strategy=sort_strategy, # Sort strategy (auto, multi, single)
|
||||
max_reviews=max_reviews # Optional limit for testing
|
||||
)
|
||||
|
||||
# Update job metadata with tracking info from scraper result
|
||||
tracking_metadata = {
|
||||
'bot_detected': result.get('bot_detected', False),
|
||||
'initial_sort_used': result.get('initial_sort_used', 'newest'),
|
||||
'multi_sort': result.get('multi_sort', {}),
|
||||
}
|
||||
# Preserve existing sort_orders_attempted and add current sort
|
||||
existing_sorts = metadata.get('sort_orders_attempted', []) if metadata else []
|
||||
current_sort = result.get('initial_sort_used', 'newest')
|
||||
if current_sort not in existing_sorts:
|
||||
tracking_metadata['sort_orders_attempted'] = existing_sorts + [current_sort]
|
||||
else:
|
||||
tracking_metadata['sort_orders_attempted'] = existing_sorts
|
||||
|
||||
# Update metadata in database
|
||||
await db.update_job_metadata(job_id, tracking_metadata)
|
||||
if result.get('bot_detected'):
|
||||
log.warn(f"Bot detection flagged for job {job_id} - sort button was hidden")
|
||||
|
||||
if result['success']:
|
||||
# Save session fingerprint if captured
|
||||
@@ -1746,6 +2174,18 @@ async def run_scraping_job(job_id: UUID):
|
||||
await db.update_session_fingerprint(job_id, result['session_fingerprint'])
|
||||
log.info(f"Saved session fingerprint for job {job_id}")
|
||||
|
||||
# Save business info to dedicated columns (queryable/indexable)
|
||||
business_info = result.get('business_info', {})
|
||||
if business_info:
|
||||
await db.update_business_info(
|
||||
job_id=job_id,
|
||||
business_name=business_info.get('name'),
|
||||
business_category=business_info.get('category'),
|
||||
business_address=business_info.get('address'),
|
||||
business_rating=business_info.get('rating')
|
||||
)
|
||||
log.info(f"Saved business info for job {job_id}: {business_info.get('name')} ({business_info.get('category')})")
|
||||
|
||||
# Save results to database (including scraper logs and review topics)
|
||||
await db.save_job_result(
|
||||
job_id=job_id,
|
||||
|
||||
Reference in New Issue
Block a user