Add browser fingerprint support and analytics metadata display

- Transfer user's browser fingerprint (user-agent, viewport, timezone,
  language, geolocation) to Chrome for more authentic scraping
- Display review topics from Google Maps in analytics dashboard
- Show business category badge in analytics header
- Fix date_text null handling in analytics (handle undefined/timestamp fields)
- Add review_topics and business_category to JobStatus interface

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-24 10:36:06 +00:00
parent 1bd30c0789
commit a540ab97b1
9 changed files with 1214 additions and 231 deletions

View File

@@ -39,6 +39,13 @@ RUN apt-get update \
&& apt-get install -y chromium chromium-driver \ && apt-get install -y chromium chromium-driver \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
# Install VNC server and noVNC (browser-based VNC viewer)
RUN apt-get update && apt-get install -y \
x11vnc \
novnc \
websockify \
&& rm -rf /var/lib/apt/lists/*
# Set working directory # Set working directory
WORKDIR /app WORKDIR /app
@@ -51,7 +58,7 @@ COPY modules/ ./modules/
COPY api_server_production.py . COPY api_server_production.py .
COPY config.yaml . COPY config.yaml .
# Create startup script for Xvfb + API server # Create startup script for Xvfb + VNC + API server
RUN echo '#!/bin/bash\n\ RUN echo '#!/bin/bash\n\
# Start Xvfb (virtual display) in background\n\ # Start Xvfb (virtual display) in background\n\
Xvfb :99 -screen 0 1920x1080x24 -ac +extension GLX +render -noreset &\n\ Xvfb :99 -screen 0 1920x1080x24 -ac +extension GLX +render -noreset &\n\
@@ -60,6 +67,15 @@ export DISPLAY=:99\n\
# Wait for Xvfb to start\n\ # Wait for Xvfb to start\n\
sleep 2\n\ sleep 2\n\
\n\ \n\
# Start VNC server (no password for local dev, binds to all interfaces)\n\
x11vnc -display :99 -forever -shared -rfbport 5900 -nopw -bg\n\
\n\
# Start noVNC websocket proxy (browser access at http://localhost:6080/vnc.html)\n\
websockify --web=/usr/share/novnc/ 6080 localhost:5900 &\n\
\n\
echo "VNC server running on port 5900"\n\
echo "noVNC web interface at http://localhost:6080/vnc.html"\n\
\n\
# Start API server\n\ # Start API server\n\
exec python api_server_production.py\n\ exec python api_server_production.py\n\
' > /app/start.sh && chmod +x /app/start.sh ' > /app/start.sh && chmod +x /app/start.sh
@@ -71,8 +87,8 @@ RUN useradd -m -u 1000 scraper && \
USER scraper USER scraper
# Expose port # Expose ports: API (8000), VNC (5900), noVNC web (6080)
EXPOSE 8000 EXPOSE 8000 5900 6080
# Environment variables for Chromium in container # Environment variables for Chromium in container
ENV DISPLAY=:99 ENV DISPLAY=:99

View File

@@ -133,12 +133,36 @@ app.add_middleware(
# ==================== Request/Response Models ==================== # ==================== Request/Response Models ====================
class GeolocationModel(BaseModel):
"""Geolocation coordinates"""
lat: float = Field(..., description="Latitude")
lng: float = Field(..., description="Longitude")
class ViewportModel(BaseModel):
"""Browser viewport size"""
width: int = Field(..., description="Viewport width")
height: int = Field(..., description="Viewport height")
class BrowserFingerprintModel(BaseModel):
"""Browser fingerprint to replicate user's browser"""
geolocation: Optional[GeolocationModel] = None
userAgent: Optional[str] = Field(None, description="User agent string")
viewport: Optional[ViewportModel] = Field(None, description="Screen resolution")
timezone: Optional[str] = Field(None, description="Timezone (e.g., Europe/Madrid)")
language: Optional[str] = Field(None, description="Browser language (e.g., en-US)")
platform: Optional[str] = Field(None, description="Platform (e.g., MacIntel, Win32)")
class ScrapeRequest(BaseModel): class ScrapeRequest(BaseModel):
"""Request model for starting a scrape job""" """Request model for starting a scrape job"""
url: HttpUrl = Field(..., description="Google Maps URL to scrape") url: HttpUrl = Field(..., description="Google Maps URL to scrape")
webhook_url: Optional[HttpUrl] = Field(None, description="Webhook URL for async notifications") webhook_url: Optional[HttpUrl] = Field(None, description="Webhook URL for async notifications")
webhook_secret: Optional[str] = Field(None, description="Secret for webhook HMAC signature") webhook_secret: Optional[str] = Field(None, description="Secret for webhook HMAC signature")
metadata: Optional[Dict[str, Any]] = Field(None, description="Optional custom metadata") metadata: Optional[Dict[str, Any]] = Field(None, description="Optional custom metadata")
geolocation: Optional[GeolocationModel] = Field(None, description="User's geolocation for Chrome")
browser_fingerprint: Optional[BrowserFingerprintModel] = Field(None, description="User's browser fingerprint")
class JobResponse(BaseModel): class JobResponse(BaseModel):
@@ -149,6 +173,7 @@ class JobResponse(BaseModel):
created_at: str created_at: str
started_at: Optional[str] = None started_at: Optional[str] = None
completed_at: Optional[str] = None completed_at: Optional[str] = None
updated_at: Optional[str] = None # Last update time for progress tracking
reviews_count: Optional[int] = None reviews_count: Optional[int] = None
total_reviews: Optional[int] = None # Total reviews available for this place total_reviews: Optional[int] = None # Total reviews available for this place
scrape_time: Optional[float] = None scrape_time: Optional[float] = None
@@ -157,6 +182,8 @@ class JobResponse(BaseModel):
# Business metadata # Business metadata
business_name: Optional[str] = None business_name: Optional[str] = None
business_address: Optional[str] = None business_address: Optional[str] = None
business_category: Optional[str] = None # Category (e.g., "Barber shop")
review_topics: Optional[List[Dict[str, Any]]] = None # Topic filters with mention counts
class ReviewsResponse(BaseModel): class ReviewsResponse(BaseModel):
@@ -206,12 +233,32 @@ async def start_scrape(request: ScrapeRequest):
raise HTTPException(status_code=500, detail="Database not initialized") raise HTTPException(status_code=500, detail="Database not initialized")
try: try:
# Merge browser fingerprint into metadata if provided
metadata = request.metadata or {}
if request.browser_fingerprint:
fp = request.browser_fingerprint
metadata['browser_fingerprint'] = {
"userAgent": fp.userAgent,
"timezone": fp.timezone,
"language": fp.language,
"platform": fp.platform,
}
if fp.viewport:
metadata['browser_fingerprint']['viewport'] = {"width": fp.viewport.width, "height": fp.viewport.height}
if fp.geolocation:
metadata['browser_fingerprint']['geolocation'] = {"lat": fp.geolocation.lat, "lng": fp.geolocation.lng}
elif request.geolocation:
metadata['geolocation'] = {
'lat': request.geolocation.lat,
'lng': request.geolocation.lng
}
# Create job in database # Create job in database
job_id = await db.create_job( job_id = await db.create_job(
url=str(request.url), url=str(request.url),
webhook_url=str(request.webhook_url) if request.webhook_url else None, webhook_url=str(request.webhook_url) if request.webhook_url else None,
webhook_secret=request.webhook_secret, webhook_secret=request.webhook_secret,
metadata=request.metadata metadata=metadata
) )
# Start scraping job in background # Start scraping job in background
@@ -240,6 +287,25 @@ async def get_job(job_id: UUID):
if not job: if not job:
raise HTTPException(status_code=404, detail="Job not found") raise HTTPException(status_code=404, detail="Job not found")
# Parse review_topics if it's a string (JSONB might be returned as string)
review_topics = job.get('review_topics')
if isinstance(review_topics, str):
try:
review_topics = json.loads(review_topics)
except:
review_topics = None
# Extract business info from metadata if available
metadata = job.get('metadata')
if isinstance(metadata, str):
try:
metadata = json.loads(metadata)
except:
metadata = None
business_name = metadata.get('business_name') if metadata else None
business_category = metadata.get('business_category') if metadata else None
return JobResponse( return JobResponse(
job_id=str(job['job_id']), job_id=str(job['job_id']),
status=job['status'], status=job['status'],
@@ -247,11 +313,15 @@ async def get_job(job_id: UUID):
created_at=job['created_at'].isoformat(), created_at=job['created_at'].isoformat(),
started_at=job['started_at'].isoformat() if job['started_at'] else None, started_at=job['started_at'].isoformat() if job['started_at'] else None,
completed_at=job['completed_at'].isoformat() if job['completed_at'] else None, completed_at=job['completed_at'].isoformat() if job['completed_at'] else None,
updated_at=job['updated_at'].isoformat() if job.get('updated_at') else None,
reviews_count=job['reviews_count'], reviews_count=job['reviews_count'],
total_reviews=job.get('total_reviews'), total_reviews=job.get('total_reviews'),
scrape_time=job['scrape_time'], scrape_time=job['scrape_time'],
error_message=job['error_message'], error_message=job['error_message'],
webhook_url=job.get('webhook_url') webhook_url=job.get('webhook_url'),
business_name=business_name,
business_category=business_category,
review_topics=review_topics
) )
@@ -541,25 +611,32 @@ async def stream_all_jobs():
@app.get("/jobs/{job_id}/reviews", response_model=ReviewsResponse, summary="Get Job Reviews") @app.get("/jobs/{job_id}/reviews", response_model=ReviewsResponse, summary="Get Job Reviews")
async def get_job_reviews(job_id: UUID): async def get_job_reviews(job_id: UUID):
""" """
Get the actual reviews data for a completed job. Get reviews data for a job.
Returns 404 if job not found or not completed yet. Returns reviews for completed, partial, or running jobs (if reviews have been collected).
Returns 404 if job not found or no reviews available yet.
""" """
if not db: if not db:
raise HTTPException(status_code=500, detail="Database not initialized") raise HTTPException(status_code=500, detail="Database not initialized")
reviews = await db.get_job_reviews(job_id) # Get reviews (includes completed, running, and partial jobs)
reviews = await db.get_job_reviews(job_id, include_partial=True)
if reviews is None: if reviews is None:
job = await db.get_job(job_id) job = await db.get_job(job_id)
if not job: if not job:
raise HTTPException(status_code=404, detail="Job not found") raise HTTPException(status_code=404, detail="Job not found")
elif job['status'] != 'completed': elif job['status'] == 'pending':
raise HTTPException( raise HTTPException(
status_code=400, status_code=400,
detail=f"Job not completed yet (current status: {job['status']})" detail="Job has not started yet"
)
elif job['status'] == 'failed':
raise HTTPException(
status_code=400,
detail=f"Job failed without saving any reviews: {job.get('error_message', 'Unknown error')}"
) )
else: else:
raise HTTPException(status_code=404, detail="Reviews data not available") raise HTTPException(status_code=404, detail="No reviews data available yet")
return ReviewsResponse( return ReviewsResponse(
job_id=str(job_id), job_id=str(job_id),
@@ -603,6 +680,15 @@ async def list_jobs(
business_name = metadata.get('business_name') if metadata else None business_name = metadata.get('business_name') if metadata else None
business_address = metadata.get('business_address') if metadata else None business_address = metadata.get('business_address') if metadata else None
business_category = metadata.get('business_category') if metadata else None
# Parse review_topics if it's a string
review_topics = job.get('review_topics')
if isinstance(review_topics, str):
try:
review_topics = json.loads(review_topics)
except:
review_topics = None
result.append(JobResponse( result.append(JobResponse(
job_id=str(job['job_id']), job_id=str(job['job_id']),
@@ -615,7 +701,9 @@ async def list_jobs(
scrape_time=job.get('scrape_time'), scrape_time=job.get('scrape_time'),
error_message=job.get('error_message'), error_message=job.get('error_message'),
business_name=business_name, business_name=business_name,
business_address=business_address business_address=business_address,
business_category=business_category,
review_topics=review_topics
)) ))
return result return result
@@ -640,63 +728,69 @@ async def check_reviews(request: ScrapeRequest):
Get business card information from Google Maps. Get business card information from Google Maps.
Returns business name, address, rating, and review count. Returns business name, address, rating, and review count.
Uses pre-warmed Chrome worker from pool for instant response. Creates a fresh Chrome instance for reliable results (same as full scraper).
This is used to show the business confirmation card in the UI. This is used to show the business confirmation card in the UI.
""" """
worker = None
recycle_worker = False
try: try:
url = str(request.url) url = str(request.url)
# Get pre-warmed worker from validation pool # Use the SAME scraper algorithm with validation_only=True for early return
worker = await asyncio.to_thread(get_validation_worker, timeout=10) # Creates a fresh Chrome instance (same as full scraper) to avoid stale browser state
# Pooled browsers can have cookies/state that cause Google to render pages differently
if worker: # Build fingerprint dict from request
log.info(f"Using worker {worker.worker_id} for business card extraction") fingerprint = None
# Use the pooled worker (don't close it) if request.browser_fingerprint:
result = await asyncio.to_thread( fp = request.browser_fingerprint
get_business_card_info, fingerprint = {
url=url, "userAgent": fp.userAgent,
driver=worker.driver, "timezone": fp.timezone,
return_driver=True "language": fp.language,
) "platform": fp.platform,
}
# Check if the result indicates a session error if fp.viewport:
if not result['success'] and result.get('error'): fingerprint["viewport"] = {"width": fp.viewport.width, "height": fp.viewport.height}
error_msg = result.get('error', '').lower() if fp.geolocation:
if 'invalid session' in error_msg or 'session' in error_msg: fingerprint["geolocation"] = {"lat": fp.geolocation.lat, "lng": fp.geolocation.lng}
log.warning(f"Worker {worker.worker_id} has invalid session, will recycle") log.info(f"Creating Chrome with user fingerprint: {fp.platform}, {fp.timezone}")
recycle_worker = True elif request.geolocation:
fingerprint = {"geolocation": {"lat": request.geolocation.lat, "lng": request.geolocation.lng}}
log.info(f"Creating Chrome with geolocation only")
else: else:
# Fallback: create temporary worker log.info(f"Creating Chrome with default settings")
log.warning("No pooled worker available, creating temporary instance")
result = await asyncio.to_thread(
get_business_card_info,
url=url
)
# SIMPLIFIED VALIDATION: If we found a business (name + rating), assume it has reviews result = await asyncio.to_thread(
# Let the actual scraper determine if reviews exist fast_scrape_reviews,
has_business = bool(result.get('name') and result.get('rating')) url=url,
headless=False, # Use Xvfb display
validation_only=True, # Return early after getting total_reviews
browser_fingerprint=fingerprint # Pass user's browser fingerprint
)
# Extract validation info from the result
validation_info = result.get('validation_info', {})
total_reviews = validation_info.get('total_reviews') or result.get('total_reviews') or 0
name = validation_info.get('name')
rating = validation_info.get('rating')
category = validation_info.get('category')
address = validation_info.get('address')
# Has reviews if we found a business with the Reviews tab (indicated by total_reviews > 0)
has_reviews = bool(name and total_reviews > 0)
return { return {
"has_reviews": has_business, # Boolean: true if business exists "has_reviews": has_reviews, # True if business has reviews
"total_reviews": result.get('total_reviews') or 0, # Show 0 if unknown "total_reviews": total_reviews,
"name": result.get('name'), "name": name,
"address": result.get('address'), "address": address,
"rating": result.get('rating'), "rating": rating,
"success": result['success'], "category": category,
"success": result.get('success', True),
"error": result.get('error') "error": result.get('error')
} }
except Exception as e: except Exception as e:
log.error(f"Error checking reviews: {e}") log.error(f"Error checking reviews: {e}")
# If it's a session error, recycle the worker
if worker:
error_msg = str(e).lower()
if 'invalid session' in error_msg or 'session' in error_msg:
recycle_worker = True
return { return {
"has_reviews": False, "has_reviews": False,
@@ -704,10 +798,6 @@ async def check_reviews(request: ScrapeRequest):
"success": False, "success": False,
"error": str(e) "error": str(e)
} }
finally:
# Release worker back to pool (or recycle if broken)
if worker:
await asyncio.to_thread(release_validation_worker, worker, recycle=recycle_worker)
@app.get("/stats", response_model=StatsResponse, summary="Get Statistics") @app.get("/stats", response_model=StatsResponse, summary="Get Statistics")
@@ -808,6 +898,21 @@ async def run_scraping_job(job_id: UUID):
job = await db.get_job(job_id) job = await db.get_job(job_id)
url = job['url'] url = job['url']
# Extract browser fingerprint from metadata if available
browser_fingerprint = None
metadata = job.get('metadata')
if isinstance(metadata, str):
try:
metadata = json.loads(metadata)
except:
metadata = None
if metadata and 'browser_fingerprint' in metadata:
browser_fingerprint = metadata['browser_fingerprint']
log.info(f"Using user fingerprint: {browser_fingerprint.get('platform')}, {browser_fingerprint.get('timezone')}")
elif metadata and 'geolocation' in metadata:
browser_fingerprint = {'geolocation': metadata['geolocation']}
log.info(f"Using user geolocation only")
# Broadcast job started via SSE # Broadcast job started via SSE
await broadcast_job_update(job_id_str, "job_started", { await broadcast_job_update(job_id_str, "job_started", {
"job_id": job_id_str, "job_id": job_id_str,
@@ -821,9 +926,17 @@ async def run_scraping_job(job_id: UUID):
# Create log capture instance that we can access for real-time logs # Create log capture instance that we can access for real-time logs
log_capture = LogCapture() log_capture = LogCapture()
# Track total reviews for incremental saves
total_reviews_seen = [None]
# Accumulate all reviews for incremental saves (flush_callback receives batches)
all_reviews_collected = []
# Progress callback to update job status with current/total counts AND logs # Progress callback to update job status with current/total counts AND logs
def progress_callback(current_count: int, total_count: int): def progress_callback(current_count: int, total_count: int):
"""Update job progress and logs from worker thread""" """Update job progress and logs from worker thread"""
if total_count:
total_reviews_seen[0] = total_count
async def update(): async def update():
# Get current logs from the shared log_capture # Get current logs from the shared log_capture
current_logs = log_capture.get_logs() current_logs = log_capture.get_logs()
@@ -847,6 +960,22 @@ async def run_scraping_job(job_id: UUID):
# Schedule the coroutine on the event loop # Schedule the coroutine on the event loop
asyncio.run_coroutine_threadsafe(update(), loop) asyncio.run_coroutine_threadsafe(update(), loop)
# Flush callback to save reviews incrementally (crash recovery)
# Note: flush_callback receives batches, so we accumulate them
def flush_callback(reviews_batch: list):
"""Accumulate and save reviews to DB incrementally from worker thread"""
# Extend our collection with the new batch
all_reviews_collected.extend(reviews_batch)
async def save():
await db.save_reviews_incremental(
job_id=job_id,
reviews=all_reviews_collected, # Save ALL reviews so far
total_reviews=total_reviews_seen[0]
)
# Schedule the coroutine on the event loop
asyncio.run_coroutine_threadsafe(save(), loop)
# Run scraping with progress callback and shared log capture # Run scraping with progress callback and shared log capture
# headless=False because Docker uses Xvfb virtual display # headless=False because Docker uses Xvfb virtual display
result = await asyncio.to_thread( result = await asyncio.to_thread(
@@ -854,17 +983,20 @@ async def run_scraping_job(job_id: UUID):
url=url, url=url,
headless=False, headless=False,
progress_callback=progress_callback, progress_callback=progress_callback,
log_capture=log_capture log_capture=log_capture,
flush_callback=flush_callback,
browser_fingerprint=browser_fingerprint # Pass user's browser fingerprint
) )
if result['success']: if result['success']:
# Save results to database (including scraper logs) # Save results to database (including scraper logs and review topics)
await db.save_job_result( await db.save_job_result(
job_id=job_id, job_id=job_id,
reviews=result['reviews'], reviews=result['reviews'],
scrape_time=result['time'], scrape_time=result['time'],
total_reviews=result.get('total_reviews'), total_reviews=result.get('total_reviews'),
scrape_logs=result.get('logs') scrape_logs=result.get('logs'),
review_topics=result.get('review_topics')
) )
log.info( log.info(
@@ -898,68 +1030,142 @@ async def run_scraping_job(job_id: UUID):
) )
else: else:
# Job failed - save logs for debugging # Job failed - check if we have partial reviews saved
await db.update_job_status( current_job = await db.get_job(job_id)
job_id, partial_count = current_job.get('reviews_count', 0) if current_job else 0
JobStatus.FAILED,
error_message=result.get('error', 'Unknown error'),
scrape_logs=result.get('logs')
)
log.error(f"Failed job {job_id}: {result.get('error')}") if partial_count > 0:
# Mark as partial - we have some reviews saved
# Broadcast job failed via SSE await db.mark_job_partial(
await broadcast_job_update(job_id_str, "job_failed", { job_id,
"job_id": job_id_str, error_message=result.get('error', 'Unknown error'),
"status": "failed", scrape_logs=result.get('logs')
"error_message": result.get('error'),
"logs": result.get('logs', [])
})
# Send failure webhook if configured
if job.get('webhook_url'):
webhook_manager = WebhookManager()
await webhook_manager.send_job_completed_webhook(
webhook_url=job['webhook_url'],
job_id=job_id,
status='failed',
error_message=result.get('error'),
secret=job.get('webhook_secret'),
db=db
) )
log.warning(f"Partial job {job_id}: {partial_count} reviews saved before error: {result.get('error')}")
# Broadcast job partial via SSE
await broadcast_job_update(job_id_str, "job_partial", {
"job_id": job_id_str,
"status": "partial",
"reviews_count": partial_count,
"total_reviews": current_job.get('total_reviews'),
"error_message": result.get('error'),
"logs": result.get('logs', [])
})
# Send partial webhook if configured
if job.get('webhook_url'):
webhook_manager = WebhookManager()
await webhook_manager.send_job_completed_webhook(
webhook_url=job['webhook_url'],
job_id=job_id,
status='partial',
reviews_count=partial_count,
error_message=result.get('error'),
secret=job.get('webhook_secret'),
db=db
)
else:
# No reviews saved - mark as failed
await db.update_job_status(
job_id,
JobStatus.FAILED,
error_message=result.get('error', 'Unknown error'),
scrape_logs=result.get('logs')
)
log.error(f"Failed job {job_id}: {result.get('error')}")
# Broadcast job failed via SSE
await broadcast_job_update(job_id_str, "job_failed", {
"job_id": job_id_str,
"status": "failed",
"error_message": result.get('error'),
"logs": result.get('logs', [])
})
# Send failure webhook if configured
if job.get('webhook_url'):
webhook_manager = WebhookManager()
await webhook_manager.send_job_completed_webhook(
webhook_url=job['webhook_url'],
job_id=job_id,
status='failed',
error_message=result.get('error'),
secret=job.get('webhook_secret'),
db=db
)
except Exception as e: except Exception as e:
log.error(f"Error in scraping job {job_id}: {e}") log.error(f"Error in scraping job {job_id}: {e}")
import traceback import traceback
traceback.print_exc() traceback.print_exc()
await db.update_job_status( # Check if we have partial reviews saved
job_id, current_job = await db.get_job(job_id)
JobStatus.FAILED, partial_count = current_job.get('reviews_count', 0) if current_job else 0
error_message=str(e)
)
# Broadcast job failed via SSE if partial_count > 0:
await broadcast_job_update(job_id_str, "job_failed", { # Mark as partial - we have some reviews saved
"job_id": job_id_str, await db.mark_job_partial(
"status": "failed", job_id,
"error_message": str(e),
"logs": []
})
# Send failure webhook
job = await db.get_job(job_id)
if job and job.get('webhook_url'):
webhook_manager = WebhookManager()
await webhook_manager.send_job_completed_webhook(
webhook_url=job['webhook_url'],
job_id=job_id,
status='failed',
error_message=str(e), error_message=str(e),
secret=job.get('webhook_secret'), scrape_logs=log_capture.get_logs() if log_capture else None
db=db
) )
log.warning(f"Partial job {job_id}: {partial_count} reviews saved before exception: {e}")
# Broadcast job partial via SSE
await broadcast_job_update(job_id_str, "job_partial", {
"job_id": job_id_str,
"status": "partial",
"reviews_count": partial_count,
"total_reviews": current_job.get('total_reviews'),
"error_message": str(e),
"logs": log_capture.get_logs() if log_capture else []
})
# Send partial webhook
if current_job and current_job.get('webhook_url'):
webhook_manager = WebhookManager()
await webhook_manager.send_job_completed_webhook(
webhook_url=current_job['webhook_url'],
job_id=job_id,
status='partial',
reviews_count=partial_count,
error_message=str(e),
secret=current_job.get('webhook_secret'),
db=db
)
else:
# No reviews saved - mark as failed
await db.update_job_status(
job_id,
JobStatus.FAILED,
error_message=str(e)
)
# Broadcast job failed via SSE
await broadcast_job_update(job_id_str, "job_failed", {
"job_id": job_id_str,
"status": "failed",
"error_message": str(e),
"logs": []
})
# Send failure webhook
if current_job and current_job.get('webhook_url'):
webhook_manager = WebhookManager()
await webhook_manager.send_job_completed_webhook(
webhook_url=current_job['webhook_url'],
job_id=job_id,
status='failed',
error_message=str(e),
secret=current_job.get('webhook_secret'),
db=db
)
if __name__ == "__main__": if __name__ == "__main__":
import uvicorn import uvicorn

View File

@@ -39,6 +39,8 @@ services:
- CHROME_BIN=/usr/bin/chromium - CHROME_BIN=/usr/bin/chromium
ports: ports:
- "8000:8000" - "8000:8000"
- "5900:5900" # VNC port (for VNC client)
- "6080:6080" # noVNC web interface (browser access)
depends_on: depends_on:
db: db:
condition: service_healthy condition: service_healthy

View File

@@ -21,6 +21,7 @@ class JobStatus(str, Enum):
COMPLETED = "completed" COMPLETED = "completed"
FAILED = "failed" FAILED = "failed"
CANCELLED = "cancelled" CANCELLED = "cancelled"
PARTIAL = "partial" # Job crashed but has partial reviews saved
class DatabaseManager: class DatabaseManager:
@@ -69,6 +70,7 @@ class DatabaseManager:
created_at TIMESTAMP NOT NULL DEFAULT NOW(), created_at TIMESTAMP NOT NULL DEFAULT NOW(),
started_at TIMESTAMP, started_at TIMESTAMP,
completed_at TIMESTAMP, completed_at TIMESTAMP,
updated_at TIMESTAMP,
reviews_count INTEGER, reviews_count INTEGER,
total_reviews INTEGER, total_reviews INTEGER,
@@ -79,7 +81,7 @@ class DatabaseManager:
metadata JSONB, metadata JSONB,
scrape_logs JSONB, scrape_logs JSONB,
CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled')) CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled', 'partial'))
); );
""") """)
@@ -88,6 +90,24 @@ class DatabaseManager:
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS scrape_logs JSONB; ALTER TABLE jobs ADD COLUMN IF NOT EXISTS scrape_logs JSONB;
""") """)
# Add updated_at column if it doesn't exist (for incremental progress tracking)
await conn.execute("""
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS updated_at TIMESTAMP;
""")
# Add review_topics column if it doesn't exist (extracted topic filters with mention counts)
await conn.execute("""
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS review_topics JSONB;
""")
# Update constraint to include 'partial' status (for existing databases)
await conn.execute("""
ALTER TABLE jobs DROP CONSTRAINT IF EXISTS valid_status;
""")
await conn.execute("""
ALTER TABLE jobs ADD CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled', 'partial'));
""")
# Create indexes # Create indexes
await conn.execute(""" await conn.execute("""
CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status); CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status);
@@ -187,13 +207,15 @@ class DatabaseManager:
created_at, created_at,
started_at, started_at,
completed_at, completed_at,
updated_at,
reviews_count, reviews_count,
total_reviews, total_reviews,
reviews_data, reviews_data,
scrape_time, scrape_time,
error_message, error_message,
metadata, metadata,
scrape_logs scrape_logs,
review_topics
FROM jobs FROM jobs
WHERE job_id = $1 WHERE job_id = $1
""", job_id) """, job_id)
@@ -203,22 +225,32 @@ class DatabaseManager:
return dict(row) return dict(row)
async def get_job_reviews(self, job_id: UUID) -> Optional[List[Dict[str, Any]]]: async def get_job_reviews(self, job_id: UUID, include_partial: bool = True) -> Optional[List[Dict[str, Any]]]:
""" """
Get reviews for a specific job. Get reviews for a specific job.
Args: Args:
job_id: Job UUID job_id: Job UUID
include_partial: If True, also return reviews for running and partial jobs
Returns: Returns:
List of reviews or None if not found/not completed List of reviews or None if not found/no reviews
""" """
async with self.pool.acquire() as conn: async with self.pool.acquire() as conn:
reviews_data = await conn.fetchval(""" if include_partial:
SELECT reviews_data # Return reviews for completed, running, or partial jobs
FROM jobs reviews_data = await conn.fetchval("""
WHERE job_id = $1 AND status = 'completed' SELECT reviews_data
""", job_id) FROM jobs
WHERE job_id = $1 AND status IN ('completed', 'running', 'partial')
""", job_id)
else:
# Only return reviews for completed jobs
reviews_data = await conn.fetchval("""
SELECT reviews_data
FROM jobs
WHERE job_id = $1 AND status = 'completed'
""", job_id)
if not reviews_data: if not reviews_data:
return None return None
@@ -278,7 +310,8 @@ class DatabaseManager:
reviews: List[Dict[str, Any]], reviews: List[Dict[str, Any]],
scrape_time: float, scrape_time: float,
total_reviews: Optional[int] = None, total_reviews: Optional[int] = None,
scrape_logs: Optional[List[Dict[str, Any]]] = None scrape_logs: Optional[List[Dict[str, Any]]] = None,
review_topics: Optional[List[Dict[str, Any]]] = None
): ):
""" """
Save scraping results to database. Save scraping results to database.
@@ -289,8 +322,33 @@ class DatabaseManager:
scrape_time: Time taken to scrape in seconds scrape_time: Time taken to scrape in seconds
total_reviews: Total reviews available (from page counter) total_reviews: Total reviews available (from page counter)
scrape_logs: List of log entries from the scraper scrape_logs: List of log entries from the scraper
review_topics: List of topic filter dictionaries with topic and count
""" """
async with self.pool.acquire() as conn: async with self.pool.acquire() as conn:
# If reviews list is empty, check if job already has reviews from incremental saves
# This happens when flush_callback was used during scraping
if not reviews:
existing = await conn.fetchval(
"SELECT reviews_count FROM jobs WHERE job_id = $1", job_id
)
if existing and existing > 0:
# Job has reviews from incremental saves, don't overwrite reviews_data
await conn.execute("""
UPDATE jobs
SET
status = 'completed',
completed_at = NOW(),
total_reviews = COALESCE($2, total_reviews),
scrape_time = $3,
scrape_logs = $4::jsonb,
review_topics = $5::jsonb
WHERE job_id = $1
""", job_id, total_reviews, scrape_time,
json.dumps(scrape_logs) if scrape_logs else None,
json.dumps(review_topics) if review_topics else None)
log.info(f"Completed job {job_id} with {existing} reviews (from incremental saves)")
return
await conn.execute(""" await conn.execute("""
UPDATE jobs UPDATE jobs
SET SET
@@ -300,13 +358,70 @@ class DatabaseManager:
total_reviews = $3, total_reviews = $3,
reviews_data = $4::jsonb, reviews_data = $4::jsonb,
scrape_time = $5, scrape_time = $5,
scrape_logs = $6::jsonb scrape_logs = $6::jsonb,
review_topics = $7::jsonb
WHERE job_id = $1 WHERE job_id = $1
""", job_id, len(reviews), total_reviews, json.dumps(reviews), scrape_time, """, job_id, len(reviews), total_reviews, json.dumps(reviews), scrape_time,
json.dumps(scrape_logs) if scrape_logs else None) json.dumps(scrape_logs) if scrape_logs else None,
json.dumps(review_topics) if review_topics else None)
log.info(f"Saved {len(reviews)} reviews for job {job_id}") log.info(f"Saved {len(reviews)} reviews for job {job_id}")
async def save_reviews_incremental(
self,
job_id: UUID,
reviews: List[Dict[str, Any]],
total_reviews: Optional[int] = None
):
"""
Save reviews incrementally during scraping.
Called on each flush to preserve progress in case of crash.
Args:
job_id: Job UUID
reviews: ALL reviews collected so far (not just new ones)
total_reviews: Total reviews available (from page counter)
"""
async with self.pool.acquire() as conn:
await conn.execute("""
UPDATE jobs
SET
reviews_count = $2,
total_reviews = COALESCE($3, total_reviews),
reviews_data = $4::jsonb,
updated_at = NOW()
WHERE job_id = $1 AND status = 'running'
""", job_id, len(reviews), total_reviews, json.dumps(reviews))
log.debug(f"Incremental save: {len(reviews)} reviews for job {job_id}")
async def mark_job_partial(
self,
job_id: UUID,
error_message: str,
scrape_logs: Optional[List[Dict[str, Any]]] = None
):
"""
Mark a job as partial (crashed but has some reviews saved).
Args:
job_id: Job UUID
error_message: Error that caused the crash
scrape_logs: Log entries from the scraper
"""
async with self.pool.acquire() as conn:
await conn.execute("""
UPDATE jobs
SET
status = 'partial',
completed_at = NOW(),
error_message = $2,
scrape_logs = $3::jsonb
WHERE job_id = $1
""", job_id, error_message, json.dumps(scrape_logs) if scrape_logs else None)
log.info(f"Marked job {job_id} as partial due to: {error_message}")
async def list_jobs( async def list_jobs(
self, self,
status: Optional[JobStatus] = None, status: Optional[JobStatus] = None,
@@ -337,7 +452,8 @@ class DatabaseManager:
total_reviews, total_reviews,
scrape_time, scrape_time,
error_message, error_message,
metadata metadata,
review_topics
FROM jobs FROM jobs
WHERE status = $1 WHERE status = $1
ORDER BY created_at DESC ORDER BY created_at DESC
@@ -355,7 +471,8 @@ class DatabaseManager:
total_reviews, total_reviews,
scrape_time, scrape_time,
error_message, error_message,
metadata metadata,
review_topics
FROM jobs FROM jobs
ORDER BY created_at DESC ORDER BY created_at DESC
LIMIT $1 OFFSET $2 LIMIT $1 OFFSET $2

View File

@@ -268,7 +268,7 @@ def parse_dom_review(card) -> dict:
def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: int = 15, def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: int = 15,
flush_callback=None, flush_batch_size: int = 500, log_capture: LogCapture = None, flush_callback=None, flush_batch_size: int = 500, log_capture: LogCapture = None,
progress_callback=None) -> dict: progress_callback=None, validation_only: bool = False) -> dict:
""" """
Scrape Google Maps reviews. Scrape Google Maps reviews.
@@ -299,6 +299,9 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
# Track total reviews (persists across refreshes) # Track total reviews (persists across refreshes)
total_reviews = [None] # Use list for closure mutation total_reviews = [None] # Use list for closure mutation
# Store business info extracted from overview (before clicking reviews tab)
business_info_cache = [None]
# Hard refresh counter # Hard refresh counter
hard_refresh_count = [0] hard_refresh_count = [0]
max_hard_refreshes = 3 # Max number of hard refreshes before giving up max_hard_refreshes = 3 # Max number of hard refreshes before giving up
@@ -323,11 +326,14 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
pass pass
return None return None
def setup_reviews_page(is_refresh=False): def setup_reviews_page(is_refresh=False, validation_only_mode=False):
""" """
Setup the reviews page for scraping. Setup the reviews page for scraping.
Returns (scroll_container, stop_scrolling_event) or (None, None) on failure. Returns (scroll_container, stop_scrolling_event) or (None, None) on failure.
Can be called after initial load or after a hard refresh. Can be called after initial load or after a hard refresh.
If validation_only_mode=True, returns early after extracting business info
without clicking reviews tab or finding scroll container.
""" """
nonlocal total_reviews nonlocal total_reviews
@@ -335,6 +341,13 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
# Navigate to URL (only on initial load or refresh) # Navigate to URL (only on initial load or refresh)
if not is_refresh: if not is_refresh:
# Reset browser state by navigating to blank page first
# This clears any stale state from pooled browser sessions
try:
driver.get("about:blank")
time.sleep(0.1)
except:
pass
log.info(f"🌐 Loading: {url[:80]}...") log.info(f"🌐 Loading: {url[:80]}...")
else: else:
log.info(f"🔄 Hard refresh #{hard_refresh_count[0]}: reloading page...") log.info(f"🔄 Hard refresh #{hard_refresh_count[0]}: reloading page...")
@@ -353,6 +366,8 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
# Reload original URL after consent # Reload original URL after consent
log.info(" Reloading after consent...") log.info(" Reloading after consent...")
driver.get(url) driver.get(url)
# Wait for page to settle after consent reload
time.sleep(1)
break break
except: except:
pass pass
@@ -362,43 +377,108 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
break break
time.sleep(0.01) # 10ms - responsive but low CPU time.sleep(0.01) # 10ms - responsive but low CPU
# Extract total review count BEFORE clicking reviews tab (it's on Overview) # Extract business info and total review count BEFORE clicking reviews tab (on Overview)
# This captures name, rating, category, address while they're visible
# Only on first load (don't overwrite if we already have it) # Only on first load (don't overwrite if we already have it)
if total_reviews[0] is None: if total_reviews[0] is None or business_info_cache[0] is None:
start = time.time() start = time.time()
while time.time() - start < 5: while time.time() - start < 5:
try: try:
count = driver.execute_script(""" info = driver.execute_script("""
var reviewSpans = document.querySelectorAll('span[role="img"]'); var result = {
for (var i = 0; i < reviewSpans.length; i++) { total_reviews: null,
var label = reviewSpans[i].getAttribute('aria-label') || ''; name: null,
var match = label.match(/^([\\d,\\.]+)\\s*review/i); rating: null,
if (match) { category: null,
return parseInt(match[1].replace(/[,\\.]/g, '')); address: null
};
// Business name from h1
var h1 = document.querySelector('h1');
if (h1) result.name = h1.textContent.trim();
// Category - use jsaction attribute (robust selector)
var catBtn = document.querySelector('button[jsaction*="category"]');
if (catBtn) result.category = catBtn.textContent.trim();
// Rating and review count from span[role="img"] aria-labels
var spans = document.querySelectorAll('span[role="img"]');
for (var i = 0; i < spans.length; i++) {
var label = spans[i].getAttribute('aria-label') || '';
// Rating: "4.8 stars"
var rMatch = label.match(/^([\\d,.]+)\\s*star/i);
if (rMatch && !result.rating) {
result.rating = parseFloat(rMatch[1].replace(',', '.'));
}
// Reviews: "79 reviews"
var revMatch = label.match(/^([\\d,\\.]+)\\s*review/i);
if (revMatch && !result.total_reviews) {
result.total_reviews = parseInt(revMatch[1].replace(/[,\\.]/g, ''));
} }
} }
return null;
// Address from button
var addrBtn = document.querySelector('button[data-item-id="address"]');
if (addrBtn) {
var label = addrBtn.getAttribute('aria-label');
if (label) result.address = label.replace(/^Address:\\s*/i, '');
}
return result;
""") """)
if count:
total_reviews[0] = count if info:
log.info(f"📊 Total reviews on page: {count}") if info.get('total_reviews') and total_reviews[0] is None:
break total_reviews[0] = info['total_reviews']
log.info(f"📊 Total reviews on page: {total_reviews[0]}")
if info.get('name') and business_info_cache[0] is None:
business_info_cache[0] = info
log.info(f"📍 Business: {info.get('name')}")
if total_reviews[0] and business_info_cache[0]:
break
except: except:
pass pass
time.sleep(0.1) time.sleep(0.1)
# VALIDATION_ONLY: Return early - skip clicking reviews tab, sorting, etc.
if validation_only_mode:
log.info("📋 Validation mode: returning early (skipping reviews tab)")
return ("validation_done", None)
# Click reviews tab - poll until found # Click reviews tab - poll until found
review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"] review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"]
start = time.time() start = time.time()
tab_clicked = False tab_clicked = False
tabs_logged = False
while time.time() - start < 5: # Max 5s for tabs while time.time() - start < 5: # Max 5s for tabs
try: try:
tabs = driver.find_elements(By.CSS_SELECTOR, "button[role='tab']") tabs = driver.find_elements(By.CSS_SELECTOR, "button[role='tab']")
# Log available tabs once for debugging
if not tabs_logged and tabs:
tabs_logged = True
tab_texts = [t.text for t in tabs]
log.info(f" Available tabs: {tab_texts}")
for tab in tabs: for tab in tabs:
tab_text = tab.text.lower() tab_text = tab.text.lower()
if any(kw in tab_text for kw in review_keywords): if any(kw in tab_text for kw in review_keywords):
if not is_refresh: if not is_refresh:
log.info(f" Clicking reviews tab: '{tab.text}'") log.info(f" Clicking reviews tab: '{tab.text}'")
# Extract total_reviews from tab text like "Reviews (79)" or "Reviews\n79"
if total_reviews[0] is None:
import re
# Try pattern with parentheses: "Reviews (79)"
match = re.search(r'\((\d+)\)', tab.text)
if match:
total_reviews[0] = int(match.group(1))
log.info(f"📊 Total reviews from tab: {total_reviews[0]}")
else:
# Try pattern with newline: "Reviews\n79"
match = re.search(r'(\d+)', tab.text)
if match:
total_reviews[0] = int(match.group(1))
log.info(f"📊 Total reviews from tab: {total_reviews[0]}")
tab.click() tab.click()
tab_clicked = True tab_clicked = True
break break
@@ -569,11 +649,85 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
return scroll_container, stop_scrolling return scroll_container, stop_scrolling
# Initial page setup # Helper to extract review topics from the reviews tab
scroll_container, stop_scrolling = setup_reviews_page(is_refresh=False) def extract_review_topics():
"""Extract review topic filters from radiogroup (robust selectors)."""
try:
topics = driver.execute_script("""
var topics = [];
// Primary: use role="radiogroup" with aria-label="Refine reviews" (robust)
var container = document.querySelector('div[role="radiogroup"][aria-label*="Refine"], div[role="radiogroup"][aria-label*="refine"]');
if (!container) {
// Fallback: any radiogroup in the reviews area
container = document.querySelector('div[role="radiogroup"]');
}
if (container) {
var buttons = container.querySelectorAll('button[role="radio"]');
for (var btn of buttons) {
var label = btn.getAttribute('aria-label') || '';
// Parse "hair salon, mentioned in 4 reviews" format
var match = label.match(/^([^,]+),\\s*mentioned in (\\d+)/i);
if (match) {
topics.push({
topic: match[1].trim(),
count: parseInt(match[2])
});
} else if (label && !label.toLowerCase().includes('all review')) {
// Fallback: try to extract from child spans
var countSpan = btn.querySelector('.bC3Nkc, .fontBodySmall');
var nameSpan = btn.querySelector('.uEubGf, span:first-child');
if (nameSpan) {
var name = nameSpan.textContent.trim();
var count = countSpan ? parseInt(countSpan.textContent) : 0;
if (name && name.toLowerCase() !== 'all') {
topics.push({topic: name, count: count || 0});
}
}
}
}
}
return topics;
""")
return topics or []
except:
return []
# Initial page setup (pass validation_only to skip unnecessary steps)
scroll_container, stop_scrolling = setup_reviews_page(is_refresh=False, validation_only_mode=validation_only)
# VALIDATION_ONLY MODE: Return early with just total_reviews and business info
# setup_reviews_page returns ("validation_done", None) in this case
if validation_only or scroll_container == "validation_done":
# Use the business info captured from Overview (before clicking reviews tab)
business_info = business_info_cache[0] or {}
return {
"reviews": [],
"total": total_reviews[0] or 0,
"scrolls": 0,
"error": None,
"validation_info": {
"name": business_info.get("name"),
"rating": business_info.get("rating"),
"category": business_info.get("category"),
"address": business_info.get("address"),
"total_reviews": total_reviews[0]
}
}
if not scroll_container: if not scroll_container:
return {"reviews": [], "total": 0, "scrolls": 0, "error": "No scroll container found"} return {"reviews": [], "total": 0, "scrolls": 0, "error": "No scroll container found"}
# Extract review topics after reviews tab is loaded (before scrolling begins)
time.sleep(0.5) # Brief wait for topic filters to render
review_topics = extract_review_topics()
if review_topics:
log.info(f"📊 Found {len(review_topics)} review topics: {', '.join(t['topic'] for t in review_topics[:5])}...")
def get_api_reviews(): def get_api_reviews():
"""Get reviews from intercepted API responses.""" """Get reviews from intercepted API responses."""
api_revs = [] api_revs = []
@@ -990,13 +1144,15 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
"total_flushed": total_flushed[0], "total_flushed": total_flushed[0],
"checks": check_num, "checks": check_num,
"url": url, "url": url,
"logs": log.get_logs() "logs": log.get_logs(),
"review_topics": review_topics # Topic filters with mention counts
} }
def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999, def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999,
progress_callback=None, driver=None, return_driver: bool = False, progress_callback=None, driver=None, return_driver: bool = False,
log_capture: LogCapture = None): log_capture: LogCapture = None, flush_callback=None, validation_only: bool = False,
browser_fingerprint: dict = None):
""" """
Production-compatible wrapper for scrape_reviews. Production-compatible wrapper for scrape_reviews.
Matches the API expected by job_manager.py. Matches the API expected by job_manager.py.
@@ -1009,6 +1165,13 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
driver: Existing driver instance to reuse driver: Existing driver instance to reuse
return_driver: If True, return driver in result return_driver: If True, return driver in result
log_capture: Optional LogCapture instance for real-time log access log_capture: Optional LogCapture instance for real-time log access
browser_fingerprint: Optional dict with user's browser fingerprint:
- geolocation: {lat, lng}
- userAgent: string
- viewport: {width, height}
- timezone: string (e.g., "Europe/Madrid")
- language: string (e.g., "en-US")
- platform: string (e.g., "MacIntel")
Returns: Returns:
Dictionary with: reviews, count, total_reviews, time, success, error, driver, logs Dictionary with: reviews, count, total_reviews, time, success, error, driver, logs
@@ -1023,27 +1186,56 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
log_capture = log_capture or LogCapture() log_capture = log_capture or LogCapture()
try: try:
# Extract fingerprint settings
fp = browser_fingerprint or {}
user_agent = fp.get('userAgent') or "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
viewport = fp.get('viewport') or {'width': 1200, 'height': 900}
geolocation = fp.get('geolocation')
timezone = fp.get('timezone')
language = fp.get('language', 'en-US')
# Create driver if not provided # Create driver if not provided
if not driver: if not driver:
driver = Driver( driver = Driver(
uc=True, uc=True,
headless=headless, headless=headless,
page_load_strategy="normal", page_load_strategy="normal",
agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" agent=user_agent # Use user's actual user agent
) )
driver.set_window_size(1200, 900) # Proper viewport for Google Maps # Set viewport to match user's screen
driver.set_window_size(viewport['width'], viewport['height'])
# Set Chrome geolocation to US (Boston, MA) using CDP # Apply browser fingerprint settings via CDP
# This ensures Google Maps shows US results regardless of server location
try: try:
driver.execute_cdp_cmd('Emulation.setGeolocationOverride', { # Set timezone if provided
'latitude': 42.3601, if timezone:
'longitude': -71.0589, driver.execute_cdp_cmd('Emulation.setTimezoneOverride', {'timezoneId': timezone})
'accuracy': 100 log_capture.info(f"Set timezone to {timezone}")
})
log_capture.info("Set geolocation to US (Boston, MA)") # Set locale/language
driver.execute_cdp_cmd('Emulation.setLocaleOverride', {'locale': language})
# Set geolocation
if geolocation and 'lat' in geolocation and 'lng' in geolocation:
driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
'latitude': geolocation['lat'],
'longitude': geolocation['lng'],
'accuracy': 1000 # ~1km accuracy for IP-based location
})
log_capture.info(f"Set geolocation to ({geolocation['lat']:.2f}, {geolocation['lng']:.2f})")
else:
# Default to US (Boston, MA) if no geolocation provided
driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
'latitude': 42.3601,
'longitude': -71.0589,
'accuracy': 100
})
log_capture.info("Set geolocation to US (Boston, MA) [default]")
if fp:
log_capture.info(f"Browser fingerprint applied: {fp.get('platform', 'unknown')}, {viewport['width']}x{viewport['height']}")
except Exception as e: except Exception as e:
log_capture.warning(f"Could not set geolocation: {e}") log_capture.warning(f"Could not apply fingerprint settings: {e}")
# Add URL parameters for consistent results # Add URL parameters for consistent results
if 'hl=' not in url: if 'hl=' not in url:
@@ -1052,14 +1244,18 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
if 'gl=' not in url: if 'gl=' not in url:
url = f"{url}&gl=us" url = f"{url}&gl=us"
# Create progress wrapper if callback provided # Create combined flush callback for progress + external handler
flush_callback = None external_flush = flush_callback # Save external callback
if progress_callback: internal_flush = None
if progress_callback or external_flush:
collected = [0] collected = [0]
def flush_with_progress(reviews_batch): def combined_flush(reviews_batch):
collected[0] += len(reviews_batch) collected[0] = len(reviews_batch) # reviews_batch is ALL reviews so far
progress_callback(collected[0], None) if progress_callback:
flush_callback = flush_with_progress progress_callback(collected[0], None)
if external_flush:
external_flush(reviews_batch) # Pass reviews to external handler
internal_flush = combined_flush
# Run the scraper with progress callback for real-time updates # Run the scraper with progress callback for real-time updates
result = scrape_reviews( result = scrape_reviews(
@@ -1067,10 +1263,11 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
url=url, url=url,
max_reviews=999999, # Effectively unlimited max_reviews=999999, # Effectively unlimited
timeout_no_new=15, timeout_no_new=15,
flush_callback=flush_callback, flush_callback=internal_flush,
flush_batch_size=100, # Smaller batches for more frequent progress flush_batch_size=100, # Smaller batches for more frequent progress
log_capture=log_capture, log_capture=log_capture,
progress_callback=progress_callback # Pass through for real-time log updates progress_callback=progress_callback, # Pass through for real-time log updates
validation_only=validation_only # Return early if just validating
) )
elapsed = time.time() - start_time elapsed = time.time() - start_time
@@ -1083,9 +1280,14 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
"time": elapsed, "time": elapsed,
"success": True, "success": True,
"error": None, "error": None,
"logs": result.get("logs", []) "logs": result.get("logs", []),
"review_topics": result.get("review_topics", []) # Topic filters with mention counts
} }
# Include validation_info if in validation_only mode
if validation_only and "validation_info" in result:
response["validation_info"] = result["validation_info"]
if return_driver: if return_driver:
response["driver"] = driver response["driver"] = driver
elif should_close_driver: elif should_close_driver:
@@ -1120,6 +1322,122 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
} }
def extract_about_info(driver, url: str = None) -> dict:
"""
Extract About section info from Google Maps (Accessibility, Amenities, etc.).
This function should be called AFTER reviews are scraped if about info is needed,
as it navigates to a different tab.
Args:
driver: Selenium WebDriver instance (already on the business page)
url: Optional URL to navigate to first (if not already on the page)
Returns:
dict with section names as keys, each containing list of features
"""
try:
# Navigate if URL provided
if url:
# Force English
if 'hl=' not in url:
separator = '&' if '?' in url else '?'
url = f"{url}{separator}hl=en"
if 'gl=' not in url:
url = f"{url}&gl=us"
driver.get(url)
time.sleep(1)
# Click About tab using robust selectors
clicked = driver.execute_script("""
// Try multiple selectors for about tab
var selectors = [
'button[aria-label*="About"]',
'button[data-tab-index="2"]',
'div[role="tablist"] button:nth-child(3)',
'button[jsaction*="about"]'
];
for (var sel of selectors) {
var btn = document.querySelector(sel);
if (btn && btn.textContent.toLowerCase().includes('about')) {
btn.click();
return true;
}
}
// Fallback: find by text content
var buttons = document.querySelectorAll('button');
for (var btn of buttons) {
if (btn.textContent.trim().toLowerCase() === 'about') {
btn.click();
return true;
}
}
return false;
""")
if not clicked:
return {}
time.sleep(1.5) # Wait for about tab to load
# Extract about sections using aria-labels (robust)
about = driver.execute_script("""
var about = {};
// Find the about region by aria-label or role
var container = document.querySelector('div[role="region"][aria-label*="About"]');
if (!container) {
// Fallback: look for the scrollable area with sections
container = document.querySelector('.m6QErb[aria-label*="About"]');
}
if (!container) {
// Last resort: find sections by h2 headers
container = document;
}
// Find all section headers (h2 elements)
var sections = container.querySelectorAll('h2');
for (var h2 of sections) {
var sectionName = h2.textContent.trim();
var items = [];
// Find the ul list following this h2
var parent = h2.closest('.iP2t7d, div');
if (parent) {
var listItems = parent.querySelectorAll('li span[aria-label]');
for (var li of listItems) {
var label = li.getAttribute('aria-label');
if (label) {
// Parse "Has toilet" or "No wheelchair-accessible car park"
var hasFeature = !label.toLowerCase().startsWith('no ');
var featureName = label.replace(/^(Has |No )/i, '');
items.push({
feature: featureName,
available: hasFeature
});
}
}
}
if (sectionName && items.length > 0) {
about[sectionName] = items;
}
}
return about;
""")
return about or {}
except Exception as e:
return {"error": str(e)}
# Test function # Test function
if __name__ == "__main__": if __name__ == "__main__":
from seleniumbase import Driver from seleniumbase import Driver
@@ -1159,6 +1477,8 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
dict with: name, address, rating, total_reviews, success, error, time dict with: name, address, rating, total_reviews, success, error, time
""" """
from seleniumbase import Driver from seleniumbase import Driver
import logging
log = logging.getLogger(__name__)
start_time = time.time() start_time = time.time()
driver_provided = driver is not None driver_provided = driver is not None
@@ -1177,13 +1497,15 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
except: except:
pass pass
# Clear state if reusing a pooled driver (ensures clean page load) # Don't clear state - Google may serve different content based on session history
if driver_provided: # The scraper doesn't reset state, so validation shouldn't either
try:
driver.delete_all_cookies() # Force English interface for consistent parsing
driver.get("about:blank") if 'hl=' not in url:
except: separator = '&' if '?' in url else '?'
pass url = f"{url}{separator}hl=en"
if 'gl=' not in url:
url = f"{url}&gl=us"
# Navigate to URL # Navigate to URL
driver.get(url) driver.get(url)
@@ -1193,48 +1515,183 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
while time.time() - start < 5: while time.time() - start < 5:
if "consent.google" in driver.current_url: if "consent.google" in driver.current_url:
try: try:
for btn in driver.find_elements(By.CSS_SELECTOR, "button"): # Try multiple approaches to find and click accept button
txt = btn.text.lower() clicked = False
if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
btn.click() # Method 1: Find by aria-label (most reliable for Google consent)
driver.get(url) for btn in driver.find_elements(By.CSS_SELECTOR, "button[aria-label*='Accept']"):
break btn.click()
except: clicked = True
break
# Method 2: Find by text content
if not clicked:
for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
txt = btn.text.lower()
if "accept all" in txt or "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
btn.click()
clicked = True
break
if clicked:
time.sleep(0.5) # Brief wait for consent to process
driver.get(url) # Reload the target URL
time.sleep(0.5) # Wait for reload
except Exception as e:
pass pass
break break
if "maps/place" in driver.current_url or ("maps" in driver.current_url and "consent" not in driver.current_url): if "maps/place" in driver.current_url or ("maps" in driver.current_url and "consent" not in driver.current_url):
break break
time.sleep(0.01) # 10ms - responsive but low CPU time.sleep(0.01) # 10ms - responsive but low CPU
# Log current URL after consent handling
try:
current_url = driver.current_url
log.info(f"🔍 Validation: Current URL after load: {current_url[:80]}...")
except:
pass
# Wait for page to fully render before polling (tabs may load dynamically)
time.sleep(2)
# Poll for business info (same pattern as total_reviews extraction) # Poll for business info (same pattern as total_reviews extraction)
info = {"name": None, "rating": None, "total_reviews": None, "address": None} # Timeout increased to 10s because Reviews tab can take 6+ seconds to appear after consent
info = {"name": None, "rating": None, "total_reviews": None, "address": None, "category": None}
start = time.time() start = time.time()
while time.time() - start < 5: debug_logged = False
while time.time() - start < 10:
try: try:
info = driver.execute_script(""" info = driver.execute_script("""
var result = {name: null, rating: null, total_reviews: null, address: null}; var result = {name: null, rating: null, total_reviews: null, address: null, category: null, debug: []};
// Business name from h1 // Business name from h1
var h1 = document.querySelector('h1'); var h1 = document.querySelector('h1');
if (h1) result.name = h1.textContent.trim(); if (h1) result.name = h1.textContent.trim();
// Rating and reviews from span[role="img"] aria-labels // Category - use jsaction attribute (robust, survives class changes)
// Same pattern as scrape_reviews for consistency var catBtn = document.querySelector('button[jsaction*="category"]');
if (catBtn) result.category = catBtn.textContent.trim();
// Fallback: look for button after rating that's not a link
if (!result.category) {
var buttons = document.querySelectorAll('button');
for (var btn of buttons) {
var text = btn.textContent.trim();
// Categories are short words, no numbers, not navigation
if (text && text.length < 50 && !text.match(/^[0-9]/) &&
!text.match(/review|star|direction|save|share|photo/i)) {
// Check if it's near the rating area
var parent = btn.closest('.LBgpqf, .skqShb, .fontBodyMedium');
if (parent) {
result.category = text;
break;
}
}
}
}
// Rating from span[role="img"] aria-labels
var spans = document.querySelectorAll('span[role="img"]'); var spans = document.querySelectorAll('span[role="img"]');
for (var i = 0; i < spans.length; i++) { for (var i = 0; i < spans.length; i++) {
var label = spans[i].getAttribute('aria-label') || ''; var label = spans[i].getAttribute('aria-label') || '';
// Rating: "4.8 stars", "4,8 estrellas", etc (partial match) // Collect debug info for all aria-labels
var rMatch = label.match(/^([\\d,.]+)\\s*(star|estrella|étoile|stern|stell)/i); if (label) {
result.debug.push('img-aria: ' + label);
}
// Rating: "4.8 stars" (English forced via hl=en)
var rMatch = label.match(/^([\\d,.]+)\\s*star/i);
if (rMatch && !result.rating) { if (rMatch && !result.rating) {
result.rating = parseFloat(rMatch[1].replace(',', '.')); result.rating = parseFloat(rMatch[1].replace(',', '.'));
} }
// Reviews: same as scrape_reviews - /^([\d,.]+)\s*review/i // Reviews: "79 reviews" or "4.8 stars 79 reviews" (English forced via hl=en)
// Plus Spanish "reseña" which doesn't contain "review" // Try direct format first: "79 reviews"
var revMatch = label.match(/^([\\d,\\.]+)\\s*(review|reseña|avis|bewertung|recension)/i); var revMatch = label.match(/^([\\d,]+)\\s*review/i);
if (revMatch && !result.total_reviews) { if (revMatch && !result.total_reviews) {
result.total_reviews = parseInt(revMatch[1].replace(/[,\\.]/g, '')); result.total_reviews = parseInt(revMatch[1].replace(/,/g, ''));
}
// Try combined format: "4.8 stars 79 reviews" or "4.8 stars 79k+ reviews"
if (!result.total_reviews) {
var combinedMatch = label.match(/stars?\\s+([\\d,]+k?\\+?)\\s*review/i);
if (combinedMatch) {
var countStr = combinedMatch[1].replace(/,/g, '');
if (countStr.includes('k')) {
// Handle "9k+" format
result.total_reviews = parseInt(countStr) * 1000;
} else {
result.total_reviews = parseInt(countStr);
}
}
}
}
// Also collect tab button texts for debugging (include full text including numbers)
var tabs = document.querySelectorAll('button[role="tab"]');
for (var j = 0; j < tabs.length; j++) {
var tabText = tabs[j].textContent.trim();
result.debug.push('tab: ' + tabText);
// Also try to extract review count from tab text like "Reviews (79)"
if (tabText.toLowerCase().includes('review') && !result.total_reviews) {
var tabMatch = tabText.match(/\\((\\d+)\\)/);
if (tabMatch) {
result.total_reviews = parseInt(tabMatch[1]);
result.debug.push('Found reviews in tab: ' + tabText);
}
}
}
// Also check ALL buttons for reviews count
var allButtons = document.querySelectorAll('button');
for (var b = 0; b < allButtons.length; b++) {
var btnText = allButtons[b].textContent || '';
if (btnText.toLowerCase().includes('review') && !btnText.toLowerCase().includes('write')) {
var numMatch = btnText.match(/\\((\\d+)\\)/);
if (numMatch && !result.total_reviews) {
result.total_reviews = parseInt(numMatch[1]);
result.debug.push('Found reviews in button: ' + btnText.substring(0, 50));
}
}
}
// Check if we're on search results vs place page
result.debug.push('title: ' + document.title);
result.debug.push('url: ' + window.location.href.substring(0, 80));
// Check for search results list
var searchResults = document.querySelectorAll('div[role="feed"] > div');
result.debug.push('search_results_count: ' + searchResults.length);
// Fallback: Get review count from Reviews tab button "Reviews (79)"
// Search ALL tab buttons for one containing "review" text (same as scrape_reviews)
if (!result.total_reviews) {
var tabs = document.querySelectorAll('button[role="tab"]');
for (var tab of tabs) {
var text = tab.textContent.toLowerCase();
if (text.includes('review')) {
var match = tab.textContent.match(/\\((\\d+)\\)/);
if (match) {
result.total_reviews = parseInt(match[1]);
break;
}
}
}
}
// Fallback 2: Look for any button with "Reviews" and a number
if (!result.total_reviews) {
var buttons = document.querySelectorAll('button');
for (var btn of buttons) {
var text = btn.textContent;
if (text.toLowerCase().includes('review') && !text.toLowerCase().includes('write')) {
var numMatch = text.match(/\\((\\d+)\\)/);
if (numMatch) {
result.total_reviews = parseInt(numMatch[1]);
break;
}
}
} }
} }
@@ -1242,23 +1699,41 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
var addrBtn = document.querySelector('button[data-item-id="address"]'); var addrBtn = document.querySelector('button[data-item-id="address"]');
if (addrBtn) { if (addrBtn) {
var label = addrBtn.getAttribute('aria-label'); var label = addrBtn.getAttribute('aria-label');
if (label) result.address = label.replace(/^(Address|Dirección|Adresse):\\s*/i, ''); if (label) result.address = label.replace(/^Address:\\s*/i, '');
} }
return result; return result;
""") """)
# Exit early if we have the essentials # Exit early if we have the essentials (name found AND reviews count > 0)
if info.get("name") and info.get("total_reviews") is not None: if info.get("name") and info.get("total_reviews") and info.get("total_reviews") > 0:
break break
# Log debug info once after 3 seconds
if not debug_logged and time.time() - start > 3:
debug_logged = True
debug_info = info.get("debug", [])
if debug_info:
log.info(f"🔍 Validation debug - URL: {url[:50]}...")
log.info(f" Name: {info.get('name')}, Rating: {info.get('rating')}, Reviews: {info.get('total_reviews')}")
for d in debug_info[:10]: # First 10 debug items
log.info(f" {d}")
except: except:
pass pass
time.sleep(0.1) # 100ms between polls time.sleep(0.1) # 100ms between polls
# Final debug log if still no reviews
if not info.get("total_reviews"):
debug_info = info.get("debug", [])
log.warning(f"⚠️ Validation: No reviews found for '{info.get('name')}' after 10s polling")
if debug_info:
log.warning(f" Debug items: {debug_info[:10]}")
return { return {
"name": info.get("name"), "name": info.get("name"),
"address": info.get("address"), "address": info.get("address"),
"rating": info.get("rating"), "rating": info.get("rating"),
"total_reviews": info.get("total_reviews"), "total_reviews": info.get("total_reviews"),
"category": info.get("category"),
"success": bool(info.get("name")), "success": bool(info.get("name")),
"error": None, "error": None,
"time": time.time() - start_time "time": time.time() - start_time
@@ -1270,6 +1745,7 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
"address": None, "address": None,
"rating": None, "rating": None,
"total_reviews": None, "total_reviews": None,
"category": None,
"success": False, "success": False,
"error": str(e), "error": str(e),
"time": time.time() - start_time "time": time.time() - start_time

View File

@@ -27,6 +27,8 @@ interface SelectedJob {
jobId: string; jobId: string;
newCount?: number; newCount?: number;
previousJobId?: string; previousJobId?: string;
businessCategory?: string;
reviewTopics?: { topic: string; count: number }[];
} }
type ViewType = 'newScrape' | 'jobs' | 'reports'; type ViewType = 'newScrape' | 'jobs' | 'reports';
@@ -106,6 +108,8 @@ export default function Home() {
jobId: job.job_id, jobId: job.job_id,
newCount: data.new_count, newCount: data.new_count,
previousJobId: previousJob?.job_id, previousJobId: previousJob?.job_id,
businessCategory: job.business_category || undefined,
reviewTopics: job.review_topics || undefined,
}); });
setActiveView('reports'); setActiveView('reports');
} }
@@ -155,7 +159,7 @@ export default function Home() {
Back to Reports Back to Reports
</button> </button>
</div> </div>
<ReviewAnalytics reviews={selectedJob.reviews} businessName={selectedJob.businessName} businessUrl={selectedJob.businessUrl} newCount={selectedJob.newCount} /> <ReviewAnalytics reviews={selectedJob.reviews} businessName={selectedJob.businessName} businessUrl={selectedJob.businessUrl} newCount={selectedJob.newCount} businessCategory={selectedJob.businessCategory} reviewTopics={selectedJob.reviewTopics} />
</div> </div>
) : ( ) : (
<div className="h-full overflow-y-auto p-6"> <div className="h-full overflow-y-auto p-6">

View File

@@ -22,14 +22,21 @@ interface ReviewWithNew extends Review {
photo_urls?: string[] | null; photo_urls?: string[] | null;
} }
interface ReviewTopic {
topic: string;
count: number;
}
interface ReviewAnalyticsProps { interface ReviewAnalyticsProps {
reviews: ReviewWithNew[]; reviews: ReviewWithNew[];
businessName?: string; businessName?: string;
businessUrl?: string; businessUrl?: string;
newCount?: number; newCount?: number;
businessCategory?: string;
reviewTopics?: ReviewTopic[];
} }
export default function ReviewAnalytics({ reviews, businessName, businessUrl, newCount }: ReviewAnalyticsProps) { export default function ReviewAnalytics({ reviews, businessName, businessUrl, newCount, businessCategory, reviewTopics }: ReviewAnalyticsProps) {
const [sorting, setSorting] = useState<SortingState>([{ id: 'date', desc: true }]); // Default: newest first const [sorting, setSorting] = useState<SortingState>([{ id: 'date', desc: true }]); // Default: newest first
const [columnFilters, setColumnFiltersState] = useState<ColumnFiltersState>([]); const [columnFilters, setColumnFiltersState] = useState<ColumnFiltersState>([]);
const [globalFilter, setGlobalFilter] = useState(''); const [globalFilter, setGlobalFilter] = useState('');
@@ -476,9 +483,16 @@ export default function ReviewAnalytics({ reviews, businessName, businessUrl, ne
{/* Header */} {/* Header */}
<div className="flex items-center justify-between"> <div className="flex items-center justify-between">
<div> <div>
<h2 className="text-3xl font-bold text-gray-900"> <div className="flex items-center gap-3">
{businessName || 'Review Analytics'} <h2 className="text-3xl font-bold text-gray-900">
</h2> {businessName || 'Review Analytics'}
</h2>
{businessCategory && (
<span className="px-3 py-1 bg-purple-100 text-purple-800 text-sm font-medium rounded-full border border-purple-300">
{businessCategory}
</span>
)}
</div>
{businessUrl && ( {businessUrl && (
<a <a
href={businessUrl} href={businessUrl}
@@ -821,6 +835,33 @@ export default function ReviewAnalytics({ reviews, businessName, businessUrl, ne
</div> </div>
</div> </div>
{/* Review Topics - from Google Maps */}
{reviewTopics && reviewTopics.length > 0 && (
<div className="bg-white border-2 border-gray-300 rounded-xl p-5 shadow-md">
<div className="flex items-center gap-2 mb-4">
<MessageSquare className="w-6 h-6 text-indigo-600" />
<h3 className="text-lg font-bold text-gray-900">What People Talk About</h3>
<span className="text-sm text-gray-500">({reviewTopics.length} topics from Google)</span>
</div>
<div className="flex flex-wrap gap-2">
{reviewTopics.slice(0, 15).map((topic, idx) => (
<div
key={idx}
className="px-3 py-1.5 bg-gradient-to-r from-indigo-50 to-purple-50 border border-indigo-200 rounded-full flex items-center gap-2"
>
<span className="text-sm font-medium text-indigo-800">{topic.topic}</span>
<span className="text-xs bg-indigo-200 text-indigo-900 px-1.5 py-0.5 rounded-full font-bold">
{topic.count}
</span>
</div>
))}
</div>
{reviewTopics.length > 15 && (
<p className="text-sm text-gray-500 mt-3">+{reviewTopics.length - 15} more topics</p>
)}
</div>
)}
{/* Rating & Volume Timeline */} {/* Rating & Volume Timeline */}
{timelineData.length > 0 && ( {timelineData.length > 0 && (
<div className={`bg-white rounded-xl p-6 shadow-md transition-all ${ <div className={`bg-white rounded-xl p-6 shadow-md transition-all ${

View File

@@ -15,7 +15,7 @@ interface Review {
export interface JobStatus { export interface JobStatus {
job_id: string; job_id: string;
status: 'pending' | 'running' | 'completed' | 'failed'; status: 'pending' | 'running' | 'completed' | 'failed' | 'partial';
url: string; url: string;
created_at: string; created_at: string;
started_at: string | null; started_at: string | null;
@@ -28,8 +28,11 @@ export interface JobStatus {
// Business metadata for tracking and comparison // Business metadata for tracking and comparison
business_name: string | null; business_name: string | null;
business_address: string | null; business_address: string | null;
business_category: string | null;
rating_snapshot: number | null; rating_snapshot: number | null;
total_reviews_snapshot: number | null; total_reviews_snapshot: number | null;
// Review topics extracted from Google Maps
review_topics: { topic: string; count: number }[] | null;
} }
interface ScraperTestProps { interface ScraperTestProps {
@@ -56,7 +59,64 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
const [businessRating, setBusinessRating] = useState<number | null>(null); const [businessRating, setBusinessRating] = useState<number | null>(null);
const [businessImage, setBusinessImage] = useState<string | null>(null); const [businessImage, setBusinessImage] = useState<string | null>(null);
const [businessCategory, setBusinessCategory] = useState<string | null>(null); const [businessCategory, setBusinessCategory] = useState<string | null>(null);
const [userFingerprint, setUserFingerprint] = useState<{
geolocation?: {lat: number, lng: number},
userAgent?: string,
viewport?: {width: number, height: number},
timezone?: string,
language?: string,
platform?: string
}>({});
const debounceRef = useRef<NodeJS.Timeout | null>(null); const debounceRef = useRef<NodeJS.Timeout | null>(null);
// Collect browser fingerprint on mount (no permissions needed)
useEffect(() => {
const collectFingerprint = async () => {
const fingerprint: typeof userFingerprint = {};
// User agent
fingerprint.userAgent = navigator.userAgent;
// Screen/viewport size
fingerprint.viewport = {
width: window.screen.width,
height: window.screen.height
};
// Timezone
fingerprint.timezone = Intl.DateTimeFormat().resolvedOptions().timeZone;
// Language
fingerprint.language = navigator.language;
// Platform
fingerprint.platform = navigator.platform;
// Get approximate location from IP (no permission needed)
try {
const response = await fetch('https://ipapi.co/json/', {
signal: AbortSignal.timeout(3000)
});
if (response.ok) {
const data = await response.json();
if (data.latitude && data.longitude) {
fingerprint.geolocation = {
lat: data.latitude,
lng: data.longitude
};
console.log('IP location:', data.city, data.country_name);
}
}
} catch (error) {
console.log('IP geolocation not available');
}
setUserFingerprint(fingerprint);
console.log('Browser fingerprint:', fingerprint);
};
collectFingerprint();
}, []);
const pollingIntervals = useRef<Map<string, NodeJS.Timeout>>(new Map()); const pollingIntervals = useRef<Map<string, NodeJS.Timeout>>(new Map());
const abortControllerRef = useRef<AbortController | null>(null); const abortControllerRef = useRef<AbortController | null>(null);
@@ -121,18 +181,23 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
setBusinessCategory(null); setBusinessCategory(null);
setError(''); setError('');
// Create new abort controller with 30 second timeout // Create new abort controller with 60 second timeout (validation can be slow)
const controller = new AbortController(); const controller = new AbortController();
abortControllerRef.current = controller; abortControllerRef.current = controller;
const timeoutId = setTimeout(() => controller.abort(), 30000); const timeoutId = setTimeout(() => controller.abort(), 60000);
try { try {
const url = `https://www.google.com/maps/search/?api=1&query=${encodeURIComponent(query)}`; // Force English with hl=en parameter
const url = `https://www.google.com/maps/search/?api=1&query=${encodeURIComponent(query)}&hl=en`;
const response = await fetch('/api/check-reviews', { const response = await fetch('/api/check-reviews', {
method: 'POST', method: 'POST',
headers: { 'Content-Type': 'application/json' }, headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ url }), body: JSON.stringify({
url,
geolocation: userFingerprint.geolocation,
browser_fingerprint: userFingerprint // Pass full fingerprint
}),
signal: controller.signal, signal: controller.signal,
}); });
@@ -157,21 +222,30 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
} catch (err) { } catch (err) {
clearTimeout(timeoutId); clearTimeout(timeoutId);
// Ignore AbortError (happens when user starts a new validation) // Check if this is a timeout abort vs user-initiated abort
if (err instanceof Error && err.name === 'AbortError') { if (err instanceof Error && err.name === 'AbortError') {
console.log('Validation cancelled (new validation started)'); // Check if it was a timeout (controller still matches) or user started new search
return; if (abortControllerRef.current === controller) {
// Timeout - show error
console.error('Validation timed out');
setError('Validation timed out. Please try again.');
setHasReviews(false);
setAvailableReviewCount(0);
} else {
// User started a new search - just return silently
console.log('Validation cancelled (new validation started)');
return;
}
} else {
console.error('Error getting business info:', err);
// Error occurred
setHasReviews(false);
setAvailableReviewCount(0);
} }
console.error('Error getting business info:', err);
// Error occurred
setHasReviews(false);
setAvailableReviewCount(0);
} finally { } finally {
// Only clear loading state if this controller wasn't aborted clearTimeout(timeoutId);
if (!controller.signal.aborted) { // Always clear loading state (even on timeout)
setIsCheckingReviews(false); setIsCheckingReviews(false);
}
} }
}; };
@@ -192,8 +266,8 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
return newMap; return newMap;
}); });
// Stop polling if job is done // Stop polling if job is done (completed, failed, or partial)
if (data.status === 'completed' || data.status === 'failed') { if (data.status === 'completed' || data.status === 'failed' || data.status === 'partial') {
const interval = pollingIntervals.current.get(jobId); const interval = pollingIntervals.current.get(jobId);
if (interval) { if (interval) {
clearInterval(interval); clearInterval(interval);
@@ -244,8 +318,8 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
setIsSubmitting(true); setIsSubmitting(true);
setShowConfirmModal(false); setShowConfirmModal(false);
// Use the search query to create a Google Maps search URL // Use the search query to create a Google Maps search URL (force English)
const url = `https://www.google.com/maps/search/?api=1&query=${encodeURIComponent(searchedQuery)}`; const url = `https://www.google.com/maps/search/?api=1&query=${encodeURIComponent(searchedQuery)}&hl=en`;
try { try {
const response = await fetch('/api/scrape', { const response = await fetch('/api/scrape', {
@@ -257,6 +331,8 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
business_address: businessAddress, business_address: businessAddress,
rating_snapshot: businessRating, rating_snapshot: businessRating,
total_reviews_snapshot: availableReviewCount, total_reviews_snapshot: availableReviewCount,
geolocation: userFingerprint.geolocation,
browser_fingerprint: userFingerprint, // Pass full fingerprint
}), }),
}); });
@@ -283,8 +359,10 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
error_message: null, error_message: null,
business_name: businessName, business_name: businessName,
business_address: businessAddress, business_address: businessAddress,
business_category: businessCategory,
rating_snapshot: businessRating, rating_snapshot: businessRating,
total_reviews_snapshot: availableReviewCount, total_reviews_snapshot: availableReviewCount,
review_topics: null, // Will be populated when job completes
}); });
return newMap; return newMap;
}); });
@@ -305,6 +383,7 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
case 'completed': return 'text-green-700'; case 'completed': return 'text-green-700';
case 'running': return 'text-blue-700'; case 'running': return 'text-blue-700';
case 'failed': return 'text-red-700'; case 'failed': return 'text-red-700';
case 'partial': return 'text-orange-700';
default: return 'text-gray-800'; default: return 'text-gray-800';
} }
}; };
@@ -325,6 +404,12 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
<path fillRule="evenodd" d="M10 18a8 8 0 100-16 8 8 0 000 16zM8.707 7.293a1 1 0 00-1.414 1.414L8.586 10l-1.293 1.293a1 1 0 101.414 1.414L10 11.414l1.293 1.293a1 1 0 001.414-1.414L11.414 10l1.293-1.293a1 1 0 00-1.414-1.414L10 8.586 8.707 7.293z" clipRule="evenodd" /> <path fillRule="evenodd" d="M10 18a8 8 0 100-16 8 8 0 000 16zM8.707 7.293a1 1 0 00-1.414 1.414L8.586 10l-1.293 1.293a1 1 0 101.414 1.414L10 11.414l1.293 1.293a1 1 0 001.414-1.414L11.414 10l1.293-1.293a1 1 0 00-1.414-1.414L10 8.586 8.707 7.293z" clipRule="evenodd" />
</svg> </svg>
); );
case 'partial':
return (
<svg className="w-5 h-5 text-orange-500" fill="currentColor" viewBox="0 0 20 20">
<path fillRule="evenodd" d="M8.257 3.099c.765-1.36 2.722-1.36 3.486 0l5.58 9.92c.75 1.334-.213 2.98-1.742 2.98H4.42c-1.53 0-2.493-1.646-1.743-2.98l5.58-9.92zM11 13a1 1 0 11-2 0 1 1 0 012 0zm-1-8a1 1 0 00-1 1v3a1 1 0 002 0V6a1 1 0 00-1-1z" clipRule="evenodd" />
</svg>
);
default: default:
return ( return (
<svg className="w-5 h-5 text-gray-400" fill="currentColor" viewBox="0 0 20 20"> <svg className="w-5 h-5 text-gray-400" fill="currentColor" viewBox="0 0 20 20">
@@ -776,8 +861,8 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
)} )}
</div> </div>
{/* Action Buttons - Show when completed and has reviews */} {/* Action Buttons - Show when completed, partial, or running with reviews */}
{job.status === 'completed' && job.reviews_count && job.reviews_count > 0 && ( {(job.status === 'completed' || job.status === 'partial' || (job.status === 'running' && job.reviews_count && job.reviews_count > 0)) && job.reviews_count && job.reviews_count > 0 && (
<div className="flex gap-3"> <div className="flex gap-3">
<button <button
onClick={async () => { onClick={async () => {
@@ -818,7 +903,13 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
} }
}} }}
disabled={isLoadingReviews} disabled={isLoadingReviews}
className="flex-1 py-4 bg-gradient-to-r from-blue-600 to-indigo-700 text-white rounded-xl font-bold hover:from-blue-700 hover:to-indigo-800 transition-all flex items-center justify-center gap-2 shadow-lg disabled:opacity-50 disabled:cursor-not-allowed text-lg border-2 border-blue-500" className={`flex-1 py-4 text-white rounded-xl font-bold transition-all flex items-center justify-center gap-2 shadow-lg disabled:opacity-50 disabled:cursor-not-allowed text-lg border-2 ${
job.status === 'partial'
? 'bg-gradient-to-r from-orange-500 to-amber-600 hover:from-orange-600 hover:to-amber-700 border-orange-400'
: job.status === 'running'
? 'bg-gradient-to-r from-blue-500 to-cyan-600 hover:from-blue-600 hover:to-cyan-700 border-blue-400'
: 'bg-gradient-to-r from-blue-600 to-indigo-700 hover:from-blue-700 hover:to-indigo-800 border-blue-500'
}`}
> >
{isLoadingReviews ? ( {isLoadingReviews ? (
<> <>
@@ -830,7 +921,7 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
<svg className="w-6 h-6" fill="none" stroke="currentColor" viewBox="0 0 24 24"> <svg className="w-6 h-6" fill="none" stroke="currentColor" viewBox="0 0 24 24">
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M9 19v-6a2 2 0 00-2-2H5a2 2 0 00-2 2v6a2 2 0 002 2h2a2 2 0 002-2zm0 0V9a2 2 0 012-2h2a2 2 0 012 2v10m-6 0a2 2 0 002 2h2a2 2 0 002-2m0 0V5a2 2 0 012-2h2a2 2 0 012 2v14a2 2 0 01-2 2h-2a2 2 0 01-2-2z" /> <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M9 19v-6a2 2 0 00-2-2H5a2 2 0 00-2 2v6a2 2 0 002 2h2a2 2 0 002-2zm0 0V9a2 2 0 012-2h2a2 2 0 012 2v10m-6 0a2 2 0 002 2h2a2 2 0 002-2m0 0V5a2 2 0 012-2h2a2 2 0 012 2v14a2 2 0 01-2 2h-2a2 2 0 01-2-2z" />
</svg> </svg>
📊 Open Analytics Dashboard 📊 {job.status === 'running' ? 'Preview Analytics' : job.status === 'partial' ? 'View Partial Data' : 'Open Analytics Dashboard'}
</> </>
)} )}
</button> </button>
@@ -845,7 +936,7 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
const url = URL.createObjectURL(blob); const url = URL.createObjectURL(blob);
const a = document.createElement('a'); const a = document.createElement('a');
a.href = url; a.href = url;
a.download = `reviews-${job.job_id}.json`; a.download = `reviews-${job.job_id}${job.status === 'partial' ? '-partial' : ''}.json`;
a.click(); a.click();
} }
} catch (err) { } catch (err) {
@@ -862,6 +953,24 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
</div> </div>
)} )}
{/* Partial Job Warning */}
{job.status === 'partial' && (
<div className="mt-4 p-4 bg-orange-100 border-2 border-orange-300 rounded-lg">
<div className="flex items-start gap-2">
<svg className="w-5 h-5 text-orange-700 flex-shrink-0 mt-0.5" fill="currentColor" viewBox="0 0 20 20">
<path fillRule="evenodd" d="M8.257 3.099c.765-1.36 2.722-1.36 3.486 0l5.58 9.92c.75 1.334-.213 2.98-1.742 2.98H4.42c-1.53 0-2.493-1.646-1.743-2.98l5.58-9.92zM11 13a1 1 0 11-2 0 1 1 0 012 0zm-1-8a1 1 0 00-1 1v3a1 1 0 002 0V6a1 1 0 00-1-1z" clipRule="evenodd" />
</svg>
<div>
<p className="font-bold text-orange-900">Partial Results</p>
<p className="text-sm text-orange-800 mt-1">
This job was interrupted but {job.reviews_count} reviews were saved.
{job.error_message && <span className="block mt-1 text-orange-700">Reason: {job.error_message}</span>}
</p>
</div>
</div>
</div>
)}
{/* Error Message */} {/* Error Message */}
{job.status === 'failed' && job.error_message && ( {job.status === 'failed' && job.error_message && (
<div className="mt-4 p-4 bg-red-100 border-2 border-red-300 rounded-lg"> <div className="mt-4 p-4 bg-red-100 border-2 border-red-300 rounded-lg">

View File

@@ -66,7 +66,9 @@ export function calculateReviewStats(reviews: Review[]): ReviewStats {
// Populate minDate/maxDate/centerDate on reviews for display // Populate minDate/maxDate/centerDate on reviews for display
reviews.forEach(r => { reviews.forEach(r => {
if (!r.minDate || !r.maxDate || !r.centerDate) { if (!r.minDate || !r.maxDate || !r.centerDate) {
const range = parseDateTextToRange(r.date_text); // Handle both date_text and timestamp field names
const dateText = r.date_text || (r as any).timestamp || '';
const range = parseDateTextToRange(dateText);
r.minDate = range.minDate; r.minDate = range.minDate;
r.maxDate = range.maxDate; r.maxDate = range.maxDate;
// Calculate centerDate as midpoint // Calculate centerDate as midpoint
@@ -96,8 +98,8 @@ export function calculateReviewStats(reviews: Review[]): ReviewStats {
// Recent reviews (last 30 days - simplified check) // Recent reviews (last 30 days - simplified check)
const recentReviews = reviews.filter(r => { const recentReviews = reviews.filter(r => {
const text = r.date_text.toLowerCase(); const text = (r.date_text || (r as any).timestamp || '').toLowerCase();
return text.includes('day') || text.includes('week') || text.includes('hour'); return text.includes('day') || text.includes('week') || text.includes('hour') || text.includes('minute') || text.includes('second');
}).length; }).length;
// Rating distribution // Rating distribution
@@ -278,6 +280,14 @@ function extractNumber(text: string): number {
*/ */
export function parseDateTextToRange(dateText: string): { minDate: Date; maxDate: Date } { export function parseDateTextToRange(dateText: string): { minDate: Date; maxDate: Date } {
const now = new Date(); const now = new Date();
// Handle undefined/null dateText
if (!dateText) {
// Return a default range (assume recent - within last month)
const daysAgo = (days: number) => new Date(now.getTime() - days * 24 * 60 * 60 * 1000);
return { minDate: daysAgo(30), maxDate: now };
}
const text = dateText.toLowerCase(); const text = dateText.toLowerCase();
// Remove "Edited " prefix if present // Remove "Edited " prefix if present
@@ -396,7 +406,8 @@ export function filterReviewsByDateRange(reviews: Review[], range: DateRange): R
// Filter range: [filterStart, filterEnd] // Filter range: [filterStart, filterEnd]
// Overlap occurs when: minDate <= filterEnd AND maxDate >= filterStart // Overlap occurs when: minDate <= filterEnd AND maxDate >= filterStart
return reviews.filter(r => { return reviews.filter(r => {
const { minDate, maxDate } = parseDateTextToRange(r.date_text); const dateText = r.date_text || (r as any).timestamp || '';
const { minDate, maxDate } = parseDateTextToRange(dateText);
return minDate <= filterEnd && maxDate >= filterStart; return minDate <= filterEnd && maxDate >= filterStart;
}); });
} }
@@ -405,7 +416,8 @@ export function filterReviewsByCustomDateRange(reviews: Review[], fromDate: Date
if (!fromDate && !toDate) return reviews; if (!fromDate && !toDate) return reviews;
return reviews.filter(r => { return reviews.filter(r => {
const reviewDate = parseDateText(r.date_text); const dateText = r.date_text || (r as any).timestamp || '';
const reviewDate = parseDateText(dateText);
// If only fromDate is set, filter reviews >= fromDate // If only fromDate is set, filter reviews >= fromDate
if (fromDate && !toDate) { if (fromDate && !toDate) {
@@ -429,7 +441,7 @@ export function filterReviewsByCustomDateRange(reviews: Review[], fromDate: Date
export function calculateTimelineData(reviews: Review[]): TimelineDataPoint[] { export function calculateTimelineData(reviews: Review[]): TimelineDataPoint[] {
// Sort reviews by date (newest first) // Sort reviews by date (newest first)
const sortedReviews = [...reviews] const sortedReviews = [...reviews]
.map(r => ({ ...r, parsedDate: parseDateText(r.date_text) })) .map(r => ({ ...r, parsedDate: parseDateText(r.date_text || (r as any).timestamp || '') }))
.sort((a, b) => b.parsedDate.getTime() - a.parsedDate.getTime()); .sort((a, b) => b.parsedDate.getTime() - a.parsedDate.getTime());
// Group by month // Group by month