diff --git a/Dockerfile b/Dockerfile
index c3bbf89..ee8b8fc 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -39,6 +39,13 @@ RUN apt-get update \
&& apt-get install -y chromium chromium-driver \
&& rm -rf /var/lib/apt/lists/*
+# Install VNC server and noVNC (browser-based VNC viewer)
+RUN apt-get update && apt-get install -y \
+ x11vnc \
+ novnc \
+ websockify \
+ && rm -rf /var/lib/apt/lists/*
+
# Set working directory
WORKDIR /app
@@ -51,7 +58,7 @@ COPY modules/ ./modules/
COPY api_server_production.py .
COPY config.yaml .
-# Create startup script for Xvfb + API server
+# Create startup script for Xvfb + VNC + API server
RUN echo '#!/bin/bash\n\
# Start Xvfb (virtual display) in background\n\
Xvfb :99 -screen 0 1920x1080x24 -ac +extension GLX +render -noreset &\n\
@@ -60,6 +67,15 @@ export DISPLAY=:99\n\
# Wait for Xvfb to start\n\
sleep 2\n\
\n\
+# Start VNC server (no password for local dev, binds to all interfaces)\n\
+x11vnc -display :99 -forever -shared -rfbport 5900 -nopw -bg\n\
+\n\
+# Start noVNC websocket proxy (browser access at http://localhost:6080/vnc.html)\n\
+websockify --web=/usr/share/novnc/ 6080 localhost:5900 &\n\
+\n\
+echo "VNC server running on port 5900"\n\
+echo "noVNC web interface at http://localhost:6080/vnc.html"\n\
+\n\
# Start API server\n\
exec python api_server_production.py\n\
' > /app/start.sh && chmod +x /app/start.sh
@@ -71,8 +87,8 @@ RUN useradd -m -u 1000 scraper && \
USER scraper
-# Expose port
-EXPOSE 8000
+# Expose ports: API (8000), VNC (5900), noVNC web (6080)
+EXPOSE 8000 5900 6080
# Environment variables for Chromium in container
ENV DISPLAY=:99
diff --git a/api_server_production.py b/api_server_production.py
index 133ae8b..62b96e8 100644
--- a/api_server_production.py
+++ b/api_server_production.py
@@ -133,12 +133,36 @@ app.add_middleware(
# ==================== Request/Response Models ====================
+class GeolocationModel(BaseModel):
+ """Geolocation coordinates"""
+ lat: float = Field(..., description="Latitude")
+ lng: float = Field(..., description="Longitude")
+
+
+class ViewportModel(BaseModel):
+ """Browser viewport size"""
+ width: int = Field(..., description="Viewport width")
+ height: int = Field(..., description="Viewport height")
+
+
+class BrowserFingerprintModel(BaseModel):
+ """Browser fingerprint to replicate user's browser"""
+ geolocation: Optional[GeolocationModel] = None
+ userAgent: Optional[str] = Field(None, description="User agent string")
+ viewport: Optional[ViewportModel] = Field(None, description="Screen resolution")
+ timezone: Optional[str] = Field(None, description="Timezone (e.g., Europe/Madrid)")
+ language: Optional[str] = Field(None, description="Browser language (e.g., en-US)")
+ platform: Optional[str] = Field(None, description="Platform (e.g., MacIntel, Win32)")
+
+
class ScrapeRequest(BaseModel):
"""Request model for starting a scrape job"""
url: HttpUrl = Field(..., description="Google Maps URL to scrape")
webhook_url: Optional[HttpUrl] = Field(None, description="Webhook URL for async notifications")
webhook_secret: Optional[str] = Field(None, description="Secret for webhook HMAC signature")
metadata: Optional[Dict[str, Any]] = Field(None, description="Optional custom metadata")
+ geolocation: Optional[GeolocationModel] = Field(None, description="User's geolocation for Chrome")
+ browser_fingerprint: Optional[BrowserFingerprintModel] = Field(None, description="User's browser fingerprint")
class JobResponse(BaseModel):
@@ -149,6 +173,7 @@ class JobResponse(BaseModel):
created_at: str
started_at: Optional[str] = None
completed_at: Optional[str] = None
+ updated_at: Optional[str] = None # Last update time for progress tracking
reviews_count: Optional[int] = None
total_reviews: Optional[int] = None # Total reviews available for this place
scrape_time: Optional[float] = None
@@ -157,6 +182,8 @@ class JobResponse(BaseModel):
# Business metadata
business_name: Optional[str] = None
business_address: Optional[str] = None
+ business_category: Optional[str] = None # Category (e.g., "Barber shop")
+ review_topics: Optional[List[Dict[str, Any]]] = None # Topic filters with mention counts
class ReviewsResponse(BaseModel):
@@ -206,12 +233,32 @@ async def start_scrape(request: ScrapeRequest):
raise HTTPException(status_code=500, detail="Database not initialized")
try:
+ # Merge browser fingerprint into metadata if provided
+ metadata = request.metadata or {}
+ if request.browser_fingerprint:
+ fp = request.browser_fingerprint
+ metadata['browser_fingerprint'] = {
+ "userAgent": fp.userAgent,
+ "timezone": fp.timezone,
+ "language": fp.language,
+ "platform": fp.platform,
+ }
+ if fp.viewport:
+ metadata['browser_fingerprint']['viewport'] = {"width": fp.viewport.width, "height": fp.viewport.height}
+ if fp.geolocation:
+ metadata['browser_fingerprint']['geolocation'] = {"lat": fp.geolocation.lat, "lng": fp.geolocation.lng}
+ elif request.geolocation:
+ metadata['geolocation'] = {
+ 'lat': request.geolocation.lat,
+ 'lng': request.geolocation.lng
+ }
+
# Create job in database
job_id = await db.create_job(
url=str(request.url),
webhook_url=str(request.webhook_url) if request.webhook_url else None,
webhook_secret=request.webhook_secret,
- metadata=request.metadata
+ metadata=metadata
)
# Start scraping job in background
@@ -240,6 +287,25 @@ async def get_job(job_id: UUID):
if not job:
raise HTTPException(status_code=404, detail="Job not found")
+ # Parse review_topics if it's a string (JSONB might be returned as string)
+ review_topics = job.get('review_topics')
+ if isinstance(review_topics, str):
+ try:
+ review_topics = json.loads(review_topics)
+ except:
+ review_topics = None
+
+ # Extract business info from metadata if available
+ metadata = job.get('metadata')
+ if isinstance(metadata, str):
+ try:
+ metadata = json.loads(metadata)
+ except:
+ metadata = None
+
+ business_name = metadata.get('business_name') if metadata else None
+ business_category = metadata.get('business_category') if metadata else None
+
return JobResponse(
job_id=str(job['job_id']),
status=job['status'],
@@ -247,11 +313,15 @@ async def get_job(job_id: UUID):
created_at=job['created_at'].isoformat(),
started_at=job['started_at'].isoformat() if job['started_at'] else None,
completed_at=job['completed_at'].isoformat() if job['completed_at'] else None,
+ updated_at=job['updated_at'].isoformat() if job.get('updated_at') else None,
reviews_count=job['reviews_count'],
total_reviews=job.get('total_reviews'),
scrape_time=job['scrape_time'],
error_message=job['error_message'],
- webhook_url=job.get('webhook_url')
+ webhook_url=job.get('webhook_url'),
+ business_name=business_name,
+ business_category=business_category,
+ review_topics=review_topics
)
@@ -541,25 +611,32 @@ async def stream_all_jobs():
@app.get("/jobs/{job_id}/reviews", response_model=ReviewsResponse, summary="Get Job Reviews")
async def get_job_reviews(job_id: UUID):
"""
- Get the actual reviews data for a completed job.
+ Get reviews data for a job.
- Returns 404 if job not found or not completed yet.
+ Returns reviews for completed, partial, or running jobs (if reviews have been collected).
+ Returns 404 if job not found or no reviews available yet.
"""
if not db:
raise HTTPException(status_code=500, detail="Database not initialized")
- reviews = await db.get_job_reviews(job_id)
+ # Get reviews (includes completed, running, and partial jobs)
+ reviews = await db.get_job_reviews(job_id, include_partial=True)
if reviews is None:
job = await db.get_job(job_id)
if not job:
raise HTTPException(status_code=404, detail="Job not found")
- elif job['status'] != 'completed':
+ elif job['status'] == 'pending':
raise HTTPException(
status_code=400,
- detail=f"Job not completed yet (current status: {job['status']})"
+ detail="Job has not started yet"
+ )
+ elif job['status'] == 'failed':
+ raise HTTPException(
+ status_code=400,
+ detail=f"Job failed without saving any reviews: {job.get('error_message', 'Unknown error')}"
)
else:
- raise HTTPException(status_code=404, detail="Reviews data not available")
+ raise HTTPException(status_code=404, detail="No reviews data available yet")
return ReviewsResponse(
job_id=str(job_id),
@@ -603,6 +680,15 @@ async def list_jobs(
business_name = metadata.get('business_name') if metadata else None
business_address = metadata.get('business_address') if metadata else None
+ business_category = metadata.get('business_category') if metadata else None
+
+ # Parse review_topics if it's a string
+ review_topics = job.get('review_topics')
+ if isinstance(review_topics, str):
+ try:
+ review_topics = json.loads(review_topics)
+ except:
+ review_topics = None
result.append(JobResponse(
job_id=str(job['job_id']),
@@ -615,7 +701,9 @@ async def list_jobs(
scrape_time=job.get('scrape_time'),
error_message=job.get('error_message'),
business_name=business_name,
- business_address=business_address
+ business_address=business_address,
+ business_category=business_category,
+ review_topics=review_topics
))
return result
@@ -640,63 +728,69 @@ async def check_reviews(request: ScrapeRequest):
Get business card information from Google Maps.
Returns business name, address, rating, and review count.
- Uses pre-warmed Chrome worker from pool for instant response.
+ Creates a fresh Chrome instance for reliable results (same as full scraper).
This is used to show the business confirmation card in the UI.
"""
- worker = None
- recycle_worker = False
-
try:
url = str(request.url)
- # Get pre-warmed worker from validation pool
- worker = await asyncio.to_thread(get_validation_worker, timeout=10)
+ # Use the SAME scraper algorithm with validation_only=True for early return
+ # Creates a fresh Chrome instance (same as full scraper) to avoid stale browser state
+ # Pooled browsers can have cookies/state that cause Google to render pages differently
- if worker:
- log.info(f"Using worker {worker.worker_id} for business card extraction")
- # Use the pooled worker (don't close it)
- result = await asyncio.to_thread(
- get_business_card_info,
- url=url,
- driver=worker.driver,
- return_driver=True
- )
-
- # Check if the result indicates a session error
- if not result['success'] and result.get('error'):
- error_msg = result.get('error', '').lower()
- if 'invalid session' in error_msg or 'session' in error_msg:
- log.warning(f"Worker {worker.worker_id} has invalid session, will recycle")
- recycle_worker = True
+ # Build fingerprint dict from request
+ fingerprint = None
+ if request.browser_fingerprint:
+ fp = request.browser_fingerprint
+ fingerprint = {
+ "userAgent": fp.userAgent,
+ "timezone": fp.timezone,
+ "language": fp.language,
+ "platform": fp.platform,
+ }
+ if fp.viewport:
+ fingerprint["viewport"] = {"width": fp.viewport.width, "height": fp.viewport.height}
+ if fp.geolocation:
+ fingerprint["geolocation"] = {"lat": fp.geolocation.lat, "lng": fp.geolocation.lng}
+ log.info(f"Creating Chrome with user fingerprint: {fp.platform}, {fp.timezone}")
+ elif request.geolocation:
+ fingerprint = {"geolocation": {"lat": request.geolocation.lat, "lng": request.geolocation.lng}}
+ log.info(f"Creating Chrome with geolocation only")
else:
- # Fallback: create temporary worker
- log.warning("No pooled worker available, creating temporary instance")
- result = await asyncio.to_thread(
- get_business_card_info,
- url=url
- )
+ log.info(f"Creating Chrome with default settings")
- # SIMPLIFIED VALIDATION: If we found a business (name + rating), assume it has reviews
- # Let the actual scraper determine if reviews exist
- has_business = bool(result.get('name') and result.get('rating'))
+ result = await asyncio.to_thread(
+ fast_scrape_reviews,
+ url=url,
+ headless=False, # Use Xvfb display
+ validation_only=True, # Return early after getting total_reviews
+ browser_fingerprint=fingerprint # Pass user's browser fingerprint
+ )
+
+ # Extract validation info from the result
+ validation_info = result.get('validation_info', {})
+ total_reviews = validation_info.get('total_reviews') or result.get('total_reviews') or 0
+ name = validation_info.get('name')
+ rating = validation_info.get('rating')
+ category = validation_info.get('category')
+ address = validation_info.get('address')
+
+ # Has reviews if we found a business with the Reviews tab (indicated by total_reviews > 0)
+ has_reviews = bool(name and total_reviews > 0)
return {
- "has_reviews": has_business, # Boolean: true if business exists
- "total_reviews": result.get('total_reviews') or 0, # Show 0 if unknown
- "name": result.get('name'),
- "address": result.get('address'),
- "rating": result.get('rating'),
- "success": result['success'],
+ "has_reviews": has_reviews, # True if business has reviews
+ "total_reviews": total_reviews,
+ "name": name,
+ "address": address,
+ "rating": rating,
+ "category": category,
+ "success": result.get('success', True),
"error": result.get('error')
}
except Exception as e:
log.error(f"Error checking reviews: {e}")
- # If it's a session error, recycle the worker
- if worker:
- error_msg = str(e).lower()
- if 'invalid session' in error_msg or 'session' in error_msg:
- recycle_worker = True
return {
"has_reviews": False,
@@ -704,10 +798,6 @@ async def check_reviews(request: ScrapeRequest):
"success": False,
"error": str(e)
}
- finally:
- # Release worker back to pool (or recycle if broken)
- if worker:
- await asyncio.to_thread(release_validation_worker, worker, recycle=recycle_worker)
@app.get("/stats", response_model=StatsResponse, summary="Get Statistics")
@@ -808,6 +898,21 @@ async def run_scraping_job(job_id: UUID):
job = await db.get_job(job_id)
url = job['url']
+ # Extract browser fingerprint from metadata if available
+ browser_fingerprint = None
+ metadata = job.get('metadata')
+ if isinstance(metadata, str):
+ try:
+ metadata = json.loads(metadata)
+ except:
+ metadata = None
+ if metadata and 'browser_fingerprint' in metadata:
+ browser_fingerprint = metadata['browser_fingerprint']
+ log.info(f"Using user fingerprint: {browser_fingerprint.get('platform')}, {browser_fingerprint.get('timezone')}")
+ elif metadata and 'geolocation' in metadata:
+ browser_fingerprint = {'geolocation': metadata['geolocation']}
+ log.info(f"Using user geolocation only")
+
# Broadcast job started via SSE
await broadcast_job_update(job_id_str, "job_started", {
"job_id": job_id_str,
@@ -821,9 +926,17 @@ async def run_scraping_job(job_id: UUID):
# Create log capture instance that we can access for real-time logs
log_capture = LogCapture()
+ # Track total reviews for incremental saves
+ total_reviews_seen = [None]
+ # Accumulate all reviews for incremental saves (flush_callback receives batches)
+ all_reviews_collected = []
+
# Progress callback to update job status with current/total counts AND logs
def progress_callback(current_count: int, total_count: int):
"""Update job progress and logs from worker thread"""
+ if total_count:
+ total_reviews_seen[0] = total_count
+
async def update():
# Get current logs from the shared log_capture
current_logs = log_capture.get_logs()
@@ -847,6 +960,22 @@ async def run_scraping_job(job_id: UUID):
# Schedule the coroutine on the event loop
asyncio.run_coroutine_threadsafe(update(), loop)
+ # Flush callback to save reviews incrementally (crash recovery)
+ # Note: flush_callback receives batches, so we accumulate them
+ def flush_callback(reviews_batch: list):
+ """Accumulate and save reviews to DB incrementally from worker thread"""
+ # Extend our collection with the new batch
+ all_reviews_collected.extend(reviews_batch)
+
+ async def save():
+ await db.save_reviews_incremental(
+ job_id=job_id,
+ reviews=all_reviews_collected, # Save ALL reviews so far
+ total_reviews=total_reviews_seen[0]
+ )
+ # Schedule the coroutine on the event loop
+ asyncio.run_coroutine_threadsafe(save(), loop)
+
# Run scraping with progress callback and shared log capture
# headless=False because Docker uses Xvfb virtual display
result = await asyncio.to_thread(
@@ -854,17 +983,20 @@ async def run_scraping_job(job_id: UUID):
url=url,
headless=False,
progress_callback=progress_callback,
- log_capture=log_capture
+ log_capture=log_capture,
+ flush_callback=flush_callback,
+ browser_fingerprint=browser_fingerprint # Pass user's browser fingerprint
)
if result['success']:
- # Save results to database (including scraper logs)
+ # Save results to database (including scraper logs and review topics)
await db.save_job_result(
job_id=job_id,
reviews=result['reviews'],
scrape_time=result['time'],
total_reviews=result.get('total_reviews'),
- scrape_logs=result.get('logs')
+ scrape_logs=result.get('logs'),
+ review_topics=result.get('review_topics')
)
log.info(
@@ -898,68 +1030,142 @@ async def run_scraping_job(job_id: UUID):
)
else:
- # Job failed - save logs for debugging
- await db.update_job_status(
- job_id,
- JobStatus.FAILED,
- error_message=result.get('error', 'Unknown error'),
- scrape_logs=result.get('logs')
- )
+ # Job failed - check if we have partial reviews saved
+ current_job = await db.get_job(job_id)
+ partial_count = current_job.get('reviews_count', 0) if current_job else 0
- log.error(f"Failed job {job_id}: {result.get('error')}")
-
- # Broadcast job failed via SSE
- await broadcast_job_update(job_id_str, "job_failed", {
- "job_id": job_id_str,
- "status": "failed",
- "error_message": result.get('error'),
- "logs": result.get('logs', [])
- })
-
- # Send failure webhook if configured
- if job.get('webhook_url'):
- webhook_manager = WebhookManager()
- await webhook_manager.send_job_completed_webhook(
- webhook_url=job['webhook_url'],
- job_id=job_id,
- status='failed',
- error_message=result.get('error'),
- secret=job.get('webhook_secret'),
- db=db
+ if partial_count > 0:
+ # Mark as partial - we have some reviews saved
+ await db.mark_job_partial(
+ job_id,
+ error_message=result.get('error', 'Unknown error'),
+ scrape_logs=result.get('logs')
)
+ log.warning(f"Partial job {job_id}: {partial_count} reviews saved before error: {result.get('error')}")
+
+ # Broadcast job partial via SSE
+ await broadcast_job_update(job_id_str, "job_partial", {
+ "job_id": job_id_str,
+ "status": "partial",
+ "reviews_count": partial_count,
+ "total_reviews": current_job.get('total_reviews'),
+ "error_message": result.get('error'),
+ "logs": result.get('logs', [])
+ })
+
+ # Send partial webhook if configured
+ if job.get('webhook_url'):
+ webhook_manager = WebhookManager()
+ await webhook_manager.send_job_completed_webhook(
+ webhook_url=job['webhook_url'],
+ job_id=job_id,
+ status='partial',
+ reviews_count=partial_count,
+ error_message=result.get('error'),
+ secret=job.get('webhook_secret'),
+ db=db
+ )
+ else:
+ # No reviews saved - mark as failed
+ await db.update_job_status(
+ job_id,
+ JobStatus.FAILED,
+ error_message=result.get('error', 'Unknown error'),
+ scrape_logs=result.get('logs')
+ )
+
+ log.error(f"Failed job {job_id}: {result.get('error')}")
+
+ # Broadcast job failed via SSE
+ await broadcast_job_update(job_id_str, "job_failed", {
+ "job_id": job_id_str,
+ "status": "failed",
+ "error_message": result.get('error'),
+ "logs": result.get('logs', [])
+ })
+
+ # Send failure webhook if configured
+ if job.get('webhook_url'):
+ webhook_manager = WebhookManager()
+ await webhook_manager.send_job_completed_webhook(
+ webhook_url=job['webhook_url'],
+ job_id=job_id,
+ status='failed',
+ error_message=result.get('error'),
+ secret=job.get('webhook_secret'),
+ db=db
+ )
+
except Exception as e:
log.error(f"Error in scraping job {job_id}: {e}")
import traceback
traceback.print_exc()
- await db.update_job_status(
- job_id,
- JobStatus.FAILED,
- error_message=str(e)
- )
+ # Check if we have partial reviews saved
+ current_job = await db.get_job(job_id)
+ partial_count = current_job.get('reviews_count', 0) if current_job else 0
- # Broadcast job failed via SSE
- await broadcast_job_update(job_id_str, "job_failed", {
- "job_id": job_id_str,
- "status": "failed",
- "error_message": str(e),
- "logs": []
- })
-
- # Send failure webhook
- job = await db.get_job(job_id)
- if job and job.get('webhook_url'):
- webhook_manager = WebhookManager()
- await webhook_manager.send_job_completed_webhook(
- webhook_url=job['webhook_url'],
- job_id=job_id,
- status='failed',
+ if partial_count > 0:
+ # Mark as partial - we have some reviews saved
+ await db.mark_job_partial(
+ job_id,
error_message=str(e),
- secret=job.get('webhook_secret'),
- db=db
+ scrape_logs=log_capture.get_logs() if log_capture else None
)
+ log.warning(f"Partial job {job_id}: {partial_count} reviews saved before exception: {e}")
+
+ # Broadcast job partial via SSE
+ await broadcast_job_update(job_id_str, "job_partial", {
+ "job_id": job_id_str,
+ "status": "partial",
+ "reviews_count": partial_count,
+ "total_reviews": current_job.get('total_reviews'),
+ "error_message": str(e),
+ "logs": log_capture.get_logs() if log_capture else []
+ })
+
+ # Send partial webhook
+ if current_job and current_job.get('webhook_url'):
+ webhook_manager = WebhookManager()
+ await webhook_manager.send_job_completed_webhook(
+ webhook_url=current_job['webhook_url'],
+ job_id=job_id,
+ status='partial',
+ reviews_count=partial_count,
+ error_message=str(e),
+ secret=current_job.get('webhook_secret'),
+ db=db
+ )
+ else:
+ # No reviews saved - mark as failed
+ await db.update_job_status(
+ job_id,
+ JobStatus.FAILED,
+ error_message=str(e)
+ )
+
+ # Broadcast job failed via SSE
+ await broadcast_job_update(job_id_str, "job_failed", {
+ "job_id": job_id_str,
+ "status": "failed",
+ "error_message": str(e),
+ "logs": []
+ })
+
+ # Send failure webhook
+ if current_job and current_job.get('webhook_url'):
+ webhook_manager = WebhookManager()
+ await webhook_manager.send_job_completed_webhook(
+ webhook_url=current_job['webhook_url'],
+ job_id=job_id,
+ status='failed',
+ error_message=str(e),
+ secret=current_job.get('webhook_secret'),
+ db=db
+ )
+
if __name__ == "__main__":
import uvicorn
diff --git a/docker-compose.production.yml b/docker-compose.production.yml
index 856e585..5e35140 100644
--- a/docker-compose.production.yml
+++ b/docker-compose.production.yml
@@ -39,6 +39,8 @@ services:
- CHROME_BIN=/usr/bin/chromium
ports:
- "8000:8000"
+ - "5900:5900" # VNC port (for VNC client)
+ - "6080:6080" # noVNC web interface (browser access)
depends_on:
db:
condition: service_healthy
diff --git a/modules/database.py b/modules/database.py
index 8f112a1..7a660f8 100644
--- a/modules/database.py
+++ b/modules/database.py
@@ -21,6 +21,7 @@ class JobStatus(str, Enum):
COMPLETED = "completed"
FAILED = "failed"
CANCELLED = "cancelled"
+ PARTIAL = "partial" # Job crashed but has partial reviews saved
class DatabaseManager:
@@ -69,6 +70,7 @@ class DatabaseManager:
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
started_at TIMESTAMP,
completed_at TIMESTAMP,
+ updated_at TIMESTAMP,
reviews_count INTEGER,
total_reviews INTEGER,
@@ -79,7 +81,7 @@ class DatabaseManager:
metadata JSONB,
scrape_logs JSONB,
- CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled'))
+ CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled', 'partial'))
);
""")
@@ -88,6 +90,24 @@ class DatabaseManager:
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS scrape_logs JSONB;
""")
+ # Add updated_at column if it doesn't exist (for incremental progress tracking)
+ await conn.execute("""
+ ALTER TABLE jobs ADD COLUMN IF NOT EXISTS updated_at TIMESTAMP;
+ """)
+
+ # Add review_topics column if it doesn't exist (extracted topic filters with mention counts)
+ await conn.execute("""
+ ALTER TABLE jobs ADD COLUMN IF NOT EXISTS review_topics JSONB;
+ """)
+
+ # Update constraint to include 'partial' status (for existing databases)
+ await conn.execute("""
+ ALTER TABLE jobs DROP CONSTRAINT IF EXISTS valid_status;
+ """)
+ await conn.execute("""
+ ALTER TABLE jobs ADD CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled', 'partial'));
+ """)
+
# Create indexes
await conn.execute("""
CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status);
@@ -187,13 +207,15 @@ class DatabaseManager:
created_at,
started_at,
completed_at,
+ updated_at,
reviews_count,
total_reviews,
reviews_data,
scrape_time,
error_message,
metadata,
- scrape_logs
+ scrape_logs,
+ review_topics
FROM jobs
WHERE job_id = $1
""", job_id)
@@ -203,22 +225,32 @@ class DatabaseManager:
return dict(row)
- async def get_job_reviews(self, job_id: UUID) -> Optional[List[Dict[str, Any]]]:
+ async def get_job_reviews(self, job_id: UUID, include_partial: bool = True) -> Optional[List[Dict[str, Any]]]:
"""
Get reviews for a specific job.
Args:
job_id: Job UUID
+ include_partial: If True, also return reviews for running and partial jobs
Returns:
- List of reviews or None if not found/not completed
+ List of reviews or None if not found/no reviews
"""
async with self.pool.acquire() as conn:
- reviews_data = await conn.fetchval("""
- SELECT reviews_data
- FROM jobs
- WHERE job_id = $1 AND status = 'completed'
- """, job_id)
+ if include_partial:
+ # Return reviews for completed, running, or partial jobs
+ reviews_data = await conn.fetchval("""
+ SELECT reviews_data
+ FROM jobs
+ WHERE job_id = $1 AND status IN ('completed', 'running', 'partial')
+ """, job_id)
+ else:
+ # Only return reviews for completed jobs
+ reviews_data = await conn.fetchval("""
+ SELECT reviews_data
+ FROM jobs
+ WHERE job_id = $1 AND status = 'completed'
+ """, job_id)
if not reviews_data:
return None
@@ -278,7 +310,8 @@ class DatabaseManager:
reviews: List[Dict[str, Any]],
scrape_time: float,
total_reviews: Optional[int] = None,
- scrape_logs: Optional[List[Dict[str, Any]]] = None
+ scrape_logs: Optional[List[Dict[str, Any]]] = None,
+ review_topics: Optional[List[Dict[str, Any]]] = None
):
"""
Save scraping results to database.
@@ -289,8 +322,33 @@ class DatabaseManager:
scrape_time: Time taken to scrape in seconds
total_reviews: Total reviews available (from page counter)
scrape_logs: List of log entries from the scraper
+ review_topics: List of topic filter dictionaries with topic and count
"""
async with self.pool.acquire() as conn:
+ # If reviews list is empty, check if job already has reviews from incremental saves
+ # This happens when flush_callback was used during scraping
+ if not reviews:
+ existing = await conn.fetchval(
+ "SELECT reviews_count FROM jobs WHERE job_id = $1", job_id
+ )
+ if existing and existing > 0:
+ # Job has reviews from incremental saves, don't overwrite reviews_data
+ await conn.execute("""
+ UPDATE jobs
+ SET
+ status = 'completed',
+ completed_at = NOW(),
+ total_reviews = COALESCE($2, total_reviews),
+ scrape_time = $3,
+ scrape_logs = $4::jsonb,
+ review_topics = $5::jsonb
+ WHERE job_id = $1
+ """, job_id, total_reviews, scrape_time,
+ json.dumps(scrape_logs) if scrape_logs else None,
+ json.dumps(review_topics) if review_topics else None)
+ log.info(f"Completed job {job_id} with {existing} reviews (from incremental saves)")
+ return
+
await conn.execute("""
UPDATE jobs
SET
@@ -300,13 +358,70 @@ class DatabaseManager:
total_reviews = $3,
reviews_data = $4::jsonb,
scrape_time = $5,
- scrape_logs = $6::jsonb
+ scrape_logs = $6::jsonb,
+ review_topics = $7::jsonb
WHERE job_id = $1
""", job_id, len(reviews), total_reviews, json.dumps(reviews), scrape_time,
- json.dumps(scrape_logs) if scrape_logs else None)
+ json.dumps(scrape_logs) if scrape_logs else None,
+ json.dumps(review_topics) if review_topics else None)
log.info(f"Saved {len(reviews)} reviews for job {job_id}")
+ async def save_reviews_incremental(
+ self,
+ job_id: UUID,
+ reviews: List[Dict[str, Any]],
+ total_reviews: Optional[int] = None
+ ):
+ """
+ Save reviews incrementally during scraping.
+ Called on each flush to preserve progress in case of crash.
+
+ Args:
+ job_id: Job UUID
+ reviews: ALL reviews collected so far (not just new ones)
+ total_reviews: Total reviews available (from page counter)
+ """
+ async with self.pool.acquire() as conn:
+ await conn.execute("""
+ UPDATE jobs
+ SET
+ reviews_count = $2,
+ total_reviews = COALESCE($3, total_reviews),
+ reviews_data = $4::jsonb,
+ updated_at = NOW()
+ WHERE job_id = $1 AND status = 'running'
+ """, job_id, len(reviews), total_reviews, json.dumps(reviews))
+
+ log.debug(f"Incremental save: {len(reviews)} reviews for job {job_id}")
+
+ async def mark_job_partial(
+ self,
+ job_id: UUID,
+ error_message: str,
+ scrape_logs: Optional[List[Dict[str, Any]]] = None
+ ):
+ """
+ Mark a job as partial (crashed but has some reviews saved).
+
+ Args:
+ job_id: Job UUID
+ error_message: Error that caused the crash
+ scrape_logs: Log entries from the scraper
+ """
+ async with self.pool.acquire() as conn:
+ await conn.execute("""
+ UPDATE jobs
+ SET
+ status = 'partial',
+ completed_at = NOW(),
+ error_message = $2,
+ scrape_logs = $3::jsonb
+ WHERE job_id = $1
+ """, job_id, error_message, json.dumps(scrape_logs) if scrape_logs else None)
+
+ log.info(f"Marked job {job_id} as partial due to: {error_message}")
+
async def list_jobs(
self,
status: Optional[JobStatus] = None,
@@ -337,7 +452,8 @@ class DatabaseManager:
total_reviews,
scrape_time,
error_message,
- metadata
+ metadata,
+ review_topics
FROM jobs
WHERE status = $1
ORDER BY created_at DESC
@@ -355,7 +471,8 @@ class DatabaseManager:
total_reviews,
scrape_time,
error_message,
- metadata
+ metadata,
+ review_topics
FROM jobs
ORDER BY created_at DESC
LIMIT $1 OFFSET $2
diff --git a/modules/scraper_clean.py b/modules/scraper_clean.py
index a60d114..e364359 100644
--- a/modules/scraper_clean.py
+++ b/modules/scraper_clean.py
@@ -268,7 +268,7 @@ def parse_dom_review(card) -> dict:
def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: int = 15,
flush_callback=None, flush_batch_size: int = 500, log_capture: LogCapture = None,
- progress_callback=None) -> dict:
+ progress_callback=None, validation_only: bool = False) -> dict:
"""
Scrape Google Maps reviews.
@@ -299,6 +299,9 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
# Track total reviews (persists across refreshes)
total_reviews = [None] # Use list for closure mutation
+ # Store business info extracted from overview (before clicking reviews tab)
+ business_info_cache = [None]
+
# Hard refresh counter
hard_refresh_count = [0]
max_hard_refreshes = 3 # Max number of hard refreshes before giving up
@@ -323,11 +326,14 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
pass
return None
- def setup_reviews_page(is_refresh=False):
+ def setup_reviews_page(is_refresh=False, validation_only_mode=False):
"""
Setup the reviews page for scraping.
Returns (scroll_container, stop_scrolling_event) or (None, None) on failure.
Can be called after initial load or after a hard refresh.
+
+ If validation_only_mode=True, returns early after extracting business info
+ without clicking reviews tab or finding scroll container.
"""
nonlocal total_reviews
@@ -335,6 +341,13 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
# Navigate to URL (only on initial load or refresh)
if not is_refresh:
+ # Reset browser state by navigating to blank page first
+ # This clears any stale state from pooled browser sessions
+ try:
+ driver.get("about:blank")
+ time.sleep(0.1)
+ except:
+ pass
log.info(f"🌐 Loading: {url[:80]}...")
else:
log.info(f"🔄 Hard refresh #{hard_refresh_count[0]}: reloading page...")
@@ -353,6 +366,8 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
# Reload original URL after consent
log.info(" Reloading after consent...")
driver.get(url)
+ # Wait for page to settle after consent reload
+ time.sleep(1)
break
except:
pass
@@ -362,43 +377,108 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
break
time.sleep(0.01) # 10ms - responsive but low CPU
- # Extract total review count BEFORE clicking reviews tab (it's on Overview)
+ # Extract business info and total review count BEFORE clicking reviews tab (on Overview)
+ # This captures name, rating, category, address while they're visible
# Only on first load (don't overwrite if we already have it)
- if total_reviews[0] is None:
+ if total_reviews[0] is None or business_info_cache[0] is None:
start = time.time()
while time.time() - start < 5:
try:
- count = driver.execute_script("""
- var reviewSpans = document.querySelectorAll('span[role="img"]');
- for (var i = 0; i < reviewSpans.length; i++) {
- var label = reviewSpans[i].getAttribute('aria-label') || '';
- var match = label.match(/^([\\d,\\.]+)\\s*review/i);
- if (match) {
- return parseInt(match[1].replace(/[,\\.]/g, ''));
+ info = driver.execute_script("""
+ var result = {
+ total_reviews: null,
+ name: null,
+ rating: null,
+ category: null,
+ address: null
+ };
+
+ // Business name from h1
+ var h1 = document.querySelector('h1');
+ if (h1) result.name = h1.textContent.trim();
+
+ // Category - use jsaction attribute (robust selector)
+ var catBtn = document.querySelector('button[jsaction*="category"]');
+ if (catBtn) result.category = catBtn.textContent.trim();
+
+ // Rating and review count from span[role="img"] aria-labels
+ var spans = document.querySelectorAll('span[role="img"]');
+ for (var i = 0; i < spans.length; i++) {
+ var label = spans[i].getAttribute('aria-label') || '';
+
+ // Rating: "4.8 stars"
+ var rMatch = label.match(/^([\\d,.]+)\\s*star/i);
+ if (rMatch && !result.rating) {
+ result.rating = parseFloat(rMatch[1].replace(',', '.'));
+ }
+
+ // Reviews: "79 reviews"
+ var revMatch = label.match(/^([\\d,\\.]+)\\s*review/i);
+ if (revMatch && !result.total_reviews) {
+ result.total_reviews = parseInt(revMatch[1].replace(/[,\\.]/g, ''));
}
}
- return null;
+
+ // Address from button
+ var addrBtn = document.querySelector('button[data-item-id="address"]');
+ if (addrBtn) {
+ var label = addrBtn.getAttribute('aria-label');
+ if (label) result.address = label.replace(/^Address:\\s*/i, '');
+ }
+
+ return result;
""")
- if count:
- total_reviews[0] = count
- log.info(f"📊 Total reviews on page: {count}")
- break
+
+ if info:
+ if info.get('total_reviews') and total_reviews[0] is None:
+ total_reviews[0] = info['total_reviews']
+ log.info(f"📊 Total reviews on page: {total_reviews[0]}")
+ if info.get('name') and business_info_cache[0] is None:
+ business_info_cache[0] = info
+ log.info(f"📍 Business: {info.get('name')}")
+ if total_reviews[0] and business_info_cache[0]:
+ break
except:
pass
time.sleep(0.1)
+ # VALIDATION_ONLY: Return early - skip clicking reviews tab, sorting, etc.
+ if validation_only_mode:
+ log.info("📋 Validation mode: returning early (skipping reviews tab)")
+ return ("validation_done", None)
+
# Click reviews tab - poll until found
review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"]
start = time.time()
tab_clicked = False
+ tabs_logged = False
while time.time() - start < 5: # Max 5s for tabs
try:
tabs = driver.find_elements(By.CSS_SELECTOR, "button[role='tab']")
+ # Log available tabs once for debugging
+ if not tabs_logged and tabs:
+ tabs_logged = True
+ tab_texts = [t.text for t in tabs]
+ log.info(f" Available tabs: {tab_texts}")
for tab in tabs:
tab_text = tab.text.lower()
if any(kw in tab_text for kw in review_keywords):
if not is_refresh:
log.info(f" Clicking reviews tab: '{tab.text}'")
+ # Extract total_reviews from tab text like "Reviews (79)" or "Reviews\n79"
+ if total_reviews[0] is None:
+ import re
+ # Try pattern with parentheses: "Reviews (79)"
+ match = re.search(r'\((\d+)\)', tab.text)
+ if match:
+ total_reviews[0] = int(match.group(1))
+ log.info(f"📊 Total reviews from tab: {total_reviews[0]}")
+ else:
+ # Try pattern with newline: "Reviews\n79"
+ match = re.search(r'(\d+)', tab.text)
+ if match:
+ total_reviews[0] = int(match.group(1))
+ log.info(f"📊 Total reviews from tab: {total_reviews[0]}")
tab.click()
tab_clicked = True
break
@@ -569,11 +649,85 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
return scroll_container, stop_scrolling
- # Initial page setup
- scroll_container, stop_scrolling = setup_reviews_page(is_refresh=False)
+ # Helper to extract review topics from the reviews tab
+ def extract_review_topics():
+ """Extract review topic filters from radiogroup (robust selectors)."""
+ try:
+ topics = driver.execute_script("""
+ var topics = [];
+
+ // Primary: use role="radiogroup" with aria-label="Refine reviews" (robust)
+ var container = document.querySelector('div[role="radiogroup"][aria-label*="Refine"], div[role="radiogroup"][aria-label*="refine"]');
+
+ if (!container) {
+ // Fallback: any radiogroup in the reviews area
+ container = document.querySelector('div[role="radiogroup"]');
+ }
+
+ if (container) {
+ var buttons = container.querySelectorAll('button[role="radio"]');
+ for (var btn of buttons) {
+ var label = btn.getAttribute('aria-label') || '';
+ // Parse "hair salon, mentioned in 4 reviews" format
+ var match = label.match(/^([^,]+),\\s*mentioned in (\\d+)/i);
+ if (match) {
+ topics.push({
+ topic: match[1].trim(),
+ count: parseInt(match[2])
+ });
+ } else if (label && !label.toLowerCase().includes('all review')) {
+ // Fallback: try to extract from child spans
+ var countSpan = btn.querySelector('.bC3Nkc, .fontBodySmall');
+ var nameSpan = btn.querySelector('.uEubGf, span:first-child');
+ if (nameSpan) {
+ var name = nameSpan.textContent.trim();
+ var count = countSpan ? parseInt(countSpan.textContent) : 0;
+ if (name && name.toLowerCase() !== 'all') {
+ topics.push({topic: name, count: count || 0});
+ }
+ }
+ }
+ }
+ }
+
+ return topics;
+ """)
+ return topics or []
+ except:
+ return []
+
+ # Initial page setup (pass validation_only to skip unnecessary steps)
+ scroll_container, stop_scrolling = setup_reviews_page(is_refresh=False, validation_only_mode=validation_only)
+
+ # VALIDATION_ONLY MODE: Return early with just total_reviews and business info
+ # setup_reviews_page returns ("validation_done", None) in this case
+ if validation_only or scroll_container == "validation_done":
+ # Use the business info captured from Overview (before clicking reviews tab)
+ business_info = business_info_cache[0] or {}
+
+ return {
+ "reviews": [],
+ "total": total_reviews[0] or 0,
+ "scrolls": 0,
+ "error": None,
+ "validation_info": {
+ "name": business_info.get("name"),
+ "rating": business_info.get("rating"),
+ "category": business_info.get("category"),
+ "address": business_info.get("address"),
+ "total_reviews": total_reviews[0]
+ }
+ }
+
if not scroll_container:
return {"reviews": [], "total": 0, "scrolls": 0, "error": "No scroll container found"}
+ # Extract review topics after reviews tab is loaded (before scrolling begins)
+ time.sleep(0.5) # Brief wait for topic filters to render
+ review_topics = extract_review_topics()
+ if review_topics:
+ log.info(f"📊 Found {len(review_topics)} review topics: {', '.join(t['topic'] for t in review_topics[:5])}...")
+
def get_api_reviews():
"""Get reviews from intercepted API responses."""
api_revs = []
@@ -990,13 +1144,15 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
"total_flushed": total_flushed[0],
"checks": check_num,
"url": url,
- "logs": log.get_logs()
+ "logs": log.get_logs(),
+ "review_topics": review_topics # Topic filters with mention counts
}
def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999,
progress_callback=None, driver=None, return_driver: bool = False,
- log_capture: LogCapture = None):
+ log_capture: LogCapture = None, flush_callback=None, validation_only: bool = False,
+ browser_fingerprint: dict = None):
"""
Production-compatible wrapper for scrape_reviews.
Matches the API expected by job_manager.py.
@@ -1009,6 +1165,13 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
driver: Existing driver instance to reuse
return_driver: If True, return driver in result
log_capture: Optional LogCapture instance for real-time log access
+ browser_fingerprint: Optional dict with user's browser fingerprint:
+ - geolocation: {lat, lng}
+ - userAgent: string
+ - viewport: {width, height}
+ - timezone: string (e.g., "Europe/Madrid")
+ - language: string (e.g., "en-US")
+ - platform: string (e.g., "MacIntel")
Returns:
Dictionary with: reviews, count, total_reviews, time, success, error, driver, logs
@@ -1023,27 +1186,56 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
log_capture = log_capture or LogCapture()
try:
+ # Extract fingerprint settings
+ fp = browser_fingerprint or {}
+ user_agent = fp.get('userAgent') or "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+ viewport = fp.get('viewport') or {'width': 1200, 'height': 900}
+ geolocation = fp.get('geolocation')
+ timezone = fp.get('timezone')
+ language = fp.get('language', 'en-US')
+
# Create driver if not provided
if not driver:
driver = Driver(
uc=True,
headless=headless,
page_load_strategy="normal",
- agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+ agent=user_agent # Use user's actual user agent
)
- driver.set_window_size(1200, 900) # Proper viewport for Google Maps
+ # Set viewport to match user's screen
+ driver.set_window_size(viewport['width'], viewport['height'])
- # Set Chrome geolocation to US (Boston, MA) using CDP
- # This ensures Google Maps shows US results regardless of server location
+ # Apply browser fingerprint settings via CDP
try:
- driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
- 'latitude': 42.3601,
- 'longitude': -71.0589,
- 'accuracy': 100
- })
- log_capture.info("Set geolocation to US (Boston, MA)")
+ # Set timezone if provided
+ if timezone:
+ driver.execute_cdp_cmd('Emulation.setTimezoneOverride', {'timezoneId': timezone})
+ log_capture.info(f"Set timezone to {timezone}")
+
+ # Set locale/language
+ driver.execute_cdp_cmd('Emulation.setLocaleOverride', {'locale': language})
+
+ # Set geolocation
+ if geolocation and 'lat' in geolocation and 'lng' in geolocation:
+ driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
+ 'latitude': geolocation['lat'],
+ 'longitude': geolocation['lng'],
+ 'accuracy': 1000 # ~1km accuracy for IP-based location
+ })
+ log_capture.info(f"Set geolocation to ({geolocation['lat']:.2f}, {geolocation['lng']:.2f})")
+ else:
+ # Default to US (Boston, MA) if no geolocation provided
+ driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
+ 'latitude': 42.3601,
+ 'longitude': -71.0589,
+ 'accuracy': 100
+ })
+ log_capture.info("Set geolocation to US (Boston, MA) [default]")
+
+ if fp:
+ log_capture.info(f"Browser fingerprint applied: {fp.get('platform', 'unknown')}, {viewport['width']}x{viewport['height']}")
except Exception as e:
- log_capture.warning(f"Could not set geolocation: {e}")
+ log_capture.warning(f"Could not apply fingerprint settings: {e}")
# Add URL parameters for consistent results
if 'hl=' not in url:
@@ -1052,14 +1244,18 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
if 'gl=' not in url:
url = f"{url}&gl=us"
- # Create progress wrapper if callback provided
- flush_callback = None
- if progress_callback:
+ # Create combined flush callback for progress + external handler
+ external_flush = flush_callback # Save external callback
+ internal_flush = None
+ if progress_callback or external_flush:
collected = [0]
- def flush_with_progress(reviews_batch):
- collected[0] += len(reviews_batch)
- progress_callback(collected[0], None)
- flush_callback = flush_with_progress
+ def combined_flush(reviews_batch):
+ collected[0] = len(reviews_batch) # reviews_batch is ALL reviews so far
+ if progress_callback:
+ progress_callback(collected[0], None)
+ if external_flush:
+ external_flush(reviews_batch) # Pass reviews to external handler
+ internal_flush = combined_flush
# Run the scraper with progress callback for real-time updates
result = scrape_reviews(
@@ -1067,10 +1263,11 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
url=url,
max_reviews=999999, # Effectively unlimited
timeout_no_new=15,
- flush_callback=flush_callback,
+ flush_callback=internal_flush,
flush_batch_size=100, # Smaller batches for more frequent progress
log_capture=log_capture,
- progress_callback=progress_callback # Pass through for real-time log updates
+ progress_callback=progress_callback, # Pass through for real-time log updates
+ validation_only=validation_only # Return early if just validating
)
elapsed = time.time() - start_time
@@ -1083,9 +1280,14 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
"time": elapsed,
"success": True,
"error": None,
- "logs": result.get("logs", [])
+ "logs": result.get("logs", []),
+ "review_topics": result.get("review_topics", []) # Topic filters with mention counts
}
+ # Include validation_info if in validation_only mode
+ if validation_only and "validation_info" in result:
+ response["validation_info"] = result["validation_info"]
+
if return_driver:
response["driver"] = driver
elif should_close_driver:
@@ -1120,6 +1322,122 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
}
+def extract_about_info(driver, url: str = None) -> dict:
+ """
+ Extract About section info from Google Maps (Accessibility, Amenities, etc.).
+
+ This function should be called AFTER reviews are scraped if about info is needed,
+ as it navigates to a different tab.
+
+ Args:
+ driver: Selenium WebDriver instance (already on the business page)
+ url: Optional URL to navigate to first (if not already on the page)
+
+ Returns:
+ dict with section names as keys, each containing list of features
+ """
+ try:
+ # Navigate if URL provided
+ if url:
+ # Force English
+ if 'hl=' not in url:
+ separator = '&' if '?' in url else '?'
+ url = f"{url}{separator}hl=en"
+ if 'gl=' not in url:
+ url = f"{url}&gl=us"
+ driver.get(url)
+ time.sleep(1)
+
+ # Click About tab using robust selectors
+ clicked = driver.execute_script("""
+ // Try multiple selectors for about tab
+ var selectors = [
+ 'button[aria-label*="About"]',
+ 'button[data-tab-index="2"]',
+ 'div[role="tablist"] button:nth-child(3)',
+ 'button[jsaction*="about"]'
+ ];
+
+ for (var sel of selectors) {
+ var btn = document.querySelector(sel);
+ if (btn && btn.textContent.toLowerCase().includes('about')) {
+ btn.click();
+ return true;
+ }
+ }
+
+ // Fallback: find by text content
+ var buttons = document.querySelectorAll('button');
+ for (var btn of buttons) {
+ if (btn.textContent.trim().toLowerCase() === 'about') {
+ btn.click();
+ return true;
+ }
+ }
+ return false;
+ """)
+
+ if not clicked:
+ return {}
+
+ time.sleep(1.5) # Wait for about tab to load
+
+ # Extract about sections using aria-labels (robust)
+ about = driver.execute_script("""
+ var about = {};
+
+ // Find the about region by aria-label or role
+ var container = document.querySelector('div[role="region"][aria-label*="About"]');
+
+ if (!container) {
+ // Fallback: look for the scrollable area with sections
+ container = document.querySelector('.m6QErb[aria-label*="About"]');
+ }
+
+ if (!container) {
+ // Last resort: find sections by h2 headers
+ container = document;
+ }
+
+ // Find all section headers (h2 elements)
+ var sections = container.querySelectorAll('h2');
+
+ for (var h2 of sections) {
+ var sectionName = h2.textContent.trim();
+ var items = [];
+
+ // Find the ul list following this h2
+ var parent = h2.closest('.iP2t7d, div');
+ if (parent) {
+ var listItems = parent.querySelectorAll('li span[aria-label]');
+ for (var li of listItems) {
+ var label = li.getAttribute('aria-label');
+ if (label) {
+ // Parse "Has toilet" or "No wheelchair-accessible car park"
+ var hasFeature = !label.toLowerCase().startsWith('no ');
+ var featureName = label.replace(/^(Has |No )/i, '');
+ items.push({
+ feature: featureName,
+ available: hasFeature
+ });
+ }
+ }
+ }
+
+ if (sectionName && items.length > 0) {
+ about[sectionName] = items;
+ }
+ }
+
+ return about;
+ """)
+
+ return about or {}
+
+ except Exception as e:
+ return {"error": str(e)}
+
+
# Test function
if __name__ == "__main__":
from seleniumbase import Driver
@@ -1159,6 +1477,8 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
dict with: name, address, rating, total_reviews, success, error, time
"""
from seleniumbase import Driver
+ import logging
+ log = logging.getLogger(__name__)
start_time = time.time()
driver_provided = driver is not None
@@ -1177,13 +1497,15 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
except:
pass
- # Clear state if reusing a pooled driver (ensures clean page load)
- if driver_provided:
- try:
- driver.delete_all_cookies()
- driver.get("about:blank")
- except:
- pass
+ # Don't clear state - Google may serve different content based on session history
+ # The scraper doesn't reset state, so validation shouldn't either
+
+ # Force English interface for consistent parsing
+ if 'hl=' not in url:
+ separator = '&' if '?' in url else '?'
+ url = f"{url}{separator}hl=en"
+ if 'gl=' not in url:
+ url = f"{url}&gl=us"
# Navigate to URL
driver.get(url)
@@ -1193,48 +1515,183 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
while time.time() - start < 5:
if "consent.google" in driver.current_url:
try:
- for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
- txt = btn.text.lower()
- if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
- btn.click()
- driver.get(url)
- break
- except:
+ # Try multiple approaches to find and click accept button
+ clicked = False
+
+ # Method 1: Find by aria-label (most reliable for Google consent)
+ for btn in driver.find_elements(By.CSS_SELECTOR, "button[aria-label*='Accept']"):
+ btn.click()
+ clicked = True
+ break
+
+ # Method 2: Find by text content
+ if not clicked:
+ for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
+ txt = btn.text.lower()
+ if "accept all" in txt or "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
+ btn.click()
+ clicked = True
+ break
+
+ if clicked:
+ time.sleep(0.5) # Brief wait for consent to process
+ driver.get(url) # Reload the target URL
+ time.sleep(0.5) # Wait for reload
+ except Exception as e:
pass
break
if "maps/place" in driver.current_url or ("maps" in driver.current_url and "consent" not in driver.current_url):
break
time.sleep(0.01) # 10ms - responsive but low CPU
+ # Log current URL after consent handling
+ try:
+ current_url = driver.current_url
+ log.info(f"🔍 Validation: Current URL after load: {current_url[:80]}...")
+ except:
+ pass
+
+ # Wait for page to fully render before polling (tabs may load dynamically)
+ time.sleep(2)
+
# Poll for business info (same pattern as total_reviews extraction)
- info = {"name": None, "rating": None, "total_reviews": None, "address": None}
+ # Timeout increased to 10s because Reviews tab can take 6+ seconds to appear after consent
+ info = {"name": None, "rating": None, "total_reviews": None, "address": None, "category": None}
start = time.time()
- while time.time() - start < 5:
+ debug_logged = False
+ while time.time() - start < 10:
try:
info = driver.execute_script("""
- var result = {name: null, rating: null, total_reviews: null, address: null};
+ var result = {name: null, rating: null, total_reviews: null, address: null, category: null, debug: []};
// Business name from h1
var h1 = document.querySelector('h1');
if (h1) result.name = h1.textContent.trim();
- // Rating and reviews from span[role="img"] aria-labels
- // Same pattern as scrape_reviews for consistency
+ // Category - use jsaction attribute (robust, survives class changes)
+ var catBtn = document.querySelector('button[jsaction*="category"]');
+ if (catBtn) result.category = catBtn.textContent.trim();
+
+ // Fallback: look for button after rating that's not a link
+ if (!result.category) {
+ var buttons = document.querySelectorAll('button');
+ for (var btn of buttons) {
+ var text = btn.textContent.trim();
+ // Categories are short words, no numbers, not navigation
+ if (text && text.length < 50 && !text.match(/^[0-9]/) &&
+ !text.match(/review|star|direction|save|share|photo/i)) {
+ // Check if it's near the rating area
+ var parent = btn.closest('.LBgpqf, .skqShb, .fontBodyMedium');
+ if (parent) {
+ result.category = text;
+ break;
+ }
+ }
+ }
+ }
+
+ // Rating from span[role="img"] aria-labels
var spans = document.querySelectorAll('span[role="img"]');
for (var i = 0; i < spans.length; i++) {
var label = spans[i].getAttribute('aria-label') || '';
- // Rating: "4.8 stars", "4,8 estrellas", etc (partial match)
- var rMatch = label.match(/^([\\d,.]+)\\s*(star|estrella|étoile|stern|stell)/i);
+ // Collect debug info for all aria-labels
+ if (label) {
+ result.debug.push('img-aria: ' + label);
+ }
+
+ // Rating: "4.8 stars" (English forced via hl=en)
+ var rMatch = label.match(/^([\\d,.]+)\\s*star/i);
if (rMatch && !result.rating) {
result.rating = parseFloat(rMatch[1].replace(',', '.'));
}
- // Reviews: same as scrape_reviews - /^([\d,.]+)\s*review/i
- // Plus Spanish "reseña" which doesn't contain "review"
- var revMatch = label.match(/^([\\d,\\.]+)\\s*(review|reseña|avis|bewertung|recension)/i);
+ // Reviews: "79 reviews" or "4.8 stars 79 reviews" (English forced via hl=en)
+ // Try direct format first: "79 reviews"
+ var revMatch = label.match(/^([\\d,]+)\\s*review/i);
if (revMatch && !result.total_reviews) {
- result.total_reviews = parseInt(revMatch[1].replace(/[,\\.]/g, ''));
+ result.total_reviews = parseInt(revMatch[1].replace(/,/g, ''));
+ }
+
+ // Try combined format: "4.8 stars 79 reviews" or "4.8 stars 79k+ reviews"
+ if (!result.total_reviews) {
+ var combinedMatch = label.match(/stars?\\s+([\\d,]+k?\\+?)\\s*review/i);
+ if (combinedMatch) {
+ var countStr = combinedMatch[1].replace(/,/g, '');
+ if (countStr.includes('k')) {
+ // Handle "9k+" format
+ result.total_reviews = parseInt(countStr) * 1000;
+ } else {
+ result.total_reviews = parseInt(countStr);
+ }
+ }
+ }
+ }
+
+ // Also collect tab button texts for debugging (include full text including numbers)
+ var tabs = document.querySelectorAll('button[role="tab"]');
+ for (var j = 0; j < tabs.length; j++) {
+ var tabText = tabs[j].textContent.trim();
+ result.debug.push('tab: ' + tabText);
+ // Also try to extract review count from tab text like "Reviews (79)"
+ if (tabText.toLowerCase().includes('review') && !result.total_reviews) {
+ var tabMatch = tabText.match(/\\((\\d+)\\)/);
+ if (tabMatch) {
+ result.total_reviews = parseInt(tabMatch[1]);
+ result.debug.push('Found reviews in tab: ' + tabText);
+ }
+ }
+ }
+
+ // Also check ALL buttons for reviews count
+ var allButtons = document.querySelectorAll('button');
+ for (var b = 0; b < allButtons.length; b++) {
+ var btnText = allButtons[b].textContent || '';
+ if (btnText.toLowerCase().includes('review') && !btnText.toLowerCase().includes('write')) {
+ var numMatch = btnText.match(/\\((\\d+)\\)/);
+ if (numMatch && !result.total_reviews) {
+ result.total_reviews = parseInt(numMatch[1]);
+ result.debug.push('Found reviews in button: ' + btnText.substring(0, 50));
+ }
+ }
+ }
+
+ // Check if we're on search results vs place page
+ result.debug.push('title: ' + document.title);
+ result.debug.push('url: ' + window.location.href.substring(0, 80));
+
+ // Check for search results list
+ var searchResults = document.querySelectorAll('div[role="feed"] > div');
+ result.debug.push('search_results_count: ' + searchResults.length);
+
+ // Fallback: Get review count from Reviews tab button "Reviews (79)"
+ // Search ALL tab buttons for one containing "review" text (same as scrape_reviews)
+ if (!result.total_reviews) {
+ var tabs = document.querySelectorAll('button[role="tab"]');
+ for (var tab of tabs) {
+ var text = tab.textContent.toLowerCase();
+ if (text.includes('review')) {
+ var match = tab.textContent.match(/\\((\\d+)\\)/);
+ if (match) {
+ result.total_reviews = parseInt(match[1]);
+ break;
+ }
+ }
+ }
+ }
+
+ // Fallback 2: Look for any button with "Reviews" and a number
+ if (!result.total_reviews) {
+ var buttons = document.querySelectorAll('button');
+ for (var btn of buttons) {
+ var text = btn.textContent;
+ if (text.toLowerCase().includes('review') && !text.toLowerCase().includes('write')) {
+ var numMatch = text.match(/\\((\\d+)\\)/);
+ if (numMatch) {
+ result.total_reviews = parseInt(numMatch[1]);
+ break;
+ }
+ }
}
}
@@ -1242,23 +1699,41 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
var addrBtn = document.querySelector('button[data-item-id="address"]');
if (addrBtn) {
var label = addrBtn.getAttribute('aria-label');
- if (label) result.address = label.replace(/^(Address|Dirección|Adresse):\\s*/i, '');
+ if (label) result.address = label.replace(/^Address:\\s*/i, '');
}
return result;
""")
- # Exit early if we have the essentials
- if info.get("name") and info.get("total_reviews") is not None:
+ # Exit early if we have the essentials (name found AND reviews count > 0)
+ if info.get("name") and info.get("total_reviews") and info.get("total_reviews") > 0:
break
+
+ # Log debug info once after 3 seconds
+ if not debug_logged and time.time() - start > 3:
+ debug_logged = True
+ debug_info = info.get("debug", [])
+ if debug_info:
+ log.info(f"🔍 Validation debug - URL: {url[:50]}...")
+ log.info(f" Name: {info.get('name')}, Rating: {info.get('rating')}, Reviews: {info.get('total_reviews')}")
+ for d in debug_info[:10]: # First 10 debug items
+ log.info(f" {d}")
except:
pass
time.sleep(0.1) # 100ms between polls
+ # Final debug log if still no reviews
+ if not info.get("total_reviews"):
+ debug_info = info.get("debug", [])
+ log.warning(f"⚠️ Validation: No reviews found for '{info.get('name')}' after 10s polling")
+ if debug_info:
+ log.warning(f" Debug items: {debug_info[:10]}")
+
return {
"name": info.get("name"),
"address": info.get("address"),
"rating": info.get("rating"),
"total_reviews": info.get("total_reviews"),
+ "category": info.get("category"),
"success": bool(info.get("name")),
"error": None,
"time": time.time() - start_time
@@ -1270,6 +1745,7 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
"address": None,
"rating": None,
"total_reviews": None,
+ "category": None,
"success": False,
"error": str(e),
"time": time.time() - start_time
diff --git a/web/app/page.tsx b/web/app/page.tsx
index a2478af..834865a 100644
--- a/web/app/page.tsx
+++ b/web/app/page.tsx
@@ -27,6 +27,8 @@ interface SelectedJob {
jobId: string;
newCount?: number;
previousJobId?: string;
+ businessCategory?: string;
+ reviewTopics?: { topic: string; count: number }[];
}
type ViewType = 'newScrape' | 'jobs' | 'reports';
@@ -106,6 +108,8 @@ export default function Home() {
jobId: job.job_id,
newCount: data.new_count,
previousJobId: previousJob?.job_id,
+ businessCategory: job.business_category || undefined,
+ reviewTopics: job.review_topics || undefined,
});
setActiveView('reports');
}
@@ -155,7 +159,7 @@ export default function Home() {
Back to Reports
-
+{reviewTopics.length - 15} more topics
+ )} +Partial Results
++ This job was interrupted but {job.reviews_count} reviews were saved. + {job.error_message && Reason: {job.error_message}} +
+