Add browser fingerprint support and analytics metadata display

- Transfer user's browser fingerprint (user-agent, viewport, timezone,
  language, geolocation) to Chrome for more authentic scraping
- Display review topics from Google Maps in analytics dashboard
- Show business category badge in analytics header
- Fix date_text null handling in analytics (handle undefined/timestamp fields)
- Add review_topics and business_category to JobStatus interface

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-24 10:36:06 +00:00
parent 1bd30c0789
commit a540ab97b1
9 changed files with 1214 additions and 231 deletions

View File

@@ -39,6 +39,13 @@ RUN apt-get update \
&& apt-get install -y chromium chromium-driver \
&& rm -rf /var/lib/apt/lists/*
# Install VNC server and noVNC (browser-based VNC viewer)
RUN apt-get update && apt-get install -y \
x11vnc \
novnc \
websockify \
&& rm -rf /var/lib/apt/lists/*
# Set working directory
WORKDIR /app
@@ -51,7 +58,7 @@ COPY modules/ ./modules/
COPY api_server_production.py .
COPY config.yaml .
# Create startup script for Xvfb + API server
# Create startup script for Xvfb + VNC + API server
RUN echo '#!/bin/bash\n\
# Start Xvfb (virtual display) in background\n\
Xvfb :99 -screen 0 1920x1080x24 -ac +extension GLX +render -noreset &\n\
@@ -60,6 +67,15 @@ export DISPLAY=:99\n\
# Wait for Xvfb to start\n\
sleep 2\n\
\n\
# Start VNC server (no password for local dev, binds to all interfaces)\n\
x11vnc -display :99 -forever -shared -rfbport 5900 -nopw -bg\n\
\n\
# Start noVNC websocket proxy (browser access at http://localhost:6080/vnc.html)\n\
websockify --web=/usr/share/novnc/ 6080 localhost:5900 &\n\
\n\
echo "VNC server running on port 5900"\n\
echo "noVNC web interface at http://localhost:6080/vnc.html"\n\
\n\
# Start API server\n\
exec python api_server_production.py\n\
' > /app/start.sh && chmod +x /app/start.sh
@@ -71,8 +87,8 @@ RUN useradd -m -u 1000 scraper && \
USER scraper
# Expose port
EXPOSE 8000
# Expose ports: API (8000), VNC (5900), noVNC web (6080)
EXPOSE 8000 5900 6080
# Environment variables for Chromium in container
ENV DISPLAY=:99

View File

@@ -133,12 +133,36 @@ app.add_middleware(
# ==================== Request/Response Models ====================
class GeolocationModel(BaseModel):
"""Geolocation coordinates"""
lat: float = Field(..., description="Latitude")
lng: float = Field(..., description="Longitude")
class ViewportModel(BaseModel):
"""Browser viewport size"""
width: int = Field(..., description="Viewport width")
height: int = Field(..., description="Viewport height")
class BrowserFingerprintModel(BaseModel):
"""Browser fingerprint to replicate user's browser"""
geolocation: Optional[GeolocationModel] = None
userAgent: Optional[str] = Field(None, description="User agent string")
viewport: Optional[ViewportModel] = Field(None, description="Screen resolution")
timezone: Optional[str] = Field(None, description="Timezone (e.g., Europe/Madrid)")
language: Optional[str] = Field(None, description="Browser language (e.g., en-US)")
platform: Optional[str] = Field(None, description="Platform (e.g., MacIntel, Win32)")
class ScrapeRequest(BaseModel):
"""Request model for starting a scrape job"""
url: HttpUrl = Field(..., description="Google Maps URL to scrape")
webhook_url: Optional[HttpUrl] = Field(None, description="Webhook URL for async notifications")
webhook_secret: Optional[str] = Field(None, description="Secret for webhook HMAC signature")
metadata: Optional[Dict[str, Any]] = Field(None, description="Optional custom metadata")
geolocation: Optional[GeolocationModel] = Field(None, description="User's geolocation for Chrome")
browser_fingerprint: Optional[BrowserFingerprintModel] = Field(None, description="User's browser fingerprint")
class JobResponse(BaseModel):
@@ -149,6 +173,7 @@ class JobResponse(BaseModel):
created_at: str
started_at: Optional[str] = None
completed_at: Optional[str] = None
updated_at: Optional[str] = None # Last update time for progress tracking
reviews_count: Optional[int] = None
total_reviews: Optional[int] = None # Total reviews available for this place
scrape_time: Optional[float] = None
@@ -157,6 +182,8 @@ class JobResponse(BaseModel):
# Business metadata
business_name: Optional[str] = None
business_address: Optional[str] = None
business_category: Optional[str] = None # Category (e.g., "Barber shop")
review_topics: Optional[List[Dict[str, Any]]] = None # Topic filters with mention counts
class ReviewsResponse(BaseModel):
@@ -206,12 +233,32 @@ async def start_scrape(request: ScrapeRequest):
raise HTTPException(status_code=500, detail="Database not initialized")
try:
# Merge browser fingerprint into metadata if provided
metadata = request.metadata or {}
if request.browser_fingerprint:
fp = request.browser_fingerprint
metadata['browser_fingerprint'] = {
"userAgent": fp.userAgent,
"timezone": fp.timezone,
"language": fp.language,
"platform": fp.platform,
}
if fp.viewport:
metadata['browser_fingerprint']['viewport'] = {"width": fp.viewport.width, "height": fp.viewport.height}
if fp.geolocation:
metadata['browser_fingerprint']['geolocation'] = {"lat": fp.geolocation.lat, "lng": fp.geolocation.lng}
elif request.geolocation:
metadata['geolocation'] = {
'lat': request.geolocation.lat,
'lng': request.geolocation.lng
}
# Create job in database
job_id = await db.create_job(
url=str(request.url),
webhook_url=str(request.webhook_url) if request.webhook_url else None,
webhook_secret=request.webhook_secret,
metadata=request.metadata
metadata=metadata
)
# Start scraping job in background
@@ -240,6 +287,25 @@ async def get_job(job_id: UUID):
if not job:
raise HTTPException(status_code=404, detail="Job not found")
# Parse review_topics if it's a string (JSONB might be returned as string)
review_topics = job.get('review_topics')
if isinstance(review_topics, str):
try:
review_topics = json.loads(review_topics)
except:
review_topics = None
# Extract business info from metadata if available
metadata = job.get('metadata')
if isinstance(metadata, str):
try:
metadata = json.loads(metadata)
except:
metadata = None
business_name = metadata.get('business_name') if metadata else None
business_category = metadata.get('business_category') if metadata else None
return JobResponse(
job_id=str(job['job_id']),
status=job['status'],
@@ -247,11 +313,15 @@ async def get_job(job_id: UUID):
created_at=job['created_at'].isoformat(),
started_at=job['started_at'].isoformat() if job['started_at'] else None,
completed_at=job['completed_at'].isoformat() if job['completed_at'] else None,
updated_at=job['updated_at'].isoformat() if job.get('updated_at') else None,
reviews_count=job['reviews_count'],
total_reviews=job.get('total_reviews'),
scrape_time=job['scrape_time'],
error_message=job['error_message'],
webhook_url=job.get('webhook_url')
webhook_url=job.get('webhook_url'),
business_name=business_name,
business_category=business_category,
review_topics=review_topics
)
@@ -541,25 +611,32 @@ async def stream_all_jobs():
@app.get("/jobs/{job_id}/reviews", response_model=ReviewsResponse, summary="Get Job Reviews")
async def get_job_reviews(job_id: UUID):
"""
Get the actual reviews data for a completed job.
Get reviews data for a job.
Returns 404 if job not found or not completed yet.
Returns reviews for completed, partial, or running jobs (if reviews have been collected).
Returns 404 if job not found or no reviews available yet.
"""
if not db:
raise HTTPException(status_code=500, detail="Database not initialized")
reviews = await db.get_job_reviews(job_id)
# Get reviews (includes completed, running, and partial jobs)
reviews = await db.get_job_reviews(job_id, include_partial=True)
if reviews is None:
job = await db.get_job(job_id)
if not job:
raise HTTPException(status_code=404, detail="Job not found")
elif job['status'] != 'completed':
elif job['status'] == 'pending':
raise HTTPException(
status_code=400,
detail=f"Job not completed yet (current status: {job['status']})"
detail="Job has not started yet"
)
elif job['status'] == 'failed':
raise HTTPException(
status_code=400,
detail=f"Job failed without saving any reviews: {job.get('error_message', 'Unknown error')}"
)
else:
raise HTTPException(status_code=404, detail="Reviews data not available")
raise HTTPException(status_code=404, detail="No reviews data available yet")
return ReviewsResponse(
job_id=str(job_id),
@@ -603,6 +680,15 @@ async def list_jobs(
business_name = metadata.get('business_name') if metadata else None
business_address = metadata.get('business_address') if metadata else None
business_category = metadata.get('business_category') if metadata else None
# Parse review_topics if it's a string
review_topics = job.get('review_topics')
if isinstance(review_topics, str):
try:
review_topics = json.loads(review_topics)
except:
review_topics = None
result.append(JobResponse(
job_id=str(job['job_id']),
@@ -615,7 +701,9 @@ async def list_jobs(
scrape_time=job.get('scrape_time'),
error_message=job.get('error_message'),
business_name=business_name,
business_address=business_address
business_address=business_address,
business_category=business_category,
review_topics=review_topics
))
return result
@@ -640,63 +728,69 @@ async def check_reviews(request: ScrapeRequest):
Get business card information from Google Maps.
Returns business name, address, rating, and review count.
Uses pre-warmed Chrome worker from pool for instant response.
Creates a fresh Chrome instance for reliable results (same as full scraper).
This is used to show the business confirmation card in the UI.
"""
worker = None
recycle_worker = False
try:
url = str(request.url)
# Get pre-warmed worker from validation pool
worker = await asyncio.to_thread(get_validation_worker, timeout=10)
# Use the SAME scraper algorithm with validation_only=True for early return
# Creates a fresh Chrome instance (same as full scraper) to avoid stale browser state
# Pooled browsers can have cookies/state that cause Google to render pages differently
if worker:
log.info(f"Using worker {worker.worker_id} for business card extraction")
# Use the pooled worker (don't close it)
result = await asyncio.to_thread(
get_business_card_info,
url=url,
driver=worker.driver,
return_driver=True
)
# Check if the result indicates a session error
if not result['success'] and result.get('error'):
error_msg = result.get('error', '').lower()
if 'invalid session' in error_msg or 'session' in error_msg:
log.warning(f"Worker {worker.worker_id} has invalid session, will recycle")
recycle_worker = True
# Build fingerprint dict from request
fingerprint = None
if request.browser_fingerprint:
fp = request.browser_fingerprint
fingerprint = {
"userAgent": fp.userAgent,
"timezone": fp.timezone,
"language": fp.language,
"platform": fp.platform,
}
if fp.viewport:
fingerprint["viewport"] = {"width": fp.viewport.width, "height": fp.viewport.height}
if fp.geolocation:
fingerprint["geolocation"] = {"lat": fp.geolocation.lat, "lng": fp.geolocation.lng}
log.info(f"Creating Chrome with user fingerprint: {fp.platform}, {fp.timezone}")
elif request.geolocation:
fingerprint = {"geolocation": {"lat": request.geolocation.lat, "lng": request.geolocation.lng}}
log.info(f"Creating Chrome with geolocation only")
else:
# Fallback: create temporary worker
log.warning("No pooled worker available, creating temporary instance")
log.info(f"Creating Chrome with default settings")
result = await asyncio.to_thread(
get_business_card_info,
url=url
fast_scrape_reviews,
url=url,
headless=False, # Use Xvfb display
validation_only=True, # Return early after getting total_reviews
browser_fingerprint=fingerprint # Pass user's browser fingerprint
)
# SIMPLIFIED VALIDATION: If we found a business (name + rating), assume it has reviews
# Let the actual scraper determine if reviews exist
has_business = bool(result.get('name') and result.get('rating'))
# Extract validation info from the result
validation_info = result.get('validation_info', {})
total_reviews = validation_info.get('total_reviews') or result.get('total_reviews') or 0
name = validation_info.get('name')
rating = validation_info.get('rating')
category = validation_info.get('category')
address = validation_info.get('address')
# Has reviews if we found a business with the Reviews tab (indicated by total_reviews > 0)
has_reviews = bool(name and total_reviews > 0)
return {
"has_reviews": has_business, # Boolean: true if business exists
"total_reviews": result.get('total_reviews') or 0, # Show 0 if unknown
"name": result.get('name'),
"address": result.get('address'),
"rating": result.get('rating'),
"success": result['success'],
"has_reviews": has_reviews, # True if business has reviews
"total_reviews": total_reviews,
"name": name,
"address": address,
"rating": rating,
"category": category,
"success": result.get('success', True),
"error": result.get('error')
}
except Exception as e:
log.error(f"Error checking reviews: {e}")
# If it's a session error, recycle the worker
if worker:
error_msg = str(e).lower()
if 'invalid session' in error_msg or 'session' in error_msg:
recycle_worker = True
return {
"has_reviews": False,
@@ -704,10 +798,6 @@ async def check_reviews(request: ScrapeRequest):
"success": False,
"error": str(e)
}
finally:
# Release worker back to pool (or recycle if broken)
if worker:
await asyncio.to_thread(release_validation_worker, worker, recycle=recycle_worker)
@app.get("/stats", response_model=StatsResponse, summary="Get Statistics")
@@ -808,6 +898,21 @@ async def run_scraping_job(job_id: UUID):
job = await db.get_job(job_id)
url = job['url']
# Extract browser fingerprint from metadata if available
browser_fingerprint = None
metadata = job.get('metadata')
if isinstance(metadata, str):
try:
metadata = json.loads(metadata)
except:
metadata = None
if metadata and 'browser_fingerprint' in metadata:
browser_fingerprint = metadata['browser_fingerprint']
log.info(f"Using user fingerprint: {browser_fingerprint.get('platform')}, {browser_fingerprint.get('timezone')}")
elif metadata and 'geolocation' in metadata:
browser_fingerprint = {'geolocation': metadata['geolocation']}
log.info(f"Using user geolocation only")
# Broadcast job started via SSE
await broadcast_job_update(job_id_str, "job_started", {
"job_id": job_id_str,
@@ -821,9 +926,17 @@ async def run_scraping_job(job_id: UUID):
# Create log capture instance that we can access for real-time logs
log_capture = LogCapture()
# Track total reviews for incremental saves
total_reviews_seen = [None]
# Accumulate all reviews for incremental saves (flush_callback receives batches)
all_reviews_collected = []
# Progress callback to update job status with current/total counts AND logs
def progress_callback(current_count: int, total_count: int):
"""Update job progress and logs from worker thread"""
if total_count:
total_reviews_seen[0] = total_count
async def update():
# Get current logs from the shared log_capture
current_logs = log_capture.get_logs()
@@ -847,6 +960,22 @@ async def run_scraping_job(job_id: UUID):
# Schedule the coroutine on the event loop
asyncio.run_coroutine_threadsafe(update(), loop)
# Flush callback to save reviews incrementally (crash recovery)
# Note: flush_callback receives batches, so we accumulate them
def flush_callback(reviews_batch: list):
"""Accumulate and save reviews to DB incrementally from worker thread"""
# Extend our collection with the new batch
all_reviews_collected.extend(reviews_batch)
async def save():
await db.save_reviews_incremental(
job_id=job_id,
reviews=all_reviews_collected, # Save ALL reviews so far
total_reviews=total_reviews_seen[0]
)
# Schedule the coroutine on the event loop
asyncio.run_coroutine_threadsafe(save(), loop)
# Run scraping with progress callback and shared log capture
# headless=False because Docker uses Xvfb virtual display
result = await asyncio.to_thread(
@@ -854,17 +983,20 @@ async def run_scraping_job(job_id: UUID):
url=url,
headless=False,
progress_callback=progress_callback,
log_capture=log_capture
log_capture=log_capture,
flush_callback=flush_callback,
browser_fingerprint=browser_fingerprint # Pass user's browser fingerprint
)
if result['success']:
# Save results to database (including scraper logs)
# Save results to database (including scraper logs and review topics)
await db.save_job_result(
job_id=job_id,
reviews=result['reviews'],
scrape_time=result['time'],
total_reviews=result.get('total_reviews'),
scrape_logs=result.get('logs')
scrape_logs=result.get('logs'),
review_topics=result.get('review_topics')
)
log.info(
@@ -898,7 +1030,44 @@ async def run_scraping_job(job_id: UUID):
)
else:
# Job failed - save logs for debugging
# Job failed - check if we have partial reviews saved
current_job = await db.get_job(job_id)
partial_count = current_job.get('reviews_count', 0) if current_job else 0
if partial_count > 0:
# Mark as partial - we have some reviews saved
await db.mark_job_partial(
job_id,
error_message=result.get('error', 'Unknown error'),
scrape_logs=result.get('logs')
)
log.warning(f"Partial job {job_id}: {partial_count} reviews saved before error: {result.get('error')}")
# Broadcast job partial via SSE
await broadcast_job_update(job_id_str, "job_partial", {
"job_id": job_id_str,
"status": "partial",
"reviews_count": partial_count,
"total_reviews": current_job.get('total_reviews'),
"error_message": result.get('error'),
"logs": result.get('logs', [])
})
# Send partial webhook if configured
if job.get('webhook_url'):
webhook_manager = WebhookManager()
await webhook_manager.send_job_completed_webhook(
webhook_url=job['webhook_url'],
job_id=job_id,
status='partial',
reviews_count=partial_count,
error_message=result.get('error'),
secret=job.get('webhook_secret'),
db=db
)
else:
# No reviews saved - mark as failed
await db.update_job_status(
job_id,
JobStatus.FAILED,
@@ -933,6 +1102,44 @@ async def run_scraping_job(job_id: UUID):
import traceback
traceback.print_exc()
# Check if we have partial reviews saved
current_job = await db.get_job(job_id)
partial_count = current_job.get('reviews_count', 0) if current_job else 0
if partial_count > 0:
# Mark as partial - we have some reviews saved
await db.mark_job_partial(
job_id,
error_message=str(e),
scrape_logs=log_capture.get_logs() if log_capture else None
)
log.warning(f"Partial job {job_id}: {partial_count} reviews saved before exception: {e}")
# Broadcast job partial via SSE
await broadcast_job_update(job_id_str, "job_partial", {
"job_id": job_id_str,
"status": "partial",
"reviews_count": partial_count,
"total_reviews": current_job.get('total_reviews'),
"error_message": str(e),
"logs": log_capture.get_logs() if log_capture else []
})
# Send partial webhook
if current_job and current_job.get('webhook_url'):
webhook_manager = WebhookManager()
await webhook_manager.send_job_completed_webhook(
webhook_url=current_job['webhook_url'],
job_id=job_id,
status='partial',
reviews_count=partial_count,
error_message=str(e),
secret=current_job.get('webhook_secret'),
db=db
)
else:
# No reviews saved - mark as failed
await db.update_job_status(
job_id,
JobStatus.FAILED,
@@ -948,15 +1155,14 @@ async def run_scraping_job(job_id: UUID):
})
# Send failure webhook
job = await db.get_job(job_id)
if job and job.get('webhook_url'):
if current_job and current_job.get('webhook_url'):
webhook_manager = WebhookManager()
await webhook_manager.send_job_completed_webhook(
webhook_url=job['webhook_url'],
webhook_url=current_job['webhook_url'],
job_id=job_id,
status='failed',
error_message=str(e),
secret=job.get('webhook_secret'),
secret=current_job.get('webhook_secret'),
db=db
)

View File

@@ -39,6 +39,8 @@ services:
- CHROME_BIN=/usr/bin/chromium
ports:
- "8000:8000"
- "5900:5900" # VNC port (for VNC client)
- "6080:6080" # noVNC web interface (browser access)
depends_on:
db:
condition: service_healthy

View File

@@ -21,6 +21,7 @@ class JobStatus(str, Enum):
COMPLETED = "completed"
FAILED = "failed"
CANCELLED = "cancelled"
PARTIAL = "partial" # Job crashed but has partial reviews saved
class DatabaseManager:
@@ -69,6 +70,7 @@ class DatabaseManager:
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
started_at TIMESTAMP,
completed_at TIMESTAMP,
updated_at TIMESTAMP,
reviews_count INTEGER,
total_reviews INTEGER,
@@ -79,7 +81,7 @@ class DatabaseManager:
metadata JSONB,
scrape_logs JSONB,
CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled'))
CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled', 'partial'))
);
""")
@@ -88,6 +90,24 @@ class DatabaseManager:
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS scrape_logs JSONB;
""")
# Add updated_at column if it doesn't exist (for incremental progress tracking)
await conn.execute("""
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS updated_at TIMESTAMP;
""")
# Add review_topics column if it doesn't exist (extracted topic filters with mention counts)
await conn.execute("""
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS review_topics JSONB;
""")
# Update constraint to include 'partial' status (for existing databases)
await conn.execute("""
ALTER TABLE jobs DROP CONSTRAINT IF EXISTS valid_status;
""")
await conn.execute("""
ALTER TABLE jobs ADD CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled', 'partial'));
""")
# Create indexes
await conn.execute("""
CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status);
@@ -187,13 +207,15 @@ class DatabaseManager:
created_at,
started_at,
completed_at,
updated_at,
reviews_count,
total_reviews,
reviews_data,
scrape_time,
error_message,
metadata,
scrape_logs
scrape_logs,
review_topics
FROM jobs
WHERE job_id = $1
""", job_id)
@@ -203,17 +225,27 @@ class DatabaseManager:
return dict(row)
async def get_job_reviews(self, job_id: UUID) -> Optional[List[Dict[str, Any]]]:
async def get_job_reviews(self, job_id: UUID, include_partial: bool = True) -> Optional[List[Dict[str, Any]]]:
"""
Get reviews for a specific job.
Args:
job_id: Job UUID
include_partial: If True, also return reviews for running and partial jobs
Returns:
List of reviews or None if not found/not completed
List of reviews or None if not found/no reviews
"""
async with self.pool.acquire() as conn:
if include_partial:
# Return reviews for completed, running, or partial jobs
reviews_data = await conn.fetchval("""
SELECT reviews_data
FROM jobs
WHERE job_id = $1 AND status IN ('completed', 'running', 'partial')
""", job_id)
else:
# Only return reviews for completed jobs
reviews_data = await conn.fetchval("""
SELECT reviews_data
FROM jobs
@@ -278,7 +310,8 @@ class DatabaseManager:
reviews: List[Dict[str, Any]],
scrape_time: float,
total_reviews: Optional[int] = None,
scrape_logs: Optional[List[Dict[str, Any]]] = None
scrape_logs: Optional[List[Dict[str, Any]]] = None,
review_topics: Optional[List[Dict[str, Any]]] = None
):
"""
Save scraping results to database.
@@ -289,8 +322,33 @@ class DatabaseManager:
scrape_time: Time taken to scrape in seconds
total_reviews: Total reviews available (from page counter)
scrape_logs: List of log entries from the scraper
review_topics: List of topic filter dictionaries with topic and count
"""
async with self.pool.acquire() as conn:
# If reviews list is empty, check if job already has reviews from incremental saves
# This happens when flush_callback was used during scraping
if not reviews:
existing = await conn.fetchval(
"SELECT reviews_count FROM jobs WHERE job_id = $1", job_id
)
if existing and existing > 0:
# Job has reviews from incremental saves, don't overwrite reviews_data
await conn.execute("""
UPDATE jobs
SET
status = 'completed',
completed_at = NOW(),
total_reviews = COALESCE($2, total_reviews),
scrape_time = $3,
scrape_logs = $4::jsonb,
review_topics = $5::jsonb
WHERE job_id = $1
""", job_id, total_reviews, scrape_time,
json.dumps(scrape_logs) if scrape_logs else None,
json.dumps(review_topics) if review_topics else None)
log.info(f"Completed job {job_id} with {existing} reviews (from incremental saves)")
return
await conn.execute("""
UPDATE jobs
SET
@@ -300,13 +358,70 @@ class DatabaseManager:
total_reviews = $3,
reviews_data = $4::jsonb,
scrape_time = $5,
scrape_logs = $6::jsonb
scrape_logs = $6::jsonb,
review_topics = $7::jsonb
WHERE job_id = $1
""", job_id, len(reviews), total_reviews, json.dumps(reviews), scrape_time,
json.dumps(scrape_logs) if scrape_logs else None)
json.dumps(scrape_logs) if scrape_logs else None,
json.dumps(review_topics) if review_topics else None)
log.info(f"Saved {len(reviews)} reviews for job {job_id}")
async def save_reviews_incremental(
self,
job_id: UUID,
reviews: List[Dict[str, Any]],
total_reviews: Optional[int] = None
):
"""
Save reviews incrementally during scraping.
Called on each flush to preserve progress in case of crash.
Args:
job_id: Job UUID
reviews: ALL reviews collected so far (not just new ones)
total_reviews: Total reviews available (from page counter)
"""
async with self.pool.acquire() as conn:
await conn.execute("""
UPDATE jobs
SET
reviews_count = $2,
total_reviews = COALESCE($3, total_reviews),
reviews_data = $4::jsonb,
updated_at = NOW()
WHERE job_id = $1 AND status = 'running'
""", job_id, len(reviews), total_reviews, json.dumps(reviews))
log.debug(f"Incremental save: {len(reviews)} reviews for job {job_id}")
async def mark_job_partial(
self,
job_id: UUID,
error_message: str,
scrape_logs: Optional[List[Dict[str, Any]]] = None
):
"""
Mark a job as partial (crashed but has some reviews saved).
Args:
job_id: Job UUID
error_message: Error that caused the crash
scrape_logs: Log entries from the scraper
"""
async with self.pool.acquire() as conn:
await conn.execute("""
UPDATE jobs
SET
status = 'partial',
completed_at = NOW(),
error_message = $2,
scrape_logs = $3::jsonb
WHERE job_id = $1
""", job_id, error_message, json.dumps(scrape_logs) if scrape_logs else None)
log.info(f"Marked job {job_id} as partial due to: {error_message}")
async def list_jobs(
self,
status: Optional[JobStatus] = None,
@@ -337,7 +452,8 @@ class DatabaseManager:
total_reviews,
scrape_time,
error_message,
metadata
metadata,
review_topics
FROM jobs
WHERE status = $1
ORDER BY created_at DESC
@@ -355,7 +471,8 @@ class DatabaseManager:
total_reviews,
scrape_time,
error_message,
metadata
metadata,
review_topics
FROM jobs
ORDER BY created_at DESC
LIMIT $1 OFFSET $2

View File

@@ -268,7 +268,7 @@ def parse_dom_review(card) -> dict:
def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: int = 15,
flush_callback=None, flush_batch_size: int = 500, log_capture: LogCapture = None,
progress_callback=None) -> dict:
progress_callback=None, validation_only: bool = False) -> dict:
"""
Scrape Google Maps reviews.
@@ -299,6 +299,9 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
# Track total reviews (persists across refreshes)
total_reviews = [None] # Use list for closure mutation
# Store business info extracted from overview (before clicking reviews tab)
business_info_cache = [None]
# Hard refresh counter
hard_refresh_count = [0]
max_hard_refreshes = 3 # Max number of hard refreshes before giving up
@@ -323,11 +326,14 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
pass
return None
def setup_reviews_page(is_refresh=False):
def setup_reviews_page(is_refresh=False, validation_only_mode=False):
"""
Setup the reviews page for scraping.
Returns (scroll_container, stop_scrolling_event) or (None, None) on failure.
Can be called after initial load or after a hard refresh.
If validation_only_mode=True, returns early after extracting business info
without clicking reviews tab or finding scroll container.
"""
nonlocal total_reviews
@@ -335,6 +341,13 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
# Navigate to URL (only on initial load or refresh)
if not is_refresh:
# Reset browser state by navigating to blank page first
# This clears any stale state from pooled browser sessions
try:
driver.get("about:blank")
time.sleep(0.1)
except:
pass
log.info(f"🌐 Loading: {url[:80]}...")
else:
log.info(f"🔄 Hard refresh #{hard_refresh_count[0]}: reloading page...")
@@ -353,6 +366,8 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
# Reload original URL after consent
log.info(" Reloading after consent...")
driver.get(url)
# Wait for page to settle after consent reload
time.sleep(1)
break
except:
pass
@@ -362,43 +377,108 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
break
time.sleep(0.01) # 10ms - responsive but low CPU
# Extract total review count BEFORE clicking reviews tab (it's on Overview)
# Extract business info and total review count BEFORE clicking reviews tab (on Overview)
# This captures name, rating, category, address while they're visible
# Only on first load (don't overwrite if we already have it)
if total_reviews[0] is None:
if total_reviews[0] is None or business_info_cache[0] is None:
start = time.time()
while time.time() - start < 5:
try:
count = driver.execute_script("""
var reviewSpans = document.querySelectorAll('span[role="img"]');
for (var i = 0; i < reviewSpans.length; i++) {
var label = reviewSpans[i].getAttribute('aria-label') || '';
var match = label.match(/^([\\d,\\.]+)\\s*review/i);
if (match) {
return parseInt(match[1].replace(/[,\\.]/g, ''));
info = driver.execute_script("""
var result = {
total_reviews: null,
name: null,
rating: null,
category: null,
address: null
};
// Business name from h1
var h1 = document.querySelector('h1');
if (h1) result.name = h1.textContent.trim();
// Category - use jsaction attribute (robust selector)
var catBtn = document.querySelector('button[jsaction*="category"]');
if (catBtn) result.category = catBtn.textContent.trim();
// Rating and review count from span[role="img"] aria-labels
var spans = document.querySelectorAll('span[role="img"]');
for (var i = 0; i < spans.length; i++) {
var label = spans[i].getAttribute('aria-label') || '';
// Rating: "4.8 stars"
var rMatch = label.match(/^([\\d,.]+)\\s*star/i);
if (rMatch && !result.rating) {
result.rating = parseFloat(rMatch[1].replace(',', '.'));
}
// Reviews: "79 reviews"
var revMatch = label.match(/^([\\d,\\.]+)\\s*review/i);
if (revMatch && !result.total_reviews) {
result.total_reviews = parseInt(revMatch[1].replace(/[,\\.]/g, ''));
}
}
return null;
// Address from button
var addrBtn = document.querySelector('button[data-item-id="address"]');
if (addrBtn) {
var label = addrBtn.getAttribute('aria-label');
if (label) result.address = label.replace(/^Address:\\s*/i, '');
}
return result;
""")
if count:
total_reviews[0] = count
log.info(f"📊 Total reviews on page: {count}")
if info:
if info.get('total_reviews') and total_reviews[0] is None:
total_reviews[0] = info['total_reviews']
log.info(f"📊 Total reviews on page: {total_reviews[0]}")
if info.get('name') and business_info_cache[0] is None:
business_info_cache[0] = info
log.info(f"📍 Business: {info.get('name')}")
if total_reviews[0] and business_info_cache[0]:
break
except:
pass
time.sleep(0.1)
# VALIDATION_ONLY: Return early - skip clicking reviews tab, sorting, etc.
if validation_only_mode:
log.info("📋 Validation mode: returning early (skipping reviews tab)")
return ("validation_done", None)
# Click reviews tab - poll until found
review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"]
start = time.time()
tab_clicked = False
tabs_logged = False
while time.time() - start < 5: # Max 5s for tabs
try:
tabs = driver.find_elements(By.CSS_SELECTOR, "button[role='tab']")
# Log available tabs once for debugging
if not tabs_logged and tabs:
tabs_logged = True
tab_texts = [t.text for t in tabs]
log.info(f" Available tabs: {tab_texts}")
for tab in tabs:
tab_text = tab.text.lower()
if any(kw in tab_text for kw in review_keywords):
if not is_refresh:
log.info(f" Clicking reviews tab: '{tab.text}'")
# Extract total_reviews from tab text like "Reviews (79)" or "Reviews\n79"
if total_reviews[0] is None:
import re
# Try pattern with parentheses: "Reviews (79)"
match = re.search(r'\((\d+)\)', tab.text)
if match:
total_reviews[0] = int(match.group(1))
log.info(f"📊 Total reviews from tab: {total_reviews[0]}")
else:
# Try pattern with newline: "Reviews\n79"
match = re.search(r'(\d+)', tab.text)
if match:
total_reviews[0] = int(match.group(1))
log.info(f"📊 Total reviews from tab: {total_reviews[0]}")
tab.click()
tab_clicked = True
break
@@ -569,11 +649,85 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
return scroll_container, stop_scrolling
# Initial page setup
scroll_container, stop_scrolling = setup_reviews_page(is_refresh=False)
# Helper to extract review topics from the reviews tab
def extract_review_topics():
"""Extract review topic filters from radiogroup (robust selectors)."""
try:
topics = driver.execute_script("""
var topics = [];
// Primary: use role="radiogroup" with aria-label="Refine reviews" (robust)
var container = document.querySelector('div[role="radiogroup"][aria-label*="Refine"], div[role="radiogroup"][aria-label*="refine"]');
if (!container) {
// Fallback: any radiogroup in the reviews area
container = document.querySelector('div[role="radiogroup"]');
}
if (container) {
var buttons = container.querySelectorAll('button[role="radio"]');
for (var btn of buttons) {
var label = btn.getAttribute('aria-label') || '';
// Parse "hair salon, mentioned in 4 reviews" format
var match = label.match(/^([^,]+),\\s*mentioned in (\\d+)/i);
if (match) {
topics.push({
topic: match[1].trim(),
count: parseInt(match[2])
});
} else if (label && !label.toLowerCase().includes('all review')) {
// Fallback: try to extract from child spans
var countSpan = btn.querySelector('.bC3Nkc, .fontBodySmall');
var nameSpan = btn.querySelector('.uEubGf, span:first-child');
if (nameSpan) {
var name = nameSpan.textContent.trim();
var count = countSpan ? parseInt(countSpan.textContent) : 0;
if (name && name.toLowerCase() !== 'all') {
topics.push({topic: name, count: count || 0});
}
}
}
}
}
return topics;
""")
return topics or []
except:
return []
# Initial page setup (pass validation_only to skip unnecessary steps)
scroll_container, stop_scrolling = setup_reviews_page(is_refresh=False, validation_only_mode=validation_only)
# VALIDATION_ONLY MODE: Return early with just total_reviews and business info
# setup_reviews_page returns ("validation_done", None) in this case
if validation_only or scroll_container == "validation_done":
# Use the business info captured from Overview (before clicking reviews tab)
business_info = business_info_cache[0] or {}
return {
"reviews": [],
"total": total_reviews[0] or 0,
"scrolls": 0,
"error": None,
"validation_info": {
"name": business_info.get("name"),
"rating": business_info.get("rating"),
"category": business_info.get("category"),
"address": business_info.get("address"),
"total_reviews": total_reviews[0]
}
}
if not scroll_container:
return {"reviews": [], "total": 0, "scrolls": 0, "error": "No scroll container found"}
# Extract review topics after reviews tab is loaded (before scrolling begins)
time.sleep(0.5) # Brief wait for topic filters to render
review_topics = extract_review_topics()
if review_topics:
log.info(f"📊 Found {len(review_topics)} review topics: {', '.join(t['topic'] for t in review_topics[:5])}...")
def get_api_reviews():
"""Get reviews from intercepted API responses."""
api_revs = []
@@ -990,13 +1144,15 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
"total_flushed": total_flushed[0],
"checks": check_num,
"url": url,
"logs": log.get_logs()
"logs": log.get_logs(),
"review_topics": review_topics # Topic filters with mention counts
}
def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999,
progress_callback=None, driver=None, return_driver: bool = False,
log_capture: LogCapture = None):
log_capture: LogCapture = None, flush_callback=None, validation_only: bool = False,
browser_fingerprint: dict = None):
"""
Production-compatible wrapper for scrape_reviews.
Matches the API expected by job_manager.py.
@@ -1009,6 +1165,13 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
driver: Existing driver instance to reuse
return_driver: If True, return driver in result
log_capture: Optional LogCapture instance for real-time log access
browser_fingerprint: Optional dict with user's browser fingerprint:
- geolocation: {lat, lng}
- userAgent: string
- viewport: {width, height}
- timezone: string (e.g., "Europe/Madrid")
- language: string (e.g., "en-US")
- platform: string (e.g., "MacIntel")
Returns:
Dictionary with: reviews, count, total_reviews, time, success, error, driver, logs
@@ -1023,27 +1186,56 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
log_capture = log_capture or LogCapture()
try:
# Extract fingerprint settings
fp = browser_fingerprint or {}
user_agent = fp.get('userAgent') or "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
viewport = fp.get('viewport') or {'width': 1200, 'height': 900}
geolocation = fp.get('geolocation')
timezone = fp.get('timezone')
language = fp.get('language', 'en-US')
# Create driver if not provided
if not driver:
driver = Driver(
uc=True,
headless=headless,
page_load_strategy="normal",
agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
agent=user_agent # Use user's actual user agent
)
driver.set_window_size(1200, 900) # Proper viewport for Google Maps
# Set viewport to match user's screen
driver.set_window_size(viewport['width'], viewport['height'])
# Set Chrome geolocation to US (Boston, MA) using CDP
# This ensures Google Maps shows US results regardless of server location
# Apply browser fingerprint settings via CDP
try:
# Set timezone if provided
if timezone:
driver.execute_cdp_cmd('Emulation.setTimezoneOverride', {'timezoneId': timezone})
log_capture.info(f"Set timezone to {timezone}")
# Set locale/language
driver.execute_cdp_cmd('Emulation.setLocaleOverride', {'locale': language})
# Set geolocation
if geolocation and 'lat' in geolocation and 'lng' in geolocation:
driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
'latitude': geolocation['lat'],
'longitude': geolocation['lng'],
'accuracy': 1000 # ~1km accuracy for IP-based location
})
log_capture.info(f"Set geolocation to ({geolocation['lat']:.2f}, {geolocation['lng']:.2f})")
else:
# Default to US (Boston, MA) if no geolocation provided
driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
'latitude': 42.3601,
'longitude': -71.0589,
'accuracy': 100
})
log_capture.info("Set geolocation to US (Boston, MA)")
log_capture.info("Set geolocation to US (Boston, MA) [default]")
if fp:
log_capture.info(f"Browser fingerprint applied: {fp.get('platform', 'unknown')}, {viewport['width']}x{viewport['height']}")
except Exception as e:
log_capture.warning(f"Could not set geolocation: {e}")
log_capture.warning(f"Could not apply fingerprint settings: {e}")
# Add URL parameters for consistent results
if 'hl=' not in url:
@@ -1052,14 +1244,18 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
if 'gl=' not in url:
url = f"{url}&gl=us"
# Create progress wrapper if callback provided
flush_callback = None
if progress_callback:
# Create combined flush callback for progress + external handler
external_flush = flush_callback # Save external callback
internal_flush = None
if progress_callback or external_flush:
collected = [0]
def flush_with_progress(reviews_batch):
collected[0] += len(reviews_batch)
def combined_flush(reviews_batch):
collected[0] = len(reviews_batch) # reviews_batch is ALL reviews so far
if progress_callback:
progress_callback(collected[0], None)
flush_callback = flush_with_progress
if external_flush:
external_flush(reviews_batch) # Pass reviews to external handler
internal_flush = combined_flush
# Run the scraper with progress callback for real-time updates
result = scrape_reviews(
@@ -1067,10 +1263,11 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
url=url,
max_reviews=999999, # Effectively unlimited
timeout_no_new=15,
flush_callback=flush_callback,
flush_callback=internal_flush,
flush_batch_size=100, # Smaller batches for more frequent progress
log_capture=log_capture,
progress_callback=progress_callback # Pass through for real-time log updates
progress_callback=progress_callback, # Pass through for real-time log updates
validation_only=validation_only # Return early if just validating
)
elapsed = time.time() - start_time
@@ -1083,9 +1280,14 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
"time": elapsed,
"success": True,
"error": None,
"logs": result.get("logs", [])
"logs": result.get("logs", []),
"review_topics": result.get("review_topics", []) # Topic filters with mention counts
}
# Include validation_info if in validation_only mode
if validation_only and "validation_info" in result:
response["validation_info"] = result["validation_info"]
if return_driver:
response["driver"] = driver
elif should_close_driver:
@@ -1120,6 +1322,122 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
}
def extract_about_info(driver, url: str = None) -> dict:
"""
Extract About section info from Google Maps (Accessibility, Amenities, etc.).
This function should be called AFTER reviews are scraped if about info is needed,
as it navigates to a different tab.
Args:
driver: Selenium WebDriver instance (already on the business page)
url: Optional URL to navigate to first (if not already on the page)
Returns:
dict with section names as keys, each containing list of features
"""
try:
# Navigate if URL provided
if url:
# Force English
if 'hl=' not in url:
separator = '&' if '?' in url else '?'
url = f"{url}{separator}hl=en"
if 'gl=' not in url:
url = f"{url}&gl=us"
driver.get(url)
time.sleep(1)
# Click About tab using robust selectors
clicked = driver.execute_script("""
// Try multiple selectors for about tab
var selectors = [
'button[aria-label*="About"]',
'button[data-tab-index="2"]',
'div[role="tablist"] button:nth-child(3)',
'button[jsaction*="about"]'
];
for (var sel of selectors) {
var btn = document.querySelector(sel);
if (btn && btn.textContent.toLowerCase().includes('about')) {
btn.click();
return true;
}
}
// Fallback: find by text content
var buttons = document.querySelectorAll('button');
for (var btn of buttons) {
if (btn.textContent.trim().toLowerCase() === 'about') {
btn.click();
return true;
}
}
return false;
""")
if not clicked:
return {}
time.sleep(1.5) # Wait for about tab to load
# Extract about sections using aria-labels (robust)
about = driver.execute_script("""
var about = {};
// Find the about region by aria-label or role
var container = document.querySelector('div[role="region"][aria-label*="About"]');
if (!container) {
// Fallback: look for the scrollable area with sections
container = document.querySelector('.m6QErb[aria-label*="About"]');
}
if (!container) {
// Last resort: find sections by h2 headers
container = document;
}
// Find all section headers (h2 elements)
var sections = container.querySelectorAll('h2');
for (var h2 of sections) {
var sectionName = h2.textContent.trim();
var items = [];
// Find the ul list following this h2
var parent = h2.closest('.iP2t7d, div');
if (parent) {
var listItems = parent.querySelectorAll('li span[aria-label]');
for (var li of listItems) {
var label = li.getAttribute('aria-label');
if (label) {
// Parse "Has toilet" or "No wheelchair-accessible car park"
var hasFeature = !label.toLowerCase().startsWith('no ');
var featureName = label.replace(/^(Has |No )/i, '');
items.push({
feature: featureName,
available: hasFeature
});
}
}
}
if (sectionName && items.length > 0) {
about[sectionName] = items;
}
}
return about;
""")
return about or {}
except Exception as e:
return {"error": str(e)}
# Test function
if __name__ == "__main__":
from seleniumbase import Driver
@@ -1159,6 +1477,8 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
dict with: name, address, rating, total_reviews, success, error, time
"""
from seleniumbase import Driver
import logging
log = logging.getLogger(__name__)
start_time = time.time()
driver_provided = driver is not None
@@ -1177,13 +1497,15 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
except:
pass
# Clear state if reusing a pooled driver (ensures clean page load)
if driver_provided:
try:
driver.delete_all_cookies()
driver.get("about:blank")
except:
pass
# Don't clear state - Google may serve different content based on session history
# The scraper doesn't reset state, so validation shouldn't either
# Force English interface for consistent parsing
if 'hl=' not in url:
separator = '&' if '?' in url else '?'
url = f"{url}{separator}hl=en"
if 'gl=' not in url:
url = f"{url}&gl=us"
# Navigate to URL
driver.get(url)
@@ -1193,48 +1515,183 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
while time.time() - start < 5:
if "consent.google" in driver.current_url:
try:
# Try multiple approaches to find and click accept button
clicked = False
# Method 1: Find by aria-label (most reliable for Google consent)
for btn in driver.find_elements(By.CSS_SELECTOR, "button[aria-label*='Accept']"):
btn.click()
clicked = True
break
# Method 2: Find by text content
if not clicked:
for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
txt = btn.text.lower()
if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
if "accept all" in txt or "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
btn.click()
driver.get(url)
clicked = True
break
except:
if clicked:
time.sleep(0.5) # Brief wait for consent to process
driver.get(url) # Reload the target URL
time.sleep(0.5) # Wait for reload
except Exception as e:
pass
break
if "maps/place" in driver.current_url or ("maps" in driver.current_url and "consent" not in driver.current_url):
break
time.sleep(0.01) # 10ms - responsive but low CPU
# Log current URL after consent handling
try:
current_url = driver.current_url
log.info(f"🔍 Validation: Current URL after load: {current_url[:80]}...")
except:
pass
# Wait for page to fully render before polling (tabs may load dynamically)
time.sleep(2)
# Poll for business info (same pattern as total_reviews extraction)
info = {"name": None, "rating": None, "total_reviews": None, "address": None}
# Timeout increased to 10s because Reviews tab can take 6+ seconds to appear after consent
info = {"name": None, "rating": None, "total_reviews": None, "address": None, "category": None}
start = time.time()
while time.time() - start < 5:
debug_logged = False
while time.time() - start < 10:
try:
info = driver.execute_script("""
var result = {name: null, rating: null, total_reviews: null, address: null};
var result = {name: null, rating: null, total_reviews: null, address: null, category: null, debug: []};
// Business name from h1
var h1 = document.querySelector('h1');
if (h1) result.name = h1.textContent.trim();
// Rating and reviews from span[role="img"] aria-labels
// Same pattern as scrape_reviews for consistency
// Category - use jsaction attribute (robust, survives class changes)
var catBtn = document.querySelector('button[jsaction*="category"]');
if (catBtn) result.category = catBtn.textContent.trim();
// Fallback: look for button after rating that's not a link
if (!result.category) {
var buttons = document.querySelectorAll('button');
for (var btn of buttons) {
var text = btn.textContent.trim();
// Categories are short words, no numbers, not navigation
if (text && text.length < 50 && !text.match(/^[0-9]/) &&
!text.match(/review|star|direction|save|share|photo/i)) {
// Check if it's near the rating area
var parent = btn.closest('.LBgpqf, .skqShb, .fontBodyMedium');
if (parent) {
result.category = text;
break;
}
}
}
}
// Rating from span[role="img"] aria-labels
var spans = document.querySelectorAll('span[role="img"]');
for (var i = 0; i < spans.length; i++) {
var label = spans[i].getAttribute('aria-label') || '';
// Rating: "4.8 stars", "4,8 estrellas", etc (partial match)
var rMatch = label.match(/^([\\d,.]+)\\s*(star|estrella|étoile|stern|stell)/i);
// Collect debug info for all aria-labels
if (label) {
result.debug.push('img-aria: ' + label);
}
// Rating: "4.8 stars" (English forced via hl=en)
var rMatch = label.match(/^([\\d,.]+)\\s*star/i);
if (rMatch && !result.rating) {
result.rating = parseFloat(rMatch[1].replace(',', '.'));
}
// Reviews: same as scrape_reviews - /^([\d,.]+)\s*review/i
// Plus Spanish "reseña" which doesn't contain "review"
var revMatch = label.match(/^([\\d,\\.]+)\\s*(review|reseña|avis|bewertung|recension)/i);
// Reviews: "79 reviews" or "4.8 stars 79 reviews" (English forced via hl=en)
// Try direct format first: "79 reviews"
var revMatch = label.match(/^([\\d,]+)\\s*review/i);
if (revMatch && !result.total_reviews) {
result.total_reviews = parseInt(revMatch[1].replace(/[,\\.]/g, ''));
result.total_reviews = parseInt(revMatch[1].replace(/,/g, ''));
}
// Try combined format: "4.8 stars 79 reviews" or "4.8 stars 79k+ reviews"
if (!result.total_reviews) {
var combinedMatch = label.match(/stars?\\s+([\\d,]+k?\\+?)\\s*review/i);
if (combinedMatch) {
var countStr = combinedMatch[1].replace(/,/g, '');
if (countStr.includes('k')) {
// Handle "9k+" format
result.total_reviews = parseInt(countStr) * 1000;
} else {
result.total_reviews = parseInt(countStr);
}
}
}
}
// Also collect tab button texts for debugging (include full text including numbers)
var tabs = document.querySelectorAll('button[role="tab"]');
for (var j = 0; j < tabs.length; j++) {
var tabText = tabs[j].textContent.trim();
result.debug.push('tab: ' + tabText);
// Also try to extract review count from tab text like "Reviews (79)"
if (tabText.toLowerCase().includes('review') && !result.total_reviews) {
var tabMatch = tabText.match(/\\((\\d+)\\)/);
if (tabMatch) {
result.total_reviews = parseInt(tabMatch[1]);
result.debug.push('Found reviews in tab: ' + tabText);
}
}
}
// Also check ALL buttons for reviews count
var allButtons = document.querySelectorAll('button');
for (var b = 0; b < allButtons.length; b++) {
var btnText = allButtons[b].textContent || '';
if (btnText.toLowerCase().includes('review') && !btnText.toLowerCase().includes('write')) {
var numMatch = btnText.match(/\\((\\d+)\\)/);
if (numMatch && !result.total_reviews) {
result.total_reviews = parseInt(numMatch[1]);
result.debug.push('Found reviews in button: ' + btnText.substring(0, 50));
}
}
}
// Check if we're on search results vs place page
result.debug.push('title: ' + document.title);
result.debug.push('url: ' + window.location.href.substring(0, 80));
// Check for search results list
var searchResults = document.querySelectorAll('div[role="feed"] > div');
result.debug.push('search_results_count: ' + searchResults.length);
// Fallback: Get review count from Reviews tab button "Reviews (79)"
// Search ALL tab buttons for one containing "review" text (same as scrape_reviews)
if (!result.total_reviews) {
var tabs = document.querySelectorAll('button[role="tab"]');
for (var tab of tabs) {
var text = tab.textContent.toLowerCase();
if (text.includes('review')) {
var match = tab.textContent.match(/\\((\\d+)\\)/);
if (match) {
result.total_reviews = parseInt(match[1]);
break;
}
}
}
}
// Fallback 2: Look for any button with "Reviews" and a number
if (!result.total_reviews) {
var buttons = document.querySelectorAll('button');
for (var btn of buttons) {
var text = btn.textContent;
if (text.toLowerCase().includes('review') && !text.toLowerCase().includes('write')) {
var numMatch = text.match(/\\((\\d+)\\)/);
if (numMatch) {
result.total_reviews = parseInt(numMatch[1]);
break;
}
}
}
}
@@ -1242,23 +1699,41 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
var addrBtn = document.querySelector('button[data-item-id="address"]');
if (addrBtn) {
var label = addrBtn.getAttribute('aria-label');
if (label) result.address = label.replace(/^(Address|Dirección|Adresse):\\s*/i, '');
if (label) result.address = label.replace(/^Address:\\s*/i, '');
}
return result;
""")
# Exit early if we have the essentials
if info.get("name") and info.get("total_reviews") is not None:
# Exit early if we have the essentials (name found AND reviews count > 0)
if info.get("name") and info.get("total_reviews") and info.get("total_reviews") > 0:
break
# Log debug info once after 3 seconds
if not debug_logged and time.time() - start > 3:
debug_logged = True
debug_info = info.get("debug", [])
if debug_info:
log.info(f"🔍 Validation debug - URL: {url[:50]}...")
log.info(f" Name: {info.get('name')}, Rating: {info.get('rating')}, Reviews: {info.get('total_reviews')}")
for d in debug_info[:10]: # First 10 debug items
log.info(f" {d}")
except:
pass
time.sleep(0.1) # 100ms between polls
# Final debug log if still no reviews
if not info.get("total_reviews"):
debug_info = info.get("debug", [])
log.warning(f"⚠️ Validation: No reviews found for '{info.get('name')}' after 10s polling")
if debug_info:
log.warning(f" Debug items: {debug_info[:10]}")
return {
"name": info.get("name"),
"address": info.get("address"),
"rating": info.get("rating"),
"total_reviews": info.get("total_reviews"),
"category": info.get("category"),
"success": bool(info.get("name")),
"error": None,
"time": time.time() - start_time
@@ -1270,6 +1745,7 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
"address": None,
"rating": None,
"total_reviews": None,
"category": None,
"success": False,
"error": str(e),
"time": time.time() - start_time

View File

@@ -27,6 +27,8 @@ interface SelectedJob {
jobId: string;
newCount?: number;
previousJobId?: string;
businessCategory?: string;
reviewTopics?: { topic: string; count: number }[];
}
type ViewType = 'newScrape' | 'jobs' | 'reports';
@@ -106,6 +108,8 @@ export default function Home() {
jobId: job.job_id,
newCount: data.new_count,
previousJobId: previousJob?.job_id,
businessCategory: job.business_category || undefined,
reviewTopics: job.review_topics || undefined,
});
setActiveView('reports');
}
@@ -155,7 +159,7 @@ export default function Home() {
Back to Reports
</button>
</div>
<ReviewAnalytics reviews={selectedJob.reviews} businessName={selectedJob.businessName} businessUrl={selectedJob.businessUrl} newCount={selectedJob.newCount} />
<ReviewAnalytics reviews={selectedJob.reviews} businessName={selectedJob.businessName} businessUrl={selectedJob.businessUrl} newCount={selectedJob.newCount} businessCategory={selectedJob.businessCategory} reviewTopics={selectedJob.reviewTopics} />
</div>
) : (
<div className="h-full overflow-y-auto p-6">

View File

@@ -22,14 +22,21 @@ interface ReviewWithNew extends Review {
photo_urls?: string[] | null;
}
interface ReviewTopic {
topic: string;
count: number;
}
interface ReviewAnalyticsProps {
reviews: ReviewWithNew[];
businessName?: string;
businessUrl?: string;
newCount?: number;
businessCategory?: string;
reviewTopics?: ReviewTopic[];
}
export default function ReviewAnalytics({ reviews, businessName, businessUrl, newCount }: ReviewAnalyticsProps) {
export default function ReviewAnalytics({ reviews, businessName, businessUrl, newCount, businessCategory, reviewTopics }: ReviewAnalyticsProps) {
const [sorting, setSorting] = useState<SortingState>([{ id: 'date', desc: true }]); // Default: newest first
const [columnFilters, setColumnFiltersState] = useState<ColumnFiltersState>([]);
const [globalFilter, setGlobalFilter] = useState('');
@@ -476,9 +483,16 @@ export default function ReviewAnalytics({ reviews, businessName, businessUrl, ne
{/* Header */}
<div className="flex items-center justify-between">
<div>
<div className="flex items-center gap-3">
<h2 className="text-3xl font-bold text-gray-900">
{businessName || 'Review Analytics'}
</h2>
{businessCategory && (
<span className="px-3 py-1 bg-purple-100 text-purple-800 text-sm font-medium rounded-full border border-purple-300">
{businessCategory}
</span>
)}
</div>
{businessUrl && (
<a
href={businessUrl}
@@ -821,6 +835,33 @@ export default function ReviewAnalytics({ reviews, businessName, businessUrl, ne
</div>
</div>
{/* Review Topics - from Google Maps */}
{reviewTopics && reviewTopics.length > 0 && (
<div className="bg-white border-2 border-gray-300 rounded-xl p-5 shadow-md">
<div className="flex items-center gap-2 mb-4">
<MessageSquare className="w-6 h-6 text-indigo-600" />
<h3 className="text-lg font-bold text-gray-900">What People Talk About</h3>
<span className="text-sm text-gray-500">({reviewTopics.length} topics from Google)</span>
</div>
<div className="flex flex-wrap gap-2">
{reviewTopics.slice(0, 15).map((topic, idx) => (
<div
key={idx}
className="px-3 py-1.5 bg-gradient-to-r from-indigo-50 to-purple-50 border border-indigo-200 rounded-full flex items-center gap-2"
>
<span className="text-sm font-medium text-indigo-800">{topic.topic}</span>
<span className="text-xs bg-indigo-200 text-indigo-900 px-1.5 py-0.5 rounded-full font-bold">
{topic.count}
</span>
</div>
))}
</div>
{reviewTopics.length > 15 && (
<p className="text-sm text-gray-500 mt-3">+{reviewTopics.length - 15} more topics</p>
)}
</div>
)}
{/* Rating & Volume Timeline */}
{timelineData.length > 0 && (
<div className={`bg-white rounded-xl p-6 shadow-md transition-all ${

View File

@@ -15,7 +15,7 @@ interface Review {
export interface JobStatus {
job_id: string;
status: 'pending' | 'running' | 'completed' | 'failed';
status: 'pending' | 'running' | 'completed' | 'failed' | 'partial';
url: string;
created_at: string;
started_at: string | null;
@@ -28,8 +28,11 @@ export interface JobStatus {
// Business metadata for tracking and comparison
business_name: string | null;
business_address: string | null;
business_category: string | null;
rating_snapshot: number | null;
total_reviews_snapshot: number | null;
// Review topics extracted from Google Maps
review_topics: { topic: string; count: number }[] | null;
}
interface ScraperTestProps {
@@ -56,7 +59,64 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
const [businessRating, setBusinessRating] = useState<number | null>(null);
const [businessImage, setBusinessImage] = useState<string | null>(null);
const [businessCategory, setBusinessCategory] = useState<string | null>(null);
const [userFingerprint, setUserFingerprint] = useState<{
geolocation?: {lat: number, lng: number},
userAgent?: string,
viewport?: {width: number, height: number},
timezone?: string,
language?: string,
platform?: string
}>({});
const debounceRef = useRef<NodeJS.Timeout | null>(null);
// Collect browser fingerprint on mount (no permissions needed)
useEffect(() => {
const collectFingerprint = async () => {
const fingerprint: typeof userFingerprint = {};
// User agent
fingerprint.userAgent = navigator.userAgent;
// Screen/viewport size
fingerprint.viewport = {
width: window.screen.width,
height: window.screen.height
};
// Timezone
fingerprint.timezone = Intl.DateTimeFormat().resolvedOptions().timeZone;
// Language
fingerprint.language = navigator.language;
// Platform
fingerprint.platform = navigator.platform;
// Get approximate location from IP (no permission needed)
try {
const response = await fetch('https://ipapi.co/json/', {
signal: AbortSignal.timeout(3000)
});
if (response.ok) {
const data = await response.json();
if (data.latitude && data.longitude) {
fingerprint.geolocation = {
lat: data.latitude,
lng: data.longitude
};
console.log('IP location:', data.city, data.country_name);
}
}
} catch (error) {
console.log('IP geolocation not available');
}
setUserFingerprint(fingerprint);
console.log('Browser fingerprint:', fingerprint);
};
collectFingerprint();
}, []);
const pollingIntervals = useRef<Map<string, NodeJS.Timeout>>(new Map());
const abortControllerRef = useRef<AbortController | null>(null);
@@ -121,18 +181,23 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
setBusinessCategory(null);
setError('');
// Create new abort controller with 30 second timeout
// Create new abort controller with 60 second timeout (validation can be slow)
const controller = new AbortController();
abortControllerRef.current = controller;
const timeoutId = setTimeout(() => controller.abort(), 30000);
const timeoutId = setTimeout(() => controller.abort(), 60000);
try {
const url = `https://www.google.com/maps/search/?api=1&query=${encodeURIComponent(query)}`;
// Force English with hl=en parameter
const url = `https://www.google.com/maps/search/?api=1&query=${encodeURIComponent(query)}&hl=en`;
const response = await fetch('/api/check-reviews', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ url }),
body: JSON.stringify({
url,
geolocation: userFingerprint.geolocation,
browser_fingerprint: userFingerprint // Pass full fingerprint
}),
signal: controller.signal,
});
@@ -157,21 +222,30 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
} catch (err) {
clearTimeout(timeoutId);
// Ignore AbortError (happens when user starts a new validation)
// Check if this is a timeout abort vs user-initiated abort
if (err instanceof Error && err.name === 'AbortError') {
// Check if it was a timeout (controller still matches) or user started new search
if (abortControllerRef.current === controller) {
// Timeout - show error
console.error('Validation timed out');
setError('Validation timed out. Please try again.');
setHasReviews(false);
setAvailableReviewCount(0);
} else {
// User started a new search - just return silently
console.log('Validation cancelled (new validation started)');
return;
}
} else {
console.error('Error getting business info:', err);
// Error occurred
setHasReviews(false);
setAvailableReviewCount(0);
} finally {
// Only clear loading state if this controller wasn't aborted
if (!controller.signal.aborted) {
setIsCheckingReviews(false);
}
} finally {
clearTimeout(timeoutId);
// Always clear loading state (even on timeout)
setIsCheckingReviews(false);
}
};
@@ -192,8 +266,8 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
return newMap;
});
// Stop polling if job is done
if (data.status === 'completed' || data.status === 'failed') {
// Stop polling if job is done (completed, failed, or partial)
if (data.status === 'completed' || data.status === 'failed' || data.status === 'partial') {
const interval = pollingIntervals.current.get(jobId);
if (interval) {
clearInterval(interval);
@@ -244,8 +318,8 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
setIsSubmitting(true);
setShowConfirmModal(false);
// Use the search query to create a Google Maps search URL
const url = `https://www.google.com/maps/search/?api=1&query=${encodeURIComponent(searchedQuery)}`;
// Use the search query to create a Google Maps search URL (force English)
const url = `https://www.google.com/maps/search/?api=1&query=${encodeURIComponent(searchedQuery)}&hl=en`;
try {
const response = await fetch('/api/scrape', {
@@ -257,6 +331,8 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
business_address: businessAddress,
rating_snapshot: businessRating,
total_reviews_snapshot: availableReviewCount,
geolocation: userFingerprint.geolocation,
browser_fingerprint: userFingerprint, // Pass full fingerprint
}),
});
@@ -283,8 +359,10 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
error_message: null,
business_name: businessName,
business_address: businessAddress,
business_category: businessCategory,
rating_snapshot: businessRating,
total_reviews_snapshot: availableReviewCount,
review_topics: null, // Will be populated when job completes
});
return newMap;
});
@@ -305,6 +383,7 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
case 'completed': return 'text-green-700';
case 'running': return 'text-blue-700';
case 'failed': return 'text-red-700';
case 'partial': return 'text-orange-700';
default: return 'text-gray-800';
}
};
@@ -325,6 +404,12 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
<path fillRule="evenodd" d="M10 18a8 8 0 100-16 8 8 0 000 16zM8.707 7.293a1 1 0 00-1.414 1.414L8.586 10l-1.293 1.293a1 1 0 101.414 1.414L10 11.414l1.293 1.293a1 1 0 001.414-1.414L11.414 10l1.293-1.293a1 1 0 00-1.414-1.414L10 8.586 8.707 7.293z" clipRule="evenodd" />
</svg>
);
case 'partial':
return (
<svg className="w-5 h-5 text-orange-500" fill="currentColor" viewBox="0 0 20 20">
<path fillRule="evenodd" d="M8.257 3.099c.765-1.36 2.722-1.36 3.486 0l5.58 9.92c.75 1.334-.213 2.98-1.742 2.98H4.42c-1.53 0-2.493-1.646-1.743-2.98l5.58-9.92zM11 13a1 1 0 11-2 0 1 1 0 012 0zm-1-8a1 1 0 00-1 1v3a1 1 0 002 0V6a1 1 0 00-1-1z" clipRule="evenodd" />
</svg>
);
default:
return (
<svg className="w-5 h-5 text-gray-400" fill="currentColor" viewBox="0 0 20 20">
@@ -776,8 +861,8 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
)}
</div>
{/* Action Buttons - Show when completed and has reviews */}
{job.status === 'completed' && job.reviews_count && job.reviews_count > 0 && (
{/* Action Buttons - Show when completed, partial, or running with reviews */}
{(job.status === 'completed' || job.status === 'partial' || (job.status === 'running' && job.reviews_count && job.reviews_count > 0)) && job.reviews_count && job.reviews_count > 0 && (
<div className="flex gap-3">
<button
onClick={async () => {
@@ -818,7 +903,13 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
}
}}
disabled={isLoadingReviews}
className="flex-1 py-4 bg-gradient-to-r from-blue-600 to-indigo-700 text-white rounded-xl font-bold hover:from-blue-700 hover:to-indigo-800 transition-all flex items-center justify-center gap-2 shadow-lg disabled:opacity-50 disabled:cursor-not-allowed text-lg border-2 border-blue-500"
className={`flex-1 py-4 text-white rounded-xl font-bold transition-all flex items-center justify-center gap-2 shadow-lg disabled:opacity-50 disabled:cursor-not-allowed text-lg border-2 ${
job.status === 'partial'
? 'bg-gradient-to-r from-orange-500 to-amber-600 hover:from-orange-600 hover:to-amber-700 border-orange-400'
: job.status === 'running'
? 'bg-gradient-to-r from-blue-500 to-cyan-600 hover:from-blue-600 hover:to-cyan-700 border-blue-400'
: 'bg-gradient-to-r from-blue-600 to-indigo-700 hover:from-blue-700 hover:to-indigo-800 border-blue-500'
}`}
>
{isLoadingReviews ? (
<>
@@ -830,7 +921,7 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
<svg className="w-6 h-6" fill="none" stroke="currentColor" viewBox="0 0 24 24">
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M9 19v-6a2 2 0 00-2-2H5a2 2 0 00-2 2v6a2 2 0 002 2h2a2 2 0 002-2zm0 0V9a2 2 0 012-2h2a2 2 0 012 2v10m-6 0a2 2 0 002 2h2a2 2 0 002-2m0 0V5a2 2 0 012-2h2a2 2 0 012 2v14a2 2 0 01-2 2h-2a2 2 0 01-2-2z" />
</svg>
📊 Open Analytics Dashboard
📊 {job.status === 'running' ? 'Preview Analytics' : job.status === 'partial' ? 'View Partial Data' : 'Open Analytics Dashboard'}
</>
)}
</button>
@@ -845,7 +936,7 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = `reviews-${job.job_id}.json`;
a.download = `reviews-${job.job_id}${job.status === 'partial' ? '-partial' : ''}.json`;
a.click();
}
} catch (err) {
@@ -862,6 +953,24 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
</div>
)}
{/* Partial Job Warning */}
{job.status === 'partial' && (
<div className="mt-4 p-4 bg-orange-100 border-2 border-orange-300 rounded-lg">
<div className="flex items-start gap-2">
<svg className="w-5 h-5 text-orange-700 flex-shrink-0 mt-0.5" fill="currentColor" viewBox="0 0 20 20">
<path fillRule="evenodd" d="M8.257 3.099c.765-1.36 2.722-1.36 3.486 0l5.58 9.92c.75 1.334-.213 2.98-1.742 2.98H4.42c-1.53 0-2.493-1.646-1.743-2.98l5.58-9.92zM11 13a1 1 0 11-2 0 1 1 0 012 0zm-1-8a1 1 0 00-1 1v3a1 1 0 002 0V6a1 1 0 00-1-1z" clipRule="evenodd" />
</svg>
<div>
<p className="font-bold text-orange-900">Partial Results</p>
<p className="text-sm text-orange-800 mt-1">
This job was interrupted but {job.reviews_count} reviews were saved.
{job.error_message && <span className="block mt-1 text-orange-700">Reason: {job.error_message}</span>}
</p>
</div>
</div>
</div>
)}
{/* Error Message */}
{job.status === 'failed' && job.error_message && (
<div className="mt-4 p-4 bg-red-100 border-2 border-red-300 rounded-lg">

View File

@@ -66,7 +66,9 @@ export function calculateReviewStats(reviews: Review[]): ReviewStats {
// Populate minDate/maxDate/centerDate on reviews for display
reviews.forEach(r => {
if (!r.minDate || !r.maxDate || !r.centerDate) {
const range = parseDateTextToRange(r.date_text);
// Handle both date_text and timestamp field names
const dateText = r.date_text || (r as any).timestamp || '';
const range = parseDateTextToRange(dateText);
r.minDate = range.minDate;
r.maxDate = range.maxDate;
// Calculate centerDate as midpoint
@@ -96,8 +98,8 @@ export function calculateReviewStats(reviews: Review[]): ReviewStats {
// Recent reviews (last 30 days - simplified check)
const recentReviews = reviews.filter(r => {
const text = r.date_text.toLowerCase();
return text.includes('day') || text.includes('week') || text.includes('hour');
const text = (r.date_text || (r as any).timestamp || '').toLowerCase();
return text.includes('day') || text.includes('week') || text.includes('hour') || text.includes('minute') || text.includes('second');
}).length;
// Rating distribution
@@ -278,6 +280,14 @@ function extractNumber(text: string): number {
*/
export function parseDateTextToRange(dateText: string): { minDate: Date; maxDate: Date } {
const now = new Date();
// Handle undefined/null dateText
if (!dateText) {
// Return a default range (assume recent - within last month)
const daysAgo = (days: number) => new Date(now.getTime() - days * 24 * 60 * 60 * 1000);
return { minDate: daysAgo(30), maxDate: now };
}
const text = dateText.toLowerCase();
// Remove "Edited " prefix if present
@@ -396,7 +406,8 @@ export function filterReviewsByDateRange(reviews: Review[], range: DateRange): R
// Filter range: [filterStart, filterEnd]
// Overlap occurs when: minDate <= filterEnd AND maxDate >= filterStart
return reviews.filter(r => {
const { minDate, maxDate } = parseDateTextToRange(r.date_text);
const dateText = r.date_text || (r as any).timestamp || '';
const { minDate, maxDate } = parseDateTextToRange(dateText);
return minDate <= filterEnd && maxDate >= filterStart;
});
}
@@ -405,7 +416,8 @@ export function filterReviewsByCustomDateRange(reviews: Review[], fromDate: Date
if (!fromDate && !toDate) return reviews;
return reviews.filter(r => {
const reviewDate = parseDateText(r.date_text);
const dateText = r.date_text || (r as any).timestamp || '';
const reviewDate = parseDateText(dateText);
// If only fromDate is set, filter reviews >= fromDate
if (fromDate && !toDate) {
@@ -429,7 +441,7 @@ export function filterReviewsByCustomDateRange(reviews: Review[], fromDate: Date
export function calculateTimelineData(reviews: Review[]): TimelineDataPoint[] {
// Sort reviews by date (newest first)
const sortedReviews = [...reviews]
.map(r => ({ ...r, parsedDate: parseDateText(r.date_text) }))
.map(r => ({ ...r, parsedDate: parseDateText(r.date_text || (r as any).timestamp || '') }))
.sort((a, b) => b.parsedDate.getTime() - a.parsedDate.getTime());
// Group by month