Add browser fingerprint support and analytics metadata display
- Transfer user's browser fingerprint (user-agent, viewport, timezone, language, geolocation) to Chrome for more authentic scraping - Display review topics from Google Maps in analytics dashboard - Show business category badge in analytics header - Fix date_text null handling in analytics (handle undefined/timestamp fields) - Add review_topics and business_category to JobStatus interface Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
22
Dockerfile
22
Dockerfile
@@ -39,6 +39,13 @@ RUN apt-get update \
|
||||
&& apt-get install -y chromium chromium-driver \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install VNC server and noVNC (browser-based VNC viewer)
|
||||
RUN apt-get update && apt-get install -y \
|
||||
x11vnc \
|
||||
novnc \
|
||||
websockify \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
@@ -51,7 +58,7 @@ COPY modules/ ./modules/
|
||||
COPY api_server_production.py .
|
||||
COPY config.yaml .
|
||||
|
||||
# Create startup script for Xvfb + API server
|
||||
# Create startup script for Xvfb + VNC + API server
|
||||
RUN echo '#!/bin/bash\n\
|
||||
# Start Xvfb (virtual display) in background\n\
|
||||
Xvfb :99 -screen 0 1920x1080x24 -ac +extension GLX +render -noreset &\n\
|
||||
@@ -60,6 +67,15 @@ export DISPLAY=:99\n\
|
||||
# Wait for Xvfb to start\n\
|
||||
sleep 2\n\
|
||||
\n\
|
||||
# Start VNC server (no password for local dev, binds to all interfaces)\n\
|
||||
x11vnc -display :99 -forever -shared -rfbport 5900 -nopw -bg\n\
|
||||
\n\
|
||||
# Start noVNC websocket proxy (browser access at http://localhost:6080/vnc.html)\n\
|
||||
websockify --web=/usr/share/novnc/ 6080 localhost:5900 &\n\
|
||||
\n\
|
||||
echo "VNC server running on port 5900"\n\
|
||||
echo "noVNC web interface at http://localhost:6080/vnc.html"\n\
|
||||
\n\
|
||||
# Start API server\n\
|
||||
exec python api_server_production.py\n\
|
||||
' > /app/start.sh && chmod +x /app/start.sh
|
||||
@@ -71,8 +87,8 @@ RUN useradd -m -u 1000 scraper && \
|
||||
|
||||
USER scraper
|
||||
|
||||
# Expose port
|
||||
EXPOSE 8000
|
||||
# Expose ports: API (8000), VNC (5900), noVNC web (6080)
|
||||
EXPOSE 8000 5900 6080
|
||||
|
||||
# Environment variables for Chromium in container
|
||||
ENV DISPLAY=:99
|
||||
|
||||
@@ -133,12 +133,36 @@ app.add_middleware(
|
||||
|
||||
# ==================== Request/Response Models ====================
|
||||
|
||||
class GeolocationModel(BaseModel):
|
||||
"""Geolocation coordinates"""
|
||||
lat: float = Field(..., description="Latitude")
|
||||
lng: float = Field(..., description="Longitude")
|
||||
|
||||
|
||||
class ViewportModel(BaseModel):
|
||||
"""Browser viewport size"""
|
||||
width: int = Field(..., description="Viewport width")
|
||||
height: int = Field(..., description="Viewport height")
|
||||
|
||||
|
||||
class BrowserFingerprintModel(BaseModel):
|
||||
"""Browser fingerprint to replicate user's browser"""
|
||||
geolocation: Optional[GeolocationModel] = None
|
||||
userAgent: Optional[str] = Field(None, description="User agent string")
|
||||
viewport: Optional[ViewportModel] = Field(None, description="Screen resolution")
|
||||
timezone: Optional[str] = Field(None, description="Timezone (e.g., Europe/Madrid)")
|
||||
language: Optional[str] = Field(None, description="Browser language (e.g., en-US)")
|
||||
platform: Optional[str] = Field(None, description="Platform (e.g., MacIntel, Win32)")
|
||||
|
||||
|
||||
class ScrapeRequest(BaseModel):
|
||||
"""Request model for starting a scrape job"""
|
||||
url: HttpUrl = Field(..., description="Google Maps URL to scrape")
|
||||
webhook_url: Optional[HttpUrl] = Field(None, description="Webhook URL for async notifications")
|
||||
webhook_secret: Optional[str] = Field(None, description="Secret for webhook HMAC signature")
|
||||
metadata: Optional[Dict[str, Any]] = Field(None, description="Optional custom metadata")
|
||||
geolocation: Optional[GeolocationModel] = Field(None, description="User's geolocation for Chrome")
|
||||
browser_fingerprint: Optional[BrowserFingerprintModel] = Field(None, description="User's browser fingerprint")
|
||||
|
||||
|
||||
class JobResponse(BaseModel):
|
||||
@@ -149,6 +173,7 @@ class JobResponse(BaseModel):
|
||||
created_at: str
|
||||
started_at: Optional[str] = None
|
||||
completed_at: Optional[str] = None
|
||||
updated_at: Optional[str] = None # Last update time for progress tracking
|
||||
reviews_count: Optional[int] = None
|
||||
total_reviews: Optional[int] = None # Total reviews available for this place
|
||||
scrape_time: Optional[float] = None
|
||||
@@ -157,6 +182,8 @@ class JobResponse(BaseModel):
|
||||
# Business metadata
|
||||
business_name: Optional[str] = None
|
||||
business_address: Optional[str] = None
|
||||
business_category: Optional[str] = None # Category (e.g., "Barber shop")
|
||||
review_topics: Optional[List[Dict[str, Any]]] = None # Topic filters with mention counts
|
||||
|
||||
|
||||
class ReviewsResponse(BaseModel):
|
||||
@@ -206,12 +233,32 @@ async def start_scrape(request: ScrapeRequest):
|
||||
raise HTTPException(status_code=500, detail="Database not initialized")
|
||||
|
||||
try:
|
||||
# Merge browser fingerprint into metadata if provided
|
||||
metadata = request.metadata or {}
|
||||
if request.browser_fingerprint:
|
||||
fp = request.browser_fingerprint
|
||||
metadata['browser_fingerprint'] = {
|
||||
"userAgent": fp.userAgent,
|
||||
"timezone": fp.timezone,
|
||||
"language": fp.language,
|
||||
"platform": fp.platform,
|
||||
}
|
||||
if fp.viewport:
|
||||
metadata['browser_fingerprint']['viewport'] = {"width": fp.viewport.width, "height": fp.viewport.height}
|
||||
if fp.geolocation:
|
||||
metadata['browser_fingerprint']['geolocation'] = {"lat": fp.geolocation.lat, "lng": fp.geolocation.lng}
|
||||
elif request.geolocation:
|
||||
metadata['geolocation'] = {
|
||||
'lat': request.geolocation.lat,
|
||||
'lng': request.geolocation.lng
|
||||
}
|
||||
|
||||
# Create job in database
|
||||
job_id = await db.create_job(
|
||||
url=str(request.url),
|
||||
webhook_url=str(request.webhook_url) if request.webhook_url else None,
|
||||
webhook_secret=request.webhook_secret,
|
||||
metadata=request.metadata
|
||||
metadata=metadata
|
||||
)
|
||||
|
||||
# Start scraping job in background
|
||||
@@ -240,6 +287,25 @@ async def get_job(job_id: UUID):
|
||||
if not job:
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
|
||||
# Parse review_topics if it's a string (JSONB might be returned as string)
|
||||
review_topics = job.get('review_topics')
|
||||
if isinstance(review_topics, str):
|
||||
try:
|
||||
review_topics = json.loads(review_topics)
|
||||
except:
|
||||
review_topics = None
|
||||
|
||||
# Extract business info from metadata if available
|
||||
metadata = job.get('metadata')
|
||||
if isinstance(metadata, str):
|
||||
try:
|
||||
metadata = json.loads(metadata)
|
||||
except:
|
||||
metadata = None
|
||||
|
||||
business_name = metadata.get('business_name') if metadata else None
|
||||
business_category = metadata.get('business_category') if metadata else None
|
||||
|
||||
return JobResponse(
|
||||
job_id=str(job['job_id']),
|
||||
status=job['status'],
|
||||
@@ -247,11 +313,15 @@ async def get_job(job_id: UUID):
|
||||
created_at=job['created_at'].isoformat(),
|
||||
started_at=job['started_at'].isoformat() if job['started_at'] else None,
|
||||
completed_at=job['completed_at'].isoformat() if job['completed_at'] else None,
|
||||
updated_at=job['updated_at'].isoformat() if job.get('updated_at') else None,
|
||||
reviews_count=job['reviews_count'],
|
||||
total_reviews=job.get('total_reviews'),
|
||||
scrape_time=job['scrape_time'],
|
||||
error_message=job['error_message'],
|
||||
webhook_url=job.get('webhook_url')
|
||||
webhook_url=job.get('webhook_url'),
|
||||
business_name=business_name,
|
||||
business_category=business_category,
|
||||
review_topics=review_topics
|
||||
)
|
||||
|
||||
|
||||
@@ -541,25 +611,32 @@ async def stream_all_jobs():
|
||||
@app.get("/jobs/{job_id}/reviews", response_model=ReviewsResponse, summary="Get Job Reviews")
|
||||
async def get_job_reviews(job_id: UUID):
|
||||
"""
|
||||
Get the actual reviews data for a completed job.
|
||||
Get reviews data for a job.
|
||||
|
||||
Returns 404 if job not found or not completed yet.
|
||||
Returns reviews for completed, partial, or running jobs (if reviews have been collected).
|
||||
Returns 404 if job not found or no reviews available yet.
|
||||
"""
|
||||
if not db:
|
||||
raise HTTPException(status_code=500, detail="Database not initialized")
|
||||
|
||||
reviews = await db.get_job_reviews(job_id)
|
||||
# Get reviews (includes completed, running, and partial jobs)
|
||||
reviews = await db.get_job_reviews(job_id, include_partial=True)
|
||||
if reviews is None:
|
||||
job = await db.get_job(job_id)
|
||||
if not job:
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
elif job['status'] != 'completed':
|
||||
elif job['status'] == 'pending':
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Job not completed yet (current status: {job['status']})"
|
||||
detail="Job has not started yet"
|
||||
)
|
||||
elif job['status'] == 'failed':
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Job failed without saving any reviews: {job.get('error_message', 'Unknown error')}"
|
||||
)
|
||||
else:
|
||||
raise HTTPException(status_code=404, detail="Reviews data not available")
|
||||
raise HTTPException(status_code=404, detail="No reviews data available yet")
|
||||
|
||||
return ReviewsResponse(
|
||||
job_id=str(job_id),
|
||||
@@ -603,6 +680,15 @@ async def list_jobs(
|
||||
|
||||
business_name = metadata.get('business_name') if metadata else None
|
||||
business_address = metadata.get('business_address') if metadata else None
|
||||
business_category = metadata.get('business_category') if metadata else None
|
||||
|
||||
# Parse review_topics if it's a string
|
||||
review_topics = job.get('review_topics')
|
||||
if isinstance(review_topics, str):
|
||||
try:
|
||||
review_topics = json.loads(review_topics)
|
||||
except:
|
||||
review_topics = None
|
||||
|
||||
result.append(JobResponse(
|
||||
job_id=str(job['job_id']),
|
||||
@@ -615,7 +701,9 @@ async def list_jobs(
|
||||
scrape_time=job.get('scrape_time'),
|
||||
error_message=job.get('error_message'),
|
||||
business_name=business_name,
|
||||
business_address=business_address
|
||||
business_address=business_address,
|
||||
business_category=business_category,
|
||||
review_topics=review_topics
|
||||
))
|
||||
|
||||
return result
|
||||
@@ -640,63 +728,69 @@ async def check_reviews(request: ScrapeRequest):
|
||||
Get business card information from Google Maps.
|
||||
Returns business name, address, rating, and review count.
|
||||
|
||||
Uses pre-warmed Chrome worker from pool for instant response.
|
||||
Creates a fresh Chrome instance for reliable results (same as full scraper).
|
||||
This is used to show the business confirmation card in the UI.
|
||||
"""
|
||||
worker = None
|
||||
recycle_worker = False
|
||||
|
||||
try:
|
||||
url = str(request.url)
|
||||
|
||||
# Get pre-warmed worker from validation pool
|
||||
worker = await asyncio.to_thread(get_validation_worker, timeout=10)
|
||||
# Use the SAME scraper algorithm with validation_only=True for early return
|
||||
# Creates a fresh Chrome instance (same as full scraper) to avoid stale browser state
|
||||
# Pooled browsers can have cookies/state that cause Google to render pages differently
|
||||
|
||||
if worker:
|
||||
log.info(f"Using worker {worker.worker_id} for business card extraction")
|
||||
# Use the pooled worker (don't close it)
|
||||
result = await asyncio.to_thread(
|
||||
get_business_card_info,
|
||||
url=url,
|
||||
driver=worker.driver,
|
||||
return_driver=True
|
||||
)
|
||||
|
||||
# Check if the result indicates a session error
|
||||
if not result['success'] and result.get('error'):
|
||||
error_msg = result.get('error', '').lower()
|
||||
if 'invalid session' in error_msg or 'session' in error_msg:
|
||||
log.warning(f"Worker {worker.worker_id} has invalid session, will recycle")
|
||||
recycle_worker = True
|
||||
# Build fingerprint dict from request
|
||||
fingerprint = None
|
||||
if request.browser_fingerprint:
|
||||
fp = request.browser_fingerprint
|
||||
fingerprint = {
|
||||
"userAgent": fp.userAgent,
|
||||
"timezone": fp.timezone,
|
||||
"language": fp.language,
|
||||
"platform": fp.platform,
|
||||
}
|
||||
if fp.viewport:
|
||||
fingerprint["viewport"] = {"width": fp.viewport.width, "height": fp.viewport.height}
|
||||
if fp.geolocation:
|
||||
fingerprint["geolocation"] = {"lat": fp.geolocation.lat, "lng": fp.geolocation.lng}
|
||||
log.info(f"Creating Chrome with user fingerprint: {fp.platform}, {fp.timezone}")
|
||||
elif request.geolocation:
|
||||
fingerprint = {"geolocation": {"lat": request.geolocation.lat, "lng": request.geolocation.lng}}
|
||||
log.info(f"Creating Chrome with geolocation only")
|
||||
else:
|
||||
# Fallback: create temporary worker
|
||||
log.warning("No pooled worker available, creating temporary instance")
|
||||
log.info(f"Creating Chrome with default settings")
|
||||
|
||||
result = await asyncio.to_thread(
|
||||
get_business_card_info,
|
||||
url=url
|
||||
fast_scrape_reviews,
|
||||
url=url,
|
||||
headless=False, # Use Xvfb display
|
||||
validation_only=True, # Return early after getting total_reviews
|
||||
browser_fingerprint=fingerprint # Pass user's browser fingerprint
|
||||
)
|
||||
|
||||
# SIMPLIFIED VALIDATION: If we found a business (name + rating), assume it has reviews
|
||||
# Let the actual scraper determine if reviews exist
|
||||
has_business = bool(result.get('name') and result.get('rating'))
|
||||
# Extract validation info from the result
|
||||
validation_info = result.get('validation_info', {})
|
||||
total_reviews = validation_info.get('total_reviews') or result.get('total_reviews') or 0
|
||||
name = validation_info.get('name')
|
||||
rating = validation_info.get('rating')
|
||||
category = validation_info.get('category')
|
||||
address = validation_info.get('address')
|
||||
|
||||
# Has reviews if we found a business with the Reviews tab (indicated by total_reviews > 0)
|
||||
has_reviews = bool(name and total_reviews > 0)
|
||||
|
||||
return {
|
||||
"has_reviews": has_business, # Boolean: true if business exists
|
||||
"total_reviews": result.get('total_reviews') or 0, # Show 0 if unknown
|
||||
"name": result.get('name'),
|
||||
"address": result.get('address'),
|
||||
"rating": result.get('rating'),
|
||||
"success": result['success'],
|
||||
"has_reviews": has_reviews, # True if business has reviews
|
||||
"total_reviews": total_reviews,
|
||||
"name": name,
|
||||
"address": address,
|
||||
"rating": rating,
|
||||
"category": category,
|
||||
"success": result.get('success', True),
|
||||
"error": result.get('error')
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error checking reviews: {e}")
|
||||
# If it's a session error, recycle the worker
|
||||
if worker:
|
||||
error_msg = str(e).lower()
|
||||
if 'invalid session' in error_msg or 'session' in error_msg:
|
||||
recycle_worker = True
|
||||
|
||||
return {
|
||||
"has_reviews": False,
|
||||
@@ -704,10 +798,6 @@ async def check_reviews(request: ScrapeRequest):
|
||||
"success": False,
|
||||
"error": str(e)
|
||||
}
|
||||
finally:
|
||||
# Release worker back to pool (or recycle if broken)
|
||||
if worker:
|
||||
await asyncio.to_thread(release_validation_worker, worker, recycle=recycle_worker)
|
||||
|
||||
|
||||
@app.get("/stats", response_model=StatsResponse, summary="Get Statistics")
|
||||
@@ -808,6 +898,21 @@ async def run_scraping_job(job_id: UUID):
|
||||
job = await db.get_job(job_id)
|
||||
url = job['url']
|
||||
|
||||
# Extract browser fingerprint from metadata if available
|
||||
browser_fingerprint = None
|
||||
metadata = job.get('metadata')
|
||||
if isinstance(metadata, str):
|
||||
try:
|
||||
metadata = json.loads(metadata)
|
||||
except:
|
||||
metadata = None
|
||||
if metadata and 'browser_fingerprint' in metadata:
|
||||
browser_fingerprint = metadata['browser_fingerprint']
|
||||
log.info(f"Using user fingerprint: {browser_fingerprint.get('platform')}, {browser_fingerprint.get('timezone')}")
|
||||
elif metadata and 'geolocation' in metadata:
|
||||
browser_fingerprint = {'geolocation': metadata['geolocation']}
|
||||
log.info(f"Using user geolocation only")
|
||||
|
||||
# Broadcast job started via SSE
|
||||
await broadcast_job_update(job_id_str, "job_started", {
|
||||
"job_id": job_id_str,
|
||||
@@ -821,9 +926,17 @@ async def run_scraping_job(job_id: UUID):
|
||||
# Create log capture instance that we can access for real-time logs
|
||||
log_capture = LogCapture()
|
||||
|
||||
# Track total reviews for incremental saves
|
||||
total_reviews_seen = [None]
|
||||
# Accumulate all reviews for incremental saves (flush_callback receives batches)
|
||||
all_reviews_collected = []
|
||||
|
||||
# Progress callback to update job status with current/total counts AND logs
|
||||
def progress_callback(current_count: int, total_count: int):
|
||||
"""Update job progress and logs from worker thread"""
|
||||
if total_count:
|
||||
total_reviews_seen[0] = total_count
|
||||
|
||||
async def update():
|
||||
# Get current logs from the shared log_capture
|
||||
current_logs = log_capture.get_logs()
|
||||
@@ -847,6 +960,22 @@ async def run_scraping_job(job_id: UUID):
|
||||
# Schedule the coroutine on the event loop
|
||||
asyncio.run_coroutine_threadsafe(update(), loop)
|
||||
|
||||
# Flush callback to save reviews incrementally (crash recovery)
|
||||
# Note: flush_callback receives batches, so we accumulate them
|
||||
def flush_callback(reviews_batch: list):
|
||||
"""Accumulate and save reviews to DB incrementally from worker thread"""
|
||||
# Extend our collection with the new batch
|
||||
all_reviews_collected.extend(reviews_batch)
|
||||
|
||||
async def save():
|
||||
await db.save_reviews_incremental(
|
||||
job_id=job_id,
|
||||
reviews=all_reviews_collected, # Save ALL reviews so far
|
||||
total_reviews=total_reviews_seen[0]
|
||||
)
|
||||
# Schedule the coroutine on the event loop
|
||||
asyncio.run_coroutine_threadsafe(save(), loop)
|
||||
|
||||
# Run scraping with progress callback and shared log capture
|
||||
# headless=False because Docker uses Xvfb virtual display
|
||||
result = await asyncio.to_thread(
|
||||
@@ -854,17 +983,20 @@ async def run_scraping_job(job_id: UUID):
|
||||
url=url,
|
||||
headless=False,
|
||||
progress_callback=progress_callback,
|
||||
log_capture=log_capture
|
||||
log_capture=log_capture,
|
||||
flush_callback=flush_callback,
|
||||
browser_fingerprint=browser_fingerprint # Pass user's browser fingerprint
|
||||
)
|
||||
|
||||
if result['success']:
|
||||
# Save results to database (including scraper logs)
|
||||
# Save results to database (including scraper logs and review topics)
|
||||
await db.save_job_result(
|
||||
job_id=job_id,
|
||||
reviews=result['reviews'],
|
||||
scrape_time=result['time'],
|
||||
total_reviews=result.get('total_reviews'),
|
||||
scrape_logs=result.get('logs')
|
||||
scrape_logs=result.get('logs'),
|
||||
review_topics=result.get('review_topics')
|
||||
)
|
||||
|
||||
log.info(
|
||||
@@ -898,7 +1030,44 @@ async def run_scraping_job(job_id: UUID):
|
||||
)
|
||||
|
||||
else:
|
||||
# Job failed - save logs for debugging
|
||||
# Job failed - check if we have partial reviews saved
|
||||
current_job = await db.get_job(job_id)
|
||||
partial_count = current_job.get('reviews_count', 0) if current_job else 0
|
||||
|
||||
if partial_count > 0:
|
||||
# Mark as partial - we have some reviews saved
|
||||
await db.mark_job_partial(
|
||||
job_id,
|
||||
error_message=result.get('error', 'Unknown error'),
|
||||
scrape_logs=result.get('logs')
|
||||
)
|
||||
|
||||
log.warning(f"Partial job {job_id}: {partial_count} reviews saved before error: {result.get('error')}")
|
||||
|
||||
# Broadcast job partial via SSE
|
||||
await broadcast_job_update(job_id_str, "job_partial", {
|
||||
"job_id": job_id_str,
|
||||
"status": "partial",
|
||||
"reviews_count": partial_count,
|
||||
"total_reviews": current_job.get('total_reviews'),
|
||||
"error_message": result.get('error'),
|
||||
"logs": result.get('logs', [])
|
||||
})
|
||||
|
||||
# Send partial webhook if configured
|
||||
if job.get('webhook_url'):
|
||||
webhook_manager = WebhookManager()
|
||||
await webhook_manager.send_job_completed_webhook(
|
||||
webhook_url=job['webhook_url'],
|
||||
job_id=job_id,
|
||||
status='partial',
|
||||
reviews_count=partial_count,
|
||||
error_message=result.get('error'),
|
||||
secret=job.get('webhook_secret'),
|
||||
db=db
|
||||
)
|
||||
else:
|
||||
# No reviews saved - mark as failed
|
||||
await db.update_job_status(
|
||||
job_id,
|
||||
JobStatus.FAILED,
|
||||
@@ -933,6 +1102,44 @@ async def run_scraping_job(job_id: UUID):
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
# Check if we have partial reviews saved
|
||||
current_job = await db.get_job(job_id)
|
||||
partial_count = current_job.get('reviews_count', 0) if current_job else 0
|
||||
|
||||
if partial_count > 0:
|
||||
# Mark as partial - we have some reviews saved
|
||||
await db.mark_job_partial(
|
||||
job_id,
|
||||
error_message=str(e),
|
||||
scrape_logs=log_capture.get_logs() if log_capture else None
|
||||
)
|
||||
|
||||
log.warning(f"Partial job {job_id}: {partial_count} reviews saved before exception: {e}")
|
||||
|
||||
# Broadcast job partial via SSE
|
||||
await broadcast_job_update(job_id_str, "job_partial", {
|
||||
"job_id": job_id_str,
|
||||
"status": "partial",
|
||||
"reviews_count": partial_count,
|
||||
"total_reviews": current_job.get('total_reviews'),
|
||||
"error_message": str(e),
|
||||
"logs": log_capture.get_logs() if log_capture else []
|
||||
})
|
||||
|
||||
# Send partial webhook
|
||||
if current_job and current_job.get('webhook_url'):
|
||||
webhook_manager = WebhookManager()
|
||||
await webhook_manager.send_job_completed_webhook(
|
||||
webhook_url=current_job['webhook_url'],
|
||||
job_id=job_id,
|
||||
status='partial',
|
||||
reviews_count=partial_count,
|
||||
error_message=str(e),
|
||||
secret=current_job.get('webhook_secret'),
|
||||
db=db
|
||||
)
|
||||
else:
|
||||
# No reviews saved - mark as failed
|
||||
await db.update_job_status(
|
||||
job_id,
|
||||
JobStatus.FAILED,
|
||||
@@ -948,15 +1155,14 @@ async def run_scraping_job(job_id: UUID):
|
||||
})
|
||||
|
||||
# Send failure webhook
|
||||
job = await db.get_job(job_id)
|
||||
if job and job.get('webhook_url'):
|
||||
if current_job and current_job.get('webhook_url'):
|
||||
webhook_manager = WebhookManager()
|
||||
await webhook_manager.send_job_completed_webhook(
|
||||
webhook_url=job['webhook_url'],
|
||||
webhook_url=current_job['webhook_url'],
|
||||
job_id=job_id,
|
||||
status='failed',
|
||||
error_message=str(e),
|
||||
secret=job.get('webhook_secret'),
|
||||
secret=current_job.get('webhook_secret'),
|
||||
db=db
|
||||
)
|
||||
|
||||
|
||||
@@ -39,6 +39,8 @@ services:
|
||||
- CHROME_BIN=/usr/bin/chromium
|
||||
ports:
|
||||
- "8000:8000"
|
||||
- "5900:5900" # VNC port (for VNC client)
|
||||
- "6080:6080" # noVNC web interface (browser access)
|
||||
depends_on:
|
||||
db:
|
||||
condition: service_healthy
|
||||
|
||||
@@ -21,6 +21,7 @@ class JobStatus(str, Enum):
|
||||
COMPLETED = "completed"
|
||||
FAILED = "failed"
|
||||
CANCELLED = "cancelled"
|
||||
PARTIAL = "partial" # Job crashed but has partial reviews saved
|
||||
|
||||
|
||||
class DatabaseManager:
|
||||
@@ -69,6 +70,7 @@ class DatabaseManager:
|
||||
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
|
||||
started_at TIMESTAMP,
|
||||
completed_at TIMESTAMP,
|
||||
updated_at TIMESTAMP,
|
||||
|
||||
reviews_count INTEGER,
|
||||
total_reviews INTEGER,
|
||||
@@ -79,7 +81,7 @@ class DatabaseManager:
|
||||
metadata JSONB,
|
||||
scrape_logs JSONB,
|
||||
|
||||
CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled'))
|
||||
CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled', 'partial'))
|
||||
);
|
||||
""")
|
||||
|
||||
@@ -88,6 +90,24 @@ class DatabaseManager:
|
||||
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS scrape_logs JSONB;
|
||||
""")
|
||||
|
||||
# Add updated_at column if it doesn't exist (for incremental progress tracking)
|
||||
await conn.execute("""
|
||||
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS updated_at TIMESTAMP;
|
||||
""")
|
||||
|
||||
# Add review_topics column if it doesn't exist (extracted topic filters with mention counts)
|
||||
await conn.execute("""
|
||||
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS review_topics JSONB;
|
||||
""")
|
||||
|
||||
# Update constraint to include 'partial' status (for existing databases)
|
||||
await conn.execute("""
|
||||
ALTER TABLE jobs DROP CONSTRAINT IF EXISTS valid_status;
|
||||
""")
|
||||
await conn.execute("""
|
||||
ALTER TABLE jobs ADD CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled', 'partial'));
|
||||
""")
|
||||
|
||||
# Create indexes
|
||||
await conn.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status);
|
||||
@@ -187,13 +207,15 @@ class DatabaseManager:
|
||||
created_at,
|
||||
started_at,
|
||||
completed_at,
|
||||
updated_at,
|
||||
reviews_count,
|
||||
total_reviews,
|
||||
reviews_data,
|
||||
scrape_time,
|
||||
error_message,
|
||||
metadata,
|
||||
scrape_logs
|
||||
scrape_logs,
|
||||
review_topics
|
||||
FROM jobs
|
||||
WHERE job_id = $1
|
||||
""", job_id)
|
||||
@@ -203,17 +225,27 @@ class DatabaseManager:
|
||||
|
||||
return dict(row)
|
||||
|
||||
async def get_job_reviews(self, job_id: UUID) -> Optional[List[Dict[str, Any]]]:
|
||||
async def get_job_reviews(self, job_id: UUID, include_partial: bool = True) -> Optional[List[Dict[str, Any]]]:
|
||||
"""
|
||||
Get reviews for a specific job.
|
||||
|
||||
Args:
|
||||
job_id: Job UUID
|
||||
include_partial: If True, also return reviews for running and partial jobs
|
||||
|
||||
Returns:
|
||||
List of reviews or None if not found/not completed
|
||||
List of reviews or None if not found/no reviews
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
if include_partial:
|
||||
# Return reviews for completed, running, or partial jobs
|
||||
reviews_data = await conn.fetchval("""
|
||||
SELECT reviews_data
|
||||
FROM jobs
|
||||
WHERE job_id = $1 AND status IN ('completed', 'running', 'partial')
|
||||
""", job_id)
|
||||
else:
|
||||
# Only return reviews for completed jobs
|
||||
reviews_data = await conn.fetchval("""
|
||||
SELECT reviews_data
|
||||
FROM jobs
|
||||
@@ -278,7 +310,8 @@ class DatabaseManager:
|
||||
reviews: List[Dict[str, Any]],
|
||||
scrape_time: float,
|
||||
total_reviews: Optional[int] = None,
|
||||
scrape_logs: Optional[List[Dict[str, Any]]] = None
|
||||
scrape_logs: Optional[List[Dict[str, Any]]] = None,
|
||||
review_topics: Optional[List[Dict[str, Any]]] = None
|
||||
):
|
||||
"""
|
||||
Save scraping results to database.
|
||||
@@ -289,8 +322,33 @@ class DatabaseManager:
|
||||
scrape_time: Time taken to scrape in seconds
|
||||
total_reviews: Total reviews available (from page counter)
|
||||
scrape_logs: List of log entries from the scraper
|
||||
review_topics: List of topic filter dictionaries with topic and count
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
# If reviews list is empty, check if job already has reviews from incremental saves
|
||||
# This happens when flush_callback was used during scraping
|
||||
if not reviews:
|
||||
existing = await conn.fetchval(
|
||||
"SELECT reviews_count FROM jobs WHERE job_id = $1", job_id
|
||||
)
|
||||
if existing and existing > 0:
|
||||
# Job has reviews from incremental saves, don't overwrite reviews_data
|
||||
await conn.execute("""
|
||||
UPDATE jobs
|
||||
SET
|
||||
status = 'completed',
|
||||
completed_at = NOW(),
|
||||
total_reviews = COALESCE($2, total_reviews),
|
||||
scrape_time = $3,
|
||||
scrape_logs = $4::jsonb,
|
||||
review_topics = $5::jsonb
|
||||
WHERE job_id = $1
|
||||
""", job_id, total_reviews, scrape_time,
|
||||
json.dumps(scrape_logs) if scrape_logs else None,
|
||||
json.dumps(review_topics) if review_topics else None)
|
||||
log.info(f"Completed job {job_id} with {existing} reviews (from incremental saves)")
|
||||
return
|
||||
|
||||
await conn.execute("""
|
||||
UPDATE jobs
|
||||
SET
|
||||
@@ -300,13 +358,70 @@ class DatabaseManager:
|
||||
total_reviews = $3,
|
||||
reviews_data = $4::jsonb,
|
||||
scrape_time = $5,
|
||||
scrape_logs = $6::jsonb
|
||||
scrape_logs = $6::jsonb,
|
||||
review_topics = $7::jsonb
|
||||
WHERE job_id = $1
|
||||
""", job_id, len(reviews), total_reviews, json.dumps(reviews), scrape_time,
|
||||
json.dumps(scrape_logs) if scrape_logs else None)
|
||||
json.dumps(scrape_logs) if scrape_logs else None,
|
||||
json.dumps(review_topics) if review_topics else None)
|
||||
|
||||
log.info(f"Saved {len(reviews)} reviews for job {job_id}")
|
||||
|
||||
async def save_reviews_incremental(
|
||||
self,
|
||||
job_id: UUID,
|
||||
reviews: List[Dict[str, Any]],
|
||||
total_reviews: Optional[int] = None
|
||||
):
|
||||
"""
|
||||
Save reviews incrementally during scraping.
|
||||
Called on each flush to preserve progress in case of crash.
|
||||
|
||||
Args:
|
||||
job_id: Job UUID
|
||||
reviews: ALL reviews collected so far (not just new ones)
|
||||
total_reviews: Total reviews available (from page counter)
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
await conn.execute("""
|
||||
UPDATE jobs
|
||||
SET
|
||||
reviews_count = $2,
|
||||
total_reviews = COALESCE($3, total_reviews),
|
||||
reviews_data = $4::jsonb,
|
||||
updated_at = NOW()
|
||||
WHERE job_id = $1 AND status = 'running'
|
||||
""", job_id, len(reviews), total_reviews, json.dumps(reviews))
|
||||
|
||||
log.debug(f"Incremental save: {len(reviews)} reviews for job {job_id}")
|
||||
|
||||
async def mark_job_partial(
|
||||
self,
|
||||
job_id: UUID,
|
||||
error_message: str,
|
||||
scrape_logs: Optional[List[Dict[str, Any]]] = None
|
||||
):
|
||||
"""
|
||||
Mark a job as partial (crashed but has some reviews saved).
|
||||
|
||||
Args:
|
||||
job_id: Job UUID
|
||||
error_message: Error that caused the crash
|
||||
scrape_logs: Log entries from the scraper
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
await conn.execute("""
|
||||
UPDATE jobs
|
||||
SET
|
||||
status = 'partial',
|
||||
completed_at = NOW(),
|
||||
error_message = $2,
|
||||
scrape_logs = $3::jsonb
|
||||
WHERE job_id = $1
|
||||
""", job_id, error_message, json.dumps(scrape_logs) if scrape_logs else None)
|
||||
|
||||
log.info(f"Marked job {job_id} as partial due to: {error_message}")
|
||||
|
||||
async def list_jobs(
|
||||
self,
|
||||
status: Optional[JobStatus] = None,
|
||||
@@ -337,7 +452,8 @@ class DatabaseManager:
|
||||
total_reviews,
|
||||
scrape_time,
|
||||
error_message,
|
||||
metadata
|
||||
metadata,
|
||||
review_topics
|
||||
FROM jobs
|
||||
WHERE status = $1
|
||||
ORDER BY created_at DESC
|
||||
@@ -355,7 +471,8 @@ class DatabaseManager:
|
||||
total_reviews,
|
||||
scrape_time,
|
||||
error_message,
|
||||
metadata
|
||||
metadata,
|
||||
review_topics
|
||||
FROM jobs
|
||||
ORDER BY created_at DESC
|
||||
LIMIT $1 OFFSET $2
|
||||
|
||||
@@ -268,7 +268,7 @@ def parse_dom_review(card) -> dict:
|
||||
|
||||
def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: int = 15,
|
||||
flush_callback=None, flush_batch_size: int = 500, log_capture: LogCapture = None,
|
||||
progress_callback=None) -> dict:
|
||||
progress_callback=None, validation_only: bool = False) -> dict:
|
||||
"""
|
||||
Scrape Google Maps reviews.
|
||||
|
||||
@@ -299,6 +299,9 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
# Track total reviews (persists across refreshes)
|
||||
total_reviews = [None] # Use list for closure mutation
|
||||
|
||||
# Store business info extracted from overview (before clicking reviews tab)
|
||||
business_info_cache = [None]
|
||||
|
||||
# Hard refresh counter
|
||||
hard_refresh_count = [0]
|
||||
max_hard_refreshes = 3 # Max number of hard refreshes before giving up
|
||||
@@ -323,11 +326,14 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
pass
|
||||
return None
|
||||
|
||||
def setup_reviews_page(is_refresh=False):
|
||||
def setup_reviews_page(is_refresh=False, validation_only_mode=False):
|
||||
"""
|
||||
Setup the reviews page for scraping.
|
||||
Returns (scroll_container, stop_scrolling_event) or (None, None) on failure.
|
||||
Can be called after initial load or after a hard refresh.
|
||||
|
||||
If validation_only_mode=True, returns early after extracting business info
|
||||
without clicking reviews tab or finding scroll container.
|
||||
"""
|
||||
nonlocal total_reviews
|
||||
|
||||
@@ -335,6 +341,13 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
|
||||
# Navigate to URL (only on initial load or refresh)
|
||||
if not is_refresh:
|
||||
# Reset browser state by navigating to blank page first
|
||||
# This clears any stale state from pooled browser sessions
|
||||
try:
|
||||
driver.get("about:blank")
|
||||
time.sleep(0.1)
|
||||
except:
|
||||
pass
|
||||
log.info(f"🌐 Loading: {url[:80]}...")
|
||||
else:
|
||||
log.info(f"🔄 Hard refresh #{hard_refresh_count[0]}: reloading page...")
|
||||
@@ -353,6 +366,8 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
# Reload original URL after consent
|
||||
log.info(" Reloading after consent...")
|
||||
driver.get(url)
|
||||
# Wait for page to settle after consent reload
|
||||
time.sleep(1)
|
||||
break
|
||||
except:
|
||||
pass
|
||||
@@ -362,43 +377,108 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
break
|
||||
time.sleep(0.01) # 10ms - responsive but low CPU
|
||||
|
||||
# Extract total review count BEFORE clicking reviews tab (it's on Overview)
|
||||
# Extract business info and total review count BEFORE clicking reviews tab (on Overview)
|
||||
# This captures name, rating, category, address while they're visible
|
||||
# Only on first load (don't overwrite if we already have it)
|
||||
if total_reviews[0] is None:
|
||||
if total_reviews[0] is None or business_info_cache[0] is None:
|
||||
start = time.time()
|
||||
while time.time() - start < 5:
|
||||
try:
|
||||
count = driver.execute_script("""
|
||||
var reviewSpans = document.querySelectorAll('span[role="img"]');
|
||||
for (var i = 0; i < reviewSpans.length; i++) {
|
||||
var label = reviewSpans[i].getAttribute('aria-label') || '';
|
||||
var match = label.match(/^([\\d,\\.]+)\\s*review/i);
|
||||
if (match) {
|
||||
return parseInt(match[1].replace(/[,\\.]/g, ''));
|
||||
info = driver.execute_script("""
|
||||
var result = {
|
||||
total_reviews: null,
|
||||
name: null,
|
||||
rating: null,
|
||||
category: null,
|
||||
address: null
|
||||
};
|
||||
|
||||
// Business name from h1
|
||||
var h1 = document.querySelector('h1');
|
||||
if (h1) result.name = h1.textContent.trim();
|
||||
|
||||
// Category - use jsaction attribute (robust selector)
|
||||
var catBtn = document.querySelector('button[jsaction*="category"]');
|
||||
if (catBtn) result.category = catBtn.textContent.trim();
|
||||
|
||||
// Rating and review count from span[role="img"] aria-labels
|
||||
var spans = document.querySelectorAll('span[role="img"]');
|
||||
for (var i = 0; i < spans.length; i++) {
|
||||
var label = spans[i].getAttribute('aria-label') || '';
|
||||
|
||||
// Rating: "4.8 stars"
|
||||
var rMatch = label.match(/^([\\d,.]+)\\s*star/i);
|
||||
if (rMatch && !result.rating) {
|
||||
result.rating = parseFloat(rMatch[1].replace(',', '.'));
|
||||
}
|
||||
|
||||
// Reviews: "79 reviews"
|
||||
var revMatch = label.match(/^([\\d,\\.]+)\\s*review/i);
|
||||
if (revMatch && !result.total_reviews) {
|
||||
result.total_reviews = parseInt(revMatch[1].replace(/[,\\.]/g, ''));
|
||||
}
|
||||
}
|
||||
return null;
|
||||
|
||||
// Address from button
|
||||
var addrBtn = document.querySelector('button[data-item-id="address"]');
|
||||
if (addrBtn) {
|
||||
var label = addrBtn.getAttribute('aria-label');
|
||||
if (label) result.address = label.replace(/^Address:\\s*/i, '');
|
||||
}
|
||||
|
||||
return result;
|
||||
""")
|
||||
if count:
|
||||
total_reviews[0] = count
|
||||
log.info(f"📊 Total reviews on page: {count}")
|
||||
|
||||
if info:
|
||||
if info.get('total_reviews') and total_reviews[0] is None:
|
||||
total_reviews[0] = info['total_reviews']
|
||||
log.info(f"📊 Total reviews on page: {total_reviews[0]}")
|
||||
if info.get('name') and business_info_cache[0] is None:
|
||||
business_info_cache[0] = info
|
||||
log.info(f"📍 Business: {info.get('name')}")
|
||||
if total_reviews[0] and business_info_cache[0]:
|
||||
break
|
||||
except:
|
||||
pass
|
||||
time.sleep(0.1)
|
||||
|
||||
# VALIDATION_ONLY: Return early - skip clicking reviews tab, sorting, etc.
|
||||
if validation_only_mode:
|
||||
log.info("📋 Validation mode: returning early (skipping reviews tab)")
|
||||
return ("validation_done", None)
|
||||
|
||||
# Click reviews tab - poll until found
|
||||
review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"]
|
||||
start = time.time()
|
||||
tab_clicked = False
|
||||
tabs_logged = False
|
||||
while time.time() - start < 5: # Max 5s for tabs
|
||||
try:
|
||||
tabs = driver.find_elements(By.CSS_SELECTOR, "button[role='tab']")
|
||||
# Log available tabs once for debugging
|
||||
if not tabs_logged and tabs:
|
||||
tabs_logged = True
|
||||
tab_texts = [t.text for t in tabs]
|
||||
log.info(f" Available tabs: {tab_texts}")
|
||||
for tab in tabs:
|
||||
tab_text = tab.text.lower()
|
||||
if any(kw in tab_text for kw in review_keywords):
|
||||
if not is_refresh:
|
||||
log.info(f" Clicking reviews tab: '{tab.text}'")
|
||||
# Extract total_reviews from tab text like "Reviews (79)" or "Reviews\n79"
|
||||
if total_reviews[0] is None:
|
||||
import re
|
||||
# Try pattern with parentheses: "Reviews (79)"
|
||||
match = re.search(r'\((\d+)\)', tab.text)
|
||||
if match:
|
||||
total_reviews[0] = int(match.group(1))
|
||||
log.info(f"📊 Total reviews from tab: {total_reviews[0]}")
|
||||
else:
|
||||
# Try pattern with newline: "Reviews\n79"
|
||||
match = re.search(r'(\d+)', tab.text)
|
||||
if match:
|
||||
total_reviews[0] = int(match.group(1))
|
||||
log.info(f"📊 Total reviews from tab: {total_reviews[0]}")
|
||||
tab.click()
|
||||
tab_clicked = True
|
||||
break
|
||||
@@ -569,11 +649,85 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
|
||||
return scroll_container, stop_scrolling
|
||||
|
||||
# Initial page setup
|
||||
scroll_container, stop_scrolling = setup_reviews_page(is_refresh=False)
|
||||
# Helper to extract review topics from the reviews tab
|
||||
def extract_review_topics():
|
||||
"""Extract review topic filters from radiogroup (robust selectors)."""
|
||||
try:
|
||||
topics = driver.execute_script("""
|
||||
var topics = [];
|
||||
|
||||
// Primary: use role="radiogroup" with aria-label="Refine reviews" (robust)
|
||||
var container = document.querySelector('div[role="radiogroup"][aria-label*="Refine"], div[role="radiogroup"][aria-label*="refine"]');
|
||||
|
||||
if (!container) {
|
||||
// Fallback: any radiogroup in the reviews area
|
||||
container = document.querySelector('div[role="radiogroup"]');
|
||||
}
|
||||
|
||||
if (container) {
|
||||
var buttons = container.querySelectorAll('button[role="radio"]');
|
||||
for (var btn of buttons) {
|
||||
var label = btn.getAttribute('aria-label') || '';
|
||||
// Parse "hair salon, mentioned in 4 reviews" format
|
||||
var match = label.match(/^([^,]+),\\s*mentioned in (\\d+)/i);
|
||||
if (match) {
|
||||
topics.push({
|
||||
topic: match[1].trim(),
|
||||
count: parseInt(match[2])
|
||||
});
|
||||
} else if (label && !label.toLowerCase().includes('all review')) {
|
||||
// Fallback: try to extract from child spans
|
||||
var countSpan = btn.querySelector('.bC3Nkc, .fontBodySmall');
|
||||
var nameSpan = btn.querySelector('.uEubGf, span:first-child');
|
||||
if (nameSpan) {
|
||||
var name = nameSpan.textContent.trim();
|
||||
var count = countSpan ? parseInt(countSpan.textContent) : 0;
|
||||
if (name && name.toLowerCase() !== 'all') {
|
||||
topics.push({topic: name, count: count || 0});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return topics;
|
||||
""")
|
||||
return topics or []
|
||||
except:
|
||||
return []
|
||||
|
||||
# Initial page setup (pass validation_only to skip unnecessary steps)
|
||||
scroll_container, stop_scrolling = setup_reviews_page(is_refresh=False, validation_only_mode=validation_only)
|
||||
|
||||
# VALIDATION_ONLY MODE: Return early with just total_reviews and business info
|
||||
# setup_reviews_page returns ("validation_done", None) in this case
|
||||
if validation_only or scroll_container == "validation_done":
|
||||
# Use the business info captured from Overview (before clicking reviews tab)
|
||||
business_info = business_info_cache[0] or {}
|
||||
|
||||
return {
|
||||
"reviews": [],
|
||||
"total": total_reviews[0] or 0,
|
||||
"scrolls": 0,
|
||||
"error": None,
|
||||
"validation_info": {
|
||||
"name": business_info.get("name"),
|
||||
"rating": business_info.get("rating"),
|
||||
"category": business_info.get("category"),
|
||||
"address": business_info.get("address"),
|
||||
"total_reviews": total_reviews[0]
|
||||
}
|
||||
}
|
||||
|
||||
if not scroll_container:
|
||||
return {"reviews": [], "total": 0, "scrolls": 0, "error": "No scroll container found"}
|
||||
|
||||
# Extract review topics after reviews tab is loaded (before scrolling begins)
|
||||
time.sleep(0.5) # Brief wait for topic filters to render
|
||||
review_topics = extract_review_topics()
|
||||
if review_topics:
|
||||
log.info(f"📊 Found {len(review_topics)} review topics: {', '.join(t['topic'] for t in review_topics[:5])}...")
|
||||
|
||||
def get_api_reviews():
|
||||
"""Get reviews from intercepted API responses."""
|
||||
api_revs = []
|
||||
@@ -990,13 +1144,15 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
"total_flushed": total_flushed[0],
|
||||
"checks": check_num,
|
||||
"url": url,
|
||||
"logs": log.get_logs()
|
||||
"logs": log.get_logs(),
|
||||
"review_topics": review_topics # Topic filters with mention counts
|
||||
}
|
||||
|
||||
|
||||
def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999,
|
||||
progress_callback=None, driver=None, return_driver: bool = False,
|
||||
log_capture: LogCapture = None):
|
||||
log_capture: LogCapture = None, flush_callback=None, validation_only: bool = False,
|
||||
browser_fingerprint: dict = None):
|
||||
"""
|
||||
Production-compatible wrapper for scrape_reviews.
|
||||
Matches the API expected by job_manager.py.
|
||||
@@ -1009,6 +1165,13 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
||||
driver: Existing driver instance to reuse
|
||||
return_driver: If True, return driver in result
|
||||
log_capture: Optional LogCapture instance for real-time log access
|
||||
browser_fingerprint: Optional dict with user's browser fingerprint:
|
||||
- geolocation: {lat, lng}
|
||||
- userAgent: string
|
||||
- viewport: {width, height}
|
||||
- timezone: string (e.g., "Europe/Madrid")
|
||||
- language: string (e.g., "en-US")
|
||||
- platform: string (e.g., "MacIntel")
|
||||
|
||||
Returns:
|
||||
Dictionary with: reviews, count, total_reviews, time, success, error, driver, logs
|
||||
@@ -1023,27 +1186,56 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
||||
log_capture = log_capture or LogCapture()
|
||||
|
||||
try:
|
||||
# Extract fingerprint settings
|
||||
fp = browser_fingerprint or {}
|
||||
user_agent = fp.get('userAgent') or "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
viewport = fp.get('viewport') or {'width': 1200, 'height': 900}
|
||||
geolocation = fp.get('geolocation')
|
||||
timezone = fp.get('timezone')
|
||||
language = fp.get('language', 'en-US')
|
||||
|
||||
# Create driver if not provided
|
||||
if not driver:
|
||||
driver = Driver(
|
||||
uc=True,
|
||||
headless=headless,
|
||||
page_load_strategy="normal",
|
||||
agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
agent=user_agent # Use user's actual user agent
|
||||
)
|
||||
driver.set_window_size(1200, 900) # Proper viewport for Google Maps
|
||||
# Set viewport to match user's screen
|
||||
driver.set_window_size(viewport['width'], viewport['height'])
|
||||
|
||||
# Set Chrome geolocation to US (Boston, MA) using CDP
|
||||
# This ensures Google Maps shows US results regardless of server location
|
||||
# Apply browser fingerprint settings via CDP
|
||||
try:
|
||||
# Set timezone if provided
|
||||
if timezone:
|
||||
driver.execute_cdp_cmd('Emulation.setTimezoneOverride', {'timezoneId': timezone})
|
||||
log_capture.info(f"Set timezone to {timezone}")
|
||||
|
||||
# Set locale/language
|
||||
driver.execute_cdp_cmd('Emulation.setLocaleOverride', {'locale': language})
|
||||
|
||||
# Set geolocation
|
||||
if geolocation and 'lat' in geolocation and 'lng' in geolocation:
|
||||
driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
|
||||
'latitude': geolocation['lat'],
|
||||
'longitude': geolocation['lng'],
|
||||
'accuracy': 1000 # ~1km accuracy for IP-based location
|
||||
})
|
||||
log_capture.info(f"Set geolocation to ({geolocation['lat']:.2f}, {geolocation['lng']:.2f})")
|
||||
else:
|
||||
# Default to US (Boston, MA) if no geolocation provided
|
||||
driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
|
||||
'latitude': 42.3601,
|
||||
'longitude': -71.0589,
|
||||
'accuracy': 100
|
||||
})
|
||||
log_capture.info("Set geolocation to US (Boston, MA)")
|
||||
log_capture.info("Set geolocation to US (Boston, MA) [default]")
|
||||
|
||||
if fp:
|
||||
log_capture.info(f"Browser fingerprint applied: {fp.get('platform', 'unknown')}, {viewport['width']}x{viewport['height']}")
|
||||
except Exception as e:
|
||||
log_capture.warning(f"Could not set geolocation: {e}")
|
||||
log_capture.warning(f"Could not apply fingerprint settings: {e}")
|
||||
|
||||
# Add URL parameters for consistent results
|
||||
if 'hl=' not in url:
|
||||
@@ -1052,14 +1244,18 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
||||
if 'gl=' not in url:
|
||||
url = f"{url}&gl=us"
|
||||
|
||||
# Create progress wrapper if callback provided
|
||||
flush_callback = None
|
||||
if progress_callback:
|
||||
# Create combined flush callback for progress + external handler
|
||||
external_flush = flush_callback # Save external callback
|
||||
internal_flush = None
|
||||
if progress_callback or external_flush:
|
||||
collected = [0]
|
||||
def flush_with_progress(reviews_batch):
|
||||
collected[0] += len(reviews_batch)
|
||||
def combined_flush(reviews_batch):
|
||||
collected[0] = len(reviews_batch) # reviews_batch is ALL reviews so far
|
||||
if progress_callback:
|
||||
progress_callback(collected[0], None)
|
||||
flush_callback = flush_with_progress
|
||||
if external_flush:
|
||||
external_flush(reviews_batch) # Pass reviews to external handler
|
||||
internal_flush = combined_flush
|
||||
|
||||
# Run the scraper with progress callback for real-time updates
|
||||
result = scrape_reviews(
|
||||
@@ -1067,10 +1263,11 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
||||
url=url,
|
||||
max_reviews=999999, # Effectively unlimited
|
||||
timeout_no_new=15,
|
||||
flush_callback=flush_callback,
|
||||
flush_callback=internal_flush,
|
||||
flush_batch_size=100, # Smaller batches for more frequent progress
|
||||
log_capture=log_capture,
|
||||
progress_callback=progress_callback # Pass through for real-time log updates
|
||||
progress_callback=progress_callback, # Pass through for real-time log updates
|
||||
validation_only=validation_only # Return early if just validating
|
||||
)
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
@@ -1083,9 +1280,14 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
||||
"time": elapsed,
|
||||
"success": True,
|
||||
"error": None,
|
||||
"logs": result.get("logs", [])
|
||||
"logs": result.get("logs", []),
|
||||
"review_topics": result.get("review_topics", []) # Topic filters with mention counts
|
||||
}
|
||||
|
||||
# Include validation_info if in validation_only mode
|
||||
if validation_only and "validation_info" in result:
|
||||
response["validation_info"] = result["validation_info"]
|
||||
|
||||
if return_driver:
|
||||
response["driver"] = driver
|
||||
elif should_close_driver:
|
||||
@@ -1120,6 +1322,122 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
||||
}
|
||||
|
||||
|
||||
def extract_about_info(driver, url: str = None) -> dict:
|
||||
"""
|
||||
Extract About section info from Google Maps (Accessibility, Amenities, etc.).
|
||||
|
||||
This function should be called AFTER reviews are scraped if about info is needed,
|
||||
as it navigates to a different tab.
|
||||
|
||||
Args:
|
||||
driver: Selenium WebDriver instance (already on the business page)
|
||||
url: Optional URL to navigate to first (if not already on the page)
|
||||
|
||||
Returns:
|
||||
dict with section names as keys, each containing list of features
|
||||
"""
|
||||
try:
|
||||
# Navigate if URL provided
|
||||
if url:
|
||||
# Force English
|
||||
if 'hl=' not in url:
|
||||
separator = '&' if '?' in url else '?'
|
||||
url = f"{url}{separator}hl=en"
|
||||
if 'gl=' not in url:
|
||||
url = f"{url}&gl=us"
|
||||
driver.get(url)
|
||||
time.sleep(1)
|
||||
|
||||
# Click About tab using robust selectors
|
||||
clicked = driver.execute_script("""
|
||||
// Try multiple selectors for about tab
|
||||
var selectors = [
|
||||
'button[aria-label*="About"]',
|
||||
'button[data-tab-index="2"]',
|
||||
'div[role="tablist"] button:nth-child(3)',
|
||||
'button[jsaction*="about"]'
|
||||
];
|
||||
|
||||
for (var sel of selectors) {
|
||||
var btn = document.querySelector(sel);
|
||||
if (btn && btn.textContent.toLowerCase().includes('about')) {
|
||||
btn.click();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: find by text content
|
||||
var buttons = document.querySelectorAll('button');
|
||||
for (var btn of buttons) {
|
||||
if (btn.textContent.trim().toLowerCase() === 'about') {
|
||||
btn.click();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
""")
|
||||
|
||||
if not clicked:
|
||||
return {}
|
||||
|
||||
time.sleep(1.5) # Wait for about tab to load
|
||||
|
||||
# Extract about sections using aria-labels (robust)
|
||||
about = driver.execute_script("""
|
||||
var about = {};
|
||||
|
||||
// Find the about region by aria-label or role
|
||||
var container = document.querySelector('div[role="region"][aria-label*="About"]');
|
||||
|
||||
if (!container) {
|
||||
// Fallback: look for the scrollable area with sections
|
||||
container = document.querySelector('.m6QErb[aria-label*="About"]');
|
||||
}
|
||||
|
||||
if (!container) {
|
||||
// Last resort: find sections by h2 headers
|
||||
container = document;
|
||||
}
|
||||
|
||||
// Find all section headers (h2 elements)
|
||||
var sections = container.querySelectorAll('h2');
|
||||
|
||||
for (var h2 of sections) {
|
||||
var sectionName = h2.textContent.trim();
|
||||
var items = [];
|
||||
|
||||
// Find the ul list following this h2
|
||||
var parent = h2.closest('.iP2t7d, div');
|
||||
if (parent) {
|
||||
var listItems = parent.querySelectorAll('li span[aria-label]');
|
||||
for (var li of listItems) {
|
||||
var label = li.getAttribute('aria-label');
|
||||
if (label) {
|
||||
// Parse "Has toilet" or "No wheelchair-accessible car park"
|
||||
var hasFeature = !label.toLowerCase().startsWith('no ');
|
||||
var featureName = label.replace(/^(Has |No )/i, '');
|
||||
items.push({
|
||||
feature: featureName,
|
||||
available: hasFeature
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (sectionName && items.length > 0) {
|
||||
about[sectionName] = items;
|
||||
}
|
||||
}
|
||||
|
||||
return about;
|
||||
""")
|
||||
|
||||
return about or {}
|
||||
|
||||
except Exception as e:
|
||||
return {"error": str(e)}
|
||||
|
||||
|
||||
# Test function
|
||||
if __name__ == "__main__":
|
||||
from seleniumbase import Driver
|
||||
@@ -1159,6 +1477,8 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
|
||||
dict with: name, address, rating, total_reviews, success, error, time
|
||||
"""
|
||||
from seleniumbase import Driver
|
||||
import logging
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
start_time = time.time()
|
||||
driver_provided = driver is not None
|
||||
@@ -1177,13 +1497,15 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
|
||||
except:
|
||||
pass
|
||||
|
||||
# Clear state if reusing a pooled driver (ensures clean page load)
|
||||
if driver_provided:
|
||||
try:
|
||||
driver.delete_all_cookies()
|
||||
driver.get("about:blank")
|
||||
except:
|
||||
pass
|
||||
# Don't clear state - Google may serve different content based on session history
|
||||
# The scraper doesn't reset state, so validation shouldn't either
|
||||
|
||||
# Force English interface for consistent parsing
|
||||
if 'hl=' not in url:
|
||||
separator = '&' if '?' in url else '?'
|
||||
url = f"{url}{separator}hl=en"
|
||||
if 'gl=' not in url:
|
||||
url = f"{url}&gl=us"
|
||||
|
||||
# Navigate to URL
|
||||
driver.get(url)
|
||||
@@ -1193,48 +1515,183 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
|
||||
while time.time() - start < 5:
|
||||
if "consent.google" in driver.current_url:
|
||||
try:
|
||||
# Try multiple approaches to find and click accept button
|
||||
clicked = False
|
||||
|
||||
# Method 1: Find by aria-label (most reliable for Google consent)
|
||||
for btn in driver.find_elements(By.CSS_SELECTOR, "button[aria-label*='Accept']"):
|
||||
btn.click()
|
||||
clicked = True
|
||||
break
|
||||
|
||||
# Method 2: Find by text content
|
||||
if not clicked:
|
||||
for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
|
||||
txt = btn.text.lower()
|
||||
if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
|
||||
if "accept all" in txt or "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
|
||||
btn.click()
|
||||
driver.get(url)
|
||||
clicked = True
|
||||
break
|
||||
except:
|
||||
|
||||
if clicked:
|
||||
time.sleep(0.5) # Brief wait for consent to process
|
||||
driver.get(url) # Reload the target URL
|
||||
time.sleep(0.5) # Wait for reload
|
||||
except Exception as e:
|
||||
pass
|
||||
break
|
||||
if "maps/place" in driver.current_url or ("maps" in driver.current_url and "consent" not in driver.current_url):
|
||||
break
|
||||
time.sleep(0.01) # 10ms - responsive but low CPU
|
||||
|
||||
# Log current URL after consent handling
|
||||
try:
|
||||
current_url = driver.current_url
|
||||
log.info(f"🔍 Validation: Current URL after load: {current_url[:80]}...")
|
||||
except:
|
||||
pass
|
||||
|
||||
# Wait for page to fully render before polling (tabs may load dynamically)
|
||||
time.sleep(2)
|
||||
|
||||
# Poll for business info (same pattern as total_reviews extraction)
|
||||
info = {"name": None, "rating": None, "total_reviews": None, "address": None}
|
||||
# Timeout increased to 10s because Reviews tab can take 6+ seconds to appear after consent
|
||||
info = {"name": None, "rating": None, "total_reviews": None, "address": None, "category": None}
|
||||
start = time.time()
|
||||
while time.time() - start < 5:
|
||||
debug_logged = False
|
||||
while time.time() - start < 10:
|
||||
try:
|
||||
info = driver.execute_script("""
|
||||
var result = {name: null, rating: null, total_reviews: null, address: null};
|
||||
var result = {name: null, rating: null, total_reviews: null, address: null, category: null, debug: []};
|
||||
|
||||
// Business name from h1
|
||||
var h1 = document.querySelector('h1');
|
||||
if (h1) result.name = h1.textContent.trim();
|
||||
|
||||
// Rating and reviews from span[role="img"] aria-labels
|
||||
// Same pattern as scrape_reviews for consistency
|
||||
// Category - use jsaction attribute (robust, survives class changes)
|
||||
var catBtn = document.querySelector('button[jsaction*="category"]');
|
||||
if (catBtn) result.category = catBtn.textContent.trim();
|
||||
|
||||
// Fallback: look for button after rating that's not a link
|
||||
if (!result.category) {
|
||||
var buttons = document.querySelectorAll('button');
|
||||
for (var btn of buttons) {
|
||||
var text = btn.textContent.trim();
|
||||
// Categories are short words, no numbers, not navigation
|
||||
if (text && text.length < 50 && !text.match(/^[0-9]/) &&
|
||||
!text.match(/review|star|direction|save|share|photo/i)) {
|
||||
// Check if it's near the rating area
|
||||
var parent = btn.closest('.LBgpqf, .skqShb, .fontBodyMedium');
|
||||
if (parent) {
|
||||
result.category = text;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Rating from span[role="img"] aria-labels
|
||||
var spans = document.querySelectorAll('span[role="img"]');
|
||||
for (var i = 0; i < spans.length; i++) {
|
||||
var label = spans[i].getAttribute('aria-label') || '';
|
||||
|
||||
// Rating: "4.8 stars", "4,8 estrellas", etc (partial match)
|
||||
var rMatch = label.match(/^([\\d,.]+)\\s*(star|estrella|étoile|stern|stell)/i);
|
||||
// Collect debug info for all aria-labels
|
||||
if (label) {
|
||||
result.debug.push('img-aria: ' + label);
|
||||
}
|
||||
|
||||
// Rating: "4.8 stars" (English forced via hl=en)
|
||||
var rMatch = label.match(/^([\\d,.]+)\\s*star/i);
|
||||
if (rMatch && !result.rating) {
|
||||
result.rating = parseFloat(rMatch[1].replace(',', '.'));
|
||||
}
|
||||
|
||||
// Reviews: same as scrape_reviews - /^([\d,.]+)\s*review/i
|
||||
// Plus Spanish "reseña" which doesn't contain "review"
|
||||
var revMatch = label.match(/^([\\d,\\.]+)\\s*(review|reseña|avis|bewertung|recension)/i);
|
||||
// Reviews: "79 reviews" or "4.8 stars 79 reviews" (English forced via hl=en)
|
||||
// Try direct format first: "79 reviews"
|
||||
var revMatch = label.match(/^([\\d,]+)\\s*review/i);
|
||||
if (revMatch && !result.total_reviews) {
|
||||
result.total_reviews = parseInt(revMatch[1].replace(/[,\\.]/g, ''));
|
||||
result.total_reviews = parseInt(revMatch[1].replace(/,/g, ''));
|
||||
}
|
||||
|
||||
// Try combined format: "4.8 stars 79 reviews" or "4.8 stars 79k+ reviews"
|
||||
if (!result.total_reviews) {
|
||||
var combinedMatch = label.match(/stars?\\s+([\\d,]+k?\\+?)\\s*review/i);
|
||||
if (combinedMatch) {
|
||||
var countStr = combinedMatch[1].replace(/,/g, '');
|
||||
if (countStr.includes('k')) {
|
||||
// Handle "9k+" format
|
||||
result.total_reviews = parseInt(countStr) * 1000;
|
||||
} else {
|
||||
result.total_reviews = parseInt(countStr);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Also collect tab button texts for debugging (include full text including numbers)
|
||||
var tabs = document.querySelectorAll('button[role="tab"]');
|
||||
for (var j = 0; j < tabs.length; j++) {
|
||||
var tabText = tabs[j].textContent.trim();
|
||||
result.debug.push('tab: ' + tabText);
|
||||
// Also try to extract review count from tab text like "Reviews (79)"
|
||||
if (tabText.toLowerCase().includes('review') && !result.total_reviews) {
|
||||
var tabMatch = tabText.match(/\\((\\d+)\\)/);
|
||||
if (tabMatch) {
|
||||
result.total_reviews = parseInt(tabMatch[1]);
|
||||
result.debug.push('Found reviews in tab: ' + tabText);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Also check ALL buttons for reviews count
|
||||
var allButtons = document.querySelectorAll('button');
|
||||
for (var b = 0; b < allButtons.length; b++) {
|
||||
var btnText = allButtons[b].textContent || '';
|
||||
if (btnText.toLowerCase().includes('review') && !btnText.toLowerCase().includes('write')) {
|
||||
var numMatch = btnText.match(/\\((\\d+)\\)/);
|
||||
if (numMatch && !result.total_reviews) {
|
||||
result.total_reviews = parseInt(numMatch[1]);
|
||||
result.debug.push('Found reviews in button: ' + btnText.substring(0, 50));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check if we're on search results vs place page
|
||||
result.debug.push('title: ' + document.title);
|
||||
result.debug.push('url: ' + window.location.href.substring(0, 80));
|
||||
|
||||
// Check for search results list
|
||||
var searchResults = document.querySelectorAll('div[role="feed"] > div');
|
||||
result.debug.push('search_results_count: ' + searchResults.length);
|
||||
|
||||
// Fallback: Get review count from Reviews tab button "Reviews (79)"
|
||||
// Search ALL tab buttons for one containing "review" text (same as scrape_reviews)
|
||||
if (!result.total_reviews) {
|
||||
var tabs = document.querySelectorAll('button[role="tab"]');
|
||||
for (var tab of tabs) {
|
||||
var text = tab.textContent.toLowerCase();
|
||||
if (text.includes('review')) {
|
||||
var match = tab.textContent.match(/\\((\\d+)\\)/);
|
||||
if (match) {
|
||||
result.total_reviews = parseInt(match[1]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback 2: Look for any button with "Reviews" and a number
|
||||
if (!result.total_reviews) {
|
||||
var buttons = document.querySelectorAll('button');
|
||||
for (var btn of buttons) {
|
||||
var text = btn.textContent;
|
||||
if (text.toLowerCase().includes('review') && !text.toLowerCase().includes('write')) {
|
||||
var numMatch = text.match(/\\((\\d+)\\)/);
|
||||
if (numMatch) {
|
||||
result.total_reviews = parseInt(numMatch[1]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1242,23 +1699,41 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
|
||||
var addrBtn = document.querySelector('button[data-item-id="address"]');
|
||||
if (addrBtn) {
|
||||
var label = addrBtn.getAttribute('aria-label');
|
||||
if (label) result.address = label.replace(/^(Address|Dirección|Adresse):\\s*/i, '');
|
||||
if (label) result.address = label.replace(/^Address:\\s*/i, '');
|
||||
}
|
||||
|
||||
return result;
|
||||
""")
|
||||
# Exit early if we have the essentials
|
||||
if info.get("name") and info.get("total_reviews") is not None:
|
||||
# Exit early if we have the essentials (name found AND reviews count > 0)
|
||||
if info.get("name") and info.get("total_reviews") and info.get("total_reviews") > 0:
|
||||
break
|
||||
|
||||
# Log debug info once after 3 seconds
|
||||
if not debug_logged and time.time() - start > 3:
|
||||
debug_logged = True
|
||||
debug_info = info.get("debug", [])
|
||||
if debug_info:
|
||||
log.info(f"🔍 Validation debug - URL: {url[:50]}...")
|
||||
log.info(f" Name: {info.get('name')}, Rating: {info.get('rating')}, Reviews: {info.get('total_reviews')}")
|
||||
for d in debug_info[:10]: # First 10 debug items
|
||||
log.info(f" {d}")
|
||||
except:
|
||||
pass
|
||||
time.sleep(0.1) # 100ms between polls
|
||||
|
||||
# Final debug log if still no reviews
|
||||
if not info.get("total_reviews"):
|
||||
debug_info = info.get("debug", [])
|
||||
log.warning(f"⚠️ Validation: No reviews found for '{info.get('name')}' after 10s polling")
|
||||
if debug_info:
|
||||
log.warning(f" Debug items: {debug_info[:10]}")
|
||||
|
||||
return {
|
||||
"name": info.get("name"),
|
||||
"address": info.get("address"),
|
||||
"rating": info.get("rating"),
|
||||
"total_reviews": info.get("total_reviews"),
|
||||
"category": info.get("category"),
|
||||
"success": bool(info.get("name")),
|
||||
"error": None,
|
||||
"time": time.time() - start_time
|
||||
@@ -1270,6 +1745,7 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
|
||||
"address": None,
|
||||
"rating": None,
|
||||
"total_reviews": None,
|
||||
"category": None,
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"time": time.time() - start_time
|
||||
|
||||
@@ -27,6 +27,8 @@ interface SelectedJob {
|
||||
jobId: string;
|
||||
newCount?: number;
|
||||
previousJobId?: string;
|
||||
businessCategory?: string;
|
||||
reviewTopics?: { topic: string; count: number }[];
|
||||
}
|
||||
|
||||
type ViewType = 'newScrape' | 'jobs' | 'reports';
|
||||
@@ -106,6 +108,8 @@ export default function Home() {
|
||||
jobId: job.job_id,
|
||||
newCount: data.new_count,
|
||||
previousJobId: previousJob?.job_id,
|
||||
businessCategory: job.business_category || undefined,
|
||||
reviewTopics: job.review_topics || undefined,
|
||||
});
|
||||
setActiveView('reports');
|
||||
}
|
||||
@@ -155,7 +159,7 @@ export default function Home() {
|
||||
Back to Reports
|
||||
</button>
|
||||
</div>
|
||||
<ReviewAnalytics reviews={selectedJob.reviews} businessName={selectedJob.businessName} businessUrl={selectedJob.businessUrl} newCount={selectedJob.newCount} />
|
||||
<ReviewAnalytics reviews={selectedJob.reviews} businessName={selectedJob.businessName} businessUrl={selectedJob.businessUrl} newCount={selectedJob.newCount} businessCategory={selectedJob.businessCategory} reviewTopics={selectedJob.reviewTopics} />
|
||||
</div>
|
||||
) : (
|
||||
<div className="h-full overflow-y-auto p-6">
|
||||
|
||||
@@ -22,14 +22,21 @@ interface ReviewWithNew extends Review {
|
||||
photo_urls?: string[] | null;
|
||||
}
|
||||
|
||||
interface ReviewTopic {
|
||||
topic: string;
|
||||
count: number;
|
||||
}
|
||||
|
||||
interface ReviewAnalyticsProps {
|
||||
reviews: ReviewWithNew[];
|
||||
businessName?: string;
|
||||
businessUrl?: string;
|
||||
newCount?: number;
|
||||
businessCategory?: string;
|
||||
reviewTopics?: ReviewTopic[];
|
||||
}
|
||||
|
||||
export default function ReviewAnalytics({ reviews, businessName, businessUrl, newCount }: ReviewAnalyticsProps) {
|
||||
export default function ReviewAnalytics({ reviews, businessName, businessUrl, newCount, businessCategory, reviewTopics }: ReviewAnalyticsProps) {
|
||||
const [sorting, setSorting] = useState<SortingState>([{ id: 'date', desc: true }]); // Default: newest first
|
||||
const [columnFilters, setColumnFiltersState] = useState<ColumnFiltersState>([]);
|
||||
const [globalFilter, setGlobalFilter] = useState('');
|
||||
@@ -476,9 +483,16 @@ export default function ReviewAnalytics({ reviews, businessName, businessUrl, ne
|
||||
{/* Header */}
|
||||
<div className="flex items-center justify-between">
|
||||
<div>
|
||||
<div className="flex items-center gap-3">
|
||||
<h2 className="text-3xl font-bold text-gray-900">
|
||||
{businessName || 'Review Analytics'}
|
||||
</h2>
|
||||
{businessCategory && (
|
||||
<span className="px-3 py-1 bg-purple-100 text-purple-800 text-sm font-medium rounded-full border border-purple-300">
|
||||
{businessCategory}
|
||||
</span>
|
||||
)}
|
||||
</div>
|
||||
{businessUrl && (
|
||||
<a
|
||||
href={businessUrl}
|
||||
@@ -821,6 +835,33 @@ export default function ReviewAnalytics({ reviews, businessName, businessUrl, ne
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Review Topics - from Google Maps */}
|
||||
{reviewTopics && reviewTopics.length > 0 && (
|
||||
<div className="bg-white border-2 border-gray-300 rounded-xl p-5 shadow-md">
|
||||
<div className="flex items-center gap-2 mb-4">
|
||||
<MessageSquare className="w-6 h-6 text-indigo-600" />
|
||||
<h3 className="text-lg font-bold text-gray-900">What People Talk About</h3>
|
||||
<span className="text-sm text-gray-500">({reviewTopics.length} topics from Google)</span>
|
||||
</div>
|
||||
<div className="flex flex-wrap gap-2">
|
||||
{reviewTopics.slice(0, 15).map((topic, idx) => (
|
||||
<div
|
||||
key={idx}
|
||||
className="px-3 py-1.5 bg-gradient-to-r from-indigo-50 to-purple-50 border border-indigo-200 rounded-full flex items-center gap-2"
|
||||
>
|
||||
<span className="text-sm font-medium text-indigo-800">{topic.topic}</span>
|
||||
<span className="text-xs bg-indigo-200 text-indigo-900 px-1.5 py-0.5 rounded-full font-bold">
|
||||
{topic.count}
|
||||
</span>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
{reviewTopics.length > 15 && (
|
||||
<p className="text-sm text-gray-500 mt-3">+{reviewTopics.length - 15} more topics</p>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Rating & Volume Timeline */}
|
||||
{timelineData.length > 0 && (
|
||||
<div className={`bg-white rounded-xl p-6 shadow-md transition-all ${
|
||||
|
||||
@@ -15,7 +15,7 @@ interface Review {
|
||||
|
||||
export interface JobStatus {
|
||||
job_id: string;
|
||||
status: 'pending' | 'running' | 'completed' | 'failed';
|
||||
status: 'pending' | 'running' | 'completed' | 'failed' | 'partial';
|
||||
url: string;
|
||||
created_at: string;
|
||||
started_at: string | null;
|
||||
@@ -28,8 +28,11 @@ export interface JobStatus {
|
||||
// Business metadata for tracking and comparison
|
||||
business_name: string | null;
|
||||
business_address: string | null;
|
||||
business_category: string | null;
|
||||
rating_snapshot: number | null;
|
||||
total_reviews_snapshot: number | null;
|
||||
// Review topics extracted from Google Maps
|
||||
review_topics: { topic: string; count: number }[] | null;
|
||||
}
|
||||
|
||||
interface ScraperTestProps {
|
||||
@@ -56,7 +59,64 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
|
||||
const [businessRating, setBusinessRating] = useState<number | null>(null);
|
||||
const [businessImage, setBusinessImage] = useState<string | null>(null);
|
||||
const [businessCategory, setBusinessCategory] = useState<string | null>(null);
|
||||
const [userFingerprint, setUserFingerprint] = useState<{
|
||||
geolocation?: {lat: number, lng: number},
|
||||
userAgent?: string,
|
||||
viewport?: {width: number, height: number},
|
||||
timezone?: string,
|
||||
language?: string,
|
||||
platform?: string
|
||||
}>({});
|
||||
const debounceRef = useRef<NodeJS.Timeout | null>(null);
|
||||
|
||||
// Collect browser fingerprint on mount (no permissions needed)
|
||||
useEffect(() => {
|
||||
const collectFingerprint = async () => {
|
||||
const fingerprint: typeof userFingerprint = {};
|
||||
|
||||
// User agent
|
||||
fingerprint.userAgent = navigator.userAgent;
|
||||
|
||||
// Screen/viewport size
|
||||
fingerprint.viewport = {
|
||||
width: window.screen.width,
|
||||
height: window.screen.height
|
||||
};
|
||||
|
||||
// Timezone
|
||||
fingerprint.timezone = Intl.DateTimeFormat().resolvedOptions().timeZone;
|
||||
|
||||
// Language
|
||||
fingerprint.language = navigator.language;
|
||||
|
||||
// Platform
|
||||
fingerprint.platform = navigator.platform;
|
||||
|
||||
// Get approximate location from IP (no permission needed)
|
||||
try {
|
||||
const response = await fetch('https://ipapi.co/json/', {
|
||||
signal: AbortSignal.timeout(3000)
|
||||
});
|
||||
if (response.ok) {
|
||||
const data = await response.json();
|
||||
if (data.latitude && data.longitude) {
|
||||
fingerprint.geolocation = {
|
||||
lat: data.latitude,
|
||||
lng: data.longitude
|
||||
};
|
||||
console.log('IP location:', data.city, data.country_name);
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.log('IP geolocation not available');
|
||||
}
|
||||
|
||||
setUserFingerprint(fingerprint);
|
||||
console.log('Browser fingerprint:', fingerprint);
|
||||
};
|
||||
|
||||
collectFingerprint();
|
||||
}, []);
|
||||
const pollingIntervals = useRef<Map<string, NodeJS.Timeout>>(new Map());
|
||||
const abortControllerRef = useRef<AbortController | null>(null);
|
||||
|
||||
@@ -121,18 +181,23 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
|
||||
setBusinessCategory(null);
|
||||
setError('');
|
||||
|
||||
// Create new abort controller with 30 second timeout
|
||||
// Create new abort controller with 60 second timeout (validation can be slow)
|
||||
const controller = new AbortController();
|
||||
abortControllerRef.current = controller;
|
||||
const timeoutId = setTimeout(() => controller.abort(), 30000);
|
||||
const timeoutId = setTimeout(() => controller.abort(), 60000);
|
||||
|
||||
try {
|
||||
const url = `https://www.google.com/maps/search/?api=1&query=${encodeURIComponent(query)}`;
|
||||
// Force English with hl=en parameter
|
||||
const url = `https://www.google.com/maps/search/?api=1&query=${encodeURIComponent(query)}&hl=en`;
|
||||
|
||||
const response = await fetch('/api/check-reviews', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ url }),
|
||||
body: JSON.stringify({
|
||||
url,
|
||||
geolocation: userFingerprint.geolocation,
|
||||
browser_fingerprint: userFingerprint // Pass full fingerprint
|
||||
}),
|
||||
signal: controller.signal,
|
||||
});
|
||||
|
||||
@@ -157,21 +222,30 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
|
||||
} catch (err) {
|
||||
clearTimeout(timeoutId);
|
||||
|
||||
// Ignore AbortError (happens when user starts a new validation)
|
||||
// Check if this is a timeout abort vs user-initiated abort
|
||||
if (err instanceof Error && err.name === 'AbortError') {
|
||||
// Check if it was a timeout (controller still matches) or user started new search
|
||||
if (abortControllerRef.current === controller) {
|
||||
// Timeout - show error
|
||||
console.error('Validation timed out');
|
||||
setError('Validation timed out. Please try again.');
|
||||
setHasReviews(false);
|
||||
setAvailableReviewCount(0);
|
||||
} else {
|
||||
// User started a new search - just return silently
|
||||
console.log('Validation cancelled (new validation started)');
|
||||
return;
|
||||
}
|
||||
|
||||
} else {
|
||||
console.error('Error getting business info:', err);
|
||||
// Error occurred
|
||||
setHasReviews(false);
|
||||
setAvailableReviewCount(0);
|
||||
} finally {
|
||||
// Only clear loading state if this controller wasn't aborted
|
||||
if (!controller.signal.aborted) {
|
||||
setIsCheckingReviews(false);
|
||||
}
|
||||
} finally {
|
||||
clearTimeout(timeoutId);
|
||||
// Always clear loading state (even on timeout)
|
||||
setIsCheckingReviews(false);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -192,8 +266,8 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
|
||||
return newMap;
|
||||
});
|
||||
|
||||
// Stop polling if job is done
|
||||
if (data.status === 'completed' || data.status === 'failed') {
|
||||
// Stop polling if job is done (completed, failed, or partial)
|
||||
if (data.status === 'completed' || data.status === 'failed' || data.status === 'partial') {
|
||||
const interval = pollingIntervals.current.get(jobId);
|
||||
if (interval) {
|
||||
clearInterval(interval);
|
||||
@@ -244,8 +318,8 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
|
||||
setIsSubmitting(true);
|
||||
setShowConfirmModal(false);
|
||||
|
||||
// Use the search query to create a Google Maps search URL
|
||||
const url = `https://www.google.com/maps/search/?api=1&query=${encodeURIComponent(searchedQuery)}`;
|
||||
// Use the search query to create a Google Maps search URL (force English)
|
||||
const url = `https://www.google.com/maps/search/?api=1&query=${encodeURIComponent(searchedQuery)}&hl=en`;
|
||||
|
||||
try {
|
||||
const response = await fetch('/api/scrape', {
|
||||
@@ -257,6 +331,8 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
|
||||
business_address: businessAddress,
|
||||
rating_snapshot: businessRating,
|
||||
total_reviews_snapshot: availableReviewCount,
|
||||
geolocation: userFingerprint.geolocation,
|
||||
browser_fingerprint: userFingerprint, // Pass full fingerprint
|
||||
}),
|
||||
});
|
||||
|
||||
@@ -283,8 +359,10 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
|
||||
error_message: null,
|
||||
business_name: businessName,
|
||||
business_address: businessAddress,
|
||||
business_category: businessCategory,
|
||||
rating_snapshot: businessRating,
|
||||
total_reviews_snapshot: availableReviewCount,
|
||||
review_topics: null, // Will be populated when job completes
|
||||
});
|
||||
return newMap;
|
||||
});
|
||||
@@ -305,6 +383,7 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
|
||||
case 'completed': return 'text-green-700';
|
||||
case 'running': return 'text-blue-700';
|
||||
case 'failed': return 'text-red-700';
|
||||
case 'partial': return 'text-orange-700';
|
||||
default: return 'text-gray-800';
|
||||
}
|
||||
};
|
||||
@@ -325,6 +404,12 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
|
||||
<path fillRule="evenodd" d="M10 18a8 8 0 100-16 8 8 0 000 16zM8.707 7.293a1 1 0 00-1.414 1.414L8.586 10l-1.293 1.293a1 1 0 101.414 1.414L10 11.414l1.293 1.293a1 1 0 001.414-1.414L11.414 10l1.293-1.293a1 1 0 00-1.414-1.414L10 8.586 8.707 7.293z" clipRule="evenodd" />
|
||||
</svg>
|
||||
);
|
||||
case 'partial':
|
||||
return (
|
||||
<svg className="w-5 h-5 text-orange-500" fill="currentColor" viewBox="0 0 20 20">
|
||||
<path fillRule="evenodd" d="M8.257 3.099c.765-1.36 2.722-1.36 3.486 0l5.58 9.92c.75 1.334-.213 2.98-1.742 2.98H4.42c-1.53 0-2.493-1.646-1.743-2.98l5.58-9.92zM11 13a1 1 0 11-2 0 1 1 0 012 0zm-1-8a1 1 0 00-1 1v3a1 1 0 002 0V6a1 1 0 00-1-1z" clipRule="evenodd" />
|
||||
</svg>
|
||||
);
|
||||
default:
|
||||
return (
|
||||
<svg className="w-5 h-5 text-gray-400" fill="currentColor" viewBox="0 0 20 20">
|
||||
@@ -776,8 +861,8 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
|
||||
)}
|
||||
</div>
|
||||
|
||||
{/* Action Buttons - Show when completed and has reviews */}
|
||||
{job.status === 'completed' && job.reviews_count && job.reviews_count > 0 && (
|
||||
{/* Action Buttons - Show when completed, partial, or running with reviews */}
|
||||
{(job.status === 'completed' || job.status === 'partial' || (job.status === 'running' && job.reviews_count && job.reviews_count > 0)) && job.reviews_count && job.reviews_count > 0 && (
|
||||
<div className="flex gap-3">
|
||||
<button
|
||||
onClick={async () => {
|
||||
@@ -818,7 +903,13 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
|
||||
}
|
||||
}}
|
||||
disabled={isLoadingReviews}
|
||||
className="flex-1 py-4 bg-gradient-to-r from-blue-600 to-indigo-700 text-white rounded-xl font-bold hover:from-blue-700 hover:to-indigo-800 transition-all flex items-center justify-center gap-2 shadow-lg disabled:opacity-50 disabled:cursor-not-allowed text-lg border-2 border-blue-500"
|
||||
className={`flex-1 py-4 text-white rounded-xl font-bold transition-all flex items-center justify-center gap-2 shadow-lg disabled:opacity-50 disabled:cursor-not-allowed text-lg border-2 ${
|
||||
job.status === 'partial'
|
||||
? 'bg-gradient-to-r from-orange-500 to-amber-600 hover:from-orange-600 hover:to-amber-700 border-orange-400'
|
||||
: job.status === 'running'
|
||||
? 'bg-gradient-to-r from-blue-500 to-cyan-600 hover:from-blue-600 hover:to-cyan-700 border-blue-400'
|
||||
: 'bg-gradient-to-r from-blue-600 to-indigo-700 hover:from-blue-700 hover:to-indigo-800 border-blue-500'
|
||||
}`}
|
||||
>
|
||||
{isLoadingReviews ? (
|
||||
<>
|
||||
@@ -830,7 +921,7 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
|
||||
<svg className="w-6 h-6" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
||||
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M9 19v-6a2 2 0 00-2-2H5a2 2 0 00-2 2v6a2 2 0 002 2h2a2 2 0 002-2zm0 0V9a2 2 0 012-2h2a2 2 0 012 2v10m-6 0a2 2 0 002 2h2a2 2 0 002-2m0 0V5a2 2 0 012-2h2a2 2 0 012 2v14a2 2 0 01-2 2h-2a2 2 0 01-2-2z" />
|
||||
</svg>
|
||||
📊 Open Analytics Dashboard
|
||||
📊 {job.status === 'running' ? 'Preview Analytics' : job.status === 'partial' ? 'View Partial Data' : 'Open Analytics Dashboard'}
|
||||
</>
|
||||
)}
|
||||
</button>
|
||||
@@ -845,7 +936,7 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
|
||||
const url = URL.createObjectURL(blob);
|
||||
const a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = `reviews-${job.job_id}.json`;
|
||||
a.download = `reviews-${job.job_id}${job.status === 'partial' ? '-partial' : ''}.json`;
|
||||
a.click();
|
||||
}
|
||||
} catch (err) {
|
||||
@@ -862,6 +953,24 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Partial Job Warning */}
|
||||
{job.status === 'partial' && (
|
||||
<div className="mt-4 p-4 bg-orange-100 border-2 border-orange-300 rounded-lg">
|
||||
<div className="flex items-start gap-2">
|
||||
<svg className="w-5 h-5 text-orange-700 flex-shrink-0 mt-0.5" fill="currentColor" viewBox="0 0 20 20">
|
||||
<path fillRule="evenodd" d="M8.257 3.099c.765-1.36 2.722-1.36 3.486 0l5.58 9.92c.75 1.334-.213 2.98-1.742 2.98H4.42c-1.53 0-2.493-1.646-1.743-2.98l5.58-9.92zM11 13a1 1 0 11-2 0 1 1 0 012 0zm-1-8a1 1 0 00-1 1v3a1 1 0 002 0V6a1 1 0 00-1-1z" clipRule="evenodd" />
|
||||
</svg>
|
||||
<div>
|
||||
<p className="font-bold text-orange-900">Partial Results</p>
|
||||
<p className="text-sm text-orange-800 mt-1">
|
||||
This job was interrupted but {job.reviews_count} reviews were saved.
|
||||
{job.error_message && <span className="block mt-1 text-orange-700">Reason: {job.error_message}</span>}
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Error Message */}
|
||||
{job.status === 'failed' && job.error_message && (
|
||||
<div className="mt-4 p-4 bg-red-100 border-2 border-red-300 rounded-lg">
|
||||
|
||||
@@ -66,7 +66,9 @@ export function calculateReviewStats(reviews: Review[]): ReviewStats {
|
||||
// Populate minDate/maxDate/centerDate on reviews for display
|
||||
reviews.forEach(r => {
|
||||
if (!r.minDate || !r.maxDate || !r.centerDate) {
|
||||
const range = parseDateTextToRange(r.date_text);
|
||||
// Handle both date_text and timestamp field names
|
||||
const dateText = r.date_text || (r as any).timestamp || '';
|
||||
const range = parseDateTextToRange(dateText);
|
||||
r.minDate = range.minDate;
|
||||
r.maxDate = range.maxDate;
|
||||
// Calculate centerDate as midpoint
|
||||
@@ -96,8 +98,8 @@ export function calculateReviewStats(reviews: Review[]): ReviewStats {
|
||||
|
||||
// Recent reviews (last 30 days - simplified check)
|
||||
const recentReviews = reviews.filter(r => {
|
||||
const text = r.date_text.toLowerCase();
|
||||
return text.includes('day') || text.includes('week') || text.includes('hour');
|
||||
const text = (r.date_text || (r as any).timestamp || '').toLowerCase();
|
||||
return text.includes('day') || text.includes('week') || text.includes('hour') || text.includes('minute') || text.includes('second');
|
||||
}).length;
|
||||
|
||||
// Rating distribution
|
||||
@@ -278,6 +280,14 @@ function extractNumber(text: string): number {
|
||||
*/
|
||||
export function parseDateTextToRange(dateText: string): { minDate: Date; maxDate: Date } {
|
||||
const now = new Date();
|
||||
|
||||
// Handle undefined/null dateText
|
||||
if (!dateText) {
|
||||
// Return a default range (assume recent - within last month)
|
||||
const daysAgo = (days: number) => new Date(now.getTime() - days * 24 * 60 * 60 * 1000);
|
||||
return { minDate: daysAgo(30), maxDate: now };
|
||||
}
|
||||
|
||||
const text = dateText.toLowerCase();
|
||||
|
||||
// Remove "Edited " prefix if present
|
||||
@@ -396,7 +406,8 @@ export function filterReviewsByDateRange(reviews: Review[], range: DateRange): R
|
||||
// Filter range: [filterStart, filterEnd]
|
||||
// Overlap occurs when: minDate <= filterEnd AND maxDate >= filterStart
|
||||
return reviews.filter(r => {
|
||||
const { minDate, maxDate } = parseDateTextToRange(r.date_text);
|
||||
const dateText = r.date_text || (r as any).timestamp || '';
|
||||
const { minDate, maxDate } = parseDateTextToRange(dateText);
|
||||
return minDate <= filterEnd && maxDate >= filterStart;
|
||||
});
|
||||
}
|
||||
@@ -405,7 +416,8 @@ export function filterReviewsByCustomDateRange(reviews: Review[], fromDate: Date
|
||||
if (!fromDate && !toDate) return reviews;
|
||||
|
||||
return reviews.filter(r => {
|
||||
const reviewDate = parseDateText(r.date_text);
|
||||
const dateText = r.date_text || (r as any).timestamp || '';
|
||||
const reviewDate = parseDateText(dateText);
|
||||
|
||||
// If only fromDate is set, filter reviews >= fromDate
|
||||
if (fromDate && !toDate) {
|
||||
@@ -429,7 +441,7 @@ export function filterReviewsByCustomDateRange(reviews: Review[], fromDate: Date
|
||||
export function calculateTimelineData(reviews: Review[]): TimelineDataPoint[] {
|
||||
// Sort reviews by date (newest first)
|
||||
const sortedReviews = [...reviews]
|
||||
.map(r => ({ ...r, parsedDate: parseDateText(r.date_text) }))
|
||||
.map(r => ({ ...r, parsedDate: parseDateText(r.date_text || (r as any).timestamp || '') }))
|
||||
.sort((a, b) => b.parsedDate.getTime() - a.parsedDate.getTime());
|
||||
|
||||
// Group by month
|
||||
|
||||
Reference in New Issue
Block a user