Add browser fingerprint support and analytics metadata display
- Transfer user's browser fingerprint (user-agent, viewport, timezone, language, geolocation) to Chrome for more authentic scraping - Display review topics from Google Maps in analytics dashboard - Show business category badge in analytics header - Fix date_text null handling in analytics (handle undefined/timestamp fields) - Add review_topics and business_category to JobStatus interface Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
22
Dockerfile
22
Dockerfile
@@ -39,6 +39,13 @@ RUN apt-get update \
|
|||||||
&& apt-get install -y chromium chromium-driver \
|
&& apt-get install -y chromium chromium-driver \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Install VNC server and noVNC (browser-based VNC viewer)
|
||||||
|
RUN apt-get update && apt-get install -y \
|
||||||
|
x11vnc \
|
||||||
|
novnc \
|
||||||
|
websockify \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# Set working directory
|
# Set working directory
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
@@ -51,7 +58,7 @@ COPY modules/ ./modules/
|
|||||||
COPY api_server_production.py .
|
COPY api_server_production.py .
|
||||||
COPY config.yaml .
|
COPY config.yaml .
|
||||||
|
|
||||||
# Create startup script for Xvfb + API server
|
# Create startup script for Xvfb + VNC + API server
|
||||||
RUN echo '#!/bin/bash\n\
|
RUN echo '#!/bin/bash\n\
|
||||||
# Start Xvfb (virtual display) in background\n\
|
# Start Xvfb (virtual display) in background\n\
|
||||||
Xvfb :99 -screen 0 1920x1080x24 -ac +extension GLX +render -noreset &\n\
|
Xvfb :99 -screen 0 1920x1080x24 -ac +extension GLX +render -noreset &\n\
|
||||||
@@ -60,6 +67,15 @@ export DISPLAY=:99\n\
|
|||||||
# Wait for Xvfb to start\n\
|
# Wait for Xvfb to start\n\
|
||||||
sleep 2\n\
|
sleep 2\n\
|
||||||
\n\
|
\n\
|
||||||
|
# Start VNC server (no password for local dev, binds to all interfaces)\n\
|
||||||
|
x11vnc -display :99 -forever -shared -rfbport 5900 -nopw -bg\n\
|
||||||
|
\n\
|
||||||
|
# Start noVNC websocket proxy (browser access at http://localhost:6080/vnc.html)\n\
|
||||||
|
websockify --web=/usr/share/novnc/ 6080 localhost:5900 &\n\
|
||||||
|
\n\
|
||||||
|
echo "VNC server running on port 5900"\n\
|
||||||
|
echo "noVNC web interface at http://localhost:6080/vnc.html"\n\
|
||||||
|
\n\
|
||||||
# Start API server\n\
|
# Start API server\n\
|
||||||
exec python api_server_production.py\n\
|
exec python api_server_production.py\n\
|
||||||
' > /app/start.sh && chmod +x /app/start.sh
|
' > /app/start.sh && chmod +x /app/start.sh
|
||||||
@@ -71,8 +87,8 @@ RUN useradd -m -u 1000 scraper && \
|
|||||||
|
|
||||||
USER scraper
|
USER scraper
|
||||||
|
|
||||||
# Expose port
|
# Expose ports: API (8000), VNC (5900), noVNC web (6080)
|
||||||
EXPOSE 8000
|
EXPOSE 8000 5900 6080
|
||||||
|
|
||||||
# Environment variables for Chromium in container
|
# Environment variables for Chromium in container
|
||||||
ENV DISPLAY=:99
|
ENV DISPLAY=:99
|
||||||
|
|||||||
@@ -133,12 +133,36 @@ app.add_middleware(
|
|||||||
|
|
||||||
# ==================== Request/Response Models ====================
|
# ==================== Request/Response Models ====================
|
||||||
|
|
||||||
|
class GeolocationModel(BaseModel):
|
||||||
|
"""Geolocation coordinates"""
|
||||||
|
lat: float = Field(..., description="Latitude")
|
||||||
|
lng: float = Field(..., description="Longitude")
|
||||||
|
|
||||||
|
|
||||||
|
class ViewportModel(BaseModel):
|
||||||
|
"""Browser viewport size"""
|
||||||
|
width: int = Field(..., description="Viewport width")
|
||||||
|
height: int = Field(..., description="Viewport height")
|
||||||
|
|
||||||
|
|
||||||
|
class BrowserFingerprintModel(BaseModel):
|
||||||
|
"""Browser fingerprint to replicate user's browser"""
|
||||||
|
geolocation: Optional[GeolocationModel] = None
|
||||||
|
userAgent: Optional[str] = Field(None, description="User agent string")
|
||||||
|
viewport: Optional[ViewportModel] = Field(None, description="Screen resolution")
|
||||||
|
timezone: Optional[str] = Field(None, description="Timezone (e.g., Europe/Madrid)")
|
||||||
|
language: Optional[str] = Field(None, description="Browser language (e.g., en-US)")
|
||||||
|
platform: Optional[str] = Field(None, description="Platform (e.g., MacIntel, Win32)")
|
||||||
|
|
||||||
|
|
||||||
class ScrapeRequest(BaseModel):
|
class ScrapeRequest(BaseModel):
|
||||||
"""Request model for starting a scrape job"""
|
"""Request model for starting a scrape job"""
|
||||||
url: HttpUrl = Field(..., description="Google Maps URL to scrape")
|
url: HttpUrl = Field(..., description="Google Maps URL to scrape")
|
||||||
webhook_url: Optional[HttpUrl] = Field(None, description="Webhook URL for async notifications")
|
webhook_url: Optional[HttpUrl] = Field(None, description="Webhook URL for async notifications")
|
||||||
webhook_secret: Optional[str] = Field(None, description="Secret for webhook HMAC signature")
|
webhook_secret: Optional[str] = Field(None, description="Secret for webhook HMAC signature")
|
||||||
metadata: Optional[Dict[str, Any]] = Field(None, description="Optional custom metadata")
|
metadata: Optional[Dict[str, Any]] = Field(None, description="Optional custom metadata")
|
||||||
|
geolocation: Optional[GeolocationModel] = Field(None, description="User's geolocation for Chrome")
|
||||||
|
browser_fingerprint: Optional[BrowserFingerprintModel] = Field(None, description="User's browser fingerprint")
|
||||||
|
|
||||||
|
|
||||||
class JobResponse(BaseModel):
|
class JobResponse(BaseModel):
|
||||||
@@ -149,6 +173,7 @@ class JobResponse(BaseModel):
|
|||||||
created_at: str
|
created_at: str
|
||||||
started_at: Optional[str] = None
|
started_at: Optional[str] = None
|
||||||
completed_at: Optional[str] = None
|
completed_at: Optional[str] = None
|
||||||
|
updated_at: Optional[str] = None # Last update time for progress tracking
|
||||||
reviews_count: Optional[int] = None
|
reviews_count: Optional[int] = None
|
||||||
total_reviews: Optional[int] = None # Total reviews available for this place
|
total_reviews: Optional[int] = None # Total reviews available for this place
|
||||||
scrape_time: Optional[float] = None
|
scrape_time: Optional[float] = None
|
||||||
@@ -157,6 +182,8 @@ class JobResponse(BaseModel):
|
|||||||
# Business metadata
|
# Business metadata
|
||||||
business_name: Optional[str] = None
|
business_name: Optional[str] = None
|
||||||
business_address: Optional[str] = None
|
business_address: Optional[str] = None
|
||||||
|
business_category: Optional[str] = None # Category (e.g., "Barber shop")
|
||||||
|
review_topics: Optional[List[Dict[str, Any]]] = None # Topic filters with mention counts
|
||||||
|
|
||||||
|
|
||||||
class ReviewsResponse(BaseModel):
|
class ReviewsResponse(BaseModel):
|
||||||
@@ -206,12 +233,32 @@ async def start_scrape(request: ScrapeRequest):
|
|||||||
raise HTTPException(status_code=500, detail="Database not initialized")
|
raise HTTPException(status_code=500, detail="Database not initialized")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
# Merge browser fingerprint into metadata if provided
|
||||||
|
metadata = request.metadata or {}
|
||||||
|
if request.browser_fingerprint:
|
||||||
|
fp = request.browser_fingerprint
|
||||||
|
metadata['browser_fingerprint'] = {
|
||||||
|
"userAgent": fp.userAgent,
|
||||||
|
"timezone": fp.timezone,
|
||||||
|
"language": fp.language,
|
||||||
|
"platform": fp.platform,
|
||||||
|
}
|
||||||
|
if fp.viewport:
|
||||||
|
metadata['browser_fingerprint']['viewport'] = {"width": fp.viewport.width, "height": fp.viewport.height}
|
||||||
|
if fp.geolocation:
|
||||||
|
metadata['browser_fingerprint']['geolocation'] = {"lat": fp.geolocation.lat, "lng": fp.geolocation.lng}
|
||||||
|
elif request.geolocation:
|
||||||
|
metadata['geolocation'] = {
|
||||||
|
'lat': request.geolocation.lat,
|
||||||
|
'lng': request.geolocation.lng
|
||||||
|
}
|
||||||
|
|
||||||
# Create job in database
|
# Create job in database
|
||||||
job_id = await db.create_job(
|
job_id = await db.create_job(
|
||||||
url=str(request.url),
|
url=str(request.url),
|
||||||
webhook_url=str(request.webhook_url) if request.webhook_url else None,
|
webhook_url=str(request.webhook_url) if request.webhook_url else None,
|
||||||
webhook_secret=request.webhook_secret,
|
webhook_secret=request.webhook_secret,
|
||||||
metadata=request.metadata
|
metadata=metadata
|
||||||
)
|
)
|
||||||
|
|
||||||
# Start scraping job in background
|
# Start scraping job in background
|
||||||
@@ -240,6 +287,25 @@ async def get_job(job_id: UUID):
|
|||||||
if not job:
|
if not job:
|
||||||
raise HTTPException(status_code=404, detail="Job not found")
|
raise HTTPException(status_code=404, detail="Job not found")
|
||||||
|
|
||||||
|
# Parse review_topics if it's a string (JSONB might be returned as string)
|
||||||
|
review_topics = job.get('review_topics')
|
||||||
|
if isinstance(review_topics, str):
|
||||||
|
try:
|
||||||
|
review_topics = json.loads(review_topics)
|
||||||
|
except:
|
||||||
|
review_topics = None
|
||||||
|
|
||||||
|
# Extract business info from metadata if available
|
||||||
|
metadata = job.get('metadata')
|
||||||
|
if isinstance(metadata, str):
|
||||||
|
try:
|
||||||
|
metadata = json.loads(metadata)
|
||||||
|
except:
|
||||||
|
metadata = None
|
||||||
|
|
||||||
|
business_name = metadata.get('business_name') if metadata else None
|
||||||
|
business_category = metadata.get('business_category') if metadata else None
|
||||||
|
|
||||||
return JobResponse(
|
return JobResponse(
|
||||||
job_id=str(job['job_id']),
|
job_id=str(job['job_id']),
|
||||||
status=job['status'],
|
status=job['status'],
|
||||||
@@ -247,11 +313,15 @@ async def get_job(job_id: UUID):
|
|||||||
created_at=job['created_at'].isoformat(),
|
created_at=job['created_at'].isoformat(),
|
||||||
started_at=job['started_at'].isoformat() if job['started_at'] else None,
|
started_at=job['started_at'].isoformat() if job['started_at'] else None,
|
||||||
completed_at=job['completed_at'].isoformat() if job['completed_at'] else None,
|
completed_at=job['completed_at'].isoformat() if job['completed_at'] else None,
|
||||||
|
updated_at=job['updated_at'].isoformat() if job.get('updated_at') else None,
|
||||||
reviews_count=job['reviews_count'],
|
reviews_count=job['reviews_count'],
|
||||||
total_reviews=job.get('total_reviews'),
|
total_reviews=job.get('total_reviews'),
|
||||||
scrape_time=job['scrape_time'],
|
scrape_time=job['scrape_time'],
|
||||||
error_message=job['error_message'],
|
error_message=job['error_message'],
|
||||||
webhook_url=job.get('webhook_url')
|
webhook_url=job.get('webhook_url'),
|
||||||
|
business_name=business_name,
|
||||||
|
business_category=business_category,
|
||||||
|
review_topics=review_topics
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -541,25 +611,32 @@ async def stream_all_jobs():
|
|||||||
@app.get("/jobs/{job_id}/reviews", response_model=ReviewsResponse, summary="Get Job Reviews")
|
@app.get("/jobs/{job_id}/reviews", response_model=ReviewsResponse, summary="Get Job Reviews")
|
||||||
async def get_job_reviews(job_id: UUID):
|
async def get_job_reviews(job_id: UUID):
|
||||||
"""
|
"""
|
||||||
Get the actual reviews data for a completed job.
|
Get reviews data for a job.
|
||||||
|
|
||||||
Returns 404 if job not found or not completed yet.
|
Returns reviews for completed, partial, or running jobs (if reviews have been collected).
|
||||||
|
Returns 404 if job not found or no reviews available yet.
|
||||||
"""
|
"""
|
||||||
if not db:
|
if not db:
|
||||||
raise HTTPException(status_code=500, detail="Database not initialized")
|
raise HTTPException(status_code=500, detail="Database not initialized")
|
||||||
|
|
||||||
reviews = await db.get_job_reviews(job_id)
|
# Get reviews (includes completed, running, and partial jobs)
|
||||||
|
reviews = await db.get_job_reviews(job_id, include_partial=True)
|
||||||
if reviews is None:
|
if reviews is None:
|
||||||
job = await db.get_job(job_id)
|
job = await db.get_job(job_id)
|
||||||
if not job:
|
if not job:
|
||||||
raise HTTPException(status_code=404, detail="Job not found")
|
raise HTTPException(status_code=404, detail="Job not found")
|
||||||
elif job['status'] != 'completed':
|
elif job['status'] == 'pending':
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
status_code=400,
|
status_code=400,
|
||||||
detail=f"Job not completed yet (current status: {job['status']})"
|
detail="Job has not started yet"
|
||||||
|
)
|
||||||
|
elif job['status'] == 'failed':
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail=f"Job failed without saving any reviews: {job.get('error_message', 'Unknown error')}"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
raise HTTPException(status_code=404, detail="Reviews data not available")
|
raise HTTPException(status_code=404, detail="No reviews data available yet")
|
||||||
|
|
||||||
return ReviewsResponse(
|
return ReviewsResponse(
|
||||||
job_id=str(job_id),
|
job_id=str(job_id),
|
||||||
@@ -603,6 +680,15 @@ async def list_jobs(
|
|||||||
|
|
||||||
business_name = metadata.get('business_name') if metadata else None
|
business_name = metadata.get('business_name') if metadata else None
|
||||||
business_address = metadata.get('business_address') if metadata else None
|
business_address = metadata.get('business_address') if metadata else None
|
||||||
|
business_category = metadata.get('business_category') if metadata else None
|
||||||
|
|
||||||
|
# Parse review_topics if it's a string
|
||||||
|
review_topics = job.get('review_topics')
|
||||||
|
if isinstance(review_topics, str):
|
||||||
|
try:
|
||||||
|
review_topics = json.loads(review_topics)
|
||||||
|
except:
|
||||||
|
review_topics = None
|
||||||
|
|
||||||
result.append(JobResponse(
|
result.append(JobResponse(
|
||||||
job_id=str(job['job_id']),
|
job_id=str(job['job_id']),
|
||||||
@@ -615,7 +701,9 @@ async def list_jobs(
|
|||||||
scrape_time=job.get('scrape_time'),
|
scrape_time=job.get('scrape_time'),
|
||||||
error_message=job.get('error_message'),
|
error_message=job.get('error_message'),
|
||||||
business_name=business_name,
|
business_name=business_name,
|
||||||
business_address=business_address
|
business_address=business_address,
|
||||||
|
business_category=business_category,
|
||||||
|
review_topics=review_topics
|
||||||
))
|
))
|
||||||
|
|
||||||
return result
|
return result
|
||||||
@@ -640,63 +728,69 @@ async def check_reviews(request: ScrapeRequest):
|
|||||||
Get business card information from Google Maps.
|
Get business card information from Google Maps.
|
||||||
Returns business name, address, rating, and review count.
|
Returns business name, address, rating, and review count.
|
||||||
|
|
||||||
Uses pre-warmed Chrome worker from pool for instant response.
|
Creates a fresh Chrome instance for reliable results (same as full scraper).
|
||||||
This is used to show the business confirmation card in the UI.
|
This is used to show the business confirmation card in the UI.
|
||||||
"""
|
"""
|
||||||
worker = None
|
|
||||||
recycle_worker = False
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
url = str(request.url)
|
url = str(request.url)
|
||||||
|
|
||||||
# Get pre-warmed worker from validation pool
|
# Use the SAME scraper algorithm with validation_only=True for early return
|
||||||
worker = await asyncio.to_thread(get_validation_worker, timeout=10)
|
# Creates a fresh Chrome instance (same as full scraper) to avoid stale browser state
|
||||||
|
# Pooled browsers can have cookies/state that cause Google to render pages differently
|
||||||
|
|
||||||
if worker:
|
# Build fingerprint dict from request
|
||||||
log.info(f"Using worker {worker.worker_id} for business card extraction")
|
fingerprint = None
|
||||||
# Use the pooled worker (don't close it)
|
if request.browser_fingerprint:
|
||||||
result = await asyncio.to_thread(
|
fp = request.browser_fingerprint
|
||||||
get_business_card_info,
|
fingerprint = {
|
||||||
url=url,
|
"userAgent": fp.userAgent,
|
||||||
driver=worker.driver,
|
"timezone": fp.timezone,
|
||||||
return_driver=True
|
"language": fp.language,
|
||||||
)
|
"platform": fp.platform,
|
||||||
|
}
|
||||||
# Check if the result indicates a session error
|
if fp.viewport:
|
||||||
if not result['success'] and result.get('error'):
|
fingerprint["viewport"] = {"width": fp.viewport.width, "height": fp.viewport.height}
|
||||||
error_msg = result.get('error', '').lower()
|
if fp.geolocation:
|
||||||
if 'invalid session' in error_msg or 'session' in error_msg:
|
fingerprint["geolocation"] = {"lat": fp.geolocation.lat, "lng": fp.geolocation.lng}
|
||||||
log.warning(f"Worker {worker.worker_id} has invalid session, will recycle")
|
log.info(f"Creating Chrome with user fingerprint: {fp.platform}, {fp.timezone}")
|
||||||
recycle_worker = True
|
elif request.geolocation:
|
||||||
|
fingerprint = {"geolocation": {"lat": request.geolocation.lat, "lng": request.geolocation.lng}}
|
||||||
|
log.info(f"Creating Chrome with geolocation only")
|
||||||
else:
|
else:
|
||||||
# Fallback: create temporary worker
|
log.info(f"Creating Chrome with default settings")
|
||||||
log.warning("No pooled worker available, creating temporary instance")
|
|
||||||
result = await asyncio.to_thread(
|
|
||||||
get_business_card_info,
|
|
||||||
url=url
|
|
||||||
)
|
|
||||||
|
|
||||||
# SIMPLIFIED VALIDATION: If we found a business (name + rating), assume it has reviews
|
result = await asyncio.to_thread(
|
||||||
# Let the actual scraper determine if reviews exist
|
fast_scrape_reviews,
|
||||||
has_business = bool(result.get('name') and result.get('rating'))
|
url=url,
|
||||||
|
headless=False, # Use Xvfb display
|
||||||
|
validation_only=True, # Return early after getting total_reviews
|
||||||
|
browser_fingerprint=fingerprint # Pass user's browser fingerprint
|
||||||
|
)
|
||||||
|
|
||||||
|
# Extract validation info from the result
|
||||||
|
validation_info = result.get('validation_info', {})
|
||||||
|
total_reviews = validation_info.get('total_reviews') or result.get('total_reviews') or 0
|
||||||
|
name = validation_info.get('name')
|
||||||
|
rating = validation_info.get('rating')
|
||||||
|
category = validation_info.get('category')
|
||||||
|
address = validation_info.get('address')
|
||||||
|
|
||||||
|
# Has reviews if we found a business with the Reviews tab (indicated by total_reviews > 0)
|
||||||
|
has_reviews = bool(name and total_reviews > 0)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"has_reviews": has_business, # Boolean: true if business exists
|
"has_reviews": has_reviews, # True if business has reviews
|
||||||
"total_reviews": result.get('total_reviews') or 0, # Show 0 if unknown
|
"total_reviews": total_reviews,
|
||||||
"name": result.get('name'),
|
"name": name,
|
||||||
"address": result.get('address'),
|
"address": address,
|
||||||
"rating": result.get('rating'),
|
"rating": rating,
|
||||||
"success": result['success'],
|
"category": category,
|
||||||
|
"success": result.get('success', True),
|
||||||
"error": result.get('error')
|
"error": result.get('error')
|
||||||
}
|
}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.error(f"Error checking reviews: {e}")
|
log.error(f"Error checking reviews: {e}")
|
||||||
# If it's a session error, recycle the worker
|
|
||||||
if worker:
|
|
||||||
error_msg = str(e).lower()
|
|
||||||
if 'invalid session' in error_msg or 'session' in error_msg:
|
|
||||||
recycle_worker = True
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"has_reviews": False,
|
"has_reviews": False,
|
||||||
@@ -704,10 +798,6 @@ async def check_reviews(request: ScrapeRequest):
|
|||||||
"success": False,
|
"success": False,
|
||||||
"error": str(e)
|
"error": str(e)
|
||||||
}
|
}
|
||||||
finally:
|
|
||||||
# Release worker back to pool (or recycle if broken)
|
|
||||||
if worker:
|
|
||||||
await asyncio.to_thread(release_validation_worker, worker, recycle=recycle_worker)
|
|
||||||
|
|
||||||
|
|
||||||
@app.get("/stats", response_model=StatsResponse, summary="Get Statistics")
|
@app.get("/stats", response_model=StatsResponse, summary="Get Statistics")
|
||||||
@@ -808,6 +898,21 @@ async def run_scraping_job(job_id: UUID):
|
|||||||
job = await db.get_job(job_id)
|
job = await db.get_job(job_id)
|
||||||
url = job['url']
|
url = job['url']
|
||||||
|
|
||||||
|
# Extract browser fingerprint from metadata if available
|
||||||
|
browser_fingerprint = None
|
||||||
|
metadata = job.get('metadata')
|
||||||
|
if isinstance(metadata, str):
|
||||||
|
try:
|
||||||
|
metadata = json.loads(metadata)
|
||||||
|
except:
|
||||||
|
metadata = None
|
||||||
|
if metadata and 'browser_fingerprint' in metadata:
|
||||||
|
browser_fingerprint = metadata['browser_fingerprint']
|
||||||
|
log.info(f"Using user fingerprint: {browser_fingerprint.get('platform')}, {browser_fingerprint.get('timezone')}")
|
||||||
|
elif metadata and 'geolocation' in metadata:
|
||||||
|
browser_fingerprint = {'geolocation': metadata['geolocation']}
|
||||||
|
log.info(f"Using user geolocation only")
|
||||||
|
|
||||||
# Broadcast job started via SSE
|
# Broadcast job started via SSE
|
||||||
await broadcast_job_update(job_id_str, "job_started", {
|
await broadcast_job_update(job_id_str, "job_started", {
|
||||||
"job_id": job_id_str,
|
"job_id": job_id_str,
|
||||||
@@ -821,9 +926,17 @@ async def run_scraping_job(job_id: UUID):
|
|||||||
# Create log capture instance that we can access for real-time logs
|
# Create log capture instance that we can access for real-time logs
|
||||||
log_capture = LogCapture()
|
log_capture = LogCapture()
|
||||||
|
|
||||||
|
# Track total reviews for incremental saves
|
||||||
|
total_reviews_seen = [None]
|
||||||
|
# Accumulate all reviews for incremental saves (flush_callback receives batches)
|
||||||
|
all_reviews_collected = []
|
||||||
|
|
||||||
# Progress callback to update job status with current/total counts AND logs
|
# Progress callback to update job status with current/total counts AND logs
|
||||||
def progress_callback(current_count: int, total_count: int):
|
def progress_callback(current_count: int, total_count: int):
|
||||||
"""Update job progress and logs from worker thread"""
|
"""Update job progress and logs from worker thread"""
|
||||||
|
if total_count:
|
||||||
|
total_reviews_seen[0] = total_count
|
||||||
|
|
||||||
async def update():
|
async def update():
|
||||||
# Get current logs from the shared log_capture
|
# Get current logs from the shared log_capture
|
||||||
current_logs = log_capture.get_logs()
|
current_logs = log_capture.get_logs()
|
||||||
@@ -847,6 +960,22 @@ async def run_scraping_job(job_id: UUID):
|
|||||||
# Schedule the coroutine on the event loop
|
# Schedule the coroutine on the event loop
|
||||||
asyncio.run_coroutine_threadsafe(update(), loop)
|
asyncio.run_coroutine_threadsafe(update(), loop)
|
||||||
|
|
||||||
|
# Flush callback to save reviews incrementally (crash recovery)
|
||||||
|
# Note: flush_callback receives batches, so we accumulate them
|
||||||
|
def flush_callback(reviews_batch: list):
|
||||||
|
"""Accumulate and save reviews to DB incrementally from worker thread"""
|
||||||
|
# Extend our collection with the new batch
|
||||||
|
all_reviews_collected.extend(reviews_batch)
|
||||||
|
|
||||||
|
async def save():
|
||||||
|
await db.save_reviews_incremental(
|
||||||
|
job_id=job_id,
|
||||||
|
reviews=all_reviews_collected, # Save ALL reviews so far
|
||||||
|
total_reviews=total_reviews_seen[0]
|
||||||
|
)
|
||||||
|
# Schedule the coroutine on the event loop
|
||||||
|
asyncio.run_coroutine_threadsafe(save(), loop)
|
||||||
|
|
||||||
# Run scraping with progress callback and shared log capture
|
# Run scraping with progress callback and shared log capture
|
||||||
# headless=False because Docker uses Xvfb virtual display
|
# headless=False because Docker uses Xvfb virtual display
|
||||||
result = await asyncio.to_thread(
|
result = await asyncio.to_thread(
|
||||||
@@ -854,17 +983,20 @@ async def run_scraping_job(job_id: UUID):
|
|||||||
url=url,
|
url=url,
|
||||||
headless=False,
|
headless=False,
|
||||||
progress_callback=progress_callback,
|
progress_callback=progress_callback,
|
||||||
log_capture=log_capture
|
log_capture=log_capture,
|
||||||
|
flush_callback=flush_callback,
|
||||||
|
browser_fingerprint=browser_fingerprint # Pass user's browser fingerprint
|
||||||
)
|
)
|
||||||
|
|
||||||
if result['success']:
|
if result['success']:
|
||||||
# Save results to database (including scraper logs)
|
# Save results to database (including scraper logs and review topics)
|
||||||
await db.save_job_result(
|
await db.save_job_result(
|
||||||
job_id=job_id,
|
job_id=job_id,
|
||||||
reviews=result['reviews'],
|
reviews=result['reviews'],
|
||||||
scrape_time=result['time'],
|
scrape_time=result['time'],
|
||||||
total_reviews=result.get('total_reviews'),
|
total_reviews=result.get('total_reviews'),
|
||||||
scrape_logs=result.get('logs')
|
scrape_logs=result.get('logs'),
|
||||||
|
review_topics=result.get('review_topics')
|
||||||
)
|
)
|
||||||
|
|
||||||
log.info(
|
log.info(
|
||||||
@@ -898,68 +1030,142 @@ async def run_scraping_job(job_id: UUID):
|
|||||||
)
|
)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# Job failed - save logs for debugging
|
# Job failed - check if we have partial reviews saved
|
||||||
await db.update_job_status(
|
current_job = await db.get_job(job_id)
|
||||||
job_id,
|
partial_count = current_job.get('reviews_count', 0) if current_job else 0
|
||||||
JobStatus.FAILED,
|
|
||||||
error_message=result.get('error', 'Unknown error'),
|
|
||||||
scrape_logs=result.get('logs')
|
|
||||||
)
|
|
||||||
|
|
||||||
log.error(f"Failed job {job_id}: {result.get('error')}")
|
if partial_count > 0:
|
||||||
|
# Mark as partial - we have some reviews saved
|
||||||
# Broadcast job failed via SSE
|
await db.mark_job_partial(
|
||||||
await broadcast_job_update(job_id_str, "job_failed", {
|
job_id,
|
||||||
"job_id": job_id_str,
|
error_message=result.get('error', 'Unknown error'),
|
||||||
"status": "failed",
|
scrape_logs=result.get('logs')
|
||||||
"error_message": result.get('error'),
|
|
||||||
"logs": result.get('logs', [])
|
|
||||||
})
|
|
||||||
|
|
||||||
# Send failure webhook if configured
|
|
||||||
if job.get('webhook_url'):
|
|
||||||
webhook_manager = WebhookManager()
|
|
||||||
await webhook_manager.send_job_completed_webhook(
|
|
||||||
webhook_url=job['webhook_url'],
|
|
||||||
job_id=job_id,
|
|
||||||
status='failed',
|
|
||||||
error_message=result.get('error'),
|
|
||||||
secret=job.get('webhook_secret'),
|
|
||||||
db=db
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
log.warning(f"Partial job {job_id}: {partial_count} reviews saved before error: {result.get('error')}")
|
||||||
|
|
||||||
|
# Broadcast job partial via SSE
|
||||||
|
await broadcast_job_update(job_id_str, "job_partial", {
|
||||||
|
"job_id": job_id_str,
|
||||||
|
"status": "partial",
|
||||||
|
"reviews_count": partial_count,
|
||||||
|
"total_reviews": current_job.get('total_reviews'),
|
||||||
|
"error_message": result.get('error'),
|
||||||
|
"logs": result.get('logs', [])
|
||||||
|
})
|
||||||
|
|
||||||
|
# Send partial webhook if configured
|
||||||
|
if job.get('webhook_url'):
|
||||||
|
webhook_manager = WebhookManager()
|
||||||
|
await webhook_manager.send_job_completed_webhook(
|
||||||
|
webhook_url=job['webhook_url'],
|
||||||
|
job_id=job_id,
|
||||||
|
status='partial',
|
||||||
|
reviews_count=partial_count,
|
||||||
|
error_message=result.get('error'),
|
||||||
|
secret=job.get('webhook_secret'),
|
||||||
|
db=db
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# No reviews saved - mark as failed
|
||||||
|
await db.update_job_status(
|
||||||
|
job_id,
|
||||||
|
JobStatus.FAILED,
|
||||||
|
error_message=result.get('error', 'Unknown error'),
|
||||||
|
scrape_logs=result.get('logs')
|
||||||
|
)
|
||||||
|
|
||||||
|
log.error(f"Failed job {job_id}: {result.get('error')}")
|
||||||
|
|
||||||
|
# Broadcast job failed via SSE
|
||||||
|
await broadcast_job_update(job_id_str, "job_failed", {
|
||||||
|
"job_id": job_id_str,
|
||||||
|
"status": "failed",
|
||||||
|
"error_message": result.get('error'),
|
||||||
|
"logs": result.get('logs', [])
|
||||||
|
})
|
||||||
|
|
||||||
|
# Send failure webhook if configured
|
||||||
|
if job.get('webhook_url'):
|
||||||
|
webhook_manager = WebhookManager()
|
||||||
|
await webhook_manager.send_job_completed_webhook(
|
||||||
|
webhook_url=job['webhook_url'],
|
||||||
|
job_id=job_id,
|
||||||
|
status='failed',
|
||||||
|
error_message=result.get('error'),
|
||||||
|
secret=job.get('webhook_secret'),
|
||||||
|
db=db
|
||||||
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.error(f"Error in scraping job {job_id}: {e}")
|
log.error(f"Error in scraping job {job_id}: {e}")
|
||||||
import traceback
|
import traceback
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
await db.update_job_status(
|
# Check if we have partial reviews saved
|
||||||
job_id,
|
current_job = await db.get_job(job_id)
|
||||||
JobStatus.FAILED,
|
partial_count = current_job.get('reviews_count', 0) if current_job else 0
|
||||||
error_message=str(e)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Broadcast job failed via SSE
|
if partial_count > 0:
|
||||||
await broadcast_job_update(job_id_str, "job_failed", {
|
# Mark as partial - we have some reviews saved
|
||||||
"job_id": job_id_str,
|
await db.mark_job_partial(
|
||||||
"status": "failed",
|
job_id,
|
||||||
"error_message": str(e),
|
|
||||||
"logs": []
|
|
||||||
})
|
|
||||||
|
|
||||||
# Send failure webhook
|
|
||||||
job = await db.get_job(job_id)
|
|
||||||
if job and job.get('webhook_url'):
|
|
||||||
webhook_manager = WebhookManager()
|
|
||||||
await webhook_manager.send_job_completed_webhook(
|
|
||||||
webhook_url=job['webhook_url'],
|
|
||||||
job_id=job_id,
|
|
||||||
status='failed',
|
|
||||||
error_message=str(e),
|
error_message=str(e),
|
||||||
secret=job.get('webhook_secret'),
|
scrape_logs=log_capture.get_logs() if log_capture else None
|
||||||
db=db
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
log.warning(f"Partial job {job_id}: {partial_count} reviews saved before exception: {e}")
|
||||||
|
|
||||||
|
# Broadcast job partial via SSE
|
||||||
|
await broadcast_job_update(job_id_str, "job_partial", {
|
||||||
|
"job_id": job_id_str,
|
||||||
|
"status": "partial",
|
||||||
|
"reviews_count": partial_count,
|
||||||
|
"total_reviews": current_job.get('total_reviews'),
|
||||||
|
"error_message": str(e),
|
||||||
|
"logs": log_capture.get_logs() if log_capture else []
|
||||||
|
})
|
||||||
|
|
||||||
|
# Send partial webhook
|
||||||
|
if current_job and current_job.get('webhook_url'):
|
||||||
|
webhook_manager = WebhookManager()
|
||||||
|
await webhook_manager.send_job_completed_webhook(
|
||||||
|
webhook_url=current_job['webhook_url'],
|
||||||
|
job_id=job_id,
|
||||||
|
status='partial',
|
||||||
|
reviews_count=partial_count,
|
||||||
|
error_message=str(e),
|
||||||
|
secret=current_job.get('webhook_secret'),
|
||||||
|
db=db
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# No reviews saved - mark as failed
|
||||||
|
await db.update_job_status(
|
||||||
|
job_id,
|
||||||
|
JobStatus.FAILED,
|
||||||
|
error_message=str(e)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Broadcast job failed via SSE
|
||||||
|
await broadcast_job_update(job_id_str, "job_failed", {
|
||||||
|
"job_id": job_id_str,
|
||||||
|
"status": "failed",
|
||||||
|
"error_message": str(e),
|
||||||
|
"logs": []
|
||||||
|
})
|
||||||
|
|
||||||
|
# Send failure webhook
|
||||||
|
if current_job and current_job.get('webhook_url'):
|
||||||
|
webhook_manager = WebhookManager()
|
||||||
|
await webhook_manager.send_job_completed_webhook(
|
||||||
|
webhook_url=current_job['webhook_url'],
|
||||||
|
job_id=job_id,
|
||||||
|
status='failed',
|
||||||
|
error_message=str(e),
|
||||||
|
secret=current_job.get('webhook_secret'),
|
||||||
|
db=db
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import uvicorn
|
import uvicorn
|
||||||
|
|||||||
@@ -39,6 +39,8 @@ services:
|
|||||||
- CHROME_BIN=/usr/bin/chromium
|
- CHROME_BIN=/usr/bin/chromium
|
||||||
ports:
|
ports:
|
||||||
- "8000:8000"
|
- "8000:8000"
|
||||||
|
- "5900:5900" # VNC port (for VNC client)
|
||||||
|
- "6080:6080" # noVNC web interface (browser access)
|
||||||
depends_on:
|
depends_on:
|
||||||
db:
|
db:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ class JobStatus(str, Enum):
|
|||||||
COMPLETED = "completed"
|
COMPLETED = "completed"
|
||||||
FAILED = "failed"
|
FAILED = "failed"
|
||||||
CANCELLED = "cancelled"
|
CANCELLED = "cancelled"
|
||||||
|
PARTIAL = "partial" # Job crashed but has partial reviews saved
|
||||||
|
|
||||||
|
|
||||||
class DatabaseManager:
|
class DatabaseManager:
|
||||||
@@ -69,6 +70,7 @@ class DatabaseManager:
|
|||||||
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
|
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
|
||||||
started_at TIMESTAMP,
|
started_at TIMESTAMP,
|
||||||
completed_at TIMESTAMP,
|
completed_at TIMESTAMP,
|
||||||
|
updated_at TIMESTAMP,
|
||||||
|
|
||||||
reviews_count INTEGER,
|
reviews_count INTEGER,
|
||||||
total_reviews INTEGER,
|
total_reviews INTEGER,
|
||||||
@@ -79,7 +81,7 @@ class DatabaseManager:
|
|||||||
metadata JSONB,
|
metadata JSONB,
|
||||||
scrape_logs JSONB,
|
scrape_logs JSONB,
|
||||||
|
|
||||||
CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled'))
|
CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled', 'partial'))
|
||||||
);
|
);
|
||||||
""")
|
""")
|
||||||
|
|
||||||
@@ -88,6 +90,24 @@ class DatabaseManager:
|
|||||||
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS scrape_logs JSONB;
|
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS scrape_logs JSONB;
|
||||||
""")
|
""")
|
||||||
|
|
||||||
|
# Add updated_at column if it doesn't exist (for incremental progress tracking)
|
||||||
|
await conn.execute("""
|
||||||
|
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS updated_at TIMESTAMP;
|
||||||
|
""")
|
||||||
|
|
||||||
|
# Add review_topics column if it doesn't exist (extracted topic filters with mention counts)
|
||||||
|
await conn.execute("""
|
||||||
|
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS review_topics JSONB;
|
||||||
|
""")
|
||||||
|
|
||||||
|
# Update constraint to include 'partial' status (for existing databases)
|
||||||
|
await conn.execute("""
|
||||||
|
ALTER TABLE jobs DROP CONSTRAINT IF EXISTS valid_status;
|
||||||
|
""")
|
||||||
|
await conn.execute("""
|
||||||
|
ALTER TABLE jobs ADD CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled', 'partial'));
|
||||||
|
""")
|
||||||
|
|
||||||
# Create indexes
|
# Create indexes
|
||||||
await conn.execute("""
|
await conn.execute("""
|
||||||
CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status);
|
CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status);
|
||||||
@@ -187,13 +207,15 @@ class DatabaseManager:
|
|||||||
created_at,
|
created_at,
|
||||||
started_at,
|
started_at,
|
||||||
completed_at,
|
completed_at,
|
||||||
|
updated_at,
|
||||||
reviews_count,
|
reviews_count,
|
||||||
total_reviews,
|
total_reviews,
|
||||||
reviews_data,
|
reviews_data,
|
||||||
scrape_time,
|
scrape_time,
|
||||||
error_message,
|
error_message,
|
||||||
metadata,
|
metadata,
|
||||||
scrape_logs
|
scrape_logs,
|
||||||
|
review_topics
|
||||||
FROM jobs
|
FROM jobs
|
||||||
WHERE job_id = $1
|
WHERE job_id = $1
|
||||||
""", job_id)
|
""", job_id)
|
||||||
@@ -203,22 +225,32 @@ class DatabaseManager:
|
|||||||
|
|
||||||
return dict(row)
|
return dict(row)
|
||||||
|
|
||||||
async def get_job_reviews(self, job_id: UUID) -> Optional[List[Dict[str, Any]]]:
|
async def get_job_reviews(self, job_id: UUID, include_partial: bool = True) -> Optional[List[Dict[str, Any]]]:
|
||||||
"""
|
"""
|
||||||
Get reviews for a specific job.
|
Get reviews for a specific job.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
job_id: Job UUID
|
job_id: Job UUID
|
||||||
|
include_partial: If True, also return reviews for running and partial jobs
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of reviews or None if not found/not completed
|
List of reviews or None if not found/no reviews
|
||||||
"""
|
"""
|
||||||
async with self.pool.acquire() as conn:
|
async with self.pool.acquire() as conn:
|
||||||
reviews_data = await conn.fetchval("""
|
if include_partial:
|
||||||
SELECT reviews_data
|
# Return reviews for completed, running, or partial jobs
|
||||||
FROM jobs
|
reviews_data = await conn.fetchval("""
|
||||||
WHERE job_id = $1 AND status = 'completed'
|
SELECT reviews_data
|
||||||
""", job_id)
|
FROM jobs
|
||||||
|
WHERE job_id = $1 AND status IN ('completed', 'running', 'partial')
|
||||||
|
""", job_id)
|
||||||
|
else:
|
||||||
|
# Only return reviews for completed jobs
|
||||||
|
reviews_data = await conn.fetchval("""
|
||||||
|
SELECT reviews_data
|
||||||
|
FROM jobs
|
||||||
|
WHERE job_id = $1 AND status = 'completed'
|
||||||
|
""", job_id)
|
||||||
|
|
||||||
if not reviews_data:
|
if not reviews_data:
|
||||||
return None
|
return None
|
||||||
@@ -278,7 +310,8 @@ class DatabaseManager:
|
|||||||
reviews: List[Dict[str, Any]],
|
reviews: List[Dict[str, Any]],
|
||||||
scrape_time: float,
|
scrape_time: float,
|
||||||
total_reviews: Optional[int] = None,
|
total_reviews: Optional[int] = None,
|
||||||
scrape_logs: Optional[List[Dict[str, Any]]] = None
|
scrape_logs: Optional[List[Dict[str, Any]]] = None,
|
||||||
|
review_topics: Optional[List[Dict[str, Any]]] = None
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Save scraping results to database.
|
Save scraping results to database.
|
||||||
@@ -289,8 +322,33 @@ class DatabaseManager:
|
|||||||
scrape_time: Time taken to scrape in seconds
|
scrape_time: Time taken to scrape in seconds
|
||||||
total_reviews: Total reviews available (from page counter)
|
total_reviews: Total reviews available (from page counter)
|
||||||
scrape_logs: List of log entries from the scraper
|
scrape_logs: List of log entries from the scraper
|
||||||
|
review_topics: List of topic filter dictionaries with topic and count
|
||||||
"""
|
"""
|
||||||
async with self.pool.acquire() as conn:
|
async with self.pool.acquire() as conn:
|
||||||
|
# If reviews list is empty, check if job already has reviews from incremental saves
|
||||||
|
# This happens when flush_callback was used during scraping
|
||||||
|
if not reviews:
|
||||||
|
existing = await conn.fetchval(
|
||||||
|
"SELECT reviews_count FROM jobs WHERE job_id = $1", job_id
|
||||||
|
)
|
||||||
|
if existing and existing > 0:
|
||||||
|
# Job has reviews from incremental saves, don't overwrite reviews_data
|
||||||
|
await conn.execute("""
|
||||||
|
UPDATE jobs
|
||||||
|
SET
|
||||||
|
status = 'completed',
|
||||||
|
completed_at = NOW(),
|
||||||
|
total_reviews = COALESCE($2, total_reviews),
|
||||||
|
scrape_time = $3,
|
||||||
|
scrape_logs = $4::jsonb,
|
||||||
|
review_topics = $5::jsonb
|
||||||
|
WHERE job_id = $1
|
||||||
|
""", job_id, total_reviews, scrape_time,
|
||||||
|
json.dumps(scrape_logs) if scrape_logs else None,
|
||||||
|
json.dumps(review_topics) if review_topics else None)
|
||||||
|
log.info(f"Completed job {job_id} with {existing} reviews (from incremental saves)")
|
||||||
|
return
|
||||||
|
|
||||||
await conn.execute("""
|
await conn.execute("""
|
||||||
UPDATE jobs
|
UPDATE jobs
|
||||||
SET
|
SET
|
||||||
@@ -300,13 +358,70 @@ class DatabaseManager:
|
|||||||
total_reviews = $3,
|
total_reviews = $3,
|
||||||
reviews_data = $4::jsonb,
|
reviews_data = $4::jsonb,
|
||||||
scrape_time = $5,
|
scrape_time = $5,
|
||||||
scrape_logs = $6::jsonb
|
scrape_logs = $6::jsonb,
|
||||||
|
review_topics = $7::jsonb
|
||||||
WHERE job_id = $1
|
WHERE job_id = $1
|
||||||
""", job_id, len(reviews), total_reviews, json.dumps(reviews), scrape_time,
|
""", job_id, len(reviews), total_reviews, json.dumps(reviews), scrape_time,
|
||||||
json.dumps(scrape_logs) if scrape_logs else None)
|
json.dumps(scrape_logs) if scrape_logs else None,
|
||||||
|
json.dumps(review_topics) if review_topics else None)
|
||||||
|
|
||||||
log.info(f"Saved {len(reviews)} reviews for job {job_id}")
|
log.info(f"Saved {len(reviews)} reviews for job {job_id}")
|
||||||
|
|
||||||
|
async def save_reviews_incremental(
|
||||||
|
self,
|
||||||
|
job_id: UUID,
|
||||||
|
reviews: List[Dict[str, Any]],
|
||||||
|
total_reviews: Optional[int] = None
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Save reviews incrementally during scraping.
|
||||||
|
Called on each flush to preserve progress in case of crash.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
job_id: Job UUID
|
||||||
|
reviews: ALL reviews collected so far (not just new ones)
|
||||||
|
total_reviews: Total reviews available (from page counter)
|
||||||
|
"""
|
||||||
|
async with self.pool.acquire() as conn:
|
||||||
|
await conn.execute("""
|
||||||
|
UPDATE jobs
|
||||||
|
SET
|
||||||
|
reviews_count = $2,
|
||||||
|
total_reviews = COALESCE($3, total_reviews),
|
||||||
|
reviews_data = $4::jsonb,
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE job_id = $1 AND status = 'running'
|
||||||
|
""", job_id, len(reviews), total_reviews, json.dumps(reviews))
|
||||||
|
|
||||||
|
log.debug(f"Incremental save: {len(reviews)} reviews for job {job_id}")
|
||||||
|
|
||||||
|
async def mark_job_partial(
|
||||||
|
self,
|
||||||
|
job_id: UUID,
|
||||||
|
error_message: str,
|
||||||
|
scrape_logs: Optional[List[Dict[str, Any]]] = None
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Mark a job as partial (crashed but has some reviews saved).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
job_id: Job UUID
|
||||||
|
error_message: Error that caused the crash
|
||||||
|
scrape_logs: Log entries from the scraper
|
||||||
|
"""
|
||||||
|
async with self.pool.acquire() as conn:
|
||||||
|
await conn.execute("""
|
||||||
|
UPDATE jobs
|
||||||
|
SET
|
||||||
|
status = 'partial',
|
||||||
|
completed_at = NOW(),
|
||||||
|
error_message = $2,
|
||||||
|
scrape_logs = $3::jsonb
|
||||||
|
WHERE job_id = $1
|
||||||
|
""", job_id, error_message, json.dumps(scrape_logs) if scrape_logs else None)
|
||||||
|
|
||||||
|
log.info(f"Marked job {job_id} as partial due to: {error_message}")
|
||||||
|
|
||||||
async def list_jobs(
|
async def list_jobs(
|
||||||
self,
|
self,
|
||||||
status: Optional[JobStatus] = None,
|
status: Optional[JobStatus] = None,
|
||||||
@@ -337,7 +452,8 @@ class DatabaseManager:
|
|||||||
total_reviews,
|
total_reviews,
|
||||||
scrape_time,
|
scrape_time,
|
||||||
error_message,
|
error_message,
|
||||||
metadata
|
metadata,
|
||||||
|
review_topics
|
||||||
FROM jobs
|
FROM jobs
|
||||||
WHERE status = $1
|
WHERE status = $1
|
||||||
ORDER BY created_at DESC
|
ORDER BY created_at DESC
|
||||||
@@ -355,7 +471,8 @@ class DatabaseManager:
|
|||||||
total_reviews,
|
total_reviews,
|
||||||
scrape_time,
|
scrape_time,
|
||||||
error_message,
|
error_message,
|
||||||
metadata
|
metadata,
|
||||||
|
review_topics
|
||||||
FROM jobs
|
FROM jobs
|
||||||
ORDER BY created_at DESC
|
ORDER BY created_at DESC
|
||||||
LIMIT $1 OFFSET $2
|
LIMIT $1 OFFSET $2
|
||||||
|
|||||||
@@ -268,7 +268,7 @@ def parse_dom_review(card) -> dict:
|
|||||||
|
|
||||||
def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: int = 15,
|
def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: int = 15,
|
||||||
flush_callback=None, flush_batch_size: int = 500, log_capture: LogCapture = None,
|
flush_callback=None, flush_batch_size: int = 500, log_capture: LogCapture = None,
|
||||||
progress_callback=None) -> dict:
|
progress_callback=None, validation_only: bool = False) -> dict:
|
||||||
"""
|
"""
|
||||||
Scrape Google Maps reviews.
|
Scrape Google Maps reviews.
|
||||||
|
|
||||||
@@ -299,6 +299,9 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
# Track total reviews (persists across refreshes)
|
# Track total reviews (persists across refreshes)
|
||||||
total_reviews = [None] # Use list for closure mutation
|
total_reviews = [None] # Use list for closure mutation
|
||||||
|
|
||||||
|
# Store business info extracted from overview (before clicking reviews tab)
|
||||||
|
business_info_cache = [None]
|
||||||
|
|
||||||
# Hard refresh counter
|
# Hard refresh counter
|
||||||
hard_refresh_count = [0]
|
hard_refresh_count = [0]
|
||||||
max_hard_refreshes = 3 # Max number of hard refreshes before giving up
|
max_hard_refreshes = 3 # Max number of hard refreshes before giving up
|
||||||
@@ -323,11 +326,14 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
pass
|
pass
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def setup_reviews_page(is_refresh=False):
|
def setup_reviews_page(is_refresh=False, validation_only_mode=False):
|
||||||
"""
|
"""
|
||||||
Setup the reviews page for scraping.
|
Setup the reviews page for scraping.
|
||||||
Returns (scroll_container, stop_scrolling_event) or (None, None) on failure.
|
Returns (scroll_container, stop_scrolling_event) or (None, None) on failure.
|
||||||
Can be called after initial load or after a hard refresh.
|
Can be called after initial load or after a hard refresh.
|
||||||
|
|
||||||
|
If validation_only_mode=True, returns early after extracting business info
|
||||||
|
without clicking reviews tab or finding scroll container.
|
||||||
"""
|
"""
|
||||||
nonlocal total_reviews
|
nonlocal total_reviews
|
||||||
|
|
||||||
@@ -335,6 +341,13 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
|
|
||||||
# Navigate to URL (only on initial load or refresh)
|
# Navigate to URL (only on initial load or refresh)
|
||||||
if not is_refresh:
|
if not is_refresh:
|
||||||
|
# Reset browser state by navigating to blank page first
|
||||||
|
# This clears any stale state from pooled browser sessions
|
||||||
|
try:
|
||||||
|
driver.get("about:blank")
|
||||||
|
time.sleep(0.1)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
log.info(f"🌐 Loading: {url[:80]}...")
|
log.info(f"🌐 Loading: {url[:80]}...")
|
||||||
else:
|
else:
|
||||||
log.info(f"🔄 Hard refresh #{hard_refresh_count[0]}: reloading page...")
|
log.info(f"🔄 Hard refresh #{hard_refresh_count[0]}: reloading page...")
|
||||||
@@ -353,6 +366,8 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
# Reload original URL after consent
|
# Reload original URL after consent
|
||||||
log.info(" Reloading after consent...")
|
log.info(" Reloading after consent...")
|
||||||
driver.get(url)
|
driver.get(url)
|
||||||
|
# Wait for page to settle after consent reload
|
||||||
|
time.sleep(1)
|
||||||
break
|
break
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
@@ -362,43 +377,108 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
break
|
break
|
||||||
time.sleep(0.01) # 10ms - responsive but low CPU
|
time.sleep(0.01) # 10ms - responsive but low CPU
|
||||||
|
|
||||||
# Extract total review count BEFORE clicking reviews tab (it's on Overview)
|
# Extract business info and total review count BEFORE clicking reviews tab (on Overview)
|
||||||
|
# This captures name, rating, category, address while they're visible
|
||||||
# Only on first load (don't overwrite if we already have it)
|
# Only on first load (don't overwrite if we already have it)
|
||||||
if total_reviews[0] is None:
|
if total_reviews[0] is None or business_info_cache[0] is None:
|
||||||
start = time.time()
|
start = time.time()
|
||||||
while time.time() - start < 5:
|
while time.time() - start < 5:
|
||||||
try:
|
try:
|
||||||
count = driver.execute_script("""
|
info = driver.execute_script("""
|
||||||
var reviewSpans = document.querySelectorAll('span[role="img"]');
|
var result = {
|
||||||
for (var i = 0; i < reviewSpans.length; i++) {
|
total_reviews: null,
|
||||||
var label = reviewSpans[i].getAttribute('aria-label') || '';
|
name: null,
|
||||||
var match = label.match(/^([\\d,\\.]+)\\s*review/i);
|
rating: null,
|
||||||
if (match) {
|
category: null,
|
||||||
return parseInt(match[1].replace(/[,\\.]/g, ''));
|
address: null
|
||||||
|
};
|
||||||
|
|
||||||
|
// Business name from h1
|
||||||
|
var h1 = document.querySelector('h1');
|
||||||
|
if (h1) result.name = h1.textContent.trim();
|
||||||
|
|
||||||
|
// Category - use jsaction attribute (robust selector)
|
||||||
|
var catBtn = document.querySelector('button[jsaction*="category"]');
|
||||||
|
if (catBtn) result.category = catBtn.textContent.trim();
|
||||||
|
|
||||||
|
// Rating and review count from span[role="img"] aria-labels
|
||||||
|
var spans = document.querySelectorAll('span[role="img"]');
|
||||||
|
for (var i = 0; i < spans.length; i++) {
|
||||||
|
var label = spans[i].getAttribute('aria-label') || '';
|
||||||
|
|
||||||
|
// Rating: "4.8 stars"
|
||||||
|
var rMatch = label.match(/^([\\d,.]+)\\s*star/i);
|
||||||
|
if (rMatch && !result.rating) {
|
||||||
|
result.rating = parseFloat(rMatch[1].replace(',', '.'));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reviews: "79 reviews"
|
||||||
|
var revMatch = label.match(/^([\\d,\\.]+)\\s*review/i);
|
||||||
|
if (revMatch && !result.total_reviews) {
|
||||||
|
result.total_reviews = parseInt(revMatch[1].replace(/[,\\.]/g, ''));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return null;
|
|
||||||
|
// Address from button
|
||||||
|
var addrBtn = document.querySelector('button[data-item-id="address"]');
|
||||||
|
if (addrBtn) {
|
||||||
|
var label = addrBtn.getAttribute('aria-label');
|
||||||
|
if (label) result.address = label.replace(/^Address:\\s*/i, '');
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
""")
|
""")
|
||||||
if count:
|
|
||||||
total_reviews[0] = count
|
if info:
|
||||||
log.info(f"📊 Total reviews on page: {count}")
|
if info.get('total_reviews') and total_reviews[0] is None:
|
||||||
break
|
total_reviews[0] = info['total_reviews']
|
||||||
|
log.info(f"📊 Total reviews on page: {total_reviews[0]}")
|
||||||
|
if info.get('name') and business_info_cache[0] is None:
|
||||||
|
business_info_cache[0] = info
|
||||||
|
log.info(f"📍 Business: {info.get('name')}")
|
||||||
|
if total_reviews[0] and business_info_cache[0]:
|
||||||
|
break
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
time.sleep(0.1)
|
time.sleep(0.1)
|
||||||
|
|
||||||
|
# VALIDATION_ONLY: Return early - skip clicking reviews tab, sorting, etc.
|
||||||
|
if validation_only_mode:
|
||||||
|
log.info("📋 Validation mode: returning early (skipping reviews tab)")
|
||||||
|
return ("validation_done", None)
|
||||||
|
|
||||||
# Click reviews tab - poll until found
|
# Click reviews tab - poll until found
|
||||||
review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"]
|
review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"]
|
||||||
start = time.time()
|
start = time.time()
|
||||||
tab_clicked = False
|
tab_clicked = False
|
||||||
|
tabs_logged = False
|
||||||
while time.time() - start < 5: # Max 5s for tabs
|
while time.time() - start < 5: # Max 5s for tabs
|
||||||
try:
|
try:
|
||||||
tabs = driver.find_elements(By.CSS_SELECTOR, "button[role='tab']")
|
tabs = driver.find_elements(By.CSS_SELECTOR, "button[role='tab']")
|
||||||
|
# Log available tabs once for debugging
|
||||||
|
if not tabs_logged and tabs:
|
||||||
|
tabs_logged = True
|
||||||
|
tab_texts = [t.text for t in tabs]
|
||||||
|
log.info(f" Available tabs: {tab_texts}")
|
||||||
for tab in tabs:
|
for tab in tabs:
|
||||||
tab_text = tab.text.lower()
|
tab_text = tab.text.lower()
|
||||||
if any(kw in tab_text for kw in review_keywords):
|
if any(kw in tab_text for kw in review_keywords):
|
||||||
if not is_refresh:
|
if not is_refresh:
|
||||||
log.info(f" Clicking reviews tab: '{tab.text}'")
|
log.info(f" Clicking reviews tab: '{tab.text}'")
|
||||||
|
# Extract total_reviews from tab text like "Reviews (79)" or "Reviews\n79"
|
||||||
|
if total_reviews[0] is None:
|
||||||
|
import re
|
||||||
|
# Try pattern with parentheses: "Reviews (79)"
|
||||||
|
match = re.search(r'\((\d+)\)', tab.text)
|
||||||
|
if match:
|
||||||
|
total_reviews[0] = int(match.group(1))
|
||||||
|
log.info(f"📊 Total reviews from tab: {total_reviews[0]}")
|
||||||
|
else:
|
||||||
|
# Try pattern with newline: "Reviews\n79"
|
||||||
|
match = re.search(r'(\d+)', tab.text)
|
||||||
|
if match:
|
||||||
|
total_reviews[0] = int(match.group(1))
|
||||||
|
log.info(f"📊 Total reviews from tab: {total_reviews[0]}")
|
||||||
tab.click()
|
tab.click()
|
||||||
tab_clicked = True
|
tab_clicked = True
|
||||||
break
|
break
|
||||||
@@ -569,11 +649,85 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
|
|
||||||
return scroll_container, stop_scrolling
|
return scroll_container, stop_scrolling
|
||||||
|
|
||||||
# Initial page setup
|
# Helper to extract review topics from the reviews tab
|
||||||
scroll_container, stop_scrolling = setup_reviews_page(is_refresh=False)
|
def extract_review_topics():
|
||||||
|
"""Extract review topic filters from radiogroup (robust selectors)."""
|
||||||
|
try:
|
||||||
|
topics = driver.execute_script("""
|
||||||
|
var topics = [];
|
||||||
|
|
||||||
|
// Primary: use role="radiogroup" with aria-label="Refine reviews" (robust)
|
||||||
|
var container = document.querySelector('div[role="radiogroup"][aria-label*="Refine"], div[role="radiogroup"][aria-label*="refine"]');
|
||||||
|
|
||||||
|
if (!container) {
|
||||||
|
// Fallback: any radiogroup in the reviews area
|
||||||
|
container = document.querySelector('div[role="radiogroup"]');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (container) {
|
||||||
|
var buttons = container.querySelectorAll('button[role="radio"]');
|
||||||
|
for (var btn of buttons) {
|
||||||
|
var label = btn.getAttribute('aria-label') || '';
|
||||||
|
// Parse "hair salon, mentioned in 4 reviews" format
|
||||||
|
var match = label.match(/^([^,]+),\\s*mentioned in (\\d+)/i);
|
||||||
|
if (match) {
|
||||||
|
topics.push({
|
||||||
|
topic: match[1].trim(),
|
||||||
|
count: parseInt(match[2])
|
||||||
|
});
|
||||||
|
} else if (label && !label.toLowerCase().includes('all review')) {
|
||||||
|
// Fallback: try to extract from child spans
|
||||||
|
var countSpan = btn.querySelector('.bC3Nkc, .fontBodySmall');
|
||||||
|
var nameSpan = btn.querySelector('.uEubGf, span:first-child');
|
||||||
|
if (nameSpan) {
|
||||||
|
var name = nameSpan.textContent.trim();
|
||||||
|
var count = countSpan ? parseInt(countSpan.textContent) : 0;
|
||||||
|
if (name && name.toLowerCase() !== 'all') {
|
||||||
|
topics.push({topic: name, count: count || 0});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return topics;
|
||||||
|
""")
|
||||||
|
return topics or []
|
||||||
|
except:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Initial page setup (pass validation_only to skip unnecessary steps)
|
||||||
|
scroll_container, stop_scrolling = setup_reviews_page(is_refresh=False, validation_only_mode=validation_only)
|
||||||
|
|
||||||
|
# VALIDATION_ONLY MODE: Return early with just total_reviews and business info
|
||||||
|
# setup_reviews_page returns ("validation_done", None) in this case
|
||||||
|
if validation_only or scroll_container == "validation_done":
|
||||||
|
# Use the business info captured from Overview (before clicking reviews tab)
|
||||||
|
business_info = business_info_cache[0] or {}
|
||||||
|
|
||||||
|
return {
|
||||||
|
"reviews": [],
|
||||||
|
"total": total_reviews[0] or 0,
|
||||||
|
"scrolls": 0,
|
||||||
|
"error": None,
|
||||||
|
"validation_info": {
|
||||||
|
"name": business_info.get("name"),
|
||||||
|
"rating": business_info.get("rating"),
|
||||||
|
"category": business_info.get("category"),
|
||||||
|
"address": business_info.get("address"),
|
||||||
|
"total_reviews": total_reviews[0]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if not scroll_container:
|
if not scroll_container:
|
||||||
return {"reviews": [], "total": 0, "scrolls": 0, "error": "No scroll container found"}
|
return {"reviews": [], "total": 0, "scrolls": 0, "error": "No scroll container found"}
|
||||||
|
|
||||||
|
# Extract review topics after reviews tab is loaded (before scrolling begins)
|
||||||
|
time.sleep(0.5) # Brief wait for topic filters to render
|
||||||
|
review_topics = extract_review_topics()
|
||||||
|
if review_topics:
|
||||||
|
log.info(f"📊 Found {len(review_topics)} review topics: {', '.join(t['topic'] for t in review_topics[:5])}...")
|
||||||
|
|
||||||
def get_api_reviews():
|
def get_api_reviews():
|
||||||
"""Get reviews from intercepted API responses."""
|
"""Get reviews from intercepted API responses."""
|
||||||
api_revs = []
|
api_revs = []
|
||||||
@@ -990,13 +1144,15 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
"total_flushed": total_flushed[0],
|
"total_flushed": total_flushed[0],
|
||||||
"checks": check_num,
|
"checks": check_num,
|
||||||
"url": url,
|
"url": url,
|
||||||
"logs": log.get_logs()
|
"logs": log.get_logs(),
|
||||||
|
"review_topics": review_topics # Topic filters with mention counts
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999,
|
def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999,
|
||||||
progress_callback=None, driver=None, return_driver: bool = False,
|
progress_callback=None, driver=None, return_driver: bool = False,
|
||||||
log_capture: LogCapture = None):
|
log_capture: LogCapture = None, flush_callback=None, validation_only: bool = False,
|
||||||
|
browser_fingerprint: dict = None):
|
||||||
"""
|
"""
|
||||||
Production-compatible wrapper for scrape_reviews.
|
Production-compatible wrapper for scrape_reviews.
|
||||||
Matches the API expected by job_manager.py.
|
Matches the API expected by job_manager.py.
|
||||||
@@ -1009,6 +1165,13 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
|||||||
driver: Existing driver instance to reuse
|
driver: Existing driver instance to reuse
|
||||||
return_driver: If True, return driver in result
|
return_driver: If True, return driver in result
|
||||||
log_capture: Optional LogCapture instance for real-time log access
|
log_capture: Optional LogCapture instance for real-time log access
|
||||||
|
browser_fingerprint: Optional dict with user's browser fingerprint:
|
||||||
|
- geolocation: {lat, lng}
|
||||||
|
- userAgent: string
|
||||||
|
- viewport: {width, height}
|
||||||
|
- timezone: string (e.g., "Europe/Madrid")
|
||||||
|
- language: string (e.g., "en-US")
|
||||||
|
- platform: string (e.g., "MacIntel")
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dictionary with: reviews, count, total_reviews, time, success, error, driver, logs
|
Dictionary with: reviews, count, total_reviews, time, success, error, driver, logs
|
||||||
@@ -1023,27 +1186,56 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
|||||||
log_capture = log_capture or LogCapture()
|
log_capture = log_capture or LogCapture()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
# Extract fingerprint settings
|
||||||
|
fp = browser_fingerprint or {}
|
||||||
|
user_agent = fp.get('userAgent') or "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||||
|
viewport = fp.get('viewport') or {'width': 1200, 'height': 900}
|
||||||
|
geolocation = fp.get('geolocation')
|
||||||
|
timezone = fp.get('timezone')
|
||||||
|
language = fp.get('language', 'en-US')
|
||||||
|
|
||||||
# Create driver if not provided
|
# Create driver if not provided
|
||||||
if not driver:
|
if not driver:
|
||||||
driver = Driver(
|
driver = Driver(
|
||||||
uc=True,
|
uc=True,
|
||||||
headless=headless,
|
headless=headless,
|
||||||
page_load_strategy="normal",
|
page_load_strategy="normal",
|
||||||
agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
agent=user_agent # Use user's actual user agent
|
||||||
)
|
)
|
||||||
driver.set_window_size(1200, 900) # Proper viewport for Google Maps
|
# Set viewport to match user's screen
|
||||||
|
driver.set_window_size(viewport['width'], viewport['height'])
|
||||||
|
|
||||||
# Set Chrome geolocation to US (Boston, MA) using CDP
|
# Apply browser fingerprint settings via CDP
|
||||||
# This ensures Google Maps shows US results regardless of server location
|
|
||||||
try:
|
try:
|
||||||
driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
|
# Set timezone if provided
|
||||||
'latitude': 42.3601,
|
if timezone:
|
||||||
'longitude': -71.0589,
|
driver.execute_cdp_cmd('Emulation.setTimezoneOverride', {'timezoneId': timezone})
|
||||||
'accuracy': 100
|
log_capture.info(f"Set timezone to {timezone}")
|
||||||
})
|
|
||||||
log_capture.info("Set geolocation to US (Boston, MA)")
|
# Set locale/language
|
||||||
|
driver.execute_cdp_cmd('Emulation.setLocaleOverride', {'locale': language})
|
||||||
|
|
||||||
|
# Set geolocation
|
||||||
|
if geolocation and 'lat' in geolocation and 'lng' in geolocation:
|
||||||
|
driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
|
||||||
|
'latitude': geolocation['lat'],
|
||||||
|
'longitude': geolocation['lng'],
|
||||||
|
'accuracy': 1000 # ~1km accuracy for IP-based location
|
||||||
|
})
|
||||||
|
log_capture.info(f"Set geolocation to ({geolocation['lat']:.2f}, {geolocation['lng']:.2f})")
|
||||||
|
else:
|
||||||
|
# Default to US (Boston, MA) if no geolocation provided
|
||||||
|
driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
|
||||||
|
'latitude': 42.3601,
|
||||||
|
'longitude': -71.0589,
|
||||||
|
'accuracy': 100
|
||||||
|
})
|
||||||
|
log_capture.info("Set geolocation to US (Boston, MA) [default]")
|
||||||
|
|
||||||
|
if fp:
|
||||||
|
log_capture.info(f"Browser fingerprint applied: {fp.get('platform', 'unknown')}, {viewport['width']}x{viewport['height']}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log_capture.warning(f"Could not set geolocation: {e}")
|
log_capture.warning(f"Could not apply fingerprint settings: {e}")
|
||||||
|
|
||||||
# Add URL parameters for consistent results
|
# Add URL parameters for consistent results
|
||||||
if 'hl=' not in url:
|
if 'hl=' not in url:
|
||||||
@@ -1052,14 +1244,18 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
|||||||
if 'gl=' not in url:
|
if 'gl=' not in url:
|
||||||
url = f"{url}&gl=us"
|
url = f"{url}&gl=us"
|
||||||
|
|
||||||
# Create progress wrapper if callback provided
|
# Create combined flush callback for progress + external handler
|
||||||
flush_callback = None
|
external_flush = flush_callback # Save external callback
|
||||||
if progress_callback:
|
internal_flush = None
|
||||||
|
if progress_callback or external_flush:
|
||||||
collected = [0]
|
collected = [0]
|
||||||
def flush_with_progress(reviews_batch):
|
def combined_flush(reviews_batch):
|
||||||
collected[0] += len(reviews_batch)
|
collected[0] = len(reviews_batch) # reviews_batch is ALL reviews so far
|
||||||
progress_callback(collected[0], None)
|
if progress_callback:
|
||||||
flush_callback = flush_with_progress
|
progress_callback(collected[0], None)
|
||||||
|
if external_flush:
|
||||||
|
external_flush(reviews_batch) # Pass reviews to external handler
|
||||||
|
internal_flush = combined_flush
|
||||||
|
|
||||||
# Run the scraper with progress callback for real-time updates
|
# Run the scraper with progress callback for real-time updates
|
||||||
result = scrape_reviews(
|
result = scrape_reviews(
|
||||||
@@ -1067,10 +1263,11 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
|||||||
url=url,
|
url=url,
|
||||||
max_reviews=999999, # Effectively unlimited
|
max_reviews=999999, # Effectively unlimited
|
||||||
timeout_no_new=15,
|
timeout_no_new=15,
|
||||||
flush_callback=flush_callback,
|
flush_callback=internal_flush,
|
||||||
flush_batch_size=100, # Smaller batches for more frequent progress
|
flush_batch_size=100, # Smaller batches for more frequent progress
|
||||||
log_capture=log_capture,
|
log_capture=log_capture,
|
||||||
progress_callback=progress_callback # Pass through for real-time log updates
|
progress_callback=progress_callback, # Pass through for real-time log updates
|
||||||
|
validation_only=validation_only # Return early if just validating
|
||||||
)
|
)
|
||||||
|
|
||||||
elapsed = time.time() - start_time
|
elapsed = time.time() - start_time
|
||||||
@@ -1083,9 +1280,14 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
|||||||
"time": elapsed,
|
"time": elapsed,
|
||||||
"success": True,
|
"success": True,
|
||||||
"error": None,
|
"error": None,
|
||||||
"logs": result.get("logs", [])
|
"logs": result.get("logs", []),
|
||||||
|
"review_topics": result.get("review_topics", []) # Topic filters with mention counts
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Include validation_info if in validation_only mode
|
||||||
|
if validation_only and "validation_info" in result:
|
||||||
|
response["validation_info"] = result["validation_info"]
|
||||||
|
|
||||||
if return_driver:
|
if return_driver:
|
||||||
response["driver"] = driver
|
response["driver"] = driver
|
||||||
elif should_close_driver:
|
elif should_close_driver:
|
||||||
@@ -1120,6 +1322,122 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def extract_about_info(driver, url: str = None) -> dict:
|
||||||
|
"""
|
||||||
|
Extract About section info from Google Maps (Accessibility, Amenities, etc.).
|
||||||
|
|
||||||
|
This function should be called AFTER reviews are scraped if about info is needed,
|
||||||
|
as it navigates to a different tab.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
driver: Selenium WebDriver instance (already on the business page)
|
||||||
|
url: Optional URL to navigate to first (if not already on the page)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict with section names as keys, each containing list of features
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Navigate if URL provided
|
||||||
|
if url:
|
||||||
|
# Force English
|
||||||
|
if 'hl=' not in url:
|
||||||
|
separator = '&' if '?' in url else '?'
|
||||||
|
url = f"{url}{separator}hl=en"
|
||||||
|
if 'gl=' not in url:
|
||||||
|
url = f"{url}&gl=us"
|
||||||
|
driver.get(url)
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
# Click About tab using robust selectors
|
||||||
|
clicked = driver.execute_script("""
|
||||||
|
// Try multiple selectors for about tab
|
||||||
|
var selectors = [
|
||||||
|
'button[aria-label*="About"]',
|
||||||
|
'button[data-tab-index="2"]',
|
||||||
|
'div[role="tablist"] button:nth-child(3)',
|
||||||
|
'button[jsaction*="about"]'
|
||||||
|
];
|
||||||
|
|
||||||
|
for (var sel of selectors) {
|
||||||
|
var btn = document.querySelector(sel);
|
||||||
|
if (btn && btn.textContent.toLowerCase().includes('about')) {
|
||||||
|
btn.click();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback: find by text content
|
||||||
|
var buttons = document.querySelectorAll('button');
|
||||||
|
for (var btn of buttons) {
|
||||||
|
if (btn.textContent.trim().toLowerCase() === 'about') {
|
||||||
|
btn.click();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
""")
|
||||||
|
|
||||||
|
if not clicked:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
time.sleep(1.5) # Wait for about tab to load
|
||||||
|
|
||||||
|
# Extract about sections using aria-labels (robust)
|
||||||
|
about = driver.execute_script("""
|
||||||
|
var about = {};
|
||||||
|
|
||||||
|
// Find the about region by aria-label or role
|
||||||
|
var container = document.querySelector('div[role="region"][aria-label*="About"]');
|
||||||
|
|
||||||
|
if (!container) {
|
||||||
|
// Fallback: look for the scrollable area with sections
|
||||||
|
container = document.querySelector('.m6QErb[aria-label*="About"]');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!container) {
|
||||||
|
// Last resort: find sections by h2 headers
|
||||||
|
container = document;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find all section headers (h2 elements)
|
||||||
|
var sections = container.querySelectorAll('h2');
|
||||||
|
|
||||||
|
for (var h2 of sections) {
|
||||||
|
var sectionName = h2.textContent.trim();
|
||||||
|
var items = [];
|
||||||
|
|
||||||
|
// Find the ul list following this h2
|
||||||
|
var parent = h2.closest('.iP2t7d, div');
|
||||||
|
if (parent) {
|
||||||
|
var listItems = parent.querySelectorAll('li span[aria-label]');
|
||||||
|
for (var li of listItems) {
|
||||||
|
var label = li.getAttribute('aria-label');
|
||||||
|
if (label) {
|
||||||
|
// Parse "Has toilet" or "No wheelchair-accessible car park"
|
||||||
|
var hasFeature = !label.toLowerCase().startsWith('no ');
|
||||||
|
var featureName = label.replace(/^(Has |No )/i, '');
|
||||||
|
items.push({
|
||||||
|
feature: featureName,
|
||||||
|
available: hasFeature
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sectionName && items.length > 0) {
|
||||||
|
about[sectionName] = items;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return about;
|
||||||
|
""")
|
||||||
|
|
||||||
|
return about or {}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return {"error": str(e)}
|
||||||
|
|
||||||
|
|
||||||
# Test function
|
# Test function
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
from seleniumbase import Driver
|
from seleniumbase import Driver
|
||||||
@@ -1159,6 +1477,8 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
|
|||||||
dict with: name, address, rating, total_reviews, success, error, time
|
dict with: name, address, rating, total_reviews, success, error, time
|
||||||
"""
|
"""
|
||||||
from seleniumbase import Driver
|
from seleniumbase import Driver
|
||||||
|
import logging
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
driver_provided = driver is not None
|
driver_provided = driver is not None
|
||||||
@@ -1177,13 +1497,15 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
|
|||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Clear state if reusing a pooled driver (ensures clean page load)
|
# Don't clear state - Google may serve different content based on session history
|
||||||
if driver_provided:
|
# The scraper doesn't reset state, so validation shouldn't either
|
||||||
try:
|
|
||||||
driver.delete_all_cookies()
|
# Force English interface for consistent parsing
|
||||||
driver.get("about:blank")
|
if 'hl=' not in url:
|
||||||
except:
|
separator = '&' if '?' in url else '?'
|
||||||
pass
|
url = f"{url}{separator}hl=en"
|
||||||
|
if 'gl=' not in url:
|
||||||
|
url = f"{url}&gl=us"
|
||||||
|
|
||||||
# Navigate to URL
|
# Navigate to URL
|
||||||
driver.get(url)
|
driver.get(url)
|
||||||
@@ -1193,48 +1515,183 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
|
|||||||
while time.time() - start < 5:
|
while time.time() - start < 5:
|
||||||
if "consent.google" in driver.current_url:
|
if "consent.google" in driver.current_url:
|
||||||
try:
|
try:
|
||||||
for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
|
# Try multiple approaches to find and click accept button
|
||||||
txt = btn.text.lower()
|
clicked = False
|
||||||
if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
|
|
||||||
btn.click()
|
# Method 1: Find by aria-label (most reliable for Google consent)
|
||||||
driver.get(url)
|
for btn in driver.find_elements(By.CSS_SELECTOR, "button[aria-label*='Accept']"):
|
||||||
break
|
btn.click()
|
||||||
except:
|
clicked = True
|
||||||
|
break
|
||||||
|
|
||||||
|
# Method 2: Find by text content
|
||||||
|
if not clicked:
|
||||||
|
for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
|
||||||
|
txt = btn.text.lower()
|
||||||
|
if "accept all" in txt or "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
|
||||||
|
btn.click()
|
||||||
|
clicked = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if clicked:
|
||||||
|
time.sleep(0.5) # Brief wait for consent to process
|
||||||
|
driver.get(url) # Reload the target URL
|
||||||
|
time.sleep(0.5) # Wait for reload
|
||||||
|
except Exception as e:
|
||||||
pass
|
pass
|
||||||
break
|
break
|
||||||
if "maps/place" in driver.current_url or ("maps" in driver.current_url and "consent" not in driver.current_url):
|
if "maps/place" in driver.current_url or ("maps" in driver.current_url and "consent" not in driver.current_url):
|
||||||
break
|
break
|
||||||
time.sleep(0.01) # 10ms - responsive but low CPU
|
time.sleep(0.01) # 10ms - responsive but low CPU
|
||||||
|
|
||||||
|
# Log current URL after consent handling
|
||||||
|
try:
|
||||||
|
current_url = driver.current_url
|
||||||
|
log.info(f"🔍 Validation: Current URL after load: {current_url[:80]}...")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Wait for page to fully render before polling (tabs may load dynamically)
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
# Poll for business info (same pattern as total_reviews extraction)
|
# Poll for business info (same pattern as total_reviews extraction)
|
||||||
info = {"name": None, "rating": None, "total_reviews": None, "address": None}
|
# Timeout increased to 10s because Reviews tab can take 6+ seconds to appear after consent
|
||||||
|
info = {"name": None, "rating": None, "total_reviews": None, "address": None, "category": None}
|
||||||
start = time.time()
|
start = time.time()
|
||||||
while time.time() - start < 5:
|
debug_logged = False
|
||||||
|
while time.time() - start < 10:
|
||||||
try:
|
try:
|
||||||
info = driver.execute_script("""
|
info = driver.execute_script("""
|
||||||
var result = {name: null, rating: null, total_reviews: null, address: null};
|
var result = {name: null, rating: null, total_reviews: null, address: null, category: null, debug: []};
|
||||||
|
|
||||||
// Business name from h1
|
// Business name from h1
|
||||||
var h1 = document.querySelector('h1');
|
var h1 = document.querySelector('h1');
|
||||||
if (h1) result.name = h1.textContent.trim();
|
if (h1) result.name = h1.textContent.trim();
|
||||||
|
|
||||||
// Rating and reviews from span[role="img"] aria-labels
|
// Category - use jsaction attribute (robust, survives class changes)
|
||||||
// Same pattern as scrape_reviews for consistency
|
var catBtn = document.querySelector('button[jsaction*="category"]');
|
||||||
|
if (catBtn) result.category = catBtn.textContent.trim();
|
||||||
|
|
||||||
|
// Fallback: look for button after rating that's not a link
|
||||||
|
if (!result.category) {
|
||||||
|
var buttons = document.querySelectorAll('button');
|
||||||
|
for (var btn of buttons) {
|
||||||
|
var text = btn.textContent.trim();
|
||||||
|
// Categories are short words, no numbers, not navigation
|
||||||
|
if (text && text.length < 50 && !text.match(/^[0-9]/) &&
|
||||||
|
!text.match(/review|star|direction|save|share|photo/i)) {
|
||||||
|
// Check if it's near the rating area
|
||||||
|
var parent = btn.closest('.LBgpqf, .skqShb, .fontBodyMedium');
|
||||||
|
if (parent) {
|
||||||
|
result.category = text;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Rating from span[role="img"] aria-labels
|
||||||
var spans = document.querySelectorAll('span[role="img"]');
|
var spans = document.querySelectorAll('span[role="img"]');
|
||||||
for (var i = 0; i < spans.length; i++) {
|
for (var i = 0; i < spans.length; i++) {
|
||||||
var label = spans[i].getAttribute('aria-label') || '';
|
var label = spans[i].getAttribute('aria-label') || '';
|
||||||
|
|
||||||
// Rating: "4.8 stars", "4,8 estrellas", etc (partial match)
|
// Collect debug info for all aria-labels
|
||||||
var rMatch = label.match(/^([\\d,.]+)\\s*(star|estrella|étoile|stern|stell)/i);
|
if (label) {
|
||||||
|
result.debug.push('img-aria: ' + label);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Rating: "4.8 stars" (English forced via hl=en)
|
||||||
|
var rMatch = label.match(/^([\\d,.]+)\\s*star/i);
|
||||||
if (rMatch && !result.rating) {
|
if (rMatch && !result.rating) {
|
||||||
result.rating = parseFloat(rMatch[1].replace(',', '.'));
|
result.rating = parseFloat(rMatch[1].replace(',', '.'));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Reviews: same as scrape_reviews - /^([\d,.]+)\s*review/i
|
// Reviews: "79 reviews" or "4.8 stars 79 reviews" (English forced via hl=en)
|
||||||
// Plus Spanish "reseña" which doesn't contain "review"
|
// Try direct format first: "79 reviews"
|
||||||
var revMatch = label.match(/^([\\d,\\.]+)\\s*(review|reseña|avis|bewertung|recension)/i);
|
var revMatch = label.match(/^([\\d,]+)\\s*review/i);
|
||||||
if (revMatch && !result.total_reviews) {
|
if (revMatch && !result.total_reviews) {
|
||||||
result.total_reviews = parseInt(revMatch[1].replace(/[,\\.]/g, ''));
|
result.total_reviews = parseInt(revMatch[1].replace(/,/g, ''));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try combined format: "4.8 stars 79 reviews" or "4.8 stars 79k+ reviews"
|
||||||
|
if (!result.total_reviews) {
|
||||||
|
var combinedMatch = label.match(/stars?\\s+([\\d,]+k?\\+?)\\s*review/i);
|
||||||
|
if (combinedMatch) {
|
||||||
|
var countStr = combinedMatch[1].replace(/,/g, '');
|
||||||
|
if (countStr.includes('k')) {
|
||||||
|
// Handle "9k+" format
|
||||||
|
result.total_reviews = parseInt(countStr) * 1000;
|
||||||
|
} else {
|
||||||
|
result.total_reviews = parseInt(countStr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Also collect tab button texts for debugging (include full text including numbers)
|
||||||
|
var tabs = document.querySelectorAll('button[role="tab"]');
|
||||||
|
for (var j = 0; j < tabs.length; j++) {
|
||||||
|
var tabText = tabs[j].textContent.trim();
|
||||||
|
result.debug.push('tab: ' + tabText);
|
||||||
|
// Also try to extract review count from tab text like "Reviews (79)"
|
||||||
|
if (tabText.toLowerCase().includes('review') && !result.total_reviews) {
|
||||||
|
var tabMatch = tabText.match(/\\((\\d+)\\)/);
|
||||||
|
if (tabMatch) {
|
||||||
|
result.total_reviews = parseInt(tabMatch[1]);
|
||||||
|
result.debug.push('Found reviews in tab: ' + tabText);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Also check ALL buttons for reviews count
|
||||||
|
var allButtons = document.querySelectorAll('button');
|
||||||
|
for (var b = 0; b < allButtons.length; b++) {
|
||||||
|
var btnText = allButtons[b].textContent || '';
|
||||||
|
if (btnText.toLowerCase().includes('review') && !btnText.toLowerCase().includes('write')) {
|
||||||
|
var numMatch = btnText.match(/\\((\\d+)\\)/);
|
||||||
|
if (numMatch && !result.total_reviews) {
|
||||||
|
result.total_reviews = parseInt(numMatch[1]);
|
||||||
|
result.debug.push('Found reviews in button: ' + btnText.substring(0, 50));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if we're on search results vs place page
|
||||||
|
result.debug.push('title: ' + document.title);
|
||||||
|
result.debug.push('url: ' + window.location.href.substring(0, 80));
|
||||||
|
|
||||||
|
// Check for search results list
|
||||||
|
var searchResults = document.querySelectorAll('div[role="feed"] > div');
|
||||||
|
result.debug.push('search_results_count: ' + searchResults.length);
|
||||||
|
|
||||||
|
// Fallback: Get review count from Reviews tab button "Reviews (79)"
|
||||||
|
// Search ALL tab buttons for one containing "review" text (same as scrape_reviews)
|
||||||
|
if (!result.total_reviews) {
|
||||||
|
var tabs = document.querySelectorAll('button[role="tab"]');
|
||||||
|
for (var tab of tabs) {
|
||||||
|
var text = tab.textContent.toLowerCase();
|
||||||
|
if (text.includes('review')) {
|
||||||
|
var match = tab.textContent.match(/\\((\\d+)\\)/);
|
||||||
|
if (match) {
|
||||||
|
result.total_reviews = parseInt(match[1]);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback 2: Look for any button with "Reviews" and a number
|
||||||
|
if (!result.total_reviews) {
|
||||||
|
var buttons = document.querySelectorAll('button');
|
||||||
|
for (var btn of buttons) {
|
||||||
|
var text = btn.textContent;
|
||||||
|
if (text.toLowerCase().includes('review') && !text.toLowerCase().includes('write')) {
|
||||||
|
var numMatch = text.match(/\\((\\d+)\\)/);
|
||||||
|
if (numMatch) {
|
||||||
|
result.total_reviews = parseInt(numMatch[1]);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1242,23 +1699,41 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
|
|||||||
var addrBtn = document.querySelector('button[data-item-id="address"]');
|
var addrBtn = document.querySelector('button[data-item-id="address"]');
|
||||||
if (addrBtn) {
|
if (addrBtn) {
|
||||||
var label = addrBtn.getAttribute('aria-label');
|
var label = addrBtn.getAttribute('aria-label');
|
||||||
if (label) result.address = label.replace(/^(Address|Dirección|Adresse):\\s*/i, '');
|
if (label) result.address = label.replace(/^Address:\\s*/i, '');
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
""")
|
""")
|
||||||
# Exit early if we have the essentials
|
# Exit early if we have the essentials (name found AND reviews count > 0)
|
||||||
if info.get("name") and info.get("total_reviews") is not None:
|
if info.get("name") and info.get("total_reviews") and info.get("total_reviews") > 0:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# Log debug info once after 3 seconds
|
||||||
|
if not debug_logged and time.time() - start > 3:
|
||||||
|
debug_logged = True
|
||||||
|
debug_info = info.get("debug", [])
|
||||||
|
if debug_info:
|
||||||
|
log.info(f"🔍 Validation debug - URL: {url[:50]}...")
|
||||||
|
log.info(f" Name: {info.get('name')}, Rating: {info.get('rating')}, Reviews: {info.get('total_reviews')}")
|
||||||
|
for d in debug_info[:10]: # First 10 debug items
|
||||||
|
log.info(f" {d}")
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
time.sleep(0.1) # 100ms between polls
|
time.sleep(0.1) # 100ms between polls
|
||||||
|
|
||||||
|
# Final debug log if still no reviews
|
||||||
|
if not info.get("total_reviews"):
|
||||||
|
debug_info = info.get("debug", [])
|
||||||
|
log.warning(f"⚠️ Validation: No reviews found for '{info.get('name')}' after 10s polling")
|
||||||
|
if debug_info:
|
||||||
|
log.warning(f" Debug items: {debug_info[:10]}")
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"name": info.get("name"),
|
"name": info.get("name"),
|
||||||
"address": info.get("address"),
|
"address": info.get("address"),
|
||||||
"rating": info.get("rating"),
|
"rating": info.get("rating"),
|
||||||
"total_reviews": info.get("total_reviews"),
|
"total_reviews": info.get("total_reviews"),
|
||||||
|
"category": info.get("category"),
|
||||||
"success": bool(info.get("name")),
|
"success": bool(info.get("name")),
|
||||||
"error": None,
|
"error": None,
|
||||||
"time": time.time() - start_time
|
"time": time.time() - start_time
|
||||||
@@ -1270,6 +1745,7 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
|
|||||||
"address": None,
|
"address": None,
|
||||||
"rating": None,
|
"rating": None,
|
||||||
"total_reviews": None,
|
"total_reviews": None,
|
||||||
|
"category": None,
|
||||||
"success": False,
|
"success": False,
|
||||||
"error": str(e),
|
"error": str(e),
|
||||||
"time": time.time() - start_time
|
"time": time.time() - start_time
|
||||||
|
|||||||
@@ -27,6 +27,8 @@ interface SelectedJob {
|
|||||||
jobId: string;
|
jobId: string;
|
||||||
newCount?: number;
|
newCount?: number;
|
||||||
previousJobId?: string;
|
previousJobId?: string;
|
||||||
|
businessCategory?: string;
|
||||||
|
reviewTopics?: { topic: string; count: number }[];
|
||||||
}
|
}
|
||||||
|
|
||||||
type ViewType = 'newScrape' | 'jobs' | 'reports';
|
type ViewType = 'newScrape' | 'jobs' | 'reports';
|
||||||
@@ -106,6 +108,8 @@ export default function Home() {
|
|||||||
jobId: job.job_id,
|
jobId: job.job_id,
|
||||||
newCount: data.new_count,
|
newCount: data.new_count,
|
||||||
previousJobId: previousJob?.job_id,
|
previousJobId: previousJob?.job_id,
|
||||||
|
businessCategory: job.business_category || undefined,
|
||||||
|
reviewTopics: job.review_topics || undefined,
|
||||||
});
|
});
|
||||||
setActiveView('reports');
|
setActiveView('reports');
|
||||||
}
|
}
|
||||||
@@ -155,7 +159,7 @@ export default function Home() {
|
|||||||
Back to Reports
|
Back to Reports
|
||||||
</button>
|
</button>
|
||||||
</div>
|
</div>
|
||||||
<ReviewAnalytics reviews={selectedJob.reviews} businessName={selectedJob.businessName} businessUrl={selectedJob.businessUrl} newCount={selectedJob.newCount} />
|
<ReviewAnalytics reviews={selectedJob.reviews} businessName={selectedJob.businessName} businessUrl={selectedJob.businessUrl} newCount={selectedJob.newCount} businessCategory={selectedJob.businessCategory} reviewTopics={selectedJob.reviewTopics} />
|
||||||
</div>
|
</div>
|
||||||
) : (
|
) : (
|
||||||
<div className="h-full overflow-y-auto p-6">
|
<div className="h-full overflow-y-auto p-6">
|
||||||
|
|||||||
@@ -22,14 +22,21 @@ interface ReviewWithNew extends Review {
|
|||||||
photo_urls?: string[] | null;
|
photo_urls?: string[] | null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
interface ReviewTopic {
|
||||||
|
topic: string;
|
||||||
|
count: number;
|
||||||
|
}
|
||||||
|
|
||||||
interface ReviewAnalyticsProps {
|
interface ReviewAnalyticsProps {
|
||||||
reviews: ReviewWithNew[];
|
reviews: ReviewWithNew[];
|
||||||
businessName?: string;
|
businessName?: string;
|
||||||
businessUrl?: string;
|
businessUrl?: string;
|
||||||
newCount?: number;
|
newCount?: number;
|
||||||
|
businessCategory?: string;
|
||||||
|
reviewTopics?: ReviewTopic[];
|
||||||
}
|
}
|
||||||
|
|
||||||
export default function ReviewAnalytics({ reviews, businessName, businessUrl, newCount }: ReviewAnalyticsProps) {
|
export default function ReviewAnalytics({ reviews, businessName, businessUrl, newCount, businessCategory, reviewTopics }: ReviewAnalyticsProps) {
|
||||||
const [sorting, setSorting] = useState<SortingState>([{ id: 'date', desc: true }]); // Default: newest first
|
const [sorting, setSorting] = useState<SortingState>([{ id: 'date', desc: true }]); // Default: newest first
|
||||||
const [columnFilters, setColumnFiltersState] = useState<ColumnFiltersState>([]);
|
const [columnFilters, setColumnFiltersState] = useState<ColumnFiltersState>([]);
|
||||||
const [globalFilter, setGlobalFilter] = useState('');
|
const [globalFilter, setGlobalFilter] = useState('');
|
||||||
@@ -476,9 +483,16 @@ export default function ReviewAnalytics({ reviews, businessName, businessUrl, ne
|
|||||||
{/* Header */}
|
{/* Header */}
|
||||||
<div className="flex items-center justify-between">
|
<div className="flex items-center justify-between">
|
||||||
<div>
|
<div>
|
||||||
<h2 className="text-3xl font-bold text-gray-900">
|
<div className="flex items-center gap-3">
|
||||||
{businessName || 'Review Analytics'}
|
<h2 className="text-3xl font-bold text-gray-900">
|
||||||
</h2>
|
{businessName || 'Review Analytics'}
|
||||||
|
</h2>
|
||||||
|
{businessCategory && (
|
||||||
|
<span className="px-3 py-1 bg-purple-100 text-purple-800 text-sm font-medium rounded-full border border-purple-300">
|
||||||
|
{businessCategory}
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
{businessUrl && (
|
{businessUrl && (
|
||||||
<a
|
<a
|
||||||
href={businessUrl}
|
href={businessUrl}
|
||||||
@@ -821,6 +835,33 @@ export default function ReviewAnalytics({ reviews, businessName, businessUrl, ne
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
{/* Review Topics - from Google Maps */}
|
||||||
|
{reviewTopics && reviewTopics.length > 0 && (
|
||||||
|
<div className="bg-white border-2 border-gray-300 rounded-xl p-5 shadow-md">
|
||||||
|
<div className="flex items-center gap-2 mb-4">
|
||||||
|
<MessageSquare className="w-6 h-6 text-indigo-600" />
|
||||||
|
<h3 className="text-lg font-bold text-gray-900">What People Talk About</h3>
|
||||||
|
<span className="text-sm text-gray-500">({reviewTopics.length} topics from Google)</span>
|
||||||
|
</div>
|
||||||
|
<div className="flex flex-wrap gap-2">
|
||||||
|
{reviewTopics.slice(0, 15).map((topic, idx) => (
|
||||||
|
<div
|
||||||
|
key={idx}
|
||||||
|
className="px-3 py-1.5 bg-gradient-to-r from-indigo-50 to-purple-50 border border-indigo-200 rounded-full flex items-center gap-2"
|
||||||
|
>
|
||||||
|
<span className="text-sm font-medium text-indigo-800">{topic.topic}</span>
|
||||||
|
<span className="text-xs bg-indigo-200 text-indigo-900 px-1.5 py-0.5 rounded-full font-bold">
|
||||||
|
{topic.count}
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
{reviewTopics.length > 15 && (
|
||||||
|
<p className="text-sm text-gray-500 mt-3">+{reviewTopics.length - 15} more topics</p>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
{/* Rating & Volume Timeline */}
|
{/* Rating & Volume Timeline */}
|
||||||
{timelineData.length > 0 && (
|
{timelineData.length > 0 && (
|
||||||
<div className={`bg-white rounded-xl p-6 shadow-md transition-all ${
|
<div className={`bg-white rounded-xl p-6 shadow-md transition-all ${
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ interface Review {
|
|||||||
|
|
||||||
export interface JobStatus {
|
export interface JobStatus {
|
||||||
job_id: string;
|
job_id: string;
|
||||||
status: 'pending' | 'running' | 'completed' | 'failed';
|
status: 'pending' | 'running' | 'completed' | 'failed' | 'partial';
|
||||||
url: string;
|
url: string;
|
||||||
created_at: string;
|
created_at: string;
|
||||||
started_at: string | null;
|
started_at: string | null;
|
||||||
@@ -28,8 +28,11 @@ export interface JobStatus {
|
|||||||
// Business metadata for tracking and comparison
|
// Business metadata for tracking and comparison
|
||||||
business_name: string | null;
|
business_name: string | null;
|
||||||
business_address: string | null;
|
business_address: string | null;
|
||||||
|
business_category: string | null;
|
||||||
rating_snapshot: number | null;
|
rating_snapshot: number | null;
|
||||||
total_reviews_snapshot: number | null;
|
total_reviews_snapshot: number | null;
|
||||||
|
// Review topics extracted from Google Maps
|
||||||
|
review_topics: { topic: string; count: number }[] | null;
|
||||||
}
|
}
|
||||||
|
|
||||||
interface ScraperTestProps {
|
interface ScraperTestProps {
|
||||||
@@ -56,7 +59,64 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
|
|||||||
const [businessRating, setBusinessRating] = useState<number | null>(null);
|
const [businessRating, setBusinessRating] = useState<number | null>(null);
|
||||||
const [businessImage, setBusinessImage] = useState<string | null>(null);
|
const [businessImage, setBusinessImage] = useState<string | null>(null);
|
||||||
const [businessCategory, setBusinessCategory] = useState<string | null>(null);
|
const [businessCategory, setBusinessCategory] = useState<string | null>(null);
|
||||||
|
const [userFingerprint, setUserFingerprint] = useState<{
|
||||||
|
geolocation?: {lat: number, lng: number},
|
||||||
|
userAgent?: string,
|
||||||
|
viewport?: {width: number, height: number},
|
||||||
|
timezone?: string,
|
||||||
|
language?: string,
|
||||||
|
platform?: string
|
||||||
|
}>({});
|
||||||
const debounceRef = useRef<NodeJS.Timeout | null>(null);
|
const debounceRef = useRef<NodeJS.Timeout | null>(null);
|
||||||
|
|
||||||
|
// Collect browser fingerprint on mount (no permissions needed)
|
||||||
|
useEffect(() => {
|
||||||
|
const collectFingerprint = async () => {
|
||||||
|
const fingerprint: typeof userFingerprint = {};
|
||||||
|
|
||||||
|
// User agent
|
||||||
|
fingerprint.userAgent = navigator.userAgent;
|
||||||
|
|
||||||
|
// Screen/viewport size
|
||||||
|
fingerprint.viewport = {
|
||||||
|
width: window.screen.width,
|
||||||
|
height: window.screen.height
|
||||||
|
};
|
||||||
|
|
||||||
|
// Timezone
|
||||||
|
fingerprint.timezone = Intl.DateTimeFormat().resolvedOptions().timeZone;
|
||||||
|
|
||||||
|
// Language
|
||||||
|
fingerprint.language = navigator.language;
|
||||||
|
|
||||||
|
// Platform
|
||||||
|
fingerprint.platform = navigator.platform;
|
||||||
|
|
||||||
|
// Get approximate location from IP (no permission needed)
|
||||||
|
try {
|
||||||
|
const response = await fetch('https://ipapi.co/json/', {
|
||||||
|
signal: AbortSignal.timeout(3000)
|
||||||
|
});
|
||||||
|
if (response.ok) {
|
||||||
|
const data = await response.json();
|
||||||
|
if (data.latitude && data.longitude) {
|
||||||
|
fingerprint.geolocation = {
|
||||||
|
lat: data.latitude,
|
||||||
|
lng: data.longitude
|
||||||
|
};
|
||||||
|
console.log('IP location:', data.city, data.country_name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.log('IP geolocation not available');
|
||||||
|
}
|
||||||
|
|
||||||
|
setUserFingerprint(fingerprint);
|
||||||
|
console.log('Browser fingerprint:', fingerprint);
|
||||||
|
};
|
||||||
|
|
||||||
|
collectFingerprint();
|
||||||
|
}, []);
|
||||||
const pollingIntervals = useRef<Map<string, NodeJS.Timeout>>(new Map());
|
const pollingIntervals = useRef<Map<string, NodeJS.Timeout>>(new Map());
|
||||||
const abortControllerRef = useRef<AbortController | null>(null);
|
const abortControllerRef = useRef<AbortController | null>(null);
|
||||||
|
|
||||||
@@ -121,18 +181,23 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
|
|||||||
setBusinessCategory(null);
|
setBusinessCategory(null);
|
||||||
setError('');
|
setError('');
|
||||||
|
|
||||||
// Create new abort controller with 30 second timeout
|
// Create new abort controller with 60 second timeout (validation can be slow)
|
||||||
const controller = new AbortController();
|
const controller = new AbortController();
|
||||||
abortControllerRef.current = controller;
|
abortControllerRef.current = controller;
|
||||||
const timeoutId = setTimeout(() => controller.abort(), 30000);
|
const timeoutId = setTimeout(() => controller.abort(), 60000);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const url = `https://www.google.com/maps/search/?api=1&query=${encodeURIComponent(query)}`;
|
// Force English with hl=en parameter
|
||||||
|
const url = `https://www.google.com/maps/search/?api=1&query=${encodeURIComponent(query)}&hl=en`;
|
||||||
|
|
||||||
const response = await fetch('/api/check-reviews', {
|
const response = await fetch('/api/check-reviews', {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
headers: { 'Content-Type': 'application/json' },
|
headers: { 'Content-Type': 'application/json' },
|
||||||
body: JSON.stringify({ url }),
|
body: JSON.stringify({
|
||||||
|
url,
|
||||||
|
geolocation: userFingerprint.geolocation,
|
||||||
|
browser_fingerprint: userFingerprint // Pass full fingerprint
|
||||||
|
}),
|
||||||
signal: controller.signal,
|
signal: controller.signal,
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -157,21 +222,30 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
|
|||||||
} catch (err) {
|
} catch (err) {
|
||||||
clearTimeout(timeoutId);
|
clearTimeout(timeoutId);
|
||||||
|
|
||||||
// Ignore AbortError (happens when user starts a new validation)
|
// Check if this is a timeout abort vs user-initiated abort
|
||||||
if (err instanceof Error && err.name === 'AbortError') {
|
if (err instanceof Error && err.name === 'AbortError') {
|
||||||
console.log('Validation cancelled (new validation started)');
|
// Check if it was a timeout (controller still matches) or user started new search
|
||||||
return;
|
if (abortControllerRef.current === controller) {
|
||||||
|
// Timeout - show error
|
||||||
|
console.error('Validation timed out');
|
||||||
|
setError('Validation timed out. Please try again.');
|
||||||
|
setHasReviews(false);
|
||||||
|
setAvailableReviewCount(0);
|
||||||
|
} else {
|
||||||
|
// User started a new search - just return silently
|
||||||
|
console.log('Validation cancelled (new validation started)');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
console.error('Error getting business info:', err);
|
||||||
|
// Error occurred
|
||||||
|
setHasReviews(false);
|
||||||
|
setAvailableReviewCount(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
console.error('Error getting business info:', err);
|
|
||||||
// Error occurred
|
|
||||||
setHasReviews(false);
|
|
||||||
setAvailableReviewCount(0);
|
|
||||||
} finally {
|
} finally {
|
||||||
// Only clear loading state if this controller wasn't aborted
|
clearTimeout(timeoutId);
|
||||||
if (!controller.signal.aborted) {
|
// Always clear loading state (even on timeout)
|
||||||
setIsCheckingReviews(false);
|
setIsCheckingReviews(false);
|
||||||
}
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -192,8 +266,8 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
|
|||||||
return newMap;
|
return newMap;
|
||||||
});
|
});
|
||||||
|
|
||||||
// Stop polling if job is done
|
// Stop polling if job is done (completed, failed, or partial)
|
||||||
if (data.status === 'completed' || data.status === 'failed') {
|
if (data.status === 'completed' || data.status === 'failed' || data.status === 'partial') {
|
||||||
const interval = pollingIntervals.current.get(jobId);
|
const interval = pollingIntervals.current.get(jobId);
|
||||||
if (interval) {
|
if (interval) {
|
||||||
clearInterval(interval);
|
clearInterval(interval);
|
||||||
@@ -244,8 +318,8 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
|
|||||||
setIsSubmitting(true);
|
setIsSubmitting(true);
|
||||||
setShowConfirmModal(false);
|
setShowConfirmModal(false);
|
||||||
|
|
||||||
// Use the search query to create a Google Maps search URL
|
// Use the search query to create a Google Maps search URL (force English)
|
||||||
const url = `https://www.google.com/maps/search/?api=1&query=${encodeURIComponent(searchedQuery)}`;
|
const url = `https://www.google.com/maps/search/?api=1&query=${encodeURIComponent(searchedQuery)}&hl=en`;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const response = await fetch('/api/scrape', {
|
const response = await fetch('/api/scrape', {
|
||||||
@@ -257,6 +331,8 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
|
|||||||
business_address: businessAddress,
|
business_address: businessAddress,
|
||||||
rating_snapshot: businessRating,
|
rating_snapshot: businessRating,
|
||||||
total_reviews_snapshot: availableReviewCount,
|
total_reviews_snapshot: availableReviewCount,
|
||||||
|
geolocation: userFingerprint.geolocation,
|
||||||
|
browser_fingerprint: userFingerprint, // Pass full fingerprint
|
||||||
}),
|
}),
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -283,8 +359,10 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
|
|||||||
error_message: null,
|
error_message: null,
|
||||||
business_name: businessName,
|
business_name: businessName,
|
||||||
business_address: businessAddress,
|
business_address: businessAddress,
|
||||||
|
business_category: businessCategory,
|
||||||
rating_snapshot: businessRating,
|
rating_snapshot: businessRating,
|
||||||
total_reviews_snapshot: availableReviewCount,
|
total_reviews_snapshot: availableReviewCount,
|
||||||
|
review_topics: null, // Will be populated when job completes
|
||||||
});
|
});
|
||||||
return newMap;
|
return newMap;
|
||||||
});
|
});
|
||||||
@@ -305,6 +383,7 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
|
|||||||
case 'completed': return 'text-green-700';
|
case 'completed': return 'text-green-700';
|
||||||
case 'running': return 'text-blue-700';
|
case 'running': return 'text-blue-700';
|
||||||
case 'failed': return 'text-red-700';
|
case 'failed': return 'text-red-700';
|
||||||
|
case 'partial': return 'text-orange-700';
|
||||||
default: return 'text-gray-800';
|
default: return 'text-gray-800';
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@@ -325,6 +404,12 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
|
|||||||
<path fillRule="evenodd" d="M10 18a8 8 0 100-16 8 8 0 000 16zM8.707 7.293a1 1 0 00-1.414 1.414L8.586 10l-1.293 1.293a1 1 0 101.414 1.414L10 11.414l1.293 1.293a1 1 0 001.414-1.414L11.414 10l1.293-1.293a1 1 0 00-1.414-1.414L10 8.586 8.707 7.293z" clipRule="evenodd" />
|
<path fillRule="evenodd" d="M10 18a8 8 0 100-16 8 8 0 000 16zM8.707 7.293a1 1 0 00-1.414 1.414L8.586 10l-1.293 1.293a1 1 0 101.414 1.414L10 11.414l1.293 1.293a1 1 0 001.414-1.414L11.414 10l1.293-1.293a1 1 0 00-1.414-1.414L10 8.586 8.707 7.293z" clipRule="evenodd" />
|
||||||
</svg>
|
</svg>
|
||||||
);
|
);
|
||||||
|
case 'partial':
|
||||||
|
return (
|
||||||
|
<svg className="w-5 h-5 text-orange-500" fill="currentColor" viewBox="0 0 20 20">
|
||||||
|
<path fillRule="evenodd" d="M8.257 3.099c.765-1.36 2.722-1.36 3.486 0l5.58 9.92c.75 1.334-.213 2.98-1.742 2.98H4.42c-1.53 0-2.493-1.646-1.743-2.98l5.58-9.92zM11 13a1 1 0 11-2 0 1 1 0 012 0zm-1-8a1 1 0 00-1 1v3a1 1 0 002 0V6a1 1 0 00-1-1z" clipRule="evenodd" />
|
||||||
|
</svg>
|
||||||
|
);
|
||||||
default:
|
default:
|
||||||
return (
|
return (
|
||||||
<svg className="w-5 h-5 text-gray-400" fill="currentColor" viewBox="0 0 20 20">
|
<svg className="w-5 h-5 text-gray-400" fill="currentColor" viewBox="0 0 20 20">
|
||||||
@@ -776,8 +861,8 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
|
|||||||
)}
|
)}
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
{/* Action Buttons - Show when completed and has reviews */}
|
{/* Action Buttons - Show when completed, partial, or running with reviews */}
|
||||||
{job.status === 'completed' && job.reviews_count && job.reviews_count > 0 && (
|
{(job.status === 'completed' || job.status === 'partial' || (job.status === 'running' && job.reviews_count && job.reviews_count > 0)) && job.reviews_count && job.reviews_count > 0 && (
|
||||||
<div className="flex gap-3">
|
<div className="flex gap-3">
|
||||||
<button
|
<button
|
||||||
onClick={async () => {
|
onClick={async () => {
|
||||||
@@ -818,7 +903,13 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
|
|||||||
}
|
}
|
||||||
}}
|
}}
|
||||||
disabled={isLoadingReviews}
|
disabled={isLoadingReviews}
|
||||||
className="flex-1 py-4 bg-gradient-to-r from-blue-600 to-indigo-700 text-white rounded-xl font-bold hover:from-blue-700 hover:to-indigo-800 transition-all flex items-center justify-center gap-2 shadow-lg disabled:opacity-50 disabled:cursor-not-allowed text-lg border-2 border-blue-500"
|
className={`flex-1 py-4 text-white rounded-xl font-bold transition-all flex items-center justify-center gap-2 shadow-lg disabled:opacity-50 disabled:cursor-not-allowed text-lg border-2 ${
|
||||||
|
job.status === 'partial'
|
||||||
|
? 'bg-gradient-to-r from-orange-500 to-amber-600 hover:from-orange-600 hover:to-amber-700 border-orange-400'
|
||||||
|
: job.status === 'running'
|
||||||
|
? 'bg-gradient-to-r from-blue-500 to-cyan-600 hover:from-blue-600 hover:to-cyan-700 border-blue-400'
|
||||||
|
: 'bg-gradient-to-r from-blue-600 to-indigo-700 hover:from-blue-700 hover:to-indigo-800 border-blue-500'
|
||||||
|
}`}
|
||||||
>
|
>
|
||||||
{isLoadingReviews ? (
|
{isLoadingReviews ? (
|
||||||
<>
|
<>
|
||||||
@@ -830,7 +921,7 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
|
|||||||
<svg className="w-6 h-6" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
<svg className="w-6 h-6" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
||||||
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M9 19v-6a2 2 0 00-2-2H5a2 2 0 00-2 2v6a2 2 0 002 2h2a2 2 0 002-2zm0 0V9a2 2 0 012-2h2a2 2 0 012 2v10m-6 0a2 2 0 002 2h2a2 2 0 002-2m0 0V5a2 2 0 012-2h2a2 2 0 012 2v14a2 2 0 01-2 2h-2a2 2 0 01-2-2z" />
|
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M9 19v-6a2 2 0 00-2-2H5a2 2 0 00-2 2v6a2 2 0 002 2h2a2 2 0 002-2zm0 0V9a2 2 0 012-2h2a2 2 0 012 2v10m-6 0a2 2 0 002 2h2a2 2 0 002-2m0 0V5a2 2 0 012-2h2a2 2 0 012 2v14a2 2 0 01-2 2h-2a2 2 0 01-2-2z" />
|
||||||
</svg>
|
</svg>
|
||||||
📊 Open Analytics Dashboard
|
📊 {job.status === 'running' ? 'Preview Analytics' : job.status === 'partial' ? 'View Partial Data' : 'Open Analytics Dashboard'}
|
||||||
</>
|
</>
|
||||||
)}
|
)}
|
||||||
</button>
|
</button>
|
||||||
@@ -845,7 +936,7 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
|
|||||||
const url = URL.createObjectURL(blob);
|
const url = URL.createObjectURL(blob);
|
||||||
const a = document.createElement('a');
|
const a = document.createElement('a');
|
||||||
a.href = url;
|
a.href = url;
|
||||||
a.download = `reviews-${job.job_id}.json`;
|
a.download = `reviews-${job.job_id}${job.status === 'partial' ? '-partial' : ''}.json`;
|
||||||
a.click();
|
a.click();
|
||||||
}
|
}
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
@@ -862,6 +953,24 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe
|
|||||||
</div>
|
</div>
|
||||||
)}
|
)}
|
||||||
|
|
||||||
|
{/* Partial Job Warning */}
|
||||||
|
{job.status === 'partial' && (
|
||||||
|
<div className="mt-4 p-4 bg-orange-100 border-2 border-orange-300 rounded-lg">
|
||||||
|
<div className="flex items-start gap-2">
|
||||||
|
<svg className="w-5 h-5 text-orange-700 flex-shrink-0 mt-0.5" fill="currentColor" viewBox="0 0 20 20">
|
||||||
|
<path fillRule="evenodd" d="M8.257 3.099c.765-1.36 2.722-1.36 3.486 0l5.58 9.92c.75 1.334-.213 2.98-1.742 2.98H4.42c-1.53 0-2.493-1.646-1.743-2.98l5.58-9.92zM11 13a1 1 0 11-2 0 1 1 0 012 0zm-1-8a1 1 0 00-1 1v3a1 1 0 002 0V6a1 1 0 00-1-1z" clipRule="evenodd" />
|
||||||
|
</svg>
|
||||||
|
<div>
|
||||||
|
<p className="font-bold text-orange-900">Partial Results</p>
|
||||||
|
<p className="text-sm text-orange-800 mt-1">
|
||||||
|
This job was interrupted but {job.reviews_count} reviews were saved.
|
||||||
|
{job.error_message && <span className="block mt-1 text-orange-700">Reason: {job.error_message}</span>}
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
{/* Error Message */}
|
{/* Error Message */}
|
||||||
{job.status === 'failed' && job.error_message && (
|
{job.status === 'failed' && job.error_message && (
|
||||||
<div className="mt-4 p-4 bg-red-100 border-2 border-red-300 rounded-lg">
|
<div className="mt-4 p-4 bg-red-100 border-2 border-red-300 rounded-lg">
|
||||||
|
|||||||
@@ -66,7 +66,9 @@ export function calculateReviewStats(reviews: Review[]): ReviewStats {
|
|||||||
// Populate minDate/maxDate/centerDate on reviews for display
|
// Populate minDate/maxDate/centerDate on reviews for display
|
||||||
reviews.forEach(r => {
|
reviews.forEach(r => {
|
||||||
if (!r.minDate || !r.maxDate || !r.centerDate) {
|
if (!r.minDate || !r.maxDate || !r.centerDate) {
|
||||||
const range = parseDateTextToRange(r.date_text);
|
// Handle both date_text and timestamp field names
|
||||||
|
const dateText = r.date_text || (r as any).timestamp || '';
|
||||||
|
const range = parseDateTextToRange(dateText);
|
||||||
r.minDate = range.minDate;
|
r.minDate = range.minDate;
|
||||||
r.maxDate = range.maxDate;
|
r.maxDate = range.maxDate;
|
||||||
// Calculate centerDate as midpoint
|
// Calculate centerDate as midpoint
|
||||||
@@ -96,8 +98,8 @@ export function calculateReviewStats(reviews: Review[]): ReviewStats {
|
|||||||
|
|
||||||
// Recent reviews (last 30 days - simplified check)
|
// Recent reviews (last 30 days - simplified check)
|
||||||
const recentReviews = reviews.filter(r => {
|
const recentReviews = reviews.filter(r => {
|
||||||
const text = r.date_text.toLowerCase();
|
const text = (r.date_text || (r as any).timestamp || '').toLowerCase();
|
||||||
return text.includes('day') || text.includes('week') || text.includes('hour');
|
return text.includes('day') || text.includes('week') || text.includes('hour') || text.includes('minute') || text.includes('second');
|
||||||
}).length;
|
}).length;
|
||||||
|
|
||||||
// Rating distribution
|
// Rating distribution
|
||||||
@@ -278,6 +280,14 @@ function extractNumber(text: string): number {
|
|||||||
*/
|
*/
|
||||||
export function parseDateTextToRange(dateText: string): { minDate: Date; maxDate: Date } {
|
export function parseDateTextToRange(dateText: string): { minDate: Date; maxDate: Date } {
|
||||||
const now = new Date();
|
const now = new Date();
|
||||||
|
|
||||||
|
// Handle undefined/null dateText
|
||||||
|
if (!dateText) {
|
||||||
|
// Return a default range (assume recent - within last month)
|
||||||
|
const daysAgo = (days: number) => new Date(now.getTime() - days * 24 * 60 * 60 * 1000);
|
||||||
|
return { minDate: daysAgo(30), maxDate: now };
|
||||||
|
}
|
||||||
|
|
||||||
const text = dateText.toLowerCase();
|
const text = dateText.toLowerCase();
|
||||||
|
|
||||||
// Remove "Edited " prefix if present
|
// Remove "Edited " prefix if present
|
||||||
@@ -396,7 +406,8 @@ export function filterReviewsByDateRange(reviews: Review[], range: DateRange): R
|
|||||||
// Filter range: [filterStart, filterEnd]
|
// Filter range: [filterStart, filterEnd]
|
||||||
// Overlap occurs when: minDate <= filterEnd AND maxDate >= filterStart
|
// Overlap occurs when: minDate <= filterEnd AND maxDate >= filterStart
|
||||||
return reviews.filter(r => {
|
return reviews.filter(r => {
|
||||||
const { minDate, maxDate } = parseDateTextToRange(r.date_text);
|
const dateText = r.date_text || (r as any).timestamp || '';
|
||||||
|
const { minDate, maxDate } = parseDateTextToRange(dateText);
|
||||||
return minDate <= filterEnd && maxDate >= filterStart;
|
return minDate <= filterEnd && maxDate >= filterStart;
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@@ -405,7 +416,8 @@ export function filterReviewsByCustomDateRange(reviews: Review[], fromDate: Date
|
|||||||
if (!fromDate && !toDate) return reviews;
|
if (!fromDate && !toDate) return reviews;
|
||||||
|
|
||||||
return reviews.filter(r => {
|
return reviews.filter(r => {
|
||||||
const reviewDate = parseDateText(r.date_text);
|
const dateText = r.date_text || (r as any).timestamp || '';
|
||||||
|
const reviewDate = parseDateText(dateText);
|
||||||
|
|
||||||
// If only fromDate is set, filter reviews >= fromDate
|
// If only fromDate is set, filter reviews >= fromDate
|
||||||
if (fromDate && !toDate) {
|
if (fromDate && !toDate) {
|
||||||
@@ -429,7 +441,7 @@ export function filterReviewsByCustomDateRange(reviews: Review[], fromDate: Date
|
|||||||
export function calculateTimelineData(reviews: Review[]): TimelineDataPoint[] {
|
export function calculateTimelineData(reviews: Review[]): TimelineDataPoint[] {
|
||||||
// Sort reviews by date (newest first)
|
// Sort reviews by date (newest first)
|
||||||
const sortedReviews = [...reviews]
|
const sortedReviews = [...reviews]
|
||||||
.map(r => ({ ...r, parsedDate: parseDateText(r.date_text) }))
|
.map(r => ({ ...r, parsedDate: parseDateText(r.date_text || (r as any).timestamp || '') }))
|
||||||
.sort((a, b) => b.parsedDate.getTime() - a.parsedDate.getTime());
|
.sort((a, b) => b.parsedDate.getTime() - a.parsedDate.getTime());
|
||||||
|
|
||||||
// Group by month
|
// Group by month
|
||||||
|
|||||||
Reference in New Issue
Block a user