Add browser fingerprint support and analytics metadata display

- Transfer user's browser fingerprint (user-agent, viewport, timezone,
  language, geolocation) to Chrome for more authentic scraping
- Display review topics from Google Maps in analytics dashboard
- Show business category badge in analytics header
- Fix date_text null handling in analytics (handle undefined/timestamp fields)
- Add review_topics and business_category to JobStatus interface

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-24 10:36:06 +00:00
parent 1bd30c0789
commit a540ab97b1
9 changed files with 1214 additions and 231 deletions

View File

@@ -21,6 +21,7 @@ class JobStatus(str, Enum):
COMPLETED = "completed"
FAILED = "failed"
CANCELLED = "cancelled"
PARTIAL = "partial" # Job crashed but has partial reviews saved
class DatabaseManager:
@@ -69,6 +70,7 @@ class DatabaseManager:
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
started_at TIMESTAMP,
completed_at TIMESTAMP,
updated_at TIMESTAMP,
reviews_count INTEGER,
total_reviews INTEGER,
@@ -79,7 +81,7 @@ class DatabaseManager:
metadata JSONB,
scrape_logs JSONB,
CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled'))
CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled', 'partial'))
);
""")
@@ -88,6 +90,24 @@ class DatabaseManager:
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS scrape_logs JSONB;
""")
# Add updated_at column if it doesn't exist (for incremental progress tracking)
await conn.execute("""
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS updated_at TIMESTAMP;
""")
# Add review_topics column if it doesn't exist (extracted topic filters with mention counts)
await conn.execute("""
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS review_topics JSONB;
""")
# Update constraint to include 'partial' status (for existing databases)
await conn.execute("""
ALTER TABLE jobs DROP CONSTRAINT IF EXISTS valid_status;
""")
await conn.execute("""
ALTER TABLE jobs ADD CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled', 'partial'));
""")
# Create indexes
await conn.execute("""
CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status);
@@ -187,13 +207,15 @@ class DatabaseManager:
created_at,
started_at,
completed_at,
updated_at,
reviews_count,
total_reviews,
reviews_data,
scrape_time,
error_message,
metadata,
scrape_logs
scrape_logs,
review_topics
FROM jobs
WHERE job_id = $1
""", job_id)
@@ -203,22 +225,32 @@ class DatabaseManager:
return dict(row)
async def get_job_reviews(self, job_id: UUID) -> Optional[List[Dict[str, Any]]]:
async def get_job_reviews(self, job_id: UUID, include_partial: bool = True) -> Optional[List[Dict[str, Any]]]:
"""
Get reviews for a specific job.
Args:
job_id: Job UUID
include_partial: If True, also return reviews for running and partial jobs
Returns:
List of reviews or None if not found/not completed
List of reviews or None if not found/no reviews
"""
async with self.pool.acquire() as conn:
reviews_data = await conn.fetchval("""
SELECT reviews_data
FROM jobs
WHERE job_id = $1 AND status = 'completed'
""", job_id)
if include_partial:
# Return reviews for completed, running, or partial jobs
reviews_data = await conn.fetchval("""
SELECT reviews_data
FROM jobs
WHERE job_id = $1 AND status IN ('completed', 'running', 'partial')
""", job_id)
else:
# Only return reviews for completed jobs
reviews_data = await conn.fetchval("""
SELECT reviews_data
FROM jobs
WHERE job_id = $1 AND status = 'completed'
""", job_id)
if not reviews_data:
return None
@@ -278,7 +310,8 @@ class DatabaseManager:
reviews: List[Dict[str, Any]],
scrape_time: float,
total_reviews: Optional[int] = None,
scrape_logs: Optional[List[Dict[str, Any]]] = None
scrape_logs: Optional[List[Dict[str, Any]]] = None,
review_topics: Optional[List[Dict[str, Any]]] = None
):
"""
Save scraping results to database.
@@ -289,8 +322,33 @@ class DatabaseManager:
scrape_time: Time taken to scrape in seconds
total_reviews: Total reviews available (from page counter)
scrape_logs: List of log entries from the scraper
review_topics: List of topic filter dictionaries with topic and count
"""
async with self.pool.acquire() as conn:
# If reviews list is empty, check if job already has reviews from incremental saves
# This happens when flush_callback was used during scraping
if not reviews:
existing = await conn.fetchval(
"SELECT reviews_count FROM jobs WHERE job_id = $1", job_id
)
if existing and existing > 0:
# Job has reviews from incremental saves, don't overwrite reviews_data
await conn.execute("""
UPDATE jobs
SET
status = 'completed',
completed_at = NOW(),
total_reviews = COALESCE($2, total_reviews),
scrape_time = $3,
scrape_logs = $4::jsonb,
review_topics = $5::jsonb
WHERE job_id = $1
""", job_id, total_reviews, scrape_time,
json.dumps(scrape_logs) if scrape_logs else None,
json.dumps(review_topics) if review_topics else None)
log.info(f"Completed job {job_id} with {existing} reviews (from incremental saves)")
return
await conn.execute("""
UPDATE jobs
SET
@@ -300,13 +358,70 @@ class DatabaseManager:
total_reviews = $3,
reviews_data = $4::jsonb,
scrape_time = $5,
scrape_logs = $6::jsonb
scrape_logs = $6::jsonb,
review_topics = $7::jsonb
WHERE job_id = $1
""", job_id, len(reviews), total_reviews, json.dumps(reviews), scrape_time,
json.dumps(scrape_logs) if scrape_logs else None)
json.dumps(scrape_logs) if scrape_logs else None,
json.dumps(review_topics) if review_topics else None)
log.info(f"Saved {len(reviews)} reviews for job {job_id}")
async def save_reviews_incremental(
self,
job_id: UUID,
reviews: List[Dict[str, Any]],
total_reviews: Optional[int] = None
):
"""
Save reviews incrementally during scraping.
Called on each flush to preserve progress in case of crash.
Args:
job_id: Job UUID
reviews: ALL reviews collected so far (not just new ones)
total_reviews: Total reviews available (from page counter)
"""
async with self.pool.acquire() as conn:
await conn.execute("""
UPDATE jobs
SET
reviews_count = $2,
total_reviews = COALESCE($3, total_reviews),
reviews_data = $4::jsonb,
updated_at = NOW()
WHERE job_id = $1 AND status = 'running'
""", job_id, len(reviews), total_reviews, json.dumps(reviews))
log.debug(f"Incremental save: {len(reviews)} reviews for job {job_id}")
async def mark_job_partial(
self,
job_id: UUID,
error_message: str,
scrape_logs: Optional[List[Dict[str, Any]]] = None
):
"""
Mark a job as partial (crashed but has some reviews saved).
Args:
job_id: Job UUID
error_message: Error that caused the crash
scrape_logs: Log entries from the scraper
"""
async with self.pool.acquire() as conn:
await conn.execute("""
UPDATE jobs
SET
status = 'partial',
completed_at = NOW(),
error_message = $2,
scrape_logs = $3::jsonb
WHERE job_id = $1
""", job_id, error_message, json.dumps(scrape_logs) if scrape_logs else None)
log.info(f"Marked job {job_id} as partial due to: {error_message}")
async def list_jobs(
self,
status: Optional[JobStatus] = None,
@@ -337,7 +452,8 @@ class DatabaseManager:
total_reviews,
scrape_time,
error_message,
metadata
metadata,
review_topics
FROM jobs
WHERE status = $1
ORDER BY created_at DESC
@@ -355,7 +471,8 @@ class DatabaseManager:
total_reviews,
scrape_time,
error_message,
metadata
metadata,
review_topics
FROM jobs
ORDER BY created_at DESC
LIMIT $1 OFFSET $2