Add browser fingerprint support and analytics metadata display
- Transfer user's browser fingerprint (user-agent, viewport, timezone, language, geolocation) to Chrome for more authentic scraping - Display review topics from Google Maps in analytics dashboard - Show business category badge in analytics header - Fix date_text null handling in analytics (handle undefined/timestamp fields) - Add review_topics and business_category to JobStatus interface Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -21,6 +21,7 @@ class JobStatus(str, Enum):
|
||||
COMPLETED = "completed"
|
||||
FAILED = "failed"
|
||||
CANCELLED = "cancelled"
|
||||
PARTIAL = "partial" # Job crashed but has partial reviews saved
|
||||
|
||||
|
||||
class DatabaseManager:
|
||||
@@ -69,6 +70,7 @@ class DatabaseManager:
|
||||
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
|
||||
started_at TIMESTAMP,
|
||||
completed_at TIMESTAMP,
|
||||
updated_at TIMESTAMP,
|
||||
|
||||
reviews_count INTEGER,
|
||||
total_reviews INTEGER,
|
||||
@@ -79,7 +81,7 @@ class DatabaseManager:
|
||||
metadata JSONB,
|
||||
scrape_logs JSONB,
|
||||
|
||||
CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled'))
|
||||
CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled', 'partial'))
|
||||
);
|
||||
""")
|
||||
|
||||
@@ -88,6 +90,24 @@ class DatabaseManager:
|
||||
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS scrape_logs JSONB;
|
||||
""")
|
||||
|
||||
# Add updated_at column if it doesn't exist (for incremental progress tracking)
|
||||
await conn.execute("""
|
||||
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS updated_at TIMESTAMP;
|
||||
""")
|
||||
|
||||
# Add review_topics column if it doesn't exist (extracted topic filters with mention counts)
|
||||
await conn.execute("""
|
||||
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS review_topics JSONB;
|
||||
""")
|
||||
|
||||
# Update constraint to include 'partial' status (for existing databases)
|
||||
await conn.execute("""
|
||||
ALTER TABLE jobs DROP CONSTRAINT IF EXISTS valid_status;
|
||||
""")
|
||||
await conn.execute("""
|
||||
ALTER TABLE jobs ADD CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled', 'partial'));
|
||||
""")
|
||||
|
||||
# Create indexes
|
||||
await conn.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status);
|
||||
@@ -187,13 +207,15 @@ class DatabaseManager:
|
||||
created_at,
|
||||
started_at,
|
||||
completed_at,
|
||||
updated_at,
|
||||
reviews_count,
|
||||
total_reviews,
|
||||
reviews_data,
|
||||
scrape_time,
|
||||
error_message,
|
||||
metadata,
|
||||
scrape_logs
|
||||
scrape_logs,
|
||||
review_topics
|
||||
FROM jobs
|
||||
WHERE job_id = $1
|
||||
""", job_id)
|
||||
@@ -203,22 +225,32 @@ class DatabaseManager:
|
||||
|
||||
return dict(row)
|
||||
|
||||
async def get_job_reviews(self, job_id: UUID) -> Optional[List[Dict[str, Any]]]:
|
||||
async def get_job_reviews(self, job_id: UUID, include_partial: bool = True) -> Optional[List[Dict[str, Any]]]:
|
||||
"""
|
||||
Get reviews for a specific job.
|
||||
|
||||
Args:
|
||||
job_id: Job UUID
|
||||
include_partial: If True, also return reviews for running and partial jobs
|
||||
|
||||
Returns:
|
||||
List of reviews or None if not found/not completed
|
||||
List of reviews or None if not found/no reviews
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
reviews_data = await conn.fetchval("""
|
||||
SELECT reviews_data
|
||||
FROM jobs
|
||||
WHERE job_id = $1 AND status = 'completed'
|
||||
""", job_id)
|
||||
if include_partial:
|
||||
# Return reviews for completed, running, or partial jobs
|
||||
reviews_data = await conn.fetchval("""
|
||||
SELECT reviews_data
|
||||
FROM jobs
|
||||
WHERE job_id = $1 AND status IN ('completed', 'running', 'partial')
|
||||
""", job_id)
|
||||
else:
|
||||
# Only return reviews for completed jobs
|
||||
reviews_data = await conn.fetchval("""
|
||||
SELECT reviews_data
|
||||
FROM jobs
|
||||
WHERE job_id = $1 AND status = 'completed'
|
||||
""", job_id)
|
||||
|
||||
if not reviews_data:
|
||||
return None
|
||||
@@ -278,7 +310,8 @@ class DatabaseManager:
|
||||
reviews: List[Dict[str, Any]],
|
||||
scrape_time: float,
|
||||
total_reviews: Optional[int] = None,
|
||||
scrape_logs: Optional[List[Dict[str, Any]]] = None
|
||||
scrape_logs: Optional[List[Dict[str, Any]]] = None,
|
||||
review_topics: Optional[List[Dict[str, Any]]] = None
|
||||
):
|
||||
"""
|
||||
Save scraping results to database.
|
||||
@@ -289,8 +322,33 @@ class DatabaseManager:
|
||||
scrape_time: Time taken to scrape in seconds
|
||||
total_reviews: Total reviews available (from page counter)
|
||||
scrape_logs: List of log entries from the scraper
|
||||
review_topics: List of topic filter dictionaries with topic and count
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
# If reviews list is empty, check if job already has reviews from incremental saves
|
||||
# This happens when flush_callback was used during scraping
|
||||
if not reviews:
|
||||
existing = await conn.fetchval(
|
||||
"SELECT reviews_count FROM jobs WHERE job_id = $1", job_id
|
||||
)
|
||||
if existing and existing > 0:
|
||||
# Job has reviews from incremental saves, don't overwrite reviews_data
|
||||
await conn.execute("""
|
||||
UPDATE jobs
|
||||
SET
|
||||
status = 'completed',
|
||||
completed_at = NOW(),
|
||||
total_reviews = COALESCE($2, total_reviews),
|
||||
scrape_time = $3,
|
||||
scrape_logs = $4::jsonb,
|
||||
review_topics = $5::jsonb
|
||||
WHERE job_id = $1
|
||||
""", job_id, total_reviews, scrape_time,
|
||||
json.dumps(scrape_logs) if scrape_logs else None,
|
||||
json.dumps(review_topics) if review_topics else None)
|
||||
log.info(f"Completed job {job_id} with {existing} reviews (from incremental saves)")
|
||||
return
|
||||
|
||||
await conn.execute("""
|
||||
UPDATE jobs
|
||||
SET
|
||||
@@ -300,13 +358,70 @@ class DatabaseManager:
|
||||
total_reviews = $3,
|
||||
reviews_data = $4::jsonb,
|
||||
scrape_time = $5,
|
||||
scrape_logs = $6::jsonb
|
||||
scrape_logs = $6::jsonb,
|
||||
review_topics = $7::jsonb
|
||||
WHERE job_id = $1
|
||||
""", job_id, len(reviews), total_reviews, json.dumps(reviews), scrape_time,
|
||||
json.dumps(scrape_logs) if scrape_logs else None)
|
||||
json.dumps(scrape_logs) if scrape_logs else None,
|
||||
json.dumps(review_topics) if review_topics else None)
|
||||
|
||||
log.info(f"Saved {len(reviews)} reviews for job {job_id}")
|
||||
|
||||
async def save_reviews_incremental(
|
||||
self,
|
||||
job_id: UUID,
|
||||
reviews: List[Dict[str, Any]],
|
||||
total_reviews: Optional[int] = None
|
||||
):
|
||||
"""
|
||||
Save reviews incrementally during scraping.
|
||||
Called on each flush to preserve progress in case of crash.
|
||||
|
||||
Args:
|
||||
job_id: Job UUID
|
||||
reviews: ALL reviews collected so far (not just new ones)
|
||||
total_reviews: Total reviews available (from page counter)
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
await conn.execute("""
|
||||
UPDATE jobs
|
||||
SET
|
||||
reviews_count = $2,
|
||||
total_reviews = COALESCE($3, total_reviews),
|
||||
reviews_data = $4::jsonb,
|
||||
updated_at = NOW()
|
||||
WHERE job_id = $1 AND status = 'running'
|
||||
""", job_id, len(reviews), total_reviews, json.dumps(reviews))
|
||||
|
||||
log.debug(f"Incremental save: {len(reviews)} reviews for job {job_id}")
|
||||
|
||||
async def mark_job_partial(
|
||||
self,
|
||||
job_id: UUID,
|
||||
error_message: str,
|
||||
scrape_logs: Optional[List[Dict[str, Any]]] = None
|
||||
):
|
||||
"""
|
||||
Mark a job as partial (crashed but has some reviews saved).
|
||||
|
||||
Args:
|
||||
job_id: Job UUID
|
||||
error_message: Error that caused the crash
|
||||
scrape_logs: Log entries from the scraper
|
||||
"""
|
||||
async with self.pool.acquire() as conn:
|
||||
await conn.execute("""
|
||||
UPDATE jobs
|
||||
SET
|
||||
status = 'partial',
|
||||
completed_at = NOW(),
|
||||
error_message = $2,
|
||||
scrape_logs = $3::jsonb
|
||||
WHERE job_id = $1
|
||||
""", job_id, error_message, json.dumps(scrape_logs) if scrape_logs else None)
|
||||
|
||||
log.info(f"Marked job {job_id} as partial due to: {error_message}")
|
||||
|
||||
async def list_jobs(
|
||||
self,
|
||||
status: Optional[JobStatus] = None,
|
||||
@@ -337,7 +452,8 @@ class DatabaseManager:
|
||||
total_reviews,
|
||||
scrape_time,
|
||||
error_message,
|
||||
metadata
|
||||
metadata,
|
||||
review_topics
|
||||
FROM jobs
|
||||
WHERE status = $1
|
||||
ORDER BY created_at DESC
|
||||
@@ -355,7 +471,8 @@ class DatabaseManager:
|
||||
total_reviews,
|
||||
scrape_time,
|
||||
error_message,
|
||||
metadata
|
||||
metadata,
|
||||
review_topics
|
||||
FROM jobs
|
||||
ORDER BY created_at DESC
|
||||
LIMIT $1 OFFSET $2
|
||||
|
||||
Reference in New Issue
Block a user