Files
whyrating-engine-legacy/scripts/backfill_business_category.py
2026-02-02 18:19:00 +00:00

88 lines
2.5 KiB
Python

#!/usr/bin/env python3
"""
Backfill missing business_category for existing jobs.
Uses validation_only mode to quickly capture business info without re-scraping reviews.
"""
import asyncio
import asyncpg
import os
import sys
# Add project root to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from scrapers.google_reviews.v1_1_0 import fast_scrape_reviews
DATABASE_URL = os.getenv('DATABASE_URL', 'postgresql://scraper:scraper123@localhost:5437/scraper')
async def backfill_categories():
"""Fetch and update missing business categories."""
# Connect to database
conn = await asyncpg.connect(DATABASE_URL)
try:
# Get jobs missing business_category
rows = await conn.fetch("""
SELECT job_id, url, business_name
FROM jobs
WHERE business_category IS NULL
AND status = 'completed'
ORDER BY created_at DESC
""")
print(f"Found {len(rows)} jobs missing business_category\n")
updated = 0
failed = 0
for row in rows:
job_id = row['job_id']
url = row['url']
name = row['business_name'] or 'Unknown'
print(f"Processing: {name[:50]}...")
try:
# Run validation-only scrape (fast - just captures business info)
result = await asyncio.to_thread(
fast_scrape_reviews,
url=url,
headless=True,
validation_only=True
)
# Extract category from validation_info
validation_info = result.get('validation_info', {})
category = validation_info.get('category')
if category:
# Update the database
await conn.execute("""
UPDATE jobs
SET business_category = $2,
updated_at = NOW()
WHERE job_id = $1
""", job_id, category)
print(f" ✓ Category: {category}")
updated += 1
else:
print(f" ✗ No category found")
failed += 1
except Exception as e:
print(f" ✗ Error: {e}")
failed += 1
print(f"\n{'='*50}")
print(f"Done! Updated: {updated}, Failed: {failed}")
finally:
await conn.close()
if __name__ == '__main__':
asyncio.run(backfill_categories())