#!/usr/bin/env python3 """ Backfill missing business_category for existing jobs. Uses validation_only mode to quickly capture business info without re-scraping reviews. """ import asyncio import asyncpg import os import sys # Add project root to path sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from scrapers.google_reviews.v1_1_0 import fast_scrape_reviews DATABASE_URL = os.getenv('DATABASE_URL', 'postgresql://scraper:scraper123@localhost:5437/scraper') async def backfill_categories(): """Fetch and update missing business categories.""" # Connect to database conn = await asyncpg.connect(DATABASE_URL) try: # Get jobs missing business_category rows = await conn.fetch(""" SELECT job_id, url, business_name FROM jobs WHERE business_category IS NULL AND status = 'completed' ORDER BY created_at DESC """) print(f"Found {len(rows)} jobs missing business_category\n") updated = 0 failed = 0 for row in rows: job_id = row['job_id'] url = row['url'] name = row['business_name'] or 'Unknown' print(f"Processing: {name[:50]}...") try: # Run validation-only scrape (fast - just captures business info) result = await asyncio.to_thread( fast_scrape_reviews, url=url, headless=True, validation_only=True ) # Extract category from validation_info validation_info = result.get('validation_info', {}) category = validation_info.get('category') if category: # Update the database await conn.execute(""" UPDATE jobs SET business_category = $2, updated_at = NOW() WHERE job_id = $1 """, job_id, category) print(f" ✓ Category: {category}") updated += 1 else: print(f" ✗ No category found") failed += 1 except Exception as e: print(f" ✗ Error: {e}") failed += 1 print(f"\n{'='*50}") print(f"Done! Updated: {updated}, Failed: {failed}") finally: await conn.close() if __name__ == '__main__': asyncio.run(backfill_categories())