88 lines
2.5 KiB
Python
88 lines
2.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Backfill missing business_category for existing jobs.
|
|
Uses validation_only mode to quickly capture business info without re-scraping reviews.
|
|
"""
|
|
import asyncio
|
|
import asyncpg
|
|
import os
|
|
import sys
|
|
|
|
# Add project root to path
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
from scrapers.google_reviews.v1_1_0 import fast_scrape_reviews
|
|
|
|
DATABASE_URL = os.getenv('DATABASE_URL', 'postgresql://scraper:scraper123@localhost:5437/scraper')
|
|
|
|
|
|
async def backfill_categories():
|
|
"""Fetch and update missing business categories."""
|
|
|
|
# Connect to database
|
|
conn = await asyncpg.connect(DATABASE_URL)
|
|
|
|
try:
|
|
# Get jobs missing business_category
|
|
rows = await conn.fetch("""
|
|
SELECT job_id, url, business_name
|
|
FROM jobs
|
|
WHERE business_category IS NULL
|
|
AND status = 'completed'
|
|
ORDER BY created_at DESC
|
|
""")
|
|
|
|
print(f"Found {len(rows)} jobs missing business_category\n")
|
|
|
|
updated = 0
|
|
failed = 0
|
|
|
|
for row in rows:
|
|
job_id = row['job_id']
|
|
url = row['url']
|
|
name = row['business_name'] or 'Unknown'
|
|
|
|
print(f"Processing: {name[:50]}...")
|
|
|
|
try:
|
|
# Run validation-only scrape (fast - just captures business info)
|
|
result = await asyncio.to_thread(
|
|
fast_scrape_reviews,
|
|
url=url,
|
|
headless=True,
|
|
validation_only=True
|
|
)
|
|
|
|
# Extract category from validation_info
|
|
validation_info = result.get('validation_info', {})
|
|
category = validation_info.get('category')
|
|
|
|
if category:
|
|
# Update the database
|
|
await conn.execute("""
|
|
UPDATE jobs
|
|
SET business_category = $2,
|
|
updated_at = NOW()
|
|
WHERE job_id = $1
|
|
""", job_id, category)
|
|
|
|
print(f" ✓ Category: {category}")
|
|
updated += 1
|
|
else:
|
|
print(f" ✗ No category found")
|
|
failed += 1
|
|
|
|
except Exception as e:
|
|
print(f" ✗ Error: {e}")
|
|
failed += 1
|
|
|
|
print(f"\n{'='*50}")
|
|
print(f"Done! Updated: {updated}, Failed: {failed}")
|
|
|
|
finally:
|
|
await conn.close()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
asyncio.run(backfill_categories())
|