Initial commit - WhyRating Engine (Google Reviews Scraper)
This commit is contained in:
87
scripts/backfill_business_category.py
Normal file
87
scripts/backfill_business_category.py
Normal file
@@ -0,0 +1,87 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Backfill missing business_category for existing jobs.
|
||||
Uses validation_only mode to quickly capture business info without re-scraping reviews.
|
||||
"""
|
||||
import asyncio
|
||||
import asyncpg
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Add project root to path
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from scrapers.google_reviews.v1_1_0 import fast_scrape_reviews
|
||||
|
||||
DATABASE_URL = os.getenv('DATABASE_URL', 'postgresql://scraper:scraper123@localhost:5437/scraper')
|
||||
|
||||
|
||||
async def backfill_categories():
|
||||
"""Fetch and update missing business categories."""
|
||||
|
||||
# Connect to database
|
||||
conn = await asyncpg.connect(DATABASE_URL)
|
||||
|
||||
try:
|
||||
# Get jobs missing business_category
|
||||
rows = await conn.fetch("""
|
||||
SELECT job_id, url, business_name
|
||||
FROM jobs
|
||||
WHERE business_category IS NULL
|
||||
AND status = 'completed'
|
||||
ORDER BY created_at DESC
|
||||
""")
|
||||
|
||||
print(f"Found {len(rows)} jobs missing business_category\n")
|
||||
|
||||
updated = 0
|
||||
failed = 0
|
||||
|
||||
for row in rows:
|
||||
job_id = row['job_id']
|
||||
url = row['url']
|
||||
name = row['business_name'] or 'Unknown'
|
||||
|
||||
print(f"Processing: {name[:50]}...")
|
||||
|
||||
try:
|
||||
# Run validation-only scrape (fast - just captures business info)
|
||||
result = await asyncio.to_thread(
|
||||
fast_scrape_reviews,
|
||||
url=url,
|
||||
headless=True,
|
||||
validation_only=True
|
||||
)
|
||||
|
||||
# Extract category from validation_info
|
||||
validation_info = result.get('validation_info', {})
|
||||
category = validation_info.get('category')
|
||||
|
||||
if category:
|
||||
# Update the database
|
||||
await conn.execute("""
|
||||
UPDATE jobs
|
||||
SET business_category = $2,
|
||||
updated_at = NOW()
|
||||
WHERE job_id = $1
|
||||
""", job_id, category)
|
||||
|
||||
print(f" ✓ Category: {category}")
|
||||
updated += 1
|
||||
else:
|
||||
print(f" ✗ No category found")
|
||||
failed += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f" ✗ Error: {e}")
|
||||
failed += 1
|
||||
|
||||
print(f"\n{'='*50}")
|
||||
print(f"Done! Updated: {updated}, Failed: {failed}")
|
||||
|
||||
finally:
|
||||
await conn.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
asyncio.run(backfill_categories())
|
||||
Reference in New Issue
Block a user