Initial commit - WhyRating Engine (Google Reviews Scraper)

This commit is contained in:
Alejandro Gutiérrez
2026-02-02 18:19:00 +00:00
parent 0543a08242
commit 2206ddeff2
136 changed files with 51138 additions and 855 deletions

View File

@@ -0,0 +1,202 @@
#!/usr/bin/env python3
"""
Apply the hierarchical recategorization to the database.
This script:
1. Gets all items currently in Other.Uncategorized
2. Applies the categorization rules
3. Updates the database with new paths
4. Creates new level 2/3 categories as needed
5. Updates category counts
"""
import psycopg2
import re
from collections import defaultdict
# Import categorization functions
import sys
sys.path.insert(0, '/Users/agutierrez/Desktop/google-reviews-scraper-pro/db')
from recategorize_hierarchical import get_sector_for_item, get_business_type_for_item
DB_URL = "postgresql://scraper:scraper123@localhost:5437/scraper"
def slugify(text):
"""Convert text to slug format"""
slug = re.sub(r'[^\w\s-]', '', text)
slug = re.sub(r'[-\s]+', '_', slug)
return slug.strip('_')
def main():
conn = psycopg2.connect(DB_URL)
cursor = conn.cursor()
# Get all items in Other.Uncategorized
cursor.execute("""
SELECT id, name, slug
FROM gbp_categories
WHERE path ~ 'Other.Uncategorized.*' AND level = 4
ORDER BY name
""")
other_items = cursor.fetchall()
print(f"Found {len(other_items)} items in Other.Uncategorized")
# Get existing paths
cursor.execute("SELECT path::text, id FROM gbp_categories")
existing_paths = {row[0]: row[1] for row in cursor.fetchall()}
print(f"Found {len(existing_paths)} existing paths")
# Categorize items
moves = [] # (item_id, item_name, item_slug, new_sector, new_btype)
stats = defaultdict(int)
for item_id, name, slug in other_items:
sector = get_sector_for_item(name)
btype = get_business_type_for_item(name, sector)
if sector != 'Other':
moves.append((item_id, name, slug, sector, btype))
stats[sector] += 1
else:
stats['Still_Other'] += 1
print(f"\nCategorization results:")
for sector, count in sorted(stats.items(), key=lambda x: -x[1]):
print(f" {sector}: {count}")
print(f"\nTotal to move: {len(moves)}")
print(f"Remaining in Other: {stats.get('Still_Other', 0)}")
# Ask for confirmation
response = input("\nProceed with database updates? (yes/no): ")
if response.lower() != 'yes':
print("Aborted.")
conn.close()
return
# Process moves
created_paths = set()
updated = 0
errors = []
for item_id, name, slug, sector, btype in moves:
try:
sector_slug = slugify(sector)
btype_slug = slugify(btype)
# Check if sector exists
sector_path = sector_slug
if sector_path not in existing_paths:
print(f" [ERROR] Sector not found: {sector_path} for '{name}'")
errors.append((name, f"Sector not found: {sector_path}"))
continue
# Check/create business type (level 2)
btype_path = f"{sector_path}.{btype_slug}"
if btype_path not in existing_paths and btype_path not in created_paths:
cursor.execute("""
INSERT INTO gbp_categories (name, slug, path, level, parent_id, category_count)
SELECT %s, %s, %s::ltree, 2, id, 0
FROM gbp_categories WHERE path = %s::ltree
ON CONFLICT (path) DO NOTHING
RETURNING id
""", (btype, btype_slug, btype_path, sector_path))
result = cursor.fetchone()
if result:
existing_paths[btype_path] = result[0]
created_paths.add(btype_path)
print(f" [NEW] Created business type: {btype_path}")
# Check/create sub-category (level 3) - use "General" as default
subcat = "General"
subcat_slug = "General"
subcat_path = f"{btype_path}.{subcat_slug}"
if subcat_path not in existing_paths and subcat_path not in created_paths:
cursor.execute("""
INSERT INTO gbp_categories (name, slug, path, level, parent_id, category_count)
SELECT %s, %s, %s::ltree, 3, id, 0
FROM gbp_categories WHERE path = %s::ltree
ON CONFLICT (path) DO NOTHING
RETURNING id
""", (subcat, subcat_slug, subcat_path, btype_path))
result = cursor.fetchone()
if result:
existing_paths[subcat_path] = result[0]
created_paths.add(subcat_path)
print(f" [NEW] Created sub-category: {subcat_path}")
# Update the item's path
new_path = f"{subcat_path}.{slug}"
cursor.execute("""
UPDATE gbp_categories
SET path = %s::ltree,
parent_id = (SELECT id FROM gbp_categories WHERE path = %s::ltree)
WHERE id = %s
""", (new_path, subcat_path, item_id))
updated += 1
except Exception as e:
errors.append((name, str(e)))
print(f" [ERROR] {name}: {e}")
# Update category counts
print("\nUpdating category counts...")
cursor.execute("""
WITH counts AS (
SELECT
parent_id,
COUNT(*) as cnt
FROM gbp_categories
WHERE parent_id IS NOT NULL
GROUP BY parent_id
)
UPDATE gbp_categories g
SET category_count = COALESCE(c.cnt, 0)
FROM counts c
WHERE g.id = c.parent_id
""")
# Also reset counts for categories that no longer have children
cursor.execute("""
UPDATE gbp_categories
SET category_count = 0
WHERE id NOT IN (
SELECT DISTINCT parent_id FROM gbp_categories WHERE parent_id IS NOT NULL
)
AND level < 4
""")
conn.commit()
print(f"\n{'='*60}")
print(f"SUMMARY")
print(f"{'='*60}")
print(f"Items moved: {updated}")
print(f"New paths created: {len(created_paths)}")
print(f"Errors: {len(errors)}")
if errors:
print("\nErrors:")
for name, err in errors[:10]:
print(f" - {name}: {err}")
if len(errors) > 10:
print(f" ... and {len(errors) - 10} more")
# Show final stats
cursor.execute("""
SELECT
SPLIT_PART(path::text, '.', 1) as sector,
COUNT(*) as count
FROM gbp_categories
WHERE level = 4
GROUP BY sector
ORDER BY count DESC
""")
print("\nFinal category distribution:")
for sector, count in cursor.fetchall():
print(f" {sector}: {count}")
conn.close()
if __name__ == '__main__':
main()