#!/usr/bin/env python3 """ Apply the hierarchical recategorization to the database. This script: 1. Gets all items currently in Other.Uncategorized 2. Applies the categorization rules 3. Updates the database with new paths 4. Creates new level 2/3 categories as needed 5. Updates category counts """ import psycopg2 import re from collections import defaultdict # Import categorization functions import sys sys.path.insert(0, '/Users/agutierrez/Desktop/google-reviews-scraper-pro/db') from recategorize_hierarchical import get_sector_for_item, get_business_type_for_item DB_URL = "postgresql://scraper:scraper123@localhost:5437/scraper" def slugify(text): """Convert text to slug format""" slug = re.sub(r'[^\w\s-]', '', text) slug = re.sub(r'[-\s]+', '_', slug) return slug.strip('_') def main(): conn = psycopg2.connect(DB_URL) cursor = conn.cursor() # Get all items in Other.Uncategorized cursor.execute(""" SELECT id, name, slug FROM gbp_categories WHERE path ~ 'Other.Uncategorized.*' AND level = 4 ORDER BY name """) other_items = cursor.fetchall() print(f"Found {len(other_items)} items in Other.Uncategorized") # Get existing paths cursor.execute("SELECT path::text, id FROM gbp_categories") existing_paths = {row[0]: row[1] for row in cursor.fetchall()} print(f"Found {len(existing_paths)} existing paths") # Categorize items moves = [] # (item_id, item_name, item_slug, new_sector, new_btype) stats = defaultdict(int) for item_id, name, slug in other_items: sector = get_sector_for_item(name) btype = get_business_type_for_item(name, sector) if sector != 'Other': moves.append((item_id, name, slug, sector, btype)) stats[sector] += 1 else: stats['Still_Other'] += 1 print(f"\nCategorization results:") for sector, count in sorted(stats.items(), key=lambda x: -x[1]): print(f" {sector}: {count}") print(f"\nTotal to move: {len(moves)}") print(f"Remaining in Other: {stats.get('Still_Other', 0)}") # Ask for confirmation response = input("\nProceed with database updates? (yes/no): ") if response.lower() != 'yes': print("Aborted.") conn.close() return # Process moves created_paths = set() updated = 0 errors = [] for item_id, name, slug, sector, btype in moves: try: sector_slug = slugify(sector) btype_slug = slugify(btype) # Check if sector exists sector_path = sector_slug if sector_path not in existing_paths: print(f" [ERROR] Sector not found: {sector_path} for '{name}'") errors.append((name, f"Sector not found: {sector_path}")) continue # Check/create business type (level 2) btype_path = f"{sector_path}.{btype_slug}" if btype_path not in existing_paths and btype_path not in created_paths: cursor.execute(""" INSERT INTO gbp_categories (name, slug, path, level, parent_id, category_count) SELECT %s, %s, %s::ltree, 2, id, 0 FROM gbp_categories WHERE path = %s::ltree ON CONFLICT (path) DO NOTHING RETURNING id """, (btype, btype_slug, btype_path, sector_path)) result = cursor.fetchone() if result: existing_paths[btype_path] = result[0] created_paths.add(btype_path) print(f" [NEW] Created business type: {btype_path}") # Check/create sub-category (level 3) - use "General" as default subcat = "General" subcat_slug = "General" subcat_path = f"{btype_path}.{subcat_slug}" if subcat_path not in existing_paths and subcat_path not in created_paths: cursor.execute(""" INSERT INTO gbp_categories (name, slug, path, level, parent_id, category_count) SELECT %s, %s, %s::ltree, 3, id, 0 FROM gbp_categories WHERE path = %s::ltree ON CONFLICT (path) DO NOTHING RETURNING id """, (subcat, subcat_slug, subcat_path, btype_path)) result = cursor.fetchone() if result: existing_paths[subcat_path] = result[0] created_paths.add(subcat_path) print(f" [NEW] Created sub-category: {subcat_path}") # Update the item's path new_path = f"{subcat_path}.{slug}" cursor.execute(""" UPDATE gbp_categories SET path = %s::ltree, parent_id = (SELECT id FROM gbp_categories WHERE path = %s::ltree) WHERE id = %s """, (new_path, subcat_path, item_id)) updated += 1 except Exception as e: errors.append((name, str(e))) print(f" [ERROR] {name}: {e}") # Update category counts print("\nUpdating category counts...") cursor.execute(""" WITH counts AS ( SELECT parent_id, COUNT(*) as cnt FROM gbp_categories WHERE parent_id IS NOT NULL GROUP BY parent_id ) UPDATE gbp_categories g SET category_count = COALESCE(c.cnt, 0) FROM counts c WHERE g.id = c.parent_id """) # Also reset counts for categories that no longer have children cursor.execute(""" UPDATE gbp_categories SET category_count = 0 WHERE id NOT IN ( SELECT DISTINCT parent_id FROM gbp_categories WHERE parent_id IS NOT NULL ) AND level < 4 """) conn.commit() print(f"\n{'='*60}") print(f"SUMMARY") print(f"{'='*60}") print(f"Items moved: {updated}") print(f"New paths created: {len(created_paths)}") print(f"Errors: {len(errors)}") if errors: print("\nErrors:") for name, err in errors[:10]: print(f" - {name}: {err}") if len(errors) > 10: print(f" ... and {len(errors) - 10} more") # Show final stats cursor.execute(""" SELECT SPLIT_PART(path::text, '.', 1) as sector, COUNT(*) as count FROM gbp_categories WHERE level = 4 GROUP BY sector ORDER BY count DESC """) print("\nFinal category distribution:") for sector, count in cursor.fetchall(): print(f" {sector}: {count}") conn.close() if __name__ == '__main__': main()