203 lines
6.6 KiB
Python
203 lines
6.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Apply the hierarchical recategorization to the database.
|
|
|
|
This script:
|
|
1. Gets all items currently in Other.Uncategorized
|
|
2. Applies the categorization rules
|
|
3. Updates the database with new paths
|
|
4. Creates new level 2/3 categories as needed
|
|
5. Updates category counts
|
|
"""
|
|
|
|
import psycopg2
|
|
import re
|
|
from collections import defaultdict
|
|
|
|
# Import categorization functions
|
|
import sys
|
|
sys.path.insert(0, '/Users/agutierrez/Desktop/google-reviews-scraper-pro/db')
|
|
from recategorize_hierarchical import get_sector_for_item, get_business_type_for_item
|
|
|
|
DB_URL = "postgresql://scraper:scraper123@localhost:5437/scraper"
|
|
|
|
def slugify(text):
|
|
"""Convert text to slug format"""
|
|
slug = re.sub(r'[^\w\s-]', '', text)
|
|
slug = re.sub(r'[-\s]+', '_', slug)
|
|
return slug.strip('_')
|
|
|
|
def main():
|
|
conn = psycopg2.connect(DB_URL)
|
|
cursor = conn.cursor()
|
|
|
|
# Get all items in Other.Uncategorized
|
|
cursor.execute("""
|
|
SELECT id, name, slug
|
|
FROM gbp_categories
|
|
WHERE path ~ 'Other.Uncategorized.*' AND level = 4
|
|
ORDER BY name
|
|
""")
|
|
other_items = cursor.fetchall()
|
|
print(f"Found {len(other_items)} items in Other.Uncategorized")
|
|
|
|
# Get existing paths
|
|
cursor.execute("SELECT path::text, id FROM gbp_categories")
|
|
existing_paths = {row[0]: row[1] for row in cursor.fetchall()}
|
|
print(f"Found {len(existing_paths)} existing paths")
|
|
|
|
# Categorize items
|
|
moves = [] # (item_id, item_name, item_slug, new_sector, new_btype)
|
|
stats = defaultdict(int)
|
|
|
|
for item_id, name, slug in other_items:
|
|
sector = get_sector_for_item(name)
|
|
btype = get_business_type_for_item(name, sector)
|
|
|
|
if sector != 'Other':
|
|
moves.append((item_id, name, slug, sector, btype))
|
|
stats[sector] += 1
|
|
else:
|
|
stats['Still_Other'] += 1
|
|
|
|
print(f"\nCategorization results:")
|
|
for sector, count in sorted(stats.items(), key=lambda x: -x[1]):
|
|
print(f" {sector}: {count}")
|
|
|
|
print(f"\nTotal to move: {len(moves)}")
|
|
print(f"Remaining in Other: {stats.get('Still_Other', 0)}")
|
|
|
|
# Ask for confirmation
|
|
response = input("\nProceed with database updates? (yes/no): ")
|
|
if response.lower() != 'yes':
|
|
print("Aborted.")
|
|
conn.close()
|
|
return
|
|
|
|
# Process moves
|
|
created_paths = set()
|
|
updated = 0
|
|
errors = []
|
|
|
|
for item_id, name, slug, sector, btype in moves:
|
|
try:
|
|
sector_slug = slugify(sector)
|
|
btype_slug = slugify(btype)
|
|
|
|
# Check if sector exists
|
|
sector_path = sector_slug
|
|
if sector_path not in existing_paths:
|
|
print(f" [ERROR] Sector not found: {sector_path} for '{name}'")
|
|
errors.append((name, f"Sector not found: {sector_path}"))
|
|
continue
|
|
|
|
# Check/create business type (level 2)
|
|
btype_path = f"{sector_path}.{btype_slug}"
|
|
if btype_path not in existing_paths and btype_path not in created_paths:
|
|
cursor.execute("""
|
|
INSERT INTO gbp_categories (name, slug, path, level, parent_id, category_count)
|
|
SELECT %s, %s, %s::ltree, 2, id, 0
|
|
FROM gbp_categories WHERE path = %s::ltree
|
|
ON CONFLICT (path) DO NOTHING
|
|
RETURNING id
|
|
""", (btype, btype_slug, btype_path, sector_path))
|
|
result = cursor.fetchone()
|
|
if result:
|
|
existing_paths[btype_path] = result[0]
|
|
created_paths.add(btype_path)
|
|
print(f" [NEW] Created business type: {btype_path}")
|
|
|
|
# Check/create sub-category (level 3) - use "General" as default
|
|
subcat = "General"
|
|
subcat_slug = "General"
|
|
subcat_path = f"{btype_path}.{subcat_slug}"
|
|
if subcat_path not in existing_paths and subcat_path not in created_paths:
|
|
cursor.execute("""
|
|
INSERT INTO gbp_categories (name, slug, path, level, parent_id, category_count)
|
|
SELECT %s, %s, %s::ltree, 3, id, 0
|
|
FROM gbp_categories WHERE path = %s::ltree
|
|
ON CONFLICT (path) DO NOTHING
|
|
RETURNING id
|
|
""", (subcat, subcat_slug, subcat_path, btype_path))
|
|
result = cursor.fetchone()
|
|
if result:
|
|
existing_paths[subcat_path] = result[0]
|
|
created_paths.add(subcat_path)
|
|
print(f" [NEW] Created sub-category: {subcat_path}")
|
|
|
|
# Update the item's path
|
|
new_path = f"{subcat_path}.{slug}"
|
|
cursor.execute("""
|
|
UPDATE gbp_categories
|
|
SET path = %s::ltree,
|
|
parent_id = (SELECT id FROM gbp_categories WHERE path = %s::ltree)
|
|
WHERE id = %s
|
|
""", (new_path, subcat_path, item_id))
|
|
updated += 1
|
|
|
|
except Exception as e:
|
|
errors.append((name, str(e)))
|
|
print(f" [ERROR] {name}: {e}")
|
|
|
|
# Update category counts
|
|
print("\nUpdating category counts...")
|
|
cursor.execute("""
|
|
WITH counts AS (
|
|
SELECT
|
|
parent_id,
|
|
COUNT(*) as cnt
|
|
FROM gbp_categories
|
|
WHERE parent_id IS NOT NULL
|
|
GROUP BY parent_id
|
|
)
|
|
UPDATE gbp_categories g
|
|
SET category_count = COALESCE(c.cnt, 0)
|
|
FROM counts c
|
|
WHERE g.id = c.parent_id
|
|
""")
|
|
|
|
# Also reset counts for categories that no longer have children
|
|
cursor.execute("""
|
|
UPDATE gbp_categories
|
|
SET category_count = 0
|
|
WHERE id NOT IN (
|
|
SELECT DISTINCT parent_id FROM gbp_categories WHERE parent_id IS NOT NULL
|
|
)
|
|
AND level < 4
|
|
""")
|
|
|
|
conn.commit()
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"SUMMARY")
|
|
print(f"{'='*60}")
|
|
print(f"Items moved: {updated}")
|
|
print(f"New paths created: {len(created_paths)}")
|
|
print(f"Errors: {len(errors)}")
|
|
|
|
if errors:
|
|
print("\nErrors:")
|
|
for name, err in errors[:10]:
|
|
print(f" - {name}: {err}")
|
|
if len(errors) > 10:
|
|
print(f" ... and {len(errors) - 10} more")
|
|
|
|
# Show final stats
|
|
cursor.execute("""
|
|
SELECT
|
|
SPLIT_PART(path::text, '.', 1) as sector,
|
|
COUNT(*) as count
|
|
FROM gbp_categories
|
|
WHERE level = 4
|
|
GROUP BY sector
|
|
ORDER BY count DESC
|
|
""")
|
|
print("\nFinal category distribution:")
|
|
for sector, count in cursor.fetchall():
|
|
print(f" {sector}: {count}")
|
|
|
|
conn.close()
|
|
|
|
if __name__ == '__main__':
|
|
main()
|