Initial commit - WhyRating Engine (Google Reviews Scraper)
This commit is contained in:
202
db/apply_recategorization.py
Normal file
202
db/apply_recategorization.py
Normal file
@@ -0,0 +1,202 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Apply the hierarchical recategorization to the database.
|
||||
|
||||
This script:
|
||||
1. Gets all items currently in Other.Uncategorized
|
||||
2. Applies the categorization rules
|
||||
3. Updates the database with new paths
|
||||
4. Creates new level 2/3 categories as needed
|
||||
5. Updates category counts
|
||||
"""
|
||||
|
||||
import psycopg2
|
||||
import re
|
||||
from collections import defaultdict
|
||||
|
||||
# Import categorization functions
|
||||
import sys
|
||||
sys.path.insert(0, '/Users/agutierrez/Desktop/google-reviews-scraper-pro/db')
|
||||
from recategorize_hierarchical import get_sector_for_item, get_business_type_for_item
|
||||
|
||||
DB_URL = "postgresql://scraper:scraper123@localhost:5437/scraper"
|
||||
|
||||
def slugify(text):
|
||||
"""Convert text to slug format"""
|
||||
slug = re.sub(r'[^\w\s-]', '', text)
|
||||
slug = re.sub(r'[-\s]+', '_', slug)
|
||||
return slug.strip('_')
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get all items in Other.Uncategorized
|
||||
cursor.execute("""
|
||||
SELECT id, name, slug
|
||||
FROM gbp_categories
|
||||
WHERE path ~ 'Other.Uncategorized.*' AND level = 4
|
||||
ORDER BY name
|
||||
""")
|
||||
other_items = cursor.fetchall()
|
||||
print(f"Found {len(other_items)} items in Other.Uncategorized")
|
||||
|
||||
# Get existing paths
|
||||
cursor.execute("SELECT path::text, id FROM gbp_categories")
|
||||
existing_paths = {row[0]: row[1] for row in cursor.fetchall()}
|
||||
print(f"Found {len(existing_paths)} existing paths")
|
||||
|
||||
# Categorize items
|
||||
moves = [] # (item_id, item_name, item_slug, new_sector, new_btype)
|
||||
stats = defaultdict(int)
|
||||
|
||||
for item_id, name, slug in other_items:
|
||||
sector = get_sector_for_item(name)
|
||||
btype = get_business_type_for_item(name, sector)
|
||||
|
||||
if sector != 'Other':
|
||||
moves.append((item_id, name, slug, sector, btype))
|
||||
stats[sector] += 1
|
||||
else:
|
||||
stats['Still_Other'] += 1
|
||||
|
||||
print(f"\nCategorization results:")
|
||||
for sector, count in sorted(stats.items(), key=lambda x: -x[1]):
|
||||
print(f" {sector}: {count}")
|
||||
|
||||
print(f"\nTotal to move: {len(moves)}")
|
||||
print(f"Remaining in Other: {stats.get('Still_Other', 0)}")
|
||||
|
||||
# Ask for confirmation
|
||||
response = input("\nProceed with database updates? (yes/no): ")
|
||||
if response.lower() != 'yes':
|
||||
print("Aborted.")
|
||||
conn.close()
|
||||
return
|
||||
|
||||
# Process moves
|
||||
created_paths = set()
|
||||
updated = 0
|
||||
errors = []
|
||||
|
||||
for item_id, name, slug, sector, btype in moves:
|
||||
try:
|
||||
sector_slug = slugify(sector)
|
||||
btype_slug = slugify(btype)
|
||||
|
||||
# Check if sector exists
|
||||
sector_path = sector_slug
|
||||
if sector_path not in existing_paths:
|
||||
print(f" [ERROR] Sector not found: {sector_path} for '{name}'")
|
||||
errors.append((name, f"Sector not found: {sector_path}"))
|
||||
continue
|
||||
|
||||
# Check/create business type (level 2)
|
||||
btype_path = f"{sector_path}.{btype_slug}"
|
||||
if btype_path not in existing_paths and btype_path not in created_paths:
|
||||
cursor.execute("""
|
||||
INSERT INTO gbp_categories (name, slug, path, level, parent_id, category_count)
|
||||
SELECT %s, %s, %s::ltree, 2, id, 0
|
||||
FROM gbp_categories WHERE path = %s::ltree
|
||||
ON CONFLICT (path) DO NOTHING
|
||||
RETURNING id
|
||||
""", (btype, btype_slug, btype_path, sector_path))
|
||||
result = cursor.fetchone()
|
||||
if result:
|
||||
existing_paths[btype_path] = result[0]
|
||||
created_paths.add(btype_path)
|
||||
print(f" [NEW] Created business type: {btype_path}")
|
||||
|
||||
# Check/create sub-category (level 3) - use "General" as default
|
||||
subcat = "General"
|
||||
subcat_slug = "General"
|
||||
subcat_path = f"{btype_path}.{subcat_slug}"
|
||||
if subcat_path not in existing_paths and subcat_path not in created_paths:
|
||||
cursor.execute("""
|
||||
INSERT INTO gbp_categories (name, slug, path, level, parent_id, category_count)
|
||||
SELECT %s, %s, %s::ltree, 3, id, 0
|
||||
FROM gbp_categories WHERE path = %s::ltree
|
||||
ON CONFLICT (path) DO NOTHING
|
||||
RETURNING id
|
||||
""", (subcat, subcat_slug, subcat_path, btype_path))
|
||||
result = cursor.fetchone()
|
||||
if result:
|
||||
existing_paths[subcat_path] = result[0]
|
||||
created_paths.add(subcat_path)
|
||||
print(f" [NEW] Created sub-category: {subcat_path}")
|
||||
|
||||
# Update the item's path
|
||||
new_path = f"{subcat_path}.{slug}"
|
||||
cursor.execute("""
|
||||
UPDATE gbp_categories
|
||||
SET path = %s::ltree,
|
||||
parent_id = (SELECT id FROM gbp_categories WHERE path = %s::ltree)
|
||||
WHERE id = %s
|
||||
""", (new_path, subcat_path, item_id))
|
||||
updated += 1
|
||||
|
||||
except Exception as e:
|
||||
errors.append((name, str(e)))
|
||||
print(f" [ERROR] {name}: {e}")
|
||||
|
||||
# Update category counts
|
||||
print("\nUpdating category counts...")
|
||||
cursor.execute("""
|
||||
WITH counts AS (
|
||||
SELECT
|
||||
parent_id,
|
||||
COUNT(*) as cnt
|
||||
FROM gbp_categories
|
||||
WHERE parent_id IS NOT NULL
|
||||
GROUP BY parent_id
|
||||
)
|
||||
UPDATE gbp_categories g
|
||||
SET category_count = COALESCE(c.cnt, 0)
|
||||
FROM counts c
|
||||
WHERE g.id = c.parent_id
|
||||
""")
|
||||
|
||||
# Also reset counts for categories that no longer have children
|
||||
cursor.execute("""
|
||||
UPDATE gbp_categories
|
||||
SET category_count = 0
|
||||
WHERE id NOT IN (
|
||||
SELECT DISTINCT parent_id FROM gbp_categories WHERE parent_id IS NOT NULL
|
||||
)
|
||||
AND level < 4
|
||||
""")
|
||||
|
||||
conn.commit()
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"SUMMARY")
|
||||
print(f"{'='*60}")
|
||||
print(f"Items moved: {updated}")
|
||||
print(f"New paths created: {len(created_paths)}")
|
||||
print(f"Errors: {len(errors)}")
|
||||
|
||||
if errors:
|
||||
print("\nErrors:")
|
||||
for name, err in errors[:10]:
|
||||
print(f" - {name}: {err}")
|
||||
if len(errors) > 10:
|
||||
print(f" ... and {len(errors) - 10} more")
|
||||
|
||||
# Show final stats
|
||||
cursor.execute("""
|
||||
SELECT
|
||||
SPLIT_PART(path::text, '.', 1) as sector,
|
||||
COUNT(*) as count
|
||||
FROM gbp_categories
|
||||
WHERE level = 4
|
||||
GROUP BY sector
|
||||
ORDER BY count DESC
|
||||
""")
|
||||
print("\nFinal category distribution:")
|
||||
for sector, count in cursor.fetchall():
|
||||
print(f" {sector}: {count}")
|
||||
|
||||
conn.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user