Initial commit - WhyRating Engine (Google Reviews Scraper)

This commit is contained in:
Alejandro Gutiérrez
2026-02-02 18:19:00 +00:00
parent 0543a08242
commit 2206ddeff2
136 changed files with 51138 additions and 855 deletions

View File

@@ -0,0 +1,202 @@
#!/usr/bin/env python3
"""
Apply the hierarchical recategorization to the database.
This script:
1. Gets all items currently in Other.Uncategorized
2. Applies the categorization rules
3. Updates the database with new paths
4. Creates new level 2/3 categories as needed
5. Updates category counts
"""
import psycopg2
import re
from collections import defaultdict
# Import categorization functions
import sys
sys.path.insert(0, '/Users/agutierrez/Desktop/google-reviews-scraper-pro/db')
from recategorize_hierarchical import get_sector_for_item, get_business_type_for_item
DB_URL = "postgresql://scraper:scraper123@localhost:5437/scraper"
def slugify(text):
"""Convert text to slug format"""
slug = re.sub(r'[^\w\s-]', '', text)
slug = re.sub(r'[-\s]+', '_', slug)
return slug.strip('_')
def main():
conn = psycopg2.connect(DB_URL)
cursor = conn.cursor()
# Get all items in Other.Uncategorized
cursor.execute("""
SELECT id, name, slug
FROM gbp_categories
WHERE path ~ 'Other.Uncategorized.*' AND level = 4
ORDER BY name
""")
other_items = cursor.fetchall()
print(f"Found {len(other_items)} items in Other.Uncategorized")
# Get existing paths
cursor.execute("SELECT path::text, id FROM gbp_categories")
existing_paths = {row[0]: row[1] for row in cursor.fetchall()}
print(f"Found {len(existing_paths)} existing paths")
# Categorize items
moves = [] # (item_id, item_name, item_slug, new_sector, new_btype)
stats = defaultdict(int)
for item_id, name, slug in other_items:
sector = get_sector_for_item(name)
btype = get_business_type_for_item(name, sector)
if sector != 'Other':
moves.append((item_id, name, slug, sector, btype))
stats[sector] += 1
else:
stats['Still_Other'] += 1
print(f"\nCategorization results:")
for sector, count in sorted(stats.items(), key=lambda x: -x[1]):
print(f" {sector}: {count}")
print(f"\nTotal to move: {len(moves)}")
print(f"Remaining in Other: {stats.get('Still_Other', 0)}")
# Ask for confirmation
response = input("\nProceed with database updates? (yes/no): ")
if response.lower() != 'yes':
print("Aborted.")
conn.close()
return
# Process moves
created_paths = set()
updated = 0
errors = []
for item_id, name, slug, sector, btype in moves:
try:
sector_slug = slugify(sector)
btype_slug = slugify(btype)
# Check if sector exists
sector_path = sector_slug
if sector_path not in existing_paths:
print(f" [ERROR] Sector not found: {sector_path} for '{name}'")
errors.append((name, f"Sector not found: {sector_path}"))
continue
# Check/create business type (level 2)
btype_path = f"{sector_path}.{btype_slug}"
if btype_path not in existing_paths and btype_path not in created_paths:
cursor.execute("""
INSERT INTO gbp_categories (name, slug, path, level, parent_id, category_count)
SELECT %s, %s, %s::ltree, 2, id, 0
FROM gbp_categories WHERE path = %s::ltree
ON CONFLICT (path) DO NOTHING
RETURNING id
""", (btype, btype_slug, btype_path, sector_path))
result = cursor.fetchone()
if result:
existing_paths[btype_path] = result[0]
created_paths.add(btype_path)
print(f" [NEW] Created business type: {btype_path}")
# Check/create sub-category (level 3) - use "General" as default
subcat = "General"
subcat_slug = "General"
subcat_path = f"{btype_path}.{subcat_slug}"
if subcat_path not in existing_paths and subcat_path not in created_paths:
cursor.execute("""
INSERT INTO gbp_categories (name, slug, path, level, parent_id, category_count)
SELECT %s, %s, %s::ltree, 3, id, 0
FROM gbp_categories WHERE path = %s::ltree
ON CONFLICT (path) DO NOTHING
RETURNING id
""", (subcat, subcat_slug, subcat_path, btype_path))
result = cursor.fetchone()
if result:
existing_paths[subcat_path] = result[0]
created_paths.add(subcat_path)
print(f" [NEW] Created sub-category: {subcat_path}")
# Update the item's path
new_path = f"{subcat_path}.{slug}"
cursor.execute("""
UPDATE gbp_categories
SET path = %s::ltree,
parent_id = (SELECT id FROM gbp_categories WHERE path = %s::ltree)
WHERE id = %s
""", (new_path, subcat_path, item_id))
updated += 1
except Exception as e:
errors.append((name, str(e)))
print(f" [ERROR] {name}: {e}")
# Update category counts
print("\nUpdating category counts...")
cursor.execute("""
WITH counts AS (
SELECT
parent_id,
COUNT(*) as cnt
FROM gbp_categories
WHERE parent_id IS NOT NULL
GROUP BY parent_id
)
UPDATE gbp_categories g
SET category_count = COALESCE(c.cnt, 0)
FROM counts c
WHERE g.id = c.parent_id
""")
# Also reset counts for categories that no longer have children
cursor.execute("""
UPDATE gbp_categories
SET category_count = 0
WHERE id NOT IN (
SELECT DISTINCT parent_id FROM gbp_categories WHERE parent_id IS NOT NULL
)
AND level < 4
""")
conn.commit()
print(f"\n{'='*60}")
print(f"SUMMARY")
print(f"{'='*60}")
print(f"Items moved: {updated}")
print(f"New paths created: {len(created_paths)}")
print(f"Errors: {len(errors)}")
if errors:
print("\nErrors:")
for name, err in errors[:10]:
print(f" - {name}: {err}")
if len(errors) > 10:
print(f" ... and {len(errors) - 10} more")
# Show final stats
cursor.execute("""
SELECT
SPLIT_PART(path::text, '.', 1) as sector,
COUNT(*) as count
FROM gbp_categories
WHERE level = 4
GROUP BY sector
ORDER BY count DESC
""")
print("\nFinal category distribution:")
for sector, count in cursor.fetchall():
print(f" {sector}: {count}")
conn.close()
if __name__ == '__main__':
main()

977
db/import_categories.py Normal file
View File

@@ -0,0 +1,977 @@
#!/usr/bin/env python3
"""
Import Google Business Profile categories into PostgreSQL with ltree hierarchy.
Usage:
python import_categories.py [--csv-path PATH] [--db-url URL]
Example:
python import_categories.py --csv-path ./categories.csv --db-url postgresql://scraper:scraper123@localhost:5437/scraper
"""
import csv
import re
import os
import argparse
from typing import Optional
try:
import psycopg2
from psycopg2.extras import execute_values
HAS_PSYCOPG2 = True
except ImportError:
HAS_PSYCOPG2 = False
# Default paths
DEFAULT_CSV_PATH = os.path.expanduser("~/Downloads/Google Business Profile Categories (2025 List) - Category List (English).csv")
DEFAULT_DB_URL = "postgresql://scraper:scraper123@localhost:5437/scraper"
def slugify(text: str) -> str:
"""Convert text to ltree-safe slug."""
# Replace special characters with underscores
slug = re.sub(r'[^a-zA-Z0-9]+', '_', text)
# Remove leading/trailing underscores
slug = slug.strip('_')
# Ensure it starts with a letter (ltree requirement)
if slug and not slug[0].isalpha():
slug = 'cat_' + slug
return slug or 'unknown'
def categorize_category(cat: str) -> tuple:
"""
Categorize a GBP category into 4-level hierarchy.
Returns: (level1, level2, level3, level4)
"""
c = cat.lower()
# === FOOD & DINING ===
if 'restaurant' in c:
if any(x in c for x in ['fast food', 'drive-in', 'takeaway', 'takeout', 'quick service']):
return ("Food & Dining", "Restaurants", "Fast Food & Quick Service", cat)
# Cuisine types
return ("Food & Dining", "Restaurants", "By Cuisine", cat)
if any(x in c for x in ['cafe', 'coffee shop', 'tea house', 'tea room', 'espresso bar']):
return ("Food & Dining", "Cafes & Coffee", "Coffee Shops", cat)
if any(x in c for x in ['bar', 'pub', 'nightclub', 'night club', 'cocktail', 'wine bar', 'beer', 'lounge']):
if 'gay' in c or 'lesbian' in c:
return ("Food & Dining", "Bars & Nightlife", "LGBTQ+ Venues", cat)
if 'karaoke' in c:
return ("Food & Dining", "Bars & Nightlife", "Karaoke", cat)
return ("Food & Dining", "Bars & Nightlife", "Bars & Pubs", cat)
if any(x in c for x in ['bakery', 'pastry', 'cake', 'donut', 'dessert', 'ice cream', 'frozen yogurt', 'candy', 'chocolate', 'confection']):
return ("Food & Dining", "Bakeries & Desserts", "Sweet Shops", cat)
if any(x in c for x in ['caterer', 'catering']):
return ("Food & Dining", "Food Services", "Catering", cat)
if any(x in c for x in ['brewery', 'winery', 'distillery', 'vineyard']):
return ("Food & Dining", "Beverage Production", "Producers", cat)
if any(x in c for x in ['food truck', 'food stand', 'food stall', 'food court']):
return ("Food & Dining", "Quick Service", "Street Food", cat)
# === RETAIL & SHOPPING ===
if 'store' in c or 'shop' in c:
if any(x in c for x in ['clothing', 'fashion', 'shoe', 'dress', 'apparel', 'wear', 'boutique', 'tailor']):
return ("Retail & Shopping", "Clothing & Fashion", "Apparel Stores", cat)
if any(x in c for x in ['electronic', 'computer', 'phone', 'appliance', 'tv', 'audio', 'video game']):
return ("Retail & Shopping", "Electronics", "Electronics Stores", cat)
if any(x in c for x in ['furniture', 'home decor', 'kitchen', 'bed', 'mattress', 'carpet', 'curtain', 'lighting']):
return ("Retail & Shopping", "Home & Garden", "Home Furnishings", cat)
if any(x in c for x in ['grocery', 'supermarket', 'food', 'beverage', 'wine', 'liquor', 'butcher', 'fish', 'fruit', 'vegetable']):
return ("Retail & Shopping", "Food & Grocery", "Grocery Stores", cat)
if any(x in c for x in ['book', 'stationery', 'office supply', 'paper']):
return ("Retail & Shopping", "Books & Office", "Book Stores", cat)
if any(x in c for x in ['pet', 'animal']):
return ("Retail & Shopping", "Pet Supplies", "Pet Stores", cat)
if any(x in c for x in ['toy', 'game', 'hobby']):
return ("Retail & Shopping", "Toys & Hobbies", "Toy Stores", cat)
if any(x in c for x in ['jewelry', 'watch', 'gold', 'diamond']):
return ("Retail & Shopping", "Jewelry & Watches", "Jewelry Stores", cat)
if any(x in c for x in ['sport', 'athletic', 'fitness', 'outdoor', 'camping', 'fishing', 'hunting']):
return ("Retail & Shopping", "Sports & Outdoors", "Sporting Goods", cat)
if any(x in c for x in ['music', 'instrument', 'record', 'vinyl']):
return ("Retail & Shopping", "Music & Entertainment", "Music Stores", cat)
if any(x in c for x in ['art', 'craft', 'fabric', 'sewing', 'yarn', 'knitting']):
return ("Retail & Shopping", "Arts & Crafts", "Art Supply Stores", cat)
if any(x in c for x in ['beauty', 'cosmetic', 'perfume', 'makeup']):
return ("Retail & Shopping", "Beauty & Cosmetics", "Beauty Stores", cat)
if any(x in c for x in ['pharmacy', 'drug', 'medicine', 'health']):
return ("Retail & Shopping", "Health & Pharmacy", "Pharmacies", cat)
if any(x in c for x in ['garden', 'plant', 'flower', 'nursery', 'landscap']):
return ("Retail & Shopping", "Home & Garden", "Garden Centers", cat)
if any(x in c for x in ['hardware', 'tool', 'building', 'lumber', 'paint']):
return ("Retail & Shopping", "Hardware & Building", "Hardware Stores", cat)
if any(x in c for x in ['antique', 'vintage', 'thrift', 'consignment', 'second hand', 'used']):
return ("Retail & Shopping", "Secondhand & Vintage", "Thrift Stores", cat)
return ("Retail & Shopping", "Specialty Retail", "Other Stores", cat)
if any(x in c for x in ['supplier', 'wholesaler', 'distributor', 'exporter', 'importer']):
if any(x in c for x in ['food', 'beverage', 'meat', 'seafood', 'produce']):
return ("Retail & Shopping", "Wholesale & Distribution", "Food Wholesale", cat)
if any(x in c for x in ['building', 'construction', 'lumber', 'concrete', 'steel']):
return ("Retail & Shopping", "Wholesale & Distribution", "Building Materials", cat)
if any(x in c for x in ['industrial', 'machinery', 'equipment']):
return ("Retail & Shopping", "Wholesale & Distribution", "Industrial Supplies", cat)
return ("Retail & Shopping", "Wholesale & Distribution", "General Wholesale", cat)
if 'market' in c and 'marketing' not in c:
if 'flea' in c or 'antique' in c:
return ("Retail & Shopping", "Markets", "Flea Markets", cat)
if 'farmer' in c:
return ("Retail & Shopping", "Markets", "Farmers Markets", cat)
return ("Retail & Shopping", "Markets", "General Markets", cat)
# === AUTOMOTIVE ===
if 'dealer' in c:
car_brands = ['abarth', 'acura', 'alfa romeo', 'aston martin', 'audi', 'bentley', 'bmw', 'bugatti',
'buick', 'cadillac', 'chevrolet', 'chrysler', 'citroen', 'cupra', 'dacia', 'daihatsu',
'dodge', 'ferrari', 'fiat', 'ford', 'genesis', 'gmc', 'honda', 'hummer', 'hyundai',
'infiniti', 'isuzu', 'jaguar', 'jeep', 'kia', 'lamborghini', 'lancia', 'land rover',
'lexus', 'lincoln', 'lotus', 'maserati', 'mazda', 'mclaren', 'mercedes', 'mini',
'mitsubishi', 'nissan', 'opel', 'peugeot', 'porsche', 'ram', 'renault', 'rolls-royce',
'saab', 'seat', 'skoda', 'smart', 'subaru', 'suzuki', 'tesla', 'toyota', 'volkswagen',
'volvo', 'yamaha', 'harley', 'ducati', 'kawasaki', 'triumph', 'vespa', 'piaggio']
if any(b in c for b in car_brands):
if 'motorcycle' in c or any(x in c for x in ['harley', 'ducati', 'kawasaki', 'triumph', 'vespa']):
return ("Automotive", "Dealers", "Motorcycle Brands", cat)
return ("Automotive", "Dealers", "Car Brands", cat)
if any(x in c for x in ['motorcycle', 'scooter', 'moped']):
return ("Automotive", "Dealers", "Motorcycle Dealers", cat)
if any(x in c for x in ['truck', 'commercial vehicle', 'trailer']):
return ("Automotive", "Dealers", "Truck & Commercial", cat)
if any(x in c for x in ['boat', 'yacht', 'marine', 'jet ski']):
return ("Automotive", "Dealers", "Marine & Boats", cat)
if any(x in c for x in ['rv', 'camper', 'motorhome', 'caravan']):
return ("Automotive", "Dealers", "RV & Campers", cat)
if any(x in c for x in ['atv', 'quad', 'off-road', 'utv']):
return ("Automotive", "Dealers", "ATV & Off-Road", cat)
if 'used' in c or 'pre-owned' in c:
return ("Automotive", "Dealers", "Used Vehicles", cat)
return ("Automotive", "Dealers", "Other Dealers", cat)
if any(x in c for x in ['car wash', 'auto detailing', 'car detailing']):
return ("Automotive", "Vehicle Care", "Cleaning & Detailing", cat)
if any(x in c for x in ['car rental', 'auto rental', 'vehicle rental', 'truck rental']):
return ("Automotive", "Rental Services", "Vehicle Rental", cat)
if any(x in c for x in ['car repair', 'auto repair', 'mechanic', 'garage', 'auto body', 'collision']):
return ("Automotive", "Repair & Maintenance", "Auto Repair", cat)
if any(x in c for x in ['tire', 'tyre', 'wheel']):
return ("Automotive", "Parts & Accessories", "Tires & Wheels", cat)
if any(x in c for x in ['auto part', 'car part', 'auto accessories']):
return ("Automotive", "Parts & Accessories", "Auto Parts", cat)
if any(x in c for x in ['driving school', 'driving instruction']):
return ("Automotive", "Training", "Driving Schools", cat)
if any(x in c for x in ['parking', 'car park', 'garage']):
if 'repair' not in c and 'mechanic' not in c:
return ("Automotive", "Parking", "Parking Facilities", cat)
if any(x in c for x in ['gas station', 'petrol', 'fuel', 'charging station', 'ev charging']):
return ("Automotive", "Fuel & Charging", "Fuel Stations", cat)
# === HEALTHCARE ===
if any(x in c for x in ['hospital']):
if 'animal' in c or 'veterinar' in c:
return ("Healthcare", "Veterinary", "Animal Hospitals", cat)
if 'children' in c or 'pediatric' in c:
return ("Healthcare", "Hospitals", "Pediatric Hospitals", cat)
if 'mental' in c or 'psychiatric' in c:
return ("Healthcare", "Mental Health", "Psychiatric Hospitals", cat)
return ("Healthcare", "Hospitals", "General Hospitals", cat)
if any(x in c for x in ['clinic']):
if 'dental' in c:
return ("Healthcare", "Dental", "Dental Clinics", cat)
if 'eye' in c or 'vision' in c or 'optical' in c:
return ("Healthcare", "Vision Care", "Eye Clinics", cat)
if 'fertility' in c or 'ivf' in c:
return ("Healthcare", "Specialty Care", "Fertility Clinics", cat)
if 'skin' in c or 'dermatol' in c:
return ("Healthcare", "Specialty Care", "Dermatology", cat)
if 'physical therapy' in c or 'physiotherapy' in c or 'rehab' in c:
return ("Healthcare", "Rehabilitation", "Physical Therapy", cat)
return ("Healthcare", "Clinics", "Medical Clinics", cat)
if any(x in c for x in ['doctor', 'physician']):
return ("Healthcare", "Medical Practitioners", "Doctors", cat)
if any(x in c for x in ['dentist', 'dental', 'orthodont', 'endodont', 'periodont']):
return ("Healthcare", "Dental", "Dental Services", cat)
if any(x in c for x in ['surgeon', 'surgery']):
if 'plastic' in c or 'cosmetic' in c:
return ("Healthcare", "Specialty Care", "Cosmetic Surgery", cat)
return ("Healthcare", "Medical Practitioners", "Surgeons", cat)
if any(x in c for x in ['psycholog', 'psychiatr', 'mental health', 'counselor', 'therapist']):
if 'marriage' in c or 'family' in c:
return ("Healthcare", "Mental Health", "Family Counseling", cat)
if 'addiction' in c or 'substance' in c:
return ("Healthcare", "Mental Health", "Addiction Treatment", cat)
return ("Healthcare", "Mental Health", "Mental Health Services", cat)
if any(x in c for x in ['chiropract']):
return ("Healthcare", "Alternative Medicine", "Chiropractic", cat)
if any(x in c for x in ['acupuncture', 'acupuncturist']):
return ("Healthcare", "Alternative Medicine", "Acupuncture", cat)
if any(x in c for x in ['naturopath', 'homeopath', 'ayurved', 'holistic']):
return ("Healthcare", "Alternative Medicine", "Natural Medicine", cat)
if any(x in c for x in ['optometrist', 'optician', 'eye doctor', 'ophthalmol']):
return ("Healthcare", "Vision Care", "Eye Care", cat)
if any(x in c for x in ['pharmacy', 'drugstore', 'apothecary']):
return ("Healthcare", "Pharmacies", "Retail Pharmacies", cat)
if any(x in c for x in ['veterinar', 'vet ', 'animal clinic', 'pet clinic']):
return ("Healthcare", "Veterinary", "Veterinary Services", cat)
if any(x in c for x in ['nursing home', 'assisted living', 'senior care', 'elder care', 'retirement home']):
return ("Healthcare", "Senior Care", "Senior Living", cat)
if any(x in c for x in ['lab', 'laboratory', 'diagnostic', 'imaging', 'x-ray', 'mri', 'radiology']):
return ("Healthcare", "Diagnostics", "Medical Labs", cat)
if any(x in c for x in ['ambulance', 'emergency', 'urgent care']):
return ("Healthcare", "Emergency Services", "Emergency Care", cat)
# === EDUCATION ===
if 'school' in c or 'academy' in c:
if any(x in c for x in ['preschool', 'kindergarten', 'nursery', 'daycare', 'pre-school']):
return ("Education", "Early Childhood", "Preschools", cat)
if any(x in c for x in ['elementary', 'primary']):
return ("Education", "K-12 Schools", "Elementary Schools", cat)
if any(x in c for x in ['middle', 'junior high']):
return ("Education", "K-12 Schools", "Middle Schools", cat)
if any(x in c for x in ['high school', 'secondary']):
return ("Education", "K-12 Schools", "High Schools", cat)
if any(x in c for x in ['boarding']):
return ("Education", "K-12 Schools", "Boarding Schools", cat)
if any(x in c for x in ['driving']):
return ("Automotive", "Training", "Driving Schools", cat)
if any(x in c for x in ['language', 'english', 'spanish', 'french', 'german', 'chinese', 'japanese']):
return ("Education", "Language Learning", "Language Schools", cat)
if any(x in c for x in ['art', 'music', 'dance', 'drama', 'theater', 'acting']):
return ("Education", "Arts Education", "Arts Schools", cat)
if any(x in c for x in ['martial art', 'karate', 'judo', 'taekwondo', 'kung fu', 'aikido', 'boxing']):
return ("Education", "Sports Training", "Martial Arts Schools", cat)
if any(x in c for x in ['beauty', 'cosmetology', 'barber']):
return ("Education", "Vocational Training", "Beauty Schools", cat)
if any(x in c for x in ['cooking', 'culinary', 'chef']):
return ("Education", "Vocational Training", "Culinary Schools", cat)
if any(x in c for x in ['business', 'mba']):
return ("Education", "Higher Education", "Business Schools", cat)
if any(x in c for x in ['medical', 'nursing', 'dental']):
return ("Education", "Higher Education", "Medical Schools", cat)
if any(x in c for x in ['law']):
return ("Education", "Higher Education", "Law Schools", cat)
if any(x in c for x in ['flight', 'aviation', 'pilot']):
return ("Education", "Vocational Training", "Aviation Schools", cat)
if any(x in c for x in ['computer', 'it ', 'coding', 'programming', 'software']):
return ("Education", "Technology Training", "Computer Schools", cat)
if any(x in c for x in ['trade', 'technical', 'vocational']):
return ("Education", "Vocational Training", "Trade Schools", cat)
return ("Education", "Specialty Schools", "Other Schools", cat)
if any(x in c for x in ['university', 'college']):
if 'community' in c:
return ("Education", "Higher Education", "Community Colleges", cat)
return ("Education", "Higher Education", "Universities", cat)
if any(x in c for x in ['tutor', 'tutoring']):
return ("Education", "Tutoring", "Private Tutoring", cat)
if any(x in c for x in ['training center', 'training program', 'training institute']):
return ("Education", "Professional Training", "Training Centers", cat)
if any(x in c for x in ['library']):
return ("Education", "Libraries", "Public Libraries", cat)
# === PROFESSIONAL SERVICES ===
if any(x in c for x in ['lawyer', 'attorney', 'law firm', 'legal']):
if any(x in c for x in ['immigration']):
return ("Professional Services", "Legal", "Immigration Law", cat)
if any(x in c for x in ['criminal', 'defense']):
return ("Professional Services", "Legal", "Criminal Law", cat)
if any(x in c for x in ['family', 'divorce']):
return ("Professional Services", "Legal", "Family Law", cat)
if any(x in c for x in ['personal injury', 'accident']):
return ("Professional Services", "Legal", "Personal Injury", cat)
if any(x in c for x in ['real estate', 'property']):
return ("Professional Services", "Legal", "Real Estate Law", cat)
if any(x in c for x in ['business', 'corporate', 'commercial']):
return ("Professional Services", "Legal", "Business Law", cat)
return ("Professional Services", "Legal", "General Legal", cat)
if any(x in c for x in ['accountant', 'accounting', 'bookkeep', 'tax']):
return ("Professional Services", "Financial Services", "Accounting", cat)
if any(x in c for x in ['consultant', 'consulting', 'advisor']):
if any(x in c for x in ['business', 'management']):
return ("Professional Services", "Consulting", "Business Consulting", cat)
if any(x in c for x in ['it ', 'technology', 'computer']):
return ("Professional Services", "Consulting", "IT Consulting", cat)
if any(x in c for x in ['marketing', 'advertising']):
return ("Professional Services", "Consulting", "Marketing Consulting", cat)
return ("Professional Services", "Consulting", "General Consulting", cat)
if any(x in c for x in ['notary', 'notarial']):
return ("Professional Services", "Legal", "Notary Services", cat)
if any(x in c for x in ['architect', 'architecture']):
return ("Professional Services", "Design", "Architecture", cat)
if any(x in c for x in ['engineer', 'engineering']):
if 'civil' in c:
return ("Professional Services", "Engineering", "Civil Engineering", cat)
if 'structural' in c:
return ("Professional Services", "Engineering", "Structural Engineering", cat)
if 'mechanical' in c:
return ("Professional Services", "Engineering", "Mechanical Engineering", cat)
if 'electrical' in c:
return ("Professional Services", "Engineering", "Electrical Engineering", cat)
return ("Professional Services", "Engineering", "General Engineering", cat)
if any(x in c for x in ['agency']):
if any(x in c for x in ['advertising', 'marketing', 'creative', 'digital']):
return ("Professional Services", "Marketing & Advertising", "Agencies", cat)
if any(x in c for x in ['real estate', 'property']):
return ("Real Estate", "Agencies", "Real Estate Agencies", cat)
if any(x in c for x in ['insurance']):
return ("Finance & Insurance", "Insurance", "Insurance Agencies", cat)
if any(x in c for x in ['travel', 'tour']):
return ("Hospitality & Travel", "Travel Services", "Travel Agencies", cat)
if any(x in c for x in ['employment', 'staffing', 'recruitment', 'temp']):
return ("Professional Services", "HR Services", "Staffing Agencies", cat)
return ("Professional Services", "Agencies", "Other Agencies", cat)
if any(x in c for x in ['photographer', 'photography', 'photo studio']):
return ("Professional Services", "Creative Services", "Photography", cat)
if any(x in c for x in ['graphic design', 'web design', 'design studio']):
return ("Professional Services", "Creative Services", "Design Services", cat)
if any(x in c for x in ['translator', 'translation', 'interpreter']):
return ("Professional Services", "Language Services", "Translation", cat)
if any(x in c for x in ['printing', 'print shop', 'copy']):
return ("Professional Services", "Business Services", "Printing Services", cat)
# === HOME SERVICES ===
if any(x in c for x in ['plumber', 'plumbing']):
return ("Home Services", "Plumbing", "Plumbers", cat)
if any(x in c for x in ['electrician', 'electrical']):
if 'contractor' in c or 'service' in c:
return ("Home Services", "Electrical", "Electricians", cat)
if any(x in c for x in ['hvac', 'air conditioning', 'heating', 'furnace']):
return ("Home Services", "HVAC", "Heating & Cooling", cat)
if any(x in c for x in ['roofing', 'roofer']):
return ("Home Services", "Roofing", "Roofing Services", cat)
if any(x in c for x in ['painter', 'painting']):
if 'house' in c or 'residential' in c or 'contractor' in c:
return ("Home Services", "Painting", "House Painters", cat)
if any(x in c for x in ['landscap', 'lawn', 'garden']):
if 'service' in c or 'company' in c or 'contractor' in c:
return ("Home Services", "Landscaping", "Landscaping Services", cat)
if any(x in c for x in ['cleaning service', 'maid', 'housekeep', 'janitorial']):
return ("Home Services", "Cleaning", "Cleaning Services", cat)
if any(x in c for x in ['pest control', 'exterminator']):
return ("Home Services", "Pest Control", "Exterminators", cat)
if any(x in c for x in ['locksmith']):
return ("Home Services", "Security", "Locksmiths", cat)
if any(x in c for x in ['moving company', 'mover', 'relocation']):
return ("Home Services", "Moving", "Moving Services", cat)
if any(x in c for x in ['contractor']):
if 'general' in c:
return ("Home Services", "Construction", "General Contractors", cat)
return ("Home Services", "Construction", "Contractors", cat)
if any(x in c for x in ['carpenter', 'carpentry']):
return ("Home Services", "Construction", "Carpenters", cat)
if any(x in c for x in ['flooring', 'floor']):
if 'service' in c or 'contractor' in c or 'installation' in c:
return ("Home Services", "Flooring", "Floor Installation", cat)
if any(x in c for x in ['window', 'glass']):
if 'repair' in c or 'installation' in c or 'service' in c:
return ("Home Services", "Windows & Doors", "Window Services", cat)
if any(x in c for x in ['pool', 'spa']):
if 'service' in c or 'cleaning' in c or 'maintenance' in c:
return ("Home Services", "Pool & Spa", "Pool Services", cat)
if any(x in c for x in ['appliance repair', 'appliance service']):
return ("Home Services", "Appliance Repair", "Appliance Services", cat)
if any(x in c for x in ['handyman']):
return ("Home Services", "General Repair", "Handyman Services", cat)
if any(x in c for x in ['interior design', 'decorator']):
return ("Home Services", "Design", "Interior Design", cat)
# === PERSONAL SERVICES ===
if any(x in c for x in ['salon', 'hair', 'hairdress', 'stylist']):
return ("Personal Services", "Hair Care", "Hair Salons", cat)
if any(x in c for x in ['barber']):
if 'shop' in c or not 'school' in c:
return ("Personal Services", "Hair Care", "Barber Shops", cat)
if any(x in c for x in ['nail', 'manicure', 'pedicure']):
return ("Personal Services", "Nail Care", "Nail Salons", cat)
if any(x in c for x in ['spa']):
if 'day spa' in c or 'medical spa' in c or ('service' not in c and 'pool' not in c):
return ("Personal Services", "Spa & Wellness", "Day Spas", cat)
if any(x in c for x in ['massage']):
return ("Personal Services", "Massage", "Massage Therapy", cat)
if any(x in c for x in ['beauty']):
if 'salon' in c or 'parlor' in c:
return ("Personal Services", "Beauty", "Beauty Salons", cat)
if any(x in c for x in ['tattoo']):
return ("Personal Services", "Body Art", "Tattoo Shops", cat)
if any(x in c for x in ['piercing']):
return ("Personal Services", "Body Art", "Piercing Studios", cat)
if any(x in c for x in ['tanning']):
return ("Personal Services", "Tanning", "Tanning Salons", cat)
if any(x in c for x in ['tailor', 'alteration', 'seamstress']):
return ("Personal Services", "Clothing Care", "Tailoring", cat)
if any(x in c for x in ['dry clean', 'laundry', 'laundromat']):
return ("Personal Services", "Laundry", "Laundry Services", cat)
if any(x in c for x in ['personal trainer', 'fitness trainer']):
return ("Personal Services", "Fitness", "Personal Training", cat)
# === ENTERTAINMENT & RECREATION ===
if any(x in c for x in ['movie theater', 'cinema', 'multiplex']):
return ("Entertainment", "Movies", "Movie Theaters", cat)
if any(x in c for x in ['theater', 'theatre']):
if 'movie' not in c:
return ("Entertainment", "Performing Arts", "Theaters", cat)
if any(x in c for x in ['museum']):
if 'art' in c:
return ("Entertainment", "Museums", "Art Museums", cat)
if 'history' in c or 'historical' in c:
return ("Entertainment", "Museums", "History Museums", cat)
if 'science' in c or 'natural' in c:
return ("Entertainment", "Museums", "Science Museums", cat)
if 'children' in c or 'kid' in c:
return ("Entertainment", "Museums", "Children's Museums", cat)
return ("Entertainment", "Museums", "General Museums", cat)
if any(x in c for x in ['art gallery', 'gallery']):
return ("Entertainment", "Arts", "Art Galleries", cat)
if any(x in c for x in ['amusement park', 'theme park', 'water park']):
return ("Entertainment", "Amusement", "Theme Parks", cat)
if any(x in c for x in ['zoo', 'aquarium', 'wildlife']):
return ("Entertainment", "Wildlife", "Zoos & Aquariums", cat)
if any(x in c for x in ['bowling']):
return ("Entertainment", "Games & Recreation", "Bowling", cat)
if any(x in c for x in ['arcade', 'video game']):
return ("Entertainment", "Games & Recreation", "Arcades", cat)
if any(x in c for x in ['escape room']):
return ("Entertainment", "Games & Recreation", "Escape Rooms", cat)
if any(x in c for x in ['casino', 'gambling']):
return ("Entertainment", "Gambling", "Casinos", cat)
if any(x in c for x in ['concert', 'music venue', 'live music']):
return ("Entertainment", "Music Venues", "Concert Halls", cat)
if any(x in c for x in ['gym', 'fitness center', 'health club']):
return ("Entertainment", "Fitness", "Gyms", cat)
if any(x in c for x in ['yoga']):
if 'studio' in c or 'center' in c:
return ("Entertainment", "Fitness", "Yoga Studios", cat)
if any(x in c for x in ['pilates']):
return ("Entertainment", "Fitness", "Pilates Studios", cat)
if any(x in c for x in ['swimming pool', 'swim']):
return ("Entertainment", "Sports", "Swimming Pools", cat)
if any(x in c for x in ['golf']):
if 'course' in c or 'club' in c:
return ("Entertainment", "Sports", "Golf Courses", cat)
if any(x in c for x in ['tennis']):
return ("Entertainment", "Sports", "Tennis Courts", cat)
if any(x in c for x in ['stadium', 'arena', 'sports complex']):
return ("Entertainment", "Venues", "Sports Venues", cat)
if any(x in c for x in ['park']):
if 'amusement' not in c and 'theme' not in c:
if 'national' in c or 'state' in c:
return ("Entertainment", "Parks", "National Parks", cat)
if 'dog' in c:
return ("Entertainment", "Parks", "Dog Parks", cat)
return ("Entertainment", "Parks", "Public Parks", cat)
if any(x in c for x in ['recreation center', 'community center']):
return ("Entertainment", "Recreation", "Community Centers", cat)
if any(x in c for x in ['club']):
if 'night' in c:
return ("Food & Dining", "Bars & Nightlife", "Night Clubs", cat)
if 'country' in c:
return ("Entertainment", "Sports", "Country Clubs", cat)
if 'sport' in c or 'athletic' in c:
return ("Entertainment", "Sports", "Sports Clubs", cat)
if 'social' in c:
return ("Entertainment", "Social", "Social Clubs", cat)
# === HOSPITALITY & TRAVEL ===
if any(x in c for x in ['hotel', 'motel', 'inn']):
if 'boutique' in c:
return ("Hospitality & Travel", "Lodging", "Boutique Hotels", cat)
if 'resort' in c:
return ("Hospitality & Travel", "Lodging", "Resorts", cat)
if 'budget' in c or 'economy' in c:
return ("Hospitality & Travel", "Lodging", "Budget Hotels", cat)
return ("Hospitality & Travel", "Lodging", "Hotels", cat)
if any(x in c for x in ['hostel']):
return ("Hospitality & Travel", "Lodging", "Hostels", cat)
if any(x in c for x in ['bed and breakfast', 'b&b', 'bnb']):
return ("Hospitality & Travel", "Lodging", "B&Bs", cat)
if any(x in c for x in ['resort']):
return ("Hospitality & Travel", "Lodging", "Resorts", cat)
if any(x in c for x in ['vacation rental', 'holiday rental']):
return ("Hospitality & Travel", "Lodging", "Vacation Rentals", cat)
if any(x in c for x in ['campground', 'camping', 'rv park']):
return ("Hospitality & Travel", "Lodging", "Campgrounds", cat)
if any(x in c for x in ['travel agency', 'tour operator', 'travel agent']):
return ("Hospitality & Travel", "Travel Services", "Travel Agencies", cat)
if any(x in c for x in ['airline', 'airport']):
return ("Hospitality & Travel", "Transportation", "Airlines & Airports", cat)
if any(x in c for x in ['cruise']):
return ("Hospitality & Travel", "Travel Services", "Cruises", cat)
if any(x in c for x in ['tourist', 'attraction', 'sightseeing']):
return ("Hospitality & Travel", "Attractions", "Tourist Attractions", cat)
# === FINANCE & INSURANCE ===
if any(x in c for x in ['bank', 'banking', 'credit union']):
return ("Finance & Insurance", "Banking", "Banks", cat)
if any(x in c for x in ['atm', 'cash machine']):
return ("Finance & Insurance", "Banking", "ATMs", cat)
if any(x in c for x in ['insurance']):
if 'health' in c or 'medical' in c:
return ("Finance & Insurance", "Insurance", "Health Insurance", cat)
if 'auto' in c or 'car' in c:
return ("Finance & Insurance", "Insurance", "Auto Insurance", cat)
if 'home' in c or 'property' in c:
return ("Finance & Insurance", "Insurance", "Home Insurance", cat)
if 'life' in c:
return ("Finance & Insurance", "Insurance", "Life Insurance", cat)
return ("Finance & Insurance", "Insurance", "Insurance Services", cat)
if any(x in c for x in ['loan', 'mortgage', 'lending']):
return ("Finance & Insurance", "Lending", "Loans", cat)
if any(x in c for x in ['investment', 'financial advisor', 'wealth management', 'financial planner']):
return ("Finance & Insurance", "Investment", "Financial Services", cat)
if any(x in c for x in ['currency exchange', 'money transfer', 'wire transfer']):
return ("Finance & Insurance", "Money Services", "Currency Services", cat)
if any(x in c for x in ['pawn']):
return ("Finance & Insurance", "Money Services", "Pawn Shops", cat)
# === REAL ESTATE ===
if any(x in c for x in ['real estate', 'property', 'realty', 'realtor']):
if 'agent' in c or 'agency' in c or 'broker' in c:
return ("Real Estate", "Agencies", "Real Estate Agents", cat)
if 'developer' in c or 'development' in c:
return ("Real Estate", "Development", "Developers", cat)
if 'management' in c:
return ("Real Estate", "Management", "Property Management", cat)
if 'commercial' in c:
return ("Real Estate", "Commercial", "Commercial Real Estate", cat)
return ("Real Estate", "Services", "Real Estate Services", cat)
if any(x in c for x in ['apartment', 'condo', 'rental']):
if 'complex' in c or 'building' in c:
return ("Real Estate", "Residential", "Apartment Complexes", cat)
if any(x in c for x in ['storage', 'self storage', 'warehouse']):
if 'self' in c or 'mini' in c:
return ("Real Estate", "Storage", "Self Storage", cat)
# === RELIGIOUS ===
if any(x in c for x in ['church']):
if 'catholic' in c:
return ("Religious", "Christian", "Catholic Churches", cat)
if 'baptist' in c:
return ("Religious", "Christian", "Baptist Churches", cat)
if 'methodist' in c:
return ("Religious", "Christian", "Methodist Churches", cat)
if 'lutheran' in c:
return ("Religious", "Christian", "Lutheran Churches", cat)
if 'orthodox' in c:
return ("Religious", "Christian", "Orthodox Churches", cat)
if 'pentecostal' in c:
return ("Religious", "Christian", "Pentecostal Churches", cat)
return ("Religious", "Christian", "Churches", cat)
if any(x in c for x in ['mosque', 'islamic', 'muslim']):
return ("Religious", "Islam", "Mosques", cat)
if any(x in c for x in ['synagogue', 'jewish', 'temple']):
if 'jewish' in c or 'synagogue' in c:
return ("Religious", "Judaism", "Synagogues", cat)
if 'hindu' in c:
return ("Religious", "Hinduism", "Hindu Temples", cat)
if 'buddhist' in c:
return ("Religious", "Buddhism", "Buddhist Temples", cat)
return ("Religious", "Other", "Temples", cat)
if any(x in c for x in ['abbey', 'monastery', 'convent']):
return ("Religious", "Christian", "Monasteries", cat)
if any(x in c for x in ['gurdwara', 'sikh']):
return ("Religious", "Sikhism", "Gurdwaras", cat)
# === GOVERNMENT & PUBLIC SERVICES ===
if any(x in c for x in ['government', 'city hall', 'town hall', 'municipal']):
return ("Government", "Local Government", "Government Offices", cat)
if any(x in c for x in ['court', 'courthouse']):
return ("Government", "Legal", "Courts", cat)
if any(x in c for x in ['police', 'sheriff']):
return ("Government", "Public Safety", "Police", cat)
if any(x in c for x in ['fire station', 'fire department']):
return ("Government", "Public Safety", "Fire Departments", cat)
if any(x in c for x in ['post office', 'postal']):
return ("Government", "Postal", "Post Offices", cat)
if any(x in c for x in ['embassy', 'consulate']):
return ("Government", "International", "Embassies", cat)
if any(x in c for x in ['dmv', 'motor vehicle', 'driver license']):
return ("Government", "Transportation", "DMV", cat)
if any(x in c for x in ['social security', 'welfare', 'social services']):
return ("Government", "Social Services", "Social Services", cat)
# === INDUSTRIAL & MANUFACTURING ===
if any(x in c for x in ['manufacturer', 'manufacturing', 'factory', 'plant']):
if any(x in c for x in ['food', 'beverage', 'bakery']):
return ("Industrial", "Manufacturing", "Food Manufacturing", cat)
if any(x in c for x in ['textile', 'clothing', 'garment']):
return ("Industrial", "Manufacturing", "Textile Manufacturing", cat)
if any(x in c for x in ['electronics', 'computer', 'semiconductor']):
return ("Industrial", "Manufacturing", "Electronics Manufacturing", cat)
if any(x in c for x in ['auto', 'car', 'vehicle']):
return ("Industrial", "Manufacturing", "Auto Manufacturing", cat)
if any(x in c for x in ['chemical', 'pharmaceutical']):
return ("Industrial", "Manufacturing", "Chemical Manufacturing", cat)
if any(x in c for x in ['metal', 'steel', 'iron']):
return ("Industrial", "Manufacturing", "Metal Manufacturing", cat)
if any(x in c for x in ['plastic', 'rubber']):
return ("Industrial", "Manufacturing", "Plastics Manufacturing", cat)
if any(x in c for x in ['furniture', 'wood']):
return ("Industrial", "Manufacturing", "Furniture Manufacturing", cat)
return ("Industrial", "Manufacturing", "General Manufacturing", cat)
if any(x in c for x in ['mining', 'quarry']):
return ("Industrial", "Mining", "Mining Operations", cat)
if any(x in c for x in ['construction company', 'builder']):
return ("Industrial", "Construction", "Construction Companies", cat)
# === TECHNOLOGY ===
if any(x in c for x in ['software', 'app developer', 'web developer']):
return ("Technology", "Software", "Software Development", cat)
if any(x in c for x in ['it service', 'computer service', 'tech support']):
return ("Technology", "IT Services", "IT Support", cat)
if any(x in c for x in ['data center', 'hosting', 'cloud']):
return ("Technology", "Infrastructure", "Data Services", cat)
if any(x in c for x in ['telecommunication', 'telecom', 'internet service']):
return ("Technology", "Telecommunications", "Telecom Services", cat)
# === TRANSPORTATION & LOGISTICS ===
if any(x in c for x in ['shipping', 'freight', 'cargo', 'logistics']):
return ("Transportation", "Logistics", "Shipping & Freight", cat)
if any(x in c for x in ['courier', 'delivery', 'express']):
return ("Transportation", "Delivery", "Courier Services", cat)
if any(x in c for x in ['taxi', 'cab', 'ride', 'limo', 'chauffeur']):
return ("Transportation", "Passenger", "Taxi & Ride Services", cat)
if any(x in c for x in ['bus', 'coach', 'shuttle']):
if 'station' in c or 'terminal' in c or 'stop' in c:
return ("Transportation", "Public Transit", "Bus Stations", cat)
return ("Transportation", "Passenger", "Bus Services", cat)
if any(x in c for x in ['train', 'rail', 'subway', 'metro']):
if 'station' in c or 'terminal' in c:
return ("Transportation", "Public Transit", "Train Stations", cat)
return ("Transportation", "Public Transit", "Rail Services", cat)
if any(x in c for x in ['towing', 'tow truck']):
return ("Transportation", "Vehicle Services", "Towing", cat)
# === AGRICULTURE ===
if any(x in c for x in ['farm', 'ranch', 'orchard', 'vineyard']):
return ("Agriculture", "Farming", "Farms", cat)
if any(x in c for x in ['agricultural', 'agri']):
return ("Agriculture", "Services", "Agricultural Services", cat)
# === PETS & ANIMALS ===
if any(x in c for x in ['pet', 'dog', 'cat']):
if 'grooming' in c or 'groomer' in c:
return ("Pets & Animals", "Pet Services", "Pet Grooming", cat)
if 'boarding' in c or 'kennel' in c or 'sitting' in c or 'daycare' in c:
return ("Pets & Animals", "Pet Services", "Pet Boarding", cat)
if 'training' in c or 'trainer' in c:
return ("Pets & Animals", "Pet Services", "Pet Training", cat)
if 'adoption' in c or 'shelter' in c or 'rescue' in c:
return ("Pets & Animals", "Animal Welfare", "Shelters", cat)
if 'store' in c or 'shop' in c:
return ("Retail & Shopping", "Pet Supplies", "Pet Stores", cat)
# === EVENTS & WEDDINGS ===
if any(x in c for x in ['wedding', 'bridal']):
if 'venue' in c or 'hall' in c:
return ("Events & Weddings", "Venues", "Wedding Venues", cat)
if 'planner' in c:
return ("Events & Weddings", "Planning", "Wedding Planners", cat)
if 'dress' in c or 'gown' in c:
return ("Events & Weddings", "Attire", "Bridal Shops", cat)
return ("Events & Weddings", "Services", "Wedding Services", cat)
if any(x in c for x in ['event', 'party', 'banquet']):
if 'venue' in c or 'hall' in c or 'center' in c:
return ("Events & Weddings", "Venues", "Event Venues", cat)
if 'planner' in c or 'planning' in c:
return ("Events & Weddings", "Planning", "Event Planners", cat)
if 'rental' in c or 'supply' in c:
return ("Events & Weddings", "Rentals", "Event Rentals", cat)
return ("Events & Weddings", "Services", "Event Services", cat)
if any(x in c for x in ['florist', 'flower']):
if 'shop' in c or 'store' not in c:
return ("Events & Weddings", "Florists", "Flower Shops", cat)
if any(x in c for x in ['funeral', 'mortuary', 'cremation', 'cemetery']):
return ("Events & Weddings", "Memorial", "Funeral Services", cat)
# === NON-PROFIT & COMMUNITY ===
if any(x in c for x in ['non-profit', 'nonprofit', 'charity', 'foundation']):
return ("Non-Profit", "Charities", "Non-Profit Organizations", cat)
if any(x in c for x in ['community', 'civic', 'volunteer']):
if 'center' in c:
return ("Non-Profit", "Community", "Community Centers", cat)
return ("Non-Profit", "Community", "Community Organizations", cat)
if any(x in c for x in ['association', 'organization', 'society']):
if 'professional' in c or 'trade' in c or 'business' in c:
return ("Non-Profit", "Professional", "Professional Associations", cat)
return ("Non-Profit", "General", "Organizations", cat)
# Default fallback
return ("Other", "Uncategorized", "General", cat)
def main():
parser = argparse.ArgumentParser(description='Import GBP categories into PostgreSQL with ltree')
parser.add_argument('--csv-path', default=DEFAULT_CSV_PATH, help='Path to categories CSV')
parser.add_argument('--db-url', default=DEFAULT_DB_URL, help='PostgreSQL connection URL')
parser.add_argument('--dry-run', action='store_true', help='Print categories without importing')
args = parser.parse_args()
# Read categories
print(f"Reading categories from: {args.csv_path}")
categories = []
with open(args.csv_path, 'r', encoding='utf-8') as f:
reader = csv.reader(f)
next(reader) # Skip header
for row in reader:
if row and row[0].strip():
categories.append(row[0].strip())
print(f"Found {len(categories)} categories")
# Build tree structure
tree = {} # path -> (name, level, parent_path)
for cat in categories:
l1, l2, l3, l4 = categorize_category(cat)
# Build paths
l1_slug = slugify(l1)
l2_slug = slugify(l2)
l3_slug = slugify(l3)
l4_slug = slugify(l4)
# Level 1 (Sector)
l1_path = l1_slug
if l1_path not in tree:
tree[l1_path] = (l1, 1, None)
# Level 2 (Business Type)
l2_path = f"{l1_slug}.{l2_slug}"
if l2_path not in tree:
tree[l2_path] = (l2, 2, l1_path)
# Level 3 (Sub-category)
l3_path = f"{l1_slug}.{l2_slug}.{l3_slug}"
if l3_path not in tree:
tree[l3_path] = (l3, 3, l2_path)
# Level 4 (Specific Category)
l4_path = f"{l1_slug}.{l2_slug}.{l3_slug}.{l4_slug}"
if l4_path not in tree:
tree[l4_path] = (l4, 4, l3_path)
# Print statistics
level_counts = {1: 0, 2: 0, 3: 0, 4: 0}
for path, (name, level, parent) in tree.items():
level_counts[level] += 1
print(f"\nTree structure:")
print(f" Level 1 (Sectors): {level_counts[1]}")
print(f" Level 2 (Business Types): {level_counts[2]}")
print(f" Level 3 (Sub-categories): {level_counts[3]}")
print(f" Level 4 (Categories): {level_counts[4]}")
print(f" Total nodes: {len(tree)}")
if args.dry_run:
print("\n[DRY RUN] Would insert these nodes:")
for path in sorted(tree.keys())[:20]:
name, level, parent = tree[path]
print(f" {' ' * (level-1)}{name} ({path})")
print(f" ... and {len(tree) - 20} more")
return
# Check for psycopg2
if not HAS_PSYCOPG2:
print("\nERROR: psycopg2 is required for database import.")
print("Install it with: pip install psycopg2-binary")
return
# Connect to database
print(f"\nConnecting to database...")
conn = psycopg2.connect(args.db_url)
cur = conn.cursor()
# Run init SQL first
init_sql_path = os.path.join(os.path.dirname(__file__), 'init', '01_create_categories.sql')
if os.path.exists(init_sql_path):
print(f"Running init SQL: {init_sql_path}")
with open(init_sql_path, 'r') as f:
cur.execute(f.read())
conn.commit()
# Clear existing data
print("Clearing existing categories...")
cur.execute("TRUNCATE TABLE gbp_categories RESTART IDENTITY CASCADE")
# Insert nodes in order (parents first)
print("Inserting categories...")
path_to_id = {}
# Sort by level to ensure parents are inserted first
sorted_items = sorted(tree.items(), key=lambda x: x[1][1])
for path, (name, level, parent_path) in sorted_items:
parent_id = path_to_id.get(parent_path) if parent_path else None
slug = path.split('.')[-1]
cur.execute("""
INSERT INTO gbp_categories (name, slug, path, level, parent_id)
VALUES (%s, %s, %s, %s, %s)
RETURNING id
""", (name, slug, path, level, parent_id))
path_to_id[path] = cur.fetchone()[0]
# Update category counts
print("Updating category counts...")
cur.execute("""
UPDATE gbp_categories p
SET category_count = (
SELECT COUNT(*) FROM gbp_categories c
WHERE c.path <@ p.path AND c.path != p.path
)
""")
conn.commit()
# Verify
cur.execute("SELECT COUNT(*) FROM gbp_categories")
count = cur.fetchone()[0]
print(f"\nSuccess! Inserted {count} nodes into gbp_categories table")
# Show tree stats
cur.execute("SELECT * FROM category_tree_stats")
print("\nTree statistics:")
for row in cur.fetchall():
print(f" Level {row[0]}: {row[1]} nodes")
cur.close()
conn.close()
print("\nDone!")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,120 @@
-- Enable ltree extension for hierarchical data
CREATE EXTENSION IF NOT EXISTS ltree;
-- Categories tree table
CREATE TABLE IF NOT EXISTS gbp_categories (
id SERIAL PRIMARY KEY,
name TEXT NOT NULL,
slug TEXT NOT NULL,
path ltree NOT NULL,
level INT NOT NULL DEFAULT 1,
parent_id INT REFERENCES gbp_categories(id),
category_count INT DEFAULT 0,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
UNIQUE(path)
);
-- Indexes for fast hierarchical queries
CREATE INDEX IF NOT EXISTS idx_gbp_categories_path ON gbp_categories USING GIST (path);
CREATE INDEX IF NOT EXISTS idx_gbp_categories_path_btree ON gbp_categories USING BTREE (path);
CREATE INDEX IF NOT EXISTS idx_gbp_categories_name ON gbp_categories (name);
CREATE INDEX IF NOT EXISTS idx_gbp_categories_slug ON gbp_categories (slug);
CREATE INDEX IF NOT EXISTS idx_gbp_categories_level ON gbp_categories (level);
CREATE INDEX IF NOT EXISTS idx_gbp_categories_parent ON gbp_categories (parent_id);
-- Full text search index
CREATE INDEX IF NOT EXISTS idx_gbp_categories_name_trgm ON gbp_categories USING GIN (name gin_trgm_ops);
-- Enable trigram extension for fuzzy search
CREATE EXTENSION IF NOT EXISTS pg_trgm;
-- Function to update timestamp
CREATE OR REPLACE FUNCTION update_updated_at_column()
RETURNS TRIGGER AS $$
BEGIN
NEW.updated_at = CURRENT_TIMESTAMP;
RETURN NEW;
END;
$$ language 'plpgsql';
-- Trigger for auto-updating timestamp
DROP TRIGGER IF EXISTS update_gbp_categories_updated_at ON gbp_categories;
CREATE TRIGGER update_gbp_categories_updated_at
BEFORE UPDATE ON gbp_categories
FOR EACH ROW
EXECUTE FUNCTION update_updated_at_column();
-- Helper function: Get all children of a category
CREATE OR REPLACE FUNCTION get_category_children(parent_path ltree)
RETURNS TABLE (
id INT,
name TEXT,
slug TEXT,
path ltree,
level INT
) AS $$
BEGIN
RETURN QUERY
SELECT c.id, c.name, c.slug, c.path, c.level
FROM gbp_categories c
WHERE c.path <@ parent_path AND c.path != parent_path
ORDER BY c.path;
END;
$$ LANGUAGE plpgsql;
-- Helper function: Get ancestors of a category
CREATE OR REPLACE FUNCTION get_category_ancestors(category_path ltree)
RETURNS TABLE (
id INT,
name TEXT,
slug TEXT,
path ltree,
level INT
) AS $$
BEGIN
RETURN QUERY
SELECT c.id, c.name, c.slug, c.path, c.level
FROM gbp_categories c
WHERE category_path <@ c.path AND c.path != category_path
ORDER BY c.level;
END;
$$ LANGUAGE plpgsql;
-- Helper function: Search categories by name (fuzzy)
CREATE OR REPLACE FUNCTION search_categories(search_term TEXT, limit_count INT DEFAULT 20)
RETURNS TABLE (
id INT,
name TEXT,
path ltree,
level INT,
similarity REAL
) AS $$
BEGIN
RETURN QUERY
SELECT c.id, c.name, c.path, c.level,
similarity(c.name, search_term) as sim
FROM gbp_categories c
WHERE c.name ILIKE '%' || search_term || '%'
OR similarity(c.name, search_term) > 0.3
ORDER BY sim DESC, c.level, c.name
LIMIT limit_count;
END;
$$ LANGUAGE plpgsql;
-- View for tree statistics
CREATE OR REPLACE VIEW category_tree_stats AS
SELECT
level,
COUNT(*) as count,
COUNT(*) FILTER (WHERE level = 1) as sectors,
COUNT(*) FILTER (WHERE level = 2) as business_types,
COUNT(*) FILTER (WHERE level = 3) as sub_categories,
COUNT(*) FILTER (WHERE level = 4) as leaf_categories
FROM gbp_categories
GROUP BY level
ORDER BY level;
COMMENT ON TABLE gbp_categories IS 'Google Business Profile categories organized in a 4-level hierarchy using ltree';
COMMENT ON COLUMN gbp_categories.path IS 'Hierarchical path using ltree (e.g., Food_Dining.Restaurants.By_Cuisine.Afghan_restaurant)';
COMMENT ON COLUMN gbp_categories.level IS '1=Sector, 2=Business Type, 3=Sub-category, 4=Specific Category';

View File

@@ -0,0 +1,293 @@
#!/usr/bin/env python3
"""
Hierarchical categorization of Other items.
APPROACH:
1. First pass: Assign to Level 1 (Sector) - items that don't match go to sector's "Other" business type
2. Second pass: Within each sector, refine Level 2 (Business Type)
3. Third pass: Within each business type, refine Level 3 (Sub-category)
This creates:
- Sector.Other.Uncategorized for sector-level unknowns
- Sector.BusinessType.Other for business-type-level unknowns
EXISTING SECTORS (21 + Other):
Agriculture, Automotive, Education, Entertainment, Events_Weddings, Finance_Insurance,
Food_Dining, Government, Healthcare, Home_Services, Hospitality_Travel, Industrial,
Non_Profit, Personal_Services, Pets_Animals, Professional_Services, Real_Estate,
Religious, Retail_Shopping, Technology, Transportation, Other
"""
import re
# ==================== LEVEL 1: SECTOR ASSIGNMENT ====================
# Maps keyword patterns to sectors. Order matters - first match wins.
# These are broad patterns to catch as much as possible at sector level.
SECTOR_PATTERNS = [
# HEALTHCARE - Medical professionals, facilities, services
(r'(doctor|clinic|hospital|medical|health\s|dental|dentist|therapy|therapist|psycho|chiropract|optom|optician|pharmacy|pharmacist|nurse|surgeon|physician|cardiolog|dermatol|pediatr|orthoped|neurolog|oncolog|urolog|allergist|anesthesiol|audiolog|blood\sbank|blood\sdonat|blood\stest|dialysis|fertility|hospice|rehab|physiother|acupunct|naturopath|homeopath|osteopath|midwife|birth\scenter|prenatal|maternity|wellness\s(clinic|center)|diagnostic|x-ray|mri|ultrasound|laboratory|patholog|radiolog|pulmonolog|gastroenter|endocrin|rheumatol|immunolog|geriatr|podiatr|ophthalmolog|otolaryng|hematolog|nephrolog|proctolog|physiatrist|diabetolog|toxicolog|epidemiolog|oncology|assisted\sliving|nursing\shome|senior\scare|aged\scare|elder\scare|ambulance|emergency\sroom|urgent\scare|first\said|denture|diabetes\scenter|eye\scare|hiv\stest|perinatal|physical\sexam|pregnancy\scare|surgical\scenter|mammograph|std\stest|drug\stest|lactation|doula|bonesetting|hearing\said|prosthetic|orthotic|oxygen|ostomy|sleep\sclinic|sleep\slab|fertility|ivf|sperm\sbank|stem\scell|general\spractitioner|gynecolog|obstetrician|hepatolog|intensivist|internist|neurophysiol|orthoptist|prosthodontist|sexolog|venereolog|nutritionist|dietitian|endoscopist|kinesiolog|pedorthist|seitai|foot\scare|internal\smedicine|family\smedic|family\sdoctor|gp\s|medical\sward)', 'Healthcare'),
# EDUCATION - Schools, training, learning
(r'(school|university|college|academy|training\scenter|training\sschool|lesson|instructor|tutor|education|library|kindergarten|preschool|pre-?school|daycare|day\scare|learning\scenter|vocational|apprentice|faculty|campus|institute|seminary|boarding\sschool|private\sschool|public\sschool|elementary|middle\sschool|high\sschool|montessori|waldorf|charter\sschool|language\sschool|driving\sschool|flight\sschool|cooking\sclass|art\sclass|music\sclass|dance\sclass|acting\sclass|drama\sclass|conservatory|music\sacademy|ballet\sacademy|film\sschool|design\sschool|fashion\sschool|culinary|bartending|beauty\sschool|cosmetology|esthetician|barber\sschool|massage\sschool|yoga\steacher|yoga\straining|meditation\sclass|self-?defense\sclass|swimming\slesson|tennis\slesson|golf\slesson|ski\sschool|surf\sschool|scuba|sailing\sschool|studying\scenter|test\sprep|sat\sprep|gre\sprep|cram\sschool|juku|hagwon|coaching\scenter|head\sstart|early\shead|childminder|assistante\smaternelle|au\spair|nanny\sagency|student\sdormitor|student\shousing|student\scareer|career\scounseling|english\slanguage\scamp|language\scamp|summer\scamp|science\scamp|coding\scamp|academic\sdepartment)', 'Education'),
# AUTOMOTIVE - Vehicles, parts, services
(r'(auto\s|car\s|vehicle|motor\s|tire\s|tyre\s|mechanic|garage(?!\sdoor)|parking\s(lot|garage|facility)|driving|truck\s|motorcycle|motorbike|scooter\s|atv\s|automotive|car\swash|car\sdetail|car\sdealer|car\srental|car\slease|car\sinspect|car\sauction|smog\scheck|oil\schange|brake\s|transmission|radiator|exhaust|muffler|auto\sbody|collision|windshield|car\sstorage|towing|roadside)', 'Automotive'),
# TRANSPORTATION - Moving people/goods
(r'(airport|airline|aviation(?!\sschool)|aircraft|airplane|airfield|airstrip|heliport|seaplane|ferry|cruise|port\sauthority|port\soperating|harbor|dock\s|pier\s|marina|shipping|freight|cargo|trucking|logistics|warehouse|courier|messenger|delivery\sservice|taxi|cab\sservice|limo|chauffeur|bus\sstation|bus\sterminal|train\sstation|rail|metro|subway|transit|rickshaw|bicycle\srental|boat\srental|bike\sshare|car\sshare)', 'Transportation'),
# GOVERNMENT - Public administration, military, legal system
(r'(government|military|army\s|navy\s|naval\sbase|air\sforce|marine\s|coast\sguard|national\sguard|police|sheriff|law\senforce|fire\sstation|fire\sdepartment|courthouse|court\s|embassy|consulate|city\shall|municipal|county\s|district\soffice|passport|immigration|citizenship|dmv|tax\soffice|social\ssecurity|border|customs|post\soffice|postal|public\srecord|voter|election|legislature|parliament|congress|senate|mayor|governor|council|permit|license\s(office|bureau)|civil\sdefense|emergency\smanagement|public\ssafety|prison|jail|detention|correctional|probation|parole|aadhaar|agenzia\sentrate|anganwadi|asylum\scenter|city\sclerk|environment\soffice|land\sregistry|patent\soffice|pension\soffice|registration\soffice|registry\soffice|unemployment|employment\scenter|citizen\sinformation|consumer\sadvice|state\sarchive|national\sarchive|public\sarchive|guardia\scivil|highway\spatrol|department\sof|ministry\sof|bureau\sof|board\sof\seducation|public\sworks|sanitation|water\sauthority|housing\sauthority|port\sauthority|transit\sauthority)', 'Government'),
# RELIGIOUS - Places of worship, spiritual
(r'(church|temple|mosque|masjid|synagogue|chapel|cathedral|basilica|parish|religious|spiritual|ashram|monastery|convent|abbey|priory|buddhist|hindu|christian|catholic|protestant|orthodox|baptist|methodist|lutheran|presbyterian|pentecostal|evangelical|muslim|islamic|jewish|judai|sikh|gurdwara|gurudwara|baha.?i|shinto|taoist|quaker|mennonite|amish|latter-?day|jehovah|scientolog|meditation\scenter|retreat\scenter|pilgrimage|shrine|pagoda|wat\s|vihara|mission(?!\scontrol)|musalla|place\sof\sworship|rectory|yeshiva|marae|congregation|spiritist|priest|mohel|botanica)', 'Religious'),
# ENTERTAINMENT - Fun, recreation, sports, arts, culture
(r'(sports\s|sport\s|club(?!\shouse)|field$|court\s|gym\s|gymnasium|fitness|athletic|stadium|arena|pool\s|swimming|track\s|golf\s|tennis|soccer|football|basketball|baseball|hockey|volleyball|badminton|squash|racquetball|bowling|billiard|snooker|boxing|martial\sart|karate|judo|taekwondo|aikido|wrestling|fencing|archery|shooting\srange|gun\sclub|yoga\s|pilates|crossfit|cycling|skating|skateboard|skiing|snowboard|surfing|diving|climbing|bouldering|trampoline|gymnastics|dance\s|ballet|museum|theater|theatre|cinema|movie|art\sgallery|art\scenter|art\sstudio|gallery|music\svenue|concert|entertainment|amusement|theme\spark|water\spark|zoo|aquarium|wildlife|safari|botanical|arboretum|casino|gambling|betting|arcade|game\scenter|escape\sroom|laser\stag|paintball|go-?kart|mini\sgolf|comedy\sclub|jazz\sclub|blues\sclub|karaoke|nightclub|disco|rave|circus|carnival|fair\s|rodeo|bullring|race\strack|racecourse|hippodrome|velodrome|skate\spark|bmx|motocross|off-?road|aquatic\scenter|batting\scage|bungee|hang\sglid|paraglid|skydiv|indoor\ssnow|leisure\scenter|recreation\scenter|cultural\scenter|exhibit|festival|philharmon|opera\shouse|opera\scompany|symphony|orchestra|planetarium|observatory|science\scenter|discovery\scenter|children.*amusement|funfair|bouncy\scastle|inflatab|playground|adventure\spark|treetop|zipline|zip\sline|ropes\scourse|obstacle\scourse|ninja\swarrior|canoeing|kayaking|rafting|fishing\spond|fishing\sarea|bird\swatch|nature\sreserve|nature\scenter|hiking\strail|walking\strail|hiking\sarea|beach\spavil|beach\sresort|waterfront|promenade|pier\s(?!fishing)|boardwalk|scenic\spoint|scenic\sspot|lookout|viewpoint|observation|monument|landmark|castle|palace|fortress|historic\ssite|heritage|ruins|amphitheater|bandstand|gazebo|pavilion|curling\shall|scout\shall|scout\shome|village\shall|community\shall|social\shall|civic\scenter|convention\scenter|exhibition\scenter|artist$|band$|choir|musician|entertainer|magician|pyrotechnician|performing\sarts|stage$|sculpture|statuary|painting$|roller\scoaster|haunted\shouse|fairground|ghost\stown|lido|rugby|rugby\sfield|softball\sfield|little\sleague\sfield|water\spolo|cricket\sground|rowing\sarea|weightlifting|off\sroading|prawn\sfishing|raft\strip|mountaineering|summer\stoboggan|pumpkin\spatch|picnic\sground|national\sforest|national\sreserve|national\spark|nature\spreserve|protected\sarea|reenactment|sambodrome|pachinko|mahjong\shouse|children\shall|children.*camp|outdoor\sactivity|outdoor\sbath|onsen|thermal\sbath|day-?use\sonsen|foot\sbath)', 'Entertainment'),
# FOOD & DINING - Restaurants, bars, food production
(r'(restaurant|cafe(?!\steria)|café|coffee\s|espresso|bar\s(?!association)|pub\s|tavern|lounge|brewery|taproom|brewpub|winery|distillery|bakery|patisserie|pastry|dessert|ice\scream|gelato|frozen\syogurt|pizzeria|pizza\s|taco|burrito|sushi|ramen|noodle|dim\ssum|dumpling|steakhouse|steak\shouse|seafood|grill|bbq|barbecue|diner|bistro|brasserie|eatery|canteen|cafeteria|food\scourt|food\struck|food\scart|catering|caterer|buffet|brunch|breakfast|lunch|dinner|takeout|take-?away|delivery\sfood|meal|kitchen(?!\scabinet)|chef\s|cook\s|juice\sbar|smoothie|tea\shouse|traditional\steahouse|bubble\stea|boba|wine\sbar|wine\scellar|cocktail|speakeasy|gastropub|chophouse|crab\shouse|fish\s&\schips|curry|indian\srestaurant|chinese\srestaurant|chinese\stakeaway|italian\srestaurant|mexican\srestaurant|thai\srestaurant|japanese\srestaurant|korean\srestaurant|vietnamese|french\srestaurant|greek\srestaurant|mediterranean|middle\seastern|african\srestaurant|caribbean|latin\samerican|american\srestaurant|fast\sfood|quick\sservice|drive-?thru|dhaba|tiffin|hawker|churreria|creperie|crepe|pastelaria|pasteleria|tapas|izakaya|yakiniku|okonomiyaki|tempura|udon|soba|tonkatsu|kaiseki|robatayaki|teppanyaki|kushiyaki|yakitori|gyudon|poke\sbowl|acai|falafel|shawarma|kebab|gyro|pita|hummus|mezze|tagine|injera|pho|banh\smi|bibimbap|bulgogi|kimchi|hotpot|fondue|raclette|schnitzel|bratwurst|currywurst|pierogi|borscht|blini|pelmeni|empanada|arepa|pupusa|ceviche|asado|churrasco|rodizio|feijoada|moqueca|acaraje|jerk|oxtail|doubles|roti|samosa|biryani|tandoori|masala|tikka|naan|dosa|idli|vada|chaat|thali|satay|laksa|rendang|nasi\sgoreng|pad\sthai|som\stam|tom\syum|green\scurry|massaman|poutine|smoked\smeat|lobster\sroll|clam\schowder|po.?boy|gumbo|jambalaya|soul\sfood|southern\sfood|cajun|creole|carvery|dairy$|frituur|fruit\sparlor|meyhane|sugar\shack|yakatabune|olive\soil\scooperative|soy\ssauce)', 'Food_Dining'),
# HOME SERVICES - Home improvement, maintenance, repair
(r'(plumb|electrician|electrical\scontract|hvac|heating|air\scondition|cooling|roof|landscap|lawn\s|garden\sservice|gardener|arborist|tree\sservice|clean\s(service|company)|cleaning\sservice|cleaners$|pest\scontrol|exterminator|paint\scontract|painter(?!\sartist)|paint\sstrip|carpent|cabinet\smaker|flooring|tile\sinstall|hardwood|carpet\sinstall|repair\sservice|contractor|remodel|renovation|handyman|locksmith\sservice|moving\scompany|mover\s|moving\sand\sstorage|piano\smoving|appliance\srepair|garage\sdoor|gutter|chimney|window\sinstall|door\sinstall|double\sglazing|glass\srepair|fence\s|deck\sbuild|patio|drywall|insulation|siding|masonry|brick|concrete|paving|asphalt|pool\sservice|pool\scleaning|spa\sservice|septic|sewer|drain|water\sheater|well\sdrill|solar\sinstall|solar\spanel\smaintenance|security\ssystem|alarm\sinstall|home\sinspect|building\sinspect|surveyor|interior\sdesign|home\sstaging|pressure\swash|graffiti\sremoval|debris\sremoval|junk\sremoval|house\sclearance|snow\sremoval|antenna\sservice|satellite\sinstall|gasfitter|gas\sinstall|height\sworks|impermeabilization|wallpaper\sinstall|airbrushing|home\shelp|stall\sinstall)', 'Home_Services'),
# RETAIL & SHOPPING - Stores, shops, markets
(r'(store\s|shop\s(?!service)|retail|boutique|market(?!ing)|mall\s|outlet|dealer(?!ship)|supplier|wholesale|distributor|supermarket|grocery|convenience|department\sstore|discount|thrift|consignment|pawn|antique|vintage|secondhand|used\s|book\sstore|stationery|office\ssupply|toy\sstore|game\sstore|hobby|craft\sstore|art\ssupply|music\sstore|record\sstore|electronics|computer\sstore|phone\sstore|appliance\sstore|furniture\sstore|home\sdecor|bedding|mattress|kitchenware|hardware|tool\sstore|building\ssupply|lumber|garden\scenter|plant\snursery|florist|flower\sshop|pet\sstore|pet\ssupply|clothing|fashion|apparel|shoe\sstore|jewelry|watch\sstore|cosmetic|beauty\ssupply|pharmacy|drugstore|health\sstore|vitamin|supplement|sporting\sgoods|outdoor\sstore|bicycle\sshop|gun\sshop|hunting|fishing\sstore|camping|liquor|wine\sshop|beer\sstore|tobacco|cigar|vape|smoke\sshop|candy|chocolate|confection|bakery\sshop|cheese\sshop|spice|tea\sshop|coffee\sshop(?!\scafe)|newsstand|kiosk|vending|bazar|bazaar|hawker\scenter|flea\smarket|farmers\smarket|night\smarket|food\shall|food\scourt|deli(?!very)|delicatessen|charcuterie|butcher|fishmonger|greengrocer|produce|fruit\sstand|flower\sstand|fabric|textile\sshop|yarn|knitting|sewing\sshop|craft\ssuppl|frame\sshop|framing|trophy|engraving|gift\sshop|souvenir|duty\sfree|airport\sshop|convenience|corner\sstore|general\sstore|variety|dollar\sstore|pound\sshop|euro\sshop|99\scent|surplus|closeout|liquidat|outlet\small|factory\soutlet|warehouse\sstore|membership\sclub|costco|sam.*club)', 'Retail_Shopping'),
# PROFESSIONAL SERVICES - Business services, consulting, legal, creative
(r'(lawyer|attorney|law\sfirm|legal\sservice|accountant|accounting|bookkeep|cpa\s|tax\s(prepar|service|consult)|consultant|consulting|architect(?!ure)|engineer(?!ing\sschool)|survey\scompany|land\ssurvey|topograph|agency(?!\sgovernment)|staffing|recruiting|recruiter|employment\sagency|hr\sservice|marketing|advertis|pr\sfirm|public\srelations|graphic\sdesign|web\sdesign|website\sdesign|photography|photographer|videograph|film\sproduction|animation\sstudio|recording\sstudio|rehearsal\sstudio|production\sstudio|portrait\sstudio|model\sportfolio\sstudio|painting\sstudio|translation|interpret|transcription|notary|commissioner\sfor\soaths|private\sinvestigat|detective|appraiser|appraisal|estate\sappraiser|auditor|financial\saudit|actuary|financial\splanner|wealth\smanag|investment\sadvis|business\sconsult|management\sconsult|it\sconsult|media\scompany|media\shouse|record\scompany|scenograph|model\sdesign|telemarket|direct\smail|copywriter|editor|proofreader|technical\swriter|ghostwriter|literary\sagent|talent\sagent|booking\sagent|casting|modeling\sagent|artist\smanage|court\sreport|patent\sagent|trademark|intellectual\sproperty|customs\sbroker|freight\sforward|import\sexport|export\scompany|geological\sresearch|geological\sservice|environmental\sconsult|safety\sconsult|quality\sconsult|process\sserv|skip\strac|bail\senforce|collection\sagent|factoring|mezzanine\sfinance|conveyancer|executor|genealogist|gemologist|loss\sadjuster|foreclosure|insolvency|judicial\sscrivener|commercial\sagent|executive\ssearch|payroll\sservice|resume\sservice|typing\sservice|fax\sservice|mailing\sservice|shredding\sservice|blueprint|drafting|mapping\sservice|research\sand\sproduct|information\sservice|news\sservice|music\smanagement|yacht\sbroker|finance\sbroker|food\sbroker)', 'Professional_Services'),
# INDUSTRIAL - Manufacturing, construction, mining, utilities, trades
(r'(factory|plant(?!\snursery)|mill$|mill\s|manufactur|industrial|mining|mine\s|quarry|production|foundry|forge|smelter|refinery|chemical\s|pharmaceutical\scompan|textile|garment\sfactory|food\sprocessing|cannery|bottling|assembly|fabricat|machine\sshop|metal\swork|metal\sprocess|metallurg|welding|welder|steel|iron\sworks|aluminum|plastic|rubber|paper\smill|lumber\smill|sawmill|saw\smill|print\sshop|commercial\sprint|digital\sprint|packaging|recycling|waste\smanagement|construction\scompany|general\scontractor|building\scompany|building\sfirm|developer|civil\sengineering|demolition|excavat|crane\sservice|scaffold|heavy\sequipment|blacksmith|coppersmith|goldsmith|silversmith|horseshoe|locksmith(?!\sservice)|tinsmith|gunsmith|bladesmith|knifesmith|boilermaker|machinist|millwright|pipefitter|rigger|sheet\smetal|ironwork|structural\ssteel|precast|concrete\splant|asphalt\splant|gravel|aggregate|sand\s&\sgravel|earth\sworks|anodizing|electroplat|galvaniz|powder\scoat|metal\spolish|metal\sfinish|sandblast|shot\sblast|heat\streat|tempering|hardening|casting|die\scast|injection\smold|blow\smold|extrusion|stamping|forging|cnc|lathe|milling\smachine|grinding|boring|drilling|water\sutility|electric\sutility|gas\scompany|power\sstation|power\splant|nuclear\spower|solar\senergy|wind\sfarm|hydroelectric|substation|transformer|utility\scompany|water\spurification|sewage|wastewater|biotechnolog|shipbuilding|ship\srepair|shipyard|dry\sdock|boatyard|marine\sengine|propeller|cotton\smill|flour\smill|rice\smill|jute\smill|water\smill|weaving\smill|cider\smill|slaughterhouse|tannery|dyeworks|meat\spacker|meat\sprocessor|fruit.*processing|glass\sindustry|sewing\scompany|turnery|toolroom|machine\sconstruct|stone\scutter|stone\scarving|joiner|woodworker|plasterer|glazier|plating\sservice|embossing|lamination|laser\scutting|water\sjet|salvage\syard|junkyard|garbage\sdump|waste\stransfer|coalfield|oilfield)', 'Industrial'),
# HOSPITALITY & TRAVEL - Lodging, tourism
(r'(hotel|motel|inn\s|resort|hostel|lodge\s|bed\s&\sbreakfast|bed\sand\sbreakfast|b&b|guesthouse|guest\shouse|vacation\srental|holiday\s(rental|apartment|home)|cabin\srental|cottage\srental|cottage(?!\sindustry)|chalet|airbnb|vrbo|travel\sagent|travel\sagency|tour\soperator|tour\sguide|tourist\s(information|office|attraction)|sightseeing|excursion|cruise|camping|campground|caravan\spark|rv\spark|glamping|youth\shostel|retreat\scenter(?!\sreligious)|boarding\shouse|rooming\shouse|dormitory(?!\sstudent)|rest\sstop|rest\sarea|truck\sstop|service\sarea|visitor\scenter|welcome\scenter|country\shouse|manor\shouse|estate\shouse|villa\srental|apartment\shotel|extended\sstay|residence\sinn|suite\shotel|capsule\shotel|love\shotel|ryokan|minshuku|pension\s|agriturismo|pousada|parador|paradores)', 'Hospitality_Travel'),
# PERSONAL SERVICES - Beauty, wellness, personal care
(r'(salon\s|spa\s(?!automotive)|massage(?!\schair)|tattoo|piercing|body\sart|barber|beauty\s(?!supply|store)|nail\s|manicure|pedicure|hair\s(salon|stylist|dresser|cut)|waxing|threading|lash|brow|eyelash|makeup\sartist|esthetician|cosmetolog|tanning|sunbed|sauna|steam\sroom|bathhouse|hammam|laundry|laundromat|dry\sclean|tailor|alteration|seamstress|shoe\srepair|cobbler|watch\srepair|key\scutting|weight\sloss|diet\scenter|personal\strainer|life\scoach|dating\sservice|matchmak)', 'Personal_Services'),
# FINANCE & INSURANCE - Banks, financial services
(r'(bank(?!\sfood)|credit\sunion|savings\s&\sloan|atm\s|insurance\s(agent|agency|company|broker)|mortgage|loan\s(company|officer|broker)|lending|finance\scompany|financial\sservic|investment\s(firm|company|bank)|stock\sbroker|wealth\smanage|money\stransfer|remittance|currency\sexchange|forex|check\scash|payday\sloan|pawn(?!shop)|bail\sbond|credit\srepair|debt\scollect|factoring|leasing\scompany)', 'Finance_Insurance'),
# REAL ESTATE - Property, housing, storage
(r'(real\sestate|realtor|property\s(agent|management|company)|apartment\s(complex|building|rental)|condo|condominium|housing|home\sbuilder|land\sdeveloper|commercial\sreal|office\sspace|coworking|business\scenter|storage\s(facility|unit)|self.?storage|mini\sstorage|warehouse\sspace|parking\sspace|mobile\shome\spark|trailer\spark)', 'Real_Estate'),
# EVENTS & WEDDINGS - Event services, funeral
(r'(funeral|mortuary|cremation|crematorium|cemetery|memorial\s|casket|burial|wedding\s(planner|venue|dress|photographer)|event\s(planner|venue|center)|party\s(planner|supply|rental)|banquet\shall|reception\shall|conference\scenter|convention|meeting\sroom|catering\shall|dj\sservice|disc\sjockey|band\sfor\shire|balloon|decoration\sservice|tent\srental|photo\sbooth|florist(?!\sshop))', 'Events_Weddings'),
# NON-PROFIT - Charities, community organizations, social services
(r'(charity|charitable|non-?profit|ngo\s|foundation(?!\srepair)|community\scenter|community\sorganiz|civic\s|volunteer|food\sbank|soup\skitchen|homeless\s(shelter|service)|social\sservice|social\sworker|welfare\soffice|crisis\scenter|hotline|support\sgroup|self-?help|aa\s|alcoholics|narcotics\sanonymous|veteran|vfw|american\slegion|rotary|lions\sclub|kiwanis|elks|freemason|masonic|fraternal|chamber\sof\scommerce|chamber\sof\shandicrafts|trade\sassociation|professional\sassociation|labor\sunion|tenant.*union|indigenous|aboriginal|tribal|youth\scenter|youth\scare|youth\sgroup|senior\scitizen\scenter|women.s\s(shelter|center|protection)|domestic\sviolence|battered|abuse\s(shelter|center)|halfway\shouse|sober\sliving|addiction\s(center|service)|recovery\scenter|rehab\scenter(?!ilitation)|detox|mental\shealth\sadvocacy|disability\s(service|advocacy)|deaf\sservice|blind\sservice|immigrant\s(service|aid)|refugee\s(service|aid|camp)|legal\said|pro\sbono|family\sservice|family\splanning|birth\scontrol|child\swelfare|foster\scare|adoption\sagency|big\sbrothers|big\ssisters|boys\s&\sgirls|ymca|ywca|jewish\scommunity|jcc|salvation\sarmy|goodwill|habitat\sfor\shumanity|red\scross|united\sway|make-?a-?wish|special\solympics|donations\scenter|thrift(?!\sstore)|donation\sdrop|orphanage|children.*home|group\shome|shelter$|scouting|literacy\sprogram|crime\svictim|mediation\sservice|special\seducator|playgroup|student\sunion)', 'Non_Profit'),
# TECHNOLOGY - IT, software, telecom
(r'(software|app\sdevelop|web\sdevelop|it\sservice|it\ssupport|computer\sservice|computer\srepair|computer\ssecurity|computer\snetwork|tech\ssupport|data\scenter|data\srecovery|data\sentry|database|server\s(farm|hosting)|cloud\sservice|internet\sservice|isp\s|broadband|telecom|telephone\scompany|mobile\s(operator|network)|cell\sphone\sservice|fiber\soptic|satellite\s(communication|service)|cable\sprovider|cybersecurity|network|systems\sintegrat|bpo|call\scenter|outsourc|automation\scompany|home\sautomation|robotics|ai\scompany|machine\slearning|e-?commerce|digital\smarketing|seo|web\shost|domain\sregist|ssl|vpn|managed\sservice|msp|helpdesk|remote\ssupport|pc\srepair)', 'Technology'),
# AGRICULTURE - Farming, ranching
(r'(farm(?!acy|er.s\smarket)|ranch|agriculture|livestock|cattle|poultry|dairy\sfarm|pig\sfarm|sheep|goat|horse\sfarm|stable(?!\sservice)|equestrian\scenter|riding\sschool|crop|orchard|vineyard(?!\swinery)|plantation|greenhouse|horticulture|nursery(?!school)|floricult|aquaculture|fish\sfarm|beekeep|apiary|agronomy|fertilizer|seed\scompany|farm\sequipment|tractor|irrigation|grain|silo|feed\sstore|livestock\sauction|veterinari.*(large|farm|livestock))', 'Agriculture'),
# PETS & ANIMALS - Pet services, animal welfare
(r'(pet\s(?!rol)|animal\s(?!hospital|clinic)|dog\s(?!hot)|cat\s|bird\s(?!watch)|fish\s(?!market|restaurant)|reptile|aquarium\sstore|vet(?!eran)|veterinar(?!.*large|.*farm)|kennel|doggy\sdaycare|pet\sgrooming|pet\sboarding|pet\ssitting|dog\swalk|pet\strain|animal\sshelter|animal\srescue|animal\scontrol|humane\ssociety|spca|aspca|wildlife\srehab|sanctuary|cattery|aviary|breeder|stud\sservice|horse\sboarding|stable(?!\sindustry)|equine|farrier|horse\sshoe)', 'Pets_Animals'),
# FINANCE & INSURANCE - Banks, financial services
(r'(bank(?!\sfood)|credit\sunion|savings\s&\sloan|atm\s|insurance\s(agent|agency|company|broker)|mortgage|loan\s(company|officer|broker)|lending|finance\scompany|financial\sservic|investment\s(firm|company|bank)|stock\sbroker|wealth\smanage|money\stransfer|remittance|currency\sexchange|forex|check\scash|payday\sloan|bail\sbond|credit\srepair|debt\scollect|factoring|leasing\scompany|venture\scapital|private\sequity|hedge\sfund|asset\smanag|trust\scompany|escrow|title\scompany|credit\scounseling|financial\splanning|retirement\splanning|pension\sfund|401k|ira|annuity|securities|commodities|futures|options|trading|brokerage|fintech|mobile\smoney|digital\swallet|cryptocurrency|bitcoin|blockchain)', 'Finance_Insurance'),
# Catch more rentals and specialized services
(r'(equipment\srental|tool\srental|party\srental|tent\srental|chair\srental|table\srental|linen\srental|costume\srental|tuxedo\srental|dress\srental|appliance\srental|furniture\srental|office\sequipment\srental|audiovisual.*rental|av\srental|musical\sinstrument\srental|ski\srental|snowboard\srental|snowmobile\srental|jet\sski\srental|boat\srental|kayak\srental|canoe\srental|bicycle\srental|scooter\srental|segway|atv\srental|motorcycle\srental|rv\srental|camper\srental|trailer\srental|truck\srental|van\srental|car\srental|forklift\srental|crane\srental|scaffolding\srental|construction.*rental|dumpster\srental|portable\stoilet|porta.*potty)', 'Retail_Shopping'),
# Specialized restoration and repair services
(r'(restoration\sservice|furniture\srestoration|antique\srestoration|art\srestoration|photo\srestoration|document\srestoration|clock\srepair|watch\srepair|jewelry\srepair|shoe\srepair|luggage\srepair|leather\srepair|upholstery\srepair|musical\sinstrument\srepair|piano\stuning|guitar\srepair|violin\srepair|camera\srepair|electronics\srepair|phone\srepair|screen\srepair|computer\srepair|printer\srepair|copier\srepair|typewriter|sewing\smachine\srepair|vacuum\srepair|small\sengine\srepair|lawn\smower\srepair|chainsaw|power\stool\srepair|fire\sextinguisher\sservice|scale\srepair|calibration|water\sdamage\srestoration|fire\sdamage|smoke\sdamage|mold\sremediation|biohazard|crime\sscene\sclean|hoarding\sclean)', 'Home_Services'),
# Specialized trades and craftspeople
(r'(clock\smaker|watch\smaker|furniture\smaker|cabinet\smaker|instrument\smaker|stringed\sinstrument\smaker|piano\smaker|organ\sbuilder|luthier|bookbinder|print\smaker|engraver|etcher|lithograph|screen\sprint|sign\smaker|sign\spainter|glass\sblower|stained\sglass|ceramic|pottery|potter|sculptor|woodcarver|wood\sturner|basket\smaker|weaver|spinner|knitter|quilter|longarm|embroidery|monogram|tailor|seamstress|dressmaker|milliner|cobbler|saddle|harness|leather\scraft|upholster|framemaker|gilder|conservator|taxiderm|model\smaker|prop\smaker|costume\smaker|wig\smaker|prosthetic|mask\smaker|puppet|doll\smaker|toy\smaker)', 'Industrial'),
# Specialized testing and inspection services
(r'(testing\sservice|inspection\sservice|asbestos\stest|lead\stest|radon\stest|water\stest|soil\stest|air\squality|environmental\stest|mold\stest|home\sinspect|building\sinspect|property\sinspect|roof\sinspect|termite\sinspect|pest\sinspect|pool\sinspect|chimney\sinspect|septic\sinspect|well\sinspect|electrical\sinspect|plumbing\sinspect|hvac\sinspect|fire\sinspect|safety\sinspect|code\senforcement|energy\saudit|blower\sdoor|duct\stest|infrared|thermal\simag)', 'Professional_Services'),
# Personal and lifestyle services
(r'(psychic|astrologer|fortune\steller|fortune\stelling|palm\sread|tarot|medium|spiritual\sadvis|feng\shui|numerolog|grapholog|hypnotherap|hypnosis|past\slife|akashic|aura|chakra|reiki|energy\shealing|crystal\shealing|sound\shealing|aromatherap|reflexolog|iridolog|kinesiology|craniosacral|rolfing|alexander\stechnique|feldenkrais|pilates\sinstructor|yoga\sinstructor|meditation\sinstructor|breathwork|pranayama|ayurved|traditional\schinese|tcm|herbalist|naturopath|homeopath|beautician|esthetician|esthetics|body\sshaping|boot\scamp|loctician|mehandi|mehndi|teeth\swhitening|wellness\sprogram|alternative\smedicine\spractitioner)', 'Personal_Services'),
# More Government patterns
(r'(archive$|birth\scertificate|city\semployment|state\semployment|company\sregistry|district\sjustice|justice\sdepartment|land\splanning|urban\splanning|toll\sstation|traffic\sofficer|weigh\sstation|sanitary\sinspect|smog\sinspect|superfund|water\sworks|weather\sforecast|ground\sself\sdefense|united\sstates\sarmed|radio\sbroadcaster|television\sstation|closed\scircuit|communications\stower)', 'Government'),
# More Transportation patterns
(r'(boat\sramp|container\sterminal|helicopter\scharter|river\sport|transportation\sservice|transportation\sescort|fixed-?base\soperator|handicapped\stransportation|carpooling)', 'Transportation'),
# More Finance patterns
(r'(diamond\sbuyer|financial\sinstitution|holding\scompany|leasing\sservice|stock\sexchange|money\sorder|payment\sterminal)', 'Finance_Insurance'),
# More Real Estate patterns
(r'(corporate\soffice|display\shome|townhouse\scomplex|villa$|serviced\s(accommodation|apartment)|function\sroom|virtual\soffice)', 'Real_Estate'),
# More Entertainment/Sports patterns
(r'(fishing\s(camp|charter|pier)|horseback\sriding|horse\srental|equestrian\sfacility|outdoor\sequestrian|salsa\sclass|wood\sworking\sclass|stitching\sclass|childbirth\sclass|mehandi\sclass)', 'Entertainment'),
# More Industrial/Repair patterns
(r'(engine\srebuilding|machine\smaintenance|saw\ssharpening|skate\ssharpening|sharpening\sservice|lpg\sconversion|cng\sfitment|boat\sdetailing|rv\sdetailing|rv\srepair|bike\swash|fire\sprotection|elevator\sservice|drone\sservice)', 'Industrial'),
# More Retail patterns
(r'(haberdashery|jeweler$|lapidary|glass\smerchant|furniture\saccessories|showroom$|tesla\sshowroom|bottle.*redemption|coin\soperated)', 'Retail_Shopping'),
# More Professional Services patterns
(r'(building\sdesigner|polygraph|professional\sorganizer|video\s(conferencing|duplication|editing)|meeting\splanning|personal\sconcierge|house\ssitter|marriage\scelebrant|singing\stelegram|roommate\sreferral)', 'Professional_Services'),
# Miscellaneous remaining - catch-all for specific items
(r'(agistment|auction\shouse|appliances\scustomer|bicycle\srack|bridge$|building\sequipment\shire|container\sservice|distribution\sservice|diaper\sservice|divorce\sservice|drinking\swater\sfountain|energy\sequipment|environment\srenewable|forestry\sservice|fur\sservice|garbage\scollection|garden$|handicraft|hiking\sguide|homekill|judicial\sauction|key\sduplication|land\sallotment|line\smark|livery\scompany|lodge$|lodging$|lyceum|mailbox\srental|marquee\shire|memorial$|mercantile|mineral\swater\scompany|mold\smaker|office\srefurbish|oil\sand\sgas\sexploration|orchid\sgrower|package\slocker|pedestrian\szone|road\ssafety\stown|sacem|sailmaker|seating\ssystems|security\s(guard|service)|shoe\sshining|societe|staple\sfood|tenant\sownership|ticket\soffice|weir|wi-?fi\sspot)', 'Other'),
]
def get_sector_for_item(name):
"""
Determine which sector an item belongs to.
Returns sector slug or 'Other' if no match.
"""
name_lower = name.lower()
for pattern, sector in SECTOR_PATTERNS:
if re.search(pattern, name_lower, re.IGNORECASE):
return sector
return 'Other'
# ==================== LEVEL 2: BUSINESS TYPE PATTERNS ====================
# These are more specific patterns within each sector
BUSINESS_TYPE_PATTERNS = {
'Entertainment': [
(r'(fitness|gym|workout|crossfit|pilates|yoga|aerobic|exercise|weight\s(room|training)|spin\sclass|bootcamp)', 'Fitness'),
(r'(sports\s|athletic|stadium|arena|field\s|court\s|track\s|league|team\s)', 'Sports'),
(r'(museum|exhibit|gallery|art\s(center|gallery)|sculpture)', 'Museums'),
(r'(theater|theatre|playhouse|opera|ballet|symphony|orchestra|concert|performance|show)', 'Performing Arts'),
(r'(cinema|movie|film|drive-?in)', 'Movies'),
(r'(park(?!\sing)|playground|recreation|picnic|garden|botanical|arboretum|nature|trail)', 'Parks'),
(r'(amusement|theme\spark|water\spark|carnival|fair|ride|attraction)', 'Amusement'),
(r'(arcade|game|escape\sroom|laser|paintball|go.?kart|bowling|billiard|mini\sgolf)', 'Games & Recreation'),
(r'(casino|gambling|betting|poker|slot)', 'Gambling'),
(r'(club|nightclub|disco|bar|lounge)', 'Social'),
(r'(zoo|aquarium|wildlife|safari|sanctuary)', 'Wildlife'),
(r'(music|concert|jazz|blues|rock|karaoke)', 'Music Venues'),
],
'Healthcare': [
(r'(hospital|medical\scenter|health\scenter)', 'Hospitals'),
(r'(clinic|office|practice|urgent\scare)', 'Clinics'),
(r'(dentist|dental|orthodont|oral\ssurg|periodont|endodont)', 'Dental'),
(r'(eye|vision|optom|optician|ophthalmolog)', 'Vision Care'),
(r'(mental|psych|counsel|therapist|psychiatr)', 'Mental Health'),
(r'(chiropract|acupunct|naturopath|homeopath|osteopath|alternative|holistic)', 'Alternative Medicine'),
(r'(physical\stherap|occupational|speech|rehab)', 'Rehabilitation'),
(r'(lab|diagnostic|patholog|radiology|x-?ray|imaging|blood\stest)', 'Diagnostics'),
(r'(pharmacy|drugstore|prescription)', 'Pharmacies'),
(r'(senior|aged|elder|nursing\shome|assisted)', 'Senior Care'),
(r'(emergency|ambulance|paramedic|first\said|urgent)', 'Emergency Services'),
(r'(veterinar|vet\s|animal\s(hospital|clinic))', 'Veterinary'),
(r'(doctor|physician|surgeon|specialist|practitioner)', 'Medical Practitioners'),
],
'Food_Dining': [
(r'(restaurant|eatery|dining|bistro|brasserie|grill|steakhouse)', 'Restaurants'),
(r'(cafe|café|coffee|espresso|tea\shouse)', 'Cafes & Coffee'),
(r'(bar\s|pub|tavern|brewery|taproom|lounge|cocktail|wine\sbar)', 'Bars & Nightlife'),
(r'(bakery|patisserie|pastry|bread|donut|bagel)', 'Bakeries & Desserts'),
(r'(ice\scream|gelato|dessert|frozen\syogurt|candy|chocolate)', 'Bakeries & Desserts'),
(r'(fast\sfood|quick\sservice|drive.?thru|takeout|take.?away)', 'Quick Service'),
(r'(caterer|catering|food\sservice|meal\sprep)', 'Food Services'),
(r'(winery|distillery|vineyard)', 'Beverage Production'),
],
'Home_Services': [
(r'(plumb|pipe|drain|sewer|septic)', 'Plumbing'),
(r'(electric|wiring|panel|outlet)', 'Electrical'),
(r'(hvac|heat|cool|air\scondition|furnace)', 'HVAC'),
(r'(roof|gutter|shingle)', 'Roofing'),
(r'(landscap|lawn|garden|tree|arbor)', 'Landscaping'),
(r'(clean|maid|janitor|housekeep)', 'Cleaning'),
(r'(pest|exterminator|termite)', 'Pest Control'),
(r'(paint|drywall|plaster|wallpaper)', 'Construction'),
(r'(floor|carpet|tile|hardwood)', 'Flooring'),
(r'(window|door|glass)', 'Windows & Doors'),
(r'(pool|spa|hot\stub)', 'Pool & Spa'),
(r'(security|alarm|lock|safe)', 'Security'),
(r'(appliance|washer|dryer|refrigerator)', 'Appliance Repair'),
(r'(handyman|repair|fix|maintenance)', 'General Repair'),
(r'(construct|build|remodel|renovation|contractor)', 'Construction'),
(r'(mov(er|ing)|relocat)', 'Moving'),
(r'(interior|decor|design|stag)', 'Design'),
],
}
def get_business_type_for_item(name, sector):
"""
Determine which business type an item belongs to within a sector.
Returns business type or 'Other' if no match.
"""
if sector not in BUSINESS_TYPE_PATTERNS:
return 'Other'
name_lower = name.lower()
for pattern, btype in BUSINESS_TYPE_PATTERNS[sector]:
if re.search(pattern, name_lower, re.IGNORECASE):
return btype
return 'Other'
def main():
"""Main function to categorize and show results"""
import sys
# Read items from stdin or file
if len(sys.argv) > 1:
with open(sys.argv[1]) as f:
items = [line.strip() for line in f if line.strip()]
else:
items = [line.strip() for line in sys.stdin if line.strip()]
# Categorize
results = {}
for name in items:
sector = get_sector_for_item(name)
btype = get_business_type_for_item(name, sector)
key = (sector, btype)
if key not in results:
results[key] = []
results[key].append(name)
# Print summary
print(f"Total items: {len(items)}\n")
# Group by sector
by_sector = {}
for (sector, btype), names in results.items():
if sector not in by_sector:
by_sector[sector] = {}
by_sector[sector][btype] = names
# Print sector summary
print("=" * 60)
print("SECTOR SUMMARY")
print("=" * 60)
for sector in sorted(by_sector.keys()):
total = sum(len(names) for names in by_sector[sector].values())
other_count = len(by_sector[sector].get('Other', []))
print(f"{sector}: {total} items ({other_count} in Other)")
print("\n" + "=" * 60)
print("DETAILED BREAKDOWN")
print("=" * 60)
for sector in sorted(by_sector.keys()):
print(f"\n### {sector} ###")
for btype in sorted(by_sector[sector].keys()):
names = by_sector[sector][btype]
print(f" {btype}: {len(names)}")
if len(names) <= 10:
for name in sorted(names):
print(f" - {name}")
else:
for name in sorted(names)[:5]:
print(f" - {name}")
print(f" ... and {len(names) - 5} more")
if __name__ == '__main__':
main()

555
db/recategorize_other.py Normal file
View File

@@ -0,0 +1,555 @@
#!/usr/bin/env python3
"""
Recategorize items from Other.Uncategorized into appropriate existing categories.
RULES:
1. NEVER create new Level 1 (Sector) categories
2. Only create new Level 2 (Business Type) if >10 items would use it
3. Only create new Level 3 (Sub-category) if >5 items would use it
4. Prefer matching to existing categories at all times
5. If uncertain, leave in Other
EXISTING SECTORS (21 non-Other):
- Agriculture: Farming, Services
- Automotive: Dealers, Fuel & Charging, Parking, Parts & Accessories, Rental Services, Repair & Maintenance, Training, Vehicle Care
- Education: Arts Education, Early Childhood, Higher Education, K-12 Schools, Language Learning, Libraries, Professional Training, Specialty Schools, Sports Training, Technology Training, Tutoring, Vocational Training
- Entertainment: Amusement, Arts, Fitness, Gambling, Games & Recreation, Movies, Museums, Music Venues, Parks, Performing Arts, Recreation, Social, Sports, Venues, Wildlife
- Events_Weddings: Attire, Florists, Memorial, Planning, Rentals, Services, Venues
- Finance_Insurance: Banking, Insurance, Investment, Lending, Money Services
- Food_Dining: Bakeries & Desserts, Bars & Nightlife, Beverage Production, Cafes & Coffee, Food Services, Quick Service, Restaurants
- Government: International, Legal, Local Government, Postal, Public Safety, Social Services, Transportation
- Healthcare: Alternative Medicine, Clinics, Dental, Diagnostics, Emergency Services, Hospitals, Medical Practitioners, Mental Health, Pharmacies, Rehabilitation, Senior Care, Specialty Care, Veterinary, Vision Care
- Home_Services: Appliance Repair, Cleaning, Construction, Design, Electrical, Flooring, General Repair, HVAC, Landscaping, Moving, Pest Control, Plumbing, Pool & Spa, Roofing, Security, Windows & Doors
- Hospitality_Travel: Attractions, Lodging, Transportation, Travel Services
- Industrial: Construction, Manufacturing, Mining
- Non_Profit: Charities, Community, General, Professional
- Personal_Services: Body Art, Clothing Care, Fitness, Hair Care, Laundry, Massage, Spa & Wellness
- Pets_Animals: Animal Welfare, Pet Services
- Professional_Services: Agencies, Business Services, Consulting, Creative Services, Design, Engineering, Financial Services, HR Services, Language Services, Legal, Marketing & Advertising
- Real_Estate: Agencies, Commercial, Development, Management, Residential, Services, Storage
- Religious: Buddhism, Christian, Hinduism, Islam, Judaism, Other
- Retail_Shopping: Arts & Crafts, Beauty & Cosmetics, Books & Office, Clothing & Fashion, Electronics, Food & Grocery, Hardware & Building, Health & Pharmacy, Home & Garden, Jewelry & Watches, Markets, Music & Entertainment, Pet Supplies, Secondhand & Vintage, Specialty Retail, Sports & Outdoors, Toys & Hobbies, Wholesale & Distribution
- Technology: IT Services, Infrastructure, Software, Telecommunications
- Transportation: Delivery, Logistics, Passenger, Public Transit, Vehicle Services
"""
import psycopg2
import re
from collections import defaultdict
# Database connection
DB_URL = "postgresql://scraper:scraper123@localhost:5437/scraper"
def slugify(text):
"""Convert text to slug format"""
slug = re.sub(r'[^\w\s-]', '', text)
slug = re.sub(r'[-\s]+', '_', slug)
return slug.strip('_')
# ==================== CATEGORIZATION RULES ====================
# Format: (keyword_pattern, sector, business_type, sub_category)
# Use regex patterns for flexibility
CATEGORIZATION_RULES = [
# ==================== SPORTS & FITNESS (→ Entertainment.Sports or Entertainment.Fitness) ====================
# Sports clubs and facilities
(r'\b(basketball|baseball|football|soccer|tennis|golf|hockey|rugby|cricket|volleyball|badminton|squash|racquetball)\b.*(club|court|field|ground|stadium|arena|complex)', 'Entertainment', 'Sports', 'Facilities'),
(r'\b(swimming|diving|aquatic|pool)\b.*(club|center|pool|facility)', 'Entertainment', 'Sports', 'Aquatic'),
(r'\b(gym|fitness|workout|crossfit|aerobic|pilates|yoga|zumba)\b.*(center|studio|club|class)', 'Entertainment', 'Fitness', 'Studios'),
(r'\b(martial arts|karate|judo|taekwondo|aikido|boxing|kickboxing|mma|wrestling|fencing)\b.*(club|school|academy|dojo|studio)', 'Entertainment', 'Sports', 'Martial_Arts'),
(r'\b(archery|shooting|rifle|gun)\b.*(range|club|center)', 'Entertainment', 'Sports', 'Shooting'),
(r'\b(skateboard|skate park|bmx|cycling|bicycle)\b.*(park|venue|club|center)', 'Entertainment', 'Sports', 'Cycling_Skating'),
(r'\b(climbing|bouldering|rock climbing)\b.*(gym|wall|center|club)', 'Entertainment', 'Fitness', 'Climbing'),
(r'\b(dance|ballet|ballroom|salsa|tango)\b.*(studio|school|class|instructor)', 'Entertainment', 'Performing Arts', 'Dance'),
(r'\bsports\b.*(center|complex|facility|club)', 'Entertainment', 'Sports', 'General'),
(r'\bathletic\b.*(field|track|club|center)', 'Entertainment', 'Sports', 'Facilities'),
(r'\b(rowing|canoeing|kayaking|sailing|boat)\b.*(club|center|school)', 'Entertainment', 'Sports', 'Water_Sports'),
(r'\b(equestrian|horse|polo|riding)\b.*(club|center|school|stable|arena)', 'Entertainment', 'Sports', 'Equestrian'),
(r'\b(ski|snowboard|ice skating|ice rink)\b.*(resort|center|club|rink)', 'Entertainment', 'Sports', 'Winter_Sports'),
# Instructors and trainers
(r'\b(fitness|personal|sports|athletic)\b.*\b(trainer|instructor|coach)\b', 'Entertainment', 'Fitness', 'Trainers'),
(r'\baerobic.*instructor\b', 'Entertainment', 'Fitness', 'Trainers'),
# ==================== HEALTHCARE (various) ====================
# Medical specialists
(r'\b(allergist|anesthesiologist|cardiologist|dermatologist|endocrinologist|gastroenterologist|geriatrician|hematologist|immunologist|nephrologist|neurologist|oncologist|ophthalmologist|orthopedist|otolaryngologist|pathologist|pediatrician|physiatrist|podiatrist|proctologist|pulmonologist|radiologist|rheumatologist|urologist)\b', 'Healthcare', 'Medical Practitioners', 'Specialists'),
(r'\b(audiologist|speech therapist|occupational therapist|physical therapist)\b', 'Healthcare', 'Rehabilitation', 'Therapists'),
(r'\b(psychologist|psychiatrist|counselor|therapist)\b(?!.*massage)', 'Healthcare', 'Mental Health', 'Practitioners'),
(r'\b(chiropractor|osteopath|naturopath|homeopath|acupuncturist|herbalist)\b', 'Healthcare', 'Alternative Medicine', 'Practitioners'),
(r'\b(optometrist|optician)\b', 'Healthcare', 'Vision Care', 'Practitioners'),
(r'\b(medical|health)\b.*(center|clinic|office|practice)', 'Healthcare', 'Clinics', 'General'),
(r'\b(aged care|elder care|senior care|nursing home|assisted living|retirement)\b', 'Healthcare', 'Senior Care', 'Facilities'),
(r'\b(blood bank|blood donation|plasma)\b', 'Healthcare', 'Diagnostics', 'Blood_Services'),
(r'\b(dialysis|kidney)\b.*(center|clinic)', 'Healthcare', 'Specialty Care', 'Dialysis'),
(r'\b(fertility|ivf|reproductive)\b.*(clinic|center)', 'Healthcare', 'Specialty Care', 'Fertility'),
(r'\b(hospice|palliative)\b', 'Healthcare', 'Senior Care', 'Hospice'),
(r'\b(medical lab|laboratory|pathology|diagnostic)\b.*(center|lab)', 'Healthcare', 'Diagnostics', 'Labs'),
(r'\b(ambulance|emergency|paramedic|first aid)\b', 'Healthcare', 'Emergency Services', 'EMS'),
# ==================== AUTOMOTIVE (various) ====================
(r'\bauto\b.*(body|paint|dent|collision|restoration|upholster)', 'Automotive', 'Repair & Maintenance', 'Body_Work'),
(r'\bauto\b.*(repair|mechanic|service|tune.?up|brake|transmission|radiator)', 'Automotive', 'Repair & Maintenance', 'Mechanical'),
(r'\bauto\b.*(auction|broker|dealer)', 'Automotive', 'Dealers', 'Used_Vehicles'),
(r'\bauto\b.*(wrecker|salvage|junk|dismantl)', 'Automotive', 'Parts & Accessories', 'Salvage'),
(r'\b(car|vehicle|auto)\b.*(wash|detail|clean|wax)', 'Automotive', 'Vehicle Care', 'Cleaning'),
(r'\b(car|vehicle|auto)\b.*(rental|hire|lease)', 'Automotive', 'Rental Services', 'Vehicles'),
(r'\b(car|vehicle|auto)\b.*(storage|parking)', 'Automotive', 'Parking', 'Storage'),
(r'\b(motorcycle|motorbike|scooter|atv|quad)\b.*(dealer|shop|rental|repair)', 'Automotive', 'Dealers', 'Motorcycles'),
(r'\b(tire|tyre|wheel)\b.*(shop|store|service|dealer)', 'Automotive', 'Parts & Accessories', 'Tires'),
(r'\b(driving|driver)\b.*(school|training|instructor|lesson)', 'Automotive', 'Training', 'Driving_Schools'),
(r'\btruck\b.*(stop|dealer|rental|repair)', 'Automotive', 'Dealers', 'Trucks'),
(r'\b(rickshaw|auto rickshaw)\b', 'Transportation', 'Passenger', 'Local'),
# ==================== GOVERNMENT & MILITARY ====================
(r'\b(air force|army|navy|military|armed forces)\b.*(base|facility|office|recruitment)', 'Government', 'Public Safety', 'Military'),
(r'\b(police|sheriff|law enforcement)\b.*(station|department|office)', 'Government', 'Public Safety', 'Police'),
(r'\b(fire|firefighter)\b.*(station|department)', 'Government', 'Public Safety', 'Fire'),
(r'\b(court|courthouse|tribunal|judiciary)\b', 'Government', 'Legal', 'Courts'),
(r'\b(embassy|consulate|visa)\b.*(office|center)', 'Government', 'International', 'Diplomatic'),
(r'\b(city|town|municipal|county|district|borough)\b.*(hall|office|government|administration)', 'Government', 'Local Government', 'Offices'),
(r'\b(social services|welfare|unemployment|disability)\b.*(office|center)', 'Government', 'Social Services', 'Welfare'),
(r'\b(dmv|driver.*license|vehicle registration|motor vehicle)\b', 'Government', 'Transportation', 'DMV'),
(r'\b(passport|immigration|citizenship)\b.*(office|center)', 'Government', 'International', 'Immigration'),
(r'\b(aadhaar|agenzia entrate|tax)\b.*(office|center)', 'Government', 'Local Government', 'Tax'),
(r'\b(asylum|refugee)\b.*(center|office)', 'Government', 'Social Services', 'Refugee'),
# ==================== PETS & ANIMALS ====================
(r'\b(animal|pet)\b.*(shelter|rescue|adoption|welfare|pound|sanctuary)', 'Pets_Animals', 'Animal Welfare', 'Shelters'),
(r'\b(animal|pet)\b.*(hospital|clinic|vet|veterinary)', 'Healthcare', 'Veterinary', 'Clinics'),
(r'\b(animal|pet)\b.*(grooming|boarding|kennel|daycare|sitting|walking)', 'Pets_Animals', 'Pet Services', 'Care'),
(r'\b(animal|pet)\b.*(training|obedience|behavior)', 'Pets_Animals', 'Pet Services', 'Training'),
(r'\b(dog|cat|bird|fish|reptile|aquarium)\b.*(breeder|shop|store)', 'Retail_Shopping', 'Pet Supplies', 'Breeders'),
(r'\bzoo\b|aquarium|wildlife.*park|safari', 'Entertainment', 'Wildlife', 'Zoos'),
# ==================== RELIGIOUS ====================
(r'\b(church|chapel|cathedral|basilica|parish)\b', 'Religious', 'Christian', 'Churches'),
(r'\b(temple|mandir|hindu)\b', 'Religious', 'Hinduism', 'Temples'),
(r'\b(mosque|masjid|islamic)\b', 'Religious', 'Islam', 'Mosques'),
(r'\b(synagogue|jewish|judaism)\b', 'Religious', 'Judaism', 'Synagogues'),
(r'\b(buddhist|buddha|monastery|zen|meditation center)\b', 'Religious', 'Buddhism', 'Temples'),
(r'\b(ashram|spiritual|guru)\b', 'Religious', 'Other', 'Spiritual'),
(r'\b(baha.*i|sikh|gurdwara|shinto)\b', 'Religious', 'Other', 'Houses_of_Worship'),
# ==================== EDUCATION ====================
(r'\b(university|college|faculty|academic department)\b', 'Education', 'Higher Education', 'Universities'),
(r'\b(preschool|kindergarten|nursery|daycare|child.*care|creche)\b(?!.*animal)', 'Education', 'Early Childhood', 'Preschools'),
(r'\b(school|academy)\b(?!.*driving|.*martial|.*dance|.*music|.*art|.*beauty|.*cooking|.*flight)', 'Education', 'K-12 Schools', 'General'),
(r'\b(language|esl|english)\b.*(school|class|course|learning)', 'Education', 'Language Learning', 'Schools'),
(r'\b(art|drawing|painting)\b.*(school|class|studio)', 'Education', 'Arts Education', 'Visual_Arts'),
(r'\b(music|piano|guitar|violin|drum)\b.*(school|lesson|instructor|teacher)', 'Education', 'Arts Education', 'Music'),
(r'\b(acting|theater|drama)\b.*(school|class|academy)', 'Education', 'Arts Education', 'Performing'),
(r'\b(tutoring|tutor|coaching)\b.*(center|service)', 'Education', 'Tutoring', 'General'),
(r'\b(library|public library)\b', 'Education', 'Libraries', 'Public'),
(r'\b(archive|historical|museum)\b.*library', 'Education', 'Libraries', 'Special'),
(r'\b(vocational|trade|technical)\b.*(school|training|institute)', 'Education', 'Vocational Training', 'General'),
(r'\b(apprentice|internship)\b', 'Education', 'Vocational Training', 'Apprenticeships'),
(r'\b(flight|aviation|pilot)\b.*(school|training|academy)', 'Education', 'Specialty Schools', 'Aviation'),
(r'\b(cooking|culinary|chef)\b.*(school|class|academy)', 'Education', 'Specialty Schools', 'Culinary'),
(r'\b(beauty|cosmetology|esthetician)\b.*(school|academy)', 'Education', 'Specialty Schools', 'Beauty'),
# ==================== HOME SERVICES ====================
(r'\b(bathroom|kitchen)\b.*(remodel|renovation|contractor)', 'Home_Services', 'Construction', 'Remodeling'),
(r'\b(general|home)\b.*contractor', 'Home_Services', 'Construction', 'General'),
(r'\b(painter|painting)\b.*(contractor|service|company)(?!.*auto)', 'Home_Services', 'Construction', 'Painting'),
(r'\b(carpenter|carpentry|cabinet|woodwork)\b', 'Home_Services', 'Construction', 'Carpentry'),
(r'\b(mason|masonry|brick|concrete|stone)\b.*(contractor|service|company)', 'Home_Services', 'Construction', 'Masonry'),
(r'\b(electrician|electrical)\b.*(contractor|service|company)', 'Home_Services', 'Electrical', 'Contractors'),
(r'\b(plumber|plumbing)\b.*(contractor|service|company)', 'Home_Services', 'Plumbing', 'Contractors'),
(r'\b(hvac|heating|air conditioning|furnace)\b.*(contractor|service|company)', 'Home_Services', 'HVAC', 'Contractors'),
(r'\b(roofer|roofing)\b.*(contractor|service|company)', 'Home_Services', 'Roofing', 'Contractors'),
(r'\b(landscap|lawn|garden)\b.*(service|company|contractor)(?!.*store|.*center)', 'Home_Services', 'Landscaping', 'Services'),
(r'\b(pool|spa)\b.*(service|cleaning|maintenance|contractor)', 'Home_Services', 'Pool & Spa', 'Services'),
(r'\b(pest|exterminator|termite)\b.*(control|service)', 'Home_Services', 'Pest Control', 'Services'),
(r'\b(cleaning|maid|janitorial|housekeeping)\b.*(service|company)', 'Home_Services', 'Cleaning', 'Services'),
(r'\b(window)\b.*(cleaning|wash)', 'Home_Services', 'Cleaning', 'Window'),
(r'\b(appliance)\b.*(repair|service)', 'Home_Services', 'Appliance Repair', 'Services'),
(r'\b(handyman|odd job|home repair)\b', 'Home_Services', 'General Repair', 'Handyman'),
(r'\b(moving|movers|relocation)\b.*(company|service)', 'Home_Services', 'Moving', 'Services'),
(r'\b(locksmith)\b', 'Home_Services', 'Security', 'Locksmith'),
(r'\b(alarm|security system)\b.*(company|service|installer)', 'Home_Services', 'Security', 'Systems'),
(r'\b(arborist|tree)\b.*(service|removal|trimming)', 'Home_Services', 'Landscaping', 'Tree_Service'),
(r'\b(fence)\b.*(contractor|company|install)', 'Home_Services', 'Construction', 'Fencing'),
(r'\b(garage door)\b.*(service|repair|install)', 'Home_Services', 'General Repair', 'Garage_Doors'),
(r'\b(gutter)\b.*(cleaning|service|install)', 'Home_Services', 'Construction', 'Gutters'),
(r'\b(insulation)\b.*(contractor|company)', 'Home_Services', 'Construction', 'Insulation'),
(r'\b(deck|patio)\b.*(builder|contractor)', 'Home_Services', 'Construction', 'Outdoor'),
(r'\b(drywall|sheetrock)\b', 'Home_Services', 'Construction', 'Drywall'),
(r'\b(flooring|carpet|tile|hardwood)\b.*(install|contractor|company)(?!.*store)', 'Home_Services', 'Flooring', 'Installation'),
(r'\b(window|door)\b.*(install|replacement|contractor)', 'Home_Services', 'Windows & Doors', 'Installation'),
(r'\b(glass)\b.*(repair|replacement|company)(?!.*auto)', 'Home_Services', 'Windows & Doors', 'Glass'),
(r'\b(chimney)\b.*(sweep|cleaning|repair)', 'Home_Services', 'General Repair', 'Chimney'),
(r'\b(septic|sewer)\b.*(service|pumping|cleaning)', 'Home_Services', 'Plumbing', 'Septic'),
(r'\b(well)\b.*(drilling|service|pump)', 'Home_Services', 'Plumbing', 'Wells'),
(r'\b(solar)\b.*(install|contractor|company)', 'Home_Services', 'Electrical', 'Solar'),
# ==================== RETAIL & SHOPPING ====================
(r'\b(antique|vintage|secondhand|thrift|consignment|pawn)\b.*(shop|store)', 'Retail_Shopping', 'Secondhand & Vintage', 'Stores'),
(r'\b(auction)\b.*(house|company)', 'Retail_Shopping', 'Secondhand & Vintage', 'Auctions'),
(r'\b(art|craft|hobby)\b.*(supply|store|shop)', 'Retail_Shopping', 'Arts & Crafts', 'Supplies'),
(r'\b(toy|game|hobby)\b.*(store|shop)', 'Retail_Shopping', 'Toys & Hobbies', 'Stores'),
(r'\b(book|stationery|office supply)\b.*(store|shop)', 'Retail_Shopping', 'Books & Office', 'Stores'),
(r'\b(music|instrument|record|vinyl)\b.*(store|shop)', 'Retail_Shopping', 'Music & Entertainment', 'Stores'),
(r'\b(sporting|sports|outdoor|camping|fishing|hunting)\b.*(goods|store|shop)', 'Retail_Shopping', 'Sports & Outdoors', 'Stores'),
(r'\b(electronics|computer|phone|appliance)\b.*(store|shop|retailer)', 'Retail_Shopping', 'Electronics', 'Stores'),
(r'\b(furniture|home decor|bedding|mattress)\b.*(store|shop)', 'Retail_Shopping', 'Home & Garden', 'Stores'),
(r'\b(clothing|fashion|apparel|boutique|shoe)\b.*(store|shop)', 'Retail_Shopping', 'Clothing & Fashion', 'Stores'),
(r'\b(jewelry|watch|gem)\b.*(store|shop)', 'Retail_Shopping', 'Jewelry & Watches', 'Stores'),
(r'\b(hardware|tool|building supply|lumber)\b.*(store|shop)', 'Retail_Shopping', 'Hardware & Building', 'Stores'),
(r'\b(garden|nursery|plant)\b.*(center|store|shop)', 'Retail_Shopping', 'Home & Garden', 'Garden_Centers'),
(r'\b(pharmacy|drugstore)\b', 'Retail_Shopping', 'Health & Pharmacy', 'Pharmacies'),
(r'\b(cosmetic|beauty|makeup)\b.*(store|shop)', 'Retail_Shopping', 'Beauty & Cosmetics', 'Stores'),
(r'\b(grocery|supermarket|food|convenience)\b.*(store|market|shop)', 'Retail_Shopping', 'Food & Grocery', 'Stores'),
(r'\b(liquor|wine|beer|alcohol)\b.*(store|shop)', 'Retail_Shopping', 'Food & Grocery', 'Liquor'),
(r'\b(tobacco|cigar|vape|smoke)\b.*(shop|store)', 'Retail_Shopping', 'Specialty Retail', 'Tobacco'),
(r'\b(mobile phone|cell phone)\b.*(store|shop|dealer)', 'Retail_Shopping', 'Electronics', 'Phones'),
(r'\b(optical|eyewear|glasses|sunglass)\b.*(store|shop)', 'Retail_Shopping', 'Health & Pharmacy', 'Optical'),
(r'\b(florist|flower)\b.*(shop|store)', 'Events_Weddings', 'Florists', 'Shops'),
(r'\b(bridal|wedding)\b.*(shop|store|boutique)', 'Events_Weddings', 'Attire', 'Bridal'),
(r'\b(uniform|workwear)\b.*(store|shop)', 'Retail_Shopping', 'Clothing & Fashion', 'Specialty'),
# ==================== PROFESSIONAL SERVICES ====================
(r'\b(lawyer|attorney|law firm|legal)\b.*(office|firm|service)', 'Professional_Services', 'Legal', 'Firms'),
(r'\b(accountant|accounting|bookkeep|tax)\b.*(firm|service|office)(?!.*government)', 'Professional_Services', 'Financial Services', 'Accounting'),
(r'\b(architect|architecture)\b.*(firm|office|studio)', 'Professional_Services', 'Engineering', 'Architecture'),
(r'\b(engineer|engineering)\b.*(firm|office|company)', 'Professional_Services', 'Engineering', 'Firms'),
(r'\b(surveyor|surveying|land survey)\b', 'Professional_Services', 'Engineering', 'Surveying'),
(r'\b(consultant|consulting)\b.*(firm|company|service)', 'Professional_Services', 'Consulting', 'General'),
(r'\b(marketing|advertising|pr|public relations)\b.*(agency|firm|company)', 'Professional_Services', 'Marketing & Advertising', 'Agencies'),
(r'\b(graphic|web|design)\b.*(studio|agency|firm)', 'Professional_Services', 'Creative Services', 'Design'),
(r'\b(photography|photographer|video|videograph)\b.*(studio|service)', 'Professional_Services', 'Creative Services', 'Photography'),
(r'\b(translation|interpreter|language)\b.*service', 'Professional_Services', 'Language Services', 'Translation'),
(r'\b(staffing|recruiting|employment|hr)\b.*(agency|service|firm)', 'Professional_Services', 'HR Services', 'Agencies'),
(r'\b(notary|notarial)\b', 'Professional_Services', 'Legal', 'Notary'),
(r'\b(private investigator|detective)\b', 'Professional_Services', 'Agencies', 'Investigation'),
(r'\b(appraiser|appraisal|valuation)\b', 'Professional_Services', 'Financial Services', 'Appraisal'),
(r'\b(auditor|audit)\b.*(firm|service)', 'Professional_Services', 'Financial Services', 'Audit'),
(r'\b(courier|messenger|delivery)\b.*service', 'Transportation', 'Delivery', 'Courier'),
# ==================== ARTS & CULTURE ====================
(r'\b(art|gallery|exhibition)\b(?!.*supply|.*store|.*school)', 'Entertainment', 'Arts', 'Galleries'),
(r'\b(museum)\b', 'Entertainment', 'Museums', 'General'),
(r'\b(theater|theatre|playhouse|opera house)\b', 'Entertainment', 'Performing Arts', 'Venues'),
(r'\b(cinema|movie theater|multiplex)\b', 'Entertainment', 'Movies', 'Theaters'),
(r'\b(concert|music)\b.*(hall|venue)', 'Entertainment', 'Music Venues', 'Concert_Halls'),
(r'\b(band|orchestra|choir|ensemble)\b', 'Entertainment', 'Performing Arts', 'Groups'),
(r'\b(comedian|comedy club)\b', 'Entertainment', 'Performing Arts', 'Comedy'),
(r'\b(artist|sculptor|painter)\b(?!.*makeup)', 'Entertainment', 'Arts', 'Artists'),
(r'\b(animation|animator)\b.*(studio|company)', 'Professional_Services', 'Creative Services', 'Animation'),
(r'\b(recording|music)\b.*studio', 'Professional_Services', 'Creative Services', 'Recording'),
(r'\b(art restoration|restoration service)\b', 'Professional_Services', 'Creative Services', 'Restoration'),
# ==================== ENTERTAINMENT & RECREATION ====================
(r'\b(amusement|theme)\b.*park', 'Entertainment', 'Amusement', 'Parks'),
(r'\b(arcade|game center|gaming)\b', 'Entertainment', 'Games & Recreation', 'Arcades'),
(r'\b(escape room|puzzle room)\b', 'Entertainment', 'Games & Recreation', 'Escape_Rooms'),
(r'\b(bowling)\b.*(alley|center)', 'Entertainment', 'Games & Recreation', 'Bowling'),
(r'\b(billiard|pool hall|snooker)\b', 'Entertainment', 'Games & Recreation', 'Billiards'),
(r'\b(karaoke)\b', 'Entertainment', 'Music Venues', 'Karaoke'),
(r'\b(casino|gambling|betting)\b', 'Entertainment', 'Gambling', 'Casinos'),
(r'\b(nightclub|disco|club)\b(?!.*golf|.*country|.*tennis)', 'Food_Dining', 'Bars & Nightlife', 'Nightclubs'),
(r'\b(country club|private club|social club)\b', 'Entertainment', 'Social', 'Clubs'),
(r'\b(botanical garden|arboretum)\b', 'Entertainment', 'Parks', 'Gardens'),
(r'\b(park|playground|recreation area)\b(?!.*theme|.*water|.*trailer|.*mobile)', 'Entertainment', 'Parks', 'Public'),
(r'\b(beach|waterfront|marina)\b(?!.*hotel)', 'Entertainment', 'Parks', 'Beaches'),
(r'\b(campground|camping|rv park|caravan)\b', 'Hospitality_Travel', 'Lodging', 'Camping'),
(r'\b(go.?kart|kart|karting)\b', 'Entertainment', 'Games & Recreation', 'Karting'),
(r'\b(laser tag|paintball)\b', 'Entertainment', 'Games & Recreation', 'Adventure'),
(r'\b(trampoline|bounce|jump)\b.*(park|center)', 'Entertainment', 'Games & Recreation', 'Trampoline'),
(r'\b(mini golf|miniature golf|putt.?putt)\b', 'Entertainment', 'Games & Recreation', 'Mini_Golf'),
(r'\b(water park|aqua park)\b', 'Entertainment', 'Amusement', 'Water_Parks'),
(r'\b(haunted|horror)\b.*(house|attraction)', 'Entertainment', 'Amusement', 'Attractions'),
(r'\b(circus|carnival|fair)\b', 'Entertainment', 'Amusement', 'Shows'),
(r'\b(planetarium|observatory)\b', 'Entertainment', 'Museums', 'Science'),
# ==================== FOOD & DINING ====================
(r'\b(bar|pub|tavern|lounge|brewery|taproom|brewpub)\b(?!.*brow|.*eyebrow)', 'Food_Dining', 'Bars & Nightlife', 'Bars'),
(r'\b(cafe|coffee|espresso)\b.*(shop|house|bar)', 'Food_Dining', 'Cafes & Coffee', 'Cafes'),
(r'\b(restaurant|eatery|diner|bistro|brasserie|grill)\b', 'Food_Dining', 'Restaurants', 'General'),
(r'\b(bakery|patisserie|pastry)\b', 'Food_Dining', 'Bakeries & Desserts', 'Bakeries'),
(r'\b(ice cream|gelato|frozen yogurt|dessert)\b.*(shop|parlor|store)', 'Food_Dining', 'Bakeries & Desserts', 'Desserts'),
(r'\b(caterer|catering)\b', 'Food_Dining', 'Food Services', 'Catering'),
(r'\b(food truck|food cart)\b', 'Food_Dining', 'Quick Service', 'Mobile'),
(r'\b(juice|smoothie)\b.*(bar|shop)', 'Food_Dining', 'Cafes & Coffee', 'Juice'),
(r'\b(tea|bubble tea|boba)\b.*(shop|house|room)', 'Food_Dining', 'Cafes & Coffee', 'Tea'),
(r'\b(winery|vineyard|wine)\b.*(tasting|cellar)', 'Food_Dining', 'Beverage Production', 'Wineries'),
(r'\b(distillery|spirit)\b', 'Food_Dining', 'Beverage Production', 'Distilleries'),
(r'\b(butcher|meat)\b.*shop', 'Retail_Shopping', 'Food & Grocery', 'Butchers'),
(r'\b(fish|seafood)\b.*market', 'Retail_Shopping', 'Food & Grocery', 'Seafood'),
(r'\b(deli|delicatessen)\b', 'Retail_Shopping', 'Food & Grocery', 'Delis'),
(r'\b(candy|chocolate|sweet|confection)\b.*(shop|store)', 'Retail_Shopping', 'Food & Grocery', 'Confectionery'),
# ==================== PERSONAL SERVICES ====================
(r'\b(barber|hair)\b.*(shop|salon|stylist)', 'Personal_Services', 'Hair Care', 'Salons'),
(r'\b(beauty|nail|manicure|pedicure)\b.*(salon|spa|studio)', 'Personal_Services', 'Spa & Wellness', 'Beauty'),
(r'\b(tattoo|piercing|body art)\b.*(shop|studio|parlor)', 'Personal_Services', 'Body Art', 'Studios'),
(r'\b(massage)\b.*(therapist|spa|parlor|studio)', 'Personal_Services', 'Massage', 'Studios'),
(r'\b(spa|wellness|day spa)\b', 'Personal_Services', 'Spa & Wellness', 'Spas'),
(r'\b(tanning|sunbed)\b.*(salon|studio)', 'Personal_Services', 'Spa & Wellness', 'Tanning'),
(r'\b(laundry|laundromat|dry clean|tailor|alteration|seamstress)\b', 'Personal_Services', 'Laundry', 'Services'),
(r'\b(shoe repair|cobbler)\b', 'Personal_Services', 'Clothing Care', 'Shoe_Repair'),
(r'\b(brow|eyebrow|lash|eyelash)\b.*(bar|salon|studio)', 'Personal_Services', 'Spa & Wellness', 'Brows_Lashes'),
(r'\b(makeup artist|stylist)\b', 'Personal_Services', 'Spa & Wellness', 'Makeup'),
(r'\b(sauna|steam room|bathhouse|hammam)\b', 'Personal_Services', 'Spa & Wellness', 'Baths'),
(r'\b(waxing)\b.*(salon|studio)', 'Personal_Services', 'Spa & Wellness', 'Waxing'),
# ==================== HOSPITALITY & TRAVEL ====================
(r'\b(hotel|motel|inn|resort|hostel|lodge|bed and breakfast|b&b|guesthouse)\b', 'Hospitality_Travel', 'Lodging', 'Hotels'),
(r'\b(travel|tour)\b.*(agency|operator|company)', 'Hospitality_Travel', 'Travel Services', 'Agencies'),
(r'\b(airline|airport|aviation)\b(?!.*school)', 'Transportation', 'Passenger', 'Air'),
(r'\b(cruise|ferry)\b.*(line|terminal|port)', 'Transportation', 'Passenger', 'Water'),
(r'\b(train|rail)\b.*(station|service)', 'Transportation', 'Passenger', 'Rail'),
(r'\b(bus|coach)\b.*(station|terminal|service|company)', 'Transportation', 'Passenger', 'Bus'),
(r'\b(taxi|cab|ride|uber|lyft|limo|limousine|chauffeur)\b.*(service|company|stand)', 'Transportation', 'Passenger', 'Taxi'),
(r'\b(tourist|visitor)\b.*(information|center|bureau)', 'Hospitality_Travel', 'Travel Services', 'Information'),
(r'\b(rental)\b.*\b(cabin|cottage|vacation|holiday)\b', 'Hospitality_Travel', 'Lodging', 'Rentals'),
# ==================== INDUSTRIAL & MANUFACTURING ====================
(r'\b(factory|plant|mill|manufacturing)\b', 'Industrial', 'Manufacturing', 'General'),
(r'\b(warehouse|distribution|logistics)\b.*(center|facility)', 'Transportation', 'Logistics', 'Warehouses'),
(r'\b(machine|machinist|metalwork|welding|welder)\b.*(shop|company|service)', 'Industrial', 'Manufacturing', 'Metal'),
(r'\b(print|printing|press)\b.*(shop|company|service)', 'Industrial', 'Manufacturing', 'Printing'),
(r'\b(textile|fabric|garment)\b.*(factory|mill|manufacturer)', 'Industrial', 'Manufacturing', 'Textile'),
(r'\b(chemical|pharmaceutical)\b.*(company|manufacturer|plant)', 'Industrial', 'Manufacturing', 'Chemical'),
(r'\b(construction|building)\b.*(company|contractor|firm)', 'Industrial', 'Construction', 'General'),
(r'\b(quarry|gravel|sand|aggregate)\b', 'Industrial', 'Mining', 'Quarries'),
(r'\b(sawmill|lumber)\b.*(mill|yard)', 'Industrial', 'Manufacturing', 'Wood'),
(r'\b(steel|iron|aluminum)\b.*(plant|manufacturer|company)', 'Industrial', 'Manufacturing', 'Metal'),
(r'\b(packaging|container)\b.*(company|manufacturer)', 'Industrial', 'Manufacturing', 'Packaging'),
(r'\b(recycling|waste)\b.*(center|facility|company)', 'Industrial', 'Manufacturing', 'Recycling'),
# ==================== REAL ESTATE ====================
(r'\b(real estate|realtor|property)\b.*(agent|agency|company)', 'Real_Estate', 'Agencies', 'Agents'),
(r'\b(property management|apartment|rental)\b.*(company|agency)', 'Real_Estate', 'Management', 'Residential'),
(r'\b(storage|self storage|mini storage)\b.*(facility|unit)', 'Real_Estate', 'Storage', 'Self_Storage'),
(r'\b(office|commercial)\b.*(space|building|complex)', 'Real_Estate', 'Commercial', 'Office'),
(r'\b(apartment|condo|housing)\b.*(complex|building|community)', 'Real_Estate', 'Residential', 'Apartments'),
(r'\b(home builder|housing development)\b', 'Real_Estate', 'Development', 'Residential'),
# ==================== NON-PROFIT & COMMUNITY ====================
(r'\b(charity|charitable|foundation|fund)\b(?!.*investment)', 'Non_Profit', 'Charities', 'General'),
(r'\b(non.?profit|ngo|association)\b', 'Non_Profit', 'General', 'Organizations'),
(r'\b(community|civic|neighborhood)\b.*(center|organization|association)', 'Non_Profit', 'Community', 'Centers'),
(r'\b(youth|boys|girls|scout)\b.*(club|organization|center)', 'Non_Profit', 'Community', 'Youth'),
(r'\b(senior|elder)\b.*(center|club)(?!.*care)', 'Non_Profit', 'Community', 'Seniors'),
(r'\b(veterans|vfw|american legion)\b', 'Non_Profit', 'Community', 'Veterans'),
(r'\b(rotary|lions|kiwanis|elks|freemason|lodge)\b', 'Non_Profit', 'Community', 'Fraternal'),
(r'\b(union|labor)\b.*(hall|organization)', 'Non_Profit', 'Professional', 'Unions'),
(r'\b(chamber of commerce|business association)\b', 'Non_Profit', 'Professional', 'Business'),
(r'\b(aboriginal|indigenous|tribal)\b.*(organization|center)', 'Non_Profit', 'Community', 'Indigenous'),
# ==================== TECHNOLOGY ====================
(r'\b(software|app|web)\b.*(developer|development|company)', 'Technology', 'Software', 'Development'),
(r'\b(it|computer|tech)\b.*(service|support|repair)', 'Technology', 'IT Services', 'Support'),
(r'\b(data center|server|cloud)\b', 'Technology', 'Infrastructure', 'Data_Centers'),
(r'\b(internet|isp|broadband|telecom)\b.*(provider|service|company)', 'Technology', 'Telecommunications', 'Providers'),
(r'\b(bpo|call center|outsourc)\b', 'Technology', 'IT Services', 'BPO'),
(r'\b(automation|robot)\b.*(company|service)', 'Technology', 'Software', 'Automation'),
# ==================== FINANCE & INSURANCE ====================
(r'\b(bank|credit union|savings)\b(?!.*blood|.*food)', 'Finance_Insurance', 'Banking', 'Banks'),
(r'\b(atm|cash machine)\b', 'Finance_Insurance', 'Banking', 'ATMs'),
(r'\b(insurance)\b.*(agent|agency|company|broker)', 'Finance_Insurance', 'Insurance', 'Agents'),
(r'\b(mortgage|loan|lending|finance)\b.*(company|broker|service)', 'Finance_Insurance', 'Lending', 'Lenders'),
(r'\b(investment|wealth|portfolio|financial advisor)\b', 'Finance_Insurance', 'Investment', 'Advisors'),
(r'\b(money transfer|remittance|western union|moneygram)\b', 'Finance_Insurance', 'Money Services', 'Transfer'),
(r'\b(currency exchange|forex)\b', 'Finance_Insurance', 'Money Services', 'Exchange'),
(r'\b(bail bond)\b', 'Professional_Services', 'Legal', 'Bail'),
# ==================== EVENTS & WEDDINGS ====================
(r'\b(funeral|mortuary|cremation|cemetery|memorial)\b', 'Events_Weddings', 'Memorial', 'Funeral'),
(r'\b(event|party|wedding)\b.*(planner|planning|coordinator)', 'Events_Weddings', 'Planning', 'Planners'),
(r'\b(banquet|event|reception|wedding)\b.*(hall|venue|center)', 'Events_Weddings', 'Venues', 'Halls'),
(r'\b(dj|disc jockey|entertainment)\b.*service', 'Events_Weddings', 'Services', 'Entertainment'),
(r'\b(balloon|party supply|decoration)\b', 'Events_Weddings', 'Services', 'Decorations'),
(r'\b(tent|equipment)\b.*rental(?!.*car|.*truck)', 'Events_Weddings', 'Rentals', 'Equipment'),
(r'\b(photo booth|photobooth)\b', 'Events_Weddings', 'Services', 'Photography'),
# ==================== AGRICULTURE ====================
(r'\b(farm|ranch|orchard|vineyard)\b(?!.*winery)', 'Agriculture', 'Farming', 'Farms'),
(r'\b(agriculture|farming|crop)\b.*(service|supply|equipment)', 'Agriculture', 'Services', 'Supplies'),
(r'\b(livestock|cattle|poultry|dairy)\b', 'Agriculture', 'Farming', 'Livestock'),
(r'\b(nursery|greenhouse|horticulture)\b.*(wholesale|grower)', 'Agriculture', 'Farming', 'Horticulture'),
(r'\b(agistment|horse boarding|stable)\b', 'Agriculture', 'Services', 'Equine'),
(r'\b(veterinarian|vet)\b.*(livestock|farm|large animal)', 'Agriculture', 'Services', 'Veterinary'),
# ==================== TRANSPORTATION ====================
(r'\b(shipping|freight|cargo|trucking)\b.*(company|service)', 'Transportation', 'Logistics', 'Shipping'),
(r'\b(courier|messenger|express)\b.*(service|delivery)', 'Transportation', 'Delivery', 'Courier'),
(r'\b(airport|airfield|airstrip|heliport)\b', 'Transportation', 'Passenger', 'Airports'),
(r'\b(port|harbor|dock|pier|marina)\b(?!.*wine)', 'Transportation', 'Logistics', 'Ports'),
(r'\b(parking)\b.*(lot|garage|structure)', 'Automotive', 'Parking', 'Lots'),
(r'\b(towing|tow truck)\b', 'Transportation', 'Vehicle Services', 'Towing'),
]
def categorize_item(name):
"""
Categorize a single item based on rules.
Returns (sector, business_type, sub_category) or None if no match.
"""
name_lower = name.lower()
for pattern, sector, btype, subcat in CATEGORIZATION_RULES:
if re.search(pattern, name_lower, re.IGNORECASE):
return (sector, btype, subcat)
return None
def get_existing_paths(cursor):
"""Get all existing paths in the database"""
cursor.execute("SELECT path::text FROM gbp_categories")
return {row[0] for row in cursor.fetchall()}
def get_or_create_path(cursor, sector, btype, subcat, existing_paths):
"""
Get or create the full path for a category.
Returns the parent path (level 3) for the item.
"""
sector_slug = slugify(sector)
btype_slug = slugify(btype)
subcat_slug = slugify(subcat)
# Level 1: Sector
sector_path = sector_slug
if sector_path not in existing_paths:
# Don't create new sectors - return None
print(f" [SKIP] Would need new sector: {sector_path}")
return None
# Level 2: Business Type
btype_path = f"{sector_path}.{btype_slug}"
if btype_path not in existing_paths:
# Create new business type
cursor.execute("""
INSERT INTO gbp_categories (name, slug, path, level, parent_id, category_count)
SELECT %s, %s, %s::ltree, 2, id, 0
FROM gbp_categories WHERE path = %s::ltree
ON CONFLICT (path) DO NOTHING
RETURNING id
""", (btype, btype_slug, btype_path, sector_path))
result = cursor.fetchone()
if result:
existing_paths.add(btype_path)
print(f" [NEW] Created business type: {btype_path}")
# Level 3: Sub-category
subcat_path = f"{btype_path}.{subcat_slug}"
if subcat_path not in existing_paths:
# Create new sub-category
cursor.execute("""
INSERT INTO gbp_categories (name, slug, path, level, parent_id, category_count)
SELECT %s, %s, %s::ltree, 3, id, 0
FROM gbp_categories WHERE path = %s::ltree
ON CONFLICT (path) DO NOTHING
RETURNING id
""", (subcat, subcat_slug, subcat_path, btype_path))
result = cursor.fetchone()
if result:
existing_paths.add(subcat_path)
print(f" [NEW] Created sub-category: {subcat_path}")
return subcat_path
def main():
conn = psycopg2.connect(DB_URL)
cursor = conn.cursor()
# Get all items in Other.Uncategorized
cursor.execute("""
SELECT id, name, slug
FROM gbp_categories
WHERE path ~ 'Other.Uncategorized.*' AND level = 4
ORDER BY name
""")
other_items = cursor.fetchall()
print(f"Found {len(other_items)} items in Other.Uncategorized\n")
# Get existing paths
existing_paths = get_existing_paths(cursor)
# Categorize items
categorized = []
uncategorized = []
category_counts = defaultdict(int)
for item_id, name, slug in other_items:
result = categorize_item(name)
if result:
sector, btype, subcat = result
categorized.append((item_id, name, slug, sector, btype, subcat))
category_counts[(sector, btype, subcat)] += 1
else:
uncategorized.append((item_id, name))
print(f"Categorized: {len(categorized)}")
print(f"Still uncategorized: {len(uncategorized)}")
print()
# Show category distribution
print("Category distribution:")
for (sector, btype, subcat), count in sorted(category_counts.items(), key=lambda x: -x[1])[:30]:
print(f" {sector}.{btype}.{subcat}: {count}")
print()
# Show some uncategorized items
print("Sample uncategorized items (first 50):")
for item_id, name in uncategorized[:50]:
print(f" - {name}")
print()
# Ask for confirmation
response = input("Proceed with database updates? (yes/no): ")
if response.lower() != 'yes':
print("Aborted.")
conn.close()
return
# Update database
updated = 0
for item_id, name, slug, sector, btype, subcat in categorized:
parent_path = get_or_create_path(cursor, sector, btype, subcat, existing_paths)
if parent_path:
new_path = f"{parent_path}.{slug}"
# Update the item
cursor.execute("""
UPDATE gbp_categories
SET path = %s::ltree,
parent_id = (SELECT id FROM gbp_categories WHERE path = %s::ltree)
WHERE id = %s
""", (new_path, parent_path, item_id))
updated += 1
# Update category counts
cursor.execute("""
WITH counts AS (
SELECT
parent_id,
COUNT(*) as cnt
FROM gbp_categories
WHERE parent_id IS NOT NULL
GROUP BY parent_id
)
UPDATE gbp_categories g
SET category_count = COALESCE(c.cnt, 0)
FROM counts c
WHERE g.id = c.parent_id
""")
conn.commit()
print(f"\nUpdated {updated} items")
# Show final stats
cursor.execute("""
SELECT path, name, category_count
FROM gbp_categories
WHERE level = 1
ORDER BY category_count DESC
""")
print("\nFinal sector counts:")
for path, name, count in cursor.fetchall():
print(f" {name}: {count}")
conn.close()
if __name__ == '__main__':
main()