978 lines
46 KiB
Python
978 lines
46 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Import Google Business Profile categories into PostgreSQL with ltree hierarchy.
|
|
|
|
Usage:
|
|
python import_categories.py [--csv-path PATH] [--db-url URL]
|
|
|
|
Example:
|
|
python import_categories.py --csv-path ./categories.csv --db-url postgresql://scraper:scraper123@localhost:5437/scraper
|
|
"""
|
|
|
|
import csv
|
|
import re
|
|
import os
|
|
import argparse
|
|
from typing import Optional
|
|
|
|
try:
|
|
import psycopg2
|
|
from psycopg2.extras import execute_values
|
|
HAS_PSYCOPG2 = True
|
|
except ImportError:
|
|
HAS_PSYCOPG2 = False
|
|
|
|
# Default paths
|
|
DEFAULT_CSV_PATH = os.path.expanduser("~/Downloads/Google Business Profile Categories (2025 List) - Category List (English).csv")
|
|
DEFAULT_DB_URL = "postgresql://scraper:scraper123@localhost:5437/scraper"
|
|
|
|
|
|
def slugify(text: str) -> str:
|
|
"""Convert text to ltree-safe slug."""
|
|
# Replace special characters with underscores
|
|
slug = re.sub(r'[^a-zA-Z0-9]+', '_', text)
|
|
# Remove leading/trailing underscores
|
|
slug = slug.strip('_')
|
|
# Ensure it starts with a letter (ltree requirement)
|
|
if slug and not slug[0].isalpha():
|
|
slug = 'cat_' + slug
|
|
return slug or 'unknown'
|
|
|
|
|
|
def categorize_category(cat: str) -> tuple:
|
|
"""
|
|
Categorize a GBP category into 4-level hierarchy.
|
|
Returns: (level1, level2, level3, level4)
|
|
"""
|
|
c = cat.lower()
|
|
|
|
# === FOOD & DINING ===
|
|
if 'restaurant' in c:
|
|
if any(x in c for x in ['fast food', 'drive-in', 'takeaway', 'takeout', 'quick service']):
|
|
return ("Food & Dining", "Restaurants", "Fast Food & Quick Service", cat)
|
|
# Cuisine types
|
|
return ("Food & Dining", "Restaurants", "By Cuisine", cat)
|
|
|
|
if any(x in c for x in ['cafe', 'coffee shop', 'tea house', 'tea room', 'espresso bar']):
|
|
return ("Food & Dining", "Cafes & Coffee", "Coffee Shops", cat)
|
|
|
|
if any(x in c for x in ['bar', 'pub', 'nightclub', 'night club', 'cocktail', 'wine bar', 'beer', 'lounge']):
|
|
if 'gay' in c or 'lesbian' in c:
|
|
return ("Food & Dining", "Bars & Nightlife", "LGBTQ+ Venues", cat)
|
|
if 'karaoke' in c:
|
|
return ("Food & Dining", "Bars & Nightlife", "Karaoke", cat)
|
|
return ("Food & Dining", "Bars & Nightlife", "Bars & Pubs", cat)
|
|
|
|
if any(x in c for x in ['bakery', 'pastry', 'cake', 'donut', 'dessert', 'ice cream', 'frozen yogurt', 'candy', 'chocolate', 'confection']):
|
|
return ("Food & Dining", "Bakeries & Desserts", "Sweet Shops", cat)
|
|
|
|
if any(x in c for x in ['caterer', 'catering']):
|
|
return ("Food & Dining", "Food Services", "Catering", cat)
|
|
|
|
if any(x in c for x in ['brewery', 'winery', 'distillery', 'vineyard']):
|
|
return ("Food & Dining", "Beverage Production", "Producers", cat)
|
|
|
|
if any(x in c for x in ['food truck', 'food stand', 'food stall', 'food court']):
|
|
return ("Food & Dining", "Quick Service", "Street Food", cat)
|
|
|
|
# === RETAIL & SHOPPING ===
|
|
if 'store' in c or 'shop' in c:
|
|
if any(x in c for x in ['clothing', 'fashion', 'shoe', 'dress', 'apparel', 'wear', 'boutique', 'tailor']):
|
|
return ("Retail & Shopping", "Clothing & Fashion", "Apparel Stores", cat)
|
|
if any(x in c for x in ['electronic', 'computer', 'phone', 'appliance', 'tv', 'audio', 'video game']):
|
|
return ("Retail & Shopping", "Electronics", "Electronics Stores", cat)
|
|
if any(x in c for x in ['furniture', 'home decor', 'kitchen', 'bed', 'mattress', 'carpet', 'curtain', 'lighting']):
|
|
return ("Retail & Shopping", "Home & Garden", "Home Furnishings", cat)
|
|
if any(x in c for x in ['grocery', 'supermarket', 'food', 'beverage', 'wine', 'liquor', 'butcher', 'fish', 'fruit', 'vegetable']):
|
|
return ("Retail & Shopping", "Food & Grocery", "Grocery Stores", cat)
|
|
if any(x in c for x in ['book', 'stationery', 'office supply', 'paper']):
|
|
return ("Retail & Shopping", "Books & Office", "Book Stores", cat)
|
|
if any(x in c for x in ['pet', 'animal']):
|
|
return ("Retail & Shopping", "Pet Supplies", "Pet Stores", cat)
|
|
if any(x in c for x in ['toy', 'game', 'hobby']):
|
|
return ("Retail & Shopping", "Toys & Hobbies", "Toy Stores", cat)
|
|
if any(x in c for x in ['jewelry', 'watch', 'gold', 'diamond']):
|
|
return ("Retail & Shopping", "Jewelry & Watches", "Jewelry Stores", cat)
|
|
if any(x in c for x in ['sport', 'athletic', 'fitness', 'outdoor', 'camping', 'fishing', 'hunting']):
|
|
return ("Retail & Shopping", "Sports & Outdoors", "Sporting Goods", cat)
|
|
if any(x in c for x in ['music', 'instrument', 'record', 'vinyl']):
|
|
return ("Retail & Shopping", "Music & Entertainment", "Music Stores", cat)
|
|
if any(x in c for x in ['art', 'craft', 'fabric', 'sewing', 'yarn', 'knitting']):
|
|
return ("Retail & Shopping", "Arts & Crafts", "Art Supply Stores", cat)
|
|
if any(x in c for x in ['beauty', 'cosmetic', 'perfume', 'makeup']):
|
|
return ("Retail & Shopping", "Beauty & Cosmetics", "Beauty Stores", cat)
|
|
if any(x in c for x in ['pharmacy', 'drug', 'medicine', 'health']):
|
|
return ("Retail & Shopping", "Health & Pharmacy", "Pharmacies", cat)
|
|
if any(x in c for x in ['garden', 'plant', 'flower', 'nursery', 'landscap']):
|
|
return ("Retail & Shopping", "Home & Garden", "Garden Centers", cat)
|
|
if any(x in c for x in ['hardware', 'tool', 'building', 'lumber', 'paint']):
|
|
return ("Retail & Shopping", "Hardware & Building", "Hardware Stores", cat)
|
|
if any(x in c for x in ['antique', 'vintage', 'thrift', 'consignment', 'second hand', 'used']):
|
|
return ("Retail & Shopping", "Secondhand & Vintage", "Thrift Stores", cat)
|
|
return ("Retail & Shopping", "Specialty Retail", "Other Stores", cat)
|
|
|
|
if any(x in c for x in ['supplier', 'wholesaler', 'distributor', 'exporter', 'importer']):
|
|
if any(x in c for x in ['food', 'beverage', 'meat', 'seafood', 'produce']):
|
|
return ("Retail & Shopping", "Wholesale & Distribution", "Food Wholesale", cat)
|
|
if any(x in c for x in ['building', 'construction', 'lumber', 'concrete', 'steel']):
|
|
return ("Retail & Shopping", "Wholesale & Distribution", "Building Materials", cat)
|
|
if any(x in c for x in ['industrial', 'machinery', 'equipment']):
|
|
return ("Retail & Shopping", "Wholesale & Distribution", "Industrial Supplies", cat)
|
|
return ("Retail & Shopping", "Wholesale & Distribution", "General Wholesale", cat)
|
|
|
|
if 'market' in c and 'marketing' not in c:
|
|
if 'flea' in c or 'antique' in c:
|
|
return ("Retail & Shopping", "Markets", "Flea Markets", cat)
|
|
if 'farmer' in c:
|
|
return ("Retail & Shopping", "Markets", "Farmers Markets", cat)
|
|
return ("Retail & Shopping", "Markets", "General Markets", cat)
|
|
|
|
# === AUTOMOTIVE ===
|
|
if 'dealer' in c:
|
|
car_brands = ['abarth', 'acura', 'alfa romeo', 'aston martin', 'audi', 'bentley', 'bmw', 'bugatti',
|
|
'buick', 'cadillac', 'chevrolet', 'chrysler', 'citroen', 'cupra', 'dacia', 'daihatsu',
|
|
'dodge', 'ferrari', 'fiat', 'ford', 'genesis', 'gmc', 'honda', 'hummer', 'hyundai',
|
|
'infiniti', 'isuzu', 'jaguar', 'jeep', 'kia', 'lamborghini', 'lancia', 'land rover',
|
|
'lexus', 'lincoln', 'lotus', 'maserati', 'mazda', 'mclaren', 'mercedes', 'mini',
|
|
'mitsubishi', 'nissan', 'opel', 'peugeot', 'porsche', 'ram', 'renault', 'rolls-royce',
|
|
'saab', 'seat', 'skoda', 'smart', 'subaru', 'suzuki', 'tesla', 'toyota', 'volkswagen',
|
|
'volvo', 'yamaha', 'harley', 'ducati', 'kawasaki', 'triumph', 'vespa', 'piaggio']
|
|
if any(b in c for b in car_brands):
|
|
if 'motorcycle' in c or any(x in c for x in ['harley', 'ducati', 'kawasaki', 'triumph', 'vespa']):
|
|
return ("Automotive", "Dealers", "Motorcycle Brands", cat)
|
|
return ("Automotive", "Dealers", "Car Brands", cat)
|
|
if any(x in c for x in ['motorcycle', 'scooter', 'moped']):
|
|
return ("Automotive", "Dealers", "Motorcycle Dealers", cat)
|
|
if any(x in c for x in ['truck', 'commercial vehicle', 'trailer']):
|
|
return ("Automotive", "Dealers", "Truck & Commercial", cat)
|
|
if any(x in c for x in ['boat', 'yacht', 'marine', 'jet ski']):
|
|
return ("Automotive", "Dealers", "Marine & Boats", cat)
|
|
if any(x in c for x in ['rv', 'camper', 'motorhome', 'caravan']):
|
|
return ("Automotive", "Dealers", "RV & Campers", cat)
|
|
if any(x in c for x in ['atv', 'quad', 'off-road', 'utv']):
|
|
return ("Automotive", "Dealers", "ATV & Off-Road", cat)
|
|
if 'used' in c or 'pre-owned' in c:
|
|
return ("Automotive", "Dealers", "Used Vehicles", cat)
|
|
return ("Automotive", "Dealers", "Other Dealers", cat)
|
|
|
|
if any(x in c for x in ['car wash', 'auto detailing', 'car detailing']):
|
|
return ("Automotive", "Vehicle Care", "Cleaning & Detailing", cat)
|
|
|
|
if any(x in c for x in ['car rental', 'auto rental', 'vehicle rental', 'truck rental']):
|
|
return ("Automotive", "Rental Services", "Vehicle Rental", cat)
|
|
|
|
if any(x in c for x in ['car repair', 'auto repair', 'mechanic', 'garage', 'auto body', 'collision']):
|
|
return ("Automotive", "Repair & Maintenance", "Auto Repair", cat)
|
|
|
|
if any(x in c for x in ['tire', 'tyre', 'wheel']):
|
|
return ("Automotive", "Parts & Accessories", "Tires & Wheels", cat)
|
|
|
|
if any(x in c for x in ['auto part', 'car part', 'auto accessories']):
|
|
return ("Automotive", "Parts & Accessories", "Auto Parts", cat)
|
|
|
|
if any(x in c for x in ['driving school', 'driving instruction']):
|
|
return ("Automotive", "Training", "Driving Schools", cat)
|
|
|
|
if any(x in c for x in ['parking', 'car park', 'garage']):
|
|
if 'repair' not in c and 'mechanic' not in c:
|
|
return ("Automotive", "Parking", "Parking Facilities", cat)
|
|
|
|
if any(x in c for x in ['gas station', 'petrol', 'fuel', 'charging station', 'ev charging']):
|
|
return ("Automotive", "Fuel & Charging", "Fuel Stations", cat)
|
|
|
|
# === HEALTHCARE ===
|
|
if any(x in c for x in ['hospital']):
|
|
if 'animal' in c or 'veterinar' in c:
|
|
return ("Healthcare", "Veterinary", "Animal Hospitals", cat)
|
|
if 'children' in c or 'pediatric' in c:
|
|
return ("Healthcare", "Hospitals", "Pediatric Hospitals", cat)
|
|
if 'mental' in c or 'psychiatric' in c:
|
|
return ("Healthcare", "Mental Health", "Psychiatric Hospitals", cat)
|
|
return ("Healthcare", "Hospitals", "General Hospitals", cat)
|
|
|
|
if any(x in c for x in ['clinic']):
|
|
if 'dental' in c:
|
|
return ("Healthcare", "Dental", "Dental Clinics", cat)
|
|
if 'eye' in c or 'vision' in c or 'optical' in c:
|
|
return ("Healthcare", "Vision Care", "Eye Clinics", cat)
|
|
if 'fertility' in c or 'ivf' in c:
|
|
return ("Healthcare", "Specialty Care", "Fertility Clinics", cat)
|
|
if 'skin' in c or 'dermatol' in c:
|
|
return ("Healthcare", "Specialty Care", "Dermatology", cat)
|
|
if 'physical therapy' in c or 'physiotherapy' in c or 'rehab' in c:
|
|
return ("Healthcare", "Rehabilitation", "Physical Therapy", cat)
|
|
return ("Healthcare", "Clinics", "Medical Clinics", cat)
|
|
|
|
if any(x in c for x in ['doctor', 'physician']):
|
|
return ("Healthcare", "Medical Practitioners", "Doctors", cat)
|
|
|
|
if any(x in c for x in ['dentist', 'dental', 'orthodont', 'endodont', 'periodont']):
|
|
return ("Healthcare", "Dental", "Dental Services", cat)
|
|
|
|
if any(x in c for x in ['surgeon', 'surgery']):
|
|
if 'plastic' in c or 'cosmetic' in c:
|
|
return ("Healthcare", "Specialty Care", "Cosmetic Surgery", cat)
|
|
return ("Healthcare", "Medical Practitioners", "Surgeons", cat)
|
|
|
|
if any(x in c for x in ['psycholog', 'psychiatr', 'mental health', 'counselor', 'therapist']):
|
|
if 'marriage' in c or 'family' in c:
|
|
return ("Healthcare", "Mental Health", "Family Counseling", cat)
|
|
if 'addiction' in c or 'substance' in c:
|
|
return ("Healthcare", "Mental Health", "Addiction Treatment", cat)
|
|
return ("Healthcare", "Mental Health", "Mental Health Services", cat)
|
|
|
|
if any(x in c for x in ['chiropract']):
|
|
return ("Healthcare", "Alternative Medicine", "Chiropractic", cat)
|
|
|
|
if any(x in c for x in ['acupuncture', 'acupuncturist']):
|
|
return ("Healthcare", "Alternative Medicine", "Acupuncture", cat)
|
|
|
|
if any(x in c for x in ['naturopath', 'homeopath', 'ayurved', 'holistic']):
|
|
return ("Healthcare", "Alternative Medicine", "Natural Medicine", cat)
|
|
|
|
if any(x in c for x in ['optometrist', 'optician', 'eye doctor', 'ophthalmol']):
|
|
return ("Healthcare", "Vision Care", "Eye Care", cat)
|
|
|
|
if any(x in c for x in ['pharmacy', 'drugstore', 'apothecary']):
|
|
return ("Healthcare", "Pharmacies", "Retail Pharmacies", cat)
|
|
|
|
if any(x in c for x in ['veterinar', 'vet ', 'animal clinic', 'pet clinic']):
|
|
return ("Healthcare", "Veterinary", "Veterinary Services", cat)
|
|
|
|
if any(x in c for x in ['nursing home', 'assisted living', 'senior care', 'elder care', 'retirement home']):
|
|
return ("Healthcare", "Senior Care", "Senior Living", cat)
|
|
|
|
if any(x in c for x in ['lab', 'laboratory', 'diagnostic', 'imaging', 'x-ray', 'mri', 'radiology']):
|
|
return ("Healthcare", "Diagnostics", "Medical Labs", cat)
|
|
|
|
if any(x in c for x in ['ambulance', 'emergency', 'urgent care']):
|
|
return ("Healthcare", "Emergency Services", "Emergency Care", cat)
|
|
|
|
# === EDUCATION ===
|
|
if 'school' in c or 'academy' in c:
|
|
if any(x in c for x in ['preschool', 'kindergarten', 'nursery', 'daycare', 'pre-school']):
|
|
return ("Education", "Early Childhood", "Preschools", cat)
|
|
if any(x in c for x in ['elementary', 'primary']):
|
|
return ("Education", "K-12 Schools", "Elementary Schools", cat)
|
|
if any(x in c for x in ['middle', 'junior high']):
|
|
return ("Education", "K-12 Schools", "Middle Schools", cat)
|
|
if any(x in c for x in ['high school', 'secondary']):
|
|
return ("Education", "K-12 Schools", "High Schools", cat)
|
|
if any(x in c for x in ['boarding']):
|
|
return ("Education", "K-12 Schools", "Boarding Schools", cat)
|
|
if any(x in c for x in ['driving']):
|
|
return ("Automotive", "Training", "Driving Schools", cat)
|
|
if any(x in c for x in ['language', 'english', 'spanish', 'french', 'german', 'chinese', 'japanese']):
|
|
return ("Education", "Language Learning", "Language Schools", cat)
|
|
if any(x in c for x in ['art', 'music', 'dance', 'drama', 'theater', 'acting']):
|
|
return ("Education", "Arts Education", "Arts Schools", cat)
|
|
if any(x in c for x in ['martial art', 'karate', 'judo', 'taekwondo', 'kung fu', 'aikido', 'boxing']):
|
|
return ("Education", "Sports Training", "Martial Arts Schools", cat)
|
|
if any(x in c for x in ['beauty', 'cosmetology', 'barber']):
|
|
return ("Education", "Vocational Training", "Beauty Schools", cat)
|
|
if any(x in c for x in ['cooking', 'culinary', 'chef']):
|
|
return ("Education", "Vocational Training", "Culinary Schools", cat)
|
|
if any(x in c for x in ['business', 'mba']):
|
|
return ("Education", "Higher Education", "Business Schools", cat)
|
|
if any(x in c for x in ['medical', 'nursing', 'dental']):
|
|
return ("Education", "Higher Education", "Medical Schools", cat)
|
|
if any(x in c for x in ['law']):
|
|
return ("Education", "Higher Education", "Law Schools", cat)
|
|
if any(x in c for x in ['flight', 'aviation', 'pilot']):
|
|
return ("Education", "Vocational Training", "Aviation Schools", cat)
|
|
if any(x in c for x in ['computer', 'it ', 'coding', 'programming', 'software']):
|
|
return ("Education", "Technology Training", "Computer Schools", cat)
|
|
if any(x in c for x in ['trade', 'technical', 'vocational']):
|
|
return ("Education", "Vocational Training", "Trade Schools", cat)
|
|
return ("Education", "Specialty Schools", "Other Schools", cat)
|
|
|
|
if any(x in c for x in ['university', 'college']):
|
|
if 'community' in c:
|
|
return ("Education", "Higher Education", "Community Colleges", cat)
|
|
return ("Education", "Higher Education", "Universities", cat)
|
|
|
|
if any(x in c for x in ['tutor', 'tutoring']):
|
|
return ("Education", "Tutoring", "Private Tutoring", cat)
|
|
|
|
if any(x in c for x in ['training center', 'training program', 'training institute']):
|
|
return ("Education", "Professional Training", "Training Centers", cat)
|
|
|
|
if any(x in c for x in ['library']):
|
|
return ("Education", "Libraries", "Public Libraries", cat)
|
|
|
|
# === PROFESSIONAL SERVICES ===
|
|
if any(x in c for x in ['lawyer', 'attorney', 'law firm', 'legal']):
|
|
if any(x in c for x in ['immigration']):
|
|
return ("Professional Services", "Legal", "Immigration Law", cat)
|
|
if any(x in c for x in ['criminal', 'defense']):
|
|
return ("Professional Services", "Legal", "Criminal Law", cat)
|
|
if any(x in c for x in ['family', 'divorce']):
|
|
return ("Professional Services", "Legal", "Family Law", cat)
|
|
if any(x in c for x in ['personal injury', 'accident']):
|
|
return ("Professional Services", "Legal", "Personal Injury", cat)
|
|
if any(x in c for x in ['real estate', 'property']):
|
|
return ("Professional Services", "Legal", "Real Estate Law", cat)
|
|
if any(x in c for x in ['business', 'corporate', 'commercial']):
|
|
return ("Professional Services", "Legal", "Business Law", cat)
|
|
return ("Professional Services", "Legal", "General Legal", cat)
|
|
|
|
if any(x in c for x in ['accountant', 'accounting', 'bookkeep', 'tax']):
|
|
return ("Professional Services", "Financial Services", "Accounting", cat)
|
|
|
|
if any(x in c for x in ['consultant', 'consulting', 'advisor']):
|
|
if any(x in c for x in ['business', 'management']):
|
|
return ("Professional Services", "Consulting", "Business Consulting", cat)
|
|
if any(x in c for x in ['it ', 'technology', 'computer']):
|
|
return ("Professional Services", "Consulting", "IT Consulting", cat)
|
|
if any(x in c for x in ['marketing', 'advertising']):
|
|
return ("Professional Services", "Consulting", "Marketing Consulting", cat)
|
|
return ("Professional Services", "Consulting", "General Consulting", cat)
|
|
|
|
if any(x in c for x in ['notary', 'notarial']):
|
|
return ("Professional Services", "Legal", "Notary Services", cat)
|
|
|
|
if any(x in c for x in ['architect', 'architecture']):
|
|
return ("Professional Services", "Design", "Architecture", cat)
|
|
|
|
if any(x in c for x in ['engineer', 'engineering']):
|
|
if 'civil' in c:
|
|
return ("Professional Services", "Engineering", "Civil Engineering", cat)
|
|
if 'structural' in c:
|
|
return ("Professional Services", "Engineering", "Structural Engineering", cat)
|
|
if 'mechanical' in c:
|
|
return ("Professional Services", "Engineering", "Mechanical Engineering", cat)
|
|
if 'electrical' in c:
|
|
return ("Professional Services", "Engineering", "Electrical Engineering", cat)
|
|
return ("Professional Services", "Engineering", "General Engineering", cat)
|
|
|
|
if any(x in c for x in ['agency']):
|
|
if any(x in c for x in ['advertising', 'marketing', 'creative', 'digital']):
|
|
return ("Professional Services", "Marketing & Advertising", "Agencies", cat)
|
|
if any(x in c for x in ['real estate', 'property']):
|
|
return ("Real Estate", "Agencies", "Real Estate Agencies", cat)
|
|
if any(x in c for x in ['insurance']):
|
|
return ("Finance & Insurance", "Insurance", "Insurance Agencies", cat)
|
|
if any(x in c for x in ['travel', 'tour']):
|
|
return ("Hospitality & Travel", "Travel Services", "Travel Agencies", cat)
|
|
if any(x in c for x in ['employment', 'staffing', 'recruitment', 'temp']):
|
|
return ("Professional Services", "HR Services", "Staffing Agencies", cat)
|
|
return ("Professional Services", "Agencies", "Other Agencies", cat)
|
|
|
|
if any(x in c for x in ['photographer', 'photography', 'photo studio']):
|
|
return ("Professional Services", "Creative Services", "Photography", cat)
|
|
|
|
if any(x in c for x in ['graphic design', 'web design', 'design studio']):
|
|
return ("Professional Services", "Creative Services", "Design Services", cat)
|
|
|
|
if any(x in c for x in ['translator', 'translation', 'interpreter']):
|
|
return ("Professional Services", "Language Services", "Translation", cat)
|
|
|
|
if any(x in c for x in ['printing', 'print shop', 'copy']):
|
|
return ("Professional Services", "Business Services", "Printing Services", cat)
|
|
|
|
# === HOME SERVICES ===
|
|
if any(x in c for x in ['plumber', 'plumbing']):
|
|
return ("Home Services", "Plumbing", "Plumbers", cat)
|
|
|
|
if any(x in c for x in ['electrician', 'electrical']):
|
|
if 'contractor' in c or 'service' in c:
|
|
return ("Home Services", "Electrical", "Electricians", cat)
|
|
|
|
if any(x in c for x in ['hvac', 'air conditioning', 'heating', 'furnace']):
|
|
return ("Home Services", "HVAC", "Heating & Cooling", cat)
|
|
|
|
if any(x in c for x in ['roofing', 'roofer']):
|
|
return ("Home Services", "Roofing", "Roofing Services", cat)
|
|
|
|
if any(x in c for x in ['painter', 'painting']):
|
|
if 'house' in c or 'residential' in c or 'contractor' in c:
|
|
return ("Home Services", "Painting", "House Painters", cat)
|
|
|
|
if any(x in c for x in ['landscap', 'lawn', 'garden']):
|
|
if 'service' in c or 'company' in c or 'contractor' in c:
|
|
return ("Home Services", "Landscaping", "Landscaping Services", cat)
|
|
|
|
if any(x in c for x in ['cleaning service', 'maid', 'housekeep', 'janitorial']):
|
|
return ("Home Services", "Cleaning", "Cleaning Services", cat)
|
|
|
|
if any(x in c for x in ['pest control', 'exterminator']):
|
|
return ("Home Services", "Pest Control", "Exterminators", cat)
|
|
|
|
if any(x in c for x in ['locksmith']):
|
|
return ("Home Services", "Security", "Locksmiths", cat)
|
|
|
|
if any(x in c for x in ['moving company', 'mover', 'relocation']):
|
|
return ("Home Services", "Moving", "Moving Services", cat)
|
|
|
|
if any(x in c for x in ['contractor']):
|
|
if 'general' in c:
|
|
return ("Home Services", "Construction", "General Contractors", cat)
|
|
return ("Home Services", "Construction", "Contractors", cat)
|
|
|
|
if any(x in c for x in ['carpenter', 'carpentry']):
|
|
return ("Home Services", "Construction", "Carpenters", cat)
|
|
|
|
if any(x in c for x in ['flooring', 'floor']):
|
|
if 'service' in c or 'contractor' in c or 'installation' in c:
|
|
return ("Home Services", "Flooring", "Floor Installation", cat)
|
|
|
|
if any(x in c for x in ['window', 'glass']):
|
|
if 'repair' in c or 'installation' in c or 'service' in c:
|
|
return ("Home Services", "Windows & Doors", "Window Services", cat)
|
|
|
|
if any(x in c for x in ['pool', 'spa']):
|
|
if 'service' in c or 'cleaning' in c or 'maintenance' in c:
|
|
return ("Home Services", "Pool & Spa", "Pool Services", cat)
|
|
|
|
if any(x in c for x in ['appliance repair', 'appliance service']):
|
|
return ("Home Services", "Appliance Repair", "Appliance Services", cat)
|
|
|
|
if any(x in c for x in ['handyman']):
|
|
return ("Home Services", "General Repair", "Handyman Services", cat)
|
|
|
|
if any(x in c for x in ['interior design', 'decorator']):
|
|
return ("Home Services", "Design", "Interior Design", cat)
|
|
|
|
# === PERSONAL SERVICES ===
|
|
if any(x in c for x in ['salon', 'hair', 'hairdress', 'stylist']):
|
|
return ("Personal Services", "Hair Care", "Hair Salons", cat)
|
|
|
|
if any(x in c for x in ['barber']):
|
|
if 'shop' in c or not 'school' in c:
|
|
return ("Personal Services", "Hair Care", "Barber Shops", cat)
|
|
|
|
if any(x in c for x in ['nail', 'manicure', 'pedicure']):
|
|
return ("Personal Services", "Nail Care", "Nail Salons", cat)
|
|
|
|
if any(x in c for x in ['spa']):
|
|
if 'day spa' in c or 'medical spa' in c or ('service' not in c and 'pool' not in c):
|
|
return ("Personal Services", "Spa & Wellness", "Day Spas", cat)
|
|
|
|
if any(x in c for x in ['massage']):
|
|
return ("Personal Services", "Massage", "Massage Therapy", cat)
|
|
|
|
if any(x in c for x in ['beauty']):
|
|
if 'salon' in c or 'parlor' in c:
|
|
return ("Personal Services", "Beauty", "Beauty Salons", cat)
|
|
|
|
if any(x in c for x in ['tattoo']):
|
|
return ("Personal Services", "Body Art", "Tattoo Shops", cat)
|
|
|
|
if any(x in c for x in ['piercing']):
|
|
return ("Personal Services", "Body Art", "Piercing Studios", cat)
|
|
|
|
if any(x in c for x in ['tanning']):
|
|
return ("Personal Services", "Tanning", "Tanning Salons", cat)
|
|
|
|
if any(x in c for x in ['tailor', 'alteration', 'seamstress']):
|
|
return ("Personal Services", "Clothing Care", "Tailoring", cat)
|
|
|
|
if any(x in c for x in ['dry clean', 'laundry', 'laundromat']):
|
|
return ("Personal Services", "Laundry", "Laundry Services", cat)
|
|
|
|
if any(x in c for x in ['personal trainer', 'fitness trainer']):
|
|
return ("Personal Services", "Fitness", "Personal Training", cat)
|
|
|
|
# === ENTERTAINMENT & RECREATION ===
|
|
if any(x in c for x in ['movie theater', 'cinema', 'multiplex']):
|
|
return ("Entertainment", "Movies", "Movie Theaters", cat)
|
|
|
|
if any(x in c for x in ['theater', 'theatre']):
|
|
if 'movie' not in c:
|
|
return ("Entertainment", "Performing Arts", "Theaters", cat)
|
|
|
|
if any(x in c for x in ['museum']):
|
|
if 'art' in c:
|
|
return ("Entertainment", "Museums", "Art Museums", cat)
|
|
if 'history' in c or 'historical' in c:
|
|
return ("Entertainment", "Museums", "History Museums", cat)
|
|
if 'science' in c or 'natural' in c:
|
|
return ("Entertainment", "Museums", "Science Museums", cat)
|
|
if 'children' in c or 'kid' in c:
|
|
return ("Entertainment", "Museums", "Children's Museums", cat)
|
|
return ("Entertainment", "Museums", "General Museums", cat)
|
|
|
|
if any(x in c for x in ['art gallery', 'gallery']):
|
|
return ("Entertainment", "Arts", "Art Galleries", cat)
|
|
|
|
if any(x in c for x in ['amusement park', 'theme park', 'water park']):
|
|
return ("Entertainment", "Amusement", "Theme Parks", cat)
|
|
|
|
if any(x in c for x in ['zoo', 'aquarium', 'wildlife']):
|
|
return ("Entertainment", "Wildlife", "Zoos & Aquariums", cat)
|
|
|
|
if any(x in c for x in ['bowling']):
|
|
return ("Entertainment", "Games & Recreation", "Bowling", cat)
|
|
|
|
if any(x in c for x in ['arcade', 'video game']):
|
|
return ("Entertainment", "Games & Recreation", "Arcades", cat)
|
|
|
|
if any(x in c for x in ['escape room']):
|
|
return ("Entertainment", "Games & Recreation", "Escape Rooms", cat)
|
|
|
|
if any(x in c for x in ['casino', 'gambling']):
|
|
return ("Entertainment", "Gambling", "Casinos", cat)
|
|
|
|
if any(x in c for x in ['concert', 'music venue', 'live music']):
|
|
return ("Entertainment", "Music Venues", "Concert Halls", cat)
|
|
|
|
if any(x in c for x in ['gym', 'fitness center', 'health club']):
|
|
return ("Entertainment", "Fitness", "Gyms", cat)
|
|
|
|
if any(x in c for x in ['yoga']):
|
|
if 'studio' in c or 'center' in c:
|
|
return ("Entertainment", "Fitness", "Yoga Studios", cat)
|
|
|
|
if any(x in c for x in ['pilates']):
|
|
return ("Entertainment", "Fitness", "Pilates Studios", cat)
|
|
|
|
if any(x in c for x in ['swimming pool', 'swim']):
|
|
return ("Entertainment", "Sports", "Swimming Pools", cat)
|
|
|
|
if any(x in c for x in ['golf']):
|
|
if 'course' in c or 'club' in c:
|
|
return ("Entertainment", "Sports", "Golf Courses", cat)
|
|
|
|
if any(x in c for x in ['tennis']):
|
|
return ("Entertainment", "Sports", "Tennis Courts", cat)
|
|
|
|
if any(x in c for x in ['stadium', 'arena', 'sports complex']):
|
|
return ("Entertainment", "Venues", "Sports Venues", cat)
|
|
|
|
if any(x in c for x in ['park']):
|
|
if 'amusement' not in c and 'theme' not in c:
|
|
if 'national' in c or 'state' in c:
|
|
return ("Entertainment", "Parks", "National Parks", cat)
|
|
if 'dog' in c:
|
|
return ("Entertainment", "Parks", "Dog Parks", cat)
|
|
return ("Entertainment", "Parks", "Public Parks", cat)
|
|
|
|
if any(x in c for x in ['recreation center', 'community center']):
|
|
return ("Entertainment", "Recreation", "Community Centers", cat)
|
|
|
|
if any(x in c for x in ['club']):
|
|
if 'night' in c:
|
|
return ("Food & Dining", "Bars & Nightlife", "Night Clubs", cat)
|
|
if 'country' in c:
|
|
return ("Entertainment", "Sports", "Country Clubs", cat)
|
|
if 'sport' in c or 'athletic' in c:
|
|
return ("Entertainment", "Sports", "Sports Clubs", cat)
|
|
if 'social' in c:
|
|
return ("Entertainment", "Social", "Social Clubs", cat)
|
|
|
|
# === HOSPITALITY & TRAVEL ===
|
|
if any(x in c for x in ['hotel', 'motel', 'inn']):
|
|
if 'boutique' in c:
|
|
return ("Hospitality & Travel", "Lodging", "Boutique Hotels", cat)
|
|
if 'resort' in c:
|
|
return ("Hospitality & Travel", "Lodging", "Resorts", cat)
|
|
if 'budget' in c or 'economy' in c:
|
|
return ("Hospitality & Travel", "Lodging", "Budget Hotels", cat)
|
|
return ("Hospitality & Travel", "Lodging", "Hotels", cat)
|
|
|
|
if any(x in c for x in ['hostel']):
|
|
return ("Hospitality & Travel", "Lodging", "Hostels", cat)
|
|
|
|
if any(x in c for x in ['bed and breakfast', 'b&b', 'bnb']):
|
|
return ("Hospitality & Travel", "Lodging", "B&Bs", cat)
|
|
|
|
if any(x in c for x in ['resort']):
|
|
return ("Hospitality & Travel", "Lodging", "Resorts", cat)
|
|
|
|
if any(x in c for x in ['vacation rental', 'holiday rental']):
|
|
return ("Hospitality & Travel", "Lodging", "Vacation Rentals", cat)
|
|
|
|
if any(x in c for x in ['campground', 'camping', 'rv park']):
|
|
return ("Hospitality & Travel", "Lodging", "Campgrounds", cat)
|
|
|
|
if any(x in c for x in ['travel agency', 'tour operator', 'travel agent']):
|
|
return ("Hospitality & Travel", "Travel Services", "Travel Agencies", cat)
|
|
|
|
if any(x in c for x in ['airline', 'airport']):
|
|
return ("Hospitality & Travel", "Transportation", "Airlines & Airports", cat)
|
|
|
|
if any(x in c for x in ['cruise']):
|
|
return ("Hospitality & Travel", "Travel Services", "Cruises", cat)
|
|
|
|
if any(x in c for x in ['tourist', 'attraction', 'sightseeing']):
|
|
return ("Hospitality & Travel", "Attractions", "Tourist Attractions", cat)
|
|
|
|
# === FINANCE & INSURANCE ===
|
|
if any(x in c for x in ['bank', 'banking', 'credit union']):
|
|
return ("Finance & Insurance", "Banking", "Banks", cat)
|
|
|
|
if any(x in c for x in ['atm', 'cash machine']):
|
|
return ("Finance & Insurance", "Banking", "ATMs", cat)
|
|
|
|
if any(x in c for x in ['insurance']):
|
|
if 'health' in c or 'medical' in c:
|
|
return ("Finance & Insurance", "Insurance", "Health Insurance", cat)
|
|
if 'auto' in c or 'car' in c:
|
|
return ("Finance & Insurance", "Insurance", "Auto Insurance", cat)
|
|
if 'home' in c or 'property' in c:
|
|
return ("Finance & Insurance", "Insurance", "Home Insurance", cat)
|
|
if 'life' in c:
|
|
return ("Finance & Insurance", "Insurance", "Life Insurance", cat)
|
|
return ("Finance & Insurance", "Insurance", "Insurance Services", cat)
|
|
|
|
if any(x in c for x in ['loan', 'mortgage', 'lending']):
|
|
return ("Finance & Insurance", "Lending", "Loans", cat)
|
|
|
|
if any(x in c for x in ['investment', 'financial advisor', 'wealth management', 'financial planner']):
|
|
return ("Finance & Insurance", "Investment", "Financial Services", cat)
|
|
|
|
if any(x in c for x in ['currency exchange', 'money transfer', 'wire transfer']):
|
|
return ("Finance & Insurance", "Money Services", "Currency Services", cat)
|
|
|
|
if any(x in c for x in ['pawn']):
|
|
return ("Finance & Insurance", "Money Services", "Pawn Shops", cat)
|
|
|
|
# === REAL ESTATE ===
|
|
if any(x in c for x in ['real estate', 'property', 'realty', 'realtor']):
|
|
if 'agent' in c or 'agency' in c or 'broker' in c:
|
|
return ("Real Estate", "Agencies", "Real Estate Agents", cat)
|
|
if 'developer' in c or 'development' in c:
|
|
return ("Real Estate", "Development", "Developers", cat)
|
|
if 'management' in c:
|
|
return ("Real Estate", "Management", "Property Management", cat)
|
|
if 'commercial' in c:
|
|
return ("Real Estate", "Commercial", "Commercial Real Estate", cat)
|
|
return ("Real Estate", "Services", "Real Estate Services", cat)
|
|
|
|
if any(x in c for x in ['apartment', 'condo', 'rental']):
|
|
if 'complex' in c or 'building' in c:
|
|
return ("Real Estate", "Residential", "Apartment Complexes", cat)
|
|
|
|
if any(x in c for x in ['storage', 'self storage', 'warehouse']):
|
|
if 'self' in c or 'mini' in c:
|
|
return ("Real Estate", "Storage", "Self Storage", cat)
|
|
|
|
# === RELIGIOUS ===
|
|
if any(x in c for x in ['church']):
|
|
if 'catholic' in c:
|
|
return ("Religious", "Christian", "Catholic Churches", cat)
|
|
if 'baptist' in c:
|
|
return ("Religious", "Christian", "Baptist Churches", cat)
|
|
if 'methodist' in c:
|
|
return ("Religious", "Christian", "Methodist Churches", cat)
|
|
if 'lutheran' in c:
|
|
return ("Religious", "Christian", "Lutheran Churches", cat)
|
|
if 'orthodox' in c:
|
|
return ("Religious", "Christian", "Orthodox Churches", cat)
|
|
if 'pentecostal' in c:
|
|
return ("Religious", "Christian", "Pentecostal Churches", cat)
|
|
return ("Religious", "Christian", "Churches", cat)
|
|
|
|
if any(x in c for x in ['mosque', 'islamic', 'muslim']):
|
|
return ("Religious", "Islam", "Mosques", cat)
|
|
|
|
if any(x in c for x in ['synagogue', 'jewish', 'temple']):
|
|
if 'jewish' in c or 'synagogue' in c:
|
|
return ("Religious", "Judaism", "Synagogues", cat)
|
|
if 'hindu' in c:
|
|
return ("Religious", "Hinduism", "Hindu Temples", cat)
|
|
if 'buddhist' in c:
|
|
return ("Religious", "Buddhism", "Buddhist Temples", cat)
|
|
return ("Religious", "Other", "Temples", cat)
|
|
|
|
if any(x in c for x in ['abbey', 'monastery', 'convent']):
|
|
return ("Religious", "Christian", "Monasteries", cat)
|
|
|
|
if any(x in c for x in ['gurdwara', 'sikh']):
|
|
return ("Religious", "Sikhism", "Gurdwaras", cat)
|
|
|
|
# === GOVERNMENT & PUBLIC SERVICES ===
|
|
if any(x in c for x in ['government', 'city hall', 'town hall', 'municipal']):
|
|
return ("Government", "Local Government", "Government Offices", cat)
|
|
|
|
if any(x in c for x in ['court', 'courthouse']):
|
|
return ("Government", "Legal", "Courts", cat)
|
|
|
|
if any(x in c for x in ['police', 'sheriff']):
|
|
return ("Government", "Public Safety", "Police", cat)
|
|
|
|
if any(x in c for x in ['fire station', 'fire department']):
|
|
return ("Government", "Public Safety", "Fire Departments", cat)
|
|
|
|
if any(x in c for x in ['post office', 'postal']):
|
|
return ("Government", "Postal", "Post Offices", cat)
|
|
|
|
if any(x in c for x in ['embassy', 'consulate']):
|
|
return ("Government", "International", "Embassies", cat)
|
|
|
|
if any(x in c for x in ['dmv', 'motor vehicle', 'driver license']):
|
|
return ("Government", "Transportation", "DMV", cat)
|
|
|
|
if any(x in c for x in ['social security', 'welfare', 'social services']):
|
|
return ("Government", "Social Services", "Social Services", cat)
|
|
|
|
# === INDUSTRIAL & MANUFACTURING ===
|
|
if any(x in c for x in ['manufacturer', 'manufacturing', 'factory', 'plant']):
|
|
if any(x in c for x in ['food', 'beverage', 'bakery']):
|
|
return ("Industrial", "Manufacturing", "Food Manufacturing", cat)
|
|
if any(x in c for x in ['textile', 'clothing', 'garment']):
|
|
return ("Industrial", "Manufacturing", "Textile Manufacturing", cat)
|
|
if any(x in c for x in ['electronics', 'computer', 'semiconductor']):
|
|
return ("Industrial", "Manufacturing", "Electronics Manufacturing", cat)
|
|
if any(x in c for x in ['auto', 'car', 'vehicle']):
|
|
return ("Industrial", "Manufacturing", "Auto Manufacturing", cat)
|
|
if any(x in c for x in ['chemical', 'pharmaceutical']):
|
|
return ("Industrial", "Manufacturing", "Chemical Manufacturing", cat)
|
|
if any(x in c for x in ['metal', 'steel', 'iron']):
|
|
return ("Industrial", "Manufacturing", "Metal Manufacturing", cat)
|
|
if any(x in c for x in ['plastic', 'rubber']):
|
|
return ("Industrial", "Manufacturing", "Plastics Manufacturing", cat)
|
|
if any(x in c for x in ['furniture', 'wood']):
|
|
return ("Industrial", "Manufacturing", "Furniture Manufacturing", cat)
|
|
return ("Industrial", "Manufacturing", "General Manufacturing", cat)
|
|
|
|
if any(x in c for x in ['mining', 'quarry']):
|
|
return ("Industrial", "Mining", "Mining Operations", cat)
|
|
|
|
if any(x in c for x in ['construction company', 'builder']):
|
|
return ("Industrial", "Construction", "Construction Companies", cat)
|
|
|
|
# === TECHNOLOGY ===
|
|
if any(x in c for x in ['software', 'app developer', 'web developer']):
|
|
return ("Technology", "Software", "Software Development", cat)
|
|
|
|
if any(x in c for x in ['it service', 'computer service', 'tech support']):
|
|
return ("Technology", "IT Services", "IT Support", cat)
|
|
|
|
if any(x in c for x in ['data center', 'hosting', 'cloud']):
|
|
return ("Technology", "Infrastructure", "Data Services", cat)
|
|
|
|
if any(x in c for x in ['telecommunication', 'telecom', 'internet service']):
|
|
return ("Technology", "Telecommunications", "Telecom Services", cat)
|
|
|
|
# === TRANSPORTATION & LOGISTICS ===
|
|
if any(x in c for x in ['shipping', 'freight', 'cargo', 'logistics']):
|
|
return ("Transportation", "Logistics", "Shipping & Freight", cat)
|
|
|
|
if any(x in c for x in ['courier', 'delivery', 'express']):
|
|
return ("Transportation", "Delivery", "Courier Services", cat)
|
|
|
|
if any(x in c for x in ['taxi', 'cab', 'ride', 'limo', 'chauffeur']):
|
|
return ("Transportation", "Passenger", "Taxi & Ride Services", cat)
|
|
|
|
if any(x in c for x in ['bus', 'coach', 'shuttle']):
|
|
if 'station' in c or 'terminal' in c or 'stop' in c:
|
|
return ("Transportation", "Public Transit", "Bus Stations", cat)
|
|
return ("Transportation", "Passenger", "Bus Services", cat)
|
|
|
|
if any(x in c for x in ['train', 'rail', 'subway', 'metro']):
|
|
if 'station' in c or 'terminal' in c:
|
|
return ("Transportation", "Public Transit", "Train Stations", cat)
|
|
return ("Transportation", "Public Transit", "Rail Services", cat)
|
|
|
|
if any(x in c for x in ['towing', 'tow truck']):
|
|
return ("Transportation", "Vehicle Services", "Towing", cat)
|
|
|
|
# === AGRICULTURE ===
|
|
if any(x in c for x in ['farm', 'ranch', 'orchard', 'vineyard']):
|
|
return ("Agriculture", "Farming", "Farms", cat)
|
|
|
|
if any(x in c for x in ['agricultural', 'agri']):
|
|
return ("Agriculture", "Services", "Agricultural Services", cat)
|
|
|
|
# === PETS & ANIMALS ===
|
|
if any(x in c for x in ['pet', 'dog', 'cat']):
|
|
if 'grooming' in c or 'groomer' in c:
|
|
return ("Pets & Animals", "Pet Services", "Pet Grooming", cat)
|
|
if 'boarding' in c or 'kennel' in c or 'sitting' in c or 'daycare' in c:
|
|
return ("Pets & Animals", "Pet Services", "Pet Boarding", cat)
|
|
if 'training' in c or 'trainer' in c:
|
|
return ("Pets & Animals", "Pet Services", "Pet Training", cat)
|
|
if 'adoption' in c or 'shelter' in c or 'rescue' in c:
|
|
return ("Pets & Animals", "Animal Welfare", "Shelters", cat)
|
|
if 'store' in c or 'shop' in c:
|
|
return ("Retail & Shopping", "Pet Supplies", "Pet Stores", cat)
|
|
|
|
# === EVENTS & WEDDINGS ===
|
|
if any(x in c for x in ['wedding', 'bridal']):
|
|
if 'venue' in c or 'hall' in c:
|
|
return ("Events & Weddings", "Venues", "Wedding Venues", cat)
|
|
if 'planner' in c:
|
|
return ("Events & Weddings", "Planning", "Wedding Planners", cat)
|
|
if 'dress' in c or 'gown' in c:
|
|
return ("Events & Weddings", "Attire", "Bridal Shops", cat)
|
|
return ("Events & Weddings", "Services", "Wedding Services", cat)
|
|
|
|
if any(x in c for x in ['event', 'party', 'banquet']):
|
|
if 'venue' in c or 'hall' in c or 'center' in c:
|
|
return ("Events & Weddings", "Venues", "Event Venues", cat)
|
|
if 'planner' in c or 'planning' in c:
|
|
return ("Events & Weddings", "Planning", "Event Planners", cat)
|
|
if 'rental' in c or 'supply' in c:
|
|
return ("Events & Weddings", "Rentals", "Event Rentals", cat)
|
|
return ("Events & Weddings", "Services", "Event Services", cat)
|
|
|
|
if any(x in c for x in ['florist', 'flower']):
|
|
if 'shop' in c or 'store' not in c:
|
|
return ("Events & Weddings", "Florists", "Flower Shops", cat)
|
|
|
|
if any(x in c for x in ['funeral', 'mortuary', 'cremation', 'cemetery']):
|
|
return ("Events & Weddings", "Memorial", "Funeral Services", cat)
|
|
|
|
# === NON-PROFIT & COMMUNITY ===
|
|
if any(x in c for x in ['non-profit', 'nonprofit', 'charity', 'foundation']):
|
|
return ("Non-Profit", "Charities", "Non-Profit Organizations", cat)
|
|
|
|
if any(x in c for x in ['community', 'civic', 'volunteer']):
|
|
if 'center' in c:
|
|
return ("Non-Profit", "Community", "Community Centers", cat)
|
|
return ("Non-Profit", "Community", "Community Organizations", cat)
|
|
|
|
if any(x in c for x in ['association', 'organization', 'society']):
|
|
if 'professional' in c or 'trade' in c or 'business' in c:
|
|
return ("Non-Profit", "Professional", "Professional Associations", cat)
|
|
return ("Non-Profit", "General", "Organizations", cat)
|
|
|
|
# Default fallback
|
|
return ("Other", "Uncategorized", "General", cat)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Import GBP categories into PostgreSQL with ltree')
|
|
parser.add_argument('--csv-path', default=DEFAULT_CSV_PATH, help='Path to categories CSV')
|
|
parser.add_argument('--db-url', default=DEFAULT_DB_URL, help='PostgreSQL connection URL')
|
|
parser.add_argument('--dry-run', action='store_true', help='Print categories without importing')
|
|
args = parser.parse_args()
|
|
|
|
# Read categories
|
|
print(f"Reading categories from: {args.csv_path}")
|
|
categories = []
|
|
with open(args.csv_path, 'r', encoding='utf-8') as f:
|
|
reader = csv.reader(f)
|
|
next(reader) # Skip header
|
|
for row in reader:
|
|
if row and row[0].strip():
|
|
categories.append(row[0].strip())
|
|
|
|
print(f"Found {len(categories)} categories")
|
|
|
|
# Build tree structure
|
|
tree = {} # path -> (name, level, parent_path)
|
|
|
|
for cat in categories:
|
|
l1, l2, l3, l4 = categorize_category(cat)
|
|
|
|
# Build paths
|
|
l1_slug = slugify(l1)
|
|
l2_slug = slugify(l2)
|
|
l3_slug = slugify(l3)
|
|
l4_slug = slugify(l4)
|
|
|
|
# Level 1 (Sector)
|
|
l1_path = l1_slug
|
|
if l1_path not in tree:
|
|
tree[l1_path] = (l1, 1, None)
|
|
|
|
# Level 2 (Business Type)
|
|
l2_path = f"{l1_slug}.{l2_slug}"
|
|
if l2_path not in tree:
|
|
tree[l2_path] = (l2, 2, l1_path)
|
|
|
|
# Level 3 (Sub-category)
|
|
l3_path = f"{l1_slug}.{l2_slug}.{l3_slug}"
|
|
if l3_path not in tree:
|
|
tree[l3_path] = (l3, 3, l2_path)
|
|
|
|
# Level 4 (Specific Category)
|
|
l4_path = f"{l1_slug}.{l2_slug}.{l3_slug}.{l4_slug}"
|
|
if l4_path not in tree:
|
|
tree[l4_path] = (l4, 4, l3_path)
|
|
|
|
# Print statistics
|
|
level_counts = {1: 0, 2: 0, 3: 0, 4: 0}
|
|
for path, (name, level, parent) in tree.items():
|
|
level_counts[level] += 1
|
|
|
|
print(f"\nTree structure:")
|
|
print(f" Level 1 (Sectors): {level_counts[1]}")
|
|
print(f" Level 2 (Business Types): {level_counts[2]}")
|
|
print(f" Level 3 (Sub-categories): {level_counts[3]}")
|
|
print(f" Level 4 (Categories): {level_counts[4]}")
|
|
print(f" Total nodes: {len(tree)}")
|
|
|
|
if args.dry_run:
|
|
print("\n[DRY RUN] Would insert these nodes:")
|
|
for path in sorted(tree.keys())[:20]:
|
|
name, level, parent = tree[path]
|
|
print(f" {' ' * (level-1)}{name} ({path})")
|
|
print(f" ... and {len(tree) - 20} more")
|
|
return
|
|
|
|
# Check for psycopg2
|
|
if not HAS_PSYCOPG2:
|
|
print("\nERROR: psycopg2 is required for database import.")
|
|
print("Install it with: pip install psycopg2-binary")
|
|
return
|
|
|
|
# Connect to database
|
|
print(f"\nConnecting to database...")
|
|
conn = psycopg2.connect(args.db_url)
|
|
cur = conn.cursor()
|
|
|
|
# Run init SQL first
|
|
init_sql_path = os.path.join(os.path.dirname(__file__), 'init', '01_create_categories.sql')
|
|
if os.path.exists(init_sql_path):
|
|
print(f"Running init SQL: {init_sql_path}")
|
|
with open(init_sql_path, 'r') as f:
|
|
cur.execute(f.read())
|
|
conn.commit()
|
|
|
|
# Clear existing data
|
|
print("Clearing existing categories...")
|
|
cur.execute("TRUNCATE TABLE gbp_categories RESTART IDENTITY CASCADE")
|
|
|
|
# Insert nodes in order (parents first)
|
|
print("Inserting categories...")
|
|
path_to_id = {}
|
|
|
|
# Sort by level to ensure parents are inserted first
|
|
sorted_items = sorted(tree.items(), key=lambda x: x[1][1])
|
|
|
|
for path, (name, level, parent_path) in sorted_items:
|
|
parent_id = path_to_id.get(parent_path) if parent_path else None
|
|
slug = path.split('.')[-1]
|
|
|
|
cur.execute("""
|
|
INSERT INTO gbp_categories (name, slug, path, level, parent_id)
|
|
VALUES (%s, %s, %s, %s, %s)
|
|
RETURNING id
|
|
""", (name, slug, path, level, parent_id))
|
|
|
|
path_to_id[path] = cur.fetchone()[0]
|
|
|
|
# Update category counts
|
|
print("Updating category counts...")
|
|
cur.execute("""
|
|
UPDATE gbp_categories p
|
|
SET category_count = (
|
|
SELECT COUNT(*) FROM gbp_categories c
|
|
WHERE c.path <@ p.path AND c.path != p.path
|
|
)
|
|
""")
|
|
|
|
conn.commit()
|
|
|
|
# Verify
|
|
cur.execute("SELECT COUNT(*) FROM gbp_categories")
|
|
count = cur.fetchone()[0]
|
|
print(f"\nSuccess! Inserted {count} nodes into gbp_categories table")
|
|
|
|
# Show tree stats
|
|
cur.execute("SELECT * FROM category_tree_stats")
|
|
print("\nTree statistics:")
|
|
for row in cur.fetchall():
|
|
print(f" Level {row[0]}: {row[1]} nodes")
|
|
|
|
cur.close()
|
|
conn.close()
|
|
print("\nDone!")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|