#!/usr/bin/env python3 """ Import Google Business Profile categories into PostgreSQL with ltree hierarchy. Usage: python import_categories.py [--csv-path PATH] [--db-url URL] Example: python import_categories.py --csv-path ./categories.csv --db-url postgresql://scraper:scraper123@localhost:5437/scraper """ import csv import re import os import argparse from typing import Optional try: import psycopg2 from psycopg2.extras import execute_values HAS_PSYCOPG2 = True except ImportError: HAS_PSYCOPG2 = False # Default paths DEFAULT_CSV_PATH = os.path.expanduser("~/Downloads/Google Business Profile Categories (2025 List) - Category List (English).csv") DEFAULT_DB_URL = "postgresql://scraper:scraper123@localhost:5437/scraper" def slugify(text: str) -> str: """Convert text to ltree-safe slug.""" # Replace special characters with underscores slug = re.sub(r'[^a-zA-Z0-9]+', '_', text) # Remove leading/trailing underscores slug = slug.strip('_') # Ensure it starts with a letter (ltree requirement) if slug and not slug[0].isalpha(): slug = 'cat_' + slug return slug or 'unknown' def categorize_category(cat: str) -> tuple: """ Categorize a GBP category into 4-level hierarchy. Returns: (level1, level2, level3, level4) """ c = cat.lower() # === FOOD & DINING === if 'restaurant' in c: if any(x in c for x in ['fast food', 'drive-in', 'takeaway', 'takeout', 'quick service']): return ("Food & Dining", "Restaurants", "Fast Food & Quick Service", cat) # Cuisine types return ("Food & Dining", "Restaurants", "By Cuisine", cat) if any(x in c for x in ['cafe', 'coffee shop', 'tea house', 'tea room', 'espresso bar']): return ("Food & Dining", "Cafes & Coffee", "Coffee Shops", cat) if any(x in c for x in ['bar', 'pub', 'nightclub', 'night club', 'cocktail', 'wine bar', 'beer', 'lounge']): if 'gay' in c or 'lesbian' in c: return ("Food & Dining", "Bars & Nightlife", "LGBTQ+ Venues", cat) if 'karaoke' in c: return ("Food & Dining", "Bars & Nightlife", "Karaoke", cat) return ("Food & Dining", "Bars & Nightlife", "Bars & Pubs", cat) if any(x in c for x in ['bakery', 'pastry', 'cake', 'donut', 'dessert', 'ice cream', 'frozen yogurt', 'candy', 'chocolate', 'confection']): return ("Food & Dining", "Bakeries & Desserts", "Sweet Shops", cat) if any(x in c for x in ['caterer', 'catering']): return ("Food & Dining", "Food Services", "Catering", cat) if any(x in c for x in ['brewery', 'winery', 'distillery', 'vineyard']): return ("Food & Dining", "Beverage Production", "Producers", cat) if any(x in c for x in ['food truck', 'food stand', 'food stall', 'food court']): return ("Food & Dining", "Quick Service", "Street Food", cat) # === RETAIL & SHOPPING === if 'store' in c or 'shop' in c: if any(x in c for x in ['clothing', 'fashion', 'shoe', 'dress', 'apparel', 'wear', 'boutique', 'tailor']): return ("Retail & Shopping", "Clothing & Fashion", "Apparel Stores", cat) if any(x in c for x in ['electronic', 'computer', 'phone', 'appliance', 'tv', 'audio', 'video game']): return ("Retail & Shopping", "Electronics", "Electronics Stores", cat) if any(x in c for x in ['furniture', 'home decor', 'kitchen', 'bed', 'mattress', 'carpet', 'curtain', 'lighting']): return ("Retail & Shopping", "Home & Garden", "Home Furnishings", cat) if any(x in c for x in ['grocery', 'supermarket', 'food', 'beverage', 'wine', 'liquor', 'butcher', 'fish', 'fruit', 'vegetable']): return ("Retail & Shopping", "Food & Grocery", "Grocery Stores", cat) if any(x in c for x in ['book', 'stationery', 'office supply', 'paper']): return ("Retail & Shopping", "Books & Office", "Book Stores", cat) if any(x in c for x in ['pet', 'animal']): return ("Retail & Shopping", "Pet Supplies", "Pet Stores", cat) if any(x in c for x in ['toy', 'game', 'hobby']): return ("Retail & Shopping", "Toys & Hobbies", "Toy Stores", cat) if any(x in c for x in ['jewelry', 'watch', 'gold', 'diamond']): return ("Retail & Shopping", "Jewelry & Watches", "Jewelry Stores", cat) if any(x in c for x in ['sport', 'athletic', 'fitness', 'outdoor', 'camping', 'fishing', 'hunting']): return ("Retail & Shopping", "Sports & Outdoors", "Sporting Goods", cat) if any(x in c for x in ['music', 'instrument', 'record', 'vinyl']): return ("Retail & Shopping", "Music & Entertainment", "Music Stores", cat) if any(x in c for x in ['art', 'craft', 'fabric', 'sewing', 'yarn', 'knitting']): return ("Retail & Shopping", "Arts & Crafts", "Art Supply Stores", cat) if any(x in c for x in ['beauty', 'cosmetic', 'perfume', 'makeup']): return ("Retail & Shopping", "Beauty & Cosmetics", "Beauty Stores", cat) if any(x in c for x in ['pharmacy', 'drug', 'medicine', 'health']): return ("Retail & Shopping", "Health & Pharmacy", "Pharmacies", cat) if any(x in c for x in ['garden', 'plant', 'flower', 'nursery', 'landscap']): return ("Retail & Shopping", "Home & Garden", "Garden Centers", cat) if any(x in c for x in ['hardware', 'tool', 'building', 'lumber', 'paint']): return ("Retail & Shopping", "Hardware & Building", "Hardware Stores", cat) if any(x in c for x in ['antique', 'vintage', 'thrift', 'consignment', 'second hand', 'used']): return ("Retail & Shopping", "Secondhand & Vintage", "Thrift Stores", cat) return ("Retail & Shopping", "Specialty Retail", "Other Stores", cat) if any(x in c for x in ['supplier', 'wholesaler', 'distributor', 'exporter', 'importer']): if any(x in c for x in ['food', 'beverage', 'meat', 'seafood', 'produce']): return ("Retail & Shopping", "Wholesale & Distribution", "Food Wholesale", cat) if any(x in c for x in ['building', 'construction', 'lumber', 'concrete', 'steel']): return ("Retail & Shopping", "Wholesale & Distribution", "Building Materials", cat) if any(x in c for x in ['industrial', 'machinery', 'equipment']): return ("Retail & Shopping", "Wholesale & Distribution", "Industrial Supplies", cat) return ("Retail & Shopping", "Wholesale & Distribution", "General Wholesale", cat) if 'market' in c and 'marketing' not in c: if 'flea' in c or 'antique' in c: return ("Retail & Shopping", "Markets", "Flea Markets", cat) if 'farmer' in c: return ("Retail & Shopping", "Markets", "Farmers Markets", cat) return ("Retail & Shopping", "Markets", "General Markets", cat) # === AUTOMOTIVE === if 'dealer' in c: car_brands = ['abarth', 'acura', 'alfa romeo', 'aston martin', 'audi', 'bentley', 'bmw', 'bugatti', 'buick', 'cadillac', 'chevrolet', 'chrysler', 'citroen', 'cupra', 'dacia', 'daihatsu', 'dodge', 'ferrari', 'fiat', 'ford', 'genesis', 'gmc', 'honda', 'hummer', 'hyundai', 'infiniti', 'isuzu', 'jaguar', 'jeep', 'kia', 'lamborghini', 'lancia', 'land rover', 'lexus', 'lincoln', 'lotus', 'maserati', 'mazda', 'mclaren', 'mercedes', 'mini', 'mitsubishi', 'nissan', 'opel', 'peugeot', 'porsche', 'ram', 'renault', 'rolls-royce', 'saab', 'seat', 'skoda', 'smart', 'subaru', 'suzuki', 'tesla', 'toyota', 'volkswagen', 'volvo', 'yamaha', 'harley', 'ducati', 'kawasaki', 'triumph', 'vespa', 'piaggio'] if any(b in c for b in car_brands): if 'motorcycle' in c or any(x in c for x in ['harley', 'ducati', 'kawasaki', 'triumph', 'vespa']): return ("Automotive", "Dealers", "Motorcycle Brands", cat) return ("Automotive", "Dealers", "Car Brands", cat) if any(x in c for x in ['motorcycle', 'scooter', 'moped']): return ("Automotive", "Dealers", "Motorcycle Dealers", cat) if any(x in c for x in ['truck', 'commercial vehicle', 'trailer']): return ("Automotive", "Dealers", "Truck & Commercial", cat) if any(x in c for x in ['boat', 'yacht', 'marine', 'jet ski']): return ("Automotive", "Dealers", "Marine & Boats", cat) if any(x in c for x in ['rv', 'camper', 'motorhome', 'caravan']): return ("Automotive", "Dealers", "RV & Campers", cat) if any(x in c for x in ['atv', 'quad', 'off-road', 'utv']): return ("Automotive", "Dealers", "ATV & Off-Road", cat) if 'used' in c or 'pre-owned' in c: return ("Automotive", "Dealers", "Used Vehicles", cat) return ("Automotive", "Dealers", "Other Dealers", cat) if any(x in c for x in ['car wash', 'auto detailing', 'car detailing']): return ("Automotive", "Vehicle Care", "Cleaning & Detailing", cat) if any(x in c for x in ['car rental', 'auto rental', 'vehicle rental', 'truck rental']): return ("Automotive", "Rental Services", "Vehicle Rental", cat) if any(x in c for x in ['car repair', 'auto repair', 'mechanic', 'garage', 'auto body', 'collision']): return ("Automotive", "Repair & Maintenance", "Auto Repair", cat) if any(x in c for x in ['tire', 'tyre', 'wheel']): return ("Automotive", "Parts & Accessories", "Tires & Wheels", cat) if any(x in c for x in ['auto part', 'car part', 'auto accessories']): return ("Automotive", "Parts & Accessories", "Auto Parts", cat) if any(x in c for x in ['driving school', 'driving instruction']): return ("Automotive", "Training", "Driving Schools", cat) if any(x in c for x in ['parking', 'car park', 'garage']): if 'repair' not in c and 'mechanic' not in c: return ("Automotive", "Parking", "Parking Facilities", cat) if any(x in c for x in ['gas station', 'petrol', 'fuel', 'charging station', 'ev charging']): return ("Automotive", "Fuel & Charging", "Fuel Stations", cat) # === HEALTHCARE === if any(x in c for x in ['hospital']): if 'animal' in c or 'veterinar' in c: return ("Healthcare", "Veterinary", "Animal Hospitals", cat) if 'children' in c or 'pediatric' in c: return ("Healthcare", "Hospitals", "Pediatric Hospitals", cat) if 'mental' in c or 'psychiatric' in c: return ("Healthcare", "Mental Health", "Psychiatric Hospitals", cat) return ("Healthcare", "Hospitals", "General Hospitals", cat) if any(x in c for x in ['clinic']): if 'dental' in c: return ("Healthcare", "Dental", "Dental Clinics", cat) if 'eye' in c or 'vision' in c or 'optical' in c: return ("Healthcare", "Vision Care", "Eye Clinics", cat) if 'fertility' in c or 'ivf' in c: return ("Healthcare", "Specialty Care", "Fertility Clinics", cat) if 'skin' in c or 'dermatol' in c: return ("Healthcare", "Specialty Care", "Dermatology", cat) if 'physical therapy' in c or 'physiotherapy' in c or 'rehab' in c: return ("Healthcare", "Rehabilitation", "Physical Therapy", cat) return ("Healthcare", "Clinics", "Medical Clinics", cat) if any(x in c for x in ['doctor', 'physician']): return ("Healthcare", "Medical Practitioners", "Doctors", cat) if any(x in c for x in ['dentist', 'dental', 'orthodont', 'endodont', 'periodont']): return ("Healthcare", "Dental", "Dental Services", cat) if any(x in c for x in ['surgeon', 'surgery']): if 'plastic' in c or 'cosmetic' in c: return ("Healthcare", "Specialty Care", "Cosmetic Surgery", cat) return ("Healthcare", "Medical Practitioners", "Surgeons", cat) if any(x in c for x in ['psycholog', 'psychiatr', 'mental health', 'counselor', 'therapist']): if 'marriage' in c or 'family' in c: return ("Healthcare", "Mental Health", "Family Counseling", cat) if 'addiction' in c or 'substance' in c: return ("Healthcare", "Mental Health", "Addiction Treatment", cat) return ("Healthcare", "Mental Health", "Mental Health Services", cat) if any(x in c for x in ['chiropract']): return ("Healthcare", "Alternative Medicine", "Chiropractic", cat) if any(x in c for x in ['acupuncture', 'acupuncturist']): return ("Healthcare", "Alternative Medicine", "Acupuncture", cat) if any(x in c for x in ['naturopath', 'homeopath', 'ayurved', 'holistic']): return ("Healthcare", "Alternative Medicine", "Natural Medicine", cat) if any(x in c for x in ['optometrist', 'optician', 'eye doctor', 'ophthalmol']): return ("Healthcare", "Vision Care", "Eye Care", cat) if any(x in c for x in ['pharmacy', 'drugstore', 'apothecary']): return ("Healthcare", "Pharmacies", "Retail Pharmacies", cat) if any(x in c for x in ['veterinar', 'vet ', 'animal clinic', 'pet clinic']): return ("Healthcare", "Veterinary", "Veterinary Services", cat) if any(x in c for x in ['nursing home', 'assisted living', 'senior care', 'elder care', 'retirement home']): return ("Healthcare", "Senior Care", "Senior Living", cat) if any(x in c for x in ['lab', 'laboratory', 'diagnostic', 'imaging', 'x-ray', 'mri', 'radiology']): return ("Healthcare", "Diagnostics", "Medical Labs", cat) if any(x in c for x in ['ambulance', 'emergency', 'urgent care']): return ("Healthcare", "Emergency Services", "Emergency Care", cat) # === EDUCATION === if 'school' in c or 'academy' in c: if any(x in c for x in ['preschool', 'kindergarten', 'nursery', 'daycare', 'pre-school']): return ("Education", "Early Childhood", "Preschools", cat) if any(x in c for x in ['elementary', 'primary']): return ("Education", "K-12 Schools", "Elementary Schools", cat) if any(x in c for x in ['middle', 'junior high']): return ("Education", "K-12 Schools", "Middle Schools", cat) if any(x in c for x in ['high school', 'secondary']): return ("Education", "K-12 Schools", "High Schools", cat) if any(x in c for x in ['boarding']): return ("Education", "K-12 Schools", "Boarding Schools", cat) if any(x in c for x in ['driving']): return ("Automotive", "Training", "Driving Schools", cat) if any(x in c for x in ['language', 'english', 'spanish', 'french', 'german', 'chinese', 'japanese']): return ("Education", "Language Learning", "Language Schools", cat) if any(x in c for x in ['art', 'music', 'dance', 'drama', 'theater', 'acting']): return ("Education", "Arts Education", "Arts Schools", cat) if any(x in c for x in ['martial art', 'karate', 'judo', 'taekwondo', 'kung fu', 'aikido', 'boxing']): return ("Education", "Sports Training", "Martial Arts Schools", cat) if any(x in c for x in ['beauty', 'cosmetology', 'barber']): return ("Education", "Vocational Training", "Beauty Schools", cat) if any(x in c for x in ['cooking', 'culinary', 'chef']): return ("Education", "Vocational Training", "Culinary Schools", cat) if any(x in c for x in ['business', 'mba']): return ("Education", "Higher Education", "Business Schools", cat) if any(x in c for x in ['medical', 'nursing', 'dental']): return ("Education", "Higher Education", "Medical Schools", cat) if any(x in c for x in ['law']): return ("Education", "Higher Education", "Law Schools", cat) if any(x in c for x in ['flight', 'aviation', 'pilot']): return ("Education", "Vocational Training", "Aviation Schools", cat) if any(x in c for x in ['computer', 'it ', 'coding', 'programming', 'software']): return ("Education", "Technology Training", "Computer Schools", cat) if any(x in c for x in ['trade', 'technical', 'vocational']): return ("Education", "Vocational Training", "Trade Schools", cat) return ("Education", "Specialty Schools", "Other Schools", cat) if any(x in c for x in ['university', 'college']): if 'community' in c: return ("Education", "Higher Education", "Community Colleges", cat) return ("Education", "Higher Education", "Universities", cat) if any(x in c for x in ['tutor', 'tutoring']): return ("Education", "Tutoring", "Private Tutoring", cat) if any(x in c for x in ['training center', 'training program', 'training institute']): return ("Education", "Professional Training", "Training Centers", cat) if any(x in c for x in ['library']): return ("Education", "Libraries", "Public Libraries", cat) # === PROFESSIONAL SERVICES === if any(x in c for x in ['lawyer', 'attorney', 'law firm', 'legal']): if any(x in c for x in ['immigration']): return ("Professional Services", "Legal", "Immigration Law", cat) if any(x in c for x in ['criminal', 'defense']): return ("Professional Services", "Legal", "Criminal Law", cat) if any(x in c for x in ['family', 'divorce']): return ("Professional Services", "Legal", "Family Law", cat) if any(x in c for x in ['personal injury', 'accident']): return ("Professional Services", "Legal", "Personal Injury", cat) if any(x in c for x in ['real estate', 'property']): return ("Professional Services", "Legal", "Real Estate Law", cat) if any(x in c for x in ['business', 'corporate', 'commercial']): return ("Professional Services", "Legal", "Business Law", cat) return ("Professional Services", "Legal", "General Legal", cat) if any(x in c for x in ['accountant', 'accounting', 'bookkeep', 'tax']): return ("Professional Services", "Financial Services", "Accounting", cat) if any(x in c for x in ['consultant', 'consulting', 'advisor']): if any(x in c for x in ['business', 'management']): return ("Professional Services", "Consulting", "Business Consulting", cat) if any(x in c for x in ['it ', 'technology', 'computer']): return ("Professional Services", "Consulting", "IT Consulting", cat) if any(x in c for x in ['marketing', 'advertising']): return ("Professional Services", "Consulting", "Marketing Consulting", cat) return ("Professional Services", "Consulting", "General Consulting", cat) if any(x in c for x in ['notary', 'notarial']): return ("Professional Services", "Legal", "Notary Services", cat) if any(x in c for x in ['architect', 'architecture']): return ("Professional Services", "Design", "Architecture", cat) if any(x in c for x in ['engineer', 'engineering']): if 'civil' in c: return ("Professional Services", "Engineering", "Civil Engineering", cat) if 'structural' in c: return ("Professional Services", "Engineering", "Structural Engineering", cat) if 'mechanical' in c: return ("Professional Services", "Engineering", "Mechanical Engineering", cat) if 'electrical' in c: return ("Professional Services", "Engineering", "Electrical Engineering", cat) return ("Professional Services", "Engineering", "General Engineering", cat) if any(x in c for x in ['agency']): if any(x in c for x in ['advertising', 'marketing', 'creative', 'digital']): return ("Professional Services", "Marketing & Advertising", "Agencies", cat) if any(x in c for x in ['real estate', 'property']): return ("Real Estate", "Agencies", "Real Estate Agencies", cat) if any(x in c for x in ['insurance']): return ("Finance & Insurance", "Insurance", "Insurance Agencies", cat) if any(x in c for x in ['travel', 'tour']): return ("Hospitality & Travel", "Travel Services", "Travel Agencies", cat) if any(x in c for x in ['employment', 'staffing', 'recruitment', 'temp']): return ("Professional Services", "HR Services", "Staffing Agencies", cat) return ("Professional Services", "Agencies", "Other Agencies", cat) if any(x in c for x in ['photographer', 'photography', 'photo studio']): return ("Professional Services", "Creative Services", "Photography", cat) if any(x in c for x in ['graphic design', 'web design', 'design studio']): return ("Professional Services", "Creative Services", "Design Services", cat) if any(x in c for x in ['translator', 'translation', 'interpreter']): return ("Professional Services", "Language Services", "Translation", cat) if any(x in c for x in ['printing', 'print shop', 'copy']): return ("Professional Services", "Business Services", "Printing Services", cat) # === HOME SERVICES === if any(x in c for x in ['plumber', 'plumbing']): return ("Home Services", "Plumbing", "Plumbers", cat) if any(x in c for x in ['electrician', 'electrical']): if 'contractor' in c or 'service' in c: return ("Home Services", "Electrical", "Electricians", cat) if any(x in c for x in ['hvac', 'air conditioning', 'heating', 'furnace']): return ("Home Services", "HVAC", "Heating & Cooling", cat) if any(x in c for x in ['roofing', 'roofer']): return ("Home Services", "Roofing", "Roofing Services", cat) if any(x in c for x in ['painter', 'painting']): if 'house' in c or 'residential' in c or 'contractor' in c: return ("Home Services", "Painting", "House Painters", cat) if any(x in c for x in ['landscap', 'lawn', 'garden']): if 'service' in c or 'company' in c or 'contractor' in c: return ("Home Services", "Landscaping", "Landscaping Services", cat) if any(x in c for x in ['cleaning service', 'maid', 'housekeep', 'janitorial']): return ("Home Services", "Cleaning", "Cleaning Services", cat) if any(x in c for x in ['pest control', 'exterminator']): return ("Home Services", "Pest Control", "Exterminators", cat) if any(x in c for x in ['locksmith']): return ("Home Services", "Security", "Locksmiths", cat) if any(x in c for x in ['moving company', 'mover', 'relocation']): return ("Home Services", "Moving", "Moving Services", cat) if any(x in c for x in ['contractor']): if 'general' in c: return ("Home Services", "Construction", "General Contractors", cat) return ("Home Services", "Construction", "Contractors", cat) if any(x in c for x in ['carpenter', 'carpentry']): return ("Home Services", "Construction", "Carpenters", cat) if any(x in c for x in ['flooring', 'floor']): if 'service' in c or 'contractor' in c or 'installation' in c: return ("Home Services", "Flooring", "Floor Installation", cat) if any(x in c for x in ['window', 'glass']): if 'repair' in c or 'installation' in c or 'service' in c: return ("Home Services", "Windows & Doors", "Window Services", cat) if any(x in c for x in ['pool', 'spa']): if 'service' in c or 'cleaning' in c or 'maintenance' in c: return ("Home Services", "Pool & Spa", "Pool Services", cat) if any(x in c for x in ['appliance repair', 'appliance service']): return ("Home Services", "Appliance Repair", "Appliance Services", cat) if any(x in c for x in ['handyman']): return ("Home Services", "General Repair", "Handyman Services", cat) if any(x in c for x in ['interior design', 'decorator']): return ("Home Services", "Design", "Interior Design", cat) # === PERSONAL SERVICES === if any(x in c for x in ['salon', 'hair', 'hairdress', 'stylist']): return ("Personal Services", "Hair Care", "Hair Salons", cat) if any(x in c for x in ['barber']): if 'shop' in c or not 'school' in c: return ("Personal Services", "Hair Care", "Barber Shops", cat) if any(x in c for x in ['nail', 'manicure', 'pedicure']): return ("Personal Services", "Nail Care", "Nail Salons", cat) if any(x in c for x in ['spa']): if 'day spa' in c or 'medical spa' in c or ('service' not in c and 'pool' not in c): return ("Personal Services", "Spa & Wellness", "Day Spas", cat) if any(x in c for x in ['massage']): return ("Personal Services", "Massage", "Massage Therapy", cat) if any(x in c for x in ['beauty']): if 'salon' in c or 'parlor' in c: return ("Personal Services", "Beauty", "Beauty Salons", cat) if any(x in c for x in ['tattoo']): return ("Personal Services", "Body Art", "Tattoo Shops", cat) if any(x in c for x in ['piercing']): return ("Personal Services", "Body Art", "Piercing Studios", cat) if any(x in c for x in ['tanning']): return ("Personal Services", "Tanning", "Tanning Salons", cat) if any(x in c for x in ['tailor', 'alteration', 'seamstress']): return ("Personal Services", "Clothing Care", "Tailoring", cat) if any(x in c for x in ['dry clean', 'laundry', 'laundromat']): return ("Personal Services", "Laundry", "Laundry Services", cat) if any(x in c for x in ['personal trainer', 'fitness trainer']): return ("Personal Services", "Fitness", "Personal Training", cat) # === ENTERTAINMENT & RECREATION === if any(x in c for x in ['movie theater', 'cinema', 'multiplex']): return ("Entertainment", "Movies", "Movie Theaters", cat) if any(x in c for x in ['theater', 'theatre']): if 'movie' not in c: return ("Entertainment", "Performing Arts", "Theaters", cat) if any(x in c for x in ['museum']): if 'art' in c: return ("Entertainment", "Museums", "Art Museums", cat) if 'history' in c or 'historical' in c: return ("Entertainment", "Museums", "History Museums", cat) if 'science' in c or 'natural' in c: return ("Entertainment", "Museums", "Science Museums", cat) if 'children' in c or 'kid' in c: return ("Entertainment", "Museums", "Children's Museums", cat) return ("Entertainment", "Museums", "General Museums", cat) if any(x in c for x in ['art gallery', 'gallery']): return ("Entertainment", "Arts", "Art Galleries", cat) if any(x in c for x in ['amusement park', 'theme park', 'water park']): return ("Entertainment", "Amusement", "Theme Parks", cat) if any(x in c for x in ['zoo', 'aquarium', 'wildlife']): return ("Entertainment", "Wildlife", "Zoos & Aquariums", cat) if any(x in c for x in ['bowling']): return ("Entertainment", "Games & Recreation", "Bowling", cat) if any(x in c for x in ['arcade', 'video game']): return ("Entertainment", "Games & Recreation", "Arcades", cat) if any(x in c for x in ['escape room']): return ("Entertainment", "Games & Recreation", "Escape Rooms", cat) if any(x in c for x in ['casino', 'gambling']): return ("Entertainment", "Gambling", "Casinos", cat) if any(x in c for x in ['concert', 'music venue', 'live music']): return ("Entertainment", "Music Venues", "Concert Halls", cat) if any(x in c for x in ['gym', 'fitness center', 'health club']): return ("Entertainment", "Fitness", "Gyms", cat) if any(x in c for x in ['yoga']): if 'studio' in c or 'center' in c: return ("Entertainment", "Fitness", "Yoga Studios", cat) if any(x in c for x in ['pilates']): return ("Entertainment", "Fitness", "Pilates Studios", cat) if any(x in c for x in ['swimming pool', 'swim']): return ("Entertainment", "Sports", "Swimming Pools", cat) if any(x in c for x in ['golf']): if 'course' in c or 'club' in c: return ("Entertainment", "Sports", "Golf Courses", cat) if any(x in c for x in ['tennis']): return ("Entertainment", "Sports", "Tennis Courts", cat) if any(x in c for x in ['stadium', 'arena', 'sports complex']): return ("Entertainment", "Venues", "Sports Venues", cat) if any(x in c for x in ['park']): if 'amusement' not in c and 'theme' not in c: if 'national' in c or 'state' in c: return ("Entertainment", "Parks", "National Parks", cat) if 'dog' in c: return ("Entertainment", "Parks", "Dog Parks", cat) return ("Entertainment", "Parks", "Public Parks", cat) if any(x in c for x in ['recreation center', 'community center']): return ("Entertainment", "Recreation", "Community Centers", cat) if any(x in c for x in ['club']): if 'night' in c: return ("Food & Dining", "Bars & Nightlife", "Night Clubs", cat) if 'country' in c: return ("Entertainment", "Sports", "Country Clubs", cat) if 'sport' in c or 'athletic' in c: return ("Entertainment", "Sports", "Sports Clubs", cat) if 'social' in c: return ("Entertainment", "Social", "Social Clubs", cat) # === HOSPITALITY & TRAVEL === if any(x in c for x in ['hotel', 'motel', 'inn']): if 'boutique' in c: return ("Hospitality & Travel", "Lodging", "Boutique Hotels", cat) if 'resort' in c: return ("Hospitality & Travel", "Lodging", "Resorts", cat) if 'budget' in c or 'economy' in c: return ("Hospitality & Travel", "Lodging", "Budget Hotels", cat) return ("Hospitality & Travel", "Lodging", "Hotels", cat) if any(x in c for x in ['hostel']): return ("Hospitality & Travel", "Lodging", "Hostels", cat) if any(x in c for x in ['bed and breakfast', 'b&b', 'bnb']): return ("Hospitality & Travel", "Lodging", "B&Bs", cat) if any(x in c for x in ['resort']): return ("Hospitality & Travel", "Lodging", "Resorts", cat) if any(x in c for x in ['vacation rental', 'holiday rental']): return ("Hospitality & Travel", "Lodging", "Vacation Rentals", cat) if any(x in c for x in ['campground', 'camping', 'rv park']): return ("Hospitality & Travel", "Lodging", "Campgrounds", cat) if any(x in c for x in ['travel agency', 'tour operator', 'travel agent']): return ("Hospitality & Travel", "Travel Services", "Travel Agencies", cat) if any(x in c for x in ['airline', 'airport']): return ("Hospitality & Travel", "Transportation", "Airlines & Airports", cat) if any(x in c for x in ['cruise']): return ("Hospitality & Travel", "Travel Services", "Cruises", cat) if any(x in c for x in ['tourist', 'attraction', 'sightseeing']): return ("Hospitality & Travel", "Attractions", "Tourist Attractions", cat) # === FINANCE & INSURANCE === if any(x in c for x in ['bank', 'banking', 'credit union']): return ("Finance & Insurance", "Banking", "Banks", cat) if any(x in c for x in ['atm', 'cash machine']): return ("Finance & Insurance", "Banking", "ATMs", cat) if any(x in c for x in ['insurance']): if 'health' in c or 'medical' in c: return ("Finance & Insurance", "Insurance", "Health Insurance", cat) if 'auto' in c or 'car' in c: return ("Finance & Insurance", "Insurance", "Auto Insurance", cat) if 'home' in c or 'property' in c: return ("Finance & Insurance", "Insurance", "Home Insurance", cat) if 'life' in c: return ("Finance & Insurance", "Insurance", "Life Insurance", cat) return ("Finance & Insurance", "Insurance", "Insurance Services", cat) if any(x in c for x in ['loan', 'mortgage', 'lending']): return ("Finance & Insurance", "Lending", "Loans", cat) if any(x in c for x in ['investment', 'financial advisor', 'wealth management', 'financial planner']): return ("Finance & Insurance", "Investment", "Financial Services", cat) if any(x in c for x in ['currency exchange', 'money transfer', 'wire transfer']): return ("Finance & Insurance", "Money Services", "Currency Services", cat) if any(x in c for x in ['pawn']): return ("Finance & Insurance", "Money Services", "Pawn Shops", cat) # === REAL ESTATE === if any(x in c for x in ['real estate', 'property', 'realty', 'realtor']): if 'agent' in c or 'agency' in c or 'broker' in c: return ("Real Estate", "Agencies", "Real Estate Agents", cat) if 'developer' in c or 'development' in c: return ("Real Estate", "Development", "Developers", cat) if 'management' in c: return ("Real Estate", "Management", "Property Management", cat) if 'commercial' in c: return ("Real Estate", "Commercial", "Commercial Real Estate", cat) return ("Real Estate", "Services", "Real Estate Services", cat) if any(x in c for x in ['apartment', 'condo', 'rental']): if 'complex' in c or 'building' in c: return ("Real Estate", "Residential", "Apartment Complexes", cat) if any(x in c for x in ['storage', 'self storage', 'warehouse']): if 'self' in c or 'mini' in c: return ("Real Estate", "Storage", "Self Storage", cat) # === RELIGIOUS === if any(x in c for x in ['church']): if 'catholic' in c: return ("Religious", "Christian", "Catholic Churches", cat) if 'baptist' in c: return ("Religious", "Christian", "Baptist Churches", cat) if 'methodist' in c: return ("Religious", "Christian", "Methodist Churches", cat) if 'lutheran' in c: return ("Religious", "Christian", "Lutheran Churches", cat) if 'orthodox' in c: return ("Religious", "Christian", "Orthodox Churches", cat) if 'pentecostal' in c: return ("Religious", "Christian", "Pentecostal Churches", cat) return ("Religious", "Christian", "Churches", cat) if any(x in c for x in ['mosque', 'islamic', 'muslim']): return ("Religious", "Islam", "Mosques", cat) if any(x in c for x in ['synagogue', 'jewish', 'temple']): if 'jewish' in c or 'synagogue' in c: return ("Religious", "Judaism", "Synagogues", cat) if 'hindu' in c: return ("Religious", "Hinduism", "Hindu Temples", cat) if 'buddhist' in c: return ("Religious", "Buddhism", "Buddhist Temples", cat) return ("Religious", "Other", "Temples", cat) if any(x in c for x in ['abbey', 'monastery', 'convent']): return ("Religious", "Christian", "Monasteries", cat) if any(x in c for x in ['gurdwara', 'sikh']): return ("Religious", "Sikhism", "Gurdwaras", cat) # === GOVERNMENT & PUBLIC SERVICES === if any(x in c for x in ['government', 'city hall', 'town hall', 'municipal']): return ("Government", "Local Government", "Government Offices", cat) if any(x in c for x in ['court', 'courthouse']): return ("Government", "Legal", "Courts", cat) if any(x in c for x in ['police', 'sheriff']): return ("Government", "Public Safety", "Police", cat) if any(x in c for x in ['fire station', 'fire department']): return ("Government", "Public Safety", "Fire Departments", cat) if any(x in c for x in ['post office', 'postal']): return ("Government", "Postal", "Post Offices", cat) if any(x in c for x in ['embassy', 'consulate']): return ("Government", "International", "Embassies", cat) if any(x in c for x in ['dmv', 'motor vehicle', 'driver license']): return ("Government", "Transportation", "DMV", cat) if any(x in c for x in ['social security', 'welfare', 'social services']): return ("Government", "Social Services", "Social Services", cat) # === INDUSTRIAL & MANUFACTURING === if any(x in c for x in ['manufacturer', 'manufacturing', 'factory', 'plant']): if any(x in c for x in ['food', 'beverage', 'bakery']): return ("Industrial", "Manufacturing", "Food Manufacturing", cat) if any(x in c for x in ['textile', 'clothing', 'garment']): return ("Industrial", "Manufacturing", "Textile Manufacturing", cat) if any(x in c for x in ['electronics', 'computer', 'semiconductor']): return ("Industrial", "Manufacturing", "Electronics Manufacturing", cat) if any(x in c for x in ['auto', 'car', 'vehicle']): return ("Industrial", "Manufacturing", "Auto Manufacturing", cat) if any(x in c for x in ['chemical', 'pharmaceutical']): return ("Industrial", "Manufacturing", "Chemical Manufacturing", cat) if any(x in c for x in ['metal', 'steel', 'iron']): return ("Industrial", "Manufacturing", "Metal Manufacturing", cat) if any(x in c for x in ['plastic', 'rubber']): return ("Industrial", "Manufacturing", "Plastics Manufacturing", cat) if any(x in c for x in ['furniture', 'wood']): return ("Industrial", "Manufacturing", "Furniture Manufacturing", cat) return ("Industrial", "Manufacturing", "General Manufacturing", cat) if any(x in c for x in ['mining', 'quarry']): return ("Industrial", "Mining", "Mining Operations", cat) if any(x in c for x in ['construction company', 'builder']): return ("Industrial", "Construction", "Construction Companies", cat) # === TECHNOLOGY === if any(x in c for x in ['software', 'app developer', 'web developer']): return ("Technology", "Software", "Software Development", cat) if any(x in c for x in ['it service', 'computer service', 'tech support']): return ("Technology", "IT Services", "IT Support", cat) if any(x in c for x in ['data center', 'hosting', 'cloud']): return ("Technology", "Infrastructure", "Data Services", cat) if any(x in c for x in ['telecommunication', 'telecom', 'internet service']): return ("Technology", "Telecommunications", "Telecom Services", cat) # === TRANSPORTATION & LOGISTICS === if any(x in c for x in ['shipping', 'freight', 'cargo', 'logistics']): return ("Transportation", "Logistics", "Shipping & Freight", cat) if any(x in c for x in ['courier', 'delivery', 'express']): return ("Transportation", "Delivery", "Courier Services", cat) if any(x in c for x in ['taxi', 'cab', 'ride', 'limo', 'chauffeur']): return ("Transportation", "Passenger", "Taxi & Ride Services", cat) if any(x in c for x in ['bus', 'coach', 'shuttle']): if 'station' in c or 'terminal' in c or 'stop' in c: return ("Transportation", "Public Transit", "Bus Stations", cat) return ("Transportation", "Passenger", "Bus Services", cat) if any(x in c for x in ['train', 'rail', 'subway', 'metro']): if 'station' in c or 'terminal' in c: return ("Transportation", "Public Transit", "Train Stations", cat) return ("Transportation", "Public Transit", "Rail Services", cat) if any(x in c for x in ['towing', 'tow truck']): return ("Transportation", "Vehicle Services", "Towing", cat) # === AGRICULTURE === if any(x in c for x in ['farm', 'ranch', 'orchard', 'vineyard']): return ("Agriculture", "Farming", "Farms", cat) if any(x in c for x in ['agricultural', 'agri']): return ("Agriculture", "Services", "Agricultural Services", cat) # === PETS & ANIMALS === if any(x in c for x in ['pet', 'dog', 'cat']): if 'grooming' in c or 'groomer' in c: return ("Pets & Animals", "Pet Services", "Pet Grooming", cat) if 'boarding' in c or 'kennel' in c or 'sitting' in c or 'daycare' in c: return ("Pets & Animals", "Pet Services", "Pet Boarding", cat) if 'training' in c or 'trainer' in c: return ("Pets & Animals", "Pet Services", "Pet Training", cat) if 'adoption' in c or 'shelter' in c or 'rescue' in c: return ("Pets & Animals", "Animal Welfare", "Shelters", cat) if 'store' in c or 'shop' in c: return ("Retail & Shopping", "Pet Supplies", "Pet Stores", cat) # === EVENTS & WEDDINGS === if any(x in c for x in ['wedding', 'bridal']): if 'venue' in c or 'hall' in c: return ("Events & Weddings", "Venues", "Wedding Venues", cat) if 'planner' in c: return ("Events & Weddings", "Planning", "Wedding Planners", cat) if 'dress' in c or 'gown' in c: return ("Events & Weddings", "Attire", "Bridal Shops", cat) return ("Events & Weddings", "Services", "Wedding Services", cat) if any(x in c for x in ['event', 'party', 'banquet']): if 'venue' in c or 'hall' in c or 'center' in c: return ("Events & Weddings", "Venues", "Event Venues", cat) if 'planner' in c or 'planning' in c: return ("Events & Weddings", "Planning", "Event Planners", cat) if 'rental' in c or 'supply' in c: return ("Events & Weddings", "Rentals", "Event Rentals", cat) return ("Events & Weddings", "Services", "Event Services", cat) if any(x in c for x in ['florist', 'flower']): if 'shop' in c or 'store' not in c: return ("Events & Weddings", "Florists", "Flower Shops", cat) if any(x in c for x in ['funeral', 'mortuary', 'cremation', 'cemetery']): return ("Events & Weddings", "Memorial", "Funeral Services", cat) # === NON-PROFIT & COMMUNITY === if any(x in c for x in ['non-profit', 'nonprofit', 'charity', 'foundation']): return ("Non-Profit", "Charities", "Non-Profit Organizations", cat) if any(x in c for x in ['community', 'civic', 'volunteer']): if 'center' in c: return ("Non-Profit", "Community", "Community Centers", cat) return ("Non-Profit", "Community", "Community Organizations", cat) if any(x in c for x in ['association', 'organization', 'society']): if 'professional' in c or 'trade' in c or 'business' in c: return ("Non-Profit", "Professional", "Professional Associations", cat) return ("Non-Profit", "General", "Organizations", cat) # Default fallback return ("Other", "Uncategorized", "General", cat) def main(): parser = argparse.ArgumentParser(description='Import GBP categories into PostgreSQL with ltree') parser.add_argument('--csv-path', default=DEFAULT_CSV_PATH, help='Path to categories CSV') parser.add_argument('--db-url', default=DEFAULT_DB_URL, help='PostgreSQL connection URL') parser.add_argument('--dry-run', action='store_true', help='Print categories without importing') args = parser.parse_args() # Read categories print(f"Reading categories from: {args.csv_path}") categories = [] with open(args.csv_path, 'r', encoding='utf-8') as f: reader = csv.reader(f) next(reader) # Skip header for row in reader: if row and row[0].strip(): categories.append(row[0].strip()) print(f"Found {len(categories)} categories") # Build tree structure tree = {} # path -> (name, level, parent_path) for cat in categories: l1, l2, l3, l4 = categorize_category(cat) # Build paths l1_slug = slugify(l1) l2_slug = slugify(l2) l3_slug = slugify(l3) l4_slug = slugify(l4) # Level 1 (Sector) l1_path = l1_slug if l1_path not in tree: tree[l1_path] = (l1, 1, None) # Level 2 (Business Type) l2_path = f"{l1_slug}.{l2_slug}" if l2_path not in tree: tree[l2_path] = (l2, 2, l1_path) # Level 3 (Sub-category) l3_path = f"{l1_slug}.{l2_slug}.{l3_slug}" if l3_path not in tree: tree[l3_path] = (l3, 3, l2_path) # Level 4 (Specific Category) l4_path = f"{l1_slug}.{l2_slug}.{l3_slug}.{l4_slug}" if l4_path not in tree: tree[l4_path] = (l4, 4, l3_path) # Print statistics level_counts = {1: 0, 2: 0, 3: 0, 4: 0} for path, (name, level, parent) in tree.items(): level_counts[level] += 1 print(f"\nTree structure:") print(f" Level 1 (Sectors): {level_counts[1]}") print(f" Level 2 (Business Types): {level_counts[2]}") print(f" Level 3 (Sub-categories): {level_counts[3]}") print(f" Level 4 (Categories): {level_counts[4]}") print(f" Total nodes: {len(tree)}") if args.dry_run: print("\n[DRY RUN] Would insert these nodes:") for path in sorted(tree.keys())[:20]: name, level, parent = tree[path] print(f" {' ' * (level-1)}{name} ({path})") print(f" ... and {len(tree) - 20} more") return # Check for psycopg2 if not HAS_PSYCOPG2: print("\nERROR: psycopg2 is required for database import.") print("Install it with: pip install psycopg2-binary") return # Connect to database print(f"\nConnecting to database...") conn = psycopg2.connect(args.db_url) cur = conn.cursor() # Run init SQL first init_sql_path = os.path.join(os.path.dirname(__file__), 'init', '01_create_categories.sql') if os.path.exists(init_sql_path): print(f"Running init SQL: {init_sql_path}") with open(init_sql_path, 'r') as f: cur.execute(f.read()) conn.commit() # Clear existing data print("Clearing existing categories...") cur.execute("TRUNCATE TABLE gbp_categories RESTART IDENTITY CASCADE") # Insert nodes in order (parents first) print("Inserting categories...") path_to_id = {} # Sort by level to ensure parents are inserted first sorted_items = sorted(tree.items(), key=lambda x: x[1][1]) for path, (name, level, parent_path) in sorted_items: parent_id = path_to_id.get(parent_path) if parent_path else None slug = path.split('.')[-1] cur.execute(""" INSERT INTO gbp_categories (name, slug, path, level, parent_id) VALUES (%s, %s, %s, %s, %s) RETURNING id """, (name, slug, path, level, parent_id)) path_to_id[path] = cur.fetchone()[0] # Update category counts print("Updating category counts...") cur.execute(""" UPDATE gbp_categories p SET category_count = ( SELECT COUNT(*) FROM gbp_categories c WHERE c.path <@ p.path AND c.path != p.path ) """) conn.commit() # Verify cur.execute("SELECT COUNT(*) FROM gbp_categories") count = cur.fetchone()[0] print(f"\nSuccess! Inserted {count} nodes into gbp_categories table") # Show tree stats cur.execute("SELECT * FROM category_tree_stats") print("\nTree statistics:") for row in cur.fetchall(): print(f" Level {row[0]}: {row[1]} nodes") cur.close() conn.close() print("\nDone!") if __name__ == '__main__': main()