Initial commit - WhyRating Engine (Google Reviews Scraper)
This commit is contained in:
202
db/apply_recategorization.py
Normal file
202
db/apply_recategorization.py
Normal file
@@ -0,0 +1,202 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Apply the hierarchical recategorization to the database.
|
||||
|
||||
This script:
|
||||
1. Gets all items currently in Other.Uncategorized
|
||||
2. Applies the categorization rules
|
||||
3. Updates the database with new paths
|
||||
4. Creates new level 2/3 categories as needed
|
||||
5. Updates category counts
|
||||
"""
|
||||
|
||||
import psycopg2
|
||||
import re
|
||||
from collections import defaultdict
|
||||
|
||||
# Import categorization functions
|
||||
import sys
|
||||
sys.path.insert(0, '/Users/agutierrez/Desktop/google-reviews-scraper-pro/db')
|
||||
from recategorize_hierarchical import get_sector_for_item, get_business_type_for_item
|
||||
|
||||
DB_URL = "postgresql://scraper:scraper123@localhost:5437/scraper"
|
||||
|
||||
def slugify(text):
|
||||
"""Convert text to slug format"""
|
||||
slug = re.sub(r'[^\w\s-]', '', text)
|
||||
slug = re.sub(r'[-\s]+', '_', slug)
|
||||
return slug.strip('_')
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get all items in Other.Uncategorized
|
||||
cursor.execute("""
|
||||
SELECT id, name, slug
|
||||
FROM gbp_categories
|
||||
WHERE path ~ 'Other.Uncategorized.*' AND level = 4
|
||||
ORDER BY name
|
||||
""")
|
||||
other_items = cursor.fetchall()
|
||||
print(f"Found {len(other_items)} items in Other.Uncategorized")
|
||||
|
||||
# Get existing paths
|
||||
cursor.execute("SELECT path::text, id FROM gbp_categories")
|
||||
existing_paths = {row[0]: row[1] for row in cursor.fetchall()}
|
||||
print(f"Found {len(existing_paths)} existing paths")
|
||||
|
||||
# Categorize items
|
||||
moves = [] # (item_id, item_name, item_slug, new_sector, new_btype)
|
||||
stats = defaultdict(int)
|
||||
|
||||
for item_id, name, slug in other_items:
|
||||
sector = get_sector_for_item(name)
|
||||
btype = get_business_type_for_item(name, sector)
|
||||
|
||||
if sector != 'Other':
|
||||
moves.append((item_id, name, slug, sector, btype))
|
||||
stats[sector] += 1
|
||||
else:
|
||||
stats['Still_Other'] += 1
|
||||
|
||||
print(f"\nCategorization results:")
|
||||
for sector, count in sorted(stats.items(), key=lambda x: -x[1]):
|
||||
print(f" {sector}: {count}")
|
||||
|
||||
print(f"\nTotal to move: {len(moves)}")
|
||||
print(f"Remaining in Other: {stats.get('Still_Other', 0)}")
|
||||
|
||||
# Ask for confirmation
|
||||
response = input("\nProceed with database updates? (yes/no): ")
|
||||
if response.lower() != 'yes':
|
||||
print("Aborted.")
|
||||
conn.close()
|
||||
return
|
||||
|
||||
# Process moves
|
||||
created_paths = set()
|
||||
updated = 0
|
||||
errors = []
|
||||
|
||||
for item_id, name, slug, sector, btype in moves:
|
||||
try:
|
||||
sector_slug = slugify(sector)
|
||||
btype_slug = slugify(btype)
|
||||
|
||||
# Check if sector exists
|
||||
sector_path = sector_slug
|
||||
if sector_path not in existing_paths:
|
||||
print(f" [ERROR] Sector not found: {sector_path} for '{name}'")
|
||||
errors.append((name, f"Sector not found: {sector_path}"))
|
||||
continue
|
||||
|
||||
# Check/create business type (level 2)
|
||||
btype_path = f"{sector_path}.{btype_slug}"
|
||||
if btype_path not in existing_paths and btype_path not in created_paths:
|
||||
cursor.execute("""
|
||||
INSERT INTO gbp_categories (name, slug, path, level, parent_id, category_count)
|
||||
SELECT %s, %s, %s::ltree, 2, id, 0
|
||||
FROM gbp_categories WHERE path = %s::ltree
|
||||
ON CONFLICT (path) DO NOTHING
|
||||
RETURNING id
|
||||
""", (btype, btype_slug, btype_path, sector_path))
|
||||
result = cursor.fetchone()
|
||||
if result:
|
||||
existing_paths[btype_path] = result[0]
|
||||
created_paths.add(btype_path)
|
||||
print(f" [NEW] Created business type: {btype_path}")
|
||||
|
||||
# Check/create sub-category (level 3) - use "General" as default
|
||||
subcat = "General"
|
||||
subcat_slug = "General"
|
||||
subcat_path = f"{btype_path}.{subcat_slug}"
|
||||
if subcat_path not in existing_paths and subcat_path not in created_paths:
|
||||
cursor.execute("""
|
||||
INSERT INTO gbp_categories (name, slug, path, level, parent_id, category_count)
|
||||
SELECT %s, %s, %s::ltree, 3, id, 0
|
||||
FROM gbp_categories WHERE path = %s::ltree
|
||||
ON CONFLICT (path) DO NOTHING
|
||||
RETURNING id
|
||||
""", (subcat, subcat_slug, subcat_path, btype_path))
|
||||
result = cursor.fetchone()
|
||||
if result:
|
||||
existing_paths[subcat_path] = result[0]
|
||||
created_paths.add(subcat_path)
|
||||
print(f" [NEW] Created sub-category: {subcat_path}")
|
||||
|
||||
# Update the item's path
|
||||
new_path = f"{subcat_path}.{slug}"
|
||||
cursor.execute("""
|
||||
UPDATE gbp_categories
|
||||
SET path = %s::ltree,
|
||||
parent_id = (SELECT id FROM gbp_categories WHERE path = %s::ltree)
|
||||
WHERE id = %s
|
||||
""", (new_path, subcat_path, item_id))
|
||||
updated += 1
|
||||
|
||||
except Exception as e:
|
||||
errors.append((name, str(e)))
|
||||
print(f" [ERROR] {name}: {e}")
|
||||
|
||||
# Update category counts
|
||||
print("\nUpdating category counts...")
|
||||
cursor.execute("""
|
||||
WITH counts AS (
|
||||
SELECT
|
||||
parent_id,
|
||||
COUNT(*) as cnt
|
||||
FROM gbp_categories
|
||||
WHERE parent_id IS NOT NULL
|
||||
GROUP BY parent_id
|
||||
)
|
||||
UPDATE gbp_categories g
|
||||
SET category_count = COALESCE(c.cnt, 0)
|
||||
FROM counts c
|
||||
WHERE g.id = c.parent_id
|
||||
""")
|
||||
|
||||
# Also reset counts for categories that no longer have children
|
||||
cursor.execute("""
|
||||
UPDATE gbp_categories
|
||||
SET category_count = 0
|
||||
WHERE id NOT IN (
|
||||
SELECT DISTINCT parent_id FROM gbp_categories WHERE parent_id IS NOT NULL
|
||||
)
|
||||
AND level < 4
|
||||
""")
|
||||
|
||||
conn.commit()
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"SUMMARY")
|
||||
print(f"{'='*60}")
|
||||
print(f"Items moved: {updated}")
|
||||
print(f"New paths created: {len(created_paths)}")
|
||||
print(f"Errors: {len(errors)}")
|
||||
|
||||
if errors:
|
||||
print("\nErrors:")
|
||||
for name, err in errors[:10]:
|
||||
print(f" - {name}: {err}")
|
||||
if len(errors) > 10:
|
||||
print(f" ... and {len(errors) - 10} more")
|
||||
|
||||
# Show final stats
|
||||
cursor.execute("""
|
||||
SELECT
|
||||
SPLIT_PART(path::text, '.', 1) as sector,
|
||||
COUNT(*) as count
|
||||
FROM gbp_categories
|
||||
WHERE level = 4
|
||||
GROUP BY sector
|
||||
ORDER BY count DESC
|
||||
""")
|
||||
print("\nFinal category distribution:")
|
||||
for sector, count in cursor.fetchall():
|
||||
print(f" {sector}: {count}")
|
||||
|
||||
conn.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
977
db/import_categories.py
Normal file
977
db/import_categories.py
Normal file
@@ -0,0 +1,977 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Import Google Business Profile categories into PostgreSQL with ltree hierarchy.
|
||||
|
||||
Usage:
|
||||
python import_categories.py [--csv-path PATH] [--db-url URL]
|
||||
|
||||
Example:
|
||||
python import_categories.py --csv-path ./categories.csv --db-url postgresql://scraper:scraper123@localhost:5437/scraper
|
||||
"""
|
||||
|
||||
import csv
|
||||
import re
|
||||
import os
|
||||
import argparse
|
||||
from typing import Optional
|
||||
|
||||
try:
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_values
|
||||
HAS_PSYCOPG2 = True
|
||||
except ImportError:
|
||||
HAS_PSYCOPG2 = False
|
||||
|
||||
# Default paths
|
||||
DEFAULT_CSV_PATH = os.path.expanduser("~/Downloads/Google Business Profile Categories (2025 List) - Category List (English).csv")
|
||||
DEFAULT_DB_URL = "postgresql://scraper:scraper123@localhost:5437/scraper"
|
||||
|
||||
|
||||
def slugify(text: str) -> str:
|
||||
"""Convert text to ltree-safe slug."""
|
||||
# Replace special characters with underscores
|
||||
slug = re.sub(r'[^a-zA-Z0-9]+', '_', text)
|
||||
# Remove leading/trailing underscores
|
||||
slug = slug.strip('_')
|
||||
# Ensure it starts with a letter (ltree requirement)
|
||||
if slug and not slug[0].isalpha():
|
||||
slug = 'cat_' + slug
|
||||
return slug or 'unknown'
|
||||
|
||||
|
||||
def categorize_category(cat: str) -> tuple:
|
||||
"""
|
||||
Categorize a GBP category into 4-level hierarchy.
|
||||
Returns: (level1, level2, level3, level4)
|
||||
"""
|
||||
c = cat.lower()
|
||||
|
||||
# === FOOD & DINING ===
|
||||
if 'restaurant' in c:
|
||||
if any(x in c for x in ['fast food', 'drive-in', 'takeaway', 'takeout', 'quick service']):
|
||||
return ("Food & Dining", "Restaurants", "Fast Food & Quick Service", cat)
|
||||
# Cuisine types
|
||||
return ("Food & Dining", "Restaurants", "By Cuisine", cat)
|
||||
|
||||
if any(x in c for x in ['cafe', 'coffee shop', 'tea house', 'tea room', 'espresso bar']):
|
||||
return ("Food & Dining", "Cafes & Coffee", "Coffee Shops", cat)
|
||||
|
||||
if any(x in c for x in ['bar', 'pub', 'nightclub', 'night club', 'cocktail', 'wine bar', 'beer', 'lounge']):
|
||||
if 'gay' in c or 'lesbian' in c:
|
||||
return ("Food & Dining", "Bars & Nightlife", "LGBTQ+ Venues", cat)
|
||||
if 'karaoke' in c:
|
||||
return ("Food & Dining", "Bars & Nightlife", "Karaoke", cat)
|
||||
return ("Food & Dining", "Bars & Nightlife", "Bars & Pubs", cat)
|
||||
|
||||
if any(x in c for x in ['bakery', 'pastry', 'cake', 'donut', 'dessert', 'ice cream', 'frozen yogurt', 'candy', 'chocolate', 'confection']):
|
||||
return ("Food & Dining", "Bakeries & Desserts", "Sweet Shops", cat)
|
||||
|
||||
if any(x in c for x in ['caterer', 'catering']):
|
||||
return ("Food & Dining", "Food Services", "Catering", cat)
|
||||
|
||||
if any(x in c for x in ['brewery', 'winery', 'distillery', 'vineyard']):
|
||||
return ("Food & Dining", "Beverage Production", "Producers", cat)
|
||||
|
||||
if any(x in c for x in ['food truck', 'food stand', 'food stall', 'food court']):
|
||||
return ("Food & Dining", "Quick Service", "Street Food", cat)
|
||||
|
||||
# === RETAIL & SHOPPING ===
|
||||
if 'store' in c or 'shop' in c:
|
||||
if any(x in c for x in ['clothing', 'fashion', 'shoe', 'dress', 'apparel', 'wear', 'boutique', 'tailor']):
|
||||
return ("Retail & Shopping", "Clothing & Fashion", "Apparel Stores", cat)
|
||||
if any(x in c for x in ['electronic', 'computer', 'phone', 'appliance', 'tv', 'audio', 'video game']):
|
||||
return ("Retail & Shopping", "Electronics", "Electronics Stores", cat)
|
||||
if any(x in c for x in ['furniture', 'home decor', 'kitchen', 'bed', 'mattress', 'carpet', 'curtain', 'lighting']):
|
||||
return ("Retail & Shopping", "Home & Garden", "Home Furnishings", cat)
|
||||
if any(x in c for x in ['grocery', 'supermarket', 'food', 'beverage', 'wine', 'liquor', 'butcher', 'fish', 'fruit', 'vegetable']):
|
||||
return ("Retail & Shopping", "Food & Grocery", "Grocery Stores", cat)
|
||||
if any(x in c for x in ['book', 'stationery', 'office supply', 'paper']):
|
||||
return ("Retail & Shopping", "Books & Office", "Book Stores", cat)
|
||||
if any(x in c for x in ['pet', 'animal']):
|
||||
return ("Retail & Shopping", "Pet Supplies", "Pet Stores", cat)
|
||||
if any(x in c for x in ['toy', 'game', 'hobby']):
|
||||
return ("Retail & Shopping", "Toys & Hobbies", "Toy Stores", cat)
|
||||
if any(x in c for x in ['jewelry', 'watch', 'gold', 'diamond']):
|
||||
return ("Retail & Shopping", "Jewelry & Watches", "Jewelry Stores", cat)
|
||||
if any(x in c for x in ['sport', 'athletic', 'fitness', 'outdoor', 'camping', 'fishing', 'hunting']):
|
||||
return ("Retail & Shopping", "Sports & Outdoors", "Sporting Goods", cat)
|
||||
if any(x in c for x in ['music', 'instrument', 'record', 'vinyl']):
|
||||
return ("Retail & Shopping", "Music & Entertainment", "Music Stores", cat)
|
||||
if any(x in c for x in ['art', 'craft', 'fabric', 'sewing', 'yarn', 'knitting']):
|
||||
return ("Retail & Shopping", "Arts & Crafts", "Art Supply Stores", cat)
|
||||
if any(x in c for x in ['beauty', 'cosmetic', 'perfume', 'makeup']):
|
||||
return ("Retail & Shopping", "Beauty & Cosmetics", "Beauty Stores", cat)
|
||||
if any(x in c for x in ['pharmacy', 'drug', 'medicine', 'health']):
|
||||
return ("Retail & Shopping", "Health & Pharmacy", "Pharmacies", cat)
|
||||
if any(x in c for x in ['garden', 'plant', 'flower', 'nursery', 'landscap']):
|
||||
return ("Retail & Shopping", "Home & Garden", "Garden Centers", cat)
|
||||
if any(x in c for x in ['hardware', 'tool', 'building', 'lumber', 'paint']):
|
||||
return ("Retail & Shopping", "Hardware & Building", "Hardware Stores", cat)
|
||||
if any(x in c for x in ['antique', 'vintage', 'thrift', 'consignment', 'second hand', 'used']):
|
||||
return ("Retail & Shopping", "Secondhand & Vintage", "Thrift Stores", cat)
|
||||
return ("Retail & Shopping", "Specialty Retail", "Other Stores", cat)
|
||||
|
||||
if any(x in c for x in ['supplier', 'wholesaler', 'distributor', 'exporter', 'importer']):
|
||||
if any(x in c for x in ['food', 'beverage', 'meat', 'seafood', 'produce']):
|
||||
return ("Retail & Shopping", "Wholesale & Distribution", "Food Wholesale", cat)
|
||||
if any(x in c for x in ['building', 'construction', 'lumber', 'concrete', 'steel']):
|
||||
return ("Retail & Shopping", "Wholesale & Distribution", "Building Materials", cat)
|
||||
if any(x in c for x in ['industrial', 'machinery', 'equipment']):
|
||||
return ("Retail & Shopping", "Wholesale & Distribution", "Industrial Supplies", cat)
|
||||
return ("Retail & Shopping", "Wholesale & Distribution", "General Wholesale", cat)
|
||||
|
||||
if 'market' in c and 'marketing' not in c:
|
||||
if 'flea' in c or 'antique' in c:
|
||||
return ("Retail & Shopping", "Markets", "Flea Markets", cat)
|
||||
if 'farmer' in c:
|
||||
return ("Retail & Shopping", "Markets", "Farmers Markets", cat)
|
||||
return ("Retail & Shopping", "Markets", "General Markets", cat)
|
||||
|
||||
# === AUTOMOTIVE ===
|
||||
if 'dealer' in c:
|
||||
car_brands = ['abarth', 'acura', 'alfa romeo', 'aston martin', 'audi', 'bentley', 'bmw', 'bugatti',
|
||||
'buick', 'cadillac', 'chevrolet', 'chrysler', 'citroen', 'cupra', 'dacia', 'daihatsu',
|
||||
'dodge', 'ferrari', 'fiat', 'ford', 'genesis', 'gmc', 'honda', 'hummer', 'hyundai',
|
||||
'infiniti', 'isuzu', 'jaguar', 'jeep', 'kia', 'lamborghini', 'lancia', 'land rover',
|
||||
'lexus', 'lincoln', 'lotus', 'maserati', 'mazda', 'mclaren', 'mercedes', 'mini',
|
||||
'mitsubishi', 'nissan', 'opel', 'peugeot', 'porsche', 'ram', 'renault', 'rolls-royce',
|
||||
'saab', 'seat', 'skoda', 'smart', 'subaru', 'suzuki', 'tesla', 'toyota', 'volkswagen',
|
||||
'volvo', 'yamaha', 'harley', 'ducati', 'kawasaki', 'triumph', 'vespa', 'piaggio']
|
||||
if any(b in c for b in car_brands):
|
||||
if 'motorcycle' in c or any(x in c for x in ['harley', 'ducati', 'kawasaki', 'triumph', 'vespa']):
|
||||
return ("Automotive", "Dealers", "Motorcycle Brands", cat)
|
||||
return ("Automotive", "Dealers", "Car Brands", cat)
|
||||
if any(x in c for x in ['motorcycle', 'scooter', 'moped']):
|
||||
return ("Automotive", "Dealers", "Motorcycle Dealers", cat)
|
||||
if any(x in c for x in ['truck', 'commercial vehicle', 'trailer']):
|
||||
return ("Automotive", "Dealers", "Truck & Commercial", cat)
|
||||
if any(x in c for x in ['boat', 'yacht', 'marine', 'jet ski']):
|
||||
return ("Automotive", "Dealers", "Marine & Boats", cat)
|
||||
if any(x in c for x in ['rv', 'camper', 'motorhome', 'caravan']):
|
||||
return ("Automotive", "Dealers", "RV & Campers", cat)
|
||||
if any(x in c for x in ['atv', 'quad', 'off-road', 'utv']):
|
||||
return ("Automotive", "Dealers", "ATV & Off-Road", cat)
|
||||
if 'used' in c or 'pre-owned' in c:
|
||||
return ("Automotive", "Dealers", "Used Vehicles", cat)
|
||||
return ("Automotive", "Dealers", "Other Dealers", cat)
|
||||
|
||||
if any(x in c for x in ['car wash', 'auto detailing', 'car detailing']):
|
||||
return ("Automotive", "Vehicle Care", "Cleaning & Detailing", cat)
|
||||
|
||||
if any(x in c for x in ['car rental', 'auto rental', 'vehicle rental', 'truck rental']):
|
||||
return ("Automotive", "Rental Services", "Vehicle Rental", cat)
|
||||
|
||||
if any(x in c for x in ['car repair', 'auto repair', 'mechanic', 'garage', 'auto body', 'collision']):
|
||||
return ("Automotive", "Repair & Maintenance", "Auto Repair", cat)
|
||||
|
||||
if any(x in c for x in ['tire', 'tyre', 'wheel']):
|
||||
return ("Automotive", "Parts & Accessories", "Tires & Wheels", cat)
|
||||
|
||||
if any(x in c for x in ['auto part', 'car part', 'auto accessories']):
|
||||
return ("Automotive", "Parts & Accessories", "Auto Parts", cat)
|
||||
|
||||
if any(x in c for x in ['driving school', 'driving instruction']):
|
||||
return ("Automotive", "Training", "Driving Schools", cat)
|
||||
|
||||
if any(x in c for x in ['parking', 'car park', 'garage']):
|
||||
if 'repair' not in c and 'mechanic' not in c:
|
||||
return ("Automotive", "Parking", "Parking Facilities", cat)
|
||||
|
||||
if any(x in c for x in ['gas station', 'petrol', 'fuel', 'charging station', 'ev charging']):
|
||||
return ("Automotive", "Fuel & Charging", "Fuel Stations", cat)
|
||||
|
||||
# === HEALTHCARE ===
|
||||
if any(x in c for x in ['hospital']):
|
||||
if 'animal' in c or 'veterinar' in c:
|
||||
return ("Healthcare", "Veterinary", "Animal Hospitals", cat)
|
||||
if 'children' in c or 'pediatric' in c:
|
||||
return ("Healthcare", "Hospitals", "Pediatric Hospitals", cat)
|
||||
if 'mental' in c or 'psychiatric' in c:
|
||||
return ("Healthcare", "Mental Health", "Psychiatric Hospitals", cat)
|
||||
return ("Healthcare", "Hospitals", "General Hospitals", cat)
|
||||
|
||||
if any(x in c for x in ['clinic']):
|
||||
if 'dental' in c:
|
||||
return ("Healthcare", "Dental", "Dental Clinics", cat)
|
||||
if 'eye' in c or 'vision' in c or 'optical' in c:
|
||||
return ("Healthcare", "Vision Care", "Eye Clinics", cat)
|
||||
if 'fertility' in c or 'ivf' in c:
|
||||
return ("Healthcare", "Specialty Care", "Fertility Clinics", cat)
|
||||
if 'skin' in c or 'dermatol' in c:
|
||||
return ("Healthcare", "Specialty Care", "Dermatology", cat)
|
||||
if 'physical therapy' in c or 'physiotherapy' in c or 'rehab' in c:
|
||||
return ("Healthcare", "Rehabilitation", "Physical Therapy", cat)
|
||||
return ("Healthcare", "Clinics", "Medical Clinics", cat)
|
||||
|
||||
if any(x in c for x in ['doctor', 'physician']):
|
||||
return ("Healthcare", "Medical Practitioners", "Doctors", cat)
|
||||
|
||||
if any(x in c for x in ['dentist', 'dental', 'orthodont', 'endodont', 'periodont']):
|
||||
return ("Healthcare", "Dental", "Dental Services", cat)
|
||||
|
||||
if any(x in c for x in ['surgeon', 'surgery']):
|
||||
if 'plastic' in c or 'cosmetic' in c:
|
||||
return ("Healthcare", "Specialty Care", "Cosmetic Surgery", cat)
|
||||
return ("Healthcare", "Medical Practitioners", "Surgeons", cat)
|
||||
|
||||
if any(x in c for x in ['psycholog', 'psychiatr', 'mental health', 'counselor', 'therapist']):
|
||||
if 'marriage' in c or 'family' in c:
|
||||
return ("Healthcare", "Mental Health", "Family Counseling", cat)
|
||||
if 'addiction' in c or 'substance' in c:
|
||||
return ("Healthcare", "Mental Health", "Addiction Treatment", cat)
|
||||
return ("Healthcare", "Mental Health", "Mental Health Services", cat)
|
||||
|
||||
if any(x in c for x in ['chiropract']):
|
||||
return ("Healthcare", "Alternative Medicine", "Chiropractic", cat)
|
||||
|
||||
if any(x in c for x in ['acupuncture', 'acupuncturist']):
|
||||
return ("Healthcare", "Alternative Medicine", "Acupuncture", cat)
|
||||
|
||||
if any(x in c for x in ['naturopath', 'homeopath', 'ayurved', 'holistic']):
|
||||
return ("Healthcare", "Alternative Medicine", "Natural Medicine", cat)
|
||||
|
||||
if any(x in c for x in ['optometrist', 'optician', 'eye doctor', 'ophthalmol']):
|
||||
return ("Healthcare", "Vision Care", "Eye Care", cat)
|
||||
|
||||
if any(x in c for x in ['pharmacy', 'drugstore', 'apothecary']):
|
||||
return ("Healthcare", "Pharmacies", "Retail Pharmacies", cat)
|
||||
|
||||
if any(x in c for x in ['veterinar', 'vet ', 'animal clinic', 'pet clinic']):
|
||||
return ("Healthcare", "Veterinary", "Veterinary Services", cat)
|
||||
|
||||
if any(x in c for x in ['nursing home', 'assisted living', 'senior care', 'elder care', 'retirement home']):
|
||||
return ("Healthcare", "Senior Care", "Senior Living", cat)
|
||||
|
||||
if any(x in c for x in ['lab', 'laboratory', 'diagnostic', 'imaging', 'x-ray', 'mri', 'radiology']):
|
||||
return ("Healthcare", "Diagnostics", "Medical Labs", cat)
|
||||
|
||||
if any(x in c for x in ['ambulance', 'emergency', 'urgent care']):
|
||||
return ("Healthcare", "Emergency Services", "Emergency Care", cat)
|
||||
|
||||
# === EDUCATION ===
|
||||
if 'school' in c or 'academy' in c:
|
||||
if any(x in c for x in ['preschool', 'kindergarten', 'nursery', 'daycare', 'pre-school']):
|
||||
return ("Education", "Early Childhood", "Preschools", cat)
|
||||
if any(x in c for x in ['elementary', 'primary']):
|
||||
return ("Education", "K-12 Schools", "Elementary Schools", cat)
|
||||
if any(x in c for x in ['middle', 'junior high']):
|
||||
return ("Education", "K-12 Schools", "Middle Schools", cat)
|
||||
if any(x in c for x in ['high school', 'secondary']):
|
||||
return ("Education", "K-12 Schools", "High Schools", cat)
|
||||
if any(x in c for x in ['boarding']):
|
||||
return ("Education", "K-12 Schools", "Boarding Schools", cat)
|
||||
if any(x in c for x in ['driving']):
|
||||
return ("Automotive", "Training", "Driving Schools", cat)
|
||||
if any(x in c for x in ['language', 'english', 'spanish', 'french', 'german', 'chinese', 'japanese']):
|
||||
return ("Education", "Language Learning", "Language Schools", cat)
|
||||
if any(x in c for x in ['art', 'music', 'dance', 'drama', 'theater', 'acting']):
|
||||
return ("Education", "Arts Education", "Arts Schools", cat)
|
||||
if any(x in c for x in ['martial art', 'karate', 'judo', 'taekwondo', 'kung fu', 'aikido', 'boxing']):
|
||||
return ("Education", "Sports Training", "Martial Arts Schools", cat)
|
||||
if any(x in c for x in ['beauty', 'cosmetology', 'barber']):
|
||||
return ("Education", "Vocational Training", "Beauty Schools", cat)
|
||||
if any(x in c for x in ['cooking', 'culinary', 'chef']):
|
||||
return ("Education", "Vocational Training", "Culinary Schools", cat)
|
||||
if any(x in c for x in ['business', 'mba']):
|
||||
return ("Education", "Higher Education", "Business Schools", cat)
|
||||
if any(x in c for x in ['medical', 'nursing', 'dental']):
|
||||
return ("Education", "Higher Education", "Medical Schools", cat)
|
||||
if any(x in c for x in ['law']):
|
||||
return ("Education", "Higher Education", "Law Schools", cat)
|
||||
if any(x in c for x in ['flight', 'aviation', 'pilot']):
|
||||
return ("Education", "Vocational Training", "Aviation Schools", cat)
|
||||
if any(x in c for x in ['computer', 'it ', 'coding', 'programming', 'software']):
|
||||
return ("Education", "Technology Training", "Computer Schools", cat)
|
||||
if any(x in c for x in ['trade', 'technical', 'vocational']):
|
||||
return ("Education", "Vocational Training", "Trade Schools", cat)
|
||||
return ("Education", "Specialty Schools", "Other Schools", cat)
|
||||
|
||||
if any(x in c for x in ['university', 'college']):
|
||||
if 'community' in c:
|
||||
return ("Education", "Higher Education", "Community Colleges", cat)
|
||||
return ("Education", "Higher Education", "Universities", cat)
|
||||
|
||||
if any(x in c for x in ['tutor', 'tutoring']):
|
||||
return ("Education", "Tutoring", "Private Tutoring", cat)
|
||||
|
||||
if any(x in c for x in ['training center', 'training program', 'training institute']):
|
||||
return ("Education", "Professional Training", "Training Centers", cat)
|
||||
|
||||
if any(x in c for x in ['library']):
|
||||
return ("Education", "Libraries", "Public Libraries", cat)
|
||||
|
||||
# === PROFESSIONAL SERVICES ===
|
||||
if any(x in c for x in ['lawyer', 'attorney', 'law firm', 'legal']):
|
||||
if any(x in c for x in ['immigration']):
|
||||
return ("Professional Services", "Legal", "Immigration Law", cat)
|
||||
if any(x in c for x in ['criminal', 'defense']):
|
||||
return ("Professional Services", "Legal", "Criminal Law", cat)
|
||||
if any(x in c for x in ['family', 'divorce']):
|
||||
return ("Professional Services", "Legal", "Family Law", cat)
|
||||
if any(x in c for x in ['personal injury', 'accident']):
|
||||
return ("Professional Services", "Legal", "Personal Injury", cat)
|
||||
if any(x in c for x in ['real estate', 'property']):
|
||||
return ("Professional Services", "Legal", "Real Estate Law", cat)
|
||||
if any(x in c for x in ['business', 'corporate', 'commercial']):
|
||||
return ("Professional Services", "Legal", "Business Law", cat)
|
||||
return ("Professional Services", "Legal", "General Legal", cat)
|
||||
|
||||
if any(x in c for x in ['accountant', 'accounting', 'bookkeep', 'tax']):
|
||||
return ("Professional Services", "Financial Services", "Accounting", cat)
|
||||
|
||||
if any(x in c for x in ['consultant', 'consulting', 'advisor']):
|
||||
if any(x in c for x in ['business', 'management']):
|
||||
return ("Professional Services", "Consulting", "Business Consulting", cat)
|
||||
if any(x in c for x in ['it ', 'technology', 'computer']):
|
||||
return ("Professional Services", "Consulting", "IT Consulting", cat)
|
||||
if any(x in c for x in ['marketing', 'advertising']):
|
||||
return ("Professional Services", "Consulting", "Marketing Consulting", cat)
|
||||
return ("Professional Services", "Consulting", "General Consulting", cat)
|
||||
|
||||
if any(x in c for x in ['notary', 'notarial']):
|
||||
return ("Professional Services", "Legal", "Notary Services", cat)
|
||||
|
||||
if any(x in c for x in ['architect', 'architecture']):
|
||||
return ("Professional Services", "Design", "Architecture", cat)
|
||||
|
||||
if any(x in c for x in ['engineer', 'engineering']):
|
||||
if 'civil' in c:
|
||||
return ("Professional Services", "Engineering", "Civil Engineering", cat)
|
||||
if 'structural' in c:
|
||||
return ("Professional Services", "Engineering", "Structural Engineering", cat)
|
||||
if 'mechanical' in c:
|
||||
return ("Professional Services", "Engineering", "Mechanical Engineering", cat)
|
||||
if 'electrical' in c:
|
||||
return ("Professional Services", "Engineering", "Electrical Engineering", cat)
|
||||
return ("Professional Services", "Engineering", "General Engineering", cat)
|
||||
|
||||
if any(x in c for x in ['agency']):
|
||||
if any(x in c for x in ['advertising', 'marketing', 'creative', 'digital']):
|
||||
return ("Professional Services", "Marketing & Advertising", "Agencies", cat)
|
||||
if any(x in c for x in ['real estate', 'property']):
|
||||
return ("Real Estate", "Agencies", "Real Estate Agencies", cat)
|
||||
if any(x in c for x in ['insurance']):
|
||||
return ("Finance & Insurance", "Insurance", "Insurance Agencies", cat)
|
||||
if any(x in c for x in ['travel', 'tour']):
|
||||
return ("Hospitality & Travel", "Travel Services", "Travel Agencies", cat)
|
||||
if any(x in c for x in ['employment', 'staffing', 'recruitment', 'temp']):
|
||||
return ("Professional Services", "HR Services", "Staffing Agencies", cat)
|
||||
return ("Professional Services", "Agencies", "Other Agencies", cat)
|
||||
|
||||
if any(x in c for x in ['photographer', 'photography', 'photo studio']):
|
||||
return ("Professional Services", "Creative Services", "Photography", cat)
|
||||
|
||||
if any(x in c for x in ['graphic design', 'web design', 'design studio']):
|
||||
return ("Professional Services", "Creative Services", "Design Services", cat)
|
||||
|
||||
if any(x in c for x in ['translator', 'translation', 'interpreter']):
|
||||
return ("Professional Services", "Language Services", "Translation", cat)
|
||||
|
||||
if any(x in c for x in ['printing', 'print shop', 'copy']):
|
||||
return ("Professional Services", "Business Services", "Printing Services", cat)
|
||||
|
||||
# === HOME SERVICES ===
|
||||
if any(x in c for x in ['plumber', 'plumbing']):
|
||||
return ("Home Services", "Plumbing", "Plumbers", cat)
|
||||
|
||||
if any(x in c for x in ['electrician', 'electrical']):
|
||||
if 'contractor' in c or 'service' in c:
|
||||
return ("Home Services", "Electrical", "Electricians", cat)
|
||||
|
||||
if any(x in c for x in ['hvac', 'air conditioning', 'heating', 'furnace']):
|
||||
return ("Home Services", "HVAC", "Heating & Cooling", cat)
|
||||
|
||||
if any(x in c for x in ['roofing', 'roofer']):
|
||||
return ("Home Services", "Roofing", "Roofing Services", cat)
|
||||
|
||||
if any(x in c for x in ['painter', 'painting']):
|
||||
if 'house' in c or 'residential' in c or 'contractor' in c:
|
||||
return ("Home Services", "Painting", "House Painters", cat)
|
||||
|
||||
if any(x in c for x in ['landscap', 'lawn', 'garden']):
|
||||
if 'service' in c or 'company' in c or 'contractor' in c:
|
||||
return ("Home Services", "Landscaping", "Landscaping Services", cat)
|
||||
|
||||
if any(x in c for x in ['cleaning service', 'maid', 'housekeep', 'janitorial']):
|
||||
return ("Home Services", "Cleaning", "Cleaning Services", cat)
|
||||
|
||||
if any(x in c for x in ['pest control', 'exterminator']):
|
||||
return ("Home Services", "Pest Control", "Exterminators", cat)
|
||||
|
||||
if any(x in c for x in ['locksmith']):
|
||||
return ("Home Services", "Security", "Locksmiths", cat)
|
||||
|
||||
if any(x in c for x in ['moving company', 'mover', 'relocation']):
|
||||
return ("Home Services", "Moving", "Moving Services", cat)
|
||||
|
||||
if any(x in c for x in ['contractor']):
|
||||
if 'general' in c:
|
||||
return ("Home Services", "Construction", "General Contractors", cat)
|
||||
return ("Home Services", "Construction", "Contractors", cat)
|
||||
|
||||
if any(x in c for x in ['carpenter', 'carpentry']):
|
||||
return ("Home Services", "Construction", "Carpenters", cat)
|
||||
|
||||
if any(x in c for x in ['flooring', 'floor']):
|
||||
if 'service' in c or 'contractor' in c or 'installation' in c:
|
||||
return ("Home Services", "Flooring", "Floor Installation", cat)
|
||||
|
||||
if any(x in c for x in ['window', 'glass']):
|
||||
if 'repair' in c or 'installation' in c or 'service' in c:
|
||||
return ("Home Services", "Windows & Doors", "Window Services", cat)
|
||||
|
||||
if any(x in c for x in ['pool', 'spa']):
|
||||
if 'service' in c or 'cleaning' in c or 'maintenance' in c:
|
||||
return ("Home Services", "Pool & Spa", "Pool Services", cat)
|
||||
|
||||
if any(x in c for x in ['appliance repair', 'appliance service']):
|
||||
return ("Home Services", "Appliance Repair", "Appliance Services", cat)
|
||||
|
||||
if any(x in c for x in ['handyman']):
|
||||
return ("Home Services", "General Repair", "Handyman Services", cat)
|
||||
|
||||
if any(x in c for x in ['interior design', 'decorator']):
|
||||
return ("Home Services", "Design", "Interior Design", cat)
|
||||
|
||||
# === PERSONAL SERVICES ===
|
||||
if any(x in c for x in ['salon', 'hair', 'hairdress', 'stylist']):
|
||||
return ("Personal Services", "Hair Care", "Hair Salons", cat)
|
||||
|
||||
if any(x in c for x in ['barber']):
|
||||
if 'shop' in c or not 'school' in c:
|
||||
return ("Personal Services", "Hair Care", "Barber Shops", cat)
|
||||
|
||||
if any(x in c for x in ['nail', 'manicure', 'pedicure']):
|
||||
return ("Personal Services", "Nail Care", "Nail Salons", cat)
|
||||
|
||||
if any(x in c for x in ['spa']):
|
||||
if 'day spa' in c or 'medical spa' in c or ('service' not in c and 'pool' not in c):
|
||||
return ("Personal Services", "Spa & Wellness", "Day Spas", cat)
|
||||
|
||||
if any(x in c for x in ['massage']):
|
||||
return ("Personal Services", "Massage", "Massage Therapy", cat)
|
||||
|
||||
if any(x in c for x in ['beauty']):
|
||||
if 'salon' in c or 'parlor' in c:
|
||||
return ("Personal Services", "Beauty", "Beauty Salons", cat)
|
||||
|
||||
if any(x in c for x in ['tattoo']):
|
||||
return ("Personal Services", "Body Art", "Tattoo Shops", cat)
|
||||
|
||||
if any(x in c for x in ['piercing']):
|
||||
return ("Personal Services", "Body Art", "Piercing Studios", cat)
|
||||
|
||||
if any(x in c for x in ['tanning']):
|
||||
return ("Personal Services", "Tanning", "Tanning Salons", cat)
|
||||
|
||||
if any(x in c for x in ['tailor', 'alteration', 'seamstress']):
|
||||
return ("Personal Services", "Clothing Care", "Tailoring", cat)
|
||||
|
||||
if any(x in c for x in ['dry clean', 'laundry', 'laundromat']):
|
||||
return ("Personal Services", "Laundry", "Laundry Services", cat)
|
||||
|
||||
if any(x in c for x in ['personal trainer', 'fitness trainer']):
|
||||
return ("Personal Services", "Fitness", "Personal Training", cat)
|
||||
|
||||
# === ENTERTAINMENT & RECREATION ===
|
||||
if any(x in c for x in ['movie theater', 'cinema', 'multiplex']):
|
||||
return ("Entertainment", "Movies", "Movie Theaters", cat)
|
||||
|
||||
if any(x in c for x in ['theater', 'theatre']):
|
||||
if 'movie' not in c:
|
||||
return ("Entertainment", "Performing Arts", "Theaters", cat)
|
||||
|
||||
if any(x in c for x in ['museum']):
|
||||
if 'art' in c:
|
||||
return ("Entertainment", "Museums", "Art Museums", cat)
|
||||
if 'history' in c or 'historical' in c:
|
||||
return ("Entertainment", "Museums", "History Museums", cat)
|
||||
if 'science' in c or 'natural' in c:
|
||||
return ("Entertainment", "Museums", "Science Museums", cat)
|
||||
if 'children' in c or 'kid' in c:
|
||||
return ("Entertainment", "Museums", "Children's Museums", cat)
|
||||
return ("Entertainment", "Museums", "General Museums", cat)
|
||||
|
||||
if any(x in c for x in ['art gallery', 'gallery']):
|
||||
return ("Entertainment", "Arts", "Art Galleries", cat)
|
||||
|
||||
if any(x in c for x in ['amusement park', 'theme park', 'water park']):
|
||||
return ("Entertainment", "Amusement", "Theme Parks", cat)
|
||||
|
||||
if any(x in c for x in ['zoo', 'aquarium', 'wildlife']):
|
||||
return ("Entertainment", "Wildlife", "Zoos & Aquariums", cat)
|
||||
|
||||
if any(x in c for x in ['bowling']):
|
||||
return ("Entertainment", "Games & Recreation", "Bowling", cat)
|
||||
|
||||
if any(x in c for x in ['arcade', 'video game']):
|
||||
return ("Entertainment", "Games & Recreation", "Arcades", cat)
|
||||
|
||||
if any(x in c for x in ['escape room']):
|
||||
return ("Entertainment", "Games & Recreation", "Escape Rooms", cat)
|
||||
|
||||
if any(x in c for x in ['casino', 'gambling']):
|
||||
return ("Entertainment", "Gambling", "Casinos", cat)
|
||||
|
||||
if any(x in c for x in ['concert', 'music venue', 'live music']):
|
||||
return ("Entertainment", "Music Venues", "Concert Halls", cat)
|
||||
|
||||
if any(x in c for x in ['gym', 'fitness center', 'health club']):
|
||||
return ("Entertainment", "Fitness", "Gyms", cat)
|
||||
|
||||
if any(x in c for x in ['yoga']):
|
||||
if 'studio' in c or 'center' in c:
|
||||
return ("Entertainment", "Fitness", "Yoga Studios", cat)
|
||||
|
||||
if any(x in c for x in ['pilates']):
|
||||
return ("Entertainment", "Fitness", "Pilates Studios", cat)
|
||||
|
||||
if any(x in c for x in ['swimming pool', 'swim']):
|
||||
return ("Entertainment", "Sports", "Swimming Pools", cat)
|
||||
|
||||
if any(x in c for x in ['golf']):
|
||||
if 'course' in c or 'club' in c:
|
||||
return ("Entertainment", "Sports", "Golf Courses", cat)
|
||||
|
||||
if any(x in c for x in ['tennis']):
|
||||
return ("Entertainment", "Sports", "Tennis Courts", cat)
|
||||
|
||||
if any(x in c for x in ['stadium', 'arena', 'sports complex']):
|
||||
return ("Entertainment", "Venues", "Sports Venues", cat)
|
||||
|
||||
if any(x in c for x in ['park']):
|
||||
if 'amusement' not in c and 'theme' not in c:
|
||||
if 'national' in c or 'state' in c:
|
||||
return ("Entertainment", "Parks", "National Parks", cat)
|
||||
if 'dog' in c:
|
||||
return ("Entertainment", "Parks", "Dog Parks", cat)
|
||||
return ("Entertainment", "Parks", "Public Parks", cat)
|
||||
|
||||
if any(x in c for x in ['recreation center', 'community center']):
|
||||
return ("Entertainment", "Recreation", "Community Centers", cat)
|
||||
|
||||
if any(x in c for x in ['club']):
|
||||
if 'night' in c:
|
||||
return ("Food & Dining", "Bars & Nightlife", "Night Clubs", cat)
|
||||
if 'country' in c:
|
||||
return ("Entertainment", "Sports", "Country Clubs", cat)
|
||||
if 'sport' in c or 'athletic' in c:
|
||||
return ("Entertainment", "Sports", "Sports Clubs", cat)
|
||||
if 'social' in c:
|
||||
return ("Entertainment", "Social", "Social Clubs", cat)
|
||||
|
||||
# === HOSPITALITY & TRAVEL ===
|
||||
if any(x in c for x in ['hotel', 'motel', 'inn']):
|
||||
if 'boutique' in c:
|
||||
return ("Hospitality & Travel", "Lodging", "Boutique Hotels", cat)
|
||||
if 'resort' in c:
|
||||
return ("Hospitality & Travel", "Lodging", "Resorts", cat)
|
||||
if 'budget' in c or 'economy' in c:
|
||||
return ("Hospitality & Travel", "Lodging", "Budget Hotels", cat)
|
||||
return ("Hospitality & Travel", "Lodging", "Hotels", cat)
|
||||
|
||||
if any(x in c for x in ['hostel']):
|
||||
return ("Hospitality & Travel", "Lodging", "Hostels", cat)
|
||||
|
||||
if any(x in c for x in ['bed and breakfast', 'b&b', 'bnb']):
|
||||
return ("Hospitality & Travel", "Lodging", "B&Bs", cat)
|
||||
|
||||
if any(x in c for x in ['resort']):
|
||||
return ("Hospitality & Travel", "Lodging", "Resorts", cat)
|
||||
|
||||
if any(x in c for x in ['vacation rental', 'holiday rental']):
|
||||
return ("Hospitality & Travel", "Lodging", "Vacation Rentals", cat)
|
||||
|
||||
if any(x in c for x in ['campground', 'camping', 'rv park']):
|
||||
return ("Hospitality & Travel", "Lodging", "Campgrounds", cat)
|
||||
|
||||
if any(x in c for x in ['travel agency', 'tour operator', 'travel agent']):
|
||||
return ("Hospitality & Travel", "Travel Services", "Travel Agencies", cat)
|
||||
|
||||
if any(x in c for x in ['airline', 'airport']):
|
||||
return ("Hospitality & Travel", "Transportation", "Airlines & Airports", cat)
|
||||
|
||||
if any(x in c for x in ['cruise']):
|
||||
return ("Hospitality & Travel", "Travel Services", "Cruises", cat)
|
||||
|
||||
if any(x in c for x in ['tourist', 'attraction', 'sightseeing']):
|
||||
return ("Hospitality & Travel", "Attractions", "Tourist Attractions", cat)
|
||||
|
||||
# === FINANCE & INSURANCE ===
|
||||
if any(x in c for x in ['bank', 'banking', 'credit union']):
|
||||
return ("Finance & Insurance", "Banking", "Banks", cat)
|
||||
|
||||
if any(x in c for x in ['atm', 'cash machine']):
|
||||
return ("Finance & Insurance", "Banking", "ATMs", cat)
|
||||
|
||||
if any(x in c for x in ['insurance']):
|
||||
if 'health' in c or 'medical' in c:
|
||||
return ("Finance & Insurance", "Insurance", "Health Insurance", cat)
|
||||
if 'auto' in c or 'car' in c:
|
||||
return ("Finance & Insurance", "Insurance", "Auto Insurance", cat)
|
||||
if 'home' in c or 'property' in c:
|
||||
return ("Finance & Insurance", "Insurance", "Home Insurance", cat)
|
||||
if 'life' in c:
|
||||
return ("Finance & Insurance", "Insurance", "Life Insurance", cat)
|
||||
return ("Finance & Insurance", "Insurance", "Insurance Services", cat)
|
||||
|
||||
if any(x in c for x in ['loan', 'mortgage', 'lending']):
|
||||
return ("Finance & Insurance", "Lending", "Loans", cat)
|
||||
|
||||
if any(x in c for x in ['investment', 'financial advisor', 'wealth management', 'financial planner']):
|
||||
return ("Finance & Insurance", "Investment", "Financial Services", cat)
|
||||
|
||||
if any(x in c for x in ['currency exchange', 'money transfer', 'wire transfer']):
|
||||
return ("Finance & Insurance", "Money Services", "Currency Services", cat)
|
||||
|
||||
if any(x in c for x in ['pawn']):
|
||||
return ("Finance & Insurance", "Money Services", "Pawn Shops", cat)
|
||||
|
||||
# === REAL ESTATE ===
|
||||
if any(x in c for x in ['real estate', 'property', 'realty', 'realtor']):
|
||||
if 'agent' in c or 'agency' in c or 'broker' in c:
|
||||
return ("Real Estate", "Agencies", "Real Estate Agents", cat)
|
||||
if 'developer' in c or 'development' in c:
|
||||
return ("Real Estate", "Development", "Developers", cat)
|
||||
if 'management' in c:
|
||||
return ("Real Estate", "Management", "Property Management", cat)
|
||||
if 'commercial' in c:
|
||||
return ("Real Estate", "Commercial", "Commercial Real Estate", cat)
|
||||
return ("Real Estate", "Services", "Real Estate Services", cat)
|
||||
|
||||
if any(x in c for x in ['apartment', 'condo', 'rental']):
|
||||
if 'complex' in c or 'building' in c:
|
||||
return ("Real Estate", "Residential", "Apartment Complexes", cat)
|
||||
|
||||
if any(x in c for x in ['storage', 'self storage', 'warehouse']):
|
||||
if 'self' in c or 'mini' in c:
|
||||
return ("Real Estate", "Storage", "Self Storage", cat)
|
||||
|
||||
# === RELIGIOUS ===
|
||||
if any(x in c for x in ['church']):
|
||||
if 'catholic' in c:
|
||||
return ("Religious", "Christian", "Catholic Churches", cat)
|
||||
if 'baptist' in c:
|
||||
return ("Religious", "Christian", "Baptist Churches", cat)
|
||||
if 'methodist' in c:
|
||||
return ("Religious", "Christian", "Methodist Churches", cat)
|
||||
if 'lutheran' in c:
|
||||
return ("Religious", "Christian", "Lutheran Churches", cat)
|
||||
if 'orthodox' in c:
|
||||
return ("Religious", "Christian", "Orthodox Churches", cat)
|
||||
if 'pentecostal' in c:
|
||||
return ("Religious", "Christian", "Pentecostal Churches", cat)
|
||||
return ("Religious", "Christian", "Churches", cat)
|
||||
|
||||
if any(x in c for x in ['mosque', 'islamic', 'muslim']):
|
||||
return ("Religious", "Islam", "Mosques", cat)
|
||||
|
||||
if any(x in c for x in ['synagogue', 'jewish', 'temple']):
|
||||
if 'jewish' in c or 'synagogue' in c:
|
||||
return ("Religious", "Judaism", "Synagogues", cat)
|
||||
if 'hindu' in c:
|
||||
return ("Religious", "Hinduism", "Hindu Temples", cat)
|
||||
if 'buddhist' in c:
|
||||
return ("Religious", "Buddhism", "Buddhist Temples", cat)
|
||||
return ("Religious", "Other", "Temples", cat)
|
||||
|
||||
if any(x in c for x in ['abbey', 'monastery', 'convent']):
|
||||
return ("Religious", "Christian", "Monasteries", cat)
|
||||
|
||||
if any(x in c for x in ['gurdwara', 'sikh']):
|
||||
return ("Religious", "Sikhism", "Gurdwaras", cat)
|
||||
|
||||
# === GOVERNMENT & PUBLIC SERVICES ===
|
||||
if any(x in c for x in ['government', 'city hall', 'town hall', 'municipal']):
|
||||
return ("Government", "Local Government", "Government Offices", cat)
|
||||
|
||||
if any(x in c for x in ['court', 'courthouse']):
|
||||
return ("Government", "Legal", "Courts", cat)
|
||||
|
||||
if any(x in c for x in ['police', 'sheriff']):
|
||||
return ("Government", "Public Safety", "Police", cat)
|
||||
|
||||
if any(x in c for x in ['fire station', 'fire department']):
|
||||
return ("Government", "Public Safety", "Fire Departments", cat)
|
||||
|
||||
if any(x in c for x in ['post office', 'postal']):
|
||||
return ("Government", "Postal", "Post Offices", cat)
|
||||
|
||||
if any(x in c for x in ['embassy', 'consulate']):
|
||||
return ("Government", "International", "Embassies", cat)
|
||||
|
||||
if any(x in c for x in ['dmv', 'motor vehicle', 'driver license']):
|
||||
return ("Government", "Transportation", "DMV", cat)
|
||||
|
||||
if any(x in c for x in ['social security', 'welfare', 'social services']):
|
||||
return ("Government", "Social Services", "Social Services", cat)
|
||||
|
||||
# === INDUSTRIAL & MANUFACTURING ===
|
||||
if any(x in c for x in ['manufacturer', 'manufacturing', 'factory', 'plant']):
|
||||
if any(x in c for x in ['food', 'beverage', 'bakery']):
|
||||
return ("Industrial", "Manufacturing", "Food Manufacturing", cat)
|
||||
if any(x in c for x in ['textile', 'clothing', 'garment']):
|
||||
return ("Industrial", "Manufacturing", "Textile Manufacturing", cat)
|
||||
if any(x in c for x in ['electronics', 'computer', 'semiconductor']):
|
||||
return ("Industrial", "Manufacturing", "Electronics Manufacturing", cat)
|
||||
if any(x in c for x in ['auto', 'car', 'vehicle']):
|
||||
return ("Industrial", "Manufacturing", "Auto Manufacturing", cat)
|
||||
if any(x in c for x in ['chemical', 'pharmaceutical']):
|
||||
return ("Industrial", "Manufacturing", "Chemical Manufacturing", cat)
|
||||
if any(x in c for x in ['metal', 'steel', 'iron']):
|
||||
return ("Industrial", "Manufacturing", "Metal Manufacturing", cat)
|
||||
if any(x in c for x in ['plastic', 'rubber']):
|
||||
return ("Industrial", "Manufacturing", "Plastics Manufacturing", cat)
|
||||
if any(x in c for x in ['furniture', 'wood']):
|
||||
return ("Industrial", "Manufacturing", "Furniture Manufacturing", cat)
|
||||
return ("Industrial", "Manufacturing", "General Manufacturing", cat)
|
||||
|
||||
if any(x in c for x in ['mining', 'quarry']):
|
||||
return ("Industrial", "Mining", "Mining Operations", cat)
|
||||
|
||||
if any(x in c for x in ['construction company', 'builder']):
|
||||
return ("Industrial", "Construction", "Construction Companies", cat)
|
||||
|
||||
# === TECHNOLOGY ===
|
||||
if any(x in c for x in ['software', 'app developer', 'web developer']):
|
||||
return ("Technology", "Software", "Software Development", cat)
|
||||
|
||||
if any(x in c for x in ['it service', 'computer service', 'tech support']):
|
||||
return ("Technology", "IT Services", "IT Support", cat)
|
||||
|
||||
if any(x in c for x in ['data center', 'hosting', 'cloud']):
|
||||
return ("Technology", "Infrastructure", "Data Services", cat)
|
||||
|
||||
if any(x in c for x in ['telecommunication', 'telecom', 'internet service']):
|
||||
return ("Technology", "Telecommunications", "Telecom Services", cat)
|
||||
|
||||
# === TRANSPORTATION & LOGISTICS ===
|
||||
if any(x in c for x in ['shipping', 'freight', 'cargo', 'logistics']):
|
||||
return ("Transportation", "Logistics", "Shipping & Freight", cat)
|
||||
|
||||
if any(x in c for x in ['courier', 'delivery', 'express']):
|
||||
return ("Transportation", "Delivery", "Courier Services", cat)
|
||||
|
||||
if any(x in c for x in ['taxi', 'cab', 'ride', 'limo', 'chauffeur']):
|
||||
return ("Transportation", "Passenger", "Taxi & Ride Services", cat)
|
||||
|
||||
if any(x in c for x in ['bus', 'coach', 'shuttle']):
|
||||
if 'station' in c or 'terminal' in c or 'stop' in c:
|
||||
return ("Transportation", "Public Transit", "Bus Stations", cat)
|
||||
return ("Transportation", "Passenger", "Bus Services", cat)
|
||||
|
||||
if any(x in c for x in ['train', 'rail', 'subway', 'metro']):
|
||||
if 'station' in c or 'terminal' in c:
|
||||
return ("Transportation", "Public Transit", "Train Stations", cat)
|
||||
return ("Transportation", "Public Transit", "Rail Services", cat)
|
||||
|
||||
if any(x in c for x in ['towing', 'tow truck']):
|
||||
return ("Transportation", "Vehicle Services", "Towing", cat)
|
||||
|
||||
# === AGRICULTURE ===
|
||||
if any(x in c for x in ['farm', 'ranch', 'orchard', 'vineyard']):
|
||||
return ("Agriculture", "Farming", "Farms", cat)
|
||||
|
||||
if any(x in c for x in ['agricultural', 'agri']):
|
||||
return ("Agriculture", "Services", "Agricultural Services", cat)
|
||||
|
||||
# === PETS & ANIMALS ===
|
||||
if any(x in c for x in ['pet', 'dog', 'cat']):
|
||||
if 'grooming' in c or 'groomer' in c:
|
||||
return ("Pets & Animals", "Pet Services", "Pet Grooming", cat)
|
||||
if 'boarding' in c or 'kennel' in c or 'sitting' in c or 'daycare' in c:
|
||||
return ("Pets & Animals", "Pet Services", "Pet Boarding", cat)
|
||||
if 'training' in c or 'trainer' in c:
|
||||
return ("Pets & Animals", "Pet Services", "Pet Training", cat)
|
||||
if 'adoption' in c or 'shelter' in c or 'rescue' in c:
|
||||
return ("Pets & Animals", "Animal Welfare", "Shelters", cat)
|
||||
if 'store' in c or 'shop' in c:
|
||||
return ("Retail & Shopping", "Pet Supplies", "Pet Stores", cat)
|
||||
|
||||
# === EVENTS & WEDDINGS ===
|
||||
if any(x in c for x in ['wedding', 'bridal']):
|
||||
if 'venue' in c or 'hall' in c:
|
||||
return ("Events & Weddings", "Venues", "Wedding Venues", cat)
|
||||
if 'planner' in c:
|
||||
return ("Events & Weddings", "Planning", "Wedding Planners", cat)
|
||||
if 'dress' in c or 'gown' in c:
|
||||
return ("Events & Weddings", "Attire", "Bridal Shops", cat)
|
||||
return ("Events & Weddings", "Services", "Wedding Services", cat)
|
||||
|
||||
if any(x in c for x in ['event', 'party', 'banquet']):
|
||||
if 'venue' in c or 'hall' in c or 'center' in c:
|
||||
return ("Events & Weddings", "Venues", "Event Venues", cat)
|
||||
if 'planner' in c or 'planning' in c:
|
||||
return ("Events & Weddings", "Planning", "Event Planners", cat)
|
||||
if 'rental' in c or 'supply' in c:
|
||||
return ("Events & Weddings", "Rentals", "Event Rentals", cat)
|
||||
return ("Events & Weddings", "Services", "Event Services", cat)
|
||||
|
||||
if any(x in c for x in ['florist', 'flower']):
|
||||
if 'shop' in c or 'store' not in c:
|
||||
return ("Events & Weddings", "Florists", "Flower Shops", cat)
|
||||
|
||||
if any(x in c for x in ['funeral', 'mortuary', 'cremation', 'cemetery']):
|
||||
return ("Events & Weddings", "Memorial", "Funeral Services", cat)
|
||||
|
||||
# === NON-PROFIT & COMMUNITY ===
|
||||
if any(x in c for x in ['non-profit', 'nonprofit', 'charity', 'foundation']):
|
||||
return ("Non-Profit", "Charities", "Non-Profit Organizations", cat)
|
||||
|
||||
if any(x in c for x in ['community', 'civic', 'volunteer']):
|
||||
if 'center' in c:
|
||||
return ("Non-Profit", "Community", "Community Centers", cat)
|
||||
return ("Non-Profit", "Community", "Community Organizations", cat)
|
||||
|
||||
if any(x in c for x in ['association', 'organization', 'society']):
|
||||
if 'professional' in c or 'trade' in c or 'business' in c:
|
||||
return ("Non-Profit", "Professional", "Professional Associations", cat)
|
||||
return ("Non-Profit", "General", "Organizations", cat)
|
||||
|
||||
# Default fallback
|
||||
return ("Other", "Uncategorized", "General", cat)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Import GBP categories into PostgreSQL with ltree')
|
||||
parser.add_argument('--csv-path', default=DEFAULT_CSV_PATH, help='Path to categories CSV')
|
||||
parser.add_argument('--db-url', default=DEFAULT_DB_URL, help='PostgreSQL connection URL')
|
||||
parser.add_argument('--dry-run', action='store_true', help='Print categories without importing')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Read categories
|
||||
print(f"Reading categories from: {args.csv_path}")
|
||||
categories = []
|
||||
with open(args.csv_path, 'r', encoding='utf-8') as f:
|
||||
reader = csv.reader(f)
|
||||
next(reader) # Skip header
|
||||
for row in reader:
|
||||
if row and row[0].strip():
|
||||
categories.append(row[0].strip())
|
||||
|
||||
print(f"Found {len(categories)} categories")
|
||||
|
||||
# Build tree structure
|
||||
tree = {} # path -> (name, level, parent_path)
|
||||
|
||||
for cat in categories:
|
||||
l1, l2, l3, l4 = categorize_category(cat)
|
||||
|
||||
# Build paths
|
||||
l1_slug = slugify(l1)
|
||||
l2_slug = slugify(l2)
|
||||
l3_slug = slugify(l3)
|
||||
l4_slug = slugify(l4)
|
||||
|
||||
# Level 1 (Sector)
|
||||
l1_path = l1_slug
|
||||
if l1_path not in tree:
|
||||
tree[l1_path] = (l1, 1, None)
|
||||
|
||||
# Level 2 (Business Type)
|
||||
l2_path = f"{l1_slug}.{l2_slug}"
|
||||
if l2_path not in tree:
|
||||
tree[l2_path] = (l2, 2, l1_path)
|
||||
|
||||
# Level 3 (Sub-category)
|
||||
l3_path = f"{l1_slug}.{l2_slug}.{l3_slug}"
|
||||
if l3_path not in tree:
|
||||
tree[l3_path] = (l3, 3, l2_path)
|
||||
|
||||
# Level 4 (Specific Category)
|
||||
l4_path = f"{l1_slug}.{l2_slug}.{l3_slug}.{l4_slug}"
|
||||
if l4_path not in tree:
|
||||
tree[l4_path] = (l4, 4, l3_path)
|
||||
|
||||
# Print statistics
|
||||
level_counts = {1: 0, 2: 0, 3: 0, 4: 0}
|
||||
for path, (name, level, parent) in tree.items():
|
||||
level_counts[level] += 1
|
||||
|
||||
print(f"\nTree structure:")
|
||||
print(f" Level 1 (Sectors): {level_counts[1]}")
|
||||
print(f" Level 2 (Business Types): {level_counts[2]}")
|
||||
print(f" Level 3 (Sub-categories): {level_counts[3]}")
|
||||
print(f" Level 4 (Categories): {level_counts[4]}")
|
||||
print(f" Total nodes: {len(tree)}")
|
||||
|
||||
if args.dry_run:
|
||||
print("\n[DRY RUN] Would insert these nodes:")
|
||||
for path in sorted(tree.keys())[:20]:
|
||||
name, level, parent = tree[path]
|
||||
print(f" {' ' * (level-1)}{name} ({path})")
|
||||
print(f" ... and {len(tree) - 20} more")
|
||||
return
|
||||
|
||||
# Check for psycopg2
|
||||
if not HAS_PSYCOPG2:
|
||||
print("\nERROR: psycopg2 is required for database import.")
|
||||
print("Install it with: pip install psycopg2-binary")
|
||||
return
|
||||
|
||||
# Connect to database
|
||||
print(f"\nConnecting to database...")
|
||||
conn = psycopg2.connect(args.db_url)
|
||||
cur = conn.cursor()
|
||||
|
||||
# Run init SQL first
|
||||
init_sql_path = os.path.join(os.path.dirname(__file__), 'init', '01_create_categories.sql')
|
||||
if os.path.exists(init_sql_path):
|
||||
print(f"Running init SQL: {init_sql_path}")
|
||||
with open(init_sql_path, 'r') as f:
|
||||
cur.execute(f.read())
|
||||
conn.commit()
|
||||
|
||||
# Clear existing data
|
||||
print("Clearing existing categories...")
|
||||
cur.execute("TRUNCATE TABLE gbp_categories RESTART IDENTITY CASCADE")
|
||||
|
||||
# Insert nodes in order (parents first)
|
||||
print("Inserting categories...")
|
||||
path_to_id = {}
|
||||
|
||||
# Sort by level to ensure parents are inserted first
|
||||
sorted_items = sorted(tree.items(), key=lambda x: x[1][1])
|
||||
|
||||
for path, (name, level, parent_path) in sorted_items:
|
||||
parent_id = path_to_id.get(parent_path) if parent_path else None
|
||||
slug = path.split('.')[-1]
|
||||
|
||||
cur.execute("""
|
||||
INSERT INTO gbp_categories (name, slug, path, level, parent_id)
|
||||
VALUES (%s, %s, %s, %s, %s)
|
||||
RETURNING id
|
||||
""", (name, slug, path, level, parent_id))
|
||||
|
||||
path_to_id[path] = cur.fetchone()[0]
|
||||
|
||||
# Update category counts
|
||||
print("Updating category counts...")
|
||||
cur.execute("""
|
||||
UPDATE gbp_categories p
|
||||
SET category_count = (
|
||||
SELECT COUNT(*) FROM gbp_categories c
|
||||
WHERE c.path <@ p.path AND c.path != p.path
|
||||
)
|
||||
""")
|
||||
|
||||
conn.commit()
|
||||
|
||||
# Verify
|
||||
cur.execute("SELECT COUNT(*) FROM gbp_categories")
|
||||
count = cur.fetchone()[0]
|
||||
print(f"\nSuccess! Inserted {count} nodes into gbp_categories table")
|
||||
|
||||
# Show tree stats
|
||||
cur.execute("SELECT * FROM category_tree_stats")
|
||||
print("\nTree statistics:")
|
||||
for row in cur.fetchall():
|
||||
print(f" Level {row[0]}: {row[1]} nodes")
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
print("\nDone!")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
120
db/init/01_create_categories.sql
Normal file
120
db/init/01_create_categories.sql
Normal file
@@ -0,0 +1,120 @@
|
||||
-- Enable ltree extension for hierarchical data
|
||||
CREATE EXTENSION IF NOT EXISTS ltree;
|
||||
|
||||
-- Categories tree table
|
||||
CREATE TABLE IF NOT EXISTS gbp_categories (
|
||||
id SERIAL PRIMARY KEY,
|
||||
name TEXT NOT NULL,
|
||||
slug TEXT NOT NULL,
|
||||
path ltree NOT NULL,
|
||||
level INT NOT NULL DEFAULT 1,
|
||||
parent_id INT REFERENCES gbp_categories(id),
|
||||
category_count INT DEFAULT 0,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
UNIQUE(path)
|
||||
);
|
||||
|
||||
-- Indexes for fast hierarchical queries
|
||||
CREATE INDEX IF NOT EXISTS idx_gbp_categories_path ON gbp_categories USING GIST (path);
|
||||
CREATE INDEX IF NOT EXISTS idx_gbp_categories_path_btree ON gbp_categories USING BTREE (path);
|
||||
CREATE INDEX IF NOT EXISTS idx_gbp_categories_name ON gbp_categories (name);
|
||||
CREATE INDEX IF NOT EXISTS idx_gbp_categories_slug ON gbp_categories (slug);
|
||||
CREATE INDEX IF NOT EXISTS idx_gbp_categories_level ON gbp_categories (level);
|
||||
CREATE INDEX IF NOT EXISTS idx_gbp_categories_parent ON gbp_categories (parent_id);
|
||||
|
||||
-- Full text search index
|
||||
CREATE INDEX IF NOT EXISTS idx_gbp_categories_name_trgm ON gbp_categories USING GIN (name gin_trgm_ops);
|
||||
|
||||
-- Enable trigram extension for fuzzy search
|
||||
CREATE EXTENSION IF NOT EXISTS pg_trgm;
|
||||
|
||||
-- Function to update timestamp
|
||||
CREATE OR REPLACE FUNCTION update_updated_at_column()
|
||||
RETURNS TRIGGER AS $$
|
||||
BEGIN
|
||||
NEW.updated_at = CURRENT_TIMESTAMP;
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ language 'plpgsql';
|
||||
|
||||
-- Trigger for auto-updating timestamp
|
||||
DROP TRIGGER IF EXISTS update_gbp_categories_updated_at ON gbp_categories;
|
||||
CREATE TRIGGER update_gbp_categories_updated_at
|
||||
BEFORE UPDATE ON gbp_categories
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION update_updated_at_column();
|
||||
|
||||
-- Helper function: Get all children of a category
|
||||
CREATE OR REPLACE FUNCTION get_category_children(parent_path ltree)
|
||||
RETURNS TABLE (
|
||||
id INT,
|
||||
name TEXT,
|
||||
slug TEXT,
|
||||
path ltree,
|
||||
level INT
|
||||
) AS $$
|
||||
BEGIN
|
||||
RETURN QUERY
|
||||
SELECT c.id, c.name, c.slug, c.path, c.level
|
||||
FROM gbp_categories c
|
||||
WHERE c.path <@ parent_path AND c.path != parent_path
|
||||
ORDER BY c.path;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Helper function: Get ancestors of a category
|
||||
CREATE OR REPLACE FUNCTION get_category_ancestors(category_path ltree)
|
||||
RETURNS TABLE (
|
||||
id INT,
|
||||
name TEXT,
|
||||
slug TEXT,
|
||||
path ltree,
|
||||
level INT
|
||||
) AS $$
|
||||
BEGIN
|
||||
RETURN QUERY
|
||||
SELECT c.id, c.name, c.slug, c.path, c.level
|
||||
FROM gbp_categories c
|
||||
WHERE category_path <@ c.path AND c.path != category_path
|
||||
ORDER BY c.level;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Helper function: Search categories by name (fuzzy)
|
||||
CREATE OR REPLACE FUNCTION search_categories(search_term TEXT, limit_count INT DEFAULT 20)
|
||||
RETURNS TABLE (
|
||||
id INT,
|
||||
name TEXT,
|
||||
path ltree,
|
||||
level INT,
|
||||
similarity REAL
|
||||
) AS $$
|
||||
BEGIN
|
||||
RETURN QUERY
|
||||
SELECT c.id, c.name, c.path, c.level,
|
||||
similarity(c.name, search_term) as sim
|
||||
FROM gbp_categories c
|
||||
WHERE c.name ILIKE '%' || search_term || '%'
|
||||
OR similarity(c.name, search_term) > 0.3
|
||||
ORDER BY sim DESC, c.level, c.name
|
||||
LIMIT limit_count;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- View for tree statistics
|
||||
CREATE OR REPLACE VIEW category_tree_stats AS
|
||||
SELECT
|
||||
level,
|
||||
COUNT(*) as count,
|
||||
COUNT(*) FILTER (WHERE level = 1) as sectors,
|
||||
COUNT(*) FILTER (WHERE level = 2) as business_types,
|
||||
COUNT(*) FILTER (WHERE level = 3) as sub_categories,
|
||||
COUNT(*) FILTER (WHERE level = 4) as leaf_categories
|
||||
FROM gbp_categories
|
||||
GROUP BY level
|
||||
ORDER BY level;
|
||||
|
||||
COMMENT ON TABLE gbp_categories IS 'Google Business Profile categories organized in a 4-level hierarchy using ltree';
|
||||
COMMENT ON COLUMN gbp_categories.path IS 'Hierarchical path using ltree (e.g., Food_Dining.Restaurants.By_Cuisine.Afghan_restaurant)';
|
||||
COMMENT ON COLUMN gbp_categories.level IS '1=Sector, 2=Business Type, 3=Sub-category, 4=Specific Category';
|
||||
293
db/recategorize_hierarchical.py
Normal file
293
db/recategorize_hierarchical.py
Normal file
@@ -0,0 +1,293 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Hierarchical categorization of Other items.
|
||||
|
||||
APPROACH:
|
||||
1. First pass: Assign to Level 1 (Sector) - items that don't match go to sector's "Other" business type
|
||||
2. Second pass: Within each sector, refine Level 2 (Business Type)
|
||||
3. Third pass: Within each business type, refine Level 3 (Sub-category)
|
||||
|
||||
This creates:
|
||||
- Sector.Other.Uncategorized for sector-level unknowns
|
||||
- Sector.BusinessType.Other for business-type-level unknowns
|
||||
|
||||
EXISTING SECTORS (21 + Other):
|
||||
Agriculture, Automotive, Education, Entertainment, Events_Weddings, Finance_Insurance,
|
||||
Food_Dining, Government, Healthcare, Home_Services, Hospitality_Travel, Industrial,
|
||||
Non_Profit, Personal_Services, Pets_Animals, Professional_Services, Real_Estate,
|
||||
Religious, Retail_Shopping, Technology, Transportation, Other
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
# ==================== LEVEL 1: SECTOR ASSIGNMENT ====================
|
||||
# Maps keyword patterns to sectors. Order matters - first match wins.
|
||||
# These are broad patterns to catch as much as possible at sector level.
|
||||
|
||||
SECTOR_PATTERNS = [
|
||||
# HEALTHCARE - Medical professionals, facilities, services
|
||||
(r'(doctor|clinic|hospital|medical|health\s|dental|dentist|therapy|therapist|psycho|chiropract|optom|optician|pharmacy|pharmacist|nurse|surgeon|physician|cardiolog|dermatol|pediatr|orthoped|neurolog|oncolog|urolog|allergist|anesthesiol|audiolog|blood\sbank|blood\sdonat|blood\stest|dialysis|fertility|hospice|rehab|physiother|acupunct|naturopath|homeopath|osteopath|midwife|birth\scenter|prenatal|maternity|wellness\s(clinic|center)|diagnostic|x-ray|mri|ultrasound|laboratory|patholog|radiolog|pulmonolog|gastroenter|endocrin|rheumatol|immunolog|geriatr|podiatr|ophthalmolog|otolaryng|hematolog|nephrolog|proctolog|physiatrist|diabetolog|toxicolog|epidemiolog|oncology|assisted\sliving|nursing\shome|senior\scare|aged\scare|elder\scare|ambulance|emergency\sroom|urgent\scare|first\said|denture|diabetes\scenter|eye\scare|hiv\stest|perinatal|physical\sexam|pregnancy\scare|surgical\scenter|mammograph|std\stest|drug\stest|lactation|doula|bonesetting|hearing\said|prosthetic|orthotic|oxygen|ostomy|sleep\sclinic|sleep\slab|fertility|ivf|sperm\sbank|stem\scell|general\spractitioner|gynecolog|obstetrician|hepatolog|intensivist|internist|neurophysiol|orthoptist|prosthodontist|sexolog|venereolog|nutritionist|dietitian|endoscopist|kinesiolog|pedorthist|seitai|foot\scare|internal\smedicine|family\smedic|family\sdoctor|gp\s|medical\sward)', 'Healthcare'),
|
||||
|
||||
# EDUCATION - Schools, training, learning
|
||||
(r'(school|university|college|academy|training\scenter|training\sschool|lesson|instructor|tutor|education|library|kindergarten|preschool|pre-?school|daycare|day\scare|learning\scenter|vocational|apprentice|faculty|campus|institute|seminary|boarding\sschool|private\sschool|public\sschool|elementary|middle\sschool|high\sschool|montessori|waldorf|charter\sschool|language\sschool|driving\sschool|flight\sschool|cooking\sclass|art\sclass|music\sclass|dance\sclass|acting\sclass|drama\sclass|conservatory|music\sacademy|ballet\sacademy|film\sschool|design\sschool|fashion\sschool|culinary|bartending|beauty\sschool|cosmetology|esthetician|barber\sschool|massage\sschool|yoga\steacher|yoga\straining|meditation\sclass|self-?defense\sclass|swimming\slesson|tennis\slesson|golf\slesson|ski\sschool|surf\sschool|scuba|sailing\sschool|studying\scenter|test\sprep|sat\sprep|gre\sprep|cram\sschool|juku|hagwon|coaching\scenter|head\sstart|early\shead|childminder|assistante\smaternelle|au\spair|nanny\sagency|student\sdormitor|student\shousing|student\scareer|career\scounseling|english\slanguage\scamp|language\scamp|summer\scamp|science\scamp|coding\scamp|academic\sdepartment)', 'Education'),
|
||||
|
||||
# AUTOMOTIVE - Vehicles, parts, services
|
||||
(r'(auto\s|car\s|vehicle|motor\s|tire\s|tyre\s|mechanic|garage(?!\sdoor)|parking\s(lot|garage|facility)|driving|truck\s|motorcycle|motorbike|scooter\s|atv\s|automotive|car\swash|car\sdetail|car\sdealer|car\srental|car\slease|car\sinspect|car\sauction|smog\scheck|oil\schange|brake\s|transmission|radiator|exhaust|muffler|auto\sbody|collision|windshield|car\sstorage|towing|roadside)', 'Automotive'),
|
||||
|
||||
# TRANSPORTATION - Moving people/goods
|
||||
(r'(airport|airline|aviation(?!\sschool)|aircraft|airplane|airfield|airstrip|heliport|seaplane|ferry|cruise|port\sauthority|port\soperating|harbor|dock\s|pier\s|marina|shipping|freight|cargo|trucking|logistics|warehouse|courier|messenger|delivery\sservice|taxi|cab\sservice|limo|chauffeur|bus\sstation|bus\sterminal|train\sstation|rail|metro|subway|transit|rickshaw|bicycle\srental|boat\srental|bike\sshare|car\sshare)', 'Transportation'),
|
||||
|
||||
# GOVERNMENT - Public administration, military, legal system
|
||||
(r'(government|military|army\s|navy\s|naval\sbase|air\sforce|marine\s|coast\sguard|national\sguard|police|sheriff|law\senforce|fire\sstation|fire\sdepartment|courthouse|court\s|embassy|consulate|city\shall|municipal|county\s|district\soffice|passport|immigration|citizenship|dmv|tax\soffice|social\ssecurity|border|customs|post\soffice|postal|public\srecord|voter|election|legislature|parliament|congress|senate|mayor|governor|council|permit|license\s(office|bureau)|civil\sdefense|emergency\smanagement|public\ssafety|prison|jail|detention|correctional|probation|parole|aadhaar|agenzia\sentrate|anganwadi|asylum\scenter|city\sclerk|environment\soffice|land\sregistry|patent\soffice|pension\soffice|registration\soffice|registry\soffice|unemployment|employment\scenter|citizen\sinformation|consumer\sadvice|state\sarchive|national\sarchive|public\sarchive|guardia\scivil|highway\spatrol|department\sof|ministry\sof|bureau\sof|board\sof\seducation|public\sworks|sanitation|water\sauthority|housing\sauthority|port\sauthority|transit\sauthority)', 'Government'),
|
||||
|
||||
# RELIGIOUS - Places of worship, spiritual
|
||||
(r'(church|temple|mosque|masjid|synagogue|chapel|cathedral|basilica|parish|religious|spiritual|ashram|monastery|convent|abbey|priory|buddhist|hindu|christian|catholic|protestant|orthodox|baptist|methodist|lutheran|presbyterian|pentecostal|evangelical|muslim|islamic|jewish|judai|sikh|gurdwara|gurudwara|baha.?i|shinto|taoist|quaker|mennonite|amish|latter-?day|jehovah|scientolog|meditation\scenter|retreat\scenter|pilgrimage|shrine|pagoda|wat\s|vihara|mission(?!\scontrol)|musalla|place\sof\sworship|rectory|yeshiva|marae|congregation|spiritist|priest|mohel|botanica)', 'Religious'),
|
||||
|
||||
# ENTERTAINMENT - Fun, recreation, sports, arts, culture
|
||||
(r'(sports\s|sport\s|club(?!\shouse)|field$|court\s|gym\s|gymnasium|fitness|athletic|stadium|arena|pool\s|swimming|track\s|golf\s|tennis|soccer|football|basketball|baseball|hockey|volleyball|badminton|squash|racquetball|bowling|billiard|snooker|boxing|martial\sart|karate|judo|taekwondo|aikido|wrestling|fencing|archery|shooting\srange|gun\sclub|yoga\s|pilates|crossfit|cycling|skating|skateboard|skiing|snowboard|surfing|diving|climbing|bouldering|trampoline|gymnastics|dance\s|ballet|museum|theater|theatre|cinema|movie|art\sgallery|art\scenter|art\sstudio|gallery|music\svenue|concert|entertainment|amusement|theme\spark|water\spark|zoo|aquarium|wildlife|safari|botanical|arboretum|casino|gambling|betting|arcade|game\scenter|escape\sroom|laser\stag|paintball|go-?kart|mini\sgolf|comedy\sclub|jazz\sclub|blues\sclub|karaoke|nightclub|disco|rave|circus|carnival|fair\s|rodeo|bullring|race\strack|racecourse|hippodrome|velodrome|skate\spark|bmx|motocross|off-?road|aquatic\scenter|batting\scage|bungee|hang\sglid|paraglid|skydiv|indoor\ssnow|leisure\scenter|recreation\scenter|cultural\scenter|exhibit|festival|philharmon|opera\shouse|opera\scompany|symphony|orchestra|planetarium|observatory|science\scenter|discovery\scenter|children.*amusement|funfair|bouncy\scastle|inflatab|playground|adventure\spark|treetop|zipline|zip\sline|ropes\scourse|obstacle\scourse|ninja\swarrior|canoeing|kayaking|rafting|fishing\spond|fishing\sarea|bird\swatch|nature\sreserve|nature\scenter|hiking\strail|walking\strail|hiking\sarea|beach\spavil|beach\sresort|waterfront|promenade|pier\s(?!fishing)|boardwalk|scenic\spoint|scenic\sspot|lookout|viewpoint|observation|monument|landmark|castle|palace|fortress|historic\ssite|heritage|ruins|amphitheater|bandstand|gazebo|pavilion|curling\shall|scout\shall|scout\shome|village\shall|community\shall|social\shall|civic\scenter|convention\scenter|exhibition\scenter|artist$|band$|choir|musician|entertainer|magician|pyrotechnician|performing\sarts|stage$|sculpture|statuary|painting$|roller\scoaster|haunted\shouse|fairground|ghost\stown|lido|rugby|rugby\sfield|softball\sfield|little\sleague\sfield|water\spolo|cricket\sground|rowing\sarea|weightlifting|off\sroading|prawn\sfishing|raft\strip|mountaineering|summer\stoboggan|pumpkin\spatch|picnic\sground|national\sforest|national\sreserve|national\spark|nature\spreserve|protected\sarea|reenactment|sambodrome|pachinko|mahjong\shouse|children\shall|children.*camp|outdoor\sactivity|outdoor\sbath|onsen|thermal\sbath|day-?use\sonsen|foot\sbath)', 'Entertainment'),
|
||||
|
||||
# FOOD & DINING - Restaurants, bars, food production
|
||||
(r'(restaurant|cafe(?!\steria)|café|coffee\s|espresso|bar\s(?!association)|pub\s|tavern|lounge|brewery|taproom|brewpub|winery|distillery|bakery|patisserie|pastry|dessert|ice\scream|gelato|frozen\syogurt|pizzeria|pizza\s|taco|burrito|sushi|ramen|noodle|dim\ssum|dumpling|steakhouse|steak\shouse|seafood|grill|bbq|barbecue|diner|bistro|brasserie|eatery|canteen|cafeteria|food\scourt|food\struck|food\scart|catering|caterer|buffet|brunch|breakfast|lunch|dinner|takeout|take-?away|delivery\sfood|meal|kitchen(?!\scabinet)|chef\s|cook\s|juice\sbar|smoothie|tea\shouse|traditional\steahouse|bubble\stea|boba|wine\sbar|wine\scellar|cocktail|speakeasy|gastropub|chophouse|crab\shouse|fish\s&\schips|curry|indian\srestaurant|chinese\srestaurant|chinese\stakeaway|italian\srestaurant|mexican\srestaurant|thai\srestaurant|japanese\srestaurant|korean\srestaurant|vietnamese|french\srestaurant|greek\srestaurant|mediterranean|middle\seastern|african\srestaurant|caribbean|latin\samerican|american\srestaurant|fast\sfood|quick\sservice|drive-?thru|dhaba|tiffin|hawker|churreria|creperie|crepe|pastelaria|pasteleria|tapas|izakaya|yakiniku|okonomiyaki|tempura|udon|soba|tonkatsu|kaiseki|robatayaki|teppanyaki|kushiyaki|yakitori|gyudon|poke\sbowl|acai|falafel|shawarma|kebab|gyro|pita|hummus|mezze|tagine|injera|pho|banh\smi|bibimbap|bulgogi|kimchi|hotpot|fondue|raclette|schnitzel|bratwurst|currywurst|pierogi|borscht|blini|pelmeni|empanada|arepa|pupusa|ceviche|asado|churrasco|rodizio|feijoada|moqueca|acaraje|jerk|oxtail|doubles|roti|samosa|biryani|tandoori|masala|tikka|naan|dosa|idli|vada|chaat|thali|satay|laksa|rendang|nasi\sgoreng|pad\sthai|som\stam|tom\syum|green\scurry|massaman|poutine|smoked\smeat|lobster\sroll|clam\schowder|po.?boy|gumbo|jambalaya|soul\sfood|southern\sfood|cajun|creole|carvery|dairy$|frituur|fruit\sparlor|meyhane|sugar\shack|yakatabune|olive\soil\scooperative|soy\ssauce)', 'Food_Dining'),
|
||||
|
||||
# HOME SERVICES - Home improvement, maintenance, repair
|
||||
(r'(plumb|electrician|electrical\scontract|hvac|heating|air\scondition|cooling|roof|landscap|lawn\s|garden\sservice|gardener|arborist|tree\sservice|clean\s(service|company)|cleaning\sservice|cleaners$|pest\scontrol|exterminator|paint\scontract|painter(?!\sartist)|paint\sstrip|carpent|cabinet\smaker|flooring|tile\sinstall|hardwood|carpet\sinstall|repair\sservice|contractor|remodel|renovation|handyman|locksmith\sservice|moving\scompany|mover\s|moving\sand\sstorage|piano\smoving|appliance\srepair|garage\sdoor|gutter|chimney|window\sinstall|door\sinstall|double\sglazing|glass\srepair|fence\s|deck\sbuild|patio|drywall|insulation|siding|masonry|brick|concrete|paving|asphalt|pool\sservice|pool\scleaning|spa\sservice|septic|sewer|drain|water\sheater|well\sdrill|solar\sinstall|solar\spanel\smaintenance|security\ssystem|alarm\sinstall|home\sinspect|building\sinspect|surveyor|interior\sdesign|home\sstaging|pressure\swash|graffiti\sremoval|debris\sremoval|junk\sremoval|house\sclearance|snow\sremoval|antenna\sservice|satellite\sinstall|gasfitter|gas\sinstall|height\sworks|impermeabilization|wallpaper\sinstall|airbrushing|home\shelp|stall\sinstall)', 'Home_Services'),
|
||||
|
||||
# RETAIL & SHOPPING - Stores, shops, markets
|
||||
(r'(store\s|shop\s(?!service)|retail|boutique|market(?!ing)|mall\s|outlet|dealer(?!ship)|supplier|wholesale|distributor|supermarket|grocery|convenience|department\sstore|discount|thrift|consignment|pawn|antique|vintage|secondhand|used\s|book\sstore|stationery|office\ssupply|toy\sstore|game\sstore|hobby|craft\sstore|art\ssupply|music\sstore|record\sstore|electronics|computer\sstore|phone\sstore|appliance\sstore|furniture\sstore|home\sdecor|bedding|mattress|kitchenware|hardware|tool\sstore|building\ssupply|lumber|garden\scenter|plant\snursery|florist|flower\sshop|pet\sstore|pet\ssupply|clothing|fashion|apparel|shoe\sstore|jewelry|watch\sstore|cosmetic|beauty\ssupply|pharmacy|drugstore|health\sstore|vitamin|supplement|sporting\sgoods|outdoor\sstore|bicycle\sshop|gun\sshop|hunting|fishing\sstore|camping|liquor|wine\sshop|beer\sstore|tobacco|cigar|vape|smoke\sshop|candy|chocolate|confection|bakery\sshop|cheese\sshop|spice|tea\sshop|coffee\sshop(?!\scafe)|newsstand|kiosk|vending|bazar|bazaar|hawker\scenter|flea\smarket|farmers\smarket|night\smarket|food\shall|food\scourt|deli(?!very)|delicatessen|charcuterie|butcher|fishmonger|greengrocer|produce|fruit\sstand|flower\sstand|fabric|textile\sshop|yarn|knitting|sewing\sshop|craft\ssuppl|frame\sshop|framing|trophy|engraving|gift\sshop|souvenir|duty\sfree|airport\sshop|convenience|corner\sstore|general\sstore|variety|dollar\sstore|pound\sshop|euro\sshop|99\scent|surplus|closeout|liquidat|outlet\small|factory\soutlet|warehouse\sstore|membership\sclub|costco|sam.*club)', 'Retail_Shopping'),
|
||||
|
||||
# PROFESSIONAL SERVICES - Business services, consulting, legal, creative
|
||||
(r'(lawyer|attorney|law\sfirm|legal\sservice|accountant|accounting|bookkeep|cpa\s|tax\s(prepar|service|consult)|consultant|consulting|architect(?!ure)|engineer(?!ing\sschool)|survey\scompany|land\ssurvey|topograph|agency(?!\sgovernment)|staffing|recruiting|recruiter|employment\sagency|hr\sservice|marketing|advertis|pr\sfirm|public\srelations|graphic\sdesign|web\sdesign|website\sdesign|photography|photographer|videograph|film\sproduction|animation\sstudio|recording\sstudio|rehearsal\sstudio|production\sstudio|portrait\sstudio|model\sportfolio\sstudio|painting\sstudio|translation|interpret|transcription|notary|commissioner\sfor\soaths|private\sinvestigat|detective|appraiser|appraisal|estate\sappraiser|auditor|financial\saudit|actuary|financial\splanner|wealth\smanag|investment\sadvis|business\sconsult|management\sconsult|it\sconsult|media\scompany|media\shouse|record\scompany|scenograph|model\sdesign|telemarket|direct\smail|copywriter|editor|proofreader|technical\swriter|ghostwriter|literary\sagent|talent\sagent|booking\sagent|casting|modeling\sagent|artist\smanage|court\sreport|patent\sagent|trademark|intellectual\sproperty|customs\sbroker|freight\sforward|import\sexport|export\scompany|geological\sresearch|geological\sservice|environmental\sconsult|safety\sconsult|quality\sconsult|process\sserv|skip\strac|bail\senforce|collection\sagent|factoring|mezzanine\sfinance|conveyancer|executor|genealogist|gemologist|loss\sadjuster|foreclosure|insolvency|judicial\sscrivener|commercial\sagent|executive\ssearch|payroll\sservice|resume\sservice|typing\sservice|fax\sservice|mailing\sservice|shredding\sservice|blueprint|drafting|mapping\sservice|research\sand\sproduct|information\sservice|news\sservice|music\smanagement|yacht\sbroker|finance\sbroker|food\sbroker)', 'Professional_Services'),
|
||||
|
||||
# INDUSTRIAL - Manufacturing, construction, mining, utilities, trades
|
||||
(r'(factory|plant(?!\snursery)|mill$|mill\s|manufactur|industrial|mining|mine\s|quarry|production|foundry|forge|smelter|refinery|chemical\s|pharmaceutical\scompan|textile|garment\sfactory|food\sprocessing|cannery|bottling|assembly|fabricat|machine\sshop|metal\swork|metal\sprocess|metallurg|welding|welder|steel|iron\sworks|aluminum|plastic|rubber|paper\smill|lumber\smill|sawmill|saw\smill|print\sshop|commercial\sprint|digital\sprint|packaging|recycling|waste\smanagement|construction\scompany|general\scontractor|building\scompany|building\sfirm|developer|civil\sengineering|demolition|excavat|crane\sservice|scaffold|heavy\sequipment|blacksmith|coppersmith|goldsmith|silversmith|horseshoe|locksmith(?!\sservice)|tinsmith|gunsmith|bladesmith|knifesmith|boilermaker|machinist|millwright|pipefitter|rigger|sheet\smetal|ironwork|structural\ssteel|precast|concrete\splant|asphalt\splant|gravel|aggregate|sand\s&\sgravel|earth\sworks|anodizing|electroplat|galvaniz|powder\scoat|metal\spolish|metal\sfinish|sandblast|shot\sblast|heat\streat|tempering|hardening|casting|die\scast|injection\smold|blow\smold|extrusion|stamping|forging|cnc|lathe|milling\smachine|grinding|boring|drilling|water\sutility|electric\sutility|gas\scompany|power\sstation|power\splant|nuclear\spower|solar\senergy|wind\sfarm|hydroelectric|substation|transformer|utility\scompany|water\spurification|sewage|wastewater|biotechnolog|shipbuilding|ship\srepair|shipyard|dry\sdock|boatyard|marine\sengine|propeller|cotton\smill|flour\smill|rice\smill|jute\smill|water\smill|weaving\smill|cider\smill|slaughterhouse|tannery|dyeworks|meat\spacker|meat\sprocessor|fruit.*processing|glass\sindustry|sewing\scompany|turnery|toolroom|machine\sconstruct|stone\scutter|stone\scarving|joiner|woodworker|plasterer|glazier|plating\sservice|embossing|lamination|laser\scutting|water\sjet|salvage\syard|junkyard|garbage\sdump|waste\stransfer|coalfield|oilfield)', 'Industrial'),
|
||||
|
||||
# HOSPITALITY & TRAVEL - Lodging, tourism
|
||||
(r'(hotel|motel|inn\s|resort|hostel|lodge\s|bed\s&\sbreakfast|bed\sand\sbreakfast|b&b|guesthouse|guest\shouse|vacation\srental|holiday\s(rental|apartment|home)|cabin\srental|cottage\srental|cottage(?!\sindustry)|chalet|airbnb|vrbo|travel\sagent|travel\sagency|tour\soperator|tour\sguide|tourist\s(information|office|attraction)|sightseeing|excursion|cruise|camping|campground|caravan\spark|rv\spark|glamping|youth\shostel|retreat\scenter(?!\sreligious)|boarding\shouse|rooming\shouse|dormitory(?!\sstudent)|rest\sstop|rest\sarea|truck\sstop|service\sarea|visitor\scenter|welcome\scenter|country\shouse|manor\shouse|estate\shouse|villa\srental|apartment\shotel|extended\sstay|residence\sinn|suite\shotel|capsule\shotel|love\shotel|ryokan|minshuku|pension\s|agriturismo|pousada|parador|paradores)', 'Hospitality_Travel'),
|
||||
|
||||
# PERSONAL SERVICES - Beauty, wellness, personal care
|
||||
(r'(salon\s|spa\s(?!automotive)|massage(?!\schair)|tattoo|piercing|body\sart|barber|beauty\s(?!supply|store)|nail\s|manicure|pedicure|hair\s(salon|stylist|dresser|cut)|waxing|threading|lash|brow|eyelash|makeup\sartist|esthetician|cosmetolog|tanning|sunbed|sauna|steam\sroom|bathhouse|hammam|laundry|laundromat|dry\sclean|tailor|alteration|seamstress|shoe\srepair|cobbler|watch\srepair|key\scutting|weight\sloss|diet\scenter|personal\strainer|life\scoach|dating\sservice|matchmak)', 'Personal_Services'),
|
||||
|
||||
# FINANCE & INSURANCE - Banks, financial services
|
||||
(r'(bank(?!\sfood)|credit\sunion|savings\s&\sloan|atm\s|insurance\s(agent|agency|company|broker)|mortgage|loan\s(company|officer|broker)|lending|finance\scompany|financial\sservic|investment\s(firm|company|bank)|stock\sbroker|wealth\smanage|money\stransfer|remittance|currency\sexchange|forex|check\scash|payday\sloan|pawn(?!shop)|bail\sbond|credit\srepair|debt\scollect|factoring|leasing\scompany)', 'Finance_Insurance'),
|
||||
|
||||
# REAL ESTATE - Property, housing, storage
|
||||
(r'(real\sestate|realtor|property\s(agent|management|company)|apartment\s(complex|building|rental)|condo|condominium|housing|home\sbuilder|land\sdeveloper|commercial\sreal|office\sspace|coworking|business\scenter|storage\s(facility|unit)|self.?storage|mini\sstorage|warehouse\sspace|parking\sspace|mobile\shome\spark|trailer\spark)', 'Real_Estate'),
|
||||
|
||||
# EVENTS & WEDDINGS - Event services, funeral
|
||||
(r'(funeral|mortuary|cremation|crematorium|cemetery|memorial\s|casket|burial|wedding\s(planner|venue|dress|photographer)|event\s(planner|venue|center)|party\s(planner|supply|rental)|banquet\shall|reception\shall|conference\scenter|convention|meeting\sroom|catering\shall|dj\sservice|disc\sjockey|band\sfor\shire|balloon|decoration\sservice|tent\srental|photo\sbooth|florist(?!\sshop))', 'Events_Weddings'),
|
||||
|
||||
# NON-PROFIT - Charities, community organizations, social services
|
||||
(r'(charity|charitable|non-?profit|ngo\s|foundation(?!\srepair)|community\scenter|community\sorganiz|civic\s|volunteer|food\sbank|soup\skitchen|homeless\s(shelter|service)|social\sservice|social\sworker|welfare\soffice|crisis\scenter|hotline|support\sgroup|self-?help|aa\s|alcoholics|narcotics\sanonymous|veteran|vfw|american\slegion|rotary|lions\sclub|kiwanis|elks|freemason|masonic|fraternal|chamber\sof\scommerce|chamber\sof\shandicrafts|trade\sassociation|professional\sassociation|labor\sunion|tenant.*union|indigenous|aboriginal|tribal|youth\scenter|youth\scare|youth\sgroup|senior\scitizen\scenter|women.s\s(shelter|center|protection)|domestic\sviolence|battered|abuse\s(shelter|center)|halfway\shouse|sober\sliving|addiction\s(center|service)|recovery\scenter|rehab\scenter(?!ilitation)|detox|mental\shealth\sadvocacy|disability\s(service|advocacy)|deaf\sservice|blind\sservice|immigrant\s(service|aid)|refugee\s(service|aid|camp)|legal\said|pro\sbono|family\sservice|family\splanning|birth\scontrol|child\swelfare|foster\scare|adoption\sagency|big\sbrothers|big\ssisters|boys\s&\sgirls|ymca|ywca|jewish\scommunity|jcc|salvation\sarmy|goodwill|habitat\sfor\shumanity|red\scross|united\sway|make-?a-?wish|special\solympics|donations\scenter|thrift(?!\sstore)|donation\sdrop|orphanage|children.*home|group\shome|shelter$|scouting|literacy\sprogram|crime\svictim|mediation\sservice|special\seducator|playgroup|student\sunion)', 'Non_Profit'),
|
||||
|
||||
# TECHNOLOGY - IT, software, telecom
|
||||
(r'(software|app\sdevelop|web\sdevelop|it\sservice|it\ssupport|computer\sservice|computer\srepair|computer\ssecurity|computer\snetwork|tech\ssupport|data\scenter|data\srecovery|data\sentry|database|server\s(farm|hosting)|cloud\sservice|internet\sservice|isp\s|broadband|telecom|telephone\scompany|mobile\s(operator|network)|cell\sphone\sservice|fiber\soptic|satellite\s(communication|service)|cable\sprovider|cybersecurity|network|systems\sintegrat|bpo|call\scenter|outsourc|automation\scompany|home\sautomation|robotics|ai\scompany|machine\slearning|e-?commerce|digital\smarketing|seo|web\shost|domain\sregist|ssl|vpn|managed\sservice|msp|helpdesk|remote\ssupport|pc\srepair)', 'Technology'),
|
||||
|
||||
# AGRICULTURE - Farming, ranching
|
||||
(r'(farm(?!acy|er.s\smarket)|ranch|agriculture|livestock|cattle|poultry|dairy\sfarm|pig\sfarm|sheep|goat|horse\sfarm|stable(?!\sservice)|equestrian\scenter|riding\sschool|crop|orchard|vineyard(?!\swinery)|plantation|greenhouse|horticulture|nursery(?!school)|floricult|aquaculture|fish\sfarm|beekeep|apiary|agronomy|fertilizer|seed\scompany|farm\sequipment|tractor|irrigation|grain|silo|feed\sstore|livestock\sauction|veterinari.*(large|farm|livestock))', 'Agriculture'),
|
||||
|
||||
# PETS & ANIMALS - Pet services, animal welfare
|
||||
(r'(pet\s(?!rol)|animal\s(?!hospital|clinic)|dog\s(?!hot)|cat\s|bird\s(?!watch)|fish\s(?!market|restaurant)|reptile|aquarium\sstore|vet(?!eran)|veterinar(?!.*large|.*farm)|kennel|doggy\sdaycare|pet\sgrooming|pet\sboarding|pet\ssitting|dog\swalk|pet\strain|animal\sshelter|animal\srescue|animal\scontrol|humane\ssociety|spca|aspca|wildlife\srehab|sanctuary|cattery|aviary|breeder|stud\sservice|horse\sboarding|stable(?!\sindustry)|equine|farrier|horse\sshoe)', 'Pets_Animals'),
|
||||
|
||||
# FINANCE & INSURANCE - Banks, financial services
|
||||
(r'(bank(?!\sfood)|credit\sunion|savings\s&\sloan|atm\s|insurance\s(agent|agency|company|broker)|mortgage|loan\s(company|officer|broker)|lending|finance\scompany|financial\sservic|investment\s(firm|company|bank)|stock\sbroker|wealth\smanage|money\stransfer|remittance|currency\sexchange|forex|check\scash|payday\sloan|bail\sbond|credit\srepair|debt\scollect|factoring|leasing\scompany|venture\scapital|private\sequity|hedge\sfund|asset\smanag|trust\scompany|escrow|title\scompany|credit\scounseling|financial\splanning|retirement\splanning|pension\sfund|401k|ira|annuity|securities|commodities|futures|options|trading|brokerage|fintech|mobile\smoney|digital\swallet|cryptocurrency|bitcoin|blockchain)', 'Finance_Insurance'),
|
||||
|
||||
# Catch more rentals and specialized services
|
||||
(r'(equipment\srental|tool\srental|party\srental|tent\srental|chair\srental|table\srental|linen\srental|costume\srental|tuxedo\srental|dress\srental|appliance\srental|furniture\srental|office\sequipment\srental|audiovisual.*rental|av\srental|musical\sinstrument\srental|ski\srental|snowboard\srental|snowmobile\srental|jet\sski\srental|boat\srental|kayak\srental|canoe\srental|bicycle\srental|scooter\srental|segway|atv\srental|motorcycle\srental|rv\srental|camper\srental|trailer\srental|truck\srental|van\srental|car\srental|forklift\srental|crane\srental|scaffolding\srental|construction.*rental|dumpster\srental|portable\stoilet|porta.*potty)', 'Retail_Shopping'),
|
||||
|
||||
# Specialized restoration and repair services
|
||||
(r'(restoration\sservice|furniture\srestoration|antique\srestoration|art\srestoration|photo\srestoration|document\srestoration|clock\srepair|watch\srepair|jewelry\srepair|shoe\srepair|luggage\srepair|leather\srepair|upholstery\srepair|musical\sinstrument\srepair|piano\stuning|guitar\srepair|violin\srepair|camera\srepair|electronics\srepair|phone\srepair|screen\srepair|computer\srepair|printer\srepair|copier\srepair|typewriter|sewing\smachine\srepair|vacuum\srepair|small\sengine\srepair|lawn\smower\srepair|chainsaw|power\stool\srepair|fire\sextinguisher\sservice|scale\srepair|calibration|water\sdamage\srestoration|fire\sdamage|smoke\sdamage|mold\sremediation|biohazard|crime\sscene\sclean|hoarding\sclean)', 'Home_Services'),
|
||||
|
||||
# Specialized trades and craftspeople
|
||||
(r'(clock\smaker|watch\smaker|furniture\smaker|cabinet\smaker|instrument\smaker|stringed\sinstrument\smaker|piano\smaker|organ\sbuilder|luthier|bookbinder|print\smaker|engraver|etcher|lithograph|screen\sprint|sign\smaker|sign\spainter|glass\sblower|stained\sglass|ceramic|pottery|potter|sculptor|woodcarver|wood\sturner|basket\smaker|weaver|spinner|knitter|quilter|longarm|embroidery|monogram|tailor|seamstress|dressmaker|milliner|cobbler|saddle|harness|leather\scraft|upholster|framemaker|gilder|conservator|taxiderm|model\smaker|prop\smaker|costume\smaker|wig\smaker|prosthetic|mask\smaker|puppet|doll\smaker|toy\smaker)', 'Industrial'),
|
||||
|
||||
# Specialized testing and inspection services
|
||||
(r'(testing\sservice|inspection\sservice|asbestos\stest|lead\stest|radon\stest|water\stest|soil\stest|air\squality|environmental\stest|mold\stest|home\sinspect|building\sinspect|property\sinspect|roof\sinspect|termite\sinspect|pest\sinspect|pool\sinspect|chimney\sinspect|septic\sinspect|well\sinspect|electrical\sinspect|plumbing\sinspect|hvac\sinspect|fire\sinspect|safety\sinspect|code\senforcement|energy\saudit|blower\sdoor|duct\stest|infrared|thermal\simag)', 'Professional_Services'),
|
||||
|
||||
# Personal and lifestyle services
|
||||
(r'(psychic|astrologer|fortune\steller|fortune\stelling|palm\sread|tarot|medium|spiritual\sadvis|feng\shui|numerolog|grapholog|hypnotherap|hypnosis|past\slife|akashic|aura|chakra|reiki|energy\shealing|crystal\shealing|sound\shealing|aromatherap|reflexolog|iridolog|kinesiology|craniosacral|rolfing|alexander\stechnique|feldenkrais|pilates\sinstructor|yoga\sinstructor|meditation\sinstructor|breathwork|pranayama|ayurved|traditional\schinese|tcm|herbalist|naturopath|homeopath|beautician|esthetician|esthetics|body\sshaping|boot\scamp|loctician|mehandi|mehndi|teeth\swhitening|wellness\sprogram|alternative\smedicine\spractitioner)', 'Personal_Services'),
|
||||
|
||||
# More Government patterns
|
||||
(r'(archive$|birth\scertificate|city\semployment|state\semployment|company\sregistry|district\sjustice|justice\sdepartment|land\splanning|urban\splanning|toll\sstation|traffic\sofficer|weigh\sstation|sanitary\sinspect|smog\sinspect|superfund|water\sworks|weather\sforecast|ground\sself\sdefense|united\sstates\sarmed|radio\sbroadcaster|television\sstation|closed\scircuit|communications\stower)', 'Government'),
|
||||
|
||||
# More Transportation patterns
|
||||
(r'(boat\sramp|container\sterminal|helicopter\scharter|river\sport|transportation\sservice|transportation\sescort|fixed-?base\soperator|handicapped\stransportation|carpooling)', 'Transportation'),
|
||||
|
||||
# More Finance patterns
|
||||
(r'(diamond\sbuyer|financial\sinstitution|holding\scompany|leasing\sservice|stock\sexchange|money\sorder|payment\sterminal)', 'Finance_Insurance'),
|
||||
|
||||
# More Real Estate patterns
|
||||
(r'(corporate\soffice|display\shome|townhouse\scomplex|villa$|serviced\s(accommodation|apartment)|function\sroom|virtual\soffice)', 'Real_Estate'),
|
||||
|
||||
# More Entertainment/Sports patterns
|
||||
(r'(fishing\s(camp|charter|pier)|horseback\sriding|horse\srental|equestrian\sfacility|outdoor\sequestrian|salsa\sclass|wood\sworking\sclass|stitching\sclass|childbirth\sclass|mehandi\sclass)', 'Entertainment'),
|
||||
|
||||
# More Industrial/Repair patterns
|
||||
(r'(engine\srebuilding|machine\smaintenance|saw\ssharpening|skate\ssharpening|sharpening\sservice|lpg\sconversion|cng\sfitment|boat\sdetailing|rv\sdetailing|rv\srepair|bike\swash|fire\sprotection|elevator\sservice|drone\sservice)', 'Industrial'),
|
||||
|
||||
# More Retail patterns
|
||||
(r'(haberdashery|jeweler$|lapidary|glass\smerchant|furniture\saccessories|showroom$|tesla\sshowroom|bottle.*redemption|coin\soperated)', 'Retail_Shopping'),
|
||||
|
||||
# More Professional Services patterns
|
||||
(r'(building\sdesigner|polygraph|professional\sorganizer|video\s(conferencing|duplication|editing)|meeting\splanning|personal\sconcierge|house\ssitter|marriage\scelebrant|singing\stelegram|roommate\sreferral)', 'Professional_Services'),
|
||||
|
||||
# Miscellaneous remaining - catch-all for specific items
|
||||
(r'(agistment|auction\shouse|appliances\scustomer|bicycle\srack|bridge$|building\sequipment\shire|container\sservice|distribution\sservice|diaper\sservice|divorce\sservice|drinking\swater\sfountain|energy\sequipment|environment\srenewable|forestry\sservice|fur\sservice|garbage\scollection|garden$|handicraft|hiking\sguide|homekill|judicial\sauction|key\sduplication|land\sallotment|line\smark|livery\scompany|lodge$|lodging$|lyceum|mailbox\srental|marquee\shire|memorial$|mercantile|mineral\swater\scompany|mold\smaker|office\srefurbish|oil\sand\sgas\sexploration|orchid\sgrower|package\slocker|pedestrian\szone|road\ssafety\stown|sacem|sailmaker|seating\ssystems|security\s(guard|service)|shoe\sshining|societe|staple\sfood|tenant\sownership|ticket\soffice|weir|wi-?fi\sspot)', 'Other'),
|
||||
]
|
||||
|
||||
def get_sector_for_item(name):
|
||||
"""
|
||||
Determine which sector an item belongs to.
|
||||
Returns sector slug or 'Other' if no match.
|
||||
"""
|
||||
name_lower = name.lower()
|
||||
|
||||
for pattern, sector in SECTOR_PATTERNS:
|
||||
if re.search(pattern, name_lower, re.IGNORECASE):
|
||||
return sector
|
||||
|
||||
return 'Other'
|
||||
|
||||
|
||||
# ==================== LEVEL 2: BUSINESS TYPE PATTERNS ====================
|
||||
# These are more specific patterns within each sector
|
||||
|
||||
BUSINESS_TYPE_PATTERNS = {
|
||||
'Entertainment': [
|
||||
(r'(fitness|gym|workout|crossfit|pilates|yoga|aerobic|exercise|weight\s(room|training)|spin\sclass|bootcamp)', 'Fitness'),
|
||||
(r'(sports\s|athletic|stadium|arena|field\s|court\s|track\s|league|team\s)', 'Sports'),
|
||||
(r'(museum|exhibit|gallery|art\s(center|gallery)|sculpture)', 'Museums'),
|
||||
(r'(theater|theatre|playhouse|opera|ballet|symphony|orchestra|concert|performance|show)', 'Performing Arts'),
|
||||
(r'(cinema|movie|film|drive-?in)', 'Movies'),
|
||||
(r'(park(?!\sing)|playground|recreation|picnic|garden|botanical|arboretum|nature|trail)', 'Parks'),
|
||||
(r'(amusement|theme\spark|water\spark|carnival|fair|ride|attraction)', 'Amusement'),
|
||||
(r'(arcade|game|escape\sroom|laser|paintball|go.?kart|bowling|billiard|mini\sgolf)', 'Games & Recreation'),
|
||||
(r'(casino|gambling|betting|poker|slot)', 'Gambling'),
|
||||
(r'(club|nightclub|disco|bar|lounge)', 'Social'),
|
||||
(r'(zoo|aquarium|wildlife|safari|sanctuary)', 'Wildlife'),
|
||||
(r'(music|concert|jazz|blues|rock|karaoke)', 'Music Venues'),
|
||||
],
|
||||
'Healthcare': [
|
||||
(r'(hospital|medical\scenter|health\scenter)', 'Hospitals'),
|
||||
(r'(clinic|office|practice|urgent\scare)', 'Clinics'),
|
||||
(r'(dentist|dental|orthodont|oral\ssurg|periodont|endodont)', 'Dental'),
|
||||
(r'(eye|vision|optom|optician|ophthalmolog)', 'Vision Care'),
|
||||
(r'(mental|psych|counsel|therapist|psychiatr)', 'Mental Health'),
|
||||
(r'(chiropract|acupunct|naturopath|homeopath|osteopath|alternative|holistic)', 'Alternative Medicine'),
|
||||
(r'(physical\stherap|occupational|speech|rehab)', 'Rehabilitation'),
|
||||
(r'(lab|diagnostic|patholog|radiology|x-?ray|imaging|blood\stest)', 'Diagnostics'),
|
||||
(r'(pharmacy|drugstore|prescription)', 'Pharmacies'),
|
||||
(r'(senior|aged|elder|nursing\shome|assisted)', 'Senior Care'),
|
||||
(r'(emergency|ambulance|paramedic|first\said|urgent)', 'Emergency Services'),
|
||||
(r'(veterinar|vet\s|animal\s(hospital|clinic))', 'Veterinary'),
|
||||
(r'(doctor|physician|surgeon|specialist|practitioner)', 'Medical Practitioners'),
|
||||
],
|
||||
'Food_Dining': [
|
||||
(r'(restaurant|eatery|dining|bistro|brasserie|grill|steakhouse)', 'Restaurants'),
|
||||
(r'(cafe|café|coffee|espresso|tea\shouse)', 'Cafes & Coffee'),
|
||||
(r'(bar\s|pub|tavern|brewery|taproom|lounge|cocktail|wine\sbar)', 'Bars & Nightlife'),
|
||||
(r'(bakery|patisserie|pastry|bread|donut|bagel)', 'Bakeries & Desserts'),
|
||||
(r'(ice\scream|gelato|dessert|frozen\syogurt|candy|chocolate)', 'Bakeries & Desserts'),
|
||||
(r'(fast\sfood|quick\sservice|drive.?thru|takeout|take.?away)', 'Quick Service'),
|
||||
(r'(caterer|catering|food\sservice|meal\sprep)', 'Food Services'),
|
||||
(r'(winery|distillery|vineyard)', 'Beverage Production'),
|
||||
],
|
||||
'Home_Services': [
|
||||
(r'(plumb|pipe|drain|sewer|septic)', 'Plumbing'),
|
||||
(r'(electric|wiring|panel|outlet)', 'Electrical'),
|
||||
(r'(hvac|heat|cool|air\scondition|furnace)', 'HVAC'),
|
||||
(r'(roof|gutter|shingle)', 'Roofing'),
|
||||
(r'(landscap|lawn|garden|tree|arbor)', 'Landscaping'),
|
||||
(r'(clean|maid|janitor|housekeep)', 'Cleaning'),
|
||||
(r'(pest|exterminator|termite)', 'Pest Control'),
|
||||
(r'(paint|drywall|plaster|wallpaper)', 'Construction'),
|
||||
(r'(floor|carpet|tile|hardwood)', 'Flooring'),
|
||||
(r'(window|door|glass)', 'Windows & Doors'),
|
||||
(r'(pool|spa|hot\stub)', 'Pool & Spa'),
|
||||
(r'(security|alarm|lock|safe)', 'Security'),
|
||||
(r'(appliance|washer|dryer|refrigerator)', 'Appliance Repair'),
|
||||
(r'(handyman|repair|fix|maintenance)', 'General Repair'),
|
||||
(r'(construct|build|remodel|renovation|contractor)', 'Construction'),
|
||||
(r'(mov(er|ing)|relocat)', 'Moving'),
|
||||
(r'(interior|decor|design|stag)', 'Design'),
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def get_business_type_for_item(name, sector):
|
||||
"""
|
||||
Determine which business type an item belongs to within a sector.
|
||||
Returns business type or 'Other' if no match.
|
||||
"""
|
||||
if sector not in BUSINESS_TYPE_PATTERNS:
|
||||
return 'Other'
|
||||
|
||||
name_lower = name.lower()
|
||||
|
||||
for pattern, btype in BUSINESS_TYPE_PATTERNS[sector]:
|
||||
if re.search(pattern, name_lower, re.IGNORECASE):
|
||||
return btype
|
||||
|
||||
return 'Other'
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function to categorize and show results"""
|
||||
import sys
|
||||
|
||||
# Read items from stdin or file
|
||||
if len(sys.argv) > 1:
|
||||
with open(sys.argv[1]) as f:
|
||||
items = [line.strip() for line in f if line.strip()]
|
||||
else:
|
||||
items = [line.strip() for line in sys.stdin if line.strip()]
|
||||
|
||||
# Categorize
|
||||
results = {}
|
||||
for name in items:
|
||||
sector = get_sector_for_item(name)
|
||||
btype = get_business_type_for_item(name, sector)
|
||||
|
||||
key = (sector, btype)
|
||||
if key not in results:
|
||||
results[key] = []
|
||||
results[key].append(name)
|
||||
|
||||
# Print summary
|
||||
print(f"Total items: {len(items)}\n")
|
||||
|
||||
# Group by sector
|
||||
by_sector = {}
|
||||
for (sector, btype), names in results.items():
|
||||
if sector not in by_sector:
|
||||
by_sector[sector] = {}
|
||||
by_sector[sector][btype] = names
|
||||
|
||||
# Print sector summary
|
||||
print("=" * 60)
|
||||
print("SECTOR SUMMARY")
|
||||
print("=" * 60)
|
||||
for sector in sorted(by_sector.keys()):
|
||||
total = sum(len(names) for names in by_sector[sector].values())
|
||||
other_count = len(by_sector[sector].get('Other', []))
|
||||
print(f"{sector}: {total} items ({other_count} in Other)")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("DETAILED BREAKDOWN")
|
||||
print("=" * 60)
|
||||
|
||||
for sector in sorted(by_sector.keys()):
|
||||
print(f"\n### {sector} ###")
|
||||
for btype in sorted(by_sector[sector].keys()):
|
||||
names = by_sector[sector][btype]
|
||||
print(f" {btype}: {len(names)}")
|
||||
if len(names) <= 10:
|
||||
for name in sorted(names):
|
||||
print(f" - {name}")
|
||||
else:
|
||||
for name in sorted(names)[:5]:
|
||||
print(f" - {name}")
|
||||
print(f" ... and {len(names) - 5} more")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
555
db/recategorize_other.py
Normal file
555
db/recategorize_other.py
Normal file
@@ -0,0 +1,555 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Recategorize items from Other.Uncategorized into appropriate existing categories.
|
||||
|
||||
RULES:
|
||||
1. NEVER create new Level 1 (Sector) categories
|
||||
2. Only create new Level 2 (Business Type) if >10 items would use it
|
||||
3. Only create new Level 3 (Sub-category) if >5 items would use it
|
||||
4. Prefer matching to existing categories at all times
|
||||
5. If uncertain, leave in Other
|
||||
|
||||
EXISTING SECTORS (21 non-Other):
|
||||
- Agriculture: Farming, Services
|
||||
- Automotive: Dealers, Fuel & Charging, Parking, Parts & Accessories, Rental Services, Repair & Maintenance, Training, Vehicle Care
|
||||
- Education: Arts Education, Early Childhood, Higher Education, K-12 Schools, Language Learning, Libraries, Professional Training, Specialty Schools, Sports Training, Technology Training, Tutoring, Vocational Training
|
||||
- Entertainment: Amusement, Arts, Fitness, Gambling, Games & Recreation, Movies, Museums, Music Venues, Parks, Performing Arts, Recreation, Social, Sports, Venues, Wildlife
|
||||
- Events_Weddings: Attire, Florists, Memorial, Planning, Rentals, Services, Venues
|
||||
- Finance_Insurance: Banking, Insurance, Investment, Lending, Money Services
|
||||
- Food_Dining: Bakeries & Desserts, Bars & Nightlife, Beverage Production, Cafes & Coffee, Food Services, Quick Service, Restaurants
|
||||
- Government: International, Legal, Local Government, Postal, Public Safety, Social Services, Transportation
|
||||
- Healthcare: Alternative Medicine, Clinics, Dental, Diagnostics, Emergency Services, Hospitals, Medical Practitioners, Mental Health, Pharmacies, Rehabilitation, Senior Care, Specialty Care, Veterinary, Vision Care
|
||||
- Home_Services: Appliance Repair, Cleaning, Construction, Design, Electrical, Flooring, General Repair, HVAC, Landscaping, Moving, Pest Control, Plumbing, Pool & Spa, Roofing, Security, Windows & Doors
|
||||
- Hospitality_Travel: Attractions, Lodging, Transportation, Travel Services
|
||||
- Industrial: Construction, Manufacturing, Mining
|
||||
- Non_Profit: Charities, Community, General, Professional
|
||||
- Personal_Services: Body Art, Clothing Care, Fitness, Hair Care, Laundry, Massage, Spa & Wellness
|
||||
- Pets_Animals: Animal Welfare, Pet Services
|
||||
- Professional_Services: Agencies, Business Services, Consulting, Creative Services, Design, Engineering, Financial Services, HR Services, Language Services, Legal, Marketing & Advertising
|
||||
- Real_Estate: Agencies, Commercial, Development, Management, Residential, Services, Storage
|
||||
- Religious: Buddhism, Christian, Hinduism, Islam, Judaism, Other
|
||||
- Retail_Shopping: Arts & Crafts, Beauty & Cosmetics, Books & Office, Clothing & Fashion, Electronics, Food & Grocery, Hardware & Building, Health & Pharmacy, Home & Garden, Jewelry & Watches, Markets, Music & Entertainment, Pet Supplies, Secondhand & Vintage, Specialty Retail, Sports & Outdoors, Toys & Hobbies, Wholesale & Distribution
|
||||
- Technology: IT Services, Infrastructure, Software, Telecommunications
|
||||
- Transportation: Delivery, Logistics, Passenger, Public Transit, Vehicle Services
|
||||
"""
|
||||
|
||||
import psycopg2
|
||||
import re
|
||||
from collections import defaultdict
|
||||
|
||||
# Database connection
|
||||
DB_URL = "postgresql://scraper:scraper123@localhost:5437/scraper"
|
||||
|
||||
def slugify(text):
|
||||
"""Convert text to slug format"""
|
||||
slug = re.sub(r'[^\w\s-]', '', text)
|
||||
slug = re.sub(r'[-\s]+', '_', slug)
|
||||
return slug.strip('_')
|
||||
|
||||
# ==================== CATEGORIZATION RULES ====================
|
||||
# Format: (keyword_pattern, sector, business_type, sub_category)
|
||||
# Use regex patterns for flexibility
|
||||
|
||||
CATEGORIZATION_RULES = [
|
||||
# ==================== SPORTS & FITNESS (→ Entertainment.Sports or Entertainment.Fitness) ====================
|
||||
# Sports clubs and facilities
|
||||
(r'\b(basketball|baseball|football|soccer|tennis|golf|hockey|rugby|cricket|volleyball|badminton|squash|racquetball)\b.*(club|court|field|ground|stadium|arena|complex)', 'Entertainment', 'Sports', 'Facilities'),
|
||||
(r'\b(swimming|diving|aquatic|pool)\b.*(club|center|pool|facility)', 'Entertainment', 'Sports', 'Aquatic'),
|
||||
(r'\b(gym|fitness|workout|crossfit|aerobic|pilates|yoga|zumba)\b.*(center|studio|club|class)', 'Entertainment', 'Fitness', 'Studios'),
|
||||
(r'\b(martial arts|karate|judo|taekwondo|aikido|boxing|kickboxing|mma|wrestling|fencing)\b.*(club|school|academy|dojo|studio)', 'Entertainment', 'Sports', 'Martial_Arts'),
|
||||
(r'\b(archery|shooting|rifle|gun)\b.*(range|club|center)', 'Entertainment', 'Sports', 'Shooting'),
|
||||
(r'\b(skateboard|skate park|bmx|cycling|bicycle)\b.*(park|venue|club|center)', 'Entertainment', 'Sports', 'Cycling_Skating'),
|
||||
(r'\b(climbing|bouldering|rock climbing)\b.*(gym|wall|center|club)', 'Entertainment', 'Fitness', 'Climbing'),
|
||||
(r'\b(dance|ballet|ballroom|salsa|tango)\b.*(studio|school|class|instructor)', 'Entertainment', 'Performing Arts', 'Dance'),
|
||||
(r'\bsports\b.*(center|complex|facility|club)', 'Entertainment', 'Sports', 'General'),
|
||||
(r'\bathletic\b.*(field|track|club|center)', 'Entertainment', 'Sports', 'Facilities'),
|
||||
(r'\b(rowing|canoeing|kayaking|sailing|boat)\b.*(club|center|school)', 'Entertainment', 'Sports', 'Water_Sports'),
|
||||
(r'\b(equestrian|horse|polo|riding)\b.*(club|center|school|stable|arena)', 'Entertainment', 'Sports', 'Equestrian'),
|
||||
(r'\b(ski|snowboard|ice skating|ice rink)\b.*(resort|center|club|rink)', 'Entertainment', 'Sports', 'Winter_Sports'),
|
||||
|
||||
# Instructors and trainers
|
||||
(r'\b(fitness|personal|sports|athletic)\b.*\b(trainer|instructor|coach)\b', 'Entertainment', 'Fitness', 'Trainers'),
|
||||
(r'\baerobic.*instructor\b', 'Entertainment', 'Fitness', 'Trainers'),
|
||||
|
||||
# ==================== HEALTHCARE (various) ====================
|
||||
# Medical specialists
|
||||
(r'\b(allergist|anesthesiologist|cardiologist|dermatologist|endocrinologist|gastroenterologist|geriatrician|hematologist|immunologist|nephrologist|neurologist|oncologist|ophthalmologist|orthopedist|otolaryngologist|pathologist|pediatrician|physiatrist|podiatrist|proctologist|pulmonologist|radiologist|rheumatologist|urologist)\b', 'Healthcare', 'Medical Practitioners', 'Specialists'),
|
||||
(r'\b(audiologist|speech therapist|occupational therapist|physical therapist)\b', 'Healthcare', 'Rehabilitation', 'Therapists'),
|
||||
(r'\b(psychologist|psychiatrist|counselor|therapist)\b(?!.*massage)', 'Healthcare', 'Mental Health', 'Practitioners'),
|
||||
(r'\b(chiropractor|osteopath|naturopath|homeopath|acupuncturist|herbalist)\b', 'Healthcare', 'Alternative Medicine', 'Practitioners'),
|
||||
(r'\b(optometrist|optician)\b', 'Healthcare', 'Vision Care', 'Practitioners'),
|
||||
(r'\b(medical|health)\b.*(center|clinic|office|practice)', 'Healthcare', 'Clinics', 'General'),
|
||||
(r'\b(aged care|elder care|senior care|nursing home|assisted living|retirement)\b', 'Healthcare', 'Senior Care', 'Facilities'),
|
||||
(r'\b(blood bank|blood donation|plasma)\b', 'Healthcare', 'Diagnostics', 'Blood_Services'),
|
||||
(r'\b(dialysis|kidney)\b.*(center|clinic)', 'Healthcare', 'Specialty Care', 'Dialysis'),
|
||||
(r'\b(fertility|ivf|reproductive)\b.*(clinic|center)', 'Healthcare', 'Specialty Care', 'Fertility'),
|
||||
(r'\b(hospice|palliative)\b', 'Healthcare', 'Senior Care', 'Hospice'),
|
||||
(r'\b(medical lab|laboratory|pathology|diagnostic)\b.*(center|lab)', 'Healthcare', 'Diagnostics', 'Labs'),
|
||||
(r'\b(ambulance|emergency|paramedic|first aid)\b', 'Healthcare', 'Emergency Services', 'EMS'),
|
||||
|
||||
# ==================== AUTOMOTIVE (various) ====================
|
||||
(r'\bauto\b.*(body|paint|dent|collision|restoration|upholster)', 'Automotive', 'Repair & Maintenance', 'Body_Work'),
|
||||
(r'\bauto\b.*(repair|mechanic|service|tune.?up|brake|transmission|radiator)', 'Automotive', 'Repair & Maintenance', 'Mechanical'),
|
||||
(r'\bauto\b.*(auction|broker|dealer)', 'Automotive', 'Dealers', 'Used_Vehicles'),
|
||||
(r'\bauto\b.*(wrecker|salvage|junk|dismantl)', 'Automotive', 'Parts & Accessories', 'Salvage'),
|
||||
(r'\b(car|vehicle|auto)\b.*(wash|detail|clean|wax)', 'Automotive', 'Vehicle Care', 'Cleaning'),
|
||||
(r'\b(car|vehicle|auto)\b.*(rental|hire|lease)', 'Automotive', 'Rental Services', 'Vehicles'),
|
||||
(r'\b(car|vehicle|auto)\b.*(storage|parking)', 'Automotive', 'Parking', 'Storage'),
|
||||
(r'\b(motorcycle|motorbike|scooter|atv|quad)\b.*(dealer|shop|rental|repair)', 'Automotive', 'Dealers', 'Motorcycles'),
|
||||
(r'\b(tire|tyre|wheel)\b.*(shop|store|service|dealer)', 'Automotive', 'Parts & Accessories', 'Tires'),
|
||||
(r'\b(driving|driver)\b.*(school|training|instructor|lesson)', 'Automotive', 'Training', 'Driving_Schools'),
|
||||
(r'\btruck\b.*(stop|dealer|rental|repair)', 'Automotive', 'Dealers', 'Trucks'),
|
||||
(r'\b(rickshaw|auto rickshaw)\b', 'Transportation', 'Passenger', 'Local'),
|
||||
|
||||
# ==================== GOVERNMENT & MILITARY ====================
|
||||
(r'\b(air force|army|navy|military|armed forces)\b.*(base|facility|office|recruitment)', 'Government', 'Public Safety', 'Military'),
|
||||
(r'\b(police|sheriff|law enforcement)\b.*(station|department|office)', 'Government', 'Public Safety', 'Police'),
|
||||
(r'\b(fire|firefighter)\b.*(station|department)', 'Government', 'Public Safety', 'Fire'),
|
||||
(r'\b(court|courthouse|tribunal|judiciary)\b', 'Government', 'Legal', 'Courts'),
|
||||
(r'\b(embassy|consulate|visa)\b.*(office|center)', 'Government', 'International', 'Diplomatic'),
|
||||
(r'\b(city|town|municipal|county|district|borough)\b.*(hall|office|government|administration)', 'Government', 'Local Government', 'Offices'),
|
||||
(r'\b(social services|welfare|unemployment|disability)\b.*(office|center)', 'Government', 'Social Services', 'Welfare'),
|
||||
(r'\b(dmv|driver.*license|vehicle registration|motor vehicle)\b', 'Government', 'Transportation', 'DMV'),
|
||||
(r'\b(passport|immigration|citizenship)\b.*(office|center)', 'Government', 'International', 'Immigration'),
|
||||
(r'\b(aadhaar|agenzia entrate|tax)\b.*(office|center)', 'Government', 'Local Government', 'Tax'),
|
||||
(r'\b(asylum|refugee)\b.*(center|office)', 'Government', 'Social Services', 'Refugee'),
|
||||
|
||||
# ==================== PETS & ANIMALS ====================
|
||||
(r'\b(animal|pet)\b.*(shelter|rescue|adoption|welfare|pound|sanctuary)', 'Pets_Animals', 'Animal Welfare', 'Shelters'),
|
||||
(r'\b(animal|pet)\b.*(hospital|clinic|vet|veterinary)', 'Healthcare', 'Veterinary', 'Clinics'),
|
||||
(r'\b(animal|pet)\b.*(grooming|boarding|kennel|daycare|sitting|walking)', 'Pets_Animals', 'Pet Services', 'Care'),
|
||||
(r'\b(animal|pet)\b.*(training|obedience|behavior)', 'Pets_Animals', 'Pet Services', 'Training'),
|
||||
(r'\b(dog|cat|bird|fish|reptile|aquarium)\b.*(breeder|shop|store)', 'Retail_Shopping', 'Pet Supplies', 'Breeders'),
|
||||
(r'\bzoo\b|aquarium|wildlife.*park|safari', 'Entertainment', 'Wildlife', 'Zoos'),
|
||||
|
||||
# ==================== RELIGIOUS ====================
|
||||
(r'\b(church|chapel|cathedral|basilica|parish)\b', 'Religious', 'Christian', 'Churches'),
|
||||
(r'\b(temple|mandir|hindu)\b', 'Religious', 'Hinduism', 'Temples'),
|
||||
(r'\b(mosque|masjid|islamic)\b', 'Religious', 'Islam', 'Mosques'),
|
||||
(r'\b(synagogue|jewish|judaism)\b', 'Religious', 'Judaism', 'Synagogues'),
|
||||
(r'\b(buddhist|buddha|monastery|zen|meditation center)\b', 'Religious', 'Buddhism', 'Temples'),
|
||||
(r'\b(ashram|spiritual|guru)\b', 'Religious', 'Other', 'Spiritual'),
|
||||
(r'\b(baha.*i|sikh|gurdwara|shinto)\b', 'Religious', 'Other', 'Houses_of_Worship'),
|
||||
|
||||
# ==================== EDUCATION ====================
|
||||
(r'\b(university|college|faculty|academic department)\b', 'Education', 'Higher Education', 'Universities'),
|
||||
(r'\b(preschool|kindergarten|nursery|daycare|child.*care|creche)\b(?!.*animal)', 'Education', 'Early Childhood', 'Preschools'),
|
||||
(r'\b(school|academy)\b(?!.*driving|.*martial|.*dance|.*music|.*art|.*beauty|.*cooking|.*flight)', 'Education', 'K-12 Schools', 'General'),
|
||||
(r'\b(language|esl|english)\b.*(school|class|course|learning)', 'Education', 'Language Learning', 'Schools'),
|
||||
(r'\b(art|drawing|painting)\b.*(school|class|studio)', 'Education', 'Arts Education', 'Visual_Arts'),
|
||||
(r'\b(music|piano|guitar|violin|drum)\b.*(school|lesson|instructor|teacher)', 'Education', 'Arts Education', 'Music'),
|
||||
(r'\b(acting|theater|drama)\b.*(school|class|academy)', 'Education', 'Arts Education', 'Performing'),
|
||||
(r'\b(tutoring|tutor|coaching)\b.*(center|service)', 'Education', 'Tutoring', 'General'),
|
||||
(r'\b(library|public library)\b', 'Education', 'Libraries', 'Public'),
|
||||
(r'\b(archive|historical|museum)\b.*library', 'Education', 'Libraries', 'Special'),
|
||||
(r'\b(vocational|trade|technical)\b.*(school|training|institute)', 'Education', 'Vocational Training', 'General'),
|
||||
(r'\b(apprentice|internship)\b', 'Education', 'Vocational Training', 'Apprenticeships'),
|
||||
(r'\b(flight|aviation|pilot)\b.*(school|training|academy)', 'Education', 'Specialty Schools', 'Aviation'),
|
||||
(r'\b(cooking|culinary|chef)\b.*(school|class|academy)', 'Education', 'Specialty Schools', 'Culinary'),
|
||||
(r'\b(beauty|cosmetology|esthetician)\b.*(school|academy)', 'Education', 'Specialty Schools', 'Beauty'),
|
||||
|
||||
# ==================== HOME SERVICES ====================
|
||||
(r'\b(bathroom|kitchen)\b.*(remodel|renovation|contractor)', 'Home_Services', 'Construction', 'Remodeling'),
|
||||
(r'\b(general|home)\b.*contractor', 'Home_Services', 'Construction', 'General'),
|
||||
(r'\b(painter|painting)\b.*(contractor|service|company)(?!.*auto)', 'Home_Services', 'Construction', 'Painting'),
|
||||
(r'\b(carpenter|carpentry|cabinet|woodwork)\b', 'Home_Services', 'Construction', 'Carpentry'),
|
||||
(r'\b(mason|masonry|brick|concrete|stone)\b.*(contractor|service|company)', 'Home_Services', 'Construction', 'Masonry'),
|
||||
(r'\b(electrician|electrical)\b.*(contractor|service|company)', 'Home_Services', 'Electrical', 'Contractors'),
|
||||
(r'\b(plumber|plumbing)\b.*(contractor|service|company)', 'Home_Services', 'Plumbing', 'Contractors'),
|
||||
(r'\b(hvac|heating|air conditioning|furnace)\b.*(contractor|service|company)', 'Home_Services', 'HVAC', 'Contractors'),
|
||||
(r'\b(roofer|roofing)\b.*(contractor|service|company)', 'Home_Services', 'Roofing', 'Contractors'),
|
||||
(r'\b(landscap|lawn|garden)\b.*(service|company|contractor)(?!.*store|.*center)', 'Home_Services', 'Landscaping', 'Services'),
|
||||
(r'\b(pool|spa)\b.*(service|cleaning|maintenance|contractor)', 'Home_Services', 'Pool & Spa', 'Services'),
|
||||
(r'\b(pest|exterminator|termite)\b.*(control|service)', 'Home_Services', 'Pest Control', 'Services'),
|
||||
(r'\b(cleaning|maid|janitorial|housekeeping)\b.*(service|company)', 'Home_Services', 'Cleaning', 'Services'),
|
||||
(r'\b(window)\b.*(cleaning|wash)', 'Home_Services', 'Cleaning', 'Window'),
|
||||
(r'\b(appliance)\b.*(repair|service)', 'Home_Services', 'Appliance Repair', 'Services'),
|
||||
(r'\b(handyman|odd job|home repair)\b', 'Home_Services', 'General Repair', 'Handyman'),
|
||||
(r'\b(moving|movers|relocation)\b.*(company|service)', 'Home_Services', 'Moving', 'Services'),
|
||||
(r'\b(locksmith)\b', 'Home_Services', 'Security', 'Locksmith'),
|
||||
(r'\b(alarm|security system)\b.*(company|service|installer)', 'Home_Services', 'Security', 'Systems'),
|
||||
(r'\b(arborist|tree)\b.*(service|removal|trimming)', 'Home_Services', 'Landscaping', 'Tree_Service'),
|
||||
(r'\b(fence)\b.*(contractor|company|install)', 'Home_Services', 'Construction', 'Fencing'),
|
||||
(r'\b(garage door)\b.*(service|repair|install)', 'Home_Services', 'General Repair', 'Garage_Doors'),
|
||||
(r'\b(gutter)\b.*(cleaning|service|install)', 'Home_Services', 'Construction', 'Gutters'),
|
||||
(r'\b(insulation)\b.*(contractor|company)', 'Home_Services', 'Construction', 'Insulation'),
|
||||
(r'\b(deck|patio)\b.*(builder|contractor)', 'Home_Services', 'Construction', 'Outdoor'),
|
||||
(r'\b(drywall|sheetrock)\b', 'Home_Services', 'Construction', 'Drywall'),
|
||||
(r'\b(flooring|carpet|tile|hardwood)\b.*(install|contractor|company)(?!.*store)', 'Home_Services', 'Flooring', 'Installation'),
|
||||
(r'\b(window|door)\b.*(install|replacement|contractor)', 'Home_Services', 'Windows & Doors', 'Installation'),
|
||||
(r'\b(glass)\b.*(repair|replacement|company)(?!.*auto)', 'Home_Services', 'Windows & Doors', 'Glass'),
|
||||
(r'\b(chimney)\b.*(sweep|cleaning|repair)', 'Home_Services', 'General Repair', 'Chimney'),
|
||||
(r'\b(septic|sewer)\b.*(service|pumping|cleaning)', 'Home_Services', 'Plumbing', 'Septic'),
|
||||
(r'\b(well)\b.*(drilling|service|pump)', 'Home_Services', 'Plumbing', 'Wells'),
|
||||
(r'\b(solar)\b.*(install|contractor|company)', 'Home_Services', 'Electrical', 'Solar'),
|
||||
|
||||
# ==================== RETAIL & SHOPPING ====================
|
||||
(r'\b(antique|vintage|secondhand|thrift|consignment|pawn)\b.*(shop|store)', 'Retail_Shopping', 'Secondhand & Vintage', 'Stores'),
|
||||
(r'\b(auction)\b.*(house|company)', 'Retail_Shopping', 'Secondhand & Vintage', 'Auctions'),
|
||||
(r'\b(art|craft|hobby)\b.*(supply|store|shop)', 'Retail_Shopping', 'Arts & Crafts', 'Supplies'),
|
||||
(r'\b(toy|game|hobby)\b.*(store|shop)', 'Retail_Shopping', 'Toys & Hobbies', 'Stores'),
|
||||
(r'\b(book|stationery|office supply)\b.*(store|shop)', 'Retail_Shopping', 'Books & Office', 'Stores'),
|
||||
(r'\b(music|instrument|record|vinyl)\b.*(store|shop)', 'Retail_Shopping', 'Music & Entertainment', 'Stores'),
|
||||
(r'\b(sporting|sports|outdoor|camping|fishing|hunting)\b.*(goods|store|shop)', 'Retail_Shopping', 'Sports & Outdoors', 'Stores'),
|
||||
(r'\b(electronics|computer|phone|appliance)\b.*(store|shop|retailer)', 'Retail_Shopping', 'Electronics', 'Stores'),
|
||||
(r'\b(furniture|home decor|bedding|mattress)\b.*(store|shop)', 'Retail_Shopping', 'Home & Garden', 'Stores'),
|
||||
(r'\b(clothing|fashion|apparel|boutique|shoe)\b.*(store|shop)', 'Retail_Shopping', 'Clothing & Fashion', 'Stores'),
|
||||
(r'\b(jewelry|watch|gem)\b.*(store|shop)', 'Retail_Shopping', 'Jewelry & Watches', 'Stores'),
|
||||
(r'\b(hardware|tool|building supply|lumber)\b.*(store|shop)', 'Retail_Shopping', 'Hardware & Building', 'Stores'),
|
||||
(r'\b(garden|nursery|plant)\b.*(center|store|shop)', 'Retail_Shopping', 'Home & Garden', 'Garden_Centers'),
|
||||
(r'\b(pharmacy|drugstore)\b', 'Retail_Shopping', 'Health & Pharmacy', 'Pharmacies'),
|
||||
(r'\b(cosmetic|beauty|makeup)\b.*(store|shop)', 'Retail_Shopping', 'Beauty & Cosmetics', 'Stores'),
|
||||
(r'\b(grocery|supermarket|food|convenience)\b.*(store|market|shop)', 'Retail_Shopping', 'Food & Grocery', 'Stores'),
|
||||
(r'\b(liquor|wine|beer|alcohol)\b.*(store|shop)', 'Retail_Shopping', 'Food & Grocery', 'Liquor'),
|
||||
(r'\b(tobacco|cigar|vape|smoke)\b.*(shop|store)', 'Retail_Shopping', 'Specialty Retail', 'Tobacco'),
|
||||
(r'\b(mobile phone|cell phone)\b.*(store|shop|dealer)', 'Retail_Shopping', 'Electronics', 'Phones'),
|
||||
(r'\b(optical|eyewear|glasses|sunglass)\b.*(store|shop)', 'Retail_Shopping', 'Health & Pharmacy', 'Optical'),
|
||||
(r'\b(florist|flower)\b.*(shop|store)', 'Events_Weddings', 'Florists', 'Shops'),
|
||||
(r'\b(bridal|wedding)\b.*(shop|store|boutique)', 'Events_Weddings', 'Attire', 'Bridal'),
|
||||
(r'\b(uniform|workwear)\b.*(store|shop)', 'Retail_Shopping', 'Clothing & Fashion', 'Specialty'),
|
||||
|
||||
# ==================== PROFESSIONAL SERVICES ====================
|
||||
(r'\b(lawyer|attorney|law firm|legal)\b.*(office|firm|service)', 'Professional_Services', 'Legal', 'Firms'),
|
||||
(r'\b(accountant|accounting|bookkeep|tax)\b.*(firm|service|office)(?!.*government)', 'Professional_Services', 'Financial Services', 'Accounting'),
|
||||
(r'\b(architect|architecture)\b.*(firm|office|studio)', 'Professional_Services', 'Engineering', 'Architecture'),
|
||||
(r'\b(engineer|engineering)\b.*(firm|office|company)', 'Professional_Services', 'Engineering', 'Firms'),
|
||||
(r'\b(surveyor|surveying|land survey)\b', 'Professional_Services', 'Engineering', 'Surveying'),
|
||||
(r'\b(consultant|consulting)\b.*(firm|company|service)', 'Professional_Services', 'Consulting', 'General'),
|
||||
(r'\b(marketing|advertising|pr|public relations)\b.*(agency|firm|company)', 'Professional_Services', 'Marketing & Advertising', 'Agencies'),
|
||||
(r'\b(graphic|web|design)\b.*(studio|agency|firm)', 'Professional_Services', 'Creative Services', 'Design'),
|
||||
(r'\b(photography|photographer|video|videograph)\b.*(studio|service)', 'Professional_Services', 'Creative Services', 'Photography'),
|
||||
(r'\b(translation|interpreter|language)\b.*service', 'Professional_Services', 'Language Services', 'Translation'),
|
||||
(r'\b(staffing|recruiting|employment|hr)\b.*(agency|service|firm)', 'Professional_Services', 'HR Services', 'Agencies'),
|
||||
(r'\b(notary|notarial)\b', 'Professional_Services', 'Legal', 'Notary'),
|
||||
(r'\b(private investigator|detective)\b', 'Professional_Services', 'Agencies', 'Investigation'),
|
||||
(r'\b(appraiser|appraisal|valuation)\b', 'Professional_Services', 'Financial Services', 'Appraisal'),
|
||||
(r'\b(auditor|audit)\b.*(firm|service)', 'Professional_Services', 'Financial Services', 'Audit'),
|
||||
(r'\b(courier|messenger|delivery)\b.*service', 'Transportation', 'Delivery', 'Courier'),
|
||||
|
||||
# ==================== ARTS & CULTURE ====================
|
||||
(r'\b(art|gallery|exhibition)\b(?!.*supply|.*store|.*school)', 'Entertainment', 'Arts', 'Galleries'),
|
||||
(r'\b(museum)\b', 'Entertainment', 'Museums', 'General'),
|
||||
(r'\b(theater|theatre|playhouse|opera house)\b', 'Entertainment', 'Performing Arts', 'Venues'),
|
||||
(r'\b(cinema|movie theater|multiplex)\b', 'Entertainment', 'Movies', 'Theaters'),
|
||||
(r'\b(concert|music)\b.*(hall|venue)', 'Entertainment', 'Music Venues', 'Concert_Halls'),
|
||||
(r'\b(band|orchestra|choir|ensemble)\b', 'Entertainment', 'Performing Arts', 'Groups'),
|
||||
(r'\b(comedian|comedy club)\b', 'Entertainment', 'Performing Arts', 'Comedy'),
|
||||
(r'\b(artist|sculptor|painter)\b(?!.*makeup)', 'Entertainment', 'Arts', 'Artists'),
|
||||
(r'\b(animation|animator)\b.*(studio|company)', 'Professional_Services', 'Creative Services', 'Animation'),
|
||||
(r'\b(recording|music)\b.*studio', 'Professional_Services', 'Creative Services', 'Recording'),
|
||||
(r'\b(art restoration|restoration service)\b', 'Professional_Services', 'Creative Services', 'Restoration'),
|
||||
|
||||
# ==================== ENTERTAINMENT & RECREATION ====================
|
||||
(r'\b(amusement|theme)\b.*park', 'Entertainment', 'Amusement', 'Parks'),
|
||||
(r'\b(arcade|game center|gaming)\b', 'Entertainment', 'Games & Recreation', 'Arcades'),
|
||||
(r'\b(escape room|puzzle room)\b', 'Entertainment', 'Games & Recreation', 'Escape_Rooms'),
|
||||
(r'\b(bowling)\b.*(alley|center)', 'Entertainment', 'Games & Recreation', 'Bowling'),
|
||||
(r'\b(billiard|pool hall|snooker)\b', 'Entertainment', 'Games & Recreation', 'Billiards'),
|
||||
(r'\b(karaoke)\b', 'Entertainment', 'Music Venues', 'Karaoke'),
|
||||
(r'\b(casino|gambling|betting)\b', 'Entertainment', 'Gambling', 'Casinos'),
|
||||
(r'\b(nightclub|disco|club)\b(?!.*golf|.*country|.*tennis)', 'Food_Dining', 'Bars & Nightlife', 'Nightclubs'),
|
||||
(r'\b(country club|private club|social club)\b', 'Entertainment', 'Social', 'Clubs'),
|
||||
(r'\b(botanical garden|arboretum)\b', 'Entertainment', 'Parks', 'Gardens'),
|
||||
(r'\b(park|playground|recreation area)\b(?!.*theme|.*water|.*trailer|.*mobile)', 'Entertainment', 'Parks', 'Public'),
|
||||
(r'\b(beach|waterfront|marina)\b(?!.*hotel)', 'Entertainment', 'Parks', 'Beaches'),
|
||||
(r'\b(campground|camping|rv park|caravan)\b', 'Hospitality_Travel', 'Lodging', 'Camping'),
|
||||
(r'\b(go.?kart|kart|karting)\b', 'Entertainment', 'Games & Recreation', 'Karting'),
|
||||
(r'\b(laser tag|paintball)\b', 'Entertainment', 'Games & Recreation', 'Adventure'),
|
||||
(r'\b(trampoline|bounce|jump)\b.*(park|center)', 'Entertainment', 'Games & Recreation', 'Trampoline'),
|
||||
(r'\b(mini golf|miniature golf|putt.?putt)\b', 'Entertainment', 'Games & Recreation', 'Mini_Golf'),
|
||||
(r'\b(water park|aqua park)\b', 'Entertainment', 'Amusement', 'Water_Parks'),
|
||||
(r'\b(haunted|horror)\b.*(house|attraction)', 'Entertainment', 'Amusement', 'Attractions'),
|
||||
(r'\b(circus|carnival|fair)\b', 'Entertainment', 'Amusement', 'Shows'),
|
||||
(r'\b(planetarium|observatory)\b', 'Entertainment', 'Museums', 'Science'),
|
||||
|
||||
# ==================== FOOD & DINING ====================
|
||||
(r'\b(bar|pub|tavern|lounge|brewery|taproom|brewpub)\b(?!.*brow|.*eyebrow)', 'Food_Dining', 'Bars & Nightlife', 'Bars'),
|
||||
(r'\b(cafe|coffee|espresso)\b.*(shop|house|bar)', 'Food_Dining', 'Cafes & Coffee', 'Cafes'),
|
||||
(r'\b(restaurant|eatery|diner|bistro|brasserie|grill)\b', 'Food_Dining', 'Restaurants', 'General'),
|
||||
(r'\b(bakery|patisserie|pastry)\b', 'Food_Dining', 'Bakeries & Desserts', 'Bakeries'),
|
||||
(r'\b(ice cream|gelato|frozen yogurt|dessert)\b.*(shop|parlor|store)', 'Food_Dining', 'Bakeries & Desserts', 'Desserts'),
|
||||
(r'\b(caterer|catering)\b', 'Food_Dining', 'Food Services', 'Catering'),
|
||||
(r'\b(food truck|food cart)\b', 'Food_Dining', 'Quick Service', 'Mobile'),
|
||||
(r'\b(juice|smoothie)\b.*(bar|shop)', 'Food_Dining', 'Cafes & Coffee', 'Juice'),
|
||||
(r'\b(tea|bubble tea|boba)\b.*(shop|house|room)', 'Food_Dining', 'Cafes & Coffee', 'Tea'),
|
||||
(r'\b(winery|vineyard|wine)\b.*(tasting|cellar)', 'Food_Dining', 'Beverage Production', 'Wineries'),
|
||||
(r'\b(distillery|spirit)\b', 'Food_Dining', 'Beverage Production', 'Distilleries'),
|
||||
(r'\b(butcher|meat)\b.*shop', 'Retail_Shopping', 'Food & Grocery', 'Butchers'),
|
||||
(r'\b(fish|seafood)\b.*market', 'Retail_Shopping', 'Food & Grocery', 'Seafood'),
|
||||
(r'\b(deli|delicatessen)\b', 'Retail_Shopping', 'Food & Grocery', 'Delis'),
|
||||
(r'\b(candy|chocolate|sweet|confection)\b.*(shop|store)', 'Retail_Shopping', 'Food & Grocery', 'Confectionery'),
|
||||
|
||||
# ==================== PERSONAL SERVICES ====================
|
||||
(r'\b(barber|hair)\b.*(shop|salon|stylist)', 'Personal_Services', 'Hair Care', 'Salons'),
|
||||
(r'\b(beauty|nail|manicure|pedicure)\b.*(salon|spa|studio)', 'Personal_Services', 'Spa & Wellness', 'Beauty'),
|
||||
(r'\b(tattoo|piercing|body art)\b.*(shop|studio|parlor)', 'Personal_Services', 'Body Art', 'Studios'),
|
||||
(r'\b(massage)\b.*(therapist|spa|parlor|studio)', 'Personal_Services', 'Massage', 'Studios'),
|
||||
(r'\b(spa|wellness|day spa)\b', 'Personal_Services', 'Spa & Wellness', 'Spas'),
|
||||
(r'\b(tanning|sunbed)\b.*(salon|studio)', 'Personal_Services', 'Spa & Wellness', 'Tanning'),
|
||||
(r'\b(laundry|laundromat|dry clean|tailor|alteration|seamstress)\b', 'Personal_Services', 'Laundry', 'Services'),
|
||||
(r'\b(shoe repair|cobbler)\b', 'Personal_Services', 'Clothing Care', 'Shoe_Repair'),
|
||||
(r'\b(brow|eyebrow|lash|eyelash)\b.*(bar|salon|studio)', 'Personal_Services', 'Spa & Wellness', 'Brows_Lashes'),
|
||||
(r'\b(makeup artist|stylist)\b', 'Personal_Services', 'Spa & Wellness', 'Makeup'),
|
||||
(r'\b(sauna|steam room|bathhouse|hammam)\b', 'Personal_Services', 'Spa & Wellness', 'Baths'),
|
||||
(r'\b(waxing)\b.*(salon|studio)', 'Personal_Services', 'Spa & Wellness', 'Waxing'),
|
||||
|
||||
# ==================== HOSPITALITY & TRAVEL ====================
|
||||
(r'\b(hotel|motel|inn|resort|hostel|lodge|bed and breakfast|b&b|guesthouse)\b', 'Hospitality_Travel', 'Lodging', 'Hotels'),
|
||||
(r'\b(travel|tour)\b.*(agency|operator|company)', 'Hospitality_Travel', 'Travel Services', 'Agencies'),
|
||||
(r'\b(airline|airport|aviation)\b(?!.*school)', 'Transportation', 'Passenger', 'Air'),
|
||||
(r'\b(cruise|ferry)\b.*(line|terminal|port)', 'Transportation', 'Passenger', 'Water'),
|
||||
(r'\b(train|rail)\b.*(station|service)', 'Transportation', 'Passenger', 'Rail'),
|
||||
(r'\b(bus|coach)\b.*(station|terminal|service|company)', 'Transportation', 'Passenger', 'Bus'),
|
||||
(r'\b(taxi|cab|ride|uber|lyft|limo|limousine|chauffeur)\b.*(service|company|stand)', 'Transportation', 'Passenger', 'Taxi'),
|
||||
(r'\b(tourist|visitor)\b.*(information|center|bureau)', 'Hospitality_Travel', 'Travel Services', 'Information'),
|
||||
(r'\b(rental)\b.*\b(cabin|cottage|vacation|holiday)\b', 'Hospitality_Travel', 'Lodging', 'Rentals'),
|
||||
|
||||
# ==================== INDUSTRIAL & MANUFACTURING ====================
|
||||
(r'\b(factory|plant|mill|manufacturing)\b', 'Industrial', 'Manufacturing', 'General'),
|
||||
(r'\b(warehouse|distribution|logistics)\b.*(center|facility)', 'Transportation', 'Logistics', 'Warehouses'),
|
||||
(r'\b(machine|machinist|metalwork|welding|welder)\b.*(shop|company|service)', 'Industrial', 'Manufacturing', 'Metal'),
|
||||
(r'\b(print|printing|press)\b.*(shop|company|service)', 'Industrial', 'Manufacturing', 'Printing'),
|
||||
(r'\b(textile|fabric|garment)\b.*(factory|mill|manufacturer)', 'Industrial', 'Manufacturing', 'Textile'),
|
||||
(r'\b(chemical|pharmaceutical)\b.*(company|manufacturer|plant)', 'Industrial', 'Manufacturing', 'Chemical'),
|
||||
(r'\b(construction|building)\b.*(company|contractor|firm)', 'Industrial', 'Construction', 'General'),
|
||||
(r'\b(quarry|gravel|sand|aggregate)\b', 'Industrial', 'Mining', 'Quarries'),
|
||||
(r'\b(sawmill|lumber)\b.*(mill|yard)', 'Industrial', 'Manufacturing', 'Wood'),
|
||||
(r'\b(steel|iron|aluminum)\b.*(plant|manufacturer|company)', 'Industrial', 'Manufacturing', 'Metal'),
|
||||
(r'\b(packaging|container)\b.*(company|manufacturer)', 'Industrial', 'Manufacturing', 'Packaging'),
|
||||
(r'\b(recycling|waste)\b.*(center|facility|company)', 'Industrial', 'Manufacturing', 'Recycling'),
|
||||
|
||||
# ==================== REAL ESTATE ====================
|
||||
(r'\b(real estate|realtor|property)\b.*(agent|agency|company)', 'Real_Estate', 'Agencies', 'Agents'),
|
||||
(r'\b(property management|apartment|rental)\b.*(company|agency)', 'Real_Estate', 'Management', 'Residential'),
|
||||
(r'\b(storage|self storage|mini storage)\b.*(facility|unit)', 'Real_Estate', 'Storage', 'Self_Storage'),
|
||||
(r'\b(office|commercial)\b.*(space|building|complex)', 'Real_Estate', 'Commercial', 'Office'),
|
||||
(r'\b(apartment|condo|housing)\b.*(complex|building|community)', 'Real_Estate', 'Residential', 'Apartments'),
|
||||
(r'\b(home builder|housing development)\b', 'Real_Estate', 'Development', 'Residential'),
|
||||
|
||||
# ==================== NON-PROFIT & COMMUNITY ====================
|
||||
(r'\b(charity|charitable|foundation|fund)\b(?!.*investment)', 'Non_Profit', 'Charities', 'General'),
|
||||
(r'\b(non.?profit|ngo|association)\b', 'Non_Profit', 'General', 'Organizations'),
|
||||
(r'\b(community|civic|neighborhood)\b.*(center|organization|association)', 'Non_Profit', 'Community', 'Centers'),
|
||||
(r'\b(youth|boys|girls|scout)\b.*(club|organization|center)', 'Non_Profit', 'Community', 'Youth'),
|
||||
(r'\b(senior|elder)\b.*(center|club)(?!.*care)', 'Non_Profit', 'Community', 'Seniors'),
|
||||
(r'\b(veterans|vfw|american legion)\b', 'Non_Profit', 'Community', 'Veterans'),
|
||||
(r'\b(rotary|lions|kiwanis|elks|freemason|lodge)\b', 'Non_Profit', 'Community', 'Fraternal'),
|
||||
(r'\b(union|labor)\b.*(hall|organization)', 'Non_Profit', 'Professional', 'Unions'),
|
||||
(r'\b(chamber of commerce|business association)\b', 'Non_Profit', 'Professional', 'Business'),
|
||||
(r'\b(aboriginal|indigenous|tribal)\b.*(organization|center)', 'Non_Profit', 'Community', 'Indigenous'),
|
||||
|
||||
# ==================== TECHNOLOGY ====================
|
||||
(r'\b(software|app|web)\b.*(developer|development|company)', 'Technology', 'Software', 'Development'),
|
||||
(r'\b(it|computer|tech)\b.*(service|support|repair)', 'Technology', 'IT Services', 'Support'),
|
||||
(r'\b(data center|server|cloud)\b', 'Technology', 'Infrastructure', 'Data_Centers'),
|
||||
(r'\b(internet|isp|broadband|telecom)\b.*(provider|service|company)', 'Technology', 'Telecommunications', 'Providers'),
|
||||
(r'\b(bpo|call center|outsourc)\b', 'Technology', 'IT Services', 'BPO'),
|
||||
(r'\b(automation|robot)\b.*(company|service)', 'Technology', 'Software', 'Automation'),
|
||||
|
||||
# ==================== FINANCE & INSURANCE ====================
|
||||
(r'\b(bank|credit union|savings)\b(?!.*blood|.*food)', 'Finance_Insurance', 'Banking', 'Banks'),
|
||||
(r'\b(atm|cash machine)\b', 'Finance_Insurance', 'Banking', 'ATMs'),
|
||||
(r'\b(insurance)\b.*(agent|agency|company|broker)', 'Finance_Insurance', 'Insurance', 'Agents'),
|
||||
(r'\b(mortgage|loan|lending|finance)\b.*(company|broker|service)', 'Finance_Insurance', 'Lending', 'Lenders'),
|
||||
(r'\b(investment|wealth|portfolio|financial advisor)\b', 'Finance_Insurance', 'Investment', 'Advisors'),
|
||||
(r'\b(money transfer|remittance|western union|moneygram)\b', 'Finance_Insurance', 'Money Services', 'Transfer'),
|
||||
(r'\b(currency exchange|forex)\b', 'Finance_Insurance', 'Money Services', 'Exchange'),
|
||||
(r'\b(bail bond)\b', 'Professional_Services', 'Legal', 'Bail'),
|
||||
|
||||
# ==================== EVENTS & WEDDINGS ====================
|
||||
(r'\b(funeral|mortuary|cremation|cemetery|memorial)\b', 'Events_Weddings', 'Memorial', 'Funeral'),
|
||||
(r'\b(event|party|wedding)\b.*(planner|planning|coordinator)', 'Events_Weddings', 'Planning', 'Planners'),
|
||||
(r'\b(banquet|event|reception|wedding)\b.*(hall|venue|center)', 'Events_Weddings', 'Venues', 'Halls'),
|
||||
(r'\b(dj|disc jockey|entertainment)\b.*service', 'Events_Weddings', 'Services', 'Entertainment'),
|
||||
(r'\b(balloon|party supply|decoration)\b', 'Events_Weddings', 'Services', 'Decorations'),
|
||||
(r'\b(tent|equipment)\b.*rental(?!.*car|.*truck)', 'Events_Weddings', 'Rentals', 'Equipment'),
|
||||
(r'\b(photo booth|photobooth)\b', 'Events_Weddings', 'Services', 'Photography'),
|
||||
|
||||
# ==================== AGRICULTURE ====================
|
||||
(r'\b(farm|ranch|orchard|vineyard)\b(?!.*winery)', 'Agriculture', 'Farming', 'Farms'),
|
||||
(r'\b(agriculture|farming|crop)\b.*(service|supply|equipment)', 'Agriculture', 'Services', 'Supplies'),
|
||||
(r'\b(livestock|cattle|poultry|dairy)\b', 'Agriculture', 'Farming', 'Livestock'),
|
||||
(r'\b(nursery|greenhouse|horticulture)\b.*(wholesale|grower)', 'Agriculture', 'Farming', 'Horticulture'),
|
||||
(r'\b(agistment|horse boarding|stable)\b', 'Agriculture', 'Services', 'Equine'),
|
||||
(r'\b(veterinarian|vet)\b.*(livestock|farm|large animal)', 'Agriculture', 'Services', 'Veterinary'),
|
||||
|
||||
# ==================== TRANSPORTATION ====================
|
||||
(r'\b(shipping|freight|cargo|trucking)\b.*(company|service)', 'Transportation', 'Logistics', 'Shipping'),
|
||||
(r'\b(courier|messenger|express)\b.*(service|delivery)', 'Transportation', 'Delivery', 'Courier'),
|
||||
(r'\b(airport|airfield|airstrip|heliport)\b', 'Transportation', 'Passenger', 'Airports'),
|
||||
(r'\b(port|harbor|dock|pier|marina)\b(?!.*wine)', 'Transportation', 'Logistics', 'Ports'),
|
||||
(r'\b(parking)\b.*(lot|garage|structure)', 'Automotive', 'Parking', 'Lots'),
|
||||
(r'\b(towing|tow truck)\b', 'Transportation', 'Vehicle Services', 'Towing'),
|
||||
]
|
||||
|
||||
def categorize_item(name):
|
||||
"""
|
||||
Categorize a single item based on rules.
|
||||
Returns (sector, business_type, sub_category) or None if no match.
|
||||
"""
|
||||
name_lower = name.lower()
|
||||
|
||||
for pattern, sector, btype, subcat in CATEGORIZATION_RULES:
|
||||
if re.search(pattern, name_lower, re.IGNORECASE):
|
||||
return (sector, btype, subcat)
|
||||
|
||||
return None
|
||||
|
||||
def get_existing_paths(cursor):
|
||||
"""Get all existing paths in the database"""
|
||||
cursor.execute("SELECT path::text FROM gbp_categories")
|
||||
return {row[0] for row in cursor.fetchall()}
|
||||
|
||||
def get_or_create_path(cursor, sector, btype, subcat, existing_paths):
|
||||
"""
|
||||
Get or create the full path for a category.
|
||||
Returns the parent path (level 3) for the item.
|
||||
"""
|
||||
sector_slug = slugify(sector)
|
||||
btype_slug = slugify(btype)
|
||||
subcat_slug = slugify(subcat)
|
||||
|
||||
# Level 1: Sector
|
||||
sector_path = sector_slug
|
||||
if sector_path not in existing_paths:
|
||||
# Don't create new sectors - return None
|
||||
print(f" [SKIP] Would need new sector: {sector_path}")
|
||||
return None
|
||||
|
||||
# Level 2: Business Type
|
||||
btype_path = f"{sector_path}.{btype_slug}"
|
||||
if btype_path not in existing_paths:
|
||||
# Create new business type
|
||||
cursor.execute("""
|
||||
INSERT INTO gbp_categories (name, slug, path, level, parent_id, category_count)
|
||||
SELECT %s, %s, %s::ltree, 2, id, 0
|
||||
FROM gbp_categories WHERE path = %s::ltree
|
||||
ON CONFLICT (path) DO NOTHING
|
||||
RETURNING id
|
||||
""", (btype, btype_slug, btype_path, sector_path))
|
||||
result = cursor.fetchone()
|
||||
if result:
|
||||
existing_paths.add(btype_path)
|
||||
print(f" [NEW] Created business type: {btype_path}")
|
||||
|
||||
# Level 3: Sub-category
|
||||
subcat_path = f"{btype_path}.{subcat_slug}"
|
||||
if subcat_path not in existing_paths:
|
||||
# Create new sub-category
|
||||
cursor.execute("""
|
||||
INSERT INTO gbp_categories (name, slug, path, level, parent_id, category_count)
|
||||
SELECT %s, %s, %s::ltree, 3, id, 0
|
||||
FROM gbp_categories WHERE path = %s::ltree
|
||||
ON CONFLICT (path) DO NOTHING
|
||||
RETURNING id
|
||||
""", (subcat, subcat_slug, subcat_path, btype_path))
|
||||
result = cursor.fetchone()
|
||||
if result:
|
||||
existing_paths.add(subcat_path)
|
||||
print(f" [NEW] Created sub-category: {subcat_path}")
|
||||
|
||||
return subcat_path
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get all items in Other.Uncategorized
|
||||
cursor.execute("""
|
||||
SELECT id, name, slug
|
||||
FROM gbp_categories
|
||||
WHERE path ~ 'Other.Uncategorized.*' AND level = 4
|
||||
ORDER BY name
|
||||
""")
|
||||
other_items = cursor.fetchall()
|
||||
print(f"Found {len(other_items)} items in Other.Uncategorized\n")
|
||||
|
||||
# Get existing paths
|
||||
existing_paths = get_existing_paths(cursor)
|
||||
|
||||
# Categorize items
|
||||
categorized = []
|
||||
uncategorized = []
|
||||
category_counts = defaultdict(int)
|
||||
|
||||
for item_id, name, slug in other_items:
|
||||
result = categorize_item(name)
|
||||
if result:
|
||||
sector, btype, subcat = result
|
||||
categorized.append((item_id, name, slug, sector, btype, subcat))
|
||||
category_counts[(sector, btype, subcat)] += 1
|
||||
else:
|
||||
uncategorized.append((item_id, name))
|
||||
|
||||
print(f"Categorized: {len(categorized)}")
|
||||
print(f"Still uncategorized: {len(uncategorized)}")
|
||||
print()
|
||||
|
||||
# Show category distribution
|
||||
print("Category distribution:")
|
||||
for (sector, btype, subcat), count in sorted(category_counts.items(), key=lambda x: -x[1])[:30]:
|
||||
print(f" {sector}.{btype}.{subcat}: {count}")
|
||||
print()
|
||||
|
||||
# Show some uncategorized items
|
||||
print("Sample uncategorized items (first 50):")
|
||||
for item_id, name in uncategorized[:50]:
|
||||
print(f" - {name}")
|
||||
print()
|
||||
|
||||
# Ask for confirmation
|
||||
response = input("Proceed with database updates? (yes/no): ")
|
||||
if response.lower() != 'yes':
|
||||
print("Aborted.")
|
||||
conn.close()
|
||||
return
|
||||
|
||||
# Update database
|
||||
updated = 0
|
||||
for item_id, name, slug, sector, btype, subcat in categorized:
|
||||
parent_path = get_or_create_path(cursor, sector, btype, subcat, existing_paths)
|
||||
if parent_path:
|
||||
new_path = f"{parent_path}.{slug}"
|
||||
# Update the item
|
||||
cursor.execute("""
|
||||
UPDATE gbp_categories
|
||||
SET path = %s::ltree,
|
||||
parent_id = (SELECT id FROM gbp_categories WHERE path = %s::ltree)
|
||||
WHERE id = %s
|
||||
""", (new_path, parent_path, item_id))
|
||||
updated += 1
|
||||
|
||||
# Update category counts
|
||||
cursor.execute("""
|
||||
WITH counts AS (
|
||||
SELECT
|
||||
parent_id,
|
||||
COUNT(*) as cnt
|
||||
FROM gbp_categories
|
||||
WHERE parent_id IS NOT NULL
|
||||
GROUP BY parent_id
|
||||
)
|
||||
UPDATE gbp_categories g
|
||||
SET category_count = COALESCE(c.cnt, 0)
|
||||
FROM counts c
|
||||
WHERE g.id = c.parent_id
|
||||
""")
|
||||
|
||||
conn.commit()
|
||||
print(f"\nUpdated {updated} items")
|
||||
|
||||
# Show final stats
|
||||
cursor.execute("""
|
||||
SELECT path, name, category_count
|
||||
FROM gbp_categories
|
||||
WHERE level = 1
|
||||
ORDER BY category_count DESC
|
||||
""")
|
||||
print("\nFinal sector counts:")
|
||||
for path, name, count in cursor.fetchall():
|
||||
print(f" {name}: {count}")
|
||||
|
||||
conn.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user