Initial commit - WhyRating Engine (Google Reviews Scraper)
This commit is contained in:
555
db/recategorize_other.py
Normal file
555
db/recategorize_other.py
Normal file
@@ -0,0 +1,555 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Recategorize items from Other.Uncategorized into appropriate existing categories.
|
||||
|
||||
RULES:
|
||||
1. NEVER create new Level 1 (Sector) categories
|
||||
2. Only create new Level 2 (Business Type) if >10 items would use it
|
||||
3. Only create new Level 3 (Sub-category) if >5 items would use it
|
||||
4. Prefer matching to existing categories at all times
|
||||
5. If uncertain, leave in Other
|
||||
|
||||
EXISTING SECTORS (21 non-Other):
|
||||
- Agriculture: Farming, Services
|
||||
- Automotive: Dealers, Fuel & Charging, Parking, Parts & Accessories, Rental Services, Repair & Maintenance, Training, Vehicle Care
|
||||
- Education: Arts Education, Early Childhood, Higher Education, K-12 Schools, Language Learning, Libraries, Professional Training, Specialty Schools, Sports Training, Technology Training, Tutoring, Vocational Training
|
||||
- Entertainment: Amusement, Arts, Fitness, Gambling, Games & Recreation, Movies, Museums, Music Venues, Parks, Performing Arts, Recreation, Social, Sports, Venues, Wildlife
|
||||
- Events_Weddings: Attire, Florists, Memorial, Planning, Rentals, Services, Venues
|
||||
- Finance_Insurance: Banking, Insurance, Investment, Lending, Money Services
|
||||
- Food_Dining: Bakeries & Desserts, Bars & Nightlife, Beverage Production, Cafes & Coffee, Food Services, Quick Service, Restaurants
|
||||
- Government: International, Legal, Local Government, Postal, Public Safety, Social Services, Transportation
|
||||
- Healthcare: Alternative Medicine, Clinics, Dental, Diagnostics, Emergency Services, Hospitals, Medical Practitioners, Mental Health, Pharmacies, Rehabilitation, Senior Care, Specialty Care, Veterinary, Vision Care
|
||||
- Home_Services: Appliance Repair, Cleaning, Construction, Design, Electrical, Flooring, General Repair, HVAC, Landscaping, Moving, Pest Control, Plumbing, Pool & Spa, Roofing, Security, Windows & Doors
|
||||
- Hospitality_Travel: Attractions, Lodging, Transportation, Travel Services
|
||||
- Industrial: Construction, Manufacturing, Mining
|
||||
- Non_Profit: Charities, Community, General, Professional
|
||||
- Personal_Services: Body Art, Clothing Care, Fitness, Hair Care, Laundry, Massage, Spa & Wellness
|
||||
- Pets_Animals: Animal Welfare, Pet Services
|
||||
- Professional_Services: Agencies, Business Services, Consulting, Creative Services, Design, Engineering, Financial Services, HR Services, Language Services, Legal, Marketing & Advertising
|
||||
- Real_Estate: Agencies, Commercial, Development, Management, Residential, Services, Storage
|
||||
- Religious: Buddhism, Christian, Hinduism, Islam, Judaism, Other
|
||||
- Retail_Shopping: Arts & Crafts, Beauty & Cosmetics, Books & Office, Clothing & Fashion, Electronics, Food & Grocery, Hardware & Building, Health & Pharmacy, Home & Garden, Jewelry & Watches, Markets, Music & Entertainment, Pet Supplies, Secondhand & Vintage, Specialty Retail, Sports & Outdoors, Toys & Hobbies, Wholesale & Distribution
|
||||
- Technology: IT Services, Infrastructure, Software, Telecommunications
|
||||
- Transportation: Delivery, Logistics, Passenger, Public Transit, Vehicle Services
|
||||
"""
|
||||
|
||||
import psycopg2
|
||||
import re
|
||||
from collections import defaultdict
|
||||
|
||||
# Database connection
|
||||
DB_URL = "postgresql://scraper:scraper123@localhost:5437/scraper"
|
||||
|
||||
def slugify(text):
|
||||
"""Convert text to slug format"""
|
||||
slug = re.sub(r'[^\w\s-]', '', text)
|
||||
slug = re.sub(r'[-\s]+', '_', slug)
|
||||
return slug.strip('_')
|
||||
|
||||
# ==================== CATEGORIZATION RULES ====================
|
||||
# Format: (keyword_pattern, sector, business_type, sub_category)
|
||||
# Use regex patterns for flexibility
|
||||
|
||||
CATEGORIZATION_RULES = [
|
||||
# ==================== SPORTS & FITNESS (→ Entertainment.Sports or Entertainment.Fitness) ====================
|
||||
# Sports clubs and facilities
|
||||
(r'\b(basketball|baseball|football|soccer|tennis|golf|hockey|rugby|cricket|volleyball|badminton|squash|racquetball)\b.*(club|court|field|ground|stadium|arena|complex)', 'Entertainment', 'Sports', 'Facilities'),
|
||||
(r'\b(swimming|diving|aquatic|pool)\b.*(club|center|pool|facility)', 'Entertainment', 'Sports', 'Aquatic'),
|
||||
(r'\b(gym|fitness|workout|crossfit|aerobic|pilates|yoga|zumba)\b.*(center|studio|club|class)', 'Entertainment', 'Fitness', 'Studios'),
|
||||
(r'\b(martial arts|karate|judo|taekwondo|aikido|boxing|kickboxing|mma|wrestling|fencing)\b.*(club|school|academy|dojo|studio)', 'Entertainment', 'Sports', 'Martial_Arts'),
|
||||
(r'\b(archery|shooting|rifle|gun)\b.*(range|club|center)', 'Entertainment', 'Sports', 'Shooting'),
|
||||
(r'\b(skateboard|skate park|bmx|cycling|bicycle)\b.*(park|venue|club|center)', 'Entertainment', 'Sports', 'Cycling_Skating'),
|
||||
(r'\b(climbing|bouldering|rock climbing)\b.*(gym|wall|center|club)', 'Entertainment', 'Fitness', 'Climbing'),
|
||||
(r'\b(dance|ballet|ballroom|salsa|tango)\b.*(studio|school|class|instructor)', 'Entertainment', 'Performing Arts', 'Dance'),
|
||||
(r'\bsports\b.*(center|complex|facility|club)', 'Entertainment', 'Sports', 'General'),
|
||||
(r'\bathletic\b.*(field|track|club|center)', 'Entertainment', 'Sports', 'Facilities'),
|
||||
(r'\b(rowing|canoeing|kayaking|sailing|boat)\b.*(club|center|school)', 'Entertainment', 'Sports', 'Water_Sports'),
|
||||
(r'\b(equestrian|horse|polo|riding)\b.*(club|center|school|stable|arena)', 'Entertainment', 'Sports', 'Equestrian'),
|
||||
(r'\b(ski|snowboard|ice skating|ice rink)\b.*(resort|center|club|rink)', 'Entertainment', 'Sports', 'Winter_Sports'),
|
||||
|
||||
# Instructors and trainers
|
||||
(r'\b(fitness|personal|sports|athletic)\b.*\b(trainer|instructor|coach)\b', 'Entertainment', 'Fitness', 'Trainers'),
|
||||
(r'\baerobic.*instructor\b', 'Entertainment', 'Fitness', 'Trainers'),
|
||||
|
||||
# ==================== HEALTHCARE (various) ====================
|
||||
# Medical specialists
|
||||
(r'\b(allergist|anesthesiologist|cardiologist|dermatologist|endocrinologist|gastroenterologist|geriatrician|hematologist|immunologist|nephrologist|neurologist|oncologist|ophthalmologist|orthopedist|otolaryngologist|pathologist|pediatrician|physiatrist|podiatrist|proctologist|pulmonologist|radiologist|rheumatologist|urologist)\b', 'Healthcare', 'Medical Practitioners', 'Specialists'),
|
||||
(r'\b(audiologist|speech therapist|occupational therapist|physical therapist)\b', 'Healthcare', 'Rehabilitation', 'Therapists'),
|
||||
(r'\b(psychologist|psychiatrist|counselor|therapist)\b(?!.*massage)', 'Healthcare', 'Mental Health', 'Practitioners'),
|
||||
(r'\b(chiropractor|osteopath|naturopath|homeopath|acupuncturist|herbalist)\b', 'Healthcare', 'Alternative Medicine', 'Practitioners'),
|
||||
(r'\b(optometrist|optician)\b', 'Healthcare', 'Vision Care', 'Practitioners'),
|
||||
(r'\b(medical|health)\b.*(center|clinic|office|practice)', 'Healthcare', 'Clinics', 'General'),
|
||||
(r'\b(aged care|elder care|senior care|nursing home|assisted living|retirement)\b', 'Healthcare', 'Senior Care', 'Facilities'),
|
||||
(r'\b(blood bank|blood donation|plasma)\b', 'Healthcare', 'Diagnostics', 'Blood_Services'),
|
||||
(r'\b(dialysis|kidney)\b.*(center|clinic)', 'Healthcare', 'Specialty Care', 'Dialysis'),
|
||||
(r'\b(fertility|ivf|reproductive)\b.*(clinic|center)', 'Healthcare', 'Specialty Care', 'Fertility'),
|
||||
(r'\b(hospice|palliative)\b', 'Healthcare', 'Senior Care', 'Hospice'),
|
||||
(r'\b(medical lab|laboratory|pathology|diagnostic)\b.*(center|lab)', 'Healthcare', 'Diagnostics', 'Labs'),
|
||||
(r'\b(ambulance|emergency|paramedic|first aid)\b', 'Healthcare', 'Emergency Services', 'EMS'),
|
||||
|
||||
# ==================== AUTOMOTIVE (various) ====================
|
||||
(r'\bauto\b.*(body|paint|dent|collision|restoration|upholster)', 'Automotive', 'Repair & Maintenance', 'Body_Work'),
|
||||
(r'\bauto\b.*(repair|mechanic|service|tune.?up|brake|transmission|radiator)', 'Automotive', 'Repair & Maintenance', 'Mechanical'),
|
||||
(r'\bauto\b.*(auction|broker|dealer)', 'Automotive', 'Dealers', 'Used_Vehicles'),
|
||||
(r'\bauto\b.*(wrecker|salvage|junk|dismantl)', 'Automotive', 'Parts & Accessories', 'Salvage'),
|
||||
(r'\b(car|vehicle|auto)\b.*(wash|detail|clean|wax)', 'Automotive', 'Vehicle Care', 'Cleaning'),
|
||||
(r'\b(car|vehicle|auto)\b.*(rental|hire|lease)', 'Automotive', 'Rental Services', 'Vehicles'),
|
||||
(r'\b(car|vehicle|auto)\b.*(storage|parking)', 'Automotive', 'Parking', 'Storage'),
|
||||
(r'\b(motorcycle|motorbike|scooter|atv|quad)\b.*(dealer|shop|rental|repair)', 'Automotive', 'Dealers', 'Motorcycles'),
|
||||
(r'\b(tire|tyre|wheel)\b.*(shop|store|service|dealer)', 'Automotive', 'Parts & Accessories', 'Tires'),
|
||||
(r'\b(driving|driver)\b.*(school|training|instructor|lesson)', 'Automotive', 'Training', 'Driving_Schools'),
|
||||
(r'\btruck\b.*(stop|dealer|rental|repair)', 'Automotive', 'Dealers', 'Trucks'),
|
||||
(r'\b(rickshaw|auto rickshaw)\b', 'Transportation', 'Passenger', 'Local'),
|
||||
|
||||
# ==================== GOVERNMENT & MILITARY ====================
|
||||
(r'\b(air force|army|navy|military|armed forces)\b.*(base|facility|office|recruitment)', 'Government', 'Public Safety', 'Military'),
|
||||
(r'\b(police|sheriff|law enforcement)\b.*(station|department|office)', 'Government', 'Public Safety', 'Police'),
|
||||
(r'\b(fire|firefighter)\b.*(station|department)', 'Government', 'Public Safety', 'Fire'),
|
||||
(r'\b(court|courthouse|tribunal|judiciary)\b', 'Government', 'Legal', 'Courts'),
|
||||
(r'\b(embassy|consulate|visa)\b.*(office|center)', 'Government', 'International', 'Diplomatic'),
|
||||
(r'\b(city|town|municipal|county|district|borough)\b.*(hall|office|government|administration)', 'Government', 'Local Government', 'Offices'),
|
||||
(r'\b(social services|welfare|unemployment|disability)\b.*(office|center)', 'Government', 'Social Services', 'Welfare'),
|
||||
(r'\b(dmv|driver.*license|vehicle registration|motor vehicle)\b', 'Government', 'Transportation', 'DMV'),
|
||||
(r'\b(passport|immigration|citizenship)\b.*(office|center)', 'Government', 'International', 'Immigration'),
|
||||
(r'\b(aadhaar|agenzia entrate|tax)\b.*(office|center)', 'Government', 'Local Government', 'Tax'),
|
||||
(r'\b(asylum|refugee)\b.*(center|office)', 'Government', 'Social Services', 'Refugee'),
|
||||
|
||||
# ==================== PETS & ANIMALS ====================
|
||||
(r'\b(animal|pet)\b.*(shelter|rescue|adoption|welfare|pound|sanctuary)', 'Pets_Animals', 'Animal Welfare', 'Shelters'),
|
||||
(r'\b(animal|pet)\b.*(hospital|clinic|vet|veterinary)', 'Healthcare', 'Veterinary', 'Clinics'),
|
||||
(r'\b(animal|pet)\b.*(grooming|boarding|kennel|daycare|sitting|walking)', 'Pets_Animals', 'Pet Services', 'Care'),
|
||||
(r'\b(animal|pet)\b.*(training|obedience|behavior)', 'Pets_Animals', 'Pet Services', 'Training'),
|
||||
(r'\b(dog|cat|bird|fish|reptile|aquarium)\b.*(breeder|shop|store)', 'Retail_Shopping', 'Pet Supplies', 'Breeders'),
|
||||
(r'\bzoo\b|aquarium|wildlife.*park|safari', 'Entertainment', 'Wildlife', 'Zoos'),
|
||||
|
||||
# ==================== RELIGIOUS ====================
|
||||
(r'\b(church|chapel|cathedral|basilica|parish)\b', 'Religious', 'Christian', 'Churches'),
|
||||
(r'\b(temple|mandir|hindu)\b', 'Religious', 'Hinduism', 'Temples'),
|
||||
(r'\b(mosque|masjid|islamic)\b', 'Religious', 'Islam', 'Mosques'),
|
||||
(r'\b(synagogue|jewish|judaism)\b', 'Religious', 'Judaism', 'Synagogues'),
|
||||
(r'\b(buddhist|buddha|monastery|zen|meditation center)\b', 'Religious', 'Buddhism', 'Temples'),
|
||||
(r'\b(ashram|spiritual|guru)\b', 'Religious', 'Other', 'Spiritual'),
|
||||
(r'\b(baha.*i|sikh|gurdwara|shinto)\b', 'Religious', 'Other', 'Houses_of_Worship'),
|
||||
|
||||
# ==================== EDUCATION ====================
|
||||
(r'\b(university|college|faculty|academic department)\b', 'Education', 'Higher Education', 'Universities'),
|
||||
(r'\b(preschool|kindergarten|nursery|daycare|child.*care|creche)\b(?!.*animal)', 'Education', 'Early Childhood', 'Preschools'),
|
||||
(r'\b(school|academy)\b(?!.*driving|.*martial|.*dance|.*music|.*art|.*beauty|.*cooking|.*flight)', 'Education', 'K-12 Schools', 'General'),
|
||||
(r'\b(language|esl|english)\b.*(school|class|course|learning)', 'Education', 'Language Learning', 'Schools'),
|
||||
(r'\b(art|drawing|painting)\b.*(school|class|studio)', 'Education', 'Arts Education', 'Visual_Arts'),
|
||||
(r'\b(music|piano|guitar|violin|drum)\b.*(school|lesson|instructor|teacher)', 'Education', 'Arts Education', 'Music'),
|
||||
(r'\b(acting|theater|drama)\b.*(school|class|academy)', 'Education', 'Arts Education', 'Performing'),
|
||||
(r'\b(tutoring|tutor|coaching)\b.*(center|service)', 'Education', 'Tutoring', 'General'),
|
||||
(r'\b(library|public library)\b', 'Education', 'Libraries', 'Public'),
|
||||
(r'\b(archive|historical|museum)\b.*library', 'Education', 'Libraries', 'Special'),
|
||||
(r'\b(vocational|trade|technical)\b.*(school|training|institute)', 'Education', 'Vocational Training', 'General'),
|
||||
(r'\b(apprentice|internship)\b', 'Education', 'Vocational Training', 'Apprenticeships'),
|
||||
(r'\b(flight|aviation|pilot)\b.*(school|training|academy)', 'Education', 'Specialty Schools', 'Aviation'),
|
||||
(r'\b(cooking|culinary|chef)\b.*(school|class|academy)', 'Education', 'Specialty Schools', 'Culinary'),
|
||||
(r'\b(beauty|cosmetology|esthetician)\b.*(school|academy)', 'Education', 'Specialty Schools', 'Beauty'),
|
||||
|
||||
# ==================== HOME SERVICES ====================
|
||||
(r'\b(bathroom|kitchen)\b.*(remodel|renovation|contractor)', 'Home_Services', 'Construction', 'Remodeling'),
|
||||
(r'\b(general|home)\b.*contractor', 'Home_Services', 'Construction', 'General'),
|
||||
(r'\b(painter|painting)\b.*(contractor|service|company)(?!.*auto)', 'Home_Services', 'Construction', 'Painting'),
|
||||
(r'\b(carpenter|carpentry|cabinet|woodwork)\b', 'Home_Services', 'Construction', 'Carpentry'),
|
||||
(r'\b(mason|masonry|brick|concrete|stone)\b.*(contractor|service|company)', 'Home_Services', 'Construction', 'Masonry'),
|
||||
(r'\b(electrician|electrical)\b.*(contractor|service|company)', 'Home_Services', 'Electrical', 'Contractors'),
|
||||
(r'\b(plumber|plumbing)\b.*(contractor|service|company)', 'Home_Services', 'Plumbing', 'Contractors'),
|
||||
(r'\b(hvac|heating|air conditioning|furnace)\b.*(contractor|service|company)', 'Home_Services', 'HVAC', 'Contractors'),
|
||||
(r'\b(roofer|roofing)\b.*(contractor|service|company)', 'Home_Services', 'Roofing', 'Contractors'),
|
||||
(r'\b(landscap|lawn|garden)\b.*(service|company|contractor)(?!.*store|.*center)', 'Home_Services', 'Landscaping', 'Services'),
|
||||
(r'\b(pool|spa)\b.*(service|cleaning|maintenance|contractor)', 'Home_Services', 'Pool & Spa', 'Services'),
|
||||
(r'\b(pest|exterminator|termite)\b.*(control|service)', 'Home_Services', 'Pest Control', 'Services'),
|
||||
(r'\b(cleaning|maid|janitorial|housekeeping)\b.*(service|company)', 'Home_Services', 'Cleaning', 'Services'),
|
||||
(r'\b(window)\b.*(cleaning|wash)', 'Home_Services', 'Cleaning', 'Window'),
|
||||
(r'\b(appliance)\b.*(repair|service)', 'Home_Services', 'Appliance Repair', 'Services'),
|
||||
(r'\b(handyman|odd job|home repair)\b', 'Home_Services', 'General Repair', 'Handyman'),
|
||||
(r'\b(moving|movers|relocation)\b.*(company|service)', 'Home_Services', 'Moving', 'Services'),
|
||||
(r'\b(locksmith)\b', 'Home_Services', 'Security', 'Locksmith'),
|
||||
(r'\b(alarm|security system)\b.*(company|service|installer)', 'Home_Services', 'Security', 'Systems'),
|
||||
(r'\b(arborist|tree)\b.*(service|removal|trimming)', 'Home_Services', 'Landscaping', 'Tree_Service'),
|
||||
(r'\b(fence)\b.*(contractor|company|install)', 'Home_Services', 'Construction', 'Fencing'),
|
||||
(r'\b(garage door)\b.*(service|repair|install)', 'Home_Services', 'General Repair', 'Garage_Doors'),
|
||||
(r'\b(gutter)\b.*(cleaning|service|install)', 'Home_Services', 'Construction', 'Gutters'),
|
||||
(r'\b(insulation)\b.*(contractor|company)', 'Home_Services', 'Construction', 'Insulation'),
|
||||
(r'\b(deck|patio)\b.*(builder|contractor)', 'Home_Services', 'Construction', 'Outdoor'),
|
||||
(r'\b(drywall|sheetrock)\b', 'Home_Services', 'Construction', 'Drywall'),
|
||||
(r'\b(flooring|carpet|tile|hardwood)\b.*(install|contractor|company)(?!.*store)', 'Home_Services', 'Flooring', 'Installation'),
|
||||
(r'\b(window|door)\b.*(install|replacement|contractor)', 'Home_Services', 'Windows & Doors', 'Installation'),
|
||||
(r'\b(glass)\b.*(repair|replacement|company)(?!.*auto)', 'Home_Services', 'Windows & Doors', 'Glass'),
|
||||
(r'\b(chimney)\b.*(sweep|cleaning|repair)', 'Home_Services', 'General Repair', 'Chimney'),
|
||||
(r'\b(septic|sewer)\b.*(service|pumping|cleaning)', 'Home_Services', 'Plumbing', 'Septic'),
|
||||
(r'\b(well)\b.*(drilling|service|pump)', 'Home_Services', 'Plumbing', 'Wells'),
|
||||
(r'\b(solar)\b.*(install|contractor|company)', 'Home_Services', 'Electrical', 'Solar'),
|
||||
|
||||
# ==================== RETAIL & SHOPPING ====================
|
||||
(r'\b(antique|vintage|secondhand|thrift|consignment|pawn)\b.*(shop|store)', 'Retail_Shopping', 'Secondhand & Vintage', 'Stores'),
|
||||
(r'\b(auction)\b.*(house|company)', 'Retail_Shopping', 'Secondhand & Vintage', 'Auctions'),
|
||||
(r'\b(art|craft|hobby)\b.*(supply|store|shop)', 'Retail_Shopping', 'Arts & Crafts', 'Supplies'),
|
||||
(r'\b(toy|game|hobby)\b.*(store|shop)', 'Retail_Shopping', 'Toys & Hobbies', 'Stores'),
|
||||
(r'\b(book|stationery|office supply)\b.*(store|shop)', 'Retail_Shopping', 'Books & Office', 'Stores'),
|
||||
(r'\b(music|instrument|record|vinyl)\b.*(store|shop)', 'Retail_Shopping', 'Music & Entertainment', 'Stores'),
|
||||
(r'\b(sporting|sports|outdoor|camping|fishing|hunting)\b.*(goods|store|shop)', 'Retail_Shopping', 'Sports & Outdoors', 'Stores'),
|
||||
(r'\b(electronics|computer|phone|appliance)\b.*(store|shop|retailer)', 'Retail_Shopping', 'Electronics', 'Stores'),
|
||||
(r'\b(furniture|home decor|bedding|mattress)\b.*(store|shop)', 'Retail_Shopping', 'Home & Garden', 'Stores'),
|
||||
(r'\b(clothing|fashion|apparel|boutique|shoe)\b.*(store|shop)', 'Retail_Shopping', 'Clothing & Fashion', 'Stores'),
|
||||
(r'\b(jewelry|watch|gem)\b.*(store|shop)', 'Retail_Shopping', 'Jewelry & Watches', 'Stores'),
|
||||
(r'\b(hardware|tool|building supply|lumber)\b.*(store|shop)', 'Retail_Shopping', 'Hardware & Building', 'Stores'),
|
||||
(r'\b(garden|nursery|plant)\b.*(center|store|shop)', 'Retail_Shopping', 'Home & Garden', 'Garden_Centers'),
|
||||
(r'\b(pharmacy|drugstore)\b', 'Retail_Shopping', 'Health & Pharmacy', 'Pharmacies'),
|
||||
(r'\b(cosmetic|beauty|makeup)\b.*(store|shop)', 'Retail_Shopping', 'Beauty & Cosmetics', 'Stores'),
|
||||
(r'\b(grocery|supermarket|food|convenience)\b.*(store|market|shop)', 'Retail_Shopping', 'Food & Grocery', 'Stores'),
|
||||
(r'\b(liquor|wine|beer|alcohol)\b.*(store|shop)', 'Retail_Shopping', 'Food & Grocery', 'Liquor'),
|
||||
(r'\b(tobacco|cigar|vape|smoke)\b.*(shop|store)', 'Retail_Shopping', 'Specialty Retail', 'Tobacco'),
|
||||
(r'\b(mobile phone|cell phone)\b.*(store|shop|dealer)', 'Retail_Shopping', 'Electronics', 'Phones'),
|
||||
(r'\b(optical|eyewear|glasses|sunglass)\b.*(store|shop)', 'Retail_Shopping', 'Health & Pharmacy', 'Optical'),
|
||||
(r'\b(florist|flower)\b.*(shop|store)', 'Events_Weddings', 'Florists', 'Shops'),
|
||||
(r'\b(bridal|wedding)\b.*(shop|store|boutique)', 'Events_Weddings', 'Attire', 'Bridal'),
|
||||
(r'\b(uniform|workwear)\b.*(store|shop)', 'Retail_Shopping', 'Clothing & Fashion', 'Specialty'),
|
||||
|
||||
# ==================== PROFESSIONAL SERVICES ====================
|
||||
(r'\b(lawyer|attorney|law firm|legal)\b.*(office|firm|service)', 'Professional_Services', 'Legal', 'Firms'),
|
||||
(r'\b(accountant|accounting|bookkeep|tax)\b.*(firm|service|office)(?!.*government)', 'Professional_Services', 'Financial Services', 'Accounting'),
|
||||
(r'\b(architect|architecture)\b.*(firm|office|studio)', 'Professional_Services', 'Engineering', 'Architecture'),
|
||||
(r'\b(engineer|engineering)\b.*(firm|office|company)', 'Professional_Services', 'Engineering', 'Firms'),
|
||||
(r'\b(surveyor|surveying|land survey)\b', 'Professional_Services', 'Engineering', 'Surveying'),
|
||||
(r'\b(consultant|consulting)\b.*(firm|company|service)', 'Professional_Services', 'Consulting', 'General'),
|
||||
(r'\b(marketing|advertising|pr|public relations)\b.*(agency|firm|company)', 'Professional_Services', 'Marketing & Advertising', 'Agencies'),
|
||||
(r'\b(graphic|web|design)\b.*(studio|agency|firm)', 'Professional_Services', 'Creative Services', 'Design'),
|
||||
(r'\b(photography|photographer|video|videograph)\b.*(studio|service)', 'Professional_Services', 'Creative Services', 'Photography'),
|
||||
(r'\b(translation|interpreter|language)\b.*service', 'Professional_Services', 'Language Services', 'Translation'),
|
||||
(r'\b(staffing|recruiting|employment|hr)\b.*(agency|service|firm)', 'Professional_Services', 'HR Services', 'Agencies'),
|
||||
(r'\b(notary|notarial)\b', 'Professional_Services', 'Legal', 'Notary'),
|
||||
(r'\b(private investigator|detective)\b', 'Professional_Services', 'Agencies', 'Investigation'),
|
||||
(r'\b(appraiser|appraisal|valuation)\b', 'Professional_Services', 'Financial Services', 'Appraisal'),
|
||||
(r'\b(auditor|audit)\b.*(firm|service)', 'Professional_Services', 'Financial Services', 'Audit'),
|
||||
(r'\b(courier|messenger|delivery)\b.*service', 'Transportation', 'Delivery', 'Courier'),
|
||||
|
||||
# ==================== ARTS & CULTURE ====================
|
||||
(r'\b(art|gallery|exhibition)\b(?!.*supply|.*store|.*school)', 'Entertainment', 'Arts', 'Galleries'),
|
||||
(r'\b(museum)\b', 'Entertainment', 'Museums', 'General'),
|
||||
(r'\b(theater|theatre|playhouse|opera house)\b', 'Entertainment', 'Performing Arts', 'Venues'),
|
||||
(r'\b(cinema|movie theater|multiplex)\b', 'Entertainment', 'Movies', 'Theaters'),
|
||||
(r'\b(concert|music)\b.*(hall|venue)', 'Entertainment', 'Music Venues', 'Concert_Halls'),
|
||||
(r'\b(band|orchestra|choir|ensemble)\b', 'Entertainment', 'Performing Arts', 'Groups'),
|
||||
(r'\b(comedian|comedy club)\b', 'Entertainment', 'Performing Arts', 'Comedy'),
|
||||
(r'\b(artist|sculptor|painter)\b(?!.*makeup)', 'Entertainment', 'Arts', 'Artists'),
|
||||
(r'\b(animation|animator)\b.*(studio|company)', 'Professional_Services', 'Creative Services', 'Animation'),
|
||||
(r'\b(recording|music)\b.*studio', 'Professional_Services', 'Creative Services', 'Recording'),
|
||||
(r'\b(art restoration|restoration service)\b', 'Professional_Services', 'Creative Services', 'Restoration'),
|
||||
|
||||
# ==================== ENTERTAINMENT & RECREATION ====================
|
||||
(r'\b(amusement|theme)\b.*park', 'Entertainment', 'Amusement', 'Parks'),
|
||||
(r'\b(arcade|game center|gaming)\b', 'Entertainment', 'Games & Recreation', 'Arcades'),
|
||||
(r'\b(escape room|puzzle room)\b', 'Entertainment', 'Games & Recreation', 'Escape_Rooms'),
|
||||
(r'\b(bowling)\b.*(alley|center)', 'Entertainment', 'Games & Recreation', 'Bowling'),
|
||||
(r'\b(billiard|pool hall|snooker)\b', 'Entertainment', 'Games & Recreation', 'Billiards'),
|
||||
(r'\b(karaoke)\b', 'Entertainment', 'Music Venues', 'Karaoke'),
|
||||
(r'\b(casino|gambling|betting)\b', 'Entertainment', 'Gambling', 'Casinos'),
|
||||
(r'\b(nightclub|disco|club)\b(?!.*golf|.*country|.*tennis)', 'Food_Dining', 'Bars & Nightlife', 'Nightclubs'),
|
||||
(r'\b(country club|private club|social club)\b', 'Entertainment', 'Social', 'Clubs'),
|
||||
(r'\b(botanical garden|arboretum)\b', 'Entertainment', 'Parks', 'Gardens'),
|
||||
(r'\b(park|playground|recreation area)\b(?!.*theme|.*water|.*trailer|.*mobile)', 'Entertainment', 'Parks', 'Public'),
|
||||
(r'\b(beach|waterfront|marina)\b(?!.*hotel)', 'Entertainment', 'Parks', 'Beaches'),
|
||||
(r'\b(campground|camping|rv park|caravan)\b', 'Hospitality_Travel', 'Lodging', 'Camping'),
|
||||
(r'\b(go.?kart|kart|karting)\b', 'Entertainment', 'Games & Recreation', 'Karting'),
|
||||
(r'\b(laser tag|paintball)\b', 'Entertainment', 'Games & Recreation', 'Adventure'),
|
||||
(r'\b(trampoline|bounce|jump)\b.*(park|center)', 'Entertainment', 'Games & Recreation', 'Trampoline'),
|
||||
(r'\b(mini golf|miniature golf|putt.?putt)\b', 'Entertainment', 'Games & Recreation', 'Mini_Golf'),
|
||||
(r'\b(water park|aqua park)\b', 'Entertainment', 'Amusement', 'Water_Parks'),
|
||||
(r'\b(haunted|horror)\b.*(house|attraction)', 'Entertainment', 'Amusement', 'Attractions'),
|
||||
(r'\b(circus|carnival|fair)\b', 'Entertainment', 'Amusement', 'Shows'),
|
||||
(r'\b(planetarium|observatory)\b', 'Entertainment', 'Museums', 'Science'),
|
||||
|
||||
# ==================== FOOD & DINING ====================
|
||||
(r'\b(bar|pub|tavern|lounge|brewery|taproom|brewpub)\b(?!.*brow|.*eyebrow)', 'Food_Dining', 'Bars & Nightlife', 'Bars'),
|
||||
(r'\b(cafe|coffee|espresso)\b.*(shop|house|bar)', 'Food_Dining', 'Cafes & Coffee', 'Cafes'),
|
||||
(r'\b(restaurant|eatery|diner|bistro|brasserie|grill)\b', 'Food_Dining', 'Restaurants', 'General'),
|
||||
(r'\b(bakery|patisserie|pastry)\b', 'Food_Dining', 'Bakeries & Desserts', 'Bakeries'),
|
||||
(r'\b(ice cream|gelato|frozen yogurt|dessert)\b.*(shop|parlor|store)', 'Food_Dining', 'Bakeries & Desserts', 'Desserts'),
|
||||
(r'\b(caterer|catering)\b', 'Food_Dining', 'Food Services', 'Catering'),
|
||||
(r'\b(food truck|food cart)\b', 'Food_Dining', 'Quick Service', 'Mobile'),
|
||||
(r'\b(juice|smoothie)\b.*(bar|shop)', 'Food_Dining', 'Cafes & Coffee', 'Juice'),
|
||||
(r'\b(tea|bubble tea|boba)\b.*(shop|house|room)', 'Food_Dining', 'Cafes & Coffee', 'Tea'),
|
||||
(r'\b(winery|vineyard|wine)\b.*(tasting|cellar)', 'Food_Dining', 'Beverage Production', 'Wineries'),
|
||||
(r'\b(distillery|spirit)\b', 'Food_Dining', 'Beverage Production', 'Distilleries'),
|
||||
(r'\b(butcher|meat)\b.*shop', 'Retail_Shopping', 'Food & Grocery', 'Butchers'),
|
||||
(r'\b(fish|seafood)\b.*market', 'Retail_Shopping', 'Food & Grocery', 'Seafood'),
|
||||
(r'\b(deli|delicatessen)\b', 'Retail_Shopping', 'Food & Grocery', 'Delis'),
|
||||
(r'\b(candy|chocolate|sweet|confection)\b.*(shop|store)', 'Retail_Shopping', 'Food & Grocery', 'Confectionery'),
|
||||
|
||||
# ==================== PERSONAL SERVICES ====================
|
||||
(r'\b(barber|hair)\b.*(shop|salon|stylist)', 'Personal_Services', 'Hair Care', 'Salons'),
|
||||
(r'\b(beauty|nail|manicure|pedicure)\b.*(salon|spa|studio)', 'Personal_Services', 'Spa & Wellness', 'Beauty'),
|
||||
(r'\b(tattoo|piercing|body art)\b.*(shop|studio|parlor)', 'Personal_Services', 'Body Art', 'Studios'),
|
||||
(r'\b(massage)\b.*(therapist|spa|parlor|studio)', 'Personal_Services', 'Massage', 'Studios'),
|
||||
(r'\b(spa|wellness|day spa)\b', 'Personal_Services', 'Spa & Wellness', 'Spas'),
|
||||
(r'\b(tanning|sunbed)\b.*(salon|studio)', 'Personal_Services', 'Spa & Wellness', 'Tanning'),
|
||||
(r'\b(laundry|laundromat|dry clean|tailor|alteration|seamstress)\b', 'Personal_Services', 'Laundry', 'Services'),
|
||||
(r'\b(shoe repair|cobbler)\b', 'Personal_Services', 'Clothing Care', 'Shoe_Repair'),
|
||||
(r'\b(brow|eyebrow|lash|eyelash)\b.*(bar|salon|studio)', 'Personal_Services', 'Spa & Wellness', 'Brows_Lashes'),
|
||||
(r'\b(makeup artist|stylist)\b', 'Personal_Services', 'Spa & Wellness', 'Makeup'),
|
||||
(r'\b(sauna|steam room|bathhouse|hammam)\b', 'Personal_Services', 'Spa & Wellness', 'Baths'),
|
||||
(r'\b(waxing)\b.*(salon|studio)', 'Personal_Services', 'Spa & Wellness', 'Waxing'),
|
||||
|
||||
# ==================== HOSPITALITY & TRAVEL ====================
|
||||
(r'\b(hotel|motel|inn|resort|hostel|lodge|bed and breakfast|b&b|guesthouse)\b', 'Hospitality_Travel', 'Lodging', 'Hotels'),
|
||||
(r'\b(travel|tour)\b.*(agency|operator|company)', 'Hospitality_Travel', 'Travel Services', 'Agencies'),
|
||||
(r'\b(airline|airport|aviation)\b(?!.*school)', 'Transportation', 'Passenger', 'Air'),
|
||||
(r'\b(cruise|ferry)\b.*(line|terminal|port)', 'Transportation', 'Passenger', 'Water'),
|
||||
(r'\b(train|rail)\b.*(station|service)', 'Transportation', 'Passenger', 'Rail'),
|
||||
(r'\b(bus|coach)\b.*(station|terminal|service|company)', 'Transportation', 'Passenger', 'Bus'),
|
||||
(r'\b(taxi|cab|ride|uber|lyft|limo|limousine|chauffeur)\b.*(service|company|stand)', 'Transportation', 'Passenger', 'Taxi'),
|
||||
(r'\b(tourist|visitor)\b.*(information|center|bureau)', 'Hospitality_Travel', 'Travel Services', 'Information'),
|
||||
(r'\b(rental)\b.*\b(cabin|cottage|vacation|holiday)\b', 'Hospitality_Travel', 'Lodging', 'Rentals'),
|
||||
|
||||
# ==================== INDUSTRIAL & MANUFACTURING ====================
|
||||
(r'\b(factory|plant|mill|manufacturing)\b', 'Industrial', 'Manufacturing', 'General'),
|
||||
(r'\b(warehouse|distribution|logistics)\b.*(center|facility)', 'Transportation', 'Logistics', 'Warehouses'),
|
||||
(r'\b(machine|machinist|metalwork|welding|welder)\b.*(shop|company|service)', 'Industrial', 'Manufacturing', 'Metal'),
|
||||
(r'\b(print|printing|press)\b.*(shop|company|service)', 'Industrial', 'Manufacturing', 'Printing'),
|
||||
(r'\b(textile|fabric|garment)\b.*(factory|mill|manufacturer)', 'Industrial', 'Manufacturing', 'Textile'),
|
||||
(r'\b(chemical|pharmaceutical)\b.*(company|manufacturer|plant)', 'Industrial', 'Manufacturing', 'Chemical'),
|
||||
(r'\b(construction|building)\b.*(company|contractor|firm)', 'Industrial', 'Construction', 'General'),
|
||||
(r'\b(quarry|gravel|sand|aggregate)\b', 'Industrial', 'Mining', 'Quarries'),
|
||||
(r'\b(sawmill|lumber)\b.*(mill|yard)', 'Industrial', 'Manufacturing', 'Wood'),
|
||||
(r'\b(steel|iron|aluminum)\b.*(plant|manufacturer|company)', 'Industrial', 'Manufacturing', 'Metal'),
|
||||
(r'\b(packaging|container)\b.*(company|manufacturer)', 'Industrial', 'Manufacturing', 'Packaging'),
|
||||
(r'\b(recycling|waste)\b.*(center|facility|company)', 'Industrial', 'Manufacturing', 'Recycling'),
|
||||
|
||||
# ==================== REAL ESTATE ====================
|
||||
(r'\b(real estate|realtor|property)\b.*(agent|agency|company)', 'Real_Estate', 'Agencies', 'Agents'),
|
||||
(r'\b(property management|apartment|rental)\b.*(company|agency)', 'Real_Estate', 'Management', 'Residential'),
|
||||
(r'\b(storage|self storage|mini storage)\b.*(facility|unit)', 'Real_Estate', 'Storage', 'Self_Storage'),
|
||||
(r'\b(office|commercial)\b.*(space|building|complex)', 'Real_Estate', 'Commercial', 'Office'),
|
||||
(r'\b(apartment|condo|housing)\b.*(complex|building|community)', 'Real_Estate', 'Residential', 'Apartments'),
|
||||
(r'\b(home builder|housing development)\b', 'Real_Estate', 'Development', 'Residential'),
|
||||
|
||||
# ==================== NON-PROFIT & COMMUNITY ====================
|
||||
(r'\b(charity|charitable|foundation|fund)\b(?!.*investment)', 'Non_Profit', 'Charities', 'General'),
|
||||
(r'\b(non.?profit|ngo|association)\b', 'Non_Profit', 'General', 'Organizations'),
|
||||
(r'\b(community|civic|neighborhood)\b.*(center|organization|association)', 'Non_Profit', 'Community', 'Centers'),
|
||||
(r'\b(youth|boys|girls|scout)\b.*(club|organization|center)', 'Non_Profit', 'Community', 'Youth'),
|
||||
(r'\b(senior|elder)\b.*(center|club)(?!.*care)', 'Non_Profit', 'Community', 'Seniors'),
|
||||
(r'\b(veterans|vfw|american legion)\b', 'Non_Profit', 'Community', 'Veterans'),
|
||||
(r'\b(rotary|lions|kiwanis|elks|freemason|lodge)\b', 'Non_Profit', 'Community', 'Fraternal'),
|
||||
(r'\b(union|labor)\b.*(hall|organization)', 'Non_Profit', 'Professional', 'Unions'),
|
||||
(r'\b(chamber of commerce|business association)\b', 'Non_Profit', 'Professional', 'Business'),
|
||||
(r'\b(aboriginal|indigenous|tribal)\b.*(organization|center)', 'Non_Profit', 'Community', 'Indigenous'),
|
||||
|
||||
# ==================== TECHNOLOGY ====================
|
||||
(r'\b(software|app|web)\b.*(developer|development|company)', 'Technology', 'Software', 'Development'),
|
||||
(r'\b(it|computer|tech)\b.*(service|support|repair)', 'Technology', 'IT Services', 'Support'),
|
||||
(r'\b(data center|server|cloud)\b', 'Technology', 'Infrastructure', 'Data_Centers'),
|
||||
(r'\b(internet|isp|broadband|telecom)\b.*(provider|service|company)', 'Technology', 'Telecommunications', 'Providers'),
|
||||
(r'\b(bpo|call center|outsourc)\b', 'Technology', 'IT Services', 'BPO'),
|
||||
(r'\b(automation|robot)\b.*(company|service)', 'Technology', 'Software', 'Automation'),
|
||||
|
||||
# ==================== FINANCE & INSURANCE ====================
|
||||
(r'\b(bank|credit union|savings)\b(?!.*blood|.*food)', 'Finance_Insurance', 'Banking', 'Banks'),
|
||||
(r'\b(atm|cash machine)\b', 'Finance_Insurance', 'Banking', 'ATMs'),
|
||||
(r'\b(insurance)\b.*(agent|agency|company|broker)', 'Finance_Insurance', 'Insurance', 'Agents'),
|
||||
(r'\b(mortgage|loan|lending|finance)\b.*(company|broker|service)', 'Finance_Insurance', 'Lending', 'Lenders'),
|
||||
(r'\b(investment|wealth|portfolio|financial advisor)\b', 'Finance_Insurance', 'Investment', 'Advisors'),
|
||||
(r'\b(money transfer|remittance|western union|moneygram)\b', 'Finance_Insurance', 'Money Services', 'Transfer'),
|
||||
(r'\b(currency exchange|forex)\b', 'Finance_Insurance', 'Money Services', 'Exchange'),
|
||||
(r'\b(bail bond)\b', 'Professional_Services', 'Legal', 'Bail'),
|
||||
|
||||
# ==================== EVENTS & WEDDINGS ====================
|
||||
(r'\b(funeral|mortuary|cremation|cemetery|memorial)\b', 'Events_Weddings', 'Memorial', 'Funeral'),
|
||||
(r'\b(event|party|wedding)\b.*(planner|planning|coordinator)', 'Events_Weddings', 'Planning', 'Planners'),
|
||||
(r'\b(banquet|event|reception|wedding)\b.*(hall|venue|center)', 'Events_Weddings', 'Venues', 'Halls'),
|
||||
(r'\b(dj|disc jockey|entertainment)\b.*service', 'Events_Weddings', 'Services', 'Entertainment'),
|
||||
(r'\b(balloon|party supply|decoration)\b', 'Events_Weddings', 'Services', 'Decorations'),
|
||||
(r'\b(tent|equipment)\b.*rental(?!.*car|.*truck)', 'Events_Weddings', 'Rentals', 'Equipment'),
|
||||
(r'\b(photo booth|photobooth)\b', 'Events_Weddings', 'Services', 'Photography'),
|
||||
|
||||
# ==================== AGRICULTURE ====================
|
||||
(r'\b(farm|ranch|orchard|vineyard)\b(?!.*winery)', 'Agriculture', 'Farming', 'Farms'),
|
||||
(r'\b(agriculture|farming|crop)\b.*(service|supply|equipment)', 'Agriculture', 'Services', 'Supplies'),
|
||||
(r'\b(livestock|cattle|poultry|dairy)\b', 'Agriculture', 'Farming', 'Livestock'),
|
||||
(r'\b(nursery|greenhouse|horticulture)\b.*(wholesale|grower)', 'Agriculture', 'Farming', 'Horticulture'),
|
||||
(r'\b(agistment|horse boarding|stable)\b', 'Agriculture', 'Services', 'Equine'),
|
||||
(r'\b(veterinarian|vet)\b.*(livestock|farm|large animal)', 'Agriculture', 'Services', 'Veterinary'),
|
||||
|
||||
# ==================== TRANSPORTATION ====================
|
||||
(r'\b(shipping|freight|cargo|trucking)\b.*(company|service)', 'Transportation', 'Logistics', 'Shipping'),
|
||||
(r'\b(courier|messenger|express)\b.*(service|delivery)', 'Transportation', 'Delivery', 'Courier'),
|
||||
(r'\b(airport|airfield|airstrip|heliport)\b', 'Transportation', 'Passenger', 'Airports'),
|
||||
(r'\b(port|harbor|dock|pier|marina)\b(?!.*wine)', 'Transportation', 'Logistics', 'Ports'),
|
||||
(r'\b(parking)\b.*(lot|garage|structure)', 'Automotive', 'Parking', 'Lots'),
|
||||
(r'\b(towing|tow truck)\b', 'Transportation', 'Vehicle Services', 'Towing'),
|
||||
]
|
||||
|
||||
def categorize_item(name):
|
||||
"""
|
||||
Categorize a single item based on rules.
|
||||
Returns (sector, business_type, sub_category) or None if no match.
|
||||
"""
|
||||
name_lower = name.lower()
|
||||
|
||||
for pattern, sector, btype, subcat in CATEGORIZATION_RULES:
|
||||
if re.search(pattern, name_lower, re.IGNORECASE):
|
||||
return (sector, btype, subcat)
|
||||
|
||||
return None
|
||||
|
||||
def get_existing_paths(cursor):
|
||||
"""Get all existing paths in the database"""
|
||||
cursor.execute("SELECT path::text FROM gbp_categories")
|
||||
return {row[0] for row in cursor.fetchall()}
|
||||
|
||||
def get_or_create_path(cursor, sector, btype, subcat, existing_paths):
|
||||
"""
|
||||
Get or create the full path for a category.
|
||||
Returns the parent path (level 3) for the item.
|
||||
"""
|
||||
sector_slug = slugify(sector)
|
||||
btype_slug = slugify(btype)
|
||||
subcat_slug = slugify(subcat)
|
||||
|
||||
# Level 1: Sector
|
||||
sector_path = sector_slug
|
||||
if sector_path not in existing_paths:
|
||||
# Don't create new sectors - return None
|
||||
print(f" [SKIP] Would need new sector: {sector_path}")
|
||||
return None
|
||||
|
||||
# Level 2: Business Type
|
||||
btype_path = f"{sector_path}.{btype_slug}"
|
||||
if btype_path not in existing_paths:
|
||||
# Create new business type
|
||||
cursor.execute("""
|
||||
INSERT INTO gbp_categories (name, slug, path, level, parent_id, category_count)
|
||||
SELECT %s, %s, %s::ltree, 2, id, 0
|
||||
FROM gbp_categories WHERE path = %s::ltree
|
||||
ON CONFLICT (path) DO NOTHING
|
||||
RETURNING id
|
||||
""", (btype, btype_slug, btype_path, sector_path))
|
||||
result = cursor.fetchone()
|
||||
if result:
|
||||
existing_paths.add(btype_path)
|
||||
print(f" [NEW] Created business type: {btype_path}")
|
||||
|
||||
# Level 3: Sub-category
|
||||
subcat_path = f"{btype_path}.{subcat_slug}"
|
||||
if subcat_path not in existing_paths:
|
||||
# Create new sub-category
|
||||
cursor.execute("""
|
||||
INSERT INTO gbp_categories (name, slug, path, level, parent_id, category_count)
|
||||
SELECT %s, %s, %s::ltree, 3, id, 0
|
||||
FROM gbp_categories WHERE path = %s::ltree
|
||||
ON CONFLICT (path) DO NOTHING
|
||||
RETURNING id
|
||||
""", (subcat, subcat_slug, subcat_path, btype_path))
|
||||
result = cursor.fetchone()
|
||||
if result:
|
||||
existing_paths.add(subcat_path)
|
||||
print(f" [NEW] Created sub-category: {subcat_path}")
|
||||
|
||||
return subcat_path
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get all items in Other.Uncategorized
|
||||
cursor.execute("""
|
||||
SELECT id, name, slug
|
||||
FROM gbp_categories
|
||||
WHERE path ~ 'Other.Uncategorized.*' AND level = 4
|
||||
ORDER BY name
|
||||
""")
|
||||
other_items = cursor.fetchall()
|
||||
print(f"Found {len(other_items)} items in Other.Uncategorized\n")
|
||||
|
||||
# Get existing paths
|
||||
existing_paths = get_existing_paths(cursor)
|
||||
|
||||
# Categorize items
|
||||
categorized = []
|
||||
uncategorized = []
|
||||
category_counts = defaultdict(int)
|
||||
|
||||
for item_id, name, slug in other_items:
|
||||
result = categorize_item(name)
|
||||
if result:
|
||||
sector, btype, subcat = result
|
||||
categorized.append((item_id, name, slug, sector, btype, subcat))
|
||||
category_counts[(sector, btype, subcat)] += 1
|
||||
else:
|
||||
uncategorized.append((item_id, name))
|
||||
|
||||
print(f"Categorized: {len(categorized)}")
|
||||
print(f"Still uncategorized: {len(uncategorized)}")
|
||||
print()
|
||||
|
||||
# Show category distribution
|
||||
print("Category distribution:")
|
||||
for (sector, btype, subcat), count in sorted(category_counts.items(), key=lambda x: -x[1])[:30]:
|
||||
print(f" {sector}.{btype}.{subcat}: {count}")
|
||||
print()
|
||||
|
||||
# Show some uncategorized items
|
||||
print("Sample uncategorized items (first 50):")
|
||||
for item_id, name in uncategorized[:50]:
|
||||
print(f" - {name}")
|
||||
print()
|
||||
|
||||
# Ask for confirmation
|
||||
response = input("Proceed with database updates? (yes/no): ")
|
||||
if response.lower() != 'yes':
|
||||
print("Aborted.")
|
||||
conn.close()
|
||||
return
|
||||
|
||||
# Update database
|
||||
updated = 0
|
||||
for item_id, name, slug, sector, btype, subcat in categorized:
|
||||
parent_path = get_or_create_path(cursor, sector, btype, subcat, existing_paths)
|
||||
if parent_path:
|
||||
new_path = f"{parent_path}.{slug}"
|
||||
# Update the item
|
||||
cursor.execute("""
|
||||
UPDATE gbp_categories
|
||||
SET path = %s::ltree,
|
||||
parent_id = (SELECT id FROM gbp_categories WHERE path = %s::ltree)
|
||||
WHERE id = %s
|
||||
""", (new_path, parent_path, item_id))
|
||||
updated += 1
|
||||
|
||||
# Update category counts
|
||||
cursor.execute("""
|
||||
WITH counts AS (
|
||||
SELECT
|
||||
parent_id,
|
||||
COUNT(*) as cnt
|
||||
FROM gbp_categories
|
||||
WHERE parent_id IS NOT NULL
|
||||
GROUP BY parent_id
|
||||
)
|
||||
UPDATE gbp_categories g
|
||||
SET category_count = COALESCE(c.cnt, 0)
|
||||
FROM counts c
|
||||
WHERE g.id = c.parent_id
|
||||
""")
|
||||
|
||||
conn.commit()
|
||||
print(f"\nUpdated {updated} items")
|
||||
|
||||
# Show final stats
|
||||
cursor.execute("""
|
||||
SELECT path, name, category_count
|
||||
FROM gbp_categories
|
||||
WHERE level = 1
|
||||
ORDER BY category_count DESC
|
||||
""")
|
||||
print("\nFinal sector counts:")
|
||||
for path, name, count in cursor.fetchall():
|
||||
print(f" {name}: {count}")
|
||||
|
||||
conn.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user