-- ============================================================================= -- Migration: 014_primitive_classification_system.sql -- Purpose: Create primitive classification system for context-aware review analysis -- ============================================================================= -- -- This migration introduces a "primitive" classification system that allows -- industry-specific and category-specific configuration of what aspects to -- look for when classifying reviews. -- -- Components: -- 1. pipeline.primitives - Frozen dictionary of primitives (quality dimensions) -- 2. ALTER public.gbp_categories - Add primitive_configs and business_context -- 3. pipeline.jsonb_deep_merge() - Recursive JSONB merge function -- 4. pipeline.resolve_primitive_config() - Resolve configs through category tree -- 5. pipeline.get_classification_context() - Get full classification context -- -- Date: 2026-01-31 -- ============================================================================= -- ============================================================================= -- SECTION 1: PRIMITIVES TABLE (Frozen Dictionary) -- ============================================================================= CREATE TABLE IF NOT EXISTS pipeline.primitives ( code VARCHAR(30) PRIMARY KEY, dimension VARCHAR(20) NOT NULL, -- quality, service, process, environment, value, trust, resolution, loyalty, escape name VARCHAR(100) NOT NULL, definition TEXT NOT NULL, is_meta BOOLEAN DEFAULT FALSE, -- true for always-active primitives (HONESTY, ETHICS, etc.) base_positive_signals TEXT[], base_negative_signals TEXT[], created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW() ); -- Index on dimension for filtering CREATE INDEX IF NOT EXISTS idx_primitives_dimension ON pipeline.primitives(dimension); -- Index on is_meta for quick access to always-active primitives CREATE INDEX IF NOT EXISTS idx_primitives_is_meta ON pipeline.primitives(is_meta) WHERE is_meta = TRUE; COMMENT ON TABLE pipeline.primitives IS 'Frozen dictionary of classification primitives (quality dimensions)'; COMMENT ON COLUMN pipeline.primitives.code IS 'Unique identifier for the primitive (e.g., FOOD_TASTE, SERVICE_SPEED)'; COMMENT ON COLUMN pipeline.primitives.dimension IS 'Category of the primitive (quality, service, process, etc.)'; COMMENT ON COLUMN pipeline.primitives.is_meta IS 'If true, this primitive is always active regardless of category config'; COMMENT ON COLUMN pipeline.primitives.base_positive_signals IS 'Default positive signal keywords for this primitive'; COMMENT ON COLUMN pipeline.primitives.base_negative_signals IS 'Default negative signal keywords for this primitive'; -- ============================================================================= -- SECTION 2: ALTER gbp_categories TABLE -- ============================================================================= -- Add primitive_configs column (JSONB for flexible config) ALTER TABLE public.gbp_categories ADD COLUMN IF NOT EXISTS primitive_configs JSONB DEFAULT '{}'; -- Add business_context column (JSONB for industry-specific context) ALTER TABLE public.gbp_categories ADD COLUMN IF NOT EXISTS business_context JSONB DEFAULT '{}'; -- Add config versioning columns ALTER TABLE public.gbp_categories ADD COLUMN IF NOT EXISTS config_version VARCHAR(20); ALTER TABLE public.gbp_categories ADD COLUMN IF NOT EXISTS config_generated_by VARCHAR(100); ALTER TABLE public.gbp_categories ADD COLUMN IF NOT EXISTS config_updated_at TIMESTAMP WITH TIME ZONE; -- GIN indexes for JSONB containment queries CREATE INDEX IF NOT EXISTS idx_gbp_categories_primitive_configs ON public.gbp_categories USING GIN (primitive_configs); CREATE INDEX IF NOT EXISTS idx_gbp_categories_business_context ON public.gbp_categories USING GIN (business_context); -- Index for config version lookups CREATE INDEX IF NOT EXISTS idx_gbp_categories_config_version ON public.gbp_categories(config_version) WHERE config_version IS NOT NULL; COMMENT ON COLUMN public.gbp_categories.primitive_configs IS 'JSONB config for primitives at this category level (inherits from ancestors)'; COMMENT ON COLUMN public.gbp_categories.business_context IS 'JSONB business context for this category (industry-specific terminology, etc.)'; COMMENT ON COLUMN public.gbp_categories.config_version IS 'Version of the primitive config (for cache invalidation)'; COMMENT ON COLUMN public.gbp_categories.config_generated_by IS 'Tool/model that generated this config'; COMMENT ON COLUMN public.gbp_categories.config_updated_at IS 'When the config was last updated'; -- ============================================================================= -- SECTION 3: JSONB DEEP MERGE FUNCTION -- ============================================================================= -- Recursive JSONB merge function -- - Objects: recursively merge (override wins on conflicts) -- - Arrays: union with dedup by default -- - If object has "__replace__": true, replace entirely instead of merge CREATE OR REPLACE FUNCTION pipeline.jsonb_deep_merge( base JSONB, override JSONB ) RETURNS JSONB AS $$ DECLARE result JSONB; key TEXT; base_value JSONB; override_value JSONB; merged_array JSONB; BEGIN -- Handle NULL cases IF base IS NULL THEN RETURN override; END IF; IF override IS NULL THEN RETURN base; END IF; -- If override has __replace__ flag, return override without the flag IF jsonb_typeof(override) = 'object' AND override ? '__replace__' AND (override->>'__replace__')::boolean = true THEN RETURN override - '__replace__'; END IF; -- If both are not objects, override wins IF jsonb_typeof(base) != 'object' OR jsonb_typeof(override) != 'object' THEN RETURN override; END IF; -- Both are objects, merge recursively result := base; FOR key IN SELECT jsonb_object_keys(override) LOOP override_value := override->key; IF NOT (base ? key) THEN -- Key doesn't exist in base, just add it result := result || jsonb_build_object(key, override_value); ELSE base_value := base->key; -- Check for __replace__ flag in the override value IF jsonb_typeof(override_value) = 'object' AND override_value ? '__replace__' AND (override_value->>'__replace__')::boolean = true THEN -- Replace entirely (without the __replace__ flag) result := result || jsonb_build_object(key, override_value - '__replace__'); -- If both are objects, recurse ELSIF jsonb_typeof(base_value) = 'object' AND jsonb_typeof(override_value) = 'object' THEN result := result || jsonb_build_object( key, pipeline.jsonb_deep_merge(base_value, override_value) ); -- If both are arrays, union with dedup ELSIF jsonb_typeof(base_value) = 'array' AND jsonb_typeof(override_value) = 'array' THEN -- Union arrays, remove duplicates -- Using a subquery to deduplicate SELECT jsonb_agg(DISTINCT elem) INTO merged_array FROM ( SELECT jsonb_array_elements(base_value) AS elem UNION SELECT jsonb_array_elements(override_value) AS elem ) AS combined; result := result || jsonb_build_object(key, COALESCE(merged_array, '[]'::jsonb)); -- Otherwise, override wins ELSE result := result || jsonb_build_object(key, override_value); END IF; END IF; END LOOP; RETURN result; END; $$ LANGUAGE plpgsql IMMUTABLE; COMMENT ON FUNCTION pipeline.jsonb_deep_merge(JSONB, JSONB) IS 'Recursively merges two JSONB objects. Objects are merged recursively (override wins on conflicts). Arrays are unioned with dedup. Use {"__replace__": true, ...} to replace instead of merge.'; -- ============================================================================= -- SECTION 4: RESOLVE PRIMITIVE CONFIG FUNCTION -- ============================================================================= -- Resolves primitive config by merging ancestor configs (general -> specific) CREATE OR REPLACE FUNCTION pipeline.resolve_primitive_config( p_path ltree ) RETURNS JSONB AS $$ DECLARE result JSONB := '{}'; row_config JSONB; BEGIN -- Fetch all ancestor nodes (including self), ordered by level ASC (general -> specific) -- Uses @> operator: p_path is a descendant of (or equal to) the node's path FOR row_config IN SELECT primitive_configs FROM public.gbp_categories WHERE p_path <@ path -- p_path is descendant of or equal to path ORDER BY level ASC LOOP -- Skip NULL or empty configs IF row_config IS NOT NULL AND row_config != '{}' THEN result := pipeline.jsonb_deep_merge(result, row_config); END IF; END LOOP; RETURN result; END; $$ LANGUAGE plpgsql STABLE; COMMENT ON FUNCTION pipeline.resolve_primitive_config(ltree) IS 'Resolves the full primitive config for a category path by merging all ancestor configs from general to specific.'; -- ============================================================================= -- SECTION 5: RESOLVE BUSINESS CONTEXT FUNCTION -- ============================================================================= -- Resolves business context by merging ancestor contexts (general -> specific) CREATE OR REPLACE FUNCTION pipeline.resolve_business_context( p_path ltree ) RETURNS JSONB AS $$ DECLARE result JSONB := '{}'; row_context JSONB; BEGIN -- Fetch all ancestor nodes (including self), ordered by level ASC (general -> specific) FOR row_context IN SELECT business_context FROM public.gbp_categories WHERE p_path <@ path ORDER BY level ASC LOOP -- Skip NULL or empty contexts IF row_context IS NOT NULL AND row_context != '{}' THEN result := pipeline.jsonb_deep_merge(result, row_context); END IF; END LOOP; RETURN result; END; $$ LANGUAGE plpgsql STABLE; COMMENT ON FUNCTION pipeline.resolve_business_context(ltree) IS 'Resolves the full business context for a category path by merging all ancestor contexts from general to specific.'; -- ============================================================================= -- SECTION 6: GET CLASSIFICATION CONTEXT FUNCTION -- ============================================================================= -- Returns complete classification context for a category path CREATE OR REPLACE FUNCTION pipeline.get_classification_context( p_path ltree ) RETURNS JSONB AS $$ DECLARE resolved_primitives JSONB; resolved_context JSONB; primitives_dict JSONB; BEGIN -- Resolve the primitive config for this path resolved_primitives := pipeline.resolve_primitive_config(p_path); -- Resolve the business context for this path resolved_context := pipeline.resolve_business_context(p_path); -- Build the primitives dictionary from the primitives table SELECT jsonb_object_agg( code, jsonb_build_object( 'code', code, 'dimension', dimension, 'name', name, 'definition', definition, 'is_meta', is_meta, 'base_positive_signals', COALESCE(to_jsonb(base_positive_signals), '[]'::jsonb), 'base_negative_signals', COALESCE(to_jsonb(base_negative_signals), '[]'::jsonb) ) ) INTO primitives_dict FROM pipeline.primitives; -- Handle case where primitives table is empty IF primitives_dict IS NULL THEN primitives_dict := '{}'::jsonb; END IF; -- Return combined context object RETURN jsonb_build_object( 'primitive_configs', resolved_primitives, 'business_context', resolved_context, 'primitives_dictionary', primitives_dict, 'category_path', p_path::text, 'resolved_at', NOW() ); END; $$ LANGUAGE plpgsql STABLE; COMMENT ON FUNCTION pipeline.get_classification_context(ltree) IS 'Returns complete classification context for a category path, including resolved primitive configs, business context, and the full primitives dictionary.'; -- ============================================================================= -- SECTION 7: HELPER FUNCTION - GET ACTIVE PRIMITIVES -- ============================================================================= -- Returns the list of active primitive codes for a category path -- (includes meta primitives + enabled primitives from config) CREATE OR REPLACE FUNCTION pipeline.get_active_primitives( p_path ltree ) RETURNS TEXT[] AS $$ DECLARE resolved_config JSONB; active_codes TEXT[]; meta_codes TEXT[]; config_enabled TEXT[]; config_disabled TEXT[]; BEGIN -- Get resolved config resolved_config := pipeline.resolve_primitive_config(p_path); -- Get all meta primitives (always active) SELECT array_agg(code) INTO meta_codes FROM pipeline.primitives WHERE is_meta = TRUE; -- Get enabled primitives from config IF resolved_config ? 'enabled' THEN SELECT array_agg(elem::text) INTO config_enabled FROM jsonb_array_elements_text(resolved_config->'enabled') AS elem; END IF; -- Get disabled primitives from config IF resolved_config ? 'disabled' THEN SELECT array_agg(elem::text) INTO config_disabled FROM jsonb_array_elements_text(resolved_config->'disabled') AS elem; END IF; -- Combine: meta + enabled, minus disabled active_codes := COALESCE(meta_codes, ARRAY[]::TEXT[]) || COALESCE(config_enabled, ARRAY[]::TEXT[]); -- Remove disabled primitives IF config_disabled IS NOT NULL THEN active_codes := array( SELECT unnest(active_codes) EXCEPT SELECT unnest(config_disabled) ); END IF; -- Remove duplicates active_codes := array(SELECT DISTINCT unnest(active_codes)); RETURN active_codes; END; $$ LANGUAGE plpgsql STABLE; COMMENT ON FUNCTION pipeline.get_active_primitives(ltree) IS 'Returns array of active primitive codes for a category path (meta primitives + enabled - disabled).'; -- ============================================================================= -- SECTION 8: SEED INITIAL PRIMITIVES (Examples) -- ============================================================================= -- Insert some example primitives (can be extended later) INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals) VALUES -- Meta primitives (always active) ('HONESTY', 'trust', 'Honesty & Truthfulness', 'Whether the business is perceived as honest and truthful in their dealings', TRUE, ARRAY['honest', 'truthful', 'transparent', 'upfront', 'no hidden'], ARRAY['lied', 'dishonest', 'deceptive', 'misleading', 'hidden fees', 'bait and switch']), ('ETHICS', 'trust', 'Ethical Behavior', 'Whether the business behaves ethically and with integrity', TRUE, ARRAY['ethical', 'integrity', 'fair', 'principled', 'moral'], ARRAY['unethical', 'scam', 'fraud', 'cheat', 'ripoff', 'shady']), ('SAFETY', 'trust', 'Safety & Security', 'Whether customers feel safe and secure', TRUE, ARRAY['safe', 'secure', 'protected', 'trust'], ARRAY['unsafe', 'dangerous', 'security concern', 'risk', 'hazard']), -- Quality dimension primitives ('FOOD_TASTE', 'quality', 'Food Taste & Flavor', 'Quality and taste of food items', FALSE, ARRAY['delicious', 'tasty', 'flavorful', 'yummy', 'amazing taste', 'perfectly seasoned'], ARRAY['bland', 'tasteless', 'bad taste', 'over-seasoned', 'under-seasoned', 'disgusting']), ('FOOD_FRESHNESS', 'quality', 'Food Freshness', 'Freshness of ingredients and food items', FALSE, ARRAY['fresh', 'crisp', 'just made', 'homemade', 'organic'], ARRAY['stale', 'old', 'not fresh', 'frozen', 'reheated', 'expired']), ('FOOD_PORTION', 'quality', 'Portion Size', 'Size and quantity of food servings', FALSE, ARRAY['generous portions', 'large serving', 'filling', 'plenty of food'], ARRAY['small portions', 'tiny', 'not enough', 'skimpy', 'overpriced for size']), ('PRODUCT_QUALITY', 'quality', 'Product Quality', 'Overall quality of products', FALSE, ARRAY['high quality', 'well made', 'premium', 'durable', 'excellent quality'], ARRAY['poor quality', 'cheap', 'flimsy', 'broke easily', 'defective']), -- Service dimension primitives ('SERVICE_SPEED', 'service', 'Service Speed', 'Speed and timeliness of service', FALSE, ARRAY['fast', 'quick', 'prompt', 'efficient', 'no wait'], ARRAY['slow', 'long wait', 'took forever', 'delayed', 'waited too long']), ('SERVICE_FRIENDLINESS', 'service', 'Staff Friendliness', 'Friendliness and warmth of staff', FALSE, ARRAY['friendly', 'welcoming', 'warm', 'nice', 'pleasant', 'smiled'], ARRAY['rude', 'unfriendly', 'cold', 'dismissive', 'attitude', 'ignored']), ('SERVICE_KNOWLEDGE', 'service', 'Staff Knowledge', 'Knowledge and expertise of staff', FALSE, ARRAY['knowledgeable', 'expert', 'helpful advice', 'knew their stuff', 'professional'], ARRAY['clueless', 'didnt know', 'unhelpful', 'inexperienced', 'untrained']), ('SERVICE_ATTENTIVENESS', 'service', 'Staff Attentiveness', 'How attentive staff are to customer needs', FALSE, ARRAY['attentive', 'checked on us', 'responsive', 'available', 'proactive'], ARRAY['inattentive', 'ignored', 'couldnt find anyone', 'had to flag down', 'neglected']), -- Environment dimension primitives ('ENV_CLEANLINESS', 'environment', 'Cleanliness', 'Cleanliness of the establishment', FALSE, ARRAY['clean', 'spotless', 'tidy', 'well-maintained', 'hygienic'], ARRAY['dirty', 'filthy', 'messy', 'gross', 'sticky', 'unhygienic']), ('ENV_AMBIANCE', 'environment', 'Ambiance & Atmosphere', 'Overall atmosphere and vibe', FALSE, ARRAY['great atmosphere', 'nice ambiance', 'cozy', 'relaxing', 'beautiful decor'], ARRAY['bad atmosphere', 'uncomfortable', 'loud', 'cramped', 'depressing']), ('ENV_PARKING', 'environment', 'Parking Availability', 'Availability and convenience of parking', FALSE, ARRAY['easy parking', 'plenty of parking', 'free parking', 'valet available'], ARRAY['no parking', 'hard to park', 'paid parking', 'had to park far']), -- Value dimension primitives ('VALUE_PRICE', 'value', 'Price Level', 'Perception of price levels', FALSE, ARRAY['affordable', 'reasonable prices', 'cheap', 'good deal', 'budget-friendly'], ARRAY['expensive', 'overpriced', 'pricey', 'not worth the price', 'too costly']), ('VALUE_WORTH', 'value', 'Value for Money', 'Whether the experience is worth the cost', FALSE, ARRAY['worth it', 'great value', 'bang for buck', 'money well spent'], ARRAY['not worth it', 'waste of money', 'rip off', 'should be cheaper']), -- Process dimension primitives ('PROCESS_BOOKING', 'process', 'Booking & Reservations', 'Ease of making reservations or appointments', FALSE, ARRAY['easy to book', 'simple reservation', 'available appointments', 'online booking'], ARRAY['hard to book', 'no availability', 'complicated booking', 'had to call multiple times']), ('PROCESS_WAIT', 'process', 'Wait Times', 'Time spent waiting for service', FALSE, ARRAY['no wait', 'seated immediately', 'quick turnaround'], ARRAY['long wait', 'waited forever', 'always busy', 'need to wait in line']), -- Resolution dimension primitives ('RESOLUTION_RESPONSE', 'resolution', 'Problem Response', 'How problems and complaints are handled', FALSE, ARRAY['fixed the issue', 'made it right', 'apologized', 'took responsibility'], ARRAY['ignored complaint', 'didnt care', 'blamed me', 'no resolution', 'refused to help']), -- Loyalty dimension primitives ('LOYALTY_RETURN', 'loyalty', 'Return Intent', 'Whether customers intend to return', FALSE, ARRAY['will be back', 'coming back', 'regular customer', 'my go-to place'], ARRAY['never again', 'wont return', 'last time', 'not coming back']), ('LOYALTY_RECOMMEND', 'loyalty', 'Recommendation Intent', 'Whether customers would recommend to others', FALSE, ARRAY['highly recommend', 'tell everyone', 'bring friends', 'must try'], ARRAY['dont recommend', 'avoid', 'stay away', 'warned friends']), -- Escape dimension primitives (when customers leave early or avoid) ('ESCAPE_LEFT', 'escape', 'Early Departure', 'Whether customers left early or walked out', FALSE, ARRAY[]::TEXT[], -- No positive signals for escape ARRAY['walked out', 'left early', 'didnt finish', 'had to leave', 'couldnt stay']) ON CONFLICT (code) DO UPDATE SET dimension = EXCLUDED.dimension, name = EXCLUDED.name, definition = EXCLUDED.definition, is_meta = EXCLUDED.is_meta, base_positive_signals = EXCLUDED.base_positive_signals, base_negative_signals = EXCLUDED.base_negative_signals; -- ============================================================================= -- SECTION 9: EXAMPLE CATEGORY CONFIGS -- ============================================================================= -- Example: Set primitive config for Food & Dining sector (level 1) -- This would enable food-related primitives for all food businesses UPDATE public.gbp_categories SET primitive_configs = '{ "enabled": ["FOOD_TASTE", "FOOD_FRESHNESS", "FOOD_PORTION", "SERVICE_SPEED", "SERVICE_FRIENDLINESS", "ENV_CLEANLINESS", "ENV_AMBIANCE", "VALUE_PRICE", "VALUE_WORTH", "PROCESS_WAIT"], "weights": { "FOOD_TASTE": 1.5, "FOOD_FRESHNESS": 1.3, "SERVICE_SPEED": 1.2 } }'::jsonb, business_context = '{ "terminology": { "staff": ["server", "waiter", "waitress", "host", "hostess", "bartender"], "product": ["food", "dish", "meal", "appetizer", "entree", "dessert", "drink"] }, "industry": "food_service" }'::jsonb, config_version = 'v1.0.0', config_generated_by = 'migration_014', config_updated_at = NOW() WHERE slug = 'food_dining' AND level = 1; -- Example: Override config for Restaurants (level 2) - adds more specific settings UPDATE public.gbp_categories SET primitive_configs = '{ "enabled": ["PROCESS_BOOKING", "ENV_PARKING"], "weights": { "PROCESS_WAIT": 1.3 }, "signals": { "FOOD_TASTE": { "positive": ["perfectly cooked", "chef special", "signature dish"], "negative": ["undercooked", "overcooked", "cold food"] } } }'::jsonb, business_context = '{ "terminology": { "staff": ["chef", "cook", "sous chef", "kitchen staff"] }, "typical_visit_duration": "1-2 hours", "reservation_common": true }'::jsonb, config_version = 'v1.0.0', config_generated_by = 'migration_014', config_updated_at = NOW() WHERE slug = 'restaurants' AND level = 2; -- ============================================================================= -- VERIFICATION QUERIES (can be removed in production) -- ============================================================================= -- Verify primitives table DO $$ BEGIN RAISE NOTICE 'Primitives table created with % rows', (SELECT COUNT(*) FROM pipeline.primitives); END $$; -- Verify functions exist DO $$ BEGIN -- Test jsonb_deep_merge ASSERT pipeline.jsonb_deep_merge('{"a": 1}'::jsonb, '{"b": 2}'::jsonb) = '{"a": 1, "b": 2}'::jsonb, 'jsonb_deep_merge basic test failed'; -- Test __replace__ flag ASSERT pipeline.jsonb_deep_merge('{"a": {"x": 1, "y": 2}}'::jsonb, '{"a": {"__replace__": true, "z": 3}}'::jsonb) = '{"a": {"z": 3}}'::jsonb, 'jsonb_deep_merge __replace__ test failed'; RAISE NOTICE 'All function tests passed'; END $$;