Files
whyrating-engine-legacy/migrations/versions/014_primitive_classification_system.sql
2026-02-02 18:19:00 +00:00

562 lines
24 KiB
PL/PgSQL

-- =============================================================================
-- Migration: 014_primitive_classification_system.sql
-- Purpose: Create primitive classification system for context-aware review analysis
-- =============================================================================
--
-- This migration introduces a "primitive" classification system that allows
-- industry-specific and category-specific configuration of what aspects to
-- look for when classifying reviews.
--
-- Components:
-- 1. pipeline.primitives - Frozen dictionary of primitives (quality dimensions)
-- 2. ALTER public.gbp_categories - Add primitive_configs and business_context
-- 3. pipeline.jsonb_deep_merge() - Recursive JSONB merge function
-- 4. pipeline.resolve_primitive_config() - Resolve configs through category tree
-- 5. pipeline.get_classification_context() - Get full classification context
--
-- Date: 2026-01-31
-- =============================================================================
-- =============================================================================
-- SECTION 1: PRIMITIVES TABLE (Frozen Dictionary)
-- =============================================================================
CREATE TABLE IF NOT EXISTS pipeline.primitives (
code VARCHAR(30) PRIMARY KEY,
dimension VARCHAR(20) NOT NULL, -- quality, service, process, environment, value, trust, resolution, loyalty, escape
name VARCHAR(100) NOT NULL,
definition TEXT NOT NULL,
is_meta BOOLEAN DEFAULT FALSE, -- true for always-active primitives (HONESTY, ETHICS, etc.)
base_positive_signals TEXT[],
base_negative_signals TEXT[],
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW()
);
-- Index on dimension for filtering
CREATE INDEX IF NOT EXISTS idx_primitives_dimension ON pipeline.primitives(dimension);
-- Index on is_meta for quick access to always-active primitives
CREATE INDEX IF NOT EXISTS idx_primitives_is_meta ON pipeline.primitives(is_meta) WHERE is_meta = TRUE;
COMMENT ON TABLE pipeline.primitives IS 'Frozen dictionary of classification primitives (quality dimensions)';
COMMENT ON COLUMN pipeline.primitives.code IS 'Unique identifier for the primitive (e.g., FOOD_TASTE, SERVICE_SPEED)';
COMMENT ON COLUMN pipeline.primitives.dimension IS 'Category of the primitive (quality, service, process, etc.)';
COMMENT ON COLUMN pipeline.primitives.is_meta IS 'If true, this primitive is always active regardless of category config';
COMMENT ON COLUMN pipeline.primitives.base_positive_signals IS 'Default positive signal keywords for this primitive';
COMMENT ON COLUMN pipeline.primitives.base_negative_signals IS 'Default negative signal keywords for this primitive';
-- =============================================================================
-- SECTION 2: ALTER gbp_categories TABLE
-- =============================================================================
-- Add primitive_configs column (JSONB for flexible config)
ALTER TABLE public.gbp_categories
ADD COLUMN IF NOT EXISTS primitive_configs JSONB DEFAULT '{}';
-- Add business_context column (JSONB for industry-specific context)
ALTER TABLE public.gbp_categories
ADD COLUMN IF NOT EXISTS business_context JSONB DEFAULT '{}';
-- Add config versioning columns
ALTER TABLE public.gbp_categories
ADD COLUMN IF NOT EXISTS config_version VARCHAR(20);
ALTER TABLE public.gbp_categories
ADD COLUMN IF NOT EXISTS config_generated_by VARCHAR(100);
ALTER TABLE public.gbp_categories
ADD COLUMN IF NOT EXISTS config_updated_at TIMESTAMP WITH TIME ZONE;
-- GIN indexes for JSONB containment queries
CREATE INDEX IF NOT EXISTS idx_gbp_categories_primitive_configs
ON public.gbp_categories USING GIN (primitive_configs);
CREATE INDEX IF NOT EXISTS idx_gbp_categories_business_context
ON public.gbp_categories USING GIN (business_context);
-- Index for config version lookups
CREATE INDEX IF NOT EXISTS idx_gbp_categories_config_version
ON public.gbp_categories(config_version) WHERE config_version IS NOT NULL;
COMMENT ON COLUMN public.gbp_categories.primitive_configs IS 'JSONB config for primitives at this category level (inherits from ancestors)';
COMMENT ON COLUMN public.gbp_categories.business_context IS 'JSONB business context for this category (industry-specific terminology, etc.)';
COMMENT ON COLUMN public.gbp_categories.config_version IS 'Version of the primitive config (for cache invalidation)';
COMMENT ON COLUMN public.gbp_categories.config_generated_by IS 'Tool/model that generated this config';
COMMENT ON COLUMN public.gbp_categories.config_updated_at IS 'When the config was last updated';
-- =============================================================================
-- SECTION 3: JSONB DEEP MERGE FUNCTION
-- =============================================================================
-- Recursive JSONB merge function
-- - Objects: recursively merge (override wins on conflicts)
-- - Arrays: union with dedup by default
-- - If object has "__replace__": true, replace entirely instead of merge
CREATE OR REPLACE FUNCTION pipeline.jsonb_deep_merge(
base JSONB,
override JSONB
) RETURNS JSONB AS $$
DECLARE
result JSONB;
key TEXT;
base_value JSONB;
override_value JSONB;
merged_array JSONB;
BEGIN
-- Handle NULL cases
IF base IS NULL THEN
RETURN override;
END IF;
IF override IS NULL THEN
RETURN base;
END IF;
-- If override has __replace__ flag, return override without the flag
IF jsonb_typeof(override) = 'object' AND override ? '__replace__' AND (override->>'__replace__')::boolean = true THEN
RETURN override - '__replace__';
END IF;
-- If both are not objects, override wins
IF jsonb_typeof(base) != 'object' OR jsonb_typeof(override) != 'object' THEN
RETURN override;
END IF;
-- Both are objects, merge recursively
result := base;
FOR key IN SELECT jsonb_object_keys(override)
LOOP
override_value := override->key;
IF NOT (base ? key) THEN
-- Key doesn't exist in base, just add it
result := result || jsonb_build_object(key, override_value);
ELSE
base_value := base->key;
-- Check for __replace__ flag in the override value
IF jsonb_typeof(override_value) = 'object'
AND override_value ? '__replace__'
AND (override_value->>'__replace__')::boolean = true THEN
-- Replace entirely (without the __replace__ flag)
result := result || jsonb_build_object(key, override_value - '__replace__');
-- If both are objects, recurse
ELSIF jsonb_typeof(base_value) = 'object' AND jsonb_typeof(override_value) = 'object' THEN
result := result || jsonb_build_object(
key,
pipeline.jsonb_deep_merge(base_value, override_value)
);
-- If both are arrays, union with dedup
ELSIF jsonb_typeof(base_value) = 'array' AND jsonb_typeof(override_value) = 'array' THEN
-- Union arrays, remove duplicates
-- Using a subquery to deduplicate
SELECT jsonb_agg(DISTINCT elem)
INTO merged_array
FROM (
SELECT jsonb_array_elements(base_value) AS elem
UNION
SELECT jsonb_array_elements(override_value) AS elem
) AS combined;
result := result || jsonb_build_object(key, COALESCE(merged_array, '[]'::jsonb));
-- Otherwise, override wins
ELSE
result := result || jsonb_build_object(key, override_value);
END IF;
END IF;
END LOOP;
RETURN result;
END;
$$ LANGUAGE plpgsql IMMUTABLE;
COMMENT ON FUNCTION pipeline.jsonb_deep_merge(JSONB, JSONB) IS
'Recursively merges two JSONB objects. Objects are merged recursively (override wins on conflicts).
Arrays are unioned with dedup. Use {"__replace__": true, ...} to replace instead of merge.';
-- =============================================================================
-- SECTION 4: RESOLVE PRIMITIVE CONFIG FUNCTION
-- =============================================================================
-- Resolves primitive config by merging ancestor configs (general -> specific)
CREATE OR REPLACE FUNCTION pipeline.resolve_primitive_config(
p_path ltree
) RETURNS JSONB AS $$
DECLARE
result JSONB := '{}';
row_config JSONB;
BEGIN
-- Fetch all ancestor nodes (including self), ordered by level ASC (general -> specific)
-- Uses @> operator: p_path is a descendant of (or equal to) the node's path
FOR row_config IN
SELECT primitive_configs
FROM public.gbp_categories
WHERE p_path <@ path -- p_path is descendant of or equal to path
ORDER BY level ASC
LOOP
-- Skip NULL or empty configs
IF row_config IS NOT NULL AND row_config != '{}' THEN
result := pipeline.jsonb_deep_merge(result, row_config);
END IF;
END LOOP;
RETURN result;
END;
$$ LANGUAGE plpgsql STABLE;
COMMENT ON FUNCTION pipeline.resolve_primitive_config(ltree) IS
'Resolves the full primitive config for a category path by merging all ancestor configs from general to specific.';
-- =============================================================================
-- SECTION 5: RESOLVE BUSINESS CONTEXT FUNCTION
-- =============================================================================
-- Resolves business context by merging ancestor contexts (general -> specific)
CREATE OR REPLACE FUNCTION pipeline.resolve_business_context(
p_path ltree
) RETURNS JSONB AS $$
DECLARE
result JSONB := '{}';
row_context JSONB;
BEGIN
-- Fetch all ancestor nodes (including self), ordered by level ASC (general -> specific)
FOR row_context IN
SELECT business_context
FROM public.gbp_categories
WHERE p_path <@ path
ORDER BY level ASC
LOOP
-- Skip NULL or empty contexts
IF row_context IS NOT NULL AND row_context != '{}' THEN
result := pipeline.jsonb_deep_merge(result, row_context);
END IF;
END LOOP;
RETURN result;
END;
$$ LANGUAGE plpgsql STABLE;
COMMENT ON FUNCTION pipeline.resolve_business_context(ltree) IS
'Resolves the full business context for a category path by merging all ancestor contexts from general to specific.';
-- =============================================================================
-- SECTION 6: GET CLASSIFICATION CONTEXT FUNCTION
-- =============================================================================
-- Returns complete classification context for a category path
CREATE OR REPLACE FUNCTION pipeline.get_classification_context(
p_path ltree
) RETURNS JSONB AS $$
DECLARE
resolved_primitives JSONB;
resolved_context JSONB;
primitives_dict JSONB;
BEGIN
-- Resolve the primitive config for this path
resolved_primitives := pipeline.resolve_primitive_config(p_path);
-- Resolve the business context for this path
resolved_context := pipeline.resolve_business_context(p_path);
-- Build the primitives dictionary from the primitives table
SELECT jsonb_object_agg(
code,
jsonb_build_object(
'code', code,
'dimension', dimension,
'name', name,
'definition', definition,
'is_meta', is_meta,
'base_positive_signals', COALESCE(to_jsonb(base_positive_signals), '[]'::jsonb),
'base_negative_signals', COALESCE(to_jsonb(base_negative_signals), '[]'::jsonb)
)
)
INTO primitives_dict
FROM pipeline.primitives;
-- Handle case where primitives table is empty
IF primitives_dict IS NULL THEN
primitives_dict := '{}'::jsonb;
END IF;
-- Return combined context object
RETURN jsonb_build_object(
'primitive_configs', resolved_primitives,
'business_context', resolved_context,
'primitives_dictionary', primitives_dict,
'category_path', p_path::text,
'resolved_at', NOW()
);
END;
$$ LANGUAGE plpgsql STABLE;
COMMENT ON FUNCTION pipeline.get_classification_context(ltree) IS
'Returns complete classification context for a category path, including resolved primitive configs,
business context, and the full primitives dictionary.';
-- =============================================================================
-- SECTION 7: HELPER FUNCTION - GET ACTIVE PRIMITIVES
-- =============================================================================
-- Returns the list of active primitive codes for a category path
-- (includes meta primitives + enabled primitives from config)
CREATE OR REPLACE FUNCTION pipeline.get_active_primitives(
p_path ltree
) RETURNS TEXT[] AS $$
DECLARE
resolved_config JSONB;
active_codes TEXT[];
meta_codes TEXT[];
config_enabled TEXT[];
config_disabled TEXT[];
BEGIN
-- Get resolved config
resolved_config := pipeline.resolve_primitive_config(p_path);
-- Get all meta primitives (always active)
SELECT array_agg(code)
INTO meta_codes
FROM pipeline.primitives
WHERE is_meta = TRUE;
-- Get enabled primitives from config
IF resolved_config ? 'enabled' THEN
SELECT array_agg(elem::text)
INTO config_enabled
FROM jsonb_array_elements_text(resolved_config->'enabled') AS elem;
END IF;
-- Get disabled primitives from config
IF resolved_config ? 'disabled' THEN
SELECT array_agg(elem::text)
INTO config_disabled
FROM jsonb_array_elements_text(resolved_config->'disabled') AS elem;
END IF;
-- Combine: meta + enabled, minus disabled
active_codes := COALESCE(meta_codes, ARRAY[]::TEXT[]) || COALESCE(config_enabled, ARRAY[]::TEXT[]);
-- Remove disabled primitives
IF config_disabled IS NOT NULL THEN
active_codes := array(
SELECT unnest(active_codes)
EXCEPT
SELECT unnest(config_disabled)
);
END IF;
-- Remove duplicates
active_codes := array(SELECT DISTINCT unnest(active_codes));
RETURN active_codes;
END;
$$ LANGUAGE plpgsql STABLE;
COMMENT ON FUNCTION pipeline.get_active_primitives(ltree) IS
'Returns array of active primitive codes for a category path (meta primitives + enabled - disabled).';
-- =============================================================================
-- SECTION 8: SEED INITIAL PRIMITIVES (Examples)
-- =============================================================================
-- Insert some example primitives (can be extended later)
INSERT INTO pipeline.primitives (code, dimension, name, definition, is_meta, base_positive_signals, base_negative_signals)
VALUES
-- Meta primitives (always active)
('HONESTY', 'trust', 'Honesty & Truthfulness', 'Whether the business is perceived as honest and truthful in their dealings', TRUE,
ARRAY['honest', 'truthful', 'transparent', 'upfront', 'no hidden'],
ARRAY['lied', 'dishonest', 'deceptive', 'misleading', 'hidden fees', 'bait and switch']),
('ETHICS', 'trust', 'Ethical Behavior', 'Whether the business behaves ethically and with integrity', TRUE,
ARRAY['ethical', 'integrity', 'fair', 'principled', 'moral'],
ARRAY['unethical', 'scam', 'fraud', 'cheat', 'ripoff', 'shady']),
('SAFETY', 'trust', 'Safety & Security', 'Whether customers feel safe and secure', TRUE,
ARRAY['safe', 'secure', 'protected', 'trust'],
ARRAY['unsafe', 'dangerous', 'security concern', 'risk', 'hazard']),
-- Quality dimension primitives
('FOOD_TASTE', 'quality', 'Food Taste & Flavor', 'Quality and taste of food items', FALSE,
ARRAY['delicious', 'tasty', 'flavorful', 'yummy', 'amazing taste', 'perfectly seasoned'],
ARRAY['bland', 'tasteless', 'bad taste', 'over-seasoned', 'under-seasoned', 'disgusting']),
('FOOD_FRESHNESS', 'quality', 'Food Freshness', 'Freshness of ingredients and food items', FALSE,
ARRAY['fresh', 'crisp', 'just made', 'homemade', 'organic'],
ARRAY['stale', 'old', 'not fresh', 'frozen', 'reheated', 'expired']),
('FOOD_PORTION', 'quality', 'Portion Size', 'Size and quantity of food servings', FALSE,
ARRAY['generous portions', 'large serving', 'filling', 'plenty of food'],
ARRAY['small portions', 'tiny', 'not enough', 'skimpy', 'overpriced for size']),
('PRODUCT_QUALITY', 'quality', 'Product Quality', 'Overall quality of products', FALSE,
ARRAY['high quality', 'well made', 'premium', 'durable', 'excellent quality'],
ARRAY['poor quality', 'cheap', 'flimsy', 'broke easily', 'defective']),
-- Service dimension primitives
('SERVICE_SPEED', 'service', 'Service Speed', 'Speed and timeliness of service', FALSE,
ARRAY['fast', 'quick', 'prompt', 'efficient', 'no wait'],
ARRAY['slow', 'long wait', 'took forever', 'delayed', 'waited too long']),
('SERVICE_FRIENDLINESS', 'service', 'Staff Friendliness', 'Friendliness and warmth of staff', FALSE,
ARRAY['friendly', 'welcoming', 'warm', 'nice', 'pleasant', 'smiled'],
ARRAY['rude', 'unfriendly', 'cold', 'dismissive', 'attitude', 'ignored']),
('SERVICE_KNOWLEDGE', 'service', 'Staff Knowledge', 'Knowledge and expertise of staff', FALSE,
ARRAY['knowledgeable', 'expert', 'helpful advice', 'knew their stuff', 'professional'],
ARRAY['clueless', 'didnt know', 'unhelpful', 'inexperienced', 'untrained']),
('SERVICE_ATTENTIVENESS', 'service', 'Staff Attentiveness', 'How attentive staff are to customer needs', FALSE,
ARRAY['attentive', 'checked on us', 'responsive', 'available', 'proactive'],
ARRAY['inattentive', 'ignored', 'couldnt find anyone', 'had to flag down', 'neglected']),
-- Environment dimension primitives
('ENV_CLEANLINESS', 'environment', 'Cleanliness', 'Cleanliness of the establishment', FALSE,
ARRAY['clean', 'spotless', 'tidy', 'well-maintained', 'hygienic'],
ARRAY['dirty', 'filthy', 'messy', 'gross', 'sticky', 'unhygienic']),
('ENV_AMBIANCE', 'environment', 'Ambiance & Atmosphere', 'Overall atmosphere and vibe', FALSE,
ARRAY['great atmosphere', 'nice ambiance', 'cozy', 'relaxing', 'beautiful decor'],
ARRAY['bad atmosphere', 'uncomfortable', 'loud', 'cramped', 'depressing']),
('ENV_PARKING', 'environment', 'Parking Availability', 'Availability and convenience of parking', FALSE,
ARRAY['easy parking', 'plenty of parking', 'free parking', 'valet available'],
ARRAY['no parking', 'hard to park', 'paid parking', 'had to park far']),
-- Value dimension primitives
('VALUE_PRICE', 'value', 'Price Level', 'Perception of price levels', FALSE,
ARRAY['affordable', 'reasonable prices', 'cheap', 'good deal', 'budget-friendly'],
ARRAY['expensive', 'overpriced', 'pricey', 'not worth the price', 'too costly']),
('VALUE_WORTH', 'value', 'Value for Money', 'Whether the experience is worth the cost', FALSE,
ARRAY['worth it', 'great value', 'bang for buck', 'money well spent'],
ARRAY['not worth it', 'waste of money', 'rip off', 'should be cheaper']),
-- Process dimension primitives
('PROCESS_BOOKING', 'process', 'Booking & Reservations', 'Ease of making reservations or appointments', FALSE,
ARRAY['easy to book', 'simple reservation', 'available appointments', 'online booking'],
ARRAY['hard to book', 'no availability', 'complicated booking', 'had to call multiple times']),
('PROCESS_WAIT', 'process', 'Wait Times', 'Time spent waiting for service', FALSE,
ARRAY['no wait', 'seated immediately', 'quick turnaround'],
ARRAY['long wait', 'waited forever', 'always busy', 'need to wait in line']),
-- Resolution dimension primitives
('RESOLUTION_RESPONSE', 'resolution', 'Problem Response', 'How problems and complaints are handled', FALSE,
ARRAY['fixed the issue', 'made it right', 'apologized', 'took responsibility'],
ARRAY['ignored complaint', 'didnt care', 'blamed me', 'no resolution', 'refused to help']),
-- Loyalty dimension primitives
('LOYALTY_RETURN', 'loyalty', 'Return Intent', 'Whether customers intend to return', FALSE,
ARRAY['will be back', 'coming back', 'regular customer', 'my go-to place'],
ARRAY['never again', 'wont return', 'last time', 'not coming back']),
('LOYALTY_RECOMMEND', 'loyalty', 'Recommendation Intent', 'Whether customers would recommend to others', FALSE,
ARRAY['highly recommend', 'tell everyone', 'bring friends', 'must try'],
ARRAY['dont recommend', 'avoid', 'stay away', 'warned friends']),
-- Escape dimension primitives (when customers leave early or avoid)
('ESCAPE_LEFT', 'escape', 'Early Departure', 'Whether customers left early or walked out', FALSE,
ARRAY[]::TEXT[], -- No positive signals for escape
ARRAY['walked out', 'left early', 'didnt finish', 'had to leave', 'couldnt stay'])
ON CONFLICT (code) DO UPDATE SET
dimension = EXCLUDED.dimension,
name = EXCLUDED.name,
definition = EXCLUDED.definition,
is_meta = EXCLUDED.is_meta,
base_positive_signals = EXCLUDED.base_positive_signals,
base_negative_signals = EXCLUDED.base_negative_signals;
-- =============================================================================
-- SECTION 9: EXAMPLE CATEGORY CONFIGS
-- =============================================================================
-- Example: Set primitive config for Food & Dining sector (level 1)
-- This would enable food-related primitives for all food businesses
UPDATE public.gbp_categories
SET
primitive_configs = '{
"enabled": ["FOOD_TASTE", "FOOD_FRESHNESS", "FOOD_PORTION", "SERVICE_SPEED", "SERVICE_FRIENDLINESS", "ENV_CLEANLINESS", "ENV_AMBIANCE", "VALUE_PRICE", "VALUE_WORTH", "PROCESS_WAIT"],
"weights": {
"FOOD_TASTE": 1.5,
"FOOD_FRESHNESS": 1.3,
"SERVICE_SPEED": 1.2
}
}'::jsonb,
business_context = '{
"terminology": {
"staff": ["server", "waiter", "waitress", "host", "hostess", "bartender"],
"product": ["food", "dish", "meal", "appetizer", "entree", "dessert", "drink"]
},
"industry": "food_service"
}'::jsonb,
config_version = 'v1.0.0',
config_generated_by = 'migration_014',
config_updated_at = NOW()
WHERE slug = 'food_dining' AND level = 1;
-- Example: Override config for Restaurants (level 2) - adds more specific settings
UPDATE public.gbp_categories
SET
primitive_configs = '{
"enabled": ["PROCESS_BOOKING", "ENV_PARKING"],
"weights": {
"PROCESS_WAIT": 1.3
},
"signals": {
"FOOD_TASTE": {
"positive": ["perfectly cooked", "chef special", "signature dish"],
"negative": ["undercooked", "overcooked", "cold food"]
}
}
}'::jsonb,
business_context = '{
"terminology": {
"staff": ["chef", "cook", "sous chef", "kitchen staff"]
},
"typical_visit_duration": "1-2 hours",
"reservation_common": true
}'::jsonb,
config_version = 'v1.0.0',
config_generated_by = 'migration_014',
config_updated_at = NOW()
WHERE slug = 'restaurants' AND level = 2;
-- =============================================================================
-- VERIFICATION QUERIES (can be removed in production)
-- =============================================================================
-- Verify primitives table
DO $$
BEGIN
RAISE NOTICE 'Primitives table created with % rows', (SELECT COUNT(*) FROM pipeline.primitives);
END $$;
-- Verify functions exist
DO $$
BEGIN
-- Test jsonb_deep_merge
ASSERT pipeline.jsonb_deep_merge('{"a": 1}'::jsonb, '{"b": 2}'::jsonb) = '{"a": 1, "b": 2}'::jsonb,
'jsonb_deep_merge basic test failed';
-- Test __replace__ flag
ASSERT pipeline.jsonb_deep_merge('{"a": {"x": 1, "y": 2}}'::jsonb, '{"a": {"__replace__": true, "z": 3}}'::jsonb) = '{"a": {"z": 3}}'::jsonb,
'jsonb_deep_merge __replace__ test failed';
RAISE NOTICE 'All function tests passed';
END $$;