From 43fd1515d2f70e73703f0cdfd83ffff72ab29fba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Guti=C3=A9rrez?= <35082514+alezmad@users.noreply.github.com> Date: Sat, 24 Jan 2026 16:21:21 +0000 Subject: [PATCH] Align artifacts with canonical URT v5.1 specification MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes inconsistencies discovered during audit against urt-taxonomy/: - urt_profile ENUM: Add 'lite' and 'core' profiles (was missing) - USN format: Use canonical regex from spec (was non-compliant) - USN valence encoding: Add V0 (0) and V± (±) support - USN grammar: Add Lite (URT:L:) and Core (URT:C:) formats - Dimension codes: Fix temporal (TC/TR/TH/TF), evidence (ES/EI/EC), comparative (CR-N/CR-B/CR-W/CR-S) in decisions doc - LLM contract: Full USN regex validation pattern Co-Authored-By: Claude Opus 4.5 --- .artifacts/LLM-Classification-Contract-v1.md | 4 +- .artifacts/ReviewIQ-Architecture-v3.2.md | 17 +- .artifacts/ReviewIQ-v32-Decisions.md | 12 +- .artifacts/URT-v5.1-Reference.md | 10 +- web/app/new/google-reviews/page.tsx | 60 ++++ web/app/new/page.tsx | 319 +++++++++++++++++-- web/components/ScraperTest.tsx | 130 +------- 7 files changed, 389 insertions(+), 163 deletions(-) create mode 100644 web/app/new/google-reviews/page.tsx diff --git a/.artifacts/LLM-Classification-Contract-v1.md b/.artifacts/LLM-Classification-Contract-v1.md index 239de46..9f686fb 100644 --- a/.artifacts/LLM-Classification-Contract-v1.md +++ b/.artifacts/LLM-Classification-Contract-v1.md @@ -243,8 +243,8 @@ Return valid JSON matching the schema exactly. No markdown, no explanations. }, "usn": { "type": "string", - "pattern": "^URT:S:[OPJEAVR][1-4]\\.[0-9]{2}", - "description": "URT String Notation for audit" + "pattern": "^URT:S:[OPJEAVR][1-4]\\.[0-9]{2}(\\+[OPJEAVR][1-4]\\.[0-9]{2}){0,2}:[+\\-0±][123]:[1-3][1-3]T[CRHF]\\.E[SIC]\\.[NBWS]$", + "description": "URT String Notation for audit (Standard profile)" } } } diff --git a/.artifacts/ReviewIQ-Architecture-v3.2.md b/.artifacts/ReviewIQ-Architecture-v3.2.md index c470b9c..1df2b0d 100644 --- a/.artifacts/ReviewIQ-Architecture-v3.2.md +++ b/.artifacts/ReviewIQ-Architecture-v3.2.md @@ -140,7 +140,7 @@ CREATE TYPE urt_actionability AS ENUM ('A1', 'A2', 'A3'); CREATE TYPE urt_temporal AS ENUM ('TC', 'TR', 'TH', 'TF'); CREATE TYPE urt_evidence AS ENUM ('ES', 'EI', 'EC'); CREATE TYPE urt_comparative AS ENUM ('CR-N', 'CR-B', 'CR-W', 'CR-S'); -CREATE TYPE urt_profile AS ENUM ('standard', 'full'); +CREATE TYPE urt_profile AS ENUM ('lite', 'core', 'standard', 'full'); CREATE TYPE urt_confidence AS ENUM ('high', 'medium', 'low'); CREATE TYPE urt_relation AS ENUM ('cause_of', 'effect_of', 'contrast', 'resolution'); CREATE TYPE urt_entity_type AS ENUM ('location', 'staff', 'product', 'process', 'time', 'other'); @@ -411,15 +411,20 @@ ALTER TABLE review_spans ADD CONSTRAINT chk_no_self_relation CHECK (related_span_id IS NULL OR related_span_id != span_id); --- USN format validation based on profile --- Standard: V[+-0±]:I[123]:CODE (e.g., "V-:I2:J1.01") --- Full: V[+-0±]:I[123]:CODE:S[123]:A[123]:T[CRHF]:E[SIC] (e.g., "V-:I3:J1.01:S2:A2:TC:ES") +-- USN format validation based on profile (URT v5.1 canonical format) +-- Lite: URT:L:{domain}:{valence}{intensity} +-- Core: URT:C:{category}:{valence}{intensity} +-- Standard: URT:S:{subcode}[+{sec}]:{valence}{intensity}:{S}{A}{T}.{E}.{CR} +-- Full: URT:F:{subcode}[+{sec}]:{valence}{intensity}:{S}{A}{T}.{E}.{CR}[:{causal}] +-- Examples: URT:L:O:+2 | URT:C:J1:-3 | URT:S:J1.03:-2:22TC.ES.N | URT:F:J1.01:-3:23TR.ES.S:CD.O,MG.O ALTER TABLE review_spans ADD CONSTRAINT chk_usn_format CHECK ( usn IS NULL OR - (profile = 'standard' AND usn ~ '^V[+\-0±]:I[123]:[OPJEAVR][1-4]\.[0-9]{2}$') OR - (profile = 'full' AND usn ~ '^V[+\-0±]:I[123]:[OPJEAVR][1-4]\.[0-9]{2}:S[123]:A[123]:T[CRHF]:E[SIC]$') + (profile = 'lite' AND usn ~ '^URT:L:[OPJEAVR]:[+\-0±][123]$') OR + (profile = 'core' AND usn ~ '^URT:C:[OPJEAVR][1-4]:[+\-0±][123]$') OR + (profile = 'standard' AND usn ~ '^URT:S:[OPJEAVR][1-4]\.[0-9]{2}(\+[OPJEAVR][1-4]\.[0-9]{2}){0,2}:[+\-0±][123]:[1-3][1-3]T[CRHF]\.E[SIC]\.[NBWS]$') OR + (profile = 'full' AND usn ~ '^URT:F:[OPJEAVR][1-4]\.[0-9]{2}(\+[OPJEAVR][1-4]\.[0-9]{2}){0,2}:[+\-0±][123]:[1-3][1-3]T[CRHF]\.E[SIC]\.[NBWS](:(CD|MG|SY)\.[STEOFRPCSHX](,(CD|MG|SY)\.[STEOFRPCSHX])*)?$') ); -- Foreign keys for review_spans diff --git a/.artifacts/ReviewIQ-v32-Decisions.md b/.artifacts/ReviewIQ-v32-Decisions.md index 28e0a38..b3ce44e 100644 --- a/.artifacts/ReviewIQ-v32-Decisions.md +++ b/.artifacts/ReviewIQ-v32-Decisions.md @@ -76,15 +76,15 @@ Based on: v3.1.2 (commit f998277) - `urt_actionability` — A1, A2, A3 **Context & Evidence:** -- `urt_temporal` — T1, T2, T3 -- `urt_evidence` — E1, E2, E3 -- `urt_comparative` — CR1, CR2, CR3 +- `urt_temporal` — TC (current), TR (recent), TH (historical), TF (future) +- `urt_evidence` — ES (stated), EI (inferred), EC (contextual) +- `urt_comparative` — CR-N (none), CR-B (better), CR-W (worse), CR-S (same) **Classification:** -- `urt_profile` — factual, emotional, comparative, etc. +- `urt_profile` — lite, core, standard, full - `urt_confidence` — low, medium, high -- `urt_relation` — elaborates, contrasts, causes, etc. -- `urt_entity_type` — person, product, location, etc. +- `urt_relation` — cause_of, effect_of, contrast, resolution +- `urt_entity_type` — location, staff, product, process, time, other --- diff --git a/.artifacts/URT-v5.1-Reference.md b/.artifacts/URT-v5.1-Reference.md index 77138aa..1696279 100644 --- a/.artifacts/URT-v5.1-Reference.md +++ b/.artifacts/URT-v5.1-Reference.md @@ -6,7 +6,7 @@ The Universal Review Taxonomy (URT) is a classification system for customer feed ### Key Characteristics -- **Three Profiles**: Core, Standard, Full (increasing detail) +- **Four Profiles**: Lite, Core, Standard, Full (increasing detail) - **Seven Domains**: Covering all aspects of customer experience - **Tier-3 Canonical Codes**: Format `X#.##` (e.g., J1.02, P2.15) - **Dimensional Annotation**: Valence, intensity, specificity, and more @@ -129,8 +129,10 @@ USN is a compact string encoding for URT annotations. ### Grammar ``` -Standard: URT:S:{codes}:{V}{I}:{S}{A}{T}.{E}.{CR} -Full: URT:F:{codes}:{V}{I}:{S}{A}{T}.{E}.{CR}:{causal} +Lite: URT:L:{domain}:{V}{I} +Core: URT:C:{category}:{V}{I} +Standard: URT:S:{subcode}[+{sec}]:{V}{I}:{S}{A}{T}.{E}.{CR} +Full: URT:F:{subcode}[+{sec}]:{V}{I}:{S}{A}{T}.{E}.{CR}[:{causal}] ``` ### Encoding Rules @@ -138,6 +140,8 @@ Full: URT:F:{codes}:{V}{I}:{S}{A}{T}.{E}.{CR}:{causal} **Valence**: - `+` for V+ - `-` for V- +- `0` for V0 +- `±` for V± **Intensity**: - `1` for I1 diff --git a/web/app/new/google-reviews/page.tsx b/web/app/new/google-reviews/page.tsx new file mode 100644 index 0000000..273b714 --- /dev/null +++ b/web/app/new/google-reviews/page.tsx @@ -0,0 +1,60 @@ +'use client'; + +import { useRouter } from 'next/navigation'; +import { useCallback } from 'react'; +import ScraperTest from '@/components/ScraperTest'; +import { useJobs } from '@/contexts/JobsContext'; +import { JobStatus } from '@/components/ScraperTest'; +import Link from 'next/link'; + +export default function GoogleReviewsScraperPage() { + const router = useRouter(); + const { addJob } = useJobs(); + + const handleJobsChange = useCallback((jobs: JobStatus[]) => { + // Add new jobs to context (addJob handles deduplication) + jobs.forEach(job => addJob(job)); + }, [addJob]); + + const handleSelectReviews = (reviews: unknown[], businessName: string, jobId: string) => { + // Navigate to analytics page for this job + router.push(`/analytics/${jobId}`); + }; + + return ( +
+ {/* Breadcrumb */} +
+ +
+ + {/* Header */} +
+
+
+ + + +
+
+

Google Reviews Scraper

+

Extract reviews from any Google Maps business listing

+
+
+
+ + +
+ ); +} diff --git a/web/app/new/page.tsx b/web/app/new/page.tsx index 63f648d..62e0123 100644 --- a/web/app/new/page.tsx +++ b/web/app/new/page.tsx @@ -1,31 +1,312 @@ 'use client'; -import { useRouter } from 'next/navigation'; -import { useCallback } from 'react'; -import ScraperTest from '@/components/ScraperTest'; -import { useJobs } from '@/contexts/JobsContext'; -import { JobStatus } from '@/components/ScraperTest'; +import { useState, useEffect, useCallback } from 'react'; +import Link from 'next/link'; + +const API_BASE = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8000'; + +interface ScraperInfo { + job_type: string; + version: string; + variant: string; + traffic_pct: number; + deprecated_at: string | null; +} + +interface ScraperTypeCard { + job_type: string; + name: string; + description: string; + icon: React.ReactNode; + route: string; + color: string; + available: boolean; + versions: string[]; +} + +// Define scraper type metadata (icons, descriptions, routes) +const SCRAPER_METADATA: Record> = { + 'google-reviews': { + name: 'Google Reviews', + description: 'Extract reviews from Google Maps business listings. Supports any business with a Google Maps presence.', + icon: ( + + + + ), + route: '/new/google-reviews', + color: 'from-blue-500 to-indigo-600', + }, + 'google_reviews': { + name: 'Google Reviews', + description: 'Extract reviews from Google Maps business listings. Supports any business with a Google Maps presence.', + icon: ( + + + + ), + route: '/new/google-reviews', + color: 'from-blue-500 to-indigo-600', + }, + 'yelp-reviews': { + name: 'Yelp Reviews', + description: 'Extract reviews from Yelp business pages. Perfect for restaurants, services, and local businesses.', + icon: ( + + + + ), + route: '/new/yelp-reviews', + color: 'from-red-500 to-rose-600', + }, + 'tripadvisor-reviews': { + name: 'TripAdvisor Reviews', + description: 'Extract reviews from TripAdvisor. Ideal for hotels, restaurants, and tourist attractions.', + icon: ( + + + + ), + route: '/new/tripadvisor-reviews', + color: 'from-green-500 to-emerald-600', + }, +}; + +// Fallback for unknown scraper types +const DEFAULT_METADATA = { + name: 'Unknown Scraper', + description: 'A scraper for extracting reviews.', + icon: ( + + + + ), + route: '/new', + color: 'from-gray-500 to-gray-600', +}; export default function NewScrapePage() { - const router = useRouter(); - const { addJob } = useJobs(); + const [scrapers, setScrapers] = useState([]); + const [loading, setLoading] = useState(true); + const [error, setError] = useState(null); - const handleJobsChange = useCallback((jobs: JobStatus[]) => { - // Add new jobs to context (addJob handles deduplication) - jobs.forEach(job => addJob(job)); - }, [addJob]); + const fetchScrapers = useCallback(async () => { + try { + const response = await fetch(`${API_BASE}/api/admin/scrapers`); + if (!response.ok) throw new Error('Failed to fetch scrapers'); - const handleSelectReviews = (reviews: unknown[], businessName: string, jobId: string) => { - // Navigate to analytics page for this job - router.push(`/analytics/${jobId}`); - }; + const data: ScraperInfo[] = await response.json(); + + // Group by job_type and collect versions + const scrapersByType = data.reduce((acc, scraper) => { + const key = scraper.job_type; + if (!acc[key]) { + acc[key] = { + job_type: key, + versions: [], + hasActive: false, + }; + } + acc[key].versions.push(`v${scraper.version}${scraper.variant !== 'stable' ? ` (${scraper.variant})` : ''}`); + if (!scraper.deprecated_at && scraper.traffic_pct > 0) { + acc[key].hasActive = true; + } + return acc; + }, {} as Record); + + // Transform to ScraperTypeCard array + const cards: ScraperTypeCard[] = Object.values(scrapersByType).map(({ job_type, versions, hasActive }) => { + const metadata = SCRAPER_METADATA[job_type] || { + ...DEFAULT_METADATA, + name: job_type.split(/[-_]/).map(w => w.charAt(0).toUpperCase() + w.slice(1)).join(' '), + route: `/new/${job_type}`, + }; + + return { + job_type, + ...metadata, + available: hasActive, + versions, + }; + }); + + setScrapers(cards); + } catch (err) { + console.error('Failed to fetch scrapers:', err); + setError('Failed to load available scrapers'); + // Fallback to showing Google Reviews as available + setScrapers([{ + job_type: 'google-reviews', + ...SCRAPER_METADATA['google-reviews'], + available: true, + versions: ['v1.0.0'], + }]); + } finally { + setLoading(false); + } + }, []); + + useEffect(() => { + fetchScrapers(); + }, [fetchScrapers]); + + // Coming soon scrapers (not in registry yet) + const comingSoonScrapers: ScraperTypeCard[] = [ + { + job_type: 'yelp-reviews', + ...SCRAPER_METADATA['yelp-reviews'], + available: false, + versions: [], + }, + { + job_type: 'tripadvisor-reviews', + ...SCRAPER_METADATA['tripadvisor-reviews'], + available: false, + versions: [], + }, + ].filter(s => !scrapers.some(existing => existing.job_type === s.job_type)); return (
- +
+ {/* Header */} +
+

New Scrape Job

+

Select a scraper type to start extracting reviews

+
+ + {/* Error State */} + {error && ( +
+
+ + + + {error} +
+
+ )} + + {/* Loading State */} + {loading ? ( +
+ {[1, 2].map(i => ( +
+
+
+
+
+
+
+
+
+
+ ))} +
+ ) : ( + <> + {/* Available Scrapers */} +
+

+ + Available Scrapers +

+
+ {scrapers.filter(s => s.available).map(scraper => ( + +
+
+ {scraper.icon} +
+
+
+

+ {scraper.name} +

+ + Active + +
+

{scraper.description}

+
+ {scraper.versions.slice(0, 2).map(v => ( + + {v} + + ))} + {scraper.versions.length > 2 && ( + +{scraper.versions.length - 2} more + )} +
+
+ + + +
+ + ))} +
+
+ + {/* Coming Soon Scrapers */} + {comingSoonScrapers.length > 0 && ( +
+

+ + Coming Soon +

+
+ {comingSoonScrapers.map(scraper => ( +
+
+
+ {scraper.icon} +
+
+
+

+ {scraper.name} +

+ + Coming Soon + +
+

{scraper.description}

+
+
+
+ ))} +
+
+ )} + + )} + + {/* Help Section */} +
+
+
+ + + +
+
+

Need a different scraper?

+

+ We're constantly adding new scrapers. If you need reviews from a platform not listed here,{' '} + let us know. +

+
+
+
+
); } diff --git a/web/components/ScraperTest.tsx b/web/components/ScraperTest.tsx index dc92dce..e422063 100644 --- a/web/components/ScraperTest.tsx +++ b/web/components/ScraperTest.tsx @@ -1,17 +1,8 @@ 'use client'; -import { useState, useEffect, useRef, useCallback } from 'react'; +import { useState, useEffect, useRef } from 'react'; import ReviewAnalytics from './ReviewAnalytics'; -const API_BASE = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8000'; - -interface ScraperType { - job_type: string; - version: string; - variant: string; - label: string; -} - interface Review { author: string; rating: number; @@ -69,10 +60,6 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe const [businessImage, setBusinessImage] = useState(null); const [businessCategory, setBusinessCategory] = useState(null); - // Scraper type selection - const [availableScrapers, setAvailableScrapers] = useState([]); - const [selectedScraper, setSelectedScraper] = useState(null); - const [scrapersLoading, setScrapersLoading] = useState(true); const [userFingerprint, setUserFingerprint] = useState<{ geolocation?: {lat: number, lng: number}, userAgent?: string, @@ -132,48 +119,6 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe collectFingerprint(); }, []); - // Fetch available scraper types on mount - const fetchScrapers = useCallback(async () => { - try { - const response = await fetch(`${API_BASE}/api/admin/scrapers`); - if (response.ok) { - const data = await response.json(); - // Transform to ScraperType format and filter to active scrapers - const scrapers: ScraperType[] = data - .filter((s: { deprecated_at: string | null; traffic_pct: number }) => !s.deprecated_at && s.traffic_pct > 0) - .map((s: { job_type: string; version: string; variant: string }) => ({ - job_type: s.job_type, - version: s.version, - variant: s.variant, - // Format job_type nicely: google_reviews or google-reviews -> "Google Reviews" - label: `${s.job_type.split(/[-_]/).map((w: string) => w.charAt(0).toUpperCase() + w.slice(1)).join(' ')} v${s.version}${s.variant !== 'stable' ? ` (${s.variant})` : ''}`, - })); - setAvailableScrapers(scrapers); - // Auto-select first scraper (usually google-reviews stable) - if (scrapers.length > 0 && !selectedScraper) { - setSelectedScraper(scrapers[0]); - } - } - } catch (err) { - console.error('Failed to fetch scrapers:', err); - // Fallback to default google-reviews - const defaultScraper: ScraperType = { - job_type: 'google-reviews', - version: '1.0.0', - variant: 'stable', - label: 'Google Reviews v1.0.0', - }; - setAvailableScrapers([defaultScraper]); - setSelectedScraper(defaultScraper); - } finally { - setScrapersLoading(false); - } - }, [selectedScraper]); - - useEffect(() => { - fetchScrapers(); - }, [fetchScrapers]); - const pollingIntervals = useRef>(new Map()); const abortControllerRef = useRef(null); @@ -379,11 +324,6 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe const url = `https://www.google.com/maps/search/?api=1&query=${encodeURIComponent(searchedQuery)}&hl=en`; try { - // Use selected scraper or default to google-reviews - const jobType = selectedScraper?.job_type || 'google-reviews'; - const scraperVersion = selectedScraper?.version; - const scraperVariant = selectedScraper?.variant; - const response = await fetch('/api/scrape', { method: 'POST', headers: { 'Content-Type': 'application/json' }, @@ -395,10 +335,8 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe total_reviews_snapshot: availableReviewCount, geolocation: userFingerprint.geolocation, browser_fingerprint: userFingerprint, // Pass full fingerprint - // Include scraper selection - job_type: jobType, - scraper_version: scraperVersion, - scraper_variant: scraperVariant, + // Google Reviews scraper (this component is specific to Google Reviews) + job_type: 'google-reviews', }), }); @@ -502,68 +440,6 @@ export default function ScraperTest({ onJobsChange, onSelectReviews }: ScraperTe return (
- {/* Scraper Type Selection */} -
-
-
-
- - - -
-
- -

Select the type of data to scrape

-
-
- - {scrapersLoading ? ( -
-
- Loading... -
- ) : ( - - )} -
- - {/* Show selected scraper info */} - {selectedScraper && ( -
- - {selectedScraper.job_type.split(/[-_]/).map(w => w.charAt(0).toUpperCase() + w.slice(1)).join(' ')} - - - v{selectedScraper.version} - - {selectedScraper.variant !== 'stable' && ( - - {selectedScraper.variant} - - )} -
- )} -
- {/* Test URL Quick Select */}