feat: Add ScraperV1Adapter and real data pipeline test

- Add ScraperV1Adapter to transform scraped reviews into pipeline format
  - Handles relative timestamps (centerDate)
  - Generates deterministic IDs for DOM-sourced reviews
  - Filters out empty (rating-only) reviews

- Add sample barbershop reviews (79 reviews, 46 with text)
  - Real data from Las Palmas barbershop
  - Multi-language: Spanish, English, German, Norwegian, Italian

- Add test_pipeline_real_data.py for E2E testing with real data
  - Uses mock classifier based on keywords and rating
  - Full pipeline flow: raw -> enriched -> spans -> issues -> facts

Test results with real data:
- 46 reviews processed
- 6 languages detected (es: 35, en: 7, de: 1, no: 1, it: 1, ca: 1)
- 3 issues identified from negative reviews
- 29 fact records aggregated across date range 2017-2025

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-24 18:35:09 +00:00
parent 3e57c887e9
commit e2d7f6f118
4 changed files with 1733 additions and 0 deletions

View File

@@ -0,0 +1,5 @@
"""Ingestion adapters for various review data formats."""
from reviewiq_pipeline.adapters.scraper_v1 import ScraperV1Adapter
__all__ = ["ScraperV1Adapter"]

View File

@@ -0,0 +1,242 @@
"""
Adapter for Scraper V1 output format.
This adapter transforms the raw scraped review format into the pipeline's
expected RawReview format for Stage 1 processing.
Input format (from scraper):
{
"text": "Review text...",
"author": "Author Name",
"rating": 5,
"source": "api" | "dom",
"review_id": "ABC123...", # Optional for DOM-sourced
"timestamp": "2 months ago",
"minDate": "2025-10-27T18:31:09.843Z",
"maxDate": "2025-11-25T18:31:09.843Z",
"centerDate": "2025-11-11T06:31:09.843Z"
}
Output format (for pipeline):
{
"review_id": "ABC123...",
"text": "Review text...",
"rating": 5,
"author_name": "Author Name",
"author_id": None,
"review_time": "2025-11-11T06:31:09.843Z",
"relative_time": "2 months ago",
"raw_payload": {...} # Original data
}
"""
from __future__ import annotations
import hashlib
import logging
from datetime import datetime, timezone
from typing import Any
from uuid import uuid4
logger = logging.getLogger(__name__)
class ScraperV1Adapter:
"""
Adapter to transform Scraper V1 output into pipeline-compatible format.
The scraper produces reviews with relative timestamps ("2 months ago") and
estimated date ranges. This adapter normalizes them into absolute timestamps
using the centerDate field.
"""
def __init__(
self,
business_id: str,
place_id: str,
source: str = "google",
):
"""
Initialize the adapter.
Args:
business_id: Business identifier for the reviews
place_id: Google Maps place ID
source: Review source platform (default: "google")
"""
self.business_id = business_id
self.place_id = place_id
self.source = source
self._seen_ids: set[str] = set()
def transform(self, scraped_reviews: list[dict[str, Any]]) -> list[dict[str, Any]]:
"""
Transform a list of scraped reviews into pipeline format.
Args:
scraped_reviews: List of reviews from scraper output
Returns:
List of reviews in pipeline RawReview format
"""
transformed = []
skipped_empty = 0
skipped_duplicate = 0
for review in scraped_reviews:
result = self.transform_single(review)
if result is None:
skipped_empty += 1
continue
if result["review_id"] in self._seen_ids:
skipped_duplicate += 1
continue
self._seen_ids.add(result["review_id"])
transformed.append(result)
logger.info(
f"Transformed {len(transformed)} reviews "
f"(skipped {skipped_empty} empty, {skipped_duplicate} duplicates)"
)
return transformed
def transform_single(self, review: dict[str, Any]) -> dict[str, Any] | None:
"""
Transform a single scraped review into pipeline format.
Args:
review: Single review from scraper output
Returns:
Review in pipeline RawReview format, or None if should be skipped
"""
text = (review.get("text") or "").strip()
# Skip empty reviews (rating-only)
if not text:
return None
# Generate review_id if missing (DOM-sourced reviews)
review_id = review.get("review_id")
if not review_id:
# Generate deterministic ID from content
content_hash = hashlib.sha256(
f"{review.get('author', '')}:{text}:{review.get('rating', 0)}".encode()
).hexdigest()[:16]
review_id = f"DOM-{content_hash}"
# Parse review time from centerDate (best estimate)
review_time = self._parse_review_time(review)
return {
"review_id": review_id,
"text": text,
"rating": review.get("rating", 5),
"author_name": review.get("author", "Anonymous"),
"author_id": None, # Not available in scraper output
"review_time": review_time,
"relative_time": review.get("timestamp", ""),
"raw_payload": review, # Preserve original data
}
def _parse_review_time(self, review: dict[str, Any]) -> str:
"""
Parse the review time from available date fields.
Priority:
1. centerDate (best estimate from relative time)
2. minDate (earliest possible date)
3. Current time (fallback)
Args:
review: Review data with date fields
Returns:
ISO 8601 timestamp string
"""
# Try centerDate first (most accurate estimate)
center_date = review.get("centerDate")
if center_date:
try:
# Parse and re-format to ensure consistency
dt = datetime.fromisoformat(center_date.replace("Z", "+00:00"))
return dt.isoformat()
except (ValueError, TypeError):
pass
# Try minDate as fallback
min_date = review.get("minDate")
if min_date:
try:
dt = datetime.fromisoformat(min_date.replace("Z", "+00:00"))
return dt.isoformat()
except (ValueError, TypeError):
pass
# Final fallback: current time
return datetime.now(timezone.utc).isoformat()
def to_scraper_output(
self,
scraped_reviews: list[dict[str, Any]],
job_id: str | None = None,
) -> dict[str, Any]:
"""
Create a full scraper output envelope for the pipeline.
This creates the complete structure expected by Pipeline.process().
Args:
scraped_reviews: List of reviews from scraper
job_id: Optional job ID (generates UUID if not provided)
Returns:
Complete scraper output dict for pipeline ingestion
"""
if job_id is None:
job_id = str(uuid4())
transformed = self.transform(scraped_reviews)
return {
"job_id": job_id,
"status": "completed",
"business_id": self.business_id,
"place_id": self.place_id,
"business_info": {
"name": self.business_id,
"place_id": self.place_id,
},
"reviews": transformed,
"scrape_time_ms": 0,
"reviews_scraped": len(transformed),
"scraper_version": "v1.0.0",
}
def load_and_transform(
file_path: str,
business_id: str,
place_id: str,
) -> dict[str, Any]:
"""
Convenience function to load a JSON file and transform it.
Args:
file_path: Path to JSON file with scraped reviews
business_id: Business identifier
place_id: Google Maps place ID
Returns:
Complete scraper output dict for pipeline ingestion
"""
import json
from pathlib import Path
data = json.loads(Path(file_path).read_text())
adapter = ScraperV1Adapter(business_id, place_id)
return adapter.to_scraper_output(data)