feat: Add ScraperV1Adapter and real data pipeline test

- Add ScraperV1Adapter to transform scraped reviews into pipeline format - Handles relative timestamps (centerDate) - Generates deterministic IDs for DOM-sourced reviews - Filters out empty (rating-only) reviews - Add sample barbershop reviews (79 reviews, 46 with text) - Real data from Las Palmas barbershop - Multi-language: Spanish, English, German, Norwegian, Italian - Add test_pipeline_real_data.py for E2E testing with real data - Uses mock classifier based on keywords and rating - Full pipeline flow: raw -> enriched -> spans -> issues -> facts Test results with real data: - 46 reviews processed - 6 languages detected (es: 35, en: 7, de: 1, no: 1, it: 1, ca: 1) - 3 issues identified from negative reviews - 29 fact records aggregated across date range 2017-2025 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-24 18:35:09 +00:00
parent 3e57c887e9
commit e2d7f6f118
4 changed files with 1733 additions and 0 deletions
--- a/packages/reviewiq-pipeline/src/reviewiq_pipeline/adapters/init.py
+++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/adapters/init.py
@@ -0,0 +1,5 @@
+"""Ingestion adapters for various review data formats."""
+
+from reviewiq_pipeline.adapters.scraper_v1 import ScraperV1Adapter
+
+__all__ = ["ScraperV1Adapter"]
--- a/packages/reviewiq-pipeline/src/reviewiq_pipeline/adapters/scraper_v1.py
+++ b/packages/reviewiq-pipeline/src/reviewiq_pipeline/adapters/scraper_v1.py
@@ -0,0 +1,242 @@
+"""
+Adapter for Scraper V1 output format.
+
+This adapter transforms the raw scraped review format into the pipeline's
+expected RawReview format for Stage 1 processing.
+
+Input format (from scraper):
+{
+    "text": "Review text...",
+    "author": "Author Name",
+    "rating": 5,
+    "source": "api" | "dom",
+    "review_id": "ABC123...",  # Optional for DOM-sourced
+    "timestamp": "2 months ago",
+    "minDate": "2025-10-27T18:31:09.843Z",
+    "maxDate": "2025-11-25T18:31:09.843Z",
+    "centerDate": "2025-11-11T06:31:09.843Z"
+}
+
+Output format (for pipeline):
+{
+    "review_id": "ABC123...",
+    "text": "Review text...",
+    "rating": 5,
+    "author_name": "Author Name",
+    "author_id": None,
+    "review_time": "2025-11-11T06:31:09.843Z",
+    "relative_time": "2 months ago",
+    "raw_payload": {...}  # Original data
+}
+"""
+
+from __future__ import annotations
+
+import hashlib
+import logging
+from datetime import datetime, timezone
+from typing import Any
+from uuid import uuid4
+
+logger = logging.getLogger(__name__)
+
+
+class ScraperV1Adapter:
+    """
+    Adapter to transform Scraper V1 output into pipeline-compatible format.
+
+    The scraper produces reviews with relative timestamps ("2 months ago") and
+    estimated date ranges. This adapter normalizes them into absolute timestamps
+    using the centerDate field.
+    """
+
+    def __init__(
+        self,
+        business_id: str,
+        place_id: str,
+        source: str = "google",
+    ):
+        """
+        Initialize the adapter.
+
+        Args:
+            business_id: Business identifier for the reviews
+            place_id: Google Maps place ID
+            source: Review source platform (default: "google")
+        """
+        self.business_id = business_id
+        self.place_id = place_id
+        self.source = source
+        self._seen_ids: set[str] = set()
+
+    def transform(self, scraped_reviews: list[dict[str, Any]]) -> list[dict[str, Any]]:
+        """
+        Transform a list of scraped reviews into pipeline format.
+
+        Args:
+            scraped_reviews: List of reviews from scraper output
+
+        Returns:
+            List of reviews in pipeline RawReview format
+        """
+        transformed = []
+        skipped_empty = 0
+        skipped_duplicate = 0
+
+        for review in scraped_reviews:
+            result = self.transform_single(review)
+
+            if result is None:
+                skipped_empty += 1
+                continue
+
+            if result["review_id"] in self._seen_ids:
+                skipped_duplicate += 1
+                continue
+
+            self._seen_ids.add(result["review_id"])
+            transformed.append(result)
+
+        logger.info(
+            f"Transformed {len(transformed)} reviews "
+            f"(skipped {skipped_empty} empty, {skipped_duplicate} duplicates)"
+        )
+        return transformed
+
+    def transform_single(self, review: dict[str, Any]) -> dict[str, Any] | None:
+        """
+        Transform a single scraped review into pipeline format.
+
+        Args:
+            review: Single review from scraper output
+
+        Returns:
+            Review in pipeline RawReview format, or None if should be skipped
+        """
+        text = (review.get("text") or "").strip()
+
+        # Skip empty reviews (rating-only)
+        if not text:
+            return None
+
+        # Generate review_id if missing (DOM-sourced reviews)
+        review_id = review.get("review_id")
+        if not review_id:
+            # Generate deterministic ID from content
+            content_hash = hashlib.sha256(
+                f"{review.get('author', '')}:{text}:{review.get('rating', 0)}".encode()
+            ).hexdigest()[:16]
+            review_id = f"DOM-{content_hash}"
+
+        # Parse review time from centerDate (best estimate)
+        review_time = self._parse_review_time(review)
+
+        return {
+            "review_id": review_id,
+            "text": text,
+            "rating": review.get("rating", 5),
+            "author_name": review.get("author", "Anonymous"),
+            "author_id": None,  # Not available in scraper output
+            "review_time": review_time,
+            "relative_time": review.get("timestamp", ""),
+            "raw_payload": review,  # Preserve original data
+        }
+
+    def _parse_review_time(self, review: dict[str, Any]) -> str:
+        """
+        Parse the review time from available date fields.
+
+        Priority:
+        1. centerDate (best estimate from relative time)
+        2. minDate (earliest possible date)
+        3. Current time (fallback)
+
+        Args:
+            review: Review data with date fields
+
+        Returns:
+            ISO 8601 timestamp string
+        """
+        # Try centerDate first (most accurate estimate)
+        center_date = review.get("centerDate")
+        if center_date:
+            try:
+                # Parse and re-format to ensure consistency
+                dt = datetime.fromisoformat(center_date.replace("Z", "+00:00"))
+                return dt.isoformat()
+            except (ValueError, TypeError):
+                pass
+
+        # Try minDate as fallback
+        min_date = review.get("minDate")
+        if min_date:
+            try:
+                dt = datetime.fromisoformat(min_date.replace("Z", "+00:00"))
+                return dt.isoformat()
+            except (ValueError, TypeError):
+                pass
+
+        # Final fallback: current time
+        return datetime.now(timezone.utc).isoformat()
+
+    def to_scraper_output(
+        self,
+        scraped_reviews: list[dict[str, Any]],
+        job_id: str | None = None,
+    ) -> dict[str, Any]:
+        """
+        Create a full scraper output envelope for the pipeline.
+
+        This creates the complete structure expected by Pipeline.process().
+
+        Args:
+            scraped_reviews: List of reviews from scraper
+            job_id: Optional job ID (generates UUID if not provided)
+
+        Returns:
+            Complete scraper output dict for pipeline ingestion
+        """
+        if job_id is None:
+            job_id = str(uuid4())
+
+        transformed = self.transform(scraped_reviews)
+
+        return {
+            "job_id": job_id,
+            "status": "completed",
+            "business_id": self.business_id,
+            "place_id": self.place_id,
+            "business_info": {
+                "name": self.business_id,
+                "place_id": self.place_id,
+            },
+            "reviews": transformed,
+            "scrape_time_ms": 0,
+            "reviews_scraped": len(transformed),
+            "scraper_version": "v1.0.0",
+        }
+
+
+def load_and_transform(
+    file_path: str,
+    business_id: str,
+    place_id: str,
+) -> dict[str, Any]:
+    """
+    Convenience function to load a JSON file and transform it.
+
+    Args:
+        file_path: Path to JSON file with scraped reviews
+        business_id: Business identifier
+        place_id: Google Maps place ID
+
+    Returns:
+        Complete scraper output dict for pipeline ingestion
+    """
+    import json
+    from pathlib import Path
+
+    data = json.loads(Path(file_path).read_text())
+
+    adapter = ScraperV1Adapter(business_id, place_id)
+    return adapter.to_scraper_output(data)