feat: Add ScraperV1Adapter and real data pipeline test
- Add ScraperV1Adapter to transform scraped reviews into pipeline format - Handles relative timestamps (centerDate) - Generates deterministic IDs for DOM-sourced reviews - Filters out empty (rating-only) reviews - Add sample barbershop reviews (79 reviews, 46 with text) - Real data from Las Palmas barbershop - Multi-language: Spanish, English, German, Norwegian, Italian - Add test_pipeline_real_data.py for E2E testing with real data - Uses mock classifier based on keywords and rating - Full pipeline flow: raw -> enriched -> spans -> issues -> facts Test results with real data: - 46 reviews processed - 6 languages detected (es: 35, en: 7, de: 1, no: 1, it: 1, ca: 1) - 3 issues identified from negative reviews - 29 fact records aggregated across date range 2017-2025 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,5 @@
|
||||
"""Ingestion adapters for various review data formats."""
|
||||
|
||||
from reviewiq_pipeline.adapters.scraper_v1 import ScraperV1Adapter
|
||||
|
||||
__all__ = ["ScraperV1Adapter"]
|
||||
@@ -0,0 +1,242 @@
|
||||
"""
|
||||
Adapter for Scraper V1 output format.
|
||||
|
||||
This adapter transforms the raw scraped review format into the pipeline's
|
||||
expected RawReview format for Stage 1 processing.
|
||||
|
||||
Input format (from scraper):
|
||||
{
|
||||
"text": "Review text...",
|
||||
"author": "Author Name",
|
||||
"rating": 5,
|
||||
"source": "api" | "dom",
|
||||
"review_id": "ABC123...", # Optional for DOM-sourced
|
||||
"timestamp": "2 months ago",
|
||||
"minDate": "2025-10-27T18:31:09.843Z",
|
||||
"maxDate": "2025-11-25T18:31:09.843Z",
|
||||
"centerDate": "2025-11-11T06:31:09.843Z"
|
||||
}
|
||||
|
||||
Output format (for pipeline):
|
||||
{
|
||||
"review_id": "ABC123...",
|
||||
"text": "Review text...",
|
||||
"rating": 5,
|
||||
"author_name": "Author Name",
|
||||
"author_id": None,
|
||||
"review_time": "2025-11-11T06:31:09.843Z",
|
||||
"relative_time": "2 months ago",
|
||||
"raw_payload": {...} # Original data
|
||||
}
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
from uuid import uuid4
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ScraperV1Adapter:
|
||||
"""
|
||||
Adapter to transform Scraper V1 output into pipeline-compatible format.
|
||||
|
||||
The scraper produces reviews with relative timestamps ("2 months ago") and
|
||||
estimated date ranges. This adapter normalizes them into absolute timestamps
|
||||
using the centerDate field.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
business_id: str,
|
||||
place_id: str,
|
||||
source: str = "google",
|
||||
):
|
||||
"""
|
||||
Initialize the adapter.
|
||||
|
||||
Args:
|
||||
business_id: Business identifier for the reviews
|
||||
place_id: Google Maps place ID
|
||||
source: Review source platform (default: "google")
|
||||
"""
|
||||
self.business_id = business_id
|
||||
self.place_id = place_id
|
||||
self.source = source
|
||||
self._seen_ids: set[str] = set()
|
||||
|
||||
def transform(self, scraped_reviews: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||||
"""
|
||||
Transform a list of scraped reviews into pipeline format.
|
||||
|
||||
Args:
|
||||
scraped_reviews: List of reviews from scraper output
|
||||
|
||||
Returns:
|
||||
List of reviews in pipeline RawReview format
|
||||
"""
|
||||
transformed = []
|
||||
skipped_empty = 0
|
||||
skipped_duplicate = 0
|
||||
|
||||
for review in scraped_reviews:
|
||||
result = self.transform_single(review)
|
||||
|
||||
if result is None:
|
||||
skipped_empty += 1
|
||||
continue
|
||||
|
||||
if result["review_id"] in self._seen_ids:
|
||||
skipped_duplicate += 1
|
||||
continue
|
||||
|
||||
self._seen_ids.add(result["review_id"])
|
||||
transformed.append(result)
|
||||
|
||||
logger.info(
|
||||
f"Transformed {len(transformed)} reviews "
|
||||
f"(skipped {skipped_empty} empty, {skipped_duplicate} duplicates)"
|
||||
)
|
||||
return transformed
|
||||
|
||||
def transform_single(self, review: dict[str, Any]) -> dict[str, Any] | None:
|
||||
"""
|
||||
Transform a single scraped review into pipeline format.
|
||||
|
||||
Args:
|
||||
review: Single review from scraper output
|
||||
|
||||
Returns:
|
||||
Review in pipeline RawReview format, or None if should be skipped
|
||||
"""
|
||||
text = (review.get("text") or "").strip()
|
||||
|
||||
# Skip empty reviews (rating-only)
|
||||
if not text:
|
||||
return None
|
||||
|
||||
# Generate review_id if missing (DOM-sourced reviews)
|
||||
review_id = review.get("review_id")
|
||||
if not review_id:
|
||||
# Generate deterministic ID from content
|
||||
content_hash = hashlib.sha256(
|
||||
f"{review.get('author', '')}:{text}:{review.get('rating', 0)}".encode()
|
||||
).hexdigest()[:16]
|
||||
review_id = f"DOM-{content_hash}"
|
||||
|
||||
# Parse review time from centerDate (best estimate)
|
||||
review_time = self._parse_review_time(review)
|
||||
|
||||
return {
|
||||
"review_id": review_id,
|
||||
"text": text,
|
||||
"rating": review.get("rating", 5),
|
||||
"author_name": review.get("author", "Anonymous"),
|
||||
"author_id": None, # Not available in scraper output
|
||||
"review_time": review_time,
|
||||
"relative_time": review.get("timestamp", ""),
|
||||
"raw_payload": review, # Preserve original data
|
||||
}
|
||||
|
||||
def _parse_review_time(self, review: dict[str, Any]) -> str:
|
||||
"""
|
||||
Parse the review time from available date fields.
|
||||
|
||||
Priority:
|
||||
1. centerDate (best estimate from relative time)
|
||||
2. minDate (earliest possible date)
|
||||
3. Current time (fallback)
|
||||
|
||||
Args:
|
||||
review: Review data with date fields
|
||||
|
||||
Returns:
|
||||
ISO 8601 timestamp string
|
||||
"""
|
||||
# Try centerDate first (most accurate estimate)
|
||||
center_date = review.get("centerDate")
|
||||
if center_date:
|
||||
try:
|
||||
# Parse and re-format to ensure consistency
|
||||
dt = datetime.fromisoformat(center_date.replace("Z", "+00:00"))
|
||||
return dt.isoformat()
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
# Try minDate as fallback
|
||||
min_date = review.get("minDate")
|
||||
if min_date:
|
||||
try:
|
||||
dt = datetime.fromisoformat(min_date.replace("Z", "+00:00"))
|
||||
return dt.isoformat()
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
# Final fallback: current time
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
def to_scraper_output(
|
||||
self,
|
||||
scraped_reviews: list[dict[str, Any]],
|
||||
job_id: str | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Create a full scraper output envelope for the pipeline.
|
||||
|
||||
This creates the complete structure expected by Pipeline.process().
|
||||
|
||||
Args:
|
||||
scraped_reviews: List of reviews from scraper
|
||||
job_id: Optional job ID (generates UUID if not provided)
|
||||
|
||||
Returns:
|
||||
Complete scraper output dict for pipeline ingestion
|
||||
"""
|
||||
if job_id is None:
|
||||
job_id = str(uuid4())
|
||||
|
||||
transformed = self.transform(scraped_reviews)
|
||||
|
||||
return {
|
||||
"job_id": job_id,
|
||||
"status": "completed",
|
||||
"business_id": self.business_id,
|
||||
"place_id": self.place_id,
|
||||
"business_info": {
|
||||
"name": self.business_id,
|
||||
"place_id": self.place_id,
|
||||
},
|
||||
"reviews": transformed,
|
||||
"scrape_time_ms": 0,
|
||||
"reviews_scraped": len(transformed),
|
||||
"scraper_version": "v1.0.0",
|
||||
}
|
||||
|
||||
|
||||
def load_and_transform(
|
||||
file_path: str,
|
||||
business_id: str,
|
||||
place_id: str,
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Convenience function to load a JSON file and transform it.
|
||||
|
||||
Args:
|
||||
file_path: Path to JSON file with scraped reviews
|
||||
business_id: Business identifier
|
||||
place_id: Google Maps place ID
|
||||
|
||||
Returns:
|
||||
Complete scraper output dict for pipeline ingestion
|
||||
"""
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
data = json.loads(Path(file_path).read_text())
|
||||
|
||||
adapter = ScraperV1Adapter(business_id, place_id)
|
||||
return adapter.to_scraper_output(data)
|
||||
Reference in New Issue
Block a user