Initial commit - WhyRating Engine (Google Reviews Scraper)

2026-02-02 18:19:00 +00:00
parent 0543a08242
commit 2206ddeff2
136 changed files with 51138 additions and 855 deletions
--- a/packages/reviewiq-pipeline/scripts/backfill_review_facts.py
+++ b/packages/reviewiq-pipeline/scripts/backfill_review_facts.py
@@ -0,0 +1,409 @@
+#!/usr/bin/env python3
+"""
+Backfill review_facts_v1 from public.jobs.reviews_data.
+
+Parses relative timestamps ("17 hours ago", "2 weeks ago") into absolute
+timestamps anchored to job.created_at.
+
+Usage:
+    python backfill_review_facts.py
+    python backfill_review_facts.py --dry-run
+    python backfill_review_facts.py --job-id <uuid>
+"""
+
+import argparse
+import asyncio
+import json
+import os
+import re
+from datetime import datetime, timedelta, timezone
+from typing import Any
+
+import asyncpg
+
+# Database URL
+DB_URL = os.environ.get(
+    "DATABASE_URL",
+    "postgresql://scraper:scraper123@localhost:5437/scraper"
+)
+
+
+# =============================================================================
+# RELATIVE TIMESTAMP PARSER
+# =============================================================================
+
+# Regex patterns for relative timestamps
+RELATIVE_PATTERNS = [
+    # "17 hours ago", "2 weeks ago", "a month ago"
+    (r"(?:edited\s+)?(\d+|a|an)\s+(second|minute|hour|day|week|month|year)s?\s+ago", "standard"),
+    # "just now"
+    (r"just\s+now", "just_now"),
+    # "yesterday"
+    (r"yesterday", "yesterday"),
+    # "today"
+    (r"today", "today"),
+]
+
+# Time unit multipliers (in seconds)
+TIME_UNITS = {
+    "second": 1,
+    "minute": 60,
+    "hour": 3600,
+    "day": 86400,
+    "week": 604800,
+    "month": 2592000,  # 30 days
+    "year": 31536000,  # 365 days
+}
+
+
+def parse_relative_timestamp(raw: str, reference_time: datetime) -> datetime | None:
+    """
+    Parse a relative timestamp string into an absolute datetime.
+
+    Args:
+        raw: Relative timestamp like "17 hours ago", "Edited 2 weeks ago"
+        reference_time: The reference point (usually job.created_at)
+
+    Returns:
+        Absolute datetime or None if parsing failed
+    """
+    if not raw:
+        return None
+
+    text = raw.lower().strip()
+
+    # Handle "just now"
+    if "just now" in text:
+        return reference_time
+
+    # Handle "yesterday"
+    if text == "yesterday":
+        return reference_time - timedelta(days=1)
+
+    # Handle "today"
+    if text == "today":
+        return reference_time
+
+    # Handle standard relative format
+    # Remove "edited " prefix if present
+    text = re.sub(r"^edited\s+", "", text)
+
+    # Match "N unit(s) ago"
+    match = re.match(r"(\d+|a|an)\s+(second|minute|hour|day|week|month|year)s?\s+ago", text)
+    if match:
+        quantity_str = match.group(1)
+        unit = match.group(2)
+
+        # Convert "a"/"an" to 1
+        if quantity_str in ("a", "an"):
+            quantity = 1
+        else:
+            quantity = int(quantity_str)
+
+        seconds = quantity * TIME_UNITS.get(unit, 0)
+        return reference_time - timedelta(seconds=seconds)
+
+    # Unknown format
+    return None
+
+
+def parse_relative_timestamp_safe(raw: str, reference_time: datetime) -> tuple[datetime | None, bool]:
+    """
+    Safe wrapper that returns (parsed_time, success).
+    """
+    try:
+        result = parse_relative_timestamp(raw, reference_time)
+        return result, result is not None
+    except Exception:
+        return None, False
+
+
+# =============================================================================
+# BACKFILL LOGIC
+# =============================================================================
+
+async def get_jobs_with_reviews(pool: asyncpg.Pool, job_id: str | None = None) -> list[dict]:
+    """Get all jobs with reviews_data."""
+    if job_id:
+        query = """
+            SELECT job_id, created_at, reviews_data,
+                   COALESCE(metadata->>'business_name', url) as business_id
+            FROM public.jobs
+            WHERE job_id = $1
+              AND reviews_data IS NOT NULL
+              AND jsonb_typeof(reviews_data) = 'array'
+        """
+        rows = await pool.fetch(query, job_id)
+    else:
+        query = """
+            SELECT job_id, created_at, reviews_data,
+                   COALESCE(metadata->>'business_name', url) as business_id
+            FROM public.jobs
+            WHERE reviews_data IS NOT NULL
+              AND jsonb_typeof(reviews_data) = 'array'
+            ORDER BY created_at DESC
+        """
+        rows = await pool.fetch(query)
+
+    return [dict(r) for r in rows]
+
+
+async def get_run_id_for_job(pool: asyncpg.Pool, job_id: str) -> str | None:
+    """Get the run_id associated with a job from detected_spans_v2."""
+    row = await pool.fetchrow("""
+        SELECT DISTINCT run_id FROM pipeline.detected_spans_v2
+        WHERE job_id = $1 AND run_id IS NOT NULL
+        LIMIT 1
+    """, job_id)
+    return str(row["run_id"]) if row and row["run_id"] else None
+
+
+async def get_language_for_review(pool: asyncpg.Pool, review_id: str) -> str | None:
+    """Get detected language for a review from spans."""
+    row = await pool.fetchrow("""
+        SELECT language FROM pipeline.detected_spans_v2
+        WHERE review_id = $1 AND language IS NOT NULL
+        LIMIT 1
+    """, review_id)
+    return row["language"] if row else None
+
+
+async def upsert_review_facts(
+    pool: asyncpg.Pool,
+    facts: list[dict],
+    dry_run: bool = False,
+) -> tuple[int, int]:
+    """
+    Upsert review facts into the database.
+
+    Returns:
+        (inserted_count, updated_count)
+    """
+    if dry_run or not facts:
+        return 0, 0
+
+    # Use executemany with ON CONFLICT
+    query = """
+        INSERT INTO pipeline.review_facts_v1
+        (review_id, business_id, job_id, run_id, rating, review_time_utc, raw_timestamp, author, language)
+        VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
+        ON CONFLICT (review_id) DO UPDATE SET
+            business_id = EXCLUDED.business_id,
+            job_id = EXCLUDED.job_id,
+            run_id = COALESCE(EXCLUDED.run_id, pipeline.review_facts_v1.run_id),
+            rating = EXCLUDED.rating,
+            review_time_utc = EXCLUDED.review_time_utc,
+            raw_timestamp = EXCLUDED.raw_timestamp,
+            author = EXCLUDED.author,
+            language = COALESCE(EXCLUDED.language, pipeline.review_facts_v1.language)
+    """
+
+    # Prepare records
+    records = [
+        (
+            f["review_id"],
+            f["business_id"],
+            f["job_id"],
+            f.get("run_id"),
+            f.get("rating"),
+            f.get("review_time_utc"),
+            f.get("raw_timestamp"),
+            f.get("author"),
+            f.get("language"),
+        )
+        for f in facts
+    ]
+
+    await pool.executemany(query, records)
+    return len(records), 0
+
+
+async def backfill_job(
+    pool: asyncpg.Pool,
+    job: dict,
+    dry_run: bool = False,
+    verbose: bool = False,
+) -> dict[str, Any]:
+    """
+    Backfill review facts for a single job.
+
+    Returns:
+        Stats dict with counts and errors
+    """
+    job_id = job["job_id"]
+    job_created = job["created_at"]
+    business_id = job["business_id"]
+    reviews_data = job["reviews_data"]
+
+    # asyncpg may return JSONB as string
+    if isinstance(reviews_data, str):
+        reviews_data = json.loads(reviews_data)
+
+    # Make job_created timezone-aware if it isn't
+    if job_created.tzinfo is None:
+        job_created = job_created.replace(tzinfo=timezone.utc)
+
+    # Get run_id for this job
+    run_id = await get_run_id_for_job(pool, str(job_id))
+
+    stats = {
+        "job_id": str(job_id),
+        "total_reviews": 0,
+        "parsed_ok": 0,
+        "parsed_failed": 0,
+        "inserted": 0,
+        "sample_failures": [],
+    }
+
+    facts = []
+
+    for review in reviews_data:
+        stats["total_reviews"] += 1
+
+        # Handle both dict and JSON string
+        if isinstance(review, str):
+            try:
+                review = json.loads(review)
+            except json.JSONDecodeError:
+                continue
+
+        review_id = review.get("review_id")
+        if not review_id:
+            continue
+
+        raw_timestamp = review.get("timestamp", "")
+        review_time, success = parse_relative_timestamp_safe(raw_timestamp, job_created)
+
+        if success:
+            stats["parsed_ok"] += 1
+        else:
+            stats["parsed_failed"] += 1
+            if len(stats["sample_failures"]) < 5:
+                stats["sample_failures"].append(raw_timestamp)
+
+        # Get language from spans if available
+        language = await get_language_for_review(pool, review_id) if not dry_run else None
+
+        facts.append({
+            "review_id": review_id,
+            "business_id": business_id,
+            "job_id": job_id,
+            "run_id": run_id,
+            "rating": review.get("rating"),
+            "review_time_utc": review_time,
+            "raw_timestamp": raw_timestamp,
+            "author": review.get("author"),
+            "language": language,
+        })
+
+    # Upsert
+    inserted, _ = await upsert_review_facts(pool, facts, dry_run=dry_run)
+    stats["inserted"] = inserted
+
+    if verbose:
+        print(f"  Job {job_id}: {stats['total_reviews']} reviews, "
+              f"{stats['parsed_ok']} parsed OK, {stats['parsed_failed']} failed")
+        if stats["sample_failures"]:
+            print(f"    Sample failures: {stats['sample_failures'][:3]}")
+
+    return stats
+
+
+async def backfill_all(
+    pool: asyncpg.Pool,
+    job_id: str | None = None,
+    dry_run: bool = False,
+    verbose: bool = False,
+) -> dict[str, Any]:
+    """
+    Backfill review facts for all jobs (or a specific job).
+
+    Returns:
+        Aggregate stats
+    """
+    jobs = await get_jobs_with_reviews(pool, job_id)
+
+    print(f"\n{'[DRY RUN] ' if dry_run else ''}Backfilling review_facts_v1 from {len(jobs)} jobs...")
+
+    aggregate = {
+        "jobs_processed": 0,
+        "total_reviews": 0,
+        "parsed_ok": 0,
+        "parsed_failed": 0,
+        "inserted": 0,
+        "unique_failure_patterns": set(),
+    }
+
+    for i, job in enumerate(jobs, 1):
+        if verbose:
+            print(f"\n[{i}/{len(jobs)}] Processing job {job['job_id']}...")
+
+        stats = await backfill_job(pool, job, dry_run=dry_run, verbose=verbose)
+
+        aggregate["jobs_processed"] += 1
+        aggregate["total_reviews"] += stats["total_reviews"]
+        aggregate["parsed_ok"] += stats["parsed_ok"]
+        aggregate["parsed_failed"] += stats["parsed_failed"]
+        aggregate["inserted"] += stats["inserted"]
+        aggregate["unique_failure_patterns"].update(stats["sample_failures"])
+
+    # Convert set to list for JSON serialization
+    aggregate["unique_failure_patterns"] = list(aggregate["unique_failure_patterns"])[:20]
+
+    return aggregate
+
+
+# =============================================================================
+# CLI
+# =============================================================================
+
+async def main_async(args):
+    """Main async entry point."""
+    pool = await asyncpg.create_pool(DB_URL)
+
+    try:
+        stats = await backfill_all(
+            pool,
+            job_id=args.job_id,
+            dry_run=args.dry_run,
+            verbose=args.verbose,
+        )
+
+        print("\n" + "=" * 60)
+        print("BACKFILL COMPLETE")
+        print("=" * 60)
+        print(f"Jobs processed:    {stats['jobs_processed']}")
+        print(f"Total reviews:     {stats['total_reviews']}")
+        print(f"Timestamps parsed: {stats['parsed_ok']} ({stats['parsed_ok']/max(stats['total_reviews'],1)*100:.1f}%)")
+        print(f"Timestamps failed: {stats['parsed_failed']} ({stats['parsed_failed']/max(stats['total_reviews'],1)*100:.1f}%)")
+        if not args.dry_run:
+            print(f"Records upserted:  {stats['inserted']}")
+
+        if stats["unique_failure_patterns"]:
+            print(f"\nUnparsed timestamp patterns ({len(stats['unique_failure_patterns'])}):")
+            for p in stats["unique_failure_patterns"][:10]:
+                print(f"  - \"{p}\"")
+
+        # Calculate coverage
+        coverage = stats['parsed_ok'] / max(stats['total_reviews'], 1) * 100
+        if coverage < 90:
+            print(f"\n⚠️  WARNING: Timestamp coverage is {coverage:.1f}% (target: >90%)")
+        else:
+            print(f"\n✅ Timestamp coverage: {coverage:.1f}%")
+
+    finally:
+        await pool.close()
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Backfill review_facts_v1")
+    parser.add_argument("--job-id", help="Process a specific job only")
+    parser.add_argument("--dry-run", action="store_true", help="Don't write to database")
+    parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
+
+    args = parser.parse_args()
+    asyncio.run(main_async(args))
+
+
+if __name__ == "__main__":
+    main()