Phase 0: Project restructure to ReviewIQ platform architecture

New structure: - scrapers/google_reviews/v1_0_0.py (was modules/scraper_clean.py) - scrapers/base.py (BaseScraper interface) - scrapers/registry.py (ScraperRegistry for version routing) - core/database.py, models.py, config.py, enums.py - utils/logger.py, crash_analyzer.py, health_checks.py, helpers.py, date_converter.py - workers/chrome_pool.py - services/webhook_service.py - api/ routes structure (empty, ready for Phase 2) - tests/ structure mirroring source All imports updated in: - api_server_production.py (7 import paths updated) - utils/health_checks.py (scraper import path) Legacy modules moved to modules/_legacy/: - data_storage.py, image_handler.py, s3_handler.py (unused) Syntax verified, frontend build passing. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-24 15:22:08 +00:00
parent bb0291f265
commit 544e028c3f
37 changed files with 5782 additions and 30 deletions
--- a/core/init.py
+++ b/core/init.py
--- a/core/config.py
+++ b/core/config.py
@@ -0,0 +1,82 @@
+"""
+Configuration management for Google Maps Reviews Scraper.
+"""
+
+import logging
+from pathlib import Path
+from typing import Dict, Any
+
+import yaml
+
+# Configure logging - can be overridden by environment variable
+import os
+log_level = getattr(logging, os.environ.get('LOG_LEVEL', 'INFO').upper(), logging.INFO)
+logging.basicConfig(level=log_level, format="[%(asctime)s] %(levelname)s: %(message)s")
+log = logging.getLogger("scraper")
+
+# Default configuration path
+DEFAULT_CONFIG_PATH = Path("config.yaml")
+
+# Default configuration - will be overridden by config file
+DEFAULT_CONFIG = {
+    "url": "https://maps.app.goo.gl/6tkNMDjcj3SS6LJe9",
+    "headless": True,
+    "sort_by": "relevance",
+    "stop_on_match": False,
+    "overwrite_existing": False,
+    "use_mongodb": True,
+    "mongodb": {
+        "uri": "mongodb://localhost:27017",
+        "database": "reviews",
+        "collection": "google_reviews"
+    },
+    "backup_to_json": True,
+    "json_path": "google_reviews.json",
+    "seen_ids_path": "google_reviews.ids",
+    "convert_dates": True,
+    "download_images": True,
+    "image_dir": "review_images",
+    "download_threads": 4,
+    "store_local_paths": True,  # Option to control storing local image paths
+    "replace_urls": False,  # Option to control URL replacement
+    "custom_url_base": "https://mycustomurl.com",  # Base URL for replacement
+    "custom_url_profiles": "/profiles/",  # Path for profile images
+    "custom_url_reviews": "/reviews/",  # Path for review images
+    "preserve_original_urls": True,  # Option to preserve original URLs
+    "custom_params": {  # Custom parameters to add to each document
+        "company": "Thaitours",  # Default example
+        "source": "Google Maps"  # Default example
+    }
+}
+
+
+def load_config(config_path: Path = DEFAULT_CONFIG_PATH) -> Dict[str, Any]:
+    """Load configuration from YAML file or use defaults"""
+    config = DEFAULT_CONFIG.copy()
+
+    if config_path.exists():
+        try:
+            with open(config_path, 'r') as f:
+                user_config = yaml.safe_load(f)
+                if user_config:
+                    # Merge configs, with nested dictionary support
+                    def deep_update(d, u):
+                        for k, v in u.items():
+                            if isinstance(v, dict) and k in d and isinstance(d[k], dict):
+                                deep_update(d[k], v)
+                            else:
+                                d[k] = v
+
+                    deep_update(config, user_config)
+                    log.info(f"Loaded configuration from {config_path}")
+        except Exception as e:
+            log.error(f"Error loading config from {config_path}: {e}")
+            log.info("Using default configuration")
+    else:
+        log.info(f"Config file {config_path} not found, using default configuration")
+        # Create a default config file for future use
+        with open(config_path, 'w') as f:
+            yaml.dump(config, f, default_flow_style=False)
+            log.info(f"Created default configuration file at {config_path}")
+
+    return config
--- a/core/database.py
+++ b/core/database.py
@@ -0,0 +1,873 @@
+#!/usr/bin/env python3
+"""
+PostgreSQL database module for production microservice.
+Stores job metadata and reviews as JSONB.
+"""
+import asyncpg
+import json
+from datetime import datetime
+from typing import Optional, List, Dict, Any
+from uuid import UUID, uuid4
+import logging
+
+from core.enums import JobStatus
+
+log = logging.getLogger(__name__)
+
+
+class DatabaseManager:
+    """PostgreSQL database manager with connection pooling"""
+
+    def __init__(self, database_url: str):
+        """
+        Initialize database manager.
+
+        Args:
+            database_url: PostgreSQL connection URL
+                         Format: postgresql://user:password@host:port/database
+        """
+        self.database_url = database_url
+        self.pool: Optional[asyncpg.Pool] = None
+
+    async def connect(self):
+        """Create connection pool"""
+        log.info("Connecting to PostgreSQL database...")
+        self.pool = await asyncpg.create_pool(
+            self.database_url,
+            min_size=5,
+            max_size=20,
+            command_timeout=60
+        )
+        log.info("Database connection pool created")
+
+    async def disconnect(self):
+        """Close connection pool"""
+        if self.pool:
+            await self.pool.close()
+            log.info("Database connection pool closed")
+
+    async def initialize_schema(self):
+        """Create database schema if it doesn't exist"""
+        async with self.pool.acquire() as conn:
+            # Create jobs table
+            await conn.execute("""
+                CREATE TABLE IF NOT EXISTS jobs (
+                    job_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+                    status VARCHAR(20) NOT NULL DEFAULT 'pending',
+                    url TEXT NOT NULL,
+                    webhook_url TEXT,
+                    webhook_secret TEXT,
+
+                    created_at TIMESTAMP NOT NULL DEFAULT NOW(),
+                    started_at TIMESTAMP,
+                    completed_at TIMESTAMP,
+                    updated_at TIMESTAMP,
+
+                    reviews_count INTEGER,
+                    total_reviews INTEGER,
+                    reviews_data JSONB,
+                    scrape_time REAL,
+
+                    error_message TEXT,
+                    metadata JSONB,
+                    scrape_logs JSONB,
+
+                    CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled', 'partial'))
+                );
+            """)
+
+            # Add scrape_logs column if it doesn't exist (for existing databases)
+            await conn.execute("""
+                ALTER TABLE jobs ADD COLUMN IF NOT EXISTS scrape_logs JSONB;
+            """)
+
+            # Add updated_at column if it doesn't exist (for incremental progress tracking)
+            await conn.execute("""
+                ALTER TABLE jobs ADD COLUMN IF NOT EXISTS updated_at TIMESTAMP;
+            """)
+
+            # Add review_topics column if it doesn't exist (extracted topic filters with mention counts)
+            await conn.execute("""
+                ALTER TABLE jobs ADD COLUMN IF NOT EXISTS review_topics JSONB;
+            """)
+
+            # Update constraint to include 'partial' status (for existing databases)
+            await conn.execute("""
+                ALTER TABLE jobs DROP CONSTRAINT IF EXISTS valid_status;
+            """)
+            await conn.execute("""
+                ALTER TABLE jobs ADD CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled', 'partial'));
+            """)
+
+            # Create indexes
+            await conn.execute("""
+                CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status);
+            """)
+            await conn.execute("""
+                CREATE INDEX IF NOT EXISTS idx_jobs_created_at ON jobs(created_at DESC);
+            """)
+            await conn.execute("""
+                CREATE INDEX IF NOT EXISTS idx_jobs_webhook ON jobs(webhook_url) WHERE webhook_url IS NOT NULL;
+            """)
+
+            # Create canary results table
+            await conn.execute("""
+                CREATE TABLE IF NOT EXISTS canary_results (
+                    id SERIAL PRIMARY KEY,
+                    timestamp TIMESTAMP NOT NULL DEFAULT NOW(),
+                    success BOOLEAN NOT NULL,
+                    reviews_count INTEGER,
+                    scrape_time REAL,
+                    error_message TEXT,
+                    metadata JSONB
+                );
+            """)
+
+            await conn.execute("""
+                CREATE INDEX IF NOT EXISTS idx_canary_timestamp ON canary_results(timestamp DESC);
+            """)
+
+            # Create webhook attempts table (for retry tracking)
+            await conn.execute("""
+                CREATE TABLE IF NOT EXISTS webhook_attempts (
+                    id SERIAL PRIMARY KEY,
+                    job_id UUID NOT NULL REFERENCES jobs(job_id) ON DELETE CASCADE,
+                    attempt_number INTEGER NOT NULL,
+                    timestamp TIMESTAMP NOT NULL DEFAULT NOW(),
+                    success BOOLEAN NOT NULL,
+                    status_code INTEGER,
+                    error_message TEXT,
+                    response_time_ms REAL
+                );
+            """)
+
+            await conn.execute("""
+                CREATE INDEX IF NOT EXISTS idx_webhook_job_id ON webhook_attempts(job_id);
+            """)
+
+            # Add session_fingerprint and metrics_history columns to jobs table
+            await conn.execute("""
+                ALTER TABLE jobs ADD COLUMN IF NOT EXISTS session_fingerprint JSONB;
+            """)
+            await conn.execute("""
+                ALTER TABLE jobs ADD COLUMN IF NOT EXISTS metrics_history JSONB;
+            """)
+
+            # Create crash_reports table
+            await conn.execute("""
+                CREATE TABLE IF NOT EXISTS crash_reports (
+                    crash_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+                    job_id UUID REFERENCES jobs(job_id) ON DELETE CASCADE,
+                    created_at TIMESTAMP NOT NULL DEFAULT NOW(),
+                    crash_type VARCHAR(50) NOT NULL,
+                    error_message TEXT,
+                    state JSONB NOT NULL,
+                    metrics_history JSONB,
+                    logs_before_crash JSONB,
+                    analysis JSONB,
+                    screenshot_url TEXT,
+                    dom_snapshot_id UUID
+                );
+            """)
+
+            await conn.execute("""
+                CREATE INDEX IF NOT EXISTS idx_crash_reports_job ON crash_reports(job_id);
+            """)
+            await conn.execute("""
+                CREATE INDEX IF NOT EXISTS idx_crash_reports_type ON crash_reports(crash_type);
+            """)
+            await conn.execute("""
+                CREATE INDEX IF NOT EXISTS idx_crash_reports_created ON crash_reports(created_at DESC);
+            """)
+
+            log.info("Database schema initialized")
+
+    # ==================== Job Operations ====================
+
+    async def create_job(
+        self,
+        url: str,
+        webhook_url: Optional[str] = None,
+        webhook_secret: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None
+    ) -> UUID:
+        """
+        Create a new scraping job.
+
+        Args:
+            url: Google Maps URL to scrape
+            webhook_url: Optional webhook URL for notifications
+            webhook_secret: Optional secret for webhook signature
+            metadata: Optional additional metadata
+
+        Returns:
+            UUID of created job
+        """
+        async with self.pool.acquire() as conn:
+            job_id = await conn.fetchval("""
+                INSERT INTO jobs (url, webhook_url, webhook_secret, metadata)
+                VALUES ($1, $2, $3, $4)
+                RETURNING job_id
+            """, url, webhook_url, webhook_secret, json.dumps(metadata) if metadata else None)
+
+            log.info(f"Created job {job_id} for URL: {url[:80]}...")
+            return job_id
+
+    async def get_job(self, job_id: UUID) -> Optional[Dict[str, Any]]:
+        """
+        Get job by ID.
+
+        Args:
+            job_id: Job UUID
+
+        Returns:
+            Job dictionary or None if not found
+        """
+        async with self.pool.acquire() as conn:
+            row = await conn.fetchrow("""
+                SELECT
+                    job_id,
+                    status,
+                    url,
+                    webhook_url,
+                    created_at,
+                    started_at,
+                    completed_at,
+                    updated_at,
+                    reviews_count,
+                    total_reviews,
+                    reviews_data,
+                    scrape_time,
+                    error_message,
+                    metadata,
+                    scrape_logs,
+                    review_topics
+                FROM jobs
+                WHERE job_id = $1
+            """, job_id)
+
+            if not row:
+                return None
+
+            return dict(row)
+
+    async def get_job_reviews(self, job_id: UUID, include_partial: bool = True) -> Optional[List[Dict[str, Any]]]:
+        """
+        Get reviews for a specific job.
+
+        Args:
+            job_id: Job UUID
+            include_partial: If True, also return reviews for running and partial jobs
+
+        Returns:
+            List of reviews or None if not found/no reviews
+        """
+        async with self.pool.acquire() as conn:
+            if include_partial:
+                # Return reviews for completed, running, or partial jobs
+                reviews_data = await conn.fetchval("""
+                    SELECT reviews_data
+                    FROM jobs
+                    WHERE job_id = $1 AND status IN ('completed', 'running', 'partial')
+                """, job_id)
+            else:
+                # Only return reviews for completed jobs
+                reviews_data = await conn.fetchval("""
+                    SELECT reviews_data
+                    FROM jobs
+                    WHERE job_id = $1 AND status = 'completed'
+                """, job_id)
+
+            if not reviews_data:
+                return None
+
+            # asyncpg returns JSONB as string, need to parse it
+            if isinstance(reviews_data, str):
+                return json.loads(reviews_data)
+
+            return reviews_data
+
+    async def update_job_status(
+        self,
+        job_id: UUID,
+        status: JobStatus,
+        **kwargs
+    ):
+        """
+        Update job status and optional fields.
+
+        Args:
+            job_id: Job UUID
+            status: New status
+            **kwargs: Additional fields to update (started_at, completed_at, error_message, etc.)
+        """
+        # Build dynamic UPDATE query
+        set_clauses = ["status = $2"]
+        params = [job_id, status.value]
+        param_idx = 3
+
+        if status == JobStatus.RUNNING and 'started_at' not in kwargs:
+            kwargs['started_at'] = datetime.now()
+        elif status in [JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED] and 'completed_at' not in kwargs:
+            kwargs['completed_at'] = datetime.now()
+
+        for key, value in kwargs.items():
+            # Handle JSONB fields specially
+            if key == 'scrape_logs' and value is not None:
+                set_clauses.append(f"{key} = ${param_idx}::jsonb")
+                params.append(json.dumps(value) if not isinstance(value, str) else value)
+            else:
+                set_clauses.append(f"{key} = ${param_idx}")
+                params.append(value)
+            param_idx += 1
+
+        query = f"""
+            UPDATE jobs
+            SET {', '.join(set_clauses)}
+            WHERE job_id = $1
+        """
+
+        async with self.pool.acquire() as conn:
+            await conn.execute(query, *params)
+
+    async def save_job_result(
+        self,
+        job_id: UUID,
+        reviews: List[Dict[str, Any]],
+        scrape_time: float,
+        total_reviews: Optional[int] = None,
+        scrape_logs: Optional[List[Dict[str, Any]]] = None,
+        review_topics: Optional[List[Dict[str, Any]]] = None
+    ):
+        """
+        Save scraping results to database.
+
+        Args:
+            job_id: Job UUID
+            reviews: List of review dictionaries
+            scrape_time: Time taken to scrape in seconds
+            total_reviews: Total reviews available (from page counter)
+            scrape_logs: List of log entries from the scraper
+            review_topics: List of topic filter dictionaries with topic and count
+        """
+        async with self.pool.acquire() as conn:
+            # If reviews list is empty, check if job already has reviews from incremental saves
+            # This happens when flush_callback was used during scraping
+            if not reviews:
+                existing = await conn.fetchval(
+                    "SELECT reviews_count FROM jobs WHERE job_id = $1", job_id
+                )
+                if existing and existing > 0:
+                    # Job has reviews from incremental saves, don't overwrite reviews_data
+                    await conn.execute("""
+                        UPDATE jobs
+                        SET
+                            status = 'completed',
+                            completed_at = NOW(),
+                            total_reviews = COALESCE($2, total_reviews),
+                            scrape_time = $3,
+                            scrape_logs = $4::jsonb,
+                            review_topics = $5::jsonb
+                        WHERE job_id = $1
+                    """, job_id, total_reviews, scrape_time,
+                        json.dumps(scrape_logs) if scrape_logs else None,
+                        json.dumps(review_topics) if review_topics else None)
+                    log.info(f"Completed job {job_id} with {existing} reviews (from incremental saves)")
+                    return
+
+            await conn.execute("""
+                UPDATE jobs
+                SET
+                    status = 'completed',
+                    completed_at = NOW(),
+                    reviews_count = $2,
+                    total_reviews = $3,
+                    reviews_data = $4::jsonb,
+                    scrape_time = $5,
+                    scrape_logs = $6::jsonb,
+                    review_topics = $7::jsonb
+                WHERE job_id = $1
+            """, job_id, len(reviews), total_reviews, json.dumps(reviews), scrape_time,
+                json.dumps(scrape_logs) if scrape_logs else None,
+                json.dumps(review_topics) if review_topics else None)
+
+            log.info(f"Saved {len(reviews)} reviews for job {job_id}")
+
+    async def save_reviews_incremental(
+        self,
+        job_id: UUID,
+        reviews: List[Dict[str, Any]],
+        total_reviews: Optional[int] = None
+    ):
+        """
+        Save reviews incrementally during scraping.
+        Called on each flush to preserve progress in case of crash.
+
+        Args:
+            job_id: Job UUID
+            reviews: ALL reviews collected so far (not just new ones)
+            total_reviews: Total reviews available (from page counter)
+        """
+        async with self.pool.acquire() as conn:
+            await conn.execute("""
+                UPDATE jobs
+                SET
+                    reviews_count = $2,
+                    total_reviews = COALESCE($3, total_reviews),
+                    reviews_data = $4::jsonb,
+                    updated_at = NOW()
+                WHERE job_id = $1 AND status = 'running'
+            """, job_id, len(reviews), total_reviews, json.dumps(reviews))
+
+            log.debug(f"Incremental save: {len(reviews)} reviews for job {job_id}")
+
+    async def update_session_fingerprint(
+        self,
+        job_id: UUID,
+        session_fingerprint: Dict[str, Any]
+    ):
+        """
+        Update the session fingerprint for a job.
+
+        This should be called early in the scraping process after the browser
+        fingerprint is captured, to record browser characteristics for
+        bot detection analysis.
+
+        Args:
+            job_id: Job UUID
+            session_fingerprint: Dictionary containing browser fingerprint data:
+                - user_agent: Browser user agent string
+                - platform: OS platform
+                - language: Primary language
+                - languages: List of accepted languages
+                - timezone: Timezone string
+                - screen: {width, height, colorDepth}
+                - viewport: {width, height}
+                - webgl_vendor: WebGL vendor string
+                - webgl_renderer: WebGL renderer string
+                - canvas_fingerprint: Canvas fingerprint hash
+                - hardware_concurrency: Number of CPU cores
+                - device_memory: Device memory in GB
+                - bot_detection_tests: {webdriver_hidden, chrome_runtime, permissions_query}
+                - captured_at: ISO timestamp when fingerprint was captured
+        """
+        async with self.pool.acquire() as conn:
+            await conn.execute("""
+                UPDATE jobs
+                SET
+                    session_fingerprint = $2::jsonb,
+                    updated_at = NOW()
+                WHERE job_id = $1
+            """, job_id, json.dumps(session_fingerprint))
+
+            log.debug(f"Updated session fingerprint for job {job_id}")
+
+    async def mark_job_partial(
+        self,
+        job_id: UUID,
+        error_message: str,
+        scrape_logs: Optional[List[Dict[str, Any]]] = None
+    ):
+        """
+        Mark a job as partial (crashed but has some reviews saved).
+
+        Args:
+            job_id: Job UUID
+            error_message: Error that caused the crash
+            scrape_logs: Log entries from the scraper
+        """
+        async with self.pool.acquire() as conn:
+            await conn.execute("""
+                UPDATE jobs
+                SET
+                    status = 'partial',
+                    completed_at = NOW(),
+                    error_message = $2,
+                    scrape_logs = $3::jsonb
+                WHERE job_id = $1
+            """, job_id, error_message, json.dumps(scrape_logs) if scrape_logs else None)
+
+            log.info(f"Marked job {job_id} as partial due to: {error_message}")
+
+    async def list_jobs(
+        self,
+        status: Optional[JobStatus] = None,
+        limit: int = 100,
+        offset: int = 0
+    ) -> List[Dict[str, Any]]:
+        """
+        List jobs with optional filtering.
+
+        Args:
+            status: Optional status filter
+            limit: Maximum number of jobs to return
+            offset: Number of jobs to skip
+
+        Returns:
+            List of job dictionaries
+        """
+        async with self.pool.acquire() as conn:
+            if status:
+                rows = await conn.fetch("""
+                    SELECT
+                        job_id,
+                        status,
+                        url,
+                        created_at,
+                        completed_at,
+                        reviews_count,
+                        total_reviews,
+                        scrape_time,
+                        error_message,
+                        metadata,
+                        review_topics
+                    FROM jobs
+                    WHERE status = $1
+                    ORDER BY created_at DESC
+                    LIMIT $2 OFFSET $3
+                """, status.value, limit, offset)
+            else:
+                rows = await conn.fetch("""
+                    SELECT
+                        job_id,
+                        status,
+                        url,
+                        created_at,
+                        completed_at,
+                        reviews_count,
+                        total_reviews,
+                        scrape_time,
+                        error_message,
+                        metadata,
+                        review_topics
+                    FROM jobs
+                    ORDER BY created_at DESC
+                    LIMIT $1 OFFSET $2
+                """, limit, offset)
+
+            return [dict(row) for row in rows]
+
+    async def get_pending_jobs_with_webhooks(self, limit: int = 100) -> List[Dict[str, Any]]:
+        """
+        Get completed jobs that have webhooks pending delivery.
+
+        Args:
+            limit: Maximum number of jobs to return
+
+        Returns:
+            List of job dictionaries with webhook info
+        """
+        async with self.pool.acquire() as conn:
+            rows = await conn.fetch("""
+                SELECT
+                    job_id,
+                    status,
+                    url,
+                    webhook_url,
+                    webhook_secret,
+                    reviews_count,
+                    scrape_time,
+                    error_message,
+                    completed_at
+                FROM jobs
+                WHERE webhook_url IS NOT NULL
+                  AND status IN ('completed', 'failed')
+                  AND job_id NOT IN (
+                      SELECT job_id
+                      FROM webhook_attempts
+                      WHERE success = true
+                  )
+                ORDER BY completed_at ASC
+                LIMIT $1
+            """, limit)
+
+            return [dict(row) for row in rows]
+
+    async def delete_job(self, job_id: UUID) -> bool:
+        """
+        Delete a job from the database.
+
+        Args:
+            job_id: Job UUID
+
+        Returns:
+            True if deleted, False if not found
+        """
+        async with self.pool.acquire() as conn:
+            result = await conn.execute("""
+                DELETE FROM jobs WHERE job_id = $1
+            """, job_id)
+
+            deleted = result.split()[-1] == "1"
+            if deleted:
+                log.info(f"Deleted job {job_id}")
+            return deleted
+
+    async def cleanup_old_jobs(self, max_age_days: int = 30):
+        """
+        Delete old completed/failed jobs.
+
+        Args:
+            max_age_days: Maximum age in days before deletion
+        """
+        async with self.pool.acquire() as conn:
+            result = await conn.execute("""
+                DELETE FROM jobs
+                WHERE status IN ('completed', 'failed', 'cancelled')
+                  AND completed_at < NOW() - INTERVAL '%s days'
+            """, max_age_days)
+
+            deleted_count = int(result.split()[-1])
+            if deleted_count > 0:
+                log.info(f"Cleaned up {deleted_count} old jobs")
+
+    # ==================== Statistics ====================
+
+    async def get_stats(self) -> Dict[str, Any]:
+        """
+        Get job statistics.
+
+        Returns:
+            Statistics dictionary
+        """
+        async with self.pool.acquire() as conn:
+            stats = await conn.fetchrow("""
+                SELECT
+                    COUNT(*) as total_jobs,
+                    COUNT(*) FILTER (WHERE status = 'pending') as pending,
+                    COUNT(*) FILTER (WHERE status = 'running') as running,
+                    COUNT(*) FILTER (WHERE status = 'completed') as completed,
+                    COUNT(*) FILTER (WHERE status = 'failed') as failed,
+                    COUNT(*) FILTER (WHERE status = 'cancelled') as cancelled,
+                    AVG(scrape_time) FILTER (WHERE status = 'completed') as avg_scrape_time,
+                    SUM(reviews_count) FILTER (WHERE status = 'completed') as total_reviews
+                FROM jobs
+            """)
+
+            return dict(stats)
+
+    # ==================== Canary Operations ====================
+
+    async def save_canary_result(
+        self,
+        success: bool,
+        reviews_count: Optional[int] = None,
+        scrape_time: Optional[float] = None,
+        error_message: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None
+    ):
+        """
+        Save canary test result.
+
+        Args:
+            success: Whether canary test succeeded
+            reviews_count: Number of reviews scraped
+            scrape_time: Time taken in seconds
+            error_message: Error message if failed
+            metadata: Additional metadata
+        """
+        async with self.pool.acquire() as conn:
+            await conn.execute("""
+                INSERT INTO canary_results (success, reviews_count, scrape_time, error_message, metadata)
+                VALUES ($1, $2, $3, $4, $5)
+            """, success, reviews_count, scrape_time, error_message, json.dumps(metadata) if metadata else None)
+
+    async def get_canary_history(self, limit: int = 100) -> List[Dict[str, Any]]:
+        """
+        Get canary test history.
+
+        Args:
+            limit: Maximum number of results to return
+
+        Returns:
+            List of canary result dictionaries
+        """
+        async with self.pool.acquire() as conn:
+            rows = await conn.fetch("""
+                SELECT
+                    timestamp,
+                    success,
+                    reviews_count,
+                    scrape_time,
+                    error_message
+                FROM canary_results
+                ORDER BY timestamp DESC
+                LIMIT $1
+            """, limit)
+
+            return [dict(row) for row in rows]
+
+    # ==================== Webhook Attempts ====================
+
+    async def log_webhook_attempt(
+        self,
+        job_id: UUID,
+        attempt_number: int,
+        success: bool,
+        status_code: Optional[int] = None,
+        error_message: Optional[str] = None,
+        response_time_ms: Optional[float] = None
+    ):
+        """
+        Log a webhook delivery attempt.
+
+        Args:
+            job_id: Job UUID
+            attempt_number: Attempt number (1, 2, 3...)
+            success: Whether delivery succeeded
+            status_code: HTTP status code
+            error_message: Error message if failed
+            response_time_ms: Response time in milliseconds
+        """
+        async with self.pool.acquire() as conn:
+            await conn.execute("""
+                INSERT INTO webhook_attempts (job_id, attempt_number, success, status_code, error_message, response_time_ms)
+                VALUES ($1, $2, $3, $4, $5, $6)
+            """, job_id, attempt_number, success, status_code, error_message, response_time_ms)
+
+    # ==================== Crash Reports ====================
+
+    async def save_crash_report(self, job_id: str, crash_data: dict) -> str:
+        """
+        Save a crash report and return the crash_id.
+
+        Args:
+            job_id: Job UUID as string
+            crash_data: Dictionary containing crash report data:
+                - crash_type: Type of crash (required)
+                - error_message: Error message (optional)
+                - state: Current state at crash time (required)
+                - metrics_history: Historical metrics (optional)
+                - logs_before_crash: Log entries before crash (optional)
+                - analysis: Crash analysis data (optional)
+                - screenshot_url: URL to screenshot (optional)
+                - dom_snapshot_id: UUID of DOM snapshot (optional)
+
+        Returns:
+            UUID of created crash report as string
+        """
+        async with self.pool.acquire() as conn:
+            # Convert job_id string to UUID
+            job_uuid = UUID(job_id) if isinstance(job_id, str) else job_id
+
+            crash_id = await conn.fetchval("""
+                INSERT INTO crash_reports (
+                    job_id,
+                    crash_type,
+                    error_message,
+                    state,
+                    metrics_history,
+                    logs_before_crash,
+                    analysis,
+                    screenshot_url,
+                    dom_snapshot_id
+                )
+                VALUES ($1, $2, $3, $4::jsonb, $5::jsonb, $6::jsonb, $7::jsonb, $8, $9)
+                RETURNING crash_id
+            """,
+                job_uuid,
+                crash_data.get('crash_type'),
+                crash_data.get('error_message'),
+                json.dumps(crash_data.get('state', {})),
+                json.dumps(crash_data.get('metrics_history')) if crash_data.get('metrics_history') else None,
+                json.dumps(crash_data.get('logs_before_crash')) if crash_data.get('logs_before_crash') else None,
+                json.dumps(crash_data.get('analysis')) if crash_data.get('analysis') else None,
+                crash_data.get('screenshot_url'),
+                UUID(crash_data['dom_snapshot_id']) if crash_data.get('dom_snapshot_id') else None
+            )
+
+            log.info(f"Saved crash report {crash_id} for job {job_id}, type: {crash_data.get('crash_type')}")
+            return str(crash_id)
+
+    async def get_crash_report(self, job_id: str) -> Optional[dict]:
+        """
+        Get crash report for a job, if any.
+
+        Args:
+            job_id: Job UUID as string
+
+        Returns:
+            Crash report dictionary or None if not found
+        """
+        async with self.pool.acquire() as conn:
+            job_uuid = UUID(job_id) if isinstance(job_id, str) else job_id
+
+            row = await conn.fetchrow("""
+                SELECT
+                    crash_id,
+                    job_id,
+                    created_at,
+                    crash_type,
+                    error_message,
+                    state,
+                    metrics_history,
+                    logs_before_crash,
+                    analysis,
+                    screenshot_url,
+                    dom_snapshot_id
+                FROM crash_reports
+                WHERE job_id = $1
+                ORDER BY created_at DESC
+                LIMIT 1
+            """, job_uuid)
+
+            if not row:
+                return None
+
+            result = dict(row)
+            # Convert UUIDs to strings for JSON serialization
+            result['crash_id'] = str(result['crash_id'])
+            result['job_id'] = str(result['job_id'])
+            if result.get('dom_snapshot_id'):
+                result['dom_snapshot_id'] = str(result['dom_snapshot_id'])
+
+            return result
+
+    async def get_crash_stats(self, days: int = 7) -> dict:
+        """
+        Get crash statistics for the last N days.
+
+        Args:
+            days: Number of days to look back (default: 7)
+
+        Returns:
+            Dictionary with:
+                - total: Total number of crashes
+                - by_type: Dict mapping crash type to count
+                - by_day: List of dicts with date and count
+        """
+        async with self.pool.acquire() as conn:
+            # Get total count
+            total = await conn.fetchval("""
+                SELECT COUNT(*)
+                FROM crash_reports
+                WHERE created_at >= NOW() - INTERVAL '%s days'
+            """, days)
+
+            # Get counts by type
+            type_rows = await conn.fetch("""
+                SELECT crash_type, COUNT(*) as count
+                FROM crash_reports
+                WHERE created_at >= NOW() - INTERVAL '%s days'
+                GROUP BY crash_type
+                ORDER BY count DESC
+            """, days)
+
+            by_type = {row['crash_type']: row['count'] for row in type_rows}
+
+            # Get counts by day
+            day_rows = await conn.fetch("""
+                SELECT DATE(created_at) as date, COUNT(*) as count
+                FROM crash_reports
+                WHERE created_at >= NOW() - INTERVAL '%s days'
+                GROUP BY DATE(created_at)
+                ORDER BY date DESC
+            """, days)
+
+            by_day = [{'date': str(row['date']), 'count': row['count']} for row in day_rows]
+
+            return {
+                'total': total or 0,
+                'by_type': by_type,
+                'by_day': by_day
+            }
--- a/core/enums.py
+++ b/core/enums.py
@@ -0,0 +1,14 @@
+"""
+Enumerations for the ReviewIQ project.
+"""
+from enum import Enum
+
+
+class JobStatus(str, Enum):
+    """Job status enumeration"""
+    PENDING = "pending"
+    RUNNING = "running"
+    COMPLETED = "completed"
+    FAILED = "failed"
+    CANCELLED = "cancelled"
+    PARTIAL = "partial"  # Job crashed but has partial reviews saved
--- a/core/models.py
+++ b/core/models.py
@@ -0,0 +1,93 @@
+"""
+Data models for Google Maps Reviews Scraper.
+"""
+import re
+from dataclasses import dataclass, field
+
+from selenium.webdriver.remote.webelement import WebElement
+
+from utils.helpers import (try_find, first_text, first_attr, safe_int, detect_lang, parse_date_to_iso)
+
+
+@dataclass
+class RawReview:
+    """
+    Data class representing a raw review extracted from Google Maps.
+    """
+    id: str = ""
+    author: str = ""
+    rating: float = 0.0
+    date: str = ""
+    lang: str = "und"
+    text: str = ""
+    likes: int = 0
+    photos: list[str] = field(default_factory=list)
+    profile: str = ""
+    avatar: str = ""  # URL to profile picture
+    owner_date: str = ""
+    owner_text: str = ""
+    review_date: str = ""  # ISO format date
+
+    # Translation fields
+    translations: dict = field(default_factory=dict)  # Store translations by language code
+
+    # CSS Selectors for review elements
+    MORE_BTN = "button.kyuRq"
+    LIKE_BTN = 'button[jsaction*="toggleThumbsUp" i]'
+    PHOTO_BTN = "button.Tya61d"
+    OWNER_RESP = "div.CDe7pd"
+
+    @classmethod
+    def from_card(cls, card: WebElement) -> "RawReview":
+        """Factory method to create a RawReview from a WebElement"""
+        # expand "More" - non-blocking approach
+        for b in try_find(card, cls.MORE_BTN, all=True):
+            try:
+                b.click()
+            except Exception:
+                pass
+
+        # Try to get data-review-id from the card itself, or from a child element
+        rid = card.get_attribute("data-review-id") or ""
+        if not rid:
+            # Try to find it in a child element
+            review_id_elem = try_find(card, "[data-review-id]")
+            if review_id_elem:
+                rid = review_id_elem[0].get_attribute("data-review-id") or ""
+        author = first_text(card, 'div[class*="d4r55"]')
+        profile = first_attr(card, 'button[data-review-id]', "data-href")
+        avatar = first_attr(card, 'button[data-review-id] img', "src")
+
+        label = first_attr(card, 'span[role="img"]', "aria-label")
+        num = re.search(r"[\d\.]+", label.replace(",", ".")) if label else None
+        rating = float(num.group()) if num else 0.0
+
+        date = first_text(card, 'span[class*="rsqaWe"]')
+        # Parse the date string to ISO format
+        review_date = parse_date_to_iso(date)
+
+        text = ""
+        for sel in ('span[jsname="bN97Pc"]',
+                    'span[jsname="fbQN7e"]',
+                    'div.MyEned span.wiI7pd'):
+            text = first_text(card, sel)
+            if text: break
+        lang = detect_lang(text)
+
+        likes = 0
+        if (btn := try_find(card, cls.LIKE_BTN)):
+            likes = safe_int(btn[0].text or btn[0].get_attribute("aria-label"))
+
+        photos: list[str] = []
+        for btn in try_find(card, cls.PHOTO_BTN, all=True):
+            if (m := re.search(r'url\("([^"]+)"', btn.get_attribute("style") or "")):
+                photos.append(m.group(1))
+
+        owner_date = owner_text = ""
+        if (box := try_find(card, cls.OWNER_RESP)):
+            box = box[0]
+            owner_date = first_text(box, "span.DZSIDd")
+            owner_text = first_text(box, "div.wiI7pd")
+
+        return cls(rid, author, rating, date, lang, text, likes,
+                   photos, profile, avatar, owner_date, owner_text, review_date)