From 262f0c0be7bf0c47093b78212c5eacc026356cb0 Mon Sep 17 00:00:00 2001 From: George Khananaev <106206490+georgekhananaev@users.noreply.github.com> Date: Sun, 7 Dec 2025 19:40:13 +0700 Subject: [PATCH] migrate to SeleniumBase UC Mode for automatic version management MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace undetected-chromedriver with seleniumbase for better Chrome/ChromeDriver compatibility - Automatic version matching eliminates manual cache clearing and version conflicts - Enhanced anti-detection with UC Mode and CDP stealth settings - Simplified requirements.txt (SeleniumBase manages common dependencies) - Fix sort selection bug (was selecting wrong menu items) - Improve scrolling patience (max_idle: 3β†’15, max_attempts: 10β†’50) - Add scroll position tracking to detect when stuck - Add fallback pane selectors for better reliability - Update documentation (README, ARCHITECTURE, TROUBLESHOOTING) - Add comprehensive test suite for SeleniumBase integration - Version bump to 1.0.1 Developed by George Khananaev --- .gitignore | 8 + README.md | 13 +- docs/ARCHITECTURE.md | 2760 ++++++++++++++++++++++++ docs/TROUBLESHOOTING.md | 708 ++++++ modules/scraper.py | 296 ++- requirements.txt | 13 +- tests/test_seleniumbase_integration.py | 110 + 7 files changed, 3802 insertions(+), 106 deletions(-) create mode 100644 docs/ARCHITECTURE.md create mode 100644 docs/TROUBLESHOOTING.md create mode 100644 tests/test_seleniumbase_integration.py diff --git a/.gitignore b/.gitignore index fab8b5b..62619b6 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,7 @@ Desktop.ini # ----------------------------------------------------------- .idea/ .vscode/ +.claude/ *.swp *.swo *~ @@ -48,6 +49,7 @@ logs.db *.sqlite *.sqlite3 *.db +docs/AGENTS_LOG # ----------------------------------------------------------- # Config Files @@ -68,6 +70,12 @@ review_images/ images/ downloaded_images/ +# ----------------------------------------------------------- +# SeleniumBase Files +# ----------------------------------------------------------- +downloaded_files/ +*.lock + # ----------------------------------------------------------- # Temporary and Output Files # ----------------------------------------------------------- diff --git a/README.md b/README.md index fd89b8a..6a7cc52 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,16 @@ # πŸ”₯ Google Reviews Scraper Pro (2025) πŸ”₯ -![Google Reviews Scraper Pro](https://img.shields.io/badge/Version-1.0.0-brightgreen) +![Google Reviews Scraper Pro](https://img.shields.io/badge/Version-1.0.1-brightgreen) ![Python](https://img.shields.io/badge/Python-3.10%20%7C%203.11%20%7C%203.12%20%7C%203.13-blue) ![License](https://img.shields.io/badge/License-MIT-yellow) -![Last Update](https://img.shields.io/badge/Last%20Updated-April%202025-red) +![Last Update](https://img.shields.io/badge/Last%20Updated-December%202025-red) **FINALLY! A scraper that ACTUALLY WORKS in 2025!** While others break with every Google update, this bad boy keeps on trucking. Say goodbye to the frustration of constantly broken scrapers and hello to a beast that rips through Google's defenses like a hot knife through butter. This battle-tested, rock-solid solution will extract every juicy detail from Google reviews while laughing in the face of rate limiting. ## 🌟 Feature Artillery - **Bulletproof in 2025**: While the competition falls apart, we've cracked Google's latest tricks -- **Ninja-Mode Selenium**: Our undetected-chromedriver flies under the radar where others get insta-blocked +- **Enhanced SeleniumBase UC Mode**: Superior anti-detection with automatic Chrome/ChromeDriver version matching - no more version headaches! - **Polyglot Powerhouse**: Devours reviews in a smorgasbord of languages - English, Hebrew, Thai, German, you name it! - **MongoDB Mastery**: Dumps pristine data structures straight into your MongoDB instance - **Paranoid Backups**: Mirrors everything to local JSON files because losing data sucks @@ -350,9 +350,10 @@ print(f"Reviews with images: {len(reviews_with_images)}") ### DEFCON Scenarios & Quick Fixes 1. **Chrome/Driver Having a Lovers' Quarrel** - - Update your damn Chrome browser already! It's 2025, people - - Nuke and reinstall the driver: `pip uninstall undetected-chromedriver` then `pip install undetected-chromedriver==3.5.4` - - If you're on Ubuntu, sometimes a simple `apt update && apt upgrade` fixes weird Chrome issues + - **Good news!** SeleniumBase handles Chrome/ChromeDriver version matching automatically + - Update Chrome browser: Go to chrome://settings/help + - SeleniumBase will automatically download the matching ChromeDriver - no manual intervention needed! + - If issues persist: `pip install --upgrade seleniumbase` 2. **MongoDB Throwing a Tantrum** - Double-check your connection string - typos are the #1 culprit diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md new file mode 100644 index 0000000..b4fab6f --- /dev/null +++ b/docs/ARCHITECTURE.md @@ -0,0 +1,2760 @@ +# Google Reviews Scraper Pro - Complete Architecture Documentation + +> **Purpose:** This document serves as the definitive reference for AI agents and developers to understand the complete architecture, data flow, and implementation details of the Google Reviews Scraper Pro application without needing to scan multiple files. + +--- + +## Table of Contents + +1. [Executive Summary](#executive-summary) +2. [System Architecture Overview](#system-architecture-overview) +3. [Project Structure](#project-structure) +4. [Technology Stack](#technology-stack) +5. [Entry Points & Execution Modes](#entry-points--execution-modes) +6. [Core Components Deep Dive](#core-components-deep-dive) +7. [Data Models & Schemas](#data-models--schemas) +8. [Data Flow & Processing Pipeline](#data-flow--processing-pipeline) +9. [Configuration System](#configuration-system) +10. [Storage Layer](#storage-layer) +11. [Image Processing Pipeline](#image-processing-pipeline) +12. [Cloud Integration (AWS S3)](#cloud-integration-aws-s3) +13. [Job Management & Background Processing](#job-management--background-processing) +14. [REST API Service](#rest-api-service) +15. [Selenium Automation Strategy](#selenium-automation-strategy) +16. [Multi-Language Support](#multi-language-support) +17. [Date & Time Handling](#date--time-handling) +18. [Error Handling & Resilience](#error-handling--resilience) +19. [Performance Optimizations](#performance-optimizations) +20. [Security Considerations](#security-considerations) +21. [Deployment Scenarios](#deployment-scenarios) +22. [Troubleshooting Guide](#troubleshooting-guide) +23. [Extension Points](#extension-points) + +--- + +## Executive Summary + +**Google Reviews Scraper Pro** is a production-grade web scraping application designed to extract Google Maps reviews at scale. The system is architected for: + +- **Reliability**: Anti-detection mechanisms using undetected-chromedriver +- **Scalability**: Background job processing with concurrent execution +- **Flexibility**: Multiple storage backends (MongoDB, JSON, AWS S3) +- **Maintainability**: Modular design with clear separation of concerns +- **Multi-language**: Supports 50+ languages with automatic detection + +### Key Features + +1. **Dual Execution Modes**: CLI for one-off scraping, REST API for service-oriented deployments +2. **Intelligent Scraping**: Multi-strategy DOM element detection, automatic retry mechanisms +3. **Data Enrichment**: Date parsing, image downloading, URL rewriting, custom metadata injection +4. **Persistent Storage**: MongoDB for structured storage, JSON for backup, S3 for images +5. **Resume Capability**: Tracks seen IDs to avoid duplicates and support incremental scraping + +--- + +## System Architecture Overview + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ ENTRY POINTS β”‚ +β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ +β”‚ start.py (CLI) β”‚ api_server.py (REST API) β”‚ +β”‚ - Arg parsing β”‚ - FastAPI endpoints β”‚ +β”‚ - Direct execution β”‚ - Job queuing β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ β”‚ + v v +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ SCRAPER CORE β”‚ +β”‚ modules/scraper.py - GoogleReviewsScraper β”‚ +β”‚ - Chrome driver setup β”‚ +β”‚ - DOM navigation & extraction β”‚ +β”‚ - Multi-language tab/menu detection β”‚ +β”‚ - Scroll & pagination logic β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + v +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ DATA PROCESSING β”‚ +β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ +β”‚ models.py β”‚ date_converter.py β”‚ +β”‚ - RawReview extraction β”‚ - Relative date parsing β”‚ +β”‚ - DOM parsing β”‚ - ISO conversion β”‚ +β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ +β”‚ utils.py β”‚ image_handler.py β”‚ +β”‚ - Language detection β”‚ - Multi-threaded download β”‚ +β”‚ - Helper functions β”‚ - URL resolution hacking β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + v +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ STORAGE LAYER β”‚ +β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ +β”‚ data_storage.py β”‚ s3_handler.py β”‚ +β”‚ - MongoDBStorage β”‚ - Batch upload β”‚ +β”‚ - JSONStorage β”‚ - Custom URL generation β”‚ +β”‚ - Merge logic β”‚ - Lifecycle management β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +--- + +## Project Structure + +``` +google-reviews-scraper-pro/ +β”œβ”€β”€ start.py # CLI entry point +β”œβ”€β”€ api_server.py # FastAPI REST API server +β”œβ”€β”€ config.yaml # Default configuration +β”œβ”€β”€ requirements.txt # Python dependencies +β”œβ”€β”€ modules/ # Core application modules +β”‚ β”œβ”€β”€ __init__.py +β”‚ β”œβ”€β”€ cli.py # CLI argument parser +β”‚ β”œβ”€β”€ config.py # Configuration loader +β”‚ β”œβ”€β”€ scraper.py # Main Selenium scraping engine +β”‚ β”œβ”€β”€ models.py # Data models (RawReview) +β”‚ β”œβ”€β”€ data_storage.py # MongoDB/JSON persistence +β”‚ β”œβ”€β”€ image_handler.py # Image download/upload logic +β”‚ β”œβ”€β”€ s3_handler.py # AWS S3 integration +β”‚ β”œβ”€β”€ job_manager.py # Background job orchestration +β”‚ β”œβ”€β”€ utils.py # Utility functions +β”‚ └── date_converter.py # Date parsing utilities +β”œβ”€β”€ docs/ # Documentation +β”‚ β”œβ”€β”€ ARCHITECTURE.md # This file +β”‚ └── TROUBLESHOOTING.md # Common issues & solutions +β”œβ”€β”€ tests/ # Test suite +β”‚ β”œβ”€β”€ __init__.py +β”‚ β”œβ”€β”€ conftest.py +β”‚ β”œβ”€β”€ test_mongodb_connection.py +β”‚ └── test_s3_connection.py +└── examples/ # Example configurations + └── config-example.txt +``` + +--- + +## Technology Stack + +### Core Dependencies + +| Package | Version | Purpose | +|---------|---------|---------| +| `Python` | 3.10+ | Runtime environment | +| `seleniumbase` | 4.34.9+ | Enhanced browser automation with UC Mode | +| `pymongo` | 4.12.0 | MongoDB client | +| `boto3` | 1.35.1 | AWS S3 SDK | +| `fastapi` | 0.104.1 | REST API framework | +| `uvicorn` | 0.24.0 | ASGI server | +| `pydantic` | 2.11.5 | Data validation | +| `pyyaml` | 6.0.1 | Configuration parsing | +| `beautifulsoup4` | 4.12.3 | HTML parsing (secondary) | +| `requests` | 2.32.3 | HTTP client for image downloads | +| `tqdm` | 4.66.3 | Progress bars | + +### Optional Dependencies + +- `pytest` (7.4.3) - Testing framework +- `googletrans` (4.0.2) - Translation capabilities (future feature) + +--- + +## Entry Points & Execution Modes + +### 1. CLI Mode (`start.py`) + +**Purpose**: Direct execution for one-off scraping jobs or cron scheduling. + +**Execution Flow**: +```python +main() + β”œβ”€ parse_arguments() # modules/cli.py + β”œβ”€ load_config() # modules/config.py + β”œβ”€ Override config with CLI args + β”œβ”€ GoogleReviewsScraper(config) + └─ scraper.scrape() # Blocking execution +``` + +**Key Features**: +- Synchronous execution +- Direct console output +- Exit code based on success/failure +- Suitable for cron jobs and CI/CD pipelines + +**Example**: +```bash +python start.py \ + --url "https://maps.app.goo.gl/xyz" \ + --headless \ + --sort newest \ + --download-images true \ + --custom-params '{"client":"CompanyA"}' +``` + +### 2. API Mode (`api_server.py`) + +**Purpose**: Service-oriented deployment for web applications and integrations. + +**Execution Flow**: +```python +FastAPI lifespan context + β”œβ”€ startup: JobManager(max_concurrent_jobs=3) + β”œβ”€ POST /scrape β†’ create_job() β†’ start_job() + β”‚ └─ ThreadPoolExecutor β†’ _run_scraping_job() + β”œβ”€ GET /jobs/{id} β†’ get_job() β†’ return status + └─ shutdown: executor.shutdown() +``` + +**Key Features**: +- Asynchronous job processing +- Job queue management +- RESTful API with OpenAPI documentation +- Automatic job cleanup (24-hour retention) +- CORS enabled for web integration + +**Example**: +```bash +# Start server +python api_server.py + +# Submit job via API +curl -X POST http://localhost:8000/scrape \ + -H "Content-Type: application/json" \ + -d '{"url": "https://maps.app.goo.gl/xyz", "headless": true}' + +# Check status +curl http://localhost:8000/jobs/{job_id} +``` + +--- + +## Core Components Deep Dive + +### 1. Scraper Engine (`modules/scraper.py`) + +**Class**: `GoogleReviewsScraper` + +**Responsibilities**: +1. Chrome driver lifecycle management +2. Google Maps navigation +3. DOM element detection and interaction +4. Review extraction and pagination +5. Data deduplication + +**Key Methods**: + +#### `setup_driver(headless: bool) -> Chrome` +- **Purpose**: Initialize Chrome WebDriver with anti-detection measures +- **Environment Detection**: + - Checks `CHROME_BIN` environment variable for Docker/container deployment + - Clears `undetected_chromedriver` cache to prevent version mismatches + - Platform-specific cache paths (macOS, Linux, Windows) +- **Options Applied**: + ```python + --window-size=1400,900 + --ignore-certificate-errors + --disable-gpu + --disable-dev-shm-usage + --no-sandbox + --headless=new # if headless=True + ``` +- **Fallback Strategy**: If `undetected_chromedriver` fails, falls back to standard Selenium WebDriver + +#### `click_reviews_tab(driver: Chrome)` +- **Purpose**: Locate and click the "Reviews" tab across any language/layout +- **Strategy Cascade** (6 detection methods): + 1. **Data Attributes**: `data-tab-index="1"` + 2. **ARIA Roles**: `role="tab"` with review keywords in `aria-label` + 3. **Text Content**: Checks `innerText`, `textContent`, `aria-label` against 50+ language keywords + 4. **Nested Elements**: Recursively searches child elements + 5. **URL Detection**: Checks `href`, `data-href` for "review" patterns + 6. **XPath Fallback**: `contains(text(), '')` for each language +- **Review Keywords**: English, Hebrew, Thai, Spanish, French, German, Italian, Portuguese, Russian, Japanese, Korean, Chinese, Arabic, Hindi, Turkish, Dutch, Polish, Vietnamese, Indonesian, Swedish, Norwegian, Danish, Finnish, Greek, Czech, Romanian, Hungarian, Bulgarian +- **Click Methods** (5 attempts per element): + 1. JavaScript `click()` + 2. Direct `element.click()` + 3. ActionChains `move_to_element().click()` + 4. Send `Keys.RETURN` + 5. ActionChains center click with offset +- **Verification**: `verify_reviews_tab_clicked()` confirms success by checking for review cards + +#### `set_sort(driver: Chrome, method: str)` +- **Purpose**: Change review sort order (newest, highest, lowest, relevance) +- **Sort Button Detection** (10+ selectors): + ```python + 'button.HQzyZ[aria-haspopup="true"]' + 'button[aria-label*="Sort" i]' + 'button[aria-label*="Χ‘Χ™Χ“Χ•Χ¨"]' # Hebrew + 'button[aria-label*="ΰΉ€ΰΈ£ΰΈ΅ΰΈ’ΰΈ‡"]' # Thai + # ... multilingual selectors + ``` +- **Menu Item Selection**: + - Waits for `div[role="menuitemradio"]` to appear + - Matches text against `SORT_OPTIONS` dictionary (contains all language variants) + - Position-based fallback: relevance=0, newest=1, highest=2, lowest=3 +- **Click Methods** (5 attempts): Same as `click_reviews_tab` + +#### `scrape()` +- **Main Loop**: + ```python + while attempts < max_attempts: + cards = pane.find_elements(By.CSS_SELECTOR, CARD_SEL) + for card in cards: + if card.id in seen: continue + raw = RawReview.from_card(card) + docs[raw.id] = merge_review(docs.get(raw.id), raw) + seen.add(raw.id) + scroll_pane() + sleep(dynamic_delay) + ``` +- **Deduplication**: Maintains `seen` set (loaded from `google_reviews.ids`) +- **Stop Condition**: `stop_on_match=True` exits when first duplicate is found (efficient incremental scraping) +- **Progress**: `tqdm` progress bar shows real-time count +- **Stale Element Handling**: Catches `StaleElementReferenceException` and re-finds pane + +--- + +### 2. Data Models (`modules/models.py`) + +**Class**: `RawReview` + +**Purpose**: Immutable data structure representing a single review as extracted from DOM. + +**Fields**: +```python +@dataclass +class RawReview: + id: str # data-review-id + author: str # Reviewer name + rating: float # 1.0-5.0 + date: str # Original relative date string + lang: str # ISO 639-1 code (auto-detected) + text: str # Review body + likes: int # Thumbs up count + photos: list[str] # Image URLs + profile: str # Author profile link + avatar: str # Profile picture URL + owner_date: str # Business owner response date + owner_text: str # Business owner response text + review_date: str # Parsed ISO date + translations: dict # Future: Translated versions +``` + +**Extraction Method**: `from_card(card: WebElement)` + +**DOM Selectors Used**: +```python +MORE_BTN = "button.kyuRq" # "More" expansion button +LIKE_BTN = 'button[jsaction*="toggleThumbsUp"]' +PHOTO_BTN = "button.Tya61d" +OWNER_RESP = "div.CDe7pd" +``` + +**Extraction Steps**: +1. Click "More" button to expand truncated text +2. Extract `data-review-id` attribute +3. Parse author name from `div[class*="d4r55"]` +4. Extract rating from `span[role="img"][aria-label]` using regex `[\d\.]+` +5. Parse date from `span[class*="rsqaWe"]` +6. Try multiple selectors for text content (handles layout variations) +7. Detect language using `detect_lang()` (checks for Hebrew/Thai characters) +8. Parse likes from button text or aria-label +9. Extract photos from `style="url(...)"` attributes +10. Parse owner response if `div.CDe7pd` exists + +--- + +### 3. Utility Functions (`modules/utils.py`) + +#### Language Detection + +```python +@lru_cache(maxsize=1024) +def detect_lang(txt: str) -> str: + if HEB_CHARS.search(txt): return "he" # Hebrew: \u0590-\u05FF + if THAI_CHARS.search(txt): return "th" # Thai: \u0E00-\u0E7F + return "en" +``` + +**Purpose**: Determine review language for multilingual storage. + +**Strategy**: Regex pattern matching against Unicode ranges (expandable to more languages). + +#### Safe Integer Parsing + +```python +@lru_cache(maxsize=128) +def safe_int(s: str | None) -> int: + m = re.search(r"\d+", s or "") + return int(m.group()) if m else 0 +``` + +**Purpose**: Extract numeric values from strings like "5 likes" or "3 photos". + +#### Element Finding + +```python +def try_find(el: WebElement, css: str, *, all=False) -> List[WebElement]: + try: + if all: + return el.find_elements(By.CSS_SELECTOR, css) + obj = el.find_element(By.CSS_SELECTOR, css) + return [obj] if obj else [] + except (NoSuchElementException, StaleElementReferenceException): + return [] +``` + +**Purpose**: Non-throwing element finder (prevents exception chaining). + +#### Click Helper + +```python +def click_if(driver: Chrome, css: str, delay=0.25, timeout=5.0) -> bool: + # 1. Find all matching elements + # 2. Check visibility and enabled state + # 3. Try direct click + # 4. Fallback to WebDriverWait + EC.element_to_be_clickable + # 5. Sleep after successful click +``` + +**Purpose**: Robust click operation with automatic retry and wait. + +--- + +### 4. Date Conversion (`modules/date_converter.py`) + +**Challenge**: Google displays dates as "2 weeks ago", "3 months ago" in user's language. + +**Solution**: Multi-language regex parsing with fallback to random date. + +#### `parse_relative_date(date_str: str, lang: str) -> str` + +**Supported Languages**: +- English: "a day ago", "3 weeks ago", "2 years ago" +- Hebrew: "ΧœΧ€Χ Χ™ יום", "ΧœΧ€Χ Χ™ שבוגיים", "ΧœΧ€Χ Χ™ 7 שנים" +- Thai: "3 ΰΈ§ΰΈ±ΰΈ™ΰΈ—ΰΈ΅ΰΉˆΰΉΰΈ₯ΰΉ‰ΰΈ§", "2 ΰΈͺΰΈ±ΰΈ›ΰΈ”ΰΈ²ΰΈ«ΰΉŒΰΈ—ΰΈ΅ΰΉˆΰΉΰΈ₯ΰΉ‰ΰΈ§" + +**Algorithm**: +```python +1. Try parsing with provided language +2. If fails, iterate through all supported languages +3. If all fail, generate random date within last 365 days +4. Return ISO 8601 format string +``` + +**Regex Patterns**: +```python +# English +r'(?Pa|an|\d+)\s+(?Pday|week|month|year)s?\s+ago' + +# Hebrew +r'(?P\d+|אחד|אחΧͺ)?\s*(?PΧ©Χ Χ”|שנים|Χ—Χ•Χ“Χ©|חודשים|יום|Χ™ΧžΧ™Χ|Χ©Χ‘Χ•Χ’|Χ©Χ‘Χ•Χ’Χ•Χͺ)' + +# Thai +r'(?P\d+)?\s*(?PΰΈ§ΰΈ±ΰΈ™|ΰΈͺΰΈ±ΰΈ›ΰΈ”ΰΈ²ΰΈ«ΰΉŒ|ΰΉ€ΰΈ”ΰΈ·ΰΈ­ΰΈ™|ΰΈ›ΰΈ΅)ΰΈ—ΰΈ΅ΰΉˆΰΉΰΈ₯ΰΉ‰ΰΈ§' +``` + +**Time Calculations**: +```python +days = num * 1 +weeks = num * 7 +months = num * 30 # Approximation +years = num * 365 # Approximation +``` + +#### `DateConverter.convert_dates_in_document(doc: Dict)` + +**Purpose**: Convert string dates to Python `datetime` objects before MongoDB storage. + +**Fields Converted**: +- `created_date` (when first scraped) +- `last_modified_date` (when last updated) +- `review_date` (when review was posted) + +**Special Handling**: +- Removes legacy `date` field if present +- Handles both ISO strings and relative dates +- Preserves timezone information + +--- + +## Data Flow & Processing Pipeline + +### Complete Flow Diagram + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ 1. INPUT β”‚ +β”‚ β”œβ”€ URL (required) β”‚ +β”‚ β”œβ”€ Config (YAML + CLI overrides) β”‚ +β”‚ └─ Custom params (optional metadata) β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + v +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ 2. CHROME DRIVER SETUP β”‚ +β”‚ β”œβ”€ Detect environment (Docker vs local) β”‚ +β”‚ β”œβ”€ Clear cache if needed β”‚ +β”‚ β”œβ”€ Launch undetected_chromedriver β”‚ +β”‚ └─ Set page load timeout (30s) β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + v +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ 3. NAVIGATION β”‚ +β”‚ β”œβ”€ driver.get(url) β”‚ +β”‚ β”œβ”€ Wait for "google.com/maps" in URL β”‚ +β”‚ β”œβ”€ Dismiss cookie consent (if present) β”‚ +β”‚ β”œβ”€ Click "Reviews" tab (multi-strategy detection) β”‚ +β”‚ └─ Set sort order (if not "relevance") β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + v +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ 4. EXTRACTION LOOP β”‚ +β”‚ β”œβ”€ Find review pane (PANE_SEL) β”‚ +β”‚ β”œβ”€ Scroll pane (JavaScript injection) β”‚ +β”‚ β”œβ”€ Extract cards (CARD_SEL) β”‚ +β”‚ β”œβ”€ For each card: β”‚ +β”‚ β”‚ β”œβ”€ Get data-review-id β”‚ +β”‚ β”‚ β”œβ”€ Skip if in 'seen' set β”‚ +β”‚ β”‚ β”œβ”€ RawReview.from_card(card) β”‚ +β”‚ β”‚ β”œβ”€ Add to docs dict β”‚ +β”‚ β”‚ └─ Add ID to seen set β”‚ +β”‚ β”œβ”€ Dynamic sleep (0.7s if many cards, else 1.0s) β”‚ +β”‚ └─ Exit conditions: β”‚ +β”‚ β”œβ”€ idle >= 3 (no new reviews found) β”‚ +β”‚ β”œβ”€ stop_on_match and duplicate found β”‚ +β”‚ └─ max_attempts reached (10) β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + v +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ 5. DATA ENRICHMENT β”‚ +β”‚ β”œβ”€ Merge with existing reviews (merge_review) β”‚ +β”‚ β”œβ”€ Convert relative dates to ISO format β”‚ +β”‚ β”œβ”€ Detect language for each text field β”‚ +β”‚ β”œβ”€ Add created_date, last_modified_date β”‚ +β”‚ └─ Inject custom_params into each document β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + v +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ 6. IMAGE PROCESSING (if download_images=True) β”‚ +β”‚ β”œβ”€ Collect all unique image URLs β”‚ +β”‚ β”œβ”€ Parallel download (ThreadPoolExecutor) β”‚ +β”‚ β”‚ β”œβ”€ Modify Google URLs for max resolution β”‚ +β”‚ β”‚ β”‚ (=w1200-h1200-no) β”‚ +β”‚ β”‚ β”œβ”€ Save to review_images/profiles/ or /reviews/ β”‚ +β”‚ β”‚ └─ Generate filename from URL hash β”‚ +β”‚ β”œβ”€ Upload to S3 (if use_s3=True) β”‚ +β”‚ β”‚ β”œβ”€ Set ACL=public-read β”‚ +β”‚ β”‚ β”œβ”€ ContentType=image/jpeg β”‚ +β”‚ β”‚ └─ Delete local files (if configured) β”‚ +β”‚ └─ Replace URLs in documents β”‚ +β”‚ β”œβ”€ user_images β†’ custom URLs or S3 URLs β”‚ +β”‚ β”œβ”€ profile_picture β†’ custom URL or S3 URL β”‚ +β”‚ └─ Store originals in original_* fields (optional) β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + v +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ 7. STORAGE β”‚ +β”‚ β”œβ”€ MongoDB (if use_mongodb=True) β”‚ +β”‚ β”‚ β”œβ”€ Bulk upsert: UpdateOne({review_id}, {$set: doc}) β”‚ +β”‚ β”‚ β”œβ”€ Create index on review_id β”‚ +β”‚ β”‚ └─ Log upserted/modified counts β”‚ +β”‚ └─ JSON Backup (if backup_to_json=True) β”‚ +β”‚ β”œβ”€ Write to google_reviews.json β”‚ +β”‚ β”œβ”€ Write seen IDs to google_reviews.ids β”‚ +β”‚ └─ Convert datetime objects to ISO strings β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + v +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ 8. CLEANUP β”‚ +β”‚ β”œβ”€ driver.quit() β”‚ +β”‚ β”œβ”€ MongoDB connection close β”‚ +β”‚ └─ Return success/failure status β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +--- + +## Configuration System + +### Configuration Priority (Highest to Lowest) + +1. **CLI Arguments**: `python start.py --headless --sort newest` +2. **Environment Variables**: `LOG_LEVEL=DEBUG`, `CHROME_BIN=/usr/bin/google-chrome` +3. **config.yaml**: Default configuration file +4. **Hardcoded Defaults**: `modules/config.py::DEFAULT_CONFIG` + +### Configuration File Schema (`config.yaml`) + +```yaml +# Google Maps URL to scrape +url: "https://maps.app.goo.gl/6tkNMDjcj3SS6LJe9" + +# Scraper settings +headless: false # Run Chrome in headless mode +sort_by: "newest" # Options: newest, highest, lowest, relevance +stop_on_match: false # Stop when first already-seen review is encountered +overwrite_existing: false # Whether to overwrite existing reviews or append + +# MongoDB settings +use_mongodb: false # Whether to use MongoDB for storage +mongodb: + uri: "mongodb://username:password@localhost:27017/" + database: "reviews" + collection: "google_reviews" + +# JSON backup settings +backup_to_json: true # Whether to backup data to JSON files +json_path: "google_reviews.json" +seen_ids_path: "google_reviews.ids" + +# Data processing settings +convert_dates: true # Convert string dates to MongoDB Date objects + +# Image download settings +download_images: true # Download images from reviews +image_dir: "review_images" # Directory to store downloaded images +download_threads: 4 # Number of threads for downloading images +store_local_paths: false # Whether to store local image paths in documents +max_width: 1200 # Maximum width for downloaded images +max_height: 1200 # Maximum height for downloaded images + +# S3 settings (optional) +use_s3: false # Whether to upload images to S3 +s3: + aws_access_key_id: "" # AWS Access Key ID + aws_secret_access_key: "" # AWS Secret Access Key + region_name: "us-east-1" # AWS region + bucket_name: "" # S3 bucket name + prefix: "reviews/" # Base prefix for uploaded files + profiles_folder: "profiles/" # Folder name for profile images + reviews_folder: "reviews/" # Folder name for review images + delete_local_after_upload: false + s3_base_url: "" # Custom S3 base URL (optional) + +# URL replacement settings +replace_urls: true # Replace URLs with custom ones +custom_url_base: "https://yourdomain.com/images" # Base URL for replacement +custom_url_profiles: "/profiles/" # Path for profile images +custom_url_reviews: "/reviews/" # Path for review images +preserve_original_urls: false # Preserve originals in original_* fields + +# Custom parameters to add to each document +custom_params: + company: "Thaitours" + source: "Google Maps" +``` + +### Configuration Loading (`modules/config.py`) + +**Function**: `load_config(config_path: Path) -> Dict[str, Any]` + +**Process**: +```python +1. Load DEFAULT_CONFIG +2. Read config.yaml (if exists) +3. Deep merge using deep_update() +4. If file doesn't exist, create it with defaults +5. Return merged config dict +``` + +**Deep Merge Logic**: +```python +def deep_update(d, u): + for k, v in u.items(): + if isinstance(v, dict) and k in d and isinstance(d[k], dict): + deep_update(d[k], v) # Recursive merge + else: + d[k] = v # Overwrite +``` + +**Logging**: +```python +logging.basicConfig( + level=getattr(logging, os.environ.get('LOG_LEVEL', 'INFO').upper()), + format="[%(asctime)s] %(levelname)s: %(message)s" +) +``` + +--- + +## Storage Layer + +### 1. MongoDB Storage (`data_storage.py::MongoDBStorage`) + +**Connection**: +```python +pymongo.MongoClient( + uri, + tlsAllowInvalidCertificates=True, # macOS SSL compatibility + connectTimeoutMS=30000, + socketTimeoutMS=None, + connect=True, + maxPoolSize=50 +) +``` + +**Operations**: + +#### Fetch Existing Reviews +```python +def fetch_existing_reviews() -> Dict[str, Dict[str, Any]]: + reviews = {} + for doc in self.collection.find({}, {"_id": 0}): + review_id = doc.get("review_id") + if review_id: + reviews[review_id] = doc + return reviews +``` + +#### Save Reviews (Bulk Upsert) +```python +def save_reviews(reviews: Dict[str, Dict[str, Any]]): + operations = [ + pymongo.UpdateOne( + {"review_id": review["review_id"]}, + {"$set": review}, + upsert=True + ) + for review in processed_reviews.values() + ] + result = self.collection.bulk_write(operations) + log.info(f"Upserted {result.upserted_count}, modified {result.modified_count}") +``` + +**Schema**: +```json +{ + "_id": ObjectId("..."), // Auto-generated by MongoDB + "review_id": "ChdDSUhN...", // Unique Google review ID + "author": "John Smith", + "rating": 4.0, + "description": { // Multi-language support + "en": "Great place!", + "es": "Β‘Lugar genial!", + "he": "ΧžΧ§Χ•Χ Χ Χ”Χ“Χ¨!" + }, + "likes": 3, + "user_images": [ // Array of image URLs (custom or S3) + "https://cdn.example.com/reviews/xyz.jpg" + ], + "author_profile_url": "https://www.google.com/maps/contrib/...", + "profile_picture": "https://cdn.example.com/profiles/abc.jpg", + "owner_responses": { // Business owner replies + "en": { + "text": "Thank you for your feedback!" + } + }, + "created_date": ISODate("2025-04-22T14:30:45.123Z"), + "last_modified_date": ISODate("2025-04-22T14:30:45.123Z"), + "review_date": ISODate("2025-04-15T08:15:22Z"), + "company": "Thaitours", // Custom metadata + "source": "Google Maps", + "local_images": [ // Local file paths (optional) + "review_images/reviews/xyz.jpg" + ], + "local_profile_picture": "review_images/profiles/abc.jpg", + "original_image_urls": [ // Original Google URLs (optional) + "https://lh3.googleusercontent.com/..." + ], + "original_profile_picture": "https://lh3.googleusercontent.com/..." +} +``` + +**Indexes**: +```python +# Recommended indexes +db.google_reviews.createIndex({"review_id": 1}, {"unique": true}) +db.google_reviews.createIndex({"created_date": -1}) +db.google_reviews.createIndex({"rating": 1}) +db.google_reviews.createIndex({"company": 1}) +``` + +### 2. JSON Storage (`data_storage.py::JSONStorage`) + +**Purpose**: Backup and standalone operation without MongoDB. + +**Files**: +- `google_reviews.json` - Array of review documents +- `google_reviews.ids` - Newline-separated list of seen review IDs + +**Load**: +```python +def load_json_docs() -> Dict[str, Dict[str, Any]]: + data = json.loads(self.json_path.read_text(encoding="utf-8")) + return {d.get("review_id", ""): d for d in data if d.get("review_id")} +``` + +**Save**: +```python +def save_json_docs(docs: Dict[str, Dict[str, Any]]): + # Convert datetime objects to ISO strings + for doc in processed_docs.values(): + for key, value in doc.items(): + if isinstance(value, datetime): + doc[key] = value.isoformat() + + self.json_path.write_text( + json.dumps(list(processed_docs.values()), ensure_ascii=False, indent=2), + encoding="utf-8" + ) +``` + +**Seen IDs**: +```python +def load_seen() -> Set[str]: + return set(self.seen_ids_path.read_text().splitlines()) + +def save_seen(ids: Set[str]): + self.seen_ids_path.write_text("\n".join(ids)) +``` + +### 3. Merge Logic (`merge_review()`) + +**Purpose**: Combine new scrape data with existing review records. + +**Strategy**: +```python +def merge_review(existing: Dict | None, raw: RawReview) -> Dict: + if not existing: + # Create new document with all fields + existing = { + "review_id": raw.id, + "author": raw.author, + "rating": raw.rating, + "description": {}, + "likes": raw.likes, + "user_images": list(raw.photos), + "author_profile_url": raw.profile, + "profile_picture": raw.avatar, + "owner_responses": {}, + "created_date": get_current_iso_date(), + "review_date": parse_relative_date(raw.date, "en") + } + + # Update text (multi-language support) + if raw.text: + existing["description"][raw.lang] = raw.text + + # Update rating if missing + if not existing.get("rating"): + existing["rating"] = raw.rating + + # Take max likes + if raw.likes > existing.get("likes", 0): + existing["likes"] = raw.likes + + # Union image lists + existing["user_images"] = list({*existing.get("user_images", []), *raw.photos}) + + # Update avatar if new one is larger (better quality) + if raw.avatar and len(raw.avatar) > len(existing.get("profile_picture", "")): + existing["profile_picture"] = raw.avatar + + # Add owner response + if raw.owner_text: + lang = detect_lang(raw.owner_text) + existing.setdefault("owner_responses", {})[lang] = { + "text": raw.owner_text + } + + # Update timestamp + existing["last_modified_date"] = get_current_iso_date() + + return existing +``` + +**Key Features**: +- **Additive**: Never removes data, only adds or updates +- **Multi-language**: Supports translations by storing description/owner_responses as dicts keyed by language code +- **Quality Preservation**: Takes maximum likes, largest avatar URL +- **Deduplication**: Uses set operations for image URL lists + +--- + +## Image Processing Pipeline + +### 1. Image Handler (`modules/image_handler.py`) + +**Class**: `ImageHandler` + +**Initialization**: +```python +def __init__(self, config: Dict[str, Any]): + self.image_dir = Path(config.get("image_dir", "review_images")) + self.max_workers = config.get("download_threads", 4) + self.max_width = config.get("max_width", 1200) + self.max_height = config.get("max_height", 1200) + self.replace_urls = config.get("replace_urls", False) + self.custom_url_base = config.get("custom_url_base", "https://mycustomurl.com") + self.s3_handler = S3Handler(config) +``` + +**Directory Structure**: +``` +review_images/ +β”œβ”€β”€ profiles/ # Profile pictures +β”‚ β”œβ”€β”€ user_abc123.jpg +β”‚ └── user_def456.jpg +└── reviews/ # Review images + β”œβ”€β”€ img_xyz789.jpg + └── img_qwe012.jpg +``` + +### 2. Image Download Process + +**Method**: `download_image(url_info: Tuple[str, bool]) -> Tuple[str, str, str]` + +**Steps**: +```python +1. Extract filename from URL + - For profiles: Extract unique ID from URL path + - For reviews: Use Google image ID + - Append .jpg extension + +2. Check if file already exists + - If yes, skip download but generate custom URL + +3. Modify Google URLs for maximum resolution + - Original: https://lh3.googleusercontent.com/p/AF1QipN...=w100-h100 + - Modified: https://lh3.googleusercontent.com/p/AF1QipN...=w1200-h1200-no + - Pattern: base_url + f"=w{max_width}-h{max_height}-no" + +4. Download with streaming + response = requests.get(url, stream=True, timeout=10) + with open(filepath, 'wb') as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + +5. Generate custom URL + custom_url = f"{custom_url_base}/{path}/{filename}" + +6. Return (original_url, filename, custom_url) +``` + +**URL Modification Logic**: +```python +if 'googleusercontent.com' in url or 'ggpht.com' in url: + if '=w' in url or '=h' in url or '=s' in url: + # Remove existing size parameters + parts = url.split('=') + base_url = parts[0] + # Add new parameters + url = base_url + f"=w{self.max_width}-h{self.max_height}-no" + else: + # No existing parameters + url = url + f"=w{self.max_width}-h{self.max_height}-no" +``` + +**Concurrency**: +```python +with ThreadPoolExecutor(max_workers=self.max_workers) as executor: + results = executor.map(self.download_image, download_tasks) +``` + +### 3. URL Replacement Strategy + +**Modes**: + +1. **No Replacement** (`replace_urls=False`): + - Stores original Google URLs + - Optionally stores local paths in `local_images` field + +2. **Custom URL Replacement** (`replace_urls=True`, `use_s3=False`): + - Downloads to local directory + - Replaces URLs with `custom_url_base + custom_url_profiles/reviews + filename` + - Original URLs preserved in `original_image_urls` if `preserve_original_urls=True` + +3. **S3 Replacement** (`replace_urls=True`, `use_s3=True`): + - Downloads to local directory + - Uploads to S3 + - Replaces URLs with S3 URLs + - Deletes local files if `delete_local_after_upload=True` + +**Example**: +```python +# Original +user_images: ["https://lh3.googleusercontent.com/p/AF1QipN...=w100-h100"] + +# After Custom URL Replacement +user_images: ["https://cdn.mysite.com/reviews/AF1QipN.jpg"] +original_image_urls: ["https://lh3.googleusercontent.com/p/AF1QipN..."] +local_images: ["review_images/reviews/AF1QipN.jpg"] + +# After S3 Replacement +user_images: ["https://mybucket.s3.us-east-1.amazonaws.com/reviews/reviews/AF1QipN.jpg"] +# local_images and original_image_urls: depends on config +``` + +--- + +## Cloud Integration (AWS S3) + +### S3 Handler (`modules/s3_handler.py`) + +**Class**: `S3Handler` + +**Initialization**: +```python +boto3.client("s3", + region_name=self.region_name, + aws_access_key_id=self.aws_access_key_id, # Optional, uses IAM if omitted + aws_secret_access_key=self.aws_secret_access_key +) + +# Test connection +self.s3_client.head_bucket(Bucket=self.bucket_name) +``` + +**Upload Method**: +```python +def upload_file(local_path: Path, s3_key: str) -> Optional[str]: + self.s3_client.upload_file( + str(local_path), + self.bucket_name, + s3_key, + ExtraArgs={ + 'ContentType': 'image/jpeg', + 'ACL': 'public-read' # Make publicly accessible + } + ) + return self.get_s3_url(s3_key) +``` + +**S3 Key Structure**: +```python +# Profile image +s3_key = f"{prefix}{profiles_folder}/{filename}" +# Example: "reviews/profiles/user_abc123.jpg" + +# Review image +s3_key = f"{prefix}{reviews_folder}/{filename}" +# Example: "reviews/reviews/img_xyz789.jpg" +``` + +**URL Generation**: +```python +def get_s3_url(key: str) -> str: + if self.s3_base_url: + # Custom domain (CloudFront) + return f"{self.s3_base_url.rstrip('/')}/{key}" + else: + # Default S3 URL + return f"https://{self.bucket_name}.s3.{self.region_name}.amazonaws.com/{key}" +``` + +**Batch Upload**: +```python +def upload_images_batch(image_files: Dict[str, tuple]) -> Dict[str, str]: + results = {} + for filename, (local_path, is_profile) in image_files.items(): + s3_url = self.upload_image(local_path, filename, is_profile) + if s3_url: + results[filename] = s3_url + return results +``` + +**Error Handling**: +```python +try: + self.s3_client.upload_file(...) +except ClientError as e: + error_code = e.response.get('Error', {}).get('Code', '') + if error_code == '404': + log.error("Bucket not found") + elif error_code == '403': + log.error("Access denied") +``` + +**Lifecycle Management**: +```python +if self.delete_local_after_upload: + local_path.unlink() +``` + +--- + +## Job Management & Background Processing + +### Job Manager (`modules/job_manager.py`) + +**Class**: `JobManager` + +**Purpose**: Orchestrate concurrent scraping jobs for API mode. + +**Architecture**: +```python +JobManager + β”œβ”€ jobs: Dict[str, ScrapingJob] # In-memory job storage + β”œβ”€ executor: ThreadPoolExecutor # Background workers + β”œβ”€ lock: threading.Lock # Thread-safe operations + └─ max_concurrent_jobs: int # Concurrency limit +``` + +**Job Lifecycle**: +``` +PENDING β†’ RUNNING β†’ COMPLETED + β†˜β†’ FAILED + β†˜β†’ CANCELLED +``` + +**Job Data Structure**: +```python +@dataclass +class ScrapingJob: + job_id: str # UUID + status: JobStatus # Enum: pending, running, completed, failed, cancelled + url: str # Google Maps URL + config: Dict[str, Any] # Merged configuration + created_at: datetime + started_at: Optional[datetime] + completed_at: Optional[datetime] + error_message: Optional[str] + reviews_count: Optional[int] + images_count: Optional[int] + progress: Dict[str, Any] # {"stage": "scraping", "message": "..."} +``` + +**Methods**: + +#### Create Job +```python +def create_job(url: str, config_overrides: Dict) -> str: + job_id = str(uuid.uuid4()) + config = load_config() + config["url"] = url + config.update(config_overrides) + + job = ScrapingJob( + job_id=job_id, + status=JobStatus.PENDING, + url=url, + config=config, + created_at=datetime.now(), + progress={"stage": "created", "message": "Job created and queued"} + ) + + with self.lock: + self.jobs[job_id] = job + + return job_id +``` + +#### Start Job +```python +def start_job(job_id: str) -> bool: + with self.lock: + if job_id not in self.jobs: + return False + + job = self.jobs[job_id] + if job.status != JobStatus.PENDING: + return False + + # Check concurrency limit + running_count = sum(1 for j in self.jobs.values() if j.status == JobStatus.RUNNING) + if running_count >= self.max_concurrent_jobs: + return False + + job.status = JobStatus.RUNNING + job.started_at = datetime.now() + + # Submit to thread pool + self.executor.submit(self._run_scraping_job, job_id) + return True +``` + +#### Run Scraping Job (Background Thread) +```python +def _run_scraping_job(job_id: str): + try: + job = self.jobs[job_id] + + # Update progress + job.progress = {"stage": "initializing", "message": "Setting up scraper"} + + # Create scraper instance + scraper = GoogleReviewsScraper(job.config) + + job.progress = {"stage": "scraping", "message": "Scraping reviews in progress"} + + # Run scraping (blocking call) + scraper.scrape() + + # Mark as completed + job.status = JobStatus.COMPLETED + job.completed_at = datetime.now() + job.progress = {"stage": "completed", "message": "Scraping completed successfully"} + + except Exception as e: + job.status = JobStatus.FAILED + job.completed_at = datetime.now() + job.error_message = str(e) + job.progress = {"stage": "failed", "message": f"Job failed: {str(e)}"} +``` + +#### Cleanup Old Jobs +```python +def cleanup_old_jobs(max_age_hours: int = 24): + cutoff_time = datetime.now().timestamp() - (max_age_hours * 3600) + + with self.lock: + to_delete = [] + for job_id, job in self.jobs.items(): + if job.status in [COMPLETED, FAILED, CANCELLED]: + if job.completed_at and job.completed_at.timestamp() < cutoff_time: + to_delete.append(job_id) + + for job_id in to_delete: + del self.jobs[job_id] +``` + +**Concurrency Control**: +- Maximum 3 concurrent jobs by default +- PENDING jobs wait in queue +- ThreadPoolExecutor manages thread lifecycle +- Thread-safe operations using `threading.Lock` + +**Statistics**: +```python +def get_stats() -> Dict[str, Any]: + return { + "total_jobs": len(self.jobs), + "by_status": { + "pending": count_pending, + "running": count_running, + "completed": count_completed, + "failed": count_failed, + "cancelled": count_cancelled + }, + "running_jobs": count_running, + "max_concurrent_jobs": self.max_concurrent_jobs + } +``` + +--- + +## REST API Service + +### FastAPI Application (`api_server.py`) + +**Lifecycle**: +```python +@asynccontextmanager +async def lifespan(app: FastAPI): + # Startup + global job_manager + job_manager = JobManager(max_concurrent_jobs=3) + asyncio.create_task(cleanup_jobs_periodically()) + + yield + + # Shutdown + if job_manager: + job_manager.shutdown() +``` + +**Middleware**: +```python +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], # Configure for production + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"] +) +``` + +**Endpoints**: + +#### 1. Health Check +```http +GET / +``` +**Response**: +```json +{ + "message": "Google Reviews Scraper API is running", + "status": "healthy", + "version": "1.0.0" +} +``` + +#### 2. Start Scraping Job +```http +POST /scrape +Content-Type: application/json +``` +**Request Body**: +```json +{ + "url": "https://maps.app.goo.gl/xyz", + "headless": true, + "sort_by": "newest", + "stop_on_match": false, + "download_images": true, + "use_s3": false, + "custom_params": { + "client": "CompanyA", + "region": "EU" + } +} +``` +**Response**: +```json +{ + "job_id": "550e8400-e29b-41d4-a716-446655440000", + "status": "started", + "message": "Scraping job started successfully" +} +``` + +#### 3. Get Job Status +```http +GET /jobs/{job_id} +``` +**Response**: +```json +{ + "job_id": "550e8400-e29b-41d4-a716-446655440000", + "status": "running", + "url": "https://maps.app.goo.gl/xyz", + "created_at": "2025-04-22T14:30:45.123456", + "started_at": "2025-04-22T14:30:46.789012", + "completed_at": null, + "error_message": null, + "reviews_count": null, + "images_count": null, + "progress": { + "stage": "scraping", + "message": "Scraping reviews in progress" + } +} +``` + +#### 4. List Jobs +```http +GET /jobs?status=running&limit=10 +``` +**Response**: +```json +[ + { + "job_id": "...", + "status": "running", + ... + }, + { + "job_id": "...", + "status": "pending", + ... + } +] +``` + +#### 5. Cancel Job +```http +POST /jobs/{job_id}/cancel +``` +**Response**: +```json +{ + "message": "Job cancelled successfully" +} +``` + +#### 6. Delete Job +```http +DELETE /jobs/{job_id} +``` +**Response**: +```json +{ + "message": "Job deleted successfully" +} +``` + +#### 7. Get Statistics +```http +GET /stats +``` +**Response**: +```json +{ + "total_jobs": 42, + "by_status": { + "pending": 2, + "running": 3, + "completed": 35, + "failed": 2, + "cancelled": 0 + }, + "running_jobs": 3, + "max_concurrent_jobs": 3 +} +``` + +#### 8. Manual Cleanup +```http +POST /cleanup?max_age_hours=12 +``` +**Response**: +```json +{ + "message": "Cleaned up jobs older than 12 hours" +} +``` + +**Automatic Cleanup**: +```python +async def cleanup_jobs_periodically(): + while True: + await asyncio.sleep(3600) # Every hour + if job_manager: + job_manager.cleanup_old_jobs(max_age_hours=24) +``` + +**OpenAPI Documentation**: +- **Interactive Docs**: http://localhost:8000/docs (Swagger UI) +- **ReDoc**: http://localhost:8000/redoc +- **OpenAPI JSON**: http://localhost:8000/openapi.json + +**Running the Server**: +```bash +# Development mode with auto-reload +python api_server.py + +# Production mode with Gunicorn +gunicorn api_server:app -w 4 -k uvicorn.workers.UvicornWorker --bind 0.0.0.0:8000 +``` + +--- + +## Selenium Automation Strategy + +### Anti-Detection Measures + +1. **Undetected ChromeDriver**: Uses patched driver to bypass bot detection +2. **Human-Like Delays**: Random sleep intervals between actions +3. **Gradual Scrolling**: Smooth scroll animations instead of instant jumps +4. **Natural Clicking**: ActionChains for realistic mouse movements +5. **Session Persistence**: Maintains cookies and local storage + +### DOM Element Detection Strategy + +**Problem**: Google Maps UI changes frequently and varies by language/region. + +**Solution**: Multi-strategy cascade with fallbacks. + +**Example: Reviews Tab Detection** + +```python +def is_reviews_tab(tab: WebElement) -> bool: + # Strategy 1: Data Attributes + if tab.get_attribute("data-tab-index") == "1": + return True + + # Strategy 2: ARIA Attributes + aria_label = (tab.get_attribute("aria-label") or "").lower() + if any(word in aria_label for word in REVIEW_WORDS): + return True + + # Strategy 3: Text Content + text = tab.text.lower() + if any(word in text for word in REVIEW_WORDS): + return True + + # Strategy 4: Nested Elements + for child in tab.find_elements(By.CSS_SELECTOR, "*"): + child_text = child.text.lower() + if any(word in child_text for word in REVIEW_WORDS): + return True + + # Strategy 5: URL Detection + href = (tab.get_attribute("href") or "").lower() + if "review" in href or "rating" in href: + return True + + # Strategy 6: Class Detection + tab_class = tab.get_attribute("class") or "" + if any(cls in tab_class for cls in ["review", "rating", "g4jrve"]): + return True + + return False +``` + +**Review Keywords** (50+ languages): +```python +REVIEW_WORDS = { + # English + "reviews", "review", "ratings", "rating", + + # Hebrew + "Χ‘Χ™Χ§Χ•Χ¨Χ•Χͺ", "Χ‘Χ™Χ§Χ•Χ¨Χͺ", "דירוגים", "Χ“Χ™Χ¨Χ•Χ’", + + # Thai + "ΰΈ£ΰΈ΅ΰΈ§ΰΈ΄ΰΈ§", "ΰΈšΰΈ—ΰΈ§ΰΈ΄ΰΈˆΰΈ²ΰΈ£ΰΈ“ΰΉŒ", "คะแนน", "ΰΈ„ΰΈ§ΰΈ²ΰΈ‘ΰΈ„ΰΈ΄ΰΈ”ΰΉ€ΰΈ«ΰΉ‡ΰΈ™", + + # Spanish + "reseΓ±as", "opiniones", "valoraciones", "crΓ­ticas", + + # French + "avis", "commentaires", "Γ©valuations", "critiques", + + # German + "bewertungen", "rezensionen", "beurteilungen", "meinungen", + + # ... (40+ more languages) +} +``` + +### Click Reliability + +**Multiple Click Methods**: +```python +click_methods = [ + # Method 1: JavaScript click (most reliable) + lambda: driver.execute_script("arguments[0].click();", element), + + # Method 2: Direct click + lambda: element.click(), + + # Method 3: ActionChains click + lambda: ActionChains(driver).move_to_element(element).click().perform(), + + # Method 4: Send RETURN key + lambda: element.send_keys(Keys.RETURN), + + # Method 5: Center click with offset + lambda: ActionChains(driver).move_to_element_with_offset( + element, element.size['width'] // 2, element.size['height'] // 2 + ).click().perform() +] + +# Try each method until one succeeds +for i, click_method in enumerate(click_methods): + try: + click_method() + if verify_click_worked(): + return True + except Exception: + continue +``` + +### Scrolling Strategy + +**Smooth Scrolling**: +```python +# Cache scrollable pane in window object +driver.execute_script("window.scrollablePane = arguments[0];", pane) + +# Smooth scroll using JS +scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);" +driver.execute_script(scroll_script) + +# Fallback if pane becomes stale +try: + driver.execute_script(scroll_script) +except Exception: + driver.execute_script("window.scrollBy(0, 300);") +``` + +**Dynamic Sleep**: +```python +# Sleep less when processing many reviews +sleep_time = 0.7 if len(fresh_cards) > 5 else 1.0 +time.sleep(sleep_time) +``` + +### Stale Element Handling + +**Problem**: DOM updates while scraping cause `StaleElementReferenceException`. + +**Solution**: Re-find elements and retry. + +```python +try: + cards = pane.find_elements(By.CSS_SELECTOR, CARD_SEL) +except StaleElementReferenceException: + # Re-find pane + pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, PANE_SEL))) + driver.execute_script("window.scrollablePane = arguments[0];", pane) + cards = pane.find_elements(By.CSS_SELECTOR, CARD_SEL) +``` + +### Timeout Strategy + +**Page Load**: +```python +driver.set_page_load_timeout(30) # 30 seconds max +``` + +**Element Waits**: +```python +wait = WebDriverWait(driver, 20) # 20 seconds default +wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector))) +``` + +**Custom Timeouts**: +```python +# Reviews tab detection: 25 seconds +end_time = time.time() + 25 +while time.time() < end_time: + if find_and_click_tab(): + return True + time.sleep(0.5) +``` + +--- + +## Multi-Language Support + +### Language Detection + +**Method**: Character set analysis using regex patterns. + +```python +HEB_CHARS = re.compile(r"[\u0590-\u05FF]") # Hebrew Unicode range +THAI_CHARS = re.compile(r"[\u0E00-\u0E7F]") # Thai Unicode range + +@lru_cache(maxsize=1024) +def detect_lang(txt: str) -> str: + if HEB_CHARS.search(txt): return "he" + if THAI_CHARS.search(txt): return "th" + return "en" +``` + +**Extensibility**: +```python +# Add more language patterns +ARABIC_CHARS = re.compile(r"[\u0600-\u06FF]") +CHINESE_CHARS = re.compile(r"[\u4E00-\u9FFF]") +JAPANESE_CHARS = re.compile(r"[\u3040-\u309F\u30A0-\u30FF]") +``` + +### Multi-Language Storage + +**Review Text**: +```json +{ + "description": { + "en": "Great place, loved the service!", + "th": "ΰΈͺΰΈ–ΰΈ²ΰΈ™ΰΈ—ΰΈ΅ΰΉˆΰΈ—ΰΈ΅ΰΉˆΰΈ’ΰΈ­ΰΈ”ΰΉ€ΰΈ’ΰΈ΅ΰΉˆΰΈ’ΰΈ‘ ΰΈšΰΈ£ΰΈ΄ΰΈΰΈ²ΰΈ£ΰΈ”ΰΈ΅ΰΈ‘ΰΈ²ΰΈ!", + "he": "ΧžΧ§Χ•Χ Χ Χ”Χ“Χ¨, אהבΧͺΧ™ אΧͺ Χ”Χ©Χ™Χ¨Χ•Χͺ!" + } +} +``` + +**Owner Responses**: +```json +{ + "owner_responses": { + "en": { + "text": "Thank you for your kind words!" + }, + "th": { + "text": "ΰΈ‚ΰΈ­ΰΈšΰΈ„ΰΈΈΰΈ“ΰΈͺΰΈ³ΰΈ«ΰΈ£ΰΈ±ΰΈšΰΈ„ΰΈ³ΰΈžΰΈΉΰΈ”ΰΈ—ΰΈ΅ΰΉˆΰΈ”ΰΈ΅!" + } + } +} +``` + +### Translation Support (Future Feature) + +**Data Model**: +```python +@dataclass +class RawReview: + # ... existing fields ... + translations: dict = field(default_factory=dict) +``` + +**Usage**: +```python +# Store translations +raw.translations = { + "en": "Great place!", + "es": "Β‘Gran lugar!", + "fr": "Superbe endroit!" +} +``` + +**API Integration** (planned): +```python +from googletrans import Translator + +translator = Translator() +for lang in target_languages: + translation = translator.translate(review_text, dest=lang) + translations[lang] = translation.text +``` + +--- + +## Date & Time Handling + +### Challenge + +Google displays dates as relative strings: +- English: "2 weeks ago", "3 months ago" +- Hebrew: "ΧœΧ€Χ Χ™ שבוגיים", "ΧœΧ€Χ Χ™ 3 חודשים" +- Thai: "2 ΰΈͺΰΈ±ΰΈ›ΰΈ”ΰΈ²ΰΈ«ΰΉŒΰΈ—ΰΈ΅ΰΉˆΰΉΰΈ₯ΰΉ‰ΰΈ§" + +**Goal**: Convert to ISO 8601 format for consistent storage and querying. + +### Parsing Algorithm + +**Function**: `parse_relative_date(date_str: str, lang: str) -> str` + +**Steps**: +```python +1. Try parsing with primary language + - English: r'(?Pa|an|\d+)\s+(?Pday|week|month|year)s?\s+ago' + - Hebrew: r'(?P\d+)?\s*(?PΧ©Χ Χ”|שנים|Χ—Χ•Χ“Χ©|חודשים|יום|Χ™ΧžΧ™Χ|Χ©Χ‘Χ•Χ’|Χ©Χ‘Χ•Χ’Χ•Χͺ)' + - Thai: r'(?P\d+)?\s*(?PΰΈ§ΰΈ±ΰΈ™|ΰΈͺΰΈ±ΰΈ›ΰΈ”ΰΈ²ΰΈ«ΰΉŒ|ΰΉ€ΰΈ”ΰΈ·ΰΈ­ΰΈ™|ΰΈ›ΰΈ΅)ΰΈ—ΰΈ΅ΰΉˆΰΉΰΈ₯ΰΉ‰ΰΈ§' + +2. Extract number and unit + - "a" or "an" β†’ 1 + - Hebrew "אחד" or "אחΧͺ" β†’ 1 + - Numeric string β†’ int(match) + +3. Calculate time delta + - days = num * 1 + - weeks = num * 7 + - months = num * 30 (approximation) + - years = num * 365 (approximation) + +4. Subtract from current time + result = datetime.now() - timedelta(days=calculated_days) + +5. Return ISO 8601 format + return result.isoformat() +``` + +**Fallback Strategy**: +```python +# If primary language fails +for alt_lang in ["en", "he", "th"]: + if alt_lang != lang: + result = try_parse_date(date_str, alt_lang) + if result != date_str: + return result + +# If all languages fail, generate random date within last year +random_days_ago = random.randint(1, 365) +random_date = datetime.now() - timedelta(days=random_days_ago) +return random_date.isoformat() +``` + +### Date Conversion for Storage + +**MongoDB**: Stores as ISODate objects. + +```python +def convert_dates_in_document(doc: Dict[str, Any]) -> Dict[str, Any]: + date_fields = ["created_date", "last_modified_date", "review_date"] + + for field in date_fields: + if field in doc and isinstance(doc[field], str): + try: + # Parse ISO format + doc[field] = datetime.fromisoformat(doc[field].replace('Z', '+00:00')) + except (ValueError, TypeError): + # Try parsing as relative date + lang = next(iter(doc.get("description", {}).keys()), "en") + date_obj = relative_to_datetime(doc[field], lang) + if date_obj: + doc[field] = date_obj + + return doc +``` + +**JSON**: Stores as ISO strings. + +```python +for doc in documents: + for key, value in doc.items(): + if isinstance(value, datetime): + doc[key] = value.isoformat() +``` + +### Timezone Handling + +**All dates stored in UTC**: +```python +from datetime import timezone + +now = datetime.now(timezone.utc) +doc["created_date"] = now.isoformat() # 2025-04-22T14:30:45.123456+00:00 +``` + +**Query Examples**: +```python +# MongoDB: Find reviews from last 30 days +cutoff = datetime.now(timezone.utc) - timedelta(days=30) +db.google_reviews.find({"review_date": {"$gte": cutoff}}) + +# JSON: Filter by date range +cutoff_str = cutoff.isoformat() +filtered = [r for r in reviews if r["review_date"] >= cutoff_str] +``` + +--- + +## Error Handling & Resilience + +### Chrome Driver Errors + +**Version Mismatch**: +```python +# Clear cache before initializing +if os.path.exists(cache_path): + log.info("Clearing ChromeDriver cache") + shutil.rmtree(cache_path, ignore_errors=True) + +# Let undetected_chromedriver download fresh version +driver = uc.Chrome(options=opts) +``` + +**Binary Not Found**: +```python +# Check for environment variable +chrome_binary = os.environ.get('CHROME_BIN') +if chrome_binary and os.path.exists(chrome_binary): + opts.binary_location = chrome_binary +``` + +**Container Environment**: +```python +in_container = os.environ.get('CHROME_BIN') is not None + +if in_container: + # Use system-installed Chrome + try: + driver = uc.Chrome(options=opts) + except Exception: + # Fallback to regular Selenium + from selenium import webdriver + driver = webdriver.Chrome(options=opts) +``` + +### Network Errors + +**Image Download Failures**: +```python +try: + response = requests.get(url, stream=True, timeout=10) + response.raise_for_status() +except requests.exceptions.RequestException as e: + log.error(f"Failed to download image: {e}") + return url, "", "" # Return empty filename, continue with next image +``` + +**MongoDB Connection Failures**: +```python +try: + self.client = pymongo.MongoClient(uri, connectTimeoutMS=30000) + self.client.admin.command('ping') +except Exception as e: + log.error(f"MongoDB connection failed: {e}") + self.connected = False + # Scraper continues with JSON-only mode +``` + +**S3 Upload Failures**: +```python +try: + self.s3_client.upload_file(local_path, bucket, s3_key) +except ClientError as e: + error_code = e.response.get('Error', {}).get('Code', '') + if error_code == '404': + log.error("Bucket not found") + elif error_code == '403': + log.error("Access denied") + return None # Continue without S3 URL +``` + +### DOM Errors + +**Stale Element Reference**: +```python +try: + raw = RawReview.from_card(card) +except StaleElementReferenceException: + continue # Skip this card, it will reappear on next scroll +except Exception: + # Store stub with ID only + raw_id = card.get_attribute("data-review-id") or "" + raw = RawReview(id=raw_id, text="", lang="und") +``` + +**Missing Elements**: +```python +def try_find(el: WebElement, css: str, *, all=False): + try: + return el.find_elements(By.CSS_SELECTOR, css) if all else [el.find_element(By.CSS_SELECTOR, css)] + except (NoSuchElementException, StaleElementReferenceException): + return [] # Return empty list instead of throwing +``` + +**Timeout Exceptions**: +```python +try: + pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, PANE_SEL))) +except TimeoutException: + log.warning("Could not find reviews pane. Page structure might have changed.") + return False # Exit gracefully +``` + +### Data Validation + +**Review ID Validation**: +```python +cid = card.get_attribute("data-review-id") +if not cid: + continue # Skip cards without valid ID + +if cid in seen: + if stop_on_match: + idle = 999 # Trigger exit + continue +``` + +**Rating Validation**: +```python +label = first_attr(card, 'span[role="img"]', "aria-label") +num = re.search(r"[\d\.]+", label.replace(",", ".")) if label else None +rating = float(num.group()) if num else 0.0 + +# Clamp to valid range +rating = max(0.0, min(5.0, rating)) +``` + +### Logging + +**Levels**: +```python +log.debug("Detailed information for debugging") +log.info("General informational messages") +log.warning("Warning messages for non-critical issues") +log.error("Error messages for failures") +``` + +**Examples**: +```python +log.info(f"Starting scraper with settings: headless={headless}, sort_by={sort_by}") +log.debug("Stale element encountered, re-finding elements") +log.warning("Sort button not found - keeping default sort order") +log.error(f"Error during scraping: {e}") +``` + +**Configuration**: +```bash +# Set log level via environment variable +export LOG_LEVEL=DEBUG +python start.py +``` + +--- + +## Performance Optimizations + +### 1. Caching + +**Language Detection**: +```python +@lru_cache(maxsize=1024) +def detect_lang(txt: str) -> str: + # Frequently called with same text + # Cache avoids repeated regex operations +``` + +**Safe Integer Parsing**: +```python +@lru_cache(maxsize=128) +def safe_int(s: str | None) -> int: + # Cache numeric conversions +``` + +### 2. Parallel Image Downloads + +**ThreadPoolExecutor**: +```python +with ThreadPoolExecutor(max_workers=self.max_workers) as executor: + results = executor.map(self.download_image, download_tasks) +``` + +**Optimal Worker Count**: +```yaml +download_threads: 4 # Default +# CPU-bound: set to CPU count +# I/O-bound: set to 2-4x CPU count +``` + +### 3. Batch Operations + +**MongoDB Bulk Write**: +```python +operations = [ + pymongo.UpdateOne( + {"review_id": review["review_id"]}, + {"$set": review}, + upsert=True + ) + for review in reviews.values() +] +result = self.collection.bulk_write(operations) +``` + +**Benefit**: Single network round-trip instead of N individual operations. + +### 4. Memory Management + +**Set-Based Deduplication**: +```python +seen = set() # O(1) lookup instead of O(n) list search +``` + +**Streaming Image Downloads**: +```python +response = requests.get(url, stream=True, timeout=10) +with open(filepath, 'wb') as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) # Don't load entire image into memory +``` + +### 5. Dynamic Delays + +**Adaptive Sleep**: +```python +sleep_time = 0.7 if len(fresh_cards) > 5 else 1.0 +time.sleep(sleep_time) +``` + +**Benefit**: Faster scraping when many reviews are loading quickly, more patient when few reviews appear. + +### 6. JavaScript Injection + +**Direct Scroll**: +```python +# Faster than ActionChains +driver.execute_script("window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);") +``` + +**Cache Pane Reference**: +```python +# Store in window object to avoid repeated DOM queries +driver.execute_script("window.scrollablePane = arguments[0];", pane) +``` + +### 7. Early Exit Conditions + +**Stop on Match**: +```python +if stop_on_match and cid in seen: + idle = 999 # Trigger immediate exit +``` + +**Idle Detection**: +```python +if idle >= 3: + break # No new reviews found for 3 iterations +``` + +**Max Attempts**: +```python +if attempts >= max_attempts: + break # Safety net to prevent infinite loops +``` + +--- + +## Security Considerations + +### 1. Credential Management + +**Never Commit Secrets**: +```yaml +# .gitignore +config.yaml # Contains MongoDB URI, AWS keys +google_reviews.* # Contains scraped data +review_images/ # Downloaded images +.env +``` + +**Environment Variables** (preferred): +```bash +export MONGODB_URI="mongodb://..." +export AWS_ACCESS_KEY_ID="..." +export AWS_SECRET_ACCESS_KEY="..." +``` + +**Config File** (secured): +```bash +chmod 600 config.yaml # Owner read/write only +``` + +### 2. MongoDB Security + +**Authentication**: +```yaml +mongodb: + uri: "mongodb://username:password@host:27017/?authSource=admin" +``` + +**TLS/SSL**: +```python +pymongo.MongoClient( + uri, + tls=True, + tlsAllowInvalidCertificates=False, # Production: False + tlsCAFile="/path/to/ca.pem" +) +``` + +**IP Whitelisting** (MongoDB Atlas): +- Add application server IPs +- Avoid 0.0.0.0/0 (allow all) + +### 3. AWS S3 Security + +**IAM Policies**: +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "s3:PutObject", + "s3:GetObject", + "s3:ListBucket" + ], + "Resource": [ + "arn:aws:s3:::my-bucket", + "arn:aws:s3:::my-bucket/*" + ] + } + ] +} +``` + +**Bucket Policies**: +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "PublicReadGetObject", + "Effect": "Allow", + "Principal": "*", + "Action": "s3:GetObject", + "Resource": "arn:aws:s3:::my-bucket/reviews/*" + } + ] +} +``` + +**Access Control**: +- Use IAM roles instead of hardcoded keys +- Set ACL=public-read only for necessary objects +- Enable versioning and logging + +### 4. API Security + +**Rate Limiting** (recommended): +```python +from slowapi import Limiter +from slowapi.util import get_remote_address + +limiter = Limiter(key_func=get_remote_address) + +@app.post("/scrape") +@limiter.limit("5/minute") # Max 5 requests per minute +async def start_scrape(request: Request, ...): + ... +``` + +**Authentication** (recommended for production): +```python +from fastapi import Depends, HTTPException, status +from fastapi.security import APIKeyHeader + +API_KEY_NAME = "X-API-Key" +api_key_header = APIKeyHeader(name=API_KEY_NAME, auto_error=False) + +async def get_api_key(api_key: str = Depends(api_key_header)): + if api_key != os.environ.get("API_KEY"): + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail="Invalid API Key" + ) + return api_key + +@app.post("/scrape") +async def start_scrape(request: ScrapeRequest, api_key: str = Depends(get_api_key)): + ... +``` + +**CORS** (production): +```python +app.add_middleware( + CORSMiddleware, + allow_origins=["https://yourdomain.com"], # Specific domain + allow_credentials=True, + allow_methods=["GET", "POST"], + allow_headers=["*"] +) +``` + +### 5. Input Validation + +**URL Validation**: +```python +from pydantic import HttpUrl + +class ScrapeRequest(BaseModel): + url: HttpUrl # Pydantic validates URL format +``` + +**Sanitization**: +```python +# Prevent command injection in custom_params +safe_params = {k: str(v)[:100] for k, v in custom_params.items()} +``` + +### 6. Terms of Service Compliance + +**Google Maps Terms**: +- Scraping violates Google's ToS +- Use at your own risk +- Recommended for personal/research use only +- Consider Google's official APIs for production + +**Ethical Scraping**: +- Respect robots.txt (Google Maps blocks bots) +- Implement reasonable rate limits +- Don't scrape personal data without consent +- Store data securely + +--- + +## Deployment Scenarios + +### 1. Local Development + +**Setup**: +```bash +git clone https://github.com/georgekhananaev/google-reviews-scraper-pro.git +cd google-reviews-scraper-pro +python -m venv .venv +source .venv/bin/activate # Windows: .venv\Scripts\activate +pip install -r requirements.txt +python start.py --url "https://maps.app.goo.gl/xyz" +``` + +### 2. Docker Deployment + +**Dockerfile** (example): +```dockerfile +FROM python:3.13-slim + +# Install Chrome +RUN apt-get update && apt-get install -y \ + wget \ + gnupg \ + ca-certificates \ + && wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \ + && sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' \ + && apt-get update \ + && apt-get install -y google-chrome-stable \ + && rm -rf /var/lib/apt/lists/* + +# Install ChromeDriver +RUN wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/$(curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE)/chromedriver_linux64.zip \ + && unzip /tmp/chromedriver.zip -d /usr/local/bin/ \ + && rm /tmp/chromedriver.zip + +WORKDIR /app +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . + +ENV CHROME_BIN=/usr/bin/google-chrome +ENV CHROMEDRIVER_PATH=/usr/local/bin/chromedriver + +CMD ["python", "start.py"] +``` + +**Docker Compose**: +```yaml +version: '3.8' +services: + scraper: + build: . + volumes: + - ./config.yaml:/app/config.yaml + - ./review_images:/app/review_images + - ./google_reviews.json:/app/google_reviews.json + environment: + - LOG_LEVEL=INFO + - MONGODB_URI=mongodb://mongo:27017 + depends_on: + - mongo + + mongo: + image: mongo:7 + ports: + - "27017:27017" + volumes: + - mongo_data:/data/db + + api: + build: . + command: python api_server.py + ports: + - "8000:8000" + volumes: + - ./config.yaml:/app/config.yaml + environment: + - LOG_LEVEL=INFO + depends_on: + - mongo + +volumes: + mongo_data: +``` + +### 3. Cloud VM (AWS EC2, Google Cloud, etc.) + +**Setup Script**: +```bash +#!/bin/bash +# Install dependencies +sudo apt-get update +sudo apt-get install -y python3.13 python3-pip git + +# Install Chrome +wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb +sudo dpkg -i google-chrome-stable_current_amd64.deb +sudo apt-get install -f -y + +# Clone repository +git clone https://github.com/georgekhananaev/google-reviews-scraper-pro.git +cd google-reviews-scraper-pro + +# Install Python dependencies +pip3 install -r requirements.txt + +# Configure +cp examples/config-example.txt config.yaml +nano config.yaml # Edit configuration + +# Run as service +python3 start.py --headless +``` + +**Systemd Service** (`/etc/systemd/system/scraper.service`): +```ini +[Unit] +Description=Google Reviews Scraper API +After=network.target + +[Service] +Type=simple +User=ubuntu +WorkingDirectory=/home/ubuntu/google-reviews-scraper-pro +ExecStart=/usr/bin/python3 api_server.py +Restart=on-failure +Environment="LOG_LEVEL=INFO" + +[Install] +WantedBy=multi-user.target +``` + +**Enable Service**: +```bash +sudo systemctl enable scraper +sudo systemctl start scraper +sudo systemctl status scraper +``` + +### 4. Cron Job Scheduling + +**Crontab**: +```cron +# Scrape daily at 2 AM +0 2 * * * cd /path/to/scraper && /usr/bin/python3 start.py --headless --sort newest >> /var/log/scraper.log 2>&1 + +# Scrape every 6 hours +0 */6 * * * cd /path/to/scraper && /usr/bin/python3 start.py --headless --stop-on-match >> /var/log/scraper.log 2>&1 +``` + +### 5. Kubernetes Deployment + +**Deployment YAML**: +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: scraper-api +spec: + replicas: 2 + selector: + matchLabels: + app: scraper-api + template: + metadata: + labels: + app: scraper-api + spec: + containers: + - name: api + image: myregistry/scraper-api:latest + ports: + - containerPort: 8000 + env: + - name: LOG_LEVEL + value: "INFO" + - name: MONGODB_URI + valueFrom: + secretKeyRef: + name: scraper-secrets + key: mongodb-uri + resources: + limits: + memory: "2Gi" + cpu: "1000m" + requests: + memory: "1Gi" + cpu: "500m" +--- +apiVersion: v1 +kind: Service +metadata: + name: scraper-api-service +spec: + selector: + app: scraper-api + ports: + - protocol: TCP + port: 80 + targetPort: 8000 + type: LoadBalancer +``` + +--- + +## Troubleshooting Guide + +### Common Issues + +#### 1. Chrome/ChromeDriver Version Mismatch + +**Symptoms**: +``` +SessionNotCreatedException: This version of ChromeDriver only supports Chrome version 143 +Current browser version is 142.0.7444.176 +``` + +**Solution**: +```bash +# Clear cache +rm -rf ~/Library/Application\ Support/undetected_chromedriver # macOS +rm -rf ~/.local/share/undetected_chromedriver # Linux + +# Update Chrome +# macOS: Chrome β†’ Help β†’ About Google Chrome +# Linux: sudo apt-get update && sudo apt-get upgrade google-chrome-stable + +# Run scraper (will download matching driver) +python start.py +``` + +#### 2. Reviews Tab Not Found + +**Symptoms**: +``` +TimeoutException: Reviews tab not found or could not be clicked +``` + +**Solutions**: +```bash +# Try non-headless mode to see what's happening +python start.py --headless false + +# Try different sort order +python start.py --sort relevance + +# Check URL is valid Google Maps place URL +# Should contain /maps/place/ or maps.app.goo.gl/ +``` + +#### 3. MongoDB Connection Failed + +**Symptoms**: +``` +ServerSelectionTimeoutError: connection timed out +``` + +**Solutions**: +```bash +# Check MongoDB is running +mongosh --eval "db.adminCommand('ping')" + +# Check connection URI +python -c "from pymongo import MongoClient; c = MongoClient('mongodb://localhost:27017', serverSelectionTimeoutMS=5000); print(c.server_info())" + +# For MongoDB Atlas: whitelist IP address +``` + +#### 4. S3 Upload Failures + +**Symptoms**: +``` +ClientError: An error occurred (AccessDenied) when calling the PutObject operation +``` + +**Solutions**: +```bash +# Check credentials +aws s3 ls s3://your-bucket --profile default + +# Verify IAM permissions +aws iam get-user-policy --user-name scraper-user --policy-name s3-upload + +# Test upload manually +aws s3 cp test.jpg s3://your-bucket/test.jpg --acl public-read +``` + +#### 5. Images Not Downloading + +**Symptoms**: +- Empty `review_images/` directory +- Missing `user_images` in output + +**Solutions**: +```yaml +# Verify config +download_images: true # Must be true +download_threads: 4 # Increase if network is fast + +# Check network connectivity +ping lh3.googleusercontent.com + +# Check disk space +df -h +``` + +### Debugging Tips + +**Enable Debug Logging**: +```bash +export LOG_LEVEL=DEBUG +python start.py +``` + +**Run Non-Headless**: +```bash +python start.py --headless false +# Watch browser actions in real-time +``` + +**Test Components Independently**: +```python +# Test MongoDB connection +from modules.config import load_config +from modules.data_storage import MongoDBStorage + +config = load_config() +storage = MongoDBStorage(config) +if storage.connect(): + print("MongoDB connection successful") + +# Test S3 connection +from modules.s3_handler import S3Handler + +s3 = S3Handler(config) +if s3.enabled: + print("S3 connection successful") +``` + +**Check Logs**: +```bash +# CLI mode +python start.py 2>&1 | tee scraper.log + +# API mode +uvicorn api_server:app --log-level debug +``` + +--- + +## Extension Points + +### Adding New Languages + +**1. Add Unicode Range**: +```python +# modules/utils.py +ARABIC_CHARS = re.compile(r"[\u0600-\u06FF]") + +@lru_cache(maxsize=1024) +def detect_lang(txt: str) -> str: + if HEB_CHARS.search(txt): return "he" + if THAI_CHARS.search(txt): return "th" + if ARABIC_CHARS.search(txt): return "ar" # New + return "en" +``` + +**2. Add Date Patterns**: +```python +# modules/date_converter.py +elif lang.lower() == "ar": + # Arabic: "Ω…Ω†Ψ° 3 Ψ£ΩŠΨ§Ω…" + pattern = re.compile(r'Ω…Ω†Ψ°\s+(?P\d+)\s+(?PΩŠΩˆΩ…|أسبوع|Ψ΄Ω‡Ψ±|Ψ³Ω†Ψ©)') + # ... parsing logic +``` + +**3. Add Sort Labels**: +```python +# modules/scraper.py +SORT_OPTIONS = { + "newest": ( + "Newest", "Χ”Χ—Χ“Χ©Χ•Χͺ Χ‘Χ™Χ•ΧͺΧ¨", "ΰΉƒΰΈ«ΰΈ‘ΰΉˆΰΈ—ΰΈ΅ΰΉˆΰΈͺΰΈΈΰΈ”", + "Ψ§Ω„Ψ£Ψ­Ψ―Ψ«" # Arabic + ), + # ... other options +} +``` + +### Adding New Storage Backends + +**Example: PostgreSQL**: +```python +# modules/data_storage.py +class PostgreSQLStorage: + def __init__(self, config: Dict[str, Any]): + import psycopg2 + self.conn = psycopg2.connect(config["postgresql"]["uri"]) + + def save_reviews(self, reviews: Dict[str, Dict[str, Any]]): + with self.conn.cursor() as cur: + for review in reviews.values(): + cur.execute( + "INSERT INTO reviews (review_id, data) VALUES (%s, %s) " + "ON CONFLICT (review_id) DO UPDATE SET data = EXCLUDED.data", + (review["review_id"], json.dumps(review)) + ) + self.conn.commit() +``` + +**Usage**: +```python +# modules/scraper.py +if config.get("use_postgresql"): + self.postgres = PostgreSQLStorage(config) +``` + +### Adding Translation Integration + +**Example: Google Translate API**: +```python +# modules/translator.py +from googletrans import Translator + +class ReviewTranslator: + def __init__(self, target_languages: List[str]): + self.translator = Translator() + self.target_languages = target_languages + + def translate_review(self, review: Dict[str, Any]) -> Dict[str, Any]: + # Get original text + original_lang = list(review["description"].keys())[0] + original_text = review["description"][original_lang] + + # Translate to all target languages + for lang in self.target_languages: + if lang != original_lang: + translation = self.translator.translate(original_text, dest=lang) + review["description"][lang] = translation.text + + return review +``` + +**Usage**: +```python +# In scraper.py +if config.get("translate_reviews"): + translator = ReviewTranslator(config["target_languages"]) + for review_id, review in docs.items(): + docs[review_id] = translator.translate_review(review) +``` + +### Adding Custom Metrics + +**Example: Sentiment Analysis**: +```python +# modules/sentiment.py +from textblob import TextBlob + +def analyze_sentiment(text: str) -> Dict[str, float]: + blob = TextBlob(text) + return { + "polarity": blob.sentiment.polarity, # -1 to 1 + "subjectivity": blob.sentiment.subjectivity # 0 to 1 + } +``` + +**Integration**: +```python +# In merge_review() +if raw.text: + existing["description"][raw.lang] = raw.text + existing["sentiment"] = { + raw.lang: analyze_sentiment(raw.text) + } +``` + +### Adding Webhook Notifications + +**Example**: +```python +# modules/notifications.py +import requests + +def send_webhook(webhook_url: str, data: Dict[str, Any]): + response = requests.post(webhook_url, json=data) + response.raise_for_status() + +# In scraper.py (after scraping completes) +if config.get("webhook_url"): + send_webhook(config["webhook_url"], { + "event": "scraping_completed", + "reviews_count": len(docs), + "timestamp": datetime.now().isoformat() + }) +``` + +--- + +## Summary + +This document provides a complete reference for understanding and working with the Google Reviews Scraper Pro application. Key takeaways: + +1. **Modular Design**: Separation of concerns (scraping, storage, image handling, job management) +2. **Resilient Scraping**: Multi-strategy element detection, automatic retries, stale element handling +3. **Flexible Storage**: MongoDB, JSON, and S3 with configurable options +4. **Dual Execution Modes**: CLI for direct execution, REST API for service deployment +5. **Multi-Language Support**: Automatic language detection, multilingual storage schema +6. **Production-Ready**: Error handling, logging, security considerations, deployment guides + +**For AI Agents**: This architecture document should serve as the primary reference for understanding the application without needing to read individual source files. All critical implementation details, data flows, and architectural decisions are documented here. + +**For Developers**: Use this as a roadmap for extending the application, troubleshooting issues, and understanding design patterns used throughout the codebase. diff --git a/docs/TROUBLESHOOTING.md b/docs/TROUBLESHOOTING.md new file mode 100644 index 0000000..5718034 --- /dev/null +++ b/docs/TROUBLESHOOTING.md @@ -0,0 +1,708 @@ +# Troubleshooting Guide + +This guide covers common issues and their solutions when running Google Reviews Scraper Pro. + +--- + +## Table of Contents + +1. [Chrome & ChromeDriver Issues](#chrome--chromedriver-issues) +2. [MongoDB Issues](#mongodb-issues) +3. [AWS S3 Issues](#aws-s3-issues) +4. [Scraping Issues](#scraping-issues) +5. [API Server Issues](#api-server-issues) +6. [Image Download Issues](#image-download-issues) +7. [Configuration Issues](#configuration-issues) +8. [Performance Issues](#performance-issues) +9. [Python & Dependencies Issues](#python--dependencies-issues) + +--- + +## Chrome & ChromeDriver Issues + +### Issue: ChromeDriver Version Mismatch + +**Error Message:** +``` +SessionNotCreatedException: Message: session not created: This version of ChromeDriver only supports Chrome version 143 +Current browser version is 142.0.7444.176 +``` + +**Cause:** Chrome/ChromeDriver version mismatch (this issue is now automatically handled by SeleniumBase). + +**Solution:** + +**Good News:** With SeleniumBase UC Mode, version mismatches are automatically resolved! + +1. **Update Chrome to latest version:** + - macOS: Open Chrome β†’ Menu β†’ Help β†’ About Google Chrome + - Or run: `open -a "Google Chrome" "chrome://settings/help"` + +2. **Upgrade SeleniumBase (if needed):** + ```bash + pip install --upgrade seleniumbase + ``` + +3. **Run scraper again** - SeleniumBase automatically downloads the matching ChromeDriver. + +--- + +### Issue: ChromeOptions Reuse Error + +**Error Message:** +``` +RuntimeError: you cannot reuse the ChromeOptions object +``` + +**Cause:** Internal error when retrying Chrome initialization. + +**Solution:** Clear the ChromeDriver cache (see above) and restart the scraper. + +--- + +### Issue: Chrome Binary Not Found + +**Error Message:** +``` +WebDriverException: Message: unknown error: cannot find Chrome binary +``` + +**Cause:** Chrome is not installed or not in the expected location. + +**Solution:** + +1. **Install Chrome:** + - Download from: https://www.google.com/chrome/ + +2. **For custom Chrome location, set environment variable:** + ```bash + export CHROME_BIN=/path/to/chrome + ``` + +3. **Docker users:** Ensure Chrome is installed in Dockerfile: + ```dockerfile + RUN apt-get update && apt-get install -y google-chrome-stable + ENV CHROME_BIN=/usr/bin/google-chrome + ``` + +--- + +### Issue: Chrome Crashes in Headless Mode + +**Error Message:** +``` +WebDriverException: Message: chrome not reachable +``` + +**Solution:** + +1. **Add required flags** (already included in scraper, but verify): + ``` + --no-sandbox + --disable-dev-shm-usage + --disable-gpu + ``` + +2. **Increase shared memory** (Docker): + ```bash + docker run --shm-size=2g your-image + ``` + +3. **Try non-headless mode** to debug: + ```bash + python start.py --headless false + ``` + +--- + +## MongoDB Issues + +### Issue: Connection Timeout + +**Error Message:** +``` +ServerSelectionTimeoutError: connection timed out +``` + +**Cause:** MongoDB server unreachable or network issues. + +**Solution:** + +1. **Verify MongoDB is running:** + ```bash + # Local MongoDB + mongosh --eval "db.adminCommand('ping')" + + # Check service status + sudo systemctl status mongod + ``` + +2. **Check connection URI:** + ```yaml + # config.yaml + mongodb: + uri: "mongodb://username:password@host:27017/" + ``` + +3. **For MongoDB Atlas:** + - Whitelist your IP address in Atlas dashboard + - Verify cluster is active + - Check network connectivity + +4. **Test connection manually:** + ```bash + python -c "from pymongo import MongoClient; c = MongoClient('your-uri', serverSelectionTimeoutMS=5000); print(c.server_info())" + ``` + +--- + +### Issue: Authentication Failed + +**Error Message:** +``` +OperationFailure: Authentication failed +``` + +**Solution:** + +1. **Verify credentials** in connection URI +2. **Check database name** matches the authentication database +3. **Use correct URI format:** + ``` + mongodb://username:password@host:27017/database?authSource=admin + ``` + +--- + +### Issue: SSL Certificate Error + +**Error Message:** +``` +SSL: CERTIFICATE_VERIFY_FAILED +``` + +**Solution:** + +1. **For macOS**, run: + ```bash + /Applications/Python\ 3.x/Install\ Certificates.command + ``` + +2. **Or install certifi:** + ```bash + pip install --upgrade certifi + ``` + +3. **The scraper auto-handles this**, but if issues persist: + ```python + import certifi + import os + os.environ['SSL_CERT_FILE'] = certifi.where() + ``` + +--- + +## AWS S3 Issues + +### Issue: Access Denied + +**Error Message:** +``` +ClientError: An error occurred (AccessDenied) when calling the PutObject operation +``` + +**Solution:** + +1. **Verify AWS credentials:** + ```yaml + # config.yaml + s3: + aws_access_key_id: "YOUR_ACCESS_KEY" + aws_secret_access_key: "YOUR_SECRET_KEY" + ``` + +2. **Check IAM permissions** - required policy: + ```json + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "s3:PutObject", + "s3:GetObject", + "s3:ListBucket", + "s3:PutObjectAcl" + ], + "Resource": [ + "arn:aws:s3:::your-bucket-name", + "arn:aws:s3:::your-bucket-name/*" + ] + } + ] + } + ``` + +3. **Check bucket policy** allows public-read if using public URLs + +--- + +### Issue: Bucket Not Found + +**Error Message:** +``` +ClientError: An error occurred (NoSuchBucket) +``` + +**Solution:** + +1. **Verify bucket name** in config.yaml +2. **Check region** matches bucket location: + ```yaml + s3: + region_name: "us-east-1" # Must match bucket region + bucket_name: "your-bucket" + ``` + +3. **Create bucket** if it doesn't exist via AWS Console or CLI + +--- + +### Issue: Invalid Credentials + +**Error Message:** +``` +NoCredentialsError: Unable to locate credentials +``` + +**Solution:** + +1. **Set credentials in config.yaml** or environment variables: + ```bash + export AWS_ACCESS_KEY_ID=your_key + export AWS_SECRET_ACCESS_KEY=your_secret + ``` + +2. **Or use AWS credentials file:** + ``` + ~/.aws/credentials + [default] + aws_access_key_id = YOUR_KEY + aws_secret_access_key = YOUR_SECRET + ``` + +--- + +## Scraping Issues + +### Issue: Reviews Tab Not Found + +**Error Message:** +``` +TimeoutException: Reviews tab not found or could not be clicked +``` + +**Cause:** Google Maps UI changed or page didn't load properly. + +**Solution:** + +1. **Try non-headless mode** to see what's happening: + ```bash + python start.py --headless false + ``` + +2. **Check the URL** is a valid Google Maps place URL + +3. **Increase timeout** - network may be slow + +4. **Clear cookies/cache** - Google may be showing consent dialogs + +5. **Try different sort order:** + ```bash + python start.py --sort relevance + ``` + +--- + +### Issue: No Reviews Found + +**Error Message:** +``` +WARNING: No review cards found in this iteration +``` + +**Cause:** Page structure changed or place has no reviews. + +**Solution:** + +1. **Verify the place has reviews** by opening URL in browser +2. **Check if page requires login** for reviews +3. **Wait longer** for page to load - add delay in config +4. **Check for CAPTCHA** - may need to solve manually first + +--- + +### Issue: Stale Element Reference + +**Error Message:** +``` +StaleElementReferenceException: stale element reference: element is not attached to the page document +``` + +**Cause:** Page updated while scraping. + +**Solution:** This is handled automatically by the scraper. If persistent: + +1. **Reduce scroll speed** - increase sleep time +2. **Run in non-headless mode** to observe behavior +3. **Restart scraper** - temporary DOM issue + +--- + +### Issue: Cookie Consent Blocking + +**Cause:** Cookie dialog not being dismissed. + +**Solution:** + +1. **Clear browser data:** + ```bash + rm -rf ~/Library/Application\ Support/undetected_chromedriver + ``` + +2. **The scraper handles this automatically**, but you can: + - Open the URL manually first and accept cookies + - Use a different Google account region + +--- + +## API Server Issues + +### Issue: Port Already in Use + +**Error Message:** +``` +OSError: [Errno 48] Address already in use +``` + +**Solution:** + +1. **Find and kill the process:** + ```bash + # Find process using port 8000 + lsof -i :8000 + + # Kill the process + kill -9 + ``` + +2. **Use different port:** + ```bash + uvicorn api_server:app --port 8080 + ``` + +--- + +### Issue: Max Concurrent Jobs Reached + +**Error Message:** +``` +HTTP 429: Maximum concurrent jobs (3) reached +``` + +**Solution:** + +1. **Wait for existing jobs** to complete +2. **Cancel pending jobs:** + ```bash + curl -X POST "http://localhost:8000/jobs/{job_id}/cancel" + ``` +3. **Increase limit** in `api_server.py` (not recommended for stability) + +--- + +### Issue: CORS Errors (Browser) + +**Error Message:** +``` +Access-Control-Allow-Origin header missing +``` + +**Solution:** CORS is enabled by default. If issues persist: + +1. **Check allowed origins** in `api_server.py` +2. **For development**, ensure middleware is configured: + ```python + app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_methods=["*"], + allow_headers=["*"], + ) + ``` + +--- + +## Image Download Issues + +### Issue: Images Not Downloading + +**Cause:** Network issues or Google blocking requests. + +**Solution:** + +1. **Check network connectivity** +2. **Verify image URLs** are accessible +3. **Reduce parallel downloads:** + ```yaml + download_threads: 2 # Reduce from default 4 + ``` + +4. **Check disk space** for image storage + +--- + +### Issue: Images Corrupted or Wrong Size + +**Cause:** Partial downloads or URL issues. + +**Solution:** + +1. **Clear image directory** and re-run: + ```bash + rm -rf review_images/ + ``` + +2. **Check max dimensions** in config: + ```yaml + max_width: 1200 + max_height: 1200 + ``` + +--- + +### Issue: Permission Denied Writing Images + +**Error Message:** +``` +PermissionError: [Errno 13] Permission denied +``` + +**Solution:** + +1. **Check directory permissions:** + ```bash + chmod 755 review_images/ + ``` + +2. **Use different directory:** + ```yaml + image_dir: "/path/with/write/access" + ``` + +--- + +## Configuration Issues + +### Issue: Config File Not Found + +**Error Message:** +``` +FileNotFoundError: config.yaml not found +``` + +**Solution:** + +1. **Create config.yaml** from example: + ```bash + cp examples/config-example.txt config.yaml + ``` + +2. **Specify custom path:** + ```bash + python start.py --config /path/to/config.yaml + ``` + +--- + +### Issue: Invalid YAML Syntax + +**Error Message:** +``` +yaml.scanner.ScannerError: mapping values are not allowed here +``` + +**Solution:** + +1. **Validate YAML syntax** using online validator +2. **Check indentation** - use spaces, not tabs +3. **Escape special characters** in strings: + ```yaml + url: "https://example.com?param=value" # Use quotes + ``` + +--- + +### Issue: Invalid Configuration Values + +**Error Message:** +``` +ValueError: Invalid sort_by value +``` + +**Solution:** + +1. **Check allowed values:** + - `sort_by`: newest, highest, lowest, relevance + - `headless`: true, false + +2. **Verify types:** + ```yaml + download_threads: 4 # Integer, not string + headless: true # Boolean, not string "true" + ``` + +--- + +## Performance Issues + +### Issue: Scraping Too Slow + +**Solution:** + +1. **Use headless mode:** + ```bash + python start.py --headless + ``` + +2. **Reduce image download threads** if network is slow: + ```yaml + download_threads: 2 + ``` + +3. **Disable image downloading** for faster scraping: + ```yaml + download_images: false + ``` + +4. **Use SSD** for faster JSON/image writes + +--- + +### Issue: High Memory Usage + +**Solution:** + +1. **Process in batches** - use `stop_on_match` for incremental scraping +2. **Disable image downloading** temporarily +3. **Close other applications** +4. **Increase system swap** if needed + +--- + +### Issue: Chrome Using Too Much CPU + +**Solution:** + +1. **Use headless mode** - reduces rendering overhead +2. **Add GPU flags:** + ``` + --disable-gpu + --disable-software-rasterizer + ``` +3. **Limit concurrent jobs** in API mode + +--- + +## Python & Dependencies Issues + +### Issue: Module Not Found + +**Error Message:** +``` +ModuleNotFoundError: No module named 'undetected_chromedriver' +``` + +**Solution:** + +1. **Install dependencies:** + ```bash + pip install -r requirements.txt + ``` + +2. **Verify virtual environment is activated:** + ```bash + source venv/bin/activate # Linux/macOS + venv\Scripts\activate # Windows + ``` + +--- + +### Issue: Incompatible Package Versions + +**Error Message:** +``` +ImportError: cannot import name 'X' from 'Y' +``` + +**Solution:** + +1. **Reinstall all dependencies:** + ```bash + pip uninstall -r requirements.txt -y + pip install -r requirements.txt + ``` + +2. **Create fresh virtual environment:** + ```bash + python -m venv fresh_venv + source fresh_venv/bin/activate + pip install -r requirements.txt + ``` + +--- + +### Issue: Python Version Incompatibility + +**Error Message:** +``` +SyntaxError: invalid syntax +``` + +**Solution:** + +1. **Check Python version** (requires 3.9+): + ```bash + python --version + ``` + +2. **Install correct Python version:** + ```bash + # macOS with pyenv + pyenv install 3.13.1 + pyenv local 3.13.1 + + # Or use system package manager + ``` + +--- + +## Getting Help + +If your issue isn't listed here: + +1. **Enable debug logging:** + ```bash + LOG_LEVEL=DEBUG python start.py + ``` + +2. **Check logs** for detailed error messages + +3. **Search existing issues** on GitHub + +4. **Create a new issue** with: + - Error message (full traceback) + - Python version (`python --version`) + - OS and version + - Chrome version + - Steps to reproduce \ No newline at end of file diff --git a/modules/scraper.py b/modules/scraper.py index fe7ed5f..cc20469 100644 --- a/modules/scraper.py +++ b/modules/scraper.py @@ -1,5 +1,6 @@ """ Selenium scraping logic for Google Maps Reviews. +Uses SeleniumBase UC Mode for enhanced anti-detection and better Chrome version management. """ import logging @@ -10,7 +11,7 @@ import time import traceback from typing import Dict, Any, List -import undetected_chromedriver as uc +from seleniumbase import Driver from selenium.common.exceptions import TimeoutException, StaleElementReferenceException from selenium.webdriver import Chrome from selenium.webdriver.common.action_chains import ActionChains @@ -169,72 +170,87 @@ class GoogleReviewsScraper: self.backup_to_json = config.get("backup_to_json", True) self.overwrite_existing = config.get("overwrite_existing", False) - def setup_driver(self, headless: bool) -> Chrome: + def setup_driver(self, headless: bool): """ - Set up and configure Chrome driver with flexibility for different environments. + Set up and configure Chrome driver using SeleniumBase UC Mode. + SeleniumBase provides enhanced anti-detection and automatic Chrome/ChromeDriver version management. Works in both Docker containers and on regular OS installations (Windows, Mac, Linux). """ - # Determine if we're running in a container - in_container = os.environ.get('CHROME_BIN') is not None - - # Create Chrome options - opts = uc.ChromeOptions() - opts.add_argument("--window-size=1400,900") - opts.add_argument("--ignore-certificate-errors") - opts.add_argument("--disable-gpu") # Improves performance - opts.add_argument("--disable-dev-shm-usage") # Helps with stability - opts.add_argument("--no-sandbox") # More stable in some environments - - # Use headless mode if requested - if headless: - opts.add_argument("--headless=new") - # Log platform information for debugging log.info(f"Platform: {platform.platform()}") log.info(f"Python version: {platform.python_version()}") + log.info("Using SeleniumBase UC Mode for enhanced anti-detection") + + # Determine if we're running in a container + in_container = os.environ.get('CHROME_BIN') is not None - # If in container, use environment-provided binaries if in_container: chrome_binary = os.environ.get('CHROME_BIN') - chromedriver_path = os.environ.get('CHROMEDRIVER_PATH') - log.info(f"Container environment detected") log.info(f"Chrome binary: {chrome_binary}") - log.info(f"ChromeDriver path: {chromedriver_path}") + # Create driver with custom binary location for containers if chrome_binary and os.path.exists(chrome_binary): - log.info(f"Using Chrome binary from environment: {chrome_binary}") - opts.binary_location = chrome_binary - - try: - # Try creating Chrome driver with undetected_chromedriver - log.info("Attempting to create undetected_chromedriver instance") - driver = uc.Chrome(options=opts) - log.info("Successfully created undetected_chromedriver instance") - except Exception as e: - # Fall back to regular Selenium if undetected_chromedriver fails - log.warning(f"Failed to create undetected_chromedriver instance: {e}") - log.info("Falling back to regular Selenium Chrome") - - # Import Selenium webdriver here to avoid potential import issues - from selenium import webdriver - from selenium.webdriver.chrome.service import Service - - if chromedriver_path and os.path.exists(chromedriver_path): - log.info(f"Using ChromeDriver from path: {chromedriver_path}") - service = Service(executable_path=chromedriver_path) - driver = webdriver.Chrome(service=service, options=opts) - else: - log.info("Using default ChromeDriver") - driver = webdriver.Chrome(options=opts) + try: + driver = Driver( + uc=True, + headless=headless, + binary_location=chrome_binary, + page_load_strategy="normal" + ) + log.info("Successfully created SeleniumBase UC driver with custom binary") + except Exception as e: + log.warning(f"Failed to create driver with custom binary: {e}") + # Fall back to default + driver = Driver( + uc=True, + headless=headless, + page_load_strategy="normal" + ) + log.info("Successfully created SeleniumBase UC driver with defaults") + else: + driver = Driver( + uc=True, + headless=headless, + page_load_strategy="normal" + ) + log.info("Successfully created SeleniumBase UC driver") else: - # On regular OS, use default undetected_chromedriver - log.info("Using standard undetected_chromedriver setup") - driver = uc.Chrome(options=opts) + # Regular OS environment - SeleniumBase handles version matching automatically + log.info("Creating SeleniumBase UC Mode driver") + try: + driver = Driver( + uc=True, + headless=headless, + page_load_strategy="normal", + incognito=True # Use incognito mode for better stealth + ) + log.info("Successfully created SeleniumBase UC driver") + except Exception as e: + log.error(f"Failed to create SeleniumBase driver: {e}") + raise # Set page load timeout to avoid hanging driver.set_page_load_timeout(30) - log.info("Chrome driver setup completed successfully") + + # Set window size + driver.set_window_size(1400, 900) + + # Add additional stealth settings + try: + # Disable automation flags + driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', { + 'source': ''' + Object.defineProperty(navigator, 'webdriver', {get: () => undefined}); + Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]}); + Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en']}); + ''' + }) + log.info("Additional stealth settings applied") + except Exception as e: + log.debug(f"Could not apply additional stealth settings: {e}") + + log.info("SeleniumBase UC driver setup completed successfully") return driver def dismiss_cookies(self, driver: Chrome): @@ -471,9 +487,11 @@ class GoogleReviewsScraper: parts = current_url.split('/place/') new_url = f"{parts[0]}/place/{parts[1].split('/')[0]}/reviews?hl={lang_code}" driver.get(new_url) - time.sleep(2) + time.sleep(3) # Increased wait time for page load if "review" in driver.current_url.lower(): log.info("Navigated directly to reviews page via URL") + # Extra wait for reviews to render after URL navigation + time.sleep(2) return True # Try to identify reviews link in URL @@ -481,9 +499,11 @@ class GoogleReviewsScraper: parts = current_url.split('/place/') new_url = f"{parts[0]}/place/{parts[1].split('/')[0]}/reviews" driver.get(new_url) - time.sleep(2) + time.sleep(3) # Increased wait time for page load if "review" in driver.current_url.lower(): log.info("Navigated directly to reviews page via URL") + # Extra wait for reviews to render after URL navigation + time.sleep(2) return True except Exception as url_error: log.warning(f"Failed to navigate to reviews via URL: {url_error}") @@ -831,34 +851,37 @@ class GoogleReviewsScraper: target_item = None matched_text = None - # 1. First try direct text matching - wanted_labels = SORT_OPTIONS.get(method, []) + # Log all available menu items for debugging + log.info(f"Available menu items: {[text for _, text in visible_items]}") - for item, text in visible_items: + # Use position-based selection (most reliable for Google Maps) + position_map = { + "relevance": 0, # Usually the first option + "newest": 1, # Usually the second option + "highest": 2, # Usually the third option + "lowest": 3 # Usually the fourth option + } + + pos = position_map.get(method, -1) + if pos >= 0 and pos < len(visible_items): + target_item, matched_text = visible_items[pos] + log.info(f"Selected menu item at position {pos + 1}: '{matched_text}' for sort method '{method}'") + + # Validate the selection makes sense + wanted_labels = SORT_OPTIONS.get(method, []) + text_clean = matched_text.lower() + + # Check if selected text contains any of the expected keywords + valid_selection = False for label in wanted_labels: - if (label in text or text in label or - (len(text) > 0 and len(label) > 0 and - text.lower().startswith(label.lower()[:3]))): - target_item = item - matched_text = text - log.info(f"Found matching menu item: '{text}' for '{label}'") + if label.lower() in text_clean or text_clean in label.lower(): + valid_selection = True break - if target_item: - break - # 2. If no match found, try position-based selection - if not target_item and visible_items: - position_map = { - "relevance": 0, # Usually the first option - "newest": 1, # Usually the second option - "highest": 2, # Usually the third option - "lowest": 3 # Usually the fourth option - } - - pos = position_map.get(method, -1) - if pos >= 0 and pos < len(visible_items): - target_item, matched_text = visible_items[pos] - log.info(f"Using position-based selection (position {pos}) for '{method}'") + if not valid_selection: + log.warning(f"WARNING: Selected '{matched_text}' doesn't match expected '{method}' - might be wrong sort!") + else: + log.warning(f"Position {pos} not available in menu (only {len(visible_items)} items)") # 3. If target found, click it if target_item: @@ -1108,16 +1131,55 @@ class GoogleReviewsScraper: self.dismiss_cookies(driver) self.click_reviews_tab(driver) - self.set_sort(driver, sort_by) - # Add a wait after setting sort to allow results to load - time.sleep(1) + # Extra wait after clicking reviews tab to ensure page loads + log.info("Waiting for reviews page to fully load...") + time.sleep(3) + + # Wait for page to be fully interactive + try: + wait.until(lambda d: d.execute_script("return document.readyState") == "complete") + log.info("Page DOM is ready") + except: + log.debug("Could not verify page ready state") + + # Verify we're on a reviews page before proceeding + if "review" not in driver.current_url.lower(): + log.warning("URL doesn't contain 'review' - might not be on reviews page") + + # Try to set sort - but don't fail if it doesn't work + try: + self.set_sort(driver, sort_by) + except Exception as sort_error: + log.warning(f"Sort failed but continuing: {sort_error}") + + # Add a longer wait after setting sort to allow results to load + log.info("Waiting for reviews to render...") + time.sleep(3) # Use try-except to handle cases where the pane is not found - try: - pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, PANE_SEL))) - except TimeoutException: - log.warning("Could not find reviews pane. Page structure might have changed.") + # Try multiple selectors for the reviews pane + pane = None + pane_selectors = [ + PANE_SEL, # Primary selector + 'div[role="main"] div.m6QErb', # Simplified version + 'div.m6QErb.DxyBCb', # Even more simplified + 'div[role="main"]' # Most generic + ] + + for selector in pane_selectors: + try: + log.info(f"Trying to find reviews pane with selector: {selector}") + pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector))) + if pane: + log.info(f"Found reviews pane with selector: {selector}") + break + except TimeoutException: + log.debug(f"Pane not found with selector: {selector}") + continue + + if not pane: + log.warning("Could not find reviews pane with any selector. Page structure might have changed.") return False pbar = tqdm(desc="Scraped", ncols=80, initial=len(seen)) @@ -1132,8 +1194,12 @@ class GoogleReviewsScraper: log.warning(f"Error setting up scroll script: {e}") scroll_script = "window.scrollBy(0, 300);" # Fallback to simple scrolling - max_attempts = 10 # Limit the number of attempts to find reviews + max_attempts = 50 # Increased from 10 to 50 for very patient scrolling attempts = 0 + max_idle = 15 # Increased from 3 to 15 - much more patience for lazy-loaded reviews + consecutive_no_cards = 0 # Track how many times we find zero cards + last_scroll_position = 0 + scroll_stuck_count = 0 while attempts < max_attempts: try: @@ -1142,12 +1208,23 @@ class GoogleReviewsScraper: # Check for valid cards if len(cards) == 0: - log.debug("No review cards found in this iteration") + consecutive_no_cards += 1 + log.info(f"No review cards found in this iteration (consecutive: {consecutive_no_cards})") + + # If we keep finding no cards, might have hit the end + if consecutive_no_cards > 5: + log.warning("No cards found for 5+ iterations - might be at end of reviews") + break + attempts += 1 - # Try scrolling anyway + # Try aggressive scrolling driver.execute_script(scroll_script) time.sleep(1) + driver.execute_script("window.scrollBy(0, 1000);") # Extra scroll + time.sleep(1.5) continue + else: + consecutive_no_cards = 0 # Reset counter when we find cards for c in cards: try: @@ -1186,12 +1263,48 @@ class GoogleReviewsScraper: idle = 0 attempts = 0 # Reset attempts counter when we successfully process a review - if idle >= 3: + if idle >= max_idle: + log.info(f"Stopping: No new reviews found after {max_idle} scroll attempts") break if not fresh_cards: idle += 1 attempts += 1 + log.info(f"No new reviews in this iteration (idle: {idle}/{max_idle}, attempts: {attempts}/{max_attempts}, total seen: {len(seen)})") + + # When no new reviews, scroll more aggressively + try: + # Try multiple scroll methods + driver.execute_script(scroll_script) + time.sleep(0.5) + driver.execute_script("window.scrollBy(0, 500);") # Extra scroll + time.sleep(0.5) + except Exception as e: + log.warning(f"Error scrolling: {e}") + else: + log.info(f"Found {len(fresh_cards)} new reviews in this iteration") + + # Check if we're actually scrolling or stuck + try: + current_scroll = driver.execute_script("return arguments[0].scrollTop;", pane) + if current_scroll == last_scroll_position and len(fresh_cards) == 0: + scroll_stuck_count += 1 + log.warning(f"Scroll position hasn't changed (stuck at {current_scroll}px, stuck count: {scroll_stuck_count})") + + if scroll_stuck_count > 5: + log.warning("Scroll is stuck - trying alternative scroll method") + # Try clicking the last visible review to force loading + try: + driver.execute_script("arguments[0].lastElementChild.scrollIntoView();", pane) + time.sleep(2) + except: + pass + scroll_stuck_count = 0 + else: + scroll_stuck_count = 0 + last_scroll_position = current_scroll + except: + pass # Use JavaScript for smoother scrolling try: @@ -1201,8 +1314,13 @@ class GoogleReviewsScraper: # Try a simpler scroll method driver.execute_script("window.scrollBy(0, 300);") - # Dynamic sleep: sleep less when processing many reviews - sleep_time = 0.7 if len(fresh_cards) > 5 else 1.0 + # Dynamic sleep: sleep less when processing many reviews, more when finding none + if len(fresh_cards) > 5: + sleep_time = 0.7 + elif len(fresh_cards) == 0: + sleep_time = 2.0 # Wait longer when finding nothing (let page load) + else: + sleep_time = 1.0 time.sleep(sleep_time) except StaleElementReferenceException: diff --git a/requirements.txt b/requirements.txt index 2f56ca0..d1883ab 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,17 +1,8 @@ -requests==2.32.3 -beautifulsoup4==4.12.3 -aiohttp==3.11.11 +seleniumbase>=4.34.9 googletrans==4.0.2 -selenium==4.15.2 -undetected-chromedriver==3.5.4 -tqdm==4.66.3 +tqdm>=4.66.3 pymongo==4.12.0 -pyyaml==6.0.1 -certifi==2024.7.4 -webdriver-manager==4.0.2 -setuptools==79.0.1 boto3==1.35.1 -pytest==7.4.3 fastapi==0.104.1 uvicorn==0.24.0 botocore~=1.35.99 diff --git a/tests/test_seleniumbase_integration.py b/tests/test_seleniumbase_integration.py new file mode 100644 index 0000000..737305f --- /dev/null +++ b/tests/test_seleniumbase_integration.py @@ -0,0 +1,110 @@ +""" +Tests for SeleniumBase UC Mode integration. +Verifies that the driver setup works correctly with the new library. +""" + +import pytest +from modules.scraper import GoogleReviewsScraper + + +def test_seleniumbase_driver_creation(): + """Test that SeleniumBase driver can be created successfully""" + config = { + "url": "https://maps.app.goo.gl/test", + "headless": True, + "use_mongodb": False, + "backup_to_json": False + } + + scraper = GoogleReviewsScraper(config) + + # Test driver creation + driver = None + try: + driver = scraper.setup_driver(headless=True) + assert driver is not None + assert driver.name == "chrome" + + # Verify driver can navigate + driver.get("https://www.google.com") + assert "google" in driver.current_url.lower() + + finally: + if driver: + driver.quit() + + +def test_seleniumbase_driver_headless_mode(): + """Test that headless mode works correctly""" + config = { + "url": "https://maps.app.goo.gl/test", + "headless": True, + "use_mongodb": False, + "backup_to_json": False + } + + scraper = GoogleReviewsScraper(config) + driver = None + + try: + driver = scraper.setup_driver(headless=True) + assert driver is not None + + # In headless mode, window size should still be set + size = driver.get_window_size() + assert size['width'] == 1400 + assert size['height'] == 900 + + finally: + if driver: + driver.quit() + + +def test_seleniumbase_driver_nonheadless_mode(): + """Test that non-headless mode works correctly""" + config = { + "url": "https://maps.app.goo.gl/test", + "headless": False, + "use_mongodb": False, + "backup_to_json": False + } + + scraper = GoogleReviewsScraper(config) + driver = None + + try: + driver = scraper.setup_driver(headless=False) + assert driver is not None + assert driver.name == "chrome" + + finally: + if driver: + driver.quit() + + +@pytest.mark.skip(reason="Integration test - requires network access") +def test_seleniumbase_google_maps_access(): + """Test that driver can access Google Maps (integration test)""" + config = { + "url": "https://maps.app.goo.gl/6tkNMDjcj3SS6LJe9", + "headless": True, + "use_mongodb": False, + "backup_to_json": False + } + + scraper = GoogleReviewsScraper(config) + driver = None + + try: + driver = scraper.setup_driver(headless=True) + driver.get(config["url"]) + + # Wait for redirect to Google Maps + import time + time.sleep(3) + + assert "google.com/maps" in driver.current_url + + finally: + if driver: + driver.quit()