Phase 0: Project restructure to ReviewIQ platform architecture

New structure: - scrapers/google_reviews/v1_0_0.py (was modules/scraper_clean.py) - scrapers/base.py (BaseScraper interface) - scrapers/registry.py (ScraperRegistry for version routing) - core/database.py, models.py, config.py, enums.py - utils/logger.py, crash_analyzer.py, health_checks.py, helpers.py, date_converter.py - workers/chrome_pool.py - services/webhook_service.py - api/ routes structure (empty, ready for Phase 2) - tests/ structure mirroring source All imports updated in: - api_server_production.py (7 import paths updated) - utils/health_checks.py (scraper import path) Legacy modules moved to modules/_legacy/: - data_storage.py, image_handler.py, s3_handler.py (unused) Syntax verified, frontend build passing. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-24 15:22:08 +00:00
parent bb0291f265
commit 544e028c3f
37 changed files with 5782 additions and 30 deletions
--- a/modules/_legacy/data_storage.py
+++ b/modules/_legacy/data_storage.py
@@ -0,0 +1,349 @@
+"""
+Data storage modules for Google Maps Reviews Scraper.
+"""
+
+import json
+import logging
+import ssl
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, Any, Set
+
+import pymongo
+
+from modules.date_converter import parse_relative_date, DateConverter
+from modules.image_handler import ImageHandler
+from modules.models import RawReview
+from modules.utils import detect_lang, get_current_iso_date
+
+# Configure SSL for MongoDB connection
+ssl._create_default_https_context = ssl._create_unverified_context  # macOS SSL fix
+
+# Logger
+log = logging.getLogger("scraper")
+
+RAW_LANG = "en"
+
+
+class MongoDBStorage:
+    """MongoDB storage handler for Google Maps reviews"""
+
+    def __init__(self, config: Dict[str, Any]):
+        """Initialize MongoDB storage with configuration"""
+        mongodb_config = config.get("mongodb", {})
+        self.uri = mongodb_config.get("uri")
+        self.db_name = mongodb_config.get("database")
+        self.collection_name = mongodb_config.get("collection")
+        self.client = None
+        self.collection = None
+        self.connected = False
+        self.convert_dates = config.get("convert_dates", True)
+        self.download_images = config.get("download_images", False)
+        self.store_local_paths = config.get("store_local_paths", True)
+        self.replace_urls = config.get("replace_urls", False)
+        self.preserve_original_urls = config.get("preserve_original_urls", True)
+        self.custom_params = config.get("custom_params", {})
+        self.image_handler = ImageHandler(config) if self.download_images else None
+
+    def connect(self) -> bool:
+        """Connect to MongoDB"""
+        try:
+            # Use the correct TLS parameters for newer PyMongo versions
+            self.client = pymongo.MongoClient(
+                self.uri,
+                tlsAllowInvalidCertificates=True,  # Equivalent to ssl_cert_reqs=CERT_NONE
+                connectTimeoutMS=30000,
+                socketTimeoutMS=None,
+                connect=True,
+                maxPoolSize=50
+            )
+            # Test connection
+            self.client.admin.command('ping')
+            db = self.client[self.db_name]
+            self.collection = db[self.collection_name]
+            self.connected = True
+            log.info(f"Connected to MongoDB: {self.db_name}.{self.collection_name}")
+            return True
+        except Exception as e:
+            log.error(f"Failed to connect to MongoDB: {e}")
+            self.connected = False
+            return False
+
+    def close(self):
+        """Close MongoDB connection"""
+        if self.client:
+            self.client.close()
+            self.connected = False
+
+    def fetch_existing_reviews(self) -> Dict[str, Dict[str, Any]]:
+        """Fetch existing reviews from MongoDB"""
+        if not self.connected and not self.connect():
+            log.warning("Cannot fetch existing reviews - MongoDB connection failed")
+            return {}
+
+        try:
+            reviews = {}
+            for doc in self.collection.find({}, {"_id": 0}):
+                review_id = doc.get("review_id")
+                if review_id:
+                    reviews[review_id] = doc
+            log.info(f"Fetched {len(reviews)} existing reviews from MongoDB")
+            return reviews
+        except Exception as e:
+            log.error(f"Error fetching reviews from MongoDB: {e}")
+            return {}
+
+    def save_reviews(self, reviews: Dict[str, Dict[str, Any]]):
+        """Save reviews to MongoDB using bulk operations"""
+        if not reviews:
+            log.info("No reviews to save to MongoDB")
+            return
+
+        if not self.connected and not self.connect():
+            log.warning("Cannot save reviews - MongoDB connection failed")
+            return
+
+        try:
+            # Process reviews before saving
+            processed_reviews = reviews.copy()
+
+            # Convert string dates to datetime objects if enabled
+            if self.convert_dates:
+                processed_reviews = DateConverter.convert_dates_in_reviews(processed_reviews)
+
+            # Download and process images if enabled
+            if self.download_images and self.image_handler:
+                processed_reviews = self.image_handler.download_all_images(processed_reviews)
+
+                # If not storing local paths, remove them from the documents
+                if not self.store_local_paths:
+                    for review in processed_reviews.values():
+                        if "local_images" in review:
+                            del review["local_images"]
+                        if "local_profile_picture" in review:
+                            del review["local_profile_picture"]
+
+                # If not preserving original URLs, remove them from the documents
+                if self.replace_urls and not self.preserve_original_urls:
+                    for review in processed_reviews.values():
+                        if "original_image_urls" in review:
+                            del review["original_image_urls"]
+                        if "original_profile_picture" in review:
+                            del review["original_profile_picture"]
+
+            # Add custom parameters to each document
+            if self.custom_params:
+                log.info(f"Adding custom parameters to {len(processed_reviews)} documents")
+                for review in processed_reviews.values():
+                    for key, value in self.custom_params.items():
+                        review[key] = value
+
+            operations = []
+            for review in processed_reviews.values():
+                # Convert to proper MongoDB document
+                # Exclude _id for inserts, MongoDB will generate it
+                if "_id" in review:
+                    del review["_id"]
+
+                operations.append(
+                    pymongo.UpdateOne(
+                        {"review_id": review["review_id"]},
+                        {"$set": review},
+                        upsert=True
+                    )
+                )
+
+            if operations:
+                result = self.collection.bulk_write(operations)
+                log.info(f"MongoDB: Upserted {result.upserted_count}, modified {result.modified_count} reviews")
+        except Exception as e:
+            log.error(f"Error saving reviews to MongoDB: {e}")
+
+
+class JSONStorage:
+    """JSON file-based storage handler for Google Maps reviews"""
+
+    def __init__(self, config: Dict[str, Any]):
+        """Initialize JSON storage with configuration"""
+        self.json_path = Path(config.get("json_path", "google_reviews.json"))
+        self.seen_ids_path = Path(config.get("seen_ids_path", "google_reviews.ids"))
+        self.convert_dates = config.get("convert_dates", True)
+        self.download_images = config.get("download_images", False)
+        self.store_local_paths = config.get("store_local_paths", True)
+        self.replace_urls = config.get("replace_urls", False)
+        self.preserve_original_urls = config.get("preserve_original_urls", True)
+        self.custom_params = config.get("custom_params", {})
+        self.image_handler = ImageHandler(config) if self.download_images else None
+
+    def load_json_docs(self) -> Dict[str, Dict[str, Any]]:
+        """Load reviews from JSON file"""
+        if not self.json_path.exists():
+            return {}
+        try:
+            data = json.loads(self.json_path.read_text(encoding="utf-8"))
+            # Index by review_id for fast lookups
+            return {d.get("review_id", ""): d for d in data if d.get("review_id")}
+        except json.JSONDecodeError:
+            log.warning("⚠️ Error reading JSON file, starting with empty data")
+            return {}
+
+    def save_json_docs(self, docs: Dict[str, Dict[str, Any]]):
+        """Save reviews to JSON file"""
+        # Create a copy of the docs to avoid modifying the original
+        processed_docs = {review_id: review.copy() for review_id, review in docs.items()}
+
+        # Process reviews before saving
+        # Convert string dates to datetime objects if enabled
+        if self.convert_dates:
+            processed_docs = DateConverter.convert_dates_in_reviews(processed_docs)
+
+        # Download and process images if enabled
+        if self.download_images and self.image_handler:
+            processed_docs = self.image_handler.download_all_images(processed_docs)
+
+            # If not storing local paths, remove them from the documents
+            if not self.store_local_paths:
+                for review in processed_docs.values():
+                    if "local_images" in review:
+                        del review["local_images"]
+                    if "local_profile_picture" in review:
+                        del review["local_profile_picture"]
+
+            # If not preserving original URLs, remove them from the documents
+            if self.replace_urls and not self.preserve_original_urls:
+                for review in processed_docs.values():
+                    if "original_image_urls" in review:
+                        del review["original_image_urls"]
+                    if "original_profile_picture" in review:
+                        del review["original_profile_picture"]
+
+        # Add custom parameters to each document
+        if self.custom_params:
+            log.info(f"Adding custom parameters to {len(processed_docs)} documents")
+            for review in processed_docs.values():
+                for key, value in self.custom_params.items():
+                    review[key] = value
+
+        # Convert datetime objects back to strings for JSON serialization
+        for doc in processed_docs.values():
+            for key, value in doc.items():
+                if isinstance(value, datetime):
+                    doc[key] = value.isoformat()
+
+        # Write to JSON file
+        self.json_path.write_text(json.dumps(list(processed_docs.values()),
+                                             ensure_ascii=False, indent=2), encoding="utf-8")
+
+    def load_seen(self) -> Set[str]:
+        """Load set of already seen review IDs"""
+        return set(
+            self.seen_ids_path.read_text(encoding="utf-8").splitlines()) if self.seen_ids_path.exists() else set()
+
+    def save_seen(self, ids: Set[str]):
+        """Save set of already seen review IDs"""
+        self.seen_ids_path.write_text("\n".join(ids), encoding="utf-8")
+
+
+def merge_review(existing: Dict[str, Any] | None, raw: RawReview) -> Dict[str, Any]:
+    """
+    Merge a raw review with an existing review document.
+    Creates a new document if existing is None.
+    """
+    if not existing:
+        # Create a new review with the updated field names
+        existing = {
+            "review_id": raw.id,
+            "author": raw.author,
+            "rating": raw.rating,
+            "description": {},  # renamed from "texts"
+            "likes": raw.likes,
+            "user_images": list(raw.photos),  # renamed from "photo_urls"
+            "author_profile_url": raw.profile,  # renamed from "profile_link"
+            "profile_picture": raw.avatar,  # renamed from "avatar_url"
+            "owner_responses": {},
+            "created_date": get_current_iso_date(),
+            "review_date": parse_relative_date(raw.date, RAW_LANG),
+        }
+    else:
+        # Handle existing reviews with old field names - migrate them
+        if "texts" in existing and "description" not in existing:
+            existing["description"] = existing.pop("texts")
+
+        if "photo_urls" in existing and "user_images" not in existing:
+            existing["user_images"] = existing.pop("photo_urls")
+
+        if "profile_link" in existing and "author_profile_url" not in existing:
+            existing["author_profile_url"] = existing.pop("profile_link")
+
+        if "avatar_url" in existing and "profile_picture" not in existing:
+            existing["profile_picture"] = existing.pop("avatar_url")
+
+        # Add ISO dates if not present
+        if "created_date" not in existing:
+            existing["created_date"] = get_current_iso_date()
+
+        if "review_date" not in existing:
+            existing["review_date"] = parse_relative_date(raw.date, RAW_LANG)
+
+        # Remove the 'date' field if it exists
+        if "date" in existing:
+            del existing["date"]
+
+    if raw.text:
+        existing["description"][raw.lang] = raw.text
+
+    if not existing.get("rating"):
+        existing["rating"] = raw.rating
+
+    if raw.likes > existing.get("likes", 0):
+        existing["likes"] = raw.likes
+
+    # Update the images list
+    existing["user_images"] = list({*existing.get("user_images", []), *raw.photos})
+
+    # Update avatar/profile picture
+    if raw.avatar and (
+            not existing.get("profile_picture") or len(raw.avatar) > len(existing.get("profile_picture", ""))):
+        existing["profile_picture"] = raw.avatar
+
+    if raw.owner_text:
+        lang = detect_lang(raw.owner_text)
+        # Don't store the date string in owner_responses
+        existing.setdefault("owner_responses", {})[lang] = {
+            "text": raw.owner_text,
+        }
+
+    # Update last_modified timestamp
+    existing["last_modified_date"] = get_current_iso_date()
+
+    return existing
+
+
+def merge_review_with_translation(existing: Dict[str, Any] | None, raw: RawReview, append_translations: bool = False) -> Dict[str, Any]:
+    """
+    Enhanced merge function that supports translation mode.
+    When append_translations is True, it adds new language versions to existing reviews.
+    """
+    # Use the standard merge for the base functionality
+    merged = merge_review(existing, raw)
+    
+    if append_translations and existing and raw.text:
+        # In translation mode, always add the new language version
+        # even if we already have content for this review
+        merged["description"][raw.lang] = raw.text
+        
+        # Also merge owner responses in translation mode
+        if raw.owner_text:
+            owner_lang = detect_lang(raw.owner_text)
+            merged.setdefault("owner_responses", {})[owner_lang] = {
+                "text": raw.owner_text,
+            }
+        
+        # Add metadata about when this translation was added
+        merged.setdefault("translation_history", []).append({
+            "language": raw.lang,
+            "added_date": get_current_iso_date(),
+            "source": "regional_scraping"
+        })
+    
+    return merged
--- a/modules/_legacy/image_handler.py
+++ b/modules/_legacy/image_handler.py
@@ -0,0 +1,342 @@
+"""
+Image downloading and handling for Google Maps Reviews Scraper.
+"""
+
+import logging
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+from typing import Dict, Any, Set, Tuple
+from urllib.parse import urlparse
+
+import requests
+
+from modules.s3_handler import S3Handler
+
+# Logger
+log = logging.getLogger("scraper")
+
+
+class ImageHandler:
+    """Handler for downloading and managing review images"""
+
+    def __init__(self, config: Dict[str, Any]):
+        """Initialize image handler with configuration"""
+        self.image_dir = Path(config.get("image_dir", "review_images"))
+        self.max_workers = config.get("download_threads", 4)
+        self.store_local_paths = config.get("store_local_paths", True)
+        
+        # Image dimension settings
+        self.max_width = config.get("max_width", 1200)
+        self.max_height = config.get("max_height", 1200)
+
+        # URL replacement settings
+        self.replace_urls = config.get("replace_urls", False)
+        self.custom_url_base = config.get("custom_url_base", "https://mycustomurl.com")
+        self.custom_url_profiles = config.get("custom_url_profiles", "/profiles/")
+        self.custom_url_reviews = config.get("custom_url_reviews", "/reviews/")
+        self.preserve_original_urls = config.get("preserve_original_urls", True)
+
+        # Subdirectories for different image types
+        self.profile_dir = self.image_dir / "profiles"
+        self.review_dir = self.image_dir / "reviews"
+        
+        # Initialize S3 handler
+        self.s3_handler = S3Handler(config)
+        self.use_s3 = config.get("use_s3", False)
+
+    def ensure_directories(self):
+        """Ensure all image directories exist"""
+        self.profile_dir.mkdir(parents=True, exist_ok=True)
+        self.review_dir.mkdir(parents=True, exist_ok=True)
+
+    def is_not_custom_url(self, url: str) -> bool:
+        """Check if the URL is not one of our custom URLs"""
+        if not url:
+            return False
+
+        # Check if the URL starts with our custom URL base - if so, skip it
+        if self.custom_url_base and url.startswith(self.custom_url_base):
+            return False
+
+        return True
+
+    def get_filename_from_url(self, url: str, is_profile: bool = False) -> str:
+        """Extract filename from URL and add .jpg extension"""
+        if not url:
+            return ""
+
+        # Skip our custom URLs
+        if not self.is_not_custom_url(url):
+            return ""
+
+        # For profile pictures
+        if is_profile:
+            # Extract unique identifier from profile URL
+            parts = url.split('/')
+            if len(parts) > 1:
+                filename = parts[-2] if parts[-1] == '' else parts[-1]
+                filename = filename.split('=')[0]
+                return f"{filename}.jpg"
+
+        # For review images
+        url = url.split('=')[0]
+        filename = url.split('/')[-1]
+        return f"{filename}.jpg"
+
+        # Fallback to using the last part of the URL path
+        parsed = urlparse(url)
+        path = parsed.path
+        filename = path.split('/')[-1]
+
+        # Add .jpg extension if not present
+        if not filename.lower().endswith('.jpg'):
+            filename += ".jpg"
+
+        return filename
+
+    def get_custom_url(self, filename: str, is_profile: bool = False) -> str:
+        """Generate a custom URL for the image"""
+        if not self.replace_urls or not filename:
+            return ""
+
+        base_url = self.custom_url_base.rstrip('/')
+        path = self.custom_url_profiles if is_profile else self.custom_url_reviews
+        path = path.strip('/')
+
+        return f"{base_url}/{path}/{filename}"
+
+    def download_image(self, url_info: Tuple[str, bool]) -> Tuple[str, str, str]:
+        """
+        Download an image from URL and save to disk.
+
+        Args:
+            url_info: Tuple of (url, is_profile)
+
+        Returns:
+            Tuple of (url, local filename, custom url)
+        """
+        url, is_profile = url_info
+
+        # Skip our custom URLs
+        if not self.is_not_custom_url(url):
+            return url, "", ""
+
+        try:
+            filename = self.get_filename_from_url(url, is_profile)
+            if not filename:
+                return url, "", ""
+
+            # Choose directory based on image type
+            target_dir = self.profile_dir if is_profile else self.review_dir
+            filepath = target_dir / filename
+
+            # Skip if file already exists
+            if filepath.exists():
+                # Generate custom URL even if file exists
+                custom_url = self.get_custom_url(filename, is_profile)
+                return url, filename, custom_url
+
+            # Download the image
+            # For Google images, modify resolution parameters
+            if 'googleusercontent.com' in url or 'ggpht.com' in url or 'gstatic.com' in url:
+                # Check if URL already has size parameters (=w... or =h... or =s...)
+                if '=w' in url or '=h' in url or '=s' in url:
+                    # Remove existing size parameters
+                    # Split at = to get base URL and parameters
+                    parts = url.split('=')
+                    base_url = parts[0]
+                    # Rebuild with configurable resolution parameters (using -no suffix)
+                    url = base_url + f"=w{self.max_width}-h{self.max_height}-no"
+                else:
+                    # No existing size parameters, just append them
+                    url = url + f"=w{self.max_width}-h{self.max_height}-no"
+            else:
+                # For non-Google URLs, just remove parameters after =
+                url = url.split("=")[0]
+            
+            response = requests.get(url, stream=True, timeout=10)
+            response.raise_for_status()
+
+            with open(filepath, 'wb') as f:
+                for chunk in response.iter_content(chunk_size=8192):
+                    f.write(chunk)
+
+            # Generate custom URL
+            custom_url = self.get_custom_url(filename, is_profile)
+            return url, filename, custom_url
+
+        except Exception as e:
+            log.error(f"Error downloading image from {url}: {e}")
+            return url, "", ""
+
+    def download_all_images(self, reviews: Dict[str, Dict[str, Any]]) -> Dict[str, Dict[str, Any]]:
+        """
+        Download all images (review images and profile pictures) for all reviews.
+
+        Args:
+            reviews: Dictionary of review documents
+
+        Returns:
+            Updated reviews with local image paths and custom URLs
+        """
+        self.ensure_directories()
+
+        # Collect all unique image URLs (both review images and profile pictures)
+        # Exclude custom URLs
+        review_urls: Set[str] = set()
+        profile_urls: Set[str] = set()
+
+        for review in reviews.values():
+            # Collect review images - exclude custom URLs
+            if "user_images" in review and isinstance(review["user_images"], list):
+                for url in review["user_images"]:
+                    if self.is_not_custom_url(url):
+                        review_urls.add(url)
+                # If we have original image URLs stored separately, add those too
+                if "original_image_urls" in review and isinstance(review["original_image_urls"], list):
+                    for orig_url in review["original_image_urls"]:
+                        if self.is_not_custom_url(orig_url):
+                            review_urls.add(orig_url)
+
+            # Collect profile pictures - exclude custom URLs
+            if "profile_picture" in review and review["profile_picture"]:
+                profile_url = review["profile_picture"]
+                if self.is_not_custom_url(profile_url):
+                    profile_urls.add(profile_url)
+                # If we have original profile URL stored separately, add that too
+                if "original_profile_picture" in review and review["original_profile_picture"]:
+                    orig_profile_url = review["original_profile_picture"]
+                    if self.is_not_custom_url(orig_profile_url):
+                        profile_urls.add(orig_profile_url)
+
+        # Prepare download tasks with URL type info
+        download_tasks = [(url, False) for url in review_urls] + [(url, True) for url in profile_urls]
+
+        if not download_tasks:
+            log.info("No images to download")
+            return reviews
+
+        log.info(
+            f"Downloading {len(download_tasks)} images ({len(profile_urls)} profiles, {len(review_urls)} review images)...")
+
+        # Create URL to filename and URL to custom URL mappings
+        url_to_filename = {}
+        url_to_custom_url = {}
+
+        # Download images in parallel
+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+            results = executor.map(self.download_image, download_tasks)
+            for url, filename, custom_url in results:
+                if filename:
+                    url_to_filename[url] = filename
+                if custom_url:
+                    url_to_custom_url[url] = custom_url
+
+        # Upload to S3 if enabled
+        s3_url_mapping = {}
+        if self.use_s3 and self.s3_handler.enabled and url_to_filename:
+            log.info("Uploading images to S3...")
+            
+            # Prepare files for S3 upload
+            files_to_upload = {}
+            for url, filename in url_to_filename.items():
+                # Determine if it's a profile image
+                is_profile = any(url == profile_url for profile_url in profile_urls)
+                
+                # Get local file path
+                local_path = (self.profile_dir if is_profile else self.review_dir) / filename
+                
+                if local_path.exists():
+                    files_to_upload[filename] = (local_path, is_profile)
+            
+            # Upload to S3
+            s3_results = self.s3_handler.upload_images_batch(files_to_upload)
+            
+            # Create mapping from original URL to S3 URL
+            for url, filename in url_to_filename.items():
+                if filename in s3_results:
+                    s3_url_mapping[url] = s3_results[filename]
+
+        # Update review documents
+        for review_id, review in reviews.items():
+            # Find the original URLs to use for lookup - important for both user_images and profile_picture
+            user_images_original = []
+            profile_picture_original = ""
+
+            # For user_images, either use original URLs if we have them, or the current user_images
+            if "original_image_urls" in review and isinstance(review["original_image_urls"], list):
+                user_images_original = review["original_image_urls"]
+            elif "user_images" in review and isinstance(review["user_images"], list):
+                user_images_original = review["user_images"].copy()
+
+            # For profile_picture, either use original URL if we have it, or the current profile_picture
+            if "original_profile_picture" in review and review["original_profile_picture"]:
+                profile_picture_original = review["original_profile_picture"]
+            elif "profile_picture" in review:
+                profile_picture_original = review["profile_picture"]
+
+            # Process user_images
+            if "user_images" in review and isinstance(review["user_images"], list):
+                # Add local image paths if enabled
+                if self.store_local_paths:
+                    local_images = [url_to_filename.get(url, "") for url in user_images_original
+                                    if url and self.is_not_custom_url(url)]
+                    review["local_images"] = [img for img in local_images if img]
+
+                # Replace URLs if enabled
+                if self.replace_urls:
+                    # Store original URLs if needed and not already stored
+                    if self.preserve_original_urls and "original_image_urls" not in review:
+                        review["original_image_urls"] = review["user_images"].copy()
+
+                    # Create custom URLs for each image
+                    custom_images = []
+                    for url in user_images_original:
+                        # Prefer S3 URL if available
+                        if url in s3_url_mapping:
+                            custom_images.append(s3_url_mapping[url])
+                        elif url in url_to_custom_url:
+                            custom_images.append(url_to_custom_url[url])
+                        elif not self.is_not_custom_url(url):  # Already a custom URL
+                            custom_images.append(url)
+
+                    # Replace with custom URLs if we have them
+                    if custom_images:
+                        review["user_images"] = custom_images
+
+            # Process profile_picture
+            if "profile_picture" in review and review["profile_picture"]:
+                # Add local profile picture path if enabled
+                if self.store_local_paths and profile_picture_original in url_to_filename:
+                    review["local_profile_picture"] = url_to_filename[profile_picture_original]
+
+                # Replace profile_picture URL if enabled
+                if self.replace_urls:
+                    # Store original URL if needed and not already stored
+                    if self.preserve_original_urls and "original_profile_picture" not in review:
+                        review["original_profile_picture"] = review["profile_picture"]
+
+                    # Replace with S3 URL if available, otherwise use custom URL
+                    if profile_picture_original in s3_url_mapping:
+                        review["profile_picture"] = s3_url_mapping[profile_picture_original]
+                    elif profile_picture_original in url_to_custom_url:
+                        review["profile_picture"] = url_to_custom_url[profile_picture_original]
+                    elif not self.is_not_custom_url(review["profile_picture"]):
+                        # If current URL is already a custom URL, keep it
+                        pass
+                    elif profile_picture_original:
+                        # If we don't have a custom URL but have a filename, generate one
+                        filename = url_to_filename.get(profile_picture_original, "")
+                        if filename:
+                            custom_url = self.get_custom_url(filename, True)
+                            if custom_url:
+                                review["profile_picture"] = custom_url
+
+        log.info(f"Downloaded {len(url_to_filename)} images")
+        if self.use_s3 and s3_url_mapping:
+            log.info(f"Uploaded {len(s3_url_mapping)} images to S3")
+        if self.replace_urls:
+            total_replaced = len(s3_url_mapping) + len(url_to_custom_url)
+            log.info(f"Replaced URLs for {total_replaced} images")
+
+        return reviews
--- a/modules/_legacy/s3_handler.py
+++ b/modules/_legacy/s3_handler.py
@@ -0,0 +1,177 @@
+"""
+S3 upload handler for Google Maps Reviews Scraper.
+"""
+
+import logging
+import os
+from pathlib import Path
+from typing import Dict, Any, Optional
+
+import boto3
+from botocore.exceptions import ClientError
+
+log = logging.getLogger("scraper")
+
+
+class S3Handler:
+    """Handler for uploading images to AWS S3"""
+
+    def __init__(self, config: Dict[str, Any]):
+        """Initialize S3 handler with configuration"""
+        self.enabled = config.get("use_s3", False)
+        
+        if not self.enabled:
+            return
+            
+        s3_config = config.get("s3", {})
+        
+        self.aws_access_key_id = s3_config.get("aws_access_key_id", "")
+        self.aws_secret_access_key = s3_config.get("aws_secret_access_key", "")
+        self.region_name = s3_config.get("region_name", "us-east-1")
+        self.bucket_name = s3_config.get("bucket_name", "")
+        self.prefix = s3_config.get("prefix", "reviews/").rstrip("/") + "/"
+        self.profiles_folder = s3_config.get("profiles_folder", "profiles/").strip("/")
+        self.reviews_folder = s3_config.get("reviews_folder", "reviews/").strip("/")
+        self.delete_local_after_upload = s3_config.get("delete_local_after_upload", False)
+        self.s3_base_url = s3_config.get("s3_base_url", "")
+        
+        # Validate required settings
+        if not self.bucket_name:
+            log.error("S3 bucket_name is required when use_s3 is enabled")
+            self.enabled = False
+            return
+            
+        # Initialize S3 client
+        try:
+            session_kwargs = {"region_name": self.region_name}
+            
+            # Use credentials if provided, otherwise rely on environment/IAM
+            if self.aws_access_key_id and self.aws_secret_access_key:
+                session_kwargs.update({
+                    "aws_access_key_id": self.aws_access_key_id,
+                    "aws_secret_access_key": self.aws_secret_access_key
+                })
+            
+            self.s3_client = boto3.client("s3", **session_kwargs)
+            
+            # Test connection by checking if bucket exists
+            self.s3_client.head_bucket(Bucket=self.bucket_name)
+            log.info(f"S3 handler initialized successfully for bucket: {self.bucket_name}")
+            
+        except ClientError as e:
+            error_code = e.response.get('Error', {}).get('Code', '')
+            if error_code == '404':
+                log.error(f"S3 bucket '{self.bucket_name}' not found")
+            elif error_code == '403':
+                log.error(f"Access denied to S3 bucket '{self.bucket_name}'")
+            else:
+                log.error(f"Error connecting to S3: {e}")
+            self.enabled = False
+            
+        except Exception as e:
+            log.error(f"Error initializing S3 client: {e}")
+            self.enabled = False
+
+    def get_s3_url(self, key: str) -> str:
+        """Generate S3 URL for uploaded file"""
+        if self.s3_base_url:
+            return f"{self.s3_base_url.rstrip('/')}/{key}"
+        else:
+            return f"https://{self.bucket_name}.s3.{self.region_name}.amazonaws.com/{key}"
+
+    def upload_file(self, local_path: Path, s3_key: str) -> Optional[str]:
+        """
+        Upload a file to S3.
+        
+        Args:
+            local_path: Path to local file
+            s3_key: S3 key (path) for the uploaded file
+            
+        Returns:
+            S3 URL if successful, None if failed
+        """
+        if not self.enabled:
+            return None
+            
+        if not local_path.exists():
+            log.warning(f"Local file does not exist: {local_path}")
+            return None
+            
+        try:
+            # Upload file
+            self.s3_client.upload_file(
+                str(local_path),
+                self.bucket_name,
+                s3_key,
+                ExtraArgs={
+                    'ContentType': 'image/jpeg',
+                    'ACL': 'public-read'  # Make images publicly readable
+                }
+            )
+            
+            # Generate S3 URL
+            s3_url = self.get_s3_url(s3_key)
+            
+            # Delete local file if requested
+            if self.delete_local_after_upload:
+                try:
+                    local_path.unlink()
+                    log.debug(f"Deleted local file: {local_path}")
+                except Exception as e:
+                    log.warning(f"Failed to delete local file {local_path}: {e}")
+            
+            log.debug(f"Uploaded {local_path} to s3://{self.bucket_name}/{s3_key}")
+            return s3_url
+            
+        except ClientError as e:
+            log.error(f"Failed to upload {local_path} to S3: {e}")
+            return None
+        except Exception as e:
+            log.error(f"Unexpected error uploading {local_path} to S3: {e}")
+            return None
+
+    def upload_image(self, local_path: Path, filename: str, is_profile: bool = False) -> Optional[str]:
+        """
+        Upload an image to S3 with appropriate folder structure.
+        
+        Args:
+            local_path: Path to local image file
+            filename: Name of the file
+            is_profile: Whether this is a profile image
+            
+        Returns:
+            S3 URL if successful, None if failed
+        """
+        if not self.enabled:
+            return None
+            
+        # Create S3 key with appropriate folder structure
+        folder = self.profiles_folder if is_profile else self.reviews_folder
+        s3_key = f"{self.prefix}{folder}/{filename}"
+        
+        return self.upload_file(local_path, s3_key)
+
+    def upload_images_batch(self, image_files: Dict[str, tuple]) -> Dict[str, str]:
+        """
+        Upload multiple images to S3.
+        
+        Args:
+            image_files: Dict mapping filename to (local_path, is_profile) tuple
+            
+        Returns:
+            Dict mapping filename to S3 URL for successful uploads
+        """
+        if not self.enabled:
+            return {}
+            
+        results = {}
+        
+        for filename, (local_path, is_profile) in image_files.items():
+            s3_url = self.upload_image(local_path, filename, is_profile)
+            if s3_url:
+                results[filename] = s3_url
+                
+        if results:
+            log.info(f"Successfully uploaded {len(results)} images to S3")
+            
+        return results