Phase 0: Project restructure to ReviewIQ platform architecture

New structure:
- scrapers/google_reviews/v1_0_0.py (was modules/scraper_clean.py)
- scrapers/base.py (BaseScraper interface)
- scrapers/registry.py (ScraperRegistry for version routing)
- core/database.py, models.py, config.py, enums.py
- utils/logger.py, crash_analyzer.py, health_checks.py, helpers.py, date_converter.py
- workers/chrome_pool.py
- services/webhook_service.py
- api/ routes structure (empty, ready for Phase 2)
- tests/ structure mirroring source

All imports updated in:
- api_server_production.py (7 import paths updated)
- utils/health_checks.py (scraper import path)

Legacy modules moved to modules/_legacy/:
- data_storage.py, image_handler.py, s3_handler.py (unused)

Syntax verified, frontend build passing.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-24 15:22:08 +00:00
parent bb0291f265
commit 544e028c3f
37 changed files with 5782 additions and 30 deletions

View File

@@ -0,0 +1,349 @@
"""
Data storage modules for Google Maps Reviews Scraper.
"""
import json
import logging
import ssl
from datetime import datetime
from pathlib import Path
from typing import Dict, Any, Set
import pymongo
from modules.date_converter import parse_relative_date, DateConverter
from modules.image_handler import ImageHandler
from modules.models import RawReview
from modules.utils import detect_lang, get_current_iso_date
# Configure SSL for MongoDB connection
ssl._create_default_https_context = ssl._create_unverified_context # macOS SSL fix
# Logger
log = logging.getLogger("scraper")
RAW_LANG = "en"
class MongoDBStorage:
"""MongoDB storage handler for Google Maps reviews"""
def __init__(self, config: Dict[str, Any]):
"""Initialize MongoDB storage with configuration"""
mongodb_config = config.get("mongodb", {})
self.uri = mongodb_config.get("uri")
self.db_name = mongodb_config.get("database")
self.collection_name = mongodb_config.get("collection")
self.client = None
self.collection = None
self.connected = False
self.convert_dates = config.get("convert_dates", True)
self.download_images = config.get("download_images", False)
self.store_local_paths = config.get("store_local_paths", True)
self.replace_urls = config.get("replace_urls", False)
self.preserve_original_urls = config.get("preserve_original_urls", True)
self.custom_params = config.get("custom_params", {})
self.image_handler = ImageHandler(config) if self.download_images else None
def connect(self) -> bool:
"""Connect to MongoDB"""
try:
# Use the correct TLS parameters for newer PyMongo versions
self.client = pymongo.MongoClient(
self.uri,
tlsAllowInvalidCertificates=True, # Equivalent to ssl_cert_reqs=CERT_NONE
connectTimeoutMS=30000,
socketTimeoutMS=None,
connect=True,
maxPoolSize=50
)
# Test connection
self.client.admin.command('ping')
db = self.client[self.db_name]
self.collection = db[self.collection_name]
self.connected = True
log.info(f"Connected to MongoDB: {self.db_name}.{self.collection_name}")
return True
except Exception as e:
log.error(f"Failed to connect to MongoDB: {e}")
self.connected = False
return False
def close(self):
"""Close MongoDB connection"""
if self.client:
self.client.close()
self.connected = False
def fetch_existing_reviews(self) -> Dict[str, Dict[str, Any]]:
"""Fetch existing reviews from MongoDB"""
if not self.connected and not self.connect():
log.warning("Cannot fetch existing reviews - MongoDB connection failed")
return {}
try:
reviews = {}
for doc in self.collection.find({}, {"_id": 0}):
review_id = doc.get("review_id")
if review_id:
reviews[review_id] = doc
log.info(f"Fetched {len(reviews)} existing reviews from MongoDB")
return reviews
except Exception as e:
log.error(f"Error fetching reviews from MongoDB: {e}")
return {}
def save_reviews(self, reviews: Dict[str, Dict[str, Any]]):
"""Save reviews to MongoDB using bulk operations"""
if not reviews:
log.info("No reviews to save to MongoDB")
return
if not self.connected and not self.connect():
log.warning("Cannot save reviews - MongoDB connection failed")
return
try:
# Process reviews before saving
processed_reviews = reviews.copy()
# Convert string dates to datetime objects if enabled
if self.convert_dates:
processed_reviews = DateConverter.convert_dates_in_reviews(processed_reviews)
# Download and process images if enabled
if self.download_images and self.image_handler:
processed_reviews = self.image_handler.download_all_images(processed_reviews)
# If not storing local paths, remove them from the documents
if not self.store_local_paths:
for review in processed_reviews.values():
if "local_images" in review:
del review["local_images"]
if "local_profile_picture" in review:
del review["local_profile_picture"]
# If not preserving original URLs, remove them from the documents
if self.replace_urls and not self.preserve_original_urls:
for review in processed_reviews.values():
if "original_image_urls" in review:
del review["original_image_urls"]
if "original_profile_picture" in review:
del review["original_profile_picture"]
# Add custom parameters to each document
if self.custom_params:
log.info(f"Adding custom parameters to {len(processed_reviews)} documents")
for review in processed_reviews.values():
for key, value in self.custom_params.items():
review[key] = value
operations = []
for review in processed_reviews.values():
# Convert to proper MongoDB document
# Exclude _id for inserts, MongoDB will generate it
if "_id" in review:
del review["_id"]
operations.append(
pymongo.UpdateOne(
{"review_id": review["review_id"]},
{"$set": review},
upsert=True
)
)
if operations:
result = self.collection.bulk_write(operations)
log.info(f"MongoDB: Upserted {result.upserted_count}, modified {result.modified_count} reviews")
except Exception as e:
log.error(f"Error saving reviews to MongoDB: {e}")
class JSONStorage:
"""JSON file-based storage handler for Google Maps reviews"""
def __init__(self, config: Dict[str, Any]):
"""Initialize JSON storage with configuration"""
self.json_path = Path(config.get("json_path", "google_reviews.json"))
self.seen_ids_path = Path(config.get("seen_ids_path", "google_reviews.ids"))
self.convert_dates = config.get("convert_dates", True)
self.download_images = config.get("download_images", False)
self.store_local_paths = config.get("store_local_paths", True)
self.replace_urls = config.get("replace_urls", False)
self.preserve_original_urls = config.get("preserve_original_urls", True)
self.custom_params = config.get("custom_params", {})
self.image_handler = ImageHandler(config) if self.download_images else None
def load_json_docs(self) -> Dict[str, Dict[str, Any]]:
"""Load reviews from JSON file"""
if not self.json_path.exists():
return {}
try:
data = json.loads(self.json_path.read_text(encoding="utf-8"))
# Index by review_id for fast lookups
return {d.get("review_id", ""): d for d in data if d.get("review_id")}
except json.JSONDecodeError:
log.warning("⚠️ Error reading JSON file, starting with empty data")
return {}
def save_json_docs(self, docs: Dict[str, Dict[str, Any]]):
"""Save reviews to JSON file"""
# Create a copy of the docs to avoid modifying the original
processed_docs = {review_id: review.copy() for review_id, review in docs.items()}
# Process reviews before saving
# Convert string dates to datetime objects if enabled
if self.convert_dates:
processed_docs = DateConverter.convert_dates_in_reviews(processed_docs)
# Download and process images if enabled
if self.download_images and self.image_handler:
processed_docs = self.image_handler.download_all_images(processed_docs)
# If not storing local paths, remove them from the documents
if not self.store_local_paths:
for review in processed_docs.values():
if "local_images" in review:
del review["local_images"]
if "local_profile_picture" in review:
del review["local_profile_picture"]
# If not preserving original URLs, remove them from the documents
if self.replace_urls and not self.preserve_original_urls:
for review in processed_docs.values():
if "original_image_urls" in review:
del review["original_image_urls"]
if "original_profile_picture" in review:
del review["original_profile_picture"]
# Add custom parameters to each document
if self.custom_params:
log.info(f"Adding custom parameters to {len(processed_docs)} documents")
for review in processed_docs.values():
for key, value in self.custom_params.items():
review[key] = value
# Convert datetime objects back to strings for JSON serialization
for doc in processed_docs.values():
for key, value in doc.items():
if isinstance(value, datetime):
doc[key] = value.isoformat()
# Write to JSON file
self.json_path.write_text(json.dumps(list(processed_docs.values()),
ensure_ascii=False, indent=2), encoding="utf-8")
def load_seen(self) -> Set[str]:
"""Load set of already seen review IDs"""
return set(
self.seen_ids_path.read_text(encoding="utf-8").splitlines()) if self.seen_ids_path.exists() else set()
def save_seen(self, ids: Set[str]):
"""Save set of already seen review IDs"""
self.seen_ids_path.write_text("\n".join(ids), encoding="utf-8")
def merge_review(existing: Dict[str, Any] | None, raw: RawReview) -> Dict[str, Any]:
"""
Merge a raw review with an existing review document.
Creates a new document if existing is None.
"""
if not existing:
# Create a new review with the updated field names
existing = {
"review_id": raw.id,
"author": raw.author,
"rating": raw.rating,
"description": {}, # renamed from "texts"
"likes": raw.likes,
"user_images": list(raw.photos), # renamed from "photo_urls"
"author_profile_url": raw.profile, # renamed from "profile_link"
"profile_picture": raw.avatar, # renamed from "avatar_url"
"owner_responses": {},
"created_date": get_current_iso_date(),
"review_date": parse_relative_date(raw.date, RAW_LANG),
}
else:
# Handle existing reviews with old field names - migrate them
if "texts" in existing and "description" not in existing:
existing["description"] = existing.pop("texts")
if "photo_urls" in existing and "user_images" not in existing:
existing["user_images"] = existing.pop("photo_urls")
if "profile_link" in existing and "author_profile_url" not in existing:
existing["author_profile_url"] = existing.pop("profile_link")
if "avatar_url" in existing and "profile_picture" not in existing:
existing["profile_picture"] = existing.pop("avatar_url")
# Add ISO dates if not present
if "created_date" not in existing:
existing["created_date"] = get_current_iso_date()
if "review_date" not in existing:
existing["review_date"] = parse_relative_date(raw.date, RAW_LANG)
# Remove the 'date' field if it exists
if "date" in existing:
del existing["date"]
if raw.text:
existing["description"][raw.lang] = raw.text
if not existing.get("rating"):
existing["rating"] = raw.rating
if raw.likes > existing.get("likes", 0):
existing["likes"] = raw.likes
# Update the images list
existing["user_images"] = list({*existing.get("user_images", []), *raw.photos})
# Update avatar/profile picture
if raw.avatar and (
not existing.get("profile_picture") or len(raw.avatar) > len(existing.get("profile_picture", ""))):
existing["profile_picture"] = raw.avatar
if raw.owner_text:
lang = detect_lang(raw.owner_text)
# Don't store the date string in owner_responses
existing.setdefault("owner_responses", {})[lang] = {
"text": raw.owner_text,
}
# Update last_modified timestamp
existing["last_modified_date"] = get_current_iso_date()
return existing
def merge_review_with_translation(existing: Dict[str, Any] | None, raw: RawReview, append_translations: bool = False) -> Dict[str, Any]:
"""
Enhanced merge function that supports translation mode.
When append_translations is True, it adds new language versions to existing reviews.
"""
# Use the standard merge for the base functionality
merged = merge_review(existing, raw)
if append_translations and existing and raw.text:
# In translation mode, always add the new language version
# even if we already have content for this review
merged["description"][raw.lang] = raw.text
# Also merge owner responses in translation mode
if raw.owner_text:
owner_lang = detect_lang(raw.owner_text)
merged.setdefault("owner_responses", {})[owner_lang] = {
"text": raw.owner_text,
}
# Add metadata about when this translation was added
merged.setdefault("translation_history", []).append({
"language": raw.lang,
"added_date": get_current_iso_date(),
"source": "regional_scraping"
})
return merged

View File

@@ -0,0 +1,342 @@
"""
Image downloading and handling for Google Maps Reviews Scraper.
"""
import logging
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from typing import Dict, Any, Set, Tuple
from urllib.parse import urlparse
import requests
from modules.s3_handler import S3Handler
# Logger
log = logging.getLogger("scraper")
class ImageHandler:
"""Handler for downloading and managing review images"""
def __init__(self, config: Dict[str, Any]):
"""Initialize image handler with configuration"""
self.image_dir = Path(config.get("image_dir", "review_images"))
self.max_workers = config.get("download_threads", 4)
self.store_local_paths = config.get("store_local_paths", True)
# Image dimension settings
self.max_width = config.get("max_width", 1200)
self.max_height = config.get("max_height", 1200)
# URL replacement settings
self.replace_urls = config.get("replace_urls", False)
self.custom_url_base = config.get("custom_url_base", "https://mycustomurl.com")
self.custom_url_profiles = config.get("custom_url_profiles", "/profiles/")
self.custom_url_reviews = config.get("custom_url_reviews", "/reviews/")
self.preserve_original_urls = config.get("preserve_original_urls", True)
# Subdirectories for different image types
self.profile_dir = self.image_dir / "profiles"
self.review_dir = self.image_dir / "reviews"
# Initialize S3 handler
self.s3_handler = S3Handler(config)
self.use_s3 = config.get("use_s3", False)
def ensure_directories(self):
"""Ensure all image directories exist"""
self.profile_dir.mkdir(parents=True, exist_ok=True)
self.review_dir.mkdir(parents=True, exist_ok=True)
def is_not_custom_url(self, url: str) -> bool:
"""Check if the URL is not one of our custom URLs"""
if not url:
return False
# Check if the URL starts with our custom URL base - if so, skip it
if self.custom_url_base and url.startswith(self.custom_url_base):
return False
return True
def get_filename_from_url(self, url: str, is_profile: bool = False) -> str:
"""Extract filename from URL and add .jpg extension"""
if not url:
return ""
# Skip our custom URLs
if not self.is_not_custom_url(url):
return ""
# For profile pictures
if is_profile:
# Extract unique identifier from profile URL
parts = url.split('/')
if len(parts) > 1:
filename = parts[-2] if parts[-1] == '' else parts[-1]
filename = filename.split('=')[0]
return f"{filename}.jpg"
# For review images
url = url.split('=')[0]
filename = url.split('/')[-1]
return f"{filename}.jpg"
# Fallback to using the last part of the URL path
parsed = urlparse(url)
path = parsed.path
filename = path.split('/')[-1]
# Add .jpg extension if not present
if not filename.lower().endswith('.jpg'):
filename += ".jpg"
return filename
def get_custom_url(self, filename: str, is_profile: bool = False) -> str:
"""Generate a custom URL for the image"""
if not self.replace_urls or not filename:
return ""
base_url = self.custom_url_base.rstrip('/')
path = self.custom_url_profiles if is_profile else self.custom_url_reviews
path = path.strip('/')
return f"{base_url}/{path}/{filename}"
def download_image(self, url_info: Tuple[str, bool]) -> Tuple[str, str, str]:
"""
Download an image from URL and save to disk.
Args:
url_info: Tuple of (url, is_profile)
Returns:
Tuple of (url, local filename, custom url)
"""
url, is_profile = url_info
# Skip our custom URLs
if not self.is_not_custom_url(url):
return url, "", ""
try:
filename = self.get_filename_from_url(url, is_profile)
if not filename:
return url, "", ""
# Choose directory based on image type
target_dir = self.profile_dir if is_profile else self.review_dir
filepath = target_dir / filename
# Skip if file already exists
if filepath.exists():
# Generate custom URL even if file exists
custom_url = self.get_custom_url(filename, is_profile)
return url, filename, custom_url
# Download the image
# For Google images, modify resolution parameters
if 'googleusercontent.com' in url or 'ggpht.com' in url or 'gstatic.com' in url:
# Check if URL already has size parameters (=w... or =h... or =s...)
if '=w' in url or '=h' in url or '=s' in url:
# Remove existing size parameters
# Split at = to get base URL and parameters
parts = url.split('=')
base_url = parts[0]
# Rebuild with configurable resolution parameters (using -no suffix)
url = base_url + f"=w{self.max_width}-h{self.max_height}-no"
else:
# No existing size parameters, just append them
url = url + f"=w{self.max_width}-h{self.max_height}-no"
else:
# For non-Google URLs, just remove parameters after =
url = url.split("=")[0]
response = requests.get(url, stream=True, timeout=10)
response.raise_for_status()
with open(filepath, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
# Generate custom URL
custom_url = self.get_custom_url(filename, is_profile)
return url, filename, custom_url
except Exception as e:
log.error(f"Error downloading image from {url}: {e}")
return url, "", ""
def download_all_images(self, reviews: Dict[str, Dict[str, Any]]) -> Dict[str, Dict[str, Any]]:
"""
Download all images (review images and profile pictures) for all reviews.
Args:
reviews: Dictionary of review documents
Returns:
Updated reviews with local image paths and custom URLs
"""
self.ensure_directories()
# Collect all unique image URLs (both review images and profile pictures)
# Exclude custom URLs
review_urls: Set[str] = set()
profile_urls: Set[str] = set()
for review in reviews.values():
# Collect review images - exclude custom URLs
if "user_images" in review and isinstance(review["user_images"], list):
for url in review["user_images"]:
if self.is_not_custom_url(url):
review_urls.add(url)
# If we have original image URLs stored separately, add those too
if "original_image_urls" in review and isinstance(review["original_image_urls"], list):
for orig_url in review["original_image_urls"]:
if self.is_not_custom_url(orig_url):
review_urls.add(orig_url)
# Collect profile pictures - exclude custom URLs
if "profile_picture" in review and review["profile_picture"]:
profile_url = review["profile_picture"]
if self.is_not_custom_url(profile_url):
profile_urls.add(profile_url)
# If we have original profile URL stored separately, add that too
if "original_profile_picture" in review and review["original_profile_picture"]:
orig_profile_url = review["original_profile_picture"]
if self.is_not_custom_url(orig_profile_url):
profile_urls.add(orig_profile_url)
# Prepare download tasks with URL type info
download_tasks = [(url, False) for url in review_urls] + [(url, True) for url in profile_urls]
if not download_tasks:
log.info("No images to download")
return reviews
log.info(
f"Downloading {len(download_tasks)} images ({len(profile_urls)} profiles, {len(review_urls)} review images)...")
# Create URL to filename and URL to custom URL mappings
url_to_filename = {}
url_to_custom_url = {}
# Download images in parallel
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
results = executor.map(self.download_image, download_tasks)
for url, filename, custom_url in results:
if filename:
url_to_filename[url] = filename
if custom_url:
url_to_custom_url[url] = custom_url
# Upload to S3 if enabled
s3_url_mapping = {}
if self.use_s3 and self.s3_handler.enabled and url_to_filename:
log.info("Uploading images to S3...")
# Prepare files for S3 upload
files_to_upload = {}
for url, filename in url_to_filename.items():
# Determine if it's a profile image
is_profile = any(url == profile_url for profile_url in profile_urls)
# Get local file path
local_path = (self.profile_dir if is_profile else self.review_dir) / filename
if local_path.exists():
files_to_upload[filename] = (local_path, is_profile)
# Upload to S3
s3_results = self.s3_handler.upload_images_batch(files_to_upload)
# Create mapping from original URL to S3 URL
for url, filename in url_to_filename.items():
if filename in s3_results:
s3_url_mapping[url] = s3_results[filename]
# Update review documents
for review_id, review in reviews.items():
# Find the original URLs to use for lookup - important for both user_images and profile_picture
user_images_original = []
profile_picture_original = ""
# For user_images, either use original URLs if we have them, or the current user_images
if "original_image_urls" in review and isinstance(review["original_image_urls"], list):
user_images_original = review["original_image_urls"]
elif "user_images" in review and isinstance(review["user_images"], list):
user_images_original = review["user_images"].copy()
# For profile_picture, either use original URL if we have it, or the current profile_picture
if "original_profile_picture" in review and review["original_profile_picture"]:
profile_picture_original = review["original_profile_picture"]
elif "profile_picture" in review:
profile_picture_original = review["profile_picture"]
# Process user_images
if "user_images" in review and isinstance(review["user_images"], list):
# Add local image paths if enabled
if self.store_local_paths:
local_images = [url_to_filename.get(url, "") for url in user_images_original
if url and self.is_not_custom_url(url)]
review["local_images"] = [img for img in local_images if img]
# Replace URLs if enabled
if self.replace_urls:
# Store original URLs if needed and not already stored
if self.preserve_original_urls and "original_image_urls" not in review:
review["original_image_urls"] = review["user_images"].copy()
# Create custom URLs for each image
custom_images = []
for url in user_images_original:
# Prefer S3 URL if available
if url in s3_url_mapping:
custom_images.append(s3_url_mapping[url])
elif url in url_to_custom_url:
custom_images.append(url_to_custom_url[url])
elif not self.is_not_custom_url(url): # Already a custom URL
custom_images.append(url)
# Replace with custom URLs if we have them
if custom_images:
review["user_images"] = custom_images
# Process profile_picture
if "profile_picture" in review and review["profile_picture"]:
# Add local profile picture path if enabled
if self.store_local_paths and profile_picture_original in url_to_filename:
review["local_profile_picture"] = url_to_filename[profile_picture_original]
# Replace profile_picture URL if enabled
if self.replace_urls:
# Store original URL if needed and not already stored
if self.preserve_original_urls and "original_profile_picture" not in review:
review["original_profile_picture"] = review["profile_picture"]
# Replace with S3 URL if available, otherwise use custom URL
if profile_picture_original in s3_url_mapping:
review["profile_picture"] = s3_url_mapping[profile_picture_original]
elif profile_picture_original in url_to_custom_url:
review["profile_picture"] = url_to_custom_url[profile_picture_original]
elif not self.is_not_custom_url(review["profile_picture"]):
# If current URL is already a custom URL, keep it
pass
elif profile_picture_original:
# If we don't have a custom URL but have a filename, generate one
filename = url_to_filename.get(profile_picture_original, "")
if filename:
custom_url = self.get_custom_url(filename, True)
if custom_url:
review["profile_picture"] = custom_url
log.info(f"Downloaded {len(url_to_filename)} images")
if self.use_s3 and s3_url_mapping:
log.info(f"Uploaded {len(s3_url_mapping)} images to S3")
if self.replace_urls:
total_replaced = len(s3_url_mapping) + len(url_to_custom_url)
log.info(f"Replaced URLs for {total_replaced} images")
return reviews

View File

@@ -0,0 +1,177 @@
"""
S3 upload handler for Google Maps Reviews Scraper.
"""
import logging
import os
from pathlib import Path
from typing import Dict, Any, Optional
import boto3
from botocore.exceptions import ClientError
log = logging.getLogger("scraper")
class S3Handler:
"""Handler for uploading images to AWS S3"""
def __init__(self, config: Dict[str, Any]):
"""Initialize S3 handler with configuration"""
self.enabled = config.get("use_s3", False)
if not self.enabled:
return
s3_config = config.get("s3", {})
self.aws_access_key_id = s3_config.get("aws_access_key_id", "")
self.aws_secret_access_key = s3_config.get("aws_secret_access_key", "")
self.region_name = s3_config.get("region_name", "us-east-1")
self.bucket_name = s3_config.get("bucket_name", "")
self.prefix = s3_config.get("prefix", "reviews/").rstrip("/") + "/"
self.profiles_folder = s3_config.get("profiles_folder", "profiles/").strip("/")
self.reviews_folder = s3_config.get("reviews_folder", "reviews/").strip("/")
self.delete_local_after_upload = s3_config.get("delete_local_after_upload", False)
self.s3_base_url = s3_config.get("s3_base_url", "")
# Validate required settings
if not self.bucket_name:
log.error("S3 bucket_name is required when use_s3 is enabled")
self.enabled = False
return
# Initialize S3 client
try:
session_kwargs = {"region_name": self.region_name}
# Use credentials if provided, otherwise rely on environment/IAM
if self.aws_access_key_id and self.aws_secret_access_key:
session_kwargs.update({
"aws_access_key_id": self.aws_access_key_id,
"aws_secret_access_key": self.aws_secret_access_key
})
self.s3_client = boto3.client("s3", **session_kwargs)
# Test connection by checking if bucket exists
self.s3_client.head_bucket(Bucket=self.bucket_name)
log.info(f"S3 handler initialized successfully for bucket: {self.bucket_name}")
except ClientError as e:
error_code = e.response.get('Error', {}).get('Code', '')
if error_code == '404':
log.error(f"S3 bucket '{self.bucket_name}' not found")
elif error_code == '403':
log.error(f"Access denied to S3 bucket '{self.bucket_name}'")
else:
log.error(f"Error connecting to S3: {e}")
self.enabled = False
except Exception as e:
log.error(f"Error initializing S3 client: {e}")
self.enabled = False
def get_s3_url(self, key: str) -> str:
"""Generate S3 URL for uploaded file"""
if self.s3_base_url:
return f"{self.s3_base_url.rstrip('/')}/{key}"
else:
return f"https://{self.bucket_name}.s3.{self.region_name}.amazonaws.com/{key}"
def upload_file(self, local_path: Path, s3_key: str) -> Optional[str]:
"""
Upload a file to S3.
Args:
local_path: Path to local file
s3_key: S3 key (path) for the uploaded file
Returns:
S3 URL if successful, None if failed
"""
if not self.enabled:
return None
if not local_path.exists():
log.warning(f"Local file does not exist: {local_path}")
return None
try:
# Upload file
self.s3_client.upload_file(
str(local_path),
self.bucket_name,
s3_key,
ExtraArgs={
'ContentType': 'image/jpeg',
'ACL': 'public-read' # Make images publicly readable
}
)
# Generate S3 URL
s3_url = self.get_s3_url(s3_key)
# Delete local file if requested
if self.delete_local_after_upload:
try:
local_path.unlink()
log.debug(f"Deleted local file: {local_path}")
except Exception as e:
log.warning(f"Failed to delete local file {local_path}: {e}")
log.debug(f"Uploaded {local_path} to s3://{self.bucket_name}/{s3_key}")
return s3_url
except ClientError as e:
log.error(f"Failed to upload {local_path} to S3: {e}")
return None
except Exception as e:
log.error(f"Unexpected error uploading {local_path} to S3: {e}")
return None
def upload_image(self, local_path: Path, filename: str, is_profile: bool = False) -> Optional[str]:
"""
Upload an image to S3 with appropriate folder structure.
Args:
local_path: Path to local image file
filename: Name of the file
is_profile: Whether this is a profile image
Returns:
S3 URL if successful, None if failed
"""
if not self.enabled:
return None
# Create S3 key with appropriate folder structure
folder = self.profiles_folder if is_profile else self.reviews_folder
s3_key = f"{self.prefix}{folder}/{filename}"
return self.upload_file(local_path, s3_key)
def upload_images_batch(self, image_files: Dict[str, tuple]) -> Dict[str, str]:
"""
Upload multiple images to S3.
Args:
image_files: Dict mapping filename to (local_path, is_profile) tuple
Returns:
Dict mapping filename to S3 URL for successful uploads
"""
if not self.enabled:
return {}
results = {}
for filename, (local_path, is_profile) in image_files.items():
s3_url = self.upload_image(local_path, filename, is_profile)
if s3_url:
results[filename] = s3_url
if results:
log.info(f"Successfully uploaded {len(results)} images to S3")
return results