Phase 0: Project restructure to ReviewIQ platform architecture
New structure: - scrapers/google_reviews/v1_0_0.py (was modules/scraper_clean.py) - scrapers/base.py (BaseScraper interface) - scrapers/registry.py (ScraperRegistry for version routing) - core/database.py, models.py, config.py, enums.py - utils/logger.py, crash_analyzer.py, health_checks.py, helpers.py, date_converter.py - workers/chrome_pool.py - services/webhook_service.py - api/ routes structure (empty, ready for Phase 2) - tests/ structure mirroring source All imports updated in: - api_server_production.py (7 import paths updated) - utils/health_checks.py (scraper import path) Legacy modules moved to modules/_legacy/: - data_storage.py, image_handler.py, s3_handler.py (unused) Syntax verified, frontend build passing. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
349
modules/_legacy/data_storage.py
Normal file
349
modules/_legacy/data_storage.py
Normal file
@@ -0,0 +1,349 @@
|
||||
"""
|
||||
Data storage modules for Google Maps Reviews Scraper.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import ssl
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Set
|
||||
|
||||
import pymongo
|
||||
|
||||
from modules.date_converter import parse_relative_date, DateConverter
|
||||
from modules.image_handler import ImageHandler
|
||||
from modules.models import RawReview
|
||||
from modules.utils import detect_lang, get_current_iso_date
|
||||
|
||||
# Configure SSL for MongoDB connection
|
||||
ssl._create_default_https_context = ssl._create_unverified_context # macOS SSL fix
|
||||
|
||||
# Logger
|
||||
log = logging.getLogger("scraper")
|
||||
|
||||
RAW_LANG = "en"
|
||||
|
||||
|
||||
class MongoDBStorage:
|
||||
"""MongoDB storage handler for Google Maps reviews"""
|
||||
|
||||
def __init__(self, config: Dict[str, Any]):
|
||||
"""Initialize MongoDB storage with configuration"""
|
||||
mongodb_config = config.get("mongodb", {})
|
||||
self.uri = mongodb_config.get("uri")
|
||||
self.db_name = mongodb_config.get("database")
|
||||
self.collection_name = mongodb_config.get("collection")
|
||||
self.client = None
|
||||
self.collection = None
|
||||
self.connected = False
|
||||
self.convert_dates = config.get("convert_dates", True)
|
||||
self.download_images = config.get("download_images", False)
|
||||
self.store_local_paths = config.get("store_local_paths", True)
|
||||
self.replace_urls = config.get("replace_urls", False)
|
||||
self.preserve_original_urls = config.get("preserve_original_urls", True)
|
||||
self.custom_params = config.get("custom_params", {})
|
||||
self.image_handler = ImageHandler(config) if self.download_images else None
|
||||
|
||||
def connect(self) -> bool:
|
||||
"""Connect to MongoDB"""
|
||||
try:
|
||||
# Use the correct TLS parameters for newer PyMongo versions
|
||||
self.client = pymongo.MongoClient(
|
||||
self.uri,
|
||||
tlsAllowInvalidCertificates=True, # Equivalent to ssl_cert_reqs=CERT_NONE
|
||||
connectTimeoutMS=30000,
|
||||
socketTimeoutMS=None,
|
||||
connect=True,
|
||||
maxPoolSize=50
|
||||
)
|
||||
# Test connection
|
||||
self.client.admin.command('ping')
|
||||
db = self.client[self.db_name]
|
||||
self.collection = db[self.collection_name]
|
||||
self.connected = True
|
||||
log.info(f"Connected to MongoDB: {self.db_name}.{self.collection_name}")
|
||||
return True
|
||||
except Exception as e:
|
||||
log.error(f"Failed to connect to MongoDB: {e}")
|
||||
self.connected = False
|
||||
return False
|
||||
|
||||
def close(self):
|
||||
"""Close MongoDB connection"""
|
||||
if self.client:
|
||||
self.client.close()
|
||||
self.connected = False
|
||||
|
||||
def fetch_existing_reviews(self) -> Dict[str, Dict[str, Any]]:
|
||||
"""Fetch existing reviews from MongoDB"""
|
||||
if not self.connected and not self.connect():
|
||||
log.warning("Cannot fetch existing reviews - MongoDB connection failed")
|
||||
return {}
|
||||
|
||||
try:
|
||||
reviews = {}
|
||||
for doc in self.collection.find({}, {"_id": 0}):
|
||||
review_id = doc.get("review_id")
|
||||
if review_id:
|
||||
reviews[review_id] = doc
|
||||
log.info(f"Fetched {len(reviews)} existing reviews from MongoDB")
|
||||
return reviews
|
||||
except Exception as e:
|
||||
log.error(f"Error fetching reviews from MongoDB: {e}")
|
||||
return {}
|
||||
|
||||
def save_reviews(self, reviews: Dict[str, Dict[str, Any]]):
|
||||
"""Save reviews to MongoDB using bulk operations"""
|
||||
if not reviews:
|
||||
log.info("No reviews to save to MongoDB")
|
||||
return
|
||||
|
||||
if not self.connected and not self.connect():
|
||||
log.warning("Cannot save reviews - MongoDB connection failed")
|
||||
return
|
||||
|
||||
try:
|
||||
# Process reviews before saving
|
||||
processed_reviews = reviews.copy()
|
||||
|
||||
# Convert string dates to datetime objects if enabled
|
||||
if self.convert_dates:
|
||||
processed_reviews = DateConverter.convert_dates_in_reviews(processed_reviews)
|
||||
|
||||
# Download and process images if enabled
|
||||
if self.download_images and self.image_handler:
|
||||
processed_reviews = self.image_handler.download_all_images(processed_reviews)
|
||||
|
||||
# If not storing local paths, remove them from the documents
|
||||
if not self.store_local_paths:
|
||||
for review in processed_reviews.values():
|
||||
if "local_images" in review:
|
||||
del review["local_images"]
|
||||
if "local_profile_picture" in review:
|
||||
del review["local_profile_picture"]
|
||||
|
||||
# If not preserving original URLs, remove them from the documents
|
||||
if self.replace_urls and not self.preserve_original_urls:
|
||||
for review in processed_reviews.values():
|
||||
if "original_image_urls" in review:
|
||||
del review["original_image_urls"]
|
||||
if "original_profile_picture" in review:
|
||||
del review["original_profile_picture"]
|
||||
|
||||
# Add custom parameters to each document
|
||||
if self.custom_params:
|
||||
log.info(f"Adding custom parameters to {len(processed_reviews)} documents")
|
||||
for review in processed_reviews.values():
|
||||
for key, value in self.custom_params.items():
|
||||
review[key] = value
|
||||
|
||||
operations = []
|
||||
for review in processed_reviews.values():
|
||||
# Convert to proper MongoDB document
|
||||
# Exclude _id for inserts, MongoDB will generate it
|
||||
if "_id" in review:
|
||||
del review["_id"]
|
||||
|
||||
operations.append(
|
||||
pymongo.UpdateOne(
|
||||
{"review_id": review["review_id"]},
|
||||
{"$set": review},
|
||||
upsert=True
|
||||
)
|
||||
)
|
||||
|
||||
if operations:
|
||||
result = self.collection.bulk_write(operations)
|
||||
log.info(f"MongoDB: Upserted {result.upserted_count}, modified {result.modified_count} reviews")
|
||||
except Exception as e:
|
||||
log.error(f"Error saving reviews to MongoDB: {e}")
|
||||
|
||||
|
||||
class JSONStorage:
|
||||
"""JSON file-based storage handler for Google Maps reviews"""
|
||||
|
||||
def __init__(self, config: Dict[str, Any]):
|
||||
"""Initialize JSON storage with configuration"""
|
||||
self.json_path = Path(config.get("json_path", "google_reviews.json"))
|
||||
self.seen_ids_path = Path(config.get("seen_ids_path", "google_reviews.ids"))
|
||||
self.convert_dates = config.get("convert_dates", True)
|
||||
self.download_images = config.get("download_images", False)
|
||||
self.store_local_paths = config.get("store_local_paths", True)
|
||||
self.replace_urls = config.get("replace_urls", False)
|
||||
self.preserve_original_urls = config.get("preserve_original_urls", True)
|
||||
self.custom_params = config.get("custom_params", {})
|
||||
self.image_handler = ImageHandler(config) if self.download_images else None
|
||||
|
||||
def load_json_docs(self) -> Dict[str, Dict[str, Any]]:
|
||||
"""Load reviews from JSON file"""
|
||||
if not self.json_path.exists():
|
||||
return {}
|
||||
try:
|
||||
data = json.loads(self.json_path.read_text(encoding="utf-8"))
|
||||
# Index by review_id for fast lookups
|
||||
return {d.get("review_id", ""): d for d in data if d.get("review_id")}
|
||||
except json.JSONDecodeError:
|
||||
log.warning("⚠️ Error reading JSON file, starting with empty data")
|
||||
return {}
|
||||
|
||||
def save_json_docs(self, docs: Dict[str, Dict[str, Any]]):
|
||||
"""Save reviews to JSON file"""
|
||||
# Create a copy of the docs to avoid modifying the original
|
||||
processed_docs = {review_id: review.copy() for review_id, review in docs.items()}
|
||||
|
||||
# Process reviews before saving
|
||||
# Convert string dates to datetime objects if enabled
|
||||
if self.convert_dates:
|
||||
processed_docs = DateConverter.convert_dates_in_reviews(processed_docs)
|
||||
|
||||
# Download and process images if enabled
|
||||
if self.download_images and self.image_handler:
|
||||
processed_docs = self.image_handler.download_all_images(processed_docs)
|
||||
|
||||
# If not storing local paths, remove them from the documents
|
||||
if not self.store_local_paths:
|
||||
for review in processed_docs.values():
|
||||
if "local_images" in review:
|
||||
del review["local_images"]
|
||||
if "local_profile_picture" in review:
|
||||
del review["local_profile_picture"]
|
||||
|
||||
# If not preserving original URLs, remove them from the documents
|
||||
if self.replace_urls and not self.preserve_original_urls:
|
||||
for review in processed_docs.values():
|
||||
if "original_image_urls" in review:
|
||||
del review["original_image_urls"]
|
||||
if "original_profile_picture" in review:
|
||||
del review["original_profile_picture"]
|
||||
|
||||
# Add custom parameters to each document
|
||||
if self.custom_params:
|
||||
log.info(f"Adding custom parameters to {len(processed_docs)} documents")
|
||||
for review in processed_docs.values():
|
||||
for key, value in self.custom_params.items():
|
||||
review[key] = value
|
||||
|
||||
# Convert datetime objects back to strings for JSON serialization
|
||||
for doc in processed_docs.values():
|
||||
for key, value in doc.items():
|
||||
if isinstance(value, datetime):
|
||||
doc[key] = value.isoformat()
|
||||
|
||||
# Write to JSON file
|
||||
self.json_path.write_text(json.dumps(list(processed_docs.values()),
|
||||
ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
def load_seen(self) -> Set[str]:
|
||||
"""Load set of already seen review IDs"""
|
||||
return set(
|
||||
self.seen_ids_path.read_text(encoding="utf-8").splitlines()) if self.seen_ids_path.exists() else set()
|
||||
|
||||
def save_seen(self, ids: Set[str]):
|
||||
"""Save set of already seen review IDs"""
|
||||
self.seen_ids_path.write_text("\n".join(ids), encoding="utf-8")
|
||||
|
||||
|
||||
def merge_review(existing: Dict[str, Any] | None, raw: RawReview) -> Dict[str, Any]:
|
||||
"""
|
||||
Merge a raw review with an existing review document.
|
||||
Creates a new document if existing is None.
|
||||
"""
|
||||
if not existing:
|
||||
# Create a new review with the updated field names
|
||||
existing = {
|
||||
"review_id": raw.id,
|
||||
"author": raw.author,
|
||||
"rating": raw.rating,
|
||||
"description": {}, # renamed from "texts"
|
||||
"likes": raw.likes,
|
||||
"user_images": list(raw.photos), # renamed from "photo_urls"
|
||||
"author_profile_url": raw.profile, # renamed from "profile_link"
|
||||
"profile_picture": raw.avatar, # renamed from "avatar_url"
|
||||
"owner_responses": {},
|
||||
"created_date": get_current_iso_date(),
|
||||
"review_date": parse_relative_date(raw.date, RAW_LANG),
|
||||
}
|
||||
else:
|
||||
# Handle existing reviews with old field names - migrate them
|
||||
if "texts" in existing and "description" not in existing:
|
||||
existing["description"] = existing.pop("texts")
|
||||
|
||||
if "photo_urls" in existing and "user_images" not in existing:
|
||||
existing["user_images"] = existing.pop("photo_urls")
|
||||
|
||||
if "profile_link" in existing and "author_profile_url" not in existing:
|
||||
existing["author_profile_url"] = existing.pop("profile_link")
|
||||
|
||||
if "avatar_url" in existing and "profile_picture" not in existing:
|
||||
existing["profile_picture"] = existing.pop("avatar_url")
|
||||
|
||||
# Add ISO dates if not present
|
||||
if "created_date" not in existing:
|
||||
existing["created_date"] = get_current_iso_date()
|
||||
|
||||
if "review_date" not in existing:
|
||||
existing["review_date"] = parse_relative_date(raw.date, RAW_LANG)
|
||||
|
||||
# Remove the 'date' field if it exists
|
||||
if "date" in existing:
|
||||
del existing["date"]
|
||||
|
||||
if raw.text:
|
||||
existing["description"][raw.lang] = raw.text
|
||||
|
||||
if not existing.get("rating"):
|
||||
existing["rating"] = raw.rating
|
||||
|
||||
if raw.likes > existing.get("likes", 0):
|
||||
existing["likes"] = raw.likes
|
||||
|
||||
# Update the images list
|
||||
existing["user_images"] = list({*existing.get("user_images", []), *raw.photos})
|
||||
|
||||
# Update avatar/profile picture
|
||||
if raw.avatar and (
|
||||
not existing.get("profile_picture") or len(raw.avatar) > len(existing.get("profile_picture", ""))):
|
||||
existing["profile_picture"] = raw.avatar
|
||||
|
||||
if raw.owner_text:
|
||||
lang = detect_lang(raw.owner_text)
|
||||
# Don't store the date string in owner_responses
|
||||
existing.setdefault("owner_responses", {})[lang] = {
|
||||
"text": raw.owner_text,
|
||||
}
|
||||
|
||||
# Update last_modified timestamp
|
||||
existing["last_modified_date"] = get_current_iso_date()
|
||||
|
||||
return existing
|
||||
|
||||
|
||||
def merge_review_with_translation(existing: Dict[str, Any] | None, raw: RawReview, append_translations: bool = False) -> Dict[str, Any]:
|
||||
"""
|
||||
Enhanced merge function that supports translation mode.
|
||||
When append_translations is True, it adds new language versions to existing reviews.
|
||||
"""
|
||||
# Use the standard merge for the base functionality
|
||||
merged = merge_review(existing, raw)
|
||||
|
||||
if append_translations and existing and raw.text:
|
||||
# In translation mode, always add the new language version
|
||||
# even if we already have content for this review
|
||||
merged["description"][raw.lang] = raw.text
|
||||
|
||||
# Also merge owner responses in translation mode
|
||||
if raw.owner_text:
|
||||
owner_lang = detect_lang(raw.owner_text)
|
||||
merged.setdefault("owner_responses", {})[owner_lang] = {
|
||||
"text": raw.owner_text,
|
||||
}
|
||||
|
||||
# Add metadata about when this translation was added
|
||||
merged.setdefault("translation_history", []).append({
|
||||
"language": raw.lang,
|
||||
"added_date": get_current_iso_date(),
|
||||
"source": "regional_scraping"
|
||||
})
|
||||
|
||||
return merged
|
||||
342
modules/_legacy/image_handler.py
Normal file
342
modules/_legacy/image_handler.py
Normal file
@@ -0,0 +1,342 @@
|
||||
"""
|
||||
Image downloading and handling for Google Maps Reviews Scraper.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Set, Tuple
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
|
||||
from modules.s3_handler import S3Handler
|
||||
|
||||
# Logger
|
||||
log = logging.getLogger("scraper")
|
||||
|
||||
|
||||
class ImageHandler:
|
||||
"""Handler for downloading and managing review images"""
|
||||
|
||||
def __init__(self, config: Dict[str, Any]):
|
||||
"""Initialize image handler with configuration"""
|
||||
self.image_dir = Path(config.get("image_dir", "review_images"))
|
||||
self.max_workers = config.get("download_threads", 4)
|
||||
self.store_local_paths = config.get("store_local_paths", True)
|
||||
|
||||
# Image dimension settings
|
||||
self.max_width = config.get("max_width", 1200)
|
||||
self.max_height = config.get("max_height", 1200)
|
||||
|
||||
# URL replacement settings
|
||||
self.replace_urls = config.get("replace_urls", False)
|
||||
self.custom_url_base = config.get("custom_url_base", "https://mycustomurl.com")
|
||||
self.custom_url_profiles = config.get("custom_url_profiles", "/profiles/")
|
||||
self.custom_url_reviews = config.get("custom_url_reviews", "/reviews/")
|
||||
self.preserve_original_urls = config.get("preserve_original_urls", True)
|
||||
|
||||
# Subdirectories for different image types
|
||||
self.profile_dir = self.image_dir / "profiles"
|
||||
self.review_dir = self.image_dir / "reviews"
|
||||
|
||||
# Initialize S3 handler
|
||||
self.s3_handler = S3Handler(config)
|
||||
self.use_s3 = config.get("use_s3", False)
|
||||
|
||||
def ensure_directories(self):
|
||||
"""Ensure all image directories exist"""
|
||||
self.profile_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.review_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def is_not_custom_url(self, url: str) -> bool:
|
||||
"""Check if the URL is not one of our custom URLs"""
|
||||
if not url:
|
||||
return False
|
||||
|
||||
# Check if the URL starts with our custom URL base - if so, skip it
|
||||
if self.custom_url_base and url.startswith(self.custom_url_base):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def get_filename_from_url(self, url: str, is_profile: bool = False) -> str:
|
||||
"""Extract filename from URL and add .jpg extension"""
|
||||
if not url:
|
||||
return ""
|
||||
|
||||
# Skip our custom URLs
|
||||
if not self.is_not_custom_url(url):
|
||||
return ""
|
||||
|
||||
# For profile pictures
|
||||
if is_profile:
|
||||
# Extract unique identifier from profile URL
|
||||
parts = url.split('/')
|
||||
if len(parts) > 1:
|
||||
filename = parts[-2] if parts[-1] == '' else parts[-1]
|
||||
filename = filename.split('=')[0]
|
||||
return f"{filename}.jpg"
|
||||
|
||||
# For review images
|
||||
url = url.split('=')[0]
|
||||
filename = url.split('/')[-1]
|
||||
return f"{filename}.jpg"
|
||||
|
||||
# Fallback to using the last part of the URL path
|
||||
parsed = urlparse(url)
|
||||
path = parsed.path
|
||||
filename = path.split('/')[-1]
|
||||
|
||||
# Add .jpg extension if not present
|
||||
if not filename.lower().endswith('.jpg'):
|
||||
filename += ".jpg"
|
||||
|
||||
return filename
|
||||
|
||||
def get_custom_url(self, filename: str, is_profile: bool = False) -> str:
|
||||
"""Generate a custom URL for the image"""
|
||||
if not self.replace_urls or not filename:
|
||||
return ""
|
||||
|
||||
base_url = self.custom_url_base.rstrip('/')
|
||||
path = self.custom_url_profiles if is_profile else self.custom_url_reviews
|
||||
path = path.strip('/')
|
||||
|
||||
return f"{base_url}/{path}/{filename}"
|
||||
|
||||
def download_image(self, url_info: Tuple[str, bool]) -> Tuple[str, str, str]:
|
||||
"""
|
||||
Download an image from URL and save to disk.
|
||||
|
||||
Args:
|
||||
url_info: Tuple of (url, is_profile)
|
||||
|
||||
Returns:
|
||||
Tuple of (url, local filename, custom url)
|
||||
"""
|
||||
url, is_profile = url_info
|
||||
|
||||
# Skip our custom URLs
|
||||
if not self.is_not_custom_url(url):
|
||||
return url, "", ""
|
||||
|
||||
try:
|
||||
filename = self.get_filename_from_url(url, is_profile)
|
||||
if not filename:
|
||||
return url, "", ""
|
||||
|
||||
# Choose directory based on image type
|
||||
target_dir = self.profile_dir if is_profile else self.review_dir
|
||||
filepath = target_dir / filename
|
||||
|
||||
# Skip if file already exists
|
||||
if filepath.exists():
|
||||
# Generate custom URL even if file exists
|
||||
custom_url = self.get_custom_url(filename, is_profile)
|
||||
return url, filename, custom_url
|
||||
|
||||
# Download the image
|
||||
# For Google images, modify resolution parameters
|
||||
if 'googleusercontent.com' in url or 'ggpht.com' in url or 'gstatic.com' in url:
|
||||
# Check if URL already has size parameters (=w... or =h... or =s...)
|
||||
if '=w' in url or '=h' in url or '=s' in url:
|
||||
# Remove existing size parameters
|
||||
# Split at = to get base URL and parameters
|
||||
parts = url.split('=')
|
||||
base_url = parts[0]
|
||||
# Rebuild with configurable resolution parameters (using -no suffix)
|
||||
url = base_url + f"=w{self.max_width}-h{self.max_height}-no"
|
||||
else:
|
||||
# No existing size parameters, just append them
|
||||
url = url + f"=w{self.max_width}-h{self.max_height}-no"
|
||||
else:
|
||||
# For non-Google URLs, just remove parameters after =
|
||||
url = url.split("=")[0]
|
||||
|
||||
response = requests.get(url, stream=True, timeout=10)
|
||||
response.raise_for_status()
|
||||
|
||||
with open(filepath, 'wb') as f:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
f.write(chunk)
|
||||
|
||||
# Generate custom URL
|
||||
custom_url = self.get_custom_url(filename, is_profile)
|
||||
return url, filename, custom_url
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error downloading image from {url}: {e}")
|
||||
return url, "", ""
|
||||
|
||||
def download_all_images(self, reviews: Dict[str, Dict[str, Any]]) -> Dict[str, Dict[str, Any]]:
|
||||
"""
|
||||
Download all images (review images and profile pictures) for all reviews.
|
||||
|
||||
Args:
|
||||
reviews: Dictionary of review documents
|
||||
|
||||
Returns:
|
||||
Updated reviews with local image paths and custom URLs
|
||||
"""
|
||||
self.ensure_directories()
|
||||
|
||||
# Collect all unique image URLs (both review images and profile pictures)
|
||||
# Exclude custom URLs
|
||||
review_urls: Set[str] = set()
|
||||
profile_urls: Set[str] = set()
|
||||
|
||||
for review in reviews.values():
|
||||
# Collect review images - exclude custom URLs
|
||||
if "user_images" in review and isinstance(review["user_images"], list):
|
||||
for url in review["user_images"]:
|
||||
if self.is_not_custom_url(url):
|
||||
review_urls.add(url)
|
||||
# If we have original image URLs stored separately, add those too
|
||||
if "original_image_urls" in review and isinstance(review["original_image_urls"], list):
|
||||
for orig_url in review["original_image_urls"]:
|
||||
if self.is_not_custom_url(orig_url):
|
||||
review_urls.add(orig_url)
|
||||
|
||||
# Collect profile pictures - exclude custom URLs
|
||||
if "profile_picture" in review and review["profile_picture"]:
|
||||
profile_url = review["profile_picture"]
|
||||
if self.is_not_custom_url(profile_url):
|
||||
profile_urls.add(profile_url)
|
||||
# If we have original profile URL stored separately, add that too
|
||||
if "original_profile_picture" in review and review["original_profile_picture"]:
|
||||
orig_profile_url = review["original_profile_picture"]
|
||||
if self.is_not_custom_url(orig_profile_url):
|
||||
profile_urls.add(orig_profile_url)
|
||||
|
||||
# Prepare download tasks with URL type info
|
||||
download_tasks = [(url, False) for url in review_urls] + [(url, True) for url in profile_urls]
|
||||
|
||||
if not download_tasks:
|
||||
log.info("No images to download")
|
||||
return reviews
|
||||
|
||||
log.info(
|
||||
f"Downloading {len(download_tasks)} images ({len(profile_urls)} profiles, {len(review_urls)} review images)...")
|
||||
|
||||
# Create URL to filename and URL to custom URL mappings
|
||||
url_to_filename = {}
|
||||
url_to_custom_url = {}
|
||||
|
||||
# Download images in parallel
|
||||
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
||||
results = executor.map(self.download_image, download_tasks)
|
||||
for url, filename, custom_url in results:
|
||||
if filename:
|
||||
url_to_filename[url] = filename
|
||||
if custom_url:
|
||||
url_to_custom_url[url] = custom_url
|
||||
|
||||
# Upload to S3 if enabled
|
||||
s3_url_mapping = {}
|
||||
if self.use_s3 and self.s3_handler.enabled and url_to_filename:
|
||||
log.info("Uploading images to S3...")
|
||||
|
||||
# Prepare files for S3 upload
|
||||
files_to_upload = {}
|
||||
for url, filename in url_to_filename.items():
|
||||
# Determine if it's a profile image
|
||||
is_profile = any(url == profile_url for profile_url in profile_urls)
|
||||
|
||||
# Get local file path
|
||||
local_path = (self.profile_dir if is_profile else self.review_dir) / filename
|
||||
|
||||
if local_path.exists():
|
||||
files_to_upload[filename] = (local_path, is_profile)
|
||||
|
||||
# Upload to S3
|
||||
s3_results = self.s3_handler.upload_images_batch(files_to_upload)
|
||||
|
||||
# Create mapping from original URL to S3 URL
|
||||
for url, filename in url_to_filename.items():
|
||||
if filename in s3_results:
|
||||
s3_url_mapping[url] = s3_results[filename]
|
||||
|
||||
# Update review documents
|
||||
for review_id, review in reviews.items():
|
||||
# Find the original URLs to use for lookup - important for both user_images and profile_picture
|
||||
user_images_original = []
|
||||
profile_picture_original = ""
|
||||
|
||||
# For user_images, either use original URLs if we have them, or the current user_images
|
||||
if "original_image_urls" in review and isinstance(review["original_image_urls"], list):
|
||||
user_images_original = review["original_image_urls"]
|
||||
elif "user_images" in review and isinstance(review["user_images"], list):
|
||||
user_images_original = review["user_images"].copy()
|
||||
|
||||
# For profile_picture, either use original URL if we have it, or the current profile_picture
|
||||
if "original_profile_picture" in review and review["original_profile_picture"]:
|
||||
profile_picture_original = review["original_profile_picture"]
|
||||
elif "profile_picture" in review:
|
||||
profile_picture_original = review["profile_picture"]
|
||||
|
||||
# Process user_images
|
||||
if "user_images" in review and isinstance(review["user_images"], list):
|
||||
# Add local image paths if enabled
|
||||
if self.store_local_paths:
|
||||
local_images = [url_to_filename.get(url, "") for url in user_images_original
|
||||
if url and self.is_not_custom_url(url)]
|
||||
review["local_images"] = [img for img in local_images if img]
|
||||
|
||||
# Replace URLs if enabled
|
||||
if self.replace_urls:
|
||||
# Store original URLs if needed and not already stored
|
||||
if self.preserve_original_urls and "original_image_urls" not in review:
|
||||
review["original_image_urls"] = review["user_images"].copy()
|
||||
|
||||
# Create custom URLs for each image
|
||||
custom_images = []
|
||||
for url in user_images_original:
|
||||
# Prefer S3 URL if available
|
||||
if url in s3_url_mapping:
|
||||
custom_images.append(s3_url_mapping[url])
|
||||
elif url in url_to_custom_url:
|
||||
custom_images.append(url_to_custom_url[url])
|
||||
elif not self.is_not_custom_url(url): # Already a custom URL
|
||||
custom_images.append(url)
|
||||
|
||||
# Replace with custom URLs if we have them
|
||||
if custom_images:
|
||||
review["user_images"] = custom_images
|
||||
|
||||
# Process profile_picture
|
||||
if "profile_picture" in review and review["profile_picture"]:
|
||||
# Add local profile picture path if enabled
|
||||
if self.store_local_paths and profile_picture_original in url_to_filename:
|
||||
review["local_profile_picture"] = url_to_filename[profile_picture_original]
|
||||
|
||||
# Replace profile_picture URL if enabled
|
||||
if self.replace_urls:
|
||||
# Store original URL if needed and not already stored
|
||||
if self.preserve_original_urls and "original_profile_picture" not in review:
|
||||
review["original_profile_picture"] = review["profile_picture"]
|
||||
|
||||
# Replace with S3 URL if available, otherwise use custom URL
|
||||
if profile_picture_original in s3_url_mapping:
|
||||
review["profile_picture"] = s3_url_mapping[profile_picture_original]
|
||||
elif profile_picture_original in url_to_custom_url:
|
||||
review["profile_picture"] = url_to_custom_url[profile_picture_original]
|
||||
elif not self.is_not_custom_url(review["profile_picture"]):
|
||||
# If current URL is already a custom URL, keep it
|
||||
pass
|
||||
elif profile_picture_original:
|
||||
# If we don't have a custom URL but have a filename, generate one
|
||||
filename = url_to_filename.get(profile_picture_original, "")
|
||||
if filename:
|
||||
custom_url = self.get_custom_url(filename, True)
|
||||
if custom_url:
|
||||
review["profile_picture"] = custom_url
|
||||
|
||||
log.info(f"Downloaded {len(url_to_filename)} images")
|
||||
if self.use_s3 and s3_url_mapping:
|
||||
log.info(f"Uploaded {len(s3_url_mapping)} images to S3")
|
||||
if self.replace_urls:
|
||||
total_replaced = len(s3_url_mapping) + len(url_to_custom_url)
|
||||
log.info(f"Replaced URLs for {total_replaced} images")
|
||||
|
||||
return reviews
|
||||
177
modules/_legacy/s3_handler.py
Normal file
177
modules/_legacy/s3_handler.py
Normal file
@@ -0,0 +1,177 @@
|
||||
"""
|
||||
S3 upload handler for Google Maps Reviews Scraper.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
import boto3
|
||||
from botocore.exceptions import ClientError
|
||||
|
||||
log = logging.getLogger("scraper")
|
||||
|
||||
|
||||
class S3Handler:
|
||||
"""Handler for uploading images to AWS S3"""
|
||||
|
||||
def __init__(self, config: Dict[str, Any]):
|
||||
"""Initialize S3 handler with configuration"""
|
||||
self.enabled = config.get("use_s3", False)
|
||||
|
||||
if not self.enabled:
|
||||
return
|
||||
|
||||
s3_config = config.get("s3", {})
|
||||
|
||||
self.aws_access_key_id = s3_config.get("aws_access_key_id", "")
|
||||
self.aws_secret_access_key = s3_config.get("aws_secret_access_key", "")
|
||||
self.region_name = s3_config.get("region_name", "us-east-1")
|
||||
self.bucket_name = s3_config.get("bucket_name", "")
|
||||
self.prefix = s3_config.get("prefix", "reviews/").rstrip("/") + "/"
|
||||
self.profiles_folder = s3_config.get("profiles_folder", "profiles/").strip("/")
|
||||
self.reviews_folder = s3_config.get("reviews_folder", "reviews/").strip("/")
|
||||
self.delete_local_after_upload = s3_config.get("delete_local_after_upload", False)
|
||||
self.s3_base_url = s3_config.get("s3_base_url", "")
|
||||
|
||||
# Validate required settings
|
||||
if not self.bucket_name:
|
||||
log.error("S3 bucket_name is required when use_s3 is enabled")
|
||||
self.enabled = False
|
||||
return
|
||||
|
||||
# Initialize S3 client
|
||||
try:
|
||||
session_kwargs = {"region_name": self.region_name}
|
||||
|
||||
# Use credentials if provided, otherwise rely on environment/IAM
|
||||
if self.aws_access_key_id and self.aws_secret_access_key:
|
||||
session_kwargs.update({
|
||||
"aws_access_key_id": self.aws_access_key_id,
|
||||
"aws_secret_access_key": self.aws_secret_access_key
|
||||
})
|
||||
|
||||
self.s3_client = boto3.client("s3", **session_kwargs)
|
||||
|
||||
# Test connection by checking if bucket exists
|
||||
self.s3_client.head_bucket(Bucket=self.bucket_name)
|
||||
log.info(f"S3 handler initialized successfully for bucket: {self.bucket_name}")
|
||||
|
||||
except ClientError as e:
|
||||
error_code = e.response.get('Error', {}).get('Code', '')
|
||||
if error_code == '404':
|
||||
log.error(f"S3 bucket '{self.bucket_name}' not found")
|
||||
elif error_code == '403':
|
||||
log.error(f"Access denied to S3 bucket '{self.bucket_name}'")
|
||||
else:
|
||||
log.error(f"Error connecting to S3: {e}")
|
||||
self.enabled = False
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error initializing S3 client: {e}")
|
||||
self.enabled = False
|
||||
|
||||
def get_s3_url(self, key: str) -> str:
|
||||
"""Generate S3 URL for uploaded file"""
|
||||
if self.s3_base_url:
|
||||
return f"{self.s3_base_url.rstrip('/')}/{key}"
|
||||
else:
|
||||
return f"https://{self.bucket_name}.s3.{self.region_name}.amazonaws.com/{key}"
|
||||
|
||||
def upload_file(self, local_path: Path, s3_key: str) -> Optional[str]:
|
||||
"""
|
||||
Upload a file to S3.
|
||||
|
||||
Args:
|
||||
local_path: Path to local file
|
||||
s3_key: S3 key (path) for the uploaded file
|
||||
|
||||
Returns:
|
||||
S3 URL if successful, None if failed
|
||||
"""
|
||||
if not self.enabled:
|
||||
return None
|
||||
|
||||
if not local_path.exists():
|
||||
log.warning(f"Local file does not exist: {local_path}")
|
||||
return None
|
||||
|
||||
try:
|
||||
# Upload file
|
||||
self.s3_client.upload_file(
|
||||
str(local_path),
|
||||
self.bucket_name,
|
||||
s3_key,
|
||||
ExtraArgs={
|
||||
'ContentType': 'image/jpeg',
|
||||
'ACL': 'public-read' # Make images publicly readable
|
||||
}
|
||||
)
|
||||
|
||||
# Generate S3 URL
|
||||
s3_url = self.get_s3_url(s3_key)
|
||||
|
||||
# Delete local file if requested
|
||||
if self.delete_local_after_upload:
|
||||
try:
|
||||
local_path.unlink()
|
||||
log.debug(f"Deleted local file: {local_path}")
|
||||
except Exception as e:
|
||||
log.warning(f"Failed to delete local file {local_path}: {e}")
|
||||
|
||||
log.debug(f"Uploaded {local_path} to s3://{self.bucket_name}/{s3_key}")
|
||||
return s3_url
|
||||
|
||||
except ClientError as e:
|
||||
log.error(f"Failed to upload {local_path} to S3: {e}")
|
||||
return None
|
||||
except Exception as e:
|
||||
log.error(f"Unexpected error uploading {local_path} to S3: {e}")
|
||||
return None
|
||||
|
||||
def upload_image(self, local_path: Path, filename: str, is_profile: bool = False) -> Optional[str]:
|
||||
"""
|
||||
Upload an image to S3 with appropriate folder structure.
|
||||
|
||||
Args:
|
||||
local_path: Path to local image file
|
||||
filename: Name of the file
|
||||
is_profile: Whether this is a profile image
|
||||
|
||||
Returns:
|
||||
S3 URL if successful, None if failed
|
||||
"""
|
||||
if not self.enabled:
|
||||
return None
|
||||
|
||||
# Create S3 key with appropriate folder structure
|
||||
folder = self.profiles_folder if is_profile else self.reviews_folder
|
||||
s3_key = f"{self.prefix}{folder}/{filename}"
|
||||
|
||||
return self.upload_file(local_path, s3_key)
|
||||
|
||||
def upload_images_batch(self, image_files: Dict[str, tuple]) -> Dict[str, str]:
|
||||
"""
|
||||
Upload multiple images to S3.
|
||||
|
||||
Args:
|
||||
image_files: Dict mapping filename to (local_path, is_profile) tuple
|
||||
|
||||
Returns:
|
||||
Dict mapping filename to S3 URL for successful uploads
|
||||
"""
|
||||
if not self.enabled:
|
||||
return {}
|
||||
|
||||
results = {}
|
||||
|
||||
for filename, (local_path, is_profile) in image_files.items():
|
||||
s3_url = self.upload_image(local_path, filename, is_profile)
|
||||
if s3_url:
|
||||
results[filename] = s3_url
|
||||
|
||||
if results:
|
||||
log.info(f"Successfully uploaded {len(results)} images to S3")
|
||||
|
||||
return results
|
||||
Reference in New Issue
Block a user