Added pytest + some tests.
Added AWS S3 Support (optional, for cloud image storage)
This commit is contained in:
@@ -11,6 +11,8 @@ from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
|
||||
from modules.s3_handler import S3Handler
|
||||
|
||||
# Logger
|
||||
log = logging.getLogger("scraper")
|
||||
|
||||
@@ -34,6 +36,10 @@ class ImageHandler:
|
||||
# Subdirectories for different image types
|
||||
self.profile_dir = self.image_dir / "profiles"
|
||||
self.review_dir = self.image_dir / "reviews"
|
||||
|
||||
# Initialize S3 handler
|
||||
self.s3_handler = S3Handler(config)
|
||||
self.use_s3 = config.get("use_s3", False)
|
||||
|
||||
def ensure_directories(self):
|
||||
"""Ensure all image directories exist"""
|
||||
@@ -206,6 +212,31 @@ class ImageHandler:
|
||||
if custom_url:
|
||||
url_to_custom_url[url] = custom_url
|
||||
|
||||
# Upload to S3 if enabled
|
||||
s3_url_mapping = {}
|
||||
if self.use_s3 and self.s3_handler.enabled and url_to_filename:
|
||||
log.info("Uploading images to S3...")
|
||||
|
||||
# Prepare files for S3 upload
|
||||
files_to_upload = {}
|
||||
for url, filename in url_to_filename.items():
|
||||
# Determine if it's a profile image
|
||||
is_profile = any(url == profile_url for profile_url in profile_urls)
|
||||
|
||||
# Get local file path
|
||||
local_path = (self.profile_dir if is_profile else self.review_dir) / filename
|
||||
|
||||
if local_path.exists():
|
||||
files_to_upload[filename] = (local_path, is_profile)
|
||||
|
||||
# Upload to S3
|
||||
s3_results = self.s3_handler.upload_images_batch(files_to_upload)
|
||||
|
||||
# Create mapping from original URL to S3 URL
|
||||
for url, filename in url_to_filename.items():
|
||||
if filename in s3_results:
|
||||
s3_url_mapping[url] = s3_results[filename]
|
||||
|
||||
# Update review documents
|
||||
for review_id, review in reviews.items():
|
||||
# Find the original URLs to use for lookup - important for both user_images and profile_picture
|
||||
@@ -241,7 +272,10 @@ class ImageHandler:
|
||||
# Create custom URLs for each image
|
||||
custom_images = []
|
||||
for url in user_images_original:
|
||||
if url in url_to_custom_url:
|
||||
# Prefer S3 URL if available
|
||||
if url in s3_url_mapping:
|
||||
custom_images.append(s3_url_mapping[url])
|
||||
elif url in url_to_custom_url:
|
||||
custom_images.append(url_to_custom_url[url])
|
||||
elif not self.is_not_custom_url(url): # Already a custom URL
|
||||
custom_images.append(url)
|
||||
@@ -262,8 +296,10 @@ class ImageHandler:
|
||||
if self.preserve_original_urls and "original_profile_picture" not in review:
|
||||
review["original_profile_picture"] = review["profile_picture"]
|
||||
|
||||
# Replace with custom URL if we have one for this profile image
|
||||
if profile_picture_original in url_to_custom_url:
|
||||
# Replace with S3 URL if available, otherwise use custom URL
|
||||
if profile_picture_original in s3_url_mapping:
|
||||
review["profile_picture"] = s3_url_mapping[profile_picture_original]
|
||||
elif profile_picture_original in url_to_custom_url:
|
||||
review["profile_picture"] = url_to_custom_url[profile_picture_original]
|
||||
elif not self.is_not_custom_url(review["profile_picture"]):
|
||||
# If current URL is already a custom URL, keep it
|
||||
@@ -277,7 +313,10 @@ class ImageHandler:
|
||||
review["profile_picture"] = custom_url
|
||||
|
||||
log.info(f"Downloaded {len(url_to_filename)} images")
|
||||
if self.use_s3 and s3_url_mapping:
|
||||
log.info(f"Uploaded {len(s3_url_mapping)} images to S3")
|
||||
if self.replace_urls:
|
||||
log.info(f"Replaced URLs for {len(url_to_custom_url)} images")
|
||||
total_replaced = len(s3_url_mapping) + len(url_to_custom_url)
|
||||
log.info(f"Replaced URLs for {total_replaced} images")
|
||||
|
||||
return reviews
|
||||
|
||||
177
modules/s3_handler.py
Normal file
177
modules/s3_handler.py
Normal file
@@ -0,0 +1,177 @@
|
||||
"""
|
||||
S3 upload handler for Google Maps Reviews Scraper.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
import boto3
|
||||
from botocore.exceptions import ClientError
|
||||
|
||||
log = logging.getLogger("scraper")
|
||||
|
||||
|
||||
class S3Handler:
|
||||
"""Handler for uploading images to AWS S3"""
|
||||
|
||||
def __init__(self, config: Dict[str, Any]):
|
||||
"""Initialize S3 handler with configuration"""
|
||||
self.enabled = config.get("use_s3", False)
|
||||
|
||||
if not self.enabled:
|
||||
return
|
||||
|
||||
s3_config = config.get("s3", {})
|
||||
|
||||
self.aws_access_key_id = s3_config.get("aws_access_key_id", "")
|
||||
self.aws_secret_access_key = s3_config.get("aws_secret_access_key", "")
|
||||
self.region_name = s3_config.get("region_name", "us-east-1")
|
||||
self.bucket_name = s3_config.get("bucket_name", "")
|
||||
self.prefix = s3_config.get("prefix", "reviews/").rstrip("/") + "/"
|
||||
self.profiles_folder = s3_config.get("profiles_folder", "profiles/").strip("/")
|
||||
self.reviews_folder = s3_config.get("reviews_folder", "reviews/").strip("/")
|
||||
self.delete_local_after_upload = s3_config.get("delete_local_after_upload", False)
|
||||
self.s3_base_url = s3_config.get("s3_base_url", "")
|
||||
|
||||
# Validate required settings
|
||||
if not self.bucket_name:
|
||||
log.error("S3 bucket_name is required when use_s3 is enabled")
|
||||
self.enabled = False
|
||||
return
|
||||
|
||||
# Initialize S3 client
|
||||
try:
|
||||
session_kwargs = {"region_name": self.region_name}
|
||||
|
||||
# Use credentials if provided, otherwise rely on environment/IAM
|
||||
if self.aws_access_key_id and self.aws_secret_access_key:
|
||||
session_kwargs.update({
|
||||
"aws_access_key_id": self.aws_access_key_id,
|
||||
"aws_secret_access_key": self.aws_secret_access_key
|
||||
})
|
||||
|
||||
self.s3_client = boto3.client("s3", **session_kwargs)
|
||||
|
||||
# Test connection by checking if bucket exists
|
||||
self.s3_client.head_bucket(Bucket=self.bucket_name)
|
||||
log.info(f"S3 handler initialized successfully for bucket: {self.bucket_name}")
|
||||
|
||||
except ClientError as e:
|
||||
error_code = e.response.get('Error', {}).get('Code', '')
|
||||
if error_code == '404':
|
||||
log.error(f"S3 bucket '{self.bucket_name}' not found")
|
||||
elif error_code == '403':
|
||||
log.error(f"Access denied to S3 bucket '{self.bucket_name}'")
|
||||
else:
|
||||
log.error(f"Error connecting to S3: {e}")
|
||||
self.enabled = False
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error initializing S3 client: {e}")
|
||||
self.enabled = False
|
||||
|
||||
def get_s3_url(self, key: str) -> str:
|
||||
"""Generate S3 URL for uploaded file"""
|
||||
if self.s3_base_url:
|
||||
return f"{self.s3_base_url.rstrip('/')}/{key}"
|
||||
else:
|
||||
return f"https://{self.bucket_name}.s3.{self.region_name}.amazonaws.com/{key}"
|
||||
|
||||
def upload_file(self, local_path: Path, s3_key: str) -> Optional[str]:
|
||||
"""
|
||||
Upload a file to S3.
|
||||
|
||||
Args:
|
||||
local_path: Path to local file
|
||||
s3_key: S3 key (path) for the uploaded file
|
||||
|
||||
Returns:
|
||||
S3 URL if successful, None if failed
|
||||
"""
|
||||
if not self.enabled:
|
||||
return None
|
||||
|
||||
if not local_path.exists():
|
||||
log.warning(f"Local file does not exist: {local_path}")
|
||||
return None
|
||||
|
||||
try:
|
||||
# Upload file
|
||||
self.s3_client.upload_file(
|
||||
str(local_path),
|
||||
self.bucket_name,
|
||||
s3_key,
|
||||
ExtraArgs={
|
||||
'ContentType': 'image/jpeg',
|
||||
'ACL': 'public-read' # Make images publicly readable
|
||||
}
|
||||
)
|
||||
|
||||
# Generate S3 URL
|
||||
s3_url = self.get_s3_url(s3_key)
|
||||
|
||||
# Delete local file if requested
|
||||
if self.delete_local_after_upload:
|
||||
try:
|
||||
local_path.unlink()
|
||||
log.debug(f"Deleted local file: {local_path}")
|
||||
except Exception as e:
|
||||
log.warning(f"Failed to delete local file {local_path}: {e}")
|
||||
|
||||
log.debug(f"Uploaded {local_path} to s3://{self.bucket_name}/{s3_key}")
|
||||
return s3_url
|
||||
|
||||
except ClientError as e:
|
||||
log.error(f"Failed to upload {local_path} to S3: {e}")
|
||||
return None
|
||||
except Exception as e:
|
||||
log.error(f"Unexpected error uploading {local_path} to S3: {e}")
|
||||
return None
|
||||
|
||||
def upload_image(self, local_path: Path, filename: str, is_profile: bool = False) -> Optional[str]:
|
||||
"""
|
||||
Upload an image to S3 with appropriate folder structure.
|
||||
|
||||
Args:
|
||||
local_path: Path to local image file
|
||||
filename: Name of the file
|
||||
is_profile: Whether this is a profile image
|
||||
|
||||
Returns:
|
||||
S3 URL if successful, None if failed
|
||||
"""
|
||||
if not self.enabled:
|
||||
return None
|
||||
|
||||
# Create S3 key with appropriate folder structure
|
||||
folder = self.profiles_folder if is_profile else self.reviews_folder
|
||||
s3_key = f"{self.prefix}{folder}/{filename}"
|
||||
|
||||
return self.upload_file(local_path, s3_key)
|
||||
|
||||
def upload_images_batch(self, image_files: Dict[str, tuple]) -> Dict[str, str]:
|
||||
"""
|
||||
Upload multiple images to S3.
|
||||
|
||||
Args:
|
||||
image_files: Dict mapping filename to (local_path, is_profile) tuple
|
||||
|
||||
Returns:
|
||||
Dict mapping filename to S3 URL for successful uploads
|
||||
"""
|
||||
if not self.enabled:
|
||||
return {}
|
||||
|
||||
results = {}
|
||||
|
||||
for filename, (local_path, is_profile) in image_files.items():
|
||||
s3_url = self.upload_image(local_path, filename, is_profile)
|
||||
if s3_url:
|
||||
results[filename] = s3_url
|
||||
|
||||
if results:
|
||||
log.info(f"Successfully uploaded {len(results)} images to S3")
|
||||
|
||||
return results
|
||||
Reference in New Issue
Block a user