This commit is contained in:
George Khananaev
2025-08-20 02:38:12 +07:00
parent dddf388422
commit 6b60b02eec
2 changed files with 23 additions and 2 deletions

View File

@@ -96,6 +96,8 @@ download_images: true # Download images from reviews
image_dir: "review_images" # Directory to store downloaded images
download_threads: 4 # Number of threads for downloading images
store_local_paths: true # Whether to store local image paths in documents
max_width: 1200 # Maximum width for downloaded images (Google images)
max_height: 1200 # Maximum height for downloaded images (Google images)
# S3 settings (optional)
use_s3: false # Whether to upload images to S3

View File

@@ -3,7 +3,6 @@ Image downloading and handling for Google Maps Reviews Scraper.
"""
import logging
import re
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from typing import Dict, Any, Set, Tuple
@@ -26,6 +25,10 @@ class ImageHandler:
self.max_workers = config.get("download_threads", 4)
self.store_local_paths = config.get("store_local_paths", True)
# Image dimension settings
self.max_width = config.get("max_width", 1200)
self.max_height = config.get("max_height", 1200)
# URL replacement settings
self.replace_urls = config.get("replace_urls", False)
self.custom_url_base = config.get("custom_url_base", "https://mycustomurl.com")
@@ -134,7 +137,23 @@ class ImageHandler:
return url, filename, custom_url
# Download the image
# For Google images, modify resolution parameters
if 'googleusercontent.com' in url or 'ggpht.com' in url or 'gstatic.com' in url:
# Check if URL already has size parameters (=w... or =h... or =s...)
if '=w' in url or '=h' in url or '=s' in url:
# Remove existing size parameters
# Split at = to get base URL and parameters
parts = url.split('=')
base_url = parts[0]
# Rebuild with configurable resolution parameters (using -no suffix)
url = base_url + f"=w{self.max_width}-h{self.max_height}-no"
else:
# No existing size parameters, just append them
url = url + f"=w{self.max_width}-h{self.max_height}-no"
else:
# For non-Google URLs, just remove parameters after =
url = url.split("=")[0]
response = requests.get(url, stream=True, timeout=10)
response.raise_for_status()