This commit is contained in:
George Khananaev
2025-08-20 02:38:12 +07:00
parent dddf388422
commit 6b60b02eec
2 changed files with 23 additions and 2 deletions

View File

@@ -96,6 +96,8 @@ download_images: true # Download images from reviews
image_dir: "review_images" # Directory to store downloaded images image_dir: "review_images" # Directory to store downloaded images
download_threads: 4 # Number of threads for downloading images download_threads: 4 # Number of threads for downloading images
store_local_paths: true # Whether to store local image paths in documents store_local_paths: true # Whether to store local image paths in documents
max_width: 1200 # Maximum width for downloaded images (Google images)
max_height: 1200 # Maximum height for downloaded images (Google images)
# S3 settings (optional) # S3 settings (optional)
use_s3: false # Whether to upload images to S3 use_s3: false # Whether to upload images to S3

View File

@@ -3,7 +3,6 @@ Image downloading and handling for Google Maps Reviews Scraper.
""" """
import logging import logging
import re
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from pathlib import Path from pathlib import Path
from typing import Dict, Any, Set, Tuple from typing import Dict, Any, Set, Tuple
@@ -26,6 +25,10 @@ class ImageHandler:
self.max_workers = config.get("download_threads", 4) self.max_workers = config.get("download_threads", 4)
self.store_local_paths = config.get("store_local_paths", True) self.store_local_paths = config.get("store_local_paths", True)
# Image dimension settings
self.max_width = config.get("max_width", 1200)
self.max_height = config.get("max_height", 1200)
# URL replacement settings # URL replacement settings
self.replace_urls = config.get("replace_urls", False) self.replace_urls = config.get("replace_urls", False)
self.custom_url_base = config.get("custom_url_base", "https://mycustomurl.com") self.custom_url_base = config.get("custom_url_base", "https://mycustomurl.com")
@@ -134,7 +137,23 @@ class ImageHandler:
return url, filename, custom_url return url, filename, custom_url
# Download the image # Download the image
url = url.split("=")[0] # For Google images, modify resolution parameters
if 'googleusercontent.com' in url or 'ggpht.com' in url or 'gstatic.com' in url:
# Check if URL already has size parameters (=w... or =h... or =s...)
if '=w' in url or '=h' in url or '=s' in url:
# Remove existing size parameters
# Split at = to get base URL and parameters
parts = url.split('=')
base_url = parts[0]
# Rebuild with configurable resolution parameters (using -no suffix)
url = base_url + f"=w{self.max_width}-h{self.max_height}-no"
else:
# No existing size parameters, just append them
url = url + f"=w{self.max_width}-h{self.max_height}-no"
else:
# For non-Google URLs, just remove parameters after =
url = url.split("=")[0]
response = requests.get(url, stream=True, timeout=10) response = requests.get(url, stream=True, timeout=10)
response.raise_for_status() response.raise_for_status()