Test
This commit is contained in:
@@ -96,6 +96,8 @@ download_images: true # Download images from reviews
|
|||||||
image_dir: "review_images" # Directory to store downloaded images
|
image_dir: "review_images" # Directory to store downloaded images
|
||||||
download_threads: 4 # Number of threads for downloading images
|
download_threads: 4 # Number of threads for downloading images
|
||||||
store_local_paths: true # Whether to store local image paths in documents
|
store_local_paths: true # Whether to store local image paths in documents
|
||||||
|
max_width: 1200 # Maximum width for downloaded images (Google images)
|
||||||
|
max_height: 1200 # Maximum height for downloaded images (Google images)
|
||||||
|
|
||||||
# S3 settings (optional)
|
# S3 settings (optional)
|
||||||
use_s3: false # Whether to upload images to S3
|
use_s3: false # Whether to upload images to S3
|
||||||
|
|||||||
@@ -3,7 +3,6 @@ Image downloading and handling for Google Maps Reviews Scraper.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import re
|
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, Any, Set, Tuple
|
from typing import Dict, Any, Set, Tuple
|
||||||
@@ -26,6 +25,10 @@ class ImageHandler:
|
|||||||
self.max_workers = config.get("download_threads", 4)
|
self.max_workers = config.get("download_threads", 4)
|
||||||
self.store_local_paths = config.get("store_local_paths", True)
|
self.store_local_paths = config.get("store_local_paths", True)
|
||||||
|
|
||||||
|
# Image dimension settings
|
||||||
|
self.max_width = config.get("max_width", 1200)
|
||||||
|
self.max_height = config.get("max_height", 1200)
|
||||||
|
|
||||||
# URL replacement settings
|
# URL replacement settings
|
||||||
self.replace_urls = config.get("replace_urls", False)
|
self.replace_urls = config.get("replace_urls", False)
|
||||||
self.custom_url_base = config.get("custom_url_base", "https://mycustomurl.com")
|
self.custom_url_base = config.get("custom_url_base", "https://mycustomurl.com")
|
||||||
@@ -134,7 +137,23 @@ class ImageHandler:
|
|||||||
return url, filename, custom_url
|
return url, filename, custom_url
|
||||||
|
|
||||||
# Download the image
|
# Download the image
|
||||||
url = url.split("=")[0]
|
# For Google images, modify resolution parameters
|
||||||
|
if 'googleusercontent.com' in url or 'ggpht.com' in url or 'gstatic.com' in url:
|
||||||
|
# Check if URL already has size parameters (=w... or =h... or =s...)
|
||||||
|
if '=w' in url or '=h' in url or '=s' in url:
|
||||||
|
# Remove existing size parameters
|
||||||
|
# Split at = to get base URL and parameters
|
||||||
|
parts = url.split('=')
|
||||||
|
base_url = parts[0]
|
||||||
|
# Rebuild with configurable resolution parameters (using -no suffix)
|
||||||
|
url = base_url + f"=w{self.max_width}-h{self.max_height}-no"
|
||||||
|
else:
|
||||||
|
# No existing size parameters, just append them
|
||||||
|
url = url + f"=w{self.max_width}-h{self.max_height}-no"
|
||||||
|
else:
|
||||||
|
# For non-Google URLs, just remove parameters after =
|
||||||
|
url = url.split("=")[0]
|
||||||
|
|
||||||
response = requests.get(url, stream=True, timeout=10)
|
response = requests.get(url, stream=True, timeout=10)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user