Release Google Reviews Scraper Pro v1.0.0 (2025)

Initial release with multi-language support, MongoDB integration, image handling, URL replacement, and robust error handling. Includes detailed documentation, usage examples, and recommended usage guidelines. Built to effectively handle Google's 2025 interface changes.
This commit is contained in:
George Khananaev
2025-04-24 22:12:07 +07:00
commit 5bbaf455d8
14 changed files with 4032 additions and 0 deletions

0
modules/__init__.py Normal file
View File

76
modules/cli.py Normal file
View File

@@ -0,0 +1,76 @@
"""
Command line interface handling for Google Maps Reviews Scraper.
"""
import argparse
import json
from pathlib import Path
from modules.config import DEFAULT_CONFIG_PATH
def parse_arguments():
"""Parse command line arguments"""
ap = argparse.ArgumentParser(description="GoogleMaps review scraper with MongoDB integration")
ap.add_argument("-q", "--headless", action="store_true",
help="run Chrome in the background")
ap.add_argument("-s", "--sort", dest="sort_by",
choices=("newest", "highest", "lowest", "relevance"),
default=None, help="sorting order for reviews")
ap.add_argument("--stop-on-match", action="store_true",
help="stop scrolling when first alreadyseen id is met "
"(useful with --sort newest)")
ap.add_argument("--url", type=str, default=None,
help="custom Google Maps URL to scrape")
ap.add_argument("--overwrite", action="store_true", dest="overwrite_existing",
help="overwrite existing reviews instead of appending")
ap.add_argument("--config", type=str, default=None,
help="path to custom configuration file")
ap.add_argument("--use-mongodb", type=bool, default=None,
help="whether to use MongoDB for storage")
# Arguments for date conversion and image downloading
ap.add_argument("--convert-dates", type=bool, default=None,
help="convert string dates to MongoDB Date objects")
ap.add_argument("--download-images", type=bool, default=None,
help="download images from reviews")
ap.add_argument("--image-dir", type=str, default=None,
help="directory to store downloaded images")
ap.add_argument("--download-threads", type=int, default=None,
help="number of threads for downloading images")
# Arguments for local image paths and URL replacement
ap.add_argument("--store-local-paths", type=bool, default=None,
help="whether to store local image paths in documents")
ap.add_argument("--replace-urls", type=bool, default=None,
help="whether to replace original URLs with custom ones")
ap.add_argument("--custom-url-base", type=str, default=None,
help="base URL for replacement")
ap.add_argument("--custom-url-profiles", type=str, default=None,
help="path for profile images")
ap.add_argument("--custom-url-reviews", type=str, default=None,
help="path for review images")
ap.add_argument("--preserve-original-urls", type=bool, default=None,
help="whether to preserve original URLs in original_* fields")
# Arguments for custom parameters
ap.add_argument("--custom-params", type=str, default=None,
help="JSON string with custom parameters to add to each document (e.g. '{\"company\":\"Thaitours\"}')")
args = ap.parse_args()
# Handle config path
if args.config is not None:
args.config = Path(args.config)
else:
args.config = DEFAULT_CONFIG_PATH
# Process custom params if provided
if args.custom_params:
try:
args.custom_params = json.loads(args.custom_params)
except json.JSONDecodeError:
print(f"Warning: Could not parse custom params JSON: {args.custom_params}")
args.custom_params = None
return args

319
modules/data_storage.py Normal file
View File

@@ -0,0 +1,319 @@
"""
Data storage modules for Google Maps Reviews Scraper.
"""
import json
import logging
import ssl
from datetime import datetime
from pathlib import Path
from typing import Dict, Any, Set
import pymongo
from modules.date_converter import parse_relative_date, DateConverter
from modules.image_handler import ImageHandler
from modules.models import RawReview
from modules.utils import detect_lang, get_current_iso_date
# Configure SSL for MongoDB connection
ssl._create_default_https_context = ssl._create_unverified_context # macOS SSL fix
# Logger
log = logging.getLogger("scraper")
RAW_LANG = "en"
class MongoDBStorage:
"""MongoDB storage handler for Google Maps reviews"""
def __init__(self, config: Dict[str, Any]):
"""Initialize MongoDB storage with configuration"""
mongodb_config = config.get("mongodb", {})
self.uri = mongodb_config.get("uri")
self.db_name = mongodb_config.get("database")
self.collection_name = mongodb_config.get("collection")
self.client = None
self.collection = None
self.connected = False
self.convert_dates = config.get("convert_dates", True)
self.download_images = config.get("download_images", False)
self.store_local_paths = config.get("store_local_paths", True)
self.replace_urls = config.get("replace_urls", False)
self.preserve_original_urls = config.get("preserve_original_urls", True)
self.custom_params = config.get("custom_params", {})
self.image_handler = ImageHandler(config) if self.download_images else None
def connect(self) -> bool:
"""Connect to MongoDB"""
try:
# Use the correct TLS parameters for newer PyMongo versions
self.client = pymongo.MongoClient(
self.uri,
tlsAllowInvalidCertificates=True, # Equivalent to ssl_cert_reqs=CERT_NONE
connectTimeoutMS=30000,
socketTimeoutMS=None,
connect=True,
maxPoolSize=50
)
# Test connection
self.client.admin.command('ping')
db = self.client[self.db_name]
self.collection = db[self.collection_name]
self.connected = True
log.info(f"Connected to MongoDB: {self.db_name}.{self.collection_name}")
return True
except Exception as e:
log.error(f"Failed to connect to MongoDB: {e}")
self.connected = False
return False
def close(self):
"""Close MongoDB connection"""
if self.client:
self.client.close()
self.connected = False
def fetch_existing_reviews(self) -> Dict[str, Dict[str, Any]]:
"""Fetch existing reviews from MongoDB"""
if not self.connected and not self.connect():
log.warning("Cannot fetch existing reviews - MongoDB connection failed")
return {}
try:
reviews = {}
for doc in self.collection.find({}, {"_id": 0}):
review_id = doc.get("review_id")
if review_id:
reviews[review_id] = doc
log.info(f"Fetched {len(reviews)} existing reviews from MongoDB")
return reviews
except Exception as e:
log.error(f"Error fetching reviews from MongoDB: {e}")
return {}
def save_reviews(self, reviews: Dict[str, Dict[str, Any]]):
"""Save reviews to MongoDB using bulk operations"""
if not reviews:
log.info("No reviews to save to MongoDB")
return
if not self.connected and not self.connect():
log.warning("Cannot save reviews - MongoDB connection failed")
return
try:
# Process reviews before saving
processed_reviews = reviews.copy()
# Convert string dates to datetime objects if enabled
if self.convert_dates:
processed_reviews = DateConverter.convert_dates_in_reviews(processed_reviews)
# Download and process images if enabled
if self.download_images and self.image_handler:
processed_reviews = self.image_handler.download_all_images(processed_reviews)
# If not storing local paths, remove them from the documents
if not self.store_local_paths:
for review in processed_reviews.values():
if "local_images" in review:
del review["local_images"]
if "local_profile_picture" in review:
del review["local_profile_picture"]
# If not preserving original URLs, remove them from the documents
if self.replace_urls and not self.preserve_original_urls:
for review in processed_reviews.values():
if "original_image_urls" in review:
del review["original_image_urls"]
if "original_profile_picture" in review:
del review["original_profile_picture"]
# Add custom parameters to each document
if self.custom_params:
log.info(f"Adding custom parameters to {len(processed_reviews)} documents")
for review in processed_reviews.values():
for key, value in self.custom_params.items():
review[key] = value
operations = []
for review in processed_reviews.values():
# Convert to proper MongoDB document
# Exclude _id for inserts, MongoDB will generate it
if "_id" in review:
del review["_id"]
operations.append(
pymongo.UpdateOne(
{"review_id": review["review_id"]},
{"$set": review},
upsert=True
)
)
if operations:
result = self.collection.bulk_write(operations)
log.info(f"MongoDB: Upserted {result.upserted_count}, modified {result.modified_count} reviews")
except Exception as e:
log.error(f"Error saving reviews to MongoDB: {e}")
class JSONStorage:
"""JSON file-based storage handler for Google Maps reviews"""
def __init__(self, config: Dict[str, Any]):
"""Initialize JSON storage with configuration"""
self.json_path = Path(config.get("json_path", "google_reviews.json"))
self.seen_ids_path = Path(config.get("seen_ids_path", "google_reviews.ids"))
self.convert_dates = config.get("convert_dates", True)
self.download_images = config.get("download_images", False)
self.store_local_paths = config.get("store_local_paths", True)
self.replace_urls = config.get("replace_urls", False)
self.preserve_original_urls = config.get("preserve_original_urls", True)
self.custom_params = config.get("custom_params", {})
self.image_handler = ImageHandler(config) if self.download_images else None
def load_json_docs(self) -> Dict[str, Dict[str, Any]]:
"""Load reviews from JSON file"""
if not self.json_path.exists():
return {}
try:
data = json.loads(self.json_path.read_text(encoding="utf-8"))
# Index by review_id for fast lookups
return {d.get("review_id", ""): d for d in data if d.get("review_id")}
except json.JSONDecodeError:
log.warning("⚠️ Error reading JSON file, starting with empty data")
return {}
def save_json_docs(self, docs: Dict[str, Dict[str, Any]]):
"""Save reviews to JSON file"""
# Create a copy of the docs to avoid modifying the original
processed_docs = {review_id: review.copy() for review_id, review in docs.items()}
# Process reviews before saving
# Convert string dates to datetime objects if enabled
if self.convert_dates:
processed_docs = DateConverter.convert_dates_in_reviews(processed_docs)
# Download and process images if enabled
if self.download_images and self.image_handler:
processed_docs = self.image_handler.download_all_images(processed_docs)
# If not storing local paths, remove them from the documents
if not self.store_local_paths:
for review in processed_docs.values():
if "local_images" in review:
del review["local_images"]
if "local_profile_picture" in review:
del review["local_profile_picture"]
# If not preserving original URLs, remove them from the documents
if self.replace_urls and not self.preserve_original_urls:
for review in processed_docs.values():
if "original_image_urls" in review:
del review["original_image_urls"]
if "original_profile_picture" in review:
del review["original_profile_picture"]
# Add custom parameters to each document
if self.custom_params:
log.info(f"Adding custom parameters to {len(processed_docs)} documents")
for review in processed_docs.values():
for key, value in self.custom_params.items():
review[key] = value
# Convert datetime objects back to strings for JSON serialization
for doc in processed_docs.values():
for key, value in doc.items():
if isinstance(value, datetime):
doc[key] = value.isoformat()
# Write to JSON file
self.json_path.write_text(json.dumps(list(processed_docs.values()),
ensure_ascii=False, indent=2), encoding="utf-8")
def load_seen(self) -> Set[str]:
"""Load set of already seen review IDs"""
return set(
self.seen_ids_path.read_text(encoding="utf-8").splitlines()) if self.seen_ids_path.exists() else set()
def save_seen(self, ids: Set[str]):
"""Save set of already seen review IDs"""
self.seen_ids_path.write_text("\n".join(ids), encoding="utf-8")
def merge_review(existing: Dict[str, Any] | None, raw: RawReview) -> Dict[str, Any]:
"""
Merge a raw review with an existing review document.
Creates a new document if existing is None.
"""
if not existing:
# Create a new review with the updated field names
existing = {
"review_id": raw.id,
"author": raw.author,
"rating": raw.rating,
"description": {}, # renamed from "texts"
"likes": raw.likes,
"user_images": list(raw.photos), # renamed from "photo_urls"
"author_profile_url": raw.profile, # renamed from "profile_link"
"profile_picture": raw.avatar, # renamed from "avatar_url"
"owner_responses": {},
"created_date": get_current_iso_date(),
"review_date": parse_relative_date(raw.date, RAW_LANG),
}
else:
# Handle existing reviews with old field names - migrate them
if "texts" in existing and "description" not in existing:
existing["description"] = existing.pop("texts")
if "photo_urls" in existing and "user_images" not in existing:
existing["user_images"] = existing.pop("photo_urls")
if "profile_link" in existing and "author_profile_url" not in existing:
existing["author_profile_url"] = existing.pop("profile_link")
if "avatar_url" in existing and "profile_picture" not in existing:
existing["profile_picture"] = existing.pop("avatar_url")
# Add ISO dates if not present
if "created_date" not in existing:
existing["created_date"] = get_current_iso_date()
if "review_date" not in existing:
existing["review_date"] = parse_relative_date(raw.date, RAW_LANG)
# Remove the 'date' field if it exists
if "date" in existing:
del existing["date"]
if raw.text:
existing["description"][raw.lang] = raw.text
if not existing.get("rating"):
existing["rating"] = raw.rating
if raw.likes > existing.get("likes", 0):
existing["likes"] = raw.likes
# Update the images list
existing["user_images"] = list({*existing.get("user_images", []), *raw.photos})
# Update avatar/profile picture
if raw.avatar and (
not existing.get("profile_picture") or len(raw.avatar) > len(existing.get("profile_picture", ""))):
existing["profile_picture"] = raw.avatar
if raw.owner_text:
lang = detect_lang(raw.owner_text)
# Don't store the date string in owner_responses
existing.setdefault("owner_responses", {})[lang] = {
"text": raw.owner_text,
}
# Update last_modified timestamp
existing["last_modified_date"] = get_current_iso_date()
return existing

391
modules/date_converter.py Normal file
View File

@@ -0,0 +1,391 @@
"""
Date conversion utilities for Google Maps reviews.
"""
import logging
import re
from datetime import datetime, timedelta
from typing import Dict, Any, Optional
# Logger
log = logging.getLogger("scraper")
def relative_to_datetime(date_str: str, lang: str = "en") -> Optional[datetime]:
"""
Convert a relative date string to a datetime object.
Args:
date_str: The relative date string (e.g., "2 years ago")
lang: Language code ("en" or "he")
Returns:
datetime object or None if conversion fails
"""
if not date_str:
return None
try:
# Convert to ISO format first
iso_date = parse_relative_date(date_str, lang)
# If original string was returned, it wasn't in the expected format
if iso_date == date_str:
return None
# Parse the ISO format into datetime
return datetime.fromisoformat(iso_date)
except Exception as e:
log.debug(f"Failed to convert relative date '{date_str}': {e}")
return None
class DateConverter:
"""Handler for converting string dates to datetime objects in MongoDB"""
@staticmethod
def convert_dates_in_document(doc: Dict[str, Any]) -> Dict[str, Any]:
"""
Convert string dates to datetime objects in a document.
Args:
doc: MongoDB document with string dates
Returns:
Document with string dates converted to datetime objects
"""
# Remove the original date string field if it exists
if "date" in doc:
original_date = doc.pop("date")
# Try to use the original date to fix review_date if needed
if "review_date" not in doc or not doc["review_date"]:
lang = next(iter(doc.get("description", {}).keys()), "en")
date_obj = relative_to_datetime(original_date, lang)
if date_obj:
doc["review_date"] = date_obj
# Fields that should be converted to dates
date_fields = ["created_date", "last_modified_date", "review_date"]
# Convert date fields to datetime
for field in date_fields:
if field in doc and isinstance(doc[field], str):
try:
# Try to parse as ISO format first
doc[field] = datetime.fromisoformat(doc[field].replace('Z', '+00:00'))
except (ValueError, TypeError):
# If that fails, try parsing as relative date
lang = next(iter(doc.get("description", {}).keys()), "en")
date_obj = relative_to_datetime(doc[field], lang)
if date_obj:
doc[field] = date_obj
# Handle nested date fields in owner_responses
if "owner_responses" in doc and isinstance(doc["owner_responses"], dict):
for lang, response in doc["owner_responses"].items():
if isinstance(response, dict) and "date" in response:
# Remove the date string field from owner responses
del response["date"]
return doc
@staticmethod
def convert_dates_in_reviews(reviews: Dict[str, Dict[str, Any]]) -> Dict[str, Dict[str, Any]]:
"""
Convert string dates to datetime objects for all reviews.
Args:
reviews: Dictionary of review documents
Returns:
Reviews with dates converted to datetime objects
"""
log.info("Converting string dates to datetime objects...")
for review_id, review in reviews.items():
reviews[review_id] = DateConverter.convert_dates_in_document(review)
return reviews
def parse_relative_date(date_str: str, lang: str, now: Optional[datetime] = None) -> str:
"""
Converts a relative review_date (in English or Hebrew) such as "a week ago" or "לפני 7 שנים"
into an ISO formatted datetime string (UTC).
For English, supported formats include:
- "a day ago", "an hour ago", "3 weeks ago", "4 months ago", "2 years ago", etc.
For Hebrew, supported formats include:
- "לפני יום", "לפני 2 ימים", "לפני שבוע", "לפני שבועיים", "לפני חודש",
"לפני חודשיים", "לפני 10 חודשים", "לפני שנה", "לפני 3 שנים", etc.
Parameters:
- date_str (str): the relative date string.
- lang (str): "en" for English or "he" for Hebrew.
- now (Optional[datetime]): reference datetime; if None, current local time is used.
Returns:
A string representing the calculated absolute datetime in ISO 8601 format.
If parsing fails in all supported languages, returns a random date within the last year.
"""
import random
if now is None:
now = datetime.utcnow() # use UTC for consistency
# Try with the provided language first
result = try_parse_date(date_str, lang, now)
if result != date_str:
return result
# If the provided language failed, try other supported languages
supported_langs = ["en", "he", "th"]
for alt_lang in supported_langs:
if alt_lang != lang.lower():
result = try_parse_date(date_str, alt_lang, now)
if result != date_str:
return result
# If all parsing attempts failed, generate a random date within the last year
# This creates a date between 1 day ago and 365 days ago
random_days_ago = random.randint(1, 365)
random_date = now - timedelta(days=random_days_ago)
return random_date.isoformat()
def try_parse_date(date_str: str, lang: str, now: datetime) -> str:
"""
Helper function that attempts to parse a date string in a specific language.
Returns the ISO formatted date if successful, or the original string if not.
"""
delta = timedelta(0)
parsed = False
if lang.lower() == "en":
# Pattern: capture number or "a"/"an", then unit.
pattern = re.compile(r'(?P<num>a|an|\d+)\s+(?P<unit>day|week|month|year)s?\s+ago', re.IGNORECASE)
m = pattern.search(date_str)
if m:
num_str = m.group("num").lower()
num = 1 if num_str in ("a", "an") else int(num_str)
unit = m.group("unit").lower()
if unit == "day":
delta = timedelta(days=num)
elif unit == "week":
delta = timedelta(weeks=num)
elif unit == "month":
delta = timedelta(days=30 * num) # approximate
elif unit == "year":
delta = timedelta(days=365 * num) # approximate
parsed = True
elif lang.lower() == "he":
# Remove the "לפני" prefix if present
text = date_str.strip()
if text.startswith("לפני"):
text = text[len("לפני"):].strip()
# Handle special cases where the number and unit are combined:
special = {
"חודשיים": (2, "month"),
"שבועיים": (2, "week"),
"יומיים": (2, "day"),
}
if text in special:
num, unit = special[text]
if unit == "day":
delta = timedelta(days=num)
elif unit == "week":
delta = timedelta(weeks=num)
elif unit == "month":
delta = timedelta(days=30 * num) # approximate
parsed = True
else:
# Match optional number (or assume 1) and then a unit.
pattern = re.compile(r'(?P<num>\d+|אחד|אחת)?\s*(?P<unit>שנה|שנים|חודש|חודשים|יום|ימים|שבוע|שבועות)',
re.IGNORECASE)
m = pattern.search(text)
if m:
num_str = m.group("num")
if not num_str:
num = 1
else:
try:
num = int(num_str)
except ValueError:
num = 1
unit_he = m.group("unit")
# Map the Hebrew unit (both singular and plural) to English unit names
if unit_he in ("יום", "ימים"):
unit = "day"
elif unit_he in ("שבוע", "שבועות"):
unit = "week"
elif unit_he in ("חודש", "חודשים"):
unit = "month"
elif unit_he in ("שנה", "שנים"):
unit = "year"
else:
unit = "day" # fallback
if unit == "day":
delta = timedelta(days=num)
elif unit == "week":
delta = timedelta(weeks=num)
elif unit == "month":
delta = timedelta(days=30 * num) # approximate
elif unit == "year":
delta = timedelta(days=365 * num) # approximate
parsed = True
elif lang.lower() == "th":
# Thai language patterns (simplified)
# Check for Thai patterns like "3 วันที่แล้ว" (3 days ago)
thai_pattern = re.compile(r'(?P<num>\d+)?\s*(?P<unit>วัน|สัปดาห์|เดือน|ปี)ที่แล้ว', re.IGNORECASE)
m = thai_pattern.search(date_str)
if m:
num_str = m.group("num")
num = 1 if not num_str else int(num_str)
unit_th = m.group("unit")
# Map Thai units to English
if unit_th == "วัน":
unit = "day"
elif unit_th == "สัปดาห์":
unit = "week"
elif unit_th == "เดือน":
unit = "month"
elif unit_th == "ปี":
unit = "year"
else:
unit = "day" # fallback
if unit == "day":
delta = timedelta(days=num)
elif unit == "week":
delta = timedelta(weeks=num)
elif unit == "month":
delta = timedelta(days=30 * num) # approximate
elif unit == "year":
delta = timedelta(days=365 * num) # approximate
parsed = True
# Return the calculated date if parsing was successful, otherwise return the original string
if parsed:
result = now - delta
return result.isoformat()
else:
return date_str
# def parse_relative_date(date_str: str, lang: str, now: Optional[datetime] = None) -> str:
# """
# Converts a relative review_date (in English or Hebrew) such as "a week ago" or "לפני 7 שנים"
# into an ISO formatted datetime string (UTC).
#
# For English, supported formats include:
# - "a day ago", "an hour ago", "3 weeks ago", "4 months ago", "2 years ago", etc.
# For Hebrew, supported formats include:
# - "לפני יום", "לפני 2 ימים", "לפני שבוע", "לפני שבועיים", "לפני חודש",
# "לפני חודשיים", "לפני 10 חודשים", "לפני שנה", "לפני 3 שנים", etc.
#
# Parameters:
# - date_str (str): the relative date string.
# - lang (str): "en" for English or "he" for Hebrew.
# - now (Optional[datetime]): reference datetime; if None, current local time is used.
#
# Returns:
# A string representing the calculated absolute datetime in ISO 8601 format,
# or the original date_str if parsing fails.
# """
# if now is None:
# now = datetime.utcnow() # use UTC for consistency
#
# delta = timedelta(0)
#
# if lang.lower() == "en":
# # Pattern: capture number or "a"/"an", then unit.
# pattern = re.compile(r'(?P<num>a|an|\d+)\s+(?P<unit>day|week|month|year)s?\s+ago', re.IGNORECASE)
# m = pattern.search(date_str)
# if m:
# num_str = m.group("num").lower()
# num = 1 if num_str in ("a", "an") else int(num_str)
# unit = m.group("unit").lower()
# if unit == "day":
# delta = timedelta(days=num)
# elif unit == "week":
# delta = timedelta(weeks=num)
# elif unit == "month":
# delta = timedelta(days=30 * num) # approximate
# elif unit == "year":
# delta = timedelta(days=365 * num) # approximate
# else:
# return date_str # return original if not matched
# elif lang.lower() == "he":
# # Remove the "לפני" prefix if present
# text = date_str.strip()
# if text.startswith("לפני"):
# text = text[len("לפני"):].strip()
#
# # Handle special cases where the number and unit are combined:
# special = {
# "חודשיים": (2, "month"),
# "שבועיים": (2, "week"),
# "יומיים": (2, "day"),
# }
# if text in special:
# num, unit = special[text]
# else:
# # Match optional number (or assume 1) and then a unit.
# pattern = re.compile(r'(?P<num>\d+|אחד|אחת)?\s*(?P<unit>שנה|שנים|חודש|חודשים|יום|ימים|שבוע|שבועות)',
# re.IGNORECASE)
# m = pattern.search(text)
# if m:
# num_str = m.group("num")
# if not num_str:
# num = 1
# else:
# try:
# num = int(num_str)
# except ValueError:
# num = 1
# unit_he = m.group("unit")
# # Map the Hebrew unit (both singular and plural) to English unit names
# if unit_he in ("יום", "ימים"):
# unit = "day"
# elif unit_he in ("שבוע", "שבועות"):
# unit = "week"
# elif unit_he in ("חודש", "חודשים"):
# unit = "month"
# elif unit_he in ("שנה", "שנים"):
# unit = "year"
# else:
# unit = "day" # fallback
# else:
# return date_str # if nothing matches, return original text
#
# if unit == "day":
# delta = timedelta(days=num)
# elif unit == "week":
# delta = timedelta(weeks=num)
# elif unit == "month":
# delta = timedelta(days=30 * num) # approximate
# elif unit == "year":
# delta = timedelta(days=365 * num) # approximate
#
# result = now - delta
# return result.isoformat()
# --- Example usage ---
if __name__ == "__main__":
# Fixed reference time for reproducibility:
fixed_now = datetime(2025, 2, 5, 12, 0, 0)
examples = [
("a week ago", "he"),
("4 weeks ago", "en"),
("לפני 7 שנים", "he"),
("לפני חודשיים", "he")
]
for text, lang in examples:
iso_date = parse_relative_date(text, lang, now=fixed_now)
print(f"Original: {text} ({lang}) => ISO: {iso_date}")

283
modules/image_handler.py Normal file
View File

@@ -0,0 +1,283 @@
"""
Image downloading and handling for Google Maps Reviews Scraper.
"""
import logging
import re
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from typing import Dict, Any, Set, Tuple
from urllib.parse import urlparse
import requests
# Logger
log = logging.getLogger("scraper")
class ImageHandler:
"""Handler for downloading and managing review images"""
def __init__(self, config: Dict[str, Any]):
"""Initialize image handler with configuration"""
self.image_dir = Path(config.get("image_dir", "review_images"))
self.max_workers = config.get("download_threads", 4)
self.store_local_paths = config.get("store_local_paths", True)
# URL replacement settings
self.replace_urls = config.get("replace_urls", False)
self.custom_url_base = config.get("custom_url_base", "https://mycustomurl.com")
self.custom_url_profiles = config.get("custom_url_profiles", "/profiles/")
self.custom_url_reviews = config.get("custom_url_reviews", "/reviews/")
self.preserve_original_urls = config.get("preserve_original_urls", True)
# Subdirectories for different image types
self.profile_dir = self.image_dir / "profiles"
self.review_dir = self.image_dir / "reviews"
def ensure_directories(self):
"""Ensure all image directories exist"""
self.profile_dir.mkdir(parents=True, exist_ok=True)
self.review_dir.mkdir(parents=True, exist_ok=True)
def is_not_custom_url(self, url: str) -> bool:
"""Check if the URL is not one of our custom URLs"""
if not url:
return False
# Check if the URL starts with our custom URL base - if so, skip it
if self.custom_url_base and url.startswith(self.custom_url_base):
return False
return True
def get_filename_from_url(self, url: str, is_profile: bool = False) -> str:
"""Extract filename from URL and add .jpg extension"""
if not url:
return ""
# Skip our custom URLs
if not self.is_not_custom_url(url):
return ""
# For profile pictures
if is_profile:
# Extract unique identifier from profile URL
parts = url.split('/')
if len(parts) > 1:
filename = parts[-2] if parts[-1] in ('', 'w72-h72-p-rp-mo-ba4-br100') else parts[-1]
return f"{filename}.jpg"
# For review images
match = re.search(r'AIHoz[^=]+=', url)
if match:
# Use the ID as filename
return f"{match.group(0).rstrip('=')}w600-h450-p.jpg"
# Fallback to using the last part of the URL path
parsed = urlparse(url)
path = parsed.path
filename = path.split('/')[-1]
# Add .jpg extension if not present
if not filename.lower().endswith('.jpg'):
filename += ".jpg"
return filename
def get_custom_url(self, filename: str, is_profile: bool = False) -> str:
"""Generate a custom URL for the image"""
if not self.replace_urls or not filename:
return ""
base_url = self.custom_url_base.rstrip('/')
path = self.custom_url_profiles if is_profile else self.custom_url_reviews
path = path.strip('/')
return f"{base_url}/{path}/{filename}"
def download_image(self, url_info: Tuple[str, bool]) -> Tuple[str, str, str]:
"""
Download an image from URL and save to disk.
Args:
url_info: Tuple of (url, is_profile)
Returns:
Tuple of (url, local filename, custom url)
"""
url, is_profile = url_info
# Skip our custom URLs
if not self.is_not_custom_url(url):
return url, "", ""
try:
filename = self.get_filename_from_url(url, is_profile)
if not filename:
return url, "", ""
# Choose directory based on image type
target_dir = self.profile_dir if is_profile else self.review_dir
filepath = target_dir / filename
# Skip if file already exists
if filepath.exists():
# Generate custom URL even if file exists
custom_url = self.get_custom_url(filename, is_profile)
return url, filename, custom_url
# Download the image
response = requests.get(url, stream=True, timeout=10)
response.raise_for_status()
with open(filepath, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
# Generate custom URL
custom_url = self.get_custom_url(filename, is_profile)
return url, filename, custom_url
except Exception as e:
log.error(f"Error downloading image from {url}: {e}")
return url, "", ""
def download_all_images(self, reviews: Dict[str, Dict[str, Any]]) -> Dict[str, Dict[str, Any]]:
"""
Download all images (review images and profile pictures) for all reviews.
Args:
reviews: Dictionary of review documents
Returns:
Updated reviews with local image paths and custom URLs
"""
self.ensure_directories()
# Collect all unique image URLs (both review images and profile pictures)
# Exclude custom URLs
review_urls: Set[str] = set()
profile_urls: Set[str] = set()
for review in reviews.values():
# Collect review images - exclude custom URLs
if "user_images" in review and isinstance(review["user_images"], list):
for url in review["user_images"]:
if self.is_not_custom_url(url):
review_urls.add(url)
# If we have original image URLs stored separately, add those too
if "original_image_urls" in review and isinstance(review["original_image_urls"], list):
for orig_url in review["original_image_urls"]:
if self.is_not_custom_url(orig_url):
review_urls.add(orig_url)
# Collect profile pictures - exclude custom URLs
if "profile_picture" in review and review["profile_picture"]:
profile_url = review["profile_picture"]
if self.is_not_custom_url(profile_url):
profile_urls.add(profile_url)
# If we have original profile URL stored separately, add that too
if "original_profile_picture" in review and review["original_profile_picture"]:
orig_profile_url = review["original_profile_picture"]
if self.is_not_custom_url(orig_profile_url):
profile_urls.add(orig_profile_url)
# Prepare download tasks with URL type info
download_tasks = [(url, False) for url in review_urls] + [(url, True) for url in profile_urls]
if not download_tasks:
log.info("No images to download")
return reviews
log.info(
f"Downloading {len(download_tasks)} images ({len(profile_urls)} profiles, {len(review_urls)} review images)...")
# Create URL to filename and URL to custom URL mappings
url_to_filename = {}
url_to_custom_url = {}
# Download images in parallel
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
results = executor.map(self.download_image, download_tasks)
for url, filename, custom_url in results:
if filename:
url_to_filename[url] = filename
if custom_url:
url_to_custom_url[url] = custom_url
# Update review documents
for review_id, review in reviews.items():
# Find the original URLs to use for lookup - important for both user_images and profile_picture
user_images_original = []
profile_picture_original = ""
# For user_images, either use original URLs if we have them, or the current user_images
if "original_image_urls" in review and isinstance(review["original_image_urls"], list):
user_images_original = review["original_image_urls"]
elif "user_images" in review and isinstance(review["user_images"], list):
user_images_original = review["user_images"].copy()
# For profile_picture, either use original URL if we have it, or the current profile_picture
if "original_profile_picture" in review and review["original_profile_picture"]:
profile_picture_original = review["original_profile_picture"]
elif "profile_picture" in review:
profile_picture_original = review["profile_picture"]
# Process user_images
if "user_images" in review and isinstance(review["user_images"], list):
# Add local image paths if enabled
if self.store_local_paths:
local_images = [url_to_filename.get(url, "") for url in user_images_original
if url and self.is_not_custom_url(url)]
review["local_images"] = [img for img in local_images if img]
# Replace URLs if enabled
if self.replace_urls:
# Store original URLs if needed and not already stored
if self.preserve_original_urls and "original_image_urls" not in review:
review["original_image_urls"] = review["user_images"].copy()
# Create custom URLs for each image
custom_images = []
for url in user_images_original:
if url in url_to_custom_url:
custom_images.append(url_to_custom_url[url])
elif not self.is_not_custom_url(url): # Already a custom URL
custom_images.append(url)
# Replace with custom URLs if we have them
if custom_images:
review["user_images"] = custom_images
# Process profile_picture
if "profile_picture" in review and review["profile_picture"]:
# Add local profile picture path if enabled
if self.store_local_paths and profile_picture_original in url_to_filename:
review["local_profile_picture"] = url_to_filename[profile_picture_original]
# Replace profile_picture URL if enabled
if self.replace_urls:
# Store original URL if needed and not already stored
if self.preserve_original_urls and "original_profile_picture" not in review:
review["original_profile_picture"] = review["profile_picture"]
# Replace with custom URL if we have one for this profile image
if profile_picture_original in url_to_custom_url:
review["profile_picture"] = url_to_custom_url[profile_picture_original]
elif not self.is_not_custom_url(review["profile_picture"]):
# If current URL is already a custom URL, keep it
pass
elif profile_picture_original:
# If we don't have a custom URL but have a filename, generate one
filename = url_to_filename.get(profile_picture_original, "")
if filename:
custom_url = self.get_custom_url(filename, True)
if custom_url:
review["profile_picture"] = custom_url
log.info(f"Downloaded {len(url_to_filename)} images")
if self.replace_urls:
log.info(f"Replaced URLs for {len(url_to_custom_url)} images")
return reviews

84
modules/models.py Normal file
View File

@@ -0,0 +1,84 @@
"""
Data models for Google Maps Reviews Scraper.
"""
import re
from dataclasses import dataclass, field
from selenium.webdriver.remote.webelement import WebElement
from modules.utils import (try_find, first_text, first_attr, safe_int, detect_lang, parse_date_to_iso)
@dataclass
class RawReview:
"""
Data class representing a raw review extracted from Google Maps.
"""
id: str = ""
author: str = ""
rating: float = 0.0
date: str = ""
lang: str = "und"
text: str = ""
likes: int = 0
photos: list[str] = field(default_factory=list)
profile: str = ""
avatar: str = "" # URL to profile picture
owner_date: str = ""
owner_text: str = ""
review_date: str = "" # ISO format date
# CSS Selectors for review elements
MORE_BTN = "button.kyuRq"
LIKE_BTN = 'button[jsaction*="toggleThumbsUp" i]'
PHOTO_BTN = "button.Tya61d"
OWNER_RESP = "div.CDe7pd"
@classmethod
def from_card(cls, card: WebElement) -> "RawReview":
"""Factory method to create a RawReview from a WebElement"""
# expand "More" - non-blocking approach
for b in try_find(card, cls.MORE_BTN, all=True):
try:
b.click()
except Exception:
pass
rid = card.get_attribute("data-review-id") or ""
author = first_text(card, 'div[class*="d4r55"]')
profile = first_attr(card, 'button[data-review-id]', "data-href")
avatar = first_attr(card, 'button[data-review-id] img', "src")
label = first_attr(card, 'span[role="img"]', "aria-label")
num = re.search(r"[\d\.]+", label.replace(",", ".")) if label else None
rating = float(num.group()) if num else 0.0
date = first_text(card, 'span[class*="rsqaWe"]')
# Parse the date string to ISO format
review_date = parse_date_to_iso(date)
text = ""
for sel in ('span[jsname="bN97Pc"]',
'span[jsname="fbQN7e"]',
'div.MyEned span.wiI7pd'):
text = first_text(card, sel)
if text: break
lang = detect_lang(text)
likes = 0
if (btn := try_find(card, cls.LIKE_BTN)):
likes = safe_int(btn[0].text or btn[0].get_attribute("aria-label"))
photos: list[str] = []
for btn in try_find(card, cls.PHOTO_BTN, all=True):
if (m := re.search(r'url\("([^"]+)"', btn.get_attribute("style") or "")):
photos.append(m.group(1))
owner_date = owner_text = ""
if (box := try_find(card, cls.OWNER_RESP)):
box = box[0]
owner_date = first_text(box, "span.DZSIDd")
owner_text = first_text(box, "div.wiI7pd")
return cls(rid, author, rating, date, lang, text, likes,
photos, profile, avatar, owner_date, owner_text, review_date)

1921
modules/scraper.py Normal file

File diff suppressed because it is too large Load Diff

307
modules/utils.py Normal file
View File

@@ -0,0 +1,307 @@
"""
Utility functions for Google Maps Reviews Scraper.
"""
import datetime
import logging
import re
import time
from datetime import timezone
from functools import lru_cache
from typing import List
from selenium.common.exceptions import (NoSuchElementException,
StaleElementReferenceException,
TimeoutException)
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
# Logger
log = logging.getLogger("scraper")
# Constants for language detection
HEB_CHARS = re.compile(r"[\u0590-\u05FF]")
THAI_CHARS = re.compile(r"[\u0E00-\u0E7F]")
@lru_cache(maxsize=1024)
def detect_lang(txt: str) -> str:
"""Detect language based on character sets"""
if HEB_CHARS.search(txt): return "he"
if THAI_CHARS.search(txt): return "th"
return "en"
@lru_cache(maxsize=128)
def safe_int(s: str | None) -> int:
"""Safely convert string to integer, returning 0 if not possible"""
m = re.search(r"\d+", s or "")
return int(m.group()) if m else 0
def try_find(el: WebElement, css: str, *, all=False) -> List[WebElement]:
"""Safely find elements by CSS selector without raising exceptions"""
try:
if all:
return el.find_elements(By.CSS_SELECTOR, css)
obj = el.find_element(By.CSS_SELECTOR, css)
return [obj] if obj else []
except (NoSuchElementException, StaleElementReferenceException):
return []
def first_text(el: WebElement, css: str) -> str:
"""Get text from the first matching element that has non-empty text"""
for e in try_find(el, css, all=True):
try:
if (t := e.text.strip()):
return t
except StaleElementReferenceException:
continue
return ""
def parse_date_to_iso(date_str: str) -> str:
"""
Parse date strings like "2 weeks ago", "January 2023", etc. into ISO format.
Returns a best-effort ISO string, or empty string if parsing fails.
"""
if not date_str:
return ""
try:
now = datetime.now(timezone.utc)
# Handle relative dates
if "ago" in date_str.lower():
# For simplicity, map to approximate dates
if "minute" in date_str.lower():
minutes = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
dt = now.replace(microsecond=0) - timezone.timedelta(minutes=minutes)
elif "hour" in date_str.lower():
hours = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
dt = now.replace(microsecond=0) - timezone.timedelta(hours=hours)
elif "day" in date_str.lower():
days = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
dt = now.replace(microsecond=0) - timezone.timedelta(days=days)
elif "week" in date_str.lower():
weeks = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
dt = now.replace(microsecond=0) - timezone.timedelta(weeks=weeks)
elif "month" in date_str.lower():
months = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
# Approximate months as 30 days
dt = now.replace(microsecond=0) - timezone.timedelta(days=30 * months)
elif "year" in date_str.lower():
years = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
# Approximate years as 365 days
dt = now.replace(microsecond=0) - timezone.timedelta(days=365 * years)
else:
# Default to current time if can't parse
dt = now.replace(microsecond=0)
else:
# Handle absolute dates (month year format)
# This is a simplification - would need more robust parsing for production
dt = now.replace(microsecond=0)
return dt.isoformat()
except Exception:
# If parsing fails, return empty string
return ""
def first_attr(el: WebElement, css: str, attr: str) -> str:
"""Get attribute value from the first matching element that has a non-empty value"""
for e in try_find(el, css, all=True):
try:
if (v := (e.get_attribute(attr) or "").strip()):
return v
except StaleElementReferenceException:
continue
return ""
def click_if(driver: Chrome, css: str, delay: float = .25, timeout: float = 5.0) -> bool:
"""
Click element if it exists and is clickable, with timeout and better error handling.
Args:
driver: WebDriver instance
css: CSS selector for the element to click
delay: Time to wait after clicking (seconds)
timeout: Maximum time to wait for element (seconds)
Returns:
True if element was found and clicked, False otherwise
"""
try:
# First check if elements exist at all
elements = driver.find_elements(By.CSS_SELECTOR, css)
if not elements:
return False
# Try clicking the first visible element
for element in elements:
try:
if element.is_displayed() and element.is_enabled():
element.click()
time.sleep(delay)
return True
except Exception:
# Try next element if this one fails
continue
# If we couldn't click any of the direct elements, try with WebDriverWait
try:
WebDriverWait(driver, timeout).until(
EC.element_to_be_clickable((By.CSS_SELECTOR, css))
).click()
time.sleep(delay)
return True
except TimeoutException:
return False
except Exception as e:
log.debug(f"Error in click_if: {str(e)}")
return False
def get_current_iso_date() -> str:
"""Return current UTC time in ISO format."""
from datetime import datetime, timezone
return datetime.now(timezone.utc).isoformat()
# """
# Utility functions for Google Maps Reviews Scraper.
# """
#
# import re
# import time
# import logging
# from datetime import datetime, timezone
# from functools import lru_cache
# from typing import List, Optional
#
# from selenium.common.exceptions import (NoSuchElementException,
# StaleElementReferenceException,
# TimeoutException)
# from selenium.webdriver import Chrome
# from selenium.webdriver.common.by import By
# from selenium.webdriver.remote.webelement import WebElement
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.support.ui import WebDriverWait
#
# # Constants for language detection
# HEB_CHARS = re.compile(r"[\u0590-\u05FF]")
# THAI_CHARS = re.compile(r"[\u0E00-\u0E7F]")
#
# # Logger
# log = logging.getLogger("scraper")
#
#
# @lru_cache(maxsize=1024)
# def detect_lang(txt: str) -> str:
# """Detect language based on character sets"""
# if HEB_CHARS.search(txt): return "he"
# if THAI_CHARS.search(txt): return "th"
# return "en"
#
#
# @lru_cache(maxsize=128)
# def safe_int(s: str | None) -> int:
# """Safely convert string to integer, returning 0 if not possible"""
# m = re.search(r"\d+", s or "")
# return int(m.group()) if m else 0
#
#
# def try_find(el: WebElement, css: str, *, all=False) -> List[WebElement]:
# """Safely find elements by CSS selector without raising exceptions"""
# try:
# if all:
# return el.find_elements(By.CSS_SELECTOR, css)
# obj = el.find_element(By.CSS_SELECTOR, css)
# return [obj] if obj else []
# except (NoSuchElementException, StaleElementReferenceException):
# return []
#
#
# def first_text(el: WebElement, css: str) -> str:
# """Get text from the first matching element that has non-empty text"""
# for e in try_find(el, css, all=True):
# if (t := e.text.strip()):
# return t
# return ""
#
#
# def first_attr(el: WebElement, css: str, attr: str) -> str:
# """Get attribute value from the first matching element that has a non-empty value"""
# for e in try_find(el, css, all=True):
# if (v := (e.get_attribute(attr) or "").strip()):
# return v
# return ""
#
#
# def click_if(driver: Chrome, css: str, delay: float = .25, timeout: float = 5.0) -> bool:
# """Click element if it exists and is clickable, with timeout"""
# try:
# WebDriverWait(driver, timeout).until(
# EC.element_to_be_clickable((By.CSS_SELECTOR, css))
# ).click()
# time.sleep(delay)
# return True
# except TimeoutException:
# return False
#
#
# def parse_date_to_iso(date_str: str) -> str:
# """
# Parse date strings like "2 weeks ago", "January 2023", etc. into ISO format.
# Returns a best-effort ISO string, or empty string if parsing fails.
# """
# if not date_str:
# return ""
#
# try:
# now = datetime.now(timezone.utc)
#
# # Handle relative dates
# if "ago" in date_str.lower():
# # For simplicity, map to approximate dates
# if "minute" in date_str.lower():
# minutes = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
# dt = now.replace(microsecond=0) - timezone.timedelta(minutes=minutes)
# elif "hour" in date_str.lower():
# hours = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
# dt = now.replace(microsecond=0) - timezone.timedelta(hours=hours)
# elif "day" in date_str.lower():
# days = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
# dt = now.replace(microsecond=0) - timezone.timedelta(days=days)
# elif "week" in date_str.lower():
# weeks = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
# dt = now.replace(microsecond=0) - timezone.timedelta(weeks=weeks)
# elif "month" in date_str.lower():
# months = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
# # Approximate months as 30 days
# dt = now.replace(microsecond=0) - timezone.timedelta(days=30 * months)
# elif "year" in date_str.lower():
# years = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
# # Approximate years as 365 days
# dt = now.replace(microsecond=0) - timezone.timedelta(days=365 * years)
# else:
# # Default to current time if can't parse
# dt = now.replace(microsecond=0)
# else:
# # Handle absolute dates (month year format)
# # This is a simplification - would need more robust parsing for production
# dt = now.replace(microsecond=0)
#
# return dt.isoformat()
# except Exception:
# # If parsing fails, return empty string
# return ""
#
#
# def get_current_iso_date() -> str:
# """Return current UTC time in ISO format."""
# return datetime.now(timezone.utc).isoformat()