Release Google Reviews Scraper Pro v1.0.0 (2025)
Initial release with multi-language support, MongoDB integration, image handling, URL replacement, and robust error handling. Includes detailed documentation, usage examples, and recommended usage guidelines. Built to effectively handle Google's 2025 interface changes.
This commit is contained in:
0
modules/__init__.py
Normal file
0
modules/__init__.py
Normal file
76
modules/cli.py
Normal file
76
modules/cli.py
Normal file
@@ -0,0 +1,76 @@
|
||||
"""
|
||||
Command line interface handling for Google Maps Reviews Scraper.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from modules.config import DEFAULT_CONFIG_PATH
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
"""Parse command line arguments"""
|
||||
ap = argparse.ArgumentParser(description="Google‑Maps review scraper with MongoDB integration")
|
||||
ap.add_argument("-q", "--headless", action="store_true",
|
||||
help="run Chrome in the background")
|
||||
ap.add_argument("-s", "--sort", dest="sort_by",
|
||||
choices=("newest", "highest", "lowest", "relevance"),
|
||||
default=None, help="sorting order for reviews")
|
||||
ap.add_argument("--stop-on-match", action="store_true",
|
||||
help="stop scrolling when first already‑seen id is met "
|
||||
"(useful with --sort newest)")
|
||||
ap.add_argument("--url", type=str, default=None,
|
||||
help="custom Google Maps URL to scrape")
|
||||
ap.add_argument("--overwrite", action="store_true", dest="overwrite_existing",
|
||||
help="overwrite existing reviews instead of appending")
|
||||
ap.add_argument("--config", type=str, default=None,
|
||||
help="path to custom configuration file")
|
||||
ap.add_argument("--use-mongodb", type=bool, default=None,
|
||||
help="whether to use MongoDB for storage")
|
||||
|
||||
# Arguments for date conversion and image downloading
|
||||
ap.add_argument("--convert-dates", type=bool, default=None,
|
||||
help="convert string dates to MongoDB Date objects")
|
||||
ap.add_argument("--download-images", type=bool, default=None,
|
||||
help="download images from reviews")
|
||||
ap.add_argument("--image-dir", type=str, default=None,
|
||||
help="directory to store downloaded images")
|
||||
ap.add_argument("--download-threads", type=int, default=None,
|
||||
help="number of threads for downloading images")
|
||||
|
||||
# Arguments for local image paths and URL replacement
|
||||
ap.add_argument("--store-local-paths", type=bool, default=None,
|
||||
help="whether to store local image paths in documents")
|
||||
ap.add_argument("--replace-urls", type=bool, default=None,
|
||||
help="whether to replace original URLs with custom ones")
|
||||
ap.add_argument("--custom-url-base", type=str, default=None,
|
||||
help="base URL for replacement")
|
||||
ap.add_argument("--custom-url-profiles", type=str, default=None,
|
||||
help="path for profile images")
|
||||
ap.add_argument("--custom-url-reviews", type=str, default=None,
|
||||
help="path for review images")
|
||||
ap.add_argument("--preserve-original-urls", type=bool, default=None,
|
||||
help="whether to preserve original URLs in original_* fields")
|
||||
|
||||
# Arguments for custom parameters
|
||||
ap.add_argument("--custom-params", type=str, default=None,
|
||||
help="JSON string with custom parameters to add to each document (e.g. '{\"company\":\"Thaitours\"}')")
|
||||
|
||||
args = ap.parse_args()
|
||||
|
||||
# Handle config path
|
||||
if args.config is not None:
|
||||
args.config = Path(args.config)
|
||||
else:
|
||||
args.config = DEFAULT_CONFIG_PATH
|
||||
|
||||
# Process custom params if provided
|
||||
if args.custom_params:
|
||||
try:
|
||||
args.custom_params = json.loads(args.custom_params)
|
||||
except json.JSONDecodeError:
|
||||
print(f"Warning: Could not parse custom params JSON: {args.custom_params}")
|
||||
args.custom_params = None
|
||||
|
||||
return args
|
||||
319
modules/data_storage.py
Normal file
319
modules/data_storage.py
Normal file
@@ -0,0 +1,319 @@
|
||||
"""
|
||||
Data storage modules for Google Maps Reviews Scraper.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import ssl
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Set
|
||||
|
||||
import pymongo
|
||||
|
||||
from modules.date_converter import parse_relative_date, DateConverter
|
||||
from modules.image_handler import ImageHandler
|
||||
from modules.models import RawReview
|
||||
from modules.utils import detect_lang, get_current_iso_date
|
||||
|
||||
# Configure SSL for MongoDB connection
|
||||
ssl._create_default_https_context = ssl._create_unverified_context # macOS SSL fix
|
||||
|
||||
# Logger
|
||||
log = logging.getLogger("scraper")
|
||||
|
||||
RAW_LANG = "en"
|
||||
|
||||
|
||||
class MongoDBStorage:
|
||||
"""MongoDB storage handler for Google Maps reviews"""
|
||||
|
||||
def __init__(self, config: Dict[str, Any]):
|
||||
"""Initialize MongoDB storage with configuration"""
|
||||
mongodb_config = config.get("mongodb", {})
|
||||
self.uri = mongodb_config.get("uri")
|
||||
self.db_name = mongodb_config.get("database")
|
||||
self.collection_name = mongodb_config.get("collection")
|
||||
self.client = None
|
||||
self.collection = None
|
||||
self.connected = False
|
||||
self.convert_dates = config.get("convert_dates", True)
|
||||
self.download_images = config.get("download_images", False)
|
||||
self.store_local_paths = config.get("store_local_paths", True)
|
||||
self.replace_urls = config.get("replace_urls", False)
|
||||
self.preserve_original_urls = config.get("preserve_original_urls", True)
|
||||
self.custom_params = config.get("custom_params", {})
|
||||
self.image_handler = ImageHandler(config) if self.download_images else None
|
||||
|
||||
def connect(self) -> bool:
|
||||
"""Connect to MongoDB"""
|
||||
try:
|
||||
# Use the correct TLS parameters for newer PyMongo versions
|
||||
self.client = pymongo.MongoClient(
|
||||
self.uri,
|
||||
tlsAllowInvalidCertificates=True, # Equivalent to ssl_cert_reqs=CERT_NONE
|
||||
connectTimeoutMS=30000,
|
||||
socketTimeoutMS=None,
|
||||
connect=True,
|
||||
maxPoolSize=50
|
||||
)
|
||||
# Test connection
|
||||
self.client.admin.command('ping')
|
||||
db = self.client[self.db_name]
|
||||
self.collection = db[self.collection_name]
|
||||
self.connected = True
|
||||
log.info(f"Connected to MongoDB: {self.db_name}.{self.collection_name}")
|
||||
return True
|
||||
except Exception as e:
|
||||
log.error(f"Failed to connect to MongoDB: {e}")
|
||||
self.connected = False
|
||||
return False
|
||||
|
||||
def close(self):
|
||||
"""Close MongoDB connection"""
|
||||
if self.client:
|
||||
self.client.close()
|
||||
self.connected = False
|
||||
|
||||
def fetch_existing_reviews(self) -> Dict[str, Dict[str, Any]]:
|
||||
"""Fetch existing reviews from MongoDB"""
|
||||
if not self.connected and not self.connect():
|
||||
log.warning("Cannot fetch existing reviews - MongoDB connection failed")
|
||||
return {}
|
||||
|
||||
try:
|
||||
reviews = {}
|
||||
for doc in self.collection.find({}, {"_id": 0}):
|
||||
review_id = doc.get("review_id")
|
||||
if review_id:
|
||||
reviews[review_id] = doc
|
||||
log.info(f"Fetched {len(reviews)} existing reviews from MongoDB")
|
||||
return reviews
|
||||
except Exception as e:
|
||||
log.error(f"Error fetching reviews from MongoDB: {e}")
|
||||
return {}
|
||||
|
||||
def save_reviews(self, reviews: Dict[str, Dict[str, Any]]):
|
||||
"""Save reviews to MongoDB using bulk operations"""
|
||||
if not reviews:
|
||||
log.info("No reviews to save to MongoDB")
|
||||
return
|
||||
|
||||
if not self.connected and not self.connect():
|
||||
log.warning("Cannot save reviews - MongoDB connection failed")
|
||||
return
|
||||
|
||||
try:
|
||||
# Process reviews before saving
|
||||
processed_reviews = reviews.copy()
|
||||
|
||||
# Convert string dates to datetime objects if enabled
|
||||
if self.convert_dates:
|
||||
processed_reviews = DateConverter.convert_dates_in_reviews(processed_reviews)
|
||||
|
||||
# Download and process images if enabled
|
||||
if self.download_images and self.image_handler:
|
||||
processed_reviews = self.image_handler.download_all_images(processed_reviews)
|
||||
|
||||
# If not storing local paths, remove them from the documents
|
||||
if not self.store_local_paths:
|
||||
for review in processed_reviews.values():
|
||||
if "local_images" in review:
|
||||
del review["local_images"]
|
||||
if "local_profile_picture" in review:
|
||||
del review["local_profile_picture"]
|
||||
|
||||
# If not preserving original URLs, remove them from the documents
|
||||
if self.replace_urls and not self.preserve_original_urls:
|
||||
for review in processed_reviews.values():
|
||||
if "original_image_urls" in review:
|
||||
del review["original_image_urls"]
|
||||
if "original_profile_picture" in review:
|
||||
del review["original_profile_picture"]
|
||||
|
||||
# Add custom parameters to each document
|
||||
if self.custom_params:
|
||||
log.info(f"Adding custom parameters to {len(processed_reviews)} documents")
|
||||
for review in processed_reviews.values():
|
||||
for key, value in self.custom_params.items():
|
||||
review[key] = value
|
||||
|
||||
operations = []
|
||||
for review in processed_reviews.values():
|
||||
# Convert to proper MongoDB document
|
||||
# Exclude _id for inserts, MongoDB will generate it
|
||||
if "_id" in review:
|
||||
del review["_id"]
|
||||
|
||||
operations.append(
|
||||
pymongo.UpdateOne(
|
||||
{"review_id": review["review_id"]},
|
||||
{"$set": review},
|
||||
upsert=True
|
||||
)
|
||||
)
|
||||
|
||||
if operations:
|
||||
result = self.collection.bulk_write(operations)
|
||||
log.info(f"MongoDB: Upserted {result.upserted_count}, modified {result.modified_count} reviews")
|
||||
except Exception as e:
|
||||
log.error(f"Error saving reviews to MongoDB: {e}")
|
||||
|
||||
|
||||
class JSONStorage:
|
||||
"""JSON file-based storage handler for Google Maps reviews"""
|
||||
|
||||
def __init__(self, config: Dict[str, Any]):
|
||||
"""Initialize JSON storage with configuration"""
|
||||
self.json_path = Path(config.get("json_path", "google_reviews.json"))
|
||||
self.seen_ids_path = Path(config.get("seen_ids_path", "google_reviews.ids"))
|
||||
self.convert_dates = config.get("convert_dates", True)
|
||||
self.download_images = config.get("download_images", False)
|
||||
self.store_local_paths = config.get("store_local_paths", True)
|
||||
self.replace_urls = config.get("replace_urls", False)
|
||||
self.preserve_original_urls = config.get("preserve_original_urls", True)
|
||||
self.custom_params = config.get("custom_params", {})
|
||||
self.image_handler = ImageHandler(config) if self.download_images else None
|
||||
|
||||
def load_json_docs(self) -> Dict[str, Dict[str, Any]]:
|
||||
"""Load reviews from JSON file"""
|
||||
if not self.json_path.exists():
|
||||
return {}
|
||||
try:
|
||||
data = json.loads(self.json_path.read_text(encoding="utf-8"))
|
||||
# Index by review_id for fast lookups
|
||||
return {d.get("review_id", ""): d for d in data if d.get("review_id")}
|
||||
except json.JSONDecodeError:
|
||||
log.warning("⚠️ Error reading JSON file, starting with empty data")
|
||||
return {}
|
||||
|
||||
def save_json_docs(self, docs: Dict[str, Dict[str, Any]]):
|
||||
"""Save reviews to JSON file"""
|
||||
# Create a copy of the docs to avoid modifying the original
|
||||
processed_docs = {review_id: review.copy() for review_id, review in docs.items()}
|
||||
|
||||
# Process reviews before saving
|
||||
# Convert string dates to datetime objects if enabled
|
||||
if self.convert_dates:
|
||||
processed_docs = DateConverter.convert_dates_in_reviews(processed_docs)
|
||||
|
||||
# Download and process images if enabled
|
||||
if self.download_images and self.image_handler:
|
||||
processed_docs = self.image_handler.download_all_images(processed_docs)
|
||||
|
||||
# If not storing local paths, remove them from the documents
|
||||
if not self.store_local_paths:
|
||||
for review in processed_docs.values():
|
||||
if "local_images" in review:
|
||||
del review["local_images"]
|
||||
if "local_profile_picture" in review:
|
||||
del review["local_profile_picture"]
|
||||
|
||||
# If not preserving original URLs, remove them from the documents
|
||||
if self.replace_urls and not self.preserve_original_urls:
|
||||
for review in processed_docs.values():
|
||||
if "original_image_urls" in review:
|
||||
del review["original_image_urls"]
|
||||
if "original_profile_picture" in review:
|
||||
del review["original_profile_picture"]
|
||||
|
||||
# Add custom parameters to each document
|
||||
if self.custom_params:
|
||||
log.info(f"Adding custom parameters to {len(processed_docs)} documents")
|
||||
for review in processed_docs.values():
|
||||
for key, value in self.custom_params.items():
|
||||
review[key] = value
|
||||
|
||||
# Convert datetime objects back to strings for JSON serialization
|
||||
for doc in processed_docs.values():
|
||||
for key, value in doc.items():
|
||||
if isinstance(value, datetime):
|
||||
doc[key] = value.isoformat()
|
||||
|
||||
# Write to JSON file
|
||||
self.json_path.write_text(json.dumps(list(processed_docs.values()),
|
||||
ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
def load_seen(self) -> Set[str]:
|
||||
"""Load set of already seen review IDs"""
|
||||
return set(
|
||||
self.seen_ids_path.read_text(encoding="utf-8").splitlines()) if self.seen_ids_path.exists() else set()
|
||||
|
||||
def save_seen(self, ids: Set[str]):
|
||||
"""Save set of already seen review IDs"""
|
||||
self.seen_ids_path.write_text("\n".join(ids), encoding="utf-8")
|
||||
|
||||
|
||||
def merge_review(existing: Dict[str, Any] | None, raw: RawReview) -> Dict[str, Any]:
|
||||
"""
|
||||
Merge a raw review with an existing review document.
|
||||
Creates a new document if existing is None.
|
||||
"""
|
||||
if not existing:
|
||||
# Create a new review with the updated field names
|
||||
existing = {
|
||||
"review_id": raw.id,
|
||||
"author": raw.author,
|
||||
"rating": raw.rating,
|
||||
"description": {}, # renamed from "texts"
|
||||
"likes": raw.likes,
|
||||
"user_images": list(raw.photos), # renamed from "photo_urls"
|
||||
"author_profile_url": raw.profile, # renamed from "profile_link"
|
||||
"profile_picture": raw.avatar, # renamed from "avatar_url"
|
||||
"owner_responses": {},
|
||||
"created_date": get_current_iso_date(),
|
||||
"review_date": parse_relative_date(raw.date, RAW_LANG),
|
||||
}
|
||||
else:
|
||||
# Handle existing reviews with old field names - migrate them
|
||||
if "texts" in existing and "description" not in existing:
|
||||
existing["description"] = existing.pop("texts")
|
||||
|
||||
if "photo_urls" in existing and "user_images" not in existing:
|
||||
existing["user_images"] = existing.pop("photo_urls")
|
||||
|
||||
if "profile_link" in existing and "author_profile_url" not in existing:
|
||||
existing["author_profile_url"] = existing.pop("profile_link")
|
||||
|
||||
if "avatar_url" in existing and "profile_picture" not in existing:
|
||||
existing["profile_picture"] = existing.pop("avatar_url")
|
||||
|
||||
# Add ISO dates if not present
|
||||
if "created_date" not in existing:
|
||||
existing["created_date"] = get_current_iso_date()
|
||||
|
||||
if "review_date" not in existing:
|
||||
existing["review_date"] = parse_relative_date(raw.date, RAW_LANG)
|
||||
|
||||
# Remove the 'date' field if it exists
|
||||
if "date" in existing:
|
||||
del existing["date"]
|
||||
|
||||
if raw.text:
|
||||
existing["description"][raw.lang] = raw.text
|
||||
|
||||
if not existing.get("rating"):
|
||||
existing["rating"] = raw.rating
|
||||
|
||||
if raw.likes > existing.get("likes", 0):
|
||||
existing["likes"] = raw.likes
|
||||
|
||||
# Update the images list
|
||||
existing["user_images"] = list({*existing.get("user_images", []), *raw.photos})
|
||||
|
||||
# Update avatar/profile picture
|
||||
if raw.avatar and (
|
||||
not existing.get("profile_picture") or len(raw.avatar) > len(existing.get("profile_picture", ""))):
|
||||
existing["profile_picture"] = raw.avatar
|
||||
|
||||
if raw.owner_text:
|
||||
lang = detect_lang(raw.owner_text)
|
||||
# Don't store the date string in owner_responses
|
||||
existing.setdefault("owner_responses", {})[lang] = {
|
||||
"text": raw.owner_text,
|
||||
}
|
||||
|
||||
# Update last_modified timestamp
|
||||
existing["last_modified_date"] = get_current_iso_date()
|
||||
|
||||
return existing
|
||||
391
modules/date_converter.py
Normal file
391
modules/date_converter.py
Normal file
@@ -0,0 +1,391 @@
|
||||
"""
|
||||
Date conversion utilities for Google Maps reviews.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
# Logger
|
||||
log = logging.getLogger("scraper")
|
||||
|
||||
|
||||
def relative_to_datetime(date_str: str, lang: str = "en") -> Optional[datetime]:
|
||||
"""
|
||||
Convert a relative date string to a datetime object.
|
||||
|
||||
Args:
|
||||
date_str: The relative date string (e.g., "2 years ago")
|
||||
lang: Language code ("en" or "he")
|
||||
|
||||
Returns:
|
||||
datetime object or None if conversion fails
|
||||
"""
|
||||
if not date_str:
|
||||
return None
|
||||
|
||||
try:
|
||||
# Convert to ISO format first
|
||||
iso_date = parse_relative_date(date_str, lang)
|
||||
|
||||
# If original string was returned, it wasn't in the expected format
|
||||
if iso_date == date_str:
|
||||
return None
|
||||
|
||||
# Parse the ISO format into datetime
|
||||
return datetime.fromisoformat(iso_date)
|
||||
except Exception as e:
|
||||
log.debug(f"Failed to convert relative date '{date_str}': {e}")
|
||||
return None
|
||||
|
||||
|
||||
class DateConverter:
|
||||
"""Handler for converting string dates to datetime objects in MongoDB"""
|
||||
|
||||
@staticmethod
|
||||
def convert_dates_in_document(doc: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Convert string dates to datetime objects in a document.
|
||||
|
||||
Args:
|
||||
doc: MongoDB document with string dates
|
||||
|
||||
Returns:
|
||||
Document with string dates converted to datetime objects
|
||||
"""
|
||||
# Remove the original date string field if it exists
|
||||
if "date" in doc:
|
||||
original_date = doc.pop("date")
|
||||
|
||||
# Try to use the original date to fix review_date if needed
|
||||
if "review_date" not in doc or not doc["review_date"]:
|
||||
lang = next(iter(doc.get("description", {}).keys()), "en")
|
||||
date_obj = relative_to_datetime(original_date, lang)
|
||||
if date_obj:
|
||||
doc["review_date"] = date_obj
|
||||
|
||||
# Fields that should be converted to dates
|
||||
date_fields = ["created_date", "last_modified_date", "review_date"]
|
||||
|
||||
# Convert date fields to datetime
|
||||
for field in date_fields:
|
||||
if field in doc and isinstance(doc[field], str):
|
||||
try:
|
||||
# Try to parse as ISO format first
|
||||
doc[field] = datetime.fromisoformat(doc[field].replace('Z', '+00:00'))
|
||||
except (ValueError, TypeError):
|
||||
# If that fails, try parsing as relative date
|
||||
lang = next(iter(doc.get("description", {}).keys()), "en")
|
||||
date_obj = relative_to_datetime(doc[field], lang)
|
||||
if date_obj:
|
||||
doc[field] = date_obj
|
||||
|
||||
# Handle nested date fields in owner_responses
|
||||
if "owner_responses" in doc and isinstance(doc["owner_responses"], dict):
|
||||
for lang, response in doc["owner_responses"].items():
|
||||
if isinstance(response, dict) and "date" in response:
|
||||
# Remove the date string field from owner responses
|
||||
del response["date"]
|
||||
|
||||
return doc
|
||||
|
||||
@staticmethod
|
||||
def convert_dates_in_reviews(reviews: Dict[str, Dict[str, Any]]) -> Dict[str, Dict[str, Any]]:
|
||||
"""
|
||||
Convert string dates to datetime objects for all reviews.
|
||||
|
||||
Args:
|
||||
reviews: Dictionary of review documents
|
||||
|
||||
Returns:
|
||||
Reviews with dates converted to datetime objects
|
||||
"""
|
||||
log.info("Converting string dates to datetime objects...")
|
||||
|
||||
for review_id, review in reviews.items():
|
||||
reviews[review_id] = DateConverter.convert_dates_in_document(review)
|
||||
|
||||
return reviews
|
||||
|
||||
|
||||
def parse_relative_date(date_str: str, lang: str, now: Optional[datetime] = None) -> str:
|
||||
"""
|
||||
Converts a relative review_date (in English or Hebrew) such as "a week ago" or "לפני 7 שנים"
|
||||
into an ISO formatted datetime string (UTC).
|
||||
|
||||
For English, supported formats include:
|
||||
- "a day ago", "an hour ago", "3 weeks ago", "4 months ago", "2 years ago", etc.
|
||||
For Hebrew, supported formats include:
|
||||
- "לפני יום", "לפני 2 ימים", "לפני שבוע", "לפני שבועיים", "לפני חודש",
|
||||
"לפני חודשיים", "לפני 10 חודשים", "לפני שנה", "לפני 3 שנים", etc.
|
||||
|
||||
Parameters:
|
||||
- date_str (str): the relative date string.
|
||||
- lang (str): "en" for English or "he" for Hebrew.
|
||||
- now (Optional[datetime]): reference datetime; if None, current local time is used.
|
||||
|
||||
Returns:
|
||||
A string representing the calculated absolute datetime in ISO 8601 format.
|
||||
If parsing fails in all supported languages, returns a random date within the last year.
|
||||
"""
|
||||
import random
|
||||
|
||||
if now is None:
|
||||
now = datetime.utcnow() # use UTC for consistency
|
||||
|
||||
# Try with the provided language first
|
||||
result = try_parse_date(date_str, lang, now)
|
||||
if result != date_str:
|
||||
return result
|
||||
|
||||
# If the provided language failed, try other supported languages
|
||||
supported_langs = ["en", "he", "th"]
|
||||
for alt_lang in supported_langs:
|
||||
if alt_lang != lang.lower():
|
||||
result = try_parse_date(date_str, alt_lang, now)
|
||||
if result != date_str:
|
||||
return result
|
||||
|
||||
# If all parsing attempts failed, generate a random date within the last year
|
||||
# This creates a date between 1 day ago and 365 days ago
|
||||
random_days_ago = random.randint(1, 365)
|
||||
random_date = now - timedelta(days=random_days_ago)
|
||||
return random_date.isoformat()
|
||||
|
||||
|
||||
def try_parse_date(date_str: str, lang: str, now: datetime) -> str:
|
||||
"""
|
||||
Helper function that attempts to parse a date string in a specific language.
|
||||
|
||||
Returns the ISO formatted date if successful, or the original string if not.
|
||||
"""
|
||||
delta = timedelta(0)
|
||||
parsed = False
|
||||
|
||||
if lang.lower() == "en":
|
||||
# Pattern: capture number or "a"/"an", then unit.
|
||||
pattern = re.compile(r'(?P<num>a|an|\d+)\s+(?P<unit>day|week|month|year)s?\s+ago', re.IGNORECASE)
|
||||
m = pattern.search(date_str)
|
||||
if m:
|
||||
num_str = m.group("num").lower()
|
||||
num = 1 if num_str in ("a", "an") else int(num_str)
|
||||
unit = m.group("unit").lower()
|
||||
if unit == "day":
|
||||
delta = timedelta(days=num)
|
||||
elif unit == "week":
|
||||
delta = timedelta(weeks=num)
|
||||
elif unit == "month":
|
||||
delta = timedelta(days=30 * num) # approximate
|
||||
elif unit == "year":
|
||||
delta = timedelta(days=365 * num) # approximate
|
||||
parsed = True
|
||||
elif lang.lower() == "he":
|
||||
# Remove the "לפני" prefix if present
|
||||
text = date_str.strip()
|
||||
if text.startswith("לפני"):
|
||||
text = text[len("לפני"):].strip()
|
||||
|
||||
# Handle special cases where the number and unit are combined:
|
||||
special = {
|
||||
"חודשיים": (2, "month"),
|
||||
"שבועיים": (2, "week"),
|
||||
"יומיים": (2, "day"),
|
||||
}
|
||||
if text in special:
|
||||
num, unit = special[text]
|
||||
if unit == "day":
|
||||
delta = timedelta(days=num)
|
||||
elif unit == "week":
|
||||
delta = timedelta(weeks=num)
|
||||
elif unit == "month":
|
||||
delta = timedelta(days=30 * num) # approximate
|
||||
parsed = True
|
||||
else:
|
||||
# Match optional number (or assume 1) and then a unit.
|
||||
pattern = re.compile(r'(?P<num>\d+|אחד|אחת)?\s*(?P<unit>שנה|שנים|חודש|חודשים|יום|ימים|שבוע|שבועות)',
|
||||
re.IGNORECASE)
|
||||
m = pattern.search(text)
|
||||
if m:
|
||||
num_str = m.group("num")
|
||||
if not num_str:
|
||||
num = 1
|
||||
else:
|
||||
try:
|
||||
num = int(num_str)
|
||||
except ValueError:
|
||||
num = 1
|
||||
unit_he = m.group("unit")
|
||||
# Map the Hebrew unit (both singular and plural) to English unit names
|
||||
if unit_he in ("יום", "ימים"):
|
||||
unit = "day"
|
||||
elif unit_he in ("שבוע", "שבועות"):
|
||||
unit = "week"
|
||||
elif unit_he in ("חודש", "חודשים"):
|
||||
unit = "month"
|
||||
elif unit_he in ("שנה", "שנים"):
|
||||
unit = "year"
|
||||
else:
|
||||
unit = "day" # fallback
|
||||
|
||||
if unit == "day":
|
||||
delta = timedelta(days=num)
|
||||
elif unit == "week":
|
||||
delta = timedelta(weeks=num)
|
||||
elif unit == "month":
|
||||
delta = timedelta(days=30 * num) # approximate
|
||||
elif unit == "year":
|
||||
delta = timedelta(days=365 * num) # approximate
|
||||
parsed = True
|
||||
elif lang.lower() == "th":
|
||||
# Thai language patterns (simplified)
|
||||
# Check for Thai patterns like "3 วันที่แล้ว" (3 days ago)
|
||||
thai_pattern = re.compile(r'(?P<num>\d+)?\s*(?P<unit>วัน|สัปดาห์|เดือน|ปี)ที่แล้ว', re.IGNORECASE)
|
||||
m = thai_pattern.search(date_str)
|
||||
if m:
|
||||
num_str = m.group("num")
|
||||
num = 1 if not num_str else int(num_str)
|
||||
unit_th = m.group("unit")
|
||||
|
||||
# Map Thai units to English
|
||||
if unit_th == "วัน":
|
||||
unit = "day"
|
||||
elif unit_th == "สัปดาห์":
|
||||
unit = "week"
|
||||
elif unit_th == "เดือน":
|
||||
unit = "month"
|
||||
elif unit_th == "ปี":
|
||||
unit = "year"
|
||||
else:
|
||||
unit = "day" # fallback
|
||||
|
||||
if unit == "day":
|
||||
delta = timedelta(days=num)
|
||||
elif unit == "week":
|
||||
delta = timedelta(weeks=num)
|
||||
elif unit == "month":
|
||||
delta = timedelta(days=30 * num) # approximate
|
||||
elif unit == "year":
|
||||
delta = timedelta(days=365 * num) # approximate
|
||||
parsed = True
|
||||
|
||||
# Return the calculated date if parsing was successful, otherwise return the original string
|
||||
if parsed:
|
||||
result = now - delta
|
||||
return result.isoformat()
|
||||
else:
|
||||
return date_str
|
||||
|
||||
|
||||
# def parse_relative_date(date_str: str, lang: str, now: Optional[datetime] = None) -> str:
|
||||
# """
|
||||
# Converts a relative review_date (in English or Hebrew) such as "a week ago" or "לפני 7 שנים"
|
||||
# into an ISO formatted datetime string (UTC).
|
||||
#
|
||||
# For English, supported formats include:
|
||||
# - "a day ago", "an hour ago", "3 weeks ago", "4 months ago", "2 years ago", etc.
|
||||
# For Hebrew, supported formats include:
|
||||
# - "לפני יום", "לפני 2 ימים", "לפני שבוע", "לפני שבועיים", "לפני חודש",
|
||||
# "לפני חודשיים", "לפני 10 חודשים", "לפני שנה", "לפני 3 שנים", etc.
|
||||
#
|
||||
# Parameters:
|
||||
# - date_str (str): the relative date string.
|
||||
# - lang (str): "en" for English or "he" for Hebrew.
|
||||
# - now (Optional[datetime]): reference datetime; if None, current local time is used.
|
||||
#
|
||||
# Returns:
|
||||
# A string representing the calculated absolute datetime in ISO 8601 format,
|
||||
# or the original date_str if parsing fails.
|
||||
# """
|
||||
# if now is None:
|
||||
# now = datetime.utcnow() # use UTC for consistency
|
||||
#
|
||||
# delta = timedelta(0)
|
||||
#
|
||||
# if lang.lower() == "en":
|
||||
# # Pattern: capture number or "a"/"an", then unit.
|
||||
# pattern = re.compile(r'(?P<num>a|an|\d+)\s+(?P<unit>day|week|month|year)s?\s+ago', re.IGNORECASE)
|
||||
# m = pattern.search(date_str)
|
||||
# if m:
|
||||
# num_str = m.group("num").lower()
|
||||
# num = 1 if num_str in ("a", "an") else int(num_str)
|
||||
# unit = m.group("unit").lower()
|
||||
# if unit == "day":
|
||||
# delta = timedelta(days=num)
|
||||
# elif unit == "week":
|
||||
# delta = timedelta(weeks=num)
|
||||
# elif unit == "month":
|
||||
# delta = timedelta(days=30 * num) # approximate
|
||||
# elif unit == "year":
|
||||
# delta = timedelta(days=365 * num) # approximate
|
||||
# else:
|
||||
# return date_str # return original if not matched
|
||||
# elif lang.lower() == "he":
|
||||
# # Remove the "לפני" prefix if present
|
||||
# text = date_str.strip()
|
||||
# if text.startswith("לפני"):
|
||||
# text = text[len("לפני"):].strip()
|
||||
#
|
||||
# # Handle special cases where the number and unit are combined:
|
||||
# special = {
|
||||
# "חודשיים": (2, "month"),
|
||||
# "שבועיים": (2, "week"),
|
||||
# "יומיים": (2, "day"),
|
||||
# }
|
||||
# if text in special:
|
||||
# num, unit = special[text]
|
||||
# else:
|
||||
# # Match optional number (or assume 1) and then a unit.
|
||||
# pattern = re.compile(r'(?P<num>\d+|אחד|אחת)?\s*(?P<unit>שנה|שנים|חודש|חודשים|יום|ימים|שבוע|שבועות)',
|
||||
# re.IGNORECASE)
|
||||
# m = pattern.search(text)
|
||||
# if m:
|
||||
# num_str = m.group("num")
|
||||
# if not num_str:
|
||||
# num = 1
|
||||
# else:
|
||||
# try:
|
||||
# num = int(num_str)
|
||||
# except ValueError:
|
||||
# num = 1
|
||||
# unit_he = m.group("unit")
|
||||
# # Map the Hebrew unit (both singular and plural) to English unit names
|
||||
# if unit_he in ("יום", "ימים"):
|
||||
# unit = "day"
|
||||
# elif unit_he in ("שבוע", "שבועות"):
|
||||
# unit = "week"
|
||||
# elif unit_he in ("חודש", "חודשים"):
|
||||
# unit = "month"
|
||||
# elif unit_he in ("שנה", "שנים"):
|
||||
# unit = "year"
|
||||
# else:
|
||||
# unit = "day" # fallback
|
||||
# else:
|
||||
# return date_str # if nothing matches, return original text
|
||||
#
|
||||
# if unit == "day":
|
||||
# delta = timedelta(days=num)
|
||||
# elif unit == "week":
|
||||
# delta = timedelta(weeks=num)
|
||||
# elif unit == "month":
|
||||
# delta = timedelta(days=30 * num) # approximate
|
||||
# elif unit == "year":
|
||||
# delta = timedelta(days=365 * num) # approximate
|
||||
#
|
||||
# result = now - delta
|
||||
# return result.isoformat()
|
||||
|
||||
|
||||
# --- Example usage ---
|
||||
if __name__ == "__main__":
|
||||
# Fixed reference time for reproducibility:
|
||||
fixed_now = datetime(2025, 2, 5, 12, 0, 0)
|
||||
examples = [
|
||||
("a week ago", "he"),
|
||||
("4 weeks ago", "en"),
|
||||
("לפני 7 שנים", "he"),
|
||||
("לפני חודשיים", "he")
|
||||
]
|
||||
for text, lang in examples:
|
||||
iso_date = parse_relative_date(text, lang, now=fixed_now)
|
||||
print(f"Original: {text} ({lang}) => ISO: {iso_date}")
|
||||
283
modules/image_handler.py
Normal file
283
modules/image_handler.py
Normal file
@@ -0,0 +1,283 @@
|
||||
"""
|
||||
Image downloading and handling for Google Maps Reviews Scraper.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Set, Tuple
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
|
||||
# Logger
|
||||
log = logging.getLogger("scraper")
|
||||
|
||||
|
||||
class ImageHandler:
|
||||
"""Handler for downloading and managing review images"""
|
||||
|
||||
def __init__(self, config: Dict[str, Any]):
|
||||
"""Initialize image handler with configuration"""
|
||||
self.image_dir = Path(config.get("image_dir", "review_images"))
|
||||
self.max_workers = config.get("download_threads", 4)
|
||||
self.store_local_paths = config.get("store_local_paths", True)
|
||||
|
||||
# URL replacement settings
|
||||
self.replace_urls = config.get("replace_urls", False)
|
||||
self.custom_url_base = config.get("custom_url_base", "https://mycustomurl.com")
|
||||
self.custom_url_profiles = config.get("custom_url_profiles", "/profiles/")
|
||||
self.custom_url_reviews = config.get("custom_url_reviews", "/reviews/")
|
||||
self.preserve_original_urls = config.get("preserve_original_urls", True)
|
||||
|
||||
# Subdirectories for different image types
|
||||
self.profile_dir = self.image_dir / "profiles"
|
||||
self.review_dir = self.image_dir / "reviews"
|
||||
|
||||
def ensure_directories(self):
|
||||
"""Ensure all image directories exist"""
|
||||
self.profile_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.review_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def is_not_custom_url(self, url: str) -> bool:
|
||||
"""Check if the URL is not one of our custom URLs"""
|
||||
if not url:
|
||||
return False
|
||||
|
||||
# Check if the URL starts with our custom URL base - if so, skip it
|
||||
if self.custom_url_base and url.startswith(self.custom_url_base):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def get_filename_from_url(self, url: str, is_profile: bool = False) -> str:
|
||||
"""Extract filename from URL and add .jpg extension"""
|
||||
if not url:
|
||||
return ""
|
||||
|
||||
# Skip our custom URLs
|
||||
if not self.is_not_custom_url(url):
|
||||
return ""
|
||||
|
||||
# For profile pictures
|
||||
if is_profile:
|
||||
# Extract unique identifier from profile URL
|
||||
parts = url.split('/')
|
||||
if len(parts) > 1:
|
||||
filename = parts[-2] if parts[-1] in ('', 'w72-h72-p-rp-mo-ba4-br100') else parts[-1]
|
||||
return f"{filename}.jpg"
|
||||
|
||||
# For review images
|
||||
match = re.search(r'AIHoz[^=]+=', url)
|
||||
if match:
|
||||
# Use the ID as filename
|
||||
return f"{match.group(0).rstrip('=')}w600-h450-p.jpg"
|
||||
|
||||
# Fallback to using the last part of the URL path
|
||||
parsed = urlparse(url)
|
||||
path = parsed.path
|
||||
filename = path.split('/')[-1]
|
||||
|
||||
# Add .jpg extension if not present
|
||||
if not filename.lower().endswith('.jpg'):
|
||||
filename += ".jpg"
|
||||
|
||||
return filename
|
||||
|
||||
def get_custom_url(self, filename: str, is_profile: bool = False) -> str:
|
||||
"""Generate a custom URL for the image"""
|
||||
if not self.replace_urls or not filename:
|
||||
return ""
|
||||
|
||||
base_url = self.custom_url_base.rstrip('/')
|
||||
path = self.custom_url_profiles if is_profile else self.custom_url_reviews
|
||||
path = path.strip('/')
|
||||
|
||||
return f"{base_url}/{path}/{filename}"
|
||||
|
||||
def download_image(self, url_info: Tuple[str, bool]) -> Tuple[str, str, str]:
|
||||
"""
|
||||
Download an image from URL and save to disk.
|
||||
|
||||
Args:
|
||||
url_info: Tuple of (url, is_profile)
|
||||
|
||||
Returns:
|
||||
Tuple of (url, local filename, custom url)
|
||||
"""
|
||||
url, is_profile = url_info
|
||||
|
||||
# Skip our custom URLs
|
||||
if not self.is_not_custom_url(url):
|
||||
return url, "", ""
|
||||
|
||||
try:
|
||||
filename = self.get_filename_from_url(url, is_profile)
|
||||
if not filename:
|
||||
return url, "", ""
|
||||
|
||||
# Choose directory based on image type
|
||||
target_dir = self.profile_dir if is_profile else self.review_dir
|
||||
filepath = target_dir / filename
|
||||
|
||||
# Skip if file already exists
|
||||
if filepath.exists():
|
||||
# Generate custom URL even if file exists
|
||||
custom_url = self.get_custom_url(filename, is_profile)
|
||||
return url, filename, custom_url
|
||||
|
||||
# Download the image
|
||||
response = requests.get(url, stream=True, timeout=10)
|
||||
response.raise_for_status()
|
||||
|
||||
with open(filepath, 'wb') as f:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
f.write(chunk)
|
||||
|
||||
# Generate custom URL
|
||||
custom_url = self.get_custom_url(filename, is_profile)
|
||||
return url, filename, custom_url
|
||||
|
||||
except Exception as e:
|
||||
log.error(f"Error downloading image from {url}: {e}")
|
||||
return url, "", ""
|
||||
|
||||
def download_all_images(self, reviews: Dict[str, Dict[str, Any]]) -> Dict[str, Dict[str, Any]]:
|
||||
"""
|
||||
Download all images (review images and profile pictures) for all reviews.
|
||||
|
||||
Args:
|
||||
reviews: Dictionary of review documents
|
||||
|
||||
Returns:
|
||||
Updated reviews with local image paths and custom URLs
|
||||
"""
|
||||
self.ensure_directories()
|
||||
|
||||
# Collect all unique image URLs (both review images and profile pictures)
|
||||
# Exclude custom URLs
|
||||
review_urls: Set[str] = set()
|
||||
profile_urls: Set[str] = set()
|
||||
|
||||
for review in reviews.values():
|
||||
# Collect review images - exclude custom URLs
|
||||
if "user_images" in review and isinstance(review["user_images"], list):
|
||||
for url in review["user_images"]:
|
||||
if self.is_not_custom_url(url):
|
||||
review_urls.add(url)
|
||||
# If we have original image URLs stored separately, add those too
|
||||
if "original_image_urls" in review and isinstance(review["original_image_urls"], list):
|
||||
for orig_url in review["original_image_urls"]:
|
||||
if self.is_not_custom_url(orig_url):
|
||||
review_urls.add(orig_url)
|
||||
|
||||
# Collect profile pictures - exclude custom URLs
|
||||
if "profile_picture" in review and review["profile_picture"]:
|
||||
profile_url = review["profile_picture"]
|
||||
if self.is_not_custom_url(profile_url):
|
||||
profile_urls.add(profile_url)
|
||||
# If we have original profile URL stored separately, add that too
|
||||
if "original_profile_picture" in review and review["original_profile_picture"]:
|
||||
orig_profile_url = review["original_profile_picture"]
|
||||
if self.is_not_custom_url(orig_profile_url):
|
||||
profile_urls.add(orig_profile_url)
|
||||
|
||||
# Prepare download tasks with URL type info
|
||||
download_tasks = [(url, False) for url in review_urls] + [(url, True) for url in profile_urls]
|
||||
|
||||
if not download_tasks:
|
||||
log.info("No images to download")
|
||||
return reviews
|
||||
|
||||
log.info(
|
||||
f"Downloading {len(download_tasks)} images ({len(profile_urls)} profiles, {len(review_urls)} review images)...")
|
||||
|
||||
# Create URL to filename and URL to custom URL mappings
|
||||
url_to_filename = {}
|
||||
url_to_custom_url = {}
|
||||
|
||||
# Download images in parallel
|
||||
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
||||
results = executor.map(self.download_image, download_tasks)
|
||||
for url, filename, custom_url in results:
|
||||
if filename:
|
||||
url_to_filename[url] = filename
|
||||
if custom_url:
|
||||
url_to_custom_url[url] = custom_url
|
||||
|
||||
# Update review documents
|
||||
for review_id, review in reviews.items():
|
||||
# Find the original URLs to use for lookup - important for both user_images and profile_picture
|
||||
user_images_original = []
|
||||
profile_picture_original = ""
|
||||
|
||||
# For user_images, either use original URLs if we have them, or the current user_images
|
||||
if "original_image_urls" in review and isinstance(review["original_image_urls"], list):
|
||||
user_images_original = review["original_image_urls"]
|
||||
elif "user_images" in review and isinstance(review["user_images"], list):
|
||||
user_images_original = review["user_images"].copy()
|
||||
|
||||
# For profile_picture, either use original URL if we have it, or the current profile_picture
|
||||
if "original_profile_picture" in review and review["original_profile_picture"]:
|
||||
profile_picture_original = review["original_profile_picture"]
|
||||
elif "profile_picture" in review:
|
||||
profile_picture_original = review["profile_picture"]
|
||||
|
||||
# Process user_images
|
||||
if "user_images" in review and isinstance(review["user_images"], list):
|
||||
# Add local image paths if enabled
|
||||
if self.store_local_paths:
|
||||
local_images = [url_to_filename.get(url, "") for url in user_images_original
|
||||
if url and self.is_not_custom_url(url)]
|
||||
review["local_images"] = [img for img in local_images if img]
|
||||
|
||||
# Replace URLs if enabled
|
||||
if self.replace_urls:
|
||||
# Store original URLs if needed and not already stored
|
||||
if self.preserve_original_urls and "original_image_urls" not in review:
|
||||
review["original_image_urls"] = review["user_images"].copy()
|
||||
|
||||
# Create custom URLs for each image
|
||||
custom_images = []
|
||||
for url in user_images_original:
|
||||
if url in url_to_custom_url:
|
||||
custom_images.append(url_to_custom_url[url])
|
||||
elif not self.is_not_custom_url(url): # Already a custom URL
|
||||
custom_images.append(url)
|
||||
|
||||
# Replace with custom URLs if we have them
|
||||
if custom_images:
|
||||
review["user_images"] = custom_images
|
||||
|
||||
# Process profile_picture
|
||||
if "profile_picture" in review and review["profile_picture"]:
|
||||
# Add local profile picture path if enabled
|
||||
if self.store_local_paths and profile_picture_original in url_to_filename:
|
||||
review["local_profile_picture"] = url_to_filename[profile_picture_original]
|
||||
|
||||
# Replace profile_picture URL if enabled
|
||||
if self.replace_urls:
|
||||
# Store original URL if needed and not already stored
|
||||
if self.preserve_original_urls and "original_profile_picture" not in review:
|
||||
review["original_profile_picture"] = review["profile_picture"]
|
||||
|
||||
# Replace with custom URL if we have one for this profile image
|
||||
if profile_picture_original in url_to_custom_url:
|
||||
review["profile_picture"] = url_to_custom_url[profile_picture_original]
|
||||
elif not self.is_not_custom_url(review["profile_picture"]):
|
||||
# If current URL is already a custom URL, keep it
|
||||
pass
|
||||
elif profile_picture_original:
|
||||
# If we don't have a custom URL but have a filename, generate one
|
||||
filename = url_to_filename.get(profile_picture_original, "")
|
||||
if filename:
|
||||
custom_url = self.get_custom_url(filename, True)
|
||||
if custom_url:
|
||||
review["profile_picture"] = custom_url
|
||||
|
||||
log.info(f"Downloaded {len(url_to_filename)} images")
|
||||
if self.replace_urls:
|
||||
log.info(f"Replaced URLs for {len(url_to_custom_url)} images")
|
||||
|
||||
return reviews
|
||||
84
modules/models.py
Normal file
84
modules/models.py
Normal file
@@ -0,0 +1,84 @@
|
||||
"""
|
||||
Data models for Google Maps Reviews Scraper.
|
||||
"""
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from selenium.webdriver.remote.webelement import WebElement
|
||||
|
||||
from modules.utils import (try_find, first_text, first_attr, safe_int, detect_lang, parse_date_to_iso)
|
||||
|
||||
|
||||
@dataclass
|
||||
class RawReview:
|
||||
"""
|
||||
Data class representing a raw review extracted from Google Maps.
|
||||
"""
|
||||
id: str = ""
|
||||
author: str = ""
|
||||
rating: float = 0.0
|
||||
date: str = ""
|
||||
lang: str = "und"
|
||||
text: str = ""
|
||||
likes: int = 0
|
||||
photos: list[str] = field(default_factory=list)
|
||||
profile: str = ""
|
||||
avatar: str = "" # URL to profile picture
|
||||
owner_date: str = ""
|
||||
owner_text: str = ""
|
||||
review_date: str = "" # ISO format date
|
||||
|
||||
# CSS Selectors for review elements
|
||||
MORE_BTN = "button.kyuRq"
|
||||
LIKE_BTN = 'button[jsaction*="toggleThumbsUp" i]'
|
||||
PHOTO_BTN = "button.Tya61d"
|
||||
OWNER_RESP = "div.CDe7pd"
|
||||
|
||||
@classmethod
|
||||
def from_card(cls, card: WebElement) -> "RawReview":
|
||||
"""Factory method to create a RawReview from a WebElement"""
|
||||
# expand "More" - non-blocking approach
|
||||
for b in try_find(card, cls.MORE_BTN, all=True):
|
||||
try:
|
||||
b.click()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
rid = card.get_attribute("data-review-id") or ""
|
||||
author = first_text(card, 'div[class*="d4r55"]')
|
||||
profile = first_attr(card, 'button[data-review-id]', "data-href")
|
||||
avatar = first_attr(card, 'button[data-review-id] img', "src")
|
||||
|
||||
label = first_attr(card, 'span[role="img"]', "aria-label")
|
||||
num = re.search(r"[\d\.]+", label.replace(",", ".")) if label else None
|
||||
rating = float(num.group()) if num else 0.0
|
||||
|
||||
date = first_text(card, 'span[class*="rsqaWe"]')
|
||||
# Parse the date string to ISO format
|
||||
review_date = parse_date_to_iso(date)
|
||||
|
||||
text = ""
|
||||
for sel in ('span[jsname="bN97Pc"]',
|
||||
'span[jsname="fbQN7e"]',
|
||||
'div.MyEned span.wiI7pd'):
|
||||
text = first_text(card, sel)
|
||||
if text: break
|
||||
lang = detect_lang(text)
|
||||
|
||||
likes = 0
|
||||
if (btn := try_find(card, cls.LIKE_BTN)):
|
||||
likes = safe_int(btn[0].text or btn[0].get_attribute("aria-label"))
|
||||
|
||||
photos: list[str] = []
|
||||
for btn in try_find(card, cls.PHOTO_BTN, all=True):
|
||||
if (m := re.search(r'url\("([^"]+)"', btn.get_attribute("style") or "")):
|
||||
photos.append(m.group(1))
|
||||
|
||||
owner_date = owner_text = ""
|
||||
if (box := try_find(card, cls.OWNER_RESP)):
|
||||
box = box[0]
|
||||
owner_date = first_text(box, "span.DZSIDd")
|
||||
owner_text = first_text(box, "div.wiI7pd")
|
||||
|
||||
return cls(rid, author, rating, date, lang, text, likes,
|
||||
photos, profile, avatar, owner_date, owner_text, review_date)
|
||||
1921
modules/scraper.py
Normal file
1921
modules/scraper.py
Normal file
File diff suppressed because it is too large
Load Diff
307
modules/utils.py
Normal file
307
modules/utils.py
Normal file
@@ -0,0 +1,307 @@
|
||||
"""
|
||||
Utility functions for Google Maps Reviews Scraper.
|
||||
"""
|
||||
import datetime
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
from datetime import timezone
|
||||
from functools import lru_cache
|
||||
from typing import List
|
||||
|
||||
from selenium.common.exceptions import (NoSuchElementException,
|
||||
StaleElementReferenceException,
|
||||
TimeoutException)
|
||||
from selenium.webdriver import Chrome
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.remote.webelement import WebElement
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
|
||||
# Logger
|
||||
log = logging.getLogger("scraper")
|
||||
|
||||
# Constants for language detection
|
||||
HEB_CHARS = re.compile(r"[\u0590-\u05FF]")
|
||||
THAI_CHARS = re.compile(r"[\u0E00-\u0E7F]")
|
||||
|
||||
|
||||
@lru_cache(maxsize=1024)
|
||||
def detect_lang(txt: str) -> str:
|
||||
"""Detect language based on character sets"""
|
||||
if HEB_CHARS.search(txt): return "he"
|
||||
if THAI_CHARS.search(txt): return "th"
|
||||
return "en"
|
||||
|
||||
|
||||
@lru_cache(maxsize=128)
|
||||
def safe_int(s: str | None) -> int:
|
||||
"""Safely convert string to integer, returning 0 if not possible"""
|
||||
m = re.search(r"\d+", s or "")
|
||||
return int(m.group()) if m else 0
|
||||
|
||||
|
||||
def try_find(el: WebElement, css: str, *, all=False) -> List[WebElement]:
|
||||
"""Safely find elements by CSS selector without raising exceptions"""
|
||||
try:
|
||||
if all:
|
||||
return el.find_elements(By.CSS_SELECTOR, css)
|
||||
obj = el.find_element(By.CSS_SELECTOR, css)
|
||||
return [obj] if obj else []
|
||||
except (NoSuchElementException, StaleElementReferenceException):
|
||||
return []
|
||||
|
||||
|
||||
def first_text(el: WebElement, css: str) -> str:
|
||||
"""Get text from the first matching element that has non-empty text"""
|
||||
for e in try_find(el, css, all=True):
|
||||
try:
|
||||
if (t := e.text.strip()):
|
||||
return t
|
||||
except StaleElementReferenceException:
|
||||
continue
|
||||
return ""
|
||||
|
||||
|
||||
def parse_date_to_iso(date_str: str) -> str:
|
||||
"""
|
||||
Parse date strings like "2 weeks ago", "January 2023", etc. into ISO format.
|
||||
Returns a best-effort ISO string, or empty string if parsing fails.
|
||||
"""
|
||||
if not date_str:
|
||||
return ""
|
||||
|
||||
try:
|
||||
now = datetime.now(timezone.utc)
|
||||
|
||||
# Handle relative dates
|
||||
if "ago" in date_str.lower():
|
||||
# For simplicity, map to approximate dates
|
||||
if "minute" in date_str.lower():
|
||||
minutes = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
|
||||
dt = now.replace(microsecond=0) - timezone.timedelta(minutes=minutes)
|
||||
elif "hour" in date_str.lower():
|
||||
hours = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
|
||||
dt = now.replace(microsecond=0) - timezone.timedelta(hours=hours)
|
||||
elif "day" in date_str.lower():
|
||||
days = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
|
||||
dt = now.replace(microsecond=0) - timezone.timedelta(days=days)
|
||||
elif "week" in date_str.lower():
|
||||
weeks = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
|
||||
dt = now.replace(microsecond=0) - timezone.timedelta(weeks=weeks)
|
||||
elif "month" in date_str.lower():
|
||||
months = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
|
||||
# Approximate months as 30 days
|
||||
dt = now.replace(microsecond=0) - timezone.timedelta(days=30 * months)
|
||||
elif "year" in date_str.lower():
|
||||
years = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
|
||||
# Approximate years as 365 days
|
||||
dt = now.replace(microsecond=0) - timezone.timedelta(days=365 * years)
|
||||
else:
|
||||
# Default to current time if can't parse
|
||||
dt = now.replace(microsecond=0)
|
||||
else:
|
||||
# Handle absolute dates (month year format)
|
||||
# This is a simplification - would need more robust parsing for production
|
||||
dt = now.replace(microsecond=0)
|
||||
|
||||
return dt.isoformat()
|
||||
except Exception:
|
||||
# If parsing fails, return empty string
|
||||
return ""
|
||||
|
||||
|
||||
def first_attr(el: WebElement, css: str, attr: str) -> str:
|
||||
"""Get attribute value from the first matching element that has a non-empty value"""
|
||||
for e in try_find(el, css, all=True):
|
||||
try:
|
||||
if (v := (e.get_attribute(attr) or "").strip()):
|
||||
return v
|
||||
except StaleElementReferenceException:
|
||||
continue
|
||||
return ""
|
||||
|
||||
|
||||
def click_if(driver: Chrome, css: str, delay: float = .25, timeout: float = 5.0) -> bool:
|
||||
"""
|
||||
Click element if it exists and is clickable, with timeout and better error handling.
|
||||
|
||||
Args:
|
||||
driver: WebDriver instance
|
||||
css: CSS selector for the element to click
|
||||
delay: Time to wait after clicking (seconds)
|
||||
timeout: Maximum time to wait for element (seconds)
|
||||
|
||||
Returns:
|
||||
True if element was found and clicked, False otherwise
|
||||
"""
|
||||
try:
|
||||
# First check if elements exist at all
|
||||
elements = driver.find_elements(By.CSS_SELECTOR, css)
|
||||
if not elements:
|
||||
return False
|
||||
|
||||
# Try clicking the first visible element
|
||||
for element in elements:
|
||||
try:
|
||||
if element.is_displayed() and element.is_enabled():
|
||||
element.click()
|
||||
time.sleep(delay)
|
||||
return True
|
||||
except Exception:
|
||||
# Try next element if this one fails
|
||||
continue
|
||||
|
||||
# If we couldn't click any of the direct elements, try with WebDriverWait
|
||||
try:
|
||||
WebDriverWait(driver, timeout).until(
|
||||
EC.element_to_be_clickable((By.CSS_SELECTOR, css))
|
||||
).click()
|
||||
time.sleep(delay)
|
||||
return True
|
||||
except TimeoutException:
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
log.debug(f"Error in click_if: {str(e)}")
|
||||
return False
|
||||
|
||||
|
||||
def get_current_iso_date() -> str:
|
||||
"""Return current UTC time in ISO format."""
|
||||
from datetime import datetime, timezone
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
# """
|
||||
# Utility functions for Google Maps Reviews Scraper.
|
||||
# """
|
||||
#
|
||||
# import re
|
||||
# import time
|
||||
# import logging
|
||||
# from datetime import datetime, timezone
|
||||
# from functools import lru_cache
|
||||
# from typing import List, Optional
|
||||
#
|
||||
# from selenium.common.exceptions import (NoSuchElementException,
|
||||
# StaleElementReferenceException,
|
||||
# TimeoutException)
|
||||
# from selenium.webdriver import Chrome
|
||||
# from selenium.webdriver.common.by import By
|
||||
# from selenium.webdriver.remote.webelement import WebElement
|
||||
# from selenium.webdriver.support import expected_conditions as EC
|
||||
# from selenium.webdriver.support.ui import WebDriverWait
|
||||
#
|
||||
# # Constants for language detection
|
||||
# HEB_CHARS = re.compile(r"[\u0590-\u05FF]")
|
||||
# THAI_CHARS = re.compile(r"[\u0E00-\u0E7F]")
|
||||
#
|
||||
# # Logger
|
||||
# log = logging.getLogger("scraper")
|
||||
#
|
||||
#
|
||||
# @lru_cache(maxsize=1024)
|
||||
# def detect_lang(txt: str) -> str:
|
||||
# """Detect language based on character sets"""
|
||||
# if HEB_CHARS.search(txt): return "he"
|
||||
# if THAI_CHARS.search(txt): return "th"
|
||||
# return "en"
|
||||
#
|
||||
#
|
||||
# @lru_cache(maxsize=128)
|
||||
# def safe_int(s: str | None) -> int:
|
||||
# """Safely convert string to integer, returning 0 if not possible"""
|
||||
# m = re.search(r"\d+", s or "")
|
||||
# return int(m.group()) if m else 0
|
||||
#
|
||||
#
|
||||
# def try_find(el: WebElement, css: str, *, all=False) -> List[WebElement]:
|
||||
# """Safely find elements by CSS selector without raising exceptions"""
|
||||
# try:
|
||||
# if all:
|
||||
# return el.find_elements(By.CSS_SELECTOR, css)
|
||||
# obj = el.find_element(By.CSS_SELECTOR, css)
|
||||
# return [obj] if obj else []
|
||||
# except (NoSuchElementException, StaleElementReferenceException):
|
||||
# return []
|
||||
#
|
||||
#
|
||||
# def first_text(el: WebElement, css: str) -> str:
|
||||
# """Get text from the first matching element that has non-empty text"""
|
||||
# for e in try_find(el, css, all=True):
|
||||
# if (t := e.text.strip()):
|
||||
# return t
|
||||
# return ""
|
||||
#
|
||||
#
|
||||
# def first_attr(el: WebElement, css: str, attr: str) -> str:
|
||||
# """Get attribute value from the first matching element that has a non-empty value"""
|
||||
# for e in try_find(el, css, all=True):
|
||||
# if (v := (e.get_attribute(attr) or "").strip()):
|
||||
# return v
|
||||
# return ""
|
||||
#
|
||||
#
|
||||
# def click_if(driver: Chrome, css: str, delay: float = .25, timeout: float = 5.0) -> bool:
|
||||
# """Click element if it exists and is clickable, with timeout"""
|
||||
# try:
|
||||
# WebDriverWait(driver, timeout).until(
|
||||
# EC.element_to_be_clickable((By.CSS_SELECTOR, css))
|
||||
# ).click()
|
||||
# time.sleep(delay)
|
||||
# return True
|
||||
# except TimeoutException:
|
||||
# return False
|
||||
#
|
||||
#
|
||||
# def parse_date_to_iso(date_str: str) -> str:
|
||||
# """
|
||||
# Parse date strings like "2 weeks ago", "January 2023", etc. into ISO format.
|
||||
# Returns a best-effort ISO string, or empty string if parsing fails.
|
||||
# """
|
||||
# if not date_str:
|
||||
# return ""
|
||||
#
|
||||
# try:
|
||||
# now = datetime.now(timezone.utc)
|
||||
#
|
||||
# # Handle relative dates
|
||||
# if "ago" in date_str.lower():
|
||||
# # For simplicity, map to approximate dates
|
||||
# if "minute" in date_str.lower():
|
||||
# minutes = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
|
||||
# dt = now.replace(microsecond=0) - timezone.timedelta(minutes=minutes)
|
||||
# elif "hour" in date_str.lower():
|
||||
# hours = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
|
||||
# dt = now.replace(microsecond=0) - timezone.timedelta(hours=hours)
|
||||
# elif "day" in date_str.lower():
|
||||
# days = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
|
||||
# dt = now.replace(microsecond=0) - timezone.timedelta(days=days)
|
||||
# elif "week" in date_str.lower():
|
||||
# weeks = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
|
||||
# dt = now.replace(microsecond=0) - timezone.timedelta(weeks=weeks)
|
||||
# elif "month" in date_str.lower():
|
||||
# months = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
|
||||
# # Approximate months as 30 days
|
||||
# dt = now.replace(microsecond=0) - timezone.timedelta(days=30 * months)
|
||||
# elif "year" in date_str.lower():
|
||||
# years = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
|
||||
# # Approximate years as 365 days
|
||||
# dt = now.replace(microsecond=0) - timezone.timedelta(days=365 * years)
|
||||
# else:
|
||||
# # Default to current time if can't parse
|
||||
# dt = now.replace(microsecond=0)
|
||||
# else:
|
||||
# # Handle absolute dates (month year format)
|
||||
# # This is a simplification - would need more robust parsing for production
|
||||
# dt = now.replace(microsecond=0)
|
||||
#
|
||||
# return dt.isoformat()
|
||||
# except Exception:
|
||||
# # If parsing fails, return empty string
|
||||
# return ""
|
||||
#
|
||||
#
|
||||
# def get_current_iso_date() -> str:
|
||||
# """Return current UTC time in ISO format."""
|
||||
# return datetime.now(timezone.utc).isoformat()
|
||||
Reference in New Issue
Block a user