Phase 0: Project restructure to ReviewIQ platform architecture
New structure: - scrapers/google_reviews/v1_0_0.py (was modules/scraper_clean.py) - scrapers/base.py (BaseScraper interface) - scrapers/registry.py (ScraperRegistry for version routing) - core/database.py, models.py, config.py, enums.py - utils/logger.py, crash_analyzer.py, health_checks.py, helpers.py, date_converter.py - workers/chrome_pool.py - services/webhook_service.py - api/ routes structure (empty, ready for Phase 2) - tests/ structure mirroring source All imports updated in: - api_server_production.py (7 import paths updated) - utils/health_checks.py (scraper import path) Legacy modules moved to modules/_legacy/: - data_storage.py, image_handler.py, s3_handler.py (unused) Syntax verified, frontend build passing. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
349
modules/_legacy/data_storage.py
Normal file
349
modules/_legacy/data_storage.py
Normal file
@@ -0,0 +1,349 @@
|
||||
"""
|
||||
Data storage modules for Google Maps Reviews Scraper.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import ssl
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Set
|
||||
|
||||
import pymongo
|
||||
|
||||
from modules.date_converter import parse_relative_date, DateConverter
|
||||
from modules.image_handler import ImageHandler
|
||||
from modules.models import RawReview
|
||||
from modules.utils import detect_lang, get_current_iso_date
|
||||
|
||||
# Configure SSL for MongoDB connection
|
||||
ssl._create_default_https_context = ssl._create_unverified_context # macOS SSL fix
|
||||
|
||||
# Logger
|
||||
log = logging.getLogger("scraper")
|
||||
|
||||
RAW_LANG = "en"
|
||||
|
||||
|
||||
class MongoDBStorage:
|
||||
"""MongoDB storage handler for Google Maps reviews"""
|
||||
|
||||
def __init__(self, config: Dict[str, Any]):
|
||||
"""Initialize MongoDB storage with configuration"""
|
||||
mongodb_config = config.get("mongodb", {})
|
||||
self.uri = mongodb_config.get("uri")
|
||||
self.db_name = mongodb_config.get("database")
|
||||
self.collection_name = mongodb_config.get("collection")
|
||||
self.client = None
|
||||
self.collection = None
|
||||
self.connected = False
|
||||
self.convert_dates = config.get("convert_dates", True)
|
||||
self.download_images = config.get("download_images", False)
|
||||
self.store_local_paths = config.get("store_local_paths", True)
|
||||
self.replace_urls = config.get("replace_urls", False)
|
||||
self.preserve_original_urls = config.get("preserve_original_urls", True)
|
||||
self.custom_params = config.get("custom_params", {})
|
||||
self.image_handler = ImageHandler(config) if self.download_images else None
|
||||
|
||||
def connect(self) -> bool:
|
||||
"""Connect to MongoDB"""
|
||||
try:
|
||||
# Use the correct TLS parameters for newer PyMongo versions
|
||||
self.client = pymongo.MongoClient(
|
||||
self.uri,
|
||||
tlsAllowInvalidCertificates=True, # Equivalent to ssl_cert_reqs=CERT_NONE
|
||||
connectTimeoutMS=30000,
|
||||
socketTimeoutMS=None,
|
||||
connect=True,
|
||||
maxPoolSize=50
|
||||
)
|
||||
# Test connection
|
||||
self.client.admin.command('ping')
|
||||
db = self.client[self.db_name]
|
||||
self.collection = db[self.collection_name]
|
||||
self.connected = True
|
||||
log.info(f"Connected to MongoDB: {self.db_name}.{self.collection_name}")
|
||||
return True
|
||||
except Exception as e:
|
||||
log.error(f"Failed to connect to MongoDB: {e}")
|
||||
self.connected = False
|
||||
return False
|
||||
|
||||
def close(self):
|
||||
"""Close MongoDB connection"""
|
||||
if self.client:
|
||||
self.client.close()
|
||||
self.connected = False
|
||||
|
||||
def fetch_existing_reviews(self) -> Dict[str, Dict[str, Any]]:
|
||||
"""Fetch existing reviews from MongoDB"""
|
||||
if not self.connected and not self.connect():
|
||||
log.warning("Cannot fetch existing reviews - MongoDB connection failed")
|
||||
return {}
|
||||
|
||||
try:
|
||||
reviews = {}
|
||||
for doc in self.collection.find({}, {"_id": 0}):
|
||||
review_id = doc.get("review_id")
|
||||
if review_id:
|
||||
reviews[review_id] = doc
|
||||
log.info(f"Fetched {len(reviews)} existing reviews from MongoDB")
|
||||
return reviews
|
||||
except Exception as e:
|
||||
log.error(f"Error fetching reviews from MongoDB: {e}")
|
||||
return {}
|
||||
|
||||
def save_reviews(self, reviews: Dict[str, Dict[str, Any]]):
|
||||
"""Save reviews to MongoDB using bulk operations"""
|
||||
if not reviews:
|
||||
log.info("No reviews to save to MongoDB")
|
||||
return
|
||||
|
||||
if not self.connected and not self.connect():
|
||||
log.warning("Cannot save reviews - MongoDB connection failed")
|
||||
return
|
||||
|
||||
try:
|
||||
# Process reviews before saving
|
||||
processed_reviews = reviews.copy()
|
||||
|
||||
# Convert string dates to datetime objects if enabled
|
||||
if self.convert_dates:
|
||||
processed_reviews = DateConverter.convert_dates_in_reviews(processed_reviews)
|
||||
|
||||
# Download and process images if enabled
|
||||
if self.download_images and self.image_handler:
|
||||
processed_reviews = self.image_handler.download_all_images(processed_reviews)
|
||||
|
||||
# If not storing local paths, remove them from the documents
|
||||
if not self.store_local_paths:
|
||||
for review in processed_reviews.values():
|
||||
if "local_images" in review:
|
||||
del review["local_images"]
|
||||
if "local_profile_picture" in review:
|
||||
del review["local_profile_picture"]
|
||||
|
||||
# If not preserving original URLs, remove them from the documents
|
||||
if self.replace_urls and not self.preserve_original_urls:
|
||||
for review in processed_reviews.values():
|
||||
if "original_image_urls" in review:
|
||||
del review["original_image_urls"]
|
||||
if "original_profile_picture" in review:
|
||||
del review["original_profile_picture"]
|
||||
|
||||
# Add custom parameters to each document
|
||||
if self.custom_params:
|
||||
log.info(f"Adding custom parameters to {len(processed_reviews)} documents")
|
||||
for review in processed_reviews.values():
|
||||
for key, value in self.custom_params.items():
|
||||
review[key] = value
|
||||
|
||||
operations = []
|
||||
for review in processed_reviews.values():
|
||||
# Convert to proper MongoDB document
|
||||
# Exclude _id for inserts, MongoDB will generate it
|
||||
if "_id" in review:
|
||||
del review["_id"]
|
||||
|
||||
operations.append(
|
||||
pymongo.UpdateOne(
|
||||
{"review_id": review["review_id"]},
|
||||
{"$set": review},
|
||||
upsert=True
|
||||
)
|
||||
)
|
||||
|
||||
if operations:
|
||||
result = self.collection.bulk_write(operations)
|
||||
log.info(f"MongoDB: Upserted {result.upserted_count}, modified {result.modified_count} reviews")
|
||||
except Exception as e:
|
||||
log.error(f"Error saving reviews to MongoDB: {e}")
|
||||
|
||||
|
||||
class JSONStorage:
|
||||
"""JSON file-based storage handler for Google Maps reviews"""
|
||||
|
||||
def __init__(self, config: Dict[str, Any]):
|
||||
"""Initialize JSON storage with configuration"""
|
||||
self.json_path = Path(config.get("json_path", "google_reviews.json"))
|
||||
self.seen_ids_path = Path(config.get("seen_ids_path", "google_reviews.ids"))
|
||||
self.convert_dates = config.get("convert_dates", True)
|
||||
self.download_images = config.get("download_images", False)
|
||||
self.store_local_paths = config.get("store_local_paths", True)
|
||||
self.replace_urls = config.get("replace_urls", False)
|
||||
self.preserve_original_urls = config.get("preserve_original_urls", True)
|
||||
self.custom_params = config.get("custom_params", {})
|
||||
self.image_handler = ImageHandler(config) if self.download_images else None
|
||||
|
||||
def load_json_docs(self) -> Dict[str, Dict[str, Any]]:
|
||||
"""Load reviews from JSON file"""
|
||||
if not self.json_path.exists():
|
||||
return {}
|
||||
try:
|
||||
data = json.loads(self.json_path.read_text(encoding="utf-8"))
|
||||
# Index by review_id for fast lookups
|
||||
return {d.get("review_id", ""): d for d in data if d.get("review_id")}
|
||||
except json.JSONDecodeError:
|
||||
log.warning("⚠️ Error reading JSON file, starting with empty data")
|
||||
return {}
|
||||
|
||||
def save_json_docs(self, docs: Dict[str, Dict[str, Any]]):
|
||||
"""Save reviews to JSON file"""
|
||||
# Create a copy of the docs to avoid modifying the original
|
||||
processed_docs = {review_id: review.copy() for review_id, review in docs.items()}
|
||||
|
||||
# Process reviews before saving
|
||||
# Convert string dates to datetime objects if enabled
|
||||
if self.convert_dates:
|
||||
processed_docs = DateConverter.convert_dates_in_reviews(processed_docs)
|
||||
|
||||
# Download and process images if enabled
|
||||
if self.download_images and self.image_handler:
|
||||
processed_docs = self.image_handler.download_all_images(processed_docs)
|
||||
|
||||
# If not storing local paths, remove them from the documents
|
||||
if not self.store_local_paths:
|
||||
for review in processed_docs.values():
|
||||
if "local_images" in review:
|
||||
del review["local_images"]
|
||||
if "local_profile_picture" in review:
|
||||
del review["local_profile_picture"]
|
||||
|
||||
# If not preserving original URLs, remove them from the documents
|
||||
if self.replace_urls and not self.preserve_original_urls:
|
||||
for review in processed_docs.values():
|
||||
if "original_image_urls" in review:
|
||||
del review["original_image_urls"]
|
||||
if "original_profile_picture" in review:
|
||||
del review["original_profile_picture"]
|
||||
|
||||
# Add custom parameters to each document
|
||||
if self.custom_params:
|
||||
log.info(f"Adding custom parameters to {len(processed_docs)} documents")
|
||||
for review in processed_docs.values():
|
||||
for key, value in self.custom_params.items():
|
||||
review[key] = value
|
||||
|
||||
# Convert datetime objects back to strings for JSON serialization
|
||||
for doc in processed_docs.values():
|
||||
for key, value in doc.items():
|
||||
if isinstance(value, datetime):
|
||||
doc[key] = value.isoformat()
|
||||
|
||||
# Write to JSON file
|
||||
self.json_path.write_text(json.dumps(list(processed_docs.values()),
|
||||
ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
def load_seen(self) -> Set[str]:
|
||||
"""Load set of already seen review IDs"""
|
||||
return set(
|
||||
self.seen_ids_path.read_text(encoding="utf-8").splitlines()) if self.seen_ids_path.exists() else set()
|
||||
|
||||
def save_seen(self, ids: Set[str]):
|
||||
"""Save set of already seen review IDs"""
|
||||
self.seen_ids_path.write_text("\n".join(ids), encoding="utf-8")
|
||||
|
||||
|
||||
def merge_review(existing: Dict[str, Any] | None, raw: RawReview) -> Dict[str, Any]:
|
||||
"""
|
||||
Merge a raw review with an existing review document.
|
||||
Creates a new document if existing is None.
|
||||
"""
|
||||
if not existing:
|
||||
# Create a new review with the updated field names
|
||||
existing = {
|
||||
"review_id": raw.id,
|
||||
"author": raw.author,
|
||||
"rating": raw.rating,
|
||||
"description": {}, # renamed from "texts"
|
||||
"likes": raw.likes,
|
||||
"user_images": list(raw.photos), # renamed from "photo_urls"
|
||||
"author_profile_url": raw.profile, # renamed from "profile_link"
|
||||
"profile_picture": raw.avatar, # renamed from "avatar_url"
|
||||
"owner_responses": {},
|
||||
"created_date": get_current_iso_date(),
|
||||
"review_date": parse_relative_date(raw.date, RAW_LANG),
|
||||
}
|
||||
else:
|
||||
# Handle existing reviews with old field names - migrate them
|
||||
if "texts" in existing and "description" not in existing:
|
||||
existing["description"] = existing.pop("texts")
|
||||
|
||||
if "photo_urls" in existing and "user_images" not in existing:
|
||||
existing["user_images"] = existing.pop("photo_urls")
|
||||
|
||||
if "profile_link" in existing and "author_profile_url" not in existing:
|
||||
existing["author_profile_url"] = existing.pop("profile_link")
|
||||
|
||||
if "avatar_url" in existing and "profile_picture" not in existing:
|
||||
existing["profile_picture"] = existing.pop("avatar_url")
|
||||
|
||||
# Add ISO dates if not present
|
||||
if "created_date" not in existing:
|
||||
existing["created_date"] = get_current_iso_date()
|
||||
|
||||
if "review_date" not in existing:
|
||||
existing["review_date"] = parse_relative_date(raw.date, RAW_LANG)
|
||||
|
||||
# Remove the 'date' field if it exists
|
||||
if "date" in existing:
|
||||
del existing["date"]
|
||||
|
||||
if raw.text:
|
||||
existing["description"][raw.lang] = raw.text
|
||||
|
||||
if not existing.get("rating"):
|
||||
existing["rating"] = raw.rating
|
||||
|
||||
if raw.likes > existing.get("likes", 0):
|
||||
existing["likes"] = raw.likes
|
||||
|
||||
# Update the images list
|
||||
existing["user_images"] = list({*existing.get("user_images", []), *raw.photos})
|
||||
|
||||
# Update avatar/profile picture
|
||||
if raw.avatar and (
|
||||
not existing.get("profile_picture") or len(raw.avatar) > len(existing.get("profile_picture", ""))):
|
||||
existing["profile_picture"] = raw.avatar
|
||||
|
||||
if raw.owner_text:
|
||||
lang = detect_lang(raw.owner_text)
|
||||
# Don't store the date string in owner_responses
|
||||
existing.setdefault("owner_responses", {})[lang] = {
|
||||
"text": raw.owner_text,
|
||||
}
|
||||
|
||||
# Update last_modified timestamp
|
||||
existing["last_modified_date"] = get_current_iso_date()
|
||||
|
||||
return existing
|
||||
|
||||
|
||||
def merge_review_with_translation(existing: Dict[str, Any] | None, raw: RawReview, append_translations: bool = False) -> Dict[str, Any]:
|
||||
"""
|
||||
Enhanced merge function that supports translation mode.
|
||||
When append_translations is True, it adds new language versions to existing reviews.
|
||||
"""
|
||||
# Use the standard merge for the base functionality
|
||||
merged = merge_review(existing, raw)
|
||||
|
||||
if append_translations and existing and raw.text:
|
||||
# In translation mode, always add the new language version
|
||||
# even if we already have content for this review
|
||||
merged["description"][raw.lang] = raw.text
|
||||
|
||||
# Also merge owner responses in translation mode
|
||||
if raw.owner_text:
|
||||
owner_lang = detect_lang(raw.owner_text)
|
||||
merged.setdefault("owner_responses", {})[owner_lang] = {
|
||||
"text": raw.owner_text,
|
||||
}
|
||||
|
||||
# Add metadata about when this translation was added
|
||||
merged.setdefault("translation_history", []).append({
|
||||
"language": raw.lang,
|
||||
"added_date": get_current_iso_date(),
|
||||
"source": "regional_scraping"
|
||||
})
|
||||
|
||||
return merged
|
||||
Reference in New Issue
Block a user