Phase 0: Project restructure to ReviewIQ platform architecture

New structure:
- scrapers/google_reviews/v1_0_0.py (was modules/scraper_clean.py)
- scrapers/base.py (BaseScraper interface)
- scrapers/registry.py (ScraperRegistry for version routing)
- core/database.py, models.py, config.py, enums.py
- utils/logger.py, crash_analyzer.py, health_checks.py, helpers.py, date_converter.py
- workers/chrome_pool.py
- services/webhook_service.py
- api/ routes structure (empty, ready for Phase 2)
- tests/ structure mirroring source

All imports updated in:
- api_server_production.py (7 import paths updated)
- utils/health_checks.py (scraper import path)

Legacy modules moved to modules/_legacy/:
- data_storage.py, image_handler.py, s3_handler.py (unused)

Syntax verified, frontend build passing.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-24 15:22:08 +00:00
parent bb0291f265
commit 544e028c3f
37 changed files with 5782 additions and 30 deletions

View File

@@ -0,0 +1,349 @@
"""
Data storage modules for Google Maps Reviews Scraper.
"""
import json
import logging
import ssl
from datetime import datetime
from pathlib import Path
from typing import Dict, Any, Set
import pymongo
from modules.date_converter import parse_relative_date, DateConverter
from modules.image_handler import ImageHandler
from modules.models import RawReview
from modules.utils import detect_lang, get_current_iso_date
# Configure SSL for MongoDB connection
ssl._create_default_https_context = ssl._create_unverified_context # macOS SSL fix
# Logger
log = logging.getLogger("scraper")
RAW_LANG = "en"
class MongoDBStorage:
"""MongoDB storage handler for Google Maps reviews"""
def __init__(self, config: Dict[str, Any]):
"""Initialize MongoDB storage with configuration"""
mongodb_config = config.get("mongodb", {})
self.uri = mongodb_config.get("uri")
self.db_name = mongodb_config.get("database")
self.collection_name = mongodb_config.get("collection")
self.client = None
self.collection = None
self.connected = False
self.convert_dates = config.get("convert_dates", True)
self.download_images = config.get("download_images", False)
self.store_local_paths = config.get("store_local_paths", True)
self.replace_urls = config.get("replace_urls", False)
self.preserve_original_urls = config.get("preserve_original_urls", True)
self.custom_params = config.get("custom_params", {})
self.image_handler = ImageHandler(config) if self.download_images else None
def connect(self) -> bool:
"""Connect to MongoDB"""
try:
# Use the correct TLS parameters for newer PyMongo versions
self.client = pymongo.MongoClient(
self.uri,
tlsAllowInvalidCertificates=True, # Equivalent to ssl_cert_reqs=CERT_NONE
connectTimeoutMS=30000,
socketTimeoutMS=None,
connect=True,
maxPoolSize=50
)
# Test connection
self.client.admin.command('ping')
db = self.client[self.db_name]
self.collection = db[self.collection_name]
self.connected = True
log.info(f"Connected to MongoDB: {self.db_name}.{self.collection_name}")
return True
except Exception as e:
log.error(f"Failed to connect to MongoDB: {e}")
self.connected = False
return False
def close(self):
"""Close MongoDB connection"""
if self.client:
self.client.close()
self.connected = False
def fetch_existing_reviews(self) -> Dict[str, Dict[str, Any]]:
"""Fetch existing reviews from MongoDB"""
if not self.connected and not self.connect():
log.warning("Cannot fetch existing reviews - MongoDB connection failed")
return {}
try:
reviews = {}
for doc in self.collection.find({}, {"_id": 0}):
review_id = doc.get("review_id")
if review_id:
reviews[review_id] = doc
log.info(f"Fetched {len(reviews)} existing reviews from MongoDB")
return reviews
except Exception as e:
log.error(f"Error fetching reviews from MongoDB: {e}")
return {}
def save_reviews(self, reviews: Dict[str, Dict[str, Any]]):
"""Save reviews to MongoDB using bulk operations"""
if not reviews:
log.info("No reviews to save to MongoDB")
return
if not self.connected and not self.connect():
log.warning("Cannot save reviews - MongoDB connection failed")
return
try:
# Process reviews before saving
processed_reviews = reviews.copy()
# Convert string dates to datetime objects if enabled
if self.convert_dates:
processed_reviews = DateConverter.convert_dates_in_reviews(processed_reviews)
# Download and process images if enabled
if self.download_images and self.image_handler:
processed_reviews = self.image_handler.download_all_images(processed_reviews)
# If not storing local paths, remove them from the documents
if not self.store_local_paths:
for review in processed_reviews.values():
if "local_images" in review:
del review["local_images"]
if "local_profile_picture" in review:
del review["local_profile_picture"]
# If not preserving original URLs, remove them from the documents
if self.replace_urls and not self.preserve_original_urls:
for review in processed_reviews.values():
if "original_image_urls" in review:
del review["original_image_urls"]
if "original_profile_picture" in review:
del review["original_profile_picture"]
# Add custom parameters to each document
if self.custom_params:
log.info(f"Adding custom parameters to {len(processed_reviews)} documents")
for review in processed_reviews.values():
for key, value in self.custom_params.items():
review[key] = value
operations = []
for review in processed_reviews.values():
# Convert to proper MongoDB document
# Exclude _id for inserts, MongoDB will generate it
if "_id" in review:
del review["_id"]
operations.append(
pymongo.UpdateOne(
{"review_id": review["review_id"]},
{"$set": review},
upsert=True
)
)
if operations:
result = self.collection.bulk_write(operations)
log.info(f"MongoDB: Upserted {result.upserted_count}, modified {result.modified_count} reviews")
except Exception as e:
log.error(f"Error saving reviews to MongoDB: {e}")
class JSONStorage:
"""JSON file-based storage handler for Google Maps reviews"""
def __init__(self, config: Dict[str, Any]):
"""Initialize JSON storage with configuration"""
self.json_path = Path(config.get("json_path", "google_reviews.json"))
self.seen_ids_path = Path(config.get("seen_ids_path", "google_reviews.ids"))
self.convert_dates = config.get("convert_dates", True)
self.download_images = config.get("download_images", False)
self.store_local_paths = config.get("store_local_paths", True)
self.replace_urls = config.get("replace_urls", False)
self.preserve_original_urls = config.get("preserve_original_urls", True)
self.custom_params = config.get("custom_params", {})
self.image_handler = ImageHandler(config) if self.download_images else None
def load_json_docs(self) -> Dict[str, Dict[str, Any]]:
"""Load reviews from JSON file"""
if not self.json_path.exists():
return {}
try:
data = json.loads(self.json_path.read_text(encoding="utf-8"))
# Index by review_id for fast lookups
return {d.get("review_id", ""): d for d in data if d.get("review_id")}
except json.JSONDecodeError:
log.warning("⚠️ Error reading JSON file, starting with empty data")
return {}
def save_json_docs(self, docs: Dict[str, Dict[str, Any]]):
"""Save reviews to JSON file"""
# Create a copy of the docs to avoid modifying the original
processed_docs = {review_id: review.copy() for review_id, review in docs.items()}
# Process reviews before saving
# Convert string dates to datetime objects if enabled
if self.convert_dates:
processed_docs = DateConverter.convert_dates_in_reviews(processed_docs)
# Download and process images if enabled
if self.download_images and self.image_handler:
processed_docs = self.image_handler.download_all_images(processed_docs)
# If not storing local paths, remove them from the documents
if not self.store_local_paths:
for review in processed_docs.values():
if "local_images" in review:
del review["local_images"]
if "local_profile_picture" in review:
del review["local_profile_picture"]
# If not preserving original URLs, remove them from the documents
if self.replace_urls and not self.preserve_original_urls:
for review in processed_docs.values():
if "original_image_urls" in review:
del review["original_image_urls"]
if "original_profile_picture" in review:
del review["original_profile_picture"]
# Add custom parameters to each document
if self.custom_params:
log.info(f"Adding custom parameters to {len(processed_docs)} documents")
for review in processed_docs.values():
for key, value in self.custom_params.items():
review[key] = value
# Convert datetime objects back to strings for JSON serialization
for doc in processed_docs.values():
for key, value in doc.items():
if isinstance(value, datetime):
doc[key] = value.isoformat()
# Write to JSON file
self.json_path.write_text(json.dumps(list(processed_docs.values()),
ensure_ascii=False, indent=2), encoding="utf-8")
def load_seen(self) -> Set[str]:
"""Load set of already seen review IDs"""
return set(
self.seen_ids_path.read_text(encoding="utf-8").splitlines()) if self.seen_ids_path.exists() else set()
def save_seen(self, ids: Set[str]):
"""Save set of already seen review IDs"""
self.seen_ids_path.write_text("\n".join(ids), encoding="utf-8")
def merge_review(existing: Dict[str, Any] | None, raw: RawReview) -> Dict[str, Any]:
"""
Merge a raw review with an existing review document.
Creates a new document if existing is None.
"""
if not existing:
# Create a new review with the updated field names
existing = {
"review_id": raw.id,
"author": raw.author,
"rating": raw.rating,
"description": {}, # renamed from "texts"
"likes": raw.likes,
"user_images": list(raw.photos), # renamed from "photo_urls"
"author_profile_url": raw.profile, # renamed from "profile_link"
"profile_picture": raw.avatar, # renamed from "avatar_url"
"owner_responses": {},
"created_date": get_current_iso_date(),
"review_date": parse_relative_date(raw.date, RAW_LANG),
}
else:
# Handle existing reviews with old field names - migrate them
if "texts" in existing and "description" not in existing:
existing["description"] = existing.pop("texts")
if "photo_urls" in existing and "user_images" not in existing:
existing["user_images"] = existing.pop("photo_urls")
if "profile_link" in existing and "author_profile_url" not in existing:
existing["author_profile_url"] = existing.pop("profile_link")
if "avatar_url" in existing and "profile_picture" not in existing:
existing["profile_picture"] = existing.pop("avatar_url")
# Add ISO dates if not present
if "created_date" not in existing:
existing["created_date"] = get_current_iso_date()
if "review_date" not in existing:
existing["review_date"] = parse_relative_date(raw.date, RAW_LANG)
# Remove the 'date' field if it exists
if "date" in existing:
del existing["date"]
if raw.text:
existing["description"][raw.lang] = raw.text
if not existing.get("rating"):
existing["rating"] = raw.rating
if raw.likes > existing.get("likes", 0):
existing["likes"] = raw.likes
# Update the images list
existing["user_images"] = list({*existing.get("user_images", []), *raw.photos})
# Update avatar/profile picture
if raw.avatar and (
not existing.get("profile_picture") or len(raw.avatar) > len(existing.get("profile_picture", ""))):
existing["profile_picture"] = raw.avatar
if raw.owner_text:
lang = detect_lang(raw.owner_text)
# Don't store the date string in owner_responses
existing.setdefault("owner_responses", {})[lang] = {
"text": raw.owner_text,
}
# Update last_modified timestamp
existing["last_modified_date"] = get_current_iso_date()
return existing
def merge_review_with_translation(existing: Dict[str, Any] | None, raw: RawReview, append_translations: bool = False) -> Dict[str, Any]:
"""
Enhanced merge function that supports translation mode.
When append_translations is True, it adds new language versions to existing reviews.
"""
# Use the standard merge for the base functionality
merged = merge_review(existing, raw)
if append_translations and existing and raw.text:
# In translation mode, always add the new language version
# even if we already have content for this review
merged["description"][raw.lang] = raw.text
# Also merge owner responses in translation mode
if raw.owner_text:
owner_lang = detect_lang(raw.owner_text)
merged.setdefault("owner_responses", {})[owner_lang] = {
"text": raw.owner_text,
}
# Add metadata about when this translation was added
merged.setdefault("translation_history", []).append({
"language": raw.lang,
"added_date": get_current_iso_date(),
"source": "regional_scraping"
})
return merged