New structure: - scrapers/google_reviews/v1_0_0.py (was modules/scraper_clean.py) - scrapers/base.py (BaseScraper interface) - scrapers/registry.py (ScraperRegistry for version routing) - core/database.py, models.py, config.py, enums.py - utils/logger.py, crash_analyzer.py, health_checks.py, helpers.py, date_converter.py - workers/chrome_pool.py - services/webhook_service.py - api/ routes structure (empty, ready for Phase 2) - tests/ structure mirroring source All imports updated in: - api_server_production.py (7 import paths updated) - utils/health_checks.py (scraper import path) Legacy modules moved to modules/_legacy/: - data_storage.py, image_handler.py, s3_handler.py (unused) Syntax verified, frontend build passing. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
392 lines
15 KiB
Python
392 lines
15 KiB
Python
"""
|
|
Date conversion utilities for Google Maps reviews.
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
from datetime import datetime, timedelta
|
|
from typing import Dict, Any, Optional
|
|
|
|
# Logger
|
|
log = logging.getLogger("scraper")
|
|
|
|
|
|
def relative_to_datetime(date_str: str, lang: str = "en") -> Optional[datetime]:
|
|
"""
|
|
Convert a relative date string to a datetime object.
|
|
|
|
Args:
|
|
date_str: The relative date string (e.g., "2 years ago")
|
|
lang: Language code ("en" or "he")
|
|
|
|
Returns:
|
|
datetime object or None if conversion fails
|
|
"""
|
|
if not date_str:
|
|
return None
|
|
|
|
try:
|
|
# Convert to ISO format first
|
|
iso_date = parse_relative_date(date_str, lang)
|
|
|
|
# If original string was returned, it wasn't in the expected format
|
|
if iso_date == date_str:
|
|
return None
|
|
|
|
# Parse the ISO format into datetime
|
|
return datetime.fromisoformat(iso_date)
|
|
except Exception as e:
|
|
log.debug(f"Failed to convert relative date '{date_str}': {e}")
|
|
return None
|
|
|
|
|
|
class DateConverter:
|
|
"""Handler for converting string dates to datetime objects in MongoDB"""
|
|
|
|
@staticmethod
|
|
def convert_dates_in_document(doc: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""
|
|
Convert string dates to datetime objects in a document.
|
|
|
|
Args:
|
|
doc: MongoDB document with string dates
|
|
|
|
Returns:
|
|
Document with string dates converted to datetime objects
|
|
"""
|
|
# Remove the original date string field if it exists
|
|
if "date" in doc:
|
|
original_date = doc.pop("date")
|
|
|
|
# Try to use the original date to fix review_date if needed
|
|
if "review_date" not in doc or not doc["review_date"]:
|
|
lang = next(iter(doc.get("description", {}).keys()), "en")
|
|
date_obj = relative_to_datetime(original_date, lang)
|
|
if date_obj:
|
|
doc["review_date"] = date_obj
|
|
|
|
# Fields that should be converted to dates
|
|
date_fields = ["created_date", "last_modified_date", "review_date"]
|
|
|
|
# Convert date fields to datetime
|
|
for field in date_fields:
|
|
if field in doc and isinstance(doc[field], str):
|
|
try:
|
|
# Try to parse as ISO format first
|
|
doc[field] = datetime.fromisoformat(doc[field].replace('Z', '+00:00'))
|
|
except (ValueError, TypeError):
|
|
# If that fails, try parsing as relative date
|
|
lang = next(iter(doc.get("description", {}).keys()), "en")
|
|
date_obj = relative_to_datetime(doc[field], lang)
|
|
if date_obj:
|
|
doc[field] = date_obj
|
|
|
|
# Handle nested date fields in owner_responses
|
|
if "owner_responses" in doc and isinstance(doc["owner_responses"], dict):
|
|
for lang, response in doc["owner_responses"].items():
|
|
if isinstance(response, dict) and "date" in response:
|
|
# Remove the date string field from owner responses
|
|
del response["date"]
|
|
|
|
return doc
|
|
|
|
@staticmethod
|
|
def convert_dates_in_reviews(reviews: Dict[str, Dict[str, Any]]) -> Dict[str, Dict[str, Any]]:
|
|
"""
|
|
Convert string dates to datetime objects for all reviews.
|
|
|
|
Args:
|
|
reviews: Dictionary of review documents
|
|
|
|
Returns:
|
|
Reviews with dates converted to datetime objects
|
|
"""
|
|
log.info("Converting string dates to datetime objects...")
|
|
|
|
for review_id, review in reviews.items():
|
|
reviews[review_id] = DateConverter.convert_dates_in_document(review)
|
|
|
|
return reviews
|
|
|
|
|
|
def parse_relative_date(date_str: str, lang: str, now: Optional[datetime] = None) -> str:
|
|
"""
|
|
Converts a relative review_date (in English or Hebrew) such as "a week ago" or "לפני 7 שנים"
|
|
into an ISO formatted datetime string (UTC).
|
|
|
|
For English, supported formats include:
|
|
- "a day ago", "an hour ago", "3 weeks ago", "4 months ago", "2 years ago", etc.
|
|
For Hebrew, supported formats include:
|
|
- "לפני יום", "לפני 2 ימים", "לפני שבוע", "לפני שבועיים", "לפני חודש",
|
|
"לפני חודשיים", "לפני 10 חודשים", "לפני שנה", "לפני 3 שנים", etc.
|
|
|
|
Parameters:
|
|
- date_str (str): the relative date string.
|
|
- lang (str): "en" for English or "he" for Hebrew.
|
|
- now (Optional[datetime]): reference datetime; if None, current local time is used.
|
|
|
|
Returns:
|
|
A string representing the calculated absolute datetime in ISO 8601 format.
|
|
If parsing fails in all supported languages, returns a random date within the last year.
|
|
"""
|
|
import random
|
|
|
|
if now is None:
|
|
now = datetime.utcnow() # use UTC for consistency
|
|
|
|
# Try with the provided language first
|
|
result = try_parse_date(date_str, lang, now)
|
|
if result != date_str:
|
|
return result
|
|
|
|
# If the provided language failed, try other supported languages
|
|
supported_langs = ["en", "he", "th"]
|
|
for alt_lang in supported_langs:
|
|
if alt_lang != lang.lower():
|
|
result = try_parse_date(date_str, alt_lang, now)
|
|
if result != date_str:
|
|
return result
|
|
|
|
# If all parsing attempts failed, generate a random date within the last year
|
|
# This creates a date between 1 day ago and 365 days ago
|
|
random_days_ago = random.randint(1, 365)
|
|
random_date = now - timedelta(days=random_days_ago)
|
|
return random_date.isoformat()
|
|
|
|
|
|
def try_parse_date(date_str: str, lang: str, now: datetime) -> str:
|
|
"""
|
|
Helper function that attempts to parse a date string in a specific language.
|
|
|
|
Returns the ISO formatted date if successful, or the original string if not.
|
|
"""
|
|
delta = timedelta(0)
|
|
parsed = False
|
|
|
|
if lang.lower() == "en":
|
|
# Pattern: capture number or "a"/"an", then unit.
|
|
pattern = re.compile(r'(?P<num>a|an|\d+)\s+(?P<unit>day|week|month|year)s?\s+ago', re.IGNORECASE)
|
|
m = pattern.search(date_str)
|
|
if m:
|
|
num_str = m.group("num").lower()
|
|
num = 1 if num_str in ("a", "an") else int(num_str)
|
|
unit = m.group("unit").lower()
|
|
if unit == "day":
|
|
delta = timedelta(days=num)
|
|
elif unit == "week":
|
|
delta = timedelta(weeks=num)
|
|
elif unit == "month":
|
|
delta = timedelta(days=30 * num) # approximate
|
|
elif unit == "year":
|
|
delta = timedelta(days=365 * num) # approximate
|
|
parsed = True
|
|
elif lang.lower() == "he":
|
|
# Remove the "לפני" prefix if present
|
|
text = date_str.strip()
|
|
if text.startswith("לפני"):
|
|
text = text[len("לפני"):].strip()
|
|
|
|
# Handle special cases where the number and unit are combined:
|
|
special = {
|
|
"חודשיים": (2, "month"),
|
|
"שבועיים": (2, "week"),
|
|
"יומיים": (2, "day"),
|
|
}
|
|
if text in special:
|
|
num, unit = special[text]
|
|
if unit == "day":
|
|
delta = timedelta(days=num)
|
|
elif unit == "week":
|
|
delta = timedelta(weeks=num)
|
|
elif unit == "month":
|
|
delta = timedelta(days=30 * num) # approximate
|
|
parsed = True
|
|
else:
|
|
# Match optional number (or assume 1) and then a unit.
|
|
pattern = re.compile(r'(?P<num>\d+|אחד|אחת)?\s*(?P<unit>שנה|שנים|חודש|חודשים|יום|ימים|שבוע|שבועות)',
|
|
re.IGNORECASE)
|
|
m = pattern.search(text)
|
|
if m:
|
|
num_str = m.group("num")
|
|
if not num_str:
|
|
num = 1
|
|
else:
|
|
try:
|
|
num = int(num_str)
|
|
except ValueError:
|
|
num = 1
|
|
unit_he = m.group("unit")
|
|
# Map the Hebrew unit (both singular and plural) to English unit names
|
|
if unit_he in ("יום", "ימים"):
|
|
unit = "day"
|
|
elif unit_he in ("שבוע", "שבועות"):
|
|
unit = "week"
|
|
elif unit_he in ("חודש", "חודשים"):
|
|
unit = "month"
|
|
elif unit_he in ("שנה", "שנים"):
|
|
unit = "year"
|
|
else:
|
|
unit = "day" # fallback
|
|
|
|
if unit == "day":
|
|
delta = timedelta(days=num)
|
|
elif unit == "week":
|
|
delta = timedelta(weeks=num)
|
|
elif unit == "month":
|
|
delta = timedelta(days=30 * num) # approximate
|
|
elif unit == "year":
|
|
delta = timedelta(days=365 * num) # approximate
|
|
parsed = True
|
|
elif lang.lower() == "th":
|
|
# Thai language patterns (simplified)
|
|
# Check for Thai patterns like "3 วันที่แล้ว" (3 days ago)
|
|
thai_pattern = re.compile(r'(?P<num>\d+)?\s*(?P<unit>วัน|สัปดาห์|เดือน|ปี)ที่แล้ว', re.IGNORECASE)
|
|
m = thai_pattern.search(date_str)
|
|
if m:
|
|
num_str = m.group("num")
|
|
num = 1 if not num_str else int(num_str)
|
|
unit_th = m.group("unit")
|
|
|
|
# Map Thai units to English
|
|
if unit_th == "วัน":
|
|
unit = "day"
|
|
elif unit_th == "สัปดาห์":
|
|
unit = "week"
|
|
elif unit_th == "เดือน":
|
|
unit = "month"
|
|
elif unit_th == "ปี":
|
|
unit = "year"
|
|
else:
|
|
unit = "day" # fallback
|
|
|
|
if unit == "day":
|
|
delta = timedelta(days=num)
|
|
elif unit == "week":
|
|
delta = timedelta(weeks=num)
|
|
elif unit == "month":
|
|
delta = timedelta(days=30 * num) # approximate
|
|
elif unit == "year":
|
|
delta = timedelta(days=365 * num) # approximate
|
|
parsed = True
|
|
|
|
# Return the calculated date if parsing was successful, otherwise return the original string
|
|
if parsed:
|
|
result = now - delta
|
|
return result.isoformat()
|
|
else:
|
|
return date_str
|
|
|
|
|
|
# def parse_relative_date(date_str: str, lang: str, now: Optional[datetime] = None) -> str:
|
|
# """
|
|
# Converts a relative review_date (in English or Hebrew) such as "a week ago" or "לפני 7 שנים"
|
|
# into an ISO formatted datetime string (UTC).
|
|
#
|
|
# For English, supported formats include:
|
|
# - "a day ago", "an hour ago", "3 weeks ago", "4 months ago", "2 years ago", etc.
|
|
# For Hebrew, supported formats include:
|
|
# - "לפני יום", "לפני 2 ימים", "לפני שבוע", "לפני שבועיים", "לפני חודש",
|
|
# "לפני חודשיים", "לפני 10 חודשים", "לפני שנה", "לפני 3 שנים", etc.
|
|
#
|
|
# Parameters:
|
|
# - date_str (str): the relative date string.
|
|
# - lang (str): "en" for English or "he" for Hebrew.
|
|
# - now (Optional[datetime]): reference datetime; if None, current local time is used.
|
|
#
|
|
# Returns:
|
|
# A string representing the calculated absolute datetime in ISO 8601 format,
|
|
# or the original date_str if parsing fails.
|
|
# """
|
|
# if now is None:
|
|
# now = datetime.utcnow() # use UTC for consistency
|
|
#
|
|
# delta = timedelta(0)
|
|
#
|
|
# if lang.lower() == "en":
|
|
# # Pattern: capture number or "a"/"an", then unit.
|
|
# pattern = re.compile(r'(?P<num>a|an|\d+)\s+(?P<unit>day|week|month|year)s?\s+ago', re.IGNORECASE)
|
|
# m = pattern.search(date_str)
|
|
# if m:
|
|
# num_str = m.group("num").lower()
|
|
# num = 1 if num_str in ("a", "an") else int(num_str)
|
|
# unit = m.group("unit").lower()
|
|
# if unit == "day":
|
|
# delta = timedelta(days=num)
|
|
# elif unit == "week":
|
|
# delta = timedelta(weeks=num)
|
|
# elif unit == "month":
|
|
# delta = timedelta(days=30 * num) # approximate
|
|
# elif unit == "year":
|
|
# delta = timedelta(days=365 * num) # approximate
|
|
# else:
|
|
# return date_str # return original if not matched
|
|
# elif lang.lower() == "he":
|
|
# # Remove the "לפני" prefix if present
|
|
# text = date_str.strip()
|
|
# if text.startswith("לפני"):
|
|
# text = text[len("לפני"):].strip()
|
|
#
|
|
# # Handle special cases where the number and unit are combined:
|
|
# special = {
|
|
# "חודשיים": (2, "month"),
|
|
# "שבועיים": (2, "week"),
|
|
# "יומיים": (2, "day"),
|
|
# }
|
|
# if text in special:
|
|
# num, unit = special[text]
|
|
# else:
|
|
# # Match optional number (or assume 1) and then a unit.
|
|
# pattern = re.compile(r'(?P<num>\d+|אחד|אחת)?\s*(?P<unit>שנה|שנים|חודש|חודשים|יום|ימים|שבוע|שבועות)',
|
|
# re.IGNORECASE)
|
|
# m = pattern.search(text)
|
|
# if m:
|
|
# num_str = m.group("num")
|
|
# if not num_str:
|
|
# num = 1
|
|
# else:
|
|
# try:
|
|
# num = int(num_str)
|
|
# except ValueError:
|
|
# num = 1
|
|
# unit_he = m.group("unit")
|
|
# # Map the Hebrew unit (both singular and plural) to English unit names
|
|
# if unit_he in ("יום", "ימים"):
|
|
# unit = "day"
|
|
# elif unit_he in ("שבוע", "שבועות"):
|
|
# unit = "week"
|
|
# elif unit_he in ("חודש", "חודשים"):
|
|
# unit = "month"
|
|
# elif unit_he in ("שנה", "שנים"):
|
|
# unit = "year"
|
|
# else:
|
|
# unit = "day" # fallback
|
|
# else:
|
|
# return date_str # if nothing matches, return original text
|
|
#
|
|
# if unit == "day":
|
|
# delta = timedelta(days=num)
|
|
# elif unit == "week":
|
|
# delta = timedelta(weeks=num)
|
|
# elif unit == "month":
|
|
# delta = timedelta(days=30 * num) # approximate
|
|
# elif unit == "year":
|
|
# delta = timedelta(days=365 * num) # approximate
|
|
#
|
|
# result = now - delta
|
|
# return result.isoformat()
|
|
|
|
|
|
# --- Example usage ---
|
|
if __name__ == "__main__":
|
|
# Fixed reference time for reproducibility:
|
|
fixed_now = datetime(2025, 2, 5, 12, 0, 0)
|
|
examples = [
|
|
("a week ago", "he"),
|
|
("4 weeks ago", "en"),
|
|
("לפני 7 שנים", "he"),
|
|
("לפני חודשיים", "he")
|
|
]
|
|
for text, lang in examples:
|
|
iso_date = parse_relative_date(text, lang, now=fixed_now)
|
|
print(f"Original: {text} ({lang}) => ISO: {iso_date}")
|