Phase 0: Project restructure to ReviewIQ platform architecture

New structure:
- scrapers/google_reviews/v1_0_0.py (was modules/scraper_clean.py)
- scrapers/base.py (BaseScraper interface)
- scrapers/registry.py (ScraperRegistry for version routing)
- core/database.py, models.py, config.py, enums.py
- utils/logger.py, crash_analyzer.py, health_checks.py, helpers.py, date_converter.py
- workers/chrome_pool.py
- services/webhook_service.py
- api/ routes structure (empty, ready for Phase 2)
- tests/ structure mirroring source

All imports updated in:
- api_server_production.py (7 import paths updated)
- utils/health_checks.py (scraper import path)

Legacy modules moved to modules/_legacy/:
- data_storage.py, image_handler.py, s3_handler.py (unused)

Syntax verified, frontend build passing.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-24 15:22:08 +00:00
parent bb0291f265
commit 544e028c3f
37 changed files with 5782 additions and 30 deletions

307
utils/helpers.py Normal file
View File

@@ -0,0 +1,307 @@
"""
Utility functions for Google Maps Reviews Scraper.
"""
import datetime
import logging
import re
import time
from datetime import timezone
from functools import lru_cache
from typing import List
from selenium.common.exceptions import (NoSuchElementException,
StaleElementReferenceException,
TimeoutException)
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
# Logger
log = logging.getLogger("scraper")
# Constants for language detection
HEB_CHARS = re.compile(r"[\u0590-\u05FF]")
THAI_CHARS = re.compile(r"[\u0E00-\u0E7F]")
@lru_cache(maxsize=1024)
def detect_lang(txt: str) -> str:
"""Detect language based on character sets"""
if HEB_CHARS.search(txt): return "he"
if THAI_CHARS.search(txt): return "th"
return "en"
@lru_cache(maxsize=128)
def safe_int(s: str | None) -> int:
"""Safely convert string to integer, returning 0 if not possible"""
m = re.search(r"\d+", s or "")
return int(m.group()) if m else 0
def try_find(el: WebElement, css: str, *, all=False) -> List[WebElement]:
"""Safely find elements by CSS selector without raising exceptions"""
try:
if all:
return el.find_elements(By.CSS_SELECTOR, css)
obj = el.find_element(By.CSS_SELECTOR, css)
return [obj] if obj else []
except (NoSuchElementException, StaleElementReferenceException):
return []
def first_text(el: WebElement, css: str) -> str:
"""Get text from the first matching element that has non-empty text"""
for e in try_find(el, css, all=True):
try:
if (t := e.text.strip()):
return t
except StaleElementReferenceException:
continue
return ""
def parse_date_to_iso(date_str: str) -> str:
"""
Parse date strings like "2 weeks ago", "January 2023", etc. into ISO format.
Returns a best-effort ISO string, or empty string if parsing fails.
"""
if not date_str:
return ""
try:
now = datetime.now(timezone.utc)
# Handle relative dates
if "ago" in date_str.lower():
# For simplicity, map to approximate dates
if "minute" in date_str.lower():
minutes = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
dt = now.replace(microsecond=0) - timezone.timedelta(minutes=minutes)
elif "hour" in date_str.lower():
hours = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
dt = now.replace(microsecond=0) - timezone.timedelta(hours=hours)
elif "day" in date_str.lower():
days = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
dt = now.replace(microsecond=0) - timezone.timedelta(days=days)
elif "week" in date_str.lower():
weeks = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
dt = now.replace(microsecond=0) - timezone.timedelta(weeks=weeks)
elif "month" in date_str.lower():
months = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
# Approximate months as 30 days
dt = now.replace(microsecond=0) - timezone.timedelta(days=30 * months)
elif "year" in date_str.lower():
years = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
# Approximate years as 365 days
dt = now.replace(microsecond=0) - timezone.timedelta(days=365 * years)
else:
# Default to current time if can't parse
dt = now.replace(microsecond=0)
else:
# Handle absolute dates (month year format)
# This is a simplification - would need more robust parsing for production
dt = now.replace(microsecond=0)
return dt.isoformat()
except Exception:
# If parsing fails, return empty string
return ""
def first_attr(el: WebElement, css: str, attr: str) -> str:
"""Get attribute value from the first matching element that has a non-empty value"""
for e in try_find(el, css, all=True):
try:
if (v := (e.get_attribute(attr) or "").strip()):
return v
except StaleElementReferenceException:
continue
return ""
def click_if(driver: Chrome, css: str, delay: float = .25, timeout: float = 5.0) -> bool:
"""
Click element if it exists and is clickable, with timeout and better error handling.
Args:
driver: WebDriver instance
css: CSS selector for the element to click
delay: Time to wait after clicking (seconds)
timeout: Maximum time to wait for element (seconds)
Returns:
True if element was found and clicked, False otherwise
"""
try:
# First check if elements exist at all
elements = driver.find_elements(By.CSS_SELECTOR, css)
if not elements:
return False
# Try clicking the first visible element
for element in elements:
try:
if element.is_displayed() and element.is_enabled():
element.click()
time.sleep(delay)
return True
except Exception:
# Try next element if this one fails
continue
# If we couldn't click any of the direct elements, try with WebDriverWait
try:
WebDriverWait(driver, timeout).until(
EC.element_to_be_clickable((By.CSS_SELECTOR, css))
).click()
time.sleep(delay)
return True
except TimeoutException:
return False
except Exception as e:
log.debug(f"Error in click_if: {str(e)}")
return False
def get_current_iso_date() -> str:
"""Return current UTC time in ISO format."""
from datetime import datetime, timezone
return datetime.now(timezone.utc).isoformat()
# """
# Utility functions for Google Maps Reviews Scraper.
# """
#
# import re
# import time
# import logging
# from datetime import datetime, timezone
# from functools import lru_cache
# from typing import List, Optional
#
# from selenium.common.exceptions import (NoSuchElementException,
# StaleElementReferenceException,
# TimeoutException)
# from selenium.webdriver import Chrome
# from selenium.webdriver.common.by import By
# from selenium.webdriver.remote.webelement import WebElement
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.support.ui import WebDriverWait
#
# # Constants for language detection
# HEB_CHARS = re.compile(r"[\u0590-\u05FF]")
# THAI_CHARS = re.compile(r"[\u0E00-\u0E7F]")
#
# # Logger
# log = logging.getLogger("scraper")
#
#
# @lru_cache(maxsize=1024)
# def detect_lang(txt: str) -> str:
# """Detect language based on character sets"""
# if HEB_CHARS.search(txt): return "he"
# if THAI_CHARS.search(txt): return "th"
# return "en"
#
#
# @lru_cache(maxsize=128)
# def safe_int(s: str | None) -> int:
# """Safely convert string to integer, returning 0 if not possible"""
# m = re.search(r"\d+", s or "")
# return int(m.group()) if m else 0
#
#
# def try_find(el: WebElement, css: str, *, all=False) -> List[WebElement]:
# """Safely find elements by CSS selector without raising exceptions"""
# try:
# if all:
# return el.find_elements(By.CSS_SELECTOR, css)
# obj = el.find_element(By.CSS_SELECTOR, css)
# return [obj] if obj else []
# except (NoSuchElementException, StaleElementReferenceException):
# return []
#
#
# def first_text(el: WebElement, css: str) -> str:
# """Get text from the first matching element that has non-empty text"""
# for e in try_find(el, css, all=True):
# if (t := e.text.strip()):
# return t
# return ""
#
#
# def first_attr(el: WebElement, css: str, attr: str) -> str:
# """Get attribute value from the first matching element that has a non-empty value"""
# for e in try_find(el, css, all=True):
# if (v := (e.get_attribute(attr) or "").strip()):
# return v
# return ""
#
#
# def click_if(driver: Chrome, css: str, delay: float = .25, timeout: float = 5.0) -> bool:
# """Click element if it exists and is clickable, with timeout"""
# try:
# WebDriverWait(driver, timeout).until(
# EC.element_to_be_clickable((By.CSS_SELECTOR, css))
# ).click()
# time.sleep(delay)
# return True
# except TimeoutException:
# return False
#
#
# def parse_date_to_iso(date_str: str) -> str:
# """
# Parse date strings like "2 weeks ago", "January 2023", etc. into ISO format.
# Returns a best-effort ISO string, or empty string if parsing fails.
# """
# if not date_str:
# return ""
#
# try:
# now = datetime.now(timezone.utc)
#
# # Handle relative dates
# if "ago" in date_str.lower():
# # For simplicity, map to approximate dates
# if "minute" in date_str.lower():
# minutes = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
# dt = now.replace(microsecond=0) - timezone.timedelta(minutes=minutes)
# elif "hour" in date_str.lower():
# hours = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
# dt = now.replace(microsecond=0) - timezone.timedelta(hours=hours)
# elif "day" in date_str.lower():
# days = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
# dt = now.replace(microsecond=0) - timezone.timedelta(days=days)
# elif "week" in date_str.lower():
# weeks = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
# dt = now.replace(microsecond=0) - timezone.timedelta(weeks=weeks)
# elif "month" in date_str.lower():
# months = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
# # Approximate months as 30 days
# dt = now.replace(microsecond=0) - timezone.timedelta(days=30 * months)
# elif "year" in date_str.lower():
# years = int(re.search(r'\d+', date_str).group()) if re.search(r'\d+', date_str) else 1
# # Approximate years as 365 days
# dt = now.replace(microsecond=0) - timezone.timedelta(days=365 * years)
# else:
# # Default to current time if can't parse
# dt = now.replace(microsecond=0)
# else:
# # Handle absolute dates (month year format)
# # This is a simplification - would need more robust parsing for production
# dt = now.replace(microsecond=0)
#
# return dt.isoformat()
# except Exception:
# # If parsing fails, return empty string
# return ""
#
#
# def get_current_iso_date() -> str:
# """Return current UTC time in ISO format."""
# return datetime.now(timezone.utc).isoformat()