Files
whyrating-engine-legacy/reverse_engineer_date_formatter_v2.py
Alejandro Gutiérrez faa0704737 Optimize scraper performance and add fallback selectors for robustness
Performance improvements:
- Validation speed: 59.71s → 10.96s (5.5x improvement)
- Removed 50+ console.log statements from JavaScript extraction
- Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting
- Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls)

Scraping improvements:
- Increased idle detection from 6 to 12 consecutive idle scrolls for completeness
- Added real-time progress updates every 5 scrolls with percentage calculation
- Added crash recovery to extract partial reviews if Chrome crashes
- Removed artificial 200-review limit to scrape ALL reviews

Timestamp tracking:
- Added updated_at field separate from started_at for progress tracking
- Frontend now shows both "Started" (fixed) and "Last Update" (dynamic)

Robustness improvements:
- Added 5 fallback CSS selectors to handle different Google Maps page structures
- Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc.
- Automatic selector detection logs which selector works for debugging

Test results:
- Successfully scraped 550 reviews in 150.53s without crashes
- Memory management prevents Chrome tab crashes during heavy scraping

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-18 19:49:24 +00:00

176 lines
5.7 KiB
Python

#!/usr/bin/env python3
"""
Reverse-engineer Google's date formatting patterns by scraping reviews in English
"""
import json
from modules.fast_scraper import fast_scrape_reviews
url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=en&rclk=1"
print("Scraping reviews in English...")
result = fast_scrape_reviews(url, headless=True)
reviews = result.get('reviews', [])
print(f"\nExtracted {len(reviews)} reviews")
if reviews:
# Collect all unique date strings
date_strings = set()
for rev in reviews:
date_text = rev.get('date_text')
if date_text:
date_strings.add(date_text)
print(f"\nFound {len(date_strings)} unique date formats:")
for ds in sorted(date_strings):
print(f" '{ds}'")
# Analyze patterns
print("\n" + "="*80)
print("PATTERN ANALYSIS:")
print("="*80)
patterns = {
'seconds': [],
'minutes': [],
'hours': [],
'days': [],
'weeks': [],
'months': [],
'years': []
}
for ds in date_strings:
ds_lower = ds.lower()
if 'second' in ds_lower:
patterns['seconds'].append(ds)
elif 'minute' in ds_lower:
patterns['minutes'].append(ds)
elif 'hour' in ds_lower:
patterns['hours'].append(ds)
elif 'day' in ds_lower:
patterns['days'].append(ds)
elif 'week' in ds_lower:
patterns['weeks'].append(ds)
elif 'month' in ds_lower:
patterns['months'].append(ds)
elif 'year' in ds_lower:
patterns['years'].append(ds)
for unit, examples in sorted(patterns.items()):
if examples:
print(f"\n{unit.upper()} ({len(examples)} patterns):")
for ex in sorted(examples):
print(f" '{ex}'")
# Identify the specific patterns
print("\n" + "="*80)
print("GOOGLE MAPS DATE FORMAT PATTERNS (English):")
print("="*80)
print("\nPattern Structure:")
print("-" * 80)
single_unit_patterns = [] # "a month ago"
plural_patterns = [] # "3 months ago"
for ds in sorted(date_strings):
if ds.startswith('a '):
single_unit_patterns.append(ds)
elif ds.split()[0].isdigit():
plural_patterns.append(ds)
print(f"\nSingular (a X ago): {len(single_unit_patterns)} patterns")
for p in sorted(single_unit_patterns):
print(f" '{p}'")
print(f"\nPlural (N Xs ago): {len(plural_patterns)} patterns")
for p in sorted(plural_patterns):
print(f" '{p}'")
# Determine time ranges
print("\n" + "="*80)
print("TIME RANGE BOUNDARIES:")
print("="*80)
# Extract numbers from plural patterns
import re
from collections import defaultdict
unit_values = defaultdict(list)
for ds in date_strings:
match = re.match(r'(\d+)\s+(\w+)\s+ago', ds.lower())
if match:
number = int(match.group(1))
unit = match.group(2).rstrip('s') # Remove plural 's'
unit_values[unit].append(number)
for unit, values in sorted(unit_values.items()):
if values:
print(f"\n{unit.upper()}:")
print(f" Range: {min(values)} - {max(values)}")
print(f" Values found: {sorted(set(values))}")
# Save analysis
output = {
'total_reviews': len(reviews),
'unique_date_formats': len(date_strings),
'all_date_strings': sorted(list(date_strings)),
'patterns_by_unit': {k: sorted(v) for k, v in patterns.items() if v},
'singular_patterns': sorted(single_unit_patterns),
'plural_patterns': sorted(plural_patterns),
'value_ranges': {unit: {'min': min(values), 'max': max(values), 'values': sorted(set(values))}
for unit, values in unit_values.items() if values}
}
with open('/tmp/google_date_patterns_english.json', 'w') as f:
json.dump(output, f, indent=2)
print("\n" + "="*80)
print("Analysis saved to: /tmp/google_date_patterns_english.json")
print("="*80)
# Now let's determine the EXACT library/algorithm Google uses
print("\n" + "="*80)
print("REVERSE-ENGINEERING GOOGLE'S ALGORITHM:")
print("="*80)
print("\nBased on the patterns, Google's relative date formatter:")
print("-" * 80)
print("\n1. FORMAT STRUCTURE:")
print(" Single unit: 'a {unit} ago'")
print(" Multiple: '{number} {unit}s ago'")
print("\n2. UNIT SELECTION (hypothesis):")
if 'second' in unit_values:
print(f" - Seconds: Used for 0-59 seconds ago")
if 'minute' in unit_values:
print(f" - Minutes: Used for 1-59 minutes ago")
if 'hour' in unit_values:
print(f" - Hours: Used for 1-23 hours ago")
if 'day' in unit_values:
print(f" - Days: Used for 1-6 days ago")
if 'week' in unit_values:
print(f" - Weeks: Used for 1-3 weeks ago")
if 'month' in unit_values:
print(f" - Months: Used for 1-11 months ago")
if 'year' in unit_values:
print(f" - Years: Used for 1+ years ago")
print("\n3. BOUNDARY THRESHOLDS (estimated):")
print(" 60 seconds = switch to minutes")
print(" 60 minutes = switch to hours")
print(" 24 hours = switch to days")
print(" 7 days = switch to weeks")
print(" ~30 days (4 weeks) = switch to months")
print(" 12 months = switch to years")
print("\n4. UNCERTAINTY RANGES:")
print(" 'a month ago' = 30-59 days ago (±15 days)")
print(" '2 months ago' = 60-89 days ago (±15 days)")
print(" 'a year ago' = 365-729 days ago (±6 months)")
else:
print("No reviews extracted!")