Optimize scraper performance and add fallback selectors for robustness
Performance improvements: - Validation speed: 59.71s → 10.96s (5.5x improvement) - Removed 50+ console.log statements from JavaScript extraction - Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting - Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls) Scraping improvements: - Increased idle detection from 6 to 12 consecutive idle scrolls for completeness - Added real-time progress updates every 5 scrolls with percentage calculation - Added crash recovery to extract partial reviews if Chrome crashes - Removed artificial 200-review limit to scrape ALL reviews Timestamp tracking: - Added updated_at field separate from started_at for progress tracking - Frontend now shows both "Started" (fixed) and "Last Update" (dynamic) Robustness improvements: - Added 5 fallback CSS selectors to handle different Google Maps page structures - Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc. - Automatic selector detection logs which selector works for debugging Test results: - Successfully scraped 550 reviews in 150.53s without crashes - Memory management prevents Chrome tab crashes during heavy scraping Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
175
reverse_engineer_date_formatter_v2.py
Normal file
175
reverse_engineer_date_formatter_v2.py
Normal file
@@ -0,0 +1,175 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Reverse-engineer Google's date formatting patterns by scraping reviews in English
|
||||
"""
|
||||
import json
|
||||
from modules.fast_scraper import fast_scrape_reviews
|
||||
|
||||
url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=en&rclk=1"
|
||||
|
||||
print("Scraping reviews in English...")
|
||||
result = fast_scrape_reviews(url, headless=True)
|
||||
|
||||
reviews = result.get('reviews', [])
|
||||
print(f"\nExtracted {len(reviews)} reviews")
|
||||
|
||||
if reviews:
|
||||
# Collect all unique date strings
|
||||
date_strings = set()
|
||||
for rev in reviews:
|
||||
date_text = rev.get('date_text')
|
||||
if date_text:
|
||||
date_strings.add(date_text)
|
||||
|
||||
print(f"\nFound {len(date_strings)} unique date formats:")
|
||||
for ds in sorted(date_strings):
|
||||
print(f" '{ds}'")
|
||||
|
||||
# Analyze patterns
|
||||
print("\n" + "="*80)
|
||||
print("PATTERN ANALYSIS:")
|
||||
print("="*80)
|
||||
|
||||
patterns = {
|
||||
'seconds': [],
|
||||
'minutes': [],
|
||||
'hours': [],
|
||||
'days': [],
|
||||
'weeks': [],
|
||||
'months': [],
|
||||
'years': []
|
||||
}
|
||||
|
||||
for ds in date_strings:
|
||||
ds_lower = ds.lower()
|
||||
if 'second' in ds_lower:
|
||||
patterns['seconds'].append(ds)
|
||||
elif 'minute' in ds_lower:
|
||||
patterns['minutes'].append(ds)
|
||||
elif 'hour' in ds_lower:
|
||||
patterns['hours'].append(ds)
|
||||
elif 'day' in ds_lower:
|
||||
patterns['days'].append(ds)
|
||||
elif 'week' in ds_lower:
|
||||
patterns['weeks'].append(ds)
|
||||
elif 'month' in ds_lower:
|
||||
patterns['months'].append(ds)
|
||||
elif 'year' in ds_lower:
|
||||
patterns['years'].append(ds)
|
||||
|
||||
for unit, examples in sorted(patterns.items()):
|
||||
if examples:
|
||||
print(f"\n{unit.upper()} ({len(examples)} patterns):")
|
||||
for ex in sorted(examples):
|
||||
print(f" '{ex}'")
|
||||
|
||||
# Identify the specific patterns
|
||||
print("\n" + "="*80)
|
||||
print("GOOGLE MAPS DATE FORMAT PATTERNS (English):")
|
||||
print("="*80)
|
||||
|
||||
print("\nPattern Structure:")
|
||||
print("-" * 80)
|
||||
|
||||
single_unit_patterns = [] # "a month ago"
|
||||
plural_patterns = [] # "3 months ago"
|
||||
|
||||
for ds in sorted(date_strings):
|
||||
if ds.startswith('a '):
|
||||
single_unit_patterns.append(ds)
|
||||
elif ds.split()[0].isdigit():
|
||||
plural_patterns.append(ds)
|
||||
|
||||
print(f"\nSingular (a X ago): {len(single_unit_patterns)} patterns")
|
||||
for p in sorted(single_unit_patterns):
|
||||
print(f" '{p}'")
|
||||
|
||||
print(f"\nPlural (N Xs ago): {len(plural_patterns)} patterns")
|
||||
for p in sorted(plural_patterns):
|
||||
print(f" '{p}'")
|
||||
|
||||
# Determine time ranges
|
||||
print("\n" + "="*80)
|
||||
print("TIME RANGE BOUNDARIES:")
|
||||
print("="*80)
|
||||
|
||||
# Extract numbers from plural patterns
|
||||
import re
|
||||
from collections import defaultdict
|
||||
|
||||
unit_values = defaultdict(list)
|
||||
for ds in date_strings:
|
||||
match = re.match(r'(\d+)\s+(\w+)\s+ago', ds.lower())
|
||||
if match:
|
||||
number = int(match.group(1))
|
||||
unit = match.group(2).rstrip('s') # Remove plural 's'
|
||||
unit_values[unit].append(number)
|
||||
|
||||
for unit, values in sorted(unit_values.items()):
|
||||
if values:
|
||||
print(f"\n{unit.upper()}:")
|
||||
print(f" Range: {min(values)} - {max(values)}")
|
||||
print(f" Values found: {sorted(set(values))}")
|
||||
|
||||
# Save analysis
|
||||
output = {
|
||||
'total_reviews': len(reviews),
|
||||
'unique_date_formats': len(date_strings),
|
||||
'all_date_strings': sorted(list(date_strings)),
|
||||
'patterns_by_unit': {k: sorted(v) for k, v in patterns.items() if v},
|
||||
'singular_patterns': sorted(single_unit_patterns),
|
||||
'plural_patterns': sorted(plural_patterns),
|
||||
'value_ranges': {unit: {'min': min(values), 'max': max(values), 'values': sorted(set(values))}
|
||||
for unit, values in unit_values.items() if values}
|
||||
}
|
||||
|
||||
with open('/tmp/google_date_patterns_english.json', 'w') as f:
|
||||
json.dump(output, f, indent=2)
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("Analysis saved to: /tmp/google_date_patterns_english.json")
|
||||
print("="*80)
|
||||
|
||||
# Now let's determine the EXACT library/algorithm Google uses
|
||||
print("\n" + "="*80)
|
||||
print("REVERSE-ENGINEERING GOOGLE'S ALGORITHM:")
|
||||
print("="*80)
|
||||
|
||||
print("\nBased on the patterns, Google's relative date formatter:")
|
||||
print("-" * 80)
|
||||
|
||||
print("\n1. FORMAT STRUCTURE:")
|
||||
print(" Single unit: 'a {unit} ago'")
|
||||
print(" Multiple: '{number} {unit}s ago'")
|
||||
|
||||
print("\n2. UNIT SELECTION (hypothesis):")
|
||||
if 'second' in unit_values:
|
||||
print(f" - Seconds: Used for 0-59 seconds ago")
|
||||
if 'minute' in unit_values:
|
||||
print(f" - Minutes: Used for 1-59 minutes ago")
|
||||
if 'hour' in unit_values:
|
||||
print(f" - Hours: Used for 1-23 hours ago")
|
||||
if 'day' in unit_values:
|
||||
print(f" - Days: Used for 1-6 days ago")
|
||||
if 'week' in unit_values:
|
||||
print(f" - Weeks: Used for 1-3 weeks ago")
|
||||
if 'month' in unit_values:
|
||||
print(f" - Months: Used for 1-11 months ago")
|
||||
if 'year' in unit_values:
|
||||
print(f" - Years: Used for 1+ years ago")
|
||||
|
||||
print("\n3. BOUNDARY THRESHOLDS (estimated):")
|
||||
print(" 60 seconds = switch to minutes")
|
||||
print(" 60 minutes = switch to hours")
|
||||
print(" 24 hours = switch to days")
|
||||
print(" 7 days = switch to weeks")
|
||||
print(" ~30 days (4 weeks) = switch to months")
|
||||
print(" 12 months = switch to years")
|
||||
|
||||
print("\n4. UNCERTAINTY RANGES:")
|
||||
print(" 'a month ago' = 30-59 days ago (±15 days)")
|
||||
print(" '2 months ago' = 60-89 days ago (±15 days)")
|
||||
print(" 'a year ago' = 365-729 days ago (±6 months)")
|
||||
|
||||
else:
|
||||
print("No reviews extracted!")
|
||||
Reference in New Issue
Block a user