#!/usr/bin/env python3 """ Reverse-engineer Google's date formatting patterns by scraping reviews in English """ import json from modules.fast_scraper import fast_scrape_reviews url = "https://www.google.com/maps/place/Soho+Club/data=!4m7!3m6!1s0x46dd947294b213bf:0x864c7a232527adb4!8m2!3d54.67869!4d25.2667181!16s%2Fg%2F1thhj5ml!19sChIJvxOylHKU3UYRtK0nJSN6TIY?authuser=0&hl=en&rclk=1" print("Scraping reviews in English...") result = fast_scrape_reviews(url, headless=True) reviews = result.get('reviews', []) print(f"\nExtracted {len(reviews)} reviews") if reviews: # Collect all unique date strings date_strings = set() for rev in reviews: date_text = rev.get('date_text') if date_text: date_strings.add(date_text) print(f"\nFound {len(date_strings)} unique date formats:") for ds in sorted(date_strings): print(f" '{ds}'") # Analyze patterns print("\n" + "="*80) print("PATTERN ANALYSIS:") print("="*80) patterns = { 'seconds': [], 'minutes': [], 'hours': [], 'days': [], 'weeks': [], 'months': [], 'years': [] } for ds in date_strings: ds_lower = ds.lower() if 'second' in ds_lower: patterns['seconds'].append(ds) elif 'minute' in ds_lower: patterns['minutes'].append(ds) elif 'hour' in ds_lower: patterns['hours'].append(ds) elif 'day' in ds_lower: patterns['days'].append(ds) elif 'week' in ds_lower: patterns['weeks'].append(ds) elif 'month' in ds_lower: patterns['months'].append(ds) elif 'year' in ds_lower: patterns['years'].append(ds) for unit, examples in sorted(patterns.items()): if examples: print(f"\n{unit.upper()} ({len(examples)} patterns):") for ex in sorted(examples): print(f" '{ex}'") # Identify the specific patterns print("\n" + "="*80) print("GOOGLE MAPS DATE FORMAT PATTERNS (English):") print("="*80) print("\nPattern Structure:") print("-" * 80) single_unit_patterns = [] # "a month ago" plural_patterns = [] # "3 months ago" for ds in sorted(date_strings): if ds.startswith('a '): single_unit_patterns.append(ds) elif ds.split()[0].isdigit(): plural_patterns.append(ds) print(f"\nSingular (a X ago): {len(single_unit_patterns)} patterns") for p in sorted(single_unit_patterns): print(f" '{p}'") print(f"\nPlural (N Xs ago): {len(plural_patterns)} patterns") for p in sorted(plural_patterns): print(f" '{p}'") # Determine time ranges print("\n" + "="*80) print("TIME RANGE BOUNDARIES:") print("="*80) # Extract numbers from plural patterns import re from collections import defaultdict unit_values = defaultdict(list) for ds in date_strings: match = re.match(r'(\d+)\s+(\w+)\s+ago', ds.lower()) if match: number = int(match.group(1)) unit = match.group(2).rstrip('s') # Remove plural 's' unit_values[unit].append(number) for unit, values in sorted(unit_values.items()): if values: print(f"\n{unit.upper()}:") print(f" Range: {min(values)} - {max(values)}") print(f" Values found: {sorted(set(values))}") # Save analysis output = { 'total_reviews': len(reviews), 'unique_date_formats': len(date_strings), 'all_date_strings': sorted(list(date_strings)), 'patterns_by_unit': {k: sorted(v) for k, v in patterns.items() if v}, 'singular_patterns': sorted(single_unit_patterns), 'plural_patterns': sorted(plural_patterns), 'value_ranges': {unit: {'min': min(values), 'max': max(values), 'values': sorted(set(values))} for unit, values in unit_values.items() if values} } with open('/tmp/google_date_patterns_english.json', 'w') as f: json.dump(output, f, indent=2) print("\n" + "="*80) print("Analysis saved to: /tmp/google_date_patterns_english.json") print("="*80) # Now let's determine the EXACT library/algorithm Google uses print("\n" + "="*80) print("REVERSE-ENGINEERING GOOGLE'S ALGORITHM:") print("="*80) print("\nBased on the patterns, Google's relative date formatter:") print("-" * 80) print("\n1. FORMAT STRUCTURE:") print(" Single unit: 'a {unit} ago'") print(" Multiple: '{number} {unit}s ago'") print("\n2. UNIT SELECTION (hypothesis):") if 'second' in unit_values: print(f" - Seconds: Used for 0-59 seconds ago") if 'minute' in unit_values: print(f" - Minutes: Used for 1-59 minutes ago") if 'hour' in unit_values: print(f" - Hours: Used for 1-23 hours ago") if 'day' in unit_values: print(f" - Days: Used for 1-6 days ago") if 'week' in unit_values: print(f" - Weeks: Used for 1-3 weeks ago") if 'month' in unit_values: print(f" - Months: Used for 1-11 months ago") if 'year' in unit_values: print(f" - Years: Used for 1+ years ago") print("\n3. BOUNDARY THRESHOLDS (estimated):") print(" 60 seconds = switch to minutes") print(" 60 minutes = switch to hours") print(" 24 hours = switch to days") print(" 7 days = switch to weeks") print(" ~30 days (4 weeks) = switch to months") print(" 12 months = switch to years") print("\n4. UNCERTAINTY RANGES:") print(" 'a month ago' = 30-59 days ago (±15 days)") print(" '2 months ago' = 60-89 days ago (±15 days)") print(" 'a year ago' = 365-729 days ago (±6 months)") else: print("No reviews extracted!")