diff --git a/modules/scraper_clean.py b/modules/scraper_clean.py index 69fcdd2..a52592e 100644 --- a/modules/scraper_clean.py +++ b/modules/scraper_clean.py @@ -25,6 +25,15 @@ def parse_api_review(raw: list) -> dict: if not (1 <= rating <= 5): return None + # Filter out garbage data (language codes, metadata, etc.) + if len(author) <= 3: # Real names are longer than 3 chars + return None + if author.lower() in ['google', 'maps', 'reviews', 'es', 'en', 'it', 'no', 'de', 'fr', 'pt']: + return None + # Timestamp should look like a date, not a URL or language code + if timestamp and ('http' in str(timestamp) or len(str(timestamp)) <= 3): + return None + # Owner response owner_response = None for idx in [9, 18]: