From 218927bd9bb190f32e7dbf1408291a7710564275 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Guti=C3=A9rrez?= <35082514+alezmad@users.noreply.github.com> Date: Wed, 21 Jan 2026 20:47:08 +0000 Subject: [PATCH] Filter out garbage API data (language codes, metadata) - Reject authors with <= 3 chars (language codes like "es", "it", "no") - Reject known non-review authors ("google", "maps", etc.) - Reject timestamps that are URLs or very short strings Co-Authored-By: Claude Opus 4.5 --- modules/scraper_clean.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/modules/scraper_clean.py b/modules/scraper_clean.py index 69fcdd2..a52592e 100644 --- a/modules/scraper_clean.py +++ b/modules/scraper_clean.py @@ -25,6 +25,15 @@ def parse_api_review(raw: list) -> dict: if not (1 <= rating <= 5): return None + # Filter out garbage data (language codes, metadata, etc.) + if len(author) <= 3: # Real names are longer than 3 chars + return None + if author.lower() in ['google', 'maps', 'reviews', 'es', 'en', 'it', 'no', 'de', 'fr', 'pt']: + return None + # Timestamp should look like a date, not a URL or language code + if timestamp and ('http' in str(timestamp) or len(str(timestamp)) <= 3): + return None + # Owner response owner_response = None for idx in [9, 18]: