Filter out garbage API data (language codes, metadata)

- Reject authors with <= 3 chars (language codes like "es", "it", "no")
- Reject known non-review authors ("google", "maps", etc.)
- Reject timestamps that are URLs or very short strings

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-21 20:47:08 +00:00
parent 0e8a711a9c
commit 218927bd9b

View File

@@ -25,6 +25,15 @@ def parse_api_review(raw: list) -> dict:
if not (1 <= rating <= 5):
return None
# Filter out garbage data (language codes, metadata, etc.)
if len(author) <= 3: # Real names are longer than 3 chars
return None
if author.lower() in ['google', 'maps', 'reviews', 'es', 'en', 'it', 'no', 'de', 'fr', 'pt']:
return None
# Timestamp should look like a date, not a URL or language code
if timestamp and ('http' in str(timestamp) or len(str(timestamp)) <= 3):
return None
# Owner response
owner_response = None
for idx in [9, 18]: