Filter out garbage API data (language codes, metadata)
- Reject authors with <= 3 chars (language codes like "es", "it", "no")
- Reject known non-review authors ("google", "maps", etc.)
- Reject timestamps that are URLs or very short strings
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -25,6 +25,15 @@ def parse_api_review(raw: list) -> dict:
|
||||
if not (1 <= rating <= 5):
|
||||
return None
|
||||
|
||||
# Filter out garbage data (language codes, metadata, etc.)
|
||||
if len(author) <= 3: # Real names are longer than 3 chars
|
||||
return None
|
||||
if author.lower() in ['google', 'maps', 'reviews', 'es', 'en', 'it', 'no', 'de', 'fr', 'pt']:
|
||||
return None
|
||||
# Timestamp should look like a date, not a URL or language code
|
||||
if timestamp and ('http' in str(timestamp) or len(str(timestamp)) <= 3):
|
||||
return None
|
||||
|
||||
# Owner response
|
||||
owner_response = None
|
||||
for idx in [9, 18]:
|
||||
|
||||
Reference in New Issue
Block a user