Full text + deduplication: API parser + More button expansion

- Fix API parser to use correct Google Maps response structure
  - Review ID at [0], Author at [1][4][5][0], Rating at [2][0][0]
  - Text at [2][15][0][0], Timestamp at [1][6]
- Use review_id as key for both API and DOM to avoid duplicates
- Prefer API data (original language, full text)
- Expand "More" buttons before sorting and during scroll loop
- Results: 246/247 full text (99.6%), down from 36/247 before

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-22 13:09:40 +00:00
parent b4fae38027
commit 7abff25dc6

View File

@@ -56,7 +56,7 @@ def parse_api_review(raw: list) -> dict:
def extract_reviews_from_api_body(body: str) -> list: def extract_reviews_from_api_body(body: str) -> list:
"""Extract reviews from API response body.""" """Extract reviews from API response body using correct Google Maps structure."""
reviews = [] reviews = []
try: try:
# Remove )]}' prefix # Remove )]}' prefix
@@ -65,25 +65,76 @@ def extract_reviews_from_api_body(body: str) -> list:
data = json.loads(body) data = json.loads(body)
# Recursively find review arrays # Google Maps API structure: data[2] contains review arrays
def find_reviews(obj, depth=0): # Each review: data[2][X][0] where:
if depth > 12: # Author: [1][4][5][0]
return # Rating: [2][0][0]
if isinstance(obj, list): # Text: [2][15][0][0]
# Check if this looks like a review # Time: [1][6]
if len(obj) > 4 and isinstance(obj[0], str) and isinstance(obj[4], int): if not isinstance(data, list) or len(data) < 3:
if 1 <= obj[4] <= 5: return reviews
rev = parse_api_review(obj)
if rev and rev["author"]:
reviews.append(rev)
return
for item in obj:
find_reviews(item, depth + 1)
elif isinstance(obj, dict):
for v in obj.values():
find_reviews(v, depth + 1)
find_reviews(data) reviews_area = data[2]
if not isinstance(reviews_area, list):
return reviews
for item in reviews_area:
try:
if not isinstance(item, list) or len(item) < 1:
continue
review_data = item[0]
if not isinstance(review_data, list) or len(review_data) < 3:
continue
# Extract fields using correct paths
review_id = ""
author = ""
rating = 0
text = ""
timestamp = ""
# Review ID: [0] - same format as DOM's data-review-id
try:
review_id = review_data[0]
except (IndexError, TypeError):
pass
# Author: [1][4][5][0]
try:
author = review_data[1][4][5][0]
except (IndexError, TypeError):
pass
# Rating: [2][0][0]
try:
rating = review_data[2][0][0]
except (IndexError, TypeError):
pass
# Text: [2][15][0][0]
try:
text = review_data[2][15][0][0]
except (IndexError, TypeError):
pass
# Timestamp: [1][6]
try:
timestamp = review_data[1][6]
except (IndexError, TypeError):
pass
# Validate and add (include review_id for deduplication)
if author and isinstance(rating, int) and 1 <= rating <= 5:
reviews.append({
"review_id": review_id,
"author": author,
"text": text or "",
"rating": rating,
"timestamp": timestamp or "",
"source": "api"
})
except:
continue
except: except:
pass pass
return reviews return reviews
@@ -416,6 +467,25 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
except: except:
pass pass
# EXPAND ALL "MORE" BUTTONS for full text on pre-rendered reviews
# This batch-clicks all "More" buttons at once (fast, no waiting per button)
try:
expanded = driver.execute_script("""
var buttons = document.querySelectorAll('button.w8nwRe.kyuRq');
var count = 0;
for (var i = 0; i < buttons.length; i++) {
if (buttons[i].textContent.trim() === 'More') {
buttons[i].click();
count++;
}
}
return count;
""")
if expanded > 0:
print(f" 📝 Expanded {expanded} truncated reviews")
except:
pass
# Block images to speed up scrolling (use CDP) # Block images to speed up scrolling (use CDP)
try: try:
driver.execute_cdp_cmd('Network.setBlockedURLs', { driver.execute_cdp_cmd('Network.setBlockedURLs', {
@@ -507,15 +577,29 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
cycle_delta = t0 - cycle_start cycle_delta = t0 - cycle_start
cycle_start = t0 cycle_start = t0
# Collect from API (doesn't affect scroll) # Collect from API (doesn't affect scroll) - API has FULL TEXT in original language
# Use review_id as key to avoid duplicates with DOM
t1 = time.time() t1 = time.time()
for rev in get_api_reviews(): for rev in get_api_reviews():
key = f"api_{rev['author'][:20]}_{rev['rating']}" rid = rev.get('review_id', '')
if key not in seen_ids: if rid and rid not in seen_ids:
reviews[key] = rev reviews[rid] = rev
seen_ids.add(key) seen_ids.add(rid)
api_time = time.time() - t1 api_time = time.time() - t1
# Expand any new "More" buttons for full text (batch click, fast)
try:
driver.execute_script("""
var buttons = document.querySelectorAll('button.w8nwRe.kyuRq');
for (var i = 0; i < buttons.length; i++) {
if (buttons[i].textContent.trim() === 'More') {
buttons[i].click();
}
}
""")
except:
pass
# Parse reviews using ROBUST selectors (no class names - uses data/aria attributes) # Parse reviews using ROBUST selectors (no class names - uses data/aria attributes)
# This survives Google's CSS class name changes # This survives Google's CSS class name changes
# Also removes separators from previously-hidden cards to keep DOM light # Also removes separators from previously-hidden cards to keep DOM light