Full text + deduplication: API parser + More button expansion
- Fix API parser to use correct Google Maps response structure - Review ID at [0], Author at [1][4][5][0], Rating at [2][0][0] - Text at [2][15][0][0], Timestamp at [1][6] - Use review_id as key for both API and DOM to avoid duplicates - Prefer API data (original language, full text) - Expand "More" buttons before sorting and during scroll loop - Results: 246/247 full text (99.6%), down from 36/247 before Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -56,7 +56,7 @@ def parse_api_review(raw: list) -> dict:
|
||||
|
||||
|
||||
def extract_reviews_from_api_body(body: str) -> list:
|
||||
"""Extract reviews from API response body."""
|
||||
"""Extract reviews from API response body using correct Google Maps structure."""
|
||||
reviews = []
|
||||
try:
|
||||
# Remove )]}' prefix
|
||||
@@ -65,25 +65,76 @@ def extract_reviews_from_api_body(body: str) -> list:
|
||||
|
||||
data = json.loads(body)
|
||||
|
||||
# Recursively find review arrays
|
||||
def find_reviews(obj, depth=0):
|
||||
if depth > 12:
|
||||
return
|
||||
if isinstance(obj, list):
|
||||
# Check if this looks like a review
|
||||
if len(obj) > 4 and isinstance(obj[0], str) and isinstance(obj[4], int):
|
||||
if 1 <= obj[4] <= 5:
|
||||
rev = parse_api_review(obj)
|
||||
if rev and rev["author"]:
|
||||
reviews.append(rev)
|
||||
return
|
||||
for item in obj:
|
||||
find_reviews(item, depth + 1)
|
||||
elif isinstance(obj, dict):
|
||||
for v in obj.values():
|
||||
find_reviews(v, depth + 1)
|
||||
# Google Maps API structure: data[2] contains review arrays
|
||||
# Each review: data[2][X][0] where:
|
||||
# Author: [1][4][5][0]
|
||||
# Rating: [2][0][0]
|
||||
# Text: [2][15][0][0]
|
||||
# Time: [1][6]
|
||||
if not isinstance(data, list) or len(data) < 3:
|
||||
return reviews
|
||||
|
||||
find_reviews(data)
|
||||
reviews_area = data[2]
|
||||
if not isinstance(reviews_area, list):
|
||||
return reviews
|
||||
|
||||
for item in reviews_area:
|
||||
try:
|
||||
if not isinstance(item, list) or len(item) < 1:
|
||||
continue
|
||||
review_data = item[0]
|
||||
if not isinstance(review_data, list) or len(review_data) < 3:
|
||||
continue
|
||||
|
||||
# Extract fields using correct paths
|
||||
review_id = ""
|
||||
author = ""
|
||||
rating = 0
|
||||
text = ""
|
||||
timestamp = ""
|
||||
|
||||
# Review ID: [0] - same format as DOM's data-review-id
|
||||
try:
|
||||
review_id = review_data[0]
|
||||
except (IndexError, TypeError):
|
||||
pass
|
||||
|
||||
# Author: [1][4][5][0]
|
||||
try:
|
||||
author = review_data[1][4][5][0]
|
||||
except (IndexError, TypeError):
|
||||
pass
|
||||
|
||||
# Rating: [2][0][0]
|
||||
try:
|
||||
rating = review_data[2][0][0]
|
||||
except (IndexError, TypeError):
|
||||
pass
|
||||
|
||||
# Text: [2][15][0][0]
|
||||
try:
|
||||
text = review_data[2][15][0][0]
|
||||
except (IndexError, TypeError):
|
||||
pass
|
||||
|
||||
# Timestamp: [1][6]
|
||||
try:
|
||||
timestamp = review_data[1][6]
|
||||
except (IndexError, TypeError):
|
||||
pass
|
||||
|
||||
# Validate and add (include review_id for deduplication)
|
||||
if author and isinstance(rating, int) and 1 <= rating <= 5:
|
||||
reviews.append({
|
||||
"review_id": review_id,
|
||||
"author": author,
|
||||
"text": text or "",
|
||||
"rating": rating,
|
||||
"timestamp": timestamp or "",
|
||||
"source": "api"
|
||||
})
|
||||
except:
|
||||
continue
|
||||
except:
|
||||
pass
|
||||
return reviews
|
||||
@@ -416,6 +467,25 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
except:
|
||||
pass
|
||||
|
||||
# EXPAND ALL "MORE" BUTTONS for full text on pre-rendered reviews
|
||||
# This batch-clicks all "More" buttons at once (fast, no waiting per button)
|
||||
try:
|
||||
expanded = driver.execute_script("""
|
||||
var buttons = document.querySelectorAll('button.w8nwRe.kyuRq');
|
||||
var count = 0;
|
||||
for (var i = 0; i < buttons.length; i++) {
|
||||
if (buttons[i].textContent.trim() === 'More') {
|
||||
buttons[i].click();
|
||||
count++;
|
||||
}
|
||||
}
|
||||
return count;
|
||||
""")
|
||||
if expanded > 0:
|
||||
print(f" 📝 Expanded {expanded} truncated reviews")
|
||||
except:
|
||||
pass
|
||||
|
||||
# Block images to speed up scrolling (use CDP)
|
||||
try:
|
||||
driver.execute_cdp_cmd('Network.setBlockedURLs', {
|
||||
@@ -507,15 +577,29 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
cycle_delta = t0 - cycle_start
|
||||
cycle_start = t0
|
||||
|
||||
# Collect from API (doesn't affect scroll)
|
||||
# Collect from API (doesn't affect scroll) - API has FULL TEXT in original language
|
||||
# Use review_id as key to avoid duplicates with DOM
|
||||
t1 = time.time()
|
||||
for rev in get_api_reviews():
|
||||
key = f"api_{rev['author'][:20]}_{rev['rating']}"
|
||||
if key not in seen_ids:
|
||||
reviews[key] = rev
|
||||
seen_ids.add(key)
|
||||
rid = rev.get('review_id', '')
|
||||
if rid and rid not in seen_ids:
|
||||
reviews[rid] = rev
|
||||
seen_ids.add(rid)
|
||||
api_time = time.time() - t1
|
||||
|
||||
# Expand any new "More" buttons for full text (batch click, fast)
|
||||
try:
|
||||
driver.execute_script("""
|
||||
var buttons = document.querySelectorAll('button.w8nwRe.kyuRq');
|
||||
for (var i = 0; i < buttons.length; i++) {
|
||||
if (buttons[i].textContent.trim() === 'More') {
|
||||
buttons[i].click();
|
||||
}
|
||||
}
|
||||
""")
|
||||
except:
|
||||
pass
|
||||
|
||||
# Parse reviews using ROBUST selectors (no class names - uses data/aria attributes)
|
||||
# This survives Google's CSS class name changes
|
||||
# Also removes separators from previously-hidden cards to keep DOM light
|
||||
|
||||
Reference in New Issue
Block a user