Full text + deduplication: API parser + More button expansion
- Fix API parser to use correct Google Maps response structure - Review ID at [0], Author at [1][4][5][0], Rating at [2][0][0] - Text at [2][15][0][0], Timestamp at [1][6] - Use review_id as key for both API and DOM to avoid duplicates - Prefer API data (original language, full text) - Expand "More" buttons before sorting and during scroll loop - Results: 246/247 full text (99.6%), down from 36/247 before Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -56,7 +56,7 @@ def parse_api_review(raw: list) -> dict:
|
|||||||
|
|
||||||
|
|
||||||
def extract_reviews_from_api_body(body: str) -> list:
|
def extract_reviews_from_api_body(body: str) -> list:
|
||||||
"""Extract reviews from API response body."""
|
"""Extract reviews from API response body using correct Google Maps structure."""
|
||||||
reviews = []
|
reviews = []
|
||||||
try:
|
try:
|
||||||
# Remove )]}' prefix
|
# Remove )]}' prefix
|
||||||
@@ -65,25 +65,76 @@ def extract_reviews_from_api_body(body: str) -> list:
|
|||||||
|
|
||||||
data = json.loads(body)
|
data = json.loads(body)
|
||||||
|
|
||||||
# Recursively find review arrays
|
# Google Maps API structure: data[2] contains review arrays
|
||||||
def find_reviews(obj, depth=0):
|
# Each review: data[2][X][0] where:
|
||||||
if depth > 12:
|
# Author: [1][4][5][0]
|
||||||
return
|
# Rating: [2][0][0]
|
||||||
if isinstance(obj, list):
|
# Text: [2][15][0][0]
|
||||||
# Check if this looks like a review
|
# Time: [1][6]
|
||||||
if len(obj) > 4 and isinstance(obj[0], str) and isinstance(obj[4], int):
|
if not isinstance(data, list) or len(data) < 3:
|
||||||
if 1 <= obj[4] <= 5:
|
return reviews
|
||||||
rev = parse_api_review(obj)
|
|
||||||
if rev and rev["author"]:
|
|
||||||
reviews.append(rev)
|
|
||||||
return
|
|
||||||
for item in obj:
|
|
||||||
find_reviews(item, depth + 1)
|
|
||||||
elif isinstance(obj, dict):
|
|
||||||
for v in obj.values():
|
|
||||||
find_reviews(v, depth + 1)
|
|
||||||
|
|
||||||
find_reviews(data)
|
reviews_area = data[2]
|
||||||
|
if not isinstance(reviews_area, list):
|
||||||
|
return reviews
|
||||||
|
|
||||||
|
for item in reviews_area:
|
||||||
|
try:
|
||||||
|
if not isinstance(item, list) or len(item) < 1:
|
||||||
|
continue
|
||||||
|
review_data = item[0]
|
||||||
|
if not isinstance(review_data, list) or len(review_data) < 3:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Extract fields using correct paths
|
||||||
|
review_id = ""
|
||||||
|
author = ""
|
||||||
|
rating = 0
|
||||||
|
text = ""
|
||||||
|
timestamp = ""
|
||||||
|
|
||||||
|
# Review ID: [0] - same format as DOM's data-review-id
|
||||||
|
try:
|
||||||
|
review_id = review_data[0]
|
||||||
|
except (IndexError, TypeError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Author: [1][4][5][0]
|
||||||
|
try:
|
||||||
|
author = review_data[1][4][5][0]
|
||||||
|
except (IndexError, TypeError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Rating: [2][0][0]
|
||||||
|
try:
|
||||||
|
rating = review_data[2][0][0]
|
||||||
|
except (IndexError, TypeError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Text: [2][15][0][0]
|
||||||
|
try:
|
||||||
|
text = review_data[2][15][0][0]
|
||||||
|
except (IndexError, TypeError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Timestamp: [1][6]
|
||||||
|
try:
|
||||||
|
timestamp = review_data[1][6]
|
||||||
|
except (IndexError, TypeError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Validate and add (include review_id for deduplication)
|
||||||
|
if author and isinstance(rating, int) and 1 <= rating <= 5:
|
||||||
|
reviews.append({
|
||||||
|
"review_id": review_id,
|
||||||
|
"author": author,
|
||||||
|
"text": text or "",
|
||||||
|
"rating": rating,
|
||||||
|
"timestamp": timestamp or "",
|
||||||
|
"source": "api"
|
||||||
|
})
|
||||||
|
except:
|
||||||
|
continue
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
return reviews
|
return reviews
|
||||||
@@ -416,6 +467,25 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# EXPAND ALL "MORE" BUTTONS for full text on pre-rendered reviews
|
||||||
|
# This batch-clicks all "More" buttons at once (fast, no waiting per button)
|
||||||
|
try:
|
||||||
|
expanded = driver.execute_script("""
|
||||||
|
var buttons = document.querySelectorAll('button.w8nwRe.kyuRq');
|
||||||
|
var count = 0;
|
||||||
|
for (var i = 0; i < buttons.length; i++) {
|
||||||
|
if (buttons[i].textContent.trim() === 'More') {
|
||||||
|
buttons[i].click();
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return count;
|
||||||
|
""")
|
||||||
|
if expanded > 0:
|
||||||
|
print(f" 📝 Expanded {expanded} truncated reviews")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
# Block images to speed up scrolling (use CDP)
|
# Block images to speed up scrolling (use CDP)
|
||||||
try:
|
try:
|
||||||
driver.execute_cdp_cmd('Network.setBlockedURLs', {
|
driver.execute_cdp_cmd('Network.setBlockedURLs', {
|
||||||
@@ -507,15 +577,29 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
cycle_delta = t0 - cycle_start
|
cycle_delta = t0 - cycle_start
|
||||||
cycle_start = t0
|
cycle_start = t0
|
||||||
|
|
||||||
# Collect from API (doesn't affect scroll)
|
# Collect from API (doesn't affect scroll) - API has FULL TEXT in original language
|
||||||
|
# Use review_id as key to avoid duplicates with DOM
|
||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
for rev in get_api_reviews():
|
for rev in get_api_reviews():
|
||||||
key = f"api_{rev['author'][:20]}_{rev['rating']}"
|
rid = rev.get('review_id', '')
|
||||||
if key not in seen_ids:
|
if rid and rid not in seen_ids:
|
||||||
reviews[key] = rev
|
reviews[rid] = rev
|
||||||
seen_ids.add(key)
|
seen_ids.add(rid)
|
||||||
api_time = time.time() - t1
|
api_time = time.time() - t1
|
||||||
|
|
||||||
|
# Expand any new "More" buttons for full text (batch click, fast)
|
||||||
|
try:
|
||||||
|
driver.execute_script("""
|
||||||
|
var buttons = document.querySelectorAll('button.w8nwRe.kyuRq');
|
||||||
|
for (var i = 0; i < buttons.length; i++) {
|
||||||
|
if (buttons[i].textContent.trim() === 'More') {
|
||||||
|
buttons[i].click();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
""")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
# Parse reviews using ROBUST selectors (no class names - uses data/aria attributes)
|
# Parse reviews using ROBUST selectors (no class names - uses data/aria attributes)
|
||||||
# This survives Google's CSS class name changes
|
# This survives Google's CSS class name changes
|
||||||
# Also removes separators from previously-hidden cards to keep DOM light
|
# Also removes separators from previously-hidden cards to keep DOM light
|
||||||
|
|||||||
Reference in New Issue
Block a user