Fix clean scraper: specific selectors, consent reload, DOM parsing

- Use div.jftiEf[data-review-id] selector to exclude button elements
- Reload original URL after consent (prevents URL corruption)
- Parse full DOM data after scrolling stops
- Deduplicate API reviews by author match
- Remove slow "More" button clicking for speed

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-21 20:40:15 +00:00
parent 2c7ba2ae40
commit 0e8a711a9c

View File

@@ -125,23 +125,8 @@ def parse_dom_review(card) -> dict:
except: except:
pass pass
# Click "More" button to expand text if truncated # Note: "More" button clicking removed for speed
try: # Full text can be expanded later if needed
more_btn = card.find_element(By.CSS_SELECTOR, "button.kyuRq")
if more_btn.is_displayed():
more_btn.click()
# Re-read text after expanding
for sel in ['span[jsname="bN97Pc"]', 'span[jsname="fbQN7e"]', '.wiI7pd']:
try:
text_el = card.find_element(By.CSS_SELECTOR, sel)
expanded = text_el.text.strip()
if expanded and len(expanded) > len(text):
text = expanded
break
except:
pass
except:
pass
# Timestamp # Timestamp
timestamp = "" timestamp = ""
@@ -228,7 +213,10 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
break break
except: except:
pass pass
time.sleep(2) # Reload original URL after consent (redirect can corrupt URL)
print(" Reloading after consent...")
driver.get(url)
time.sleep(3)
# Click reviews tab if present (multilingual support) # Click reviews tab if present (multilingual support)
review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"] review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"]
@@ -411,17 +399,18 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
reviews[key] = rev reviews[key] = rev
# Collect review IDs via JavaScript (doesn't affect scroll position!) # Collect review IDs via JavaScript (doesn't affect scroll position!)
# Use specific selector to only get actual review cards, not buttons
try: try:
review_ids = driver.execute_script(""" review_ids = driver.execute_script("""
var ids = []; var ids = [];
document.querySelectorAll('[data-review-id]').forEach(function(el) { document.querySelectorAll('div.jftiEf[data-review-id]').forEach(function(el) {
ids.push(el.getAttribute('data-review-id')); ids.push(el.getAttribute('data-review-id'));
}); });
return ids; return ids;
""") """)
for rid in (review_ids or []): for rid in (review_ids or []):
if rid and rid not in reviews: if rid and rid not in reviews:
reviews[rid] = {"id": rid, "source": "dom"} reviews[rid] = {"id": rid, "source": "dom", "_needs_parse": True}
except: except:
pass pass
@@ -456,11 +445,36 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
stop_scrolling.set() stop_scrolling.set()
break break
# FINAL PHASE: Parse full review data from DOM (scroll is stopped)
print("📝 Parsing full review data...")
api_reviews_collected = {k: v for k, v in reviews.items() if v.get("source") == "api"}
reviews.clear()
# Parse all DOM cards now that scrolling is done
# Use specific selector to only get actual review cards (div.jftiEf), not buttons
try:
cards = driver.find_elements(By.CSS_SELECTOR, "div.jftiEf[data-review-id]")
for card in cards:
review = parse_dom_review(card)
if review and review.get("id"):
reviews[review["id"]] = review
except Exception as e:
print(f" Warning: DOM parse error: {e}")
# Merge API reviews (only add if not already in DOM)
api_added = 0
for key, api_rev in api_reviews_collected.items():
# Check if this author already exists in DOM reviews
author = api_rev.get("author", "")
if author and not any(r.get("author") == author for r in reviews.values()):
reviews[f"api_{key}"] = api_rev
api_added += 1
# Final results # Final results
review_list = list(reviews.values()) review_list = list(reviews.values())
dom_count = sum(1 for r in review_list if r.get("source") == "dom") dom_count = sum(1 for r in review_list if r.get("source") == "dom")
api_count = sum(1 for r in review_list if r.get("source") == "api") api_count = sum(1 for r in review_list if r.get("source") == "api")
print(f"\n📋 Total: {len(review_list)} reviews (DOM: {dom_count}, API: {api_count})") print(f"\n📋 Total: {len(review_list)} unique reviews (DOM: {dom_count}, API: {api_count})")
return { return {
"reviews": review_list, "reviews": review_list,