From 0e8a711a9c5d73a4f9148dcf7e1bb13f6a1fbf82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Guti=C3=A9rrez?= <35082514+alezmad@users.noreply.github.com> Date: Wed, 21 Jan 2026 20:40:15 +0000 Subject: [PATCH] Fix clean scraper: specific selectors, consent reload, DOM parsing - Use div.jftiEf[data-review-id] selector to exclude button elements - Reload original URL after consent (prevents URL corruption) - Parse full DOM data after scrolling stops - Deduplicate API reviews by author match - Remove slow "More" button clicking for speed Co-Authored-By: Claude Opus 4.5 --- modules/scraper_clean.py | 56 +++++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 21 deletions(-) diff --git a/modules/scraper_clean.py b/modules/scraper_clean.py index 49945f0..69fcdd2 100644 --- a/modules/scraper_clean.py +++ b/modules/scraper_clean.py @@ -125,23 +125,8 @@ def parse_dom_review(card) -> dict: except: pass - # Click "More" button to expand text if truncated - try: - more_btn = card.find_element(By.CSS_SELECTOR, "button.kyuRq") - if more_btn.is_displayed(): - more_btn.click() - # Re-read text after expanding - for sel in ['span[jsname="bN97Pc"]', 'span[jsname="fbQN7e"]', '.wiI7pd']: - try: - text_el = card.find_element(By.CSS_SELECTOR, sel) - expanded = text_el.text.strip() - if expanded and len(expanded) > len(text): - text = expanded - break - except: - pass - except: - pass + # Note: "More" button clicking removed for speed + # Full text can be expanded later if needed # Timestamp timestamp = "" @@ -228,7 +213,10 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in break except: pass - time.sleep(2) + # Reload original URL after consent (redirect can corrupt URL) + print(" Reloading after consent...") + driver.get(url) + time.sleep(3) # Click reviews tab if present (multilingual support) review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"] @@ -411,17 +399,18 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in reviews[key] = rev # Collect review IDs via JavaScript (doesn't affect scroll position!) + # Use specific selector to only get actual review cards, not buttons try: review_ids = driver.execute_script(""" var ids = []; - document.querySelectorAll('[data-review-id]').forEach(function(el) { + document.querySelectorAll('div.jftiEf[data-review-id]').forEach(function(el) { ids.push(el.getAttribute('data-review-id')); }); return ids; """) for rid in (review_ids or []): if rid and rid not in reviews: - reviews[rid] = {"id": rid, "source": "dom"} + reviews[rid] = {"id": rid, "source": "dom", "_needs_parse": True} except: pass @@ -456,11 +445,36 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in stop_scrolling.set() break + # FINAL PHASE: Parse full review data from DOM (scroll is stopped) + print("📝 Parsing full review data...") + api_reviews_collected = {k: v for k, v in reviews.items() if v.get("source") == "api"} + reviews.clear() + + # Parse all DOM cards now that scrolling is done + # Use specific selector to only get actual review cards (div.jftiEf), not buttons + try: + cards = driver.find_elements(By.CSS_SELECTOR, "div.jftiEf[data-review-id]") + for card in cards: + review = parse_dom_review(card) + if review and review.get("id"): + reviews[review["id"]] = review + except Exception as e: + print(f" Warning: DOM parse error: {e}") + + # Merge API reviews (only add if not already in DOM) + api_added = 0 + for key, api_rev in api_reviews_collected.items(): + # Check if this author already exists in DOM reviews + author = api_rev.get("author", "") + if author and not any(r.get("author") == author for r in reviews.values()): + reviews[f"api_{key}"] = api_rev + api_added += 1 + # Final results review_list = list(reviews.values()) dom_count = sum(1 for r in review_list if r.get("source") == "dom") api_count = sum(1 for r in review_list if r.get("source") == "api") - print(f"\n📋 Total: {len(review_list)} reviews (DOM: {dom_count}, API: {api_count})") + print(f"\n📋 Total: {len(review_list)} unique reviews (DOM: {dom_count}, API: {api_count})") return { "reviews": review_list,