diff --git a/modules/scraper_clean.py b/modules/scraper_clean.py index 49945f0..69fcdd2 100644 --- a/modules/scraper_clean.py +++ b/modules/scraper_clean.py @@ -125,23 +125,8 @@ def parse_dom_review(card) -> dict: except: pass - # Click "More" button to expand text if truncated - try: - more_btn = card.find_element(By.CSS_SELECTOR, "button.kyuRq") - if more_btn.is_displayed(): - more_btn.click() - # Re-read text after expanding - for sel in ['span[jsname="bN97Pc"]', 'span[jsname="fbQN7e"]', '.wiI7pd']: - try: - text_el = card.find_element(By.CSS_SELECTOR, sel) - expanded = text_el.text.strip() - if expanded and len(expanded) > len(text): - text = expanded - break - except: - pass - except: - pass + # Note: "More" button clicking removed for speed + # Full text can be expanded later if needed # Timestamp timestamp = "" @@ -228,7 +213,10 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in break except: pass - time.sleep(2) + # Reload original URL after consent (redirect can corrupt URL) + print(" Reloading after consent...") + driver.get(url) + time.sleep(3) # Click reviews tab if present (multilingual support) review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"] @@ -411,17 +399,18 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in reviews[key] = rev # Collect review IDs via JavaScript (doesn't affect scroll position!) + # Use specific selector to only get actual review cards, not buttons try: review_ids = driver.execute_script(""" var ids = []; - document.querySelectorAll('[data-review-id]').forEach(function(el) { + document.querySelectorAll('div.jftiEf[data-review-id]').forEach(function(el) { ids.push(el.getAttribute('data-review-id')); }); return ids; """) for rid in (review_ids or []): if rid and rid not in reviews: - reviews[rid] = {"id": rid, "source": "dom"} + reviews[rid] = {"id": rid, "source": "dom", "_needs_parse": True} except: pass @@ -456,11 +445,36 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in stop_scrolling.set() break + # FINAL PHASE: Parse full review data from DOM (scroll is stopped) + print("📝 Parsing full review data...") + api_reviews_collected = {k: v for k, v in reviews.items() if v.get("source") == "api"} + reviews.clear() + + # Parse all DOM cards now that scrolling is done + # Use specific selector to only get actual review cards (div.jftiEf), not buttons + try: + cards = driver.find_elements(By.CSS_SELECTOR, "div.jftiEf[data-review-id]") + for card in cards: + review = parse_dom_review(card) + if review and review.get("id"): + reviews[review["id"]] = review + except Exception as e: + print(f" Warning: DOM parse error: {e}") + + # Merge API reviews (only add if not already in DOM) + api_added = 0 + for key, api_rev in api_reviews_collected.items(): + # Check if this author already exists in DOM reviews + author = api_rev.get("author", "") + if author and not any(r.get("author") == author for r in reviews.values()): + reviews[f"api_{key}"] = api_rev + api_added += 1 + # Final results review_list = list(reviews.values()) dom_count = sum(1 for r in review_list if r.get("source") == "dom") api_count = sum(1 for r in review_list if r.get("source") == "api") - print(f"\n📋 Total: {len(review_list)} reviews (DOM: {dom_count}, API: {api_count})") + print(f"\n📋 Total: {len(review_list)} unique reviews (DOM: {dom_count}, API: {api_count})") return { "reviews": review_list,