From f1f1aa078582d1023d89469ceca7ce18f9e12b9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Guti=C3=A9rrez?= <35082514+alezmad@users.noreply.github.com> Date: Thu, 22 Jan 2026 13:17:11 +0000 Subject: [PATCH] Sort output by DOM visual order + fix browser issue - Track DOM order for all reviews (review_order dict) - Sort output by DOM position (preserves "Newest" sort order) - API content + DOM order = best of both - Remove click in recovery method 4 to avoid opening profile pages Co-Authored-By: Claude Opus 4.5 --- modules/scraper_clean.py | 49 ++++++++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/modules/scraper_clean.py b/modules/scraper_clean.py index 65ef441..fd17943 100644 --- a/modules/scraper_clean.py +++ b/modules/scraper_clean.py @@ -256,6 +256,8 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in reviews = {} # review_id -> review seen_ids = set() # Track all IDs we've seen (persists after flush) total_flushed = [0] # Use list for closure mutation + review_order = {} # review_id -> position (DOM visual order for sorting) + order_counter = [0] # Current order position # Don't force language - let Google show all reviews in user's locale @@ -544,15 +546,14 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in if (p) p.scrollTop = p.scrollHeight; """) else: - # Method 4: Click last review card to focus, then scroll + # Method 4: Scroll last card into view, then scroll pane (no click to avoid opening profile) driver.execute_script(""" - var cards = document.querySelectorAll('div.jftiEf[data-review-id]'); + var cards = document.querySelectorAll('[data-review-id]'); if (cards.length > 0) { - cards[cards.length - 1].scrollIntoView({block: 'end'}); - cards[cards.length - 1].click(); + cards[cards.length - 1].scrollIntoView({block: 'end', behavior: 'smooth'}); } """) - time.sleep(0.2) + time.sleep(0.3) driver.execute_script(""" var p = window.scrollablePane; if (p) p.scrollTop = p.scrollHeight; @@ -639,13 +640,19 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in continue; } - // Skip duplicates and already-seen - if (!rid || seenSet.has(rid) || processedIds.has(rid)) continue; + // Skip if no ID or already processed this cycle + if (!rid || processedIds.has(rid)) continue; // Only process top-level review cards (have aria-label with author name) if (!card.getAttribute('aria-label')) continue; processedIds.add(rid); + // Already seen from API - just track order, skip content + if (seenSet.has(rid)) { + results.push({id: rid, orderOnly: true}); + continue; + } + var author = '', text = '', rating = 0, timestamp = ''; // AUTHOR: Extract from "Photo of {Name}" button aria-label @@ -692,6 +699,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in if (author && rating >= 1 && rating <= 5) { results.push({ id: rid, + orderOnly: false, author: author, text: text, rating: rating, @@ -710,19 +718,28 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in new_reviews = parsed_reviews.get('reviews', []) if parsed_reviews else [] for rev in new_reviews: rid = rev.pop('id') - reviews[rid] = rev - seen_ids.add(rid) + order_only = rev.pop('orderOnly', False) + # Track DOM order for ALL reviews (for sorting output) + if rid not in review_order: + review_order[rid] = order_counter[0] + order_counter[0] += 1 + # Only add content for new reviews (not already from API) + if not order_only: + reviews[rid] = rev + seen_ids.add(rid) except Exception as e: print(f" ❌ DOM parse error: {e}") dom_time = time.time() - t2 # BATCH FLUSH: If we have enough reviews, flush to callback and clear memory + # Sort by DOM order before flushing t3 = time.time() if flush_callback and len(reviews) >= flush_batch_size: print(f" 💾 Flushing {len(reviews)} reviews to disk...") - flush_callback(list(reviews.values())) + sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf'))) + flush_callback([r for _, r in sorted_reviews]) total_flushed[0] += len(reviews) - reviews.clear() # Free memory, but keep seen_ids + reviews.clear() # Free memory, but keep seen_ids and review_order flush_time = time.time() - t3 current_count = total_flushed[0] + len(reviews) @@ -814,18 +831,20 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in stop_scrolling.set() break - # Flush any remaining reviews + # Flush any remaining reviews (sorted by DOM order) if flush_callback and reviews: print(f" 💾 Final flush: {len(reviews)} reviews...") - flush_callback(list(reviews.values())) + sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf'))) + flush_callback([r for _, r in sorted_reviews]) total_flushed[0] += len(reviews) reviews.clear() # Reviews already parsed during scrolling (real-time parsing) print("📝 Finalizing review data...") - # Final results - review_list = list(reviews.values()) + # Final results (sorted by DOM order) + sorted_items = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf'))) + review_list = [r for _, r in sorted_items] grand_total = total_flushed[0] + len(review_list) dom_count = sum(1 for r in review_list if r.get("source") == "dom") api_count = sum(1 for r in review_list if r.get("source") == "api")