Sort output by DOM visual order + fix browser issue
- Track DOM order for all reviews (review_order dict) - Sort output by DOM position (preserves "Newest" sort order) - API content + DOM order = best of both - Remove click in recovery method 4 to avoid opening profile pages Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -256,6 +256,8 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
reviews = {} # review_id -> review
|
reviews = {} # review_id -> review
|
||||||
seen_ids = set() # Track all IDs we've seen (persists after flush)
|
seen_ids = set() # Track all IDs we've seen (persists after flush)
|
||||||
total_flushed = [0] # Use list for closure mutation
|
total_flushed = [0] # Use list for closure mutation
|
||||||
|
review_order = {} # review_id -> position (DOM visual order for sorting)
|
||||||
|
order_counter = [0] # Current order position
|
||||||
|
|
||||||
# Don't force language - let Google show all reviews in user's locale
|
# Don't force language - let Google show all reviews in user's locale
|
||||||
|
|
||||||
@@ -544,15 +546,14 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
if (p) p.scrollTop = p.scrollHeight;
|
if (p) p.scrollTop = p.scrollHeight;
|
||||||
""")
|
""")
|
||||||
else:
|
else:
|
||||||
# Method 4: Click last review card to focus, then scroll
|
# Method 4: Scroll last card into view, then scroll pane (no click to avoid opening profile)
|
||||||
driver.execute_script("""
|
driver.execute_script("""
|
||||||
var cards = document.querySelectorAll('div.jftiEf[data-review-id]');
|
var cards = document.querySelectorAll('[data-review-id]');
|
||||||
if (cards.length > 0) {
|
if (cards.length > 0) {
|
||||||
cards[cards.length - 1].scrollIntoView({block: 'end'});
|
cards[cards.length - 1].scrollIntoView({block: 'end', behavior: 'smooth'});
|
||||||
cards[cards.length - 1].click();
|
|
||||||
}
|
}
|
||||||
""")
|
""")
|
||||||
time.sleep(0.2)
|
time.sleep(0.3)
|
||||||
driver.execute_script("""
|
driver.execute_script("""
|
||||||
var p = window.scrollablePane;
|
var p = window.scrollablePane;
|
||||||
if (p) p.scrollTop = p.scrollHeight;
|
if (p) p.scrollTop = p.scrollHeight;
|
||||||
@@ -639,13 +640,19 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Skip duplicates and already-seen
|
// Skip if no ID or already processed this cycle
|
||||||
if (!rid || seenSet.has(rid) || processedIds.has(rid)) continue;
|
if (!rid || processedIds.has(rid)) continue;
|
||||||
|
|
||||||
// Only process top-level review cards (have aria-label with author name)
|
// Only process top-level review cards (have aria-label with author name)
|
||||||
if (!card.getAttribute('aria-label')) continue;
|
if (!card.getAttribute('aria-label')) continue;
|
||||||
processedIds.add(rid);
|
processedIds.add(rid);
|
||||||
|
|
||||||
|
// Already seen from API - just track order, skip content
|
||||||
|
if (seenSet.has(rid)) {
|
||||||
|
results.push({id: rid, orderOnly: true});
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
var author = '', text = '', rating = 0, timestamp = '';
|
var author = '', text = '', rating = 0, timestamp = '';
|
||||||
|
|
||||||
// AUTHOR: Extract from "Photo of {Name}" button aria-label
|
// AUTHOR: Extract from "Photo of {Name}" button aria-label
|
||||||
@@ -692,6 +699,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
if (author && rating >= 1 && rating <= 5) {
|
if (author && rating >= 1 && rating <= 5) {
|
||||||
results.push({
|
results.push({
|
||||||
id: rid,
|
id: rid,
|
||||||
|
orderOnly: false,
|
||||||
author: author,
|
author: author,
|
||||||
text: text,
|
text: text,
|
||||||
rating: rating,
|
rating: rating,
|
||||||
@@ -710,6 +718,13 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
new_reviews = parsed_reviews.get('reviews', []) if parsed_reviews else []
|
new_reviews = parsed_reviews.get('reviews', []) if parsed_reviews else []
|
||||||
for rev in new_reviews:
|
for rev in new_reviews:
|
||||||
rid = rev.pop('id')
|
rid = rev.pop('id')
|
||||||
|
order_only = rev.pop('orderOnly', False)
|
||||||
|
# Track DOM order for ALL reviews (for sorting output)
|
||||||
|
if rid not in review_order:
|
||||||
|
review_order[rid] = order_counter[0]
|
||||||
|
order_counter[0] += 1
|
||||||
|
# Only add content for new reviews (not already from API)
|
||||||
|
if not order_only:
|
||||||
reviews[rid] = rev
|
reviews[rid] = rev
|
||||||
seen_ids.add(rid)
|
seen_ids.add(rid)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -717,12 +732,14 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
dom_time = time.time() - t2
|
dom_time = time.time() - t2
|
||||||
|
|
||||||
# BATCH FLUSH: If we have enough reviews, flush to callback and clear memory
|
# BATCH FLUSH: If we have enough reviews, flush to callback and clear memory
|
||||||
|
# Sort by DOM order before flushing
|
||||||
t3 = time.time()
|
t3 = time.time()
|
||||||
if flush_callback and len(reviews) >= flush_batch_size:
|
if flush_callback and len(reviews) >= flush_batch_size:
|
||||||
print(f" 💾 Flushing {len(reviews)} reviews to disk...")
|
print(f" 💾 Flushing {len(reviews)} reviews to disk...")
|
||||||
flush_callback(list(reviews.values()))
|
sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf')))
|
||||||
|
flush_callback([r for _, r in sorted_reviews])
|
||||||
total_flushed[0] += len(reviews)
|
total_flushed[0] += len(reviews)
|
||||||
reviews.clear() # Free memory, but keep seen_ids
|
reviews.clear() # Free memory, but keep seen_ids and review_order
|
||||||
flush_time = time.time() - t3
|
flush_time = time.time() - t3
|
||||||
|
|
||||||
current_count = total_flushed[0] + len(reviews)
|
current_count = total_flushed[0] + len(reviews)
|
||||||
@@ -814,18 +831,20 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
stop_scrolling.set()
|
stop_scrolling.set()
|
||||||
break
|
break
|
||||||
|
|
||||||
# Flush any remaining reviews
|
# Flush any remaining reviews (sorted by DOM order)
|
||||||
if flush_callback and reviews:
|
if flush_callback and reviews:
|
||||||
print(f" 💾 Final flush: {len(reviews)} reviews...")
|
print(f" 💾 Final flush: {len(reviews)} reviews...")
|
||||||
flush_callback(list(reviews.values()))
|
sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf')))
|
||||||
|
flush_callback([r for _, r in sorted_reviews])
|
||||||
total_flushed[0] += len(reviews)
|
total_flushed[0] += len(reviews)
|
||||||
reviews.clear()
|
reviews.clear()
|
||||||
|
|
||||||
# Reviews already parsed during scrolling (real-time parsing)
|
# Reviews already parsed during scrolling (real-time parsing)
|
||||||
print("📝 Finalizing review data...")
|
print("📝 Finalizing review data...")
|
||||||
|
|
||||||
# Final results
|
# Final results (sorted by DOM order)
|
||||||
review_list = list(reviews.values())
|
sorted_items = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf')))
|
||||||
|
review_list = [r for _, r in sorted_items]
|
||||||
grand_total = total_flushed[0] + len(review_list)
|
grand_total = total_flushed[0] + len(review_list)
|
||||||
dom_count = sum(1 for r in review_list if r.get("source") == "dom")
|
dom_count = sum(1 for r in review_list if r.get("source") == "dom")
|
||||||
api_count = sum(1 for r in review_list if r.get("source") == "api")
|
api_count = sum(1 for r in review_list if r.get("source") == "api")
|
||||||
|
|||||||
Reference in New Issue
Block a user