Add delayed separator removal to keep DOM light
- Remove separators (AyRUI, TFQHme) adjacent to already-hidden cards - Separators removed on next cycle, not immediately (preserves scroll) - DOM growth reduced by ~50% during long scrapes - Tested: 2000 reviews in 103s (19.3/s) with all features Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -533,6 +533,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
|
||||
# Parse reviews using ROBUST selectors (no class names - uses data/aria attributes)
|
||||
# This survives Google's CSS class name changes
|
||||
# Also removes separators from previously-hidden cards to keep DOM light
|
||||
t2 = time.time()
|
||||
dom_cards = 0
|
||||
try:
|
||||
@@ -541,6 +542,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
var seenSet = new Set(arguments[0]);
|
||||
var results = [];
|
||||
var processedIds = new Set();
|
||||
var sepsRemoved = 0;
|
||||
|
||||
// ROBUST: Find cards by data attribute only (not class names)
|
||||
var cards = document.querySelectorAll('[data-review-id]');
|
||||
@@ -548,6 +550,25 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
for (var i = 0; i < cards.length; i++) {
|
||||
var card = cards[i];
|
||||
var rid = card.getAttribute('data-review-id');
|
||||
var isHidden = card.style.display === 'none';
|
||||
|
||||
// CLEANUP: Remove separators adjacent to already-hidden cards
|
||||
// This keeps DOM light without breaking Google's virtual scroll
|
||||
if (isHidden) {
|
||||
var sibling = card.nextElementSibling;
|
||||
while (sibling) {
|
||||
var nextSib = sibling.nextElementSibling;
|
||||
var classes = sibling.className || '';
|
||||
if (classes.includes('AyRUI') || classes.includes('TFQHme')) {
|
||||
sibling.remove();
|
||||
sepsRemoved++;
|
||||
sibling = nextSib;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip duplicates and already-seen
|
||||
if (!rid || seenSet.has(rid) || processedIds.has(rid)) continue;
|
||||
@@ -608,12 +629,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
timestamp: timestamp,
|
||||
source: 'dom'
|
||||
});
|
||||
// Hide processed card to keep DOM light
|
||||
// Hide processed card (separators removed on next cycle)
|
||||
card.style.display = 'none';
|
||||
card.innerHTML = '';
|
||||
}
|
||||
}
|
||||
return {reviews: results, cardCount: cards.length};
|
||||
return {reviews: results, cardCount: cards.length, sepsRemoved: sepsRemoved};
|
||||
""", seen_list)
|
||||
|
||||
dom_cards = parsed_reviews.get('cardCount', 0) if parsed_reviews else 0
|
||||
|
||||
Reference in New Issue
Block a user