Add delayed separator removal to keep DOM light

- Remove separators (AyRUI, TFQHme) adjacent to already-hidden cards
- Separators removed on next cycle, not immediately (preserves scroll)
- DOM growth reduced by ~50% during long scrapes
- Tested: 2000 reviews in 103s (19.3/s) with all features

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-22 12:18:50 +00:00
parent cbc2e9c617
commit 10b32244d7

View File

@@ -533,6 +533,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
# Parse reviews using ROBUST selectors (no class names - uses data/aria attributes) # Parse reviews using ROBUST selectors (no class names - uses data/aria attributes)
# This survives Google's CSS class name changes # This survives Google's CSS class name changes
# Also removes separators from previously-hidden cards to keep DOM light
t2 = time.time() t2 = time.time()
dom_cards = 0 dom_cards = 0
try: try:
@@ -541,6 +542,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
var seenSet = new Set(arguments[0]); var seenSet = new Set(arguments[0]);
var results = []; var results = [];
var processedIds = new Set(); var processedIds = new Set();
var sepsRemoved = 0;
// ROBUST: Find cards by data attribute only (not class names) // ROBUST: Find cards by data attribute only (not class names)
var cards = document.querySelectorAll('[data-review-id]'); var cards = document.querySelectorAll('[data-review-id]');
@@ -548,6 +550,25 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
for (var i = 0; i < cards.length; i++) { for (var i = 0; i < cards.length; i++) {
var card = cards[i]; var card = cards[i];
var rid = card.getAttribute('data-review-id'); var rid = card.getAttribute('data-review-id');
var isHidden = card.style.display === 'none';
// CLEANUP: Remove separators adjacent to already-hidden cards
// This keeps DOM light without breaking Google's virtual scroll
if (isHidden) {
var sibling = card.nextElementSibling;
while (sibling) {
var nextSib = sibling.nextElementSibling;
var classes = sibling.className || '';
if (classes.includes('AyRUI') || classes.includes('TFQHme')) {
sibling.remove();
sepsRemoved++;
sibling = nextSib;
} else {
break;
}
}
continue;
}
// Skip duplicates and already-seen // Skip duplicates and already-seen
if (!rid || seenSet.has(rid) || processedIds.has(rid)) continue; if (!rid || seenSet.has(rid) || processedIds.has(rid)) continue;
@@ -608,12 +629,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
timestamp: timestamp, timestamp: timestamp,
source: 'dom' source: 'dom'
}); });
// Hide processed card to keep DOM light // Hide processed card (separators removed on next cycle)
card.style.display = 'none'; card.style.display = 'none';
card.innerHTML = ''; card.innerHTML = '';
} }
} }
return {reviews: results, cardCount: cards.length}; return {reviews: results, cardCount: cards.length, sepsRemoved: sepsRemoved};
""", seen_list) """, seen_list)
dom_cards = parsed_reviews.get('cardCount', 0) if parsed_reviews else 0 dom_cards = parsed_reviews.get('cardCount', 0) if parsed_reviews else 0