fix: Prevent Chrome tab crash by removing processed DOM cards

Root cause: Cards were hidden but not removed from DOM, causing
memory buildup (400+ nodes) that crashed Chrome tabs.

Changes:
- Actually remove processed cards from DOM (not just hide them)
- Keep last 50 cards for scroll reference/continuity
- Remove adjacent separator elements along with cards
- Add logging when DOM cleanup removes cards
- Cards near scroll end stay visible for reference

This should prevent "tab crashed" errors during long scraping
sessions with 500+ reviews.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-24 17:17:21 +00:00
parent 65eb979c12
commit 6b3f055760

View File

@@ -1330,7 +1330,8 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
# Parse reviews using ROBUST selectors (no class names - uses data/aria attributes)
# This survives Google's CSS class name changes
# Also removes separators from previously-hidden cards to keep DOM light
# MEMORY FIX: Actually remove processed cards from DOM (not just hide)
# Keep last N cards for scroll continuity
t2 = time.time()
dom_cards = 0
try:
@@ -1340,18 +1341,24 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
var results = [];
var processedIds = new Set();
var sepsRemoved = 0;
var cardsRemoved = 0;
var KEEP_LAST_N = 50; // Keep last 50 cards for scroll reference
// ROBUST: Find cards by data attribute only (not class names)
var cards = document.querySelectorAll('[data-review-id]');
var cardsArray = Array.from(cards);
var totalCards = cardsArray.length;
for (var i = 0; i < cards.length; i++) {
var card = cards[i];
for (var i = 0; i < cardsArray.length; i++) {
var card = cardsArray[i];
var rid = card.getAttribute('data-review-id');
var isHidden = card.style.display === 'none';
var isNearEnd = i >= totalCards - KEEP_LAST_N;
// CLEANUP: Remove separators adjacent to already-hidden cards
// This keeps DOM light without breaking Google's virtual scroll
if (isHidden) {
// AGGRESSIVE CLEANUP: Remove hidden cards that are NOT near the scroll end
// This prevents memory buildup that causes tab crashes
if (isHidden && !isNearEnd) {
// Remove separators first
var sibling = card.nextElementSibling;
while (sibling) {
var nextSib = sibling.nextElementSibling;
@@ -1364,9 +1371,15 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
break;
}
}
// Remove the card itself from DOM
card.remove();
cardsRemoved++;
continue;
}
// Skip already hidden cards near end (keep for scroll reference)
if (isHidden) continue;
// Skip if no ID or already processed this cycle
if (!rid || processedIds.has(rid)) continue;
@@ -1439,15 +1452,20 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
});
}
// ALWAYS hide processed cards to keep DOM light
// (even if extraction failed - we've seen this card)
// Mark card as processed (hide + clear) - will be removed on next cycle
// Keep near-end cards visible for scroll reference
if (!isNearEnd) {
card.style.display = 'none';
card.innerHTML = '';
}
return {reviews: results, cardCount: cards.length, sepsRemoved: sepsRemoved};
}
return {reviews: results, cardCount: totalCards, cardsRemoved: cardsRemoved, sepsRemoved: sepsRemoved};
""", seen_list)
dom_cards = parsed_reviews.get('cardCount', 0) if parsed_reviews else 0
cards_removed = parsed_reviews.get('cardsRemoved', 0) if parsed_reviews else 0
if cards_removed > 0:
log.info('system', f"DOM cleanup: removed {cards_removed} cards to prevent memory buildup", metrics={'cards_removed': cards_removed, 'cards_remaining': dom_cards - cards_removed})
new_reviews = parsed_reviews.get('reviews', []) if parsed_reviews else []
for rev in new_reviews:
rid = rev.pop('id')