fix: Prevent Chrome tab crash by removing processed DOM cards

Root cause: Cards were hidden but not removed from DOM, causing
memory buildup (400+ nodes) that crashed Chrome tabs.

Changes:
- Actually remove processed cards from DOM (not just hide them)
- Keep last 50 cards for scroll reference/continuity
- Remove adjacent separator elements along with cards
- Add logging when DOM cleanup removes cards
- Cards near scroll end stay visible for reference

This should prevent "tab crashed" errors during long scraping
sessions with 500+ reviews.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-24 17:17:21 +00:00
parent 65eb979c12
commit 6b3f055760

View File

@@ -1330,7 +1330,8 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
# Parse reviews using ROBUST selectors (no class names - uses data/aria attributes) # Parse reviews using ROBUST selectors (no class names - uses data/aria attributes)
# This survives Google's CSS class name changes # This survives Google's CSS class name changes
# Also removes separators from previously-hidden cards to keep DOM light # MEMORY FIX: Actually remove processed cards from DOM (not just hide)
# Keep last N cards for scroll continuity
t2 = time.time() t2 = time.time()
dom_cards = 0 dom_cards = 0
try: try:
@@ -1340,18 +1341,24 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
var results = []; var results = [];
var processedIds = new Set(); var processedIds = new Set();
var sepsRemoved = 0; var sepsRemoved = 0;
var cardsRemoved = 0;
var KEEP_LAST_N = 50; // Keep last 50 cards for scroll reference
// ROBUST: Find cards by data attribute only (not class names) // ROBUST: Find cards by data attribute only (not class names)
var cards = document.querySelectorAll('[data-review-id]'); var cards = document.querySelectorAll('[data-review-id]');
var cardsArray = Array.from(cards);
var totalCards = cardsArray.length;
for (var i = 0; i < cards.length; i++) { for (var i = 0; i < cardsArray.length; i++) {
var card = cards[i]; var card = cardsArray[i];
var rid = card.getAttribute('data-review-id'); var rid = card.getAttribute('data-review-id');
var isHidden = card.style.display === 'none'; var isHidden = card.style.display === 'none';
var isNearEnd = i >= totalCards - KEEP_LAST_N;
// CLEANUP: Remove separators adjacent to already-hidden cards // AGGRESSIVE CLEANUP: Remove hidden cards that are NOT near the scroll end
// This keeps DOM light without breaking Google's virtual scroll // This prevents memory buildup that causes tab crashes
if (isHidden) { if (isHidden && !isNearEnd) {
// Remove separators first
var sibling = card.nextElementSibling; var sibling = card.nextElementSibling;
while (sibling) { while (sibling) {
var nextSib = sibling.nextElementSibling; var nextSib = sibling.nextElementSibling;
@@ -1364,9 +1371,15 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
break; break;
} }
} }
// Remove the card itself from DOM
card.remove();
cardsRemoved++;
continue; continue;
} }
// Skip already hidden cards near end (keep for scroll reference)
if (isHidden) continue;
// Skip if no ID or already processed this cycle // Skip if no ID or already processed this cycle
if (!rid || processedIds.has(rid)) continue; if (!rid || processedIds.has(rid)) continue;
@@ -1439,15 +1452,20 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
}); });
} }
// ALWAYS hide processed cards to keep DOM light // Mark card as processed (hide + clear) - will be removed on next cycle
// (even if extraction failed - we've seen this card) // Keep near-end cards visible for scroll reference
card.style.display = 'none'; if (!isNearEnd) {
card.innerHTML = ''; card.style.display = 'none';
card.innerHTML = '';
}
} }
return {reviews: results, cardCount: cards.length, sepsRemoved: sepsRemoved}; return {reviews: results, cardCount: totalCards, cardsRemoved: cardsRemoved, sepsRemoved: sepsRemoved};
""", seen_list) """, seen_list)
dom_cards = parsed_reviews.get('cardCount', 0) if parsed_reviews else 0 dom_cards = parsed_reviews.get('cardCount', 0) if parsed_reviews else 0
cards_removed = parsed_reviews.get('cardsRemoved', 0) if parsed_reviews else 0
if cards_removed > 0:
log.info('system', f"DOM cleanup: removed {cards_removed} cards to prevent memory buildup", metrics={'cards_removed': cards_removed, 'cards_remaining': dom_cards - cards_removed})
new_reviews = parsed_reviews.get('reviews', []) if parsed_reviews else [] new_reviews = parsed_reviews.get('reviews', []) if parsed_reviews else []
for rev in new_reviews: for rev in new_reviews:
rid = rev.pop('id') rid = rev.pop('id')