From 6b3f055760e189d7714e4172aeafe1078a30817a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Guti=C3=A9rrez?= <35082514+alezmad@users.noreply.github.com> Date: Sat, 24 Jan 2026 17:17:21 +0000 Subject: [PATCH] fix: Prevent Chrome tab crash by removing processed DOM cards Root cause: Cards were hidden but not removed from DOM, causing memory buildup (400+ nodes) that crashed Chrome tabs. Changes: - Actually remove processed cards from DOM (not just hide them) - Keep last 50 cards for scroll reference/continuity - Remove adjacent separator elements along with cards - Add logging when DOM cleanup removes cards - Cards near scroll end stay visible for reference This should prevent "tab crashed" errors during long scraping sessions with 500+ reviews. Co-Authored-By: Claude Opus 4.5 --- scrapers/google_reviews/v1_0_0.py | 40 ++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/scrapers/google_reviews/v1_0_0.py b/scrapers/google_reviews/v1_0_0.py index 7595f70..791bd80 100644 --- a/scrapers/google_reviews/v1_0_0.py +++ b/scrapers/google_reviews/v1_0_0.py @@ -1330,7 +1330,8 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in # Parse reviews using ROBUST selectors (no class names - uses data/aria attributes) # This survives Google's CSS class name changes - # Also removes separators from previously-hidden cards to keep DOM light + # MEMORY FIX: Actually remove processed cards from DOM (not just hide) + # Keep last N cards for scroll continuity t2 = time.time() dom_cards = 0 try: @@ -1340,18 +1341,24 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in var results = []; var processedIds = new Set(); var sepsRemoved = 0; + var cardsRemoved = 0; + var KEEP_LAST_N = 50; // Keep last 50 cards for scroll reference // ROBUST: Find cards by data attribute only (not class names) var cards = document.querySelectorAll('[data-review-id]'); + var cardsArray = Array.from(cards); + var totalCards = cardsArray.length; - for (var i = 0; i < cards.length; i++) { - var card = cards[i]; + for (var i = 0; i < cardsArray.length; i++) { + var card = cardsArray[i]; var rid = card.getAttribute('data-review-id'); var isHidden = card.style.display === 'none'; + var isNearEnd = i >= totalCards - KEEP_LAST_N; - // CLEANUP: Remove separators adjacent to already-hidden cards - // This keeps DOM light without breaking Google's virtual scroll - if (isHidden) { + // AGGRESSIVE CLEANUP: Remove hidden cards that are NOT near the scroll end + // This prevents memory buildup that causes tab crashes + if (isHidden && !isNearEnd) { + // Remove separators first var sibling = card.nextElementSibling; while (sibling) { var nextSib = sibling.nextElementSibling; @@ -1364,9 +1371,15 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in break; } } + // Remove the card itself from DOM + card.remove(); + cardsRemoved++; continue; } + // Skip already hidden cards near end (keep for scroll reference) + if (isHidden) continue; + // Skip if no ID or already processed this cycle if (!rid || processedIds.has(rid)) continue; @@ -1439,15 +1452,20 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in }); } - // ALWAYS hide processed cards to keep DOM light - // (even if extraction failed - we've seen this card) - card.style.display = 'none'; - card.innerHTML = ''; + // Mark card as processed (hide + clear) - will be removed on next cycle + // Keep near-end cards visible for scroll reference + if (!isNearEnd) { + card.style.display = 'none'; + card.innerHTML = ''; + } } - return {reviews: results, cardCount: cards.length, sepsRemoved: sepsRemoved}; + return {reviews: results, cardCount: totalCards, cardsRemoved: cardsRemoved, sepsRemoved: sepsRemoved}; """, seen_list) dom_cards = parsed_reviews.get('cardCount', 0) if parsed_reviews else 0 + cards_removed = parsed_reviews.get('cardsRemoved', 0) if parsed_reviews else 0 + if cards_removed > 0: + log.info('system', f"DOM cleanup: removed {cards_removed} cards to prevent memory buildup", metrics={'cards_removed': cards_removed, 'cards_remaining': dom_cards - cards_removed}) new_reviews = parsed_reviews.get('reviews', []) if parsed_reviews else [] for rev in new_reviews: rid = rev.pop('id')