fix: Prevent Chrome tab crash by removing processed DOM cards
Root cause: Cards were hidden but not removed from DOM, causing memory buildup (400+ nodes) that crashed Chrome tabs. Changes: - Actually remove processed cards from DOM (not just hide them) - Keep last 50 cards for scroll reference/continuity - Remove adjacent separator elements along with cards - Add logging when DOM cleanup removes cards - Cards near scroll end stay visible for reference This should prevent "tab crashed" errors during long scraping sessions with 500+ reviews. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1330,7 +1330,8 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
|
|
||||||
# Parse reviews using ROBUST selectors (no class names - uses data/aria attributes)
|
# Parse reviews using ROBUST selectors (no class names - uses data/aria attributes)
|
||||||
# This survives Google's CSS class name changes
|
# This survives Google's CSS class name changes
|
||||||
# Also removes separators from previously-hidden cards to keep DOM light
|
# MEMORY FIX: Actually remove processed cards from DOM (not just hide)
|
||||||
|
# Keep last N cards for scroll continuity
|
||||||
t2 = time.time()
|
t2 = time.time()
|
||||||
dom_cards = 0
|
dom_cards = 0
|
||||||
try:
|
try:
|
||||||
@@ -1340,18 +1341,24 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
var results = [];
|
var results = [];
|
||||||
var processedIds = new Set();
|
var processedIds = new Set();
|
||||||
var sepsRemoved = 0;
|
var sepsRemoved = 0;
|
||||||
|
var cardsRemoved = 0;
|
||||||
|
var KEEP_LAST_N = 50; // Keep last 50 cards for scroll reference
|
||||||
|
|
||||||
// ROBUST: Find cards by data attribute only (not class names)
|
// ROBUST: Find cards by data attribute only (not class names)
|
||||||
var cards = document.querySelectorAll('[data-review-id]');
|
var cards = document.querySelectorAll('[data-review-id]');
|
||||||
|
var cardsArray = Array.from(cards);
|
||||||
|
var totalCards = cardsArray.length;
|
||||||
|
|
||||||
for (var i = 0; i < cards.length; i++) {
|
for (var i = 0; i < cardsArray.length; i++) {
|
||||||
var card = cards[i];
|
var card = cardsArray[i];
|
||||||
var rid = card.getAttribute('data-review-id');
|
var rid = card.getAttribute('data-review-id');
|
||||||
var isHidden = card.style.display === 'none';
|
var isHidden = card.style.display === 'none';
|
||||||
|
var isNearEnd = i >= totalCards - KEEP_LAST_N;
|
||||||
|
|
||||||
// CLEANUP: Remove separators adjacent to already-hidden cards
|
// AGGRESSIVE CLEANUP: Remove hidden cards that are NOT near the scroll end
|
||||||
// This keeps DOM light without breaking Google's virtual scroll
|
// This prevents memory buildup that causes tab crashes
|
||||||
if (isHidden) {
|
if (isHidden && !isNearEnd) {
|
||||||
|
// Remove separators first
|
||||||
var sibling = card.nextElementSibling;
|
var sibling = card.nextElementSibling;
|
||||||
while (sibling) {
|
while (sibling) {
|
||||||
var nextSib = sibling.nextElementSibling;
|
var nextSib = sibling.nextElementSibling;
|
||||||
@@ -1364,9 +1371,15 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Remove the card itself from DOM
|
||||||
|
card.remove();
|
||||||
|
cardsRemoved++;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Skip already hidden cards near end (keep for scroll reference)
|
||||||
|
if (isHidden) continue;
|
||||||
|
|
||||||
// Skip if no ID or already processed this cycle
|
// Skip if no ID or already processed this cycle
|
||||||
if (!rid || processedIds.has(rid)) continue;
|
if (!rid || processedIds.has(rid)) continue;
|
||||||
|
|
||||||
@@ -1439,15 +1452,20 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// ALWAYS hide processed cards to keep DOM light
|
// Mark card as processed (hide + clear) - will be removed on next cycle
|
||||||
// (even if extraction failed - we've seen this card)
|
// Keep near-end cards visible for scroll reference
|
||||||
card.style.display = 'none';
|
if (!isNearEnd) {
|
||||||
card.innerHTML = '';
|
card.style.display = 'none';
|
||||||
|
card.innerHTML = '';
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return {reviews: results, cardCount: cards.length, sepsRemoved: sepsRemoved};
|
return {reviews: results, cardCount: totalCards, cardsRemoved: cardsRemoved, sepsRemoved: sepsRemoved};
|
||||||
""", seen_list)
|
""", seen_list)
|
||||||
|
|
||||||
dom_cards = parsed_reviews.get('cardCount', 0) if parsed_reviews else 0
|
dom_cards = parsed_reviews.get('cardCount', 0) if parsed_reviews else 0
|
||||||
|
cards_removed = parsed_reviews.get('cardsRemoved', 0) if parsed_reviews else 0
|
||||||
|
if cards_removed > 0:
|
||||||
|
log.info('system', f"DOM cleanup: removed {cards_removed} cards to prevent memory buildup", metrics={'cards_removed': cards_removed, 'cards_remaining': dom_cards - cards_removed})
|
||||||
new_reviews = parsed_reviews.get('reviews', []) if parsed_reviews else []
|
new_reviews = parsed_reviews.get('reviews', []) if parsed_reviews else []
|
||||||
for rev in new_reviews:
|
for rev in new_reviews:
|
||||||
rid = rev.pop('id')
|
rid = rev.pop('id')
|
||||||
|
|||||||
Reference in New Issue
Block a user