7x faster scraping with JS parsing + batch flushing

Performance improvements:
- JS-based DOM parsing (single browser call vs Selenium round-trips)
- Batch flushing to disk every 500 reviews to free memory
- Hide parsed elements (display:none) to reduce DOM overhead
- Cycle timing instrumentation for debugging slowdowns

Results: 2826 reviews in 6.7min (7.1/sec) vs 2190 in 37min (1.0/sec)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-22 10:01:22 +00:00
parent 0778b2e07d
commit d989178119

View File

@@ -183,7 +183,8 @@ def parse_dom_review(card) -> dict:
return None
def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: int = 15) -> dict:
def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: int = 15,
flush_callback=None, flush_batch_size: int = 500) -> dict:
"""
Scrape Google Maps reviews.
@@ -192,6 +193,9 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
url: Google Maps place URL
max_reviews: Maximum reviews to collect
timeout_no_new: Seconds to wait with no new reviews before stopping
flush_callback: Optional callback(reviews_list) called every flush_batch_size reviews
This allows streaming data to disk and freeing memory
flush_batch_size: Number of reviews to collect before flushing (default 500)
Returns:
dict with reviews list and metadata
@@ -199,6 +203,8 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
# Storage - use review ID as key
reviews = {} # review_id -> review
seen_ids = set() # Track all IDs we've seen (persists after flush)
total_flushed = [0] # Use list for closure mutation
# Don't force language - let Google show all reviews in user's locale
@@ -506,31 +512,105 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
print(f"🔄 Scrolling... (timeout: {timeout_no_new}s with no new)", flush=True)
cycle_start = time.time()
while True:
check_num += 1
time.sleep(1.0) # Check every second
# TIMING: Track cycle performance
t0 = time.time()
cycle_delta = t0 - cycle_start
cycle_start = t0
# Collect from API (doesn't affect scroll)
t1 = time.time()
for rev in get_api_reviews():
if not any(r.get("author") == rev["author"] for r in reviews.values()):
key = f"api_{rev['author'][:20]}_{rev['rating']}"
key = f"api_{rev['author'][:20]}_{rev['rating']}"
if key not in seen_ids:
reviews[key] = rev
seen_ids.add(key)
api_time = time.time() - t1
# Parse reviews in real-time (Google Maps uses virtual scroll - elements get removed!)
# We must parse NOW, not later
# Parse reviews in real-time using JavaScript (FAST - single browser call)
# This replaces slow Python loop with Selenium round-trips
t2 = time.time()
dom_cards = 0
try:
cards = driver.find_elements(By.CSS_SELECTOR, "div.jftiEf[data-review-id]")
for card in cards:
rid = card.get_attribute("data-review-id")
if rid and rid not in reviews:
# Parse immediately - element may be gone later!
review = parse_dom_review(card)
if review:
reviews[rid] = review
except:
pass
# Pass seen_ids to JS so it can skip already-processed reviews
seen_list = list(seen_ids)
parsed_reviews = driver.execute_script("""
var seenSet = new Set(arguments[0]);
var results = [];
var cards = document.querySelectorAll('div.jftiEf[data-review-id]');
current_count = len(reviews)
for (var i = 0; i < cards.length; i++) {
var card = cards[i];
var rid = card.getAttribute('data-review-id');
if (!rid || seenSet.has(rid)) continue;
// Parse review data
var author = '', text = '', rating = 0, timestamp = '';
// Author name
var authorEl = card.querySelector('.d4r55');
if (authorEl) author = authorEl.textContent.trim();
// Rating from aria-label (e.g., "5 stars")
var ratingEl = card.querySelector('[aria-label*="star"]');
if (ratingEl) {
var match = ratingEl.getAttribute('aria-label').match(/(\\d)/);
if (match) rating = parseInt(match[1]);
}
// Review text (check for expanded version first)
var textEl = card.querySelector('.wiI7pd');
if (textEl) text = textEl.textContent.trim();
// Timestamp
var timeEl = card.querySelector('.rsqaWe');
if (timeEl) timestamp = timeEl.textContent.trim();
if (author && rating >= 1 && rating <= 5) {
results.push({
id: rid,
author: author,
text: text,
rating: rating,
timestamp: timestamp,
source: 'dom'
});
// Just hide the card (faster than remove, less disruptive)
card.style.display = 'none';
card.innerHTML = '';
}
}
return {reviews: results, cardCount: cards.length};
""", seen_list)
dom_cards = parsed_reviews.get('cardCount', 0) if parsed_reviews else 0
new_reviews = parsed_reviews.get('reviews', []) if parsed_reviews else []
for rev in new_reviews:
rid = rev.pop('id')
reviews[rid] = rev
seen_ids.add(rid)
except Exception as e:
print(f" ❌ DOM parse error: {e}")
dom_time = time.time() - t2
# BATCH FLUSH: If we have enough reviews, flush to callback and clear memory
t3 = time.time()
if flush_callback and len(reviews) >= flush_batch_size:
print(f" 💾 Flushing {len(reviews)} reviews to disk...")
flush_callback(list(reviews.values()))
total_flushed[0] += len(reviews)
reviews.clear() # Free memory, but keep seen_ids
flush_time = time.time() - t3
current_count = total_flushed[0] + len(reviews)
# TIMING: Print if cycle is slow (>2s)
if cycle_delta > 2.0:
print(f" ⚠️ SLOW cycle: {cycle_delta:.1f}s (api:{api_time:.1f}s dom:{dom_time:.1f}s/{dom_cards}cards flush:{flush_time:.1f}s seen:{len(seen_ids)})")
# Check for new reviews
if current_count > last_count:
@@ -615,31 +695,31 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
stop_scrolling.set()
break
# Flush any remaining reviews
if flush_callback and reviews:
print(f" 💾 Final flush: {len(reviews)} reviews...")
flush_callback(list(reviews.values()))
total_flushed[0] += len(reviews)
reviews.clear()
# Reviews already parsed during scrolling (real-time parsing)
print("📝 Finalizing review data...")
# Separate API and DOM reviews
api_reviews_collected = {k: v for k, v in reviews.items() if v.get("source") == "api"}
dom_reviews = {k: v for k, v in reviews.items() if v.get("source") == "dom"}
# Merge API reviews (only add if not already in DOM)
api_added = 0
for key, api_rev in api_reviews_collected.items():
# Check if this author already exists in DOM reviews
author = api_rev.get("author", "")
if author and not any(r.get("author") == author for r in reviews.values()):
reviews[f"api_{key}"] = api_rev
api_added += 1
# Final results
review_list = list(reviews.values())
grand_total = total_flushed[0] + len(review_list)
dom_count = sum(1 for r in review_list if r.get("source") == "dom")
api_count = sum(1 for r in review_list if r.get("source") == "api")
print(f"\n📋 Total: {len(review_list)} unique reviews (DOM: {dom_count}, API: {api_count})")
if total_flushed[0] > 0:
print(f"\n📋 Total: {grand_total} unique reviews (flushed: {total_flushed[0]}, in memory: {len(review_list)})")
else:
print(f"\n📋 Total: {len(review_list)} unique reviews (DOM: {dom_count}, API: {api_count})")
return {
"reviews": review_list,
"total": len(review_list),
"reviews": review_list, # Only unflushed reviews (flushed already sent to callback)
"total": grand_total,
"total_flushed": total_flushed[0],
"checks": check_num,
"url": url
}