7x faster scraping with JS parsing + batch flushing
Performance improvements: - JS-based DOM parsing (single browser call vs Selenium round-trips) - Batch flushing to disk every 500 reviews to free memory - Hide parsed elements (display:none) to reduce DOM overhead - Cycle timing instrumentation for debugging slowdowns Results: 2826 reviews in 6.7min (7.1/sec) vs 2190 in 37min (1.0/sec) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -183,7 +183,8 @@ def parse_dom_review(card) -> dict:
|
||||
return None
|
||||
|
||||
|
||||
def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: int = 15) -> dict:
|
||||
def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: int = 15,
|
||||
flush_callback=None, flush_batch_size: int = 500) -> dict:
|
||||
"""
|
||||
Scrape Google Maps reviews.
|
||||
|
||||
@@ -192,6 +193,9 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
url: Google Maps place URL
|
||||
max_reviews: Maximum reviews to collect
|
||||
timeout_no_new: Seconds to wait with no new reviews before stopping
|
||||
flush_callback: Optional callback(reviews_list) called every flush_batch_size reviews
|
||||
This allows streaming data to disk and freeing memory
|
||||
flush_batch_size: Number of reviews to collect before flushing (default 500)
|
||||
|
||||
Returns:
|
||||
dict with reviews list and metadata
|
||||
@@ -199,6 +203,8 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
|
||||
# Storage - use review ID as key
|
||||
reviews = {} # review_id -> review
|
||||
seen_ids = set() # Track all IDs we've seen (persists after flush)
|
||||
total_flushed = [0] # Use list for closure mutation
|
||||
|
||||
# Don't force language - let Google show all reviews in user's locale
|
||||
|
||||
@@ -506,31 +512,105 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
|
||||
print(f"🔄 Scrolling... (timeout: {timeout_no_new}s with no new)", flush=True)
|
||||
|
||||
cycle_start = time.time()
|
||||
while True:
|
||||
check_num += 1
|
||||
time.sleep(1.0) # Check every second
|
||||
|
||||
# TIMING: Track cycle performance
|
||||
t0 = time.time()
|
||||
cycle_delta = t0 - cycle_start
|
||||
cycle_start = t0
|
||||
|
||||
# Collect from API (doesn't affect scroll)
|
||||
t1 = time.time()
|
||||
for rev in get_api_reviews():
|
||||
if not any(r.get("author") == rev["author"] for r in reviews.values()):
|
||||
key = f"api_{rev['author'][:20]}_{rev['rating']}"
|
||||
key = f"api_{rev['author'][:20]}_{rev['rating']}"
|
||||
if key not in seen_ids:
|
||||
reviews[key] = rev
|
||||
seen_ids.add(key)
|
||||
api_time = time.time() - t1
|
||||
|
||||
# Parse reviews in real-time (Google Maps uses virtual scroll - elements get removed!)
|
||||
# We must parse NOW, not later
|
||||
# Parse reviews in real-time using JavaScript (FAST - single browser call)
|
||||
# This replaces slow Python loop with Selenium round-trips
|
||||
t2 = time.time()
|
||||
dom_cards = 0
|
||||
try:
|
||||
cards = driver.find_elements(By.CSS_SELECTOR, "div.jftiEf[data-review-id]")
|
||||
for card in cards:
|
||||
rid = card.get_attribute("data-review-id")
|
||||
if rid and rid not in reviews:
|
||||
# Parse immediately - element may be gone later!
|
||||
review = parse_dom_review(card)
|
||||
if review:
|
||||
reviews[rid] = review
|
||||
except:
|
||||
pass
|
||||
# Pass seen_ids to JS so it can skip already-processed reviews
|
||||
seen_list = list(seen_ids)
|
||||
parsed_reviews = driver.execute_script("""
|
||||
var seenSet = new Set(arguments[0]);
|
||||
var results = [];
|
||||
var cards = document.querySelectorAll('div.jftiEf[data-review-id]');
|
||||
|
||||
current_count = len(reviews)
|
||||
for (var i = 0; i < cards.length; i++) {
|
||||
var card = cards[i];
|
||||
var rid = card.getAttribute('data-review-id');
|
||||
if (!rid || seenSet.has(rid)) continue;
|
||||
|
||||
// Parse review data
|
||||
var author = '', text = '', rating = 0, timestamp = '';
|
||||
|
||||
// Author name
|
||||
var authorEl = card.querySelector('.d4r55');
|
||||
if (authorEl) author = authorEl.textContent.trim();
|
||||
|
||||
// Rating from aria-label (e.g., "5 stars")
|
||||
var ratingEl = card.querySelector('[aria-label*="star"]');
|
||||
if (ratingEl) {
|
||||
var match = ratingEl.getAttribute('aria-label').match(/(\\d)/);
|
||||
if (match) rating = parseInt(match[1]);
|
||||
}
|
||||
|
||||
// Review text (check for expanded version first)
|
||||
var textEl = card.querySelector('.wiI7pd');
|
||||
if (textEl) text = textEl.textContent.trim();
|
||||
|
||||
// Timestamp
|
||||
var timeEl = card.querySelector('.rsqaWe');
|
||||
if (timeEl) timestamp = timeEl.textContent.trim();
|
||||
|
||||
if (author && rating >= 1 && rating <= 5) {
|
||||
results.push({
|
||||
id: rid,
|
||||
author: author,
|
||||
text: text,
|
||||
rating: rating,
|
||||
timestamp: timestamp,
|
||||
source: 'dom'
|
||||
});
|
||||
// Just hide the card (faster than remove, less disruptive)
|
||||
card.style.display = 'none';
|
||||
card.innerHTML = '';
|
||||
}
|
||||
}
|
||||
return {reviews: results, cardCount: cards.length};
|
||||
""", seen_list)
|
||||
|
||||
dom_cards = parsed_reviews.get('cardCount', 0) if parsed_reviews else 0
|
||||
new_reviews = parsed_reviews.get('reviews', []) if parsed_reviews else []
|
||||
for rev in new_reviews:
|
||||
rid = rev.pop('id')
|
||||
reviews[rid] = rev
|
||||
seen_ids.add(rid)
|
||||
except Exception as e:
|
||||
print(f" ❌ DOM parse error: {e}")
|
||||
dom_time = time.time() - t2
|
||||
|
||||
# BATCH FLUSH: If we have enough reviews, flush to callback and clear memory
|
||||
t3 = time.time()
|
||||
if flush_callback and len(reviews) >= flush_batch_size:
|
||||
print(f" 💾 Flushing {len(reviews)} reviews to disk...")
|
||||
flush_callback(list(reviews.values()))
|
||||
total_flushed[0] += len(reviews)
|
||||
reviews.clear() # Free memory, but keep seen_ids
|
||||
flush_time = time.time() - t3
|
||||
|
||||
current_count = total_flushed[0] + len(reviews)
|
||||
|
||||
# TIMING: Print if cycle is slow (>2s)
|
||||
if cycle_delta > 2.0:
|
||||
print(f" ⚠️ SLOW cycle: {cycle_delta:.1f}s (api:{api_time:.1f}s dom:{dom_time:.1f}s/{dom_cards}cards flush:{flush_time:.1f}s seen:{len(seen_ids)})")
|
||||
|
||||
# Check for new reviews
|
||||
if current_count > last_count:
|
||||
@@ -615,31 +695,31 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
stop_scrolling.set()
|
||||
break
|
||||
|
||||
# Flush any remaining reviews
|
||||
if flush_callback and reviews:
|
||||
print(f" 💾 Final flush: {len(reviews)} reviews...")
|
||||
flush_callback(list(reviews.values()))
|
||||
total_flushed[0] += len(reviews)
|
||||
reviews.clear()
|
||||
|
||||
# Reviews already parsed during scrolling (real-time parsing)
|
||||
print("📝 Finalizing review data...")
|
||||
|
||||
# Separate API and DOM reviews
|
||||
api_reviews_collected = {k: v for k, v in reviews.items() if v.get("source") == "api"}
|
||||
dom_reviews = {k: v for k, v in reviews.items() if v.get("source") == "dom"}
|
||||
|
||||
# Merge API reviews (only add if not already in DOM)
|
||||
api_added = 0
|
||||
for key, api_rev in api_reviews_collected.items():
|
||||
# Check if this author already exists in DOM reviews
|
||||
author = api_rev.get("author", "")
|
||||
if author and not any(r.get("author") == author for r in reviews.values()):
|
||||
reviews[f"api_{key}"] = api_rev
|
||||
api_added += 1
|
||||
|
||||
# Final results
|
||||
review_list = list(reviews.values())
|
||||
grand_total = total_flushed[0] + len(review_list)
|
||||
dom_count = sum(1 for r in review_list if r.get("source") == "dom")
|
||||
api_count = sum(1 for r in review_list if r.get("source") == "api")
|
||||
print(f"\n📋 Total: {len(review_list)} unique reviews (DOM: {dom_count}, API: {api_count})")
|
||||
|
||||
if total_flushed[0] > 0:
|
||||
print(f"\n📋 Total: {grand_total} unique reviews (flushed: {total_flushed[0]}, in memory: {len(review_list)})")
|
||||
else:
|
||||
print(f"\n📋 Total: {len(review_list)} unique reviews (DOM: {dom_count}, API: {api_count})")
|
||||
|
||||
return {
|
||||
"reviews": review_list,
|
||||
"total": len(review_list),
|
||||
"reviews": review_list, # Only unflushed reviews (flushed already sent to callback)
|
||||
"total": grand_total,
|
||||
"total_flushed": total_flushed[0],
|
||||
"checks": check_num,
|
||||
"url": url
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user