7x faster scraping with JS parsing + batch flushing
Performance improvements: - JS-based DOM parsing (single browser call vs Selenium round-trips) - Batch flushing to disk every 500 reviews to free memory - Hide parsed elements (display:none) to reduce DOM overhead - Cycle timing instrumentation for debugging slowdowns Results: 2826 reviews in 6.7min (7.1/sec) vs 2190 in 37min (1.0/sec) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -183,7 +183,8 @@ def parse_dom_review(card) -> dict:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: int = 15) -> dict:
|
def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: int = 15,
|
||||||
|
flush_callback=None, flush_batch_size: int = 500) -> dict:
|
||||||
"""
|
"""
|
||||||
Scrape Google Maps reviews.
|
Scrape Google Maps reviews.
|
||||||
|
|
||||||
@@ -192,6 +193,9 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
url: Google Maps place URL
|
url: Google Maps place URL
|
||||||
max_reviews: Maximum reviews to collect
|
max_reviews: Maximum reviews to collect
|
||||||
timeout_no_new: Seconds to wait with no new reviews before stopping
|
timeout_no_new: Seconds to wait with no new reviews before stopping
|
||||||
|
flush_callback: Optional callback(reviews_list) called every flush_batch_size reviews
|
||||||
|
This allows streaming data to disk and freeing memory
|
||||||
|
flush_batch_size: Number of reviews to collect before flushing (default 500)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
dict with reviews list and metadata
|
dict with reviews list and metadata
|
||||||
@@ -199,6 +203,8 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
|
|
||||||
# Storage - use review ID as key
|
# Storage - use review ID as key
|
||||||
reviews = {} # review_id -> review
|
reviews = {} # review_id -> review
|
||||||
|
seen_ids = set() # Track all IDs we've seen (persists after flush)
|
||||||
|
total_flushed = [0] # Use list for closure mutation
|
||||||
|
|
||||||
# Don't force language - let Google show all reviews in user's locale
|
# Don't force language - let Google show all reviews in user's locale
|
||||||
|
|
||||||
@@ -506,31 +512,105 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
|
|
||||||
print(f"🔄 Scrolling... (timeout: {timeout_no_new}s with no new)", flush=True)
|
print(f"🔄 Scrolling... (timeout: {timeout_no_new}s with no new)", flush=True)
|
||||||
|
|
||||||
|
cycle_start = time.time()
|
||||||
while True:
|
while True:
|
||||||
check_num += 1
|
check_num += 1
|
||||||
time.sleep(1.0) # Check every second
|
time.sleep(1.0) # Check every second
|
||||||
|
|
||||||
|
# TIMING: Track cycle performance
|
||||||
|
t0 = time.time()
|
||||||
|
cycle_delta = t0 - cycle_start
|
||||||
|
cycle_start = t0
|
||||||
|
|
||||||
# Collect from API (doesn't affect scroll)
|
# Collect from API (doesn't affect scroll)
|
||||||
|
t1 = time.time()
|
||||||
for rev in get_api_reviews():
|
for rev in get_api_reviews():
|
||||||
if not any(r.get("author") == rev["author"] for r in reviews.values()):
|
key = f"api_{rev['author'][:20]}_{rev['rating']}"
|
||||||
key = f"api_{rev['author'][:20]}_{rev['rating']}"
|
if key not in seen_ids:
|
||||||
reviews[key] = rev
|
reviews[key] = rev
|
||||||
|
seen_ids.add(key)
|
||||||
|
api_time = time.time() - t1
|
||||||
|
|
||||||
# Parse reviews in real-time (Google Maps uses virtual scroll - elements get removed!)
|
# Parse reviews in real-time using JavaScript (FAST - single browser call)
|
||||||
# We must parse NOW, not later
|
# This replaces slow Python loop with Selenium round-trips
|
||||||
|
t2 = time.time()
|
||||||
|
dom_cards = 0
|
||||||
try:
|
try:
|
||||||
cards = driver.find_elements(By.CSS_SELECTOR, "div.jftiEf[data-review-id]")
|
# Pass seen_ids to JS so it can skip already-processed reviews
|
||||||
for card in cards:
|
seen_list = list(seen_ids)
|
||||||
rid = card.get_attribute("data-review-id")
|
parsed_reviews = driver.execute_script("""
|
||||||
if rid and rid not in reviews:
|
var seenSet = new Set(arguments[0]);
|
||||||
# Parse immediately - element may be gone later!
|
var results = [];
|
||||||
review = parse_dom_review(card)
|
var cards = document.querySelectorAll('div.jftiEf[data-review-id]');
|
||||||
if review:
|
|
||||||
reviews[rid] = review
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
current_count = len(reviews)
|
for (var i = 0; i < cards.length; i++) {
|
||||||
|
var card = cards[i];
|
||||||
|
var rid = card.getAttribute('data-review-id');
|
||||||
|
if (!rid || seenSet.has(rid)) continue;
|
||||||
|
|
||||||
|
// Parse review data
|
||||||
|
var author = '', text = '', rating = 0, timestamp = '';
|
||||||
|
|
||||||
|
// Author name
|
||||||
|
var authorEl = card.querySelector('.d4r55');
|
||||||
|
if (authorEl) author = authorEl.textContent.trim();
|
||||||
|
|
||||||
|
// Rating from aria-label (e.g., "5 stars")
|
||||||
|
var ratingEl = card.querySelector('[aria-label*="star"]');
|
||||||
|
if (ratingEl) {
|
||||||
|
var match = ratingEl.getAttribute('aria-label').match(/(\\d)/);
|
||||||
|
if (match) rating = parseInt(match[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Review text (check for expanded version first)
|
||||||
|
var textEl = card.querySelector('.wiI7pd');
|
||||||
|
if (textEl) text = textEl.textContent.trim();
|
||||||
|
|
||||||
|
// Timestamp
|
||||||
|
var timeEl = card.querySelector('.rsqaWe');
|
||||||
|
if (timeEl) timestamp = timeEl.textContent.trim();
|
||||||
|
|
||||||
|
if (author && rating >= 1 && rating <= 5) {
|
||||||
|
results.push({
|
||||||
|
id: rid,
|
||||||
|
author: author,
|
||||||
|
text: text,
|
||||||
|
rating: rating,
|
||||||
|
timestamp: timestamp,
|
||||||
|
source: 'dom'
|
||||||
|
});
|
||||||
|
// Just hide the card (faster than remove, less disruptive)
|
||||||
|
card.style.display = 'none';
|
||||||
|
card.innerHTML = '';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return {reviews: results, cardCount: cards.length};
|
||||||
|
""", seen_list)
|
||||||
|
|
||||||
|
dom_cards = parsed_reviews.get('cardCount', 0) if parsed_reviews else 0
|
||||||
|
new_reviews = parsed_reviews.get('reviews', []) if parsed_reviews else []
|
||||||
|
for rev in new_reviews:
|
||||||
|
rid = rev.pop('id')
|
||||||
|
reviews[rid] = rev
|
||||||
|
seen_ids.add(rid)
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ❌ DOM parse error: {e}")
|
||||||
|
dom_time = time.time() - t2
|
||||||
|
|
||||||
|
# BATCH FLUSH: If we have enough reviews, flush to callback and clear memory
|
||||||
|
t3 = time.time()
|
||||||
|
if flush_callback and len(reviews) >= flush_batch_size:
|
||||||
|
print(f" 💾 Flushing {len(reviews)} reviews to disk...")
|
||||||
|
flush_callback(list(reviews.values()))
|
||||||
|
total_flushed[0] += len(reviews)
|
||||||
|
reviews.clear() # Free memory, but keep seen_ids
|
||||||
|
flush_time = time.time() - t3
|
||||||
|
|
||||||
|
current_count = total_flushed[0] + len(reviews)
|
||||||
|
|
||||||
|
# TIMING: Print if cycle is slow (>2s)
|
||||||
|
if cycle_delta > 2.0:
|
||||||
|
print(f" ⚠️ SLOW cycle: {cycle_delta:.1f}s (api:{api_time:.1f}s dom:{dom_time:.1f}s/{dom_cards}cards flush:{flush_time:.1f}s seen:{len(seen_ids)})")
|
||||||
|
|
||||||
# Check for new reviews
|
# Check for new reviews
|
||||||
if current_count > last_count:
|
if current_count > last_count:
|
||||||
@@ -615,31 +695,31 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
stop_scrolling.set()
|
stop_scrolling.set()
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# Flush any remaining reviews
|
||||||
|
if flush_callback and reviews:
|
||||||
|
print(f" 💾 Final flush: {len(reviews)} reviews...")
|
||||||
|
flush_callback(list(reviews.values()))
|
||||||
|
total_flushed[0] += len(reviews)
|
||||||
|
reviews.clear()
|
||||||
|
|
||||||
# Reviews already parsed during scrolling (real-time parsing)
|
# Reviews already parsed during scrolling (real-time parsing)
|
||||||
print("📝 Finalizing review data...")
|
print("📝 Finalizing review data...")
|
||||||
|
|
||||||
# Separate API and DOM reviews
|
|
||||||
api_reviews_collected = {k: v for k, v in reviews.items() if v.get("source") == "api"}
|
|
||||||
dom_reviews = {k: v for k, v in reviews.items() if v.get("source") == "dom"}
|
|
||||||
|
|
||||||
# Merge API reviews (only add if not already in DOM)
|
|
||||||
api_added = 0
|
|
||||||
for key, api_rev in api_reviews_collected.items():
|
|
||||||
# Check if this author already exists in DOM reviews
|
|
||||||
author = api_rev.get("author", "")
|
|
||||||
if author and not any(r.get("author") == author for r in reviews.values()):
|
|
||||||
reviews[f"api_{key}"] = api_rev
|
|
||||||
api_added += 1
|
|
||||||
|
|
||||||
# Final results
|
# Final results
|
||||||
review_list = list(reviews.values())
|
review_list = list(reviews.values())
|
||||||
|
grand_total = total_flushed[0] + len(review_list)
|
||||||
dom_count = sum(1 for r in review_list if r.get("source") == "dom")
|
dom_count = sum(1 for r in review_list if r.get("source") == "dom")
|
||||||
api_count = sum(1 for r in review_list if r.get("source") == "api")
|
api_count = sum(1 for r in review_list if r.get("source") == "api")
|
||||||
print(f"\n📋 Total: {len(review_list)} unique reviews (DOM: {dom_count}, API: {api_count})")
|
|
||||||
|
if total_flushed[0] > 0:
|
||||||
|
print(f"\n📋 Total: {grand_total} unique reviews (flushed: {total_flushed[0]}, in memory: {len(review_list)})")
|
||||||
|
else:
|
||||||
|
print(f"\n📋 Total: {len(review_list)} unique reviews (DOM: {dom_count}, API: {api_count})")
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"reviews": review_list,
|
"reviews": review_list, # Only unflushed reviews (flushed already sent to callback)
|
||||||
"total": len(review_list),
|
"total": grand_total,
|
||||||
|
"total_flushed": total_flushed[0],
|
||||||
"checks": check_num,
|
"checks": check_num,
|
||||||
"url": url
|
"url": url
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user