Fix DOM cleanup: hide cards from API interception too
The continue statement was skipping the card.style.display='none' and card.innerHTML='' cleanup for cards already seen via API interception. This caused DOM to grow unbounded during long scrapes. Now ALL processed cards are hidden regardless of data source. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -8,9 +8,41 @@ import re
|
|||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
import threading
|
import threading
|
||||||
|
from datetime import datetime
|
||||||
from selenium.webdriver.common.by import By
|
from selenium.webdriver.common.by import By
|
||||||
|
|
||||||
|
|
||||||
|
class LogCapture:
|
||||||
|
"""Captures scraper logs for storage and viewing."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.logs = []
|
||||||
|
|
||||||
|
def log(self, message: str, level: str = "INFO", source: str = "scraper"):
|
||||||
|
"""Add a log entry with timestamp."""
|
||||||
|
entry = {
|
||||||
|
"timestamp": datetime.utcnow().isoformat() + "Z",
|
||||||
|
"level": level,
|
||||||
|
"source": source,
|
||||||
|
"message": message
|
||||||
|
}
|
||||||
|
self.logs.append(entry)
|
||||||
|
# Also print for console visibility
|
||||||
|
print(message, flush=True)
|
||||||
|
|
||||||
|
def info(self, message: str, source: str = "scraper"):
|
||||||
|
self.log(message, "INFO", source)
|
||||||
|
|
||||||
|
def warning(self, message: str, source: str = "scraper"):
|
||||||
|
self.log(message, "WARNING", source)
|
||||||
|
|
||||||
|
def error(self, message: str, source: str = "scraper"):
|
||||||
|
self.log(message, "ERROR", source)
|
||||||
|
|
||||||
|
def get_logs(self):
|
||||||
|
return self.logs
|
||||||
|
|
||||||
|
|
||||||
def parse_api_review(raw: list) -> dict:
|
def parse_api_review(raw: list) -> dict:
|
||||||
"""Parse a review from API response array."""
|
"""Parse a review from API response array."""
|
||||||
try:
|
try:
|
||||||
@@ -235,7 +267,8 @@ def parse_dom_review(card) -> dict:
|
|||||||
|
|
||||||
|
|
||||||
def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: int = 15,
|
def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: int = 15,
|
||||||
flush_callback=None, flush_batch_size: int = 500) -> dict:
|
flush_callback=None, flush_batch_size: int = 500, log_capture: LogCapture = None,
|
||||||
|
progress_callback=None) -> dict:
|
||||||
"""
|
"""
|
||||||
Scrape Google Maps reviews.
|
Scrape Google Maps reviews.
|
||||||
|
|
||||||
@@ -247,10 +280,14 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
flush_callback: Optional callback(reviews_list) called every flush_batch_size reviews
|
flush_callback: Optional callback(reviews_list) called every flush_batch_size reviews
|
||||||
This allows streaming data to disk and freeing memory
|
This allows streaming data to disk and freeing memory
|
||||||
flush_batch_size: Number of reviews to collect before flushing (default 500)
|
flush_batch_size: Number of reviews to collect before flushing (default 500)
|
||||||
|
log_capture: Optional LogCapture instance for storing logs
|
||||||
|
progress_callback: Optional callback(current_count, total_count) called every iteration
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
dict with reviews list and metadata
|
dict with reviews list and metadata
|
||||||
"""
|
"""
|
||||||
|
# Use provided log_capture or create a dummy that just prints
|
||||||
|
log = log_capture or LogCapture()
|
||||||
|
|
||||||
# Storage - use review ID as key
|
# Storage - use review ID as key
|
||||||
reviews = {} # review_id -> review
|
reviews = {} # review_id -> review
|
||||||
@@ -298,23 +335,23 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
|
|
||||||
# Navigate to URL (only on initial load or refresh)
|
# Navigate to URL (only on initial load or refresh)
|
||||||
if not is_refresh:
|
if not is_refresh:
|
||||||
print(f"🌐 Loading: {url[:80]}...")
|
log.info(f"🌐 Loading: {url[:80]}...")
|
||||||
else:
|
else:
|
||||||
print(f"🔄 Hard refresh #{hard_refresh_count[0]}: reloading page...")
|
log.info(f"🔄 Hard refresh #{hard_refresh_count[0]}: reloading page...")
|
||||||
driver.get(url)
|
driver.get(url)
|
||||||
|
|
||||||
# Handle consent popup if redirected (poll with tiny sleep)
|
# Handle consent popup if redirected (poll with tiny sleep)
|
||||||
start = time.time()
|
start = time.time()
|
||||||
while time.time() - start < 5: # Max 5s for consent
|
while time.time() - start < 5: # Max 5s for consent
|
||||||
if "consent.google" in driver.current_url:
|
if "consent.google" in driver.current_url:
|
||||||
print(" Handling consent popup...")
|
log.info(" Handling consent popup...")
|
||||||
try:
|
try:
|
||||||
for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
|
for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
|
||||||
txt = btn.text.lower()
|
txt = btn.text.lower()
|
||||||
if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
|
if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
|
||||||
btn.click()
|
btn.click()
|
||||||
# Reload original URL after consent
|
# Reload original URL after consent
|
||||||
print(" Reloading after consent...")
|
log.info(" Reloading after consent...")
|
||||||
driver.get(url)
|
driver.get(url)
|
||||||
break
|
break
|
||||||
except:
|
except:
|
||||||
@@ -344,7 +381,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
""")
|
""")
|
||||||
if count:
|
if count:
|
||||||
total_reviews[0] = count
|
total_reviews[0] = count
|
||||||
print(f"📊 Total reviews on page: {count}")
|
log.info(f"📊 Total reviews on page: {count}")
|
||||||
break
|
break
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
@@ -361,7 +398,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
tab_text = tab.text.lower()
|
tab_text = tab.text.lower()
|
||||||
if any(kw in tab_text for kw in review_keywords):
|
if any(kw in tab_text for kw in review_keywords):
|
||||||
if not is_refresh:
|
if not is_refresh:
|
||||||
print(f" Clicking reviews tab: '{tab.text}'")
|
log.info(f" Clicking reviews tab: '{tab.text}'")
|
||||||
tab.click()
|
tab.click()
|
||||||
tab_clicked = True
|
tab_clicked = True
|
||||||
break
|
break
|
||||||
@@ -381,24 +418,24 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
break
|
break
|
||||||
elapsed = int(time.time() - start)
|
elapsed = int(time.time() - start)
|
||||||
if elapsed > last_print:
|
if elapsed > last_print:
|
||||||
print(f" Waiting for reviews panel...{refresh_label} ({elapsed}s)")
|
log.info(f" Waiting for reviews panel...{refresh_label} ({elapsed}s)")
|
||||||
last_print = elapsed
|
last_print = elapsed
|
||||||
time.sleep(0.01) # 10ms - responsive but low CPU
|
time.sleep(0.01) # 10ms - responsive but low CPU
|
||||||
|
|
||||||
if not scroll_container:
|
if not scroll_container:
|
||||||
print(f"❌ Could not find reviews scroll container{refresh_label}")
|
log.error(f"❌ Could not find reviews scroll container{refresh_label}")
|
||||||
try:
|
try:
|
||||||
print("Page title:", driver.title)
|
log.error(f"Page title: {driver.title}")
|
||||||
print("Current URL:", driver.current_url[:100])
|
log.error(f"Current URL: {driver.current_url[:100]}")
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
print(f"✅ Found scroll container{refresh_label}")
|
log.info(f"✅ Found scroll container{refresh_label}")
|
||||||
|
|
||||||
# Inject API interceptor (needs to be re-injected after refresh)
|
# Inject API interceptor (needs to be re-injected after refresh)
|
||||||
if not is_refresh:
|
if not is_refresh:
|
||||||
print("🔌 Injecting API interceptor...")
|
log.info("🔌 Injecting API interceptor...")
|
||||||
driver.execute_script("""
|
driver.execute_script("""
|
||||||
// Always re-setup on refresh
|
// Always re-setup on refresh
|
||||||
window.__reviewInterceptorInjected = true;
|
window.__reviewInterceptorInjected = true;
|
||||||
@@ -472,12 +509,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
}
|
}
|
||||||
""")
|
""")
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
print(" 📅 Sorted by newest")
|
log.info(" 📅 Sorted by newest")
|
||||||
# Re-find scroll container after sorting (DOM may be recreated)
|
# Re-find scroll container after sorting (DOM may be recreated)
|
||||||
new_container = find_scroll_container()
|
new_container = find_scroll_container()
|
||||||
if new_container:
|
if new_container:
|
||||||
scroll_container = new_container
|
scroll_container = new_container
|
||||||
print(" 🔄 Refreshed scroll container reference")
|
log.info(" 🔄 Refreshed scroll container reference")
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -495,7 +532,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
return count;
|
return count;
|
||||||
""")
|
""")
|
||||||
if expanded > 0:
|
if expanded > 0:
|
||||||
print(f" 📝 Expanded {expanded} truncated reviews")
|
log.info(f" 📝 Expanded {expanded} truncated reviews")
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -506,7 +543,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
})
|
})
|
||||||
driver.execute_cdp_cmd('Network.enable', {})
|
driver.execute_cdp_cmd('Network.enable', {})
|
||||||
if not is_refresh:
|
if not is_refresh:
|
||||||
print(" 🚫 Blocking images for faster scrolling")
|
log.info(" 🚫 Blocking images for faster scrolling")
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -605,7 +642,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
hard_refresh_count[0] += 1
|
hard_refresh_count[0] += 1
|
||||||
|
|
||||||
if hard_refresh_count[0] > max_hard_refreshes:
|
if hard_refresh_count[0] > max_hard_refreshes:
|
||||||
print(f" ⚠️ Max hard refreshes ({max_hard_refreshes}) reached, giving up")
|
log.warning(f" ⚠️ Max hard refreshes ({max_hard_refreshes}) reached, giving up")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# Stop current scroll worker
|
# Stop current scroll worker
|
||||||
@@ -618,10 +655,10 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
scroll_container = new_container
|
scroll_container = new_container
|
||||||
stop_scrolling = new_stop
|
stop_scrolling = new_stop
|
||||||
recovery_count[0] = 0 # Reset recovery count after successful refresh
|
recovery_count[0] = 0 # Reset recovery count after successful refresh
|
||||||
print(f" ✅ Hard refresh successful, resuming with {len(seen_ids)} reviews already collected")
|
log.info(f" ✅ Hard refresh successful, resuming with {len(seen_ids)} reviews already collected")
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
print(f" ❌ Hard refresh failed to find scroll container")
|
log.error(f" ❌ Hard refresh failed to find scroll container")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# Main collection loop
|
# Main collection loop
|
||||||
@@ -629,7 +666,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
last_count = len(reviews)
|
last_count = len(reviews)
|
||||||
check_num = 0
|
check_num = 0
|
||||||
|
|
||||||
print(f"🔄 Scrolling... (timeout: {timeout_no_new}s with no new)", flush=True)
|
log.info(f"🔄 Scrolling... (timeout: {timeout_no_new}s with no new)")
|
||||||
|
|
||||||
cycle_start = time.time()
|
cycle_start = time.time()
|
||||||
while True:
|
while True:
|
||||||
@@ -711,8 +748,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
processedIds.add(rid);
|
processedIds.add(rid);
|
||||||
|
|
||||||
// Already seen from API - just track order, skip content
|
// Already seen from API - just track order, skip content
|
||||||
|
// BUT still hide the card to keep DOM light!
|
||||||
if (seenSet.has(rid)) {
|
if (seenSet.has(rid)) {
|
||||||
results.push({id: rid, orderOnly: true});
|
results.push({id: rid, orderOnly: true});
|
||||||
|
// Hide this card since we already have its data from API
|
||||||
|
card.style.display = 'none';
|
||||||
|
card.innerHTML = '';
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -769,10 +810,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
timestamp: timestamp,
|
timestamp: timestamp,
|
||||||
source: 'dom'
|
source: 'dom'
|
||||||
});
|
});
|
||||||
// Hide processed card (separators removed on next cycle)
|
|
||||||
card.style.display = 'none';
|
|
||||||
card.innerHTML = '';
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ALWAYS hide processed cards to keep DOM light
|
||||||
|
// (even if extraction failed - we've seen this card)
|
||||||
|
card.style.display = 'none';
|
||||||
|
card.innerHTML = '';
|
||||||
}
|
}
|
||||||
return {reviews: results, cardCount: cards.length, sepsRemoved: sepsRemoved};
|
return {reviews: results, cardCount: cards.length, sepsRemoved: sepsRemoved};
|
||||||
""", seen_list)
|
""", seen_list)
|
||||||
@@ -791,14 +834,14 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
reviews[rid] = rev
|
reviews[rid] = rev
|
||||||
seen_ids.add(rid)
|
seen_ids.add(rid)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f" ❌ DOM parse error: {e}")
|
log.error(f" ❌ DOM parse error: {e}")
|
||||||
dom_time = time.time() - t2
|
dom_time = time.time() - t2
|
||||||
|
|
||||||
# BATCH FLUSH: If we have enough reviews, flush to callback and clear memory
|
# BATCH FLUSH: If we have enough reviews, flush to callback and clear memory
|
||||||
# Sort by DOM order before flushing
|
# Sort by DOM order before flushing
|
||||||
t3 = time.time()
|
t3 = time.time()
|
||||||
if flush_callback and len(reviews) >= flush_batch_size:
|
if flush_callback and len(reviews) >= flush_batch_size:
|
||||||
print(f" 💾 Flushing {len(reviews)} reviews to disk...")
|
log.info(f" 💾 Flushing {len(reviews)} reviews to disk...")
|
||||||
sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf')))
|
sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf')))
|
||||||
flush_callback([r for _, r in sorted_reviews])
|
flush_callback([r for _, r in sorted_reviews])
|
||||||
total_flushed[0] += len(reviews)
|
total_flushed[0] += len(reviews)
|
||||||
@@ -809,7 +852,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
|
|
||||||
# TIMING: Print if cycle is slow (>2s)
|
# TIMING: Print if cycle is slow (>2s)
|
||||||
if cycle_delta > 2.0:
|
if cycle_delta > 2.0:
|
||||||
print(f" ⚠️ SLOW cycle: {cycle_delta:.1f}s (api:{api_time:.1f}s dom:{dom_time:.1f}s/{dom_cards}cards flush:{flush_time:.1f}s seen:{len(seen_ids)})")
|
log.warning(f" ⚠️ SLOW cycle: {cycle_delta:.1f}s (api:{api_time:.1f}s dom:{dom_time:.1f}s/{dom_cards}cards flush:{flush_time:.1f}s seen:{len(seen_ids)})")
|
||||||
|
|
||||||
# Check for new reviews
|
# Check for new reviews
|
||||||
if current_count > last_count:
|
if current_count > last_count:
|
||||||
@@ -844,19 +887,23 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
elapsed = time.time() - last_new_time
|
elapsed = time.time() - last_new_time
|
||||||
if total_reviews[0]:
|
if total_reviews[0]:
|
||||||
pct = (current_count / total_reviews[0]) * 100
|
pct = (current_count / total_reviews[0]) * 100
|
||||||
print(f" 📊 {current_count}/{total_reviews[0]} ({pct:.0f}%) | idle: {elapsed:.1f}s", flush=True)
|
log.info(f" 📊 {current_count}/{total_reviews[0]} ({pct:.0f}%) | idle: {elapsed:.1f}s")
|
||||||
else:
|
else:
|
||||||
print(f" 📊 {current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s", flush=True)
|
log.info(f" 📊 {current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s")
|
||||||
|
|
||||||
|
# Call progress callback on every iteration (for real-time log updates)
|
||||||
|
if progress_callback:
|
||||||
|
progress_callback(current_count, total_reviews[0])
|
||||||
|
|
||||||
# Stop conditions - check BEFORE recovery attempts
|
# Stop conditions - check BEFORE recovery attempts
|
||||||
if current_count >= max_reviews:
|
if current_count >= max_reviews:
|
||||||
print(f"✅ Reached max: {current_count}")
|
log.info(f"✅ Reached max: {current_count}")
|
||||||
stop_scrolling.set()
|
stop_scrolling.set()
|
||||||
break
|
break
|
||||||
|
|
||||||
# Also stop if we have all reviews from the page
|
# Also stop if we have all reviews from the page
|
||||||
if total_reviews[0] and current_count >= total_reviews[0]:
|
if total_reviews[0] and current_count >= total_reviews[0]:
|
||||||
print(f"✅ All {current_count} reviews collected")
|
log.info(f"✅ All {current_count} reviews collected")
|
||||||
stop_scrolling.set()
|
stop_scrolling.set()
|
||||||
break
|
break
|
||||||
|
|
||||||
@@ -865,12 +912,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
if elapsed >= 3 and int(elapsed) % 3 == 0:
|
if elapsed >= 3 and int(elapsed) % 3 == 0:
|
||||||
# After 8+ failed recovery attempts, try hard refresh
|
# After 8+ failed recovery attempts, try hard refresh
|
||||||
if recovery_count[0] >= 8 and hard_refresh_count[0] < max_hard_refreshes:
|
if recovery_count[0] >= 8 and hard_refresh_count[0] < max_hard_refreshes:
|
||||||
print(f" 🔄 Soft recovery failed {recovery_count[0]} times, trying hard refresh...", flush=True)
|
log.info(f" 🔄 Soft recovery failed {recovery_count[0]} times, trying hard refresh...")
|
||||||
if do_hard_refresh():
|
if do_hard_refresh():
|
||||||
last_new_time = time.time() # Reset timer after refresh
|
last_new_time = time.time() # Reset timer after refresh
|
||||||
continue # Skip to next iteration
|
continue # Skip to next iteration
|
||||||
else:
|
else:
|
||||||
print(f" 🔧 Recovery attempt #{recovery_count[0] + 1}...", flush=True)
|
log.info(f" 🔧 Recovery attempt #{recovery_count[0] + 1}...")
|
||||||
unstick_scroll()
|
unstick_scroll()
|
||||||
|
|
||||||
# Check scroll state - track if content is still being added
|
# Check scroll state - track if content is still being added
|
||||||
@@ -906,24 +953,24 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
if truly_done or timeout_hit:
|
if truly_done or timeout_hit:
|
||||||
# Last chance: try hard refresh before giving up
|
# Last chance: try hard refresh before giving up
|
||||||
if hard_refresh_count[0] < max_hard_refreshes and current_count < (total_reviews[0] or max_reviews):
|
if hard_refresh_count[0] < max_hard_refreshes and current_count < (total_reviews[0] or max_reviews):
|
||||||
print(f" 🔄 Timeout reached, trying hard refresh before giving up...", flush=True)
|
log.info(f" 🔄 Timeout reached, trying hard refresh before giving up...")
|
||||||
if do_hard_refresh():
|
if do_hard_refresh():
|
||||||
last_new_time = time.time()
|
last_new_time = time.time()
|
||||||
continue # Keep trying
|
continue # Keep trying
|
||||||
print(f"✅ All reviews loaded: {current_count}")
|
log.info(f"✅ All reviews loaded: {current_count}")
|
||||||
stop_scrolling.set()
|
stop_scrolling.set()
|
||||||
break
|
break
|
||||||
|
|
||||||
# Flush any remaining reviews (sorted by DOM order)
|
# Flush any remaining reviews (sorted by DOM order)
|
||||||
if flush_callback and reviews:
|
if flush_callback and reviews:
|
||||||
print(f" 💾 Final flush: {len(reviews)} reviews...")
|
log.info(f" 💾 Final flush: {len(reviews)} reviews...")
|
||||||
sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf')))
|
sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf')))
|
||||||
flush_callback([r for _, r in sorted_reviews])
|
flush_callback([r for _, r in sorted_reviews])
|
||||||
total_flushed[0] += len(reviews)
|
total_flushed[0] += len(reviews)
|
||||||
reviews.clear()
|
reviews.clear()
|
||||||
|
|
||||||
# Reviews already parsed during scrolling (real-time parsing)
|
# Reviews already parsed during scrolling (real-time parsing)
|
||||||
print("📝 Finalizing review data...")
|
log.info("📝 Finalizing review data...")
|
||||||
|
|
||||||
# Final results (sorted by DOM order)
|
# Final results (sorted by DOM order)
|
||||||
sorted_items = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf')))
|
sorted_items = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf')))
|
||||||
@@ -933,21 +980,23 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
api_count = sum(1 for r in review_list if r.get("source") == "api")
|
api_count = sum(1 for r in review_list if r.get("source") == "api")
|
||||||
|
|
||||||
if total_flushed[0] > 0:
|
if total_flushed[0] > 0:
|
||||||
print(f"\n📋 Total: {grand_total} unique reviews (flushed: {total_flushed[0]}, in memory: {len(review_list)})")
|
log.info(f"📋 Total: {grand_total} unique reviews (flushed: {total_flushed[0]}, in memory: {len(review_list)})")
|
||||||
else:
|
else:
|
||||||
print(f"\n📋 Total: {len(review_list)} unique reviews (DOM: {dom_count}, API: {api_count})")
|
log.info(f"📋 Total: {len(review_list)} unique reviews (DOM: {dom_count}, API: {api_count})")
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"reviews": review_list, # Only unflushed reviews (flushed already sent to callback)
|
"reviews": review_list, # Only unflushed reviews (flushed already sent to callback)
|
||||||
"total": grand_total,
|
"total": grand_total,
|
||||||
"total_flushed": total_flushed[0],
|
"total_flushed": total_flushed[0],
|
||||||
"checks": check_num,
|
"checks": check_num,
|
||||||
"url": url
|
"url": url,
|
||||||
|
"logs": log.get_logs()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999,
|
def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999,
|
||||||
progress_callback=None, driver=None, return_driver: bool = False):
|
progress_callback=None, driver=None, return_driver: bool = False,
|
||||||
|
log_capture: LogCapture = None):
|
||||||
"""
|
"""
|
||||||
Production-compatible wrapper for scrape_reviews.
|
Production-compatible wrapper for scrape_reviews.
|
||||||
Matches the API expected by job_manager.py.
|
Matches the API expected by job_manager.py.
|
||||||
@@ -959,9 +1008,10 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
|||||||
progress_callback: Optional callback(current_count, total_count) for progress
|
progress_callback: Optional callback(current_count, total_count) for progress
|
||||||
driver: Existing driver instance to reuse
|
driver: Existing driver instance to reuse
|
||||||
return_driver: If True, return driver in result
|
return_driver: If True, return driver in result
|
||||||
|
log_capture: Optional LogCapture instance for real-time log access
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dictionary with: reviews, count, total_reviews, time, success, error, driver
|
Dictionary with: reviews, count, total_reviews, time, success, error, driver, logs
|
||||||
"""
|
"""
|
||||||
from seleniumbase import Driver
|
from seleniumbase import Driver
|
||||||
|
|
||||||
@@ -969,6 +1019,9 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
|||||||
driver_provided = driver is not None
|
driver_provided = driver is not None
|
||||||
should_close_driver = not return_driver and not driver_provided
|
should_close_driver = not return_driver and not driver_provided
|
||||||
|
|
||||||
|
# Use provided log_capture or create new one
|
||||||
|
log_capture = log_capture or LogCapture()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Create driver if not provided
|
# Create driver if not provided
|
||||||
if not driver:
|
if not driver:
|
||||||
@@ -980,6 +1033,25 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
|||||||
)
|
)
|
||||||
driver.set_window_size(1200, 900) # Proper viewport for Google Maps
|
driver.set_window_size(1200, 900) # Proper viewport for Google Maps
|
||||||
|
|
||||||
|
# Set Chrome geolocation to US (Boston, MA) using CDP
|
||||||
|
# This ensures Google Maps shows US results regardless of server location
|
||||||
|
try:
|
||||||
|
driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
|
||||||
|
'latitude': 42.3601,
|
||||||
|
'longitude': -71.0589,
|
||||||
|
'accuracy': 100
|
||||||
|
})
|
||||||
|
log_capture.info("Set geolocation to US (Boston, MA)")
|
||||||
|
except Exception as e:
|
||||||
|
log_capture.warning(f"Could not set geolocation: {e}")
|
||||||
|
|
||||||
|
# Add URL parameters for consistent results
|
||||||
|
if 'hl=' not in url:
|
||||||
|
separator = '&' if '?' in url else '?'
|
||||||
|
url = f"{url}{separator}hl=en"
|
||||||
|
if 'gl=' not in url:
|
||||||
|
url = f"{url}&gl=us"
|
||||||
|
|
||||||
# Create progress wrapper if callback provided
|
# Create progress wrapper if callback provided
|
||||||
flush_callback = None
|
flush_callback = None
|
||||||
if progress_callback:
|
if progress_callback:
|
||||||
@@ -989,14 +1061,16 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
|||||||
progress_callback(collected[0], None)
|
progress_callback(collected[0], None)
|
||||||
flush_callback = flush_with_progress
|
flush_callback = flush_with_progress
|
||||||
|
|
||||||
# Run the scraper
|
# Run the scraper with progress callback for real-time updates
|
||||||
result = scrape_reviews(
|
result = scrape_reviews(
|
||||||
driver=driver,
|
driver=driver,
|
||||||
url=url,
|
url=url,
|
||||||
max_reviews=999999, # Effectively unlimited
|
max_reviews=999999, # Effectively unlimited
|
||||||
timeout_no_new=15,
|
timeout_no_new=15,
|
||||||
flush_callback=flush_callback,
|
flush_callback=flush_callback,
|
||||||
flush_batch_size=100 # Smaller batches for more frequent progress
|
flush_batch_size=100, # Smaller batches for more frequent progress
|
||||||
|
log_capture=log_capture,
|
||||||
|
progress_callback=progress_callback # Pass through for real-time log updates
|
||||||
)
|
)
|
||||||
|
|
||||||
elapsed = time.time() - start_time
|
elapsed = time.time() - start_time
|
||||||
@@ -1008,7 +1082,8 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
|||||||
"total_reviews": result.get("total", 0),
|
"total_reviews": result.get("total", 0),
|
||||||
"time": elapsed,
|
"time": elapsed,
|
||||||
"success": True,
|
"success": True,
|
||||||
"error": None
|
"error": None,
|
||||||
|
"logs": result.get("logs", [])
|
||||||
}
|
}
|
||||||
|
|
||||||
if return_driver:
|
if return_driver:
|
||||||
@@ -1030,6 +1105,9 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
|||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# Log error to the existing log_capture
|
||||||
|
log_capture.error(f"Scraper failed: {str(e)}")
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"reviews": [],
|
"reviews": [],
|
||||||
"count": 0,
|
"count": 0,
|
||||||
@@ -1037,7 +1115,8 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
|||||||
"time": elapsed,
|
"time": elapsed,
|
||||||
"success": False,
|
"success": False,
|
||||||
"error": str(e),
|
"error": str(e),
|
||||||
"driver": driver if return_driver else None
|
"driver": driver if return_driver else None,
|
||||||
|
"logs": log_capture.get_logs()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user