diff --git a/modules/scraper_clean.py b/modules/scraper_clean.py index 7881b00..2fb8681 100644 --- a/modules/scraper_clean.py +++ b/modules/scraper_clean.py @@ -183,7 +183,8 @@ def parse_dom_review(card) -> dict: return None -def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: int = 15) -> dict: +def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: int = 15, + flush_callback=None, flush_batch_size: int = 500) -> dict: """ Scrape Google Maps reviews. @@ -192,6 +193,9 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in url: Google Maps place URL max_reviews: Maximum reviews to collect timeout_no_new: Seconds to wait with no new reviews before stopping + flush_callback: Optional callback(reviews_list) called every flush_batch_size reviews + This allows streaming data to disk and freeing memory + flush_batch_size: Number of reviews to collect before flushing (default 500) Returns: dict with reviews list and metadata @@ -199,6 +203,8 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in # Storage - use review ID as key reviews = {} # review_id -> review + seen_ids = set() # Track all IDs we've seen (persists after flush) + total_flushed = [0] # Use list for closure mutation # Don't force language - let Google show all reviews in user's locale @@ -506,31 +512,105 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in print(f"šŸ”„ Scrolling... (timeout: {timeout_no_new}s with no new)", flush=True) + cycle_start = time.time() while True: check_num += 1 time.sleep(1.0) # Check every second + # TIMING: Track cycle performance + t0 = time.time() + cycle_delta = t0 - cycle_start + cycle_start = t0 + # Collect from API (doesn't affect scroll) + t1 = time.time() for rev in get_api_reviews(): - if not any(r.get("author") == rev["author"] for r in reviews.values()): - key = f"api_{rev['author'][:20]}_{rev['rating']}" + key = f"api_{rev['author'][:20]}_{rev['rating']}" + if key not in seen_ids: reviews[key] = rev + seen_ids.add(key) + api_time = time.time() - t1 - # Parse reviews in real-time (Google Maps uses virtual scroll - elements get removed!) - # We must parse NOW, not later + # Parse reviews in real-time using JavaScript (FAST - single browser call) + # This replaces slow Python loop with Selenium round-trips + t2 = time.time() + dom_cards = 0 try: - cards = driver.find_elements(By.CSS_SELECTOR, "div.jftiEf[data-review-id]") - for card in cards: - rid = card.get_attribute("data-review-id") - if rid and rid not in reviews: - # Parse immediately - element may be gone later! - review = parse_dom_review(card) - if review: - reviews[rid] = review - except: - pass + # Pass seen_ids to JS so it can skip already-processed reviews + seen_list = list(seen_ids) + parsed_reviews = driver.execute_script(""" + var seenSet = new Set(arguments[0]); + var results = []; + var cards = document.querySelectorAll('div.jftiEf[data-review-id]'); - current_count = len(reviews) + for (var i = 0; i < cards.length; i++) { + var card = cards[i]; + var rid = card.getAttribute('data-review-id'); + if (!rid || seenSet.has(rid)) continue; + + // Parse review data + var author = '', text = '', rating = 0, timestamp = ''; + + // Author name + var authorEl = card.querySelector('.d4r55'); + if (authorEl) author = authorEl.textContent.trim(); + + // Rating from aria-label (e.g., "5 stars") + var ratingEl = card.querySelector('[aria-label*="star"]'); + if (ratingEl) { + var match = ratingEl.getAttribute('aria-label').match(/(\\d)/); + if (match) rating = parseInt(match[1]); + } + + // Review text (check for expanded version first) + var textEl = card.querySelector('.wiI7pd'); + if (textEl) text = textEl.textContent.trim(); + + // Timestamp + var timeEl = card.querySelector('.rsqaWe'); + if (timeEl) timestamp = timeEl.textContent.trim(); + + if (author && rating >= 1 && rating <= 5) { + results.push({ + id: rid, + author: author, + text: text, + rating: rating, + timestamp: timestamp, + source: 'dom' + }); + // Just hide the card (faster than remove, less disruptive) + card.style.display = 'none'; + card.innerHTML = ''; + } + } + return {reviews: results, cardCount: cards.length}; + """, seen_list) + + dom_cards = parsed_reviews.get('cardCount', 0) if parsed_reviews else 0 + new_reviews = parsed_reviews.get('reviews', []) if parsed_reviews else [] + for rev in new_reviews: + rid = rev.pop('id') + reviews[rid] = rev + seen_ids.add(rid) + except Exception as e: + print(f" āŒ DOM parse error: {e}") + dom_time = time.time() - t2 + + # BATCH FLUSH: If we have enough reviews, flush to callback and clear memory + t3 = time.time() + if flush_callback and len(reviews) >= flush_batch_size: + print(f" šŸ’¾ Flushing {len(reviews)} reviews to disk...") + flush_callback(list(reviews.values())) + total_flushed[0] += len(reviews) + reviews.clear() # Free memory, but keep seen_ids + flush_time = time.time() - t3 + + current_count = total_flushed[0] + len(reviews) + + # TIMING: Print if cycle is slow (>2s) + if cycle_delta > 2.0: + print(f" āš ļø SLOW cycle: {cycle_delta:.1f}s (api:{api_time:.1f}s dom:{dom_time:.1f}s/{dom_cards}cards flush:{flush_time:.1f}s seen:{len(seen_ids)})") # Check for new reviews if current_count > last_count: @@ -615,31 +695,31 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in stop_scrolling.set() break + # Flush any remaining reviews + if flush_callback and reviews: + print(f" šŸ’¾ Final flush: {len(reviews)} reviews...") + flush_callback(list(reviews.values())) + total_flushed[0] += len(reviews) + reviews.clear() + # Reviews already parsed during scrolling (real-time parsing) print("šŸ“ Finalizing review data...") - # Separate API and DOM reviews - api_reviews_collected = {k: v for k, v in reviews.items() if v.get("source") == "api"} - dom_reviews = {k: v for k, v in reviews.items() if v.get("source") == "dom"} - - # Merge API reviews (only add if not already in DOM) - api_added = 0 - for key, api_rev in api_reviews_collected.items(): - # Check if this author already exists in DOM reviews - author = api_rev.get("author", "") - if author and not any(r.get("author") == author for r in reviews.values()): - reviews[f"api_{key}"] = api_rev - api_added += 1 - # Final results review_list = list(reviews.values()) + grand_total = total_flushed[0] + len(review_list) dom_count = sum(1 for r in review_list if r.get("source") == "dom") api_count = sum(1 for r in review_list if r.get("source") == "api") - print(f"\nšŸ“‹ Total: {len(review_list)} unique reviews (DOM: {dom_count}, API: {api_count})") + + if total_flushed[0] > 0: + print(f"\nšŸ“‹ Total: {grand_total} unique reviews (flushed: {total_flushed[0]}, in memory: {len(review_list)})") + else: + print(f"\nšŸ“‹ Total: {len(review_list)} unique reviews (DOM: {dom_count}, API: {api_count})") return { - "reviews": review_list, - "total": len(review_list), + "reviews": review_list, # Only unflushed reviews (flushed already sent to callback) + "total": grand_total, + "total_flushed": total_flushed[0], "checks": check_num, "url": url }