From cbc2e9c61798f0b0a4a8a5dca71532e8eacd6a95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Guti=C3=A9rrez?= <35082514+alezmad@users.noreply.github.com> Date: Thu, 22 Jan 2026 10:20:51 +0000 Subject: [PATCH] Robust selectors: Replace CSS class names with data/aria attributes - Use [data-review-id] + aria-label check for review cards - Extract author from button[aria-label^="Photo of"] - Use span[role="img"][aria-label*="star"] for rating - Pattern matching for timestamp ("X time ago") - Longest text span heuristic for review text A/B tested: 100% match with old class-based selectors. Survives Google's CSS class name changes. Co-Authored-By: Claude Opus 4.5 --- modules/scraper_clean.py | 65 +++++++++++++++++++++++++++++----------- 1 file changed, 47 insertions(+), 18 deletions(-) diff --git a/modules/scraper_clean.py b/modules/scraper_clean.py index 2fb8681..3a2950a 100644 --- a/modules/scraper_clean.py +++ b/modules/scraper_clean.py @@ -531,44 +531,73 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in seen_ids.add(key) api_time = time.time() - t1 - # Parse reviews in real-time using JavaScript (FAST - single browser call) - # This replaces slow Python loop with Selenium round-trips + # Parse reviews using ROBUST selectors (no class names - uses data/aria attributes) + # This survives Google's CSS class name changes t2 = time.time() dom_cards = 0 try: - # Pass seen_ids to JS so it can skip already-processed reviews seen_list = list(seen_ids) parsed_reviews = driver.execute_script(""" var seenSet = new Set(arguments[0]); var results = []; - var cards = document.querySelectorAll('div.jftiEf[data-review-id]'); + var processedIds = new Set(); + + // ROBUST: Find cards by data attribute only (not class names) + var cards = document.querySelectorAll('[data-review-id]'); for (var i = 0; i < cards.length; i++) { var card = cards[i]; var rid = card.getAttribute('data-review-id'); - if (!rid || seenSet.has(rid)) continue; - // Parse review data + // Skip duplicates and already-seen + if (!rid || seenSet.has(rid) || processedIds.has(rid)) continue; + + // Only process top-level review cards (have aria-label with author name) + if (!card.getAttribute('aria-label')) continue; + processedIds.add(rid); + var author = '', text = '', rating = 0, timestamp = ''; - // Author name - var authorEl = card.querySelector('.d4r55'); - if (authorEl) author = authorEl.textContent.trim(); + // AUTHOR: Extract from "Photo of {Name}" button aria-label + var photoBtn = card.querySelector('button[aria-label^="Photo of"]'); + if (photoBtn) { + author = photoBtn.getAttribute('aria-label').replace('Photo of ', '').trim(); + } + // Fallback: card's own aria-label is the author name + if (!author) { + author = card.getAttribute('aria-label') || ''; + } - // Rating from aria-label (e.g., "5 stars") - var ratingEl = card.querySelector('[aria-label*="star"]'); + // RATING: span with role="img" and aria-label containing "star" + var ratingEl = card.querySelector('span[role="img"][aria-label*="star"]'); if (ratingEl) { var match = ratingEl.getAttribute('aria-label').match(/(\\d)/); if (match) rating = parseInt(match[1]); } - // Review text (check for expanded version first) - var textEl = card.querySelector('.wiI7pd'); - if (textEl) text = textEl.textContent.trim(); + // TIMESTAMP: Find span with "X time ago" pattern + var spans = card.querySelectorAll('span'); + for (var j = 0; j < spans.length; j++) { + var spanText = spans[j].textContent.trim(); + if (spanText.match(/^(\\d+|a|an)\\s+(second|minute|hour|day|week|month|year)s?\\s+ago$/i)) { + timestamp = spanText; + break; + } + } - // Timestamp - var timeEl = card.querySelector('.rsqaWe'); - if (timeEl) timestamp = timeEl.textContent.trim(); + // TEXT: Find longest text span (not timestamp/UI elements) + var longestText = ''; + for (var j = 0; j < spans.length; j++) { + var spanText = spans[j].textContent.trim(); + if (spanText === timestamp) continue; + if (spanText.match(/^\\d+ stars?$/i)) continue; + if (spanText === 'More' || spanText === 'Less') continue; + if (spanText.match(/^(Like\\d*|Share)$/)) continue; + if (spanText.length > longestText.length && spanText.length > 10) { + longestText = spanText; + } + } + text = longestText; if (author && rating >= 1 && rating <= 5) { results.push({ @@ -579,7 +608,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in timestamp: timestamp, source: 'dom' }); - // Just hide the card (faster than remove, less disruptive) + // Hide processed card to keep DOM light card.style.display = 'none'; card.innerHTML = ''; }