From cbc2e9c61798f0b0a4a8a5dca71532e8eacd6a95 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Guti=C3=A9rrez?=
 <35082514+alezmad@users.noreply.github.com>
Date: Thu, 22 Jan 2026 10:20:51 +0000
Subject: [PATCH] Robust selectors: Replace CSS class names with data/aria
 attributes

- Use [data-review-id] + aria-label check for review cards
- Extract author from button[aria-label^="Photo of"]
- Use span[role="img"][aria-label*="star"] for rating
- Pattern matching for timestamp ("X time ago")
- Longest text span heuristic for review text

A/B tested: 100% match with old class-based selectors.
Survives Google's CSS class name changes.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 modules/scraper_clean.py | 65 +++++++++++++++++++++++++++++-----------
 1 file changed, 47 insertions(+), 18 deletions(-)

diff --git a/modules/scraper_clean.py b/modules/scraper_clean.py
index 2fb8681..3a2950a 100644
--- a/modules/scraper_clean.py
+++ b/modules/scraper_clean.py
@@ -531,44 +531,73 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                 seen_ids.add(key)
         api_time = time.time() - t1
 
-        # Parse reviews in real-time using JavaScript (FAST - single browser call)
-        # This replaces slow Python loop with Selenium round-trips
+        # Parse reviews using ROBUST selectors (no class names - uses data/aria attributes)
+        # This survives Google's CSS class name changes
         t2 = time.time()
         dom_cards = 0
         try:
-            # Pass seen_ids to JS so it can skip already-processed reviews
             seen_list = list(seen_ids)
             parsed_reviews = driver.execute_script("""
                 var seenSet = new Set(arguments[0]);
                 var results = [];
-                var cards = document.querySelectorAll('div.jftiEf[data-review-id]');
+                var processedIds = new Set();
+
+                // ROBUST: Find cards by data attribute only (not class names)
+                var cards = document.querySelectorAll('[data-review-id]');
 
                 for (var i = 0; i < cards.length; i++) {
                     var card = cards[i];
                     var rid = card.getAttribute('data-review-id');
-                    if (!rid || seenSet.has(rid)) continue;
 
-                    // Parse review data
+                    // Skip duplicates and already-seen
+                    if (!rid || seenSet.has(rid) || processedIds.has(rid)) continue;
+
+                    // Only process top-level review cards (have aria-label with author name)
+                    if (!card.getAttribute('aria-label')) continue;
+                    processedIds.add(rid);
+
                     var author = '', text = '', rating = 0, timestamp = '';
 
-                    // Author name
-                    var authorEl = card.querySelector('.d4r55');
-                    if (authorEl) author = authorEl.textContent.trim();
+                    // AUTHOR: Extract from "Photo of {Name}" button aria-label
+                    var photoBtn = card.querySelector('button[aria-label^="Photo of"]');
+                    if (photoBtn) {
+                        author = photoBtn.getAttribute('aria-label').replace('Photo of ', '').trim();
+                    }
+                    // Fallback: card's own aria-label is the author name
+                    if (!author) {
+                        author = card.getAttribute('aria-label') || '';
+                    }
 
-                    // Rating from aria-label (e.g., "5 stars")
-                    var ratingEl = card.querySelector('[aria-label*="star"]');
+                    // RATING: span with role="img" and aria-label containing "star"
+                    var ratingEl = card.querySelector('span[role="img"][aria-label*="star"]');
                     if (ratingEl) {
                         var match = ratingEl.getAttribute('aria-label').match(/(\\d)/);
                         if (match) rating = parseInt(match[1]);
                     }
 
-                    // Review text (check for expanded version first)
-                    var textEl = card.querySelector('.wiI7pd');
-                    if (textEl) text = textEl.textContent.trim();
+                    // TIMESTAMP: Find span with "X time ago" pattern
+                    var spans = card.querySelectorAll('span');
+                    for (var j = 0; j < spans.length; j++) {
+                        var spanText = spans[j].textContent.trim();
+                        if (spanText.match(/^(\\d+|a|an)\\s+(second|minute|hour|day|week|month|year)s?\\s+ago$/i)) {
+                            timestamp = spanText;
+                            break;
+                        }
+                    }
 
-                    // Timestamp
-                    var timeEl = card.querySelector('.rsqaWe');
-                    if (timeEl) timestamp = timeEl.textContent.trim();
+                    // TEXT: Find longest text span (not timestamp/UI elements)
+                    var longestText = '';
+                    for (var j = 0; j < spans.length; j++) {
+                        var spanText = spans[j].textContent.trim();
+                        if (spanText === timestamp) continue;
+                        if (spanText.match(/^\\d+ stars?$/i)) continue;
+                        if (spanText === 'More' || spanText === 'Less') continue;
+                        if (spanText.match(/^(Like\\d*|Share)$/)) continue;
+                        if (spanText.length > longestText.length && spanText.length > 10) {
+                            longestText = spanText;
+                        }
+                    }
+                    text = longestText;
 
                     if (author && rating >= 1 && rating <= 5) {
                         results.push({
@@ -579,7 +608,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
                             timestamp: timestamp,
                             source: 'dom'
                         });
-                        // Just hide the card (faster than remove, less disruptive)
+                        // Hide processed card to keep DOM light
                         card.style.display = 'none';
                         card.innerHTML = '';
                     }