Robust selectors: Replace CSS class names with data/aria attributes

- Use [data-review-id] + aria-label check for review cards
- Extract author from button[aria-label^="Photo of"]
- Use span[role="img"][aria-label*="star"] for rating
- Pattern matching for timestamp ("X time ago")
- Longest text span heuristic for review text

A/B tested: 100% match with old class-based selectors.
Survives Google's CSS class name changes.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-22 10:20:51 +00:00
parent d989178119
commit cbc2e9c617

View File

@@ -531,44 +531,73 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
seen_ids.add(key) seen_ids.add(key)
api_time = time.time() - t1 api_time = time.time() - t1
# Parse reviews in real-time using JavaScript (FAST - single browser call) # Parse reviews using ROBUST selectors (no class names - uses data/aria attributes)
# This replaces slow Python loop with Selenium round-trips # This survives Google's CSS class name changes
t2 = time.time() t2 = time.time()
dom_cards = 0 dom_cards = 0
try: try:
# Pass seen_ids to JS so it can skip already-processed reviews
seen_list = list(seen_ids) seen_list = list(seen_ids)
parsed_reviews = driver.execute_script(""" parsed_reviews = driver.execute_script("""
var seenSet = new Set(arguments[0]); var seenSet = new Set(arguments[0]);
var results = []; var results = [];
var cards = document.querySelectorAll('div.jftiEf[data-review-id]'); var processedIds = new Set();
// ROBUST: Find cards by data attribute only (not class names)
var cards = document.querySelectorAll('[data-review-id]');
for (var i = 0; i < cards.length; i++) { for (var i = 0; i < cards.length; i++) {
var card = cards[i]; var card = cards[i];
var rid = card.getAttribute('data-review-id'); var rid = card.getAttribute('data-review-id');
if (!rid || seenSet.has(rid)) continue;
// Parse review data // Skip duplicates and already-seen
if (!rid || seenSet.has(rid) || processedIds.has(rid)) continue;
// Only process top-level review cards (have aria-label with author name)
if (!card.getAttribute('aria-label')) continue;
processedIds.add(rid);
var author = '', text = '', rating = 0, timestamp = ''; var author = '', text = '', rating = 0, timestamp = '';
// Author name // AUTHOR: Extract from "Photo of {Name}" button aria-label
var authorEl = card.querySelector('.d4r55'); var photoBtn = card.querySelector('button[aria-label^="Photo of"]');
if (authorEl) author = authorEl.textContent.trim(); if (photoBtn) {
author = photoBtn.getAttribute('aria-label').replace('Photo of ', '').trim();
}
// Fallback: card's own aria-label is the author name
if (!author) {
author = card.getAttribute('aria-label') || '';
}
// Rating from aria-label (e.g., "5 stars") // RATING: span with role="img" and aria-label containing "star"
var ratingEl = card.querySelector('[aria-label*="star"]'); var ratingEl = card.querySelector('span[role="img"][aria-label*="star"]');
if (ratingEl) { if (ratingEl) {
var match = ratingEl.getAttribute('aria-label').match(/(\\d)/); var match = ratingEl.getAttribute('aria-label').match(/(\\d)/);
if (match) rating = parseInt(match[1]); if (match) rating = parseInt(match[1]);
} }
// Review text (check for expanded version first) // TIMESTAMP: Find span with "X time ago" pattern
var textEl = card.querySelector('.wiI7pd'); var spans = card.querySelectorAll('span');
if (textEl) text = textEl.textContent.trim(); for (var j = 0; j < spans.length; j++) {
var spanText = spans[j].textContent.trim();
if (spanText.match(/^(\\d+|a|an)\\s+(second|minute|hour|day|week|month|year)s?\\s+ago$/i)) {
timestamp = spanText;
break;
}
}
// Timestamp // TEXT: Find longest text span (not timestamp/UI elements)
var timeEl = card.querySelector('.rsqaWe'); var longestText = '';
if (timeEl) timestamp = timeEl.textContent.trim(); for (var j = 0; j < spans.length; j++) {
var spanText = spans[j].textContent.trim();
if (spanText === timestamp) continue;
if (spanText.match(/^\\d+ stars?$/i)) continue;
if (spanText === 'More' || spanText === 'Less') continue;
if (spanText.match(/^(Like\\d*|Share)$/)) continue;
if (spanText.length > longestText.length && spanText.length > 10) {
longestText = spanText;
}
}
text = longestText;
if (author && rating >= 1 && rating <= 5) { if (author && rating >= 1 && rating <= 5) {
results.push({ results.push({
@@ -579,7 +608,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
timestamp: timestamp, timestamp: timestamp,
source: 'dom' source: 'dom'
}); });
// Just hide the card (faster than remove, less disruptive) // Hide processed card to keep DOM light
card.style.display = 'none'; card.style.display = 'none';
card.innerHTML = ''; card.innerHTML = '';
} }