Robust selectors: Replace CSS class names with data/aria attributes

- Use [data-review-id] + aria-label check for review cards
- Extract author from button[aria-label^="Photo of"]
- Use span[role="img"][aria-label*="star"] for rating
- Pattern matching for timestamp ("X time ago")
- Longest text span heuristic for review text

A/B tested: 100% match with old class-based selectors.
Survives Google's CSS class name changes.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-22 10:20:51 +00:00
parent d989178119
commit cbc2e9c617

View File

@@ -531,44 +531,73 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
seen_ids.add(key)
api_time = time.time() - t1
# Parse reviews in real-time using JavaScript (FAST - single browser call)
# This replaces slow Python loop with Selenium round-trips
# Parse reviews using ROBUST selectors (no class names - uses data/aria attributes)
# This survives Google's CSS class name changes
t2 = time.time()
dom_cards = 0
try:
# Pass seen_ids to JS so it can skip already-processed reviews
seen_list = list(seen_ids)
parsed_reviews = driver.execute_script("""
var seenSet = new Set(arguments[0]);
var results = [];
var cards = document.querySelectorAll('div.jftiEf[data-review-id]');
var processedIds = new Set();
// ROBUST: Find cards by data attribute only (not class names)
var cards = document.querySelectorAll('[data-review-id]');
for (var i = 0; i < cards.length; i++) {
var card = cards[i];
var rid = card.getAttribute('data-review-id');
if (!rid || seenSet.has(rid)) continue;
// Parse review data
// Skip duplicates and already-seen
if (!rid || seenSet.has(rid) || processedIds.has(rid)) continue;
// Only process top-level review cards (have aria-label with author name)
if (!card.getAttribute('aria-label')) continue;
processedIds.add(rid);
var author = '', text = '', rating = 0, timestamp = '';
// Author name
var authorEl = card.querySelector('.d4r55');
if (authorEl) author = authorEl.textContent.trim();
// AUTHOR: Extract from "Photo of {Name}" button aria-label
var photoBtn = card.querySelector('button[aria-label^="Photo of"]');
if (photoBtn) {
author = photoBtn.getAttribute('aria-label').replace('Photo of ', '').trim();
}
// Fallback: card's own aria-label is the author name
if (!author) {
author = card.getAttribute('aria-label') || '';
}
// Rating from aria-label (e.g., "5 stars")
var ratingEl = card.querySelector('[aria-label*="star"]');
// RATING: span with role="img" and aria-label containing "star"
var ratingEl = card.querySelector('span[role="img"][aria-label*="star"]');
if (ratingEl) {
var match = ratingEl.getAttribute('aria-label').match(/(\\d)/);
if (match) rating = parseInt(match[1]);
}
// Review text (check for expanded version first)
var textEl = card.querySelector('.wiI7pd');
if (textEl) text = textEl.textContent.trim();
// TIMESTAMP: Find span with "X time ago" pattern
var spans = card.querySelectorAll('span');
for (var j = 0; j < spans.length; j++) {
var spanText = spans[j].textContent.trim();
if (spanText.match(/^(\\d+|a|an)\\s+(second|minute|hour|day|week|month|year)s?\\s+ago$/i)) {
timestamp = spanText;
break;
}
}
// Timestamp
var timeEl = card.querySelector('.rsqaWe');
if (timeEl) timestamp = timeEl.textContent.trim();
// TEXT: Find longest text span (not timestamp/UI elements)
var longestText = '';
for (var j = 0; j < spans.length; j++) {
var spanText = spans[j].textContent.trim();
if (spanText === timestamp) continue;
if (spanText.match(/^\\d+ stars?$/i)) continue;
if (spanText === 'More' || spanText === 'Less') continue;
if (spanText.match(/^(Like\\d*|Share)$/)) continue;
if (spanText.length > longestText.length && spanText.length > 10) {
longestText = spanText;
}
}
text = longestText;
if (author && rating >= 1 && rating <= 5) {
results.push({
@@ -579,7 +608,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
timestamp: timestamp,
source: 'dom'
});
// Just hide the card (faster than remove, less disruptive)
// Hide processed card to keep DOM light
card.style.display = 'none';
card.innerHTML = '';
}