Robust selectors: Replace CSS class names with data/aria attributes
- Use [data-review-id] + aria-label check for review cards
- Extract author from button[aria-label^="Photo of"]
- Use span[role="img"][aria-label*="star"] for rating
- Pattern matching for timestamp ("X time ago")
- Longest text span heuristic for review text
A/B tested: 100% match with old class-based selectors.
Survives Google's CSS class name changes.
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -531,44 +531,73 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
seen_ids.add(key)
|
||||
api_time = time.time() - t1
|
||||
|
||||
# Parse reviews in real-time using JavaScript (FAST - single browser call)
|
||||
# This replaces slow Python loop with Selenium round-trips
|
||||
# Parse reviews using ROBUST selectors (no class names - uses data/aria attributes)
|
||||
# This survives Google's CSS class name changes
|
||||
t2 = time.time()
|
||||
dom_cards = 0
|
||||
try:
|
||||
# Pass seen_ids to JS so it can skip already-processed reviews
|
||||
seen_list = list(seen_ids)
|
||||
parsed_reviews = driver.execute_script("""
|
||||
var seenSet = new Set(arguments[0]);
|
||||
var results = [];
|
||||
var cards = document.querySelectorAll('div.jftiEf[data-review-id]');
|
||||
var processedIds = new Set();
|
||||
|
||||
// ROBUST: Find cards by data attribute only (not class names)
|
||||
var cards = document.querySelectorAll('[data-review-id]');
|
||||
|
||||
for (var i = 0; i < cards.length; i++) {
|
||||
var card = cards[i];
|
||||
var rid = card.getAttribute('data-review-id');
|
||||
if (!rid || seenSet.has(rid)) continue;
|
||||
|
||||
// Parse review data
|
||||
// Skip duplicates and already-seen
|
||||
if (!rid || seenSet.has(rid) || processedIds.has(rid)) continue;
|
||||
|
||||
// Only process top-level review cards (have aria-label with author name)
|
||||
if (!card.getAttribute('aria-label')) continue;
|
||||
processedIds.add(rid);
|
||||
|
||||
var author = '', text = '', rating = 0, timestamp = '';
|
||||
|
||||
// Author name
|
||||
var authorEl = card.querySelector('.d4r55');
|
||||
if (authorEl) author = authorEl.textContent.trim();
|
||||
// AUTHOR: Extract from "Photo of {Name}" button aria-label
|
||||
var photoBtn = card.querySelector('button[aria-label^="Photo of"]');
|
||||
if (photoBtn) {
|
||||
author = photoBtn.getAttribute('aria-label').replace('Photo of ', '').trim();
|
||||
}
|
||||
// Fallback: card's own aria-label is the author name
|
||||
if (!author) {
|
||||
author = card.getAttribute('aria-label') || '';
|
||||
}
|
||||
|
||||
// Rating from aria-label (e.g., "5 stars")
|
||||
var ratingEl = card.querySelector('[aria-label*="star"]');
|
||||
// RATING: span with role="img" and aria-label containing "star"
|
||||
var ratingEl = card.querySelector('span[role="img"][aria-label*="star"]');
|
||||
if (ratingEl) {
|
||||
var match = ratingEl.getAttribute('aria-label').match(/(\\d)/);
|
||||
if (match) rating = parseInt(match[1]);
|
||||
}
|
||||
|
||||
// Review text (check for expanded version first)
|
||||
var textEl = card.querySelector('.wiI7pd');
|
||||
if (textEl) text = textEl.textContent.trim();
|
||||
// TIMESTAMP: Find span with "X time ago" pattern
|
||||
var spans = card.querySelectorAll('span');
|
||||
for (var j = 0; j < spans.length; j++) {
|
||||
var spanText = spans[j].textContent.trim();
|
||||
if (spanText.match(/^(\\d+|a|an)\\s+(second|minute|hour|day|week|month|year)s?\\s+ago$/i)) {
|
||||
timestamp = spanText;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Timestamp
|
||||
var timeEl = card.querySelector('.rsqaWe');
|
||||
if (timeEl) timestamp = timeEl.textContent.trim();
|
||||
// TEXT: Find longest text span (not timestamp/UI elements)
|
||||
var longestText = '';
|
||||
for (var j = 0; j < spans.length; j++) {
|
||||
var spanText = spans[j].textContent.trim();
|
||||
if (spanText === timestamp) continue;
|
||||
if (spanText.match(/^\\d+ stars?$/i)) continue;
|
||||
if (spanText === 'More' || spanText === 'Less') continue;
|
||||
if (spanText.match(/^(Like\\d*|Share)$/)) continue;
|
||||
if (spanText.length > longestText.length && spanText.length > 10) {
|
||||
longestText = spanText;
|
||||
}
|
||||
}
|
||||
text = longestText;
|
||||
|
||||
if (author && rating >= 1 && rating <= 5) {
|
||||
results.push({
|
||||
@@ -579,7 +608,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
timestamp: timestamp,
|
||||
source: 'dom'
|
||||
});
|
||||
// Just hide the card (faster than remove, less disruptive)
|
||||
// Hide processed card to keep DOM light
|
||||
card.style.display = 'none';
|
||||
card.innerHTML = '';
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user