Robust selectors: Replace CSS class names with data/aria attributes
- Use [data-review-id] + aria-label check for review cards
- Extract author from button[aria-label^="Photo of"]
- Use span[role="img"][aria-label*="star"] for rating
- Pattern matching for timestamp ("X time ago")
- Longest text span heuristic for review text
A/B tested: 100% match with old class-based selectors.
Survives Google's CSS class name changes.
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -531,44 +531,73 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
seen_ids.add(key)
|
seen_ids.add(key)
|
||||||
api_time = time.time() - t1
|
api_time = time.time() - t1
|
||||||
|
|
||||||
# Parse reviews in real-time using JavaScript (FAST - single browser call)
|
# Parse reviews using ROBUST selectors (no class names - uses data/aria attributes)
|
||||||
# This replaces slow Python loop with Selenium round-trips
|
# This survives Google's CSS class name changes
|
||||||
t2 = time.time()
|
t2 = time.time()
|
||||||
dom_cards = 0
|
dom_cards = 0
|
||||||
try:
|
try:
|
||||||
# Pass seen_ids to JS so it can skip already-processed reviews
|
|
||||||
seen_list = list(seen_ids)
|
seen_list = list(seen_ids)
|
||||||
parsed_reviews = driver.execute_script("""
|
parsed_reviews = driver.execute_script("""
|
||||||
var seenSet = new Set(arguments[0]);
|
var seenSet = new Set(arguments[0]);
|
||||||
var results = [];
|
var results = [];
|
||||||
var cards = document.querySelectorAll('div.jftiEf[data-review-id]');
|
var processedIds = new Set();
|
||||||
|
|
||||||
|
// ROBUST: Find cards by data attribute only (not class names)
|
||||||
|
var cards = document.querySelectorAll('[data-review-id]');
|
||||||
|
|
||||||
for (var i = 0; i < cards.length; i++) {
|
for (var i = 0; i < cards.length; i++) {
|
||||||
var card = cards[i];
|
var card = cards[i];
|
||||||
var rid = card.getAttribute('data-review-id');
|
var rid = card.getAttribute('data-review-id');
|
||||||
if (!rid || seenSet.has(rid)) continue;
|
|
||||||
|
|
||||||
// Parse review data
|
// Skip duplicates and already-seen
|
||||||
|
if (!rid || seenSet.has(rid) || processedIds.has(rid)) continue;
|
||||||
|
|
||||||
|
// Only process top-level review cards (have aria-label with author name)
|
||||||
|
if (!card.getAttribute('aria-label')) continue;
|
||||||
|
processedIds.add(rid);
|
||||||
|
|
||||||
var author = '', text = '', rating = 0, timestamp = '';
|
var author = '', text = '', rating = 0, timestamp = '';
|
||||||
|
|
||||||
// Author name
|
// AUTHOR: Extract from "Photo of {Name}" button aria-label
|
||||||
var authorEl = card.querySelector('.d4r55');
|
var photoBtn = card.querySelector('button[aria-label^="Photo of"]');
|
||||||
if (authorEl) author = authorEl.textContent.trim();
|
if (photoBtn) {
|
||||||
|
author = photoBtn.getAttribute('aria-label').replace('Photo of ', '').trim();
|
||||||
|
}
|
||||||
|
// Fallback: card's own aria-label is the author name
|
||||||
|
if (!author) {
|
||||||
|
author = card.getAttribute('aria-label') || '';
|
||||||
|
}
|
||||||
|
|
||||||
// Rating from aria-label (e.g., "5 stars")
|
// RATING: span with role="img" and aria-label containing "star"
|
||||||
var ratingEl = card.querySelector('[aria-label*="star"]');
|
var ratingEl = card.querySelector('span[role="img"][aria-label*="star"]');
|
||||||
if (ratingEl) {
|
if (ratingEl) {
|
||||||
var match = ratingEl.getAttribute('aria-label').match(/(\\d)/);
|
var match = ratingEl.getAttribute('aria-label').match(/(\\d)/);
|
||||||
if (match) rating = parseInt(match[1]);
|
if (match) rating = parseInt(match[1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Review text (check for expanded version first)
|
// TIMESTAMP: Find span with "X time ago" pattern
|
||||||
var textEl = card.querySelector('.wiI7pd');
|
var spans = card.querySelectorAll('span');
|
||||||
if (textEl) text = textEl.textContent.trim();
|
for (var j = 0; j < spans.length; j++) {
|
||||||
|
var spanText = spans[j].textContent.trim();
|
||||||
|
if (spanText.match(/^(\\d+|a|an)\\s+(second|minute|hour|day|week|month|year)s?\\s+ago$/i)) {
|
||||||
|
timestamp = spanText;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Timestamp
|
// TEXT: Find longest text span (not timestamp/UI elements)
|
||||||
var timeEl = card.querySelector('.rsqaWe');
|
var longestText = '';
|
||||||
if (timeEl) timestamp = timeEl.textContent.trim();
|
for (var j = 0; j < spans.length; j++) {
|
||||||
|
var spanText = spans[j].textContent.trim();
|
||||||
|
if (spanText === timestamp) continue;
|
||||||
|
if (spanText.match(/^\\d+ stars?$/i)) continue;
|
||||||
|
if (spanText === 'More' || spanText === 'Less') continue;
|
||||||
|
if (spanText.match(/^(Like\\d*|Share)$/)) continue;
|
||||||
|
if (spanText.length > longestText.length && spanText.length > 10) {
|
||||||
|
longestText = spanText;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
text = longestText;
|
||||||
|
|
||||||
if (author && rating >= 1 && rating <= 5) {
|
if (author && rating >= 1 && rating <= 5) {
|
||||||
results.push({
|
results.push({
|
||||||
@@ -579,7 +608,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
timestamp: timestamp,
|
timestamp: timestamp,
|
||||||
source: 'dom'
|
source: 'dom'
|
||||||
});
|
});
|
||||||
// Just hide the card (faster than remove, less disruptive)
|
// Hide processed card to keep DOM light
|
||||||
card.style.display = 'none';
|
card.style.display = 'none';
|
||||||
card.innerHTML = '';
|
card.innerHTML = '';
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user