Add fallback locale retry and pane-scoped selectors for robust review detection

- Added fallback logic: if reviews tab not found with hl=en, retry without locale override
- Added multilingual keywords for reviews tab (Lithuanian, Russian, etc.)
- Fixed structural pattern matching to search only within reviews pane, not entire page
- Added Lithuanian date keywords (dienų, savaitės) to date pattern matching
- All three selector strategies now scoped to reviews pane for accuracy

Issue: Lithuanian hospital still extracting 0/271 reviews
Root cause: Reviews elements not found even within pane after tab click
Next steps: Need manual inspection of actual page structure on Lithuanian locale
This commit is contained in:
Alejandro Gutiérrez
2026-01-18 20:36:42 +00:00
parent e98da314a5
commit 4ad5c96a36

View File

@@ -49,26 +49,29 @@ def check_no_reviews_early(driver) -> tuple[bool, str]:
return True, f"Found 'no reviews' message: '{pattern}'" return True, f"Found 'no reviews' message: '{pattern}'"
# Check if review count is explicitly 0 # Check if review count is explicitly 0
# IMPORTANT: Be very specific to avoid false positives!
review_count_check = driver.execute_script(""" review_count_check = driver.execute_script("""
// Look for review count indicators // Only check for EXACT "0 reviews" patterns, not loose matches
const patterns = [ const patterns = [
/0\\s*(?:review|reseña|avis|recensione|Bewertung|レビュー)/i, /^0\\s+reviews?$/im, // Exactly "0 reviews" on its own line
/\\(0\\)/, /\\(0\\s+reviews?\\)/i, // "(0 reviews)"
/review.*0/i, /\\b0\\s+reviews?\\b/i // "0 reviews" as a complete phrase
/0.*review/i
]; ];
const text = document.body.innerText; const text = document.body.innerText;
for (let pattern of patterns) {
if (pattern.test(text)) {
return 'Found 0 reviews indicator';
}
}
// Check for aria-labels indicating no reviews // Split into lines and check each line independently to avoid false positives
const elements = document.querySelectorAll('[aria-label*="0 review" i], [aria-label*="no review" i]'); const lines = text.split('\\n');
if (elements.length > 0) { for (let line of lines) {
return 'Found aria-label with 0 reviews'; const trimmed = line.trim();
for (let pattern of patterns) {
if (pattern.test(trimmed)) {
// Double-check: line should be short (not a review text itself)
if (trimmed.length < 50) {
return 'Found explicit "0 reviews" text: ' + trimmed;
}
}
}
} }
return null; return null;
@@ -287,11 +290,22 @@ def extract_all_reviews_js(driver) -> List[Dict[str, Any]]:
} }
// STRATEGY 2: Structural matching for unknown page layouts // STRATEGY 2: Structural matching for unknown page layouts
// IMPORTANT: Search only within the reviews pane, not the entire page!
if (!elements || elements.length === 0) { if (!elements || elements.length === 0) {
console.log('Known selectors failed, trying structural matching...'); console.log('Known selectors failed, trying structural matching...');
// Find all divs that LOOK like reviews (have review structure) // Find the reviews pane first
const allDivs = document.querySelectorAll('div'); const pane = document.querySelector('div.m6QErb.WNBkOb.XiKgde') ||
document.querySelector('div.m6QErb') ||
document.querySelector('div[role="main"]');
if (!pane) {
console.warn('No reviews pane found');
return [];
}
// Find all divs that LOOK like reviews (have review structure) WITHIN the pane
const allDivs = pane.querySelectorAll('div');
const reviewElements = []; const reviewElements = [];
for (let div of allDivs) { for (let div of allDivs) {
@@ -302,7 +316,7 @@ def extract_all_reviews_js(driver) -> List[Dict[str, Any]]:
const hasAuthor = div.querySelector('[aria-label*="photo" i], img[src*="photo"], img[src*="avatar"]'); const hasAuthor = div.querySelector('[aria-label*="photo" i], img[src*="photo"], img[src*="avatar"]');
const hasRating = div.querySelector('[aria-label*="star" i], [aria-label*="rating" i], span[role="img"]'); const hasRating = div.querySelector('[aria-label*="star" i], [aria-label*="rating" i], span[role="img"]');
const hasText = Array.from(div.querySelectorAll('span')).some(s => s.textContent.length > 20); const hasText = Array.from(div.querySelectorAll('span')).some(s => s.textContent.length > 20);
const hasDate = div.textContent.match(/\\d+\\s*(day|week|month|year|día|semana|mes|año)/i); const hasDate = div.textContent.match(/\\d+\\s*(day|week|month|year|día|semana|mes|año|dienų|savaitės)/i);
// Must have at least author, rating, and text to be a review // Must have at least author, rating, and text to be a review
const indicators = [hasAuthor, hasRating, hasText, hasDate].filter(Boolean).length; const indicators = [hasAuthor, hasRating, hasText, hasDate].filter(Boolean).length;
@@ -317,22 +331,28 @@ def extract_all_reviews_js(driver) -> List[Dict[str, Any]]:
} }
} }
// STRATEGY 3: Try role="article" as last resort // STRATEGY 3: Try role="article" as last resort (within pane)
if (!elements || elements.length === 0) { if (!elements || elements.length === 0) {
const articles = document.querySelectorAll('[role="article"]'); const pane = document.querySelector('div.m6QErb.WNBkOb.XiKgde') ||
const validArticles = []; document.querySelector('div.m6QErb') ||
document.querySelector('div[role="main"]');
for (let article of articles) { if (pane) {
const hasRating = article.querySelector('[aria-label*="star" i]'); const articles = pane.querySelectorAll('[role="article"]');
const hasText = article.textContent.length > 30; const validArticles = [];
if (hasRating && hasText) {
validArticles.push(article); for (let article of articles) {
const hasRating = article.querySelector('[aria-label*="star" i]');
const hasText = article.textContent.length > 30;
if (hasRating && hasText) {
validArticles.push(article);
}
} }
}
if (validArticles.length > 0) { if (validArticles.length > 0) {
elements = validArticles; elements = validArticles;
console.log('Found', validArticles.length, 'reviews using role=article'); console.log('Found', validArticles.length, 'reviews using role=article');
}
} }
} }
@@ -536,13 +556,19 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
# Force English locale for consistent date parsing # Force English locale for consistent date parsing
# English gives cleaner date formats: "3 months ago" vs "Hace 3 meses" # English gives cleaner date formats: "3 months ago" vs "Hace 3 meses"
# Store original URL in case we need to retry without locale override
original_url = url
locale_override_applied = False
if 'hl=' in url: if 'hl=' in url:
# Replace existing locale # Replace existing locale
url = url.replace('hl=es', 'hl=en').replace('hl=pt', 'hl=en').replace('hl=fr', 'hl=en') url = url.replace('hl=es', 'hl=en').replace('hl=pt', 'hl=en').replace('hl=fr', 'hl=en')
locale_override_applied = True
else: else:
# Add English locale parameter # Add English locale parameter
separator = '&' if '?' in url else '?' separator = '&' if '?' in url else '?'
url = f"{url}{separator}hl=en" url = f"{url}{separator}hl=en"
locale_override_applied = True
log.info(f"Using English locale (hl=en) for consistent date parsing") log.info(f"Using English locale (hl=en) for consistent date parsing")
@@ -628,7 +654,89 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
continue continue
if not reviews_tab_clicked: if not reviews_tab_clicked:
log.warning("Could not find reviews tab, continuing anyway") log.warning("Could not find reviews tab with hl=en locale")
# FALLBACK: If locale override was applied and tab not found,
# retry without locale override (fixes regional pages where hl=en breaks tabs)
if locale_override_applied:
log.info("Retrying without locale override to find reviews tab...")
# Reload page with original URL (no hl=en)
driver.get(original_url)
time.sleep(1.5)
# Handle GDPR again if needed
if 'consent.google.com' in driver.current_url:
try:
form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
for btn in form_btns:
btn_text = (btn.text or '').lower()
if any(keyword in btn_text for keyword in ['accept', 'aceptar', 'priim', 'принять', 'accepter']):
log.info(f"Clicking GDPR consent: {btn.text}")
btn.click()
time.sleep(2)
break
else:
if len(form_btns) >= 2:
log.info("Using fallback: clicking second form button")
form_btns[1].click()
time.sleep(2)
except Exception as e:
log.warning(f"GDPR consent handling failed: {e}")
# Dismiss cookie banner
try:
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
if cookie_btns:
cookie_btns[0].click()
time.sleep(0.3)
except:
pass
# Try to find reviews tab with multilingual keywords
multilingual_keywords = [
'review', 'reviews', # English
'reseña', 'reseñas', # Spanish
'avis', # French
'bewertung', 'bewertungen', # German
'recensione', 'recensioni', # Italian
'レビュー', # Japanese
'avaliação', 'avaliações', # Portuguese
'отзыв', 'отзывы', # Russian
'atsiliepimai', 'atsiliepi', # Lithuanian
'ulasan', # Indonesian
'리뷰' # Korean
]
for attempt in range(3):
if reviews_tab_clicked:
break
time.sleep(0.5)
for selector in ['button[role="tab"]', '.LRkQ2', 'button']:
try:
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
for tab in tabs:
text = (tab.text or '').lower()
aria = (tab.get_attribute('aria-label') or '').lower()
if any(kw in text or kw in aria for kw in multilingual_keywords):
log.info(f"Clicking reviews tab (native locale): {tab.text or aria[:30]}")
driver.execute_script("arguments[0].click();", tab)
time.sleep(1.5)
reviews_tab_clicked = True
break
if reviews_tab_clicked:
break
except Exception as e:
log.debug(f"Native locale tab search attempt {attempt+1} with {selector}: {e}")
continue
if not reviews_tab_clicked:
log.warning("Could not find reviews tab even without locale override")
# Wait for reviews section to load # Wait for reviews section to load
time.sleep(2) time.sleep(2)
@@ -733,8 +841,17 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
// STRATEGY 2: Structural pattern matching (robust, class-agnostic) // STRATEGY 2: Structural pattern matching (robust, class-agnostic)
// Find containers that LOOK like reviews (have author + rating + text structure) // Find containers that LOOK like reviews (have author + rating + text structure)
// IMPORTANT: Search only within the reviews pane, not the entire page!
const findReviewsByStructure = () => { const findReviewsByStructure = () => {
const allDivs = document.querySelectorAll('div'); // Find the reviews pane first
const pane = document.querySelector('div.m6QErb.WNBkOb.XiKgde') ||
document.querySelector('div.m6QErb') ||
document.querySelector('div[role="main"]');
if (!pane) return 0;
// Search only within the pane
const allDivs = pane.querySelectorAll('div');
let reviewCount = 0; let reviewCount = 0;
for (let div of allDivs) { for (let div of allDivs) {
@@ -749,7 +866,7 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
const hasAuthor = div.querySelector('[aria-label*="photo" i], img[src*="photo"], img[src*="avatar"]'); const hasAuthor = div.querySelector('[aria-label*="photo" i], img[src*="photo"], img[src*="avatar"]');
const hasRating = div.querySelector('[aria-label*="star" i], [aria-label*="rating" i], span[role="img"]'); const hasRating = div.querySelector('[aria-label*="star" i], [aria-label*="rating" i], span[role="img"]');
const hasText = Array.from(div.querySelectorAll('span')).some(s => s.textContent.length > 20); const hasText = Array.from(div.querySelectorAll('span')).some(s => s.textContent.length > 20);
const hasDate = div.textContent.match(/\\d+\\s*(day|week|month|year|día|semana|mes|año|jour|mois|année)/i); const hasDate = div.textContent.match(/\\d+\\s*(day|week|month|year|día|semana|mes|año|jour|mois|année|dienų|savaitės)/i);
// If it has at least 3 of these indicators, it's likely a review // If it has at least 3 of these indicators, it's likely a review
const indicators = [hasAuthor, hasRating, hasText, hasDate].filter(Boolean).length; const indicators = [hasAuthor, hasRating, hasText, hasDate].filter(Boolean).length;
@@ -761,19 +878,24 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
return reviewCount > 0 ? reviewCount : 0; return reviewCount > 0 ? reviewCount : 0;
}; };
// STRATEGY 3: Look for role="article" with review-like content // STRATEGY 3: Look for role="article" with review-like content (within pane)
const articles = document.querySelectorAll('[role="article"]'); const pane3 = document.querySelector('div.m6QErb.WNBkOb.XiKgde') ||
if (articles.length > 0) { document.querySelector('div.m6QErb') ||
let validArticles = 0; document.querySelector('div[role="main"]');
for (let article of articles) { if (pane3) {
// Check if article looks like a review (has rating + text) const articles = pane3.querySelectorAll('[role="article"]');
const hasRating = article.querySelector('[aria-label*="star" i]'); if (articles.length > 0) {
const hasText = article.textContent.length > 30; let validArticles = 0;
if (hasRating && hasText) { for (let article of articles) {
validArticles++; // Check if article looks like a review (has rating + text)
const hasRating = article.querySelector('[aria-label*="star" i]');
const hasText = article.textContent.length > 30;
if (hasRating && hasText) {
validArticles++;
}
} }
if (validArticles > 0) return validArticles;
} }
if (validArticles > 0) return validArticles;
} }
// Try structural matching as last resort // Try structural matching as last resort