Add fallback locale retry and pane-scoped selectors for robust review detection
- Added fallback logic: if reviews tab not found with hl=en, retry without locale override - Added multilingual keywords for reviews tab (Lithuanian, Russian, etc.) - Fixed structural pattern matching to search only within reviews pane, not entire page - Added Lithuanian date keywords (dienų, savaitės) to date pattern matching - All three selector strategies now scoped to reviews pane for accuracy Issue: Lithuanian hospital still extracting 0/271 reviews Root cause: Reviews elements not found even within pane after tab click Next steps: Need manual inspection of actual page structure on Lithuanian locale
This commit is contained in:
@@ -49,26 +49,29 @@ def check_no_reviews_early(driver) -> tuple[bool, str]:
|
||||
return True, f"Found 'no reviews' message: '{pattern}'"
|
||||
|
||||
# Check if review count is explicitly 0
|
||||
# IMPORTANT: Be very specific to avoid false positives!
|
||||
review_count_check = driver.execute_script("""
|
||||
// Look for review count indicators
|
||||
// Only check for EXACT "0 reviews" patterns, not loose matches
|
||||
const patterns = [
|
||||
/0\\s*(?:review|reseña|avis|recensione|Bewertung|レビュー)/i,
|
||||
/\\(0\\)/,
|
||||
/review.*0/i,
|
||||
/0.*review/i
|
||||
/^0\\s+reviews?$/im, // Exactly "0 reviews" on its own line
|
||||
/\\(0\\s+reviews?\\)/i, // "(0 reviews)"
|
||||
/\\b0\\s+reviews?\\b/i // "0 reviews" as a complete phrase
|
||||
];
|
||||
|
||||
const text = document.body.innerText;
|
||||
for (let pattern of patterns) {
|
||||
if (pattern.test(text)) {
|
||||
return 'Found 0 reviews indicator';
|
||||
}
|
||||
}
|
||||
|
||||
// Check for aria-labels indicating no reviews
|
||||
const elements = document.querySelectorAll('[aria-label*="0 review" i], [aria-label*="no review" i]');
|
||||
if (elements.length > 0) {
|
||||
return 'Found aria-label with 0 reviews';
|
||||
// Split into lines and check each line independently to avoid false positives
|
||||
const lines = text.split('\\n');
|
||||
for (let line of lines) {
|
||||
const trimmed = line.trim();
|
||||
for (let pattern of patterns) {
|
||||
if (pattern.test(trimmed)) {
|
||||
// Double-check: line should be short (not a review text itself)
|
||||
if (trimmed.length < 50) {
|
||||
return 'Found explicit "0 reviews" text: ' + trimmed;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
@@ -287,11 +290,22 @@ def extract_all_reviews_js(driver) -> List[Dict[str, Any]]:
|
||||
}
|
||||
|
||||
// STRATEGY 2: Structural matching for unknown page layouts
|
||||
// IMPORTANT: Search only within the reviews pane, not the entire page!
|
||||
if (!elements || elements.length === 0) {
|
||||
console.log('Known selectors failed, trying structural matching...');
|
||||
|
||||
// Find all divs that LOOK like reviews (have review structure)
|
||||
const allDivs = document.querySelectorAll('div');
|
||||
// Find the reviews pane first
|
||||
const pane = document.querySelector('div.m6QErb.WNBkOb.XiKgde') ||
|
||||
document.querySelector('div.m6QErb') ||
|
||||
document.querySelector('div[role="main"]');
|
||||
|
||||
if (!pane) {
|
||||
console.warn('No reviews pane found');
|
||||
return [];
|
||||
}
|
||||
|
||||
// Find all divs that LOOK like reviews (have review structure) WITHIN the pane
|
||||
const allDivs = pane.querySelectorAll('div');
|
||||
const reviewElements = [];
|
||||
|
||||
for (let div of allDivs) {
|
||||
@@ -302,7 +316,7 @@ def extract_all_reviews_js(driver) -> List[Dict[str, Any]]:
|
||||
const hasAuthor = div.querySelector('[aria-label*="photo" i], img[src*="photo"], img[src*="avatar"]');
|
||||
const hasRating = div.querySelector('[aria-label*="star" i], [aria-label*="rating" i], span[role="img"]');
|
||||
const hasText = Array.from(div.querySelectorAll('span')).some(s => s.textContent.length > 20);
|
||||
const hasDate = div.textContent.match(/\\d+\\s*(day|week|month|year|día|semana|mes|año)/i);
|
||||
const hasDate = div.textContent.match(/\\d+\\s*(day|week|month|year|día|semana|mes|año|dienų|savaitės)/i);
|
||||
|
||||
// Must have at least author, rating, and text to be a review
|
||||
const indicators = [hasAuthor, hasRating, hasText, hasDate].filter(Boolean).length;
|
||||
@@ -317,22 +331,28 @@ def extract_all_reviews_js(driver) -> List[Dict[str, Any]]:
|
||||
}
|
||||
}
|
||||
|
||||
// STRATEGY 3: Try role="article" as last resort
|
||||
// STRATEGY 3: Try role="article" as last resort (within pane)
|
||||
if (!elements || elements.length === 0) {
|
||||
const articles = document.querySelectorAll('[role="article"]');
|
||||
const validArticles = [];
|
||||
const pane = document.querySelector('div.m6QErb.WNBkOb.XiKgde') ||
|
||||
document.querySelector('div.m6QErb') ||
|
||||
document.querySelector('div[role="main"]');
|
||||
|
||||
for (let article of articles) {
|
||||
const hasRating = article.querySelector('[aria-label*="star" i]');
|
||||
const hasText = article.textContent.length > 30;
|
||||
if (hasRating && hasText) {
|
||||
validArticles.push(article);
|
||||
if (pane) {
|
||||
const articles = pane.querySelectorAll('[role="article"]');
|
||||
const validArticles = [];
|
||||
|
||||
for (let article of articles) {
|
||||
const hasRating = article.querySelector('[aria-label*="star" i]');
|
||||
const hasText = article.textContent.length > 30;
|
||||
if (hasRating && hasText) {
|
||||
validArticles.push(article);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (validArticles.length > 0) {
|
||||
elements = validArticles;
|
||||
console.log('Found', validArticles.length, 'reviews using role=article');
|
||||
if (validArticles.length > 0) {
|
||||
elements = validArticles;
|
||||
console.log('Found', validArticles.length, 'reviews using role=article');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -536,13 +556,19 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
||||
|
||||
# Force English locale for consistent date parsing
|
||||
# English gives cleaner date formats: "3 months ago" vs "Hace 3 meses"
|
||||
# Store original URL in case we need to retry without locale override
|
||||
original_url = url
|
||||
locale_override_applied = False
|
||||
|
||||
if 'hl=' in url:
|
||||
# Replace existing locale
|
||||
url = url.replace('hl=es', 'hl=en').replace('hl=pt', 'hl=en').replace('hl=fr', 'hl=en')
|
||||
locale_override_applied = True
|
||||
else:
|
||||
# Add English locale parameter
|
||||
separator = '&' if '?' in url else '?'
|
||||
url = f"{url}{separator}hl=en"
|
||||
locale_override_applied = True
|
||||
|
||||
log.info(f"Using English locale (hl=en) for consistent date parsing")
|
||||
|
||||
@@ -628,7 +654,89 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
||||
continue
|
||||
|
||||
if not reviews_tab_clicked:
|
||||
log.warning("Could not find reviews tab, continuing anyway")
|
||||
log.warning("Could not find reviews tab with hl=en locale")
|
||||
|
||||
# FALLBACK: If locale override was applied and tab not found,
|
||||
# retry without locale override (fixes regional pages where hl=en breaks tabs)
|
||||
if locale_override_applied:
|
||||
log.info("Retrying without locale override to find reviews tab...")
|
||||
|
||||
# Reload page with original URL (no hl=en)
|
||||
driver.get(original_url)
|
||||
time.sleep(1.5)
|
||||
|
||||
# Handle GDPR again if needed
|
||||
if 'consent.google.com' in driver.current_url:
|
||||
try:
|
||||
form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
|
||||
for btn in form_btns:
|
||||
btn_text = (btn.text or '').lower()
|
||||
if any(keyword in btn_text for keyword in ['accept', 'aceptar', 'priim', 'принять', 'accepter']):
|
||||
log.info(f"Clicking GDPR consent: {btn.text}")
|
||||
btn.click()
|
||||
time.sleep(2)
|
||||
break
|
||||
else:
|
||||
if len(form_btns) >= 2:
|
||||
log.info("Using fallback: clicking second form button")
|
||||
form_btns[1].click()
|
||||
time.sleep(2)
|
||||
except Exception as e:
|
||||
log.warning(f"GDPR consent handling failed: {e}")
|
||||
|
||||
# Dismiss cookie banner
|
||||
try:
|
||||
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
|
||||
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
|
||||
if cookie_btns:
|
||||
cookie_btns[0].click()
|
||||
time.sleep(0.3)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Try to find reviews tab with multilingual keywords
|
||||
multilingual_keywords = [
|
||||
'review', 'reviews', # English
|
||||
'reseña', 'reseñas', # Spanish
|
||||
'avis', # French
|
||||
'bewertung', 'bewertungen', # German
|
||||
'recensione', 'recensioni', # Italian
|
||||
'レビュー', # Japanese
|
||||
'avaliação', 'avaliações', # Portuguese
|
||||
'отзыв', 'отзывы', # Russian
|
||||
'atsiliepimai', 'atsiliepi', # Lithuanian
|
||||
'ulasan', # Indonesian
|
||||
'리뷰' # Korean
|
||||
]
|
||||
|
||||
for attempt in range(3):
|
||||
if reviews_tab_clicked:
|
||||
break
|
||||
|
||||
time.sleep(0.5)
|
||||
|
||||
for selector in ['button[role="tab"]', '.LRkQ2', 'button']:
|
||||
try:
|
||||
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
|
||||
for tab in tabs:
|
||||
text = (tab.text or '').lower()
|
||||
aria = (tab.get_attribute('aria-label') or '').lower()
|
||||
|
||||
if any(kw in text or kw in aria for kw in multilingual_keywords):
|
||||
log.info(f"Clicking reviews tab (native locale): {tab.text or aria[:30]}")
|
||||
driver.execute_script("arguments[0].click();", tab)
|
||||
time.sleep(1.5)
|
||||
reviews_tab_clicked = True
|
||||
break
|
||||
|
||||
if reviews_tab_clicked:
|
||||
break
|
||||
except Exception as e:
|
||||
log.debug(f"Native locale tab search attempt {attempt+1} with {selector}: {e}")
|
||||
continue
|
||||
|
||||
if not reviews_tab_clicked:
|
||||
log.warning("Could not find reviews tab even without locale override")
|
||||
|
||||
# Wait for reviews section to load
|
||||
time.sleep(2)
|
||||
@@ -733,8 +841,17 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
||||
|
||||
// STRATEGY 2: Structural pattern matching (robust, class-agnostic)
|
||||
// Find containers that LOOK like reviews (have author + rating + text structure)
|
||||
// IMPORTANT: Search only within the reviews pane, not the entire page!
|
||||
const findReviewsByStructure = () => {
|
||||
const allDivs = document.querySelectorAll('div');
|
||||
// Find the reviews pane first
|
||||
const pane = document.querySelector('div.m6QErb.WNBkOb.XiKgde') ||
|
||||
document.querySelector('div.m6QErb') ||
|
||||
document.querySelector('div[role="main"]');
|
||||
|
||||
if (!pane) return 0;
|
||||
|
||||
// Search only within the pane
|
||||
const allDivs = pane.querySelectorAll('div');
|
||||
let reviewCount = 0;
|
||||
|
||||
for (let div of allDivs) {
|
||||
@@ -749,7 +866,7 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
||||
const hasAuthor = div.querySelector('[aria-label*="photo" i], img[src*="photo"], img[src*="avatar"]');
|
||||
const hasRating = div.querySelector('[aria-label*="star" i], [aria-label*="rating" i], span[role="img"]');
|
||||
const hasText = Array.from(div.querySelectorAll('span')).some(s => s.textContent.length > 20);
|
||||
const hasDate = div.textContent.match(/\\d+\\s*(day|week|month|year|día|semana|mes|año|jour|mois|année)/i);
|
||||
const hasDate = div.textContent.match(/\\d+\\s*(day|week|month|year|día|semana|mes|año|jour|mois|année|dienų|savaitės)/i);
|
||||
|
||||
// If it has at least 3 of these indicators, it's likely a review
|
||||
const indicators = [hasAuthor, hasRating, hasText, hasDate].filter(Boolean).length;
|
||||
@@ -761,19 +878,24 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
||||
return reviewCount > 0 ? reviewCount : 0;
|
||||
};
|
||||
|
||||
// STRATEGY 3: Look for role="article" with review-like content
|
||||
const articles = document.querySelectorAll('[role="article"]');
|
||||
if (articles.length > 0) {
|
||||
let validArticles = 0;
|
||||
for (let article of articles) {
|
||||
// Check if article looks like a review (has rating + text)
|
||||
const hasRating = article.querySelector('[aria-label*="star" i]');
|
||||
const hasText = article.textContent.length > 30;
|
||||
if (hasRating && hasText) {
|
||||
validArticles++;
|
||||
// STRATEGY 3: Look for role="article" with review-like content (within pane)
|
||||
const pane3 = document.querySelector('div.m6QErb.WNBkOb.XiKgde') ||
|
||||
document.querySelector('div.m6QErb') ||
|
||||
document.querySelector('div[role="main"]');
|
||||
if (pane3) {
|
||||
const articles = pane3.querySelectorAll('[role="article"]');
|
||||
if (articles.length > 0) {
|
||||
let validArticles = 0;
|
||||
for (let article of articles) {
|
||||
// Check if article looks like a review (has rating + text)
|
||||
const hasRating = article.querySelector('[aria-label*="star" i]');
|
||||
const hasText = article.textContent.length > 30;
|
||||
if (hasRating && hasText) {
|
||||
validArticles++;
|
||||
}
|
||||
}
|
||||
if (validArticles > 0) return validArticles;
|
||||
}
|
||||
if (validArticles > 0) return validArticles;
|
||||
}
|
||||
|
||||
// Try structural matching as last resort
|
||||
|
||||
Reference in New Issue
Block a user