Add fallback locale retry and pane-scoped selectors for robust review detection
- Added fallback logic: if reviews tab not found with hl=en, retry without locale override - Added multilingual keywords for reviews tab (Lithuanian, Russian, etc.) - Fixed structural pattern matching to search only within reviews pane, not entire page - Added Lithuanian date keywords (dienų, savaitės) to date pattern matching - All three selector strategies now scoped to reviews pane for accuracy Issue: Lithuanian hospital still extracting 0/271 reviews Root cause: Reviews elements not found even within pane after tab click Next steps: Need manual inspection of actual page structure on Lithuanian locale
This commit is contained in:
@@ -49,26 +49,29 @@ def check_no_reviews_early(driver) -> tuple[bool, str]:
|
|||||||
return True, f"Found 'no reviews' message: '{pattern}'"
|
return True, f"Found 'no reviews' message: '{pattern}'"
|
||||||
|
|
||||||
# Check if review count is explicitly 0
|
# Check if review count is explicitly 0
|
||||||
|
# IMPORTANT: Be very specific to avoid false positives!
|
||||||
review_count_check = driver.execute_script("""
|
review_count_check = driver.execute_script("""
|
||||||
// Look for review count indicators
|
// Only check for EXACT "0 reviews" patterns, not loose matches
|
||||||
const patterns = [
|
const patterns = [
|
||||||
/0\\s*(?:review|reseña|avis|recensione|Bewertung|レビュー)/i,
|
/^0\\s+reviews?$/im, // Exactly "0 reviews" on its own line
|
||||||
/\\(0\\)/,
|
/\\(0\\s+reviews?\\)/i, // "(0 reviews)"
|
||||||
/review.*0/i,
|
/\\b0\\s+reviews?\\b/i // "0 reviews" as a complete phrase
|
||||||
/0.*review/i
|
|
||||||
];
|
];
|
||||||
|
|
||||||
const text = document.body.innerText;
|
const text = document.body.innerText;
|
||||||
for (let pattern of patterns) {
|
|
||||||
if (pattern.test(text)) {
|
|
||||||
return 'Found 0 reviews indicator';
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check for aria-labels indicating no reviews
|
// Split into lines and check each line independently to avoid false positives
|
||||||
const elements = document.querySelectorAll('[aria-label*="0 review" i], [aria-label*="no review" i]');
|
const lines = text.split('\\n');
|
||||||
if (elements.length > 0) {
|
for (let line of lines) {
|
||||||
return 'Found aria-label with 0 reviews';
|
const trimmed = line.trim();
|
||||||
|
for (let pattern of patterns) {
|
||||||
|
if (pattern.test(trimmed)) {
|
||||||
|
// Double-check: line should be short (not a review text itself)
|
||||||
|
if (trimmed.length < 50) {
|
||||||
|
return 'Found explicit "0 reviews" text: ' + trimmed;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
@@ -287,11 +290,22 @@ def extract_all_reviews_js(driver) -> List[Dict[str, Any]]:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// STRATEGY 2: Structural matching for unknown page layouts
|
// STRATEGY 2: Structural matching for unknown page layouts
|
||||||
|
// IMPORTANT: Search only within the reviews pane, not the entire page!
|
||||||
if (!elements || elements.length === 0) {
|
if (!elements || elements.length === 0) {
|
||||||
console.log('Known selectors failed, trying structural matching...');
|
console.log('Known selectors failed, trying structural matching...');
|
||||||
|
|
||||||
// Find all divs that LOOK like reviews (have review structure)
|
// Find the reviews pane first
|
||||||
const allDivs = document.querySelectorAll('div');
|
const pane = document.querySelector('div.m6QErb.WNBkOb.XiKgde') ||
|
||||||
|
document.querySelector('div.m6QErb') ||
|
||||||
|
document.querySelector('div[role="main"]');
|
||||||
|
|
||||||
|
if (!pane) {
|
||||||
|
console.warn('No reviews pane found');
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find all divs that LOOK like reviews (have review structure) WITHIN the pane
|
||||||
|
const allDivs = pane.querySelectorAll('div');
|
||||||
const reviewElements = [];
|
const reviewElements = [];
|
||||||
|
|
||||||
for (let div of allDivs) {
|
for (let div of allDivs) {
|
||||||
@@ -302,7 +316,7 @@ def extract_all_reviews_js(driver) -> List[Dict[str, Any]]:
|
|||||||
const hasAuthor = div.querySelector('[aria-label*="photo" i], img[src*="photo"], img[src*="avatar"]');
|
const hasAuthor = div.querySelector('[aria-label*="photo" i], img[src*="photo"], img[src*="avatar"]');
|
||||||
const hasRating = div.querySelector('[aria-label*="star" i], [aria-label*="rating" i], span[role="img"]');
|
const hasRating = div.querySelector('[aria-label*="star" i], [aria-label*="rating" i], span[role="img"]');
|
||||||
const hasText = Array.from(div.querySelectorAll('span')).some(s => s.textContent.length > 20);
|
const hasText = Array.from(div.querySelectorAll('span')).some(s => s.textContent.length > 20);
|
||||||
const hasDate = div.textContent.match(/\\d+\\s*(day|week|month|year|día|semana|mes|año)/i);
|
const hasDate = div.textContent.match(/\\d+\\s*(day|week|month|year|día|semana|mes|año|dienų|savaitės)/i);
|
||||||
|
|
||||||
// Must have at least author, rating, and text to be a review
|
// Must have at least author, rating, and text to be a review
|
||||||
const indicators = [hasAuthor, hasRating, hasText, hasDate].filter(Boolean).length;
|
const indicators = [hasAuthor, hasRating, hasText, hasDate].filter(Boolean).length;
|
||||||
@@ -317,22 +331,28 @@ def extract_all_reviews_js(driver) -> List[Dict[str, Any]]:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// STRATEGY 3: Try role="article" as last resort
|
// STRATEGY 3: Try role="article" as last resort (within pane)
|
||||||
if (!elements || elements.length === 0) {
|
if (!elements || elements.length === 0) {
|
||||||
const articles = document.querySelectorAll('[role="article"]');
|
const pane = document.querySelector('div.m6QErb.WNBkOb.XiKgde') ||
|
||||||
const validArticles = [];
|
document.querySelector('div.m6QErb') ||
|
||||||
|
document.querySelector('div[role="main"]');
|
||||||
|
|
||||||
for (let article of articles) {
|
if (pane) {
|
||||||
const hasRating = article.querySelector('[aria-label*="star" i]');
|
const articles = pane.querySelectorAll('[role="article"]');
|
||||||
const hasText = article.textContent.length > 30;
|
const validArticles = [];
|
||||||
if (hasRating && hasText) {
|
|
||||||
validArticles.push(article);
|
for (let article of articles) {
|
||||||
|
const hasRating = article.querySelector('[aria-label*="star" i]');
|
||||||
|
const hasText = article.textContent.length > 30;
|
||||||
|
if (hasRating && hasText) {
|
||||||
|
validArticles.push(article);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if (validArticles.length > 0) {
|
if (validArticles.length > 0) {
|
||||||
elements = validArticles;
|
elements = validArticles;
|
||||||
console.log('Found', validArticles.length, 'reviews using role=article');
|
console.log('Found', validArticles.length, 'reviews using role=article');
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -536,13 +556,19 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
|||||||
|
|
||||||
# Force English locale for consistent date parsing
|
# Force English locale for consistent date parsing
|
||||||
# English gives cleaner date formats: "3 months ago" vs "Hace 3 meses"
|
# English gives cleaner date formats: "3 months ago" vs "Hace 3 meses"
|
||||||
|
# Store original URL in case we need to retry without locale override
|
||||||
|
original_url = url
|
||||||
|
locale_override_applied = False
|
||||||
|
|
||||||
if 'hl=' in url:
|
if 'hl=' in url:
|
||||||
# Replace existing locale
|
# Replace existing locale
|
||||||
url = url.replace('hl=es', 'hl=en').replace('hl=pt', 'hl=en').replace('hl=fr', 'hl=en')
|
url = url.replace('hl=es', 'hl=en').replace('hl=pt', 'hl=en').replace('hl=fr', 'hl=en')
|
||||||
|
locale_override_applied = True
|
||||||
else:
|
else:
|
||||||
# Add English locale parameter
|
# Add English locale parameter
|
||||||
separator = '&' if '?' in url else '?'
|
separator = '&' if '?' in url else '?'
|
||||||
url = f"{url}{separator}hl=en"
|
url = f"{url}{separator}hl=en"
|
||||||
|
locale_override_applied = True
|
||||||
|
|
||||||
log.info(f"Using English locale (hl=en) for consistent date parsing")
|
log.info(f"Using English locale (hl=en) for consistent date parsing")
|
||||||
|
|
||||||
@@ -628,7 +654,89 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
if not reviews_tab_clicked:
|
if not reviews_tab_clicked:
|
||||||
log.warning("Could not find reviews tab, continuing anyway")
|
log.warning("Could not find reviews tab with hl=en locale")
|
||||||
|
|
||||||
|
# FALLBACK: If locale override was applied and tab not found,
|
||||||
|
# retry without locale override (fixes regional pages where hl=en breaks tabs)
|
||||||
|
if locale_override_applied:
|
||||||
|
log.info("Retrying without locale override to find reviews tab...")
|
||||||
|
|
||||||
|
# Reload page with original URL (no hl=en)
|
||||||
|
driver.get(original_url)
|
||||||
|
time.sleep(1.5)
|
||||||
|
|
||||||
|
# Handle GDPR again if needed
|
||||||
|
if 'consent.google.com' in driver.current_url:
|
||||||
|
try:
|
||||||
|
form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
|
||||||
|
for btn in form_btns:
|
||||||
|
btn_text = (btn.text or '').lower()
|
||||||
|
if any(keyword in btn_text for keyword in ['accept', 'aceptar', 'priim', 'принять', 'accepter']):
|
||||||
|
log.info(f"Clicking GDPR consent: {btn.text}")
|
||||||
|
btn.click()
|
||||||
|
time.sleep(2)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
if len(form_btns) >= 2:
|
||||||
|
log.info("Using fallback: clicking second form button")
|
||||||
|
form_btns[1].click()
|
||||||
|
time.sleep(2)
|
||||||
|
except Exception as e:
|
||||||
|
log.warning(f"GDPR consent handling failed: {e}")
|
||||||
|
|
||||||
|
# Dismiss cookie banner
|
||||||
|
try:
|
||||||
|
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
|
||||||
|
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
|
||||||
|
if cookie_btns:
|
||||||
|
cookie_btns[0].click()
|
||||||
|
time.sleep(0.3)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Try to find reviews tab with multilingual keywords
|
||||||
|
multilingual_keywords = [
|
||||||
|
'review', 'reviews', # English
|
||||||
|
'reseña', 'reseñas', # Spanish
|
||||||
|
'avis', # French
|
||||||
|
'bewertung', 'bewertungen', # German
|
||||||
|
'recensione', 'recensioni', # Italian
|
||||||
|
'レビュー', # Japanese
|
||||||
|
'avaliação', 'avaliações', # Portuguese
|
||||||
|
'отзыв', 'отзывы', # Russian
|
||||||
|
'atsiliepimai', 'atsiliepi', # Lithuanian
|
||||||
|
'ulasan', # Indonesian
|
||||||
|
'리뷰' # Korean
|
||||||
|
]
|
||||||
|
|
||||||
|
for attempt in range(3):
|
||||||
|
if reviews_tab_clicked:
|
||||||
|
break
|
||||||
|
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
for selector in ['button[role="tab"]', '.LRkQ2', 'button']:
|
||||||
|
try:
|
||||||
|
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
|
||||||
|
for tab in tabs:
|
||||||
|
text = (tab.text or '').lower()
|
||||||
|
aria = (tab.get_attribute('aria-label') or '').lower()
|
||||||
|
|
||||||
|
if any(kw in text or kw in aria for kw in multilingual_keywords):
|
||||||
|
log.info(f"Clicking reviews tab (native locale): {tab.text or aria[:30]}")
|
||||||
|
driver.execute_script("arguments[0].click();", tab)
|
||||||
|
time.sleep(1.5)
|
||||||
|
reviews_tab_clicked = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if reviews_tab_clicked:
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
log.debug(f"Native locale tab search attempt {attempt+1} with {selector}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not reviews_tab_clicked:
|
||||||
|
log.warning("Could not find reviews tab even without locale override")
|
||||||
|
|
||||||
# Wait for reviews section to load
|
# Wait for reviews section to load
|
||||||
time.sleep(2)
|
time.sleep(2)
|
||||||
@@ -733,8 +841,17 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
|||||||
|
|
||||||
// STRATEGY 2: Structural pattern matching (robust, class-agnostic)
|
// STRATEGY 2: Structural pattern matching (robust, class-agnostic)
|
||||||
// Find containers that LOOK like reviews (have author + rating + text structure)
|
// Find containers that LOOK like reviews (have author + rating + text structure)
|
||||||
|
// IMPORTANT: Search only within the reviews pane, not the entire page!
|
||||||
const findReviewsByStructure = () => {
|
const findReviewsByStructure = () => {
|
||||||
const allDivs = document.querySelectorAll('div');
|
// Find the reviews pane first
|
||||||
|
const pane = document.querySelector('div.m6QErb.WNBkOb.XiKgde') ||
|
||||||
|
document.querySelector('div.m6QErb') ||
|
||||||
|
document.querySelector('div[role="main"]');
|
||||||
|
|
||||||
|
if (!pane) return 0;
|
||||||
|
|
||||||
|
// Search only within the pane
|
||||||
|
const allDivs = pane.querySelectorAll('div');
|
||||||
let reviewCount = 0;
|
let reviewCount = 0;
|
||||||
|
|
||||||
for (let div of allDivs) {
|
for (let div of allDivs) {
|
||||||
@@ -749,7 +866,7 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
|||||||
const hasAuthor = div.querySelector('[aria-label*="photo" i], img[src*="photo"], img[src*="avatar"]');
|
const hasAuthor = div.querySelector('[aria-label*="photo" i], img[src*="photo"], img[src*="avatar"]');
|
||||||
const hasRating = div.querySelector('[aria-label*="star" i], [aria-label*="rating" i], span[role="img"]');
|
const hasRating = div.querySelector('[aria-label*="star" i], [aria-label*="rating" i], span[role="img"]');
|
||||||
const hasText = Array.from(div.querySelectorAll('span')).some(s => s.textContent.length > 20);
|
const hasText = Array.from(div.querySelectorAll('span')).some(s => s.textContent.length > 20);
|
||||||
const hasDate = div.textContent.match(/\\d+\\s*(day|week|month|year|día|semana|mes|año|jour|mois|année)/i);
|
const hasDate = div.textContent.match(/\\d+\\s*(day|week|month|year|día|semana|mes|año|jour|mois|année|dienų|savaitės)/i);
|
||||||
|
|
||||||
// If it has at least 3 of these indicators, it's likely a review
|
// If it has at least 3 of these indicators, it's likely a review
|
||||||
const indicators = [hasAuthor, hasRating, hasText, hasDate].filter(Boolean).length;
|
const indicators = [hasAuthor, hasRating, hasText, hasDate].filter(Boolean).length;
|
||||||
@@ -761,19 +878,24 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
|||||||
return reviewCount > 0 ? reviewCount : 0;
|
return reviewCount > 0 ? reviewCount : 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
// STRATEGY 3: Look for role="article" with review-like content
|
// STRATEGY 3: Look for role="article" with review-like content (within pane)
|
||||||
const articles = document.querySelectorAll('[role="article"]');
|
const pane3 = document.querySelector('div.m6QErb.WNBkOb.XiKgde') ||
|
||||||
if (articles.length > 0) {
|
document.querySelector('div.m6QErb') ||
|
||||||
let validArticles = 0;
|
document.querySelector('div[role="main"]');
|
||||||
for (let article of articles) {
|
if (pane3) {
|
||||||
// Check if article looks like a review (has rating + text)
|
const articles = pane3.querySelectorAll('[role="article"]');
|
||||||
const hasRating = article.querySelector('[aria-label*="star" i]');
|
if (articles.length > 0) {
|
||||||
const hasText = article.textContent.length > 30;
|
let validArticles = 0;
|
||||||
if (hasRating && hasText) {
|
for (let article of articles) {
|
||||||
validArticles++;
|
// Check if article looks like a review (has rating + text)
|
||||||
|
const hasRating = article.querySelector('[aria-label*="star" i]');
|
||||||
|
const hasText = article.textContent.length > 30;
|
||||||
|
if (hasRating && hasText) {
|
||||||
|
validArticles++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
if (validArticles > 0) return validArticles;
|
||||||
}
|
}
|
||||||
if (validArticles > 0) return validArticles;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Try structural matching as last resort
|
// Try structural matching as last resort
|
||||||
|
|||||||
Reference in New Issue
Block a user