- Added fallback logic: if reviews tab not found with hl=en, retry without locale override - Added multilingual keywords for reviews tab (Lithuanian, Russian, etc.) - Fixed structural pattern matching to search only within reviews pane, not entire page - Added Lithuanian date keywords (dienų, savaitės) to date pattern matching - All three selector strategies now scoped to reviews pane for accuracy Issue: Lithuanian hospital still extracting 0/271 reviews Root cause: Reviews elements not found even within pane after tab click Next steps: Need manual inspection of actual page structure on Lithuanian locale
1615 lines
63 KiB
Python
1615 lines
63 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fast DOM-only scraper module for API integration.
|
|
Based on start_dom_only_fast.py - achieves ~18.9s for all reviews.
|
|
|
|
This module provides a reusable function for the API server.
|
|
"""
|
|
import logging
|
|
import time
|
|
from typing import List, Dict, Any, Optional
|
|
from seleniumbase import Driver
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
from selenium.common.exceptions import TimeoutException
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
def check_no_reviews_early(driver) -> tuple[bool, str]:
|
|
"""
|
|
Early detection for 'no reviews available' scenarios.
|
|
Returns (has_no_reviews, reason) tuple.
|
|
|
|
Uses structural patterns instead of fragile CSS classes for robustness.
|
|
"""
|
|
try:
|
|
# Check for common "no reviews" messages in multiple languages
|
|
no_review_patterns = [
|
|
'no reviews yet',
|
|
'be the first to review',
|
|
"there aren't any reviews",
|
|
'no hay reseñas',
|
|
'sin reseñas',
|
|
"pas encore d'avis",
|
|
'noch keine bewertungen',
|
|
'nessuna recensione',
|
|
'まだレビューがありません',
|
|
'sem avaliações',
|
|
'belum ada ulasan'
|
|
]
|
|
|
|
# Get page text
|
|
page_text = driver.execute_script("return document.body.innerText.toLowerCase();")
|
|
|
|
# Check for "no reviews" messages
|
|
for pattern in no_review_patterns:
|
|
if pattern in page_text:
|
|
return True, f"Found 'no reviews' message: '{pattern}'"
|
|
|
|
# Check if review count is explicitly 0
|
|
# IMPORTANT: Be very specific to avoid false positives!
|
|
review_count_check = driver.execute_script("""
|
|
// Only check for EXACT "0 reviews" patterns, not loose matches
|
|
const patterns = [
|
|
/^0\\s+reviews?$/im, // Exactly "0 reviews" on its own line
|
|
/\\(0\\s+reviews?\\)/i, // "(0 reviews)"
|
|
/\\b0\\s+reviews?\\b/i // "0 reviews" as a complete phrase
|
|
];
|
|
|
|
const text = document.body.innerText;
|
|
|
|
// Split into lines and check each line independently to avoid false positives
|
|
const lines = text.split('\\n');
|
|
for (let line of lines) {
|
|
const trimmed = line.trim();
|
|
for (let pattern of patterns) {
|
|
if (pattern.test(trimmed)) {
|
|
// Double-check: line should be short (not a review text itself)
|
|
if (trimmed.length < 50) {
|
|
return 'Found explicit "0 reviews" text: ' + trimmed;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return null;
|
|
""")
|
|
|
|
if review_count_check:
|
|
return True, review_count_check
|
|
|
|
# Check if reviews tab is disabled or not clickable
|
|
reviews_disabled = driver.execute_script("""
|
|
const tabs = document.querySelectorAll('button[role="tab"]');
|
|
for (let tab of tabs) {
|
|
const text = (tab.textContent || '').toLowerCase();
|
|
const aria = (tab.getAttribute('aria-label') || '').toLowerCase();
|
|
|
|
if (text.includes('review') || aria.includes('review')) {
|
|
if (tab.disabled || tab.getAttribute('aria-disabled') === 'true') {
|
|
return 'Reviews tab is disabled';
|
|
}
|
|
}
|
|
}
|
|
return null;
|
|
""")
|
|
|
|
if reviews_disabled:
|
|
return True, reviews_disabled
|
|
|
|
return False, ""
|
|
|
|
except Exception as e:
|
|
log.warning(f"Error in early no-reviews detection: {e}")
|
|
return False, ""
|
|
|
|
|
|
def extract_total_review_count(driver) -> Optional[int]:
|
|
"""
|
|
Extract the total number of reviews from the Google Maps page.
|
|
Looks for text patterns like "500 reviews" in various elements.
|
|
Works on both search results pages and business detail pages.
|
|
|
|
Returns:
|
|
Total review count or None if not found
|
|
"""
|
|
extract_script = """
|
|
// Optimized review count extraction - removed verbose logging for speed
|
|
let total = null;
|
|
|
|
const parenthesesPattern = /\\((\\d[\\d,\\.\\s]*)\\)/;
|
|
const numberPattern = /(\\d[\\d,\\.\\s]*)\\s*(?:review|reseña|avis|recensione|Bewertung|レビュー)/i;
|
|
|
|
// PRIORITY 1: Search results page
|
|
const searchResultsSelectors = [
|
|
'a[href*="reviews"]',
|
|
'[role="article"] span',
|
|
'[role="article"] a',
|
|
'div.fontBodyMedium',
|
|
'span.UY7F9',
|
|
];
|
|
|
|
for (const selector of searchResultsSelectors) {
|
|
const elements = document.querySelectorAll(selector);
|
|
for (let i = 0; i < Math.min(elements.length, 20); i++) {
|
|
const elem = elements[i];
|
|
const text = elem.textContent || '';
|
|
const href = elem.getAttribute('href') || '';
|
|
|
|
let match = text.match(numberPattern);
|
|
if (match) {
|
|
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
|
if (num > 0 && num < 1000000) {
|
|
total = num;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (href.includes('reviews')) {
|
|
match = text.match(/(\\d[\\d,\\.\\s]*)/);
|
|
if (match) {
|
|
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
|
if (num > 0 && num < 1000000) {
|
|
total = num;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if (total) break;
|
|
}
|
|
|
|
// PRIORITY 2: Tab buttons (business detail page)
|
|
if (!total) {
|
|
const buttons = document.querySelectorAll('button[role="tab"]');
|
|
for (let i = 0; i < buttons.length; i++) {
|
|
const text = buttons[i].textContent || '';
|
|
let match = text.match(parenthesesPattern);
|
|
if (match) {
|
|
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
|
total = num;
|
|
break;
|
|
}
|
|
match = text.match(numberPattern);
|
|
if (match) {
|
|
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
|
total = num;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// PRIORITY 3: Aria-labels
|
|
if (!total) {
|
|
const elements = document.querySelectorAll('[aria-label]');
|
|
for (let elem of elements) {
|
|
const ariaLabel = elem.getAttribute('aria-label') || '';
|
|
let match = ariaLabel.match(parenthesesPattern);
|
|
if (match) {
|
|
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
|
total = num;
|
|
break;
|
|
}
|
|
match = ariaLabel.match(numberPattern);
|
|
if (match) {
|
|
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
|
total = num;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// PRIORITY 4: Fallback - entire page text
|
|
if (!total) {
|
|
const match = document.body.innerText.match(parenthesesPattern);
|
|
if (match) {
|
|
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
|
if (num > 0 && num < 1000000) {
|
|
total = num;
|
|
}
|
|
}
|
|
}
|
|
|
|
return total;
|
|
"""
|
|
|
|
try:
|
|
total = driver.execute_script(extract_script)
|
|
|
|
# Get debug info from JavaScript
|
|
debug_script = """
|
|
const info = {
|
|
search_results_count: document.querySelectorAll('[role="article"]').length,
|
|
links_with_reviews: document.querySelectorAll('a[href*="reviews"]').length,
|
|
page_url: window.location.href,
|
|
page_title: document.title,
|
|
sample_texts: []
|
|
};
|
|
|
|
// Get sample text from links that might contain reviews
|
|
const reviewLinks = document.querySelectorAll('a[href*="reviews"]');
|
|
for (let i = 0; i < Math.min(5, reviewLinks.length); i++) {
|
|
info.sample_texts.push(reviewLinks[i].textContent.substring(0, 100));
|
|
}
|
|
|
|
// Also check for text containing "review" keyword
|
|
const allText = document.body.innerText.substring(0, 2000);
|
|
const reviewMatches = allText.match(/\\d+[\\s,\\.]*(?:review|reseña|avis)/gi);
|
|
if (reviewMatches) {
|
|
info.review_patterns_found = reviewMatches.slice(0, 5);
|
|
}
|
|
|
|
return info;
|
|
"""
|
|
debug_info = driver.execute_script(debug_script)
|
|
log.info(f"Page debug: URL={debug_info.get('page_url')}")
|
|
log.info(f"Page debug: Found {debug_info.get('search_results_count')} search result articles")
|
|
log.info(f"Page debug: Found {debug_info.get('links_with_reviews')} links containing 'reviews'")
|
|
if debug_info.get('review_patterns_found'):
|
|
log.info(f"Page debug: Review patterns in text: {debug_info.get('review_patterns_found')}")
|
|
if debug_info.get('sample_texts'):
|
|
log.info(f"Page debug: Sample link texts: {debug_info.get('sample_texts')}")
|
|
|
|
if total and total > 0:
|
|
log.info(f"Extracted total review count: {total}")
|
|
return total
|
|
else:
|
|
log.warning(f"Could not extract total review count from page. Debug: {debug_info}")
|
|
return None
|
|
except Exception as e:
|
|
log.error(f"Error extracting total review count: {e}")
|
|
return None
|
|
|
|
|
|
def extract_all_reviews_js(driver) -> List[Dict[str, Any]]:
|
|
"""Extract ALL reviews using JavaScript - single fast operation."""
|
|
|
|
extract_script = """
|
|
const reviews = [];
|
|
|
|
// ROBUST SELECTOR STRATEGY: Try known selectors first, then fall back to structural matching
|
|
let elements = null;
|
|
|
|
// STRATEGY 1: Try known CSS selectors (fast path)
|
|
const knownSelectors = [
|
|
'div.jftiEf.fontBodyMedium',
|
|
'div.jftiEf',
|
|
'div[data-review-id]',
|
|
'div[jsaction*="review"]'
|
|
];
|
|
|
|
for (let selector of knownSelectors) {
|
|
const found = document.querySelectorAll(selector);
|
|
if (found.length > 0) {
|
|
elements = found;
|
|
console.log('Found', found.length, 'reviews using known selector:', selector);
|
|
break;
|
|
}
|
|
}
|
|
|
|
// STRATEGY 2: Structural matching for unknown page layouts
|
|
// IMPORTANT: Search only within the reviews pane, not the entire page!
|
|
if (!elements || elements.length === 0) {
|
|
console.log('Known selectors failed, trying structural matching...');
|
|
|
|
// Find the reviews pane first
|
|
const pane = document.querySelector('div.m6QErb.WNBkOb.XiKgde') ||
|
|
document.querySelector('div.m6QErb') ||
|
|
document.querySelector('div[role="main"]');
|
|
|
|
if (!pane) {
|
|
console.warn('No reviews pane found');
|
|
return [];
|
|
}
|
|
|
|
// Find all divs that LOOK like reviews (have review structure) WITHIN the pane
|
|
const allDivs = pane.querySelectorAll('div');
|
|
const reviewElements = [];
|
|
|
|
for (let div of allDivs) {
|
|
// Skip if too small
|
|
if (div.children.length < 2) continue;
|
|
|
|
// Check for review indicators
|
|
const hasAuthor = div.querySelector('[aria-label*="photo" i], img[src*="photo"], img[src*="avatar"]');
|
|
const hasRating = div.querySelector('[aria-label*="star" i], [aria-label*="rating" i], span[role="img"]');
|
|
const hasText = Array.from(div.querySelectorAll('span')).some(s => s.textContent.length > 20);
|
|
const hasDate = div.textContent.match(/\\d+\\s*(day|week|month|year|día|semana|mes|año|dienų|savaitės)/i);
|
|
|
|
// Must have at least author, rating, and text to be a review
|
|
const indicators = [hasAuthor, hasRating, hasText, hasDate].filter(Boolean).length;
|
|
if (indicators >= 3) {
|
|
reviewElements.push(div);
|
|
}
|
|
}
|
|
|
|
if (reviewElements.length > 0) {
|
|
elements = reviewElements;
|
|
console.log('Found', reviewElements.length, 'reviews using structural matching');
|
|
}
|
|
}
|
|
|
|
// STRATEGY 3: Try role="article" as last resort (within pane)
|
|
if (!elements || elements.length === 0) {
|
|
const pane = document.querySelector('div.m6QErb.WNBkOb.XiKgde') ||
|
|
document.querySelector('div.m6QErb') ||
|
|
document.querySelector('div[role="main"]');
|
|
|
|
if (pane) {
|
|
const articles = pane.querySelectorAll('[role="article"]');
|
|
const validArticles = [];
|
|
|
|
for (let article of articles) {
|
|
const hasRating = article.querySelector('[aria-label*="star" i]');
|
|
const hasText = article.textContent.length > 30;
|
|
if (hasRating && hasText) {
|
|
validArticles.push(article);
|
|
}
|
|
}
|
|
|
|
if (validArticles.length > 0) {
|
|
elements = validArticles;
|
|
console.log('Found', validArticles.length, 'reviews using role=article');
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!elements || elements.length === 0) {
|
|
console.warn('No review elements found with any strategy');
|
|
return [];
|
|
}
|
|
|
|
for (let i = 0; i < elements.length; i++) {
|
|
const elem = elements[i];
|
|
const review = {};
|
|
|
|
try {
|
|
// Author
|
|
const authorElem = elem.querySelector('div.d4r55');
|
|
review.author = authorElem ? authorElem.textContent.trim() : null;
|
|
|
|
// Rating
|
|
const ratingElem = elem.querySelector('span.kvMYJc');
|
|
if (ratingElem) {
|
|
const ariaLabel = ratingElem.getAttribute('aria-label');
|
|
if (ariaLabel) {
|
|
const match = ariaLabel.match(/\\d+/);
|
|
review.rating = match ? parseFloat(match[0]) : null;
|
|
}
|
|
}
|
|
|
|
// Text
|
|
const textElem = elem.querySelector('span.wiI7pd');
|
|
review.text = textElem ? textElem.textContent.trim() : null;
|
|
|
|
// Date
|
|
const dateElem = elem.querySelector('span.rsqaWe');
|
|
review.date_text = dateElem ? dateElem.textContent.trim() : null;
|
|
|
|
// DEEP DIVE: Find where Google stores the actual timestamp
|
|
review.timestamp = null;
|
|
review.debug_date_info = {};
|
|
|
|
if (dateElem) {
|
|
// 1. Check all attributes on date element
|
|
const allAttrs = {};
|
|
for (let attr of dateElem.attributes) {
|
|
allAttrs[attr.name] = attr.value;
|
|
}
|
|
review.debug_date_info.date_elem_attrs = allAttrs;
|
|
|
|
// 2. Check parent elements for data
|
|
let parent = dateElem.parentElement;
|
|
let parentLevel = 0;
|
|
while (parent && parentLevel < 3) {
|
|
const parentAttrs = {};
|
|
for (let attr of parent.attributes) {
|
|
if (attr.name.includes('data') || attr.name.includes('time') || attr.name.includes('date')) {
|
|
parentAttrs[attr.name] = attr.value;
|
|
}
|
|
}
|
|
if (Object.keys(parentAttrs).length > 0) {
|
|
review.debug_date_info[`parent_${parentLevel}_attrs`] = parentAttrs;
|
|
}
|
|
parent = parent.parentElement;
|
|
parentLevel++;
|
|
}
|
|
|
|
// 3. Check the entire review container for hidden data
|
|
const reviewContainer = elem;
|
|
const containerAttrs = {};
|
|
for (let attr of reviewContainer.attributes) {
|
|
containerAttrs[attr.name] = attr.value;
|
|
}
|
|
review.debug_date_info.container_attrs = containerAttrs;
|
|
|
|
// 4. Look for script tags or JSON data near the date
|
|
const nearbyScripts = elem.querySelectorAll('script');
|
|
if (nearbyScripts.length > 0) {
|
|
review.debug_date_info.has_nearby_scripts = nearbyScripts.length;
|
|
}
|
|
|
|
// 5. Check for any element with 'time' in class or data
|
|
const timeElements = elem.querySelectorAll('[class*="time"], [data-timestamp], [datetime]');
|
|
if (timeElements.length > 0) {
|
|
const timeData = [];
|
|
timeElements.forEach(el => {
|
|
timeData.push({
|
|
tag: el.tagName,
|
|
classes: el.className,
|
|
datetime: el.getAttribute('datetime'),
|
|
timestamp: el.getAttribute('data-timestamp'),
|
|
text: el.textContent.substring(0, 50)
|
|
});
|
|
});
|
|
review.debug_date_info.time_elements = timeData;
|
|
}
|
|
}
|
|
|
|
// Avatar
|
|
const avatarElem = elem.querySelector('img.NBa7we');
|
|
review.avatar_url = avatarElem ? avatarElem.src : null;
|
|
|
|
// Profile URL
|
|
const profileElem = elem.querySelector('button.WEBjve');
|
|
review.profile_url = profileElem ? profileElem.getAttribute('data-review-id') : null;
|
|
|
|
if (review.author && review.date_text) {
|
|
reviews.push(review);
|
|
}
|
|
} catch (e) {
|
|
// Skip this review
|
|
}
|
|
}
|
|
|
|
return reviews;
|
|
"""
|
|
|
|
# ADDITIONAL: Check for Google's internal state/data objects
|
|
check_state_script = """
|
|
// Look for Google Maps' internal data stores
|
|
const debugInfo = {
|
|
global_keys: [],
|
|
app_data: null,
|
|
window_data: null
|
|
};
|
|
|
|
// Check window object for Google Maps data
|
|
for (let key in window) {
|
|
if (key.includes('google') || key.includes('maps') || key.includes('APP') || key.includes('_')) {
|
|
debugInfo.global_keys.push(key);
|
|
}
|
|
}
|
|
|
|
// Check for common React/Angular state keys
|
|
const stateKeys = ['__INITIAL_STATE__', '__NEXT_DATA__', '__APP_STATE__', 'APP_INITIALIZATION_STATE'];
|
|
for (let key of stateKeys) {
|
|
if (window[key]) {
|
|
debugInfo.app_data = key;
|
|
}
|
|
}
|
|
|
|
// Check for embedded JSON in script tags
|
|
const scriptTags = document.querySelectorAll('script[type="application/json"], script[type="application/ld+json"]');
|
|
debugInfo.json_scripts_count = scriptTags.length;
|
|
if (scriptTags.length > 0) {
|
|
debugInfo.json_scripts_sample = Array.from(scriptTags).slice(0, 2).map(s => s.textContent.substring(0, 200));
|
|
}
|
|
|
|
return debugInfo;
|
|
"""
|
|
|
|
try:
|
|
reviews_data = driver.execute_script(extract_script)
|
|
state_debug = driver.execute_script(check_state_script)
|
|
|
|
# Log the global state debug info
|
|
log.info(f"Google Maps state debug: {state_debug}")
|
|
|
|
# Add review IDs
|
|
reviews = []
|
|
for i, review_data in enumerate(reviews_data):
|
|
review_id = f"review_{hash(review_data['author'] + review_data['date_text'])}"
|
|
review_data['review_id'] = review_id
|
|
|
|
# Add global state debug to first review only
|
|
if i == 0:
|
|
review_data['_google_state_debug'] = state_debug
|
|
|
|
reviews.append(review_data)
|
|
|
|
return reviews
|
|
|
|
except Exception as e:
|
|
log.error(f"Error in JavaScript extraction: {e}")
|
|
return []
|
|
|
|
|
|
def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999, progress_callback=None, driver=None, return_driver: bool = False) -> Dict[str, Any]:
|
|
"""
|
|
Ultra-fast DOM-only scraping with JavaScript extraction.
|
|
|
|
Args:
|
|
url: Google Maps URL to scrape
|
|
headless: Run Chrome in headless mode (default: True)
|
|
max_scrolls: Maximum scrolls safety limit (default: 999999 - effectively unlimited)
|
|
The scraper stops automatically via idle detection when no new reviews load.
|
|
progress_callback: Optional callback function(current_count, total_count) for progress updates
|
|
driver: Existing driver instance to reuse (from worker pool)
|
|
return_driver: If True, don't close driver and return it in result
|
|
|
|
Returns:
|
|
Dictionary with:
|
|
- reviews: List of review dictionaries
|
|
- count: Total number of reviews scraped
|
|
- total_reviews: Total reviews available (from page counter)
|
|
- time: Time taken in seconds
|
|
- success: True if successful, False otherwise
|
|
- error: Error message if failed
|
|
- driver: Driver instance (if return_driver=True)
|
|
"""
|
|
start_time = time.time()
|
|
|
|
log.info(f"Starting fast scrape for URL: {url[:80]}...")
|
|
|
|
# Force English locale for consistent date parsing
|
|
# English gives cleaner date formats: "3 months ago" vs "Hace 3 meses"
|
|
# Store original URL in case we need to retry without locale override
|
|
original_url = url
|
|
locale_override_applied = False
|
|
|
|
if 'hl=' in url:
|
|
# Replace existing locale
|
|
url = url.replace('hl=es', 'hl=en').replace('hl=pt', 'hl=en').replace('hl=fr', 'hl=en')
|
|
locale_override_applied = True
|
|
else:
|
|
# Add English locale parameter
|
|
separator = '&' if '?' in url else '?'
|
|
url = f"{url}{separator}hl=en"
|
|
locale_override_applied = True
|
|
|
|
log.info(f"Using English locale (hl=en) for consistent date parsing")
|
|
|
|
# Track if driver was provided or created
|
|
driver_provided = driver is not None
|
|
should_close_driver = not return_driver and not driver_provided
|
|
|
|
# Initialize driver with custom user agent to avoid headless detection
|
|
# Even with headless=False + Xvfb, Chromium still reports as HeadlessChrome
|
|
if not driver:
|
|
driver = Driver(
|
|
uc=True,
|
|
headless=headless,
|
|
page_load_strategy="normal",
|
|
agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
)
|
|
|
|
try:
|
|
# Navigate
|
|
driver.get(url)
|
|
time.sleep(1.5)
|
|
|
|
# Handle GDPR consent page (CRITICAL FIX for headless mode!)
|
|
if 'consent.google.com' in driver.current_url:
|
|
try:
|
|
# Find all form buttons and click "Accept all" / "Aceptar todo"
|
|
form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
|
|
for btn in form_btns:
|
|
btn_text = (btn.text or '').lower()
|
|
if 'aceptar todo' in btn_text or 'accept all' in btn_text:
|
|
log.info(f"Clicking GDPR consent: {btn.text}")
|
|
btn.click()
|
|
time.sleep(2)
|
|
break
|
|
else:
|
|
# Fallback: click second button (usually "Accept all")
|
|
if len(form_btns) >= 2:
|
|
log.info("Using fallback: clicking second form button")
|
|
form_btns[1].click()
|
|
time.sleep(2)
|
|
except Exception as e:
|
|
log.warning(f"GDPR consent handling failed: {e}")
|
|
|
|
# Dismiss cookie banner on Maps page
|
|
try:
|
|
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
|
|
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
|
|
if cookie_btns:
|
|
cookie_btns[0].click()
|
|
time.sleep(0.3)
|
|
except:
|
|
pass
|
|
|
|
# Click reviews tab with retry logic (important for containers)
|
|
review_keywords = ['reviews', 'review', 'reseñas', 'reseña']
|
|
reviews_tab_clicked = False
|
|
|
|
# Try multiple times to find and click reviews tab
|
|
for attempt in range(3):
|
|
if reviews_tab_clicked:
|
|
break
|
|
|
|
time.sleep(0.5) # Wait between attempts
|
|
|
|
for selector in ['button[role="tab"]', '.LRkQ2', 'button']:
|
|
try:
|
|
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
|
|
for tab in tabs:
|
|
text = (tab.text or '').lower()
|
|
aria = (tab.get_attribute('aria-label') or '').lower()
|
|
|
|
if any(kw in text or kw in aria for kw in review_keywords):
|
|
log.info(f"Clicking reviews tab: {tab.text or aria[:30]}")
|
|
driver.execute_script("arguments[0].click();", tab)
|
|
time.sleep(1.5) # Wait for tab to load
|
|
reviews_tab_clicked = True
|
|
break
|
|
|
|
if reviews_tab_clicked:
|
|
break
|
|
except Exception as e:
|
|
log.debug(f"Tab search attempt {attempt+1} with {selector}: {e}")
|
|
continue
|
|
|
|
if not reviews_tab_clicked:
|
|
log.warning("Could not find reviews tab with hl=en locale")
|
|
|
|
# FALLBACK: If locale override was applied and tab not found,
|
|
# retry without locale override (fixes regional pages where hl=en breaks tabs)
|
|
if locale_override_applied:
|
|
log.info("Retrying without locale override to find reviews tab...")
|
|
|
|
# Reload page with original URL (no hl=en)
|
|
driver.get(original_url)
|
|
time.sleep(1.5)
|
|
|
|
# Handle GDPR again if needed
|
|
if 'consent.google.com' in driver.current_url:
|
|
try:
|
|
form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
|
|
for btn in form_btns:
|
|
btn_text = (btn.text or '').lower()
|
|
if any(keyword in btn_text for keyword in ['accept', 'aceptar', 'priim', 'принять', 'accepter']):
|
|
log.info(f"Clicking GDPR consent: {btn.text}")
|
|
btn.click()
|
|
time.sleep(2)
|
|
break
|
|
else:
|
|
if len(form_btns) >= 2:
|
|
log.info("Using fallback: clicking second form button")
|
|
form_btns[1].click()
|
|
time.sleep(2)
|
|
except Exception as e:
|
|
log.warning(f"GDPR consent handling failed: {e}")
|
|
|
|
# Dismiss cookie banner
|
|
try:
|
|
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
|
|
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
|
|
if cookie_btns:
|
|
cookie_btns[0].click()
|
|
time.sleep(0.3)
|
|
except:
|
|
pass
|
|
|
|
# Try to find reviews tab with multilingual keywords
|
|
multilingual_keywords = [
|
|
'review', 'reviews', # English
|
|
'reseña', 'reseñas', # Spanish
|
|
'avis', # French
|
|
'bewertung', 'bewertungen', # German
|
|
'recensione', 'recensioni', # Italian
|
|
'レビュー', # Japanese
|
|
'avaliação', 'avaliações', # Portuguese
|
|
'отзыв', 'отзывы', # Russian
|
|
'atsiliepimai', 'atsiliepi', # Lithuanian
|
|
'ulasan', # Indonesian
|
|
'리뷰' # Korean
|
|
]
|
|
|
|
for attempt in range(3):
|
|
if reviews_tab_clicked:
|
|
break
|
|
|
|
time.sleep(0.5)
|
|
|
|
for selector in ['button[role="tab"]', '.LRkQ2', 'button']:
|
|
try:
|
|
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
|
|
for tab in tabs:
|
|
text = (tab.text or '').lower()
|
|
aria = (tab.get_attribute('aria-label') or '').lower()
|
|
|
|
if any(kw in text or kw in aria for kw in multilingual_keywords):
|
|
log.info(f"Clicking reviews tab (native locale): {tab.text or aria[:30]}")
|
|
driver.execute_script("arguments[0].click();", tab)
|
|
time.sleep(1.5)
|
|
reviews_tab_clicked = True
|
|
break
|
|
|
|
if reviews_tab_clicked:
|
|
break
|
|
except Exception as e:
|
|
log.debug(f"Native locale tab search attempt {attempt+1} with {selector}: {e}")
|
|
continue
|
|
|
|
if not reviews_tab_clicked:
|
|
log.warning("Could not find reviews tab even without locale override")
|
|
|
|
# Wait for reviews section to load
|
|
time.sleep(2)
|
|
|
|
# EARLY DETECTION: Check if there are no reviews before attempting to scrape
|
|
no_reviews, reason = check_no_reviews_early(driver)
|
|
if no_reviews:
|
|
log.info(f"Early detection: No reviews available. Reason: {reason}")
|
|
return {
|
|
"reviews": [],
|
|
"count": 0,
|
|
"total_reviews": 0,
|
|
"time": time.time() - start_time,
|
|
"success": True,
|
|
"message": f"No reviews available: {reason}"
|
|
}
|
|
|
|
# Extract total review count from the page
|
|
total_reviews = extract_total_review_count(driver)
|
|
|
|
# Double-check: If extracted count is 0, return early
|
|
if total_reviews == 0:
|
|
log.info("Total review count is 0, skipping scraping")
|
|
return {
|
|
"reviews": [],
|
|
"count": 0,
|
|
"total_reviews": 0,
|
|
"time": time.time() - start_time,
|
|
"success": True,
|
|
"message": "Business has 0 reviews"
|
|
}
|
|
|
|
# Report initial progress with total count
|
|
if progress_callback and total_reviews:
|
|
try:
|
|
progress_callback(0, total_reviews)
|
|
except Exception as e:
|
|
log.warning(f"Progress callback failed: {e}")
|
|
|
|
# Find scrollable pane - try multiple selectors (container-friendly)
|
|
pane = None
|
|
pane_selectors = [
|
|
'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde',
|
|
'div.m6QErb.WNBkOb.XiKgde',
|
|
'div.m6QErb', # Fallback to more general selector
|
|
'div[role="main"]',
|
|
]
|
|
|
|
wait = WebDriverWait(driver, 5)
|
|
for selector in pane_selectors:
|
|
try:
|
|
pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector)))
|
|
log.info(f"Found pane with selector: {selector}")
|
|
break
|
|
except TimeoutException:
|
|
continue
|
|
|
|
if not pane:
|
|
error_msg = "Could not find scrollable pane after trying all selectors"
|
|
log.error(error_msg)
|
|
return {
|
|
"reviews": [],
|
|
"count": 0,
|
|
"total_reviews": total_reviews,
|
|
"time": time.time() - start_time,
|
|
"success": False,
|
|
"error": error_msg
|
|
}
|
|
|
|
# Wait longer for initial reviews to load (containers can be slower)
|
|
time.sleep(2)
|
|
|
|
# Setup scroll
|
|
driver.execute_script("window.scrollablePane = arguments[0];", pane)
|
|
scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
|
|
|
|
# Trigger initial scroll and verify reviews are loading
|
|
driver.execute_script(scroll_script)
|
|
time.sleep(0.8)
|
|
|
|
# Also scroll the main window (helps in some cases, especially containers)
|
|
driver.execute_script("window.scrollBy(0, 500);")
|
|
time.sleep(0.5)
|
|
|
|
# JavaScript function to count reviews using ROBUST structural patterns
|
|
# Instead of relying on CSS classes, we look for containers with review-like structure
|
|
count_reviews_script = """
|
|
// STRATEGY 1: Try known selectors first (fast path)
|
|
const knownSelectors = [
|
|
'div.jftiEf.fontBodyMedium',
|
|
'div.jftiEf',
|
|
'div[data-review-id]',
|
|
'div[jsaction*="review"]'
|
|
];
|
|
|
|
for (let selector of knownSelectors) {
|
|
const found = document.querySelectorAll(selector);
|
|
if (found.length > 0) {
|
|
return found.length;
|
|
}
|
|
}
|
|
|
|
// STRATEGY 2: Structural pattern matching (robust, class-agnostic)
|
|
// Find containers that LOOK like reviews (have author + rating + text structure)
|
|
// IMPORTANT: Search only within the reviews pane, not the entire page!
|
|
const findReviewsByStructure = () => {
|
|
// Find the reviews pane first
|
|
const pane = document.querySelector('div.m6QErb.WNBkOb.XiKgde') ||
|
|
document.querySelector('div.m6QErb') ||
|
|
document.querySelector('div[role="main"]');
|
|
|
|
if (!pane) return 0;
|
|
|
|
// Search only within the pane
|
|
const allDivs = pane.querySelectorAll('div');
|
|
let reviewCount = 0;
|
|
|
|
for (let div of allDivs) {
|
|
// Skip if too small (reviews have substantial content)
|
|
if (div.children.length < 2) continue;
|
|
|
|
// Look for review indicators:
|
|
// - Has an author name (usually in a span/div with small text)
|
|
// - Has a rating (span with aria-label containing "star" or "rating")
|
|
// - Has review text (span/div with longer text content)
|
|
|
|
const hasAuthor = div.querySelector('[aria-label*="photo" i], img[src*="photo"], img[src*="avatar"]');
|
|
const hasRating = div.querySelector('[aria-label*="star" i], [aria-label*="rating" i], span[role="img"]');
|
|
const hasText = Array.from(div.querySelectorAll('span')).some(s => s.textContent.length > 20);
|
|
const hasDate = div.textContent.match(/\\d+\\s*(day|week|month|year|día|semana|mes|año|jour|mois|année|dienų|savaitės)/i);
|
|
|
|
// If it has at least 3 of these indicators, it's likely a review
|
|
const indicators = [hasAuthor, hasRating, hasText, hasDate].filter(Boolean).length;
|
|
if (indicators >= 3) {
|
|
reviewCount++;
|
|
}
|
|
}
|
|
|
|
return reviewCount > 0 ? reviewCount : 0;
|
|
};
|
|
|
|
// STRATEGY 3: Look for role="article" with review-like content (within pane)
|
|
const pane3 = document.querySelector('div.m6QErb.WNBkOb.XiKgde') ||
|
|
document.querySelector('div.m6QErb') ||
|
|
document.querySelector('div[role="main"]');
|
|
if (pane3) {
|
|
const articles = pane3.querySelectorAll('[role="article"]');
|
|
if (articles.length > 0) {
|
|
let validArticles = 0;
|
|
for (let article of articles) {
|
|
// Check if article looks like a review (has rating + text)
|
|
const hasRating = article.querySelector('[aria-label*="star" i]');
|
|
const hasText = article.textContent.length > 30;
|
|
if (hasRating && hasText) {
|
|
validArticles++;
|
|
}
|
|
}
|
|
if (validArticles > 0) return validArticles;
|
|
}
|
|
}
|
|
|
|
// Try structural matching as last resort
|
|
const structuralCount = findReviewsByStructure();
|
|
return structuralCount;
|
|
"""
|
|
|
|
# Check if reviews are actually loading
|
|
initial_count = driver.execute_script(count_reviews_script)
|
|
|
|
if initial_count < 5:
|
|
# Reviews not loaded yet, wait more and try alternative scrolling
|
|
log.info(f"Waiting for reviews to load (found {initial_count})...")
|
|
|
|
# Try clicking on the pane to focus it
|
|
try:
|
|
driver.execute_script("arguments[0].click();", pane)
|
|
time.sleep(0.5)
|
|
except:
|
|
pass
|
|
|
|
# Scroll both pane and window
|
|
driver.execute_script(scroll_script)
|
|
driver.execute_script("window.scrollBy(0, 500);")
|
|
time.sleep(1.5)
|
|
|
|
initial_count = driver.execute_script(count_reviews_script)
|
|
|
|
log.info(f"After extra waiting: {initial_count} reviews")
|
|
|
|
log.info(f"Scrolling to load all reviews (starting with {initial_count})...")
|
|
|
|
# Fast scrolling to load all DOM elements
|
|
last_count = 0
|
|
idle_count = 0
|
|
|
|
for i in range(max_scrolls):
|
|
# Scroll to load more
|
|
prev_count = driver.execute_script(count_reviews_script)
|
|
driver.execute_script(scroll_script)
|
|
|
|
# SMART WAIT: Wait until new reviews actually load
|
|
max_wait = 1.2
|
|
wait_step = 0.05
|
|
waited = 0
|
|
|
|
while waited < max_wait:
|
|
time.sleep(wait_step)
|
|
waited += wait_step
|
|
|
|
current_count = driver.execute_script(count_reviews_script)
|
|
|
|
# If reviews loaded, continue immediately!
|
|
if current_count > prev_count:
|
|
idle_count = 0 # Reset idle counter
|
|
break
|
|
|
|
# Give Google Maps more time to lazy-load (0.6s instead of 0.3s)
|
|
# Only exit early if we're confident nothing is loading
|
|
if waited >= 0.6 and current_count == prev_count:
|
|
break
|
|
|
|
# Track consecutive idle scrolls
|
|
if current_count == prev_count:
|
|
idle_count += 1
|
|
# Be VERY patient: wait for 12 consecutive idle scrolls to ensure we get ALL reviews
|
|
# (each with up to 1.2s wait = ~14.4s total idle time before giving up)
|
|
# This ensures Google Maps has plenty of time to lazy-load all content
|
|
if idle_count >= 12:
|
|
log.info(f"Reached end at {current_count} reviews (12 consecutive idle scrolls)")
|
|
# Double-check we got all reviews if we know the total
|
|
if total_reviews and current_count < total_reviews:
|
|
log.warning(f"Only got {current_count}/{total_reviews} reviews ({current_count/total_reviews*100:.1f}%). Some may be hidden or loading slowly.")
|
|
break
|
|
|
|
# Progress logging and callback every 5 scrolls
|
|
if (i + 1) % 5 == 0:
|
|
log.info(f"{current_count} review elements loaded...")
|
|
if progress_callback and total_reviews:
|
|
try:
|
|
progress_callback(current_count, total_reviews)
|
|
except Exception as e:
|
|
log.warning(f"Progress callback failed: {e}")
|
|
|
|
# Aggressive memory management every 20 scrolls
|
|
if (i + 1) % 20 == 0:
|
|
try:
|
|
# Clear console logs to prevent buildup
|
|
driver.execute_script("console.clear();")
|
|
|
|
# Force garbage collection in browser
|
|
driver.execute_script("""
|
|
if (window.gc) { window.gc(); }
|
|
// Remove image srcs to free memory (images reload on demand)
|
|
document.querySelectorAll('img').forEach(img => {
|
|
if (img.complete && !img.classList.contains('needed')) {
|
|
img.removeAttribute('src');
|
|
}
|
|
});
|
|
""")
|
|
|
|
# Brief pause to let Chrome breathe
|
|
time.sleep(0.1)
|
|
except Exception:
|
|
pass # Ignore if fails
|
|
|
|
last_count = current_count
|
|
|
|
# Shorter final scroll
|
|
for _ in range(2):
|
|
driver.execute_script(scroll_script)
|
|
time.sleep(0.3)
|
|
|
|
scroll_time = time.time() - start_time
|
|
log.info(f"Scrolling complete in {scroll_time:.2f}s")
|
|
|
|
# Update progress: scrolling done, starting extraction
|
|
if progress_callback and total_reviews:
|
|
try:
|
|
progress_callback(current_count, total_reviews)
|
|
except Exception as e:
|
|
log.warning(f"Progress callback failed: {e}")
|
|
|
|
# Extract ALL reviews using JavaScript (fast!)
|
|
log.info("Extracting reviews with JavaScript...")
|
|
extract_start = time.time()
|
|
|
|
all_reviews = extract_all_reviews_js(driver)
|
|
|
|
extract_time = time.time() - extract_start
|
|
log.info(f"Extraction complete in {extract_time:.2f}s")
|
|
|
|
# Final progress update with actual extracted count
|
|
if progress_callback and total_reviews:
|
|
try:
|
|
progress_callback(len(all_reviews), total_reviews)
|
|
except Exception as e:
|
|
log.warning(f"Progress callback failed: {e}")
|
|
|
|
elapsed = time.time() - start_time
|
|
|
|
log.info(f"Fast scrape completed: {len(all_reviews)} reviews in {elapsed:.2f}s")
|
|
|
|
result = {
|
|
"reviews": all_reviews,
|
|
"count": len(all_reviews),
|
|
"total_reviews": total_reviews,
|
|
"time": elapsed,
|
|
"scroll_time": scroll_time,
|
|
"extract_time": extract_time,
|
|
"success": True,
|
|
"error": None
|
|
}
|
|
|
|
if return_driver:
|
|
result["driver"] = driver
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
elapsed = time.time() - start_time
|
|
error_msg = f"Fast scrape failed: {str(e)}"
|
|
log.error(error_msg)
|
|
|
|
# Check if this is a tab crash - try to extract what we have
|
|
partial_reviews = []
|
|
is_tab_crash = "tab crashed" in str(e).lower() or "session deleted" in str(e).lower()
|
|
|
|
if is_tab_crash and driver:
|
|
log.warning("Detected tab crash - attempting to extract partial reviews from DOM before crash...")
|
|
try:
|
|
# Try to extract reviews that were loaded before crash
|
|
partial_reviews = extract_all_reviews_js(driver)
|
|
log.info(f"Recovered {len(partial_reviews)} reviews from crashed session")
|
|
except Exception as recovery_error:
|
|
log.error(f"Could not recover reviews: {recovery_error}")
|
|
|
|
# Return partial results if we got any
|
|
if partial_reviews:
|
|
result = {
|
|
"reviews": partial_reviews,
|
|
"count": len(partial_reviews),
|
|
"total_reviews": None,
|
|
"time": elapsed,
|
|
"success": False, # Mark as failed but with partial data
|
|
"error": f"{error_msg} (recovered {len(partial_reviews)} reviews)",
|
|
"partial": True
|
|
}
|
|
else:
|
|
result = {
|
|
"reviews": [],
|
|
"count": 0,
|
|
"total_reviews": None,
|
|
"time": elapsed,
|
|
"success": False,
|
|
"error": error_msg
|
|
}
|
|
|
|
if return_driver:
|
|
result["driver"] = driver
|
|
|
|
return result
|
|
|
|
finally:
|
|
if should_close_driver and driver:
|
|
try:
|
|
driver.quit()
|
|
except:
|
|
pass
|
|
|
|
|
|
def get_business_card_info(url: str, headless: bool = True, driver=None, return_driver: bool = False) -> Dict[str, Any]:
|
|
"""
|
|
Extract business card information from Google Maps.
|
|
Uses the same reliable navigation logic as the main scraper.
|
|
|
|
Returns business card with:
|
|
- name
|
|
- address
|
|
- rating (float)
|
|
- total_reviews (int)
|
|
- success/error
|
|
"""
|
|
import time as timing_module
|
|
start_time = timing_module.time()
|
|
log.info(f"[PROFILE] Getting business card info for: {url}")
|
|
|
|
driver_provided = driver is not None
|
|
should_close_driver = not return_driver and not driver_provided
|
|
|
|
try:
|
|
# Initialize driver if not provided
|
|
t0 = timing_module.time()
|
|
if not driver:
|
|
driver = Driver(
|
|
uc=True,
|
|
headless=headless,
|
|
page_load_strategy="normal",
|
|
agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
)
|
|
log.info(f"[PROFILE] Driver initialization: {timing_module.time() - t0:.2f}s")
|
|
else:
|
|
log.info(f"[PROFILE] Using pooled driver (0.00s)")
|
|
|
|
# Force English locale for consistent parsing
|
|
if 'hl=' in url:
|
|
url = url.replace('hl=es', 'hl=en').replace('hl=pt', 'hl=en').replace('hl=fr', 'hl=en')
|
|
else:
|
|
separator = '&' if '?' in url else '?'
|
|
url = f"{url}{separator}hl=en"
|
|
|
|
log.info(f"Loading Google Maps page...")
|
|
t0 = timing_module.time()
|
|
driver.get(url)
|
|
log.info(f"[PROFILE] Page load (driver.get): {timing_module.time() - t0:.2f}s")
|
|
|
|
t0 = timing_module.time()
|
|
time.sleep(0.5) # Initial wait - reduced from 2s
|
|
log.info(f"[PROFILE] Initial sleep: {timing_module.time() - t0:.2f}s")
|
|
|
|
# Handle GDPR consent page
|
|
t0 = timing_module.time()
|
|
if 'consent.google.com' in driver.current_url:
|
|
log.info("Detected GDPR consent page, accepting...")
|
|
try:
|
|
form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
|
|
for btn in form_btns:
|
|
btn_text = (btn.text or '').lower()
|
|
if 'aceptar todo' in btn_text or 'accept all' in btn_text:
|
|
log.info(f"Clicking GDPR consent: {btn.text}")
|
|
btn.click()
|
|
time.sleep(1) # Reduced from 2s
|
|
break
|
|
else:
|
|
if len(form_btns) >= 2:
|
|
log.info("Using fallback: clicking second form button")
|
|
form_btns[1].click()
|
|
time.sleep(1) # Reduced from 2s
|
|
except Exception as e:
|
|
log.warning(f"GDPR consent handling failed: {e}")
|
|
log.info(f"[PROFILE] GDPR consent handling: {timing_module.time() - t0:.2f}s")
|
|
else:
|
|
log.info(f"[PROFILE] No GDPR consent page (0.00s)")
|
|
|
|
# Dismiss cookie banner
|
|
try:
|
|
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
|
|
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
|
|
if cookie_btns:
|
|
log.info("Dismissing cookie banner...")
|
|
cookie_btns[0].click()
|
|
time.sleep(0.3) # Reduced from 0.5s
|
|
except:
|
|
pass
|
|
log.info(f"[PROFILE] Cookie banner dismissal: {timing_module.time() - t0:.2f}s")
|
|
|
|
# Wait for page to load - use smart waits
|
|
t0 = timing_module.time()
|
|
try:
|
|
log.info("Waiting for Google Maps content to load...")
|
|
wait = WebDriverWait(driver, 10)
|
|
wait.until(
|
|
lambda d: d.find_elements(By.CSS_SELECTOR, 'h1.DUwDvf, h1, [role="article"], [data-review-id]')
|
|
)
|
|
log.info("Google Maps content loaded successfully")
|
|
except Exception as e:
|
|
log.warning(f"Timeout waiting for Maps content: {e}")
|
|
time.sleep(0.5) # Minimal fallback wait
|
|
log.info(f"[PROFILE] Smart wait for content: {timing_module.time() - t0:.2f}s")
|
|
|
|
# Extract business card information using JavaScript
|
|
t0 = timing_module.time()
|
|
extract_script = """
|
|
const info = {
|
|
name: null,
|
|
address: null,
|
|
rating: null,
|
|
total_reviews: null
|
|
};
|
|
|
|
// Extract business name
|
|
const nameSelectors = [
|
|
'h1.DUwDvf',
|
|
'[role="main"] h1',
|
|
'h1.fontHeadlineLarge'
|
|
];
|
|
|
|
for (const selector of nameSelectors) {
|
|
const elem = document.querySelector(selector);
|
|
if (elem && elem.textContent) {
|
|
info.name = elem.textContent.trim();
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Extract address
|
|
const addressSelectors = [
|
|
'button[data-item-id*="address"]',
|
|
'[data-item-id*="address"]',
|
|
'div[aria-label*="Address"]'
|
|
];
|
|
|
|
for (const selector of addressSelectors) {
|
|
const elem = document.querySelector(selector);
|
|
if (elem && elem.textContent) {
|
|
info.address = elem.textContent.trim();
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Extract rating (look for aria-label like "4.2 stars")
|
|
const ratingElem = document.querySelector('[role="img"][aria-label*="star"]');
|
|
if (ratingElem) {
|
|
const ariaLabel = ratingElem.getAttribute('aria-label');
|
|
const match = ariaLabel.match(/([0-9.]+)/);
|
|
if (match) {
|
|
info.rating = parseFloat(match[1]);
|
|
}
|
|
}
|
|
|
|
// Extract total review count
|
|
const reviewPattern = /\\((\\d[\\d,\\.]*)\\)/;
|
|
const numberPattern = /(\\d[\\d,\\.]*)\\s*(?:review|reseña|avis)/i;
|
|
|
|
// PRIORITY 1: Look for review count in search results sidebar/panel
|
|
// This is where "152 reviews" appears on search results
|
|
const searchPanelSelectors = [
|
|
'a[href*="reviews"]', // Link with "reviews" in href
|
|
'button[jsaction*="reviews"]', // Button related to reviews
|
|
'div[role="link"]', // Clickable divs that might contain review info
|
|
];
|
|
|
|
for (const selector of searchPanelSelectors) {
|
|
const elements = document.querySelectorAll(selector);
|
|
for (let elem of elements) {
|
|
const text = elem.textContent || '';
|
|
const match = text.match(numberPattern);
|
|
if (match) {
|
|
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
|
if (num > 0 && num < 1000000) {
|
|
info.total_reviews = num;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
if (info.total_reviews) break;
|
|
}
|
|
|
|
// PRIORITY 2: Look in any span/div that contains the word "review"
|
|
if (!info.total_reviews) {
|
|
const allElements = document.querySelectorAll('span, div, a');
|
|
for (let elem of allElements) {
|
|
const text = elem.textContent || '';
|
|
if (text.length < 100) { // Skip very long text blocks
|
|
const match = text.match(numberPattern);
|
|
if (match) {
|
|
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
|
if (num > 0 && num < 1000000) {
|
|
info.total_reviews = num;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// PRIORITY 3: Try tabs (for business detail pages)
|
|
if (!info.total_reviews) {
|
|
const tabs = document.querySelectorAll('button[role="tab"]');
|
|
for (let tab of tabs) {
|
|
const text = tab.textContent || '';
|
|
let match = text.match(reviewPattern);
|
|
if (match) {
|
|
info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
|
break;
|
|
}
|
|
match = text.match(numberPattern);
|
|
if (match) {
|
|
info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// PRIORITY 4: Try aria-labels
|
|
if (!info.total_reviews) {
|
|
const elements = document.querySelectorAll('[aria-label]');
|
|
for (let elem of elements) {
|
|
const ariaLabel = elem.getAttribute('aria-label') || '';
|
|
let match = ariaLabel.match(reviewPattern);
|
|
if (match) {
|
|
info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
|
break;
|
|
}
|
|
match = ariaLabel.match(numberPattern);
|
|
if (match) {
|
|
info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
return info;
|
|
"""
|
|
|
|
business_info = driver.execute_script(extract_script)
|
|
log.info(f"[PROFILE] Business card extraction: {timing_module.time() - t0:.2f}s")
|
|
|
|
total_time = timing_module.time() - start_time
|
|
log.info(f"[PROFILE] *** TOTAL GET_BUSINESS_CARD TIME: {total_time:.2f}s ***")
|
|
log.info(f"Business card extracted: name={business_info.get('name')}, "
|
|
f"rating={business_info.get('rating')}, reviews={business_info.get('total_reviews')}")
|
|
|
|
result = {
|
|
"name": business_info.get('name'),
|
|
"address": business_info.get('address'),
|
|
"rating": business_info.get('rating'),
|
|
"total_reviews": business_info.get('total_reviews') or 0,
|
|
"has_reviews": (business_info.get('total_reviews') or 0) > 0,
|
|
"success": True,
|
|
"error": None
|
|
}
|
|
|
|
if return_driver:
|
|
result["driver"] = driver
|
|
return result
|
|
|
|
except Exception as e:
|
|
total_time = timing_module.time() - start_time
|
|
error_msg = f"Failed to get business card info: {str(e)}"
|
|
log.error(error_msg)
|
|
log.info(f"[PROFILE] *** TOTAL GET_BUSINESS_CARD TIME (FAILED): {total_time:.2f}s ***")
|
|
result = {
|
|
"name": None,
|
|
"address": None,
|
|
"rating": None,
|
|
"total_reviews": 0,
|
|
"has_reviews": False,
|
|
"success": False,
|
|
"error": error_msg
|
|
}
|
|
if return_driver:
|
|
result["driver"] = driver
|
|
return result
|
|
|
|
finally:
|
|
if should_close_driver and driver:
|
|
try:
|
|
driver.quit()
|
|
except:
|
|
pass
|
|
|
|
|
|
def check_reviews_available(url: str, headless: bool = True, driver=None, return_driver: bool = False) -> Dict[str, Any]:
|
|
"""
|
|
Lightweight check to see if a business has reviews available.
|
|
|
|
This function just loads the page and checks for review count without
|
|
doing the full scraping. Used to enable/disable scrape button in UI.
|
|
|
|
Args:
|
|
url: Google Maps URL to check
|
|
headless: Run in headless mode (default True)
|
|
driver: Existing driver instance to reuse (from worker pool)
|
|
return_driver: If True, don't close driver and return it in result
|
|
|
|
Returns:
|
|
Dict containing:
|
|
- has_reviews: bool - whether reviews exist
|
|
- review_count: int - number of reviews (0 if none)
|
|
- business_name: str - name of business (if found)
|
|
- success: bool - whether check succeeded
|
|
- error: str - error message (if failed)
|
|
- driver: Driver instance (if return_driver=True)
|
|
"""
|
|
import time as timing_module
|
|
start_time = timing_module.time()
|
|
log.info(f"[PROFILE] Starting validation for: {url}")
|
|
|
|
driver_provided = driver is not None
|
|
should_close_driver = not return_driver and not driver_provided
|
|
|
|
try:
|
|
# Initialize driver if not provided
|
|
t0 = timing_module.time()
|
|
if not driver:
|
|
driver = Driver(uc=True, headless=headless)
|
|
driver.maximize_window()
|
|
log.info(f"[PROFILE] Driver initialization: {timing_module.time() - t0:.2f}s")
|
|
else:
|
|
log.info(f"[PROFILE] Using pooled driver (0.00s)")
|
|
|
|
# Navigate to the URL
|
|
t0 = timing_module.time()
|
|
log.info(f"Loading Google Maps page...")
|
|
driver.get(url)
|
|
log.info(f"[PROFILE] Page load (driver.get): {timing_module.time() - t0:.2f}s")
|
|
|
|
t0 = timing_module.time()
|
|
time.sleep(0.5) # Initial wait - reduced from 2s
|
|
log.info(f"[PROFILE] Initial sleep: {timing_module.time() - t0:.2f}s")
|
|
|
|
# Handle GDPR consent page (CRITICAL for validation to work!)
|
|
t0 = timing_module.time()
|
|
if 'consent.google.com' in driver.current_url:
|
|
log.info("Detected GDPR consent page, accepting...")
|
|
try:
|
|
# Find all form buttons and click "Accept all" / "Aceptar todo"
|
|
form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
|
|
for btn in form_btns:
|
|
btn_text = (btn.text or '').lower()
|
|
if 'aceptar todo' in btn_text or 'accept all' in btn_text:
|
|
log.info(f"Clicking GDPR consent: {btn.text}")
|
|
btn.click()
|
|
time.sleep(1) # Reduced from 2s
|
|
break
|
|
else:
|
|
# Fallback: click second button (usually "Accept all")
|
|
if len(form_btns) >= 2:
|
|
log.info("Using fallback: clicking second form button")
|
|
form_btns[1].click()
|
|
time.sleep(1) # Reduced from 2s
|
|
except Exception as e:
|
|
log.warning(f"GDPR consent handling failed: {e}")
|
|
log.info(f"[PROFILE] GDPR consent handling: {timing_module.time() - t0:.2f}s")
|
|
else:
|
|
log.info(f"[PROFILE] No GDPR consent page (0.00s)")
|
|
|
|
# Dismiss cookie banner on Maps page
|
|
t0 = timing_module.time()
|
|
try:
|
|
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
|
|
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
|
|
if cookie_btns:
|
|
log.info("Dismissing cookie banner...")
|
|
cookie_btns[0].click()
|
|
time.sleep(0.3) # Reduced from 0.5s
|
|
except:
|
|
pass
|
|
log.info(f"[PROFILE] Cookie banner dismissal: {timing_module.time() - t0:.2f}s")
|
|
|
|
# Wait for page to fully load after consent - use smart waits
|
|
t0 = timing_module.time()
|
|
try:
|
|
# Wait for either business card OR search results to appear
|
|
log.info("Waiting for Google Maps content to load...")
|
|
wait = WebDriverWait(driver, 10)
|
|
wait.until(
|
|
lambda d: d.find_elements(By.CSS_SELECTOR, 'h1.DUwDvf, h1, [role="article"], [data-review-id]')
|
|
)
|
|
log.info("Google Maps content loaded successfully")
|
|
except Exception as e:
|
|
log.warning(f"Timeout waiting for Maps content: {e}")
|
|
time.sleep(0.5) # Minimal fallback wait
|
|
log.info(f"[PROFILE] Smart wait for content: {timing_module.time() - t0:.2f}s")
|
|
|
|
# Try to extract business name
|
|
t0 = timing_module.time()
|
|
business_name = None
|
|
try:
|
|
business_name_script = """
|
|
// Try to find business name from various locations
|
|
let name = null;
|
|
|
|
// Method 1: Look for business name in the main panel (most reliable)
|
|
// This is where the actual business info appears
|
|
const businessPanelSelectors = [
|
|
'h1.DUwDvf', // Main business name heading
|
|
'[role="main"] h1', // H1 in main content
|
|
'h1.fontHeadlineLarge', // Large headline font
|
|
'button[jsaction*="pane.header.rating"] h1', // Near rating button
|
|
];
|
|
|
|
for (const selector of businessPanelSelectors) {
|
|
const element = document.querySelector(selector);
|
|
if (element && element.textContent) {
|
|
const text = element.textContent.trim();
|
|
// Filter out Google's placeholder/suggestion text
|
|
if (text &&
|
|
!text.toLowerCase().includes('antes de ir') &&
|
|
!text.toLowerCase().includes('before going') &&
|
|
!text.toLowerCase().includes('google maps') &&
|
|
text.length < 100) { // Business names shouldn't be super long
|
|
name = text;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Method 2: h1 tag (fallback)
|
|
if (!name) {
|
|
const h1 = document.querySelector('h1');
|
|
if (h1 && h1.textContent) {
|
|
const text = h1.textContent.trim();
|
|
if (!text.toLowerCase().includes('antes de ir') &&
|
|
!text.toLowerCase().includes('before going')) {
|
|
name = text;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Method 3: Title attribute (last resort)
|
|
if (!name) {
|
|
const title = document.title;
|
|
if (title && !title.includes('Google Maps')) {
|
|
name = title.split('-')[0].trim();
|
|
}
|
|
}
|
|
|
|
return name;
|
|
"""
|
|
business_name = driver.execute_script(business_name_script)
|
|
if business_name:
|
|
log.info(f"Found business name: {business_name}")
|
|
except Exception as e:
|
|
log.debug(f"Could not extract business name: {e}")
|
|
log.info(f"[PROFILE] Business name extraction: {timing_module.time() - t0:.2f}s")
|
|
|
|
# Extract total review count
|
|
t0 = timing_module.time()
|
|
review_count = extract_total_review_count(driver)
|
|
log.info(f"[PROFILE] Review count extraction: {timing_module.time() - t0:.2f}s")
|
|
|
|
if review_count is None:
|
|
log.warning("Could not determine review count")
|
|
total_time = timing_module.time() - start_time
|
|
log.info(f"[PROFILE] *** TOTAL VALIDATION TIME: {total_time:.2f}s ***")
|
|
result = {
|
|
"has_reviews": False,
|
|
"review_count": 0,
|
|
"business_name": business_name,
|
|
"success": True,
|
|
"error": "Could not find review count on page"
|
|
}
|
|
if return_driver:
|
|
result["driver"] = driver
|
|
return result
|
|
|
|
log.info(f"Found {review_count} reviews available")
|
|
|
|
total_time = timing_module.time() - start_time
|
|
log.info(f"[PROFILE] *** TOTAL VALIDATION TIME: {total_time:.2f}s ***")
|
|
|
|
result = {
|
|
"has_reviews": review_count > 0,
|
|
"review_count": review_count,
|
|
"business_name": business_name,
|
|
"success": True,
|
|
"error": None
|
|
}
|
|
if return_driver:
|
|
result["driver"] = driver
|
|
return result
|
|
|
|
except Exception as e:
|
|
total_time = timing_module.time() - start_time
|
|
error_msg = f"Failed to check reviews: {str(e)}"
|
|
log.error(error_msg)
|
|
log.info(f"[PROFILE] *** TOTAL VALIDATION TIME (FAILED): {total_time:.2f}s ***")
|
|
result = {
|
|
"has_reviews": False,
|
|
"review_count": 0,
|
|
"business_name": None,
|
|
"success": False,
|
|
"error": error_msg
|
|
}
|
|
if return_driver:
|
|
result["driver"] = driver
|
|
return result
|
|
|
|
finally:
|
|
if should_close_driver and driver:
|
|
try:
|
|
driver.quit()
|
|
except:
|
|
pass
|