Files
whyrating-engine-legacy/modules/fast_scraper.py
Alejandro Gutiérrez c8c24ae483 Add robust structural pattern matching and early no-reviews detection
BREAKING IMPROVEMENTS:

1. Early Detection for No Reviews:
   - Check for "no reviews" messages in 11+ languages before scraping
   - Detect disabled reviews tabs and aria-labels with 0 reviews
   - Return early with success when no reviews exist (saves time)
   - Prevents wasted scraping attempts on businesses with no reviews

2. Structural Pattern Matching (Class-Agnostic):
   - STRATEGY 1: Try known CSS selectors (div.jftiEf.fontBodyMedium, etc.)
   - STRATEGY 2: Structural matching - find containers with review-like structure
     * Looks for elements containing: author + rating + text + date
     * Counts elements with 3+ review indicators (robust, works across layouts)
   - STRATEGY 3: Use role="article" with review content detection
   - Falls back through strategies automatically

3. Less Script-Dependent Selectors:
   - Uses aria-label attributes (more stable than CSS classes)
   - Uses role attributes (semantic HTML)
   - Searches for structural patterns (author img + rating span + text span)
   - Works across different Google Maps page layouts and languages

4. Frontend Improvement:
   - Hide "Open Analytics Dashboard" button when reviews_count is 0
   - Only show action buttons for completed jobs with reviews

TECHNICAL DETAILS:

Structural Matching Logic:
- Scans all divs for review indicators:
  * hasAuthor: img with photo/avatar in src
  * hasRating: aria-label containing "star" or "rating"
  * hasText: span with 20+ characters
  * hasDate: text matching date patterns (day/week/month/year)
- Element is a review if it has 3+ of these indicators

Early Detection Patterns:
- Checks page text for: "no reviews yet", "be the first to review", etc.
- Checks for "0 reviews" patterns in text and aria-labels
- Checks if reviews tab is disabled or aria-disabled

Benefits:
- Works on Lithuanian hospital page (was getting 0/271 reviews)
- Handles regional Google Maps variations automatically
- Faster exit for businesses with no reviews
- More reliable across Google Maps UI updates
- Better UX: no empty analytics dashboard for 0-review jobs

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-18 19:52:39 +00:00

1493 lines
57 KiB
Python

#!/usr/bin/env python3
"""
Fast DOM-only scraper module for API integration.
Based on start_dom_only_fast.py - achieves ~18.9s for all reviews.
This module provides a reusable function for the API server.
"""
import logging
import time
from typing import List, Dict, Any, Optional
from seleniumbase import Driver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
log = logging.getLogger(__name__)
def check_no_reviews_early(driver) -> tuple[bool, str]:
"""
Early detection for 'no reviews available' scenarios.
Returns (has_no_reviews, reason) tuple.
Uses structural patterns instead of fragile CSS classes for robustness.
"""
try:
# Check for common "no reviews" messages in multiple languages
no_review_patterns = [
'no reviews yet',
'be the first to review',
"there aren't any reviews",
'no hay reseñas',
'sin reseñas',
"pas encore d'avis",
'noch keine bewertungen',
'nessuna recensione',
'まだレビューがありません',
'sem avaliações',
'belum ada ulasan'
]
# Get page text
page_text = driver.execute_script("return document.body.innerText.toLowerCase();")
# Check for "no reviews" messages
for pattern in no_review_patterns:
if pattern in page_text:
return True, f"Found 'no reviews' message: '{pattern}'"
# Check if review count is explicitly 0
review_count_check = driver.execute_script("""
// Look for review count indicators
const patterns = [
/0\\s*(?:review|reseña|avis|recensione|Bewertung|レビュー)/i,
/\\(0\\)/,
/review.*0/i,
/0.*review/i
];
const text = document.body.innerText;
for (let pattern of patterns) {
if (pattern.test(text)) {
return 'Found 0 reviews indicator';
}
}
// Check for aria-labels indicating no reviews
const elements = document.querySelectorAll('[aria-label*="0 review" i], [aria-label*="no review" i]');
if (elements.length > 0) {
return 'Found aria-label with 0 reviews';
}
return null;
""")
if review_count_check:
return True, review_count_check
# Check if reviews tab is disabled or not clickable
reviews_disabled = driver.execute_script("""
const tabs = document.querySelectorAll('button[role="tab"]');
for (let tab of tabs) {
const text = (tab.textContent || '').toLowerCase();
const aria = (tab.getAttribute('aria-label') || '').toLowerCase();
if (text.includes('review') || aria.includes('review')) {
if (tab.disabled || tab.getAttribute('aria-disabled') === 'true') {
return 'Reviews tab is disabled';
}
}
}
return null;
""")
if reviews_disabled:
return True, reviews_disabled
return False, ""
except Exception as e:
log.warning(f"Error in early no-reviews detection: {e}")
return False, ""
def extract_total_review_count(driver) -> Optional[int]:
"""
Extract the total number of reviews from the Google Maps page.
Looks for text patterns like "500 reviews" in various elements.
Works on both search results pages and business detail pages.
Returns:
Total review count or None if not found
"""
extract_script = """
// Optimized review count extraction - removed verbose logging for speed
let total = null;
const parenthesesPattern = /\\((\\d[\\d,\\.\\s]*)\\)/;
const numberPattern = /(\\d[\\d,\\.\\s]*)\\s*(?:review|reseña|avis|recensione|Bewertung|レビュー)/i;
// PRIORITY 1: Search results page
const searchResultsSelectors = [
'a[href*="reviews"]',
'[role="article"] span',
'[role="article"] a',
'div.fontBodyMedium',
'span.UY7F9',
];
for (const selector of searchResultsSelectors) {
const elements = document.querySelectorAll(selector);
for (let i = 0; i < Math.min(elements.length, 20); i++) {
const elem = elements[i];
const text = elem.textContent || '';
const href = elem.getAttribute('href') || '';
let match = text.match(numberPattern);
if (match) {
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
if (num > 0 && num < 1000000) {
total = num;
break;
}
}
if (href.includes('reviews')) {
match = text.match(/(\\d[\\d,\\.\\s]*)/);
if (match) {
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
if (num > 0 && num < 1000000) {
total = num;
break;
}
}
}
}
if (total) break;
}
// PRIORITY 2: Tab buttons (business detail page)
if (!total) {
const buttons = document.querySelectorAll('button[role="tab"]');
for (let i = 0; i < buttons.length; i++) {
const text = buttons[i].textContent || '';
let match = text.match(parenthesesPattern);
if (match) {
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
total = num;
break;
}
match = text.match(numberPattern);
if (match) {
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
total = num;
break;
}
}
}
// PRIORITY 3: Aria-labels
if (!total) {
const elements = document.querySelectorAll('[aria-label]');
for (let elem of elements) {
const ariaLabel = elem.getAttribute('aria-label') || '';
let match = ariaLabel.match(parenthesesPattern);
if (match) {
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
total = num;
break;
}
match = ariaLabel.match(numberPattern);
if (match) {
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
total = num;
break;
}
}
}
// PRIORITY 4: Fallback - entire page text
if (!total) {
const match = document.body.innerText.match(parenthesesPattern);
if (match) {
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
if (num > 0 && num < 1000000) {
total = num;
}
}
}
return total;
"""
try:
total = driver.execute_script(extract_script)
# Get debug info from JavaScript
debug_script = """
const info = {
search_results_count: document.querySelectorAll('[role="article"]').length,
links_with_reviews: document.querySelectorAll('a[href*="reviews"]').length,
page_url: window.location.href,
page_title: document.title,
sample_texts: []
};
// Get sample text from links that might contain reviews
const reviewLinks = document.querySelectorAll('a[href*="reviews"]');
for (let i = 0; i < Math.min(5, reviewLinks.length); i++) {
info.sample_texts.push(reviewLinks[i].textContent.substring(0, 100));
}
// Also check for text containing "review" keyword
const allText = document.body.innerText.substring(0, 2000);
const reviewMatches = allText.match(/\\d+[\\s,\\.]*(?:review|reseña|avis)/gi);
if (reviewMatches) {
info.review_patterns_found = reviewMatches.slice(0, 5);
}
return info;
"""
debug_info = driver.execute_script(debug_script)
log.info(f"Page debug: URL={debug_info.get('page_url')}")
log.info(f"Page debug: Found {debug_info.get('search_results_count')} search result articles")
log.info(f"Page debug: Found {debug_info.get('links_with_reviews')} links containing 'reviews'")
if debug_info.get('review_patterns_found'):
log.info(f"Page debug: Review patterns in text: {debug_info.get('review_patterns_found')}")
if debug_info.get('sample_texts'):
log.info(f"Page debug: Sample link texts: {debug_info.get('sample_texts')}")
if total and total > 0:
log.info(f"Extracted total review count: {total}")
return total
else:
log.warning(f"Could not extract total review count from page. Debug: {debug_info}")
return None
except Exception as e:
log.error(f"Error extracting total review count: {e}")
return None
def extract_all_reviews_js(driver) -> List[Dict[str, Any]]:
"""Extract ALL reviews using JavaScript - single fast operation."""
extract_script = """
const reviews = [];
// ROBUST SELECTOR STRATEGY: Try known selectors first, then fall back to structural matching
let elements = null;
// STRATEGY 1: Try known CSS selectors (fast path)
const knownSelectors = [
'div.jftiEf.fontBodyMedium',
'div.jftiEf',
'div[data-review-id]',
'div[jsaction*="review"]'
];
for (let selector of knownSelectors) {
const found = document.querySelectorAll(selector);
if (found.length > 0) {
elements = found;
console.log('Found', found.length, 'reviews using known selector:', selector);
break;
}
}
// STRATEGY 2: Structural matching for unknown page layouts
if (!elements || elements.length === 0) {
console.log('Known selectors failed, trying structural matching...');
// Find all divs that LOOK like reviews (have review structure)
const allDivs = document.querySelectorAll('div');
const reviewElements = [];
for (let div of allDivs) {
// Skip if too small
if (div.children.length < 2) continue;
// Check for review indicators
const hasAuthor = div.querySelector('[aria-label*="photo" i], img[src*="photo"], img[src*="avatar"]');
const hasRating = div.querySelector('[aria-label*="star" i], [aria-label*="rating" i], span[role="img"]');
const hasText = Array.from(div.querySelectorAll('span')).some(s => s.textContent.length > 20);
const hasDate = div.textContent.match(/\\d+\\s*(day|week|month|year|día|semana|mes|año)/i);
// Must have at least author, rating, and text to be a review
const indicators = [hasAuthor, hasRating, hasText, hasDate].filter(Boolean).length;
if (indicators >= 3) {
reviewElements.push(div);
}
}
if (reviewElements.length > 0) {
elements = reviewElements;
console.log('Found', reviewElements.length, 'reviews using structural matching');
}
}
// STRATEGY 3: Try role="article" as last resort
if (!elements || elements.length === 0) {
const articles = document.querySelectorAll('[role="article"]');
const validArticles = [];
for (let article of articles) {
const hasRating = article.querySelector('[aria-label*="star" i]');
const hasText = article.textContent.length > 30;
if (hasRating && hasText) {
validArticles.push(article);
}
}
if (validArticles.length > 0) {
elements = validArticles;
console.log('Found', validArticles.length, 'reviews using role=article');
}
}
if (!elements || elements.length === 0) {
console.warn('No review elements found with any strategy');
return [];
}
for (let i = 0; i < elements.length; i++) {
const elem = elements[i];
const review = {};
try {
// Author
const authorElem = elem.querySelector('div.d4r55');
review.author = authorElem ? authorElem.textContent.trim() : null;
// Rating
const ratingElem = elem.querySelector('span.kvMYJc');
if (ratingElem) {
const ariaLabel = ratingElem.getAttribute('aria-label');
if (ariaLabel) {
const match = ariaLabel.match(/\\d+/);
review.rating = match ? parseFloat(match[0]) : null;
}
}
// Text
const textElem = elem.querySelector('span.wiI7pd');
review.text = textElem ? textElem.textContent.trim() : null;
// Date
const dateElem = elem.querySelector('span.rsqaWe');
review.date_text = dateElem ? dateElem.textContent.trim() : null;
// DEEP DIVE: Find where Google stores the actual timestamp
review.timestamp = null;
review.debug_date_info = {};
if (dateElem) {
// 1. Check all attributes on date element
const allAttrs = {};
for (let attr of dateElem.attributes) {
allAttrs[attr.name] = attr.value;
}
review.debug_date_info.date_elem_attrs = allAttrs;
// 2. Check parent elements for data
let parent = dateElem.parentElement;
let parentLevel = 0;
while (parent && parentLevel < 3) {
const parentAttrs = {};
for (let attr of parent.attributes) {
if (attr.name.includes('data') || attr.name.includes('time') || attr.name.includes('date')) {
parentAttrs[attr.name] = attr.value;
}
}
if (Object.keys(parentAttrs).length > 0) {
review.debug_date_info[`parent_${parentLevel}_attrs`] = parentAttrs;
}
parent = parent.parentElement;
parentLevel++;
}
// 3. Check the entire review container for hidden data
const reviewContainer = elem;
const containerAttrs = {};
for (let attr of reviewContainer.attributes) {
containerAttrs[attr.name] = attr.value;
}
review.debug_date_info.container_attrs = containerAttrs;
// 4. Look for script tags or JSON data near the date
const nearbyScripts = elem.querySelectorAll('script');
if (nearbyScripts.length > 0) {
review.debug_date_info.has_nearby_scripts = nearbyScripts.length;
}
// 5. Check for any element with 'time' in class or data
const timeElements = elem.querySelectorAll('[class*="time"], [data-timestamp], [datetime]');
if (timeElements.length > 0) {
const timeData = [];
timeElements.forEach(el => {
timeData.push({
tag: el.tagName,
classes: el.className,
datetime: el.getAttribute('datetime'),
timestamp: el.getAttribute('data-timestamp'),
text: el.textContent.substring(0, 50)
});
});
review.debug_date_info.time_elements = timeData;
}
}
// Avatar
const avatarElem = elem.querySelector('img.NBa7we');
review.avatar_url = avatarElem ? avatarElem.src : null;
// Profile URL
const profileElem = elem.querySelector('button.WEBjve');
review.profile_url = profileElem ? profileElem.getAttribute('data-review-id') : null;
if (review.author && review.date_text) {
reviews.push(review);
}
} catch (e) {
// Skip this review
}
}
return reviews;
"""
# ADDITIONAL: Check for Google's internal state/data objects
check_state_script = """
// Look for Google Maps' internal data stores
const debugInfo = {
global_keys: [],
app_data: null,
window_data: null
};
// Check window object for Google Maps data
for (let key in window) {
if (key.includes('google') || key.includes('maps') || key.includes('APP') || key.includes('_')) {
debugInfo.global_keys.push(key);
}
}
// Check for common React/Angular state keys
const stateKeys = ['__INITIAL_STATE__', '__NEXT_DATA__', '__APP_STATE__', 'APP_INITIALIZATION_STATE'];
for (let key of stateKeys) {
if (window[key]) {
debugInfo.app_data = key;
}
}
// Check for embedded JSON in script tags
const scriptTags = document.querySelectorAll('script[type="application/json"], script[type="application/ld+json"]');
debugInfo.json_scripts_count = scriptTags.length;
if (scriptTags.length > 0) {
debugInfo.json_scripts_sample = Array.from(scriptTags).slice(0, 2).map(s => s.textContent.substring(0, 200));
}
return debugInfo;
"""
try:
reviews_data = driver.execute_script(extract_script)
state_debug = driver.execute_script(check_state_script)
# Log the global state debug info
log.info(f"Google Maps state debug: {state_debug}")
# Add review IDs
reviews = []
for i, review_data in enumerate(reviews_data):
review_id = f"review_{hash(review_data['author'] + review_data['date_text'])}"
review_data['review_id'] = review_id
# Add global state debug to first review only
if i == 0:
review_data['_google_state_debug'] = state_debug
reviews.append(review_data)
return reviews
except Exception as e:
log.error(f"Error in JavaScript extraction: {e}")
return []
def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999, progress_callback=None, driver=None, return_driver: bool = False) -> Dict[str, Any]:
"""
Ultra-fast DOM-only scraping with JavaScript extraction.
Args:
url: Google Maps URL to scrape
headless: Run Chrome in headless mode (default: True)
max_scrolls: Maximum scrolls safety limit (default: 999999 - effectively unlimited)
The scraper stops automatically via idle detection when no new reviews load.
progress_callback: Optional callback function(current_count, total_count) for progress updates
driver: Existing driver instance to reuse (from worker pool)
return_driver: If True, don't close driver and return it in result
Returns:
Dictionary with:
- reviews: List of review dictionaries
- count: Total number of reviews scraped
- total_reviews: Total reviews available (from page counter)
- time: Time taken in seconds
- success: True if successful, False otherwise
- error: Error message if failed
- driver: Driver instance (if return_driver=True)
"""
start_time = time.time()
log.info(f"Starting fast scrape for URL: {url[:80]}...")
# Force English locale for consistent date parsing
# English gives cleaner date formats: "3 months ago" vs "Hace 3 meses"
if 'hl=' in url:
# Replace existing locale
url = url.replace('hl=es', 'hl=en').replace('hl=pt', 'hl=en').replace('hl=fr', 'hl=en')
else:
# Add English locale parameter
separator = '&' if '?' in url else '?'
url = f"{url}{separator}hl=en"
log.info(f"Using English locale (hl=en) for consistent date parsing")
# Track if driver was provided or created
driver_provided = driver is not None
should_close_driver = not return_driver and not driver_provided
# Initialize driver with custom user agent to avoid headless detection
# Even with headless=False + Xvfb, Chromium still reports as HeadlessChrome
if not driver:
driver = Driver(
uc=True,
headless=headless,
page_load_strategy="normal",
agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)
try:
# Navigate
driver.get(url)
time.sleep(1.5)
# Handle GDPR consent page (CRITICAL FIX for headless mode!)
if 'consent.google.com' in driver.current_url:
try:
# Find all form buttons and click "Accept all" / "Aceptar todo"
form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
for btn in form_btns:
btn_text = (btn.text or '').lower()
if 'aceptar todo' in btn_text or 'accept all' in btn_text:
log.info(f"Clicking GDPR consent: {btn.text}")
btn.click()
time.sleep(2)
break
else:
# Fallback: click second button (usually "Accept all")
if len(form_btns) >= 2:
log.info("Using fallback: clicking second form button")
form_btns[1].click()
time.sleep(2)
except Exception as e:
log.warning(f"GDPR consent handling failed: {e}")
# Dismiss cookie banner on Maps page
try:
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
if cookie_btns:
cookie_btns[0].click()
time.sleep(0.3)
except:
pass
# Click reviews tab with retry logic (important for containers)
review_keywords = ['reviews', 'review', 'reseñas', 'reseña']
reviews_tab_clicked = False
# Try multiple times to find and click reviews tab
for attempt in range(3):
if reviews_tab_clicked:
break
time.sleep(0.5) # Wait between attempts
for selector in ['button[role="tab"]', '.LRkQ2', 'button']:
try:
tabs = driver.find_elements(By.CSS_SELECTOR, selector)
for tab in tabs:
text = (tab.text or '').lower()
aria = (tab.get_attribute('aria-label') or '').lower()
if any(kw in text or kw in aria for kw in review_keywords):
log.info(f"Clicking reviews tab: {tab.text or aria[:30]}")
driver.execute_script("arguments[0].click();", tab)
time.sleep(1.5) # Wait for tab to load
reviews_tab_clicked = True
break
if reviews_tab_clicked:
break
except Exception as e:
log.debug(f"Tab search attempt {attempt+1} with {selector}: {e}")
continue
if not reviews_tab_clicked:
log.warning("Could not find reviews tab, continuing anyway")
# Wait for reviews section to load
time.sleep(2)
# EARLY DETECTION: Check if there are no reviews before attempting to scrape
no_reviews, reason = check_no_reviews_early(driver)
if no_reviews:
log.info(f"Early detection: No reviews available. Reason: {reason}")
return {
"reviews": [],
"count": 0,
"total_reviews": 0,
"time": time.time() - start_time,
"success": True,
"message": f"No reviews available: {reason}"
}
# Extract total review count from the page
total_reviews = extract_total_review_count(driver)
# Double-check: If extracted count is 0, return early
if total_reviews == 0:
log.info("Total review count is 0, skipping scraping")
return {
"reviews": [],
"count": 0,
"total_reviews": 0,
"time": time.time() - start_time,
"success": True,
"message": "Business has 0 reviews"
}
# Report initial progress with total count
if progress_callback and total_reviews:
try:
progress_callback(0, total_reviews)
except Exception as e:
log.warning(f"Progress callback failed: {e}")
# Find scrollable pane - try multiple selectors (container-friendly)
pane = None
pane_selectors = [
'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde',
'div.m6QErb.WNBkOb.XiKgde',
'div.m6QErb', # Fallback to more general selector
'div[role="main"]',
]
wait = WebDriverWait(driver, 5)
for selector in pane_selectors:
try:
pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector)))
log.info(f"Found pane with selector: {selector}")
break
except TimeoutException:
continue
if not pane:
error_msg = "Could not find scrollable pane after trying all selectors"
log.error(error_msg)
return {
"reviews": [],
"count": 0,
"total_reviews": total_reviews,
"time": time.time() - start_time,
"success": False,
"error": error_msg
}
# Wait longer for initial reviews to load (containers can be slower)
time.sleep(2)
# Setup scroll
driver.execute_script("window.scrollablePane = arguments[0];", pane)
scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
# Trigger initial scroll and verify reviews are loading
driver.execute_script(scroll_script)
time.sleep(0.8)
# Also scroll the main window (helps in some cases, especially containers)
driver.execute_script("window.scrollBy(0, 500);")
time.sleep(0.5)
# JavaScript function to count reviews using ROBUST structural patterns
# Instead of relying on CSS classes, we look for containers with review-like structure
count_reviews_script = """
// STRATEGY 1: Try known selectors first (fast path)
const knownSelectors = [
'div.jftiEf.fontBodyMedium',
'div.jftiEf',
'div[data-review-id]',
'div[jsaction*="review"]'
];
for (let selector of knownSelectors) {
const found = document.querySelectorAll(selector);
if (found.length > 0) {
return found.length;
}
}
// STRATEGY 2: Structural pattern matching (robust, class-agnostic)
// Find containers that LOOK like reviews (have author + rating + text structure)
const findReviewsByStructure = () => {
const allDivs = document.querySelectorAll('div');
let reviewCount = 0;
for (let div of allDivs) {
// Skip if too small (reviews have substantial content)
if (div.children.length < 2) continue;
// Look for review indicators:
// - Has an author name (usually in a span/div with small text)
// - Has a rating (span with aria-label containing "star" or "rating")
// - Has review text (span/div with longer text content)
const hasAuthor = div.querySelector('[aria-label*="photo" i], img[src*="photo"], img[src*="avatar"]');
const hasRating = div.querySelector('[aria-label*="star" i], [aria-label*="rating" i], span[role="img"]');
const hasText = Array.from(div.querySelectorAll('span')).some(s => s.textContent.length > 20);
const hasDate = div.textContent.match(/\\d+\\s*(day|week|month|year|día|semana|mes|año|jour|mois|année)/i);
// If it has at least 3 of these indicators, it's likely a review
const indicators = [hasAuthor, hasRating, hasText, hasDate].filter(Boolean).length;
if (indicators >= 3) {
reviewCount++;
}
}
return reviewCount > 0 ? reviewCount : 0;
};
// STRATEGY 3: Look for role="article" with review-like content
const articles = document.querySelectorAll('[role="article"]');
if (articles.length > 0) {
let validArticles = 0;
for (let article of articles) {
// Check if article looks like a review (has rating + text)
const hasRating = article.querySelector('[aria-label*="star" i]');
const hasText = article.textContent.length > 30;
if (hasRating && hasText) {
validArticles++;
}
}
if (validArticles > 0) return validArticles;
}
// Try structural matching as last resort
const structuralCount = findReviewsByStructure();
return structuralCount;
"""
# Check if reviews are actually loading
initial_count = driver.execute_script(count_reviews_script)
if initial_count < 5:
# Reviews not loaded yet, wait more and try alternative scrolling
log.info(f"Waiting for reviews to load (found {initial_count})...")
# Try clicking on the pane to focus it
try:
driver.execute_script("arguments[0].click();", pane)
time.sleep(0.5)
except:
pass
# Scroll both pane and window
driver.execute_script(scroll_script)
driver.execute_script("window.scrollBy(0, 500);")
time.sleep(1.5)
initial_count = driver.execute_script(count_reviews_script)
log.info(f"After extra waiting: {initial_count} reviews")
log.info(f"Scrolling to load all reviews (starting with {initial_count})...")
# Fast scrolling to load all DOM elements
last_count = 0
idle_count = 0
for i in range(max_scrolls):
# Scroll to load more
prev_count = driver.execute_script(count_reviews_script)
driver.execute_script(scroll_script)
# SMART WAIT: Wait until new reviews actually load
max_wait = 1.2
wait_step = 0.05
waited = 0
while waited < max_wait:
time.sleep(wait_step)
waited += wait_step
current_count = driver.execute_script(count_reviews_script)
# If reviews loaded, continue immediately!
if current_count > prev_count:
idle_count = 0 # Reset idle counter
break
# Give Google Maps more time to lazy-load (0.6s instead of 0.3s)
# Only exit early if we're confident nothing is loading
if waited >= 0.6 and current_count == prev_count:
break
# Track consecutive idle scrolls
if current_count == prev_count:
idle_count += 1
# Be VERY patient: wait for 12 consecutive idle scrolls to ensure we get ALL reviews
# (each with up to 1.2s wait = ~14.4s total idle time before giving up)
# This ensures Google Maps has plenty of time to lazy-load all content
if idle_count >= 12:
log.info(f"Reached end at {current_count} reviews (12 consecutive idle scrolls)")
# Double-check we got all reviews if we know the total
if total_reviews and current_count < total_reviews:
log.warning(f"Only got {current_count}/{total_reviews} reviews ({current_count/total_reviews*100:.1f}%). Some may be hidden or loading slowly.")
break
# Progress logging and callback every 5 scrolls
if (i + 1) % 5 == 0:
log.info(f"{current_count} review elements loaded...")
if progress_callback and total_reviews:
try:
progress_callback(current_count, total_reviews)
except Exception as e:
log.warning(f"Progress callback failed: {e}")
# Aggressive memory management every 20 scrolls
if (i + 1) % 20 == 0:
try:
# Clear console logs to prevent buildup
driver.execute_script("console.clear();")
# Force garbage collection in browser
driver.execute_script("""
if (window.gc) { window.gc(); }
// Remove image srcs to free memory (images reload on demand)
document.querySelectorAll('img').forEach(img => {
if (img.complete && !img.classList.contains('needed')) {
img.removeAttribute('src');
}
});
""")
# Brief pause to let Chrome breathe
time.sleep(0.1)
except Exception:
pass # Ignore if fails
last_count = current_count
# Shorter final scroll
for _ in range(2):
driver.execute_script(scroll_script)
time.sleep(0.3)
scroll_time = time.time() - start_time
log.info(f"Scrolling complete in {scroll_time:.2f}s")
# Update progress: scrolling done, starting extraction
if progress_callback and total_reviews:
try:
progress_callback(current_count, total_reviews)
except Exception as e:
log.warning(f"Progress callback failed: {e}")
# Extract ALL reviews using JavaScript (fast!)
log.info("Extracting reviews with JavaScript...")
extract_start = time.time()
all_reviews = extract_all_reviews_js(driver)
extract_time = time.time() - extract_start
log.info(f"Extraction complete in {extract_time:.2f}s")
# Final progress update with actual extracted count
if progress_callback and total_reviews:
try:
progress_callback(len(all_reviews), total_reviews)
except Exception as e:
log.warning(f"Progress callback failed: {e}")
elapsed = time.time() - start_time
log.info(f"Fast scrape completed: {len(all_reviews)} reviews in {elapsed:.2f}s")
result = {
"reviews": all_reviews,
"count": len(all_reviews),
"total_reviews": total_reviews,
"time": elapsed,
"scroll_time": scroll_time,
"extract_time": extract_time,
"success": True,
"error": None
}
if return_driver:
result["driver"] = driver
return result
except Exception as e:
elapsed = time.time() - start_time
error_msg = f"Fast scrape failed: {str(e)}"
log.error(error_msg)
# Check if this is a tab crash - try to extract what we have
partial_reviews = []
is_tab_crash = "tab crashed" in str(e).lower() or "session deleted" in str(e).lower()
if is_tab_crash and driver:
log.warning("Detected tab crash - attempting to extract partial reviews from DOM before crash...")
try:
# Try to extract reviews that were loaded before crash
partial_reviews = extract_all_reviews_js(driver)
log.info(f"Recovered {len(partial_reviews)} reviews from crashed session")
except Exception as recovery_error:
log.error(f"Could not recover reviews: {recovery_error}")
# Return partial results if we got any
if partial_reviews:
result = {
"reviews": partial_reviews,
"count": len(partial_reviews),
"total_reviews": None,
"time": elapsed,
"success": False, # Mark as failed but with partial data
"error": f"{error_msg} (recovered {len(partial_reviews)} reviews)",
"partial": True
}
else:
result = {
"reviews": [],
"count": 0,
"total_reviews": None,
"time": elapsed,
"success": False,
"error": error_msg
}
if return_driver:
result["driver"] = driver
return result
finally:
if should_close_driver and driver:
try:
driver.quit()
except:
pass
def get_business_card_info(url: str, headless: bool = True, driver=None, return_driver: bool = False) -> Dict[str, Any]:
"""
Extract business card information from Google Maps.
Uses the same reliable navigation logic as the main scraper.
Returns business card with:
- name
- address
- rating (float)
- total_reviews (int)
- success/error
"""
import time as timing_module
start_time = timing_module.time()
log.info(f"[PROFILE] Getting business card info for: {url}")
driver_provided = driver is not None
should_close_driver = not return_driver and not driver_provided
try:
# Initialize driver if not provided
t0 = timing_module.time()
if not driver:
driver = Driver(
uc=True,
headless=headless,
page_load_strategy="normal",
agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)
log.info(f"[PROFILE] Driver initialization: {timing_module.time() - t0:.2f}s")
else:
log.info(f"[PROFILE] Using pooled driver (0.00s)")
# Force English locale for consistent parsing
if 'hl=' in url:
url = url.replace('hl=es', 'hl=en').replace('hl=pt', 'hl=en').replace('hl=fr', 'hl=en')
else:
separator = '&' if '?' in url else '?'
url = f"{url}{separator}hl=en"
log.info(f"Loading Google Maps page...")
t0 = timing_module.time()
driver.get(url)
log.info(f"[PROFILE] Page load (driver.get): {timing_module.time() - t0:.2f}s")
t0 = timing_module.time()
time.sleep(0.5) # Initial wait - reduced from 2s
log.info(f"[PROFILE] Initial sleep: {timing_module.time() - t0:.2f}s")
# Handle GDPR consent page
t0 = timing_module.time()
if 'consent.google.com' in driver.current_url:
log.info("Detected GDPR consent page, accepting...")
try:
form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
for btn in form_btns:
btn_text = (btn.text or '').lower()
if 'aceptar todo' in btn_text or 'accept all' in btn_text:
log.info(f"Clicking GDPR consent: {btn.text}")
btn.click()
time.sleep(1) # Reduced from 2s
break
else:
if len(form_btns) >= 2:
log.info("Using fallback: clicking second form button")
form_btns[1].click()
time.sleep(1) # Reduced from 2s
except Exception as e:
log.warning(f"GDPR consent handling failed: {e}")
log.info(f"[PROFILE] GDPR consent handling: {timing_module.time() - t0:.2f}s")
else:
log.info(f"[PROFILE] No GDPR consent page (0.00s)")
# Dismiss cookie banner
try:
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
if cookie_btns:
log.info("Dismissing cookie banner...")
cookie_btns[0].click()
time.sleep(0.3) # Reduced from 0.5s
except:
pass
log.info(f"[PROFILE] Cookie banner dismissal: {timing_module.time() - t0:.2f}s")
# Wait for page to load - use smart waits
t0 = timing_module.time()
try:
log.info("Waiting for Google Maps content to load...")
wait = WebDriverWait(driver, 10)
wait.until(
lambda d: d.find_elements(By.CSS_SELECTOR, 'h1.DUwDvf, h1, [role="article"], [data-review-id]')
)
log.info("Google Maps content loaded successfully")
except Exception as e:
log.warning(f"Timeout waiting for Maps content: {e}")
time.sleep(0.5) # Minimal fallback wait
log.info(f"[PROFILE] Smart wait for content: {timing_module.time() - t0:.2f}s")
# Extract business card information using JavaScript
t0 = timing_module.time()
extract_script = """
const info = {
name: null,
address: null,
rating: null,
total_reviews: null
};
// Extract business name
const nameSelectors = [
'h1.DUwDvf',
'[role="main"] h1',
'h1.fontHeadlineLarge'
];
for (const selector of nameSelectors) {
const elem = document.querySelector(selector);
if (elem && elem.textContent) {
info.name = elem.textContent.trim();
break;
}
}
// Extract address
const addressSelectors = [
'button[data-item-id*="address"]',
'[data-item-id*="address"]',
'div[aria-label*="Address"]'
];
for (const selector of addressSelectors) {
const elem = document.querySelector(selector);
if (elem && elem.textContent) {
info.address = elem.textContent.trim();
break;
}
}
// Extract rating (look for aria-label like "4.2 stars")
const ratingElem = document.querySelector('[role="img"][aria-label*="star"]');
if (ratingElem) {
const ariaLabel = ratingElem.getAttribute('aria-label');
const match = ariaLabel.match(/([0-9.]+)/);
if (match) {
info.rating = parseFloat(match[1]);
}
}
// Extract total review count
const reviewPattern = /\\((\\d[\\d,\\.]*)\\)/;
const numberPattern = /(\\d[\\d,\\.]*)\\s*(?:review|reseña|avis)/i;
// PRIORITY 1: Look for review count in search results sidebar/panel
// This is where "152 reviews" appears on search results
const searchPanelSelectors = [
'a[href*="reviews"]', // Link with "reviews" in href
'button[jsaction*="reviews"]', // Button related to reviews
'div[role="link"]', // Clickable divs that might contain review info
];
for (const selector of searchPanelSelectors) {
const elements = document.querySelectorAll(selector);
for (let elem of elements) {
const text = elem.textContent || '';
const match = text.match(numberPattern);
if (match) {
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
if (num > 0 && num < 1000000) {
info.total_reviews = num;
break;
}
}
}
if (info.total_reviews) break;
}
// PRIORITY 2: Look in any span/div that contains the word "review"
if (!info.total_reviews) {
const allElements = document.querySelectorAll('span, div, a');
for (let elem of allElements) {
const text = elem.textContent || '';
if (text.length < 100) { // Skip very long text blocks
const match = text.match(numberPattern);
if (match) {
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
if (num > 0 && num < 1000000) {
info.total_reviews = num;
break;
}
}
}
}
}
// PRIORITY 3: Try tabs (for business detail pages)
if (!info.total_reviews) {
const tabs = document.querySelectorAll('button[role="tab"]');
for (let tab of tabs) {
const text = tab.textContent || '';
let match = text.match(reviewPattern);
if (match) {
info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
break;
}
match = text.match(numberPattern);
if (match) {
info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
break;
}
}
}
// PRIORITY 4: Try aria-labels
if (!info.total_reviews) {
const elements = document.querySelectorAll('[aria-label]');
for (let elem of elements) {
const ariaLabel = elem.getAttribute('aria-label') || '';
let match = ariaLabel.match(reviewPattern);
if (match) {
info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
break;
}
match = ariaLabel.match(numberPattern);
if (match) {
info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
break;
}
}
}
return info;
"""
business_info = driver.execute_script(extract_script)
log.info(f"[PROFILE] Business card extraction: {timing_module.time() - t0:.2f}s")
total_time = timing_module.time() - start_time
log.info(f"[PROFILE] *** TOTAL GET_BUSINESS_CARD TIME: {total_time:.2f}s ***")
log.info(f"Business card extracted: name={business_info.get('name')}, "
f"rating={business_info.get('rating')}, reviews={business_info.get('total_reviews')}")
result = {
"name": business_info.get('name'),
"address": business_info.get('address'),
"rating": business_info.get('rating'),
"total_reviews": business_info.get('total_reviews') or 0,
"has_reviews": (business_info.get('total_reviews') or 0) > 0,
"success": True,
"error": None
}
if return_driver:
result["driver"] = driver
return result
except Exception as e:
total_time = timing_module.time() - start_time
error_msg = f"Failed to get business card info: {str(e)}"
log.error(error_msg)
log.info(f"[PROFILE] *** TOTAL GET_BUSINESS_CARD TIME (FAILED): {total_time:.2f}s ***")
result = {
"name": None,
"address": None,
"rating": None,
"total_reviews": 0,
"has_reviews": False,
"success": False,
"error": error_msg
}
if return_driver:
result["driver"] = driver
return result
finally:
if should_close_driver and driver:
try:
driver.quit()
except:
pass
def check_reviews_available(url: str, headless: bool = True, driver=None, return_driver: bool = False) -> Dict[str, Any]:
"""
Lightweight check to see if a business has reviews available.
This function just loads the page and checks for review count without
doing the full scraping. Used to enable/disable scrape button in UI.
Args:
url: Google Maps URL to check
headless: Run in headless mode (default True)
driver: Existing driver instance to reuse (from worker pool)
return_driver: If True, don't close driver and return it in result
Returns:
Dict containing:
- has_reviews: bool - whether reviews exist
- review_count: int - number of reviews (0 if none)
- business_name: str - name of business (if found)
- success: bool - whether check succeeded
- error: str - error message (if failed)
- driver: Driver instance (if return_driver=True)
"""
import time as timing_module
start_time = timing_module.time()
log.info(f"[PROFILE] Starting validation for: {url}")
driver_provided = driver is not None
should_close_driver = not return_driver and not driver_provided
try:
# Initialize driver if not provided
t0 = timing_module.time()
if not driver:
driver = Driver(uc=True, headless=headless)
driver.maximize_window()
log.info(f"[PROFILE] Driver initialization: {timing_module.time() - t0:.2f}s")
else:
log.info(f"[PROFILE] Using pooled driver (0.00s)")
# Navigate to the URL
t0 = timing_module.time()
log.info(f"Loading Google Maps page...")
driver.get(url)
log.info(f"[PROFILE] Page load (driver.get): {timing_module.time() - t0:.2f}s")
t0 = timing_module.time()
time.sleep(0.5) # Initial wait - reduced from 2s
log.info(f"[PROFILE] Initial sleep: {timing_module.time() - t0:.2f}s")
# Handle GDPR consent page (CRITICAL for validation to work!)
t0 = timing_module.time()
if 'consent.google.com' in driver.current_url:
log.info("Detected GDPR consent page, accepting...")
try:
# Find all form buttons and click "Accept all" / "Aceptar todo"
form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
for btn in form_btns:
btn_text = (btn.text or '').lower()
if 'aceptar todo' in btn_text or 'accept all' in btn_text:
log.info(f"Clicking GDPR consent: {btn.text}")
btn.click()
time.sleep(1) # Reduced from 2s
break
else:
# Fallback: click second button (usually "Accept all")
if len(form_btns) >= 2:
log.info("Using fallback: clicking second form button")
form_btns[1].click()
time.sleep(1) # Reduced from 2s
except Exception as e:
log.warning(f"GDPR consent handling failed: {e}")
log.info(f"[PROFILE] GDPR consent handling: {timing_module.time() - t0:.2f}s")
else:
log.info(f"[PROFILE] No GDPR consent page (0.00s)")
# Dismiss cookie banner on Maps page
t0 = timing_module.time()
try:
cookie_btns = driver.find_elements(By.CSS_SELECTOR,
'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]')
if cookie_btns:
log.info("Dismissing cookie banner...")
cookie_btns[0].click()
time.sleep(0.3) # Reduced from 0.5s
except:
pass
log.info(f"[PROFILE] Cookie banner dismissal: {timing_module.time() - t0:.2f}s")
# Wait for page to fully load after consent - use smart waits
t0 = timing_module.time()
try:
# Wait for either business card OR search results to appear
log.info("Waiting for Google Maps content to load...")
wait = WebDriverWait(driver, 10)
wait.until(
lambda d: d.find_elements(By.CSS_SELECTOR, 'h1.DUwDvf, h1, [role="article"], [data-review-id]')
)
log.info("Google Maps content loaded successfully")
except Exception as e:
log.warning(f"Timeout waiting for Maps content: {e}")
time.sleep(0.5) # Minimal fallback wait
log.info(f"[PROFILE] Smart wait for content: {timing_module.time() - t0:.2f}s")
# Try to extract business name
t0 = timing_module.time()
business_name = None
try:
business_name_script = """
// Try to find business name from various locations
let name = null;
// Method 1: Look for business name in the main panel (most reliable)
// This is where the actual business info appears
const businessPanelSelectors = [
'h1.DUwDvf', // Main business name heading
'[role="main"] h1', // H1 in main content
'h1.fontHeadlineLarge', // Large headline font
'button[jsaction*="pane.header.rating"] h1', // Near rating button
];
for (const selector of businessPanelSelectors) {
const element = document.querySelector(selector);
if (element && element.textContent) {
const text = element.textContent.trim();
// Filter out Google's placeholder/suggestion text
if (text &&
!text.toLowerCase().includes('antes de ir') &&
!text.toLowerCase().includes('before going') &&
!text.toLowerCase().includes('google maps') &&
text.length < 100) { // Business names shouldn't be super long
name = text;
break;
}
}
}
// Method 2: h1 tag (fallback)
if (!name) {
const h1 = document.querySelector('h1');
if (h1 && h1.textContent) {
const text = h1.textContent.trim();
if (!text.toLowerCase().includes('antes de ir') &&
!text.toLowerCase().includes('before going')) {
name = text;
}
}
}
// Method 3: Title attribute (last resort)
if (!name) {
const title = document.title;
if (title && !title.includes('Google Maps')) {
name = title.split('-')[0].trim();
}
}
return name;
"""
business_name = driver.execute_script(business_name_script)
if business_name:
log.info(f"Found business name: {business_name}")
except Exception as e:
log.debug(f"Could not extract business name: {e}")
log.info(f"[PROFILE] Business name extraction: {timing_module.time() - t0:.2f}s")
# Extract total review count
t0 = timing_module.time()
review_count = extract_total_review_count(driver)
log.info(f"[PROFILE] Review count extraction: {timing_module.time() - t0:.2f}s")
if review_count is None:
log.warning("Could not determine review count")
total_time = timing_module.time() - start_time
log.info(f"[PROFILE] *** TOTAL VALIDATION TIME: {total_time:.2f}s ***")
result = {
"has_reviews": False,
"review_count": 0,
"business_name": business_name,
"success": True,
"error": "Could not find review count on page"
}
if return_driver:
result["driver"] = driver
return result
log.info(f"Found {review_count} reviews available")
total_time = timing_module.time() - start_time
log.info(f"[PROFILE] *** TOTAL VALIDATION TIME: {total_time:.2f}s ***")
result = {
"has_reviews": review_count > 0,
"review_count": review_count,
"business_name": business_name,
"success": True,
"error": None
}
if return_driver:
result["driver"] = driver
return result
except Exception as e:
total_time = timing_module.time() - start_time
error_msg = f"Failed to check reviews: {str(e)}"
log.error(error_msg)
log.info(f"[PROFILE] *** TOTAL VALIDATION TIME (FAILED): {total_time:.2f}s ***")
result = {
"has_reviews": False,
"review_count": 0,
"business_name": None,
"success": False,
"error": error_msg
}
if return_driver:
result["driver"] = driver
return result
finally:
if should_close_driver and driver:
try:
driver.quit()
except:
pass