Add browser fingerprint support and analytics metadata display
- Transfer user's browser fingerprint (user-agent, viewport, timezone, language, geolocation) to Chrome for more authentic scraping - Display review topics from Google Maps in analytics dashboard - Show business category badge in analytics header - Fix date_text null handling in analytics (handle undefined/timestamp fields) - Add review_topics and business_category to JobStatus interface Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -268,7 +268,7 @@ def parse_dom_review(card) -> dict:
|
||||
|
||||
def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: int = 15,
|
||||
flush_callback=None, flush_batch_size: int = 500, log_capture: LogCapture = None,
|
||||
progress_callback=None) -> dict:
|
||||
progress_callback=None, validation_only: bool = False) -> dict:
|
||||
"""
|
||||
Scrape Google Maps reviews.
|
||||
|
||||
@@ -299,6 +299,9 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
# Track total reviews (persists across refreshes)
|
||||
total_reviews = [None] # Use list for closure mutation
|
||||
|
||||
# Store business info extracted from overview (before clicking reviews tab)
|
||||
business_info_cache = [None]
|
||||
|
||||
# Hard refresh counter
|
||||
hard_refresh_count = [0]
|
||||
max_hard_refreshes = 3 # Max number of hard refreshes before giving up
|
||||
@@ -323,11 +326,14 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
pass
|
||||
return None
|
||||
|
||||
def setup_reviews_page(is_refresh=False):
|
||||
def setup_reviews_page(is_refresh=False, validation_only_mode=False):
|
||||
"""
|
||||
Setup the reviews page for scraping.
|
||||
Returns (scroll_container, stop_scrolling_event) or (None, None) on failure.
|
||||
Can be called after initial load or after a hard refresh.
|
||||
|
||||
If validation_only_mode=True, returns early after extracting business info
|
||||
without clicking reviews tab or finding scroll container.
|
||||
"""
|
||||
nonlocal total_reviews
|
||||
|
||||
@@ -335,6 +341,13 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
|
||||
# Navigate to URL (only on initial load or refresh)
|
||||
if not is_refresh:
|
||||
# Reset browser state by navigating to blank page first
|
||||
# This clears any stale state from pooled browser sessions
|
||||
try:
|
||||
driver.get("about:blank")
|
||||
time.sleep(0.1)
|
||||
except:
|
||||
pass
|
||||
log.info(f"🌐 Loading: {url[:80]}...")
|
||||
else:
|
||||
log.info(f"🔄 Hard refresh #{hard_refresh_count[0]}: reloading page...")
|
||||
@@ -353,6 +366,8 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
# Reload original URL after consent
|
||||
log.info(" Reloading after consent...")
|
||||
driver.get(url)
|
||||
# Wait for page to settle after consent reload
|
||||
time.sleep(1)
|
||||
break
|
||||
except:
|
||||
pass
|
||||
@@ -362,43 +377,108 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
break
|
||||
time.sleep(0.01) # 10ms - responsive but low CPU
|
||||
|
||||
# Extract total review count BEFORE clicking reviews tab (it's on Overview)
|
||||
# Extract business info and total review count BEFORE clicking reviews tab (on Overview)
|
||||
# This captures name, rating, category, address while they're visible
|
||||
# Only on first load (don't overwrite if we already have it)
|
||||
if total_reviews[0] is None:
|
||||
if total_reviews[0] is None or business_info_cache[0] is None:
|
||||
start = time.time()
|
||||
while time.time() - start < 5:
|
||||
try:
|
||||
count = driver.execute_script("""
|
||||
var reviewSpans = document.querySelectorAll('span[role="img"]');
|
||||
for (var i = 0; i < reviewSpans.length; i++) {
|
||||
var label = reviewSpans[i].getAttribute('aria-label') || '';
|
||||
var match = label.match(/^([\\d,\\.]+)\\s*review/i);
|
||||
if (match) {
|
||||
return parseInt(match[1].replace(/[,\\.]/g, ''));
|
||||
info = driver.execute_script("""
|
||||
var result = {
|
||||
total_reviews: null,
|
||||
name: null,
|
||||
rating: null,
|
||||
category: null,
|
||||
address: null
|
||||
};
|
||||
|
||||
// Business name from h1
|
||||
var h1 = document.querySelector('h1');
|
||||
if (h1) result.name = h1.textContent.trim();
|
||||
|
||||
// Category - use jsaction attribute (robust selector)
|
||||
var catBtn = document.querySelector('button[jsaction*="category"]');
|
||||
if (catBtn) result.category = catBtn.textContent.trim();
|
||||
|
||||
// Rating and review count from span[role="img"] aria-labels
|
||||
var spans = document.querySelectorAll('span[role="img"]');
|
||||
for (var i = 0; i < spans.length; i++) {
|
||||
var label = spans[i].getAttribute('aria-label') || '';
|
||||
|
||||
// Rating: "4.8 stars"
|
||||
var rMatch = label.match(/^([\\d,.]+)\\s*star/i);
|
||||
if (rMatch && !result.rating) {
|
||||
result.rating = parseFloat(rMatch[1].replace(',', '.'));
|
||||
}
|
||||
|
||||
// Reviews: "79 reviews"
|
||||
var revMatch = label.match(/^([\\d,\\.]+)\\s*review/i);
|
||||
if (revMatch && !result.total_reviews) {
|
||||
result.total_reviews = parseInt(revMatch[1].replace(/[,\\.]/g, ''));
|
||||
}
|
||||
}
|
||||
return null;
|
||||
|
||||
// Address from button
|
||||
var addrBtn = document.querySelector('button[data-item-id="address"]');
|
||||
if (addrBtn) {
|
||||
var label = addrBtn.getAttribute('aria-label');
|
||||
if (label) result.address = label.replace(/^Address:\\s*/i, '');
|
||||
}
|
||||
|
||||
return result;
|
||||
""")
|
||||
if count:
|
||||
total_reviews[0] = count
|
||||
log.info(f"📊 Total reviews on page: {count}")
|
||||
break
|
||||
|
||||
if info:
|
||||
if info.get('total_reviews') and total_reviews[0] is None:
|
||||
total_reviews[0] = info['total_reviews']
|
||||
log.info(f"📊 Total reviews on page: {total_reviews[0]}")
|
||||
if info.get('name') and business_info_cache[0] is None:
|
||||
business_info_cache[0] = info
|
||||
log.info(f"📍 Business: {info.get('name')}")
|
||||
if total_reviews[0] and business_info_cache[0]:
|
||||
break
|
||||
except:
|
||||
pass
|
||||
time.sleep(0.1)
|
||||
|
||||
# VALIDATION_ONLY: Return early - skip clicking reviews tab, sorting, etc.
|
||||
if validation_only_mode:
|
||||
log.info("📋 Validation mode: returning early (skipping reviews tab)")
|
||||
return ("validation_done", None)
|
||||
|
||||
# Click reviews tab - poll until found
|
||||
review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"]
|
||||
start = time.time()
|
||||
tab_clicked = False
|
||||
tabs_logged = False
|
||||
while time.time() - start < 5: # Max 5s for tabs
|
||||
try:
|
||||
tabs = driver.find_elements(By.CSS_SELECTOR, "button[role='tab']")
|
||||
# Log available tabs once for debugging
|
||||
if not tabs_logged and tabs:
|
||||
tabs_logged = True
|
||||
tab_texts = [t.text for t in tabs]
|
||||
log.info(f" Available tabs: {tab_texts}")
|
||||
for tab in tabs:
|
||||
tab_text = tab.text.lower()
|
||||
if any(kw in tab_text for kw in review_keywords):
|
||||
if not is_refresh:
|
||||
log.info(f" Clicking reviews tab: '{tab.text}'")
|
||||
# Extract total_reviews from tab text like "Reviews (79)" or "Reviews\n79"
|
||||
if total_reviews[0] is None:
|
||||
import re
|
||||
# Try pattern with parentheses: "Reviews (79)"
|
||||
match = re.search(r'\((\d+)\)', tab.text)
|
||||
if match:
|
||||
total_reviews[0] = int(match.group(1))
|
||||
log.info(f"📊 Total reviews from tab: {total_reviews[0]}")
|
||||
else:
|
||||
# Try pattern with newline: "Reviews\n79"
|
||||
match = re.search(r'(\d+)', tab.text)
|
||||
if match:
|
||||
total_reviews[0] = int(match.group(1))
|
||||
log.info(f"📊 Total reviews from tab: {total_reviews[0]}")
|
||||
tab.click()
|
||||
tab_clicked = True
|
||||
break
|
||||
@@ -569,11 +649,85 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
|
||||
return scroll_container, stop_scrolling
|
||||
|
||||
# Initial page setup
|
||||
scroll_container, stop_scrolling = setup_reviews_page(is_refresh=False)
|
||||
# Helper to extract review topics from the reviews tab
|
||||
def extract_review_topics():
|
||||
"""Extract review topic filters from radiogroup (robust selectors)."""
|
||||
try:
|
||||
topics = driver.execute_script("""
|
||||
var topics = [];
|
||||
|
||||
// Primary: use role="radiogroup" with aria-label="Refine reviews" (robust)
|
||||
var container = document.querySelector('div[role="radiogroup"][aria-label*="Refine"], div[role="radiogroup"][aria-label*="refine"]');
|
||||
|
||||
if (!container) {
|
||||
// Fallback: any radiogroup in the reviews area
|
||||
container = document.querySelector('div[role="radiogroup"]');
|
||||
}
|
||||
|
||||
if (container) {
|
||||
var buttons = container.querySelectorAll('button[role="radio"]');
|
||||
for (var btn of buttons) {
|
||||
var label = btn.getAttribute('aria-label') || '';
|
||||
// Parse "hair salon, mentioned in 4 reviews" format
|
||||
var match = label.match(/^([^,]+),\\s*mentioned in (\\d+)/i);
|
||||
if (match) {
|
||||
topics.push({
|
||||
topic: match[1].trim(),
|
||||
count: parseInt(match[2])
|
||||
});
|
||||
} else if (label && !label.toLowerCase().includes('all review')) {
|
||||
// Fallback: try to extract from child spans
|
||||
var countSpan = btn.querySelector('.bC3Nkc, .fontBodySmall');
|
||||
var nameSpan = btn.querySelector('.uEubGf, span:first-child');
|
||||
if (nameSpan) {
|
||||
var name = nameSpan.textContent.trim();
|
||||
var count = countSpan ? parseInt(countSpan.textContent) : 0;
|
||||
if (name && name.toLowerCase() !== 'all') {
|
||||
topics.push({topic: name, count: count || 0});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return topics;
|
||||
""")
|
||||
return topics or []
|
||||
except:
|
||||
return []
|
||||
|
||||
# Initial page setup (pass validation_only to skip unnecessary steps)
|
||||
scroll_container, stop_scrolling = setup_reviews_page(is_refresh=False, validation_only_mode=validation_only)
|
||||
|
||||
# VALIDATION_ONLY MODE: Return early with just total_reviews and business info
|
||||
# setup_reviews_page returns ("validation_done", None) in this case
|
||||
if validation_only or scroll_container == "validation_done":
|
||||
# Use the business info captured from Overview (before clicking reviews tab)
|
||||
business_info = business_info_cache[0] or {}
|
||||
|
||||
return {
|
||||
"reviews": [],
|
||||
"total": total_reviews[0] or 0,
|
||||
"scrolls": 0,
|
||||
"error": None,
|
||||
"validation_info": {
|
||||
"name": business_info.get("name"),
|
||||
"rating": business_info.get("rating"),
|
||||
"category": business_info.get("category"),
|
||||
"address": business_info.get("address"),
|
||||
"total_reviews": total_reviews[0]
|
||||
}
|
||||
}
|
||||
|
||||
if not scroll_container:
|
||||
return {"reviews": [], "total": 0, "scrolls": 0, "error": "No scroll container found"}
|
||||
|
||||
# Extract review topics after reviews tab is loaded (before scrolling begins)
|
||||
time.sleep(0.5) # Brief wait for topic filters to render
|
||||
review_topics = extract_review_topics()
|
||||
if review_topics:
|
||||
log.info(f"📊 Found {len(review_topics)} review topics: {', '.join(t['topic'] for t in review_topics[:5])}...")
|
||||
|
||||
def get_api_reviews():
|
||||
"""Get reviews from intercepted API responses."""
|
||||
api_revs = []
|
||||
@@ -990,13 +1144,15 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
"total_flushed": total_flushed[0],
|
||||
"checks": check_num,
|
||||
"url": url,
|
||||
"logs": log.get_logs()
|
||||
"logs": log.get_logs(),
|
||||
"review_topics": review_topics # Topic filters with mention counts
|
||||
}
|
||||
|
||||
|
||||
def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999,
|
||||
progress_callback=None, driver=None, return_driver: bool = False,
|
||||
log_capture: LogCapture = None):
|
||||
log_capture: LogCapture = None, flush_callback=None, validation_only: bool = False,
|
||||
browser_fingerprint: dict = None):
|
||||
"""
|
||||
Production-compatible wrapper for scrape_reviews.
|
||||
Matches the API expected by job_manager.py.
|
||||
@@ -1009,6 +1165,13 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
||||
driver: Existing driver instance to reuse
|
||||
return_driver: If True, return driver in result
|
||||
log_capture: Optional LogCapture instance for real-time log access
|
||||
browser_fingerprint: Optional dict with user's browser fingerprint:
|
||||
- geolocation: {lat, lng}
|
||||
- userAgent: string
|
||||
- viewport: {width, height}
|
||||
- timezone: string (e.g., "Europe/Madrid")
|
||||
- language: string (e.g., "en-US")
|
||||
- platform: string (e.g., "MacIntel")
|
||||
|
||||
Returns:
|
||||
Dictionary with: reviews, count, total_reviews, time, success, error, driver, logs
|
||||
@@ -1023,27 +1186,56 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
||||
log_capture = log_capture or LogCapture()
|
||||
|
||||
try:
|
||||
# Extract fingerprint settings
|
||||
fp = browser_fingerprint or {}
|
||||
user_agent = fp.get('userAgent') or "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
viewport = fp.get('viewport') or {'width': 1200, 'height': 900}
|
||||
geolocation = fp.get('geolocation')
|
||||
timezone = fp.get('timezone')
|
||||
language = fp.get('language', 'en-US')
|
||||
|
||||
# Create driver if not provided
|
||||
if not driver:
|
||||
driver = Driver(
|
||||
uc=True,
|
||||
headless=headless,
|
||||
page_load_strategy="normal",
|
||||
agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
agent=user_agent # Use user's actual user agent
|
||||
)
|
||||
driver.set_window_size(1200, 900) # Proper viewport for Google Maps
|
||||
# Set viewport to match user's screen
|
||||
driver.set_window_size(viewport['width'], viewport['height'])
|
||||
|
||||
# Set Chrome geolocation to US (Boston, MA) using CDP
|
||||
# This ensures Google Maps shows US results regardless of server location
|
||||
# Apply browser fingerprint settings via CDP
|
||||
try:
|
||||
driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
|
||||
'latitude': 42.3601,
|
||||
'longitude': -71.0589,
|
||||
'accuracy': 100
|
||||
})
|
||||
log_capture.info("Set geolocation to US (Boston, MA)")
|
||||
# Set timezone if provided
|
||||
if timezone:
|
||||
driver.execute_cdp_cmd('Emulation.setTimezoneOverride', {'timezoneId': timezone})
|
||||
log_capture.info(f"Set timezone to {timezone}")
|
||||
|
||||
# Set locale/language
|
||||
driver.execute_cdp_cmd('Emulation.setLocaleOverride', {'locale': language})
|
||||
|
||||
# Set geolocation
|
||||
if geolocation and 'lat' in geolocation and 'lng' in geolocation:
|
||||
driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
|
||||
'latitude': geolocation['lat'],
|
||||
'longitude': geolocation['lng'],
|
||||
'accuracy': 1000 # ~1km accuracy for IP-based location
|
||||
})
|
||||
log_capture.info(f"Set geolocation to ({geolocation['lat']:.2f}, {geolocation['lng']:.2f})")
|
||||
else:
|
||||
# Default to US (Boston, MA) if no geolocation provided
|
||||
driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
|
||||
'latitude': 42.3601,
|
||||
'longitude': -71.0589,
|
||||
'accuracy': 100
|
||||
})
|
||||
log_capture.info("Set geolocation to US (Boston, MA) [default]")
|
||||
|
||||
if fp:
|
||||
log_capture.info(f"Browser fingerprint applied: {fp.get('platform', 'unknown')}, {viewport['width']}x{viewport['height']}")
|
||||
except Exception as e:
|
||||
log_capture.warning(f"Could not set geolocation: {e}")
|
||||
log_capture.warning(f"Could not apply fingerprint settings: {e}")
|
||||
|
||||
# Add URL parameters for consistent results
|
||||
if 'hl=' not in url:
|
||||
@@ -1052,14 +1244,18 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
||||
if 'gl=' not in url:
|
||||
url = f"{url}&gl=us"
|
||||
|
||||
# Create progress wrapper if callback provided
|
||||
flush_callback = None
|
||||
if progress_callback:
|
||||
# Create combined flush callback for progress + external handler
|
||||
external_flush = flush_callback # Save external callback
|
||||
internal_flush = None
|
||||
if progress_callback or external_flush:
|
||||
collected = [0]
|
||||
def flush_with_progress(reviews_batch):
|
||||
collected[0] += len(reviews_batch)
|
||||
progress_callback(collected[0], None)
|
||||
flush_callback = flush_with_progress
|
||||
def combined_flush(reviews_batch):
|
||||
collected[0] = len(reviews_batch) # reviews_batch is ALL reviews so far
|
||||
if progress_callback:
|
||||
progress_callback(collected[0], None)
|
||||
if external_flush:
|
||||
external_flush(reviews_batch) # Pass reviews to external handler
|
||||
internal_flush = combined_flush
|
||||
|
||||
# Run the scraper with progress callback for real-time updates
|
||||
result = scrape_reviews(
|
||||
@@ -1067,10 +1263,11 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
||||
url=url,
|
||||
max_reviews=999999, # Effectively unlimited
|
||||
timeout_no_new=15,
|
||||
flush_callback=flush_callback,
|
||||
flush_callback=internal_flush,
|
||||
flush_batch_size=100, # Smaller batches for more frequent progress
|
||||
log_capture=log_capture,
|
||||
progress_callback=progress_callback # Pass through for real-time log updates
|
||||
progress_callback=progress_callback, # Pass through for real-time log updates
|
||||
validation_only=validation_only # Return early if just validating
|
||||
)
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
@@ -1083,9 +1280,14 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
||||
"time": elapsed,
|
||||
"success": True,
|
||||
"error": None,
|
||||
"logs": result.get("logs", [])
|
||||
"logs": result.get("logs", []),
|
||||
"review_topics": result.get("review_topics", []) # Topic filters with mention counts
|
||||
}
|
||||
|
||||
# Include validation_info if in validation_only mode
|
||||
if validation_only and "validation_info" in result:
|
||||
response["validation_info"] = result["validation_info"]
|
||||
|
||||
if return_driver:
|
||||
response["driver"] = driver
|
||||
elif should_close_driver:
|
||||
@@ -1120,6 +1322,122 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
||||
}
|
||||
|
||||
|
||||
def extract_about_info(driver, url: str = None) -> dict:
|
||||
"""
|
||||
Extract About section info from Google Maps (Accessibility, Amenities, etc.).
|
||||
|
||||
This function should be called AFTER reviews are scraped if about info is needed,
|
||||
as it navigates to a different tab.
|
||||
|
||||
Args:
|
||||
driver: Selenium WebDriver instance (already on the business page)
|
||||
url: Optional URL to navigate to first (if not already on the page)
|
||||
|
||||
Returns:
|
||||
dict with section names as keys, each containing list of features
|
||||
"""
|
||||
try:
|
||||
# Navigate if URL provided
|
||||
if url:
|
||||
# Force English
|
||||
if 'hl=' not in url:
|
||||
separator = '&' if '?' in url else '?'
|
||||
url = f"{url}{separator}hl=en"
|
||||
if 'gl=' not in url:
|
||||
url = f"{url}&gl=us"
|
||||
driver.get(url)
|
||||
time.sleep(1)
|
||||
|
||||
# Click About tab using robust selectors
|
||||
clicked = driver.execute_script("""
|
||||
// Try multiple selectors for about tab
|
||||
var selectors = [
|
||||
'button[aria-label*="About"]',
|
||||
'button[data-tab-index="2"]',
|
||||
'div[role="tablist"] button:nth-child(3)',
|
||||
'button[jsaction*="about"]'
|
||||
];
|
||||
|
||||
for (var sel of selectors) {
|
||||
var btn = document.querySelector(sel);
|
||||
if (btn && btn.textContent.toLowerCase().includes('about')) {
|
||||
btn.click();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: find by text content
|
||||
var buttons = document.querySelectorAll('button');
|
||||
for (var btn of buttons) {
|
||||
if (btn.textContent.trim().toLowerCase() === 'about') {
|
||||
btn.click();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
""")
|
||||
|
||||
if not clicked:
|
||||
return {}
|
||||
|
||||
time.sleep(1.5) # Wait for about tab to load
|
||||
|
||||
# Extract about sections using aria-labels (robust)
|
||||
about = driver.execute_script("""
|
||||
var about = {};
|
||||
|
||||
// Find the about region by aria-label or role
|
||||
var container = document.querySelector('div[role="region"][aria-label*="About"]');
|
||||
|
||||
if (!container) {
|
||||
// Fallback: look for the scrollable area with sections
|
||||
container = document.querySelector('.m6QErb[aria-label*="About"]');
|
||||
}
|
||||
|
||||
if (!container) {
|
||||
// Last resort: find sections by h2 headers
|
||||
container = document;
|
||||
}
|
||||
|
||||
// Find all section headers (h2 elements)
|
||||
var sections = container.querySelectorAll('h2');
|
||||
|
||||
for (var h2 of sections) {
|
||||
var sectionName = h2.textContent.trim();
|
||||
var items = [];
|
||||
|
||||
// Find the ul list following this h2
|
||||
var parent = h2.closest('.iP2t7d, div');
|
||||
if (parent) {
|
||||
var listItems = parent.querySelectorAll('li span[aria-label]');
|
||||
for (var li of listItems) {
|
||||
var label = li.getAttribute('aria-label');
|
||||
if (label) {
|
||||
// Parse "Has toilet" or "No wheelchair-accessible car park"
|
||||
var hasFeature = !label.toLowerCase().startsWith('no ');
|
||||
var featureName = label.replace(/^(Has |No )/i, '');
|
||||
items.push({
|
||||
feature: featureName,
|
||||
available: hasFeature
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (sectionName && items.length > 0) {
|
||||
about[sectionName] = items;
|
||||
}
|
||||
}
|
||||
|
||||
return about;
|
||||
""")
|
||||
|
||||
return about or {}
|
||||
|
||||
except Exception as e:
|
||||
return {"error": str(e)}
|
||||
|
||||
|
||||
# Test function
|
||||
if __name__ == "__main__":
|
||||
from seleniumbase import Driver
|
||||
@@ -1159,6 +1477,8 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
|
||||
dict with: name, address, rating, total_reviews, success, error, time
|
||||
"""
|
||||
from seleniumbase import Driver
|
||||
import logging
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
start_time = time.time()
|
||||
driver_provided = driver is not None
|
||||
@@ -1177,13 +1497,15 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
|
||||
except:
|
||||
pass
|
||||
|
||||
# Clear state if reusing a pooled driver (ensures clean page load)
|
||||
if driver_provided:
|
||||
try:
|
||||
driver.delete_all_cookies()
|
||||
driver.get("about:blank")
|
||||
except:
|
||||
pass
|
||||
# Don't clear state - Google may serve different content based on session history
|
||||
# The scraper doesn't reset state, so validation shouldn't either
|
||||
|
||||
# Force English interface for consistent parsing
|
||||
if 'hl=' not in url:
|
||||
separator = '&' if '?' in url else '?'
|
||||
url = f"{url}{separator}hl=en"
|
||||
if 'gl=' not in url:
|
||||
url = f"{url}&gl=us"
|
||||
|
||||
# Navigate to URL
|
||||
driver.get(url)
|
||||
@@ -1193,48 +1515,183 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
|
||||
while time.time() - start < 5:
|
||||
if "consent.google" in driver.current_url:
|
||||
try:
|
||||
for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
|
||||
txt = btn.text.lower()
|
||||
if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
|
||||
btn.click()
|
||||
driver.get(url)
|
||||
break
|
||||
except:
|
||||
# Try multiple approaches to find and click accept button
|
||||
clicked = False
|
||||
|
||||
# Method 1: Find by aria-label (most reliable for Google consent)
|
||||
for btn in driver.find_elements(By.CSS_SELECTOR, "button[aria-label*='Accept']"):
|
||||
btn.click()
|
||||
clicked = True
|
||||
break
|
||||
|
||||
# Method 2: Find by text content
|
||||
if not clicked:
|
||||
for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
|
||||
txt = btn.text.lower()
|
||||
if "accept all" in txt or "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
|
||||
btn.click()
|
||||
clicked = True
|
||||
break
|
||||
|
||||
if clicked:
|
||||
time.sleep(0.5) # Brief wait for consent to process
|
||||
driver.get(url) # Reload the target URL
|
||||
time.sleep(0.5) # Wait for reload
|
||||
except Exception as e:
|
||||
pass
|
||||
break
|
||||
if "maps/place" in driver.current_url or ("maps" in driver.current_url and "consent" not in driver.current_url):
|
||||
break
|
||||
time.sleep(0.01) # 10ms - responsive but low CPU
|
||||
|
||||
# Log current URL after consent handling
|
||||
try:
|
||||
current_url = driver.current_url
|
||||
log.info(f"🔍 Validation: Current URL after load: {current_url[:80]}...")
|
||||
except:
|
||||
pass
|
||||
|
||||
# Wait for page to fully render before polling (tabs may load dynamically)
|
||||
time.sleep(2)
|
||||
|
||||
# Poll for business info (same pattern as total_reviews extraction)
|
||||
info = {"name": None, "rating": None, "total_reviews": None, "address": None}
|
||||
# Timeout increased to 10s because Reviews tab can take 6+ seconds to appear after consent
|
||||
info = {"name": None, "rating": None, "total_reviews": None, "address": None, "category": None}
|
||||
start = time.time()
|
||||
while time.time() - start < 5:
|
||||
debug_logged = False
|
||||
while time.time() - start < 10:
|
||||
try:
|
||||
info = driver.execute_script("""
|
||||
var result = {name: null, rating: null, total_reviews: null, address: null};
|
||||
var result = {name: null, rating: null, total_reviews: null, address: null, category: null, debug: []};
|
||||
|
||||
// Business name from h1
|
||||
var h1 = document.querySelector('h1');
|
||||
if (h1) result.name = h1.textContent.trim();
|
||||
|
||||
// Rating and reviews from span[role="img"] aria-labels
|
||||
// Same pattern as scrape_reviews for consistency
|
||||
// Category - use jsaction attribute (robust, survives class changes)
|
||||
var catBtn = document.querySelector('button[jsaction*="category"]');
|
||||
if (catBtn) result.category = catBtn.textContent.trim();
|
||||
|
||||
// Fallback: look for button after rating that's not a link
|
||||
if (!result.category) {
|
||||
var buttons = document.querySelectorAll('button');
|
||||
for (var btn of buttons) {
|
||||
var text = btn.textContent.trim();
|
||||
// Categories are short words, no numbers, not navigation
|
||||
if (text && text.length < 50 && !text.match(/^[0-9]/) &&
|
||||
!text.match(/review|star|direction|save|share|photo/i)) {
|
||||
// Check if it's near the rating area
|
||||
var parent = btn.closest('.LBgpqf, .skqShb, .fontBodyMedium');
|
||||
if (parent) {
|
||||
result.category = text;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Rating from span[role="img"] aria-labels
|
||||
var spans = document.querySelectorAll('span[role="img"]');
|
||||
for (var i = 0; i < spans.length; i++) {
|
||||
var label = spans[i].getAttribute('aria-label') || '';
|
||||
|
||||
// Rating: "4.8 stars", "4,8 estrellas", etc (partial match)
|
||||
var rMatch = label.match(/^([\\d,.]+)\\s*(star|estrella|étoile|stern|stell)/i);
|
||||
// Collect debug info for all aria-labels
|
||||
if (label) {
|
||||
result.debug.push('img-aria: ' + label);
|
||||
}
|
||||
|
||||
// Rating: "4.8 stars" (English forced via hl=en)
|
||||
var rMatch = label.match(/^([\\d,.]+)\\s*star/i);
|
||||
if (rMatch && !result.rating) {
|
||||
result.rating = parseFloat(rMatch[1].replace(',', '.'));
|
||||
}
|
||||
|
||||
// Reviews: same as scrape_reviews - /^([\d,.]+)\s*review/i
|
||||
// Plus Spanish "reseña" which doesn't contain "review"
|
||||
var revMatch = label.match(/^([\\d,\\.]+)\\s*(review|reseña|avis|bewertung|recension)/i);
|
||||
// Reviews: "79 reviews" or "4.8 stars 79 reviews" (English forced via hl=en)
|
||||
// Try direct format first: "79 reviews"
|
||||
var revMatch = label.match(/^([\\d,]+)\\s*review/i);
|
||||
if (revMatch && !result.total_reviews) {
|
||||
result.total_reviews = parseInt(revMatch[1].replace(/[,\\.]/g, ''));
|
||||
result.total_reviews = parseInt(revMatch[1].replace(/,/g, ''));
|
||||
}
|
||||
|
||||
// Try combined format: "4.8 stars 79 reviews" or "4.8 stars 79k+ reviews"
|
||||
if (!result.total_reviews) {
|
||||
var combinedMatch = label.match(/stars?\\s+([\\d,]+k?\\+?)\\s*review/i);
|
||||
if (combinedMatch) {
|
||||
var countStr = combinedMatch[1].replace(/,/g, '');
|
||||
if (countStr.includes('k')) {
|
||||
// Handle "9k+" format
|
||||
result.total_reviews = parseInt(countStr) * 1000;
|
||||
} else {
|
||||
result.total_reviews = parseInt(countStr);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Also collect tab button texts for debugging (include full text including numbers)
|
||||
var tabs = document.querySelectorAll('button[role="tab"]');
|
||||
for (var j = 0; j < tabs.length; j++) {
|
||||
var tabText = tabs[j].textContent.trim();
|
||||
result.debug.push('tab: ' + tabText);
|
||||
// Also try to extract review count from tab text like "Reviews (79)"
|
||||
if (tabText.toLowerCase().includes('review') && !result.total_reviews) {
|
||||
var tabMatch = tabText.match(/\\((\\d+)\\)/);
|
||||
if (tabMatch) {
|
||||
result.total_reviews = parseInt(tabMatch[1]);
|
||||
result.debug.push('Found reviews in tab: ' + tabText);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Also check ALL buttons for reviews count
|
||||
var allButtons = document.querySelectorAll('button');
|
||||
for (var b = 0; b < allButtons.length; b++) {
|
||||
var btnText = allButtons[b].textContent || '';
|
||||
if (btnText.toLowerCase().includes('review') && !btnText.toLowerCase().includes('write')) {
|
||||
var numMatch = btnText.match(/\\((\\d+)\\)/);
|
||||
if (numMatch && !result.total_reviews) {
|
||||
result.total_reviews = parseInt(numMatch[1]);
|
||||
result.debug.push('Found reviews in button: ' + btnText.substring(0, 50));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check if we're on search results vs place page
|
||||
result.debug.push('title: ' + document.title);
|
||||
result.debug.push('url: ' + window.location.href.substring(0, 80));
|
||||
|
||||
// Check for search results list
|
||||
var searchResults = document.querySelectorAll('div[role="feed"] > div');
|
||||
result.debug.push('search_results_count: ' + searchResults.length);
|
||||
|
||||
// Fallback: Get review count from Reviews tab button "Reviews (79)"
|
||||
// Search ALL tab buttons for one containing "review" text (same as scrape_reviews)
|
||||
if (!result.total_reviews) {
|
||||
var tabs = document.querySelectorAll('button[role="tab"]');
|
||||
for (var tab of tabs) {
|
||||
var text = tab.textContent.toLowerCase();
|
||||
if (text.includes('review')) {
|
||||
var match = tab.textContent.match(/\\((\\d+)\\)/);
|
||||
if (match) {
|
||||
result.total_reviews = parseInt(match[1]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback 2: Look for any button with "Reviews" and a number
|
||||
if (!result.total_reviews) {
|
||||
var buttons = document.querySelectorAll('button');
|
||||
for (var btn of buttons) {
|
||||
var text = btn.textContent;
|
||||
if (text.toLowerCase().includes('review') && !text.toLowerCase().includes('write')) {
|
||||
var numMatch = text.match(/\\((\\d+)\\)/);
|
||||
if (numMatch) {
|
||||
result.total_reviews = parseInt(numMatch[1]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1242,23 +1699,41 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
|
||||
var addrBtn = document.querySelector('button[data-item-id="address"]');
|
||||
if (addrBtn) {
|
||||
var label = addrBtn.getAttribute('aria-label');
|
||||
if (label) result.address = label.replace(/^(Address|Dirección|Adresse):\\s*/i, '');
|
||||
if (label) result.address = label.replace(/^Address:\\s*/i, '');
|
||||
}
|
||||
|
||||
return result;
|
||||
""")
|
||||
# Exit early if we have the essentials
|
||||
if info.get("name") and info.get("total_reviews") is not None:
|
||||
# Exit early if we have the essentials (name found AND reviews count > 0)
|
||||
if info.get("name") and info.get("total_reviews") and info.get("total_reviews") > 0:
|
||||
break
|
||||
|
||||
# Log debug info once after 3 seconds
|
||||
if not debug_logged and time.time() - start > 3:
|
||||
debug_logged = True
|
||||
debug_info = info.get("debug", [])
|
||||
if debug_info:
|
||||
log.info(f"🔍 Validation debug - URL: {url[:50]}...")
|
||||
log.info(f" Name: {info.get('name')}, Rating: {info.get('rating')}, Reviews: {info.get('total_reviews')}")
|
||||
for d in debug_info[:10]: # First 10 debug items
|
||||
log.info(f" {d}")
|
||||
except:
|
||||
pass
|
||||
time.sleep(0.1) # 100ms between polls
|
||||
|
||||
# Final debug log if still no reviews
|
||||
if not info.get("total_reviews"):
|
||||
debug_info = info.get("debug", [])
|
||||
log.warning(f"⚠️ Validation: No reviews found for '{info.get('name')}' after 10s polling")
|
||||
if debug_info:
|
||||
log.warning(f" Debug items: {debug_info[:10]}")
|
||||
|
||||
return {
|
||||
"name": info.get("name"),
|
||||
"address": info.get("address"),
|
||||
"rating": info.get("rating"),
|
||||
"total_reviews": info.get("total_reviews"),
|
||||
"category": info.get("category"),
|
||||
"success": bool(info.get("name")),
|
||||
"error": None,
|
||||
"time": time.time() - start_time
|
||||
@@ -1270,6 +1745,7 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
|
||||
"address": None,
|
||||
"rating": None,
|
||||
"total_reviews": None,
|
||||
"category": None,
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"time": time.time() - start_time
|
||||
|
||||
Reference in New Issue
Block a user