Add browser fingerprint support and analytics metadata display

- Transfer user's browser fingerprint (user-agent, viewport, timezone,
  language, geolocation) to Chrome for more authentic scraping
- Display review topics from Google Maps in analytics dashboard
- Show business category badge in analytics header
- Fix date_text null handling in analytics (handle undefined/timestamp fields)
- Add review_topics and business_category to JobStatus interface

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-24 10:36:06 +00:00
parent 1bd30c0789
commit a540ab97b1
9 changed files with 1214 additions and 231 deletions

View File

@@ -21,6 +21,7 @@ class JobStatus(str, Enum):
COMPLETED = "completed"
FAILED = "failed"
CANCELLED = "cancelled"
PARTIAL = "partial" # Job crashed but has partial reviews saved
class DatabaseManager:
@@ -69,6 +70,7 @@ class DatabaseManager:
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
started_at TIMESTAMP,
completed_at TIMESTAMP,
updated_at TIMESTAMP,
reviews_count INTEGER,
total_reviews INTEGER,
@@ -79,7 +81,7 @@ class DatabaseManager:
metadata JSONB,
scrape_logs JSONB,
CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled'))
CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled', 'partial'))
);
""")
@@ -88,6 +90,24 @@ class DatabaseManager:
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS scrape_logs JSONB;
""")
# Add updated_at column if it doesn't exist (for incremental progress tracking)
await conn.execute("""
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS updated_at TIMESTAMP;
""")
# Add review_topics column if it doesn't exist (extracted topic filters with mention counts)
await conn.execute("""
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS review_topics JSONB;
""")
# Update constraint to include 'partial' status (for existing databases)
await conn.execute("""
ALTER TABLE jobs DROP CONSTRAINT IF EXISTS valid_status;
""")
await conn.execute("""
ALTER TABLE jobs ADD CONSTRAINT valid_status CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled', 'partial'));
""")
# Create indexes
await conn.execute("""
CREATE INDEX IF NOT EXISTS idx_jobs_status ON jobs(status);
@@ -187,13 +207,15 @@ class DatabaseManager:
created_at,
started_at,
completed_at,
updated_at,
reviews_count,
total_reviews,
reviews_data,
scrape_time,
error_message,
metadata,
scrape_logs
scrape_logs,
review_topics
FROM jobs
WHERE job_id = $1
""", job_id)
@@ -203,22 +225,32 @@ class DatabaseManager:
return dict(row)
async def get_job_reviews(self, job_id: UUID) -> Optional[List[Dict[str, Any]]]:
async def get_job_reviews(self, job_id: UUID, include_partial: bool = True) -> Optional[List[Dict[str, Any]]]:
"""
Get reviews for a specific job.
Args:
job_id: Job UUID
include_partial: If True, also return reviews for running and partial jobs
Returns:
List of reviews or None if not found/not completed
List of reviews or None if not found/no reviews
"""
async with self.pool.acquire() as conn:
reviews_data = await conn.fetchval("""
SELECT reviews_data
FROM jobs
WHERE job_id = $1 AND status = 'completed'
""", job_id)
if include_partial:
# Return reviews for completed, running, or partial jobs
reviews_data = await conn.fetchval("""
SELECT reviews_data
FROM jobs
WHERE job_id = $1 AND status IN ('completed', 'running', 'partial')
""", job_id)
else:
# Only return reviews for completed jobs
reviews_data = await conn.fetchval("""
SELECT reviews_data
FROM jobs
WHERE job_id = $1 AND status = 'completed'
""", job_id)
if not reviews_data:
return None
@@ -278,7 +310,8 @@ class DatabaseManager:
reviews: List[Dict[str, Any]],
scrape_time: float,
total_reviews: Optional[int] = None,
scrape_logs: Optional[List[Dict[str, Any]]] = None
scrape_logs: Optional[List[Dict[str, Any]]] = None,
review_topics: Optional[List[Dict[str, Any]]] = None
):
"""
Save scraping results to database.
@@ -289,8 +322,33 @@ class DatabaseManager:
scrape_time: Time taken to scrape in seconds
total_reviews: Total reviews available (from page counter)
scrape_logs: List of log entries from the scraper
review_topics: List of topic filter dictionaries with topic and count
"""
async with self.pool.acquire() as conn:
# If reviews list is empty, check if job already has reviews from incremental saves
# This happens when flush_callback was used during scraping
if not reviews:
existing = await conn.fetchval(
"SELECT reviews_count FROM jobs WHERE job_id = $1", job_id
)
if existing and existing > 0:
# Job has reviews from incremental saves, don't overwrite reviews_data
await conn.execute("""
UPDATE jobs
SET
status = 'completed',
completed_at = NOW(),
total_reviews = COALESCE($2, total_reviews),
scrape_time = $3,
scrape_logs = $4::jsonb,
review_topics = $5::jsonb
WHERE job_id = $1
""", job_id, total_reviews, scrape_time,
json.dumps(scrape_logs) if scrape_logs else None,
json.dumps(review_topics) if review_topics else None)
log.info(f"Completed job {job_id} with {existing} reviews (from incremental saves)")
return
await conn.execute("""
UPDATE jobs
SET
@@ -300,13 +358,70 @@ class DatabaseManager:
total_reviews = $3,
reviews_data = $4::jsonb,
scrape_time = $5,
scrape_logs = $6::jsonb
scrape_logs = $6::jsonb,
review_topics = $7::jsonb
WHERE job_id = $1
""", job_id, len(reviews), total_reviews, json.dumps(reviews), scrape_time,
json.dumps(scrape_logs) if scrape_logs else None)
json.dumps(scrape_logs) if scrape_logs else None,
json.dumps(review_topics) if review_topics else None)
log.info(f"Saved {len(reviews)} reviews for job {job_id}")
async def save_reviews_incremental(
self,
job_id: UUID,
reviews: List[Dict[str, Any]],
total_reviews: Optional[int] = None
):
"""
Save reviews incrementally during scraping.
Called on each flush to preserve progress in case of crash.
Args:
job_id: Job UUID
reviews: ALL reviews collected so far (not just new ones)
total_reviews: Total reviews available (from page counter)
"""
async with self.pool.acquire() as conn:
await conn.execute("""
UPDATE jobs
SET
reviews_count = $2,
total_reviews = COALESCE($3, total_reviews),
reviews_data = $4::jsonb,
updated_at = NOW()
WHERE job_id = $1 AND status = 'running'
""", job_id, len(reviews), total_reviews, json.dumps(reviews))
log.debug(f"Incremental save: {len(reviews)} reviews for job {job_id}")
async def mark_job_partial(
self,
job_id: UUID,
error_message: str,
scrape_logs: Optional[List[Dict[str, Any]]] = None
):
"""
Mark a job as partial (crashed but has some reviews saved).
Args:
job_id: Job UUID
error_message: Error that caused the crash
scrape_logs: Log entries from the scraper
"""
async with self.pool.acquire() as conn:
await conn.execute("""
UPDATE jobs
SET
status = 'partial',
completed_at = NOW(),
error_message = $2,
scrape_logs = $3::jsonb
WHERE job_id = $1
""", job_id, error_message, json.dumps(scrape_logs) if scrape_logs else None)
log.info(f"Marked job {job_id} as partial due to: {error_message}")
async def list_jobs(
self,
status: Optional[JobStatus] = None,
@@ -337,7 +452,8 @@ class DatabaseManager:
total_reviews,
scrape_time,
error_message,
metadata
metadata,
review_topics
FROM jobs
WHERE status = $1
ORDER BY created_at DESC
@@ -355,7 +471,8 @@ class DatabaseManager:
total_reviews,
scrape_time,
error_message,
metadata
metadata,
review_topics
FROM jobs
ORDER BY created_at DESC
LIMIT $1 OFFSET $2

View File

@@ -268,7 +268,7 @@ def parse_dom_review(card) -> dict:
def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: int = 15,
flush_callback=None, flush_batch_size: int = 500, log_capture: LogCapture = None,
progress_callback=None) -> dict:
progress_callback=None, validation_only: bool = False) -> dict:
"""
Scrape Google Maps reviews.
@@ -299,6 +299,9 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
# Track total reviews (persists across refreshes)
total_reviews = [None] # Use list for closure mutation
# Store business info extracted from overview (before clicking reviews tab)
business_info_cache = [None]
# Hard refresh counter
hard_refresh_count = [0]
max_hard_refreshes = 3 # Max number of hard refreshes before giving up
@@ -323,11 +326,14 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
pass
return None
def setup_reviews_page(is_refresh=False):
def setup_reviews_page(is_refresh=False, validation_only_mode=False):
"""
Setup the reviews page for scraping.
Returns (scroll_container, stop_scrolling_event) or (None, None) on failure.
Can be called after initial load or after a hard refresh.
If validation_only_mode=True, returns early after extracting business info
without clicking reviews tab or finding scroll container.
"""
nonlocal total_reviews
@@ -335,6 +341,13 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
# Navigate to URL (only on initial load or refresh)
if not is_refresh:
# Reset browser state by navigating to blank page first
# This clears any stale state from pooled browser sessions
try:
driver.get("about:blank")
time.sleep(0.1)
except:
pass
log.info(f"🌐 Loading: {url[:80]}...")
else:
log.info(f"🔄 Hard refresh #{hard_refresh_count[0]}: reloading page...")
@@ -353,6 +366,8 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
# Reload original URL after consent
log.info(" Reloading after consent...")
driver.get(url)
# Wait for page to settle after consent reload
time.sleep(1)
break
except:
pass
@@ -362,43 +377,108 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
break
time.sleep(0.01) # 10ms - responsive but low CPU
# Extract total review count BEFORE clicking reviews tab (it's on Overview)
# Extract business info and total review count BEFORE clicking reviews tab (on Overview)
# This captures name, rating, category, address while they're visible
# Only on first load (don't overwrite if we already have it)
if total_reviews[0] is None:
if total_reviews[0] is None or business_info_cache[0] is None:
start = time.time()
while time.time() - start < 5:
try:
count = driver.execute_script("""
var reviewSpans = document.querySelectorAll('span[role="img"]');
for (var i = 0; i < reviewSpans.length; i++) {
var label = reviewSpans[i].getAttribute('aria-label') || '';
var match = label.match(/^([\\d,\\.]+)\\s*review/i);
if (match) {
return parseInt(match[1].replace(/[,\\.]/g, ''));
info = driver.execute_script("""
var result = {
total_reviews: null,
name: null,
rating: null,
category: null,
address: null
};
// Business name from h1
var h1 = document.querySelector('h1');
if (h1) result.name = h1.textContent.trim();
// Category - use jsaction attribute (robust selector)
var catBtn = document.querySelector('button[jsaction*="category"]');
if (catBtn) result.category = catBtn.textContent.trim();
// Rating and review count from span[role="img"] aria-labels
var spans = document.querySelectorAll('span[role="img"]');
for (var i = 0; i < spans.length; i++) {
var label = spans[i].getAttribute('aria-label') || '';
// Rating: "4.8 stars"
var rMatch = label.match(/^([\\d,.]+)\\s*star/i);
if (rMatch && !result.rating) {
result.rating = parseFloat(rMatch[1].replace(',', '.'));
}
// Reviews: "79 reviews"
var revMatch = label.match(/^([\\d,\\.]+)\\s*review/i);
if (revMatch && !result.total_reviews) {
result.total_reviews = parseInt(revMatch[1].replace(/[,\\.]/g, ''));
}
}
return null;
// Address from button
var addrBtn = document.querySelector('button[data-item-id="address"]');
if (addrBtn) {
var label = addrBtn.getAttribute('aria-label');
if (label) result.address = label.replace(/^Address:\\s*/i, '');
}
return result;
""")
if count:
total_reviews[0] = count
log.info(f"📊 Total reviews on page: {count}")
break
if info:
if info.get('total_reviews') and total_reviews[0] is None:
total_reviews[0] = info['total_reviews']
log.info(f"📊 Total reviews on page: {total_reviews[0]}")
if info.get('name') and business_info_cache[0] is None:
business_info_cache[0] = info
log.info(f"📍 Business: {info.get('name')}")
if total_reviews[0] and business_info_cache[0]:
break
except:
pass
time.sleep(0.1)
# VALIDATION_ONLY: Return early - skip clicking reviews tab, sorting, etc.
if validation_only_mode:
log.info("📋 Validation mode: returning early (skipping reviews tab)")
return ("validation_done", None)
# Click reviews tab - poll until found
review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"]
start = time.time()
tab_clicked = False
tabs_logged = False
while time.time() - start < 5: # Max 5s for tabs
try:
tabs = driver.find_elements(By.CSS_SELECTOR, "button[role='tab']")
# Log available tabs once for debugging
if not tabs_logged and tabs:
tabs_logged = True
tab_texts = [t.text for t in tabs]
log.info(f" Available tabs: {tab_texts}")
for tab in tabs:
tab_text = tab.text.lower()
if any(kw in tab_text for kw in review_keywords):
if not is_refresh:
log.info(f" Clicking reviews tab: '{tab.text}'")
# Extract total_reviews from tab text like "Reviews (79)" or "Reviews\n79"
if total_reviews[0] is None:
import re
# Try pattern with parentheses: "Reviews (79)"
match = re.search(r'\((\d+)\)', tab.text)
if match:
total_reviews[0] = int(match.group(1))
log.info(f"📊 Total reviews from tab: {total_reviews[0]}")
else:
# Try pattern with newline: "Reviews\n79"
match = re.search(r'(\d+)', tab.text)
if match:
total_reviews[0] = int(match.group(1))
log.info(f"📊 Total reviews from tab: {total_reviews[0]}")
tab.click()
tab_clicked = True
break
@@ -569,11 +649,85 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
return scroll_container, stop_scrolling
# Initial page setup
scroll_container, stop_scrolling = setup_reviews_page(is_refresh=False)
# Helper to extract review topics from the reviews tab
def extract_review_topics():
"""Extract review topic filters from radiogroup (robust selectors)."""
try:
topics = driver.execute_script("""
var topics = [];
// Primary: use role="radiogroup" with aria-label="Refine reviews" (robust)
var container = document.querySelector('div[role="radiogroup"][aria-label*="Refine"], div[role="radiogroup"][aria-label*="refine"]');
if (!container) {
// Fallback: any radiogroup in the reviews area
container = document.querySelector('div[role="radiogroup"]');
}
if (container) {
var buttons = container.querySelectorAll('button[role="radio"]');
for (var btn of buttons) {
var label = btn.getAttribute('aria-label') || '';
// Parse "hair salon, mentioned in 4 reviews" format
var match = label.match(/^([^,]+),\\s*mentioned in (\\d+)/i);
if (match) {
topics.push({
topic: match[1].trim(),
count: parseInt(match[2])
});
} else if (label && !label.toLowerCase().includes('all review')) {
// Fallback: try to extract from child spans
var countSpan = btn.querySelector('.bC3Nkc, .fontBodySmall');
var nameSpan = btn.querySelector('.uEubGf, span:first-child');
if (nameSpan) {
var name = nameSpan.textContent.trim();
var count = countSpan ? parseInt(countSpan.textContent) : 0;
if (name && name.toLowerCase() !== 'all') {
topics.push({topic: name, count: count || 0});
}
}
}
}
}
return topics;
""")
return topics or []
except:
return []
# Initial page setup (pass validation_only to skip unnecessary steps)
scroll_container, stop_scrolling = setup_reviews_page(is_refresh=False, validation_only_mode=validation_only)
# VALIDATION_ONLY MODE: Return early with just total_reviews and business info
# setup_reviews_page returns ("validation_done", None) in this case
if validation_only or scroll_container == "validation_done":
# Use the business info captured from Overview (before clicking reviews tab)
business_info = business_info_cache[0] or {}
return {
"reviews": [],
"total": total_reviews[0] or 0,
"scrolls": 0,
"error": None,
"validation_info": {
"name": business_info.get("name"),
"rating": business_info.get("rating"),
"category": business_info.get("category"),
"address": business_info.get("address"),
"total_reviews": total_reviews[0]
}
}
if not scroll_container:
return {"reviews": [], "total": 0, "scrolls": 0, "error": "No scroll container found"}
# Extract review topics after reviews tab is loaded (before scrolling begins)
time.sleep(0.5) # Brief wait for topic filters to render
review_topics = extract_review_topics()
if review_topics:
log.info(f"📊 Found {len(review_topics)} review topics: {', '.join(t['topic'] for t in review_topics[:5])}...")
def get_api_reviews():
"""Get reviews from intercepted API responses."""
api_revs = []
@@ -990,13 +1144,15 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
"total_flushed": total_flushed[0],
"checks": check_num,
"url": url,
"logs": log.get_logs()
"logs": log.get_logs(),
"review_topics": review_topics # Topic filters with mention counts
}
def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999,
progress_callback=None, driver=None, return_driver: bool = False,
log_capture: LogCapture = None):
log_capture: LogCapture = None, flush_callback=None, validation_only: bool = False,
browser_fingerprint: dict = None):
"""
Production-compatible wrapper for scrape_reviews.
Matches the API expected by job_manager.py.
@@ -1009,6 +1165,13 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
driver: Existing driver instance to reuse
return_driver: If True, return driver in result
log_capture: Optional LogCapture instance for real-time log access
browser_fingerprint: Optional dict with user's browser fingerprint:
- geolocation: {lat, lng}
- userAgent: string
- viewport: {width, height}
- timezone: string (e.g., "Europe/Madrid")
- language: string (e.g., "en-US")
- platform: string (e.g., "MacIntel")
Returns:
Dictionary with: reviews, count, total_reviews, time, success, error, driver, logs
@@ -1023,27 +1186,56 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
log_capture = log_capture or LogCapture()
try:
# Extract fingerprint settings
fp = browser_fingerprint or {}
user_agent = fp.get('userAgent') or "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
viewport = fp.get('viewport') or {'width': 1200, 'height': 900}
geolocation = fp.get('geolocation')
timezone = fp.get('timezone')
language = fp.get('language', 'en-US')
# Create driver if not provided
if not driver:
driver = Driver(
uc=True,
headless=headless,
page_load_strategy="normal",
agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
agent=user_agent # Use user's actual user agent
)
driver.set_window_size(1200, 900) # Proper viewport for Google Maps
# Set viewport to match user's screen
driver.set_window_size(viewport['width'], viewport['height'])
# Set Chrome geolocation to US (Boston, MA) using CDP
# This ensures Google Maps shows US results regardless of server location
# Apply browser fingerprint settings via CDP
try:
driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
'latitude': 42.3601,
'longitude': -71.0589,
'accuracy': 100
})
log_capture.info("Set geolocation to US (Boston, MA)")
# Set timezone if provided
if timezone:
driver.execute_cdp_cmd('Emulation.setTimezoneOverride', {'timezoneId': timezone})
log_capture.info(f"Set timezone to {timezone}")
# Set locale/language
driver.execute_cdp_cmd('Emulation.setLocaleOverride', {'locale': language})
# Set geolocation
if geolocation and 'lat' in geolocation and 'lng' in geolocation:
driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
'latitude': geolocation['lat'],
'longitude': geolocation['lng'],
'accuracy': 1000 # ~1km accuracy for IP-based location
})
log_capture.info(f"Set geolocation to ({geolocation['lat']:.2f}, {geolocation['lng']:.2f})")
else:
# Default to US (Boston, MA) if no geolocation provided
driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
'latitude': 42.3601,
'longitude': -71.0589,
'accuracy': 100
})
log_capture.info("Set geolocation to US (Boston, MA) [default]")
if fp:
log_capture.info(f"Browser fingerprint applied: {fp.get('platform', 'unknown')}, {viewport['width']}x{viewport['height']}")
except Exception as e:
log_capture.warning(f"Could not set geolocation: {e}")
log_capture.warning(f"Could not apply fingerprint settings: {e}")
# Add URL parameters for consistent results
if 'hl=' not in url:
@@ -1052,14 +1244,18 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
if 'gl=' not in url:
url = f"{url}&gl=us"
# Create progress wrapper if callback provided
flush_callback = None
if progress_callback:
# Create combined flush callback for progress + external handler
external_flush = flush_callback # Save external callback
internal_flush = None
if progress_callback or external_flush:
collected = [0]
def flush_with_progress(reviews_batch):
collected[0] += len(reviews_batch)
progress_callback(collected[0], None)
flush_callback = flush_with_progress
def combined_flush(reviews_batch):
collected[0] = len(reviews_batch) # reviews_batch is ALL reviews so far
if progress_callback:
progress_callback(collected[0], None)
if external_flush:
external_flush(reviews_batch) # Pass reviews to external handler
internal_flush = combined_flush
# Run the scraper with progress callback for real-time updates
result = scrape_reviews(
@@ -1067,10 +1263,11 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
url=url,
max_reviews=999999, # Effectively unlimited
timeout_no_new=15,
flush_callback=flush_callback,
flush_callback=internal_flush,
flush_batch_size=100, # Smaller batches for more frequent progress
log_capture=log_capture,
progress_callback=progress_callback # Pass through for real-time log updates
progress_callback=progress_callback, # Pass through for real-time log updates
validation_only=validation_only # Return early if just validating
)
elapsed = time.time() - start_time
@@ -1083,9 +1280,14 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
"time": elapsed,
"success": True,
"error": None,
"logs": result.get("logs", [])
"logs": result.get("logs", []),
"review_topics": result.get("review_topics", []) # Topic filters with mention counts
}
# Include validation_info if in validation_only mode
if validation_only and "validation_info" in result:
response["validation_info"] = result["validation_info"]
if return_driver:
response["driver"] = driver
elif should_close_driver:
@@ -1120,6 +1322,122 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
}
def extract_about_info(driver, url: str = None) -> dict:
"""
Extract About section info from Google Maps (Accessibility, Amenities, etc.).
This function should be called AFTER reviews are scraped if about info is needed,
as it navigates to a different tab.
Args:
driver: Selenium WebDriver instance (already on the business page)
url: Optional URL to navigate to first (if not already on the page)
Returns:
dict with section names as keys, each containing list of features
"""
try:
# Navigate if URL provided
if url:
# Force English
if 'hl=' not in url:
separator = '&' if '?' in url else '?'
url = f"{url}{separator}hl=en"
if 'gl=' not in url:
url = f"{url}&gl=us"
driver.get(url)
time.sleep(1)
# Click About tab using robust selectors
clicked = driver.execute_script("""
// Try multiple selectors for about tab
var selectors = [
'button[aria-label*="About"]',
'button[data-tab-index="2"]',
'div[role="tablist"] button:nth-child(3)',
'button[jsaction*="about"]'
];
for (var sel of selectors) {
var btn = document.querySelector(sel);
if (btn && btn.textContent.toLowerCase().includes('about')) {
btn.click();
return true;
}
}
// Fallback: find by text content
var buttons = document.querySelectorAll('button');
for (var btn of buttons) {
if (btn.textContent.trim().toLowerCase() === 'about') {
btn.click();
return true;
}
}
return false;
""")
if not clicked:
return {}
time.sleep(1.5) # Wait for about tab to load
# Extract about sections using aria-labels (robust)
about = driver.execute_script("""
var about = {};
// Find the about region by aria-label or role
var container = document.querySelector('div[role="region"][aria-label*="About"]');
if (!container) {
// Fallback: look for the scrollable area with sections
container = document.querySelector('.m6QErb[aria-label*="About"]');
}
if (!container) {
// Last resort: find sections by h2 headers
container = document;
}
// Find all section headers (h2 elements)
var sections = container.querySelectorAll('h2');
for (var h2 of sections) {
var sectionName = h2.textContent.trim();
var items = [];
// Find the ul list following this h2
var parent = h2.closest('.iP2t7d, div');
if (parent) {
var listItems = parent.querySelectorAll('li span[aria-label]');
for (var li of listItems) {
var label = li.getAttribute('aria-label');
if (label) {
// Parse "Has toilet" or "No wheelchair-accessible car park"
var hasFeature = !label.toLowerCase().startsWith('no ');
var featureName = label.replace(/^(Has |No )/i, '');
items.push({
feature: featureName,
available: hasFeature
});
}
}
}
if (sectionName && items.length > 0) {
about[sectionName] = items;
}
}
return about;
""")
return about or {}
except Exception as e:
return {"error": str(e)}
# Test function
if __name__ == "__main__":
from seleniumbase import Driver
@@ -1159,6 +1477,8 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
dict with: name, address, rating, total_reviews, success, error, time
"""
from seleniumbase import Driver
import logging
log = logging.getLogger(__name__)
start_time = time.time()
driver_provided = driver is not None
@@ -1177,13 +1497,15 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
except:
pass
# Clear state if reusing a pooled driver (ensures clean page load)
if driver_provided:
try:
driver.delete_all_cookies()
driver.get("about:blank")
except:
pass
# Don't clear state - Google may serve different content based on session history
# The scraper doesn't reset state, so validation shouldn't either
# Force English interface for consistent parsing
if 'hl=' not in url:
separator = '&' if '?' in url else '?'
url = f"{url}{separator}hl=en"
if 'gl=' not in url:
url = f"{url}&gl=us"
# Navigate to URL
driver.get(url)
@@ -1193,48 +1515,183 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
while time.time() - start < 5:
if "consent.google" in driver.current_url:
try:
for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
txt = btn.text.lower()
if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
btn.click()
driver.get(url)
break
except:
# Try multiple approaches to find and click accept button
clicked = False
# Method 1: Find by aria-label (most reliable for Google consent)
for btn in driver.find_elements(By.CSS_SELECTOR, "button[aria-label*='Accept']"):
btn.click()
clicked = True
break
# Method 2: Find by text content
if not clicked:
for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
txt = btn.text.lower()
if "accept all" in txt or "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
btn.click()
clicked = True
break
if clicked:
time.sleep(0.5) # Brief wait for consent to process
driver.get(url) # Reload the target URL
time.sleep(0.5) # Wait for reload
except Exception as e:
pass
break
if "maps/place" in driver.current_url or ("maps" in driver.current_url and "consent" not in driver.current_url):
break
time.sleep(0.01) # 10ms - responsive but low CPU
# Log current URL after consent handling
try:
current_url = driver.current_url
log.info(f"🔍 Validation: Current URL after load: {current_url[:80]}...")
except:
pass
# Wait for page to fully render before polling (tabs may load dynamically)
time.sleep(2)
# Poll for business info (same pattern as total_reviews extraction)
info = {"name": None, "rating": None, "total_reviews": None, "address": None}
# Timeout increased to 10s because Reviews tab can take 6+ seconds to appear after consent
info = {"name": None, "rating": None, "total_reviews": None, "address": None, "category": None}
start = time.time()
while time.time() - start < 5:
debug_logged = False
while time.time() - start < 10:
try:
info = driver.execute_script("""
var result = {name: null, rating: null, total_reviews: null, address: null};
var result = {name: null, rating: null, total_reviews: null, address: null, category: null, debug: []};
// Business name from h1
var h1 = document.querySelector('h1');
if (h1) result.name = h1.textContent.trim();
// Rating and reviews from span[role="img"] aria-labels
// Same pattern as scrape_reviews for consistency
// Category - use jsaction attribute (robust, survives class changes)
var catBtn = document.querySelector('button[jsaction*="category"]');
if (catBtn) result.category = catBtn.textContent.trim();
// Fallback: look for button after rating that's not a link
if (!result.category) {
var buttons = document.querySelectorAll('button');
for (var btn of buttons) {
var text = btn.textContent.trim();
// Categories are short words, no numbers, not navigation
if (text && text.length < 50 && !text.match(/^[0-9]/) &&
!text.match(/review|star|direction|save|share|photo/i)) {
// Check if it's near the rating area
var parent = btn.closest('.LBgpqf, .skqShb, .fontBodyMedium');
if (parent) {
result.category = text;
break;
}
}
}
}
// Rating from span[role="img"] aria-labels
var spans = document.querySelectorAll('span[role="img"]');
for (var i = 0; i < spans.length; i++) {
var label = spans[i].getAttribute('aria-label') || '';
// Rating: "4.8 stars", "4,8 estrellas", etc (partial match)
var rMatch = label.match(/^([\\d,.]+)\\s*(star|estrella|étoile|stern|stell)/i);
// Collect debug info for all aria-labels
if (label) {
result.debug.push('img-aria: ' + label);
}
// Rating: "4.8 stars" (English forced via hl=en)
var rMatch = label.match(/^([\\d,.]+)\\s*star/i);
if (rMatch && !result.rating) {
result.rating = parseFloat(rMatch[1].replace(',', '.'));
}
// Reviews: same as scrape_reviews - /^([\d,.]+)\s*review/i
// Plus Spanish "reseña" which doesn't contain "review"
var revMatch = label.match(/^([\\d,\\.]+)\\s*(review|reseña|avis|bewertung|recension)/i);
// Reviews: "79 reviews" or "4.8 stars 79 reviews" (English forced via hl=en)
// Try direct format first: "79 reviews"
var revMatch = label.match(/^([\\d,]+)\\s*review/i);
if (revMatch && !result.total_reviews) {
result.total_reviews = parseInt(revMatch[1].replace(/[,\\.]/g, ''));
result.total_reviews = parseInt(revMatch[1].replace(/,/g, ''));
}
// Try combined format: "4.8 stars 79 reviews" or "4.8 stars 79k+ reviews"
if (!result.total_reviews) {
var combinedMatch = label.match(/stars?\\s+([\\d,]+k?\\+?)\\s*review/i);
if (combinedMatch) {
var countStr = combinedMatch[1].replace(/,/g, '');
if (countStr.includes('k')) {
// Handle "9k+" format
result.total_reviews = parseInt(countStr) * 1000;
} else {
result.total_reviews = parseInt(countStr);
}
}
}
}
// Also collect tab button texts for debugging (include full text including numbers)
var tabs = document.querySelectorAll('button[role="tab"]');
for (var j = 0; j < tabs.length; j++) {
var tabText = tabs[j].textContent.trim();
result.debug.push('tab: ' + tabText);
// Also try to extract review count from tab text like "Reviews (79)"
if (tabText.toLowerCase().includes('review') && !result.total_reviews) {
var tabMatch = tabText.match(/\\((\\d+)\\)/);
if (tabMatch) {
result.total_reviews = parseInt(tabMatch[1]);
result.debug.push('Found reviews in tab: ' + tabText);
}
}
}
// Also check ALL buttons for reviews count
var allButtons = document.querySelectorAll('button');
for (var b = 0; b < allButtons.length; b++) {
var btnText = allButtons[b].textContent || '';
if (btnText.toLowerCase().includes('review') && !btnText.toLowerCase().includes('write')) {
var numMatch = btnText.match(/\\((\\d+)\\)/);
if (numMatch && !result.total_reviews) {
result.total_reviews = parseInt(numMatch[1]);
result.debug.push('Found reviews in button: ' + btnText.substring(0, 50));
}
}
}
// Check if we're on search results vs place page
result.debug.push('title: ' + document.title);
result.debug.push('url: ' + window.location.href.substring(0, 80));
// Check for search results list
var searchResults = document.querySelectorAll('div[role="feed"] > div');
result.debug.push('search_results_count: ' + searchResults.length);
// Fallback: Get review count from Reviews tab button "Reviews (79)"
// Search ALL tab buttons for one containing "review" text (same as scrape_reviews)
if (!result.total_reviews) {
var tabs = document.querySelectorAll('button[role="tab"]');
for (var tab of tabs) {
var text = tab.textContent.toLowerCase();
if (text.includes('review')) {
var match = tab.textContent.match(/\\((\\d+)\\)/);
if (match) {
result.total_reviews = parseInt(match[1]);
break;
}
}
}
}
// Fallback 2: Look for any button with "Reviews" and a number
if (!result.total_reviews) {
var buttons = document.querySelectorAll('button');
for (var btn of buttons) {
var text = btn.textContent;
if (text.toLowerCase().includes('review') && !text.toLowerCase().includes('write')) {
var numMatch = text.match(/\\((\\d+)\\)/);
if (numMatch) {
result.total_reviews = parseInt(numMatch[1]);
break;
}
}
}
}
@@ -1242,23 +1699,41 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
var addrBtn = document.querySelector('button[data-item-id="address"]');
if (addrBtn) {
var label = addrBtn.getAttribute('aria-label');
if (label) result.address = label.replace(/^(Address|Dirección|Adresse):\\s*/i, '');
if (label) result.address = label.replace(/^Address:\\s*/i, '');
}
return result;
""")
# Exit early if we have the essentials
if info.get("name") and info.get("total_reviews") is not None:
# Exit early if we have the essentials (name found AND reviews count > 0)
if info.get("name") and info.get("total_reviews") and info.get("total_reviews") > 0:
break
# Log debug info once after 3 seconds
if not debug_logged and time.time() - start > 3:
debug_logged = True
debug_info = info.get("debug", [])
if debug_info:
log.info(f"🔍 Validation debug - URL: {url[:50]}...")
log.info(f" Name: {info.get('name')}, Rating: {info.get('rating')}, Reviews: {info.get('total_reviews')}")
for d in debug_info[:10]: # First 10 debug items
log.info(f" {d}")
except:
pass
time.sleep(0.1) # 100ms between polls
# Final debug log if still no reviews
if not info.get("total_reviews"):
debug_info = info.get("debug", [])
log.warning(f"⚠️ Validation: No reviews found for '{info.get('name')}' after 10s polling")
if debug_info:
log.warning(f" Debug items: {debug_info[:10]}")
return {
"name": info.get("name"),
"address": info.get("address"),
"rating": info.get("rating"),
"total_reviews": info.get("total_reviews"),
"category": info.get("category"),
"success": bool(info.get("name")),
"error": None,
"time": time.time() - start_time
@@ -1270,6 +1745,7 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
"address": None,
"rating": None,
"total_reviews": None,
"category": None,
"success": False,
"error": str(e),
"time": time.time() - start_time