Remove old scraper files - consolidate to scraper_clean
Production (api_server_production.py) only uses: - modules/scraper_clean.py - main scraping logic - modules/fast_scraper.py - validation helpers - modules/database.py, webhooks.py, health_checks.py, chrome_pool.py Deleted 33 unused Python files including: - Old API server (api_server.py) - 14 start*.py experimental scrapers - 7 *_scraper.py variants - Old modules: scraper.py, api_interceptor.py, job_manager.py, cli.py - Various debug/test/utility scripts Saves ~11,000 lines of unmaintained code. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1140,13 +1140,30 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
|
||||
else:
|
||||
log.info(f"[PROFILE] Using pooled driver (0.00s)")
|
||||
|
||||
# Force English locale for consistent parsing
|
||||
# Force English locale AND US region for consistent parsing/results
|
||||
# This helps avoid geolocation-based variations in Google Maps results
|
||||
if 'hl=' in url:
|
||||
url = url.replace('hl=es', 'hl=en').replace('hl=pt', 'hl=en').replace('hl=fr', 'hl=en')
|
||||
else:
|
||||
separator = '&' if '?' in url else '?'
|
||||
url = f"{url}{separator}hl=en"
|
||||
|
||||
# Add US region parameter if not present
|
||||
if 'gl=' not in url:
|
||||
url = f"{url}&gl=us"
|
||||
|
||||
# Set Chrome geolocation to US (Boston, MA) using CDP
|
||||
# This ensures Google Maps shows US results regardless of server location
|
||||
try:
|
||||
driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
|
||||
'latitude': 42.3601,
|
||||
'longitude': -71.0589,
|
||||
'accuracy': 100
|
||||
})
|
||||
log.info("Set geolocation to US (Boston, MA)")
|
||||
except Exception as e:
|
||||
log.warning(f"Could not set geolocation: {e}")
|
||||
|
||||
log.info(f"Loading Google Maps page...")
|
||||
t0 = timing_module.time()
|
||||
driver.get(url)
|
||||
@@ -1164,18 +1181,23 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
|
||||
form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
|
||||
for btn in form_btns:
|
||||
btn_text = (btn.text or '').lower()
|
||||
if 'aceptar todo' in btn_text or 'accept all' in btn_text:
|
||||
if 'aceptar todo' in btn_text or 'accept all' in btn_text or 'reject all' in btn_text:
|
||||
log.info(f"Clicking GDPR consent: {btn.text}")
|
||||
btn.click()
|
||||
time.sleep(1) # Reduced from 2s
|
||||
time.sleep(1)
|
||||
break
|
||||
else:
|
||||
if len(form_btns) >= 2:
|
||||
log.info("Using fallback: clicking second form button")
|
||||
form_btns[1].click()
|
||||
time.sleep(1) # Reduced from 2s
|
||||
time.sleep(1)
|
||||
except Exception as e:
|
||||
log.warning(f"GDPR consent handling failed: {e}")
|
||||
|
||||
# After GDPR consent, reload the original URL to ensure proper page state
|
||||
log.info(f"Reloading original URL after GDPR consent...")
|
||||
driver.get(url)
|
||||
time.sleep(1)
|
||||
log.info(f"[PROFILE] GDPR consent handling: {timing_module.time() - t0:.2f}s")
|
||||
else:
|
||||
log.info(f"[PROFILE] No GDPR consent page (0.00s)")
|
||||
@@ -1197,14 +1219,77 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
|
||||
try:
|
||||
log.info("Waiting for Google Maps content to load...")
|
||||
wait = WebDriverWait(driver, 10)
|
||||
# Wait for basic page structure (h1 or heading)
|
||||
wait.until(
|
||||
lambda d: d.find_elements(By.CSS_SELECTOR, 'h1.DUwDvf, h1, [role="article"], [data-review-id]')
|
||||
lambda d: d.find_elements(By.CSS_SELECTOR, 'h1, [role="heading"]')
|
||||
)
|
||||
log.info("Google Maps content loaded successfully")
|
||||
log.info("Basic page structure loaded")
|
||||
|
||||
# Wait for page to settle - search URLs redirect to place URLs
|
||||
# which triggers additional content loading
|
||||
time.sleep(2)
|
||||
|
||||
# Wait specifically for review count element (aria-label ending with "reviews")
|
||||
# This is the most reliable indicator that the business detail is loaded
|
||||
try:
|
||||
WebDriverWait(driver, 5).until(
|
||||
lambda d: d.execute_script("""
|
||||
var elems = document.querySelectorAll('[aria-label]');
|
||||
for (var i = 0; i < elems.length; i++) {
|
||||
var label = elems[i].getAttribute('aria-label') || '';
|
||||
if (/^[0-9]+ reviews?$/.test(label)) return true;
|
||||
}
|
||||
return false;
|
||||
""")
|
||||
)
|
||||
log.info("Review count element loaded")
|
||||
except:
|
||||
# Fallback: Try clicking Reviews tab or rating stars to expose the review count
|
||||
log.info("Review count wait timeout, trying to click Reviews/rating...")
|
||||
try:
|
||||
# Try 1: Click Reviews tab (if exists)
|
||||
clicked = driver.execute_script("""
|
||||
var tabs = document.querySelectorAll('[role="tab"]');
|
||||
for (var i = 0; i < tabs.length; i++) {
|
||||
var txt = (tabs[i].textContent || '').toLowerCase();
|
||||
if (txt.includes('review')) {
|
||||
tabs[i].click();
|
||||
return 'tab';
|
||||
}
|
||||
}
|
||||
// Try 2: Click the rating stars element (often links to reviews)
|
||||
var stars = document.querySelector('[role="img"][aria-label*="star"]');
|
||||
if (stars) {
|
||||
var parent = stars.parentElement;
|
||||
if (parent && parent.tagName.toLowerCase() === 'button') {
|
||||
parent.click();
|
||||
return 'stars_button';
|
||||
}
|
||||
stars.click();
|
||||
return 'stars';
|
||||
}
|
||||
// Try 3: Click "Write a review" or any review-related button
|
||||
var btns = document.querySelectorAll('button[aria-label*="review" i]');
|
||||
for (var b = 0; b < btns.length; b++) {
|
||||
var label = btns[b].getAttribute('aria-label') || '';
|
||||
if (!/write/i.test(label) && /review/i.test(label)) {
|
||||
btns[b].click();
|
||||
return 'review_btn: ' + label;
|
||||
}
|
||||
}
|
||||
return 'none';
|
||||
""")
|
||||
log.info(f"Clicked: {clicked}")
|
||||
time.sleep(2) # Wait for reviews panel to load
|
||||
except Exception as e:
|
||||
log.warning(f"Click attempt failed: {e}")
|
||||
|
||||
except Exception as e:
|
||||
log.warning(f"Timeout waiting for Maps content: {e}")
|
||||
time.sleep(0.5) # Minimal fallback wait
|
||||
time.sleep(2) # Fallback wait
|
||||
log.info(f"[PROFILE] Smart wait for content: {timing_module.time() - t0:.2f}s")
|
||||
log.info(f"DEBUG: Current URL: {driver.current_url[:100]}...")
|
||||
log.info(f"DEBUG: Page title: {driver.title}")
|
||||
|
||||
# Extract business card information using JavaScript
|
||||
t0 = timing_module.time()
|
||||
@@ -1216,85 +1301,166 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
|
||||
total_reviews: null
|
||||
};
|
||||
|
||||
// Extract business name
|
||||
const nameSelectors = [
|
||||
'h1.DUwDvf',
|
||||
'[role="main"] h1',
|
||||
'h1.fontHeadlineLarge'
|
||||
];
|
||||
// ============ ROBUST EXTRACTION (no class names, aria/data attributes preferred) ============
|
||||
|
||||
for (const selector of nameSelectors) {
|
||||
const elem = document.querySelector(selector);
|
||||
if (elem && elem.textContent) {
|
||||
info.name = elem.textContent.trim();
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Helper: Parse review count from text, handling multiple formats
|
||||
function parseReviewCount(text) {
|
||||
if (!text) return null;
|
||||
|
||||
// Extract address
|
||||
const addressSelectors = [
|
||||
'button[data-item-id*="address"]',
|
||||
'[data-item-id*="address"]',
|
||||
'div[aria-label*="Address"]'
|
||||
];
|
||||
|
||||
for (const selector of addressSelectors) {
|
||||
const elem = document.querySelector(selector);
|
||||
if (elem && elem.textContent) {
|
||||
info.address = elem.textContent.trim();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Extract rating (look for aria-label like "4.2 stars")
|
||||
const ratingElem = document.querySelector('[role="img"][aria-label*="star"]');
|
||||
if (ratingElem) {
|
||||
const ariaLabel = ratingElem.getAttribute('aria-label');
|
||||
const match = ariaLabel.match(/([0-9.]+)/);
|
||||
// Pattern 1: Exact "N reviews" format (aria-labels, clean text)
|
||||
// Matches: "27 reviews", "1,234 reviews", "27 reseñas", "27 avis"
|
||||
var match = text.match(/^([0-9][0-9,.]*)[ ]*(?:reviews?|reseñas?|avis|bewertungen?|recensioni?)$/i);
|
||||
if (match) {
|
||||
info.rating = parseFloat(match[1]);
|
||||
return parseInt(match[1].replace(/[,. ]/g, ''));
|
||||
}
|
||||
}
|
||||
|
||||
// Extract total review count
|
||||
const reviewPattern = /\\((\\d[\\d,\\.]*)\\)/;
|
||||
const numberPattern = /(\\d[\\d,\\.]*)\\s*(?:review|reseña|avis)/i;
|
||||
// Pattern 2: "(N)" format often used in tabs like "Reviews (27)"
|
||||
match = text.match(/[(]([0-9][0-9,.]*)[)]$/);
|
||||
if (match) {
|
||||
return parseInt(match[1].replace(/[,. ]/g, ''));
|
||||
}
|
||||
|
||||
// PRIORITY 1: Look for review count in search results sidebar/panel
|
||||
// This is where "152 reviews" appears on search results
|
||||
const searchPanelSelectors = [
|
||||
'a[href*="reviews"]', // Link with "reviews" in href
|
||||
'button[jsaction*="reviews"]', // Button related to reviews
|
||||
'div[role="link"]', // Clickable divs that might contain review info
|
||||
];
|
||||
|
||||
for (const selector of searchPanelSelectors) {
|
||||
const elements = document.querySelectorAll(selector);
|
||||
for (let elem of elements) {
|
||||
const text = elem.textContent || '';
|
||||
const match = text.match(numberPattern);
|
||||
// Pattern 3: "N reviews" anywhere in short text (< 30 chars to avoid false positives)
|
||||
if (text.length < 30) {
|
||||
match = text.match(/([0-9][0-9,]*)[ ]+(?:reviews?|reseñas?|avis)/i);
|
||||
if (match) {
|
||||
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
||||
if (num > 0 && num < 1000000) {
|
||||
info.total_reviews = num;
|
||||
break;
|
||||
}
|
||||
return parseInt(match[1].replace(/[,. ]/g, ''));
|
||||
}
|
||||
}
|
||||
if (info.total_reviews) break;
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
// PRIORITY 2: Look in any span/div that contains the word "review"
|
||||
// ============ EXTRACT BUSINESS NAME ============
|
||||
// Priority: h1 (semantic), then role="heading"
|
||||
const h1 = document.querySelector('h1');
|
||||
if (h1 && h1.textContent) {
|
||||
info.name = h1.textContent.trim();
|
||||
}
|
||||
if (!info.name) {
|
||||
const heading = document.querySelector('[role="heading"][aria-level="1"]');
|
||||
if (heading && heading.textContent) {
|
||||
info.name = heading.textContent.trim();
|
||||
}
|
||||
}
|
||||
|
||||
// ============ EXTRACT ADDRESS ============
|
||||
// Priority: data-item-id (semantic), then aria-label containing "address"
|
||||
const addressElem = document.querySelector('[data-item-id*="address"]');
|
||||
if (addressElem && addressElem.textContent) {
|
||||
info.address = addressElem.textContent.trim();
|
||||
}
|
||||
if (!info.address) {
|
||||
const ariaAddress = document.querySelector('[aria-label*="ddress"]');
|
||||
if (ariaAddress && ariaAddress.textContent) {
|
||||
info.address = ariaAddress.textContent.trim();
|
||||
}
|
||||
}
|
||||
|
||||
// ============ EXTRACT RATING ============
|
||||
// Priority: aria-label containing "star" on role="img" elements
|
||||
info._debug_rating_context = [];
|
||||
const ratingElems = document.querySelectorAll('[role="img"][aria-label*="star"]');
|
||||
for (let elem of ratingElems) {
|
||||
const ariaLabel = elem.getAttribute('aria-label') || '';
|
||||
// Match "4.9 stars" or "4,9 stars" (European format)
|
||||
const match = ariaLabel.match(/([0-9][.,]?[0-9]?)\\s*star/i);
|
||||
if (match) {
|
||||
info.rating = parseFloat(match[1].replace(',', '.'));
|
||||
// DEBUG: Capture parent/sibling context to find review count
|
||||
var parent = elem.parentElement;
|
||||
if (parent) {
|
||||
info._debug_rating_context.push('PARENT: ' + (parent.textContent || '').trim().substring(0, 100));
|
||||
var grandparent = parent.parentElement;
|
||||
if (grandparent) {
|
||||
info._debug_rating_context.push('GRANDPARENT: ' + (grandparent.textContent || '').trim().substring(0, 100));
|
||||
// Check all children of grandparent for review count
|
||||
var gpChildren = grandparent.querySelectorAll('*');
|
||||
for (var c = 0; c < Math.min(gpChildren.length, 30); c++) {
|
||||
var childText = (gpChildren[c].textContent || '').trim();
|
||||
if (childText.length > 0 && childText.length < 20 && /[0-9]/.test(childText)) {
|
||||
info._debug_rating_context.push('GP_CHILD: ' + childText);
|
||||
}
|
||||
}
|
||||
// Also check great-grandparent
|
||||
var ggp = grandparent.parentElement;
|
||||
if (ggp) {
|
||||
info._debug_rating_context.push('GREAT_GP: ' + (ggp.textContent || '').trim().substring(0, 150));
|
||||
}
|
||||
}
|
||||
// Check siblings
|
||||
var nextSib = parent.nextElementSibling;
|
||||
if (nextSib) {
|
||||
info._debug_rating_context.push('NEXT_SIB: ' + (nextSib.textContent || '').trim().substring(0, 100));
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// ============ EXTRACT TOTAL REVIEWS (ROBUST, ARIA-FIRST) ============
|
||||
|
||||
// PRIORITY 1: aria-label with exact "N reviews" format (most reliable)
|
||||
// Google Maps uses aria-label="27 reviews" for accessibility
|
||||
info._debug_aria = [];
|
||||
info._debug_all_numeric = [];
|
||||
if (!info.total_reviews) {
|
||||
const allElements = document.querySelectorAll('span, div, a');
|
||||
for (let elem of allElements) {
|
||||
const text = elem.textContent || '';
|
||||
if (text.length < 100) { // Skip very long text blocks
|
||||
const match = text.match(numberPattern);
|
||||
var ariaElems = document.querySelectorAll('[aria-label]');
|
||||
for (var i = 0; i < ariaElems.length; i++) {
|
||||
var ariaLabel = ariaElems[i].getAttribute('aria-label') || '';
|
||||
// Collect all labels containing "review"
|
||||
if (ariaLabel.toLowerCase().indexOf('review') >= 0) {
|
||||
info._debug_aria.push(ariaLabel);
|
||||
}
|
||||
// Collect all labels starting with a digit
|
||||
if (/^[0-9]/.test(ariaLabel)) {
|
||||
info._debug_all_numeric.push(ariaLabel);
|
||||
}
|
||||
var count = parseReviewCount(ariaLabel);
|
||||
if (count && count > 0 && count < 100000) {
|
||||
info.total_reviews = count;
|
||||
info._debug_matched = ariaLabel;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// DEBUG: Find all text with parenthetical numbers like "(27)"
|
||||
info._debug_parens = [];
|
||||
info._debug_short_text = []; // All short text with numbers
|
||||
var allSpans = document.querySelectorAll('span, div, a, button');
|
||||
for (var j = 0; j < Math.min(allSpans.length, 500); j++) {
|
||||
var spanText = allSpans[j].textContent || '';
|
||||
// Capture parenthetical numbers
|
||||
if (spanText.length < 20 && /[(][0-9]+[)]/.test(spanText)) {
|
||||
info._debug_parens.push(spanText.trim());
|
||||
}
|
||||
// Capture ALL short text containing numbers (for debugging)
|
||||
if (spanText.length > 0 && spanText.length < 30 && /[0-9]+/.test(spanText)) {
|
||||
var cleaned = spanText.trim().replace(/\\s+/g, ' ');
|
||||
if (cleaned && info._debug_short_text.indexOf(cleaned) < 0) {
|
||||
info._debug_short_text.push(cleaned);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// PRIORITY 2.5: Look for text containing numbers near "review" word anywhere on page
|
||||
// This catches formats like "27 reviews", "reviews: 27", etc. that aren't in aria-labels
|
||||
if (!info.total_reviews) {
|
||||
var allElems = document.querySelectorAll('*');
|
||||
for (var k = 0; k < Math.min(allElems.length, 1000); k++) {
|
||||
var elem = allElems[k];
|
||||
// Skip if has children (we want leaf nodes only)
|
||||
if (elem.children.length > 0) continue;
|
||||
var txt = (elem.textContent || '').trim();
|
||||
// Look for short text with both numbers and "review" word
|
||||
if (txt.length >= 3 && txt.length < 30 && /review/i.test(txt)) {
|
||||
var match = txt.match(/([0-9][0-9,]*)/);
|
||||
if (match) {
|
||||
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
||||
if (num > 0 && num < 1000000) {
|
||||
info.total_reviews = num;
|
||||
var count = parseInt(match[1].replace(/,/g, ''));
|
||||
if (count > 0 && count < 100000) {
|
||||
info.total_reviews = count;
|
||||
info._debug_matched = 'LEAF: ' + txt;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -1302,38 +1468,167 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
|
||||
}
|
||||
}
|
||||
|
||||
// PRIORITY 3: Try tabs (for business detail pages)
|
||||
// DEBUG: Collect all tab names
|
||||
info._debug_tabs = [];
|
||||
const tabs = document.querySelectorAll('[role="tab"]');
|
||||
for (let t = 0; t < tabs.length; t++) {
|
||||
info._debug_tabs.push((tabs[t].textContent || '').trim().substring(0, 30));
|
||||
}
|
||||
|
||||
// DEBUG: Collect all buttons with text (might contain review count)
|
||||
info._debug_buttons = [];
|
||||
const buttons = document.querySelectorAll('button');
|
||||
for (let b = 0; b < Math.min(buttons.length, 20); b++) {
|
||||
var btnText = (buttons[b].textContent || '').trim();
|
||||
if (btnText && btnText.length < 40) {
|
||||
info._debug_buttons.push(btnText.substring(0, 40));
|
||||
}
|
||||
}
|
||||
|
||||
// PRIORITY 2: Tabs with role="tab" (Reviews tab often shows count)
|
||||
if (!info.total_reviews) {
|
||||
const tabs = document.querySelectorAll('button[role="tab"]');
|
||||
for (let tab of tabs) {
|
||||
const text = tab.textContent || '';
|
||||
let match = text.match(reviewPattern);
|
||||
if (match) {
|
||||
info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
||||
break;
|
||||
const text = (tab.textContent || '').trim();
|
||||
// Look for "Reviews" tab with count
|
||||
if (text.toLowerCase().includes('review')) {
|
||||
const count = parseReviewCount(text);
|
||||
if (count && count > 0) {
|
||||
info.total_reviews = count;
|
||||
info._debug_matched = 'TAB: ' + text;
|
||||
break;
|
||||
}
|
||||
}
|
||||
match = text.match(numberPattern);
|
||||
if (match) {
|
||||
info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
||||
}
|
||||
}
|
||||
|
||||
// PRIORITY 2.3: Reviews panel header (after clicking Reviews tab)
|
||||
// Google Maps shows "27 reviews" as heading text in the reviews panel
|
||||
if (!info.total_reviews) {
|
||||
// Look for headings containing review count
|
||||
var headings = document.querySelectorAll('h1, h2, [role="heading"]');
|
||||
for (var h = 0; h < headings.length; h++) {
|
||||
var hText = (headings[h].textContent || '').trim();
|
||||
if (/review/i.test(hText)) {
|
||||
var match = hText.match(/([0-9][0-9,]*)/);
|
||||
if (match) {
|
||||
var count = parseInt(match[1].replace(/,/g, ''));
|
||||
if (count > 0 && count < 100000) {
|
||||
info.total_reviews = count;
|
||||
info._debug_matched = 'HEADING: ' + hText;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// PRIORITY 2.4: Look for sort button area which often has total count
|
||||
// The sort dropdown area displays "Sort: Newest" and total reviews
|
||||
if (!info.total_reviews) {
|
||||
var sortBtns = document.querySelectorAll('button[data-value="sort"], [aria-label*="Sort"]');
|
||||
for (var s = 0; s < sortBtns.length; s++) {
|
||||
var parent = sortBtns[s].parentElement;
|
||||
if (parent) {
|
||||
var pText = (parent.textContent || '').trim();
|
||||
if (/review/i.test(pText)) {
|
||||
var match = pText.match(/([0-9][0-9,]*)\\s*review/i);
|
||||
if (match) {
|
||||
var count = parseInt(match[1].replace(/,/g, ''));
|
||||
if (count > 0 && count < 100000) {
|
||||
info.total_reviews = count;
|
||||
info._debug_matched = 'SORT_AREA: ' + pText.substring(0, 50);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// PRIORITY 3: Elements with semantic review-related attributes
|
||||
if (!info.total_reviews) {
|
||||
const reviewLinks = document.querySelectorAll('a[href*="review"], button[aria-label*="review" i]');
|
||||
for (let elem of reviewLinks) {
|
||||
const text = (elem.textContent || '').trim();
|
||||
const count = parseReviewCount(text);
|
||||
if (count && count > 0) {
|
||||
info.total_reviews = count;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// PRIORITY 4: Try aria-labels
|
||||
// PRIORITY 4: Look for standalone review count text near rating
|
||||
// Find elements that contain ONLY "N reviews" pattern (not concatenated with rating)
|
||||
if (!info.total_reviews) {
|
||||
const elements = document.querySelectorAll('[aria-label]');
|
||||
for (let elem of elements) {
|
||||
const ariaLabel = elem.getAttribute('aria-label') || '';
|
||||
let match = ariaLabel.match(reviewPattern);
|
||||
if (match) {
|
||||
info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
||||
const allElements = document.querySelectorAll('span, a');
|
||||
for (let elem of allElements) {
|
||||
// Get direct text content only (not nested children)
|
||||
const text = (elem.textContent || '').trim();
|
||||
// Skip if too long (likely contains other content)
|
||||
if (text.length > 50) continue;
|
||||
// Skip if it looks like rating+reviews concatenated (e.g., "4.927 reviews")
|
||||
if (/^[0-9]\\.[0-9]+[0-9]/.test(text)) continue;
|
||||
|
||||
const count = parseReviewCount(text);
|
||||
if (count && count > 0 && count < 100000) {
|
||||
info.total_reviews = count;
|
||||
break;
|
||||
}
|
||||
match = ariaLabel.match(numberPattern);
|
||||
if (match) {
|
||||
info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// PRIORITY 5: Parse from visible page text using regex on short text blocks
|
||||
if (!info.total_reviews) {
|
||||
const walker = document.createTreeWalker(
|
||||
document.body,
|
||||
NodeFilter.SHOW_TEXT,
|
||||
null,
|
||||
false
|
||||
);
|
||||
while (walker.nextNode()) {
|
||||
const text = walker.currentNode.textContent.trim();
|
||||
if (text.length >= 5 && text.length <= 30) {
|
||||
// Match "27 reviews" but not "4.927 reviews"
|
||||
const match = text.match(/(?:^|[^0-9.,])([0-9,]+)\\s+(?:reviews?|reseñas?)/i);
|
||||
if (match) {
|
||||
const count = parseInt(match[1].replace(/[,]/g, ''));
|
||||
if (count > 0 && count < 100000) {
|
||||
info.total_reviews = count;
|
||||
info._debug_matched = 'WALKER: ' + text;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// PRIORITY 6: Extract from embedded JSON in page source (Google embeds data in scripts)
|
||||
if (!info.total_reviews) {
|
||||
var scripts = document.querySelectorAll('script');
|
||||
for (var sc = 0; sc < scripts.length; sc++) {
|
||||
var scriptText = scripts[sc].textContent || '';
|
||||
// Look for patterns like "user_reviews":{"count":27} or reviews_count":27
|
||||
var jsonMatch = scriptText.match(/"(?:user_reviews|reviews?)(?:_count)?"\s*[:\{]\s*"?(\d+)"?/i);
|
||||
if (jsonMatch) {
|
||||
var count = parseInt(jsonMatch[1]);
|
||||
if (count > 0 && count < 100000) {
|
||||
info.total_reviews = count;
|
||||
info._debug_matched = 'JSON_SCRIPT';
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Also look for review count in Google's data format like [\"27 reviews\"]
|
||||
if (!info.total_reviews) {
|
||||
var dataMatch = scriptText.match(/"(\d+)\s+reviews?"/i);
|
||||
if (dataMatch) {
|
||||
var count = parseInt(dataMatch[1]);
|
||||
if (count > 0 && count < 100000) {
|
||||
info.total_reviews = count;
|
||||
info._debug_matched = 'JSON_DATA: ' + dataMatch[0];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1348,6 +1643,32 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
|
||||
log.info(f"[PROFILE] *** TOTAL GET_BUSINESS_CARD TIME: {total_time:.2f}s ***")
|
||||
log.info(f"Business card extracted: name={business_info.get('name')}, "
|
||||
f"rating={business_info.get('rating')}, reviews={business_info.get('total_reviews')}")
|
||||
# Debug: log what aria-labels were found
|
||||
if business_info.get('_debug_aria'):
|
||||
log.info(f"DEBUG: Found {len(business_info.get('_debug_aria'))} aria-labels with 'review': {business_info.get('_debug_aria')[:5]}")
|
||||
if business_info.get('_debug_matched'):
|
||||
log.info(f"DEBUG: Matched aria-label: {business_info.get('_debug_matched')}")
|
||||
# Also log all numeric aria-labels (potential review counts)
|
||||
if business_info.get('_debug_all_numeric'):
|
||||
log.info(f"DEBUG: Numeric aria-labels: {business_info.get('_debug_all_numeric')[:10]}")
|
||||
# Log any text with parenthetical numbers like "(27)"
|
||||
if business_info.get('_debug_parens'):
|
||||
log.info(f"DEBUG: Parenthetical text: {business_info.get('_debug_parens')[:5]}")
|
||||
# Log all short text containing numbers (for debugging review count detection)
|
||||
if business_info.get('_debug_short_text'):
|
||||
log.info(f"DEBUG: Short text with numbers: {business_info.get('_debug_short_text')[:15]}")
|
||||
# Log the context around the rating element
|
||||
if business_info.get('_debug_rating_context'):
|
||||
for ctx in business_info.get('_debug_rating_context', []):
|
||||
log.info(f"DEBUG: Rating context: {ctx}")
|
||||
# Log what tabs exist on the page
|
||||
if business_info.get('_debug_tabs'):
|
||||
log.info(f"DEBUG: Page tabs: {business_info.get('_debug_tabs')}")
|
||||
else:
|
||||
log.info(f"DEBUG: No tabs found on page")
|
||||
# Log buttons (might contain review count)
|
||||
if business_info.get('_debug_buttons'):
|
||||
log.info(f"DEBUG: Buttons: {business_info.get('_debug_buttons')[:10]}")
|
||||
|
||||
result = {
|
||||
"name": business_info.get('name'),
|
||||
|
||||
Reference in New Issue
Block a user