Remove old scraper files - consolidate to scraper_clean

Production (api_server_production.py) only uses:
- modules/scraper_clean.py - main scraping logic
- modules/fast_scraper.py - validation helpers
- modules/database.py, webhooks.py, health_checks.py, chrome_pool.py

Deleted 33 unused Python files including:
- Old API server (api_server.py)
- 14 start*.py experimental scrapers
- 7 *_scraper.py variants
- Old modules: scraper.py, api_interceptor.py, job_manager.py, cli.py
- Various debug/test/utility scripts

Saves ~11,000 lines of unmaintained code.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-23 17:25:00 +00:00
parent 80e7771c00
commit 8ccf72a489
37 changed files with 859 additions and 11116 deletions

View File

@@ -1140,13 +1140,30 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
else:
log.info(f"[PROFILE] Using pooled driver (0.00s)")
# Force English locale for consistent parsing
# Force English locale AND US region for consistent parsing/results
# This helps avoid geolocation-based variations in Google Maps results
if 'hl=' in url:
url = url.replace('hl=es', 'hl=en').replace('hl=pt', 'hl=en').replace('hl=fr', 'hl=en')
else:
separator = '&' if '?' in url else '?'
url = f"{url}{separator}hl=en"
# Add US region parameter if not present
if 'gl=' not in url:
url = f"{url}&gl=us"
# Set Chrome geolocation to US (Boston, MA) using CDP
# This ensures Google Maps shows US results regardless of server location
try:
driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
'latitude': 42.3601,
'longitude': -71.0589,
'accuracy': 100
})
log.info("Set geolocation to US (Boston, MA)")
except Exception as e:
log.warning(f"Could not set geolocation: {e}")
log.info(f"Loading Google Maps page...")
t0 = timing_module.time()
driver.get(url)
@@ -1164,18 +1181,23 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button')
for btn in form_btns:
btn_text = (btn.text or '').lower()
if 'aceptar todo' in btn_text or 'accept all' in btn_text:
if 'aceptar todo' in btn_text or 'accept all' in btn_text or 'reject all' in btn_text:
log.info(f"Clicking GDPR consent: {btn.text}")
btn.click()
time.sleep(1) # Reduced from 2s
time.sleep(1)
break
else:
if len(form_btns) >= 2:
log.info("Using fallback: clicking second form button")
form_btns[1].click()
time.sleep(1) # Reduced from 2s
time.sleep(1)
except Exception as e:
log.warning(f"GDPR consent handling failed: {e}")
# After GDPR consent, reload the original URL to ensure proper page state
log.info(f"Reloading original URL after GDPR consent...")
driver.get(url)
time.sleep(1)
log.info(f"[PROFILE] GDPR consent handling: {timing_module.time() - t0:.2f}s")
else:
log.info(f"[PROFILE] No GDPR consent page (0.00s)")
@@ -1197,14 +1219,77 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
try:
log.info("Waiting for Google Maps content to load...")
wait = WebDriverWait(driver, 10)
# Wait for basic page structure (h1 or heading)
wait.until(
lambda d: d.find_elements(By.CSS_SELECTOR, 'h1.DUwDvf, h1, [role="article"], [data-review-id]')
lambda d: d.find_elements(By.CSS_SELECTOR, 'h1, [role="heading"]')
)
log.info("Google Maps content loaded successfully")
log.info("Basic page structure loaded")
# Wait for page to settle - search URLs redirect to place URLs
# which triggers additional content loading
time.sleep(2)
# Wait specifically for review count element (aria-label ending with "reviews")
# This is the most reliable indicator that the business detail is loaded
try:
WebDriverWait(driver, 5).until(
lambda d: d.execute_script("""
var elems = document.querySelectorAll('[aria-label]');
for (var i = 0; i < elems.length; i++) {
var label = elems[i].getAttribute('aria-label') || '';
if (/^[0-9]+ reviews?$/.test(label)) return true;
}
return false;
""")
)
log.info("Review count element loaded")
except:
# Fallback: Try clicking Reviews tab or rating stars to expose the review count
log.info("Review count wait timeout, trying to click Reviews/rating...")
try:
# Try 1: Click Reviews tab (if exists)
clicked = driver.execute_script("""
var tabs = document.querySelectorAll('[role="tab"]');
for (var i = 0; i < tabs.length; i++) {
var txt = (tabs[i].textContent || '').toLowerCase();
if (txt.includes('review')) {
tabs[i].click();
return 'tab';
}
}
// Try 2: Click the rating stars element (often links to reviews)
var stars = document.querySelector('[role="img"][aria-label*="star"]');
if (stars) {
var parent = stars.parentElement;
if (parent && parent.tagName.toLowerCase() === 'button') {
parent.click();
return 'stars_button';
}
stars.click();
return 'stars';
}
// Try 3: Click "Write a review" or any review-related button
var btns = document.querySelectorAll('button[aria-label*="review" i]');
for (var b = 0; b < btns.length; b++) {
var label = btns[b].getAttribute('aria-label') || '';
if (!/write/i.test(label) && /review/i.test(label)) {
btns[b].click();
return 'review_btn: ' + label;
}
}
return 'none';
""")
log.info(f"Clicked: {clicked}")
time.sleep(2) # Wait for reviews panel to load
except Exception as e:
log.warning(f"Click attempt failed: {e}")
except Exception as e:
log.warning(f"Timeout waiting for Maps content: {e}")
time.sleep(0.5) # Minimal fallback wait
time.sleep(2) # Fallback wait
log.info(f"[PROFILE] Smart wait for content: {timing_module.time() - t0:.2f}s")
log.info(f"DEBUG: Current URL: {driver.current_url[:100]}...")
log.info(f"DEBUG: Page title: {driver.title}")
# Extract business card information using JavaScript
t0 = timing_module.time()
@@ -1216,85 +1301,166 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
total_reviews: null
};
// Extract business name
const nameSelectors = [
'h1.DUwDvf',
'[role="main"] h1',
'h1.fontHeadlineLarge'
];
// ============ ROBUST EXTRACTION (no class names, aria/data attributes preferred) ============
for (const selector of nameSelectors) {
const elem = document.querySelector(selector);
if (elem && elem.textContent) {
info.name = elem.textContent.trim();
break;
}
}
// Helper: Parse review count from text, handling multiple formats
function parseReviewCount(text) {
if (!text) return null;
// Extract address
const addressSelectors = [
'button[data-item-id*="address"]',
'[data-item-id*="address"]',
'div[aria-label*="Address"]'
];
for (const selector of addressSelectors) {
const elem = document.querySelector(selector);
if (elem && elem.textContent) {
info.address = elem.textContent.trim();
break;
}
}
// Extract rating (look for aria-label like "4.2 stars")
const ratingElem = document.querySelector('[role="img"][aria-label*="star"]');
if (ratingElem) {
const ariaLabel = ratingElem.getAttribute('aria-label');
const match = ariaLabel.match(/([0-9.]+)/);
// Pattern 1: Exact "N reviews" format (aria-labels, clean text)
// Matches: "27 reviews", "1,234 reviews", "27 reseñas", "27 avis"
var match = text.match(/^([0-9][0-9,.]*)[ ]*(?:reviews?|reseñas?|avis|bewertungen?|recensioni?)$/i);
if (match) {
info.rating = parseFloat(match[1]);
return parseInt(match[1].replace(/[,. ]/g, ''));
}
}
// Extract total review count
const reviewPattern = /\\((\\d[\\d,\\.]*)\\)/;
const numberPattern = /(\\d[\\d,\\.]*)\\s*(?:review|reseña|avis)/i;
// Pattern 2: "(N)" format often used in tabs like "Reviews (27)"
match = text.match(/[(]([0-9][0-9,.]*)[)]$/);
if (match) {
return parseInt(match[1].replace(/[,. ]/g, ''));
}
// PRIORITY 1: Look for review count in search results sidebar/panel
// This is where "152 reviews" appears on search results
const searchPanelSelectors = [
'a[href*="reviews"]', // Link with "reviews" in href
'button[jsaction*="reviews"]', // Button related to reviews
'div[role="link"]', // Clickable divs that might contain review info
];
for (const selector of searchPanelSelectors) {
const elements = document.querySelectorAll(selector);
for (let elem of elements) {
const text = elem.textContent || '';
const match = text.match(numberPattern);
// Pattern 3: "N reviews" anywhere in short text (< 30 chars to avoid false positives)
if (text.length < 30) {
match = text.match(/([0-9][0-9,]*)[ ]+(?:reviews?|reseñas?|avis)/i);
if (match) {
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
if (num > 0 && num < 1000000) {
info.total_reviews = num;
break;
}
return parseInt(match[1].replace(/[,. ]/g, ''));
}
}
if (info.total_reviews) break;
return null;
}
// PRIORITY 2: Look in any span/div that contains the word "review"
// ============ EXTRACT BUSINESS NAME ============
// Priority: h1 (semantic), then role="heading"
const h1 = document.querySelector('h1');
if (h1 && h1.textContent) {
info.name = h1.textContent.trim();
}
if (!info.name) {
const heading = document.querySelector('[role="heading"][aria-level="1"]');
if (heading && heading.textContent) {
info.name = heading.textContent.trim();
}
}
// ============ EXTRACT ADDRESS ============
// Priority: data-item-id (semantic), then aria-label containing "address"
const addressElem = document.querySelector('[data-item-id*="address"]');
if (addressElem && addressElem.textContent) {
info.address = addressElem.textContent.trim();
}
if (!info.address) {
const ariaAddress = document.querySelector('[aria-label*="ddress"]');
if (ariaAddress && ariaAddress.textContent) {
info.address = ariaAddress.textContent.trim();
}
}
// ============ EXTRACT RATING ============
// Priority: aria-label containing "star" on role="img" elements
info._debug_rating_context = [];
const ratingElems = document.querySelectorAll('[role="img"][aria-label*="star"]');
for (let elem of ratingElems) {
const ariaLabel = elem.getAttribute('aria-label') || '';
// Match "4.9 stars" or "4,9 stars" (European format)
const match = ariaLabel.match(/([0-9][.,]?[0-9]?)\\s*star/i);
if (match) {
info.rating = parseFloat(match[1].replace(',', '.'));
// DEBUG: Capture parent/sibling context to find review count
var parent = elem.parentElement;
if (parent) {
info._debug_rating_context.push('PARENT: ' + (parent.textContent || '').trim().substring(0, 100));
var grandparent = parent.parentElement;
if (grandparent) {
info._debug_rating_context.push('GRANDPARENT: ' + (grandparent.textContent || '').trim().substring(0, 100));
// Check all children of grandparent for review count
var gpChildren = grandparent.querySelectorAll('*');
for (var c = 0; c < Math.min(gpChildren.length, 30); c++) {
var childText = (gpChildren[c].textContent || '').trim();
if (childText.length > 0 && childText.length < 20 && /[0-9]/.test(childText)) {
info._debug_rating_context.push('GP_CHILD: ' + childText);
}
}
// Also check great-grandparent
var ggp = grandparent.parentElement;
if (ggp) {
info._debug_rating_context.push('GREAT_GP: ' + (ggp.textContent || '').trim().substring(0, 150));
}
}
// Check siblings
var nextSib = parent.nextElementSibling;
if (nextSib) {
info._debug_rating_context.push('NEXT_SIB: ' + (nextSib.textContent || '').trim().substring(0, 100));
}
}
break;
}
}
// ============ EXTRACT TOTAL REVIEWS (ROBUST, ARIA-FIRST) ============
// PRIORITY 1: aria-label with exact "N reviews" format (most reliable)
// Google Maps uses aria-label="27 reviews" for accessibility
info._debug_aria = [];
info._debug_all_numeric = [];
if (!info.total_reviews) {
const allElements = document.querySelectorAll('span, div, a');
for (let elem of allElements) {
const text = elem.textContent || '';
if (text.length < 100) { // Skip very long text blocks
const match = text.match(numberPattern);
var ariaElems = document.querySelectorAll('[aria-label]');
for (var i = 0; i < ariaElems.length; i++) {
var ariaLabel = ariaElems[i].getAttribute('aria-label') || '';
// Collect all labels containing "review"
if (ariaLabel.toLowerCase().indexOf('review') >= 0) {
info._debug_aria.push(ariaLabel);
}
// Collect all labels starting with a digit
if (/^[0-9]/.test(ariaLabel)) {
info._debug_all_numeric.push(ariaLabel);
}
var count = parseReviewCount(ariaLabel);
if (count && count > 0 && count < 100000) {
info.total_reviews = count;
info._debug_matched = ariaLabel;
break;
}
}
}
// DEBUG: Find all text with parenthetical numbers like "(27)"
info._debug_parens = [];
info._debug_short_text = []; // All short text with numbers
var allSpans = document.querySelectorAll('span, div, a, button');
for (var j = 0; j < Math.min(allSpans.length, 500); j++) {
var spanText = allSpans[j].textContent || '';
// Capture parenthetical numbers
if (spanText.length < 20 && /[(][0-9]+[)]/.test(spanText)) {
info._debug_parens.push(spanText.trim());
}
// Capture ALL short text containing numbers (for debugging)
if (spanText.length > 0 && spanText.length < 30 && /[0-9]+/.test(spanText)) {
var cleaned = spanText.trim().replace(/\\s+/g, ' ');
if (cleaned && info._debug_short_text.indexOf(cleaned) < 0) {
info._debug_short_text.push(cleaned);
}
}
}
// PRIORITY 2.5: Look for text containing numbers near "review" word anywhere on page
// This catches formats like "27 reviews", "reviews: 27", etc. that aren't in aria-labels
if (!info.total_reviews) {
var allElems = document.querySelectorAll('*');
for (var k = 0; k < Math.min(allElems.length, 1000); k++) {
var elem = allElems[k];
// Skip if has children (we want leaf nodes only)
if (elem.children.length > 0) continue;
var txt = (elem.textContent || '').trim();
// Look for short text with both numbers and "review" word
if (txt.length >= 3 && txt.length < 30 && /review/i.test(txt)) {
var match = txt.match(/([0-9][0-9,]*)/);
if (match) {
const num = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
if (num > 0 && num < 1000000) {
info.total_reviews = num;
var count = parseInt(match[1].replace(/,/g, ''));
if (count > 0 && count < 100000) {
info.total_reviews = count;
info._debug_matched = 'LEAF: ' + txt;
break;
}
}
@@ -1302,38 +1468,167 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
}
}
// PRIORITY 3: Try tabs (for business detail pages)
// DEBUG: Collect all tab names
info._debug_tabs = [];
const tabs = document.querySelectorAll('[role="tab"]');
for (let t = 0; t < tabs.length; t++) {
info._debug_tabs.push((tabs[t].textContent || '').trim().substring(0, 30));
}
// DEBUG: Collect all buttons with text (might contain review count)
info._debug_buttons = [];
const buttons = document.querySelectorAll('button');
for (let b = 0; b < Math.min(buttons.length, 20); b++) {
var btnText = (buttons[b].textContent || '').trim();
if (btnText && btnText.length < 40) {
info._debug_buttons.push(btnText.substring(0, 40));
}
}
// PRIORITY 2: Tabs with role="tab" (Reviews tab often shows count)
if (!info.total_reviews) {
const tabs = document.querySelectorAll('button[role="tab"]');
for (let tab of tabs) {
const text = tab.textContent || '';
let match = text.match(reviewPattern);
if (match) {
info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
break;
const text = (tab.textContent || '').trim();
// Look for "Reviews" tab with count
if (text.toLowerCase().includes('review')) {
const count = parseReviewCount(text);
if (count && count > 0) {
info.total_reviews = count;
info._debug_matched = 'TAB: ' + text;
break;
}
}
match = text.match(numberPattern);
if (match) {
info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
}
}
// PRIORITY 2.3: Reviews panel header (after clicking Reviews tab)
// Google Maps shows "27 reviews" as heading text in the reviews panel
if (!info.total_reviews) {
// Look for headings containing review count
var headings = document.querySelectorAll('h1, h2, [role="heading"]');
for (var h = 0; h < headings.length; h++) {
var hText = (headings[h].textContent || '').trim();
if (/review/i.test(hText)) {
var match = hText.match(/([0-9][0-9,]*)/);
if (match) {
var count = parseInt(match[1].replace(/,/g, ''));
if (count > 0 && count < 100000) {
info.total_reviews = count;
info._debug_matched = 'HEADING: ' + hText;
break;
}
}
}
}
}
// PRIORITY 2.4: Look for sort button area which often has total count
// The sort dropdown area displays "Sort: Newest" and total reviews
if (!info.total_reviews) {
var sortBtns = document.querySelectorAll('button[data-value="sort"], [aria-label*="Sort"]');
for (var s = 0; s < sortBtns.length; s++) {
var parent = sortBtns[s].parentElement;
if (parent) {
var pText = (parent.textContent || '').trim();
if (/review/i.test(pText)) {
var match = pText.match(/([0-9][0-9,]*)\\s*review/i);
if (match) {
var count = parseInt(match[1].replace(/,/g, ''));
if (count > 0 && count < 100000) {
info.total_reviews = count;
info._debug_matched = 'SORT_AREA: ' + pText.substring(0, 50);
break;
}
}
}
}
}
}
// PRIORITY 3: Elements with semantic review-related attributes
if (!info.total_reviews) {
const reviewLinks = document.querySelectorAll('a[href*="review"], button[aria-label*="review" i]');
for (let elem of reviewLinks) {
const text = (elem.textContent || '').trim();
const count = parseReviewCount(text);
if (count && count > 0) {
info.total_reviews = count;
break;
}
}
}
// PRIORITY 4: Try aria-labels
// PRIORITY 4: Look for standalone review count text near rating
// Find elements that contain ONLY "N reviews" pattern (not concatenated with rating)
if (!info.total_reviews) {
const elements = document.querySelectorAll('[aria-label]');
for (let elem of elements) {
const ariaLabel = elem.getAttribute('aria-label') || '';
let match = ariaLabel.match(reviewPattern);
if (match) {
info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
const allElements = document.querySelectorAll('span, a');
for (let elem of allElements) {
// Get direct text content only (not nested children)
const text = (elem.textContent || '').trim();
// Skip if too long (likely contains other content)
if (text.length > 50) continue;
// Skip if it looks like rating+reviews concatenated (e.g., "4.927 reviews")
if (/^[0-9]\\.[0-9]+[0-9]/.test(text)) continue;
const count = parseReviewCount(text);
if (count && count > 0 && count < 100000) {
info.total_reviews = count;
break;
}
match = ariaLabel.match(numberPattern);
if (match) {
info.total_reviews = parseInt(match[1].replace(/[,\\.\\s]/g, ''));
break;
}
}
// PRIORITY 5: Parse from visible page text using regex on short text blocks
if (!info.total_reviews) {
const walker = document.createTreeWalker(
document.body,
NodeFilter.SHOW_TEXT,
null,
false
);
while (walker.nextNode()) {
const text = walker.currentNode.textContent.trim();
if (text.length >= 5 && text.length <= 30) {
// Match "27 reviews" but not "4.927 reviews"
const match = text.match(/(?:^|[^0-9.,])([0-9,]+)\\s+(?:reviews?|reseñas?)/i);
if (match) {
const count = parseInt(match[1].replace(/[,]/g, ''));
if (count > 0 && count < 100000) {
info.total_reviews = count;
info._debug_matched = 'WALKER: ' + text;
break;
}
}
}
}
}
// PRIORITY 6: Extract from embedded JSON in page source (Google embeds data in scripts)
if (!info.total_reviews) {
var scripts = document.querySelectorAll('script');
for (var sc = 0; sc < scripts.length; sc++) {
var scriptText = scripts[sc].textContent || '';
// Look for patterns like "user_reviews":{"count":27} or reviews_count":27
var jsonMatch = scriptText.match(/"(?:user_reviews|reviews?)(?:_count)?"\s*[:\{]\s*"?(\d+)"?/i);
if (jsonMatch) {
var count = parseInt(jsonMatch[1]);
if (count > 0 && count < 100000) {
info.total_reviews = count;
info._debug_matched = 'JSON_SCRIPT';
break;
}
}
// Also look for review count in Google's data format like [\"27 reviews\"]
if (!info.total_reviews) {
var dataMatch = scriptText.match(/"(\d+)\s+reviews?"/i);
if (dataMatch) {
var count = parseInt(dataMatch[1]);
if (count > 0 && count < 100000) {
info.total_reviews = count;
info._debug_matched = 'JSON_DATA: ' + dataMatch[0];
break;
}
}
}
}
}
@@ -1348,6 +1643,32 @@ def get_business_card_info(url: str, headless: bool = True, driver=None, return_
log.info(f"[PROFILE] *** TOTAL GET_BUSINESS_CARD TIME: {total_time:.2f}s ***")
log.info(f"Business card extracted: name={business_info.get('name')}, "
f"rating={business_info.get('rating')}, reviews={business_info.get('total_reviews')}")
# Debug: log what aria-labels were found
if business_info.get('_debug_aria'):
log.info(f"DEBUG: Found {len(business_info.get('_debug_aria'))} aria-labels with 'review': {business_info.get('_debug_aria')[:5]}")
if business_info.get('_debug_matched'):
log.info(f"DEBUG: Matched aria-label: {business_info.get('_debug_matched')}")
# Also log all numeric aria-labels (potential review counts)
if business_info.get('_debug_all_numeric'):
log.info(f"DEBUG: Numeric aria-labels: {business_info.get('_debug_all_numeric')[:10]}")
# Log any text with parenthetical numbers like "(27)"
if business_info.get('_debug_parens'):
log.info(f"DEBUG: Parenthetical text: {business_info.get('_debug_parens')[:5]}")
# Log all short text containing numbers (for debugging review count detection)
if business_info.get('_debug_short_text'):
log.info(f"DEBUG: Short text with numbers: {business_info.get('_debug_short_text')[:15]}")
# Log the context around the rating element
if business_info.get('_debug_rating_context'):
for ctx in business_info.get('_debug_rating_context', []):
log.info(f"DEBUG: Rating context: {ctx}")
# Log what tabs exist on the page
if business_info.get('_debug_tabs'):
log.info(f"DEBUG: Page tabs: {business_info.get('_debug_tabs')}")
else:
log.info(f"DEBUG: No tabs found on page")
# Log buttons (might contain review count)
if business_info.get('_debug_buttons'):
log.info(f"DEBUG: Buttons: {business_info.get('_debug_buttons')[:10]}")
result = {
"name": business_info.get('name'),