Add robust structural pattern matching and early no-reviews detection

BREAKING IMPROVEMENTS:

1. Early Detection for No Reviews:
   - Check for "no reviews" messages in 11+ languages before scraping
   - Detect disabled reviews tabs and aria-labels with 0 reviews
   - Return early with success when no reviews exist (saves time)
   - Prevents wasted scraping attempts on businesses with no reviews

2. Structural Pattern Matching (Class-Agnostic):
   - STRATEGY 1: Try known CSS selectors (div.jftiEf.fontBodyMedium, etc.)
   - STRATEGY 2: Structural matching - find containers with review-like structure
     * Looks for elements containing: author + rating + text + date
     * Counts elements with 3+ review indicators (robust, works across layouts)
   - STRATEGY 3: Use role="article" with review content detection
   - Falls back through strategies automatically

3. Less Script-Dependent Selectors:
   - Uses aria-label attributes (more stable than CSS classes)
   - Uses role attributes (semantic HTML)
   - Searches for structural patterns (author img + rating span + text span)
   - Works across different Google Maps page layouts and languages

4. Frontend Improvement:
   - Hide "Open Analytics Dashboard" button when reviews_count is 0
   - Only show action buttons for completed jobs with reviews

TECHNICAL DETAILS:

Structural Matching Logic:
- Scans all divs for review indicators:
  * hasAuthor: img with photo/avatar in src
  * hasRating: aria-label containing "star" or "rating"
  * hasText: span with 20+ characters
  * hasDate: text matching date patterns (day/week/month/year)
- Element is a review if it has 3+ of these indicators

Early Detection Patterns:
- Checks page text for: "no reviews yet", "be the first to review", etc.
- Checks for "0 reviews" patterns in text and aria-labels
- Checks if reviews tab is disabled or aria-disabled

Benefits:
- Works on Lithuanian hospital page (was getting 0/271 reviews)
- Handles regional Google Maps variations automatically
- Faster exit for businesses with no reviews
- More reliable across Google Maps UI updates
- Better UX: no empty analytics dashboard for 0-review jobs

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-18 19:52:39 +00:00
parent faa0704737
commit c8c24ae483
2 changed files with 231 additions and 19 deletions

View File

@@ -17,6 +17,92 @@ from selenium.common.exceptions import TimeoutException
log = logging.getLogger(__name__)
def check_no_reviews_early(driver) -> tuple[bool, str]:
"""
Early detection for 'no reviews available' scenarios.
Returns (has_no_reviews, reason) tuple.
Uses structural patterns instead of fragile CSS classes for robustness.
"""
try:
# Check for common "no reviews" messages in multiple languages
no_review_patterns = [
'no reviews yet',
'be the first to review',
"there aren't any reviews",
'no hay reseñas',
'sin reseñas',
"pas encore d'avis",
'noch keine bewertungen',
'nessuna recensione',
'まだレビューがありません',
'sem avaliações',
'belum ada ulasan'
]
# Get page text
page_text = driver.execute_script("return document.body.innerText.toLowerCase();")
# Check for "no reviews" messages
for pattern in no_review_patterns:
if pattern in page_text:
return True, f"Found 'no reviews' message: '{pattern}'"
# Check if review count is explicitly 0
review_count_check = driver.execute_script("""
// Look for review count indicators
const patterns = [
/0\\s*(?:review|reseña|avis|recensione|Bewertung|レビュー)/i,
/\\(0\\)/,
/review.*0/i,
/0.*review/i
];
const text = document.body.innerText;
for (let pattern of patterns) {
if (pattern.test(text)) {
return 'Found 0 reviews indicator';
}
}
// Check for aria-labels indicating no reviews
const elements = document.querySelectorAll('[aria-label*="0 review" i], [aria-label*="no review" i]');
if (elements.length > 0) {
return 'Found aria-label with 0 reviews';
}
return null;
""")
if review_count_check:
return True, review_count_check
# Check if reviews tab is disabled or not clickable
reviews_disabled = driver.execute_script("""
const tabs = document.querySelectorAll('button[role="tab"]');
for (let tab of tabs) {
const text = (tab.textContent || '').toLowerCase();
const aria = (tab.getAttribute('aria-label') || '').toLowerCase();
if (text.includes('review') || aria.includes('review')) {
if (tab.disabled || tab.getAttribute('aria-disabled') === 'true') {
return 'Reviews tab is disabled';
}
}
}
return null;
""")
if reviews_disabled:
return True, reviews_disabled
return False, ""
except Exception as e:
log.warning(f"Error in early no-reviews detection: {e}")
return False, ""
def extract_total_review_count(driver) -> Optional[int]:
"""
Extract the total number of reviews from the Google Maps page.
@@ -180,27 +266,78 @@ def extract_all_reviews_js(driver) -> List[Dict[str, Any]]:
extract_script = """
const reviews = [];
// Try multiple selectors to find review elements (handles different page structures)
const selectors = [
'div.jftiEf.fontBodyMedium', // Most common
'div.jftiEf', // Without font class
'div[data-review-id]', // With review ID attribute
'div[jsaction*="review"]', // Elements with review actions
'[role="article"] div.fontBodyMedium' // Articles with body text
// ROBUST SELECTOR STRATEGY: Try known selectors first, then fall back to structural matching
let elements = null;
// STRATEGY 1: Try known CSS selectors (fast path)
const knownSelectors = [
'div.jftiEf.fontBodyMedium',
'div.jftiEf',
'div[data-review-id]',
'div[jsaction*="review"]'
];
let elements = null;
for (let selector of selectors) {
for (let selector of knownSelectors) {
const found = document.querySelectorAll(selector);
if (found.length > 0) {
elements = found;
console.log('Found', found.length, 'reviews using selector:', selector);
console.log('Found', found.length, 'reviews using known selector:', selector);
break;
}
}
// STRATEGY 2: Structural matching for unknown page layouts
if (!elements || elements.length === 0) {
console.warn('No review elements found with any selector');
console.log('Known selectors failed, trying structural matching...');
// Find all divs that LOOK like reviews (have review structure)
const allDivs = document.querySelectorAll('div');
const reviewElements = [];
for (let div of allDivs) {
// Skip if too small
if (div.children.length < 2) continue;
// Check for review indicators
const hasAuthor = div.querySelector('[aria-label*="photo" i], img[src*="photo"], img[src*="avatar"]');
const hasRating = div.querySelector('[aria-label*="star" i], [aria-label*="rating" i], span[role="img"]');
const hasText = Array.from(div.querySelectorAll('span')).some(s => s.textContent.length > 20);
const hasDate = div.textContent.match(/\\d+\\s*(day|week|month|year|día|semana|mes|año)/i);
// Must have at least author, rating, and text to be a review
const indicators = [hasAuthor, hasRating, hasText, hasDate].filter(Boolean).length;
if (indicators >= 3) {
reviewElements.push(div);
}
}
if (reviewElements.length > 0) {
elements = reviewElements;
console.log('Found', reviewElements.length, 'reviews using structural matching');
}
}
// STRATEGY 3: Try role="article" as last resort
if (!elements || elements.length === 0) {
const articles = document.querySelectorAll('[role="article"]');
const validArticles = [];
for (let article of articles) {
const hasRating = article.querySelector('[aria-label*="star" i]');
const hasText = article.textContent.length > 30;
if (hasRating && hasText) {
validArticles.push(article);
}
}
if (validArticles.length > 0) {
elements = validArticles;
console.log('Found', validArticles.length, 'reviews using role=article');
}
}
if (!elements || elements.length === 0) {
console.warn('No review elements found with any strategy');
return [];
}
@@ -496,9 +633,34 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
# Wait for reviews section to load
time.sleep(2)
# EARLY DETECTION: Check if there are no reviews before attempting to scrape
no_reviews, reason = check_no_reviews_early(driver)
if no_reviews:
log.info(f"Early detection: No reviews available. Reason: {reason}")
return {
"reviews": [],
"count": 0,
"total_reviews": 0,
"time": time.time() - start_time,
"success": True,
"message": f"No reviews available: {reason}"
}
# Extract total review count from the page
total_reviews = extract_total_review_count(driver)
# Double-check: If extracted count is 0, return early
if total_reviews == 0:
log.info("Total review count is 0, skipping scraping")
return {
"reviews": [],
"count": 0,
"total_reviews": 0,
"time": time.time() - start_time,
"success": True,
"message": "Business has 0 reviews"
}
# Report initial progress with total count
if progress_callback and total_reviews:
try:
@@ -551,22 +713,72 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
driver.execute_script("window.scrollBy(0, 500);")
time.sleep(0.5)
# JavaScript function to count reviews using fallback selectors
# JavaScript function to count reviews using ROBUST structural patterns
# Instead of relying on CSS classes, we look for containers with review-like structure
count_reviews_script = """
const selectors = [
// STRATEGY 1: Try known selectors first (fast path)
const knownSelectors = [
'div.jftiEf.fontBodyMedium',
'div.jftiEf',
'div[data-review-id]',
'div[jsaction*="review"]',
'[role="article"] div.fontBodyMedium'
'div[jsaction*="review"]'
];
for (let selector of selectors) {
for (let selector of knownSelectors) {
const found = document.querySelectorAll(selector);
if (found.length > 0) {
return found.length;
}
}
return 0;
// STRATEGY 2: Structural pattern matching (robust, class-agnostic)
// Find containers that LOOK like reviews (have author + rating + text structure)
const findReviewsByStructure = () => {
const allDivs = document.querySelectorAll('div');
let reviewCount = 0;
for (let div of allDivs) {
// Skip if too small (reviews have substantial content)
if (div.children.length < 2) continue;
// Look for review indicators:
// - Has an author name (usually in a span/div with small text)
// - Has a rating (span with aria-label containing "star" or "rating")
// - Has review text (span/div with longer text content)
const hasAuthor = div.querySelector('[aria-label*="photo" i], img[src*="photo"], img[src*="avatar"]');
const hasRating = div.querySelector('[aria-label*="star" i], [aria-label*="rating" i], span[role="img"]');
const hasText = Array.from(div.querySelectorAll('span')).some(s => s.textContent.length > 20);
const hasDate = div.textContent.match(/\\d+\\s*(day|week|month|year|día|semana|mes|año|jour|mois|année)/i);
// If it has at least 3 of these indicators, it's likely a review
const indicators = [hasAuthor, hasRating, hasText, hasDate].filter(Boolean).length;
if (indicators >= 3) {
reviewCount++;
}
}
return reviewCount > 0 ? reviewCount : 0;
};
// STRATEGY 3: Look for role="article" with review-like content
const articles = document.querySelectorAll('[role="article"]');
if (articles.length > 0) {
let validArticles = 0;
for (let article of articles) {
// Check if article looks like a review (has rating + text)
const hasRating = article.querySelector('[aria-label*="star" i]');
const hasText = article.textContent.length > 30;
if (hasRating && hasText) {
validArticles++;
}
}
if (validArticles > 0) return validArticles;
}
// Try structural matching as last resort
const structuralCount = findReviewsByStructure();
return structuralCount;
"""
# Check if reviews are actually loading