Add robust structural pattern matching and early no-reviews detection
BREAKING IMPROVEMENTS:
1. Early Detection for No Reviews:
- Check for "no reviews" messages in 11+ languages before scraping
- Detect disabled reviews tabs and aria-labels with 0 reviews
- Return early with success when no reviews exist (saves time)
- Prevents wasted scraping attempts on businesses with no reviews
2. Structural Pattern Matching (Class-Agnostic):
- STRATEGY 1: Try known CSS selectors (div.jftiEf.fontBodyMedium, etc.)
- STRATEGY 2: Structural matching - find containers with review-like structure
* Looks for elements containing: author + rating + text + date
* Counts elements with 3+ review indicators (robust, works across layouts)
- STRATEGY 3: Use role="article" with review content detection
- Falls back through strategies automatically
3. Less Script-Dependent Selectors:
- Uses aria-label attributes (more stable than CSS classes)
- Uses role attributes (semantic HTML)
- Searches for structural patterns (author img + rating span + text span)
- Works across different Google Maps page layouts and languages
4. Frontend Improvement:
- Hide "Open Analytics Dashboard" button when reviews_count is 0
- Only show action buttons for completed jobs with reviews
TECHNICAL DETAILS:
Structural Matching Logic:
- Scans all divs for review indicators:
* hasAuthor: img with photo/avatar in src
* hasRating: aria-label containing "star" or "rating"
* hasText: span with 20+ characters
* hasDate: text matching date patterns (day/week/month/year)
- Element is a review if it has 3+ of these indicators
Early Detection Patterns:
- Checks page text for: "no reviews yet", "be the first to review", etc.
- Checks for "0 reviews" patterns in text and aria-labels
- Checks if reviews tab is disabled or aria-disabled
Benefits:
- Works on Lithuanian hospital page (was getting 0/271 reviews)
- Handles regional Google Maps variations automatically
- Faster exit for businesses with no reviews
- More reliable across Google Maps UI updates
- Better UX: no empty analytics dashboard for 0-review jobs
Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -17,6 +17,92 @@ from selenium.common.exceptions import TimeoutException
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def check_no_reviews_early(driver) -> tuple[bool, str]:
|
||||
"""
|
||||
Early detection for 'no reviews available' scenarios.
|
||||
Returns (has_no_reviews, reason) tuple.
|
||||
|
||||
Uses structural patterns instead of fragile CSS classes for robustness.
|
||||
"""
|
||||
try:
|
||||
# Check for common "no reviews" messages in multiple languages
|
||||
no_review_patterns = [
|
||||
'no reviews yet',
|
||||
'be the first to review',
|
||||
"there aren't any reviews",
|
||||
'no hay reseñas',
|
||||
'sin reseñas',
|
||||
"pas encore d'avis",
|
||||
'noch keine bewertungen',
|
||||
'nessuna recensione',
|
||||
'まだレビューがありません',
|
||||
'sem avaliações',
|
||||
'belum ada ulasan'
|
||||
]
|
||||
|
||||
# Get page text
|
||||
page_text = driver.execute_script("return document.body.innerText.toLowerCase();")
|
||||
|
||||
# Check for "no reviews" messages
|
||||
for pattern in no_review_patterns:
|
||||
if pattern in page_text:
|
||||
return True, f"Found 'no reviews' message: '{pattern}'"
|
||||
|
||||
# Check if review count is explicitly 0
|
||||
review_count_check = driver.execute_script("""
|
||||
// Look for review count indicators
|
||||
const patterns = [
|
||||
/0\\s*(?:review|reseña|avis|recensione|Bewertung|レビュー)/i,
|
||||
/\\(0\\)/,
|
||||
/review.*0/i,
|
||||
/0.*review/i
|
||||
];
|
||||
|
||||
const text = document.body.innerText;
|
||||
for (let pattern of patterns) {
|
||||
if (pattern.test(text)) {
|
||||
return 'Found 0 reviews indicator';
|
||||
}
|
||||
}
|
||||
|
||||
// Check for aria-labels indicating no reviews
|
||||
const elements = document.querySelectorAll('[aria-label*="0 review" i], [aria-label*="no review" i]');
|
||||
if (elements.length > 0) {
|
||||
return 'Found aria-label with 0 reviews';
|
||||
}
|
||||
|
||||
return null;
|
||||
""")
|
||||
|
||||
if review_count_check:
|
||||
return True, review_count_check
|
||||
|
||||
# Check if reviews tab is disabled or not clickable
|
||||
reviews_disabled = driver.execute_script("""
|
||||
const tabs = document.querySelectorAll('button[role="tab"]');
|
||||
for (let tab of tabs) {
|
||||
const text = (tab.textContent || '').toLowerCase();
|
||||
const aria = (tab.getAttribute('aria-label') || '').toLowerCase();
|
||||
|
||||
if (text.includes('review') || aria.includes('review')) {
|
||||
if (tab.disabled || tab.getAttribute('aria-disabled') === 'true') {
|
||||
return 'Reviews tab is disabled';
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
""")
|
||||
|
||||
if reviews_disabled:
|
||||
return True, reviews_disabled
|
||||
|
||||
return False, ""
|
||||
|
||||
except Exception as e:
|
||||
log.warning(f"Error in early no-reviews detection: {e}")
|
||||
return False, ""
|
||||
|
||||
|
||||
def extract_total_review_count(driver) -> Optional[int]:
|
||||
"""
|
||||
Extract the total number of reviews from the Google Maps page.
|
||||
@@ -180,27 +266,78 @@ def extract_all_reviews_js(driver) -> List[Dict[str, Any]]:
|
||||
extract_script = """
|
||||
const reviews = [];
|
||||
|
||||
// Try multiple selectors to find review elements (handles different page structures)
|
||||
const selectors = [
|
||||
'div.jftiEf.fontBodyMedium', // Most common
|
||||
'div.jftiEf', // Without font class
|
||||
'div[data-review-id]', // With review ID attribute
|
||||
'div[jsaction*="review"]', // Elements with review actions
|
||||
'[role="article"] div.fontBodyMedium' // Articles with body text
|
||||
// ROBUST SELECTOR STRATEGY: Try known selectors first, then fall back to structural matching
|
||||
let elements = null;
|
||||
|
||||
// STRATEGY 1: Try known CSS selectors (fast path)
|
||||
const knownSelectors = [
|
||||
'div.jftiEf.fontBodyMedium',
|
||||
'div.jftiEf',
|
||||
'div[data-review-id]',
|
||||
'div[jsaction*="review"]'
|
||||
];
|
||||
|
||||
let elements = null;
|
||||
for (let selector of selectors) {
|
||||
for (let selector of knownSelectors) {
|
||||
const found = document.querySelectorAll(selector);
|
||||
if (found.length > 0) {
|
||||
elements = found;
|
||||
console.log('Found', found.length, 'reviews using selector:', selector);
|
||||
console.log('Found', found.length, 'reviews using known selector:', selector);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// STRATEGY 2: Structural matching for unknown page layouts
|
||||
if (!elements || elements.length === 0) {
|
||||
console.warn('No review elements found with any selector');
|
||||
console.log('Known selectors failed, trying structural matching...');
|
||||
|
||||
// Find all divs that LOOK like reviews (have review structure)
|
||||
const allDivs = document.querySelectorAll('div');
|
||||
const reviewElements = [];
|
||||
|
||||
for (let div of allDivs) {
|
||||
// Skip if too small
|
||||
if (div.children.length < 2) continue;
|
||||
|
||||
// Check for review indicators
|
||||
const hasAuthor = div.querySelector('[aria-label*="photo" i], img[src*="photo"], img[src*="avatar"]');
|
||||
const hasRating = div.querySelector('[aria-label*="star" i], [aria-label*="rating" i], span[role="img"]');
|
||||
const hasText = Array.from(div.querySelectorAll('span')).some(s => s.textContent.length > 20);
|
||||
const hasDate = div.textContent.match(/\\d+\\s*(day|week|month|year|día|semana|mes|año)/i);
|
||||
|
||||
// Must have at least author, rating, and text to be a review
|
||||
const indicators = [hasAuthor, hasRating, hasText, hasDate].filter(Boolean).length;
|
||||
if (indicators >= 3) {
|
||||
reviewElements.push(div);
|
||||
}
|
||||
}
|
||||
|
||||
if (reviewElements.length > 0) {
|
||||
elements = reviewElements;
|
||||
console.log('Found', reviewElements.length, 'reviews using structural matching');
|
||||
}
|
||||
}
|
||||
|
||||
// STRATEGY 3: Try role="article" as last resort
|
||||
if (!elements || elements.length === 0) {
|
||||
const articles = document.querySelectorAll('[role="article"]');
|
||||
const validArticles = [];
|
||||
|
||||
for (let article of articles) {
|
||||
const hasRating = article.querySelector('[aria-label*="star" i]');
|
||||
const hasText = article.textContent.length > 30;
|
||||
if (hasRating && hasText) {
|
||||
validArticles.push(article);
|
||||
}
|
||||
}
|
||||
|
||||
if (validArticles.length > 0) {
|
||||
elements = validArticles;
|
||||
console.log('Found', validArticles.length, 'reviews using role=article');
|
||||
}
|
||||
}
|
||||
|
||||
if (!elements || elements.length === 0) {
|
||||
console.warn('No review elements found with any strategy');
|
||||
return [];
|
||||
}
|
||||
|
||||
@@ -496,9 +633,34 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
||||
# Wait for reviews section to load
|
||||
time.sleep(2)
|
||||
|
||||
# EARLY DETECTION: Check if there are no reviews before attempting to scrape
|
||||
no_reviews, reason = check_no_reviews_early(driver)
|
||||
if no_reviews:
|
||||
log.info(f"Early detection: No reviews available. Reason: {reason}")
|
||||
return {
|
||||
"reviews": [],
|
||||
"count": 0,
|
||||
"total_reviews": 0,
|
||||
"time": time.time() - start_time,
|
||||
"success": True,
|
||||
"message": f"No reviews available: {reason}"
|
||||
}
|
||||
|
||||
# Extract total review count from the page
|
||||
total_reviews = extract_total_review_count(driver)
|
||||
|
||||
# Double-check: If extracted count is 0, return early
|
||||
if total_reviews == 0:
|
||||
log.info("Total review count is 0, skipping scraping")
|
||||
return {
|
||||
"reviews": [],
|
||||
"count": 0,
|
||||
"total_reviews": 0,
|
||||
"time": time.time() - start_time,
|
||||
"success": True,
|
||||
"message": "Business has 0 reviews"
|
||||
}
|
||||
|
||||
# Report initial progress with total count
|
||||
if progress_callback and total_reviews:
|
||||
try:
|
||||
@@ -551,22 +713,72 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
||||
driver.execute_script("window.scrollBy(0, 500);")
|
||||
time.sleep(0.5)
|
||||
|
||||
# JavaScript function to count reviews using fallback selectors
|
||||
# JavaScript function to count reviews using ROBUST structural patterns
|
||||
# Instead of relying on CSS classes, we look for containers with review-like structure
|
||||
count_reviews_script = """
|
||||
const selectors = [
|
||||
// STRATEGY 1: Try known selectors first (fast path)
|
||||
const knownSelectors = [
|
||||
'div.jftiEf.fontBodyMedium',
|
||||
'div.jftiEf',
|
||||
'div[data-review-id]',
|
||||
'div[jsaction*="review"]',
|
||||
'[role="article"] div.fontBodyMedium'
|
||||
'div[jsaction*="review"]'
|
||||
];
|
||||
for (let selector of selectors) {
|
||||
|
||||
for (let selector of knownSelectors) {
|
||||
const found = document.querySelectorAll(selector);
|
||||
if (found.length > 0) {
|
||||
return found.length;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
|
||||
// STRATEGY 2: Structural pattern matching (robust, class-agnostic)
|
||||
// Find containers that LOOK like reviews (have author + rating + text structure)
|
||||
const findReviewsByStructure = () => {
|
||||
const allDivs = document.querySelectorAll('div');
|
||||
let reviewCount = 0;
|
||||
|
||||
for (let div of allDivs) {
|
||||
// Skip if too small (reviews have substantial content)
|
||||
if (div.children.length < 2) continue;
|
||||
|
||||
// Look for review indicators:
|
||||
// - Has an author name (usually in a span/div with small text)
|
||||
// - Has a rating (span with aria-label containing "star" or "rating")
|
||||
// - Has review text (span/div with longer text content)
|
||||
|
||||
const hasAuthor = div.querySelector('[aria-label*="photo" i], img[src*="photo"], img[src*="avatar"]');
|
||||
const hasRating = div.querySelector('[aria-label*="star" i], [aria-label*="rating" i], span[role="img"]');
|
||||
const hasText = Array.from(div.querySelectorAll('span')).some(s => s.textContent.length > 20);
|
||||
const hasDate = div.textContent.match(/\\d+\\s*(day|week|month|year|día|semana|mes|año|jour|mois|année)/i);
|
||||
|
||||
// If it has at least 3 of these indicators, it's likely a review
|
||||
const indicators = [hasAuthor, hasRating, hasText, hasDate].filter(Boolean).length;
|
||||
if (indicators >= 3) {
|
||||
reviewCount++;
|
||||
}
|
||||
}
|
||||
|
||||
return reviewCount > 0 ? reviewCount : 0;
|
||||
};
|
||||
|
||||
// STRATEGY 3: Look for role="article" with review-like content
|
||||
const articles = document.querySelectorAll('[role="article"]');
|
||||
if (articles.length > 0) {
|
||||
let validArticles = 0;
|
||||
for (let article of articles) {
|
||||
// Check if article looks like a review (has rating + text)
|
||||
const hasRating = article.querySelector('[aria-label*="star" i]');
|
||||
const hasText = article.textContent.length > 30;
|
||||
if (hasRating && hasText) {
|
||||
validArticles++;
|
||||
}
|
||||
}
|
||||
if (validArticles > 0) return validArticles;
|
||||
}
|
||||
|
||||
// Try structural matching as last resort
|
||||
const structuralCount = findReviewsByStructure();
|
||||
return structuralCount;
|
||||
"""
|
||||
|
||||
# Check if reviews are actually loading
|
||||
|
||||
Reference in New Issue
Block a user