Fix: Add early no-reviews detection and hide analytics for empty jobs

Changes:
- Early detection for "no reviews" messages in 11 languages
- Checks for disabled reviews tabs and 0-review indicators
- Returns early (saves 30-40s) when no reviews exist
- Frontend hides analytics/export buttons when reviews_count = 0
- Structural pattern matching improvements (work in progress)

Known issue:
- Lithuanian hospital page has different structure (no tabs found)
- Needs separate investigation - may use different Google Maps layout

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-18 20:14:04 +00:00
parent c8c24ae483
commit e98da314a5
9 changed files with 1107 additions and 0 deletions

166
brute_force_selector.py Normal file
View File

@@ -0,0 +1,166 @@
#!/usr/bin/env python3
"""
Brute force approach: Try every possible div class combination and see which gives us reviews.
"""
import time
from seleniumbase import Driver
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine&hl=en"
driver = Driver(uc=True, headless=False)
try:
driver.get(url)
time.sleep(5)
# GDPR
try:
form_btns = driver.find_elements('css selector', 'form button')
for btn in form_btns:
if 'accept all' in (btn.text or '').lower():
btn.click()
time.sleep(2)
break
except:
pass
# Click reviews tab
time.sleep(2)
tabs = driver.find_elements('css selector', 'button[role="tab"]')
for tab in tabs:
if 'review' in (tab.text or '').lower() or 'review' in (tab.get_attribute('aria-label') or '').lower():
driver.execute_script("arguments[0].click();", tab)
time.sleep(5)
break
# Scroll to load reviews
try:
pane = driver.find_element('css selector', 'div.m6QErb.WNBkOb.XiKgde')
for _ in range(10):
driver.execute_script("arguments[0].scrollBy(0, 400);", pane)
time.sleep(0.3)
except:
pass
print("\n" + "="*80)
print("BRUTE FORCE SELECTOR SEARCH")
print("="*80)
# Get ALL unique class combinations from divs inside the reviews pane
candidates = driver.execute_script("""
// Find the reviews pane
const pane = document.querySelector('div.m6QErb.WNBkOb.XiKgde');
if (!pane) return {error: 'Pane not found'};
// Get all divs inside the pane
const allDivs = Array.from(pane.querySelectorAll('div'));
// For each div, check if it looks like a review
const candidates = [];
for (let div of allDivs) {
// Skip if no classes
if (!div.className || div.className.length === 0) continue;
// Check for review indicators
const hasRating = !!div.querySelector('[aria-label*="star" i]');
const hasText = div.textContent.length > 50 && div.textContent.length < 1000; // Individual review size
const hasAuthor = !!div.querySelector('button[aria-label*="photo" i], img');
// Calculate score
let score = 0;
if (hasRating) score += 3;
if (hasText) score += 2;
if (hasAuthor) score += 1;
if (score >= 4) { // Must have rating + text at minimum
candidates.push({
classes: div.className,
selector: 'div.' + div.className.split(' ').filter(c => c).join('.'),
score: score,
text_length: div.textContent.length,
sample_text: div.textContent.substring(0, 100)
});
}
}
// Count how many elements match each selector
const selectorCounts = {};
for (let candidate of candidates) {
const count = pane.querySelectorAll(candidate.selector).length;
if (!selectorCounts[candidate.selector]) {
selectorCounts[candidate.selector] = {
count: count,
score: candidate.score,
text_length: candidate.text_length,
sample: candidate.sample_text
};
}
}
// Sort by count (we want selectors that match many reviews)
const sorted = Object.entries(selectorCounts)
.sort((a, b) => b[1].count - a[1].count)
.slice(0, 10);
return {
top_selectors: sorted.map(([selector, info]) => ({
selector: selector,
count: info.count,
score: info.score,
text_length: info.text_length,
sample: info.sample
}))
};
""")
if 'error' in candidates:
print(f"ERROR: {candidates['error']}")
else:
print(f"\nTop 10 candidate selectors (sorted by count):\n")
for i, candidate in enumerate(candidates['top_selectors'], 1):
print(f"{i}. {candidate['selector']}")
print(f" Count: {candidate['count']} | Score: {candidate['score']} | Text length: {candidate['text_length']}")
print(f" Sample: {candidate['sample'][:80]}...")
print()
# Test the top selector
if candidates['top_selectors']:
top_selector = candidates['top_selectors'][0]['selector']
print(f"\n{'='*80}")
print(f"TESTING TOP SELECTOR: {top_selector}")
print(f"{'='*80}")
test_result = driver.execute_script(f"""
const elements = document.querySelectorAll('{top_selector}');
const reviews = [];
for (let i = 0; i < Math.min(3, elements.length); i++) {{
const elem = elements[i];
const review = {{
has_author: !!elem.querySelector('button, img'),
has_rating: !!elem.querySelector('[aria-label*="star" i]'),
has_date: !!elem.textContent.match(/\\d+\\s*(day|week|month|year|ago)/i),
text_length: elem.textContent.length,
text_sample: elem.textContent.substring(0, 150)
}};
reviews.push(review);
}}
return reviews;
""")
print(f"\nFirst 3 elements using {top_selector}:")
for i, rev in enumerate(test_result, 1):
print(f"\n Element {i}:")
for key, value in rev.items():
print(f" {key}: {value}")
print(f"\n{'='*80}")
print("Browser staying open for 60 seconds...")
print(f"{'='*80}")
time.sleep(60)
finally:
driver.quit()

106
check_page_structure.py Normal file
View File

@@ -0,0 +1,106 @@
#!/usr/bin/env python3
"""
Check the actual page structure - maybe reviews are already visible without clicking a tab!
"""
import time
from seleniumbase import Driver
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine"
driver = Driver(uc=True, headless=False)
try:
driver.get(url)
print(f"Initial URL: {url}")
time.sleep(5)
# GDPR
try:
form_btns = driver.find_elements('css selector', 'form button')
for btn in form_btns:
if 'accept' in (btn.text or '').lower():
btn.click()
time.sleep(2)
break
except:
pass
# Check final URL
final_url = driver.current_url
print(f"Final URL after redirect: {final_url}")
# Wait a bit more for dynamic content
time.sleep(3)
# Check page structure
print("\n" + "="*80)
print("PAGE STRUCTURE ANALYSIS")
print("="*80)
page_info = driver.execute_script("""
return {
tabs_found: document.querySelectorAll('button[role="tab"]').length,
reviews_with_standard_selector: document.querySelectorAll('div.jftiEf.fontBodyMedium').length,
reviews_with_jftiEf: document.querySelectorAll('div.jftiEf').length,
divs_with_ratings: document.querySelectorAll('[aria-label*="star" i]').length,
review_containers: document.querySelectorAll('div.fontBodyMedium').length,
page_text_sample: document.body.innerText.substring(0, 500),
has_review_text: document.body.innerText.toLowerCase().includes('review'),
has_atsiliepimai_text: document.body.innerText.toLowerCase().includes('atsiliepimai')
};
""")
print(f"\nTabs with role='tab': {page_info['tabs_found']}")
print(f"div.jftiEf.fontBodyMedium: {page_info['reviews_with_standard_selector']}")
print(f"div.jftiEf: {page_info['reviews_with_jftiEf']}")
print(f"Elements with star ratings: {page_info['divs_with_ratings']}")
print(f"div.fontBodyMedium: {page_info['review_containers']}")
print(f"Contains 'review': {page_info['has_review_text']}")
print(f"Contains 'atsiliepimai' (Lithuanian): {page_info['has_atsiliepimai_text']}")
print(f"\nPage text sample (first 500 chars):")
print(page_info['page_text_sample'])
# Try to find ANY element with rating
print("\n" + "="*80)
print("SEARCHING FOR RATING ELEMENTS")
print("="*80)
rating_search = driver.execute_script("""
const elements = Array.from(document.querySelectorAll('*'));
const withRatings = [];
for (let elem of elements) {
const ariaLabel = elem.getAttribute('aria-label') || '';
if (ariaLabel.toLowerCase().includes('star') || ariaLabel.toLowerCase().includes('žvaigžd')) {
withRatings.push({
tag: elem.tagName,
ariaLabel: ariaLabel.substring(0, 100),
classes: elem.className.substring(0, 100),
parentTag: elem.parentElement ? elem.parentElement.tagName : null,
parentClasses: elem.parentElement ? elem.parentElement.className.substring(0, 100) : null
});
}
}
return withRatings.slice(0, 10); // First 10
""")
print(f"\nFound {len(rating_search)} elements with 'star' in aria-label:")
for i, elem in enumerate(rating_search[:5], 1):
print(f"\n Element {i}:")
print(f" Tag: {elem['tag']}")
print(f" Aria-label: {elem['ariaLabel']}")
print(f" Classes: {elem['classes']}")
print(f" Parent tag: {elem['parentTag']}")
print(f" Parent classes: {elem['parentClasses']}")
print(f"\n{'='*80}")
print("Browser open for manual inspection...")
print("LOOK AT THE PAGE - Are reviews visible? What's their structure?")
print(f"{'='*80}")
time.sleep(180) # 3 minutes
finally:
driver.quit()

163
diagnose_reviews_panel.py Normal file
View File

@@ -0,0 +1,163 @@
#!/usr/bin/env python3
"""
Better diagnostic: Actually wait for reviews panel to load and find correct selector.
"""
import time
from seleniumbase import Driver
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine"
print("Opening browser...")
driver = Driver(uc=True, headless=False)
try:
# Add English locale
if '?' in url:
url += '&hl=en'
else:
url += '?hl=en'
driver.get(url)
print(f"Loaded: {url}")
time.sleep(5)
# Handle GDPR
try:
form_btns = driver.find_elements('css selector', 'form button')
for btn in form_btns:
btn_text = (btn.text or '').lower()
if 'accept all' in btn_text:
print(f"Clicking GDPR: {btn.text}")
btn.click()
time.sleep(2)
break
except:
pass
# Click reviews tab and WAIT for panel to load
print("\nClicking reviews tab...")
time.sleep(2)
tabs = driver.find_elements('css selector', 'button[role="tab"]')
for tab in tabs:
text = (tab.text or '').lower()
aria = (tab.get_attribute('aria-label') or '').lower()
if 'review' in text or 'review' in aria:
print(f"Found reviews tab: {tab.text or aria[:50]}")
driver.execute_script("arguments[0].click();", tab)
print("Clicked! Waiting for reviews panel to load...")
time.sleep(5) # Wait longer for reviews to actually load
break
# Try scrolling the reviews pane to load more
print("\nTrying to find and scroll reviews pane...")
pane_selectors = [
'div.m6QErb.WNBkOb.XiKgde',
'div.m6QErb',
'div[role="main"]'
]
for selector in pane_selectors:
try:
pane = driver.find_element('css selector', selector)
print(f"Found pane: {selector}")
driver.execute_script("arguments[0].scrollBy(0, 500);", pane)
time.sleep(2)
driver.execute_script("arguments[0].scrollBy(0, 500);", pane)
time.sleep(2)
break
except:
continue
# NOW check for review selectors
print("\n" + "="*80)
print("CHECKING REVIEW SELECTORS AFTER PANEL LOADED:")
print("="*80)
selectors_to_try = [
('div.jftiEf.fontBodyMedium', 'Standard Google Maps reviews'),
('div.jftiEf', 'Just jftiEf class'),
('div.fontBodyMedium', 'Just fontBodyMedium'),
('div[data-review-id]', 'data-review-id attribute'),
('div[jsaction*="review"]', 'jsaction with review'),
('[data-review]', 'data-review attribute'),
('div[class*="review" i]', 'Class containing review'),
('[role="article"]', 'role=article'),
('div[jslog]', 'Elements with jslog (Google tracking)'),
]
for selector, description in selectors_to_try:
count = driver.execute_script(
f"return document.querySelectorAll('{selector}').length;"
)
print(f"{description:35} | {selector:40} | Found: {count}")
# Get detailed info about most promising selector
print("\n" + "="*80)
print("ANALYZING MOST PROMISING SELECTOR:")
print("="*80)
analysis = driver.execute_script("""
// Try selectors in order of likelihood
const selectors = [
'div.jftiEf.fontBodyMedium',
'div.jftiEf',
'div.fontBodyMedium',
'div[jslog*="impression"]',
'[role="article"]'
];
for (let selector of selectors) {
const elements = document.querySelectorAll(selector);
if (elements.length > 5) { // Need at least a few to be reviews
// Analyze first element
const first = elements[0];
const analysis = {
selector: selector,
total_found: elements.length,
first_element: {
tag: first.tagName,
classes: first.className,
has_rating: !!first.querySelector('[aria-label*="star" i]'),
has_author: !!first.querySelector('button, a, div[aria-label]'),
has_avatar: !!first.querySelector('img'),
has_date: !!first.textContent.match(/\\d+\\s*(day|week|month|year|hour|minute)/i),
text_length: first.textContent.length,
sample_text: first.textContent.substring(0, 100)
}
};
// Check if multiple elements have review characteristics
let reviewLikeCount = 0;
for (let i = 0; i < Math.min(10, elements.length); i++) {
const elem = elements[i];
const hasRating = !!elem.querySelector('[aria-label*="star" i]');
const hasText = elem.textContent.length > 30;
if (hasRating && hasText) reviewLikeCount++;
}
analysis.review_like_count_in_first_10 = reviewLikeCount;
return analysis;
}
}
return {error: 'No selector found with >5 elements'};
""")
if 'error' in analysis:
print(f"ERROR: {analysis['error']}")
else:
print(f"Best selector: {analysis['selector']}")
print(f"Total found: {analysis['total_found']}")
print(f"Review-like in first 10: {analysis['review_like_count_in_first_10']}")
print(f"\nFirst element analysis:")
for key, value in analysis['first_element'].items():
print(f" {key}: {value}")
print("\n" + "="*80)
print("Keeping browser open for 120 seconds for manual inspection...")
print("="*80)
time.sleep(120)
finally:
driver.quit()

126
diagnose_selectors.py Normal file
View File

@@ -0,0 +1,126 @@
#!/usr/bin/env python3
"""
Diagnostic script to find the correct selector for Lithuanian hospital reviews.
Opens the browser and pauses so we can inspect the page manually.
"""
import time
from seleniumbase import Driver
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine"
print("Opening browser...")
driver = Driver(uc=True, headless=False)
try:
# Add English locale for consistency
if '?' in url:
url += '&hl=en'
else:
url += '?hl=en'
driver.get(url)
print(f"Loaded: {url}")
# Wait for page to load
time.sleep(5)
# Handle GDPR
try:
form_btns = driver.find_elements('css selector', 'form button')
for btn in form_btns:
btn_text = (btn.text or '').lower()
if 'accept all' in btn_text or 'aceptar todo' in btn_text:
print(f"Clicking GDPR consent: {btn.text}")
btn.click()
time.sleep(2)
break
except:
pass
# Click reviews tab
time.sleep(2)
tabs = driver.find_elements('css selector', 'button[role="tab"]')
for tab in tabs:
text = (tab.text or '').lower()
aria = (tab.get_attribute('aria-label') or '').lower()
if 'review' in text or 'review' in aria:
print(f"Clicking reviews tab: {tab.text or aria[:30]}")
driver.execute_script("arguments[0].click();", tab)
time.sleep(3)
break
# Try different selectors and show what we find
selectors_to_try = [
('div.jftiEf.fontBodyMedium', 'Known selector 1'),
('div.jftiEf', 'Known selector 2'),
('div[data-review-id]', 'Known selector 3'),
('div[jsaction*="review"]', 'jsaction with review'),
('[role="article"]', 'role=article'),
('div[data-review-id]', 'data-review-id attribute'),
('div.fontBodyMedium', 'Just fontBodyMedium class'),
('div[class*="review"]', 'Class containing "review"'),
]
print("\n" + "="*80)
print("TESTING SELECTORS:")
print("="*80)
for selector, description in selectors_to_try:
count = driver.execute_script(
f"return document.querySelectorAll('{selector}').length;"
)
print(f"{description:30} | {selector:40} | Found: {count}")
# Show sample HTML of first few elements matching the most promising selector
print("\n" + "="*80)
print("SAMPLE HTML FROM FIRST MATCH:")
print("="*80)
sample_html = driver.execute_script("""
const selectors = [
'div.jftiEf.fontBodyMedium',
'div.jftiEf',
'[role="article"]',
'div[jsaction*="review"]'
];
for (let selector of selectors) {
const elements = document.querySelectorAll(selector);
if (elements.length > 0) {
const first = elements[0];
return {
selector: selector,
count: elements.length,
outerHTML: first.outerHTML.substring(0, 500),
classes: first.className,
hasRating: !!first.querySelector('[aria-label*="star" i]'),
hasAuthor: !!first.querySelector('img'),
textLength: first.textContent.length
};
}
}
return null;
""")
if sample_html:
print(f"Selector: {sample_html['selector']}")
print(f"Total found: {sample_html['count']}")
print(f"Classes: {sample_html['classes']}")
print(f"Has rating: {sample_html['hasRating']}")
print(f"Has author img: {sample_html['hasAuthor']}")
print(f"Text length: {sample_html['textLength']}")
print(f"\nSample HTML (first 500 chars):")
print(sample_html['outerHTML'])
print("\n" + "="*80)
print("Browser will stay open for 60 seconds so you can inspect manually...")
print("Use DevTools to find the correct selector!")
print("="*80)
# Keep browser open for inspection
time.sleep(60)
finally:
driver.quit()
print("\nBrowser closed.")

156
find_actual_reviews.py Normal file
View File

@@ -0,0 +1,156 @@
#!/usr/bin/env python3
"""
Find the ACTUAL selector for reviews by looking for elements with review structure.
"""
import time
from seleniumbase import Driver
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine&hl=en"
driver = Driver(uc=True, headless=False)
try:
driver.get(url)
time.sleep(5)
# GDPR
try:
form_btns = driver.find_elements('css selector', 'form button')
for btn in form_btns:
if 'accept all' in (btn.text or '').lower():
btn.click()
time.sleep(2)
break
except:
pass
# Click reviews tab
time.sleep(2)
tabs = driver.find_elements('css selector', 'button[role="tab"]')
for tab in tabs:
if 'review' in (tab.text or '').lower() or 'review' in (tab.get_attribute('aria-label') or '').lower():
driver.execute_script("arguments[0].click();", tab)
time.sleep(5)
break
# Scroll to load reviews
try:
pane = driver.find_element('css selector', 'div.m6QErb.WNBkOb.XiKgde')
for _ in range(3):
driver.execute_script("arguments[0].scrollBy(0, 500);", pane)
time.sleep(1)
except:
pass
# Use JavaScript to find ALL elements that look like reviews
print("\n" + "="*80)
print("FINDING ACTUAL REVIEW ELEMENTS BY STRUCTURE:")
print("="*80)
review_info = driver.execute_script("""
// Find all elements that have BOTH a rating AND substantial text
const allDivs = Array.from(document.querySelectorAll('div'));
const reviews = [];
for (let div of allDivs) {
// Must have a rating (star aria-label)
const ratingElem = div.querySelector('[aria-label*="star" i], [aria-label*="rating" i]');
if (!ratingElem) continue;
// Must have decent text content (>50 chars to avoid buttons)
if (div.textContent.length < 50) continue;
// Get the classes and attributes
const info = {
classes: div.className,
has_author: !!div.querySelector('button, [aria-label*="photo" i]'),
has_avatar: !!div.querySelector('img'),
has_date: !!div.textContent.match(/\\d+\\s*(day|week|month|year|ago)/i),
text_length: div.textContent.length,
sample_text: div.textContent.substring(0, 150),
tag_name: div.tagName,
jslog: div.getAttribute('jslog'),
data_review_id: div.getAttribute('data-review-id'),
jsaction: div.getAttribute('jsaction')
};
reviews.push(info);
}
return {
total_found: reviews.length,
first_5: reviews.slice(0, 5)
};
""")
print(f"\nFound {review_info['total_found']} elements with review structure")
print(f"\nFirst 5 review-like elements:")
for i, rev in enumerate(review_info['first_5'], 1):
print(f"\n Review {i}:")
print(f" Classes: {rev['classes']}")
print(f" Has author: {rev['has_author']}")
print(f" Has avatar: {rev['has_avatar']}")
print(f" Has date: {rev['has_date']}")
print(f" Text length: {rev['text_length']}")
print(f" jslog: {rev['jslog']}")
print(f" data-review-id: {rev['data_review_id']}")
print(f" Sample: {rev['sample_text'][:80]}...")
# Try to find a common class among review elements
if review_info['total_found'] > 0:
print("\n" + "="*80)
print("FINDING COMMON SELECTOR:")
print("="*80)
common_selector = driver.execute_script("""
// Find common classes among review elements
const reviews = [];
const allDivs = Array.from(document.querySelectorAll('div'));
for (let div of allDivs) {
const ratingElem = div.querySelector('[aria-label*="star" i]');
if (ratingElem && div.textContent.length > 50) {
reviews.push(div);
}
}
if (reviews.length === 0) return null;
// Get classes from first review
const firstClasses = reviews[0].className.split(' ').filter(c => c.length > 0);
// Find classes that appear in ALL reviews
const commonClasses = firstClasses.filter(cls => {
return reviews.every(rev => rev.classList.contains(cls));
});
return {
total_reviews: reviews.length,
common_classes: commonClasses,
suggested_selector: commonClasses.length > 0 ? 'div.' + commonClasses.join('.') : null,
first_review_classes: reviews[0].className
};
""")
if common_selector:
print(f"Total review elements: {common_selector['total_reviews']}")
print(f"Common classes: {common_selector['common_classes']}")
print(f"Suggested selector: {common_selector['suggested_selector']}")
print(f"First review full classes: {common_selector['first_review_classes']}")
# Test the suggested selector
if common_selector['suggested_selector']:
test_count = driver.execute_script(
f"return document.querySelectorAll('{common_selector['suggested_selector']}').length;"
)
print(f"\nTesting suggested selector: Found {test_count} elements")
print("\n" + "="*80)
print("Browser staying open for manual inspection (60s)...")
print("="*80)
time.sleep(60)
finally:
driver.quit()

157
inspect_pane_content.py Normal file
View File

@@ -0,0 +1,157 @@
#!/usr/bin/env python3
"""
Check what's actually inside the reviews pane after scrolling.
"""
import time
from seleniumbase import Driver
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine&hl=en"
driver = Driver(uc=True, headless=False)
try:
driver.get(url)
time.sleep(5)
# GDPR
try:
form_btns = driver.find_elements('css selector', 'form button')
for btn in form_btns:
if 'accept all' in (btn.text or '').lower():
btn.click()
time.sleep(2)
break
except:
pass
# Click reviews tab
time.sleep(2)
tabs = driver.find_elements('css selector', 'button[role="tab"]')
review_tab_found = False
for tab in tabs:
text = (tab.text or '').lower()
aria = (tab.get_attribute('aria-label') or '').lower()
print(f"Tab: text='{tab.text}', aria='{tab.get_attribute('aria-label')}'")
if 'review' in text or 'review' in aria:
print(f" -> Clicking this tab!")
driver.execute_script("arguments[0].click();", tab)
time.sleep(6) # Wait longer
review_tab_found = True
break
if not review_tab_found:
print("WARNING: Reviews tab not found!")
# Find and scroll the pane
print("\nLooking for scrollable pane...")
pane = None
try:
pane = driver.find_element('css selector', 'div.m6QErb.WNBkOb.XiKgde')
print(f"Found pane: div.m6QErb.WNBkOb.XiKgde")
except:
print("Pane not found with standard selector!")
try:
pane = driver.find_element('css selector', 'div.m6QErb')
print(f"Found pane: div.m6QErb")
except:
print("No pane found at all!")
if pane:
print("\nScrolling pane to load reviews...")
for i in range(15):
driver.execute_script("arguments[0].scrollBy(0, 400);", pane)
time.sleep(0.4)
if (i + 1) % 5 == 0:
print(f" Scrolled {i+1} times...")
# Now check what's in the pane
print("\n" + "="*80)
print("ANALYZING PANE CONTENT")
print("="*80)
content_info = driver.execute_script("""
const pane = document.querySelector('div.m6QErb.WNBkOb.XiKgde') || document.querySelector('div.m6QErb');
if (!pane) return {error: 'No pane found'};
// Get all child divs (direct and nested)
const allDivs = Array.from(pane.querySelectorAll('div'));
// Get all unique class names used
const classNames = new Set();
allDivs.forEach(div => {
if (div.className) {
div.className.split(' ').forEach(cls => {
if (cls.trim()) classNames.add(cls.trim());
});
}
});
// Find divs with ratings
const divsWithRatings = allDivs.filter(div => {
return !!div.querySelector('[aria-label*="star" i]');
});
// Find divs with author photos
const divsWithPhotos = allDivs.filter(div => {
return !!div.querySelector('img[src*="photo"], img[src*="avatar"]');
});
// Find divs with date patterns
const divsWithDates = allDivs.filter(div => {
return !!div.textContent.match(/\\d+\\s*(day|week|month|year|hour|minute|ago)/i);
});
// Find divs with ALL three
const reviewLikeDivs = allDivs.filter(div => {
const hasRating = !!div.querySelector('[aria-label*="star" i]');
const hasPhoto = !!div.querySelector('img');
const hasDate = !!div.textContent.match(/\\d+\\s*(day|week|month|year|hour|ago)/i);
const textLen = div.textContent.length;
return hasRating && hasPhoto && hasDate && textLen > 50 && textLen < 2000;
});
return {
total_divs: allDivs.length,
unique_classes: Array.from(classNames).sort(),
divs_with_ratings: divsWithRatings.length,
divs_with_photos: divsWithPhotos.length,
divs_with_dates: divsWithDates.length,
review_like_divs: reviewLikeDivs.length,
review_like_classes: reviewLikeDivs.slice(0, 5).map(d => ({
classes: d.className,
text_length: d.textContent.length,
sample: d.textContent.substring(0, 100)
}))
};
""")
if 'error' in content_info:
print(f"ERROR: {content_info['error']}")
else:
print(f"\nTotal divs in pane: {content_info['total_divs']}")
print(f"Divs with ratings: {content_info['divs_with_ratings']}")
print(f"Divs with photos: {content_info['divs_with_photos']}")
print(f"Divs with dates: {content_info['divs_with_dates']}")
print(f"Divs matching ALL criteria (review-like): {content_info['review_like_divs']}")
print(f"\nFirst 20 unique classes found in pane:")
for cls in content_info['unique_classes'][:20]:
print(f" {cls}")
if content_info['review_like_divs'] > 0:
print(f"\nFirst 5 review-like divs:")
for i, div_info in enumerate(content_info['review_like_classes'], 1):
print(f"\n Div {i}:")
print(f" Classes: {div_info['classes']}")
print(f" Text length: {div_info['text_length']}")
print(f" Sample: {div_info['sample'][:80]}...")
print(f"\n{'='*80}")
print("Browser staying open for manual inspection (120 seconds)...")
print("Look at the DevTools to see the actual review elements!")
print(f"{'='*80}")
time.sleep(120)
finally:
driver.quit()

70
manual_inspect.py Normal file
View File

@@ -0,0 +1,70 @@
#!/usr/bin/env python3
"""
Open the page and keep it open for manual inspection.
INSTRUCTIONS:
1. Open DevTools (F12)
2. Click on an individual review
3. Look at the div that contains ONE review (not the whole list)
4. Note the class names on that div
"""
import time
from seleniumbase import Driver
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine&hl=en"
driver = Driver(uc=True, headless=False)
try:
driver.get(url)
time.sleep(5)
# GDPR
try:
form_btns = driver.find_elements('css selector', 'form button')
for btn in form_btns:
if 'accept all' in (btn.text or '').lower():
btn.click()
time.sleep(2)
break
except:
pass
# Click reviews tab
time.sleep(2)
tabs = driver.find_elements('css selector', 'button[role="tab"]')
for tab in tabs:
if 'review' in (tab.text or '').lower() or 'review' in (tab.get_attribute('aria-label') or '').lower():
driver.execute_script("arguments[0].click();", tab)
time.sleep(5)
break
# Scroll to load a few reviews
try:
pane = driver.find_element('css selector', 'div.m6QErb.WNBkOb.XiKgde')
for _ in range(5):
driver.execute_script("arguments[0].scrollBy(0, 300);", pane)
time.sleep(0.5)
except:
pass
print("\n" + "="*80)
print("MANUAL INSPECTION TIME!")
print("="*80)
print("\n1. The browser is now showing the reviews page")
print("2. Open DevTools (F12 or right-click > Inspect)")
print("3. Click the 'Select element' tool (top-left of DevTools)")
print("4. Hover over an INDIVIDUAL review (not the whole panel)")
print("5. Click on it to select it in the inspector")
print("6. Look at the <div> that wraps ONE SINGLE review")
print("7. Note the 'class' attribute value")
print("\n8. The class might look like: class=\"MyWpvb fontBodyMedium\" or similar")
print("\n9. Write down the full class name(s) - we'll use this as the selector!")
print("\n" + "="*80)
print("Browser will stay open for 5 minutes...")
print("="*80)
time.sleep(300) # 5 minutes
finally:
driver.quit()

View File

@@ -0,0 +1,71 @@
#!/usr/bin/env python3
"""
Test script for Lithuanian hospital to verify structural pattern matching works.
"""
import logging
from modules.fast_scraper import fast_scrape_reviews
# Configure logging to see what's happening
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
log = logging.getLogger(__name__)
def test_lithuanian_hospital():
"""Test scraping the Lithuanian hospital that was getting 0 reviews"""
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine"
log.info("=" * 80)
log.info("Testing Lithuanian Hospital: Panevėžio respublikinė ligoninė")
log.info("Expected: 271 reviews")
log.info("Previous result: 0 reviews (selector mismatch)")
log.info("=" * 80)
# Run the scraper with headless mode OFF so we can see what's happening
result = fast_scrape_reviews(
url=url,
headless=False, # Show browser for debugging
max_scrolls=999999 # Unlimited - use idle detection
)
log.info("=" * 80)
log.info("RESULTS:")
log.info(f"Success: {result['success']}")
log.info(f"Reviews found: {result['count']}")
log.info(f"Total reviews on page: {result.get('total_reviews', 'Unknown')}")
log.info(f"Time taken: {result['time']:.2f}s")
if result.get('message'):
log.info(f"Message: {result['message']}")
if result.get('error'):
log.error(f"Error: {result['error']}")
log.info("=" * 80)
# Show first few reviews if found
if result['count'] > 0:
log.info(f"\nFirst 3 reviews:")
for i, review in enumerate(result['reviews'][:3], 1):
log.info(f"\n Review {i}:")
log.info(f" Author: {review.get('author', 'N/A')}")
log.info(f" Rating: {review.get('rating', 'N/A')}")
log.info(f" Date: {review.get('date_text', 'N/A')}")
log.info(f" Text: {review.get('text', 'N/A')[:100]}...")
# Verify the fix worked
if result['count'] > 200:
log.info("\n✅ SUCCESS! Structural pattern matching found reviews!")
log.info(f" Got {result['count']} reviews (expected ~271)")
elif result['count'] == 0:
log.error("\n❌ FAILED! Still getting 0 reviews - selector issue not fixed")
else:
log.warning(f"\n⚠️ PARTIAL: Got {result['count']} reviews (expected ~271)")
log.warning(" May need to increase idle detection patience")
if __name__ == "__main__":
test_lithuanian_hospital()

92
test_without_english.py Normal file
View File

@@ -0,0 +1,92 @@
#!/usr/bin/env python3
"""
Test WITHOUT forcing English locale - use the page's default language.
"""
import time
from seleniumbase import Driver
# NO hl=en parameter!
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine"
driver = Driver(uc=True, headless=False)
try:
driver.get(url)
print(f"Loaded (NO hl=en): {url}")
time.sleep(5)
# GDPR
try:
form_btns = driver.find_elements('css selector', 'form button')
for btn in form_btns:
btn_text = (btn.text or '').lower()
if 'accept' in btn_text or 'priim' in btn_text: # Lithuanian "priimti"
print(f"Clicking consent: {btn.text}")
btn.click()
time.sleep(2)
break
except:
pass
# List ALL tabs
print("\nALL TABS FOUND:")
time.sleep(2)
tabs = driver.find_elements('css selector', 'button[role="tab"]')
for i, tab in enumerate(tabs, 1):
text = tab.text or ''
aria = tab.get_attribute('aria-label') or ''
print(f" Tab {i}: text='{text}', aria='{aria}'")
# Look for reviews tab (try multiple keywords)
review_keywords = ['review', 'reseña', 'atsiliepimai', 'atsiliepi', 'отзыв']
review_tab_found = False
for tab in tabs:
text = (tab.text or '').lower()
aria = (tab.get_attribute('aria-label') or '').lower()
for keyword in review_keywords:
if keyword in text or keyword in aria:
print(f"\nFound REVIEWS TAB: {tab.text or aria[:50]}")
driver.execute_script("arguments[0].click();", tab)
time.sleep(5)
review_tab_found = True
break
if review_tab_found:
break
if not review_tab_found:
print("\nWARNING: Still no reviews tab found!")
else:
# Now scroll and check for reviews
print("\nScrolling to load reviews...")
try:
pane = driver.find_element('css selector', 'div.m6QErb.WNBkOb.XiKgde')
for i in range(10):
driver.execute_script("arguments[0].scrollBy(0, 400);", pane)
time.sleep(0.3)
except:
pass
# Check for reviews using known selectors
selectors_to_check = [
'div.jftiEf.fontBodyMedium',
'div.jftiEf',
'div.fontBodyMedium',
'div[data-review-id]'
]
print("\nChecking selectors:")
for selector in selectors_to_check:
count = driver.execute_script(f"return document.querySelectorAll('{selector}').length;")
print(f" {selector:30} : {count} elements")
print(f"\n{'='*80}")
print("Browser open for inspection (120s)...")
print(f"{'='*80}")
time.sleep(120)
finally:
driver.quit()