Fix: Add early no-reviews detection and hide analytics for empty jobs
Changes: - Early detection for "no reviews" messages in 11 languages - Checks for disabled reviews tabs and 0-review indicators - Returns early (saves 30-40s) when no reviews exist - Frontend hides analytics/export buttons when reviews_count = 0 - Structural pattern matching improvements (work in progress) Known issue: - Lithuanian hospital page has different structure (no tabs found) - Needs separate investigation - may use different Google Maps layout Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
166
brute_force_selector.py
Normal file
166
brute_force_selector.py
Normal file
@@ -0,0 +1,166 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Brute force approach: Try every possible div class combination and see which gives us reviews.
|
||||
"""
|
||||
|
||||
import time
|
||||
from seleniumbase import Driver
|
||||
|
||||
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine&hl=en"
|
||||
|
||||
driver = Driver(uc=True, headless=False)
|
||||
|
||||
try:
|
||||
driver.get(url)
|
||||
time.sleep(5)
|
||||
|
||||
# GDPR
|
||||
try:
|
||||
form_btns = driver.find_elements('css selector', 'form button')
|
||||
for btn in form_btns:
|
||||
if 'accept all' in (btn.text or '').lower():
|
||||
btn.click()
|
||||
time.sleep(2)
|
||||
break
|
||||
except:
|
||||
pass
|
||||
|
||||
# Click reviews tab
|
||||
time.sleep(2)
|
||||
tabs = driver.find_elements('css selector', 'button[role="tab"]')
|
||||
for tab in tabs:
|
||||
if 'review' in (tab.text or '').lower() or 'review' in (tab.get_attribute('aria-label') or '').lower():
|
||||
driver.execute_script("arguments[0].click();", tab)
|
||||
time.sleep(5)
|
||||
break
|
||||
|
||||
# Scroll to load reviews
|
||||
try:
|
||||
pane = driver.find_element('css selector', 'div.m6QErb.WNBkOb.XiKgde')
|
||||
for _ in range(10):
|
||||
driver.execute_script("arguments[0].scrollBy(0, 400);", pane)
|
||||
time.sleep(0.3)
|
||||
except:
|
||||
pass
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("BRUTE FORCE SELECTOR SEARCH")
|
||||
print("="*80)
|
||||
|
||||
# Get ALL unique class combinations from divs inside the reviews pane
|
||||
candidates = driver.execute_script("""
|
||||
// Find the reviews pane
|
||||
const pane = document.querySelector('div.m6QErb.WNBkOb.XiKgde');
|
||||
if (!pane) return {error: 'Pane not found'};
|
||||
|
||||
// Get all divs inside the pane
|
||||
const allDivs = Array.from(pane.querySelectorAll('div'));
|
||||
|
||||
// For each div, check if it looks like a review
|
||||
const candidates = [];
|
||||
|
||||
for (let div of allDivs) {
|
||||
// Skip if no classes
|
||||
if (!div.className || div.className.length === 0) continue;
|
||||
|
||||
// Check for review indicators
|
||||
const hasRating = !!div.querySelector('[aria-label*="star" i]');
|
||||
const hasText = div.textContent.length > 50 && div.textContent.length < 1000; // Individual review size
|
||||
const hasAuthor = !!div.querySelector('button[aria-label*="photo" i], img');
|
||||
|
||||
// Calculate score
|
||||
let score = 0;
|
||||
if (hasRating) score += 3;
|
||||
if (hasText) score += 2;
|
||||
if (hasAuthor) score += 1;
|
||||
|
||||
if (score >= 4) { // Must have rating + text at minimum
|
||||
candidates.push({
|
||||
classes: div.className,
|
||||
selector: 'div.' + div.className.split(' ').filter(c => c).join('.'),
|
||||
score: score,
|
||||
text_length: div.textContent.length,
|
||||
sample_text: div.textContent.substring(0, 100)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Count how many elements match each selector
|
||||
const selectorCounts = {};
|
||||
for (let candidate of candidates) {
|
||||
const count = pane.querySelectorAll(candidate.selector).length;
|
||||
if (!selectorCounts[candidate.selector]) {
|
||||
selectorCounts[candidate.selector] = {
|
||||
count: count,
|
||||
score: candidate.score,
|
||||
text_length: candidate.text_length,
|
||||
sample: candidate.sample_text
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by count (we want selectors that match many reviews)
|
||||
const sorted = Object.entries(selectorCounts)
|
||||
.sort((a, b) => b[1].count - a[1].count)
|
||||
.slice(0, 10);
|
||||
|
||||
return {
|
||||
top_selectors: sorted.map(([selector, info]) => ({
|
||||
selector: selector,
|
||||
count: info.count,
|
||||
score: info.score,
|
||||
text_length: info.text_length,
|
||||
sample: info.sample
|
||||
}))
|
||||
};
|
||||
""")
|
||||
|
||||
if 'error' in candidates:
|
||||
print(f"ERROR: {candidates['error']}")
|
||||
else:
|
||||
print(f"\nTop 10 candidate selectors (sorted by count):\n")
|
||||
for i, candidate in enumerate(candidates['top_selectors'], 1):
|
||||
print(f"{i}. {candidate['selector']}")
|
||||
print(f" Count: {candidate['count']} | Score: {candidate['score']} | Text length: {candidate['text_length']}")
|
||||
print(f" Sample: {candidate['sample'][:80]}...")
|
||||
print()
|
||||
|
||||
# Test the top selector
|
||||
if candidates['top_selectors']:
|
||||
top_selector = candidates['top_selectors'][0]['selector']
|
||||
print(f"\n{'='*80}")
|
||||
print(f"TESTING TOP SELECTOR: {top_selector}")
|
||||
print(f"{'='*80}")
|
||||
|
||||
test_result = driver.execute_script(f"""
|
||||
const elements = document.querySelectorAll('{top_selector}');
|
||||
const reviews = [];
|
||||
|
||||
for (let i = 0; i < Math.min(3, elements.length); i++) {{
|
||||
const elem = elements[i];
|
||||
const review = {{
|
||||
has_author: !!elem.querySelector('button, img'),
|
||||
has_rating: !!elem.querySelector('[aria-label*="star" i]'),
|
||||
has_date: !!elem.textContent.match(/\\d+\\s*(day|week|month|year|ago)/i),
|
||||
text_length: elem.textContent.length,
|
||||
text_sample: elem.textContent.substring(0, 150)
|
||||
}};
|
||||
reviews.push(review);
|
||||
}}
|
||||
|
||||
return reviews;
|
||||
""")
|
||||
|
||||
print(f"\nFirst 3 elements using {top_selector}:")
|
||||
for i, rev in enumerate(test_result, 1):
|
||||
print(f"\n Element {i}:")
|
||||
for key, value in rev.items():
|
||||
print(f" {key}: {value}")
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print("Browser staying open for 60 seconds...")
|
||||
print(f"{'='*80}")
|
||||
time.sleep(60)
|
||||
|
||||
finally:
|
||||
driver.quit()
|
||||
106
check_page_structure.py
Normal file
106
check_page_structure.py
Normal file
@@ -0,0 +1,106 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Check the actual page structure - maybe reviews are already visible without clicking a tab!
|
||||
"""
|
||||
|
||||
import time
|
||||
from seleniumbase import Driver
|
||||
|
||||
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine"
|
||||
|
||||
driver = Driver(uc=True, headless=False)
|
||||
|
||||
try:
|
||||
driver.get(url)
|
||||
print(f"Initial URL: {url}")
|
||||
time.sleep(5)
|
||||
|
||||
# GDPR
|
||||
try:
|
||||
form_btns = driver.find_elements('css selector', 'form button')
|
||||
for btn in form_btns:
|
||||
if 'accept' in (btn.text or '').lower():
|
||||
btn.click()
|
||||
time.sleep(2)
|
||||
break
|
||||
except:
|
||||
pass
|
||||
|
||||
# Check final URL
|
||||
final_url = driver.current_url
|
||||
print(f"Final URL after redirect: {final_url}")
|
||||
|
||||
# Wait a bit more for dynamic content
|
||||
time.sleep(3)
|
||||
|
||||
# Check page structure
|
||||
print("\n" + "="*80)
|
||||
print("PAGE STRUCTURE ANALYSIS")
|
||||
print("="*80)
|
||||
|
||||
page_info = driver.execute_script("""
|
||||
return {
|
||||
tabs_found: document.querySelectorAll('button[role="tab"]').length,
|
||||
reviews_with_standard_selector: document.querySelectorAll('div.jftiEf.fontBodyMedium').length,
|
||||
reviews_with_jftiEf: document.querySelectorAll('div.jftiEf').length,
|
||||
divs_with_ratings: document.querySelectorAll('[aria-label*="star" i]').length,
|
||||
review_containers: document.querySelectorAll('div.fontBodyMedium').length,
|
||||
page_text_sample: document.body.innerText.substring(0, 500),
|
||||
has_review_text: document.body.innerText.toLowerCase().includes('review'),
|
||||
has_atsiliepimai_text: document.body.innerText.toLowerCase().includes('atsiliepimai')
|
||||
};
|
||||
""")
|
||||
|
||||
print(f"\nTabs with role='tab': {page_info['tabs_found']}")
|
||||
print(f"div.jftiEf.fontBodyMedium: {page_info['reviews_with_standard_selector']}")
|
||||
print(f"div.jftiEf: {page_info['reviews_with_jftiEf']}")
|
||||
print(f"Elements with star ratings: {page_info['divs_with_ratings']}")
|
||||
print(f"div.fontBodyMedium: {page_info['review_containers']}")
|
||||
print(f"Contains 'review': {page_info['has_review_text']}")
|
||||
print(f"Contains 'atsiliepimai' (Lithuanian): {page_info['has_atsiliepimai_text']}")
|
||||
|
||||
print(f"\nPage text sample (first 500 chars):")
|
||||
print(page_info['page_text_sample'])
|
||||
|
||||
# Try to find ANY element with rating
|
||||
print("\n" + "="*80)
|
||||
print("SEARCHING FOR RATING ELEMENTS")
|
||||
print("="*80)
|
||||
|
||||
rating_search = driver.execute_script("""
|
||||
const elements = Array.from(document.querySelectorAll('*'));
|
||||
const withRatings = [];
|
||||
|
||||
for (let elem of elements) {
|
||||
const ariaLabel = elem.getAttribute('aria-label') || '';
|
||||
if (ariaLabel.toLowerCase().includes('star') || ariaLabel.toLowerCase().includes('žvaigžd')) {
|
||||
withRatings.push({
|
||||
tag: elem.tagName,
|
||||
ariaLabel: ariaLabel.substring(0, 100),
|
||||
classes: elem.className.substring(0, 100),
|
||||
parentTag: elem.parentElement ? elem.parentElement.tagName : null,
|
||||
parentClasses: elem.parentElement ? elem.parentElement.className.substring(0, 100) : null
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return withRatings.slice(0, 10); // First 10
|
||||
""")
|
||||
|
||||
print(f"\nFound {len(rating_search)} elements with 'star' in aria-label:")
|
||||
for i, elem in enumerate(rating_search[:5], 1):
|
||||
print(f"\n Element {i}:")
|
||||
print(f" Tag: {elem['tag']}")
|
||||
print(f" Aria-label: {elem['ariaLabel']}")
|
||||
print(f" Classes: {elem['classes']}")
|
||||
print(f" Parent tag: {elem['parentTag']}")
|
||||
print(f" Parent classes: {elem['parentClasses']}")
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print("Browser open for manual inspection...")
|
||||
print("LOOK AT THE PAGE - Are reviews visible? What's their structure?")
|
||||
print(f"{'='*80}")
|
||||
time.sleep(180) # 3 minutes
|
||||
|
||||
finally:
|
||||
driver.quit()
|
||||
163
diagnose_reviews_panel.py
Normal file
163
diagnose_reviews_panel.py
Normal file
@@ -0,0 +1,163 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Better diagnostic: Actually wait for reviews panel to load and find correct selector.
|
||||
"""
|
||||
|
||||
import time
|
||||
from seleniumbase import Driver
|
||||
|
||||
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine"
|
||||
|
||||
print("Opening browser...")
|
||||
driver = Driver(uc=True, headless=False)
|
||||
|
||||
try:
|
||||
# Add English locale
|
||||
if '?' in url:
|
||||
url += '&hl=en'
|
||||
else:
|
||||
url += '?hl=en'
|
||||
|
||||
driver.get(url)
|
||||
print(f"Loaded: {url}")
|
||||
time.sleep(5)
|
||||
|
||||
# Handle GDPR
|
||||
try:
|
||||
form_btns = driver.find_elements('css selector', 'form button')
|
||||
for btn in form_btns:
|
||||
btn_text = (btn.text or '').lower()
|
||||
if 'accept all' in btn_text:
|
||||
print(f"Clicking GDPR: {btn.text}")
|
||||
btn.click()
|
||||
time.sleep(2)
|
||||
break
|
||||
except:
|
||||
pass
|
||||
|
||||
# Click reviews tab and WAIT for panel to load
|
||||
print("\nClicking reviews tab...")
|
||||
time.sleep(2)
|
||||
tabs = driver.find_elements('css selector', 'button[role="tab"]')
|
||||
for tab in tabs:
|
||||
text = (tab.text or '').lower()
|
||||
aria = (tab.get_attribute('aria-label') or '').lower()
|
||||
if 'review' in text or 'review' in aria:
|
||||
print(f"Found reviews tab: {tab.text or aria[:50]}")
|
||||
driver.execute_script("arguments[0].click();", tab)
|
||||
print("Clicked! Waiting for reviews panel to load...")
|
||||
time.sleep(5) # Wait longer for reviews to actually load
|
||||
break
|
||||
|
||||
# Try scrolling the reviews pane to load more
|
||||
print("\nTrying to find and scroll reviews pane...")
|
||||
pane_selectors = [
|
||||
'div.m6QErb.WNBkOb.XiKgde',
|
||||
'div.m6QErb',
|
||||
'div[role="main"]'
|
||||
]
|
||||
|
||||
for selector in pane_selectors:
|
||||
try:
|
||||
pane = driver.find_element('css selector', selector)
|
||||
print(f"Found pane: {selector}")
|
||||
driver.execute_script("arguments[0].scrollBy(0, 500);", pane)
|
||||
time.sleep(2)
|
||||
driver.execute_script("arguments[0].scrollBy(0, 500);", pane)
|
||||
time.sleep(2)
|
||||
break
|
||||
except:
|
||||
continue
|
||||
|
||||
# NOW check for review selectors
|
||||
print("\n" + "="*80)
|
||||
print("CHECKING REVIEW SELECTORS AFTER PANEL LOADED:")
|
||||
print("="*80)
|
||||
|
||||
selectors_to_try = [
|
||||
('div.jftiEf.fontBodyMedium', 'Standard Google Maps reviews'),
|
||||
('div.jftiEf', 'Just jftiEf class'),
|
||||
('div.fontBodyMedium', 'Just fontBodyMedium'),
|
||||
('div[data-review-id]', 'data-review-id attribute'),
|
||||
('div[jsaction*="review"]', 'jsaction with review'),
|
||||
('[data-review]', 'data-review attribute'),
|
||||
('div[class*="review" i]', 'Class containing review'),
|
||||
('[role="article"]', 'role=article'),
|
||||
('div[jslog]', 'Elements with jslog (Google tracking)'),
|
||||
]
|
||||
|
||||
for selector, description in selectors_to_try:
|
||||
count = driver.execute_script(
|
||||
f"return document.querySelectorAll('{selector}').length;"
|
||||
)
|
||||
print(f"{description:35} | {selector:40} | Found: {count}")
|
||||
|
||||
# Get detailed info about most promising selector
|
||||
print("\n" + "="*80)
|
||||
print("ANALYZING MOST PROMISING SELECTOR:")
|
||||
print("="*80)
|
||||
|
||||
analysis = driver.execute_script("""
|
||||
// Try selectors in order of likelihood
|
||||
const selectors = [
|
||||
'div.jftiEf.fontBodyMedium',
|
||||
'div.jftiEf',
|
||||
'div.fontBodyMedium',
|
||||
'div[jslog*="impression"]',
|
||||
'[role="article"]'
|
||||
];
|
||||
|
||||
for (let selector of selectors) {
|
||||
const elements = document.querySelectorAll(selector);
|
||||
if (elements.length > 5) { // Need at least a few to be reviews
|
||||
// Analyze first element
|
||||
const first = elements[0];
|
||||
const analysis = {
|
||||
selector: selector,
|
||||
total_found: elements.length,
|
||||
first_element: {
|
||||
tag: first.tagName,
|
||||
classes: first.className,
|
||||
has_rating: !!first.querySelector('[aria-label*="star" i]'),
|
||||
has_author: !!first.querySelector('button, a, div[aria-label]'),
|
||||
has_avatar: !!first.querySelector('img'),
|
||||
has_date: !!first.textContent.match(/\\d+\\s*(day|week|month|year|hour|minute)/i),
|
||||
text_length: first.textContent.length,
|
||||
sample_text: first.textContent.substring(0, 100)
|
||||
}
|
||||
};
|
||||
|
||||
// Check if multiple elements have review characteristics
|
||||
let reviewLikeCount = 0;
|
||||
for (let i = 0; i < Math.min(10, elements.length); i++) {
|
||||
const elem = elements[i];
|
||||
const hasRating = !!elem.querySelector('[aria-label*="star" i]');
|
||||
const hasText = elem.textContent.length > 30;
|
||||
if (hasRating && hasText) reviewLikeCount++;
|
||||
}
|
||||
analysis.review_like_count_in_first_10 = reviewLikeCount;
|
||||
|
||||
return analysis;
|
||||
}
|
||||
}
|
||||
|
||||
return {error: 'No selector found with >5 elements'};
|
||||
""")
|
||||
|
||||
if 'error' in analysis:
|
||||
print(f"ERROR: {analysis['error']}")
|
||||
else:
|
||||
print(f"Best selector: {analysis['selector']}")
|
||||
print(f"Total found: {analysis['total_found']}")
|
||||
print(f"Review-like in first 10: {analysis['review_like_count_in_first_10']}")
|
||||
print(f"\nFirst element analysis:")
|
||||
for key, value in analysis['first_element'].items():
|
||||
print(f" {key}: {value}")
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("Keeping browser open for 120 seconds for manual inspection...")
|
||||
print("="*80)
|
||||
time.sleep(120)
|
||||
|
||||
finally:
|
||||
driver.quit()
|
||||
126
diagnose_selectors.py
Normal file
126
diagnose_selectors.py
Normal file
@@ -0,0 +1,126 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Diagnostic script to find the correct selector for Lithuanian hospital reviews.
|
||||
Opens the browser and pauses so we can inspect the page manually.
|
||||
"""
|
||||
|
||||
import time
|
||||
from seleniumbase import Driver
|
||||
|
||||
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine"
|
||||
|
||||
print("Opening browser...")
|
||||
driver = Driver(uc=True, headless=False)
|
||||
|
||||
try:
|
||||
# Add English locale for consistency
|
||||
if '?' in url:
|
||||
url += '&hl=en'
|
||||
else:
|
||||
url += '?hl=en'
|
||||
|
||||
driver.get(url)
|
||||
print(f"Loaded: {url}")
|
||||
|
||||
# Wait for page to load
|
||||
time.sleep(5)
|
||||
|
||||
# Handle GDPR
|
||||
try:
|
||||
form_btns = driver.find_elements('css selector', 'form button')
|
||||
for btn in form_btns:
|
||||
btn_text = (btn.text or '').lower()
|
||||
if 'accept all' in btn_text or 'aceptar todo' in btn_text:
|
||||
print(f"Clicking GDPR consent: {btn.text}")
|
||||
btn.click()
|
||||
time.sleep(2)
|
||||
break
|
||||
except:
|
||||
pass
|
||||
|
||||
# Click reviews tab
|
||||
time.sleep(2)
|
||||
tabs = driver.find_elements('css selector', 'button[role="tab"]')
|
||||
for tab in tabs:
|
||||
text = (tab.text or '').lower()
|
||||
aria = (tab.get_attribute('aria-label') or '').lower()
|
||||
if 'review' in text or 'review' in aria:
|
||||
print(f"Clicking reviews tab: {tab.text or aria[:30]}")
|
||||
driver.execute_script("arguments[0].click();", tab)
|
||||
time.sleep(3)
|
||||
break
|
||||
|
||||
# Try different selectors and show what we find
|
||||
selectors_to_try = [
|
||||
('div.jftiEf.fontBodyMedium', 'Known selector 1'),
|
||||
('div.jftiEf', 'Known selector 2'),
|
||||
('div[data-review-id]', 'Known selector 3'),
|
||||
('div[jsaction*="review"]', 'jsaction with review'),
|
||||
('[role="article"]', 'role=article'),
|
||||
('div[data-review-id]', 'data-review-id attribute'),
|
||||
('div.fontBodyMedium', 'Just fontBodyMedium class'),
|
||||
('div[class*="review"]', 'Class containing "review"'),
|
||||
]
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("TESTING SELECTORS:")
|
||||
print("="*80)
|
||||
|
||||
for selector, description in selectors_to_try:
|
||||
count = driver.execute_script(
|
||||
f"return document.querySelectorAll('{selector}').length;"
|
||||
)
|
||||
print(f"{description:30} | {selector:40} | Found: {count}")
|
||||
|
||||
# Show sample HTML of first few elements matching the most promising selector
|
||||
print("\n" + "="*80)
|
||||
print("SAMPLE HTML FROM FIRST MATCH:")
|
||||
print("="*80)
|
||||
|
||||
sample_html = driver.execute_script("""
|
||||
const selectors = [
|
||||
'div.jftiEf.fontBodyMedium',
|
||||
'div.jftiEf',
|
||||
'[role="article"]',
|
||||
'div[jsaction*="review"]'
|
||||
];
|
||||
|
||||
for (let selector of selectors) {
|
||||
const elements = document.querySelectorAll(selector);
|
||||
if (elements.length > 0) {
|
||||
const first = elements[0];
|
||||
return {
|
||||
selector: selector,
|
||||
count: elements.length,
|
||||
outerHTML: first.outerHTML.substring(0, 500),
|
||||
classes: first.className,
|
||||
hasRating: !!first.querySelector('[aria-label*="star" i]'),
|
||||
hasAuthor: !!first.querySelector('img'),
|
||||
textLength: first.textContent.length
|
||||
};
|
||||
}
|
||||
}
|
||||
return null;
|
||||
""")
|
||||
|
||||
if sample_html:
|
||||
print(f"Selector: {sample_html['selector']}")
|
||||
print(f"Total found: {sample_html['count']}")
|
||||
print(f"Classes: {sample_html['classes']}")
|
||||
print(f"Has rating: {sample_html['hasRating']}")
|
||||
print(f"Has author img: {sample_html['hasAuthor']}")
|
||||
print(f"Text length: {sample_html['textLength']}")
|
||||
print(f"\nSample HTML (first 500 chars):")
|
||||
print(sample_html['outerHTML'])
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("Browser will stay open for 60 seconds so you can inspect manually...")
|
||||
print("Use DevTools to find the correct selector!")
|
||||
print("="*80)
|
||||
|
||||
# Keep browser open for inspection
|
||||
time.sleep(60)
|
||||
|
||||
finally:
|
||||
driver.quit()
|
||||
print("\nBrowser closed.")
|
||||
156
find_actual_reviews.py
Normal file
156
find_actual_reviews.py
Normal file
@@ -0,0 +1,156 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Find the ACTUAL selector for reviews by looking for elements with review structure.
|
||||
"""
|
||||
|
||||
import time
|
||||
from seleniumbase import Driver
|
||||
|
||||
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine&hl=en"
|
||||
|
||||
driver = Driver(uc=True, headless=False)
|
||||
|
||||
try:
|
||||
driver.get(url)
|
||||
time.sleep(5)
|
||||
|
||||
# GDPR
|
||||
try:
|
||||
form_btns = driver.find_elements('css selector', 'form button')
|
||||
for btn in form_btns:
|
||||
if 'accept all' in (btn.text or '').lower():
|
||||
btn.click()
|
||||
time.sleep(2)
|
||||
break
|
||||
except:
|
||||
pass
|
||||
|
||||
# Click reviews tab
|
||||
time.sleep(2)
|
||||
tabs = driver.find_elements('css selector', 'button[role="tab"]')
|
||||
for tab in tabs:
|
||||
if 'review' in (tab.text or '').lower() or 'review' in (tab.get_attribute('aria-label') or '').lower():
|
||||
driver.execute_script("arguments[0].click();", tab)
|
||||
time.sleep(5)
|
||||
break
|
||||
|
||||
# Scroll to load reviews
|
||||
try:
|
||||
pane = driver.find_element('css selector', 'div.m6QErb.WNBkOb.XiKgde')
|
||||
for _ in range(3):
|
||||
driver.execute_script("arguments[0].scrollBy(0, 500);", pane)
|
||||
time.sleep(1)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Use JavaScript to find ALL elements that look like reviews
|
||||
print("\n" + "="*80)
|
||||
print("FINDING ACTUAL REVIEW ELEMENTS BY STRUCTURE:")
|
||||
print("="*80)
|
||||
|
||||
review_info = driver.execute_script("""
|
||||
// Find all elements that have BOTH a rating AND substantial text
|
||||
const allDivs = Array.from(document.querySelectorAll('div'));
|
||||
|
||||
const reviews = [];
|
||||
|
||||
for (let div of allDivs) {
|
||||
// Must have a rating (star aria-label)
|
||||
const ratingElem = div.querySelector('[aria-label*="star" i], [aria-label*="rating" i]');
|
||||
if (!ratingElem) continue;
|
||||
|
||||
// Must have decent text content (>50 chars to avoid buttons)
|
||||
if (div.textContent.length < 50) continue;
|
||||
|
||||
// Get the classes and attributes
|
||||
const info = {
|
||||
classes: div.className,
|
||||
has_author: !!div.querySelector('button, [aria-label*="photo" i]'),
|
||||
has_avatar: !!div.querySelector('img'),
|
||||
has_date: !!div.textContent.match(/\\d+\\s*(day|week|month|year|ago)/i),
|
||||
text_length: div.textContent.length,
|
||||
sample_text: div.textContent.substring(0, 150),
|
||||
tag_name: div.tagName,
|
||||
jslog: div.getAttribute('jslog'),
|
||||
data_review_id: div.getAttribute('data-review-id'),
|
||||
jsaction: div.getAttribute('jsaction')
|
||||
};
|
||||
|
||||
reviews.push(info);
|
||||
}
|
||||
|
||||
return {
|
||||
total_found: reviews.length,
|
||||
first_5: reviews.slice(0, 5)
|
||||
};
|
||||
""")
|
||||
|
||||
print(f"\nFound {review_info['total_found']} elements with review structure")
|
||||
print(f"\nFirst 5 review-like elements:")
|
||||
for i, rev in enumerate(review_info['first_5'], 1):
|
||||
print(f"\n Review {i}:")
|
||||
print(f" Classes: {rev['classes']}")
|
||||
print(f" Has author: {rev['has_author']}")
|
||||
print(f" Has avatar: {rev['has_avatar']}")
|
||||
print(f" Has date: {rev['has_date']}")
|
||||
print(f" Text length: {rev['text_length']}")
|
||||
print(f" jslog: {rev['jslog']}")
|
||||
print(f" data-review-id: {rev['data_review_id']}")
|
||||
print(f" Sample: {rev['sample_text'][:80]}...")
|
||||
|
||||
# Try to find a common class among review elements
|
||||
if review_info['total_found'] > 0:
|
||||
print("\n" + "="*80)
|
||||
print("FINDING COMMON SELECTOR:")
|
||||
print("="*80)
|
||||
|
||||
common_selector = driver.execute_script("""
|
||||
// Find common classes among review elements
|
||||
const reviews = [];
|
||||
const allDivs = Array.from(document.querySelectorAll('div'));
|
||||
|
||||
for (let div of allDivs) {
|
||||
const ratingElem = div.querySelector('[aria-label*="star" i]');
|
||||
if (ratingElem && div.textContent.length > 50) {
|
||||
reviews.push(div);
|
||||
}
|
||||
}
|
||||
|
||||
if (reviews.length === 0) return null;
|
||||
|
||||
// Get classes from first review
|
||||
const firstClasses = reviews[0].className.split(' ').filter(c => c.length > 0);
|
||||
|
||||
// Find classes that appear in ALL reviews
|
||||
const commonClasses = firstClasses.filter(cls => {
|
||||
return reviews.every(rev => rev.classList.contains(cls));
|
||||
});
|
||||
|
||||
return {
|
||||
total_reviews: reviews.length,
|
||||
common_classes: commonClasses,
|
||||
suggested_selector: commonClasses.length > 0 ? 'div.' + commonClasses.join('.') : null,
|
||||
first_review_classes: reviews[0].className
|
||||
};
|
||||
""")
|
||||
|
||||
if common_selector:
|
||||
print(f"Total review elements: {common_selector['total_reviews']}")
|
||||
print(f"Common classes: {common_selector['common_classes']}")
|
||||
print(f"Suggested selector: {common_selector['suggested_selector']}")
|
||||
print(f"First review full classes: {common_selector['first_review_classes']}")
|
||||
|
||||
# Test the suggested selector
|
||||
if common_selector['suggested_selector']:
|
||||
test_count = driver.execute_script(
|
||||
f"return document.querySelectorAll('{common_selector['suggested_selector']}').length;"
|
||||
)
|
||||
print(f"\nTesting suggested selector: Found {test_count} elements")
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("Browser staying open for manual inspection (60s)...")
|
||||
print("="*80)
|
||||
time.sleep(60)
|
||||
|
||||
finally:
|
||||
driver.quit()
|
||||
157
inspect_pane_content.py
Normal file
157
inspect_pane_content.py
Normal file
@@ -0,0 +1,157 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Check what's actually inside the reviews pane after scrolling.
|
||||
"""
|
||||
|
||||
import time
|
||||
from seleniumbase import Driver
|
||||
|
||||
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine&hl=en"
|
||||
|
||||
driver = Driver(uc=True, headless=False)
|
||||
|
||||
try:
|
||||
driver.get(url)
|
||||
time.sleep(5)
|
||||
|
||||
# GDPR
|
||||
try:
|
||||
form_btns = driver.find_elements('css selector', 'form button')
|
||||
for btn in form_btns:
|
||||
if 'accept all' in (btn.text or '').lower():
|
||||
btn.click()
|
||||
time.sleep(2)
|
||||
break
|
||||
except:
|
||||
pass
|
||||
|
||||
# Click reviews tab
|
||||
time.sleep(2)
|
||||
tabs = driver.find_elements('css selector', 'button[role="tab"]')
|
||||
review_tab_found = False
|
||||
for tab in tabs:
|
||||
text = (tab.text or '').lower()
|
||||
aria = (tab.get_attribute('aria-label') or '').lower()
|
||||
print(f"Tab: text='{tab.text}', aria='{tab.get_attribute('aria-label')}'")
|
||||
if 'review' in text or 'review' in aria:
|
||||
print(f" -> Clicking this tab!")
|
||||
driver.execute_script("arguments[0].click();", tab)
|
||||
time.sleep(6) # Wait longer
|
||||
review_tab_found = True
|
||||
break
|
||||
|
||||
if not review_tab_found:
|
||||
print("WARNING: Reviews tab not found!")
|
||||
|
||||
# Find and scroll the pane
|
||||
print("\nLooking for scrollable pane...")
|
||||
pane = None
|
||||
try:
|
||||
pane = driver.find_element('css selector', 'div.m6QErb.WNBkOb.XiKgde')
|
||||
print(f"Found pane: div.m6QErb.WNBkOb.XiKgde")
|
||||
except:
|
||||
print("Pane not found with standard selector!")
|
||||
try:
|
||||
pane = driver.find_element('css selector', 'div.m6QErb')
|
||||
print(f"Found pane: div.m6QErb")
|
||||
except:
|
||||
print("No pane found at all!")
|
||||
|
||||
if pane:
|
||||
print("\nScrolling pane to load reviews...")
|
||||
for i in range(15):
|
||||
driver.execute_script("arguments[0].scrollBy(0, 400);", pane)
|
||||
time.sleep(0.4)
|
||||
if (i + 1) % 5 == 0:
|
||||
print(f" Scrolled {i+1} times...")
|
||||
|
||||
# Now check what's in the pane
|
||||
print("\n" + "="*80)
|
||||
print("ANALYZING PANE CONTENT")
|
||||
print("="*80)
|
||||
|
||||
content_info = driver.execute_script("""
|
||||
const pane = document.querySelector('div.m6QErb.WNBkOb.XiKgde') || document.querySelector('div.m6QErb');
|
||||
if (!pane) return {error: 'No pane found'};
|
||||
|
||||
// Get all child divs (direct and nested)
|
||||
const allDivs = Array.from(pane.querySelectorAll('div'));
|
||||
|
||||
// Get all unique class names used
|
||||
const classNames = new Set();
|
||||
allDivs.forEach(div => {
|
||||
if (div.className) {
|
||||
div.className.split(' ').forEach(cls => {
|
||||
if (cls.trim()) classNames.add(cls.trim());
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
// Find divs with ratings
|
||||
const divsWithRatings = allDivs.filter(div => {
|
||||
return !!div.querySelector('[aria-label*="star" i]');
|
||||
});
|
||||
|
||||
// Find divs with author photos
|
||||
const divsWithPhotos = allDivs.filter(div => {
|
||||
return !!div.querySelector('img[src*="photo"], img[src*="avatar"]');
|
||||
});
|
||||
|
||||
// Find divs with date patterns
|
||||
const divsWithDates = allDivs.filter(div => {
|
||||
return !!div.textContent.match(/\\d+\\s*(day|week|month|year|hour|minute|ago)/i);
|
||||
});
|
||||
|
||||
// Find divs with ALL three
|
||||
const reviewLikeDivs = allDivs.filter(div => {
|
||||
const hasRating = !!div.querySelector('[aria-label*="star" i]');
|
||||
const hasPhoto = !!div.querySelector('img');
|
||||
const hasDate = !!div.textContent.match(/\\d+\\s*(day|week|month|year|hour|ago)/i);
|
||||
const textLen = div.textContent.length;
|
||||
return hasRating && hasPhoto && hasDate && textLen > 50 && textLen < 2000;
|
||||
});
|
||||
|
||||
return {
|
||||
total_divs: allDivs.length,
|
||||
unique_classes: Array.from(classNames).sort(),
|
||||
divs_with_ratings: divsWithRatings.length,
|
||||
divs_with_photos: divsWithPhotos.length,
|
||||
divs_with_dates: divsWithDates.length,
|
||||
review_like_divs: reviewLikeDivs.length,
|
||||
review_like_classes: reviewLikeDivs.slice(0, 5).map(d => ({
|
||||
classes: d.className,
|
||||
text_length: d.textContent.length,
|
||||
sample: d.textContent.substring(0, 100)
|
||||
}))
|
||||
};
|
||||
""")
|
||||
|
||||
if 'error' in content_info:
|
||||
print(f"ERROR: {content_info['error']}")
|
||||
else:
|
||||
print(f"\nTotal divs in pane: {content_info['total_divs']}")
|
||||
print(f"Divs with ratings: {content_info['divs_with_ratings']}")
|
||||
print(f"Divs with photos: {content_info['divs_with_photos']}")
|
||||
print(f"Divs with dates: {content_info['divs_with_dates']}")
|
||||
print(f"Divs matching ALL criteria (review-like): {content_info['review_like_divs']}")
|
||||
|
||||
print(f"\nFirst 20 unique classes found in pane:")
|
||||
for cls in content_info['unique_classes'][:20]:
|
||||
print(f" {cls}")
|
||||
|
||||
if content_info['review_like_divs'] > 0:
|
||||
print(f"\nFirst 5 review-like divs:")
|
||||
for i, div_info in enumerate(content_info['review_like_classes'], 1):
|
||||
print(f"\n Div {i}:")
|
||||
print(f" Classes: {div_info['classes']}")
|
||||
print(f" Text length: {div_info['text_length']}")
|
||||
print(f" Sample: {div_info['sample'][:80]}...")
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print("Browser staying open for manual inspection (120 seconds)...")
|
||||
print("Look at the DevTools to see the actual review elements!")
|
||||
print(f"{'='*80}")
|
||||
time.sleep(120)
|
||||
|
||||
finally:
|
||||
driver.quit()
|
||||
70
manual_inspect.py
Normal file
70
manual_inspect.py
Normal file
@@ -0,0 +1,70 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Open the page and keep it open for manual inspection.
|
||||
INSTRUCTIONS:
|
||||
1. Open DevTools (F12)
|
||||
2. Click on an individual review
|
||||
3. Look at the div that contains ONE review (not the whole list)
|
||||
4. Note the class names on that div
|
||||
"""
|
||||
|
||||
import time
|
||||
from seleniumbase import Driver
|
||||
|
||||
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine&hl=en"
|
||||
|
||||
driver = Driver(uc=True, headless=False)
|
||||
|
||||
try:
|
||||
driver.get(url)
|
||||
time.sleep(5)
|
||||
|
||||
# GDPR
|
||||
try:
|
||||
form_btns = driver.find_elements('css selector', 'form button')
|
||||
for btn in form_btns:
|
||||
if 'accept all' in (btn.text or '').lower():
|
||||
btn.click()
|
||||
time.sleep(2)
|
||||
break
|
||||
except:
|
||||
pass
|
||||
|
||||
# Click reviews tab
|
||||
time.sleep(2)
|
||||
tabs = driver.find_elements('css selector', 'button[role="tab"]')
|
||||
for tab in tabs:
|
||||
if 'review' in (tab.text or '').lower() or 'review' in (tab.get_attribute('aria-label') or '').lower():
|
||||
driver.execute_script("arguments[0].click();", tab)
|
||||
time.sleep(5)
|
||||
break
|
||||
|
||||
# Scroll to load a few reviews
|
||||
try:
|
||||
pane = driver.find_element('css selector', 'div.m6QErb.WNBkOb.XiKgde')
|
||||
for _ in range(5):
|
||||
driver.execute_script("arguments[0].scrollBy(0, 300);", pane)
|
||||
time.sleep(0.5)
|
||||
except:
|
||||
pass
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("MANUAL INSPECTION TIME!")
|
||||
print("="*80)
|
||||
print("\n1. The browser is now showing the reviews page")
|
||||
print("2. Open DevTools (F12 or right-click > Inspect)")
|
||||
print("3. Click the 'Select element' tool (top-left of DevTools)")
|
||||
print("4. Hover over an INDIVIDUAL review (not the whole panel)")
|
||||
print("5. Click on it to select it in the inspector")
|
||||
print("6. Look at the <div> that wraps ONE SINGLE review")
|
||||
print("7. Note the 'class' attribute value")
|
||||
print("\n8. The class might look like: class=\"MyWpvb fontBodyMedium\" or similar")
|
||||
print("\n9. Write down the full class name(s) - we'll use this as the selector!")
|
||||
print("\n" + "="*80)
|
||||
print("Browser will stay open for 5 minutes...")
|
||||
print("="*80)
|
||||
|
||||
time.sleep(300) # 5 minutes
|
||||
|
||||
finally:
|
||||
driver.quit()
|
||||
71
test_lithuanian_hospital.py
Normal file
71
test_lithuanian_hospital.py
Normal file
@@ -0,0 +1,71 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test script for Lithuanian hospital to verify structural pattern matching works.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from modules.fast_scraper import fast_scrape_reviews
|
||||
|
||||
# Configure logging to see what's happening
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
def test_lithuanian_hospital():
|
||||
"""Test scraping the Lithuanian hospital that was getting 0 reviews"""
|
||||
|
||||
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine"
|
||||
|
||||
log.info("=" * 80)
|
||||
log.info("Testing Lithuanian Hospital: Panevėžio respublikinė ligoninė")
|
||||
log.info("Expected: 271 reviews")
|
||||
log.info("Previous result: 0 reviews (selector mismatch)")
|
||||
log.info("=" * 80)
|
||||
|
||||
# Run the scraper with headless mode OFF so we can see what's happening
|
||||
result = fast_scrape_reviews(
|
||||
url=url,
|
||||
headless=False, # Show browser for debugging
|
||||
max_scrolls=999999 # Unlimited - use idle detection
|
||||
)
|
||||
|
||||
log.info("=" * 80)
|
||||
log.info("RESULTS:")
|
||||
log.info(f"Success: {result['success']}")
|
||||
log.info(f"Reviews found: {result['count']}")
|
||||
log.info(f"Total reviews on page: {result.get('total_reviews', 'Unknown')}")
|
||||
log.info(f"Time taken: {result['time']:.2f}s")
|
||||
|
||||
if result.get('message'):
|
||||
log.info(f"Message: {result['message']}")
|
||||
|
||||
if result.get('error'):
|
||||
log.error(f"Error: {result['error']}")
|
||||
|
||||
log.info("=" * 80)
|
||||
|
||||
# Show first few reviews if found
|
||||
if result['count'] > 0:
|
||||
log.info(f"\nFirst 3 reviews:")
|
||||
for i, review in enumerate(result['reviews'][:3], 1):
|
||||
log.info(f"\n Review {i}:")
|
||||
log.info(f" Author: {review.get('author', 'N/A')}")
|
||||
log.info(f" Rating: {review.get('rating', 'N/A')}")
|
||||
log.info(f" Date: {review.get('date_text', 'N/A')}")
|
||||
log.info(f" Text: {review.get('text', 'N/A')[:100]}...")
|
||||
|
||||
# Verify the fix worked
|
||||
if result['count'] > 200:
|
||||
log.info("\n✅ SUCCESS! Structural pattern matching found reviews!")
|
||||
log.info(f" Got {result['count']} reviews (expected ~271)")
|
||||
elif result['count'] == 0:
|
||||
log.error("\n❌ FAILED! Still getting 0 reviews - selector issue not fixed")
|
||||
else:
|
||||
log.warning(f"\n⚠️ PARTIAL: Got {result['count']} reviews (expected ~271)")
|
||||
log.warning(" May need to increase idle detection patience")
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_lithuanian_hospital()
|
||||
92
test_without_english.py
Normal file
92
test_without_english.py
Normal file
@@ -0,0 +1,92 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test WITHOUT forcing English locale - use the page's default language.
|
||||
"""
|
||||
|
||||
import time
|
||||
from seleniumbase import Driver
|
||||
|
||||
# NO hl=en parameter!
|
||||
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine"
|
||||
|
||||
driver = Driver(uc=True, headless=False)
|
||||
|
||||
try:
|
||||
driver.get(url)
|
||||
print(f"Loaded (NO hl=en): {url}")
|
||||
time.sleep(5)
|
||||
|
||||
# GDPR
|
||||
try:
|
||||
form_btns = driver.find_elements('css selector', 'form button')
|
||||
for btn in form_btns:
|
||||
btn_text = (btn.text or '').lower()
|
||||
if 'accept' in btn_text or 'priim' in btn_text: # Lithuanian "priimti"
|
||||
print(f"Clicking consent: {btn.text}")
|
||||
btn.click()
|
||||
time.sleep(2)
|
||||
break
|
||||
except:
|
||||
pass
|
||||
|
||||
# List ALL tabs
|
||||
print("\nALL TABS FOUND:")
|
||||
time.sleep(2)
|
||||
tabs = driver.find_elements('css selector', 'button[role="tab"]')
|
||||
for i, tab in enumerate(tabs, 1):
|
||||
text = tab.text or ''
|
||||
aria = tab.get_attribute('aria-label') or ''
|
||||
print(f" Tab {i}: text='{text}', aria='{aria}'")
|
||||
|
||||
# Look for reviews tab (try multiple keywords)
|
||||
review_keywords = ['review', 'reseña', 'atsiliepimai', 'atsiliepi', 'отзыв']
|
||||
review_tab_found = False
|
||||
|
||||
for tab in tabs:
|
||||
text = (tab.text or '').lower()
|
||||
aria = (tab.get_attribute('aria-label') or '').lower()
|
||||
|
||||
for keyword in review_keywords:
|
||||
if keyword in text or keyword in aria:
|
||||
print(f"\nFound REVIEWS TAB: {tab.text or aria[:50]}")
|
||||
driver.execute_script("arguments[0].click();", tab)
|
||||
time.sleep(5)
|
||||
review_tab_found = True
|
||||
break
|
||||
|
||||
if review_tab_found:
|
||||
break
|
||||
|
||||
if not review_tab_found:
|
||||
print("\nWARNING: Still no reviews tab found!")
|
||||
else:
|
||||
# Now scroll and check for reviews
|
||||
print("\nScrolling to load reviews...")
|
||||
try:
|
||||
pane = driver.find_element('css selector', 'div.m6QErb.WNBkOb.XiKgde')
|
||||
for i in range(10):
|
||||
driver.execute_script("arguments[0].scrollBy(0, 400);", pane)
|
||||
time.sleep(0.3)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Check for reviews using known selectors
|
||||
selectors_to_check = [
|
||||
'div.jftiEf.fontBodyMedium',
|
||||
'div.jftiEf',
|
||||
'div.fontBodyMedium',
|
||||
'div[data-review-id]'
|
||||
]
|
||||
|
||||
print("\nChecking selectors:")
|
||||
for selector in selectors_to_check:
|
||||
count = driver.execute_script(f"return document.querySelectorAll('{selector}').length;")
|
||||
print(f" {selector:30} : {count} elements")
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print("Browser open for inspection (120s)...")
|
||||
print(f"{'='*80}")
|
||||
time.sleep(120)
|
||||
|
||||
finally:
|
||||
driver.quit()
|
||||
Reference in New Issue
Block a user