Fix: Add early no-reviews detection and hide analytics for empty jobs
Changes: - Early detection for "no reviews" messages in 11 languages - Checks for disabled reviews tabs and 0-review indicators - Returns early (saves 30-40s) when no reviews exist - Frontend hides analytics/export buttons when reviews_count = 0 - Structural pattern matching improvements (work in progress) Known issue: - Lithuanian hospital page has different structure (no tabs found) - Needs separate investigation - may use different Google Maps layout Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
166
brute_force_selector.py
Normal file
166
brute_force_selector.py
Normal file
@@ -0,0 +1,166 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Brute force approach: Try every possible div class combination and see which gives us reviews.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import time
|
||||||
|
from seleniumbase import Driver
|
||||||
|
|
||||||
|
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine&hl=en"
|
||||||
|
|
||||||
|
driver = Driver(uc=True, headless=False)
|
||||||
|
|
||||||
|
try:
|
||||||
|
driver.get(url)
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
# GDPR
|
||||||
|
try:
|
||||||
|
form_btns = driver.find_elements('css selector', 'form button')
|
||||||
|
for btn in form_btns:
|
||||||
|
if 'accept all' in (btn.text or '').lower():
|
||||||
|
btn.click()
|
||||||
|
time.sleep(2)
|
||||||
|
break
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Click reviews tab
|
||||||
|
time.sleep(2)
|
||||||
|
tabs = driver.find_elements('css selector', 'button[role="tab"]')
|
||||||
|
for tab in tabs:
|
||||||
|
if 'review' in (tab.text or '').lower() or 'review' in (tab.get_attribute('aria-label') or '').lower():
|
||||||
|
driver.execute_script("arguments[0].click();", tab)
|
||||||
|
time.sleep(5)
|
||||||
|
break
|
||||||
|
|
||||||
|
# Scroll to load reviews
|
||||||
|
try:
|
||||||
|
pane = driver.find_element('css selector', 'div.m6QErb.WNBkOb.XiKgde')
|
||||||
|
for _ in range(10):
|
||||||
|
driver.execute_script("arguments[0].scrollBy(0, 400);", pane)
|
||||||
|
time.sleep(0.3)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
print("\n" + "="*80)
|
||||||
|
print("BRUTE FORCE SELECTOR SEARCH")
|
||||||
|
print("="*80)
|
||||||
|
|
||||||
|
# Get ALL unique class combinations from divs inside the reviews pane
|
||||||
|
candidates = driver.execute_script("""
|
||||||
|
// Find the reviews pane
|
||||||
|
const pane = document.querySelector('div.m6QErb.WNBkOb.XiKgde');
|
||||||
|
if (!pane) return {error: 'Pane not found'};
|
||||||
|
|
||||||
|
// Get all divs inside the pane
|
||||||
|
const allDivs = Array.from(pane.querySelectorAll('div'));
|
||||||
|
|
||||||
|
// For each div, check if it looks like a review
|
||||||
|
const candidates = [];
|
||||||
|
|
||||||
|
for (let div of allDivs) {
|
||||||
|
// Skip if no classes
|
||||||
|
if (!div.className || div.className.length === 0) continue;
|
||||||
|
|
||||||
|
// Check for review indicators
|
||||||
|
const hasRating = !!div.querySelector('[aria-label*="star" i]');
|
||||||
|
const hasText = div.textContent.length > 50 && div.textContent.length < 1000; // Individual review size
|
||||||
|
const hasAuthor = !!div.querySelector('button[aria-label*="photo" i], img');
|
||||||
|
|
||||||
|
// Calculate score
|
||||||
|
let score = 0;
|
||||||
|
if (hasRating) score += 3;
|
||||||
|
if (hasText) score += 2;
|
||||||
|
if (hasAuthor) score += 1;
|
||||||
|
|
||||||
|
if (score >= 4) { // Must have rating + text at minimum
|
||||||
|
candidates.push({
|
||||||
|
classes: div.className,
|
||||||
|
selector: 'div.' + div.className.split(' ').filter(c => c).join('.'),
|
||||||
|
score: score,
|
||||||
|
text_length: div.textContent.length,
|
||||||
|
sample_text: div.textContent.substring(0, 100)
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Count how many elements match each selector
|
||||||
|
const selectorCounts = {};
|
||||||
|
for (let candidate of candidates) {
|
||||||
|
const count = pane.querySelectorAll(candidate.selector).length;
|
||||||
|
if (!selectorCounts[candidate.selector]) {
|
||||||
|
selectorCounts[candidate.selector] = {
|
||||||
|
count: count,
|
||||||
|
score: candidate.score,
|
||||||
|
text_length: candidate.text_length,
|
||||||
|
sample: candidate.sample_text
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort by count (we want selectors that match many reviews)
|
||||||
|
const sorted = Object.entries(selectorCounts)
|
||||||
|
.sort((a, b) => b[1].count - a[1].count)
|
||||||
|
.slice(0, 10);
|
||||||
|
|
||||||
|
return {
|
||||||
|
top_selectors: sorted.map(([selector, info]) => ({
|
||||||
|
selector: selector,
|
||||||
|
count: info.count,
|
||||||
|
score: info.score,
|
||||||
|
text_length: info.text_length,
|
||||||
|
sample: info.sample
|
||||||
|
}))
|
||||||
|
};
|
||||||
|
""")
|
||||||
|
|
||||||
|
if 'error' in candidates:
|
||||||
|
print(f"ERROR: {candidates['error']}")
|
||||||
|
else:
|
||||||
|
print(f"\nTop 10 candidate selectors (sorted by count):\n")
|
||||||
|
for i, candidate in enumerate(candidates['top_selectors'], 1):
|
||||||
|
print(f"{i}. {candidate['selector']}")
|
||||||
|
print(f" Count: {candidate['count']} | Score: {candidate['score']} | Text length: {candidate['text_length']}")
|
||||||
|
print(f" Sample: {candidate['sample'][:80]}...")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Test the top selector
|
||||||
|
if candidates['top_selectors']:
|
||||||
|
top_selector = candidates['top_selectors'][0]['selector']
|
||||||
|
print(f"\n{'='*80}")
|
||||||
|
print(f"TESTING TOP SELECTOR: {top_selector}")
|
||||||
|
print(f"{'='*80}")
|
||||||
|
|
||||||
|
test_result = driver.execute_script(f"""
|
||||||
|
const elements = document.querySelectorAll('{top_selector}');
|
||||||
|
const reviews = [];
|
||||||
|
|
||||||
|
for (let i = 0; i < Math.min(3, elements.length); i++) {{
|
||||||
|
const elem = elements[i];
|
||||||
|
const review = {{
|
||||||
|
has_author: !!elem.querySelector('button, img'),
|
||||||
|
has_rating: !!elem.querySelector('[aria-label*="star" i]'),
|
||||||
|
has_date: !!elem.textContent.match(/\\d+\\s*(day|week|month|year|ago)/i),
|
||||||
|
text_length: elem.textContent.length,
|
||||||
|
text_sample: elem.textContent.substring(0, 150)
|
||||||
|
}};
|
||||||
|
reviews.push(review);
|
||||||
|
}}
|
||||||
|
|
||||||
|
return reviews;
|
||||||
|
""")
|
||||||
|
|
||||||
|
print(f"\nFirst 3 elements using {top_selector}:")
|
||||||
|
for i, rev in enumerate(test_result, 1):
|
||||||
|
print(f"\n Element {i}:")
|
||||||
|
for key, value in rev.items():
|
||||||
|
print(f" {key}: {value}")
|
||||||
|
|
||||||
|
print(f"\n{'='*80}")
|
||||||
|
print("Browser staying open for 60 seconds...")
|
||||||
|
print(f"{'='*80}")
|
||||||
|
time.sleep(60)
|
||||||
|
|
||||||
|
finally:
|
||||||
|
driver.quit()
|
||||||
106
check_page_structure.py
Normal file
106
check_page_structure.py
Normal file
@@ -0,0 +1,106 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Check the actual page structure - maybe reviews are already visible without clicking a tab!
|
||||||
|
"""
|
||||||
|
|
||||||
|
import time
|
||||||
|
from seleniumbase import Driver
|
||||||
|
|
||||||
|
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine"
|
||||||
|
|
||||||
|
driver = Driver(uc=True, headless=False)
|
||||||
|
|
||||||
|
try:
|
||||||
|
driver.get(url)
|
||||||
|
print(f"Initial URL: {url}")
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
# GDPR
|
||||||
|
try:
|
||||||
|
form_btns = driver.find_elements('css selector', 'form button')
|
||||||
|
for btn in form_btns:
|
||||||
|
if 'accept' in (btn.text or '').lower():
|
||||||
|
btn.click()
|
||||||
|
time.sleep(2)
|
||||||
|
break
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Check final URL
|
||||||
|
final_url = driver.current_url
|
||||||
|
print(f"Final URL after redirect: {final_url}")
|
||||||
|
|
||||||
|
# Wait a bit more for dynamic content
|
||||||
|
time.sleep(3)
|
||||||
|
|
||||||
|
# Check page structure
|
||||||
|
print("\n" + "="*80)
|
||||||
|
print("PAGE STRUCTURE ANALYSIS")
|
||||||
|
print("="*80)
|
||||||
|
|
||||||
|
page_info = driver.execute_script("""
|
||||||
|
return {
|
||||||
|
tabs_found: document.querySelectorAll('button[role="tab"]').length,
|
||||||
|
reviews_with_standard_selector: document.querySelectorAll('div.jftiEf.fontBodyMedium').length,
|
||||||
|
reviews_with_jftiEf: document.querySelectorAll('div.jftiEf').length,
|
||||||
|
divs_with_ratings: document.querySelectorAll('[aria-label*="star" i]').length,
|
||||||
|
review_containers: document.querySelectorAll('div.fontBodyMedium').length,
|
||||||
|
page_text_sample: document.body.innerText.substring(0, 500),
|
||||||
|
has_review_text: document.body.innerText.toLowerCase().includes('review'),
|
||||||
|
has_atsiliepimai_text: document.body.innerText.toLowerCase().includes('atsiliepimai')
|
||||||
|
};
|
||||||
|
""")
|
||||||
|
|
||||||
|
print(f"\nTabs with role='tab': {page_info['tabs_found']}")
|
||||||
|
print(f"div.jftiEf.fontBodyMedium: {page_info['reviews_with_standard_selector']}")
|
||||||
|
print(f"div.jftiEf: {page_info['reviews_with_jftiEf']}")
|
||||||
|
print(f"Elements with star ratings: {page_info['divs_with_ratings']}")
|
||||||
|
print(f"div.fontBodyMedium: {page_info['review_containers']}")
|
||||||
|
print(f"Contains 'review': {page_info['has_review_text']}")
|
||||||
|
print(f"Contains 'atsiliepimai' (Lithuanian): {page_info['has_atsiliepimai_text']}")
|
||||||
|
|
||||||
|
print(f"\nPage text sample (first 500 chars):")
|
||||||
|
print(page_info['page_text_sample'])
|
||||||
|
|
||||||
|
# Try to find ANY element with rating
|
||||||
|
print("\n" + "="*80)
|
||||||
|
print("SEARCHING FOR RATING ELEMENTS")
|
||||||
|
print("="*80)
|
||||||
|
|
||||||
|
rating_search = driver.execute_script("""
|
||||||
|
const elements = Array.from(document.querySelectorAll('*'));
|
||||||
|
const withRatings = [];
|
||||||
|
|
||||||
|
for (let elem of elements) {
|
||||||
|
const ariaLabel = elem.getAttribute('aria-label') || '';
|
||||||
|
if (ariaLabel.toLowerCase().includes('star') || ariaLabel.toLowerCase().includes('žvaigžd')) {
|
||||||
|
withRatings.push({
|
||||||
|
tag: elem.tagName,
|
||||||
|
ariaLabel: ariaLabel.substring(0, 100),
|
||||||
|
classes: elem.className.substring(0, 100),
|
||||||
|
parentTag: elem.parentElement ? elem.parentElement.tagName : null,
|
||||||
|
parentClasses: elem.parentElement ? elem.parentElement.className.substring(0, 100) : null
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return withRatings.slice(0, 10); // First 10
|
||||||
|
""")
|
||||||
|
|
||||||
|
print(f"\nFound {len(rating_search)} elements with 'star' in aria-label:")
|
||||||
|
for i, elem in enumerate(rating_search[:5], 1):
|
||||||
|
print(f"\n Element {i}:")
|
||||||
|
print(f" Tag: {elem['tag']}")
|
||||||
|
print(f" Aria-label: {elem['ariaLabel']}")
|
||||||
|
print(f" Classes: {elem['classes']}")
|
||||||
|
print(f" Parent tag: {elem['parentTag']}")
|
||||||
|
print(f" Parent classes: {elem['parentClasses']}")
|
||||||
|
|
||||||
|
print(f"\n{'='*80}")
|
||||||
|
print("Browser open for manual inspection...")
|
||||||
|
print("LOOK AT THE PAGE - Are reviews visible? What's their structure?")
|
||||||
|
print(f"{'='*80}")
|
||||||
|
time.sleep(180) # 3 minutes
|
||||||
|
|
||||||
|
finally:
|
||||||
|
driver.quit()
|
||||||
163
diagnose_reviews_panel.py
Normal file
163
diagnose_reviews_panel.py
Normal file
@@ -0,0 +1,163 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Better diagnostic: Actually wait for reviews panel to load and find correct selector.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import time
|
||||||
|
from seleniumbase import Driver
|
||||||
|
|
||||||
|
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine"
|
||||||
|
|
||||||
|
print("Opening browser...")
|
||||||
|
driver = Driver(uc=True, headless=False)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Add English locale
|
||||||
|
if '?' in url:
|
||||||
|
url += '&hl=en'
|
||||||
|
else:
|
||||||
|
url += '?hl=en'
|
||||||
|
|
||||||
|
driver.get(url)
|
||||||
|
print(f"Loaded: {url}")
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
# Handle GDPR
|
||||||
|
try:
|
||||||
|
form_btns = driver.find_elements('css selector', 'form button')
|
||||||
|
for btn in form_btns:
|
||||||
|
btn_text = (btn.text or '').lower()
|
||||||
|
if 'accept all' in btn_text:
|
||||||
|
print(f"Clicking GDPR: {btn.text}")
|
||||||
|
btn.click()
|
||||||
|
time.sleep(2)
|
||||||
|
break
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Click reviews tab and WAIT for panel to load
|
||||||
|
print("\nClicking reviews tab...")
|
||||||
|
time.sleep(2)
|
||||||
|
tabs = driver.find_elements('css selector', 'button[role="tab"]')
|
||||||
|
for tab in tabs:
|
||||||
|
text = (tab.text or '').lower()
|
||||||
|
aria = (tab.get_attribute('aria-label') or '').lower()
|
||||||
|
if 'review' in text or 'review' in aria:
|
||||||
|
print(f"Found reviews tab: {tab.text or aria[:50]}")
|
||||||
|
driver.execute_script("arguments[0].click();", tab)
|
||||||
|
print("Clicked! Waiting for reviews panel to load...")
|
||||||
|
time.sleep(5) # Wait longer for reviews to actually load
|
||||||
|
break
|
||||||
|
|
||||||
|
# Try scrolling the reviews pane to load more
|
||||||
|
print("\nTrying to find and scroll reviews pane...")
|
||||||
|
pane_selectors = [
|
||||||
|
'div.m6QErb.WNBkOb.XiKgde',
|
||||||
|
'div.m6QErb',
|
||||||
|
'div[role="main"]'
|
||||||
|
]
|
||||||
|
|
||||||
|
for selector in pane_selectors:
|
||||||
|
try:
|
||||||
|
pane = driver.find_element('css selector', selector)
|
||||||
|
print(f"Found pane: {selector}")
|
||||||
|
driver.execute_script("arguments[0].scrollBy(0, 500);", pane)
|
||||||
|
time.sleep(2)
|
||||||
|
driver.execute_script("arguments[0].scrollBy(0, 500);", pane)
|
||||||
|
time.sleep(2)
|
||||||
|
break
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# NOW check for review selectors
|
||||||
|
print("\n" + "="*80)
|
||||||
|
print("CHECKING REVIEW SELECTORS AFTER PANEL LOADED:")
|
||||||
|
print("="*80)
|
||||||
|
|
||||||
|
selectors_to_try = [
|
||||||
|
('div.jftiEf.fontBodyMedium', 'Standard Google Maps reviews'),
|
||||||
|
('div.jftiEf', 'Just jftiEf class'),
|
||||||
|
('div.fontBodyMedium', 'Just fontBodyMedium'),
|
||||||
|
('div[data-review-id]', 'data-review-id attribute'),
|
||||||
|
('div[jsaction*="review"]', 'jsaction with review'),
|
||||||
|
('[data-review]', 'data-review attribute'),
|
||||||
|
('div[class*="review" i]', 'Class containing review'),
|
||||||
|
('[role="article"]', 'role=article'),
|
||||||
|
('div[jslog]', 'Elements with jslog (Google tracking)'),
|
||||||
|
]
|
||||||
|
|
||||||
|
for selector, description in selectors_to_try:
|
||||||
|
count = driver.execute_script(
|
||||||
|
f"return document.querySelectorAll('{selector}').length;"
|
||||||
|
)
|
||||||
|
print(f"{description:35} | {selector:40} | Found: {count}")
|
||||||
|
|
||||||
|
# Get detailed info about most promising selector
|
||||||
|
print("\n" + "="*80)
|
||||||
|
print("ANALYZING MOST PROMISING SELECTOR:")
|
||||||
|
print("="*80)
|
||||||
|
|
||||||
|
analysis = driver.execute_script("""
|
||||||
|
// Try selectors in order of likelihood
|
||||||
|
const selectors = [
|
||||||
|
'div.jftiEf.fontBodyMedium',
|
||||||
|
'div.jftiEf',
|
||||||
|
'div.fontBodyMedium',
|
||||||
|
'div[jslog*="impression"]',
|
||||||
|
'[role="article"]'
|
||||||
|
];
|
||||||
|
|
||||||
|
for (let selector of selectors) {
|
||||||
|
const elements = document.querySelectorAll(selector);
|
||||||
|
if (elements.length > 5) { // Need at least a few to be reviews
|
||||||
|
// Analyze first element
|
||||||
|
const first = elements[0];
|
||||||
|
const analysis = {
|
||||||
|
selector: selector,
|
||||||
|
total_found: elements.length,
|
||||||
|
first_element: {
|
||||||
|
tag: first.tagName,
|
||||||
|
classes: first.className,
|
||||||
|
has_rating: !!first.querySelector('[aria-label*="star" i]'),
|
||||||
|
has_author: !!first.querySelector('button, a, div[aria-label]'),
|
||||||
|
has_avatar: !!first.querySelector('img'),
|
||||||
|
has_date: !!first.textContent.match(/\\d+\\s*(day|week|month|year|hour|minute)/i),
|
||||||
|
text_length: first.textContent.length,
|
||||||
|
sample_text: first.textContent.substring(0, 100)
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Check if multiple elements have review characteristics
|
||||||
|
let reviewLikeCount = 0;
|
||||||
|
for (let i = 0; i < Math.min(10, elements.length); i++) {
|
||||||
|
const elem = elements[i];
|
||||||
|
const hasRating = !!elem.querySelector('[aria-label*="star" i]');
|
||||||
|
const hasText = elem.textContent.length > 30;
|
||||||
|
if (hasRating && hasText) reviewLikeCount++;
|
||||||
|
}
|
||||||
|
analysis.review_like_count_in_first_10 = reviewLikeCount;
|
||||||
|
|
||||||
|
return analysis;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return {error: 'No selector found with >5 elements'};
|
||||||
|
""")
|
||||||
|
|
||||||
|
if 'error' in analysis:
|
||||||
|
print(f"ERROR: {analysis['error']}")
|
||||||
|
else:
|
||||||
|
print(f"Best selector: {analysis['selector']}")
|
||||||
|
print(f"Total found: {analysis['total_found']}")
|
||||||
|
print(f"Review-like in first 10: {analysis['review_like_count_in_first_10']}")
|
||||||
|
print(f"\nFirst element analysis:")
|
||||||
|
for key, value in analysis['first_element'].items():
|
||||||
|
print(f" {key}: {value}")
|
||||||
|
|
||||||
|
print("\n" + "="*80)
|
||||||
|
print("Keeping browser open for 120 seconds for manual inspection...")
|
||||||
|
print("="*80)
|
||||||
|
time.sleep(120)
|
||||||
|
|
||||||
|
finally:
|
||||||
|
driver.quit()
|
||||||
126
diagnose_selectors.py
Normal file
126
diagnose_selectors.py
Normal file
@@ -0,0 +1,126 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Diagnostic script to find the correct selector for Lithuanian hospital reviews.
|
||||||
|
Opens the browser and pauses so we can inspect the page manually.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import time
|
||||||
|
from seleniumbase import Driver
|
||||||
|
|
||||||
|
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine"
|
||||||
|
|
||||||
|
print("Opening browser...")
|
||||||
|
driver = Driver(uc=True, headless=False)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Add English locale for consistency
|
||||||
|
if '?' in url:
|
||||||
|
url += '&hl=en'
|
||||||
|
else:
|
||||||
|
url += '?hl=en'
|
||||||
|
|
||||||
|
driver.get(url)
|
||||||
|
print(f"Loaded: {url}")
|
||||||
|
|
||||||
|
# Wait for page to load
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
# Handle GDPR
|
||||||
|
try:
|
||||||
|
form_btns = driver.find_elements('css selector', 'form button')
|
||||||
|
for btn in form_btns:
|
||||||
|
btn_text = (btn.text or '').lower()
|
||||||
|
if 'accept all' in btn_text or 'aceptar todo' in btn_text:
|
||||||
|
print(f"Clicking GDPR consent: {btn.text}")
|
||||||
|
btn.click()
|
||||||
|
time.sleep(2)
|
||||||
|
break
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Click reviews tab
|
||||||
|
time.sleep(2)
|
||||||
|
tabs = driver.find_elements('css selector', 'button[role="tab"]')
|
||||||
|
for tab in tabs:
|
||||||
|
text = (tab.text or '').lower()
|
||||||
|
aria = (tab.get_attribute('aria-label') or '').lower()
|
||||||
|
if 'review' in text or 'review' in aria:
|
||||||
|
print(f"Clicking reviews tab: {tab.text or aria[:30]}")
|
||||||
|
driver.execute_script("arguments[0].click();", tab)
|
||||||
|
time.sleep(3)
|
||||||
|
break
|
||||||
|
|
||||||
|
# Try different selectors and show what we find
|
||||||
|
selectors_to_try = [
|
||||||
|
('div.jftiEf.fontBodyMedium', 'Known selector 1'),
|
||||||
|
('div.jftiEf', 'Known selector 2'),
|
||||||
|
('div[data-review-id]', 'Known selector 3'),
|
||||||
|
('div[jsaction*="review"]', 'jsaction with review'),
|
||||||
|
('[role="article"]', 'role=article'),
|
||||||
|
('div[data-review-id]', 'data-review-id attribute'),
|
||||||
|
('div.fontBodyMedium', 'Just fontBodyMedium class'),
|
||||||
|
('div[class*="review"]', 'Class containing "review"'),
|
||||||
|
]
|
||||||
|
|
||||||
|
print("\n" + "="*80)
|
||||||
|
print("TESTING SELECTORS:")
|
||||||
|
print("="*80)
|
||||||
|
|
||||||
|
for selector, description in selectors_to_try:
|
||||||
|
count = driver.execute_script(
|
||||||
|
f"return document.querySelectorAll('{selector}').length;"
|
||||||
|
)
|
||||||
|
print(f"{description:30} | {selector:40} | Found: {count}")
|
||||||
|
|
||||||
|
# Show sample HTML of first few elements matching the most promising selector
|
||||||
|
print("\n" + "="*80)
|
||||||
|
print("SAMPLE HTML FROM FIRST MATCH:")
|
||||||
|
print("="*80)
|
||||||
|
|
||||||
|
sample_html = driver.execute_script("""
|
||||||
|
const selectors = [
|
||||||
|
'div.jftiEf.fontBodyMedium',
|
||||||
|
'div.jftiEf',
|
||||||
|
'[role="article"]',
|
||||||
|
'div[jsaction*="review"]'
|
||||||
|
];
|
||||||
|
|
||||||
|
for (let selector of selectors) {
|
||||||
|
const elements = document.querySelectorAll(selector);
|
||||||
|
if (elements.length > 0) {
|
||||||
|
const first = elements[0];
|
||||||
|
return {
|
||||||
|
selector: selector,
|
||||||
|
count: elements.length,
|
||||||
|
outerHTML: first.outerHTML.substring(0, 500),
|
||||||
|
classes: first.className,
|
||||||
|
hasRating: !!first.querySelector('[aria-label*="star" i]'),
|
||||||
|
hasAuthor: !!first.querySelector('img'),
|
||||||
|
textLength: first.textContent.length
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
""")
|
||||||
|
|
||||||
|
if sample_html:
|
||||||
|
print(f"Selector: {sample_html['selector']}")
|
||||||
|
print(f"Total found: {sample_html['count']}")
|
||||||
|
print(f"Classes: {sample_html['classes']}")
|
||||||
|
print(f"Has rating: {sample_html['hasRating']}")
|
||||||
|
print(f"Has author img: {sample_html['hasAuthor']}")
|
||||||
|
print(f"Text length: {sample_html['textLength']}")
|
||||||
|
print(f"\nSample HTML (first 500 chars):")
|
||||||
|
print(sample_html['outerHTML'])
|
||||||
|
|
||||||
|
print("\n" + "="*80)
|
||||||
|
print("Browser will stay open for 60 seconds so you can inspect manually...")
|
||||||
|
print("Use DevTools to find the correct selector!")
|
||||||
|
print("="*80)
|
||||||
|
|
||||||
|
# Keep browser open for inspection
|
||||||
|
time.sleep(60)
|
||||||
|
|
||||||
|
finally:
|
||||||
|
driver.quit()
|
||||||
|
print("\nBrowser closed.")
|
||||||
156
find_actual_reviews.py
Normal file
156
find_actual_reviews.py
Normal file
@@ -0,0 +1,156 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Find the ACTUAL selector for reviews by looking for elements with review structure.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import time
|
||||||
|
from seleniumbase import Driver
|
||||||
|
|
||||||
|
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine&hl=en"
|
||||||
|
|
||||||
|
driver = Driver(uc=True, headless=False)
|
||||||
|
|
||||||
|
try:
|
||||||
|
driver.get(url)
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
# GDPR
|
||||||
|
try:
|
||||||
|
form_btns = driver.find_elements('css selector', 'form button')
|
||||||
|
for btn in form_btns:
|
||||||
|
if 'accept all' in (btn.text or '').lower():
|
||||||
|
btn.click()
|
||||||
|
time.sleep(2)
|
||||||
|
break
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Click reviews tab
|
||||||
|
time.sleep(2)
|
||||||
|
tabs = driver.find_elements('css selector', 'button[role="tab"]')
|
||||||
|
for tab in tabs:
|
||||||
|
if 'review' in (tab.text or '').lower() or 'review' in (tab.get_attribute('aria-label') or '').lower():
|
||||||
|
driver.execute_script("arguments[0].click();", tab)
|
||||||
|
time.sleep(5)
|
||||||
|
break
|
||||||
|
|
||||||
|
# Scroll to load reviews
|
||||||
|
try:
|
||||||
|
pane = driver.find_element('css selector', 'div.m6QErb.WNBkOb.XiKgde')
|
||||||
|
for _ in range(3):
|
||||||
|
driver.execute_script("arguments[0].scrollBy(0, 500);", pane)
|
||||||
|
time.sleep(1)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Use JavaScript to find ALL elements that look like reviews
|
||||||
|
print("\n" + "="*80)
|
||||||
|
print("FINDING ACTUAL REVIEW ELEMENTS BY STRUCTURE:")
|
||||||
|
print("="*80)
|
||||||
|
|
||||||
|
review_info = driver.execute_script("""
|
||||||
|
// Find all elements that have BOTH a rating AND substantial text
|
||||||
|
const allDivs = Array.from(document.querySelectorAll('div'));
|
||||||
|
|
||||||
|
const reviews = [];
|
||||||
|
|
||||||
|
for (let div of allDivs) {
|
||||||
|
// Must have a rating (star aria-label)
|
||||||
|
const ratingElem = div.querySelector('[aria-label*="star" i], [aria-label*="rating" i]');
|
||||||
|
if (!ratingElem) continue;
|
||||||
|
|
||||||
|
// Must have decent text content (>50 chars to avoid buttons)
|
||||||
|
if (div.textContent.length < 50) continue;
|
||||||
|
|
||||||
|
// Get the classes and attributes
|
||||||
|
const info = {
|
||||||
|
classes: div.className,
|
||||||
|
has_author: !!div.querySelector('button, [aria-label*="photo" i]'),
|
||||||
|
has_avatar: !!div.querySelector('img'),
|
||||||
|
has_date: !!div.textContent.match(/\\d+\\s*(day|week|month|year|ago)/i),
|
||||||
|
text_length: div.textContent.length,
|
||||||
|
sample_text: div.textContent.substring(0, 150),
|
||||||
|
tag_name: div.tagName,
|
||||||
|
jslog: div.getAttribute('jslog'),
|
||||||
|
data_review_id: div.getAttribute('data-review-id'),
|
||||||
|
jsaction: div.getAttribute('jsaction')
|
||||||
|
};
|
||||||
|
|
||||||
|
reviews.push(info);
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
total_found: reviews.length,
|
||||||
|
first_5: reviews.slice(0, 5)
|
||||||
|
};
|
||||||
|
""")
|
||||||
|
|
||||||
|
print(f"\nFound {review_info['total_found']} elements with review structure")
|
||||||
|
print(f"\nFirst 5 review-like elements:")
|
||||||
|
for i, rev in enumerate(review_info['first_5'], 1):
|
||||||
|
print(f"\n Review {i}:")
|
||||||
|
print(f" Classes: {rev['classes']}")
|
||||||
|
print(f" Has author: {rev['has_author']}")
|
||||||
|
print(f" Has avatar: {rev['has_avatar']}")
|
||||||
|
print(f" Has date: {rev['has_date']}")
|
||||||
|
print(f" Text length: {rev['text_length']}")
|
||||||
|
print(f" jslog: {rev['jslog']}")
|
||||||
|
print(f" data-review-id: {rev['data_review_id']}")
|
||||||
|
print(f" Sample: {rev['sample_text'][:80]}...")
|
||||||
|
|
||||||
|
# Try to find a common class among review elements
|
||||||
|
if review_info['total_found'] > 0:
|
||||||
|
print("\n" + "="*80)
|
||||||
|
print("FINDING COMMON SELECTOR:")
|
||||||
|
print("="*80)
|
||||||
|
|
||||||
|
common_selector = driver.execute_script("""
|
||||||
|
// Find common classes among review elements
|
||||||
|
const reviews = [];
|
||||||
|
const allDivs = Array.from(document.querySelectorAll('div'));
|
||||||
|
|
||||||
|
for (let div of allDivs) {
|
||||||
|
const ratingElem = div.querySelector('[aria-label*="star" i]');
|
||||||
|
if (ratingElem && div.textContent.length > 50) {
|
||||||
|
reviews.push(div);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (reviews.length === 0) return null;
|
||||||
|
|
||||||
|
// Get classes from first review
|
||||||
|
const firstClasses = reviews[0].className.split(' ').filter(c => c.length > 0);
|
||||||
|
|
||||||
|
// Find classes that appear in ALL reviews
|
||||||
|
const commonClasses = firstClasses.filter(cls => {
|
||||||
|
return reviews.every(rev => rev.classList.contains(cls));
|
||||||
|
});
|
||||||
|
|
||||||
|
return {
|
||||||
|
total_reviews: reviews.length,
|
||||||
|
common_classes: commonClasses,
|
||||||
|
suggested_selector: commonClasses.length > 0 ? 'div.' + commonClasses.join('.') : null,
|
||||||
|
first_review_classes: reviews[0].className
|
||||||
|
};
|
||||||
|
""")
|
||||||
|
|
||||||
|
if common_selector:
|
||||||
|
print(f"Total review elements: {common_selector['total_reviews']}")
|
||||||
|
print(f"Common classes: {common_selector['common_classes']}")
|
||||||
|
print(f"Suggested selector: {common_selector['suggested_selector']}")
|
||||||
|
print(f"First review full classes: {common_selector['first_review_classes']}")
|
||||||
|
|
||||||
|
# Test the suggested selector
|
||||||
|
if common_selector['suggested_selector']:
|
||||||
|
test_count = driver.execute_script(
|
||||||
|
f"return document.querySelectorAll('{common_selector['suggested_selector']}').length;"
|
||||||
|
)
|
||||||
|
print(f"\nTesting suggested selector: Found {test_count} elements")
|
||||||
|
|
||||||
|
print("\n" + "="*80)
|
||||||
|
print("Browser staying open for manual inspection (60s)...")
|
||||||
|
print("="*80)
|
||||||
|
time.sleep(60)
|
||||||
|
|
||||||
|
finally:
|
||||||
|
driver.quit()
|
||||||
157
inspect_pane_content.py
Normal file
157
inspect_pane_content.py
Normal file
@@ -0,0 +1,157 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Check what's actually inside the reviews pane after scrolling.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import time
|
||||||
|
from seleniumbase import Driver
|
||||||
|
|
||||||
|
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine&hl=en"
|
||||||
|
|
||||||
|
driver = Driver(uc=True, headless=False)
|
||||||
|
|
||||||
|
try:
|
||||||
|
driver.get(url)
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
# GDPR
|
||||||
|
try:
|
||||||
|
form_btns = driver.find_elements('css selector', 'form button')
|
||||||
|
for btn in form_btns:
|
||||||
|
if 'accept all' in (btn.text or '').lower():
|
||||||
|
btn.click()
|
||||||
|
time.sleep(2)
|
||||||
|
break
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Click reviews tab
|
||||||
|
time.sleep(2)
|
||||||
|
tabs = driver.find_elements('css selector', 'button[role="tab"]')
|
||||||
|
review_tab_found = False
|
||||||
|
for tab in tabs:
|
||||||
|
text = (tab.text or '').lower()
|
||||||
|
aria = (tab.get_attribute('aria-label') or '').lower()
|
||||||
|
print(f"Tab: text='{tab.text}', aria='{tab.get_attribute('aria-label')}'")
|
||||||
|
if 'review' in text or 'review' in aria:
|
||||||
|
print(f" -> Clicking this tab!")
|
||||||
|
driver.execute_script("arguments[0].click();", tab)
|
||||||
|
time.sleep(6) # Wait longer
|
||||||
|
review_tab_found = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if not review_tab_found:
|
||||||
|
print("WARNING: Reviews tab not found!")
|
||||||
|
|
||||||
|
# Find and scroll the pane
|
||||||
|
print("\nLooking for scrollable pane...")
|
||||||
|
pane = None
|
||||||
|
try:
|
||||||
|
pane = driver.find_element('css selector', 'div.m6QErb.WNBkOb.XiKgde')
|
||||||
|
print(f"Found pane: div.m6QErb.WNBkOb.XiKgde")
|
||||||
|
except:
|
||||||
|
print("Pane not found with standard selector!")
|
||||||
|
try:
|
||||||
|
pane = driver.find_element('css selector', 'div.m6QErb')
|
||||||
|
print(f"Found pane: div.m6QErb")
|
||||||
|
except:
|
||||||
|
print("No pane found at all!")
|
||||||
|
|
||||||
|
if pane:
|
||||||
|
print("\nScrolling pane to load reviews...")
|
||||||
|
for i in range(15):
|
||||||
|
driver.execute_script("arguments[0].scrollBy(0, 400);", pane)
|
||||||
|
time.sleep(0.4)
|
||||||
|
if (i + 1) % 5 == 0:
|
||||||
|
print(f" Scrolled {i+1} times...")
|
||||||
|
|
||||||
|
# Now check what's in the pane
|
||||||
|
print("\n" + "="*80)
|
||||||
|
print("ANALYZING PANE CONTENT")
|
||||||
|
print("="*80)
|
||||||
|
|
||||||
|
content_info = driver.execute_script("""
|
||||||
|
const pane = document.querySelector('div.m6QErb.WNBkOb.XiKgde') || document.querySelector('div.m6QErb');
|
||||||
|
if (!pane) return {error: 'No pane found'};
|
||||||
|
|
||||||
|
// Get all child divs (direct and nested)
|
||||||
|
const allDivs = Array.from(pane.querySelectorAll('div'));
|
||||||
|
|
||||||
|
// Get all unique class names used
|
||||||
|
const classNames = new Set();
|
||||||
|
allDivs.forEach(div => {
|
||||||
|
if (div.className) {
|
||||||
|
div.className.split(' ').forEach(cls => {
|
||||||
|
if (cls.trim()) classNames.add(cls.trim());
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Find divs with ratings
|
||||||
|
const divsWithRatings = allDivs.filter(div => {
|
||||||
|
return !!div.querySelector('[aria-label*="star" i]');
|
||||||
|
});
|
||||||
|
|
||||||
|
// Find divs with author photos
|
||||||
|
const divsWithPhotos = allDivs.filter(div => {
|
||||||
|
return !!div.querySelector('img[src*="photo"], img[src*="avatar"]');
|
||||||
|
});
|
||||||
|
|
||||||
|
// Find divs with date patterns
|
||||||
|
const divsWithDates = allDivs.filter(div => {
|
||||||
|
return !!div.textContent.match(/\\d+\\s*(day|week|month|year|hour|minute|ago)/i);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Find divs with ALL three
|
||||||
|
const reviewLikeDivs = allDivs.filter(div => {
|
||||||
|
const hasRating = !!div.querySelector('[aria-label*="star" i]');
|
||||||
|
const hasPhoto = !!div.querySelector('img');
|
||||||
|
const hasDate = !!div.textContent.match(/\\d+\\s*(day|week|month|year|hour|ago)/i);
|
||||||
|
const textLen = div.textContent.length;
|
||||||
|
return hasRating && hasPhoto && hasDate && textLen > 50 && textLen < 2000;
|
||||||
|
});
|
||||||
|
|
||||||
|
return {
|
||||||
|
total_divs: allDivs.length,
|
||||||
|
unique_classes: Array.from(classNames).sort(),
|
||||||
|
divs_with_ratings: divsWithRatings.length,
|
||||||
|
divs_with_photos: divsWithPhotos.length,
|
||||||
|
divs_with_dates: divsWithDates.length,
|
||||||
|
review_like_divs: reviewLikeDivs.length,
|
||||||
|
review_like_classes: reviewLikeDivs.slice(0, 5).map(d => ({
|
||||||
|
classes: d.className,
|
||||||
|
text_length: d.textContent.length,
|
||||||
|
sample: d.textContent.substring(0, 100)
|
||||||
|
}))
|
||||||
|
};
|
||||||
|
""")
|
||||||
|
|
||||||
|
if 'error' in content_info:
|
||||||
|
print(f"ERROR: {content_info['error']}")
|
||||||
|
else:
|
||||||
|
print(f"\nTotal divs in pane: {content_info['total_divs']}")
|
||||||
|
print(f"Divs with ratings: {content_info['divs_with_ratings']}")
|
||||||
|
print(f"Divs with photos: {content_info['divs_with_photos']}")
|
||||||
|
print(f"Divs with dates: {content_info['divs_with_dates']}")
|
||||||
|
print(f"Divs matching ALL criteria (review-like): {content_info['review_like_divs']}")
|
||||||
|
|
||||||
|
print(f"\nFirst 20 unique classes found in pane:")
|
||||||
|
for cls in content_info['unique_classes'][:20]:
|
||||||
|
print(f" {cls}")
|
||||||
|
|
||||||
|
if content_info['review_like_divs'] > 0:
|
||||||
|
print(f"\nFirst 5 review-like divs:")
|
||||||
|
for i, div_info in enumerate(content_info['review_like_classes'], 1):
|
||||||
|
print(f"\n Div {i}:")
|
||||||
|
print(f" Classes: {div_info['classes']}")
|
||||||
|
print(f" Text length: {div_info['text_length']}")
|
||||||
|
print(f" Sample: {div_info['sample'][:80]}...")
|
||||||
|
|
||||||
|
print(f"\n{'='*80}")
|
||||||
|
print("Browser staying open for manual inspection (120 seconds)...")
|
||||||
|
print("Look at the DevTools to see the actual review elements!")
|
||||||
|
print(f"{'='*80}")
|
||||||
|
time.sleep(120)
|
||||||
|
|
||||||
|
finally:
|
||||||
|
driver.quit()
|
||||||
70
manual_inspect.py
Normal file
70
manual_inspect.py
Normal file
@@ -0,0 +1,70 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Open the page and keep it open for manual inspection.
|
||||||
|
INSTRUCTIONS:
|
||||||
|
1. Open DevTools (F12)
|
||||||
|
2. Click on an individual review
|
||||||
|
3. Look at the div that contains ONE review (not the whole list)
|
||||||
|
4. Note the class names on that div
|
||||||
|
"""
|
||||||
|
|
||||||
|
import time
|
||||||
|
from seleniumbase import Driver
|
||||||
|
|
||||||
|
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine&hl=en"
|
||||||
|
|
||||||
|
driver = Driver(uc=True, headless=False)
|
||||||
|
|
||||||
|
try:
|
||||||
|
driver.get(url)
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
# GDPR
|
||||||
|
try:
|
||||||
|
form_btns = driver.find_elements('css selector', 'form button')
|
||||||
|
for btn in form_btns:
|
||||||
|
if 'accept all' in (btn.text or '').lower():
|
||||||
|
btn.click()
|
||||||
|
time.sleep(2)
|
||||||
|
break
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Click reviews tab
|
||||||
|
time.sleep(2)
|
||||||
|
tabs = driver.find_elements('css selector', 'button[role="tab"]')
|
||||||
|
for tab in tabs:
|
||||||
|
if 'review' in (tab.text or '').lower() or 'review' in (tab.get_attribute('aria-label') or '').lower():
|
||||||
|
driver.execute_script("arguments[0].click();", tab)
|
||||||
|
time.sleep(5)
|
||||||
|
break
|
||||||
|
|
||||||
|
# Scroll to load a few reviews
|
||||||
|
try:
|
||||||
|
pane = driver.find_element('css selector', 'div.m6QErb.WNBkOb.XiKgde')
|
||||||
|
for _ in range(5):
|
||||||
|
driver.execute_script("arguments[0].scrollBy(0, 300);", pane)
|
||||||
|
time.sleep(0.5)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
print("\n" + "="*80)
|
||||||
|
print("MANUAL INSPECTION TIME!")
|
||||||
|
print("="*80)
|
||||||
|
print("\n1. The browser is now showing the reviews page")
|
||||||
|
print("2. Open DevTools (F12 or right-click > Inspect)")
|
||||||
|
print("3. Click the 'Select element' tool (top-left of DevTools)")
|
||||||
|
print("4. Hover over an INDIVIDUAL review (not the whole panel)")
|
||||||
|
print("5. Click on it to select it in the inspector")
|
||||||
|
print("6. Look at the <div> that wraps ONE SINGLE review")
|
||||||
|
print("7. Note the 'class' attribute value")
|
||||||
|
print("\n8. The class might look like: class=\"MyWpvb fontBodyMedium\" or similar")
|
||||||
|
print("\n9. Write down the full class name(s) - we'll use this as the selector!")
|
||||||
|
print("\n" + "="*80)
|
||||||
|
print("Browser will stay open for 5 minutes...")
|
||||||
|
print("="*80)
|
||||||
|
|
||||||
|
time.sleep(300) # 5 minutes
|
||||||
|
|
||||||
|
finally:
|
||||||
|
driver.quit()
|
||||||
71
test_lithuanian_hospital.py
Normal file
71
test_lithuanian_hospital.py
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Test script for Lithuanian hospital to verify structural pattern matching works.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from modules.fast_scraper import fast_scrape_reviews
|
||||||
|
|
||||||
|
# Configure logging to see what's happening
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||||
|
)
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def test_lithuanian_hospital():
|
||||||
|
"""Test scraping the Lithuanian hospital that was getting 0 reviews"""
|
||||||
|
|
||||||
|
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine"
|
||||||
|
|
||||||
|
log.info("=" * 80)
|
||||||
|
log.info("Testing Lithuanian Hospital: Panevėžio respublikinė ligoninė")
|
||||||
|
log.info("Expected: 271 reviews")
|
||||||
|
log.info("Previous result: 0 reviews (selector mismatch)")
|
||||||
|
log.info("=" * 80)
|
||||||
|
|
||||||
|
# Run the scraper with headless mode OFF so we can see what's happening
|
||||||
|
result = fast_scrape_reviews(
|
||||||
|
url=url,
|
||||||
|
headless=False, # Show browser for debugging
|
||||||
|
max_scrolls=999999 # Unlimited - use idle detection
|
||||||
|
)
|
||||||
|
|
||||||
|
log.info("=" * 80)
|
||||||
|
log.info("RESULTS:")
|
||||||
|
log.info(f"Success: {result['success']}")
|
||||||
|
log.info(f"Reviews found: {result['count']}")
|
||||||
|
log.info(f"Total reviews on page: {result.get('total_reviews', 'Unknown')}")
|
||||||
|
log.info(f"Time taken: {result['time']:.2f}s")
|
||||||
|
|
||||||
|
if result.get('message'):
|
||||||
|
log.info(f"Message: {result['message']}")
|
||||||
|
|
||||||
|
if result.get('error'):
|
||||||
|
log.error(f"Error: {result['error']}")
|
||||||
|
|
||||||
|
log.info("=" * 80)
|
||||||
|
|
||||||
|
# Show first few reviews if found
|
||||||
|
if result['count'] > 0:
|
||||||
|
log.info(f"\nFirst 3 reviews:")
|
||||||
|
for i, review in enumerate(result['reviews'][:3], 1):
|
||||||
|
log.info(f"\n Review {i}:")
|
||||||
|
log.info(f" Author: {review.get('author', 'N/A')}")
|
||||||
|
log.info(f" Rating: {review.get('rating', 'N/A')}")
|
||||||
|
log.info(f" Date: {review.get('date_text', 'N/A')}")
|
||||||
|
log.info(f" Text: {review.get('text', 'N/A')[:100]}...")
|
||||||
|
|
||||||
|
# Verify the fix worked
|
||||||
|
if result['count'] > 200:
|
||||||
|
log.info("\n✅ SUCCESS! Structural pattern matching found reviews!")
|
||||||
|
log.info(f" Got {result['count']} reviews (expected ~271)")
|
||||||
|
elif result['count'] == 0:
|
||||||
|
log.error("\n❌ FAILED! Still getting 0 reviews - selector issue not fixed")
|
||||||
|
else:
|
||||||
|
log.warning(f"\n⚠️ PARTIAL: Got {result['count']} reviews (expected ~271)")
|
||||||
|
log.warning(" May need to increase idle detection patience")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test_lithuanian_hospital()
|
||||||
92
test_without_english.py
Normal file
92
test_without_english.py
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Test WITHOUT forcing English locale - use the page's default language.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import time
|
||||||
|
from seleniumbase import Driver
|
||||||
|
|
||||||
|
# NO hl=en parameter!
|
||||||
|
url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine"
|
||||||
|
|
||||||
|
driver = Driver(uc=True, headless=False)
|
||||||
|
|
||||||
|
try:
|
||||||
|
driver.get(url)
|
||||||
|
print(f"Loaded (NO hl=en): {url}")
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
# GDPR
|
||||||
|
try:
|
||||||
|
form_btns = driver.find_elements('css selector', 'form button')
|
||||||
|
for btn in form_btns:
|
||||||
|
btn_text = (btn.text or '').lower()
|
||||||
|
if 'accept' in btn_text or 'priim' in btn_text: # Lithuanian "priimti"
|
||||||
|
print(f"Clicking consent: {btn.text}")
|
||||||
|
btn.click()
|
||||||
|
time.sleep(2)
|
||||||
|
break
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# List ALL tabs
|
||||||
|
print("\nALL TABS FOUND:")
|
||||||
|
time.sleep(2)
|
||||||
|
tabs = driver.find_elements('css selector', 'button[role="tab"]')
|
||||||
|
for i, tab in enumerate(tabs, 1):
|
||||||
|
text = tab.text or ''
|
||||||
|
aria = tab.get_attribute('aria-label') or ''
|
||||||
|
print(f" Tab {i}: text='{text}', aria='{aria}'")
|
||||||
|
|
||||||
|
# Look for reviews tab (try multiple keywords)
|
||||||
|
review_keywords = ['review', 'reseña', 'atsiliepimai', 'atsiliepi', 'отзыв']
|
||||||
|
review_tab_found = False
|
||||||
|
|
||||||
|
for tab in tabs:
|
||||||
|
text = (tab.text or '').lower()
|
||||||
|
aria = (tab.get_attribute('aria-label') or '').lower()
|
||||||
|
|
||||||
|
for keyword in review_keywords:
|
||||||
|
if keyword in text or keyword in aria:
|
||||||
|
print(f"\nFound REVIEWS TAB: {tab.text or aria[:50]}")
|
||||||
|
driver.execute_script("arguments[0].click();", tab)
|
||||||
|
time.sleep(5)
|
||||||
|
review_tab_found = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if review_tab_found:
|
||||||
|
break
|
||||||
|
|
||||||
|
if not review_tab_found:
|
||||||
|
print("\nWARNING: Still no reviews tab found!")
|
||||||
|
else:
|
||||||
|
# Now scroll and check for reviews
|
||||||
|
print("\nScrolling to load reviews...")
|
||||||
|
try:
|
||||||
|
pane = driver.find_element('css selector', 'div.m6QErb.WNBkOb.XiKgde')
|
||||||
|
for i in range(10):
|
||||||
|
driver.execute_script("arguments[0].scrollBy(0, 400);", pane)
|
||||||
|
time.sleep(0.3)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Check for reviews using known selectors
|
||||||
|
selectors_to_check = [
|
||||||
|
'div.jftiEf.fontBodyMedium',
|
||||||
|
'div.jftiEf',
|
||||||
|
'div.fontBodyMedium',
|
||||||
|
'div[data-review-id]'
|
||||||
|
]
|
||||||
|
|
||||||
|
print("\nChecking selectors:")
|
||||||
|
for selector in selectors_to_check:
|
||||||
|
count = driver.execute_script(f"return document.querySelectorAll('{selector}').length;")
|
||||||
|
print(f" {selector:30} : {count} elements")
|
||||||
|
|
||||||
|
print(f"\n{'='*80}")
|
||||||
|
print("Browser open for inspection (120s)...")
|
||||||
|
print(f"{'='*80}")
|
||||||
|
time.sleep(120)
|
||||||
|
|
||||||
|
finally:
|
||||||
|
driver.quit()
|
||||||
Reference in New Issue
Block a user