diff --git a/brute_force_selector.py b/brute_force_selector.py new file mode 100644 index 0000000..21ac024 --- /dev/null +++ b/brute_force_selector.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 +""" +Brute force approach: Try every possible div class combination and see which gives us reviews. +""" + +import time +from seleniumbase import Driver + +url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine&hl=en" + +driver = Driver(uc=True, headless=False) + +try: + driver.get(url) + time.sleep(5) + + # GDPR + try: + form_btns = driver.find_elements('css selector', 'form button') + for btn in form_btns: + if 'accept all' in (btn.text or '').lower(): + btn.click() + time.sleep(2) + break + except: + pass + + # Click reviews tab + time.sleep(2) + tabs = driver.find_elements('css selector', 'button[role="tab"]') + for tab in tabs: + if 'review' in (tab.text or '').lower() or 'review' in (tab.get_attribute('aria-label') or '').lower(): + driver.execute_script("arguments[0].click();", tab) + time.sleep(5) + break + + # Scroll to load reviews + try: + pane = driver.find_element('css selector', 'div.m6QErb.WNBkOb.XiKgde') + for _ in range(10): + driver.execute_script("arguments[0].scrollBy(0, 400);", pane) + time.sleep(0.3) + except: + pass + + print("\n" + "="*80) + print("BRUTE FORCE SELECTOR SEARCH") + print("="*80) + + # Get ALL unique class combinations from divs inside the reviews pane + candidates = driver.execute_script(""" + // Find the reviews pane + const pane = document.querySelector('div.m6QErb.WNBkOb.XiKgde'); + if (!pane) return {error: 'Pane not found'}; + + // Get all divs inside the pane + const allDivs = Array.from(pane.querySelectorAll('div')); + + // For each div, check if it looks like a review + const candidates = []; + + for (let div of allDivs) { + // Skip if no classes + if (!div.className || div.className.length === 0) continue; + + // Check for review indicators + const hasRating = !!div.querySelector('[aria-label*="star" i]'); + const hasText = div.textContent.length > 50 && div.textContent.length < 1000; // Individual review size + const hasAuthor = !!div.querySelector('button[aria-label*="photo" i], img'); + + // Calculate score + let score = 0; + if (hasRating) score += 3; + if (hasText) score += 2; + if (hasAuthor) score += 1; + + if (score >= 4) { // Must have rating + text at minimum + candidates.push({ + classes: div.className, + selector: 'div.' + div.className.split(' ').filter(c => c).join('.'), + score: score, + text_length: div.textContent.length, + sample_text: div.textContent.substring(0, 100) + }); + } + } + + // Count how many elements match each selector + const selectorCounts = {}; + for (let candidate of candidates) { + const count = pane.querySelectorAll(candidate.selector).length; + if (!selectorCounts[candidate.selector]) { + selectorCounts[candidate.selector] = { + count: count, + score: candidate.score, + text_length: candidate.text_length, + sample: candidate.sample_text + }; + } + } + + // Sort by count (we want selectors that match many reviews) + const sorted = Object.entries(selectorCounts) + .sort((a, b) => b[1].count - a[1].count) + .slice(0, 10); + + return { + top_selectors: sorted.map(([selector, info]) => ({ + selector: selector, + count: info.count, + score: info.score, + text_length: info.text_length, + sample: info.sample + })) + }; + """) + + if 'error' in candidates: + print(f"ERROR: {candidates['error']}") + else: + print(f"\nTop 10 candidate selectors (sorted by count):\n") + for i, candidate in enumerate(candidates['top_selectors'], 1): + print(f"{i}. {candidate['selector']}") + print(f" Count: {candidate['count']} | Score: {candidate['score']} | Text length: {candidate['text_length']}") + print(f" Sample: {candidate['sample'][:80]}...") + print() + + # Test the top selector + if candidates['top_selectors']: + top_selector = candidates['top_selectors'][0]['selector'] + print(f"\n{'='*80}") + print(f"TESTING TOP SELECTOR: {top_selector}") + print(f"{'='*80}") + + test_result = driver.execute_script(f""" + const elements = document.querySelectorAll('{top_selector}'); + const reviews = []; + + for (let i = 0; i < Math.min(3, elements.length); i++) {{ + const elem = elements[i]; + const review = {{ + has_author: !!elem.querySelector('button, img'), + has_rating: !!elem.querySelector('[aria-label*="star" i]'), + has_date: !!elem.textContent.match(/\\d+\\s*(day|week|month|year|ago)/i), + text_length: elem.textContent.length, + text_sample: elem.textContent.substring(0, 150) + }}; + reviews.push(review); + }} + + return reviews; + """) + + print(f"\nFirst 3 elements using {top_selector}:") + for i, rev in enumerate(test_result, 1): + print(f"\n Element {i}:") + for key, value in rev.items(): + print(f" {key}: {value}") + + print(f"\n{'='*80}") + print("Browser staying open for 60 seconds...") + print(f"{'='*80}") + time.sleep(60) + +finally: + driver.quit() diff --git a/check_page_structure.py b/check_page_structure.py new file mode 100644 index 0000000..b85f6fa --- /dev/null +++ b/check_page_structure.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +""" +Check the actual page structure - maybe reviews are already visible without clicking a tab! +""" + +import time +from seleniumbase import Driver + +url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine" + +driver = Driver(uc=True, headless=False) + +try: + driver.get(url) + print(f"Initial URL: {url}") + time.sleep(5) + + # GDPR + try: + form_btns = driver.find_elements('css selector', 'form button') + for btn in form_btns: + if 'accept' in (btn.text or '').lower(): + btn.click() + time.sleep(2) + break + except: + pass + + # Check final URL + final_url = driver.current_url + print(f"Final URL after redirect: {final_url}") + + # Wait a bit more for dynamic content + time.sleep(3) + + # Check page structure + print("\n" + "="*80) + print("PAGE STRUCTURE ANALYSIS") + print("="*80) + + page_info = driver.execute_script(""" + return { + tabs_found: document.querySelectorAll('button[role="tab"]').length, + reviews_with_standard_selector: document.querySelectorAll('div.jftiEf.fontBodyMedium').length, + reviews_with_jftiEf: document.querySelectorAll('div.jftiEf').length, + divs_with_ratings: document.querySelectorAll('[aria-label*="star" i]').length, + review_containers: document.querySelectorAll('div.fontBodyMedium').length, + page_text_sample: document.body.innerText.substring(0, 500), + has_review_text: document.body.innerText.toLowerCase().includes('review'), + has_atsiliepimai_text: document.body.innerText.toLowerCase().includes('atsiliepimai') + }; + """) + + print(f"\nTabs with role='tab': {page_info['tabs_found']}") + print(f"div.jftiEf.fontBodyMedium: {page_info['reviews_with_standard_selector']}") + print(f"div.jftiEf: {page_info['reviews_with_jftiEf']}") + print(f"Elements with star ratings: {page_info['divs_with_ratings']}") + print(f"div.fontBodyMedium: {page_info['review_containers']}") + print(f"Contains 'review': {page_info['has_review_text']}") + print(f"Contains 'atsiliepimai' (Lithuanian): {page_info['has_atsiliepimai_text']}") + + print(f"\nPage text sample (first 500 chars):") + print(page_info['page_text_sample']) + + # Try to find ANY element with rating + print("\n" + "="*80) + print("SEARCHING FOR RATING ELEMENTS") + print("="*80) + + rating_search = driver.execute_script(""" + const elements = Array.from(document.querySelectorAll('*')); + const withRatings = []; + + for (let elem of elements) { + const ariaLabel = elem.getAttribute('aria-label') || ''; + if (ariaLabel.toLowerCase().includes('star') || ariaLabel.toLowerCase().includes('žvaigžd')) { + withRatings.push({ + tag: elem.tagName, + ariaLabel: ariaLabel.substring(0, 100), + classes: elem.className.substring(0, 100), + parentTag: elem.parentElement ? elem.parentElement.tagName : null, + parentClasses: elem.parentElement ? elem.parentElement.className.substring(0, 100) : null + }); + } + } + + return withRatings.slice(0, 10); // First 10 + """) + + print(f"\nFound {len(rating_search)} elements with 'star' in aria-label:") + for i, elem in enumerate(rating_search[:5], 1): + print(f"\n Element {i}:") + print(f" Tag: {elem['tag']}") + print(f" Aria-label: {elem['ariaLabel']}") + print(f" Classes: {elem['classes']}") + print(f" Parent tag: {elem['parentTag']}") + print(f" Parent classes: {elem['parentClasses']}") + + print(f"\n{'='*80}") + print("Browser open for manual inspection...") + print("LOOK AT THE PAGE - Are reviews visible? What's their structure?") + print(f"{'='*80}") + time.sleep(180) # 3 minutes + +finally: + driver.quit() diff --git a/diagnose_reviews_panel.py b/diagnose_reviews_panel.py new file mode 100644 index 0000000..8c63541 --- /dev/null +++ b/diagnose_reviews_panel.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +""" +Better diagnostic: Actually wait for reviews panel to load and find correct selector. +""" + +import time +from seleniumbase import Driver + +url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine" + +print("Opening browser...") +driver = Driver(uc=True, headless=False) + +try: + # Add English locale + if '?' in url: + url += '&hl=en' + else: + url += '?hl=en' + + driver.get(url) + print(f"Loaded: {url}") + time.sleep(5) + + # Handle GDPR + try: + form_btns = driver.find_elements('css selector', 'form button') + for btn in form_btns: + btn_text = (btn.text or '').lower() + if 'accept all' in btn_text: + print(f"Clicking GDPR: {btn.text}") + btn.click() + time.sleep(2) + break + except: + pass + + # Click reviews tab and WAIT for panel to load + print("\nClicking reviews tab...") + time.sleep(2) + tabs = driver.find_elements('css selector', 'button[role="tab"]') + for tab in tabs: + text = (tab.text or '').lower() + aria = (tab.get_attribute('aria-label') or '').lower() + if 'review' in text or 'review' in aria: + print(f"Found reviews tab: {tab.text or aria[:50]}") + driver.execute_script("arguments[0].click();", tab) + print("Clicked! Waiting for reviews panel to load...") + time.sleep(5) # Wait longer for reviews to actually load + break + + # Try scrolling the reviews pane to load more + print("\nTrying to find and scroll reviews pane...") + pane_selectors = [ + 'div.m6QErb.WNBkOb.XiKgde', + 'div.m6QErb', + 'div[role="main"]' + ] + + for selector in pane_selectors: + try: + pane = driver.find_element('css selector', selector) + print(f"Found pane: {selector}") + driver.execute_script("arguments[0].scrollBy(0, 500);", pane) + time.sleep(2) + driver.execute_script("arguments[0].scrollBy(0, 500);", pane) + time.sleep(2) + break + except: + continue + + # NOW check for review selectors + print("\n" + "="*80) + print("CHECKING REVIEW SELECTORS AFTER PANEL LOADED:") + print("="*80) + + selectors_to_try = [ + ('div.jftiEf.fontBodyMedium', 'Standard Google Maps reviews'), + ('div.jftiEf', 'Just jftiEf class'), + ('div.fontBodyMedium', 'Just fontBodyMedium'), + ('div[data-review-id]', 'data-review-id attribute'), + ('div[jsaction*="review"]', 'jsaction with review'), + ('[data-review]', 'data-review attribute'), + ('div[class*="review" i]', 'Class containing review'), + ('[role="article"]', 'role=article'), + ('div[jslog]', 'Elements with jslog (Google tracking)'), + ] + + for selector, description in selectors_to_try: + count = driver.execute_script( + f"return document.querySelectorAll('{selector}').length;" + ) + print(f"{description:35} | {selector:40} | Found: {count}") + + # Get detailed info about most promising selector + print("\n" + "="*80) + print("ANALYZING MOST PROMISING SELECTOR:") + print("="*80) + + analysis = driver.execute_script(""" + // Try selectors in order of likelihood + const selectors = [ + 'div.jftiEf.fontBodyMedium', + 'div.jftiEf', + 'div.fontBodyMedium', + 'div[jslog*="impression"]', + '[role="article"]' + ]; + + for (let selector of selectors) { + const elements = document.querySelectorAll(selector); + if (elements.length > 5) { // Need at least a few to be reviews + // Analyze first element + const first = elements[0]; + const analysis = { + selector: selector, + total_found: elements.length, + first_element: { + tag: first.tagName, + classes: first.className, + has_rating: !!first.querySelector('[aria-label*="star" i]'), + has_author: !!first.querySelector('button, a, div[aria-label]'), + has_avatar: !!first.querySelector('img'), + has_date: !!first.textContent.match(/\\d+\\s*(day|week|month|year|hour|minute)/i), + text_length: first.textContent.length, + sample_text: first.textContent.substring(0, 100) + } + }; + + // Check if multiple elements have review characteristics + let reviewLikeCount = 0; + for (let i = 0; i < Math.min(10, elements.length); i++) { + const elem = elements[i]; + const hasRating = !!elem.querySelector('[aria-label*="star" i]'); + const hasText = elem.textContent.length > 30; + if (hasRating && hasText) reviewLikeCount++; + } + analysis.review_like_count_in_first_10 = reviewLikeCount; + + return analysis; + } + } + + return {error: 'No selector found with >5 elements'}; + """) + + if 'error' in analysis: + print(f"ERROR: {analysis['error']}") + else: + print(f"Best selector: {analysis['selector']}") + print(f"Total found: {analysis['total_found']}") + print(f"Review-like in first 10: {analysis['review_like_count_in_first_10']}") + print(f"\nFirst element analysis:") + for key, value in analysis['first_element'].items(): + print(f" {key}: {value}") + + print("\n" + "="*80) + print("Keeping browser open for 120 seconds for manual inspection...") + print("="*80) + time.sleep(120) + +finally: + driver.quit() diff --git a/diagnose_selectors.py b/diagnose_selectors.py new file mode 100644 index 0000000..d32f7a9 --- /dev/null +++ b/diagnose_selectors.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 +""" +Diagnostic script to find the correct selector for Lithuanian hospital reviews. +Opens the browser and pauses so we can inspect the page manually. +""" + +import time +from seleniumbase import Driver + +url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine" + +print("Opening browser...") +driver = Driver(uc=True, headless=False) + +try: + # Add English locale for consistency + if '?' in url: + url += '&hl=en' + else: + url += '?hl=en' + + driver.get(url) + print(f"Loaded: {url}") + + # Wait for page to load + time.sleep(5) + + # Handle GDPR + try: + form_btns = driver.find_elements('css selector', 'form button') + for btn in form_btns: + btn_text = (btn.text or '').lower() + if 'accept all' in btn_text or 'aceptar todo' in btn_text: + print(f"Clicking GDPR consent: {btn.text}") + btn.click() + time.sleep(2) + break + except: + pass + + # Click reviews tab + time.sleep(2) + tabs = driver.find_elements('css selector', 'button[role="tab"]') + for tab in tabs: + text = (tab.text or '').lower() + aria = (tab.get_attribute('aria-label') or '').lower() + if 'review' in text or 'review' in aria: + print(f"Clicking reviews tab: {tab.text or aria[:30]}") + driver.execute_script("arguments[0].click();", tab) + time.sleep(3) + break + + # Try different selectors and show what we find + selectors_to_try = [ + ('div.jftiEf.fontBodyMedium', 'Known selector 1'), + ('div.jftiEf', 'Known selector 2'), + ('div[data-review-id]', 'Known selector 3'), + ('div[jsaction*="review"]', 'jsaction with review'), + ('[role="article"]', 'role=article'), + ('div[data-review-id]', 'data-review-id attribute'), + ('div.fontBodyMedium', 'Just fontBodyMedium class'), + ('div[class*="review"]', 'Class containing "review"'), + ] + + print("\n" + "="*80) + print("TESTING SELECTORS:") + print("="*80) + + for selector, description in selectors_to_try: + count = driver.execute_script( + f"return document.querySelectorAll('{selector}').length;" + ) + print(f"{description:30} | {selector:40} | Found: {count}") + + # Show sample HTML of first few elements matching the most promising selector + print("\n" + "="*80) + print("SAMPLE HTML FROM FIRST MATCH:") + print("="*80) + + sample_html = driver.execute_script(""" + const selectors = [ + 'div.jftiEf.fontBodyMedium', + 'div.jftiEf', + '[role="article"]', + 'div[jsaction*="review"]' + ]; + + for (let selector of selectors) { + const elements = document.querySelectorAll(selector); + if (elements.length > 0) { + const first = elements[0]; + return { + selector: selector, + count: elements.length, + outerHTML: first.outerHTML.substring(0, 500), + classes: first.className, + hasRating: !!first.querySelector('[aria-label*="star" i]'), + hasAuthor: !!first.querySelector('img'), + textLength: first.textContent.length + }; + } + } + return null; + """) + + if sample_html: + print(f"Selector: {sample_html['selector']}") + print(f"Total found: {sample_html['count']}") + print(f"Classes: {sample_html['classes']}") + print(f"Has rating: {sample_html['hasRating']}") + print(f"Has author img: {sample_html['hasAuthor']}") + print(f"Text length: {sample_html['textLength']}") + print(f"\nSample HTML (first 500 chars):") + print(sample_html['outerHTML']) + + print("\n" + "="*80) + print("Browser will stay open for 60 seconds so you can inspect manually...") + print("Use DevTools to find the correct selector!") + print("="*80) + + # Keep browser open for inspection + time.sleep(60) + +finally: + driver.quit() + print("\nBrowser closed.") diff --git a/find_actual_reviews.py b/find_actual_reviews.py new file mode 100644 index 0000000..948e0cc --- /dev/null +++ b/find_actual_reviews.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 +""" +Find the ACTUAL selector for reviews by looking for elements with review structure. +""" + +import time +from seleniumbase import Driver + +url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine&hl=en" + +driver = Driver(uc=True, headless=False) + +try: + driver.get(url) + time.sleep(5) + + # GDPR + try: + form_btns = driver.find_elements('css selector', 'form button') + for btn in form_btns: + if 'accept all' in (btn.text or '').lower(): + btn.click() + time.sleep(2) + break + except: + pass + + # Click reviews tab + time.sleep(2) + tabs = driver.find_elements('css selector', 'button[role="tab"]') + for tab in tabs: + if 'review' in (tab.text or '').lower() or 'review' in (tab.get_attribute('aria-label') or '').lower(): + driver.execute_script("arguments[0].click();", tab) + time.sleep(5) + break + + # Scroll to load reviews + try: + pane = driver.find_element('css selector', 'div.m6QErb.WNBkOb.XiKgde') + for _ in range(3): + driver.execute_script("arguments[0].scrollBy(0, 500);", pane) + time.sleep(1) + except: + pass + + # Use JavaScript to find ALL elements that look like reviews + print("\n" + "="*80) + print("FINDING ACTUAL REVIEW ELEMENTS BY STRUCTURE:") + print("="*80) + + review_info = driver.execute_script(""" + // Find all elements that have BOTH a rating AND substantial text + const allDivs = Array.from(document.querySelectorAll('div')); + + const reviews = []; + + for (let div of allDivs) { + // Must have a rating (star aria-label) + const ratingElem = div.querySelector('[aria-label*="star" i], [aria-label*="rating" i]'); + if (!ratingElem) continue; + + // Must have decent text content (>50 chars to avoid buttons) + if (div.textContent.length < 50) continue; + + // Get the classes and attributes + const info = { + classes: div.className, + has_author: !!div.querySelector('button, [aria-label*="photo" i]'), + has_avatar: !!div.querySelector('img'), + has_date: !!div.textContent.match(/\\d+\\s*(day|week|month|year|ago)/i), + text_length: div.textContent.length, + sample_text: div.textContent.substring(0, 150), + tag_name: div.tagName, + jslog: div.getAttribute('jslog'), + data_review_id: div.getAttribute('data-review-id'), + jsaction: div.getAttribute('jsaction') + }; + + reviews.push(info); + } + + return { + total_found: reviews.length, + first_5: reviews.slice(0, 5) + }; + """) + + print(f"\nFound {review_info['total_found']} elements with review structure") + print(f"\nFirst 5 review-like elements:") + for i, rev in enumerate(review_info['first_5'], 1): + print(f"\n Review {i}:") + print(f" Classes: {rev['classes']}") + print(f" Has author: {rev['has_author']}") + print(f" Has avatar: {rev['has_avatar']}") + print(f" Has date: {rev['has_date']}") + print(f" Text length: {rev['text_length']}") + print(f" jslog: {rev['jslog']}") + print(f" data-review-id: {rev['data_review_id']}") + print(f" Sample: {rev['sample_text'][:80]}...") + + # Try to find a common class among review elements + if review_info['total_found'] > 0: + print("\n" + "="*80) + print("FINDING COMMON SELECTOR:") + print("="*80) + + common_selector = driver.execute_script(""" + // Find common classes among review elements + const reviews = []; + const allDivs = Array.from(document.querySelectorAll('div')); + + for (let div of allDivs) { + const ratingElem = div.querySelector('[aria-label*="star" i]'); + if (ratingElem && div.textContent.length > 50) { + reviews.push(div); + } + } + + if (reviews.length === 0) return null; + + // Get classes from first review + const firstClasses = reviews[0].className.split(' ').filter(c => c.length > 0); + + // Find classes that appear in ALL reviews + const commonClasses = firstClasses.filter(cls => { + return reviews.every(rev => rev.classList.contains(cls)); + }); + + return { + total_reviews: reviews.length, + common_classes: commonClasses, + suggested_selector: commonClasses.length > 0 ? 'div.' + commonClasses.join('.') : null, + first_review_classes: reviews[0].className + }; + """) + + if common_selector: + print(f"Total review elements: {common_selector['total_reviews']}") + print(f"Common classes: {common_selector['common_classes']}") + print(f"Suggested selector: {common_selector['suggested_selector']}") + print(f"First review full classes: {common_selector['first_review_classes']}") + + # Test the suggested selector + if common_selector['suggested_selector']: + test_count = driver.execute_script( + f"return document.querySelectorAll('{common_selector['suggested_selector']}').length;" + ) + print(f"\nTesting suggested selector: Found {test_count} elements") + + print("\n" + "="*80) + print("Browser staying open for manual inspection (60s)...") + print("="*80) + time.sleep(60) + +finally: + driver.quit() diff --git a/inspect_pane_content.py b/inspect_pane_content.py new file mode 100644 index 0000000..fb95a94 --- /dev/null +++ b/inspect_pane_content.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python3 +""" +Check what's actually inside the reviews pane after scrolling. +""" + +import time +from seleniumbase import Driver + +url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine&hl=en" + +driver = Driver(uc=True, headless=False) + +try: + driver.get(url) + time.sleep(5) + + # GDPR + try: + form_btns = driver.find_elements('css selector', 'form button') + for btn in form_btns: + if 'accept all' in (btn.text or '').lower(): + btn.click() + time.sleep(2) + break + except: + pass + + # Click reviews tab + time.sleep(2) + tabs = driver.find_elements('css selector', 'button[role="tab"]') + review_tab_found = False + for tab in tabs: + text = (tab.text or '').lower() + aria = (tab.get_attribute('aria-label') or '').lower() + print(f"Tab: text='{tab.text}', aria='{tab.get_attribute('aria-label')}'") + if 'review' in text or 'review' in aria: + print(f" -> Clicking this tab!") + driver.execute_script("arguments[0].click();", tab) + time.sleep(6) # Wait longer + review_tab_found = True + break + + if not review_tab_found: + print("WARNING: Reviews tab not found!") + + # Find and scroll the pane + print("\nLooking for scrollable pane...") + pane = None + try: + pane = driver.find_element('css selector', 'div.m6QErb.WNBkOb.XiKgde') + print(f"Found pane: div.m6QErb.WNBkOb.XiKgde") + except: + print("Pane not found with standard selector!") + try: + pane = driver.find_element('css selector', 'div.m6QErb') + print(f"Found pane: div.m6QErb") + except: + print("No pane found at all!") + + if pane: + print("\nScrolling pane to load reviews...") + for i in range(15): + driver.execute_script("arguments[0].scrollBy(0, 400);", pane) + time.sleep(0.4) + if (i + 1) % 5 == 0: + print(f" Scrolled {i+1} times...") + + # Now check what's in the pane + print("\n" + "="*80) + print("ANALYZING PANE CONTENT") + print("="*80) + + content_info = driver.execute_script(""" + const pane = document.querySelector('div.m6QErb.WNBkOb.XiKgde') || document.querySelector('div.m6QErb'); + if (!pane) return {error: 'No pane found'}; + + // Get all child divs (direct and nested) + const allDivs = Array.from(pane.querySelectorAll('div')); + + // Get all unique class names used + const classNames = new Set(); + allDivs.forEach(div => { + if (div.className) { + div.className.split(' ').forEach(cls => { + if (cls.trim()) classNames.add(cls.trim()); + }); + } + }); + + // Find divs with ratings + const divsWithRatings = allDivs.filter(div => { + return !!div.querySelector('[aria-label*="star" i]'); + }); + + // Find divs with author photos + const divsWithPhotos = allDivs.filter(div => { + return !!div.querySelector('img[src*="photo"], img[src*="avatar"]'); + }); + + // Find divs with date patterns + const divsWithDates = allDivs.filter(div => { + return !!div.textContent.match(/\\d+\\s*(day|week|month|year|hour|minute|ago)/i); + }); + + // Find divs with ALL three + const reviewLikeDivs = allDivs.filter(div => { + const hasRating = !!div.querySelector('[aria-label*="star" i]'); + const hasPhoto = !!div.querySelector('img'); + const hasDate = !!div.textContent.match(/\\d+\\s*(day|week|month|year|hour|ago)/i); + const textLen = div.textContent.length; + return hasRating && hasPhoto && hasDate && textLen > 50 && textLen < 2000; + }); + + return { + total_divs: allDivs.length, + unique_classes: Array.from(classNames).sort(), + divs_with_ratings: divsWithRatings.length, + divs_with_photos: divsWithPhotos.length, + divs_with_dates: divsWithDates.length, + review_like_divs: reviewLikeDivs.length, + review_like_classes: reviewLikeDivs.slice(0, 5).map(d => ({ + classes: d.className, + text_length: d.textContent.length, + sample: d.textContent.substring(0, 100) + })) + }; + """) + + if 'error' in content_info: + print(f"ERROR: {content_info['error']}") + else: + print(f"\nTotal divs in pane: {content_info['total_divs']}") + print(f"Divs with ratings: {content_info['divs_with_ratings']}") + print(f"Divs with photos: {content_info['divs_with_photos']}") + print(f"Divs with dates: {content_info['divs_with_dates']}") + print(f"Divs matching ALL criteria (review-like): {content_info['review_like_divs']}") + + print(f"\nFirst 20 unique classes found in pane:") + for cls in content_info['unique_classes'][:20]: + print(f" {cls}") + + if content_info['review_like_divs'] > 0: + print(f"\nFirst 5 review-like divs:") + for i, div_info in enumerate(content_info['review_like_classes'], 1): + print(f"\n Div {i}:") + print(f" Classes: {div_info['classes']}") + print(f" Text length: {div_info['text_length']}") + print(f" Sample: {div_info['sample'][:80]}...") + + print(f"\n{'='*80}") + print("Browser staying open for manual inspection (120 seconds)...") + print("Look at the DevTools to see the actual review elements!") + print(f"{'='*80}") + time.sleep(120) + +finally: + driver.quit() diff --git a/manual_inspect.py b/manual_inspect.py new file mode 100644 index 0000000..6b48232 --- /dev/null +++ b/manual_inspect.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +""" +Open the page and keep it open for manual inspection. +INSTRUCTIONS: +1. Open DevTools (F12) +2. Click on an individual review +3. Look at the div that contains ONE review (not the whole list) +4. Note the class names on that div +""" + +import time +from seleniumbase import Driver + +url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine&hl=en" + +driver = Driver(uc=True, headless=False) + +try: + driver.get(url) + time.sleep(5) + + # GDPR + try: + form_btns = driver.find_elements('css selector', 'form button') + for btn in form_btns: + if 'accept all' in (btn.text or '').lower(): + btn.click() + time.sleep(2) + break + except: + pass + + # Click reviews tab + time.sleep(2) + tabs = driver.find_elements('css selector', 'button[role="tab"]') + for tab in tabs: + if 'review' in (tab.text or '').lower() or 'review' in (tab.get_attribute('aria-label') or '').lower(): + driver.execute_script("arguments[0].click();", tab) + time.sleep(5) + break + + # Scroll to load a few reviews + try: + pane = driver.find_element('css selector', 'div.m6QErb.WNBkOb.XiKgde') + for _ in range(5): + driver.execute_script("arguments[0].scrollBy(0, 300);", pane) + time.sleep(0.5) + except: + pass + + print("\n" + "="*80) + print("MANUAL INSPECTION TIME!") + print("="*80) + print("\n1. The browser is now showing the reviews page") + print("2. Open DevTools (F12 or right-click > Inspect)") + print("3. Click the 'Select element' tool (top-left of DevTools)") + print("4. Hover over an INDIVIDUAL review (not the whole panel)") + print("5. Click on it to select it in the inspector") + print("6. Look at the
that wraps ONE SINGLE review") + print("7. Note the 'class' attribute value") + print("\n8. The class might look like: class=\"MyWpvb fontBodyMedium\" or similar") + print("\n9. Write down the full class name(s) - we'll use this as the selector!") + print("\n" + "="*80) + print("Browser will stay open for 5 minutes...") + print("="*80) + + time.sleep(300) # 5 minutes + +finally: + driver.quit() diff --git a/test_lithuanian_hospital.py b/test_lithuanian_hospital.py new file mode 100644 index 0000000..ae4ff17 --- /dev/null +++ b/test_lithuanian_hospital.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +""" +Test script for Lithuanian hospital to verify structural pattern matching works. +""" + +import logging +from modules.fast_scraper import fast_scrape_reviews + +# Configure logging to see what's happening +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) + +log = logging.getLogger(__name__) + +def test_lithuanian_hospital(): + """Test scraping the Lithuanian hospital that was getting 0 reviews""" + + url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine" + + log.info("=" * 80) + log.info("Testing Lithuanian Hospital: Panevėžio respublikinė ligoninė") + log.info("Expected: 271 reviews") + log.info("Previous result: 0 reviews (selector mismatch)") + log.info("=" * 80) + + # Run the scraper with headless mode OFF so we can see what's happening + result = fast_scrape_reviews( + url=url, + headless=False, # Show browser for debugging + max_scrolls=999999 # Unlimited - use idle detection + ) + + log.info("=" * 80) + log.info("RESULTS:") + log.info(f"Success: {result['success']}") + log.info(f"Reviews found: {result['count']}") + log.info(f"Total reviews on page: {result.get('total_reviews', 'Unknown')}") + log.info(f"Time taken: {result['time']:.2f}s") + + if result.get('message'): + log.info(f"Message: {result['message']}") + + if result.get('error'): + log.error(f"Error: {result['error']}") + + log.info("=" * 80) + + # Show first few reviews if found + if result['count'] > 0: + log.info(f"\nFirst 3 reviews:") + for i, review in enumerate(result['reviews'][:3], 1): + log.info(f"\n Review {i}:") + log.info(f" Author: {review.get('author', 'N/A')}") + log.info(f" Rating: {review.get('rating', 'N/A')}") + log.info(f" Date: {review.get('date_text', 'N/A')}") + log.info(f" Text: {review.get('text', 'N/A')[:100]}...") + + # Verify the fix worked + if result['count'] > 200: + log.info("\n✅ SUCCESS! Structural pattern matching found reviews!") + log.info(f" Got {result['count']} reviews (expected ~271)") + elif result['count'] == 0: + log.error("\n❌ FAILED! Still getting 0 reviews - selector issue not fixed") + else: + log.warning(f"\n⚠️ PARTIAL: Got {result['count']} reviews (expected ~271)") + log.warning(" May need to increase idle detection patience") + +if __name__ == "__main__": + test_lithuanian_hospital() diff --git a/test_without_english.py b/test_without_english.py new file mode 100644 index 0000000..25fb0e1 --- /dev/null +++ b/test_without_english.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +""" +Test WITHOUT forcing English locale - use the page's default language. +""" + +import time +from seleniumbase import Driver + +# NO hl=en parameter! +url = "https://www.google.com/maps/search/?api=1&query=panevezio%20respubliikine%20ligonine" + +driver = Driver(uc=True, headless=False) + +try: + driver.get(url) + print(f"Loaded (NO hl=en): {url}") + time.sleep(5) + + # GDPR + try: + form_btns = driver.find_elements('css selector', 'form button') + for btn in form_btns: + btn_text = (btn.text or '').lower() + if 'accept' in btn_text or 'priim' in btn_text: # Lithuanian "priimti" + print(f"Clicking consent: {btn.text}") + btn.click() + time.sleep(2) + break + except: + pass + + # List ALL tabs + print("\nALL TABS FOUND:") + time.sleep(2) + tabs = driver.find_elements('css selector', 'button[role="tab"]') + for i, tab in enumerate(tabs, 1): + text = tab.text or '' + aria = tab.get_attribute('aria-label') or '' + print(f" Tab {i}: text='{text}', aria='{aria}'") + + # Look for reviews tab (try multiple keywords) + review_keywords = ['review', 'reseña', 'atsiliepimai', 'atsiliepi', 'отзыв'] + review_tab_found = False + + for tab in tabs: + text = (tab.text or '').lower() + aria = (tab.get_attribute('aria-label') or '').lower() + + for keyword in review_keywords: + if keyword in text or keyword in aria: + print(f"\nFound REVIEWS TAB: {tab.text or aria[:50]}") + driver.execute_script("arguments[0].click();", tab) + time.sleep(5) + review_tab_found = True + break + + if review_tab_found: + break + + if not review_tab_found: + print("\nWARNING: Still no reviews tab found!") + else: + # Now scroll and check for reviews + print("\nScrolling to load reviews...") + try: + pane = driver.find_element('css selector', 'div.m6QErb.WNBkOb.XiKgde') + for i in range(10): + driver.execute_script("arguments[0].scrollBy(0, 400);", pane) + time.sleep(0.3) + except: + pass + + # Check for reviews using known selectors + selectors_to_check = [ + 'div.jftiEf.fontBodyMedium', + 'div.jftiEf', + 'div.fontBodyMedium', + 'div[data-review-id]' + ] + + print("\nChecking selectors:") + for selector in selectors_to_check: + count = driver.execute_script(f"return document.querySelectorAll('{selector}').length;") + print(f" {selector:30} : {count} elements") + + print(f"\n{'='*80}") + print("Browser open for inspection (120s)...") + print(f"{'='*80}") + time.sleep(120) + +finally: + driver.quit()