- Add new api_interceptor.py module for CDP network interception - Capture Google Maps internal API responses during scrolling - Parse protobuf-like JSON responses to extract review data - Merge API-captured reviews with DOM-scraped data - Update CSS selectors for January 2026 Google Maps structure - Add cookie consent dismissal for multiple languages - Add --api-intercept CLI flag and config option - Fix review card and pane selectors (.jftiEf, .XiKgde) - Improve review ID extraction from card elements Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2219 lines
99 KiB
Python
2219 lines
99 KiB
Python
"""
|
||
Selenium scraping logic for Google Maps Reviews.
|
||
Uses SeleniumBase UC Mode for enhanced anti-detection and better Chrome version management.
|
||
"""
|
||
|
||
import logging
|
||
import os
|
||
import platform
|
||
import re
|
||
import time
|
||
import traceback
|
||
from typing import Dict, Any, List
|
||
|
||
from seleniumbase import Driver
|
||
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
|
||
from selenium.webdriver import Chrome
|
||
from selenium.webdriver.common.action_chains import ActionChains
|
||
from selenium.webdriver.common.by import By
|
||
from selenium.webdriver.common.keys import Keys
|
||
from selenium.webdriver.remote.webelement import WebElement
|
||
from selenium.webdriver.support import expected_conditions as EC
|
||
from selenium.webdriver.support.ui import WebDriverWait
|
||
from tqdm import tqdm
|
||
|
||
from modules.data_storage import MongoDBStorage, JSONStorage, merge_review
|
||
from modules.models import RawReview
|
||
from modules.api_interceptor import GoogleMapsAPIInterceptor
|
||
|
||
# Logger
|
||
log = logging.getLogger("scraper")
|
||
|
||
# CSS Selectors (Updated January 2026 for current Google Maps structure)
|
||
PANE_SEL = 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde'
|
||
CARD_SEL = "div.jftiEf" # Review card container
|
||
# Cookie/consent dialog selectors (Updated January 2026)
|
||
COOKIE_BTN = ('button[aria-label*="Accept" i],'
|
||
'button[aria-label*="Aceptar" i],'
|
||
'button[aria-label*="Akzeptieren" i],'
|
||
'button[aria-label*="Aceitar" i],'
|
||
'button[jsname="higCR"],' # Google's "Accept all" button
|
||
'button[jsname="hZCF7e"],'
|
||
'button[data-mdc-dialog-action="accept"],'
|
||
'form[action*="consent"] button,'
|
||
'div[role="dialog"] button[jsname],'
|
||
'.VfPpkd-LgbsSe[data-mdc-dialog-action="accept"]')
|
||
SORT_BTN = 'button[aria-label="Sort reviews" i], button[aria-label="Sort" i]'
|
||
MENU_ITEMS = 'div[role="menu"] [role="menuitem"], li[role="menuitem"]'
|
||
|
||
SORT_OPTIONS = {
|
||
"newest": (
|
||
"Newest", "החדשות ביותר", "ใหม่ที่สุด", "最新", "Más recientes", "最近",
|
||
"Mais recentes", "Neueste", "Plus récent", "Più recenti", "Nyeste",
|
||
"Новые", "Nieuwste", "جديد", "Nyeste", "Uusimmat", "Najnowsze",
|
||
"Senaste", "Terbaru", "Yakın zamanlı", "Mới nhất", "नवीनतम"
|
||
),
|
||
"highest": (
|
||
"Highest rating", "הדירוג הגבוה ביותר", "คะแนนสูงสุด", "最高評価",
|
||
"Calificación más alta", "最高评分", "Melhor avaliação", "Höchste Bewertung",
|
||
"Note la plus élevée", "Valutazione più alta", "Høyeste vurdering",
|
||
"Наивысший рейтинг", "Hoogste waardering", "أعلى تقييم", "Højeste vurdering",
|
||
"Korkein arvostelu", "Najwyższa ocena", "Högsta betyg", "Peringkat tertinggi",
|
||
"En yüksek puan", "Đánh giá cao nhất", "उच्चतम रेटिंग", "Top rating"
|
||
),
|
||
"lowest": (
|
||
"Lowest rating", "הדירוג הנמוך ביותר", "คะแนนต่ำสุด", "最低評価",
|
||
"Calificación más baja", "最低评分", "Pior avaliação", "Niedrigste Bewertung",
|
||
"Note la plus basse", "Valutazione più bassa", "Laveste vurdering",
|
||
"Наименьший рейтинг", "Laagste waardering", "أقل تقييم", "Laveste vurdering",
|
||
"Alhaisin arvostelu", "Najniższa ocena", "Lägsta betyg", "Peringkat terendah",
|
||
"En düşük puan", "Đánh giá thấp nhất", "निम्नतम रेटिंग", "Worst rating"
|
||
),
|
||
"relevance": (
|
||
"Most relevant", "רלוונטיות ביותר", "เกี่ยวข้องมากที่สุด", "関連性",
|
||
"Más relevantes", "最相关", "Mais relevantes", "Relevanteste",
|
||
"Plus pertinents", "Più pertinenti", "Mest relevante",
|
||
"Наиболее релевантные", "Meest relevant", "الأكثر صلة", "Mest relevante",
|
||
"Olennaisimmat", "Najbardziej trafne", "Mest relevanta", "Paling relevan",
|
||
"En alakalı", "Liên quan nhất", "सबसे प्रासंगिक", "Relevance"
|
||
)
|
||
}
|
||
|
||
# Comprehensive multi-language review keywords
|
||
REVIEW_WORDS = {
|
||
# English
|
||
"reviews", "review", "ratings", "rating",
|
||
|
||
# Hebrew
|
||
"ביקורות", "ביקורת", "ביקורות על", "דירוגים", "דירוג",
|
||
|
||
# Thai
|
||
"รีวิว", "บทวิจารณ์", "คะแนน", "ความคิดเห็น",
|
||
|
||
# Spanish
|
||
"reseñas", "opiniones", "valoraciones", "críticas", "calificaciones",
|
||
|
||
# French
|
||
"avis", "commentaires", "évaluations", "critiques", "notes",
|
||
|
||
# German
|
||
"bewertungen", "rezensionen", "beurteilungen", "meinungen", "kritiken",
|
||
|
||
# Italian
|
||
"recensioni", "valutazioni", "opinioni", "giudizi", "commenti",
|
||
|
||
# Portuguese
|
||
"avaliações", "comentários", "opiniões", "análises", "críticas",
|
||
|
||
# Russian
|
||
"отзывы", "рецензии", "обзоры", "оценки", "комментарии",
|
||
|
||
# Japanese
|
||
"レビュー", "口コミ", "評価", "批評", "感想",
|
||
|
||
# Korean
|
||
"리뷰", "평가", "후기", "댓글", "의견",
|
||
|
||
# Chinese (Simplified and Traditional)
|
||
"评论", "評論", "点评", "點評", "评价", "評價", "意见", "意見", "回顾", "回顧",
|
||
|
||
# Arabic
|
||
"مراجعات", "تقييمات", "آراء", "تعليقات", "نقد",
|
||
|
||
# Hindi
|
||
"समीक्षा", "रिव्यू", "राय", "मूल्यांकन", "प्रतिक्रिया",
|
||
|
||
# Turkish
|
||
"yorumlar", "değerlendirmeler", "incelemeler", "görüşler", "puanlar",
|
||
|
||
# Dutch
|
||
"beoordelingen", "recensies", "meningen", "opmerkingen", "waarderingen",
|
||
|
||
# Polish
|
||
"recenzje", "opinie", "oceny", "komentarze", "uwagi",
|
||
|
||
# Vietnamese
|
||
"đánh giá", "nhận xét", "bình luận", "phản hồi", "bài đánh giá",
|
||
|
||
# Indonesian
|
||
"ulasan", "tinjauan", "komentar", "penilaian", "pendapat",
|
||
|
||
# Swedish
|
||
"recensioner", "betyg", "omdömen", "åsikter", "kommentarer",
|
||
|
||
# Norwegian
|
||
"anmeldelser", "vurderinger", "omtaler", "meninger", "tilbakemeldinger",
|
||
|
||
# Danish
|
||
"anmeldelser", "bedømmelser", "vurderinger", "meninger", "kommentarer",
|
||
|
||
# Finnish
|
||
"arvostelut", "arviot", "kommentit", "mielipiteet", "palautteet",
|
||
|
||
# Greek
|
||
"κριτικές", "αξιολογήσεις", "σχόλια", "απόψεις", "βαθμολογίες",
|
||
|
||
# Czech
|
||
"recenze", "hodnocení", "názory", "komentáře", "posudky",
|
||
|
||
# Romanian
|
||
"recenzii", "evaluări", "opinii", "comentarii", "note",
|
||
|
||
# Hungarian
|
||
"vélemények", "értékelések", "kritikák", "hozzászólások", "megjegyzések",
|
||
|
||
# Bulgarian
|
||
"отзиви", "ревюта", "мнения", "коментари", "оценки"
|
||
}
|
||
|
||
|
||
class GoogleReviewsScraper:
|
||
"""Main scraper class for Google Maps reviews"""
|
||
|
||
def __init__(self, config: Dict[str, Any]):
|
||
"""Initialize scraper with configuration"""
|
||
self.config = config
|
||
self.use_mongodb = config.get("use_mongodb", True)
|
||
self.mongodb = MongoDBStorage(config) if self.use_mongodb else None
|
||
self.json_storage = JSONStorage(config)
|
||
self.backup_to_json = config.get("backup_to_json", True)
|
||
self.overwrite_existing = config.get("overwrite_existing", False)
|
||
self.enable_api_intercept = config.get("enable_api_intercept", False)
|
||
self.api_interceptor = None # Will be initialized when driver is ready
|
||
|
||
def setup_driver(self, headless: bool):
|
||
"""
|
||
Set up and configure Chrome driver using SeleniumBase UC Mode.
|
||
SeleniumBase provides enhanced anti-detection and automatic Chrome/ChromeDriver version management.
|
||
Works in both Docker containers and on regular OS installations (Windows, Mac, Linux).
|
||
"""
|
||
# Log platform information for debugging
|
||
log.info(f"Platform: {platform.platform()}")
|
||
log.info(f"Python version: {platform.python_version()}")
|
||
log.info("Using SeleniumBase UC Mode for enhanced anti-detection")
|
||
|
||
# Determine if we're running in a container
|
||
in_container = os.environ.get('CHROME_BIN') is not None
|
||
|
||
if in_container:
|
||
chrome_binary = os.environ.get('CHROME_BIN')
|
||
log.info(f"Container environment detected")
|
||
log.info(f"Chrome binary: {chrome_binary}")
|
||
|
||
# Create driver with custom binary location for containers
|
||
if chrome_binary and os.path.exists(chrome_binary):
|
||
try:
|
||
driver = Driver(
|
||
uc=True,
|
||
headless=headless,
|
||
binary_location=chrome_binary,
|
||
page_load_strategy="normal"
|
||
)
|
||
log.info("Successfully created SeleniumBase UC driver with custom binary")
|
||
except Exception as e:
|
||
log.warning(f"Failed to create driver with custom binary: {e}")
|
||
# Fall back to default
|
||
driver = Driver(
|
||
uc=True,
|
||
headless=headless,
|
||
page_load_strategy="normal"
|
||
)
|
||
log.info("Successfully created SeleniumBase UC driver with defaults")
|
||
else:
|
||
driver = Driver(
|
||
uc=True,
|
||
headless=headless,
|
||
page_load_strategy="normal"
|
||
)
|
||
log.info("Successfully created SeleniumBase UC driver")
|
||
else:
|
||
# Regular OS environment - SeleniumBase handles version matching automatically
|
||
log.info("Creating SeleniumBase UC Mode driver")
|
||
try:
|
||
driver = Driver(
|
||
uc=True,
|
||
headless=headless,
|
||
page_load_strategy="normal",
|
||
incognito=True # Use incognito mode for better stealth
|
||
)
|
||
log.info("Successfully created SeleniumBase UC driver")
|
||
except Exception as e:
|
||
log.error(f"Failed to create SeleniumBase driver: {e}")
|
||
raise
|
||
|
||
# Set page load timeout to avoid hanging
|
||
driver.set_page_load_timeout(30)
|
||
|
||
# Set window size
|
||
driver.set_window_size(1400, 900)
|
||
|
||
# Add additional stealth settings
|
||
try:
|
||
# Disable automation flags
|
||
driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
|
||
'source': '''
|
||
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
|
||
Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]});
|
||
Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en']});
|
||
'''
|
||
})
|
||
log.info("Additional stealth settings applied")
|
||
except Exception as e:
|
||
log.debug(f"Could not apply additional stealth settings: {e}")
|
||
|
||
log.info("SeleniumBase UC driver setup completed successfully")
|
||
return driver
|
||
|
||
def dismiss_cookies(self, driver: Chrome):
|
||
"""
|
||
Dismiss cookie consent dialogs if present.
|
||
Handles stale element references by re-finding elements if needed.
|
||
Updated January 2026 to handle current Google consent dialogs.
|
||
"""
|
||
dismissed = False
|
||
|
||
# Try multiple approaches to dismiss consent dialogs
|
||
consent_selectors = [
|
||
COOKIE_BTN,
|
||
# Additional Google consent selectors
|
||
'button[aria-label*="Accept all" i]',
|
||
'button[aria-label*="Aceptar todo" i]',
|
||
'button[aria-label*="Reject all" i]', # Sometimes we need to reject
|
||
'button:has-text("Accept")',
|
||
'button:has-text("Aceptar")',
|
||
'[role="dialog"] button:first-of-type',
|
||
'form[action*="consent"] button:first-of-type',
|
||
]
|
||
|
||
for selector in consent_selectors:
|
||
try:
|
||
elements = driver.find_elements(By.CSS_SELECTOR, selector)
|
||
for elem in elements:
|
||
try:
|
||
if elem.is_displayed() and elem.is_enabled():
|
||
# Try JavaScript click first (more reliable)
|
||
driver.execute_script("arguments[0].click();", elem)
|
||
log.info(f"Cookie/consent dialog dismissed with selector: {selector}")
|
||
time.sleep(1) # Wait for dialog to close
|
||
dismissed = True
|
||
break
|
||
except Exception as e:
|
||
log.debug(f"Error clicking consent button: {e}")
|
||
continue
|
||
if dismissed:
|
||
break
|
||
except Exception as e:
|
||
log.debug(f"Error finding consent elements with {selector}: {e}")
|
||
continue
|
||
|
||
# Also try to find and click any visible modal close buttons
|
||
if not dismissed:
|
||
try:
|
||
close_btns = driver.find_elements(By.CSS_SELECTOR,
|
||
'[role="dialog"] button[aria-label*="close" i], '
|
||
'[role="dialog"] button[aria-label*="cerrar" i], '
|
||
'.modal-close, .dialog-close')
|
||
for btn in close_btns:
|
||
if btn.is_displayed():
|
||
driver.execute_script("arguments[0].click();", btn)
|
||
log.info("Closed modal dialog")
|
||
dismissed = True
|
||
break
|
||
except Exception:
|
||
pass
|
||
|
||
return dismissed
|
||
|
||
def is_reviews_tab(self, tab: WebElement) -> bool:
|
||
"""
|
||
Dynamically detect if an element is the reviews tab across multiple languages and layouts.
|
||
Uses multiple detection approaches for maximum reliability.
|
||
"""
|
||
try:
|
||
# Strategy 1: Data attribute detection (most reliable across languages)
|
||
tab_index = tab.get_attribute("data-tab-index")
|
||
if tab_index == "1" or tab_index == "reviews":
|
||
return True
|
||
|
||
# Strategy 2: Role and aria attributes (accessibility detection)
|
||
role = tab.get_attribute("role")
|
||
aria_selected = tab.get_attribute("aria-selected")
|
||
aria_label = (tab.get_attribute("aria-label") or "").lower()
|
||
|
||
# Many review tabs have role="tab" and data attributes
|
||
if role == "tab" and any(word in aria_label for word in REVIEW_WORDS):
|
||
return True
|
||
|
||
# Strategy 3: Text content detection (multiple sources)
|
||
sources = [
|
||
tab.text.lower() if tab.text else "", # Direct text
|
||
aria_label, # ARIA label
|
||
tab.get_attribute("innerHTML").lower() or "", # Inner HTML
|
||
tab.get_attribute("textContent").lower() or "" # Text content
|
||
]
|
||
|
||
# Check all sources against our comprehensive keyword list
|
||
for source in sources:
|
||
if any(word in source for word in REVIEW_WORDS):
|
||
return True
|
||
|
||
# Strategy 4: Nested element detection
|
||
try:
|
||
# Check text in all child elements
|
||
for child in tab.find_elements(By.CSS_SELECTOR, "*"):
|
||
try:
|
||
child_text = child.text.lower() if child.text else ""
|
||
child_content = child.get_attribute("textContent").lower() or ""
|
||
|
||
if any(word in child_text for word in REVIEW_WORDS) or any(
|
||
word in child_content for word in REVIEW_WORDS):
|
||
return True
|
||
except:
|
||
continue
|
||
except:
|
||
pass
|
||
|
||
# Strategy 5: URL detection (some tabs have hrefs or data-hrefs with tell-tale values)
|
||
for attr in ["href", "data-href", "data-url", "data-target"]:
|
||
attr_value = (tab.get_attribute(attr) or "").lower()
|
||
if attr_value and ("review" in attr_value or "rating" in attr_value):
|
||
return True
|
||
|
||
# Strategy 6: Class detection (some review tabs have specific classes)
|
||
tab_class = tab.get_attribute("class") or ""
|
||
review_classes = ["review", "reviews", "rating", "ratings", "comments", "feedback", "g4jrve"]
|
||
if any(cls in tab_class for cls in review_classes):
|
||
return True
|
||
|
||
return False
|
||
|
||
except StaleElementReferenceException:
|
||
return False
|
||
except Exception as e:
|
||
log.debug(f"Error in is_reviews_tab: {e}")
|
||
return False
|
||
|
||
def click_reviews_tab(self, driver: Chrome):
|
||
"""
|
||
Highly dynamic reviews tab detection and clicking with multiple fallback strategies.
|
||
Works across different languages, layouts, and browser environments.
|
||
"""
|
||
max_timeout = 25 # Maximum seconds to try
|
||
end_time = time.time() + max_timeout
|
||
attempts = 0
|
||
|
||
# Define different selectors to try in order of reliability
|
||
tab_selectors = [
|
||
# Current Google Maps tab selectors (January 2026)
|
||
'.LRkQ2', # Main tab button class in current Google Maps
|
||
'.hh2c6', # Alternative tab button class
|
||
|
||
# Direct tab selectors
|
||
'[data-tab-index="1"]', # Most common tab index
|
||
'[role="tab"][data-tab-index]', # Any tab with index
|
||
'button[role="tab"]', # Button tabs
|
||
'div[role="tab"]', # Div tabs
|
||
'a[role="tab"]', # Link tabs
|
||
|
||
# Common Google Maps review tab selectors
|
||
'.fontTitleSmall[role="tab"]', # Google Maps title font tabs
|
||
'.m6QErb [role="tab"]', # Maps container tabs
|
||
|
||
# Text-based selectors for various languages
|
||
'button:contains("reviews")', # Button containing "reviews"
|
||
'div[role="tablist"] > *', # Any tab in a tab list
|
||
'div.m6QErb div[role="tablist"] > *', # Google Maps specific tablist
|
||
]
|
||
|
||
# Record successful clicks for debugging
|
||
successful_method = None
|
||
successful_selector = None
|
||
|
||
# Try each selector in turn
|
||
for selector in tab_selectors:
|
||
if time.time() > end_time:
|
||
break
|
||
|
||
try:
|
||
elements = driver.find_elements(By.CSS_SELECTOR, selector)
|
||
if not elements:
|
||
continue
|
||
|
||
# Try each element found with this selector
|
||
for element in elements:
|
||
attempts += 1
|
||
|
||
# First check if this is actually a reviews tab
|
||
if not self.is_reviews_tab(element):
|
||
continue
|
||
|
||
# Found a reviews tab, attempt to click it with multiple methods
|
||
log.info(f"Found potential reviews tab ({selector}): '{element.text}', attempting to click")
|
||
|
||
# Ensure visibility
|
||
driver.execute_script("arguments[0].scrollIntoView({block:'center', behavior:'smooth'});", element)
|
||
time.sleep(0.7) # Wait for scroll
|
||
|
||
# Try different click methods in order of reliability
|
||
click_methods = [
|
||
# Method 1: JavaScript click (most reliable)
|
||
lambda: driver.execute_script("arguments[0].click();", element),
|
||
|
||
# Method 2: Direct click
|
||
lambda: element.click(),
|
||
|
||
# Method 3: ActionChains click
|
||
lambda: ActionChains(driver).move_to_element(element).click().perform(),
|
||
|
||
# Method 4: Send RETURN key
|
||
lambda: element.send_keys(Keys.RETURN),
|
||
|
||
# Method 5: Center click with ActionChains
|
||
lambda: ActionChains(driver).move_to_element_with_offset(
|
||
element, element.size['width'] // 2, element.size['height'] // 2).click().perform(),
|
||
]
|
||
|
||
# Try each click method
|
||
for i, click_method in enumerate(click_methods):
|
||
try:
|
||
click_method()
|
||
time.sleep(1.5) # Wait for click to take effect
|
||
|
||
# Verify if click worked (check for new content)
|
||
if self.verify_reviews_tab_clicked(driver):
|
||
successful_method = i + 1
|
||
successful_selector = selector
|
||
log.info(
|
||
f"Successfully clicked reviews tab using method {i + 1} and selector '{selector}'")
|
||
return True
|
||
except Exception as click_error:
|
||
log.debug(f"Click method {i + 1} failed: {click_error}")
|
||
continue
|
||
|
||
except Exception as selector_error:
|
||
log.debug(f"Error with selector '{selector}': {selector_error}")
|
||
continue
|
||
|
||
# If we reach here, try XPath as a last resort
|
||
if time.time() <= end_time:
|
||
for language_keyword in REVIEW_WORDS:
|
||
try:
|
||
# Try XPath contains text
|
||
xpath = f"//*[contains(text(), '{language_keyword}')]"
|
||
elements = driver.find_elements(By.XPATH, xpath)
|
||
|
||
for element in elements:
|
||
try:
|
||
log.info(f"Trying XPath with keyword '{language_keyword}'")
|
||
driver.execute_script("arguments[0].scrollIntoView({block:'center'});", element)
|
||
time.sleep(0.7)
|
||
driver.execute_script("arguments[0].click();", element)
|
||
time.sleep(1.5)
|
||
|
||
if self.verify_reviews_tab_clicked(driver):
|
||
log.info(f"Successfully clicked element with keyword '{language_keyword}'")
|
||
return True
|
||
except:
|
||
continue
|
||
except:
|
||
continue
|
||
|
||
# Final attempt: try to navigate directly to reviews by URL
|
||
try:
|
||
current_url = driver.current_url
|
||
if "?hl=" in current_url: # Preserve language setting if present
|
||
lang_param = re.search(r'\?hl=([^&]*)', current_url)
|
||
if lang_param:
|
||
lang_code = lang_param.group(1)
|
||
# Try to replace the current part with 'reviews' or append it
|
||
if '/place/' in current_url:
|
||
parts = current_url.split('/place/')
|
||
new_url = f"{parts[0]}/place/{parts[1].split('/')[0]}/reviews?hl={lang_code}"
|
||
driver.get(new_url)
|
||
time.sleep(3) # Increased wait time for page load
|
||
if "review" in driver.current_url.lower():
|
||
log.info("Navigated directly to reviews page via URL")
|
||
# Extra wait for reviews to render after URL navigation
|
||
time.sleep(2)
|
||
return True
|
||
|
||
# Try to identify reviews link in URL
|
||
if '/place/' in current_url and '/reviews' not in current_url:
|
||
parts = current_url.split('/place/')
|
||
new_url = f"{parts[0]}/place/{parts[1].split('/')[0]}/reviews"
|
||
driver.get(new_url)
|
||
time.sleep(3) # Increased wait time for page load
|
||
if "review" in driver.current_url.lower():
|
||
log.info("Navigated directly to reviews page via URL")
|
||
# Extra wait for reviews to render after URL navigation
|
||
time.sleep(2)
|
||
return True
|
||
except Exception as url_error:
|
||
log.warning(f"Failed to navigate to reviews via URL: {url_error}")
|
||
|
||
log.warning(f"Failed to find/click reviews tab after {attempts} attempts")
|
||
raise TimeoutException("Reviews tab not found or could not be clicked")
|
||
|
||
def verify_reviews_tab_clicked(self, driver: Chrome) -> bool:
|
||
"""
|
||
Verify that the reviews tab was successfully clicked by checking for
|
||
characteristic elements that appear on the reviews page.
|
||
"""
|
||
try:
|
||
# Common elements that appear when reviews tab is active (Updated January 2026)
|
||
verification_selectors = [
|
||
# Reviews container (current)
|
||
'div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde',
|
||
'div.m6QErb.WNBkOb.XiKgde',
|
||
|
||
# Review cards (current)
|
||
'div.jftiEf',
|
||
'div[data-review-id]',
|
||
|
||
# Sort button (usually appears with reviews)
|
||
'button[aria-label*="Sort" i]',
|
||
|
||
# Review rating elements
|
||
'span[role="img"][aria-label*="star" i]',
|
||
|
||
# Other indicators
|
||
'div.m6QErb div.jftiEf',
|
||
'.HlvSq'
|
||
]
|
||
|
||
# Check if any verification selector is present
|
||
for selector in verification_selectors:
|
||
elements = driver.find_elements(By.CSS_SELECTOR, selector)
|
||
if elements and len(elements) > 0:
|
||
return True
|
||
|
||
# URL check - if "review" appears in the URL
|
||
if "review" in driver.current_url.lower():
|
||
return True
|
||
|
||
return False
|
||
except Exception as e:
|
||
log.debug(f"Error verifying reviews tab click: {e}")
|
||
return False
|
||
|
||
def set_sort(self, driver: Chrome, method: str):
|
||
"""
|
||
Set the sorting method for reviews with enhanced detection for the latest Google Maps UI.
|
||
Works across different languages and UI variations, with robust error handling.
|
||
"""
|
||
if method == "relevance":
|
||
log.info("Using default 'relevance' sort - no need to change sort order")
|
||
return True # Default order, no need to change
|
||
|
||
log.info(f"Attempting to set sort order to '{method}'")
|
||
|
||
try:
|
||
# 1. Find and click the sort button
|
||
sort_button_selectors = [
|
||
# Exact selectors based on recent HTML structure
|
||
'button.HQzyZ[aria-haspopup="true"]',
|
||
'div.m6QErb button.HQzyZ',
|
||
'button[jsaction*="pane.wfvdle84"]',
|
||
'div.fontBodyLarge.k5lwKb', # The text element inside sort button
|
||
|
||
# Common attribute-based selectors
|
||
'button[aria-label*="Sort" i]',
|
||
'button[aria-label*="sort" i]',
|
||
'button[aria-expanded="false"][aria-haspopup="true"]',
|
||
|
||
# Multilingual selectors
|
||
'button[aria-label*="סדר" i]', # Hebrew
|
||
'button[aria-label*="เรียง" i]', # Thai
|
||
'button[aria-label*="排序" i]', # Chinese
|
||
'button[aria-label*="Trier" i]', # French
|
||
'button[aria-label*="Ordenar" i]', # Spanish/Portuguese
|
||
'button[aria-label*="Sortieren" i]', # German
|
||
|
||
# Parent container-based selectors
|
||
'div.m6QErb.Hk4XGb.XiKgde.tLjsW button',
|
||
'div.m6QErb div.XiKgde button'
|
||
]
|
||
|
||
# Attempt to find the sort button
|
||
sort_button = None
|
||
|
||
# Try each selector
|
||
for selector in sort_button_selectors:
|
||
try:
|
||
elements = driver.find_elements(By.CSS_SELECTOR, selector)
|
||
for element in elements:
|
||
try:
|
||
# Skip invisible/disabled elements
|
||
if not element.is_displayed() or not element.is_enabled():
|
||
continue
|
||
|
||
# Get button text and attributes for verification
|
||
button_text = element.text.strip() if element.text else ""
|
||
button_aria = element.get_attribute("aria-label") or ""
|
||
button_class = element.get_attribute("class") or ""
|
||
|
||
# Skip buttons that are clearly not sort buttons
|
||
negative_keywords = ["back", "next", "previous", "close", "cancel", "חזרה", "סגור", "ปิด"]
|
||
if any(keyword in button_text.lower() or keyword in button_aria.lower()
|
||
for keyword in negative_keywords):
|
||
continue
|
||
|
||
# Positive detection for sort buttons
|
||
sort_keywords = ["sort", "Sort", "SORT", "סידור", "เรียง", "排序", "trier", "ordenar", "sortieren"]
|
||
has_sort_keyword = any(keyword in button_text or keyword in button_aria
|
||
for keyword in sort_keywords)
|
||
|
||
# Check for common sort button classes
|
||
has_sort_class = "HQzyZ" in button_class or "sort" in button_class.lower()
|
||
|
||
# Check for aria attributes that indicate a dropdown
|
||
has_dropdown_attrs = (element.get_attribute("aria-haspopup") == "true" or
|
||
element.get_attribute("aria-expanded") is not None)
|
||
|
||
if has_sort_keyword or has_sort_class or has_dropdown_attrs:
|
||
# Found a potential sort button
|
||
sort_button = element
|
||
log.info(f"Found sort button with selector: {selector}")
|
||
log.info(f"Button text: '{button_text}', aria-label: '{button_aria}'")
|
||
break
|
||
except Exception as e:
|
||
log.debug(f"Error checking element: {e}")
|
||
continue
|
||
|
||
if sort_button:
|
||
break
|
||
except Exception as e:
|
||
log.debug(f"Error with selector '{selector}': {e}")
|
||
continue
|
||
|
||
# If no button found with CSS selectors, try finding it from its container
|
||
if not sort_button:
|
||
try:
|
||
# Look for the sort container by its distinctive classes
|
||
containers = driver.find_elements(By.CSS_SELECTOR, 'div.m6QErb.Hk4XGb, div.XiKgde.tLjsW')
|
||
for container in containers:
|
||
try:
|
||
# Find buttons within this container
|
||
buttons = container.find_elements(By.TAG_NAME, 'button')
|
||
for button in buttons:
|
||
if button.is_displayed() and button.is_enabled():
|
||
sort_button = button
|
||
log.info("Found sort button through container element")
|
||
break
|
||
except:
|
||
continue
|
||
if sort_button:
|
||
break
|
||
except Exception as e:
|
||
log.debug(f"Error finding button via container: {e}")
|
||
|
||
# If still no button found, try XPath approach with keywords
|
||
if not sort_button:
|
||
xpath_terms = ["sort", "Sort", "סדר", "סידור", "เรียง", "排序", "Trier", "Ordenar", "Sortieren"]
|
||
for term in xpath_terms:
|
||
try:
|
||
xpath = f"//*[contains(text(), '{term}') or contains(@aria-label, '{term}')]"
|
||
elements = driver.find_elements(By.XPATH, xpath)
|
||
for element in elements:
|
||
try:
|
||
if element.is_displayed() and element.is_enabled():
|
||
sort_button = element
|
||
log.info(f"Found sort button with XPath term: '{term}'")
|
||
break
|
||
except:
|
||
continue
|
||
if sort_button:
|
||
break
|
||
except:
|
||
continue
|
||
|
||
# Final fallback: look for any button in the reviews area that might open a dropdown
|
||
if not sort_button:
|
||
try:
|
||
# Look specifically in the reviews container area
|
||
reviews_container = driver.find_elements(By.CSS_SELECTOR, 'div.m6QErb, div.DxyBCb')
|
||
for container in reviews_container:
|
||
try:
|
||
# Find all buttons in this container
|
||
buttons = container.find_elements(By.TAG_NAME, 'button')
|
||
for button in buttons:
|
||
try:
|
||
if (button.is_displayed() and button.is_enabled() and
|
||
(button.get_attribute("aria-haspopup") == "true" or
|
||
"dropdown" in (button.get_attribute("class") or "").lower())):
|
||
sort_button = button
|
||
log.info("Found potential sort button via fallback dropdown detection")
|
||
break
|
||
except:
|
||
continue
|
||
if sort_button:
|
||
break
|
||
except:
|
||
continue
|
||
except Exception as e:
|
||
log.debug(f"Error in fallback sort button detection: {e}")
|
||
|
||
# Final check - do we have a sort button?
|
||
if not sort_button:
|
||
log.warning("No sort button found with any method - keeping default sort order")
|
||
return False
|
||
|
||
# 2. Click the sort button to open dropdown menu
|
||
|
||
# First ensure the button is in view
|
||
driver.execute_script("arguments[0].scrollIntoView({block: 'center', behavior: 'smooth'});", sort_button)
|
||
time.sleep(0.8) # Wait for scroll
|
||
|
||
# Try multiple click methods
|
||
click_methods = [
|
||
# Method 1: JavaScript click
|
||
lambda: driver.execute_script("arguments[0].click();", sort_button),
|
||
|
||
# Method 2: Direct click
|
||
lambda: sort_button.click(),
|
||
|
||
# Method 3: ActionChains click with move first
|
||
lambda: ActionChains(driver).move_to_element(sort_button).pause(0.3).click().perform(),
|
||
|
||
# Method 4: Click on center of element
|
||
lambda: ActionChains(driver).move_to_element_with_offset(
|
||
sort_button, sort_button.size['width'] // 2, sort_button.size['height'] // 2
|
||
).click().perform(),
|
||
|
||
# Method 5: JavaScript focus and click
|
||
lambda: driver.execute_script(
|
||
"arguments[0].focus(); setTimeout(function() { arguments[0].click(); }, 100);", sort_button
|
||
),
|
||
|
||
# Method 6: Send RETURN key after focusing
|
||
lambda: ActionChains(driver).move_to_element(sort_button).click().send_keys(Keys.RETURN).perform()
|
||
]
|
||
|
||
# Try each click method
|
||
menu_opened = False
|
||
|
||
for i, click_method in enumerate(click_methods):
|
||
try:
|
||
log.info(f"Trying click method {i + 1} for sort button...")
|
||
click_method()
|
||
time.sleep(1) # Wait for menu to appear
|
||
|
||
# Check if menu opened
|
||
menu_opened = self.check_if_menu_opened(driver)
|
||
|
||
if menu_opened:
|
||
log.info(f"Sort menu opened with click method {i + 1}")
|
||
break
|
||
except Exception as e:
|
||
log.debug(f"Click method {i + 1} failed: {e}")
|
||
continue
|
||
|
||
# If menu not opened, abort
|
||
if not menu_opened:
|
||
log.warning("Failed to open sort menu - keeping default sort order")
|
||
# Try to reset state by clicking elsewhere
|
||
try:
|
||
ActionChains(driver).move_by_offset(50, 50).click().perform()
|
||
except:
|
||
pass
|
||
return False
|
||
|
||
# 3. Find and click the desired sort option in the menu
|
||
|
||
# Selectors for menu items with focus on the exact HTML structure
|
||
menu_item_selectors = [
|
||
# Exact Google Maps menu item selectors
|
||
'div[role="menuitemradio"]',
|
||
'div.fxNQSd[role="menuitemradio"]',
|
||
'div[role="menuitemradio"] div.mLuXec', # Inner text container
|
||
|
||
# Generic menu item selectors (fallback)
|
||
'[role="menuitemradio"]',
|
||
'[role="menuitem"]',
|
||
'div[role="menu"] > div'
|
||
]
|
||
|
||
# Combined selector for efficiency
|
||
combined_selector = ", ".join(menu_item_selectors)
|
||
|
||
try:
|
||
# Wait for menu items to appear
|
||
menu_items = WebDriverWait(driver, 5).until(
|
||
EC.presence_of_all_elements_located((By.CSS_SELECTOR, combined_selector))
|
||
)
|
||
|
||
# Process menu items to find matches
|
||
visible_items = []
|
||
|
||
for item in menu_items:
|
||
try:
|
||
# Skip invisible items
|
||
if not item.is_displayed():
|
||
continue
|
||
|
||
# Handle different element types
|
||
if item.get_attribute('role') == 'menuitemradio':
|
||
# This is a top-level menu item
|
||
try:
|
||
# Try to find text in the inner div.mLuXec element first
|
||
text_elements = item.find_elements(By.CSS_SELECTOR, 'div.mLuXec')
|
||
if text_elements and text_elements[0].is_displayed():
|
||
text = text_elements[0].text.strip()
|
||
visible_items.append((item, text))
|
||
else:
|
||
# Fall back to the item's own text
|
||
text = item.text.strip()
|
||
visible_items.append((item, text))
|
||
except:
|
||
# Last resort - use the item's own text
|
||
text = item.text.strip()
|
||
visible_items.append((item, text))
|
||
elif 'mLuXec' in (item.get_attribute('class') or ''):
|
||
# This is the text container element - get its parent menuitemradio
|
||
try:
|
||
text = item.text.strip()
|
||
parent = driver.execute_script(
|
||
"return arguments[0].closest('[role=\"menuitemradio\"]');",
|
||
item
|
||
)
|
||
if parent:
|
||
visible_items.append((parent, text))
|
||
except:
|
||
continue
|
||
else:
|
||
# Generic menu item handling
|
||
text = item.text.strip()
|
||
visible_items.append((item, text))
|
||
except Exception as e:
|
||
log.debug(f"Error processing menu item: {e}")
|
||
continue
|
||
|
||
log.info(f"Found {len(visible_items)} visible menu items")
|
||
for i, (_, text) in enumerate(visible_items):
|
||
log.debug(f" Menu item {i + 1}: '{text}'")
|
||
|
||
# Determine the target menu item based on sort method
|
||
target_item = None
|
||
matched_text = None
|
||
|
||
# Log all available menu items for debugging
|
||
log.info(f"Available menu items: {[text for _, text in visible_items]}")
|
||
|
||
# Use position-based selection (most reliable for Google Maps)
|
||
position_map = {
|
||
"relevance": 0, # Usually the first option
|
||
"newest": 1, # Usually the second option
|
||
"highest": 2, # Usually the third option
|
||
"lowest": 3 # Usually the fourth option
|
||
}
|
||
|
||
pos = position_map.get(method, -1)
|
||
if pos >= 0 and pos < len(visible_items):
|
||
target_item, matched_text = visible_items[pos]
|
||
log.info(f"Selected menu item at position {pos + 1}: '{matched_text}' for sort method '{method}'")
|
||
|
||
# Validate the selection makes sense
|
||
wanted_labels = SORT_OPTIONS.get(method, [])
|
||
text_clean = matched_text.lower()
|
||
|
||
# Check if selected text contains any of the expected keywords
|
||
valid_selection = False
|
||
for label in wanted_labels:
|
||
if label.lower() in text_clean or text_clean in label.lower():
|
||
valid_selection = True
|
||
break
|
||
|
||
if not valid_selection:
|
||
log.warning(f"WARNING: Selected '{matched_text}' doesn't match expected '{method}' - might be wrong sort!")
|
||
else:
|
||
log.warning(f"Position {pos} not available in menu (only {len(visible_items)} items)")
|
||
|
||
# 3. If target found, click it
|
||
if target_item:
|
||
# Ensure item is in view
|
||
driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", target_item)
|
||
time.sleep(0.3)
|
||
|
||
# Try multiple click methods
|
||
click_success = False
|
||
click_methods = [
|
||
# Method 1: JavaScript click
|
||
lambda: driver.execute_script("arguments[0].click();", target_item),
|
||
|
||
# Method 2: Direct click
|
||
lambda: target_item.click(),
|
||
|
||
# Method 3: ActionChains click
|
||
lambda: ActionChains(driver).move_to_element(target_item).click().perform(),
|
||
|
||
# Method 4: Center click
|
||
lambda: ActionChains(driver).move_to_element_with_offset(
|
||
target_item, target_item.size['width'] // 2, target_item.size['height'] // 2
|
||
).click().perform(),
|
||
|
||
# Method 5: JavaScript click with custom event
|
||
lambda: driver.execute_script("""
|
||
var el = arguments[0];
|
||
var evt = new MouseEvent('click', {
|
||
bubbles: true,
|
||
cancelable: true,
|
||
view: window
|
||
});
|
||
el.dispatchEvent(evt);
|
||
""", target_item)
|
||
]
|
||
|
||
for i, click_method in enumerate(click_methods):
|
||
try:
|
||
click_method()
|
||
time.sleep(1.5) # Wait for sort to take effect
|
||
|
||
# Try to verify sort happened by checking if menu closed
|
||
still_open = self.check_if_menu_opened(driver)
|
||
if not still_open:
|
||
click_success = True
|
||
log.info(f"Successfully clicked menu item with method {i + 1}")
|
||
break
|
||
except Exception as e:
|
||
log.debug(f"Menu item click method {i + 1} failed: {e}")
|
||
continue
|
||
|
||
if click_success:
|
||
log.info(f"Successfully set sort order to '{method}'")
|
||
return True
|
||
else:
|
||
log.warning(f"Failed to click menu item - keeping default sort order")
|
||
else:
|
||
log.warning(f"No matching menu item found for '{method}'")
|
||
|
||
# If we get here, we failed - try to close the menu by clicking elsewhere
|
||
try:
|
||
ActionChains(driver).move_by_offset(50, 50).click().perform()
|
||
except:
|
||
pass
|
||
|
||
return False
|
||
|
||
except TimeoutException:
|
||
log.warning("Timeout waiting for menu items")
|
||
return False
|
||
except Exception as e:
|
||
log.warning(f"Error in menu item selection: {e}")
|
||
return False
|
||
|
||
except Exception as e:
|
||
log.warning(f"Error in set_sort method: {e}")
|
||
return False
|
||
|
||
def check_if_menu_opened(self, driver):
|
||
"""
|
||
Check if a sort menu has been opened after clicking the sort button.
|
||
Uses multiple detection strategies optimized for Google Maps dropdowns.
|
||
Returns True if menu is detected, False otherwise.
|
||
"""
|
||
try:
|
||
# 1. First check for exact menu container selectors from the latest Google Maps UI
|
||
specific_menu_selectors = [
|
||
'div[role="menu"][id="action-menu"]', # Exact match from provided HTML
|
||
'div.fontBodyLarge.yu5kgd[role="menu"]', # Classes from provided HTML
|
||
'div.fxNQSd[role="menuitemradio"]', # Menu item class
|
||
'div.yu5kgd[role="menu"]' # Alternate class
|
||
]
|
||
|
||
for selector in specific_menu_selectors:
|
||
elements = driver.find_elements(By.CSS_SELECTOR, selector)
|
||
for element in elements:
|
||
try:
|
||
if element.is_displayed():
|
||
return True
|
||
except:
|
||
continue
|
||
|
||
# 2. Check for generic menu containers
|
||
generic_menu_selectors = [
|
||
'div[role="menu"]',
|
||
'ul[role="menu"]',
|
||
'[role="listbox"]'
|
||
]
|
||
|
||
for selector in generic_menu_selectors:
|
||
elements = driver.find_elements(By.CSS_SELECTOR, selector)
|
||
for element in elements:
|
||
try:
|
||
if element.is_displayed():
|
||
return True
|
||
except:
|
||
continue
|
||
|
||
# 3. Look for menu items
|
||
menu_item_selectors = [
|
||
'div[role="menuitemradio"]', # Google Maps specific
|
||
'div.fxNQSd', # Class-based detection
|
||
'div.mLuXec', # Text container class
|
||
'[role="menuitem"]', # Generic menu items
|
||
'[role="option"]' # Alternative role
|
||
]
|
||
|
||
visible_items = 0
|
||
for selector in menu_item_selectors:
|
||
elements = driver.find_elements(By.CSS_SELECTOR, selector)
|
||
for element in elements:
|
||
try:
|
||
if element.is_displayed():
|
||
visible_items += 1
|
||
if visible_items >= 2: # At least 2 menu items should be visible
|
||
return True
|
||
except:
|
||
continue
|
||
|
||
# 4. Advanced detection with JavaScript
|
||
# Checks if there are newly visible elements with menu-related roles or classes
|
||
try:
|
||
js_detection = """
|
||
return (function() {
|
||
// Check for visible menu elements
|
||
var menuElements = document.querySelectorAll('div[role="menu"], div[role="menuitemradio"], div.fxNQSd');
|
||
for (var i = 0; i < menuElements.length; i++) {
|
||
var style = window.getComputedStyle(menuElements[i]);
|
||
if (style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0') {
|
||
return true;
|
||
}
|
||
}
|
||
|
||
// Check for any recently appeared elements that might be a menu
|
||
var possibleMenus = document.querySelectorAll('div.yu5kgd, div.fontBodyLarge');
|
||
for (var i = 0; i < possibleMenus.length; i++) {
|
||
var style = window.getComputedStyle(possibleMenus[i]);
|
||
var rect = possibleMenus[i].getBoundingClientRect();
|
||
// Check if element is visible and has a meaningful size
|
||
if (style.display !== 'none' && style.visibility !== 'hidden' &&
|
||
rect.width > 50 && rect.height > 50) {
|
||
return true;
|
||
}
|
||
}
|
||
|
||
return false;
|
||
})();
|
||
"""
|
||
menu_detected = driver.execute_script(js_detection)
|
||
if menu_detected:
|
||
return True
|
||
except Exception as js_error:
|
||
log.debug(f"Error in JavaScript menu detection: {js_error}")
|
||
|
||
# 5. Last resort: check if any positioning styles were applied to elements
|
||
# This can detect menu containers that have been positioned absolutely
|
||
try:
|
||
position_check = """
|
||
return (function() {
|
||
// Look for absolutely positioned elements that appeared recently
|
||
var elements = document.querySelectorAll('div[style*="position: absolute"]');
|
||
for (var i = 0; i < elements.length; i++) {
|
||
var el = elements[i];
|
||
var style = window.getComputedStyle(el);
|
||
var hasMenuItems = el.querySelectorAll('div[role="menuitemradio"], div.fxNQSd').length > 0;
|
||
|
||
if (style.display !== 'none' && style.visibility !== 'hidden' && hasMenuItems) {
|
||
return true;
|
||
}
|
||
}
|
||
return false;
|
||
})();
|
||
"""
|
||
position_detected = driver.execute_script(position_check)
|
||
if position_detected:
|
||
return True
|
||
except:
|
||
pass
|
||
|
||
return False
|
||
|
||
except Exception as e:
|
||
log.debug(f"Error checking menu state: {e}")
|
||
return False
|
||
|
||
def scrape(self):
|
||
"""Main scraper method"""
|
||
start_time = time.time()
|
||
|
||
url = self.config.get("url")
|
||
headless = self.config.get("headless", True)
|
||
sort_by = self.config.get("sort_by", "relevance")
|
||
stop_on_match = self.config.get("stop_on_match", False)
|
||
|
||
log.info(f"Starting scraper with settings: headless={headless}, sort_by={sort_by}")
|
||
log.info(f"URL: {url}")
|
||
|
||
# Initialize storage
|
||
# If not overwriting, load existing data
|
||
if self.overwrite_existing:
|
||
docs = {}
|
||
seen = set()
|
||
else:
|
||
# Try to get from MongoDB first if enabled
|
||
docs = {}
|
||
if self.use_mongodb and self.mongodb:
|
||
docs = self.mongodb.fetch_existing_reviews()
|
||
|
||
# If backup_to_json is enabled, also load from JSON for merging
|
||
if self.backup_to_json:
|
||
json_docs = self.json_storage.load_json_docs()
|
||
# Merge JSON docs with MongoDB docs
|
||
for review_id, review in json_docs.items():
|
||
if review_id not in docs:
|
||
docs[review_id] = review
|
||
|
||
# Load seen IDs from file
|
||
seen = self.json_storage.load_seen()
|
||
|
||
driver = None
|
||
api_reviews = {} # Store reviews captured from API
|
||
try:
|
||
driver = self.setup_driver(headless)
|
||
wait = WebDriverWait(driver, 20) # Reduced from 40 to 20 for faster timeout
|
||
|
||
driver.get(url)
|
||
wait.until(lambda d: "google.com/maps" in d.current_url)
|
||
|
||
# Wait for page to load and consent dialogs to appear
|
||
time.sleep(3)
|
||
|
||
# Try to dismiss any consent/cookie dialogs
|
||
if not self.dismiss_cookies(driver):
|
||
# Wait a bit more and try again
|
||
time.sleep(2)
|
||
self.dismiss_cookies(driver)
|
||
|
||
self.click_reviews_tab(driver)
|
||
|
||
# Extra wait after clicking reviews tab to ensure page loads
|
||
log.info("Waiting for reviews page to fully load...")
|
||
time.sleep(3)
|
||
|
||
# Wait for page to be fully interactive
|
||
try:
|
||
wait.until(lambda d: d.execute_script("return document.readyState") == "complete")
|
||
log.info("Page DOM is ready")
|
||
except:
|
||
log.debug("Could not verify page ready state")
|
||
|
||
# Verify we're on a reviews page before proceeding
|
||
if "review" not in driver.current_url.lower():
|
||
log.warning("URL doesn't contain 'review' - might not be on reviews page")
|
||
|
||
# Try to set sort - but don't fail if it doesn't work
|
||
try:
|
||
self.set_sort(driver, sort_by)
|
||
except Exception as sort_error:
|
||
log.warning(f"Sort failed but continuing: {sort_error}")
|
||
|
||
# Add a longer wait after setting sort to allow results to load
|
||
log.info("Waiting for reviews to render...")
|
||
time.sleep(3)
|
||
|
||
# Use try-except to handle cases where the pane is not found
|
||
# Try multiple selectors for the reviews pane (Updated January 2026)
|
||
pane = None
|
||
pane_selectors = [
|
||
PANE_SEL, # Primary selector with XiKgde
|
||
'div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde', # Without role="main" prefix
|
||
'div.m6QErb.WNBkOb.XiKgde', # Alternative class combination
|
||
'div[role="main"] div.m6QErb.XiKgde', # Simplified with XiKgde
|
||
'div.m6QErb.DxyBCb.XiKgde', # Another variant
|
||
'div[role="main"] div.m6QErb', # Simplified version
|
||
'div.m6QErb.DxyBCb', # Even more simplified
|
||
'div[role="main"]' # Most generic
|
||
]
|
||
|
||
for selector in pane_selectors:
|
||
try:
|
||
log.info(f"Trying to find reviews pane with selector: {selector}")
|
||
pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector)))
|
||
if pane:
|
||
log.info(f"Found reviews pane with selector: {selector}")
|
||
break
|
||
except TimeoutException:
|
||
log.debug(f"Pane not found with selector: {selector}")
|
||
continue
|
||
|
||
if not pane:
|
||
log.warning("Could not find reviews pane with any selector. Page structure might have changed.")
|
||
return False
|
||
|
||
# Initialize API interceptor AFTER reviews page is loaded (if enabled)
|
||
# This prevents CDP interception from affecting initial page load and tab detection
|
||
if self.enable_api_intercept:
|
||
log.info("Setting up API interception for reviews capture")
|
||
self.api_interceptor = GoogleMapsAPIInterceptor(driver)
|
||
self.api_interceptor.setup_interception()
|
||
self.api_interceptor.inject_response_interceptor()
|
||
log.info("API interceptor ready - capturing network responses")
|
||
|
||
pbar = tqdm(desc="Scraped", ncols=80, initial=len(seen))
|
||
idle = 0
|
||
processed_ids = set() # Track processed IDs in current session
|
||
|
||
# Prefetch selector to avoid repeated lookups
|
||
try:
|
||
driver.execute_script("window.scrollablePane = arguments[0];", pane)
|
||
scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
|
||
except Exception as e:
|
||
log.warning(f"Error setting up scroll script: {e}")
|
||
scroll_script = "window.scrollBy(0, 300);" # Fallback to simple scrolling
|
||
|
||
max_attempts = 50 # Increased from 10 to 50 for very patient scrolling
|
||
attempts = 0
|
||
max_idle = 15 # Increased from 3 to 15 - much more patience for lazy-loaded reviews
|
||
consecutive_no_cards = 0 # Track how many times we find zero cards
|
||
last_scroll_position = 0
|
||
scroll_stuck_count = 0
|
||
|
||
# Card selectors to try (Updated January 2026)
|
||
card_selectors = [
|
||
CARD_SEL, # Primary: div.jftiEf
|
||
"div[data-review-id]", # Alternative: direct data-review-id
|
||
".jftiEf", # Without div prefix
|
||
"div.WMbnJf", # Another common review card class
|
||
"[data-review-id]", # Any element with review ID
|
||
]
|
||
|
||
while attempts < max_attempts:
|
||
try:
|
||
# Try multiple card selectors within the pane
|
||
cards = []
|
||
for card_sel in card_selectors:
|
||
cards = pane.find_elements(By.CSS_SELECTOR, card_sel)
|
||
if cards:
|
||
if attempts == 0: # Only log once
|
||
log.info(f"Found {len(cards)} cards with selector: {card_sel}")
|
||
break
|
||
|
||
# If no cards found in pane, try searching the entire document
|
||
if not cards:
|
||
for card_sel in card_selectors:
|
||
cards = driver.find_elements(By.CSS_SELECTOR, card_sel)
|
||
if cards:
|
||
if attempts == 0:
|
||
log.info(f"Found {len(cards)} cards in document with selector: {card_sel}")
|
||
break
|
||
|
||
fresh_cards: List[WebElement] = []
|
||
|
||
# Check for valid cards
|
||
if len(cards) == 0:
|
||
consecutive_no_cards += 1
|
||
log.info(f"No review cards found in this iteration (consecutive: {consecutive_no_cards})")
|
||
|
||
# If we keep finding no cards, might have hit the end
|
||
if consecutive_no_cards > 5:
|
||
log.warning("No cards found for 5+ iterations - might be at end of reviews")
|
||
break
|
||
|
||
attempts += 1
|
||
# Try aggressive scrolling
|
||
driver.execute_script(scroll_script)
|
||
time.sleep(1)
|
||
driver.execute_script("window.scrollBy(0, 1000);") # Extra scroll
|
||
time.sleep(1.5)
|
||
continue
|
||
else:
|
||
consecutive_no_cards = 0 # Reset counter when we find cards
|
||
|
||
for c in cards:
|
||
try:
|
||
# Try to get data-review-id from the card itself
|
||
cid = c.get_attribute("data-review-id")
|
||
# If not found on card, try to find it in a child element
|
||
if not cid:
|
||
try:
|
||
review_id_elem = c.find_element(By.CSS_SELECTOR, "[data-review-id]")
|
||
cid = review_id_elem.get_attribute("data-review-id")
|
||
except:
|
||
pass
|
||
if not cid or cid in seen or cid in processed_ids:
|
||
if stop_on_match and cid and (cid in seen or cid in processed_ids):
|
||
idle = 999
|
||
break
|
||
continue
|
||
fresh_cards.append(c)
|
||
except StaleElementReferenceException:
|
||
continue
|
||
except Exception as e:
|
||
log.debug(f"Error getting review ID: {e}")
|
||
continue
|
||
|
||
for card in fresh_cards:
|
||
try:
|
||
raw = RawReview.from_card(card)
|
||
processed_ids.add(raw.id) # Track this ID to avoid re-processing
|
||
except StaleElementReferenceException:
|
||
continue
|
||
except Exception:
|
||
log.warning("⚠️ parse error – storing stub\n%s",
|
||
traceback.format_exc(limit=1).strip())
|
||
try:
|
||
raw_id = card.get_attribute("data-review-id") or ""
|
||
raw = RawReview(id=raw_id, text="", lang="und")
|
||
processed_ids.add(raw_id)
|
||
except StaleElementReferenceException:
|
||
continue
|
||
|
||
docs[raw.id] = merge_review(docs.get(raw.id), raw)
|
||
seen.add(raw.id)
|
||
pbar.update(1)
|
||
idle = 0
|
||
attempts = 0 # Reset attempts counter when we successfully process a review
|
||
|
||
if idle >= max_idle:
|
||
log.info(f"Stopping: No new reviews found after {max_idle} scroll attempts")
|
||
break
|
||
|
||
if not fresh_cards:
|
||
idle += 1
|
||
attempts += 1
|
||
log.info(f"No new reviews in this iteration (idle: {idle}/{max_idle}, attempts: {attempts}/{max_attempts}, total seen: {len(seen)})")
|
||
|
||
# When no new reviews, scroll more aggressively
|
||
try:
|
||
# Try multiple scroll methods
|
||
driver.execute_script(scroll_script)
|
||
time.sleep(0.5)
|
||
driver.execute_script("window.scrollBy(0, 500);") # Extra scroll
|
||
time.sleep(0.5)
|
||
except Exception as e:
|
||
log.warning(f"Error scrolling: {e}")
|
||
else:
|
||
log.info(f"Found {len(fresh_cards)} new reviews in this iteration")
|
||
|
||
# Check if we're actually scrolling or stuck
|
||
try:
|
||
current_scroll = driver.execute_script("return arguments[0].scrollTop;", pane)
|
||
if current_scroll == last_scroll_position and len(fresh_cards) == 0:
|
||
scroll_stuck_count += 1
|
||
log.warning(f"Scroll position hasn't changed (stuck at {current_scroll}px, stuck count: {scroll_stuck_count})")
|
||
|
||
if scroll_stuck_count > 5:
|
||
log.warning("Scroll is stuck - trying alternative scroll method")
|
||
# Try clicking the last visible review to force loading
|
||
try:
|
||
driver.execute_script("arguments[0].lastElementChild.scrollIntoView();", pane)
|
||
time.sleep(2)
|
||
except:
|
||
pass
|
||
scroll_stuck_count = 0
|
||
else:
|
||
scroll_stuck_count = 0
|
||
last_scroll_position = current_scroll
|
||
except:
|
||
pass
|
||
|
||
# Use JavaScript for smoother scrolling
|
||
try:
|
||
driver.execute_script(scroll_script)
|
||
except Exception as e:
|
||
log.warning(f"Error scrolling: {e}")
|
||
# Try a simpler scroll method
|
||
driver.execute_script("window.scrollBy(0, 300);")
|
||
|
||
# Collect API responses if interception is enabled
|
||
if self.enable_api_intercept and self.api_interceptor:
|
||
try:
|
||
responses = self.api_interceptor.get_intercepted_responses()
|
||
if responses:
|
||
parsed = self.api_interceptor.parse_reviews_from_responses(responses)
|
||
for intercepted in parsed:
|
||
if intercepted.review_id and intercepted.review_id not in api_reviews:
|
||
api_reviews[intercepted.review_id] = self.api_interceptor.convert_to_raw_review_format(intercepted)
|
||
if parsed:
|
||
log.debug(f"API interceptor captured {len(parsed)} reviews (total unique: {len(api_reviews)})")
|
||
except Exception as api_err:
|
||
log.debug(f"API interception error: {api_err}")
|
||
|
||
# Dynamic sleep: sleep less when processing many reviews, more when finding none
|
||
if len(fresh_cards) > 5:
|
||
sleep_time = 0.7
|
||
elif len(fresh_cards) == 0:
|
||
sleep_time = 2.0 # Wait longer when finding nothing (let page load)
|
||
else:
|
||
sleep_time = 1.0
|
||
time.sleep(sleep_time)
|
||
|
||
except StaleElementReferenceException:
|
||
# The pane or other element went stale, try to re-find
|
||
log.debug("Stale element encountered, re-finding elements")
|
||
try:
|
||
pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, PANE_SEL)))
|
||
driver.execute_script("window.scrollablePane = arguments[0];", pane)
|
||
except Exception:
|
||
log.warning("Could not re-find reviews pane after stale element")
|
||
break
|
||
except Exception as e:
|
||
log.warning(f"Error during review processing: {e}")
|
||
attempts += 1
|
||
time.sleep(1)
|
||
|
||
pbar.close()
|
||
|
||
# Merge API-captured reviews if any
|
||
if self.enable_api_intercept and api_reviews:
|
||
log.info(f"Merging {len(api_reviews)} reviews captured via API interception")
|
||
for review_id, api_review in api_reviews.items():
|
||
if review_id not in docs:
|
||
# New review from API only
|
||
docs[review_id] = api_review
|
||
seen.add(review_id)
|
||
else:
|
||
# Merge API data with existing DOM data (API might have more details)
|
||
existing = docs[review_id]
|
||
# Only update fields that are missing or empty
|
||
for key, value in api_review.items():
|
||
if key not in existing or not existing.get(key):
|
||
existing[key] = value
|
||
log.info(f"After merge: {len(docs)} total reviews")
|
||
|
||
# Save to MongoDB if enabled
|
||
if self.use_mongodb and self.mongodb:
|
||
log.info("Saving reviews to MongoDB...")
|
||
self.mongodb.save_reviews(docs)
|
||
|
||
# Backup to JSON if enabled
|
||
if self.backup_to_json:
|
||
log.info("Backing up to JSON...")
|
||
self.json_storage.save_json_docs(docs)
|
||
self.json_storage.save_seen(seen)
|
||
|
||
log.info("✅ Finished – total unique reviews: %s", len(docs))
|
||
|
||
end_time = time.time()
|
||
elapsed_time = end_time - start_time
|
||
log.info(f"Execution completed in {elapsed_time:.2f} seconds")
|
||
|
||
return True
|
||
|
||
except Exception as e:
|
||
log.error(f"Error during scraping: {e}")
|
||
log.error(traceback.format_exc())
|
||
return False
|
||
|
||
finally:
|
||
# Cleanup API interceptor
|
||
if self.api_interceptor:
|
||
try:
|
||
self.api_interceptor.cleanup()
|
||
except Exception:
|
||
pass
|
||
|
||
if driver is not None:
|
||
try:
|
||
driver.quit()
|
||
except Exception:
|
||
pass
|
||
|
||
if self.mongodb:
|
||
try:
|
||
self.mongodb.close()
|
||
except Exception:
|
||
pass
|
||
|
||
# """
|
||
# Selenium scraping logic for Google Maps Reviews.
|
||
# """
|
||
#
|
||
# import os
|
||
# import time
|
||
# import logging
|
||
# import traceback
|
||
# import platform
|
||
# from typing import Dict, Any, List
|
||
#
|
||
# import undetected_chromedriver as uc
|
||
# from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
|
||
# from selenium.webdriver import Chrome
|
||
# from selenium.webdriver.common.by import By
|
||
# from selenium.webdriver.remote.webelement import WebElement
|
||
# from selenium.webdriver.support import expected_conditions as EC
|
||
# from selenium.webdriver.support.ui import WebDriverWait
|
||
# from tqdm import tqdm
|
||
#
|
||
# from modules.models import RawReview
|
||
# from modules.data_storage import MongoDBStorage, JSONStorage, merge_review
|
||
#
|
||
# # Logger
|
||
# log = logging.getLogger("scraper")
|
||
#
|
||
# # CSS Selectors
|
||
# PANE_SEL = 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf'
|
||
# CARD_SEL = "div[data-review-id]"
|
||
# COOKIE_BTN = ('button[aria-label*="Accept" i],'
|
||
# 'button[jsname="hZCF7e"],'
|
||
# 'button[data-mdc-dialog-action="accept"]')
|
||
# SORT_BTN = 'button[aria-label="Sort reviews" i], button[aria-label="Sort" i]'
|
||
# MENU_ITEMS = 'div[role="menu"] [role="menuitem"], li[role="menuitem"]'
|
||
#
|
||
# SORT_LABELS = { # text shown in Google Maps' menu
|
||
# "newest": ("Newest", "החדשות ביותר", "ใหม่ที่สุด"),
|
||
# "highest": ("Highest rating", "הדירוג הגבוה ביותר", "คะแนนสูงสุด"),
|
||
# "lowest": ("Lowest rating", "הדירוג הנמוך ביותר", "คะแนนต่ำสุด"),
|
||
# "relevance": ("Most relevant", "רלוונטיות ביותר", "เกี่ยวข้องมากที่สุด"),
|
||
# }
|
||
#
|
||
# REVIEW_WORDS = {"reviews", "review", "ביקורות", "รีวิว", "avis", "reseñas",
|
||
# "recensioni", "bewertungen", "口コミ", "レビュー",
|
||
# "리뷰", "評論", "评论", "рецензии", "ביקורת"}
|
||
#
|
||
#
|
||
# class GoogleReviewsScraper:
|
||
# """Main scraper class for Google Maps reviews"""
|
||
#
|
||
# def __init__(self, config: Dict[str, Any]):
|
||
# """Initialize scraper with configuration"""
|
||
# self.config = config
|
||
# self.use_mongodb = config.get("use_mongodb", True)
|
||
# self.mongodb = MongoDBStorage(config) if self.use_mongodb else None
|
||
# self.json_storage = JSONStorage(config)
|
||
# self.backup_to_json = config.get("backup_to_json", True)
|
||
# self.overwrite_existing = config.get("overwrite_existing", False)
|
||
#
|
||
# def setup_driver(self, headless: bool) -> Chrome:
|
||
# """
|
||
# Set up and configure Chrome driver with flexibility for different environments.
|
||
# Works in both Docker containers and on regular OS installations (Windows, Mac, Linux).
|
||
# """
|
||
# # Determine if we're running in a container
|
||
# in_container = os.environ.get('CHROME_BIN') is not None
|
||
#
|
||
# # Create Chrome options
|
||
# opts = uc.ChromeOptions()
|
||
# opts.add_argument("--window-size=1400,900")
|
||
# opts.add_argument("--ignore-certificate-errors")
|
||
# opts.add_argument("--disable-gpu") # Improves performance
|
||
# opts.add_argument("--disable-dev-shm-usage") # Helps with stability
|
||
# opts.add_argument("--no-sandbox") # More stable in some environments
|
||
#
|
||
# # Use headless mode if requested
|
||
# if headless:
|
||
# opts.add_argument("--headless=new")
|
||
#
|
||
# # Log platform information for debugging
|
||
# log.info(f"Platform: {platform.platform()}")
|
||
# log.info(f"Python version: {platform.python_version()}")
|
||
#
|
||
# # If in container, use environment-provided binaries
|
||
# if in_container:
|
||
# chrome_binary = os.environ.get('CHROME_BIN')
|
||
# chromedriver_path = os.environ.get('CHROMEDRIVER_PATH')
|
||
#
|
||
# log.info(f"Container environment detected")
|
||
# log.info(f"Chrome binary: {chrome_binary}")
|
||
# log.info(f"ChromeDriver path: {chromedriver_path}")
|
||
#
|
||
# if chrome_binary and os.path.exists(chrome_binary):
|
||
# log.info(f"Using Chrome binary from environment: {chrome_binary}")
|
||
# opts.binary_location = chrome_binary
|
||
#
|
||
# try:
|
||
# # Try creating Chrome driver with undetected_chromedriver
|
||
# log.info("Attempting to create undetected_chromedriver instance")
|
||
# driver = uc.Chrome(options=opts)
|
||
# log.info("Successfully created undetected_chromedriver instance")
|
||
# except Exception as e:
|
||
# # Fall back to regular Selenium if undetected_chromedriver fails
|
||
# log.warning(f"Failed to create undetected_chromedriver instance: {e}")
|
||
# log.info("Falling back to regular Selenium Chrome")
|
||
#
|
||
# # Import Selenium webdriver here to avoid potential import issues
|
||
# from selenium import webdriver
|
||
# from selenium.webdriver.chrome.service import Service
|
||
#
|
||
# if chromedriver_path and os.path.exists(chromedriver_path):
|
||
# log.info(f"Using ChromeDriver from path: {chromedriver_path}")
|
||
# service = Service(executable_path=chromedriver_path)
|
||
# driver = webdriver.Chrome(service=service, options=opts)
|
||
# else:
|
||
# log.info("Using default ChromeDriver")
|
||
# driver = webdriver.Chrome(options=opts)
|
||
# else:
|
||
# # On regular OS, use default undetected_chromedriver
|
||
# log.info("Using standard undetected_chromedriver setup")
|
||
# driver = uc.Chrome(options=opts)
|
||
#
|
||
# # Set page load timeout to avoid hanging
|
||
# driver.set_page_load_timeout(30)
|
||
# log.info("Chrome driver setup completed successfully")
|
||
# return driver
|
||
#
|
||
# def dismiss_cookies(self, driver: Chrome):
|
||
# """
|
||
# Dismiss cookie consent dialogs if present.
|
||
# Handles stale element references by re-finding elements if needed.
|
||
# """
|
||
# try:
|
||
# # Use WebDriverWait with expected_conditions to handle stale elements
|
||
# WebDriverWait(driver, 3).until(
|
||
# EC.presence_of_element_located((By.CSS_SELECTOR, COOKIE_BTN))
|
||
# )
|
||
# log.info("Cookie consent dialog found, attempting to dismiss")
|
||
#
|
||
# # Get elements again after waiting to avoid stale references
|
||
# elements = driver.find_elements(By.CSS_SELECTOR, COOKIE_BTN)
|
||
# for elem in elements:
|
||
# try:
|
||
# if elem.is_displayed():
|
||
# elem.click()
|
||
# log.info("Cookie dialog dismissed")
|
||
# return True
|
||
# except Exception as e:
|
||
# log.debug(f"Error clicking cookie button: {e}")
|
||
# continue
|
||
# except TimeoutException:
|
||
# # This is expected if no cookie dialog is present
|
||
# log.debug("No cookie consent dialog detected")
|
||
# except Exception as e:
|
||
# log.debug(f"Error handling cookie dialog: {e}")
|
||
#
|
||
# return False
|
||
#
|
||
# def is_reviews_tab(self, tab: WebElement) -> bool:
|
||
# """Check if a tab is the reviews tab"""
|
||
# try:
|
||
# label = (tab.get_attribute("aria-label") or tab.text or "").lower()
|
||
# return tab.get_attribute("data-tab-index") == "1" or any(w in label for w in REVIEW_WORDS)
|
||
# except StaleElementReferenceException:
|
||
# return False
|
||
# except Exception as e:
|
||
# log.debug(f"Error checking if tab is reviews tab: {e}")
|
||
# return False
|
||
#
|
||
# def click_reviews_tab(self, driver: Chrome):
|
||
# """
|
||
# Click on the reviews tab in Google Maps with improved stale element handling.
|
||
# """
|
||
# end = time.time() + 15 # Timeout after 15 seconds
|
||
# while time.time() < end:
|
||
# try:
|
||
# # Find all tab elements
|
||
# tabs = driver.find_elements(By.CSS_SELECTOR, '[role="tab"], button[aria-label]')
|
||
#
|
||
# for tab in tabs:
|
||
# try:
|
||
# # Check if this is the reviews tab
|
||
# label = (tab.get_attribute("aria-label") or tab.text or "").lower()
|
||
# is_review_tab = tab.get_attribute("data-tab-index") == "1" or any(
|
||
# w in label for w in REVIEW_WORDS)
|
||
#
|
||
# if is_review_tab:
|
||
# # Scroll the tab into view
|
||
# driver.execute_script("arguments[0].scrollIntoView({block:\"center\"});", tab)
|
||
# time.sleep(0.2) # Small wait after scrolling
|
||
#
|
||
# # Try to click the tab
|
||
# log.info("Found reviews tab, attempting to click")
|
||
# tab.click()
|
||
# log.info("Successfully clicked reviews tab")
|
||
# return True
|
||
# except Exception as e:
|
||
# # Element might be stale or not clickable, try the next one
|
||
# log.debug(f"Error with tab element: {str(e)}")
|
||
# continue
|
||
#
|
||
# # If we get here, we didn't find a suitable tab in this iteration
|
||
# log.debug("No reviews tab found in this iteration, waiting...")
|
||
# time.sleep(0.5) # Wait before next attempt
|
||
#
|
||
# except Exception as e:
|
||
# # General exception handling
|
||
# log.debug(f"Exception while looking for reviews tab: {str(e)}")
|
||
# time.sleep(0.5)
|
||
#
|
||
# # If we exit the loop, we've timed out
|
||
# log.warning("Timeout while looking for reviews tab")
|
||
# raise TimeoutException("Reviews tab not found")
|
||
#
|
||
# def set_sort(self, driver: Chrome, method: str):
|
||
# """
|
||
# Set the sorting method for reviews with improved error handling.
|
||
# """
|
||
# if method == "relevance":
|
||
# return True # Default order, no need to change
|
||
#
|
||
# log.info(f"Attempting to set sort order to '{method}'")
|
||
#
|
||
# try:
|
||
# # First try to find and click the sort button
|
||
# sort_buttons = driver.find_elements(By.CSS_SELECTOR, SORT_BTN)
|
||
# if not sort_buttons:
|
||
# log.warning(f"Sort button not found - keeping default sort order")
|
||
# return False
|
||
#
|
||
# # Try to click the first visible sort button
|
||
# for sort_button in sort_buttons:
|
||
# try:
|
||
# if sort_button.is_displayed() and sort_button.is_enabled():
|
||
# sort_button.click()
|
||
# log.info("Clicked sort button")
|
||
# time.sleep(0.5) # Wait for menu to appear
|
||
# break
|
||
# except Exception as e:
|
||
# log.debug(f"Error clicking sort button: {e}")
|
||
# continue
|
||
# else:
|
||
# log.warning("No clickable sort button found")
|
||
# return False
|
||
#
|
||
# # Now find and click the menu item for the desired sort method
|
||
# wanted = SORT_LABELS[method]
|
||
# menu_items = WebDriverWait(driver, 3).until(
|
||
# EC.presence_of_all_elements_located((By.CSS_SELECTOR, MENU_ITEMS))
|
||
# )
|
||
#
|
||
# for item in menu_items:
|
||
# try:
|
||
# label = item.text.strip()
|
||
# if label in wanted:
|
||
# item.click()
|
||
# log.info(f"Selected sort option: {label}")
|
||
# time.sleep(0.5) # Wait for sorting to take effect
|
||
# return True
|
||
# except Exception as e:
|
||
# log.debug(f"Error clicking menu item: {e}")
|
||
# continue
|
||
#
|
||
# log.warning(f"Sort option '{method}' not found in menu - keeping default")
|
||
# return False
|
||
#
|
||
# except Exception as e:
|
||
# log.warning(f"Error setting sort order: {e}")
|
||
# return False
|
||
#
|
||
# def scrape(self):
|
||
# """Main scraper method"""
|
||
# start_time = time.time()
|
||
#
|
||
# url = self.config.get("url")
|
||
# headless = self.config.get("headless", True)
|
||
# sort_by = self.config.get("sort_by", "relevance")
|
||
# stop_on_match = self.config.get("stop_on_match", False)
|
||
#
|
||
# log.info(f"Starting scraper with settings: headless={headless}, sort_by={sort_by}")
|
||
# log.info(f"URL: {url}")
|
||
#
|
||
# # Initialize storage
|
||
# # If not overwriting, load existing data
|
||
# if self.overwrite_existing:
|
||
# docs = {}
|
||
# seen = set()
|
||
# else:
|
||
# # Try to get from MongoDB first if enabled
|
||
# docs = {}
|
||
# if self.use_mongodb and self.mongodb:
|
||
# docs = self.mongodb.fetch_existing_reviews()
|
||
#
|
||
# # If backup_to_json is enabled, also load from JSON for merging
|
||
# if self.backup_to_json:
|
||
# json_docs = self.json_storage.load_json_docs()
|
||
# # Merge JSON docs with MongoDB docs
|
||
# for review_id, review in json_docs.items():
|
||
# if review_id not in docs:
|
||
# docs[review_id] = review
|
||
#
|
||
# # Load seen IDs from file
|
||
# seen = self.json_storage.load_seen()
|
||
#
|
||
# driver = None
|
||
# try:
|
||
# driver = self.setup_driver(headless)
|
||
# wait = WebDriverWait(driver, 20) # Reduced from 40 to 20 for faster timeout
|
||
#
|
||
# driver.get(url)
|
||
# wait.until(lambda d: "google.com/maps" in d.current_url)
|
||
#
|
||
# self.dismiss_cookies(driver)
|
||
# self.click_reviews_tab(driver)
|
||
# self.set_sort(driver, sort_by)
|
||
#
|
||
# # Add a wait after setting sort to allow results to load
|
||
# time.sleep(1)
|
||
#
|
||
# # Use try-except to handle cases where the pane is not found
|
||
# try:
|
||
# pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, PANE_SEL)))
|
||
# except TimeoutException:
|
||
# log.warning("Could not find reviews pane. Page structure might have changed.")
|
||
# return False
|
||
#
|
||
# pbar = tqdm(desc="Scraped", ncols=80, initial=len(seen))
|
||
# idle = 0
|
||
# processed_ids = set() # Track processed IDs in current session
|
||
#
|
||
# # Prefetch selector to avoid repeated lookups
|
||
# try:
|
||
# driver.execute_script("window.scrollablePane = arguments[0];", pane)
|
||
# scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
|
||
# except Exception as e:
|
||
# log.warning(f"Error setting up scroll script: {e}")
|
||
# scroll_script = "window.scrollBy(0, 300);" # Fallback to simple scrolling
|
||
#
|
||
# max_attempts = 10 # Limit the number of attempts to find reviews
|
||
# attempts = 0
|
||
#
|
||
# while attempts < max_attempts:
|
||
# try:
|
||
# cards = pane.find_elements(By.CSS_SELECTOR, CARD_SEL)
|
||
# fresh_cards: List[WebElement] = []
|
||
#
|
||
# # Check for valid cards
|
||
# if len(cards) == 0:
|
||
# log.debug("No review cards found in this iteration")
|
||
# attempts += 1
|
||
# # Try scrolling anyway
|
||
# driver.execute_script(scroll_script)
|
||
# time.sleep(1)
|
||
# continue
|
||
#
|
||
# for c in cards:
|
||
# try:
|
||
# cid = c.get_attribute("data-review-id")
|
||
# if not cid or cid in seen or cid in processed_ids:
|
||
# if stop_on_match and cid and (cid in seen or cid in processed_ids):
|
||
# idle = 999
|
||
# break
|
||
# continue
|
||
# fresh_cards.append(c)
|
||
# except StaleElementReferenceException:
|
||
# continue
|
||
# except Exception as e:
|
||
# log.debug(f"Error getting review ID: {e}")
|
||
# continue
|
||
#
|
||
# for card in fresh_cards:
|
||
# try:
|
||
# raw = RawReview.from_card(card)
|
||
# processed_ids.add(raw.id) # Track this ID to avoid re-processing
|
||
# except StaleElementReferenceException:
|
||
# continue
|
||
# except Exception:
|
||
# log.warning("⚠️ parse error – storing stub\n%s",
|
||
# traceback.format_exc(limit=1).strip())
|
||
# try:
|
||
# raw_id = card.get_attribute("data-review-id") or ""
|
||
# raw = RawReview(id=raw_id, text="", lang="und")
|
||
# processed_ids.add(raw_id)
|
||
# except StaleElementReferenceException:
|
||
# continue
|
||
#
|
||
# docs[raw.id] = merge_review(docs.get(raw.id), raw)
|
||
# seen.add(raw.id)
|
||
# pbar.update(1)
|
||
# idle = 0
|
||
# attempts = 0 # Reset attempts counter when we successfully process a review
|
||
#
|
||
# if idle >= 3:
|
||
# break
|
||
#
|
||
# if not fresh_cards:
|
||
# idle += 1
|
||
# attempts += 1
|
||
#
|
||
# # Use JavaScript for smoother scrolling
|
||
# try:
|
||
# driver.execute_script(scroll_script)
|
||
# except Exception as e:
|
||
# log.warning(f"Error scrolling: {e}")
|
||
# # Try a simpler scroll method
|
||
# driver.execute_script("window.scrollBy(0, 300);")
|
||
#
|
||
# # Dynamic sleep: sleep less when processing many reviews
|
||
# sleep_time = 0.7 if len(fresh_cards) > 5 else 1.0
|
||
# time.sleep(sleep_time)
|
||
#
|
||
# except StaleElementReferenceException:
|
||
# # The pane or other element went stale, try to re-find
|
||
# log.debug("Stale element encountered, re-finding elements")
|
||
# try:
|
||
# pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, PANE_SEL)))
|
||
# driver.execute_script("window.scrollablePane = arguments[0];", pane)
|
||
# except Exception:
|
||
# log.warning("Could not re-find reviews pane after stale element")
|
||
# break
|
||
# except Exception as e:
|
||
# log.warning(f"Error during review processing: {e}")
|
||
# attempts += 1
|
||
# time.sleep(1)
|
||
#
|
||
# pbar.close()
|
||
#
|
||
# # Save to MongoDB if enabled
|
||
# if self.use_mongodb and self.mongodb:
|
||
# log.info("Saving reviews to MongoDB...")
|
||
# self.mongodb.save_reviews(docs)
|
||
#
|
||
# # Backup to JSON if enabled
|
||
# if self.backup_to_json:
|
||
# log.info("Backing up to JSON...")
|
||
# self.json_storage.save_json_docs(docs)
|
||
# self.json_storage.save_seen(seen)
|
||
#
|
||
# log.info("✅ Finished – total unique reviews: %s", len(docs))
|
||
#
|
||
# end_time = time.time()
|
||
# elapsed_time = end_time - start_time
|
||
# log.info(f"Execution completed in {elapsed_time:.2f} seconds")
|
||
#
|
||
# return True
|
||
#
|
||
# except Exception as e:
|
||
# log.error(f"Error during scraping: {e}")
|
||
# log.error(traceback.format_exc())
|
||
# return False
|
||
#
|
||
# finally:
|
||
# if driver is not None:
|
||
# try:
|
||
# driver.quit()
|
||
# except Exception:
|
||
# pass
|
||
#
|
||
# if self.mongodb:
|
||
# try:
|
||
# self.mongodb.close()
|
||
# except Exception:
|
||
# pass
|
||
#
|
||
# # """
|
||
# # Selenium scraping logic for Google Maps Reviews.
|
||
# # """
|
||
# #
|
||
# # import re
|
||
# # import time
|
||
# # import logging
|
||
# # import traceback
|
||
# # from typing import Dict, Any, Set, List
|
||
# #
|
||
# # import undetected_chromedriver as uc
|
||
# # from selenium.common.exceptions import TimeoutException
|
||
# # from selenium.webdriver import Chrome
|
||
# # from selenium.webdriver.common.by import By
|
||
# # from selenium.webdriver.remote.webelement import WebElement
|
||
# # from selenium.webdriver.support import expected_conditions as EC
|
||
# # from selenium.webdriver.support.ui import WebDriverWait
|
||
# # from tqdm import tqdm
|
||
# #
|
||
# # from modules.models import RawReview
|
||
# # from modules.data_storage import MongoDBStorage, JSONStorage, merge_review
|
||
# # from modules.utils import click_if
|
||
# #
|
||
# # # Logger
|
||
# # log = logging.getLogger("scraper")
|
||
# #
|
||
# # # CSS Selectors
|
||
# # PANE_SEL = 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf'
|
||
# # CARD_SEL = "div[data-review-id]"
|
||
# # COOKIE_BTN = ('button[aria-label*="Accept" i],'
|
||
# # 'button[jsname="hZCF7e"],'
|
||
# # 'button[data-mdc-dialog-action="accept"]')
|
||
# # SORT_BTN = 'button[aria-label="Sort reviews" i], button[aria-label="Sort" i]'
|
||
# # MENU_ITEMS = 'div[role="menu"] [role="menuitem"], li[role="menuitem"]'
|
||
# #
|
||
# # SORT_LABELS = { # text shown in Google Maps' menu
|
||
# # "newest": ("Newest", "החדשות ביותר", "ใหม่ที่สุด"),
|
||
# # "highest": ("Highest rating", "הדירוג הגבוה ביותר", "คะแนนสูงสุด"),
|
||
# # "lowest": ("Lowest rating", "הדירוג הנמוך ביותר", "คะแนนต่ำสุด"),
|
||
# # "relevance": ("Most relevant", "רלוונטיות ביותר", "เกี่ยวข้องมากที่สุด"),
|
||
# # }
|
||
# #
|
||
# # REVIEW_WORDS = {"reviews", "review", "ביקורות", "รีวิว", "avis", "reseñas",
|
||
# # "recensioni", "bewertungen", "口コミ", "レビュー",
|
||
# # "리뷰", "評論", "评论", "рецензии"}
|
||
# #
|
||
# #
|
||
# # class GoogleReviewsScraper:
|
||
# # """Main scraper class for Google Maps reviews"""
|
||
# #
|
||
# # def __init__(self, config: Dict[str, Any]):
|
||
# # """Initialize scraper with configuration"""
|
||
# # self.config = config
|
||
# # self.use_mongodb = config.get("use_mongodb", True)
|
||
# # self.mongodb = MongoDBStorage(config) if self.use_mongodb else None
|
||
# # self.json_storage = JSONStorage(config)
|
||
# # self.backup_to_json = config.get("backup_to_json", True)
|
||
# # self.overwrite_existing = config.get("overwrite_existing", False)
|
||
# #
|
||
# # def setup_driver(self, headless: bool) -> Chrome:
|
||
# # """Set up and configure Chrome driver"""
|
||
# # opts = uc.ChromeOptions()
|
||
# # opts.add_argument("--window-size=1400,900")
|
||
# # opts.add_argument("--ignore-certificate-errors")
|
||
# # opts.add_argument("--disable-gpu") # Improves performance
|
||
# # opts.add_argument("--disable-dev-shm-usage") # Helps with stability
|
||
# # opts.add_argument("--no-sandbox") # More stable in some environments
|
||
# #
|
||
# # if headless:
|
||
# # opts.add_argument("--headless=new")
|
||
# #
|
||
# # driver = uc.Chrome(options=opts)
|
||
# # # Set page load timeout to avoid hanging
|
||
# # driver.set_page_load_timeout(30)
|
||
# # return driver
|
||
# #
|
||
# # def dismiss_cookies(self, driver: Chrome):
|
||
# # """Dismiss cookie consent dialogs"""
|
||
# # click_if(driver, COOKIE_BTN, timeout=3.0) # Reduced timeout for faster operation
|
||
# #
|
||
# # def is_reviews_tab(self, tab: WebElement) -> bool:
|
||
# # """Check if a tab is the reviews tab"""
|
||
# # label = (tab.get_attribute("aria-label") or tab.text or "").lower()
|
||
# # return tab.get_attribute("data-tab-index") == "1" or any(w in label for w in REVIEW_WORDS)
|
||
# #
|
||
# # def click_reviews_tab(self, driver: Chrome):
|
||
# # """Click on the reviews tab in Google Maps"""
|
||
# # end = time.time() + 15 # Reduced timeout from 30 to 15 seconds
|
||
# # while time.time() < end:
|
||
# # for tab in driver.find_elements(By.CSS_SELECTOR,
|
||
# # '[role="tab"], button[aria-label]'):
|
||
# # if self.is_reviews_tab(tab):
|
||
# # driver.execute_script("arguments[0].scrollIntoView({block:\"center\"});", tab)
|
||
# # try:
|
||
# # tab.click()
|
||
# # return
|
||
# # except Exception:
|
||
# # continue
|
||
# # time.sleep(.2) # Reduced sleep time from 0.4 to 0.2
|
||
# # raise TimeoutException("Reviews tab not found")
|
||
# #
|
||
# # def set_sort(self, driver: Chrome, method: str):
|
||
# # """Set the sorting method for reviews"""
|
||
# # if method == "relevance":
|
||
# # return # default order
|
||
# # if not click_if(driver, SORT_BTN):
|
||
# # return
|
||
# #
|
||
# # wanted = SORT_LABELS[method]
|
||
# #
|
||
# # for item in driver.find_elements(By.CSS_SELECTOR, MENU_ITEMS):
|
||
# # label = item.text.strip()
|
||
# # if label in wanted:
|
||
# # item.click()
|
||
# # time.sleep(0.5) # Reduced wait time from 1.0 to 0.5
|
||
# # return
|
||
# # log.warning("⚠️ sort option %s not found – keeping default", method)
|
||
# #
|
||
# # def scrape(self):
|
||
# # """Main scraper method"""
|
||
# # start_time = time.time()
|
||
# #
|
||
# # url = self.config.get("url")
|
||
# # headless = self.config.get("headless", True)
|
||
# # sort_by = self.config.get("sort_by", "relevance")
|
||
# # stop_on_match = self.config.get("stop_on_match", False)
|
||
# #
|
||
# # log.info(f"Starting scraper with settings: headless={headless}, sort_by={sort_by}")
|
||
# # log.info(f"URL: {url}")
|
||
# #
|
||
# # # Initialize storage
|
||
# # # If not overwriting, load existing data
|
||
# # if self.overwrite_existing:
|
||
# # docs = {}
|
||
# # seen = set()
|
||
# # else:
|
||
# # # Try to get from MongoDB first if enabled
|
||
# # docs = {}
|
||
# # if self.use_mongodb and self.mongodb:
|
||
# # docs = self.mongodb.fetch_existing_reviews()
|
||
# #
|
||
# # # If backup_to_json is enabled, also load from JSON for merging
|
||
# # if self.backup_to_json:
|
||
# # json_docs = self.json_storage.load_json_docs()
|
||
# # # Merge JSON docs with MongoDB docs
|
||
# # for review_id, review in json_docs.items():
|
||
# # if review_id not in docs:
|
||
# # docs[review_id] = review
|
||
# #
|
||
# # # Load seen IDs from file
|
||
# # seen = self.json_storage.load_seen()
|
||
# #
|
||
# # driver = self.setup_driver(headless)
|
||
# # wait = WebDriverWait(driver, 20) # Reduced from 40 to 20 for faster timeout
|
||
# #
|
||
# # try:
|
||
# # driver.get(url)
|
||
# # wait.until(lambda d: "google.com/maps" in d.current_url)
|
||
# #
|
||
# # self.dismiss_cookies(driver)
|
||
# # self.click_reviews_tab(driver)
|
||
# # self.set_sort(driver, sort_by)
|
||
# #
|
||
# # pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, PANE_SEL)))
|
||
# # pbar = tqdm(desc="Scraped", ncols=80, initial=len(seen))
|
||
# # idle = 0
|
||
# # processed_ids = set() # Track processed IDs in current session
|
||
# #
|
||
# # # Prefetch selector to avoid repeated lookups
|
||
# # driver.execute_script("window.scrollablePane = arguments[0];", pane)
|
||
# # scroll_script = "window.scrollablePane.scrollBy(0, window.scrollablePane.scrollHeight);"
|
||
# #
|
||
# # while True:
|
||
# # cards = pane.find_elements(By.CSS_SELECTOR, CARD_SEL)
|
||
# # fresh_cards: List[WebElement] = []
|
||
# #
|
||
# # for c in cards:
|
||
# # cid = c.get_attribute("data-review-id")
|
||
# # if cid in seen or cid in processed_ids:
|
||
# # if stop_on_match:
|
||
# # idle = 999
|
||
# # break
|
||
# # continue
|
||
# # fresh_cards.append(c)
|
||
# #
|
||
# # for card in fresh_cards:
|
||
# # try:
|
||
# # raw = RawReview.from_card(card)
|
||
# # processed_ids.add(raw.id) # Track this ID to avoid re-processing
|
||
# # except Exception:
|
||
# # log.warning("⚠️ parse error – storing stub\n%s",
|
||
# # traceback.format_exc(limit=1).strip())
|
||
# # raw_id = card.get_attribute("data-review-id") or ""
|
||
# # raw = RawReview(id=raw_id, text="", lang="und")
|
||
# # processed_ids.add(raw_id)
|
||
# #
|
||
# # docs[raw.id] = merge_review(docs.get(raw.id), raw)
|
||
# # seen.add(raw.id)
|
||
# # pbar.update(1)
|
||
# # idle = 0
|
||
# #
|
||
# # if idle >= 3:
|
||
# # break
|
||
# #
|
||
# # if not fresh_cards:
|
||
# # idle += 1
|
||
# #
|
||
# # # Use JavaScript for smoother scrolling
|
||
# # driver.execute_script(scroll_script)
|
||
# #
|
||
# # # Dynamic sleep: sleep less when processing many reviews
|
||
# # sleep_time = 0.7 if len(fresh_cards) > 5 else 1.0
|
||
# # time.sleep(sleep_time)
|
||
# #
|
||
# # pbar.close()
|
||
# #
|
||
# # # Save to MongoDB if enabled
|
||
# # if self.use_mongodb and self.mongodb:
|
||
# # log.info("Saving reviews to MongoDB...")
|
||
# # self.mongodb.save_reviews(docs)
|
||
# #
|
||
# # # Backup to JSON if enabled
|
||
# # if self.backup_to_json:
|
||
# # log.info("Backing up to JSON...")
|
||
# # self.json_storage.save_json_docs(docs)
|
||
# # self.json_storage.save_seen(seen)
|
||
# #
|
||
# # log.info("✅ Finished – total unique reviews: %s", len(docs))
|
||
# #
|
||
# # end_time = time.time()
|
||
# # elapsed_time = end_time - start_time
|
||
# # log.info(f"Execution completed in {elapsed_time:.2f} seconds")
|
||
# #
|
||
# # finally:
|
||
# # driver.quit()
|
||
# # if self.mongodb:
|
||
# # self.mongodb.close()
|