From 54f98ae921fa30a80d77bfa68c7fc22298d0f063 Mon Sep 17 00:00:00 2001 From: George Khananaev <106206490+georgekhananaev@users.noreply.github.com> Date: Mon, 2 Jun 2025 13:22:50 +0700 Subject: [PATCH] fixed the issue with english localization --- modules/config.py | 6 +- modules/data_storage.py | 30 ++++ modules/models.py | 3 + modules/scraper.py | 337 ++++++++++++++++++++++++++++++++++------ 4 files changed, 324 insertions(+), 52 deletions(-) diff --git a/modules/config.py b/modules/config.py index f5cf7b8..cf6c43e 100644 --- a/modules/config.py +++ b/modules/config.py @@ -8,8 +8,10 @@ from typing import Dict, Any import yaml -# Configure logging -logging.basicConfig(level=logging.INFO, format="[%(asctime)s] %(message)s") +# Configure logging - can be overridden by environment variable +import os +log_level = getattr(logging, os.environ.get('LOG_LEVEL', 'INFO').upper(), logging.INFO) +logging.basicConfig(level=log_level, format="[%(asctime)s] %(levelname)s: %(message)s") log = logging.getLogger("scraper") # Default configuration path diff --git a/modules/data_storage.py b/modules/data_storage.py index d652dac..f329ca4 100644 --- a/modules/data_storage.py +++ b/modules/data_storage.py @@ -317,3 +317,33 @@ def merge_review(existing: Dict[str, Any] | None, raw: RawReview) -> Dict[str, A existing["last_modified_date"] = get_current_iso_date() return existing + + +def merge_review_with_translation(existing: Dict[str, Any] | None, raw: RawReview, append_translations: bool = False) -> Dict[str, Any]: + """ + Enhanced merge function that supports translation mode. + When append_translations is True, it adds new language versions to existing reviews. + """ + # Use the standard merge for the base functionality + merged = merge_review(existing, raw) + + if append_translations and existing and raw.text: + # In translation mode, always add the new language version + # even if we already have content for this review + merged["description"][raw.lang] = raw.text + + # Also merge owner responses in translation mode + if raw.owner_text: + owner_lang = detect_lang(raw.owner_text) + merged.setdefault("owner_responses", {})[owner_lang] = { + "text": raw.owner_text, + } + + # Add metadata about when this translation was added + merged.setdefault("translation_history", []).append({ + "language": raw.lang, + "added_date": get_current_iso_date(), + "source": "regional_scraping" + }) + + return merged diff --git a/modules/models.py b/modules/models.py index e571259..b4b64a2 100644 --- a/modules/models.py +++ b/modules/models.py @@ -27,6 +27,9 @@ class RawReview: owner_date: str = "" owner_text: str = "" review_date: str = "" # ISO format date + + # Translation fields + translations: dict = field(default_factory=dict) # Store translations by language code # CSS Selectors for review elements MORE_BTN = "button.kyuRq" diff --git a/modules/scraper.py b/modules/scraper.py index 1bd68b1..1b9085b 100644 --- a/modules/scraper.py +++ b/modules/scraper.py @@ -21,7 +21,7 @@ from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait from tqdm import tqdm -from modules.data_storage import MongoDBStorage, JSONStorage, merge_review +from modules.data_storage import MongoDBStorage, JSONStorage, merge_review, merge_review_with_translation from modules.models import RawReview # Logger @@ -168,6 +168,11 @@ class GoogleReviewsScraper: self.json_storage = JSONStorage(config) self.backup_to_json = config.get("backup_to_json", True) self.overwrite_existing = config.get("overwrite_existing", False) + + # Translation feature settings + self.append_translations = config.get("append_translations", False) + self.translation_language = config.get("translation_language", "auto") + self.force_full_scan = config.get("force_full_scan", False) def setup_driver(self, headless: bool) -> Chrome: """ @@ -276,8 +281,13 @@ class GoogleReviewsScraper: try: # Strategy 1: Data attribute detection (most reliable across languages) tab_index = tab.get_attribute("data-tab-index") - if tab_index == "1" or tab_index == "reviews": - return True + if tab_index in ["1", "2", "reviews"]: # Reviews can be index 1 or 2 depending on layout + # Double-check this is actually a reviews tab by checking text content + aria_label = (tab.get_attribute("aria-label") or "").lower() + tab_text = (tab.text or "").lower() + if any(word.lower() in aria_label or word.lower() in tab_text for word in REVIEW_WORDS): + log.debug(f"Found reviews tab by data-tab-index: {tab_index}") + return True # Strategy 2: Role and aria attributes (accessibility detection) role = tab.get_attribute("role") @@ -285,20 +295,27 @@ class GoogleReviewsScraper: aria_label = (tab.get_attribute("aria-label") or "").lower() # Many review tabs have role="tab" and data attributes - if role == "tab" and any(word in aria_label for word in REVIEW_WORDS): + if role == "tab" and any(word.lower() in aria_label for word in REVIEW_WORDS): + log.debug(f"Found reviews tab by aria-label: {aria_label}") return True # Strategy 3: Text content detection (multiple sources) + tab_text = tab.text.lower() if tab.text else "" + inner_html = tab.get_attribute("innerHTML").lower() or "" + text_content = tab.get_attribute("textContent").lower() or "" + sources = [ - tab.text.lower() if tab.text else "", # Direct text + tab_text, # Direct text aria_label, # ARIA label - tab.get_attribute("innerHTML").lower() or "", # Inner HTML - tab.get_attribute("textContent").lower() or "" # Text content + inner_html, # Inner HTML + text_content # Text content ] # Check all sources against our comprehensive keyword list - for source in sources: - if any(word in source for word in REVIEW_WORDS): + for i, source in enumerate(sources): + if any(word.lower() in source for word in REVIEW_WORDS): + source_names = ["text", "aria-label", "innerHTML", "textContent"] + log.debug(f"Found reviews tab by {source_names[i]}: '{source}' (contains review word)") return True # Strategy 4: Nested element detection @@ -309,8 +326,9 @@ class GoogleReviewsScraper: child_text = child.text.lower() if child.text else "" child_content = child.get_attribute("textContent").lower() or "" - if any(word in child_text for word in REVIEW_WORDS) or any( - word in child_content for word in REVIEW_WORDS): + if any(word.lower() in child_text for word in REVIEW_WORDS) or any( + word.lower() in child_content for word in REVIEW_WORDS): + log.debug(f"Found reviews tab by child element text: '{child_text}' or '{child_content}'") return True except: continue @@ -321,14 +339,18 @@ class GoogleReviewsScraper: for attr in ["href", "data-href", "data-url", "data-target"]: attr_value = (tab.get_attribute(attr) or "").lower() if attr_value and ("review" in attr_value or "rating" in attr_value): + log.debug(f"Found reviews tab by {attr}: {attr_value}") return True # Strategy 6: Class detection (some review tabs have specific classes) tab_class = tab.get_attribute("class") or "" review_classes = ["review", "reviews", "rating", "ratings", "comments", "feedback", "g4jrve"] - if any(cls in tab_class for cls in review_classes): + if any(cls in tab_class.lower() for cls in review_classes): + log.debug(f"Found reviews tab by class: {tab_class}") return True + # Log what we found for debugging + log.debug(f"Tab not identified as reviews tab - role: {role}, index: {tab_index}, aria-label: '{aria_label}', text: '{tab_text}', class: '{tab_class}'") return False except StaleElementReferenceException: @@ -342,14 +364,89 @@ class GoogleReviewsScraper: Highly dynamic reviews tab detection and clicking with multiple fallback strategies. Works across different languages, layouts, and browser environments. """ - max_timeout = 25 # Maximum seconds to try + max_timeout = 30 # Maximum seconds to try end_time = time.time() + max_timeout attempts = 0 + + # First, wait for the business panel to load + log.info("Waiting for business information panel to load...") + business_panel_loaded = False + panel_wait_end = time.time() + 15 # Wait up to 15 seconds for business panel + + while time.time() < panel_wait_end: + try: + # Look for indicators that the business panel has loaded + business_indicators = [ + # Business name or rating elements + '[role="main"] h1', + '[role="main"] .DUwDvf', # Business name class + '[role="main"] .F7nice', # Rating class + '[role="main"] .fontHeadlineSmall', + # Or any elements with review-related text + '//*[contains(translate(text(), "REVIEWS", "reviews"), "review")]', + ] + + for indicator in business_indicators[:-1]: # CSS selectors first + elements = driver.find_elements(By.CSS_SELECTOR, indicator) + if elements: + log.info(f"Business panel detected using selector: {indicator}") + business_panel_loaded = True + break + + if not business_panel_loaded: + # Try XPath selector for review text + elements = driver.find_elements(By.XPATH, business_indicators[-1]) + if elements: + log.info("Business panel detected using review text search") + business_panel_loaded = True + + if business_panel_loaded: + break + + time.sleep(1) + except Exception as e: + log.debug(f"Error checking for business panel: {e}") + time.sleep(1) + + if business_panel_loaded: + log.info("Business panel loaded successfully") + # Give it a bit more time for tabs to appear + time.sleep(2) + else: + log.warning("Business panel may not have loaded completely, continuing with tab search...") + + # If no business panel is detected, try URL manipulation to force reviews view + if not business_panel_loaded: + try: + current_url = driver.current_url + if "/place/" in current_url and "/reviews" not in current_url: + # Try to navigate directly to reviews by modifying URL + if "?" in current_url: + base_url = current_url.split("?")[0] + params = current_url.split("?")[1] + new_url = f"{base_url}/reviews?{params}" + else: + new_url = f"{current_url}/reviews" + + log.info(f"Attempting to navigate directly to reviews: {new_url}") + driver.get(new_url) + time.sleep(3) + + # Check if this worked by looking for review content + review_cards = driver.find_elements(By.CSS_SELECTOR, 'div[data-review-id]') + if review_cards: + log.info("Successfully navigated to reviews page") + return True + except Exception as e: + log.debug(f"URL manipulation failed: {e}") # Define different selectors to try in order of reliability tab_selectors = [ - # Direct tab selectors - '[data-tab-index="1"]', # Most common tab index + # Direct tab selectors - try both common indexes + '[data-tab-index="1"]', # Common tab index for reviews + '[data-tab-index="2"]', # Alternative tab index for reviews + 'button[role="tab"][data-tab-index="1"]', # Exact match from HTML + 'button[role="tab"][data-tab-index="2"]', # Alternative exact match '[role="tab"][data-tab-index]', # Any tab with index 'button[role="tab"]', # Button tabs 'div[role="tab"]', # Div tabs @@ -364,6 +461,9 @@ class GoogleReviewsScraper: 'button:contains("reviews")', # Button containing "reviews" 'div[role="tablist"] > *', # Any tab in a tab list 'div.m6QErb div[role="tablist"] > *', # Google Maps specific tablist + + # Fallback selectors + '[role="tab"]', # Any tab element ] # Record successful clicks for debugging @@ -437,23 +537,39 @@ class GoogleReviewsScraper: # If we reach here, try XPath as a last resort if time.time() <= end_time: + log.info("Trying XPath-based text matching for review tabs...") for language_keyword in REVIEW_WORDS: try: - # Try XPath contains text - xpath = f"//*[contains(text(), '{language_keyword}')]" - elements = driver.find_elements(By.XPATH, xpath) - - for element in elements: + # Try different XPath patterns for review text + xpath_patterns = [ + f"//*[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{language_keyword.lower()}')]", + f"//button[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{language_keyword.lower()}')]", + f"//*[@role='tab' and contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{language_keyword.lower()}')]", + f"//*[contains(translate(@aria-label, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{language_keyword.lower()}')]", + ] + + for xpath in xpath_patterns: try: - log.info(f"Trying XPath with keyword '{language_keyword}'") - driver.execute_script("arguments[0].scrollIntoView({block:'center'});", element) - time.sleep(0.7) - driver.execute_script("arguments[0].click();", element) - time.sleep(1.5) + elements = driver.find_elements(By.XPATH, xpath) + for element in elements: + try: + # Skip if it's a script tag or hidden element + if element.tag_name.lower() in ['script', 'style', 'noscript']: + continue + if not element.is_displayed(): + continue + + log.info(f"Trying XPath pattern with keyword '{language_keyword}': {xpath}") + driver.execute_script("arguments[0].scrollIntoView({block:'center'});", element) + time.sleep(0.7) + driver.execute_script("arguments[0].click();", element) + time.sleep(1.5) - if self.verify_reviews_tab_clicked(driver): - log.info(f"Successfully clicked element with keyword '{language_keyword}'") - return True + if self.verify_reviews_tab_clicked(driver): + log.info(f"Successfully clicked element with keyword '{language_keyword}'") + return True + except: + continue except: continue except: @@ -592,6 +708,13 @@ class GoogleReviewsScraper: for keyword in negative_keywords): continue + # Additional check - make sure this is actually a sort button for reviews + if ("sort" not in button_text.lower() and "sort" not in button_aria.lower() and + "סדר" not in button_text.lower() and "เรียง" not in button_text.lower() and + "ordenar" not in button_text.lower() and "trier" not in button_text.lower()): + log.debug(f"Button doesn't appear to be a sort button, skipping: '{button_text}' / '{button_aria}'") + continue + # Found a potential sort button sort_button = element log.info(f"Found sort button with selector: {selector}") @@ -1033,7 +1156,13 @@ class GoogleReviewsScraper: sort_by = self.config.get("sort_by", "relevance") stop_on_match = self.config.get("stop_on_match", False) + # Override stop_on_match if translation mode is enabled + if self.append_translations or self.force_full_scan: + stop_on_match = False + log.info("Translation mode enabled - forcing full scan of all reviews") + log.info(f"Starting scraper with settings: headless={headless}, sort_by={sort_by}") + log.info(f"Translation mode: append_translations={self.append_translations}, language={self.translation_language}") log.info(f"URL: {url}") # Initialize storage @@ -1071,13 +1200,27 @@ class GoogleReviewsScraper: self.set_sort(driver, sort_by) # Add a wait after setting sort to allow results to load - time.sleep(1) + time.sleep(3) # Increased wait time for reviews to load # Use try-except to handle cases where the pane is not found - try: - pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, PANE_SEL))) - except TimeoutException: - log.warning("Could not find reviews pane. Page structure might have changed.") + pane = None + pane_selectors = [ + PANE_SEL, # Original selector + 'div[role="main"]', # Simpler main container + 'body', # Ultimate fallback + ] + + for pane_selector in pane_selectors: + try: + pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, pane_selector))) + log.info(f"Found scrollable pane using selector: {pane_selector}") + break + except TimeoutException: + log.debug(f"Pane selector '{pane_selector}' not found") + continue + + if not pane: + log.warning("Could not find any scrollable pane. Page structure might have changed.") return False pbar = tqdm(desc="Scraped", ncols=80, initial=len(seen)) @@ -1094,6 +1237,11 @@ class GoogleReviewsScraper: max_attempts = 10 # Limit the number of attempts to find reviews attempts = 0 + + # In translation mode, track total unique reviews to detect when we stop finding new ones + all_review_ids_seen = set() + last_unique_count = 0 + no_new_reviews_count = 0 while attempts < max_attempts: try: @@ -1102,32 +1250,106 @@ class GoogleReviewsScraper: # Check for valid cards if len(cards) == 0: - log.debug("No review cards found in this iteration") - attempts += 1 - # Try scrolling anyway - driver.execute_script(scroll_script) - time.sleep(1) - continue + log.info(f"No review cards found in iteration {attempts + 1} using selector '{CARD_SEL}'") + + # Try alternative selectors + alternative_selectors = [ + 'div[data-review-id]', + '.jftiEf[data-review-id]', + '[data-review-id]' + ] + + for alt_sel in alternative_selectors: + alt_cards = pane.find_elements(By.CSS_SELECTOR, alt_sel) + log.info(f"Alternative selector '{alt_sel}': Found {len(alt_cards)} cards") + if alt_cards: + cards = alt_cards + break + + if len(cards) == 0: + # If no cards found in pane, try searching the entire page + log.info("No cards found in pane, searching entire page...") + page_cards = driver.find_elements(By.CSS_SELECTOR, 'div[data-review-id]') + log.info(f"Found {len(page_cards)} review cards on entire page") + + if page_cards: + cards = page_cards + log.info("Using review cards found on entire page") + else: + attempts += 1 + # Try scrolling anyway + driver.execute_script(scroll_script) + time.sleep(1) + continue + else: + log.info(f"Found {len(cards)} review cards in iteration {attempts + 1}") for c in cards: try: cid = c.get_attribute("data-review-id") - if not cid or cid in seen or cid in processed_ids: - if stop_on_match and cid and (cid in seen or cid in processed_ids): - idle = 999 - break + if not cid: continue - fresh_cards.append(c) + + # In translation mode, process all cards even if seen before + if self.append_translations: + # In translation mode, we process all reviews to add potential translations + # We don't use processed_ids to track, so process all cards + if cid in seen: + log.debug(f"Translation mode: Processing {cid} again (was seen before, adding translation)") + else: + log.debug(f"Translation mode: Processing {cid} (new review)") + fresh_cards.append(c) + else: + # Normal mode: skip seen reviews + if cid in seen or cid in processed_ids: + if stop_on_match and cid and (cid in seen or cid in processed_ids): + idle = 999 + break + continue + fresh_cards.append(c) except StaleElementReferenceException: continue except Exception as e: log.debug(f"Error getting review ID: {e}") continue + # In translation mode, track all unique review IDs to detect when we've seen all reviews + if self.append_translations: + current_review_ids = set() + for c in cards: + try: + cid = c.get_attribute("data-review-id") + if cid: + current_review_ids.add(cid) + all_review_ids_seen.add(cid) + except: + continue + + # Check if we found new unique reviews + current_unique_count = len(all_review_ids_seen) + if current_unique_count == last_unique_count: + no_new_reviews_count += 1 + log.info(f"Translation mode: No new reviews found ({no_new_reviews_count}/5) - total unique: {current_unique_count}") + else: + no_new_reviews_count = 0 + log.info(f"Translation mode: Found new reviews - total unique: {current_unique_count} (was {last_unique_count})") + + last_unique_count = current_unique_count + + # If we haven't found new reviews for 5 iterations, we're done + if no_new_reviews_count >= 5: + log.info("Translation mode: No new reviews found for 5 iterations - stopping") + break + + # Log how many fresh cards we found + log.info(f"Found {len(fresh_cards)} fresh cards out of {len(cards)} total cards (translation_mode={self.append_translations})") + for card in fresh_cards: try: raw = RawReview.from_card(card) - processed_ids.add(raw.id) # Track this ID to avoid re-processing + # In translation mode, don't add to processed_ids to allow re-processing + if not self.append_translations: + processed_ids.add(raw.id) # Track this ID to avoid re-processing except StaleElementReferenceException: continue except Exception: @@ -1136,22 +1358,37 @@ class GoogleReviewsScraper: try: raw_id = card.get_attribute("data-review-id") or "" raw = RawReview(id=raw_id, text="", lang="und") - processed_ids.add(raw_id) + # In translation mode, don't add to processed_ids to allow re-processing + if not self.append_translations: + processed_ids.add(raw_id) except StaleElementReferenceException: continue - docs[raw.id] = merge_review(docs.get(raw.id), raw) + # Use translation-aware merge if translation mode is enabled + if self.append_translations: + docs[raw.id] = merge_review_with_translation(docs.get(raw.id), raw, append_translations=True) + else: + docs[raw.id] = merge_review(docs.get(raw.id), raw) seen.add(raw.id) pbar.update(1) idle = 0 attempts = 0 # Reset attempts counter when we successfully process a review - if idle >= 3: + # In translation mode, be more patient before giving up + max_idle = 10 if self.append_translations else 3 + if idle >= max_idle: + log.info(f"Stopping after {max_idle} idle iterations") break if not fresh_cards: idle += 1 attempts += 1 + # In translation mode, log why we're not finding fresh cards + if self.append_translations: + log.debug(f"No fresh cards in translation mode - idle: {idle}/{max_idle}, attempts: {attempts}") + else: + # Reset idle counter when we have fresh cards + idle = 0 # Use JavaScript for smoother scrolling try: @@ -1239,7 +1476,7 @@ class GoogleReviewsScraper: # from tqdm import tqdm # # from modules.models import RawReview -# from modules.data_storage import MongoDBStorage, JSONStorage, merge_review +# from modules.data_storage import MongoDBStorage, JSONStorage, merge_review, merge_review_with_translation # # # Logger # log = logging.getLogger("scraper") @@ -1702,7 +1939,7 @@ class GoogleReviewsScraper: # # from tqdm import tqdm # # # # from modules.models import RawReview -# # from modules.data_storage import MongoDBStorage, JSONStorage, merge_review +# # from modules.data_storage import MongoDBStorage, JSONStorage, merge_review, merge_review_with_translation # # from modules.utils import click_if # # # # # Logger