fixed the issue with english localization
This commit is contained in:
@@ -8,8 +8,10 @@ from typing import Dict, Any
|
|||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
# Configure logging
|
# Configure logging - can be overridden by environment variable
|
||||||
logging.basicConfig(level=logging.INFO, format="[%(asctime)s] %(message)s")
|
import os
|
||||||
|
log_level = getattr(logging, os.environ.get('LOG_LEVEL', 'INFO').upper(), logging.INFO)
|
||||||
|
logging.basicConfig(level=log_level, format="[%(asctime)s] %(levelname)s: %(message)s")
|
||||||
log = logging.getLogger("scraper")
|
log = logging.getLogger("scraper")
|
||||||
|
|
||||||
# Default configuration path
|
# Default configuration path
|
||||||
|
|||||||
@@ -317,3 +317,33 @@ def merge_review(existing: Dict[str, Any] | None, raw: RawReview) -> Dict[str, A
|
|||||||
existing["last_modified_date"] = get_current_iso_date()
|
existing["last_modified_date"] = get_current_iso_date()
|
||||||
|
|
||||||
return existing
|
return existing
|
||||||
|
|
||||||
|
|
||||||
|
def merge_review_with_translation(existing: Dict[str, Any] | None, raw: RawReview, append_translations: bool = False) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Enhanced merge function that supports translation mode.
|
||||||
|
When append_translations is True, it adds new language versions to existing reviews.
|
||||||
|
"""
|
||||||
|
# Use the standard merge for the base functionality
|
||||||
|
merged = merge_review(existing, raw)
|
||||||
|
|
||||||
|
if append_translations and existing and raw.text:
|
||||||
|
# In translation mode, always add the new language version
|
||||||
|
# even if we already have content for this review
|
||||||
|
merged["description"][raw.lang] = raw.text
|
||||||
|
|
||||||
|
# Also merge owner responses in translation mode
|
||||||
|
if raw.owner_text:
|
||||||
|
owner_lang = detect_lang(raw.owner_text)
|
||||||
|
merged.setdefault("owner_responses", {})[owner_lang] = {
|
||||||
|
"text": raw.owner_text,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add metadata about when this translation was added
|
||||||
|
merged.setdefault("translation_history", []).append({
|
||||||
|
"language": raw.lang,
|
||||||
|
"added_date": get_current_iso_date(),
|
||||||
|
"source": "regional_scraping"
|
||||||
|
})
|
||||||
|
|
||||||
|
return merged
|
||||||
|
|||||||
@@ -28,6 +28,9 @@ class RawReview:
|
|||||||
owner_text: str = ""
|
owner_text: str = ""
|
||||||
review_date: str = "" # ISO format date
|
review_date: str = "" # ISO format date
|
||||||
|
|
||||||
|
# Translation fields
|
||||||
|
translations: dict = field(default_factory=dict) # Store translations by language code
|
||||||
|
|
||||||
# CSS Selectors for review elements
|
# CSS Selectors for review elements
|
||||||
MORE_BTN = "button.kyuRq"
|
MORE_BTN = "button.kyuRq"
|
||||||
LIKE_BTN = 'button[jsaction*="toggleThumbsUp" i]'
|
LIKE_BTN = 'button[jsaction*="toggleThumbsUp" i]'
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ from selenium.webdriver.support import expected_conditions as EC
|
|||||||
from selenium.webdriver.support.ui import WebDriverWait
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from modules.data_storage import MongoDBStorage, JSONStorage, merge_review
|
from modules.data_storage import MongoDBStorage, JSONStorage, merge_review, merge_review_with_translation
|
||||||
from modules.models import RawReview
|
from modules.models import RawReview
|
||||||
|
|
||||||
# Logger
|
# Logger
|
||||||
@@ -169,6 +169,11 @@ class GoogleReviewsScraper:
|
|||||||
self.backup_to_json = config.get("backup_to_json", True)
|
self.backup_to_json = config.get("backup_to_json", True)
|
||||||
self.overwrite_existing = config.get("overwrite_existing", False)
|
self.overwrite_existing = config.get("overwrite_existing", False)
|
||||||
|
|
||||||
|
# Translation feature settings
|
||||||
|
self.append_translations = config.get("append_translations", False)
|
||||||
|
self.translation_language = config.get("translation_language", "auto")
|
||||||
|
self.force_full_scan = config.get("force_full_scan", False)
|
||||||
|
|
||||||
def setup_driver(self, headless: bool) -> Chrome:
|
def setup_driver(self, headless: bool) -> Chrome:
|
||||||
"""
|
"""
|
||||||
Set up and configure Chrome driver with flexibility for different environments.
|
Set up and configure Chrome driver with flexibility for different environments.
|
||||||
@@ -276,8 +281,13 @@ class GoogleReviewsScraper:
|
|||||||
try:
|
try:
|
||||||
# Strategy 1: Data attribute detection (most reliable across languages)
|
# Strategy 1: Data attribute detection (most reliable across languages)
|
||||||
tab_index = tab.get_attribute("data-tab-index")
|
tab_index = tab.get_attribute("data-tab-index")
|
||||||
if tab_index == "1" or tab_index == "reviews":
|
if tab_index in ["1", "2", "reviews"]: # Reviews can be index 1 or 2 depending on layout
|
||||||
return True
|
# Double-check this is actually a reviews tab by checking text content
|
||||||
|
aria_label = (tab.get_attribute("aria-label") or "").lower()
|
||||||
|
tab_text = (tab.text or "").lower()
|
||||||
|
if any(word.lower() in aria_label or word.lower() in tab_text for word in REVIEW_WORDS):
|
||||||
|
log.debug(f"Found reviews tab by data-tab-index: {tab_index}")
|
||||||
|
return True
|
||||||
|
|
||||||
# Strategy 2: Role and aria attributes (accessibility detection)
|
# Strategy 2: Role and aria attributes (accessibility detection)
|
||||||
role = tab.get_attribute("role")
|
role = tab.get_attribute("role")
|
||||||
@@ -285,20 +295,27 @@ class GoogleReviewsScraper:
|
|||||||
aria_label = (tab.get_attribute("aria-label") or "").lower()
|
aria_label = (tab.get_attribute("aria-label") or "").lower()
|
||||||
|
|
||||||
# Many review tabs have role="tab" and data attributes
|
# Many review tabs have role="tab" and data attributes
|
||||||
if role == "tab" and any(word in aria_label for word in REVIEW_WORDS):
|
if role == "tab" and any(word.lower() in aria_label for word in REVIEW_WORDS):
|
||||||
|
log.debug(f"Found reviews tab by aria-label: {aria_label}")
|
||||||
return True
|
return True
|
||||||
|
|
||||||
# Strategy 3: Text content detection (multiple sources)
|
# Strategy 3: Text content detection (multiple sources)
|
||||||
|
tab_text = tab.text.lower() if tab.text else ""
|
||||||
|
inner_html = tab.get_attribute("innerHTML").lower() or ""
|
||||||
|
text_content = tab.get_attribute("textContent").lower() or ""
|
||||||
|
|
||||||
sources = [
|
sources = [
|
||||||
tab.text.lower() if tab.text else "", # Direct text
|
tab_text, # Direct text
|
||||||
aria_label, # ARIA label
|
aria_label, # ARIA label
|
||||||
tab.get_attribute("innerHTML").lower() or "", # Inner HTML
|
inner_html, # Inner HTML
|
||||||
tab.get_attribute("textContent").lower() or "" # Text content
|
text_content # Text content
|
||||||
]
|
]
|
||||||
|
|
||||||
# Check all sources against our comprehensive keyword list
|
# Check all sources against our comprehensive keyword list
|
||||||
for source in sources:
|
for i, source in enumerate(sources):
|
||||||
if any(word in source for word in REVIEW_WORDS):
|
if any(word.lower() in source for word in REVIEW_WORDS):
|
||||||
|
source_names = ["text", "aria-label", "innerHTML", "textContent"]
|
||||||
|
log.debug(f"Found reviews tab by {source_names[i]}: '{source}' (contains review word)")
|
||||||
return True
|
return True
|
||||||
|
|
||||||
# Strategy 4: Nested element detection
|
# Strategy 4: Nested element detection
|
||||||
@@ -309,8 +326,9 @@ class GoogleReviewsScraper:
|
|||||||
child_text = child.text.lower() if child.text else ""
|
child_text = child.text.lower() if child.text else ""
|
||||||
child_content = child.get_attribute("textContent").lower() or ""
|
child_content = child.get_attribute("textContent").lower() or ""
|
||||||
|
|
||||||
if any(word in child_text for word in REVIEW_WORDS) or any(
|
if any(word.lower() in child_text for word in REVIEW_WORDS) or any(
|
||||||
word in child_content for word in REVIEW_WORDS):
|
word.lower() in child_content for word in REVIEW_WORDS):
|
||||||
|
log.debug(f"Found reviews tab by child element text: '{child_text}' or '{child_content}'")
|
||||||
return True
|
return True
|
||||||
except:
|
except:
|
||||||
continue
|
continue
|
||||||
@@ -321,14 +339,18 @@ class GoogleReviewsScraper:
|
|||||||
for attr in ["href", "data-href", "data-url", "data-target"]:
|
for attr in ["href", "data-href", "data-url", "data-target"]:
|
||||||
attr_value = (tab.get_attribute(attr) or "").lower()
|
attr_value = (tab.get_attribute(attr) or "").lower()
|
||||||
if attr_value and ("review" in attr_value or "rating" in attr_value):
|
if attr_value and ("review" in attr_value or "rating" in attr_value):
|
||||||
|
log.debug(f"Found reviews tab by {attr}: {attr_value}")
|
||||||
return True
|
return True
|
||||||
|
|
||||||
# Strategy 6: Class detection (some review tabs have specific classes)
|
# Strategy 6: Class detection (some review tabs have specific classes)
|
||||||
tab_class = tab.get_attribute("class") or ""
|
tab_class = tab.get_attribute("class") or ""
|
||||||
review_classes = ["review", "reviews", "rating", "ratings", "comments", "feedback", "g4jrve"]
|
review_classes = ["review", "reviews", "rating", "ratings", "comments", "feedback", "g4jrve"]
|
||||||
if any(cls in tab_class for cls in review_classes):
|
if any(cls in tab_class.lower() for cls in review_classes):
|
||||||
|
log.debug(f"Found reviews tab by class: {tab_class}")
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
# Log what we found for debugging
|
||||||
|
log.debug(f"Tab not identified as reviews tab - role: {role}, index: {tab_index}, aria-label: '{aria_label}', text: '{tab_text}', class: '{tab_class}'")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
except StaleElementReferenceException:
|
except StaleElementReferenceException:
|
||||||
@@ -342,14 +364,89 @@ class GoogleReviewsScraper:
|
|||||||
Highly dynamic reviews tab detection and clicking with multiple fallback strategies.
|
Highly dynamic reviews tab detection and clicking with multiple fallback strategies.
|
||||||
Works across different languages, layouts, and browser environments.
|
Works across different languages, layouts, and browser environments.
|
||||||
"""
|
"""
|
||||||
max_timeout = 25 # Maximum seconds to try
|
max_timeout = 30 # Maximum seconds to try
|
||||||
end_time = time.time() + max_timeout
|
end_time = time.time() + max_timeout
|
||||||
attempts = 0
|
attempts = 0
|
||||||
|
|
||||||
|
# First, wait for the business panel to load
|
||||||
|
log.info("Waiting for business information panel to load...")
|
||||||
|
business_panel_loaded = False
|
||||||
|
panel_wait_end = time.time() + 15 # Wait up to 15 seconds for business panel
|
||||||
|
|
||||||
|
while time.time() < panel_wait_end:
|
||||||
|
try:
|
||||||
|
# Look for indicators that the business panel has loaded
|
||||||
|
business_indicators = [
|
||||||
|
# Business name or rating elements
|
||||||
|
'[role="main"] h1',
|
||||||
|
'[role="main"] .DUwDvf', # Business name class
|
||||||
|
'[role="main"] .F7nice', # Rating class
|
||||||
|
'[role="main"] .fontHeadlineSmall',
|
||||||
|
# Or any elements with review-related text
|
||||||
|
'//*[contains(translate(text(), "REVIEWS", "reviews"), "review")]',
|
||||||
|
]
|
||||||
|
|
||||||
|
for indicator in business_indicators[:-1]: # CSS selectors first
|
||||||
|
elements = driver.find_elements(By.CSS_SELECTOR, indicator)
|
||||||
|
if elements:
|
||||||
|
log.info(f"Business panel detected using selector: {indicator}")
|
||||||
|
business_panel_loaded = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if not business_panel_loaded:
|
||||||
|
# Try XPath selector for review text
|
||||||
|
elements = driver.find_elements(By.XPATH, business_indicators[-1])
|
||||||
|
if elements:
|
||||||
|
log.info("Business panel detected using review text search")
|
||||||
|
business_panel_loaded = True
|
||||||
|
|
||||||
|
if business_panel_loaded:
|
||||||
|
break
|
||||||
|
|
||||||
|
time.sleep(1)
|
||||||
|
except Exception as e:
|
||||||
|
log.debug(f"Error checking for business panel: {e}")
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
if business_panel_loaded:
|
||||||
|
log.info("Business panel loaded successfully")
|
||||||
|
# Give it a bit more time for tabs to appear
|
||||||
|
time.sleep(2)
|
||||||
|
else:
|
||||||
|
log.warning("Business panel may not have loaded completely, continuing with tab search...")
|
||||||
|
|
||||||
|
# If no business panel is detected, try URL manipulation to force reviews view
|
||||||
|
if not business_panel_loaded:
|
||||||
|
try:
|
||||||
|
current_url = driver.current_url
|
||||||
|
if "/place/" in current_url and "/reviews" not in current_url:
|
||||||
|
# Try to navigate directly to reviews by modifying URL
|
||||||
|
if "?" in current_url:
|
||||||
|
base_url = current_url.split("?")[0]
|
||||||
|
params = current_url.split("?")[1]
|
||||||
|
new_url = f"{base_url}/reviews?{params}"
|
||||||
|
else:
|
||||||
|
new_url = f"{current_url}/reviews"
|
||||||
|
|
||||||
|
log.info(f"Attempting to navigate directly to reviews: {new_url}")
|
||||||
|
driver.get(new_url)
|
||||||
|
time.sleep(3)
|
||||||
|
|
||||||
|
# Check if this worked by looking for review content
|
||||||
|
review_cards = driver.find_elements(By.CSS_SELECTOR, 'div[data-review-id]')
|
||||||
|
if review_cards:
|
||||||
|
log.info("Successfully navigated to reviews page")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
log.debug(f"URL manipulation failed: {e}")
|
||||||
|
|
||||||
# Define different selectors to try in order of reliability
|
# Define different selectors to try in order of reliability
|
||||||
tab_selectors = [
|
tab_selectors = [
|
||||||
# Direct tab selectors
|
# Direct tab selectors - try both common indexes
|
||||||
'[data-tab-index="1"]', # Most common tab index
|
'[data-tab-index="1"]', # Common tab index for reviews
|
||||||
|
'[data-tab-index="2"]', # Alternative tab index for reviews
|
||||||
|
'button[role="tab"][data-tab-index="1"]', # Exact match from HTML
|
||||||
|
'button[role="tab"][data-tab-index="2"]', # Alternative exact match
|
||||||
'[role="tab"][data-tab-index]', # Any tab with index
|
'[role="tab"][data-tab-index]', # Any tab with index
|
||||||
'button[role="tab"]', # Button tabs
|
'button[role="tab"]', # Button tabs
|
||||||
'div[role="tab"]', # Div tabs
|
'div[role="tab"]', # Div tabs
|
||||||
@@ -364,6 +461,9 @@ class GoogleReviewsScraper:
|
|||||||
'button:contains("reviews")', # Button containing "reviews"
|
'button:contains("reviews")', # Button containing "reviews"
|
||||||
'div[role="tablist"] > *', # Any tab in a tab list
|
'div[role="tablist"] > *', # Any tab in a tab list
|
||||||
'div.m6QErb div[role="tablist"] > *', # Google Maps specific tablist
|
'div.m6QErb div[role="tablist"] > *', # Google Maps specific tablist
|
||||||
|
|
||||||
|
# Fallback selectors
|
||||||
|
'[role="tab"]', # Any tab element
|
||||||
]
|
]
|
||||||
|
|
||||||
# Record successful clicks for debugging
|
# Record successful clicks for debugging
|
||||||
@@ -437,23 +537,39 @@ class GoogleReviewsScraper:
|
|||||||
|
|
||||||
# If we reach here, try XPath as a last resort
|
# If we reach here, try XPath as a last resort
|
||||||
if time.time() <= end_time:
|
if time.time() <= end_time:
|
||||||
|
log.info("Trying XPath-based text matching for review tabs...")
|
||||||
for language_keyword in REVIEW_WORDS:
|
for language_keyword in REVIEW_WORDS:
|
||||||
try:
|
try:
|
||||||
# Try XPath contains text
|
# Try different XPath patterns for review text
|
||||||
xpath = f"//*[contains(text(), '{language_keyword}')]"
|
xpath_patterns = [
|
||||||
elements = driver.find_elements(By.XPATH, xpath)
|
f"//*[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{language_keyword.lower()}')]",
|
||||||
|
f"//button[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{language_keyword.lower()}')]",
|
||||||
|
f"//*[@role='tab' and contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{language_keyword.lower()}')]",
|
||||||
|
f"//*[contains(translate(@aria-label, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{language_keyword.lower()}')]",
|
||||||
|
]
|
||||||
|
|
||||||
for element in elements:
|
for xpath in xpath_patterns:
|
||||||
try:
|
try:
|
||||||
log.info(f"Trying XPath with keyword '{language_keyword}'")
|
elements = driver.find_elements(By.XPATH, xpath)
|
||||||
driver.execute_script("arguments[0].scrollIntoView({block:'center'});", element)
|
for element in elements:
|
||||||
time.sleep(0.7)
|
try:
|
||||||
driver.execute_script("arguments[0].click();", element)
|
# Skip if it's a script tag or hidden element
|
||||||
time.sleep(1.5)
|
if element.tag_name.lower() in ['script', 'style', 'noscript']:
|
||||||
|
continue
|
||||||
|
if not element.is_displayed():
|
||||||
|
continue
|
||||||
|
|
||||||
if self.verify_reviews_tab_clicked(driver):
|
log.info(f"Trying XPath pattern with keyword '{language_keyword}': {xpath}")
|
||||||
log.info(f"Successfully clicked element with keyword '{language_keyword}'")
|
driver.execute_script("arguments[0].scrollIntoView({block:'center'});", element)
|
||||||
return True
|
time.sleep(0.7)
|
||||||
|
driver.execute_script("arguments[0].click();", element)
|
||||||
|
time.sleep(1.5)
|
||||||
|
|
||||||
|
if self.verify_reviews_tab_clicked(driver):
|
||||||
|
log.info(f"Successfully clicked element with keyword '{language_keyword}'")
|
||||||
|
return True
|
||||||
|
except:
|
||||||
|
continue
|
||||||
except:
|
except:
|
||||||
continue
|
continue
|
||||||
except:
|
except:
|
||||||
@@ -592,6 +708,13 @@ class GoogleReviewsScraper:
|
|||||||
for keyword in negative_keywords):
|
for keyword in negative_keywords):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Additional check - make sure this is actually a sort button for reviews
|
||||||
|
if ("sort" not in button_text.lower() and "sort" not in button_aria.lower() and
|
||||||
|
"סדר" not in button_text.lower() and "เรียง" not in button_text.lower() and
|
||||||
|
"ordenar" not in button_text.lower() and "trier" not in button_text.lower()):
|
||||||
|
log.debug(f"Button doesn't appear to be a sort button, skipping: '{button_text}' / '{button_aria}'")
|
||||||
|
continue
|
||||||
|
|
||||||
# Found a potential sort button
|
# Found a potential sort button
|
||||||
sort_button = element
|
sort_button = element
|
||||||
log.info(f"Found sort button with selector: {selector}")
|
log.info(f"Found sort button with selector: {selector}")
|
||||||
@@ -1033,7 +1156,13 @@ class GoogleReviewsScraper:
|
|||||||
sort_by = self.config.get("sort_by", "relevance")
|
sort_by = self.config.get("sort_by", "relevance")
|
||||||
stop_on_match = self.config.get("stop_on_match", False)
|
stop_on_match = self.config.get("stop_on_match", False)
|
||||||
|
|
||||||
|
# Override stop_on_match if translation mode is enabled
|
||||||
|
if self.append_translations or self.force_full_scan:
|
||||||
|
stop_on_match = False
|
||||||
|
log.info("Translation mode enabled - forcing full scan of all reviews")
|
||||||
|
|
||||||
log.info(f"Starting scraper with settings: headless={headless}, sort_by={sort_by}")
|
log.info(f"Starting scraper with settings: headless={headless}, sort_by={sort_by}")
|
||||||
|
log.info(f"Translation mode: append_translations={self.append_translations}, language={self.translation_language}")
|
||||||
log.info(f"URL: {url}")
|
log.info(f"URL: {url}")
|
||||||
|
|
||||||
# Initialize storage
|
# Initialize storage
|
||||||
@@ -1071,13 +1200,27 @@ class GoogleReviewsScraper:
|
|||||||
self.set_sort(driver, sort_by)
|
self.set_sort(driver, sort_by)
|
||||||
|
|
||||||
# Add a wait after setting sort to allow results to load
|
# Add a wait after setting sort to allow results to load
|
||||||
time.sleep(1)
|
time.sleep(3) # Increased wait time for reviews to load
|
||||||
|
|
||||||
# Use try-except to handle cases where the pane is not found
|
# Use try-except to handle cases where the pane is not found
|
||||||
try:
|
pane = None
|
||||||
pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, PANE_SEL)))
|
pane_selectors = [
|
||||||
except TimeoutException:
|
PANE_SEL, # Original selector
|
||||||
log.warning("Could not find reviews pane. Page structure might have changed.")
|
'div[role="main"]', # Simpler main container
|
||||||
|
'body', # Ultimate fallback
|
||||||
|
]
|
||||||
|
|
||||||
|
for pane_selector in pane_selectors:
|
||||||
|
try:
|
||||||
|
pane = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, pane_selector)))
|
||||||
|
log.info(f"Found scrollable pane using selector: {pane_selector}")
|
||||||
|
break
|
||||||
|
except TimeoutException:
|
||||||
|
log.debug(f"Pane selector '{pane_selector}' not found")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not pane:
|
||||||
|
log.warning("Could not find any scrollable pane. Page structure might have changed.")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
pbar = tqdm(desc="Scraped", ncols=80, initial=len(seen))
|
pbar = tqdm(desc="Scraped", ncols=80, initial=len(seen))
|
||||||
@@ -1095,6 +1238,11 @@ class GoogleReviewsScraper:
|
|||||||
max_attempts = 10 # Limit the number of attempts to find reviews
|
max_attempts = 10 # Limit the number of attempts to find reviews
|
||||||
attempts = 0
|
attempts = 0
|
||||||
|
|
||||||
|
# In translation mode, track total unique reviews to detect when we stop finding new ones
|
||||||
|
all_review_ids_seen = set()
|
||||||
|
last_unique_count = 0
|
||||||
|
no_new_reviews_count = 0
|
||||||
|
|
||||||
while attempts < max_attempts:
|
while attempts < max_attempts:
|
||||||
try:
|
try:
|
||||||
cards = pane.find_elements(By.CSS_SELECTOR, CARD_SEL)
|
cards = pane.find_elements(By.CSS_SELECTOR, CARD_SEL)
|
||||||
@@ -1102,32 +1250,106 @@ class GoogleReviewsScraper:
|
|||||||
|
|
||||||
# Check for valid cards
|
# Check for valid cards
|
||||||
if len(cards) == 0:
|
if len(cards) == 0:
|
||||||
log.debug("No review cards found in this iteration")
|
log.info(f"No review cards found in iteration {attempts + 1} using selector '{CARD_SEL}'")
|
||||||
attempts += 1
|
|
||||||
# Try scrolling anyway
|
# Try alternative selectors
|
||||||
driver.execute_script(scroll_script)
|
alternative_selectors = [
|
||||||
time.sleep(1)
|
'div[data-review-id]',
|
||||||
continue
|
'.jftiEf[data-review-id]',
|
||||||
|
'[data-review-id]'
|
||||||
|
]
|
||||||
|
|
||||||
|
for alt_sel in alternative_selectors:
|
||||||
|
alt_cards = pane.find_elements(By.CSS_SELECTOR, alt_sel)
|
||||||
|
log.info(f"Alternative selector '{alt_sel}': Found {len(alt_cards)} cards")
|
||||||
|
if alt_cards:
|
||||||
|
cards = alt_cards
|
||||||
|
break
|
||||||
|
|
||||||
|
if len(cards) == 0:
|
||||||
|
# If no cards found in pane, try searching the entire page
|
||||||
|
log.info("No cards found in pane, searching entire page...")
|
||||||
|
page_cards = driver.find_elements(By.CSS_SELECTOR, 'div[data-review-id]')
|
||||||
|
log.info(f"Found {len(page_cards)} review cards on entire page")
|
||||||
|
|
||||||
|
if page_cards:
|
||||||
|
cards = page_cards
|
||||||
|
log.info("Using review cards found on entire page")
|
||||||
|
else:
|
||||||
|
attempts += 1
|
||||||
|
# Try scrolling anyway
|
||||||
|
driver.execute_script(scroll_script)
|
||||||
|
time.sleep(1)
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
log.info(f"Found {len(cards)} review cards in iteration {attempts + 1}")
|
||||||
|
|
||||||
for c in cards:
|
for c in cards:
|
||||||
try:
|
try:
|
||||||
cid = c.get_attribute("data-review-id")
|
cid = c.get_attribute("data-review-id")
|
||||||
if not cid or cid in seen or cid in processed_ids:
|
if not cid:
|
||||||
if stop_on_match and cid and (cid in seen or cid in processed_ids):
|
|
||||||
idle = 999
|
|
||||||
break
|
|
||||||
continue
|
continue
|
||||||
fresh_cards.append(c)
|
|
||||||
|
# In translation mode, process all cards even if seen before
|
||||||
|
if self.append_translations:
|
||||||
|
# In translation mode, we process all reviews to add potential translations
|
||||||
|
# We don't use processed_ids to track, so process all cards
|
||||||
|
if cid in seen:
|
||||||
|
log.debug(f"Translation mode: Processing {cid} again (was seen before, adding translation)")
|
||||||
|
else:
|
||||||
|
log.debug(f"Translation mode: Processing {cid} (new review)")
|
||||||
|
fresh_cards.append(c)
|
||||||
|
else:
|
||||||
|
# Normal mode: skip seen reviews
|
||||||
|
if cid in seen or cid in processed_ids:
|
||||||
|
if stop_on_match and cid and (cid in seen or cid in processed_ids):
|
||||||
|
idle = 999
|
||||||
|
break
|
||||||
|
continue
|
||||||
|
fresh_cards.append(c)
|
||||||
except StaleElementReferenceException:
|
except StaleElementReferenceException:
|
||||||
continue
|
continue
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.debug(f"Error getting review ID: {e}")
|
log.debug(f"Error getting review ID: {e}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# In translation mode, track all unique review IDs to detect when we've seen all reviews
|
||||||
|
if self.append_translations:
|
||||||
|
current_review_ids = set()
|
||||||
|
for c in cards:
|
||||||
|
try:
|
||||||
|
cid = c.get_attribute("data-review-id")
|
||||||
|
if cid:
|
||||||
|
current_review_ids.add(cid)
|
||||||
|
all_review_ids_seen.add(cid)
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check if we found new unique reviews
|
||||||
|
current_unique_count = len(all_review_ids_seen)
|
||||||
|
if current_unique_count == last_unique_count:
|
||||||
|
no_new_reviews_count += 1
|
||||||
|
log.info(f"Translation mode: No new reviews found ({no_new_reviews_count}/5) - total unique: {current_unique_count}")
|
||||||
|
else:
|
||||||
|
no_new_reviews_count = 0
|
||||||
|
log.info(f"Translation mode: Found new reviews - total unique: {current_unique_count} (was {last_unique_count})")
|
||||||
|
|
||||||
|
last_unique_count = current_unique_count
|
||||||
|
|
||||||
|
# If we haven't found new reviews for 5 iterations, we're done
|
||||||
|
if no_new_reviews_count >= 5:
|
||||||
|
log.info("Translation mode: No new reviews found for 5 iterations - stopping")
|
||||||
|
break
|
||||||
|
|
||||||
|
# Log how many fresh cards we found
|
||||||
|
log.info(f"Found {len(fresh_cards)} fresh cards out of {len(cards)} total cards (translation_mode={self.append_translations})")
|
||||||
|
|
||||||
for card in fresh_cards:
|
for card in fresh_cards:
|
||||||
try:
|
try:
|
||||||
raw = RawReview.from_card(card)
|
raw = RawReview.from_card(card)
|
||||||
processed_ids.add(raw.id) # Track this ID to avoid re-processing
|
# In translation mode, don't add to processed_ids to allow re-processing
|
||||||
|
if not self.append_translations:
|
||||||
|
processed_ids.add(raw.id) # Track this ID to avoid re-processing
|
||||||
except StaleElementReferenceException:
|
except StaleElementReferenceException:
|
||||||
continue
|
continue
|
||||||
except Exception:
|
except Exception:
|
||||||
@@ -1136,22 +1358,37 @@ class GoogleReviewsScraper:
|
|||||||
try:
|
try:
|
||||||
raw_id = card.get_attribute("data-review-id") or ""
|
raw_id = card.get_attribute("data-review-id") or ""
|
||||||
raw = RawReview(id=raw_id, text="", lang="und")
|
raw = RawReview(id=raw_id, text="", lang="und")
|
||||||
processed_ids.add(raw_id)
|
# In translation mode, don't add to processed_ids to allow re-processing
|
||||||
|
if not self.append_translations:
|
||||||
|
processed_ids.add(raw_id)
|
||||||
except StaleElementReferenceException:
|
except StaleElementReferenceException:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
docs[raw.id] = merge_review(docs.get(raw.id), raw)
|
# Use translation-aware merge if translation mode is enabled
|
||||||
|
if self.append_translations:
|
||||||
|
docs[raw.id] = merge_review_with_translation(docs.get(raw.id), raw, append_translations=True)
|
||||||
|
else:
|
||||||
|
docs[raw.id] = merge_review(docs.get(raw.id), raw)
|
||||||
seen.add(raw.id)
|
seen.add(raw.id)
|
||||||
pbar.update(1)
|
pbar.update(1)
|
||||||
idle = 0
|
idle = 0
|
||||||
attempts = 0 # Reset attempts counter when we successfully process a review
|
attempts = 0 # Reset attempts counter when we successfully process a review
|
||||||
|
|
||||||
if idle >= 3:
|
# In translation mode, be more patient before giving up
|
||||||
|
max_idle = 10 if self.append_translations else 3
|
||||||
|
if idle >= max_idle:
|
||||||
|
log.info(f"Stopping after {max_idle} idle iterations")
|
||||||
break
|
break
|
||||||
|
|
||||||
if not fresh_cards:
|
if not fresh_cards:
|
||||||
idle += 1
|
idle += 1
|
||||||
attempts += 1
|
attempts += 1
|
||||||
|
# In translation mode, log why we're not finding fresh cards
|
||||||
|
if self.append_translations:
|
||||||
|
log.debug(f"No fresh cards in translation mode - idle: {idle}/{max_idle}, attempts: {attempts}")
|
||||||
|
else:
|
||||||
|
# Reset idle counter when we have fresh cards
|
||||||
|
idle = 0
|
||||||
|
|
||||||
# Use JavaScript for smoother scrolling
|
# Use JavaScript for smoother scrolling
|
||||||
try:
|
try:
|
||||||
@@ -1239,7 +1476,7 @@ class GoogleReviewsScraper:
|
|||||||
# from tqdm import tqdm
|
# from tqdm import tqdm
|
||||||
#
|
#
|
||||||
# from modules.models import RawReview
|
# from modules.models import RawReview
|
||||||
# from modules.data_storage import MongoDBStorage, JSONStorage, merge_review
|
# from modules.data_storage import MongoDBStorage, JSONStorage, merge_review, merge_review_with_translation
|
||||||
#
|
#
|
||||||
# # Logger
|
# # Logger
|
||||||
# log = logging.getLogger("scraper")
|
# log = logging.getLogger("scraper")
|
||||||
@@ -1702,7 +1939,7 @@ class GoogleReviewsScraper:
|
|||||||
# # from tqdm import tqdm
|
# # from tqdm import tqdm
|
||||||
# #
|
# #
|
||||||
# # from modules.models import RawReview
|
# # from modules.models import RawReview
|
||||||
# # from modules.data_storage import MongoDBStorage, JSONStorage, merge_review
|
# # from modules.data_storage import MongoDBStorage, JSONStorage, merge_review, merge_review_with_translation
|
||||||
# # from modules.utils import click_if
|
# # from modules.utils import click_if
|
||||||
# #
|
# #
|
||||||
# # # Logger
|
# # # Logger
|
||||||
|
|||||||
Reference in New Issue
Block a user