diff --git a/modules/fast_scraper.py b/modules/fast_scraper.py index 002e14b..cdb7cb4 100644 --- a/modules/fast_scraper.py +++ b/modules/fast_scraper.py @@ -49,26 +49,29 @@ def check_no_reviews_early(driver) -> tuple[bool, str]: return True, f"Found 'no reviews' message: '{pattern}'" # Check if review count is explicitly 0 + # IMPORTANT: Be very specific to avoid false positives! review_count_check = driver.execute_script(""" - // Look for review count indicators + // Only check for EXACT "0 reviews" patterns, not loose matches const patterns = [ - /0\\s*(?:review|reseña|avis|recensione|Bewertung|レビュー)/i, - /\\(0\\)/, - /review.*0/i, - /0.*review/i + /^0\\s+reviews?$/im, // Exactly "0 reviews" on its own line + /\\(0\\s+reviews?\\)/i, // "(0 reviews)" + /\\b0\\s+reviews?\\b/i // "0 reviews" as a complete phrase ]; const text = document.body.innerText; - for (let pattern of patterns) { - if (pattern.test(text)) { - return 'Found 0 reviews indicator'; - } - } - // Check for aria-labels indicating no reviews - const elements = document.querySelectorAll('[aria-label*="0 review" i], [aria-label*="no review" i]'); - if (elements.length > 0) { - return 'Found aria-label with 0 reviews'; + // Split into lines and check each line independently to avoid false positives + const lines = text.split('\\n'); + for (let line of lines) { + const trimmed = line.trim(); + for (let pattern of patterns) { + if (pattern.test(trimmed)) { + // Double-check: line should be short (not a review text itself) + if (trimmed.length < 50) { + return 'Found explicit "0 reviews" text: ' + trimmed; + } + } + } } return null; @@ -287,11 +290,22 @@ def extract_all_reviews_js(driver) -> List[Dict[str, Any]]: } // STRATEGY 2: Structural matching for unknown page layouts + // IMPORTANT: Search only within the reviews pane, not the entire page! if (!elements || elements.length === 0) { console.log('Known selectors failed, trying structural matching...'); - // Find all divs that LOOK like reviews (have review structure) - const allDivs = document.querySelectorAll('div'); + // Find the reviews pane first + const pane = document.querySelector('div.m6QErb.WNBkOb.XiKgde') || + document.querySelector('div.m6QErb') || + document.querySelector('div[role="main"]'); + + if (!pane) { + console.warn('No reviews pane found'); + return []; + } + + // Find all divs that LOOK like reviews (have review structure) WITHIN the pane + const allDivs = pane.querySelectorAll('div'); const reviewElements = []; for (let div of allDivs) { @@ -302,7 +316,7 @@ def extract_all_reviews_js(driver) -> List[Dict[str, Any]]: const hasAuthor = div.querySelector('[aria-label*="photo" i], img[src*="photo"], img[src*="avatar"]'); const hasRating = div.querySelector('[aria-label*="star" i], [aria-label*="rating" i], span[role="img"]'); const hasText = Array.from(div.querySelectorAll('span')).some(s => s.textContent.length > 20); - const hasDate = div.textContent.match(/\\d+\\s*(day|week|month|year|día|semana|mes|año)/i); + const hasDate = div.textContent.match(/\\d+\\s*(day|week|month|year|día|semana|mes|año|dienų|savaitės)/i); // Must have at least author, rating, and text to be a review const indicators = [hasAuthor, hasRating, hasText, hasDate].filter(Boolean).length; @@ -317,22 +331,28 @@ def extract_all_reviews_js(driver) -> List[Dict[str, Any]]: } } - // STRATEGY 3: Try role="article" as last resort + // STRATEGY 3: Try role="article" as last resort (within pane) if (!elements || elements.length === 0) { - const articles = document.querySelectorAll('[role="article"]'); - const validArticles = []; + const pane = document.querySelector('div.m6QErb.WNBkOb.XiKgde') || + document.querySelector('div.m6QErb') || + document.querySelector('div[role="main"]'); - for (let article of articles) { - const hasRating = article.querySelector('[aria-label*="star" i]'); - const hasText = article.textContent.length > 30; - if (hasRating && hasText) { - validArticles.push(article); + if (pane) { + const articles = pane.querySelectorAll('[role="article"]'); + const validArticles = []; + + for (let article of articles) { + const hasRating = article.querySelector('[aria-label*="star" i]'); + const hasText = article.textContent.length > 30; + if (hasRating && hasText) { + validArticles.push(article); + } } - } - if (validArticles.length > 0) { - elements = validArticles; - console.log('Found', validArticles.length, 'reviews using role=article'); + if (validArticles.length > 0) { + elements = validArticles; + console.log('Found', validArticles.length, 'reviews using role=article'); + } } } @@ -536,13 +556,19 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999 # Force English locale for consistent date parsing # English gives cleaner date formats: "3 months ago" vs "Hace 3 meses" + # Store original URL in case we need to retry without locale override + original_url = url + locale_override_applied = False + if 'hl=' in url: # Replace existing locale url = url.replace('hl=es', 'hl=en').replace('hl=pt', 'hl=en').replace('hl=fr', 'hl=en') + locale_override_applied = True else: # Add English locale parameter separator = '&' if '?' in url else '?' url = f"{url}{separator}hl=en" + locale_override_applied = True log.info(f"Using English locale (hl=en) for consistent date parsing") @@ -628,7 +654,89 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999 continue if not reviews_tab_clicked: - log.warning("Could not find reviews tab, continuing anyway") + log.warning("Could not find reviews tab with hl=en locale") + + # FALLBACK: If locale override was applied and tab not found, + # retry without locale override (fixes regional pages where hl=en breaks tabs) + if locale_override_applied: + log.info("Retrying without locale override to find reviews tab...") + + # Reload page with original URL (no hl=en) + driver.get(original_url) + time.sleep(1.5) + + # Handle GDPR again if needed + if 'consent.google.com' in driver.current_url: + try: + form_btns = driver.find_elements(By.CSS_SELECTOR, 'form button') + for btn in form_btns: + btn_text = (btn.text or '').lower() + if any(keyword in btn_text for keyword in ['accept', 'aceptar', 'priim', 'принять', 'accepter']): + log.info(f"Clicking GDPR consent: {btn.text}") + btn.click() + time.sleep(2) + break + else: + if len(form_btns) >= 2: + log.info("Using fallback: clicking second form button") + form_btns[1].click() + time.sleep(2) + except Exception as e: + log.warning(f"GDPR consent handling failed: {e}") + + # Dismiss cookie banner + try: + cookie_btns = driver.find_elements(By.CSS_SELECTOR, + 'button[aria-label*="Accept" i],button[aria-label*="Aceptar" i]') + if cookie_btns: + cookie_btns[0].click() + time.sleep(0.3) + except: + pass + + # Try to find reviews tab with multilingual keywords + multilingual_keywords = [ + 'review', 'reviews', # English + 'reseña', 'reseñas', # Spanish + 'avis', # French + 'bewertung', 'bewertungen', # German + 'recensione', 'recensioni', # Italian + 'レビュー', # Japanese + 'avaliação', 'avaliações', # Portuguese + 'отзыв', 'отзывы', # Russian + 'atsiliepimai', 'atsiliepi', # Lithuanian + 'ulasan', # Indonesian + '리뷰' # Korean + ] + + for attempt in range(3): + if reviews_tab_clicked: + break + + time.sleep(0.5) + + for selector in ['button[role="tab"]', '.LRkQ2', 'button']: + try: + tabs = driver.find_elements(By.CSS_SELECTOR, selector) + for tab in tabs: + text = (tab.text or '').lower() + aria = (tab.get_attribute('aria-label') or '').lower() + + if any(kw in text or kw in aria for kw in multilingual_keywords): + log.info(f"Clicking reviews tab (native locale): {tab.text or aria[:30]}") + driver.execute_script("arguments[0].click();", tab) + time.sleep(1.5) + reviews_tab_clicked = True + break + + if reviews_tab_clicked: + break + except Exception as e: + log.debug(f"Native locale tab search attempt {attempt+1} with {selector}: {e}") + continue + + if not reviews_tab_clicked: + log.warning("Could not find reviews tab even without locale override") # Wait for reviews section to load time.sleep(2) @@ -733,8 +841,17 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999 // STRATEGY 2: Structural pattern matching (robust, class-agnostic) // Find containers that LOOK like reviews (have author + rating + text structure) + // IMPORTANT: Search only within the reviews pane, not the entire page! const findReviewsByStructure = () => { - const allDivs = document.querySelectorAll('div'); + // Find the reviews pane first + const pane = document.querySelector('div.m6QErb.WNBkOb.XiKgde') || + document.querySelector('div.m6QErb') || + document.querySelector('div[role="main"]'); + + if (!pane) return 0; + + // Search only within the pane + const allDivs = pane.querySelectorAll('div'); let reviewCount = 0; for (let div of allDivs) { @@ -749,7 +866,7 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999 const hasAuthor = div.querySelector('[aria-label*="photo" i], img[src*="photo"], img[src*="avatar"]'); const hasRating = div.querySelector('[aria-label*="star" i], [aria-label*="rating" i], span[role="img"]'); const hasText = Array.from(div.querySelectorAll('span')).some(s => s.textContent.length > 20); - const hasDate = div.textContent.match(/\\d+\\s*(day|week|month|year|día|semana|mes|año|jour|mois|année)/i); + const hasDate = div.textContent.match(/\\d+\\s*(day|week|month|year|día|semana|mes|año|jour|mois|année|dienų|savaitės)/i); // If it has at least 3 of these indicators, it's likely a review const indicators = [hasAuthor, hasRating, hasText, hasDate].filter(Boolean).length; @@ -761,19 +878,24 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999 return reviewCount > 0 ? reviewCount : 0; }; - // STRATEGY 3: Look for role="article" with review-like content - const articles = document.querySelectorAll('[role="article"]'); - if (articles.length > 0) { - let validArticles = 0; - for (let article of articles) { - // Check if article looks like a review (has rating + text) - const hasRating = article.querySelector('[aria-label*="star" i]'); - const hasText = article.textContent.length > 30; - if (hasRating && hasText) { - validArticles++; + // STRATEGY 3: Look for role="article" with review-like content (within pane) + const pane3 = document.querySelector('div.m6QErb.WNBkOb.XiKgde') || + document.querySelector('div.m6QErb') || + document.querySelector('div[role="main"]'); + if (pane3) { + const articles = pane3.querySelectorAll('[role="article"]'); + if (articles.length > 0) { + let validArticles = 0; + for (let article of articles) { + // Check if article looks like a review (has rating + text) + const hasRating = article.querySelector('[aria-label*="star" i]'); + const hasText = article.textContent.length > 30; + if (hasRating && hasText) { + validArticles++; + } } + if (validArticles > 0) return validArticles; } - if (validArticles > 0) return validArticles; } // Try structural matching as last resort