Initial commit - WhyRating Engine (Google Reviews Scraper)

This commit is contained in:
Alejandro Gutiérrez
2026-02-02 18:19:00 +00:00
parent 0543a08242
commit 2206ddeff2
136 changed files with 51138 additions and 855 deletions

View File

@@ -0,0 +1,266 @@
"""
Session Manager for Google Reviews Scraper
Manages browser sessions between validation and scraping phases.
Allows reusing the same browser instance to avoid duplicate navigation.
Usage:
# During validation
session_id = session_manager.create_session(driver, business_info, total_reviews)
return {"session_id": session_id, "business_info": business_info}
# During scraping (with session_id from validation)
session = session_manager.get_session(session_id)
if session:
driver = session['driver']
# Continue from where validation left off
"""
import uuid
import time
import threading
from typing import Optional, Dict, Any
from dataclasses import dataclass, field
from datetime import datetime
@dataclass
class BrowserSession:
"""Represents a validated browser session ready for scraping."""
session_id: str
driver: Any # WebDriver instance
url: str
business_info: Dict[str, Any]
total_reviews: int
created_at: float
expires_at: float
browser_fingerprint: Optional[Dict[str, Any]] = None
log_capture: Any = None # LogCapture instance
# Track session state
state: str = "validated" # validated -> scraping -> completed/expired
class SessionManager:
"""
Manages browser sessions between validation and scraping.
Sessions have a TTL (default 5 minutes) after which they're automatically
cleaned up and the browser is closed.
"""
DEFAULT_TTL_SECONDS = 300 # 5 minutes
CLEANUP_INTERVAL_SECONDS = 30 # Check for expired sessions every 30s
def __init__(self, ttl_seconds: int = None):
self.ttl_seconds = ttl_seconds or self.DEFAULT_TTL_SECONDS
self._sessions: Dict[str, BrowserSession] = {}
self._lock = threading.RLock()
self._cleanup_thread: Optional[threading.Thread] = None
self._running = False
def start(self):
"""Start the background cleanup thread."""
if self._running:
return
self._running = True
self._cleanup_thread = threading.Thread(target=self._cleanup_loop, daemon=True)
self._cleanup_thread.start()
def stop(self):
"""Stop the background cleanup thread."""
self._running = False
if self._cleanup_thread:
self._cleanup_thread.join(timeout=5)
def _cleanup_loop(self):
"""Background loop to clean up expired sessions."""
while self._running:
try:
self._cleanup_expired()
except Exception as e:
print(f"[SessionManager] Cleanup error: {e}")
time.sleep(self.CLEANUP_INTERVAL_SECONDS)
def _cleanup_expired(self):
"""Remove expired sessions and close their browsers."""
now = time.time()
expired_ids = []
with self._lock:
for session_id, session in self._sessions.items():
if now > session.expires_at:
expired_ids.append(session_id)
for session_id in expired_ids:
self.release_session(session_id, reason="expired")
def create_session(
self,
driver: Any,
url: str,
business_info: Dict[str, Any],
total_reviews: int,
browser_fingerprint: Optional[Dict[str, Any]] = None,
log_capture: Any = None,
ttl_seconds: Optional[int] = None
) -> str:
"""
Create a new browser session after validation.
Args:
driver: WebDriver instance (positioned on Google Maps page)
url: The validated Google Maps URL
business_info: Extracted business information
total_reviews: Total review count from page
browser_fingerprint: Browser fingerprint settings used
log_capture: LogCapture instance for logging
ttl_seconds: Custom TTL for this session (default: 5 min)
Returns:
session_id: Unique identifier for this session
"""
session_id = str(uuid.uuid4())[:8] # Short ID for easier use
ttl = ttl_seconds or self.ttl_seconds
now = time.time()
session = BrowserSession(
session_id=session_id,
driver=driver,
url=url,
business_info=business_info,
total_reviews=total_reviews,
created_at=now,
expires_at=now + ttl,
browser_fingerprint=browser_fingerprint,
log_capture=log_capture,
state="validated"
)
with self._lock:
self._sessions[session_id] = session
print(f"[SessionManager] Created session {session_id} for {business_info.get('name', 'unknown')} (TTL: {ttl}s)")
return session_id
def get_session(self, session_id: str) -> Optional[BrowserSession]:
"""
Retrieve a session by ID.
Args:
session_id: The session identifier
Returns:
BrowserSession if found and not expired, None otherwise
"""
with self._lock:
session = self._sessions.get(session_id)
if not session:
print(f"[SessionManager] Session {session_id} not found")
return None
# Check if expired
if time.time() > session.expires_at:
print(f"[SessionManager] Session {session_id} expired")
self.release_session(session_id, reason="expired")
return None
return session
def claim_session(self, session_id: str) -> Optional[BrowserSession]:
"""
Claim a session for scraping (marks it as in-use).
Args:
session_id: The session identifier
Returns:
BrowserSession if successfully claimed, None otherwise
"""
with self._lock:
session = self.get_session(session_id)
if not session:
return None
if session.state != "validated":
print(f"[SessionManager] Session {session_id} already in state: {session.state}")
return None
session.state = "scraping"
# Extend TTL during scraping (1 hour max)
session.expires_at = time.time() + 3600
print(f"[SessionManager] Claimed session {session_id} for scraping")
return session
def release_session(self, session_id: str, reason: str = "completed"):
"""
Release a session and close the browser.
Args:
session_id: The session identifier
reason: Why the session is being released
"""
with self._lock:
session = self._sessions.pop(session_id, None)
if session:
print(f"[SessionManager] Releasing session {session_id} ({reason})")
try:
if session.driver:
session.driver.quit()
except Exception as e:
print(f"[SessionManager] Error closing driver for {session_id}: {e}")
def extend_session(self, session_id: str, additional_seconds: int = 300) -> bool:
"""
Extend a session's TTL.
Args:
session_id: The session identifier
additional_seconds: Seconds to add to TTL
Returns:
True if extended, False if session not found
"""
with self._lock:
session = self._sessions.get(session_id)
if not session:
return False
session.expires_at = time.time() + additional_seconds
return True
def get_stats(self) -> Dict[str, Any]:
"""Get session manager statistics."""
with self._lock:
now = time.time()
sessions = []
for sid, s in self._sessions.items():
sessions.append({
"session_id": sid,
"business": s.business_info.get("name", "unknown"),
"state": s.state,
"age_seconds": int(now - s.created_at),
"ttl_remaining": int(s.expires_at - now)
})
return {
"total_sessions": len(self._sessions),
"sessions": sessions
}
def list_sessions(self) -> list:
"""List all active sessions."""
with self._lock:
return list(self._sessions.keys())
# Global singleton instance
_session_manager: Optional[SessionManager] = None
def get_session_manager() -> SessionManager:
"""Get or create the global session manager instance."""
global _session_manager
if _session_manager is None:
_session_manager = SessionManager()
_session_manager.start()
return _session_manager

View File

@@ -732,7 +732,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
progress_callback=None, validation_only: bool = False,
sort_strategy: str = SORT_AUTO, sort_order: List[str] = None,
multi_sort_threshold: int = MULTI_SORT_THRESHOLD,
close_enough_pct: float = 95.0) -> dict:
close_enough_pct: float = 95.0, initial_sort: str = None) -> dict:
"""
Scrape Google Maps reviews with optional multi-sort strategy.
@@ -754,6 +754,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
sort_order: Custom sort order for multi-sort (default: newest, lowest, highest, relevant)
multi_sort_threshold: Auto-enable multi-sort if total reviews > this (default: 1000)
close_enough_pct: Stop retrying if we have this % of total reviews (default: 95.0)
initial_sort: Initial sort order to use (default: newest). Used for retry with different sort
Returns:
dict with reviews list and metadata
@@ -1381,8 +1382,10 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
log.info('browser', "Sort button found")
break
# Track bot detection - if sort button hidden, Google likely detected bot
bot_detected = not sort_found
if not sort_found:
log.warn('browser', "Sort button not found after waiting, continuing without sorting")
log.warn('browser', "Sort button not found after waiting, continuing without sorting (bot detection likely)")
# Sort by specified order (default: newest)
target_sort = initial_sort or SORT_NEWEST
@@ -1815,6 +1818,71 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
}
text = longestText;
// OWNER RESPONSE: Find by "Response from the owner" text anchor
var ownerResponse = null;
var ownerSpan = null;
var cardSpans = card.querySelectorAll('span');
for (var k = 0; k < cardSpans.length; k++) {
if (cardSpans[k].textContent.trim() === 'Response from the owner') {
ownerSpan = cardSpans[k];
break;
}
}
if (ownerSpan) {
// Navigate: span -> header div -> container div
var headerDiv = ownerSpan.closest('div');
var respContainer = headerDiv ? headerDiv.parentElement : null;
if (respContainer) {
// Click expand button if exists and not expanded
var expandBtn = respContainer.querySelector('button[aria-label="See more"]');
if (expandBtn && expandBtn.getAttribute('aria-expanded') !== 'true') {
expandBtn.click();
}
// Get timestamp from header spans
var respTimestamp = '';
var headerSpans = headerDiv.querySelectorAll('span');
for (var m = 0; m < headerSpans.length; m++) {
var spanTxt = headerSpans[m].textContent.trim();
if (spanTxt.match(/ago$/i)) {
respTimestamp = spanTxt;
break;
}
}
// Get response text from direct child div[lang]
var respText = '';
var langDivs = respContainer.children;
for (var m = 0; m < langDivs.length; m++) {
if (langDivs[m].tagName === 'DIV' && langDivs[m].hasAttribute('lang')) {
respText = langDivs[m].textContent.trim();
respText = respText.replace(/(More|Less)$/, '').trim();
break;
}
}
// Fallback: find longest text div that's not the header
if (!respText) {
for (var m = 0; m < langDivs.length; m++) {
if (langDivs[m].tagName === 'DIV') {
var divTxt = langDivs[m].textContent.trim();
if (divTxt.includes('Response from the owner')) continue;
divTxt = divTxt.replace(/(More|Less)$/, '').trim();
if (divTxt.length > respText.length) {
respText = divTxt;
}
}
}
}
if (respText) {
ownerResponse = {text: respText, timestamp: respTimestamp};
}
}
}
if (author && rating >= 1 && rating <= 5) {
results.push({
id: rid,
@@ -1823,6 +1891,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
text: text,
rating: rating,
timestamp: timestamp,
owner_response: ownerResponse,
source: 'dom'
});
}
@@ -2198,6 +2267,9 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
topics_inferred_count += 1
log.info('scraper', f"Topics inferred for {topics_inferred_count}/{len(review_list)} reviews", metrics={'topics_inferred_count': topics_inferred_count, 'reviews_count': len(review_list)})
# Include business info captured from Overview page
business_info = business_info_cache[0] or {}
return {
"reviews": review_list, # Only unflushed reviews (flushed already sent to callback)
"total": grand_total,
@@ -2209,10 +2281,19 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
"metrics_history": metrics_history, # For crash detection
"start_time": start_time, # For crash report elapsed time
"session_fingerprint": session_fingerprint, # Browser fingerprint for bot detection analysis
"bot_detected": bot_detected if 'bot_detected' in dir() else False, # True if sort button was hidden
"initial_sort_used": target_sort if 'target_sort' in dir() else SORT_NEWEST, # Sort order used for first pass
"multi_sort": {
"enabled": should_multi_sort if 'should_multi_sort' in dir() else False,
"completed_sorts": completed_sorts if 'completed_sorts' in dir() else [SORT_NEWEST],
"first_pass_count": first_pass_count if 'first_pass_count' in dir() else grand_total
},
# Business info captured from Google Maps page
"business_info": {
"name": business_info.get("name"),
"category": business_info.get("category"),
"address": business_info.get("address"),
"rating": business_info.get("rating")
}
}
@@ -2220,7 +2301,8 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999,
progress_callback=None, driver=None, return_driver: bool = False,
log_capture: LogCapture = None, flush_callback=None, validation_only: bool = False,
browser_fingerprint: dict = None):
browser_fingerprint: dict = None, initial_sort: str = None,
sort_strategy: str = SORT_AUTO, max_reviews: int = None):
"""
Production-compatible wrapper for scrape_reviews.
Matches the API expected by job_manager.py.
@@ -2240,6 +2322,10 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
- timezone: string (e.g., "Europe/Madrid")
- language: string (e.g., "en-US")
- platform: string (e.g., "MacIntel")
initial_sort: Initial sort order to use ("newest", "lowest", "highest", "relevant")
Used for retry with different sort strategy
sort_strategy: Sort strategy ("auto", "multi", "single", or specific sort)
max_reviews: Maximum reviews to collect (for testing). None = unlimited (default: 5000)
Returns:
Dictionary with: reviews, count, total_reviews, time, success, error, driver, logs
@@ -2329,13 +2415,15 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
result = scrape_reviews(
driver=driver,
url=url,
max_reviews=999999, # Effectively unlimited
max_reviews=max_reviews if max_reviews else 999999, # Unlimited by default, or custom limit for testing
timeout_no_new=15,
flush_callback=internal_flush,
flush_batch_size=100, # Smaller batches for more frequent progress
log_capture=log_capture,
progress_callback=progress_callback, # Pass through for real-time log updates
validation_only=validation_only # Return early if just validating
validation_only=validation_only, # Return early if just validating
sort_strategy=sort_strategy, # Sort strategy (auto, multi, single)
initial_sort=initial_sort # Initial sort order for retry with different sort
)
elapsed = time.time() - start_time
@@ -2350,7 +2438,13 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
"error": None,
"logs": result.get("logs", []),
"review_topics": result.get("review_topics", []), # Topic filters with mention counts
"session_fingerprint": result.get("session_fingerprint") # Browser fingerprint for bot detection
"session_fingerprint": result.get("session_fingerprint"), # Browser fingerprint for bot detection
# Tracking info for retry strategy
"bot_detected": result.get("bot_detected", False), # True if sort button was hidden by Google
"initial_sort_used": result.get("initial_sort_used", "newest"), # Sort order used
"multi_sort": result.get("multi_sort", {}), # Multi-sort completion info
# Business info captured from Google Maps page
"business_info": result.get("business_info", {})
}
# Include validation_info if in validation_only mode

File diff suppressed because it is too large Load Diff