Initial commit - WhyRating Engine (Google Reviews Scraper)
This commit is contained in:
266
scrapers/google_reviews/session_manager.py
Normal file
266
scrapers/google_reviews/session_manager.py
Normal file
@@ -0,0 +1,266 @@
|
||||
"""
|
||||
Session Manager for Google Reviews Scraper
|
||||
|
||||
Manages browser sessions between validation and scraping phases.
|
||||
Allows reusing the same browser instance to avoid duplicate navigation.
|
||||
|
||||
Usage:
|
||||
# During validation
|
||||
session_id = session_manager.create_session(driver, business_info, total_reviews)
|
||||
return {"session_id": session_id, "business_info": business_info}
|
||||
|
||||
# During scraping (with session_id from validation)
|
||||
session = session_manager.get_session(session_id)
|
||||
if session:
|
||||
driver = session['driver']
|
||||
# Continue from where validation left off
|
||||
"""
|
||||
|
||||
import uuid
|
||||
import time
|
||||
import threading
|
||||
from typing import Optional, Dict, Any
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
@dataclass
|
||||
class BrowserSession:
|
||||
"""Represents a validated browser session ready for scraping."""
|
||||
session_id: str
|
||||
driver: Any # WebDriver instance
|
||||
url: str
|
||||
business_info: Dict[str, Any]
|
||||
total_reviews: int
|
||||
created_at: float
|
||||
expires_at: float
|
||||
browser_fingerprint: Optional[Dict[str, Any]] = None
|
||||
log_capture: Any = None # LogCapture instance
|
||||
# Track session state
|
||||
state: str = "validated" # validated -> scraping -> completed/expired
|
||||
|
||||
|
||||
class SessionManager:
|
||||
"""
|
||||
Manages browser sessions between validation and scraping.
|
||||
|
||||
Sessions have a TTL (default 5 minutes) after which they're automatically
|
||||
cleaned up and the browser is closed.
|
||||
"""
|
||||
|
||||
DEFAULT_TTL_SECONDS = 300 # 5 minutes
|
||||
CLEANUP_INTERVAL_SECONDS = 30 # Check for expired sessions every 30s
|
||||
|
||||
def __init__(self, ttl_seconds: int = None):
|
||||
self.ttl_seconds = ttl_seconds or self.DEFAULT_TTL_SECONDS
|
||||
self._sessions: Dict[str, BrowserSession] = {}
|
||||
self._lock = threading.RLock()
|
||||
self._cleanup_thread: Optional[threading.Thread] = None
|
||||
self._running = False
|
||||
|
||||
def start(self):
|
||||
"""Start the background cleanup thread."""
|
||||
if self._running:
|
||||
return
|
||||
self._running = True
|
||||
self._cleanup_thread = threading.Thread(target=self._cleanup_loop, daemon=True)
|
||||
self._cleanup_thread.start()
|
||||
|
||||
def stop(self):
|
||||
"""Stop the background cleanup thread."""
|
||||
self._running = False
|
||||
if self._cleanup_thread:
|
||||
self._cleanup_thread.join(timeout=5)
|
||||
|
||||
def _cleanup_loop(self):
|
||||
"""Background loop to clean up expired sessions."""
|
||||
while self._running:
|
||||
try:
|
||||
self._cleanup_expired()
|
||||
except Exception as e:
|
||||
print(f"[SessionManager] Cleanup error: {e}")
|
||||
time.sleep(self.CLEANUP_INTERVAL_SECONDS)
|
||||
|
||||
def _cleanup_expired(self):
|
||||
"""Remove expired sessions and close their browsers."""
|
||||
now = time.time()
|
||||
expired_ids = []
|
||||
|
||||
with self._lock:
|
||||
for session_id, session in self._sessions.items():
|
||||
if now > session.expires_at:
|
||||
expired_ids.append(session_id)
|
||||
|
||||
for session_id in expired_ids:
|
||||
self.release_session(session_id, reason="expired")
|
||||
|
||||
def create_session(
|
||||
self,
|
||||
driver: Any,
|
||||
url: str,
|
||||
business_info: Dict[str, Any],
|
||||
total_reviews: int,
|
||||
browser_fingerprint: Optional[Dict[str, Any]] = None,
|
||||
log_capture: Any = None,
|
||||
ttl_seconds: Optional[int] = None
|
||||
) -> str:
|
||||
"""
|
||||
Create a new browser session after validation.
|
||||
|
||||
Args:
|
||||
driver: WebDriver instance (positioned on Google Maps page)
|
||||
url: The validated Google Maps URL
|
||||
business_info: Extracted business information
|
||||
total_reviews: Total review count from page
|
||||
browser_fingerprint: Browser fingerprint settings used
|
||||
log_capture: LogCapture instance for logging
|
||||
ttl_seconds: Custom TTL for this session (default: 5 min)
|
||||
|
||||
Returns:
|
||||
session_id: Unique identifier for this session
|
||||
"""
|
||||
session_id = str(uuid.uuid4())[:8] # Short ID for easier use
|
||||
ttl = ttl_seconds or self.ttl_seconds
|
||||
now = time.time()
|
||||
|
||||
session = BrowserSession(
|
||||
session_id=session_id,
|
||||
driver=driver,
|
||||
url=url,
|
||||
business_info=business_info,
|
||||
total_reviews=total_reviews,
|
||||
created_at=now,
|
||||
expires_at=now + ttl,
|
||||
browser_fingerprint=browser_fingerprint,
|
||||
log_capture=log_capture,
|
||||
state="validated"
|
||||
)
|
||||
|
||||
with self._lock:
|
||||
self._sessions[session_id] = session
|
||||
|
||||
print(f"[SessionManager] Created session {session_id} for {business_info.get('name', 'unknown')} (TTL: {ttl}s)")
|
||||
return session_id
|
||||
|
||||
def get_session(self, session_id: str) -> Optional[BrowserSession]:
|
||||
"""
|
||||
Retrieve a session by ID.
|
||||
|
||||
Args:
|
||||
session_id: The session identifier
|
||||
|
||||
Returns:
|
||||
BrowserSession if found and not expired, None otherwise
|
||||
"""
|
||||
with self._lock:
|
||||
session = self._sessions.get(session_id)
|
||||
if not session:
|
||||
print(f"[SessionManager] Session {session_id} not found")
|
||||
return None
|
||||
|
||||
# Check if expired
|
||||
if time.time() > session.expires_at:
|
||||
print(f"[SessionManager] Session {session_id} expired")
|
||||
self.release_session(session_id, reason="expired")
|
||||
return None
|
||||
|
||||
return session
|
||||
|
||||
def claim_session(self, session_id: str) -> Optional[BrowserSession]:
|
||||
"""
|
||||
Claim a session for scraping (marks it as in-use).
|
||||
|
||||
Args:
|
||||
session_id: The session identifier
|
||||
|
||||
Returns:
|
||||
BrowserSession if successfully claimed, None otherwise
|
||||
"""
|
||||
with self._lock:
|
||||
session = self.get_session(session_id)
|
||||
if not session:
|
||||
return None
|
||||
|
||||
if session.state != "validated":
|
||||
print(f"[SessionManager] Session {session_id} already in state: {session.state}")
|
||||
return None
|
||||
|
||||
session.state = "scraping"
|
||||
# Extend TTL during scraping (1 hour max)
|
||||
session.expires_at = time.time() + 3600
|
||||
|
||||
print(f"[SessionManager] Claimed session {session_id} for scraping")
|
||||
return session
|
||||
|
||||
def release_session(self, session_id: str, reason: str = "completed"):
|
||||
"""
|
||||
Release a session and close the browser.
|
||||
|
||||
Args:
|
||||
session_id: The session identifier
|
||||
reason: Why the session is being released
|
||||
"""
|
||||
with self._lock:
|
||||
session = self._sessions.pop(session_id, None)
|
||||
|
||||
if session:
|
||||
print(f"[SessionManager] Releasing session {session_id} ({reason})")
|
||||
try:
|
||||
if session.driver:
|
||||
session.driver.quit()
|
||||
except Exception as e:
|
||||
print(f"[SessionManager] Error closing driver for {session_id}: {e}")
|
||||
|
||||
def extend_session(self, session_id: str, additional_seconds: int = 300) -> bool:
|
||||
"""
|
||||
Extend a session's TTL.
|
||||
|
||||
Args:
|
||||
session_id: The session identifier
|
||||
additional_seconds: Seconds to add to TTL
|
||||
|
||||
Returns:
|
||||
True if extended, False if session not found
|
||||
"""
|
||||
with self._lock:
|
||||
session = self._sessions.get(session_id)
|
||||
if not session:
|
||||
return False
|
||||
session.expires_at = time.time() + additional_seconds
|
||||
return True
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""Get session manager statistics."""
|
||||
with self._lock:
|
||||
now = time.time()
|
||||
sessions = []
|
||||
for sid, s in self._sessions.items():
|
||||
sessions.append({
|
||||
"session_id": sid,
|
||||
"business": s.business_info.get("name", "unknown"),
|
||||
"state": s.state,
|
||||
"age_seconds": int(now - s.created_at),
|
||||
"ttl_remaining": int(s.expires_at - now)
|
||||
})
|
||||
return {
|
||||
"total_sessions": len(self._sessions),
|
||||
"sessions": sessions
|
||||
}
|
||||
|
||||
def list_sessions(self) -> list:
|
||||
"""List all active sessions."""
|
||||
with self._lock:
|
||||
return list(self._sessions.keys())
|
||||
|
||||
|
||||
# Global singleton instance
|
||||
_session_manager: Optional[SessionManager] = None
|
||||
|
||||
|
||||
def get_session_manager() -> SessionManager:
|
||||
"""Get or create the global session manager instance."""
|
||||
global _session_manager
|
||||
if _session_manager is None:
|
||||
_session_manager = SessionManager()
|
||||
_session_manager.start()
|
||||
return _session_manager
|
||||
Reference in New Issue
Block a user