Add API interception for hybrid scraping and update selectors
- Add new api_interceptor.py module for CDP network interception - Capture Google Maps internal API responses during scrolling - Parse protobuf-like JSON responses to extract review data - Merge API-captured reviews with DOM-scraped data - Update CSS selectors for January 2026 Google Maps structure - Add cookie consent dismissal for multiple languages - Add --api-intercept CLI flag and config option - Fix review card and pane selectors (.jftiEf, .XiKgde) - Improve review ID extraction from card elements Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -24,16 +24,25 @@ from tqdm import tqdm
|
||||
|
||||
from modules.data_storage import MongoDBStorage, JSONStorage, merge_review
|
||||
from modules.models import RawReview
|
||||
from modules.api_interceptor import GoogleMapsAPIInterceptor
|
||||
|
||||
# Logger
|
||||
log = logging.getLogger("scraper")
|
||||
|
||||
# CSS Selectors
|
||||
PANE_SEL = 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf'
|
||||
CARD_SEL = "div[data-review-id]"
|
||||
# CSS Selectors (Updated January 2026 for current Google Maps structure)
|
||||
PANE_SEL = 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde'
|
||||
CARD_SEL = "div.jftiEf" # Review card container
|
||||
# Cookie/consent dialog selectors (Updated January 2026)
|
||||
COOKIE_BTN = ('button[aria-label*="Accept" i],'
|
||||
'button[aria-label*="Aceptar" i],'
|
||||
'button[aria-label*="Akzeptieren" i],'
|
||||
'button[aria-label*="Aceitar" i],'
|
||||
'button[jsname="higCR"],' # Google's "Accept all" button
|
||||
'button[jsname="hZCF7e"],'
|
||||
'button[data-mdc-dialog-action="accept"]')
|
||||
'button[data-mdc-dialog-action="accept"],'
|
||||
'form[action*="consent"] button,'
|
||||
'div[role="dialog"] button[jsname],'
|
||||
'.VfPpkd-LgbsSe[data-mdc-dialog-action="accept"]')
|
||||
SORT_BTN = 'button[aria-label="Sort reviews" i], button[aria-label="Sort" i]'
|
||||
MENU_ITEMS = 'div[role="menu"] [role="menuitem"], li[role="menuitem"]'
|
||||
|
||||
@@ -169,6 +178,8 @@ class GoogleReviewsScraper:
|
||||
self.json_storage = JSONStorage(config)
|
||||
self.backup_to_json = config.get("backup_to_json", True)
|
||||
self.overwrite_existing = config.get("overwrite_existing", False)
|
||||
self.enable_api_intercept = config.get("enable_api_intercept", False)
|
||||
self.api_interceptor = None # Will be initialized when driver is ready
|
||||
|
||||
def setup_driver(self, headless: bool):
|
||||
"""
|
||||
@@ -257,32 +268,61 @@ class GoogleReviewsScraper:
|
||||
"""
|
||||
Dismiss cookie consent dialogs if present.
|
||||
Handles stale element references by re-finding elements if needed.
|
||||
Updated January 2026 to handle current Google consent dialogs.
|
||||
"""
|
||||
try:
|
||||
# Use WebDriverWait with expected_conditions to handle stale elements
|
||||
WebDriverWait(driver, 3).until(
|
||||
EC.presence_of_element_located((By.CSS_SELECTOR, COOKIE_BTN))
|
||||
)
|
||||
log.info("Cookie consent dialog found, attempting to dismiss")
|
||||
dismissed = False
|
||||
|
||||
# Get elements again after waiting to avoid stale references
|
||||
elements = driver.find_elements(By.CSS_SELECTOR, COOKIE_BTN)
|
||||
for elem in elements:
|
||||
try:
|
||||
if elem.is_displayed():
|
||||
elem.click()
|
||||
log.info("Cookie dialog dismissed")
|
||||
return True
|
||||
except Exception as e:
|
||||
log.debug(f"Error clicking cookie button: {e}")
|
||||
continue
|
||||
except TimeoutException:
|
||||
# This is expected if no cookie dialog is present
|
||||
log.debug("No cookie consent dialog detected")
|
||||
except Exception as e:
|
||||
log.debug(f"Error handling cookie dialog: {e}")
|
||||
# Try multiple approaches to dismiss consent dialogs
|
||||
consent_selectors = [
|
||||
COOKIE_BTN,
|
||||
# Additional Google consent selectors
|
||||
'button[aria-label*="Accept all" i]',
|
||||
'button[aria-label*="Aceptar todo" i]',
|
||||
'button[aria-label*="Reject all" i]', # Sometimes we need to reject
|
||||
'button:has-text("Accept")',
|
||||
'button:has-text("Aceptar")',
|
||||
'[role="dialog"] button:first-of-type',
|
||||
'form[action*="consent"] button:first-of-type',
|
||||
]
|
||||
|
||||
return False
|
||||
for selector in consent_selectors:
|
||||
try:
|
||||
elements = driver.find_elements(By.CSS_SELECTOR, selector)
|
||||
for elem in elements:
|
||||
try:
|
||||
if elem.is_displayed() and elem.is_enabled():
|
||||
# Try JavaScript click first (more reliable)
|
||||
driver.execute_script("arguments[0].click();", elem)
|
||||
log.info(f"Cookie/consent dialog dismissed with selector: {selector}")
|
||||
time.sleep(1) # Wait for dialog to close
|
||||
dismissed = True
|
||||
break
|
||||
except Exception as e:
|
||||
log.debug(f"Error clicking consent button: {e}")
|
||||
continue
|
||||
if dismissed:
|
||||
break
|
||||
except Exception as e:
|
||||
log.debug(f"Error finding consent elements with {selector}: {e}")
|
||||
continue
|
||||
|
||||
# Also try to find and click any visible modal close buttons
|
||||
if not dismissed:
|
||||
try:
|
||||
close_btns = driver.find_elements(By.CSS_SELECTOR,
|
||||
'[role="dialog"] button[aria-label*="close" i], '
|
||||
'[role="dialog"] button[aria-label*="cerrar" i], '
|
||||
'.modal-close, .dialog-close')
|
||||
for btn in close_btns:
|
||||
if btn.is_displayed():
|
||||
driver.execute_script("arguments[0].click();", btn)
|
||||
log.info("Closed modal dialog")
|
||||
dismissed = True
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return dismissed
|
||||
|
||||
def is_reviews_tab(self, tab: WebElement) -> bool:
|
||||
"""
|
||||
@@ -364,6 +404,10 @@ class GoogleReviewsScraper:
|
||||
|
||||
# Define different selectors to try in order of reliability
|
||||
tab_selectors = [
|
||||
# Current Google Maps tab selectors (January 2026)
|
||||
'.LRkQ2', # Main tab button class in current Google Maps
|
||||
'.hh2c6', # Alternative tab button class
|
||||
|
||||
# Direct tab selectors
|
||||
'[data-tab-index="1"]', # Most common tab index
|
||||
'[role="tab"][data-tab-index]', # Any tab with index
|
||||
@@ -373,7 +417,6 @@ class GoogleReviewsScraper:
|
||||
|
||||
# Common Google Maps review tab selectors
|
||||
'.fontTitleSmall[role="tab"]', # Google Maps title font tabs
|
||||
'.hh2c6[role="tab"]', # Common Google Maps class
|
||||
'.m6QErb [role="tab"]', # Maps container tabs
|
||||
|
||||
# Text-based selectors for various languages
|
||||
@@ -517,12 +560,14 @@ class GoogleReviewsScraper:
|
||||
characteristic elements that appear on the reviews page.
|
||||
"""
|
||||
try:
|
||||
# Common elements that appear when reviews tab is active
|
||||
# Common elements that appear when reviews tab is active (Updated January 2026)
|
||||
verification_selectors = [
|
||||
# Reviews container
|
||||
'div.m6QErb.DxyBCb.kA9KIf.dS8AEf',
|
||||
# Reviews container (current)
|
||||
'div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde',
|
||||
'div.m6QErb.WNBkOb.XiKgde',
|
||||
|
||||
# Review cards
|
||||
# Review cards (current)
|
||||
'div.jftiEf',
|
||||
'div[data-review-id]',
|
||||
|
||||
# Sort button (usually appears with reviews)
|
||||
@@ -1122,6 +1167,7 @@ class GoogleReviewsScraper:
|
||||
seen = self.json_storage.load_seen()
|
||||
|
||||
driver = None
|
||||
api_reviews = {} # Store reviews captured from API
|
||||
try:
|
||||
driver = self.setup_driver(headless)
|
||||
wait = WebDriverWait(driver, 20) # Reduced from 40 to 20 for faster timeout
|
||||
@@ -1129,7 +1175,15 @@ class GoogleReviewsScraper:
|
||||
driver.get(url)
|
||||
wait.until(lambda d: "google.com/maps" in d.current_url)
|
||||
|
||||
self.dismiss_cookies(driver)
|
||||
# Wait for page to load and consent dialogs to appear
|
||||
time.sleep(3)
|
||||
|
||||
# Try to dismiss any consent/cookie dialogs
|
||||
if not self.dismiss_cookies(driver):
|
||||
# Wait a bit more and try again
|
||||
time.sleep(2)
|
||||
self.dismiss_cookies(driver)
|
||||
|
||||
self.click_reviews_tab(driver)
|
||||
|
||||
# Extra wait after clicking reviews tab to ensure page loads
|
||||
@@ -1158,10 +1212,14 @@ class GoogleReviewsScraper:
|
||||
time.sleep(3)
|
||||
|
||||
# Use try-except to handle cases where the pane is not found
|
||||
# Try multiple selectors for the reviews pane
|
||||
# Try multiple selectors for the reviews pane (Updated January 2026)
|
||||
pane = None
|
||||
pane_selectors = [
|
||||
PANE_SEL, # Primary selector
|
||||
PANE_SEL, # Primary selector with XiKgde
|
||||
'div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde', # Without role="main" prefix
|
||||
'div.m6QErb.WNBkOb.XiKgde', # Alternative class combination
|
||||
'div[role="main"] div.m6QErb.XiKgde', # Simplified with XiKgde
|
||||
'div.m6QErb.DxyBCb.XiKgde', # Another variant
|
||||
'div[role="main"] div.m6QErb', # Simplified version
|
||||
'div.m6QErb.DxyBCb', # Even more simplified
|
||||
'div[role="main"]' # Most generic
|
||||
@@ -1182,6 +1240,15 @@ class GoogleReviewsScraper:
|
||||
log.warning("Could not find reviews pane with any selector. Page structure might have changed.")
|
||||
return False
|
||||
|
||||
# Initialize API interceptor AFTER reviews page is loaded (if enabled)
|
||||
# This prevents CDP interception from affecting initial page load and tab detection
|
||||
if self.enable_api_intercept:
|
||||
log.info("Setting up API interception for reviews capture")
|
||||
self.api_interceptor = GoogleMapsAPIInterceptor(driver)
|
||||
self.api_interceptor.setup_interception()
|
||||
self.api_interceptor.inject_response_interceptor()
|
||||
log.info("API interceptor ready - capturing network responses")
|
||||
|
||||
pbar = tqdm(desc="Scraped", ncols=80, initial=len(seen))
|
||||
idle = 0
|
||||
processed_ids = set() # Track processed IDs in current session
|
||||
@@ -1201,9 +1268,35 @@ class GoogleReviewsScraper:
|
||||
last_scroll_position = 0
|
||||
scroll_stuck_count = 0
|
||||
|
||||
# Card selectors to try (Updated January 2026)
|
||||
card_selectors = [
|
||||
CARD_SEL, # Primary: div.jftiEf
|
||||
"div[data-review-id]", # Alternative: direct data-review-id
|
||||
".jftiEf", # Without div prefix
|
||||
"div.WMbnJf", # Another common review card class
|
||||
"[data-review-id]", # Any element with review ID
|
||||
]
|
||||
|
||||
while attempts < max_attempts:
|
||||
try:
|
||||
cards = pane.find_elements(By.CSS_SELECTOR, CARD_SEL)
|
||||
# Try multiple card selectors within the pane
|
||||
cards = []
|
||||
for card_sel in card_selectors:
|
||||
cards = pane.find_elements(By.CSS_SELECTOR, card_sel)
|
||||
if cards:
|
||||
if attempts == 0: # Only log once
|
||||
log.info(f"Found {len(cards)} cards with selector: {card_sel}")
|
||||
break
|
||||
|
||||
# If no cards found in pane, try searching the entire document
|
||||
if not cards:
|
||||
for card_sel in card_selectors:
|
||||
cards = driver.find_elements(By.CSS_SELECTOR, card_sel)
|
||||
if cards:
|
||||
if attempts == 0:
|
||||
log.info(f"Found {len(cards)} cards in document with selector: {card_sel}")
|
||||
break
|
||||
|
||||
fresh_cards: List[WebElement] = []
|
||||
|
||||
# Check for valid cards
|
||||
@@ -1228,7 +1321,15 @@ class GoogleReviewsScraper:
|
||||
|
||||
for c in cards:
|
||||
try:
|
||||
# Try to get data-review-id from the card itself
|
||||
cid = c.get_attribute("data-review-id")
|
||||
# If not found on card, try to find it in a child element
|
||||
if not cid:
|
||||
try:
|
||||
review_id_elem = c.find_element(By.CSS_SELECTOR, "[data-review-id]")
|
||||
cid = review_id_elem.get_attribute("data-review-id")
|
||||
except:
|
||||
pass
|
||||
if not cid or cid in seen or cid in processed_ids:
|
||||
if stop_on_match and cid and (cid in seen or cid in processed_ids):
|
||||
idle = 999
|
||||
@@ -1314,6 +1415,20 @@ class GoogleReviewsScraper:
|
||||
# Try a simpler scroll method
|
||||
driver.execute_script("window.scrollBy(0, 300);")
|
||||
|
||||
# Collect API responses if interception is enabled
|
||||
if self.enable_api_intercept and self.api_interceptor:
|
||||
try:
|
||||
responses = self.api_interceptor.get_intercepted_responses()
|
||||
if responses:
|
||||
parsed = self.api_interceptor.parse_reviews_from_responses(responses)
|
||||
for intercepted in parsed:
|
||||
if intercepted.review_id and intercepted.review_id not in api_reviews:
|
||||
api_reviews[intercepted.review_id] = self.api_interceptor.convert_to_raw_review_format(intercepted)
|
||||
if parsed:
|
||||
log.debug(f"API interceptor captured {len(parsed)} reviews (total unique: {len(api_reviews)})")
|
||||
except Exception as api_err:
|
||||
log.debug(f"API interception error: {api_err}")
|
||||
|
||||
# Dynamic sleep: sleep less when processing many reviews, more when finding none
|
||||
if len(fresh_cards) > 5:
|
||||
sleep_time = 0.7
|
||||
@@ -1339,6 +1454,23 @@ class GoogleReviewsScraper:
|
||||
|
||||
pbar.close()
|
||||
|
||||
# Merge API-captured reviews if any
|
||||
if self.enable_api_intercept and api_reviews:
|
||||
log.info(f"Merging {len(api_reviews)} reviews captured via API interception")
|
||||
for review_id, api_review in api_reviews.items():
|
||||
if review_id not in docs:
|
||||
# New review from API only
|
||||
docs[review_id] = api_review
|
||||
seen.add(review_id)
|
||||
else:
|
||||
# Merge API data with existing DOM data (API might have more details)
|
||||
existing = docs[review_id]
|
||||
# Only update fields that are missing or empty
|
||||
for key, value in api_review.items():
|
||||
if key not in existing or not existing.get(key):
|
||||
existing[key] = value
|
||||
log.info(f"After merge: {len(docs)} total reviews")
|
||||
|
||||
# Save to MongoDB if enabled
|
||||
if self.use_mongodb and self.mongodb:
|
||||
log.info("Saving reviews to MongoDB...")
|
||||
@@ -1364,6 +1496,13 @@ class GoogleReviewsScraper:
|
||||
return False
|
||||
|
||||
finally:
|
||||
# Cleanup API interceptor
|
||||
if self.api_interceptor:
|
||||
try:
|
||||
self.api_interceptor.cleanup()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if driver is not None:
|
||||
try:
|
||||
driver.quit()
|
||||
|
||||
Reference in New Issue
Block a user