Add API interception for hybrid scraping and update selectors

- Add new api_interceptor.py module for CDP network interception
- Capture Google Maps internal API responses during scrolling
- Parse protobuf-like JSON responses to extract review data
- Merge API-captured reviews with DOM-scraped data
- Update CSS selectors for January 2026 Google Maps structure
- Add cookie consent dismissal for multiple languages
- Add --api-intercept CLI flag and config option
- Fix review card and pane selectors (.jftiEf, .XiKgde)
- Improve review ID extraction from card elements

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-17 21:51:10 +00:00
parent 262f0c0be7
commit bdffb5eaac
5 changed files with 782 additions and 36 deletions

View File

@@ -24,16 +24,25 @@ from tqdm import tqdm
from modules.data_storage import MongoDBStorage, JSONStorage, merge_review
from modules.models import RawReview
from modules.api_interceptor import GoogleMapsAPIInterceptor
# Logger
log = logging.getLogger("scraper")
# CSS Selectors
PANE_SEL = 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf'
CARD_SEL = "div[data-review-id]"
# CSS Selectors (Updated January 2026 for current Google Maps structure)
PANE_SEL = 'div[role="main"] div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde'
CARD_SEL = "div.jftiEf" # Review card container
# Cookie/consent dialog selectors (Updated January 2026)
COOKIE_BTN = ('button[aria-label*="Accept" i],'
'button[aria-label*="Aceptar" i],'
'button[aria-label*="Akzeptieren" i],'
'button[aria-label*="Aceitar" i],'
'button[jsname="higCR"],' # Google's "Accept all" button
'button[jsname="hZCF7e"],'
'button[data-mdc-dialog-action="accept"]')
'button[data-mdc-dialog-action="accept"],'
'form[action*="consent"] button,'
'div[role="dialog"] button[jsname],'
'.VfPpkd-LgbsSe[data-mdc-dialog-action="accept"]')
SORT_BTN = 'button[aria-label="Sort reviews" i], button[aria-label="Sort" i]'
MENU_ITEMS = 'div[role="menu"] [role="menuitem"], li[role="menuitem"]'
@@ -169,6 +178,8 @@ class GoogleReviewsScraper:
self.json_storage = JSONStorage(config)
self.backup_to_json = config.get("backup_to_json", True)
self.overwrite_existing = config.get("overwrite_existing", False)
self.enable_api_intercept = config.get("enable_api_intercept", False)
self.api_interceptor = None # Will be initialized when driver is ready
def setup_driver(self, headless: bool):
"""
@@ -257,32 +268,61 @@ class GoogleReviewsScraper:
"""
Dismiss cookie consent dialogs if present.
Handles stale element references by re-finding elements if needed.
Updated January 2026 to handle current Google consent dialogs.
"""
try:
# Use WebDriverWait with expected_conditions to handle stale elements
WebDriverWait(driver, 3).until(
EC.presence_of_element_located((By.CSS_SELECTOR, COOKIE_BTN))
)
log.info("Cookie consent dialog found, attempting to dismiss")
dismissed = False
# Get elements again after waiting to avoid stale references
elements = driver.find_elements(By.CSS_SELECTOR, COOKIE_BTN)
for elem in elements:
try:
if elem.is_displayed():
elem.click()
log.info("Cookie dialog dismissed")
return True
except Exception as e:
log.debug(f"Error clicking cookie button: {e}")
continue
except TimeoutException:
# This is expected if no cookie dialog is present
log.debug("No cookie consent dialog detected")
except Exception as e:
log.debug(f"Error handling cookie dialog: {e}")
# Try multiple approaches to dismiss consent dialogs
consent_selectors = [
COOKIE_BTN,
# Additional Google consent selectors
'button[aria-label*="Accept all" i]',
'button[aria-label*="Aceptar todo" i]',
'button[aria-label*="Reject all" i]', # Sometimes we need to reject
'button:has-text("Accept")',
'button:has-text("Aceptar")',
'[role="dialog"] button:first-of-type',
'form[action*="consent"] button:first-of-type',
]
return False
for selector in consent_selectors:
try:
elements = driver.find_elements(By.CSS_SELECTOR, selector)
for elem in elements:
try:
if elem.is_displayed() and elem.is_enabled():
# Try JavaScript click first (more reliable)
driver.execute_script("arguments[0].click();", elem)
log.info(f"Cookie/consent dialog dismissed with selector: {selector}")
time.sleep(1) # Wait for dialog to close
dismissed = True
break
except Exception as e:
log.debug(f"Error clicking consent button: {e}")
continue
if dismissed:
break
except Exception as e:
log.debug(f"Error finding consent elements with {selector}: {e}")
continue
# Also try to find and click any visible modal close buttons
if not dismissed:
try:
close_btns = driver.find_elements(By.CSS_SELECTOR,
'[role="dialog"] button[aria-label*="close" i], '
'[role="dialog"] button[aria-label*="cerrar" i], '
'.modal-close, .dialog-close')
for btn in close_btns:
if btn.is_displayed():
driver.execute_script("arguments[0].click();", btn)
log.info("Closed modal dialog")
dismissed = True
break
except Exception:
pass
return dismissed
def is_reviews_tab(self, tab: WebElement) -> bool:
"""
@@ -364,6 +404,10 @@ class GoogleReviewsScraper:
# Define different selectors to try in order of reliability
tab_selectors = [
# Current Google Maps tab selectors (January 2026)
'.LRkQ2', # Main tab button class in current Google Maps
'.hh2c6', # Alternative tab button class
# Direct tab selectors
'[data-tab-index="1"]', # Most common tab index
'[role="tab"][data-tab-index]', # Any tab with index
@@ -373,7 +417,6 @@ class GoogleReviewsScraper:
# Common Google Maps review tab selectors
'.fontTitleSmall[role="tab"]', # Google Maps title font tabs
'.hh2c6[role="tab"]', # Common Google Maps class
'.m6QErb [role="tab"]', # Maps container tabs
# Text-based selectors for various languages
@@ -517,12 +560,14 @@ class GoogleReviewsScraper:
characteristic elements that appear on the reviews page.
"""
try:
# Common elements that appear when reviews tab is active
# Common elements that appear when reviews tab is active (Updated January 2026)
verification_selectors = [
# Reviews container
'div.m6QErb.DxyBCb.kA9KIf.dS8AEf',
# Reviews container (current)
'div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde',
'div.m6QErb.WNBkOb.XiKgde',
# Review cards
# Review cards (current)
'div.jftiEf',
'div[data-review-id]',
# Sort button (usually appears with reviews)
@@ -1122,6 +1167,7 @@ class GoogleReviewsScraper:
seen = self.json_storage.load_seen()
driver = None
api_reviews = {} # Store reviews captured from API
try:
driver = self.setup_driver(headless)
wait = WebDriverWait(driver, 20) # Reduced from 40 to 20 for faster timeout
@@ -1129,7 +1175,15 @@ class GoogleReviewsScraper:
driver.get(url)
wait.until(lambda d: "google.com/maps" in d.current_url)
self.dismiss_cookies(driver)
# Wait for page to load and consent dialogs to appear
time.sleep(3)
# Try to dismiss any consent/cookie dialogs
if not self.dismiss_cookies(driver):
# Wait a bit more and try again
time.sleep(2)
self.dismiss_cookies(driver)
self.click_reviews_tab(driver)
# Extra wait after clicking reviews tab to ensure page loads
@@ -1158,10 +1212,14 @@ class GoogleReviewsScraper:
time.sleep(3)
# Use try-except to handle cases where the pane is not found
# Try multiple selectors for the reviews pane
# Try multiple selectors for the reviews pane (Updated January 2026)
pane = None
pane_selectors = [
PANE_SEL, # Primary selector
PANE_SEL, # Primary selector with XiKgde
'div.m6QErb.DxyBCb.kA9KIf.dS8AEf.XiKgde', # Without role="main" prefix
'div.m6QErb.WNBkOb.XiKgde', # Alternative class combination
'div[role="main"] div.m6QErb.XiKgde', # Simplified with XiKgde
'div.m6QErb.DxyBCb.XiKgde', # Another variant
'div[role="main"] div.m6QErb', # Simplified version
'div.m6QErb.DxyBCb', # Even more simplified
'div[role="main"]' # Most generic
@@ -1182,6 +1240,15 @@ class GoogleReviewsScraper:
log.warning("Could not find reviews pane with any selector. Page structure might have changed.")
return False
# Initialize API interceptor AFTER reviews page is loaded (if enabled)
# This prevents CDP interception from affecting initial page load and tab detection
if self.enable_api_intercept:
log.info("Setting up API interception for reviews capture")
self.api_interceptor = GoogleMapsAPIInterceptor(driver)
self.api_interceptor.setup_interception()
self.api_interceptor.inject_response_interceptor()
log.info("API interceptor ready - capturing network responses")
pbar = tqdm(desc="Scraped", ncols=80, initial=len(seen))
idle = 0
processed_ids = set() # Track processed IDs in current session
@@ -1201,9 +1268,35 @@ class GoogleReviewsScraper:
last_scroll_position = 0
scroll_stuck_count = 0
# Card selectors to try (Updated January 2026)
card_selectors = [
CARD_SEL, # Primary: div.jftiEf
"div[data-review-id]", # Alternative: direct data-review-id
".jftiEf", # Without div prefix
"div.WMbnJf", # Another common review card class
"[data-review-id]", # Any element with review ID
]
while attempts < max_attempts:
try:
cards = pane.find_elements(By.CSS_SELECTOR, CARD_SEL)
# Try multiple card selectors within the pane
cards = []
for card_sel in card_selectors:
cards = pane.find_elements(By.CSS_SELECTOR, card_sel)
if cards:
if attempts == 0: # Only log once
log.info(f"Found {len(cards)} cards with selector: {card_sel}")
break
# If no cards found in pane, try searching the entire document
if not cards:
for card_sel in card_selectors:
cards = driver.find_elements(By.CSS_SELECTOR, card_sel)
if cards:
if attempts == 0:
log.info(f"Found {len(cards)} cards in document with selector: {card_sel}")
break
fresh_cards: List[WebElement] = []
# Check for valid cards
@@ -1228,7 +1321,15 @@ class GoogleReviewsScraper:
for c in cards:
try:
# Try to get data-review-id from the card itself
cid = c.get_attribute("data-review-id")
# If not found on card, try to find it in a child element
if not cid:
try:
review_id_elem = c.find_element(By.CSS_SELECTOR, "[data-review-id]")
cid = review_id_elem.get_attribute("data-review-id")
except:
pass
if not cid or cid in seen or cid in processed_ids:
if stop_on_match and cid and (cid in seen or cid in processed_ids):
idle = 999
@@ -1314,6 +1415,20 @@ class GoogleReviewsScraper:
# Try a simpler scroll method
driver.execute_script("window.scrollBy(0, 300);")
# Collect API responses if interception is enabled
if self.enable_api_intercept and self.api_interceptor:
try:
responses = self.api_interceptor.get_intercepted_responses()
if responses:
parsed = self.api_interceptor.parse_reviews_from_responses(responses)
for intercepted in parsed:
if intercepted.review_id and intercepted.review_id not in api_reviews:
api_reviews[intercepted.review_id] = self.api_interceptor.convert_to_raw_review_format(intercepted)
if parsed:
log.debug(f"API interceptor captured {len(parsed)} reviews (total unique: {len(api_reviews)})")
except Exception as api_err:
log.debug(f"API interception error: {api_err}")
# Dynamic sleep: sleep less when processing many reviews, more when finding none
if len(fresh_cards) > 5:
sleep_time = 0.7
@@ -1339,6 +1454,23 @@ class GoogleReviewsScraper:
pbar.close()
# Merge API-captured reviews if any
if self.enable_api_intercept and api_reviews:
log.info(f"Merging {len(api_reviews)} reviews captured via API interception")
for review_id, api_review in api_reviews.items():
if review_id not in docs:
# New review from API only
docs[review_id] = api_review
seen.add(review_id)
else:
# Merge API data with existing DOM data (API might have more details)
existing = docs[review_id]
# Only update fields that are missing or empty
for key, value in api_review.items():
if key not in existing or not existing.get(key):
existing[key] = value
log.info(f"After merge: {len(docs)} total reviews")
# Save to MongoDB if enabled
if self.use_mongodb and self.mongodb:
log.info("Saving reviews to MongoDB...")
@@ -1364,6 +1496,13 @@ class GoogleReviewsScraper:
return False
finally:
# Cleanup API interceptor
if self.api_interceptor:
try:
self.api_interceptor.cleanup()
except Exception:
pass
if driver is not None:
try:
driver.quit()