- Add new api_interceptor.py module for CDP network interception - Capture Google Maps internal API responses during scrolling - Parse protobuf-like JSON responses to extract review data - Merge API-captured reviews with DOM-scraped data - Update CSS selectors for January 2026 Google Maps structure - Add cookie consent dismissal for multiple languages - Add --api-intercept CLI flag and config option - Fix review card and pane selectors (.jftiEf, .XiKgde) - Improve review ID extraction from card elements Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
78 lines
2.6 KiB
Python
78 lines
2.6 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Google‑Maps review scraper with MongoDB integration
|
||
=================================================
|
||
|
||
Main entry point for the scraper.
|
||
"""
|
||
|
||
from modules.cli import parse_arguments
|
||
from modules.config import load_config
|
||
from modules.scraper import GoogleReviewsScraper
|
||
|
||
|
||
def main():
|
||
"""Main function to initialize and run the scraper"""
|
||
# Parse command line arguments
|
||
args = parse_arguments()
|
||
|
||
# Load configuration
|
||
config = load_config(args.config)
|
||
|
||
# Override config with command line arguments if provided
|
||
if args.headless:
|
||
config["headless"] = True
|
||
if args.sort_by is not None:
|
||
config["sort_by"] = args.sort_by
|
||
if args.stop_on_match:
|
||
config["stop_on_match"] = True
|
||
if args.url is not None:
|
||
config["url"] = args.url
|
||
if args.overwrite_existing:
|
||
config["overwrite_existing"] = True
|
||
if args.use_mongodb is not None:
|
||
config["use_mongodb"] = args.use_mongodb
|
||
|
||
# Handle arguments for date conversion and image downloading
|
||
if args.convert_dates is not None:
|
||
config["convert_dates"] = args.convert_dates
|
||
if args.download_images is not None:
|
||
config["download_images"] = args.download_images
|
||
if args.image_dir is not None:
|
||
config["image_dir"] = args.image_dir
|
||
if args.download_threads is not None:
|
||
config["download_threads"] = args.download_threads
|
||
|
||
# Handle arguments for local image paths and URL replacement
|
||
if args.store_local_paths is not None:
|
||
config["store_local_paths"] = args.store_local_paths
|
||
if args.replace_urls is not None:
|
||
config["replace_urls"] = args.replace_urls
|
||
if args.custom_url_base is not None:
|
||
config["custom_url_base"] = args.custom_url_base
|
||
if args.custom_url_profiles is not None:
|
||
config["custom_url_profiles"] = args.custom_url_profiles
|
||
if args.custom_url_reviews is not None:
|
||
config["custom_url_reviews"] = args.custom_url_reviews
|
||
if args.preserve_original_urls is not None:
|
||
config["preserve_original_urls"] = args.preserve_original_urls
|
||
|
||
# Handle custom parameters
|
||
if args.custom_params is not None:
|
||
if "custom_params" not in config:
|
||
config["custom_params"] = {}
|
||
# Update config with the provided custom parameters
|
||
config["custom_params"].update(args.custom_params)
|
||
|
||
# Handle API interception option
|
||
if args.enable_api_intercept:
|
||
config["enable_api_intercept"] = True
|
||
|
||
# Initialize and run scraper
|
||
scraper = GoogleReviewsScraper(config)
|
||
scraper.scrape()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|