Files
whyrating-engine-legacy/start.py
Alejandro Gutiérrez bdffb5eaac Add API interception for hybrid scraping and update selectors
- Add new api_interceptor.py module for CDP network interception
- Capture Google Maps internal API responses during scrolling
- Parse protobuf-like JSON responses to extract review data
- Merge API-captured reviews with DOM-scraped data
- Update CSS selectors for January 2026 Google Maps structure
- Add cookie consent dismissal for multiple languages
- Add --api-intercept CLI flag and config option
- Fix review card and pane selectors (.jftiEf, .XiKgde)
- Improve review ID extraction from card elements

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-17 21:51:10 +00:00

78 lines
2.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
GoogleMaps review scraper with MongoDB integration
=================================================
Main entry point for the scraper.
"""
from modules.cli import parse_arguments
from modules.config import load_config
from modules.scraper import GoogleReviewsScraper
def main():
"""Main function to initialize and run the scraper"""
# Parse command line arguments
args = parse_arguments()
# Load configuration
config = load_config(args.config)
# Override config with command line arguments if provided
if args.headless:
config["headless"] = True
if args.sort_by is not None:
config["sort_by"] = args.sort_by
if args.stop_on_match:
config["stop_on_match"] = True
if args.url is not None:
config["url"] = args.url
if args.overwrite_existing:
config["overwrite_existing"] = True
if args.use_mongodb is not None:
config["use_mongodb"] = args.use_mongodb
# Handle arguments for date conversion and image downloading
if args.convert_dates is not None:
config["convert_dates"] = args.convert_dates
if args.download_images is not None:
config["download_images"] = args.download_images
if args.image_dir is not None:
config["image_dir"] = args.image_dir
if args.download_threads is not None:
config["download_threads"] = args.download_threads
# Handle arguments for local image paths and URL replacement
if args.store_local_paths is not None:
config["store_local_paths"] = args.store_local_paths
if args.replace_urls is not None:
config["replace_urls"] = args.replace_urls
if args.custom_url_base is not None:
config["custom_url_base"] = args.custom_url_base
if args.custom_url_profiles is not None:
config["custom_url_profiles"] = args.custom_url_profiles
if args.custom_url_reviews is not None:
config["custom_url_reviews"] = args.custom_url_reviews
if args.preserve_original_urls is not None:
config["preserve_original_urls"] = args.preserve_original_urls
# Handle custom parameters
if args.custom_params is not None:
if "custom_params" not in config:
config["custom_params"] = {}
# Update config with the provided custom parameters
config["custom_params"].update(args.custom_params)
# Handle API interception option
if args.enable_api_intercept:
config["enable_api_intercept"] = True
# Initialize and run scraper
scraper = GoogleReviewsScraper(config)
scraper.scrape()
if __name__ == "__main__":
main()