Files
whyrating-engine-legacy/modules/cli.py
Alejandro Gutiérrez bdffb5eaac Add API interception for hybrid scraping and update selectors
- Add new api_interceptor.py module for CDP network interception
- Capture Google Maps internal API responses during scrolling
- Parse protobuf-like JSON responses to extract review data
- Merge API-captured reviews with DOM-scraped data
- Update CSS selectors for January 2026 Google Maps structure
- Add cookie consent dismissal for multiple languages
- Add --api-intercept CLI flag and config option
- Fix review card and pane selectors (.jftiEf, .XiKgde)
- Improve review ID extraction from card elements

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-17 21:51:10 +00:00

81 lines
3.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Command line interface handling for Google Maps Reviews Scraper.
"""
import argparse
import json
from pathlib import Path
from modules.config import DEFAULT_CONFIG_PATH
def parse_arguments():
"""Parse command line arguments"""
ap = argparse.ArgumentParser(description="GoogleMaps review scraper with MongoDB integration")
ap.add_argument("-q", "--headless", action="store_true",
help="run Chrome in the background")
ap.add_argument("-s", "--sort", dest="sort_by",
choices=("newest", "highest", "lowest", "relevance"),
default=None, help="sorting order for reviews")
ap.add_argument("--stop-on-match", action="store_true",
help="stop scrolling when first alreadyseen id is met "
"(useful with --sort newest)")
ap.add_argument("--url", type=str, default=None,
help="custom Google Maps URL to scrape")
ap.add_argument("--overwrite", action="store_true", dest="overwrite_existing",
help="overwrite existing reviews instead of appending")
ap.add_argument("--config", type=str, default=None,
help="path to custom configuration file")
ap.add_argument("--use-mongodb", type=bool, default=None,
help="whether to use MongoDB for storage")
# Arguments for date conversion and image downloading
ap.add_argument("--convert-dates", type=bool, default=None,
help="convert string dates to MongoDB Date objects")
ap.add_argument("--download-images", type=bool, default=None,
help="download images from reviews")
ap.add_argument("--image-dir", type=str, default=None,
help="directory to store downloaded images")
ap.add_argument("--download-threads", type=int, default=None,
help="number of threads for downloading images")
# Arguments for local image paths and URL replacement
ap.add_argument("--store-local-paths", type=bool, default=None,
help="whether to store local image paths in documents")
ap.add_argument("--replace-urls", type=bool, default=None,
help="whether to replace original URLs with custom ones")
ap.add_argument("--custom-url-base", type=str, default=None,
help="base URL for replacement")
ap.add_argument("--custom-url-profiles", type=str, default=None,
help="path for profile images")
ap.add_argument("--custom-url-reviews", type=str, default=None,
help="path for review images")
ap.add_argument("--preserve-original-urls", type=bool, default=None,
help="whether to preserve original URLs in original_* fields")
# Arguments for custom parameters
ap.add_argument("--custom-params", type=str, default=None,
help="JSON string with custom parameters to add to each document (e.g. '{\"company\":\"Thaitours\"}')")
# API interception option
ap.add_argument("--api-intercept", action="store_true", dest="enable_api_intercept",
help="enable API response interception for faster data capture (experimental)")
args = ap.parse_args()
# Handle config path
if args.config is not None:
args.config = Path(args.config)
else:
args.config = DEFAULT_CONFIG_PATH
# Process custom params if provided
if args.custom_params:
try:
args.custom_params = json.loads(args.custom_params)
except json.JSONDecodeError:
print(f"Warning: Could not parse custom params JSON: {args.custom_params}")
args.custom_params = None
return args