Initial commit - WhyRating Engine (Google Reviews Scraper)

This commit is contained in:
Alejandro Gutiérrez
2026-02-02 18:19:00 +00:00
parent 0543a08242
commit 2206ddeff2
136 changed files with 51138 additions and 855 deletions

View File

@@ -732,7 +732,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
progress_callback=None, validation_only: bool = False,
sort_strategy: str = SORT_AUTO, sort_order: List[str] = None,
multi_sort_threshold: int = MULTI_SORT_THRESHOLD,
close_enough_pct: float = 95.0) -> dict:
close_enough_pct: float = 95.0, initial_sort: str = None) -> dict:
"""
Scrape Google Maps reviews with optional multi-sort strategy.
@@ -754,6 +754,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
sort_order: Custom sort order for multi-sort (default: newest, lowest, highest, relevant)
multi_sort_threshold: Auto-enable multi-sort if total reviews > this (default: 1000)
close_enough_pct: Stop retrying if we have this % of total reviews (default: 95.0)
initial_sort: Initial sort order to use (default: newest). Used for retry with different sort
Returns:
dict with reviews list and metadata
@@ -1381,8 +1382,10 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
log.info('browser', "Sort button found")
break
# Track bot detection - if sort button hidden, Google likely detected bot
bot_detected = not sort_found
if not sort_found:
log.warn('browser', "Sort button not found after waiting, continuing without sorting")
log.warn('browser', "Sort button not found after waiting, continuing without sorting (bot detection likely)")
# Sort by specified order (default: newest)
target_sort = initial_sort or SORT_NEWEST
@@ -1815,6 +1818,71 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
}
text = longestText;
// OWNER RESPONSE: Find by "Response from the owner" text anchor
var ownerResponse = null;
var ownerSpan = null;
var cardSpans = card.querySelectorAll('span');
for (var k = 0; k < cardSpans.length; k++) {
if (cardSpans[k].textContent.trim() === 'Response from the owner') {
ownerSpan = cardSpans[k];
break;
}
}
if (ownerSpan) {
// Navigate: span -> header div -> container div
var headerDiv = ownerSpan.closest('div');
var respContainer = headerDiv ? headerDiv.parentElement : null;
if (respContainer) {
// Click expand button if exists and not expanded
var expandBtn = respContainer.querySelector('button[aria-label="See more"]');
if (expandBtn && expandBtn.getAttribute('aria-expanded') !== 'true') {
expandBtn.click();
}
// Get timestamp from header spans
var respTimestamp = '';
var headerSpans = headerDiv.querySelectorAll('span');
for (var m = 0; m < headerSpans.length; m++) {
var spanTxt = headerSpans[m].textContent.trim();
if (spanTxt.match(/ago$/i)) {
respTimestamp = spanTxt;
break;
}
}
// Get response text from direct child div[lang]
var respText = '';
var langDivs = respContainer.children;
for (var m = 0; m < langDivs.length; m++) {
if (langDivs[m].tagName === 'DIV' && langDivs[m].hasAttribute('lang')) {
respText = langDivs[m].textContent.trim();
respText = respText.replace(/(More|Less)$/, '').trim();
break;
}
}
// Fallback: find longest text div that's not the header
if (!respText) {
for (var m = 0; m < langDivs.length; m++) {
if (langDivs[m].tagName === 'DIV') {
var divTxt = langDivs[m].textContent.trim();
if (divTxt.includes('Response from the owner')) continue;
divTxt = divTxt.replace(/(More|Less)$/, '').trim();
if (divTxt.length > respText.length) {
respText = divTxt;
}
}
}
}
if (respText) {
ownerResponse = {text: respText, timestamp: respTimestamp};
}
}
}
if (author && rating >= 1 && rating <= 5) {
results.push({
id: rid,
@@ -1823,6 +1891,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
text: text,
rating: rating,
timestamp: timestamp,
owner_response: ownerResponse,
source: 'dom'
});
}
@@ -2198,6 +2267,9 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
topics_inferred_count += 1
log.info('scraper', f"Topics inferred for {topics_inferred_count}/{len(review_list)} reviews", metrics={'topics_inferred_count': topics_inferred_count, 'reviews_count': len(review_list)})
# Include business info captured from Overview page
business_info = business_info_cache[0] or {}
return {
"reviews": review_list, # Only unflushed reviews (flushed already sent to callback)
"total": grand_total,
@@ -2209,10 +2281,19 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
"metrics_history": metrics_history, # For crash detection
"start_time": start_time, # For crash report elapsed time
"session_fingerprint": session_fingerprint, # Browser fingerprint for bot detection analysis
"bot_detected": bot_detected if 'bot_detected' in dir() else False, # True if sort button was hidden
"initial_sort_used": target_sort if 'target_sort' in dir() else SORT_NEWEST, # Sort order used for first pass
"multi_sort": {
"enabled": should_multi_sort if 'should_multi_sort' in dir() else False,
"completed_sorts": completed_sorts if 'completed_sorts' in dir() else [SORT_NEWEST],
"first_pass_count": first_pass_count if 'first_pass_count' in dir() else grand_total
},
# Business info captured from Google Maps page
"business_info": {
"name": business_info.get("name"),
"category": business_info.get("category"),
"address": business_info.get("address"),
"rating": business_info.get("rating")
}
}
@@ -2220,7 +2301,8 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999,
progress_callback=None, driver=None, return_driver: bool = False,
log_capture: LogCapture = None, flush_callback=None, validation_only: bool = False,
browser_fingerprint: dict = None):
browser_fingerprint: dict = None, initial_sort: str = None,
sort_strategy: str = SORT_AUTO, max_reviews: int = None):
"""
Production-compatible wrapper for scrape_reviews.
Matches the API expected by job_manager.py.
@@ -2240,6 +2322,10 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
- timezone: string (e.g., "Europe/Madrid")
- language: string (e.g., "en-US")
- platform: string (e.g., "MacIntel")
initial_sort: Initial sort order to use ("newest", "lowest", "highest", "relevant")
Used for retry with different sort strategy
sort_strategy: Sort strategy ("auto", "multi", "single", or specific sort)
max_reviews: Maximum reviews to collect (for testing). None = unlimited (default: 5000)
Returns:
Dictionary with: reviews, count, total_reviews, time, success, error, driver, logs
@@ -2329,13 +2415,15 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
result = scrape_reviews(
driver=driver,
url=url,
max_reviews=999999, # Effectively unlimited
max_reviews=max_reviews if max_reviews else 999999, # Unlimited by default, or custom limit for testing
timeout_no_new=15,
flush_callback=internal_flush,
flush_batch_size=100, # Smaller batches for more frequent progress
log_capture=log_capture,
progress_callback=progress_callback, # Pass through for real-time log updates
validation_only=validation_only # Return early if just validating
validation_only=validation_only, # Return early if just validating
sort_strategy=sort_strategy, # Sort strategy (auto, multi, single)
initial_sort=initial_sort # Initial sort order for retry with different sort
)
elapsed = time.time() - start_time
@@ -2350,7 +2438,13 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
"error": None,
"logs": result.get("logs", []),
"review_topics": result.get("review_topics", []), # Topic filters with mention counts
"session_fingerprint": result.get("session_fingerprint") # Browser fingerprint for bot detection
"session_fingerprint": result.get("session_fingerprint"), # Browser fingerprint for bot detection
# Tracking info for retry strategy
"bot_detected": result.get("bot_detected", False), # True if sort button was hidden by Google
"initial_sort_used": result.get("initial_sort_used", "newest"), # Sort order used
"multi_sort": result.get("multi_sort", {}), # Multi-sort completion info
# Business info captured from Google Maps page
"business_info": result.get("business_info", {})
}
# Include validation_info if in validation_only mode