Initial commit - WhyRating Engine (Google Reviews Scraper)
This commit is contained in:
@@ -732,7 +732,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
progress_callback=None, validation_only: bool = False,
|
||||
sort_strategy: str = SORT_AUTO, sort_order: List[str] = None,
|
||||
multi_sort_threshold: int = MULTI_SORT_THRESHOLD,
|
||||
close_enough_pct: float = 95.0) -> dict:
|
||||
close_enough_pct: float = 95.0, initial_sort: str = None) -> dict:
|
||||
"""
|
||||
Scrape Google Maps reviews with optional multi-sort strategy.
|
||||
|
||||
@@ -754,6 +754,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
sort_order: Custom sort order for multi-sort (default: newest, lowest, highest, relevant)
|
||||
multi_sort_threshold: Auto-enable multi-sort if total reviews > this (default: 1000)
|
||||
close_enough_pct: Stop retrying if we have this % of total reviews (default: 95.0)
|
||||
initial_sort: Initial sort order to use (default: newest). Used for retry with different sort
|
||||
|
||||
Returns:
|
||||
dict with reviews list and metadata
|
||||
@@ -1381,8 +1382,10 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
log.info('browser', "Sort button found")
|
||||
break
|
||||
|
||||
# Track bot detection - if sort button hidden, Google likely detected bot
|
||||
bot_detected = not sort_found
|
||||
if not sort_found:
|
||||
log.warn('browser', "Sort button not found after waiting, continuing without sorting")
|
||||
log.warn('browser', "Sort button not found after waiting, continuing without sorting (bot detection likely)")
|
||||
|
||||
# Sort by specified order (default: newest)
|
||||
target_sort = initial_sort or SORT_NEWEST
|
||||
@@ -1815,6 +1818,71 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
}
|
||||
text = longestText;
|
||||
|
||||
// OWNER RESPONSE: Find by "Response from the owner" text anchor
|
||||
var ownerResponse = null;
|
||||
var ownerSpan = null;
|
||||
var cardSpans = card.querySelectorAll('span');
|
||||
for (var k = 0; k < cardSpans.length; k++) {
|
||||
if (cardSpans[k].textContent.trim() === 'Response from the owner') {
|
||||
ownerSpan = cardSpans[k];
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (ownerSpan) {
|
||||
// Navigate: span -> header div -> container div
|
||||
var headerDiv = ownerSpan.closest('div');
|
||||
var respContainer = headerDiv ? headerDiv.parentElement : null;
|
||||
|
||||
if (respContainer) {
|
||||
// Click expand button if exists and not expanded
|
||||
var expandBtn = respContainer.querySelector('button[aria-label="See more"]');
|
||||
if (expandBtn && expandBtn.getAttribute('aria-expanded') !== 'true') {
|
||||
expandBtn.click();
|
||||
}
|
||||
|
||||
// Get timestamp from header spans
|
||||
var respTimestamp = '';
|
||||
var headerSpans = headerDiv.querySelectorAll('span');
|
||||
for (var m = 0; m < headerSpans.length; m++) {
|
||||
var spanTxt = headerSpans[m].textContent.trim();
|
||||
if (spanTxt.match(/ago$/i)) {
|
||||
respTimestamp = spanTxt;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Get response text from direct child div[lang]
|
||||
var respText = '';
|
||||
var langDivs = respContainer.children;
|
||||
for (var m = 0; m < langDivs.length; m++) {
|
||||
if (langDivs[m].tagName === 'DIV' && langDivs[m].hasAttribute('lang')) {
|
||||
respText = langDivs[m].textContent.trim();
|
||||
respText = respText.replace(/(More|Less)$/, '').trim();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: find longest text div that's not the header
|
||||
if (!respText) {
|
||||
for (var m = 0; m < langDivs.length; m++) {
|
||||
if (langDivs[m].tagName === 'DIV') {
|
||||
var divTxt = langDivs[m].textContent.trim();
|
||||
if (divTxt.includes('Response from the owner')) continue;
|
||||
divTxt = divTxt.replace(/(More|Less)$/, '').trim();
|
||||
if (divTxt.length > respText.length) {
|
||||
respText = divTxt;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (respText) {
|
||||
ownerResponse = {text: respText, timestamp: respTimestamp};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (author && rating >= 1 && rating <= 5) {
|
||||
results.push({
|
||||
id: rid,
|
||||
@@ -1823,6 +1891,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
text: text,
|
||||
rating: rating,
|
||||
timestamp: timestamp,
|
||||
owner_response: ownerResponse,
|
||||
source: 'dom'
|
||||
});
|
||||
}
|
||||
@@ -2198,6 +2267,9 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
topics_inferred_count += 1
|
||||
log.info('scraper', f"Topics inferred for {topics_inferred_count}/{len(review_list)} reviews", metrics={'topics_inferred_count': topics_inferred_count, 'reviews_count': len(review_list)})
|
||||
|
||||
# Include business info captured from Overview page
|
||||
business_info = business_info_cache[0] or {}
|
||||
|
||||
return {
|
||||
"reviews": review_list, # Only unflushed reviews (flushed already sent to callback)
|
||||
"total": grand_total,
|
||||
@@ -2209,10 +2281,19 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
"metrics_history": metrics_history, # For crash detection
|
||||
"start_time": start_time, # For crash report elapsed time
|
||||
"session_fingerprint": session_fingerprint, # Browser fingerprint for bot detection analysis
|
||||
"bot_detected": bot_detected if 'bot_detected' in dir() else False, # True if sort button was hidden
|
||||
"initial_sort_used": target_sort if 'target_sort' in dir() else SORT_NEWEST, # Sort order used for first pass
|
||||
"multi_sort": {
|
||||
"enabled": should_multi_sort if 'should_multi_sort' in dir() else False,
|
||||
"completed_sorts": completed_sorts if 'completed_sorts' in dir() else [SORT_NEWEST],
|
||||
"first_pass_count": first_pass_count if 'first_pass_count' in dir() else grand_total
|
||||
},
|
||||
# Business info captured from Google Maps page
|
||||
"business_info": {
|
||||
"name": business_info.get("name"),
|
||||
"category": business_info.get("category"),
|
||||
"address": business_info.get("address"),
|
||||
"rating": business_info.get("rating")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2220,7 +2301,8 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999999,
|
||||
progress_callback=None, driver=None, return_driver: bool = False,
|
||||
log_capture: LogCapture = None, flush_callback=None, validation_only: bool = False,
|
||||
browser_fingerprint: dict = None):
|
||||
browser_fingerprint: dict = None, initial_sort: str = None,
|
||||
sort_strategy: str = SORT_AUTO, max_reviews: int = None):
|
||||
"""
|
||||
Production-compatible wrapper for scrape_reviews.
|
||||
Matches the API expected by job_manager.py.
|
||||
@@ -2240,6 +2322,10 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
||||
- timezone: string (e.g., "Europe/Madrid")
|
||||
- language: string (e.g., "en-US")
|
||||
- platform: string (e.g., "MacIntel")
|
||||
initial_sort: Initial sort order to use ("newest", "lowest", "highest", "relevant")
|
||||
Used for retry with different sort strategy
|
||||
sort_strategy: Sort strategy ("auto", "multi", "single", or specific sort)
|
||||
max_reviews: Maximum reviews to collect (for testing). None = unlimited (default: 5000)
|
||||
|
||||
Returns:
|
||||
Dictionary with: reviews, count, total_reviews, time, success, error, driver, logs
|
||||
@@ -2329,13 +2415,15 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
||||
result = scrape_reviews(
|
||||
driver=driver,
|
||||
url=url,
|
||||
max_reviews=999999, # Effectively unlimited
|
||||
max_reviews=max_reviews if max_reviews else 999999, # Unlimited by default, or custom limit for testing
|
||||
timeout_no_new=15,
|
||||
flush_callback=internal_flush,
|
||||
flush_batch_size=100, # Smaller batches for more frequent progress
|
||||
log_capture=log_capture,
|
||||
progress_callback=progress_callback, # Pass through for real-time log updates
|
||||
validation_only=validation_only # Return early if just validating
|
||||
validation_only=validation_only, # Return early if just validating
|
||||
sort_strategy=sort_strategy, # Sort strategy (auto, multi, single)
|
||||
initial_sort=initial_sort # Initial sort order for retry with different sort
|
||||
)
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
@@ -2350,7 +2438,13 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
|
||||
"error": None,
|
||||
"logs": result.get("logs", []),
|
||||
"review_topics": result.get("review_topics", []), # Topic filters with mention counts
|
||||
"session_fingerprint": result.get("session_fingerprint") # Browser fingerprint for bot detection
|
||||
"session_fingerprint": result.get("session_fingerprint"), # Browser fingerprint for bot detection
|
||||
# Tracking info for retry strategy
|
||||
"bot_detected": result.get("bot_detected", False), # True if sort button was hidden by Google
|
||||
"initial_sort_used": result.get("initial_sort_used", "newest"), # Sort order used
|
||||
"multi_sort": result.get("multi_sort", {}), # Multi-sort completion info
|
||||
# Business info captured from Google Maps page
|
||||
"business_info": result.get("business_info", {})
|
||||
}
|
||||
|
||||
# Include validation_info if in validation_only mode
|
||||
|
||||
Reference in New Issue
Block a user