Fix clean scraper: specific selectors, consent reload, DOM parsing
- Use div.jftiEf[data-review-id] selector to exclude button elements - Reload original URL after consent (prevents URL corruption) - Parse full DOM data after scrolling stops - Deduplicate API reviews by author match - Remove slow "More" button clicking for speed Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -125,23 +125,8 @@ def parse_dom_review(card) -> dict:
|
||||
except:
|
||||
pass
|
||||
|
||||
# Click "More" button to expand text if truncated
|
||||
try:
|
||||
more_btn = card.find_element(By.CSS_SELECTOR, "button.kyuRq")
|
||||
if more_btn.is_displayed():
|
||||
more_btn.click()
|
||||
# Re-read text after expanding
|
||||
for sel in ['span[jsname="bN97Pc"]', 'span[jsname="fbQN7e"]', '.wiI7pd']:
|
||||
try:
|
||||
text_el = card.find_element(By.CSS_SELECTOR, sel)
|
||||
expanded = text_el.text.strip()
|
||||
if expanded and len(expanded) > len(text):
|
||||
text = expanded
|
||||
break
|
||||
except:
|
||||
pass
|
||||
except:
|
||||
pass
|
||||
# Note: "More" button clicking removed for speed
|
||||
# Full text can be expanded later if needed
|
||||
|
||||
# Timestamp
|
||||
timestamp = ""
|
||||
@@ -228,7 +213,10 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
break
|
||||
except:
|
||||
pass
|
||||
time.sleep(2)
|
||||
# Reload original URL after consent (redirect can corrupt URL)
|
||||
print(" Reloading after consent...")
|
||||
driver.get(url)
|
||||
time.sleep(3)
|
||||
|
||||
# Click reviews tab if present (multilingual support)
|
||||
review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"]
|
||||
@@ -411,17 +399,18 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
reviews[key] = rev
|
||||
|
||||
# Collect review IDs via JavaScript (doesn't affect scroll position!)
|
||||
# Use specific selector to only get actual review cards, not buttons
|
||||
try:
|
||||
review_ids = driver.execute_script("""
|
||||
var ids = [];
|
||||
document.querySelectorAll('[data-review-id]').forEach(function(el) {
|
||||
document.querySelectorAll('div.jftiEf[data-review-id]').forEach(function(el) {
|
||||
ids.push(el.getAttribute('data-review-id'));
|
||||
});
|
||||
return ids;
|
||||
""")
|
||||
for rid in (review_ids or []):
|
||||
if rid and rid not in reviews:
|
||||
reviews[rid] = {"id": rid, "source": "dom"}
|
||||
reviews[rid] = {"id": rid, "source": "dom", "_needs_parse": True}
|
||||
except:
|
||||
pass
|
||||
|
||||
@@ -456,11 +445,36 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
||||
stop_scrolling.set()
|
||||
break
|
||||
|
||||
# FINAL PHASE: Parse full review data from DOM (scroll is stopped)
|
||||
print("📝 Parsing full review data...")
|
||||
api_reviews_collected = {k: v for k, v in reviews.items() if v.get("source") == "api"}
|
||||
reviews.clear()
|
||||
|
||||
# Parse all DOM cards now that scrolling is done
|
||||
# Use specific selector to only get actual review cards (div.jftiEf), not buttons
|
||||
try:
|
||||
cards = driver.find_elements(By.CSS_SELECTOR, "div.jftiEf[data-review-id]")
|
||||
for card in cards:
|
||||
review = parse_dom_review(card)
|
||||
if review and review.get("id"):
|
||||
reviews[review["id"]] = review
|
||||
except Exception as e:
|
||||
print(f" Warning: DOM parse error: {e}")
|
||||
|
||||
# Merge API reviews (only add if not already in DOM)
|
||||
api_added = 0
|
||||
for key, api_rev in api_reviews_collected.items():
|
||||
# Check if this author already exists in DOM reviews
|
||||
author = api_rev.get("author", "")
|
||||
if author and not any(r.get("author") == author for r in reviews.values()):
|
||||
reviews[f"api_{key}"] = api_rev
|
||||
api_added += 1
|
||||
|
||||
# Final results
|
||||
review_list = list(reviews.values())
|
||||
dom_count = sum(1 for r in review_list if r.get("source") == "dom")
|
||||
api_count = sum(1 for r in review_list if r.get("source") == "api")
|
||||
print(f"\n📋 Total: {len(review_list)} reviews (DOM: {dom_count}, API: {api_count})")
|
||||
print(f"\n📋 Total: {len(review_list)} unique reviews (DOM: {dom_count}, API: {api_count})")
|
||||
|
||||
return {
|
||||
"reviews": review_list,
|
||||
|
||||
Reference in New Issue
Block a user