Fix clean scraper: specific selectors, consent reload, DOM parsing
- Use div.jftiEf[data-review-id] selector to exclude button elements - Reload original URL after consent (prevents URL corruption) - Parse full DOM data after scrolling stops - Deduplicate API reviews by author match - Remove slow "More" button clicking for speed Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -125,23 +125,8 @@ def parse_dom_review(card) -> dict:
|
|||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Click "More" button to expand text if truncated
|
# Note: "More" button clicking removed for speed
|
||||||
try:
|
# Full text can be expanded later if needed
|
||||||
more_btn = card.find_element(By.CSS_SELECTOR, "button.kyuRq")
|
|
||||||
if more_btn.is_displayed():
|
|
||||||
more_btn.click()
|
|
||||||
# Re-read text after expanding
|
|
||||||
for sel in ['span[jsname="bN97Pc"]', 'span[jsname="fbQN7e"]', '.wiI7pd']:
|
|
||||||
try:
|
|
||||||
text_el = card.find_element(By.CSS_SELECTOR, sel)
|
|
||||||
expanded = text_el.text.strip()
|
|
||||||
if expanded and len(expanded) > len(text):
|
|
||||||
text = expanded
|
|
||||||
break
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Timestamp
|
# Timestamp
|
||||||
timestamp = ""
|
timestamp = ""
|
||||||
@@ -228,7 +213,10 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
break
|
break
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
time.sleep(2)
|
# Reload original URL after consent (redirect can corrupt URL)
|
||||||
|
print(" Reloading after consent...")
|
||||||
|
driver.get(url)
|
||||||
|
time.sleep(3)
|
||||||
|
|
||||||
# Click reviews tab if present (multilingual support)
|
# Click reviews tab if present (multilingual support)
|
||||||
review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"]
|
review_keywords = ["review", "reseña", "avis", "bewertung", "recensione", "opiniones"]
|
||||||
@@ -411,17 +399,18 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
reviews[key] = rev
|
reviews[key] = rev
|
||||||
|
|
||||||
# Collect review IDs via JavaScript (doesn't affect scroll position!)
|
# Collect review IDs via JavaScript (doesn't affect scroll position!)
|
||||||
|
# Use specific selector to only get actual review cards, not buttons
|
||||||
try:
|
try:
|
||||||
review_ids = driver.execute_script("""
|
review_ids = driver.execute_script("""
|
||||||
var ids = [];
|
var ids = [];
|
||||||
document.querySelectorAll('[data-review-id]').forEach(function(el) {
|
document.querySelectorAll('div.jftiEf[data-review-id]').forEach(function(el) {
|
||||||
ids.push(el.getAttribute('data-review-id'));
|
ids.push(el.getAttribute('data-review-id'));
|
||||||
});
|
});
|
||||||
return ids;
|
return ids;
|
||||||
""")
|
""")
|
||||||
for rid in (review_ids or []):
|
for rid in (review_ids or []):
|
||||||
if rid and rid not in reviews:
|
if rid and rid not in reviews:
|
||||||
reviews[rid] = {"id": rid, "source": "dom"}
|
reviews[rid] = {"id": rid, "source": "dom", "_needs_parse": True}
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -456,11 +445,36 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
|
|||||||
stop_scrolling.set()
|
stop_scrolling.set()
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# FINAL PHASE: Parse full review data from DOM (scroll is stopped)
|
||||||
|
print("📝 Parsing full review data...")
|
||||||
|
api_reviews_collected = {k: v for k, v in reviews.items() if v.get("source") == "api"}
|
||||||
|
reviews.clear()
|
||||||
|
|
||||||
|
# Parse all DOM cards now that scrolling is done
|
||||||
|
# Use specific selector to only get actual review cards (div.jftiEf), not buttons
|
||||||
|
try:
|
||||||
|
cards = driver.find_elements(By.CSS_SELECTOR, "div.jftiEf[data-review-id]")
|
||||||
|
for card in cards:
|
||||||
|
review = parse_dom_review(card)
|
||||||
|
if review and review.get("id"):
|
||||||
|
reviews[review["id"]] = review
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Warning: DOM parse error: {e}")
|
||||||
|
|
||||||
|
# Merge API reviews (only add if not already in DOM)
|
||||||
|
api_added = 0
|
||||||
|
for key, api_rev in api_reviews_collected.items():
|
||||||
|
# Check if this author already exists in DOM reviews
|
||||||
|
author = api_rev.get("author", "")
|
||||||
|
if author and not any(r.get("author") == author for r in reviews.values()):
|
||||||
|
reviews[f"api_{key}"] = api_rev
|
||||||
|
api_added += 1
|
||||||
|
|
||||||
# Final results
|
# Final results
|
||||||
review_list = list(reviews.values())
|
review_list = list(reviews.values())
|
||||||
dom_count = sum(1 for r in review_list if r.get("source") == "dom")
|
dom_count = sum(1 for r in review_list if r.get("source") == "dom")
|
||||||
api_count = sum(1 for r in review_list if r.get("source") == "api")
|
api_count = sum(1 for r in review_list if r.get("source") == "api")
|
||||||
print(f"\n📋 Total: {len(review_list)} reviews (DOM: {dom_count}, API: {api_count})")
|
print(f"\n📋 Total: {len(review_list)} unique reviews (DOM: {dom_count}, API: {api_count})")
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"reviews": review_list,
|
"reviews": review_list,
|
||||||
|
|||||||
Reference in New Issue
Block a user