Full text + deduplication: API parser + More button expansion

- Fix API parser to use correct Google Maps response structure - Review ID at [0], Author at [1][4][5][0], Rating at [2][0][0] - Text at [2][15][0][0], Timestamp at [1][6] - Use review_id as key for both API and DOM to avoid duplicates - Prefer API data (original language, full text) - Expand "More" buttons before sorting and during scroll loop - Results: 246/247 full text (99.6%), down from 36/247 before Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-22 13:09:40 +00:00
parent b4fae38027
commit 7abff25dc6
1 changed files with 108 additions and 24 deletions
--- a/modules/scraper_clean.py
+++ b/modules/scraper_clean.py
@@ -56,7 +56,7 @@ def parse_api_review(raw: list) -> dict:
 def extract_reviews_from_api_body(body: str) -> list:
-    """Extract reviews from API response body."""
+    """Extract reviews from API response body using correct Google Maps structure."""
    reviews = []
    try:
        # Remove )]}' prefix
@@ -65,25 +65,76 @@ def extract_reviews_from_api_body(body: str) -> list:
        data = json.loads(body)
-        # Recursively find review arrays
+        # Google Maps API structure: data[2] contains review arrays
-        def find_reviews(obj, depth=0):
+        # Each review: data[2][X][0] where:
-            if depth > 12:
+        #   Author: [1][4][5][0]
-                return
+        #   Rating: [2][0][0]
-            if isinstance(obj, list):
+        #   Text:   [2][15][0][0]
-                # Check if this looks like a review
+        #   Time:   [1][6]
-                if len(obj) > 4 and isinstance(obj[0], str) and isinstance(obj[4], int):
+        if not isinstance(data, list) or len(data) < 3:
-                    if 1 <= obj[4] <= 5:
+            return reviews
                        rev = parse_api_review(obj)
                        if rev and rev["author"]:
                            reviews.append(rev)
                        return
                for item in obj:
                    find_reviews(item, depth + 1)
            elif isinstance(obj, dict):
                for v in obj.values():
                    find_reviews(v, depth + 1)
-        find_reviews(data)
+        reviews_area = data[2]
        if not isinstance(reviews_area, list):
            return reviews
        for item in reviews_area:
            try:
                if not isinstance(item, list) or len(item) < 1:
                    continue
                review_data = item[0]
                if not isinstance(review_data, list) or len(review_data) < 3:
                    continue
                # Extract fields using correct paths
                review_id = ""
                author = ""
                rating = 0
                text = ""
                timestamp = ""
                # Review ID: [0] - same format as DOM's data-review-id
                try:
                    review_id = review_data[0]
                except (IndexError, TypeError):
                    pass
                # Author: [1][4][5][0]
                try:
                    author = review_data[1][4][5][0]
                except (IndexError, TypeError):
                    pass
                # Rating: [2][0][0]
                try:
                    rating = review_data[2][0][0]
                except (IndexError, TypeError):
                    pass
                # Text: [2][15][0][0]
                try:
                    text = review_data[2][15][0][0]
                except (IndexError, TypeError):
                    pass
                # Timestamp: [1][6]
                try:
                    timestamp = review_data[1][6]
                except (IndexError, TypeError):
                    pass
                # Validate and add (include review_id for deduplication)
                if author and isinstance(rating, int) and 1 <= rating <= 5:
                    reviews.append({
                        "review_id": review_id,
                        "author": author,
                        "text": text or "",
                        "rating": rating,
                        "timestamp": timestamp or "",
                        "source": "api"
                    })
            except:
                continue
    except:
        pass
    return reviews
@@ -416,6 +467,25 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
    except:
        pass
    # EXPAND ALL "MORE" BUTTONS for full text on pre-rendered reviews
    # This batch-clicks all "More" buttons at once (fast, no waiting per button)
    try:
        expanded = driver.execute_script("""
            var buttons = document.querySelectorAll('button.w8nwRe.kyuRq');
            var count = 0;
            for (var i = 0; i < buttons.length; i++) {
                if (buttons[i].textContent.trim() === 'More') {
                    buttons[i].click();
                    count++;
                }
            }
            return count;
        """)
        if expanded > 0:
            print(f"  📝 Expanded {expanded} truncated reviews")
    except:
        pass
    # Block images to speed up scrolling (use CDP)
    try:
        driver.execute_cdp_cmd('Network.setBlockedURLs', {
@@ -507,15 +577,29 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
        cycle_delta = t0 - cycle_start
        cycle_start = t0
-        # Collect from API (doesn't affect scroll)
+        # Collect from API (doesn't affect scroll) - API has FULL TEXT in original language
        # Use review_id as key to avoid duplicates with DOM
        t1 = time.time()
        for rev in get_api_reviews():
-            key = f"api_{rev['author'][:20]}_{rev['rating']}"
+            rid = rev.get('review_id', '')
-            if key not in seen_ids:
+            if rid and rid not in seen_ids:
-                reviews[key] = rev
+                reviews[rid] = rev
-                seen_ids.add(key)
+                seen_ids.add(rid)
        api_time = time.time() - t1
        # Expand any new "More" buttons for full text (batch click, fast)
        try:
            driver.execute_script("""
                var buttons = document.querySelectorAll('button.w8nwRe.kyuRq');
                for (var i = 0; i < buttons.length; i++) {
                    if (buttons[i].textContent.trim() === 'More') {
                        buttons[i].click();
                    }
                }
            """)
        except:
            pass
        # Parse reviews using ROBUST selectors (no class names - uses data/aria attributes)
        # This survives Google's CSS class name changes
        # Also removes separators from previously-hidden cards to keep DOM light