Full text + deduplication: API parser + More button expansion

- Fix API parser to use correct Google Maps response structure - Review ID at [0], Author at [1][4][5][0], Rating at [2][0][0] - Text at [2][15][0][0], Timestamp at [1][6] - Use review_id as key for both API and DOM to avoid duplicates - Prefer API data (original language, full text) - Expand "More" buttons before sorting and during scroll loop - Results: 246/247 full text (99.6%), down from 36/247 before Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-22 13:09:40 +00:00
parent b4fae38027
commit 7abff25dc6
1 changed files with 108 additions and 24 deletions
--- a/modules/scraper_clean.py
+++ b/modules/scraper_clean.py
@@ -56,7 +56,7 @@ def parse_api_review(raw: list) -> dict:


 def extract_reviews_from_api_body(body: str) -> list:
-    """Extract reviews from API response body."""
+    """Extract reviews from API response body using correct Google Maps structure."""
    reviews = []
    try:
        # Remove )]}' prefix
@@ -65,25 +65,76 @@ def extract_reviews_from_api_body(body: str) -> list:

        data = json.loads(body)

-        # Recursively find review arrays
-        def find_reviews(obj, depth=0):
-            if depth > 12:
-                return
-            if isinstance(obj, list):
-                # Check if this looks like a review
-                if len(obj) > 4 and isinstance(obj[0], str) and isinstance(obj[4], int):
-                    if 1 <= obj[4] <= 5:
-                        rev = parse_api_review(obj)
-                        if rev and rev["author"]:
-                            reviews.append(rev)
-                        return
-                for item in obj:
-                    find_reviews(item, depth + 1)
-            elif isinstance(obj, dict):
-                for v in obj.values():
-                    find_reviews(v, depth + 1)
+        # Google Maps API structure: data[2] contains review arrays
+        # Each review: data[2][X][0] where:
+        #   Author: [1][4][5][0]
+        #   Rating: [2][0][0]
+        #   Text:   [2][15][0][0]
+        #   Time:   [1][6]
+        if not isinstance(data, list) or len(data) < 3:
+            return reviews

-        find_reviews(data)
+        reviews_area = data[2]
+        if not isinstance(reviews_area, list):
+            return reviews
+
+        for item in reviews_area:
+            try:
+                if not isinstance(item, list) or len(item) < 1:
+                    continue
+                review_data = item[0]
+                if not isinstance(review_data, list) or len(review_data) < 3:
+                    continue
+
+                # Extract fields using correct paths
+                review_id = ""
+                author = ""
+                rating = 0
+                text = ""
+                timestamp = ""
+
+                # Review ID: [0] - same format as DOM's data-review-id
+                try:
+                    review_id = review_data[0]
+                except (IndexError, TypeError):
+                    pass
+
+                # Author: [1][4][5][0]
+                try:
+                    author = review_data[1][4][5][0]
+                except (IndexError, TypeError):
+                    pass
+
+                # Rating: [2][0][0]
+                try:
+                    rating = review_data[2][0][0]
+                except (IndexError, TypeError):
+                    pass
+
+                # Text: [2][15][0][0]
+                try:
+                    text = review_data[2][15][0][0]
+                except (IndexError, TypeError):
+                    pass
+
+                # Timestamp: [1][6]
+                try:
+                    timestamp = review_data[1][6]
+                except (IndexError, TypeError):
+                    pass
+
+                # Validate and add (include review_id for deduplication)
+                if author and isinstance(rating, int) and 1 <= rating <= 5:
+                    reviews.append({
+                        "review_id": review_id,
+                        "author": author,
+                        "text": text or "",
+                        "rating": rating,
+                        "timestamp": timestamp or "",
+                        "source": "api"
+                    })
+            except:
+                continue
    except:
        pass
    return reviews
@@ -416,6 +467,25 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
    except:
        pass

+    # EXPAND ALL "MORE" BUTTONS for full text on pre-rendered reviews
+    # This batch-clicks all "More" buttons at once (fast, no waiting per button)
+    try:
+        expanded = driver.execute_script("""
+            var buttons = document.querySelectorAll('button.w8nwRe.kyuRq');
+            var count = 0;
+            for (var i = 0; i < buttons.length; i++) {
+                if (buttons[i].textContent.trim() === 'More') {
+                    buttons[i].click();
+                    count++;
+                }
+            }
+            return count;
+        """)
+        if expanded > 0:
+            print(f"  📝 Expanded {expanded} truncated reviews")
+    except:
+        pass
+
    # Block images to speed up scrolling (use CDP)
    try:
        driver.execute_cdp_cmd('Network.setBlockedURLs', {
@@ -507,15 +577,29 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
        cycle_delta = t0 - cycle_start
        cycle_start = t0

-        # Collect from API (doesn't affect scroll)
+        # Collect from API (doesn't affect scroll) - API has FULL TEXT in original language
+        # Use review_id as key to avoid duplicates with DOM
        t1 = time.time()
        for rev in get_api_reviews():
-            key = f"api_{rev['author'][:20]}_{rev['rating']}"
-            if key not in seen_ids:
-                reviews[key] = rev
-                seen_ids.add(key)
+            rid = rev.get('review_id', '')
+            if rid and rid not in seen_ids:
+                reviews[rid] = rev
+                seen_ids.add(rid)
        api_time = time.time() - t1

+        # Expand any new "More" buttons for full text (batch click, fast)
+        try:
+            driver.execute_script("""
+                var buttons = document.querySelectorAll('button.w8nwRe.kyuRq');
+                for (var i = 0; i < buttons.length; i++) {
+                    if (buttons[i].textContent.trim() === 'More') {
+                        buttons[i].click();
+                    }
+                }
+            """)
+        except:
+            pass
+
        # Parse reviews using ROBUST selectors (no class names - uses data/aria attributes)
        # This survives Google's CSS class name changes
        # Also removes separators from previously-hidden cards to keep DOM light