diff --git a/modules/scraper_clean.py b/modules/scraper_clean.py index 4abef09..65ef441 100644 --- a/modules/scraper_clean.py +++ b/modules/scraper_clean.py @@ -56,7 +56,7 @@ def parse_api_review(raw: list) -> dict: def extract_reviews_from_api_body(body: str) -> list: - """Extract reviews from API response body.""" + """Extract reviews from API response body using correct Google Maps structure.""" reviews = [] try: # Remove )]}' prefix @@ -65,25 +65,76 @@ def extract_reviews_from_api_body(body: str) -> list: data = json.loads(body) - # Recursively find review arrays - def find_reviews(obj, depth=0): - if depth > 12: - return - if isinstance(obj, list): - # Check if this looks like a review - if len(obj) > 4 and isinstance(obj[0], str) and isinstance(obj[4], int): - if 1 <= obj[4] <= 5: - rev = parse_api_review(obj) - if rev and rev["author"]: - reviews.append(rev) - return - for item in obj: - find_reviews(item, depth + 1) - elif isinstance(obj, dict): - for v in obj.values(): - find_reviews(v, depth + 1) + # Google Maps API structure: data[2] contains review arrays + # Each review: data[2][X][0] where: + # Author: [1][4][5][0] + # Rating: [2][0][0] + # Text: [2][15][0][0] + # Time: [1][6] + if not isinstance(data, list) or len(data) < 3: + return reviews - find_reviews(data) + reviews_area = data[2] + if not isinstance(reviews_area, list): + return reviews + + for item in reviews_area: + try: + if not isinstance(item, list) or len(item) < 1: + continue + review_data = item[0] + if not isinstance(review_data, list) or len(review_data) < 3: + continue + + # Extract fields using correct paths + review_id = "" + author = "" + rating = 0 + text = "" + timestamp = "" + + # Review ID: [0] - same format as DOM's data-review-id + try: + review_id = review_data[0] + except (IndexError, TypeError): + pass + + # Author: [1][4][5][0] + try: + author = review_data[1][4][5][0] + except (IndexError, TypeError): + pass + + # Rating: [2][0][0] + try: + rating = review_data[2][0][0] + except (IndexError, TypeError): + pass + + # Text: [2][15][0][0] + try: + text = review_data[2][15][0][0] + except (IndexError, TypeError): + pass + + # Timestamp: [1][6] + try: + timestamp = review_data[1][6] + except (IndexError, TypeError): + pass + + # Validate and add (include review_id for deduplication) + if author and isinstance(rating, int) and 1 <= rating <= 5: + reviews.append({ + "review_id": review_id, + "author": author, + "text": text or "", + "rating": rating, + "timestamp": timestamp or "", + "source": "api" + }) + except: + continue except: pass return reviews @@ -416,6 +467,25 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in except: pass + # EXPAND ALL "MORE" BUTTONS for full text on pre-rendered reviews + # This batch-clicks all "More" buttons at once (fast, no waiting per button) + try: + expanded = driver.execute_script(""" + var buttons = document.querySelectorAll('button.w8nwRe.kyuRq'); + var count = 0; + for (var i = 0; i < buttons.length; i++) { + if (buttons[i].textContent.trim() === 'More') { + buttons[i].click(); + count++; + } + } + return count; + """) + if expanded > 0: + print(f" 📝 Expanded {expanded} truncated reviews") + except: + pass + # Block images to speed up scrolling (use CDP) try: driver.execute_cdp_cmd('Network.setBlockedURLs', { @@ -507,15 +577,29 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in cycle_delta = t0 - cycle_start cycle_start = t0 - # Collect from API (doesn't affect scroll) + # Collect from API (doesn't affect scroll) - API has FULL TEXT in original language + # Use review_id as key to avoid duplicates with DOM t1 = time.time() for rev in get_api_reviews(): - key = f"api_{rev['author'][:20]}_{rev['rating']}" - if key not in seen_ids: - reviews[key] = rev - seen_ids.add(key) + rid = rev.get('review_id', '') + if rid and rid not in seen_ids: + reviews[rid] = rev + seen_ids.add(rid) api_time = time.time() - t1 + # Expand any new "More" buttons for full text (batch click, fast) + try: + driver.execute_script(""" + var buttons = document.querySelectorAll('button.w8nwRe.kyuRq'); + for (var i = 0; i < buttons.length; i++) { + if (buttons[i].textContent.trim() === 'More') { + buttons[i].click(); + } + } + """) + except: + pass + # Parse reviews using ROBUST selectors (no class names - uses data/aria attributes) # This survives Google's CSS class name changes # Also removes separators from previously-hidden cards to keep DOM light