Release Google Reviews Scraper Pro v1.0.0 (2025)

Initial release with multi-language support, MongoDB integration, image handling, URL replacement, and robust error handling. Includes detailed documentation, usage examples, and recommended usage guidelines. Built to effectively handle Google's 2025 interface changes.
This commit is contained in:
George Khananaev
2025-04-24 22:12:07 +07:00
commit 5bbaf455d8
14 changed files with 4032 additions and 0 deletions

84
modules/models.py Normal file
View File

@@ -0,0 +1,84 @@
"""
Data models for Google Maps Reviews Scraper.
"""
import re
from dataclasses import dataclass, field
from selenium.webdriver.remote.webelement import WebElement
from modules.utils import (try_find, first_text, first_attr, safe_int, detect_lang, parse_date_to_iso)
@dataclass
class RawReview:
"""
Data class representing a raw review extracted from Google Maps.
"""
id: str = ""
author: str = ""
rating: float = 0.0
date: str = ""
lang: str = "und"
text: str = ""
likes: int = 0
photos: list[str] = field(default_factory=list)
profile: str = ""
avatar: str = "" # URL to profile picture
owner_date: str = ""
owner_text: str = ""
review_date: str = "" # ISO format date
# CSS Selectors for review elements
MORE_BTN = "button.kyuRq"
LIKE_BTN = 'button[jsaction*="toggleThumbsUp" i]'
PHOTO_BTN = "button.Tya61d"
OWNER_RESP = "div.CDe7pd"
@classmethod
def from_card(cls, card: WebElement) -> "RawReview":
"""Factory method to create a RawReview from a WebElement"""
# expand "More" - non-blocking approach
for b in try_find(card, cls.MORE_BTN, all=True):
try:
b.click()
except Exception:
pass
rid = card.get_attribute("data-review-id") or ""
author = first_text(card, 'div[class*="d4r55"]')
profile = first_attr(card, 'button[data-review-id]', "data-href")
avatar = first_attr(card, 'button[data-review-id] img', "src")
label = first_attr(card, 'span[role="img"]', "aria-label")
num = re.search(r"[\d\.]+", label.replace(",", ".")) if label else None
rating = float(num.group()) if num else 0.0
date = first_text(card, 'span[class*="rsqaWe"]')
# Parse the date string to ISO format
review_date = parse_date_to_iso(date)
text = ""
for sel in ('span[jsname="bN97Pc"]',
'span[jsname="fbQN7e"]',
'div.MyEned span.wiI7pd'):
text = first_text(card, sel)
if text: break
lang = detect_lang(text)
likes = 0
if (btn := try_find(card, cls.LIKE_BTN)):
likes = safe_int(btn[0].text or btn[0].get_attribute("aria-label"))
photos: list[str] = []
for btn in try_find(card, cls.PHOTO_BTN, all=True):
if (m := re.search(r'url\("([^"]+)"', btn.get_attribute("style") or "")):
photos.append(m.group(1))
owner_date = owner_text = ""
if (box := try_find(card, cls.OWNER_RESP)):
box = box[0]
owner_date = first_text(box, "span.DZSIDd")
owner_text = first_text(box, "div.wiI7pd")
return cls(rid, author, rating, date, lang, text, likes,
photos, profile, avatar, owner_date, owner_text, review_date)