Files
whyrating-engine-legacy/modules/models.py
George Khananaev 5bbaf455d8 Release Google Reviews Scraper Pro v1.0.0 (2025)
Initial release with multi-language support, MongoDB integration, image handling, URL replacement, and robust error handling. Includes detailed documentation, usage examples, and recommended usage guidelines. Built to effectively handle Google's 2025 interface changes.
2025-04-24 22:12:07 +07:00

85 lines
2.8 KiB
Python

"""
Data models for Google Maps Reviews Scraper.
"""
import re
from dataclasses import dataclass, field
from selenium.webdriver.remote.webelement import WebElement
from modules.utils import (try_find, first_text, first_attr, safe_int, detect_lang, parse_date_to_iso)
@dataclass
class RawReview:
"""
Data class representing a raw review extracted from Google Maps.
"""
id: str = ""
author: str = ""
rating: float = 0.0
date: str = ""
lang: str = "und"
text: str = ""
likes: int = 0
photos: list[str] = field(default_factory=list)
profile: str = ""
avatar: str = "" # URL to profile picture
owner_date: str = ""
owner_text: str = ""
review_date: str = "" # ISO format date
# CSS Selectors for review elements
MORE_BTN = "button.kyuRq"
LIKE_BTN = 'button[jsaction*="toggleThumbsUp" i]'
PHOTO_BTN = "button.Tya61d"
OWNER_RESP = "div.CDe7pd"
@classmethod
def from_card(cls, card: WebElement) -> "RawReview":
"""Factory method to create a RawReview from a WebElement"""
# expand "More" - non-blocking approach
for b in try_find(card, cls.MORE_BTN, all=True):
try:
b.click()
except Exception:
pass
rid = card.get_attribute("data-review-id") or ""
author = first_text(card, 'div[class*="d4r55"]')
profile = first_attr(card, 'button[data-review-id]', "data-href")
avatar = first_attr(card, 'button[data-review-id] img', "src")
label = first_attr(card, 'span[role="img"]', "aria-label")
num = re.search(r"[\d\.]+", label.replace(",", ".")) if label else None
rating = float(num.group()) if num else 0.0
date = first_text(card, 'span[class*="rsqaWe"]')
# Parse the date string to ISO format
review_date = parse_date_to_iso(date)
text = ""
for sel in ('span[jsname="bN97Pc"]',
'span[jsname="fbQN7e"]',
'div.MyEned span.wiI7pd'):
text = first_text(card, sel)
if text: break
lang = detect_lang(text)
likes = 0
if (btn := try_find(card, cls.LIKE_BTN)):
likes = safe_int(btn[0].text or btn[0].get_attribute("aria-label"))
photos: list[str] = []
for btn in try_find(card, cls.PHOTO_BTN, all=True):
if (m := re.search(r'url\("([^"]+)"', btn.get_attribute("style") or "")):
photos.append(m.group(1))
owner_date = owner_text = ""
if (box := try_find(card, cls.OWNER_RESP)):
box = box[0]
owner_date = first_text(box, "span.DZSIDd")
owner_text = first_text(box, "div.wiI7pd")
return cls(rid, author, rating, date, lang, text, likes,
photos, profile, avatar, owner_date, owner_text, review_date)