Wave 1: Add StructuredLogger and review topics inference

Task #1: StructuredLogger class (modules/structured_logger.py)
- LogEntry dataclass with timestamp, level, category, metrics, network
- Thread-safe storage with automatic pruning at 10k entries
- Level methods: debug(), info(), warn(), error(), fatal()
- Backward-compatible log() method for migration
- Filter methods: get_logs_by_category(), get_logs_by_level()

Task #16: Review topics inference (modules/scraper_clean.py)
- get_topic_variants(): Generate word variants (plural, -ing, -ed forms)
- infer_review_topics(): Match review text to topic keywords
- Word boundary matching to avoid false positives
- Integrated into scrape_reviews() to add 'topics' field to reviews

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-24 11:27:32 +00:00
parent 3da243be79
commit 313e32f358
2 changed files with 384 additions and 0 deletions

View File

@@ -9,9 +9,131 @@ import json
import time
import threading
from datetime import datetime
from typing import List
from selenium.webdriver.common.by import By
def get_topic_variants(topic: str) -> List[str]:
"""
Generate common variants of a topic word for matching.
Handles:
- Singular/plural forms
- Verb forms (-ing, -ed, -s)
- Common stemming patterns
Args:
topic: The topic word/phrase to generate variants for
Returns:
List of variant strings including the original
Example:
>>> get_topic_variants("cutting")
["cutting", "cut", "cuts"]
>>> get_topic_variants("service")
["service", "services", "servicing"]
"""
if not topic:
return []
topic = topic.lower().strip()
variants = {topic} # Use set to avoid duplicates
# Handle -ing forms (cutting -> cut, cuts)
if topic.endswith("ing"):
base = topic[:-3] # Remove -ing
if base:
variants.add(base)
variants.add(base + "s")
# Handle doubled consonants (cutting -> cut)
if len(base) >= 2 and base[-1] == base[-2]:
single_consonant = base[:-1]
variants.add(single_consonant)
variants.add(single_consonant + "s")
# Handle -s/-es plural forms (services -> service)
if topic.endswith("es") and len(topic) > 2:
variants.add(topic[:-2]) # Remove -es
variants.add(topic[:-2] + "ing")
elif topic.endswith("s") and len(topic) > 1 and not topic.endswith("ss"):
variants.add(topic[:-1]) # Remove -s
variants.add(topic[:-1] + "ing")
# Handle -ed forms (colored -> color)
if topic.endswith("ed") and len(topic) > 2:
base = topic[:-2]
if base:
variants.add(base)
variants.add(base + "s")
variants.add(base + "ing")
# Handle doubled consonants (colored -> color from coloured)
if len(base) >= 2 and base[-1] == base[-2]:
single_consonant = base[:-1]
variants.add(single_consonant)
# Add common forms if base word (no suffix detected)
if not (topic.endswith("ing") or topic.endswith("ed") or topic.endswith("s")):
variants.add(topic + "s")
variants.add(topic + "ing")
# Handle consonant doubling for -ing (cut -> cutting)
if len(topic) >= 2 and topic[-1] not in "aeiouwy":
variants.add(topic + topic[-1] + "ing")
return list(variants)
def infer_review_topics(review_text: str, topics: List[dict]) -> List[str]:
"""
Match review text against extracted topic keywords.
Args:
review_text: The review text to analyze
topics: List of topic dicts, e.g., [{"topic": "cutting", "count": 3}]
Returns:
List of matched topic names
Example:
>>> topics = [{"topic": "hair salon", "count": 4}, {"topic": "cutting", "count": 3}]
>>> text = "Great haircut! The cutting was professional."
>>> infer_review_topics(text, topics)
["cutting"]
"""
# Handle empty/None inputs gracefully
if not review_text or not topics:
return []
review_text_lower = review_text.lower()
matched_topics = []
for topic_dict in topics:
topic = topic_dict.get("topic", "")
if not topic:
continue
topic_lower = topic.lower().strip()
# Get all variants of the topic
variants = get_topic_variants(topic_lower)
# Check each variant for word boundary match
for variant in variants:
if not variant:
continue
# Use word boundary regex to avoid partial matches
# \b ensures we match whole words only
# E.g., "cut" won't match "execute" or "cutlery" partially
pattern = r'\b' + re.escape(variant) + r'\b'
if re.search(pattern, review_text_lower):
matched_topics.append(topic) # Use original topic name
break # Found a match, no need to check other variants
return matched_topics
class LogCapture:
"""Captures scraper logs for storage and viewing."""
@@ -1138,6 +1260,18 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
else:
log.info(f"📋 Total: {len(review_list)} unique reviews (DOM: {dom_count}, API: {api_count})")
# Infer topics for each review if review_topics is available
if review_topics:
log.info(f"🏷️ Inferring topics for {len(review_list)} reviews...")
topics_inferred_count = 0
for review in review_list:
review_text = review.get("text", "")
matched = infer_review_topics(review_text, review_topics)
review["topics"] = matched
if matched:
topics_inferred_count += 1
log.info(f"🏷️ Topics inferred for {topics_inferred_count}/{len(review_list)} reviews")
return {
"reviews": review_list, # Only unflushed reviews (flushed already sent to callback)
"total": grand_total,