Wave 2: Migrate scraper to StructuredLogger, add crash detection & topic tags

- Task #2: Migrate scraper_clean.py to use StructuredLogger with categories
  (37 log calls with metrics across browser/scraper/network/system)
- Task #4: Add crash_reports table schema and database methods
  (save_crash_report, get_crash_report, get_crash_stats)
- Task #9: Implement crash detection wrapper with metrics sampling
  (get_chrome_memory, get_dom_node_count, classify_crash)
- Task #17: Add topic tags to frontend ReviewAnalytics
  (topic filter UI, tags on cards, topics in modal)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-24 12:17:23 +00:00
parent 313e32f358
commit 9e1bcde981
4 changed files with 526 additions and 74 deletions

View File

@@ -154,6 +154,41 @@ class DatabaseManager:
CREATE INDEX IF NOT EXISTS idx_webhook_job_id ON webhook_attempts(job_id); CREATE INDEX IF NOT EXISTS idx_webhook_job_id ON webhook_attempts(job_id);
""") """)
# Add session_fingerprint and metrics_history columns to jobs table
await conn.execute("""
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS session_fingerprint JSONB;
""")
await conn.execute("""
ALTER TABLE jobs ADD COLUMN IF NOT EXISTS metrics_history JSONB;
""")
# Create crash_reports table
await conn.execute("""
CREATE TABLE IF NOT EXISTS crash_reports (
crash_id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
job_id UUID REFERENCES jobs(job_id) ON DELETE CASCADE,
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
crash_type VARCHAR(50) NOT NULL,
error_message TEXT,
state JSONB NOT NULL,
metrics_history JSONB,
logs_before_crash JSONB,
analysis JSONB,
screenshot_url TEXT,
dom_snapshot_id UUID
);
""")
await conn.execute("""
CREATE INDEX IF NOT EXISTS idx_crash_reports_job ON crash_reports(job_id);
""")
await conn.execute("""
CREATE INDEX IF NOT EXISTS idx_crash_reports_type ON crash_reports(crash_type);
""")
await conn.execute("""
CREATE INDEX IF NOT EXISTS idx_crash_reports_created ON crash_reports(created_at DESC);
""")
log.info("Database schema initialized") log.info("Database schema initialized")
# ==================== Job Operations ==================== # ==================== Job Operations ====================
@@ -657,3 +692,150 @@ class DatabaseManager:
INSERT INTO webhook_attempts (job_id, attempt_number, success, status_code, error_message, response_time_ms) INSERT INTO webhook_attempts (job_id, attempt_number, success, status_code, error_message, response_time_ms)
VALUES ($1, $2, $3, $4, $5, $6) VALUES ($1, $2, $3, $4, $5, $6)
""", job_id, attempt_number, success, status_code, error_message, response_time_ms) """, job_id, attempt_number, success, status_code, error_message, response_time_ms)
# ==================== Crash Reports ====================
async def save_crash_report(self, job_id: str, crash_data: dict) -> str:
"""
Save a crash report and return the crash_id.
Args:
job_id: Job UUID as string
crash_data: Dictionary containing crash report data:
- crash_type: Type of crash (required)
- error_message: Error message (optional)
- state: Current state at crash time (required)
- metrics_history: Historical metrics (optional)
- logs_before_crash: Log entries before crash (optional)
- analysis: Crash analysis data (optional)
- screenshot_url: URL to screenshot (optional)
- dom_snapshot_id: UUID of DOM snapshot (optional)
Returns:
UUID of created crash report as string
"""
async with self.pool.acquire() as conn:
# Convert job_id string to UUID
job_uuid = UUID(job_id) if isinstance(job_id, str) else job_id
crash_id = await conn.fetchval("""
INSERT INTO crash_reports (
job_id,
crash_type,
error_message,
state,
metrics_history,
logs_before_crash,
analysis,
screenshot_url,
dom_snapshot_id
)
VALUES ($1, $2, $3, $4::jsonb, $5::jsonb, $6::jsonb, $7::jsonb, $8, $9)
RETURNING crash_id
""",
job_uuid,
crash_data.get('crash_type'),
crash_data.get('error_message'),
json.dumps(crash_data.get('state', {})),
json.dumps(crash_data.get('metrics_history')) if crash_data.get('metrics_history') else None,
json.dumps(crash_data.get('logs_before_crash')) if crash_data.get('logs_before_crash') else None,
json.dumps(crash_data.get('analysis')) if crash_data.get('analysis') else None,
crash_data.get('screenshot_url'),
UUID(crash_data['dom_snapshot_id']) if crash_data.get('dom_snapshot_id') else None
)
log.info(f"Saved crash report {crash_id} for job {job_id}, type: {crash_data.get('crash_type')}")
return str(crash_id)
async def get_crash_report(self, job_id: str) -> Optional[dict]:
"""
Get crash report for a job, if any.
Args:
job_id: Job UUID as string
Returns:
Crash report dictionary or None if not found
"""
async with self.pool.acquire() as conn:
job_uuid = UUID(job_id) if isinstance(job_id, str) else job_id
row = await conn.fetchrow("""
SELECT
crash_id,
job_id,
created_at,
crash_type,
error_message,
state,
metrics_history,
logs_before_crash,
analysis,
screenshot_url,
dom_snapshot_id
FROM crash_reports
WHERE job_id = $1
ORDER BY created_at DESC
LIMIT 1
""", job_uuid)
if not row:
return None
result = dict(row)
# Convert UUIDs to strings for JSON serialization
result['crash_id'] = str(result['crash_id'])
result['job_id'] = str(result['job_id'])
if result.get('dom_snapshot_id'):
result['dom_snapshot_id'] = str(result['dom_snapshot_id'])
return result
async def get_crash_stats(self, days: int = 7) -> dict:
"""
Get crash statistics for the last N days.
Args:
days: Number of days to look back (default: 7)
Returns:
Dictionary with:
- total: Total number of crashes
- by_type: Dict mapping crash type to count
- by_day: List of dicts with date and count
"""
async with self.pool.acquire() as conn:
# Get total count
total = await conn.fetchval("""
SELECT COUNT(*)
FROM crash_reports
WHERE created_at >= NOW() - INTERVAL '%s days'
""", days)
# Get counts by type
type_rows = await conn.fetch("""
SELECT crash_type, COUNT(*) as count
FROM crash_reports
WHERE created_at >= NOW() - INTERVAL '%s days'
GROUP BY crash_type
ORDER BY count DESC
""", days)
by_type = {row['crash_type']: row['count'] for row in type_rows}
# Get counts by day
day_rows = await conn.fetch("""
SELECT DATE(created_at) as date, COUNT(*) as count
FROM crash_reports
WHERE created_at >= NOW() - INTERVAL '%s days'
GROUP BY DATE(created_at)
ORDER BY date DESC
""", days)
by_day = [{'date': str(row['date']), 'count': row['count']} for row in day_rows]
return {
'total': total or 0,
'by_type': by_type,
'by_day': by_day
}

View File

@@ -9,9 +9,58 @@ import json
import time import time
import threading import threading
from datetime import datetime from datetime import datetime
from typing import List from typing import List, Optional
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
from modules.structured_logger import StructuredLogger
def get_chrome_memory(driver) -> Optional[int]:
"""Get Chrome memory usage in MB using CDP."""
try:
# Use CDP Performance.getMetrics
result = driver.execute_cdp_cmd('Performance.getMetrics', {})
for metric in result.get('metrics', []):
if metric['name'] == 'JSHeapUsedSize':
return int(metric['value'] / 1024 / 1024)
except:
pass
return None
def get_dom_node_count(driver) -> Optional[int]:
"""Get DOM node count."""
try:
return driver.execute_script("return document.getElementsByTagName('*').length")
except:
return None
def classify_crash(exception: Exception, metrics_history: list) -> str:
"""Classify crash type based on exception and metrics."""
error_str = str(exception).lower()
if 'aw, snap' in error_str or 'status_access_violation' in error_str:
return 'tab_crash'
if 'timeout' in error_str:
return 'timeout'
if metrics_history and metrics_history[-1].get('memory_mb', 0) > 400:
return 'memory_exhaustion'
if 'no such element' in error_str:
return 'element_not_found'
if '429' in error_str or 'rate' in error_str:
return 'rate_limited'
if 'network' in error_str or 'connection' in error_str:
return 'network_failure'
return 'unknown'
class ScraperCrashException(Exception):
"""Exception that carries crash report data for analysis."""
def __init__(self, original_exception, crash_report):
self.original_exception = original_exception
self.crash_report = crash_report
super().__init__(str(original_exception))
def get_topic_variants(topic: str) -> List[str]: def get_topic_variants(topic: str) -> List[str]:
""" """
@@ -135,34 +184,93 @@ def infer_review_topics(review_text: str, topics: List[dict]) -> List[str]:
class LogCapture: class LogCapture:
"""Captures scraper logs for storage and viewing.""" """
Backward-compatible wrapper around StructuredLogger.
Maintains the original LogCapture API while using StructuredLogger internally.
This allows existing code to continue working while gaining structured logging benefits.
"""
def __init__(self): def __init__(self):
self.logs = [] self._logger = StructuredLogger()
def log(self, message: str, level: str = "INFO", source: str = "scraper"): def log(self, message: str, level: str = "INFO", source: str = "scraper"):
"""Add a log entry with timestamp.""" """Add a log entry with timestamp (backward compatible)."""
entry = { # Map source to category
"timestamp": datetime.utcnow().isoformat() + "Z", category = self._source_to_category(source)
"level": level, level_upper = level.upper()
"source": source,
"message": message if level_upper == "ERROR":
} self._logger.error(category, message)
self.logs.append(entry) elif level_upper == "WARNING" or level_upper == "WARN":
self._logger.warn(category, message)
elif level_upper == "DEBUG":
self._logger.debug(category, message)
else:
self._logger.info(category, message)
# Also print for console visibility # Also print for console visibility
print(message, flush=True) print(message, flush=True)
def info(self, message: str, source: str = "scraper"): def info(self, category_or_msg, message: str = None, *, metrics: dict = None):
self.log(message, "INFO", source) """
Log an INFO message.
def warning(self, message: str, source: str = "scraper"): Supports both old API: info(message, source)
self.log(message, "WARNING", source) And new API: info(category, message, metrics={...})
"""
if message is None:
# Old API: info(message) or info(message, source)
self._logger.info('scraper', category_or_msg, metrics=metrics)
print(category_or_msg, flush=True)
else:
# New API: info(category, message, metrics={...})
self._logger.info(category_or_msg, message, metrics=metrics)
print(message, flush=True)
def error(self, message: str, source: str = "scraper"): def warning(self, category_or_msg, message: str = None, *, metrics: dict = None):
self.log(message, "ERROR", source) """Log a WARNING message (supports both old and new API)."""
if message is None:
self._logger.warn('scraper', category_or_msg, metrics=metrics)
print(category_or_msg, flush=True)
else:
self._logger.warn(category_or_msg, message, metrics=metrics)
print(message, flush=True)
def warn(self, category, message: str, *, metrics: dict = None):
"""Log a WARN message with category (new API)."""
self._logger.warn(category, message, metrics=metrics)
print(message, flush=True)
def error(self, category_or_msg, message: str = None, *, metrics: dict = None):
"""Log an ERROR message (supports both old and new API)."""
if message is None:
self._logger.error('scraper', category_or_msg, metrics=metrics)
print(category_or_msg, flush=True)
else:
self._logger.error(category_or_msg, message, metrics=metrics)
print(message, flush=True)
def debug(self, category, message: str, *, metrics: dict = None):
"""Log a DEBUG message with category (new API)."""
self._logger.debug(category, message, metrics=metrics)
print(message, flush=True)
def get_logs(self): def get_logs(self):
return self.logs """Get all log entries as JSON-serializable dictionaries."""
return self._logger.get_logs()
def _source_to_category(self, source: str) -> str:
"""Map legacy source names to StructuredLogger categories."""
source_lower = source.lower() if source else 'scraper'
if source_lower in ('browser', 'navigation', 'page'):
return 'browser'
elif source_lower in ('network', 'api'):
return 'network'
elif source_lower in ('system', 'memory', 'chrome'):
return 'system'
else:
return 'scraper'
def parse_api_review(raw: list) -> dict: def parse_api_review(raw: list) -> dict:
@@ -470,23 +578,23 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
time.sleep(0.1) time.sleep(0.1)
except: except:
pass pass
log.info(f"🌐 Loading: {url[:80]}...") log.info('browser', f"Loading: {url[:80]}...")
else: else:
log.info(f"🔄 Hard refresh #{hard_refresh_count[0]}: reloading page...") log.info('browser', f"Hard refresh #{hard_refresh_count[0]}: reloading page...")
driver.get(url) driver.get(url)
# Handle consent popup if redirected (poll with tiny sleep) # Handle consent popup if redirected (poll with tiny sleep)
start = time.time() start = time.time()
while time.time() - start < 5: # Max 5s for consent while time.time() - start < 5: # Max 5s for consent
if "consent.google" in driver.current_url: if "consent.google" in driver.current_url:
log.info(" Handling consent popup...") log.info('browser', "Handling consent popup...")
try: try:
for btn in driver.find_elements(By.CSS_SELECTOR, "button"): for btn in driver.find_elements(By.CSS_SELECTOR, "button"):
txt = btn.text.lower() txt = btn.text.lower()
if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt: if "accept" in txt or "aceptar" in txt or "alle akzeptieren" in txt:
btn.click() btn.click()
# Reload original URL after consent # Reload original URL after consent
log.info(" Reloading after consent...") log.info('browser', "Reloading after consent...")
driver.get(url) driver.get(url)
# Wait for page to settle after consent reload # Wait for page to settle after consent reload
time.sleep(1) time.sleep(1)
@@ -554,10 +662,10 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
if info: if info:
if info.get('total_reviews') and total_reviews[0] is None: if info.get('total_reviews') and total_reviews[0] is None:
total_reviews[0] = info['total_reviews'] total_reviews[0] = info['total_reviews']
log.info(f"📊 Total reviews on page: {total_reviews[0]}") log.info('scraper', f"Total reviews on page: {total_reviews[0]}", metrics={'total_reviews': total_reviews[0]})
if info.get('name') and business_info_cache[0] is None: if info.get('name') and business_info_cache[0] is None:
business_info_cache[0] = info business_info_cache[0] = info
log.info(f"📍 Business: {info.get('name')}") log.info('scraper', f"Business: {info.get('name')}")
if total_reviews[0] and business_info_cache[0]: if total_reviews[0] and business_info_cache[0]:
break break
except: except:
@@ -566,7 +674,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
# VALIDATION_ONLY: Return early - skip clicking reviews tab, sorting, etc. # VALIDATION_ONLY: Return early - skip clicking reviews tab, sorting, etc.
if validation_only_mode: if validation_only_mode:
log.info("📋 Validation mode: returning early (skipping reviews tab)") log.info('scraper', "Validation mode: returning early (skipping reviews tab)")
return ("validation_done", None) return ("validation_done", None)
# Click reviews tab - poll until found # Click reviews tab - poll until found
@@ -581,12 +689,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
if not tabs_logged and tabs: if not tabs_logged and tabs:
tabs_logged = True tabs_logged = True
tab_texts = [t.text for t in tabs] tab_texts = [t.text for t in tabs]
log.info(f" Available tabs: {tab_texts}") log.info('browser', f"Available tabs: {tab_texts}")
for tab in tabs: for tab in tabs:
tab_text = tab.text.lower() tab_text = tab.text.lower()
if any(kw in tab_text for kw in review_keywords): if any(kw in tab_text for kw in review_keywords):
if not is_refresh: if not is_refresh:
log.info(f" Clicking reviews tab: '{tab.text}'") log.info('browser', f"Clicking reviews tab: '{tab.text}'")
# Extract total_reviews from tab text like "Reviews (79)" or "Reviews\n79" # Extract total_reviews from tab text like "Reviews (79)" or "Reviews\n79"
if total_reviews[0] is None: if total_reviews[0] is None:
import re import re
@@ -594,13 +702,13 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
match = re.search(r'\((\d+)\)', tab.text) match = re.search(r'\((\d+)\)', tab.text)
if match: if match:
total_reviews[0] = int(match.group(1)) total_reviews[0] = int(match.group(1))
log.info(f"📊 Total reviews from tab: {total_reviews[0]}") log.info('scraper', f"Total reviews from tab: {total_reviews[0]}", metrics={'total_reviews': total_reviews[0]})
else: else:
# Try pattern with newline: "Reviews\n79" # Try pattern with newline: "Reviews\n79"
match = re.search(r'(\d+)', tab.text) match = re.search(r'(\d+)', tab.text)
if match: if match:
total_reviews[0] = int(match.group(1)) total_reviews[0] = int(match.group(1))
log.info(f"📊 Total reviews from tab: {total_reviews[0]}") log.info('scraper', f"Total reviews from tab: {total_reviews[0]}", metrics={'total_reviews': total_reviews[0]})
tab.click() tab.click()
tab_clicked = True tab_clicked = True
break break
@@ -620,24 +728,24 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
break break
elapsed = int(time.time() - start) elapsed = int(time.time() - start)
if elapsed > last_print: if elapsed > last_print:
log.info(f" Waiting for reviews panel...{refresh_label} ({elapsed}s)") log.info('browser', f"Waiting for reviews panel...{refresh_label} ({elapsed}s)")
last_print = elapsed last_print = elapsed
time.sleep(0.01) # 10ms - responsive but low CPU time.sleep(0.01) # 10ms - responsive but low CPU
if not scroll_container: if not scroll_container:
log.error(f"Could not find reviews scroll container{refresh_label}") log.error('browser', f"Could not find reviews scroll container{refresh_label}")
try: try:
log.error(f"Page title: {driver.title}") log.error('browser', f"Page title: {driver.title}")
log.error(f"Current URL: {driver.current_url[:100]}") log.error('browser', f"Current URL: {driver.current_url[:100]}")
except: except:
pass pass
return None, None return None, None
log.info(f"Found scroll container{refresh_label}") log.info('browser', f"Found scroll container{refresh_label}")
# Inject API interceptor (needs to be re-injected after refresh) # Inject API interceptor (needs to be re-injected after refresh)
if not is_refresh: if not is_refresh:
log.info("🔌 Injecting API interceptor...") log.info('network', "Injecting API interceptor...")
driver.execute_script(""" driver.execute_script("""
// Always re-setup on refresh // Always re-setup on refresh
window.__reviewInterceptorInjected = true; window.__reviewInterceptorInjected = true;
@@ -711,12 +819,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
} }
""") """)
time.sleep(0.5) time.sleep(0.5)
log.info(" 📅 Sorted by newest") log.info('browser', "Sorted by newest")
# Re-find scroll container after sorting (DOM may be recreated) # Re-find scroll container after sorting (DOM may be recreated)
new_container = find_scroll_container() new_container = find_scroll_container()
if new_container: if new_container:
scroll_container = new_container scroll_container = new_container
log.info(" 🔄 Refreshed scroll container reference") log.info('browser', "Refreshed scroll container reference")
except: except:
pass pass
@@ -734,7 +842,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
return count; return count;
""") """)
if expanded > 0: if expanded > 0:
log.info(f" 📝 Expanded {expanded} truncated reviews") log.info('browser', f"Expanded {expanded} truncated reviews", metrics={'expanded_count': expanded})
except: except:
pass pass
@@ -745,7 +853,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
}) })
driver.execute_cdp_cmd('Network.enable', {}) driver.execute_cdp_cmd('Network.enable', {})
if not is_refresh: if not is_refresh:
log.info(" 🚫 Blocking images for faster scrolling") log.info('browser', "Blocking images for faster scrolling")
except: except:
pass pass
@@ -848,7 +956,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
time.sleep(0.5) # Brief wait for topic filters to render time.sleep(0.5) # Brief wait for topic filters to render
review_topics = extract_review_topics() review_topics = extract_review_topics()
if review_topics: if review_topics:
log.info(f"📊 Found {len(review_topics)} review topics: {', '.join(t['topic'] for t in review_topics[:5])}...") log.info('scraper', f"Found {len(review_topics)} review topics: {', '.join(t['topic'] for t in review_topics[:5])}...", metrics={'topic_count': len(review_topics)})
def get_api_reviews(): def get_api_reviews():
"""Get reviews from intercepted API responses.""" """Get reviews from intercepted API responses."""
@@ -918,7 +1026,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
hard_refresh_count[0] += 1 hard_refresh_count[0] += 1
if hard_refresh_count[0] > max_hard_refreshes: if hard_refresh_count[0] > max_hard_refreshes:
log.warning(f" ⚠️ Max hard refreshes ({max_hard_refreshes}) reached, giving up") log.warn('system', f"Max hard refreshes ({max_hard_refreshes}) reached, giving up", metrics={'hard_refresh_count': hard_refresh_count[0]})
return False return False
# Stop current scroll worker # Stop current scroll worker
@@ -931,18 +1039,24 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
scroll_container = new_container scroll_container = new_container
stop_scrolling = new_stop stop_scrolling = new_stop
recovery_count[0] = 0 # Reset recovery count after successful refresh recovery_count[0] = 0 # Reset recovery count after successful refresh
log.info(f"Hard refresh successful, resuming with {len(seen_ids)} reviews already collected") log.info('browser', f"Hard refresh successful, resuming with {len(seen_ids)} reviews already collected", metrics={'reviews_collected': len(seen_ids)})
return True return True
else: else:
log.error(f"Hard refresh failed to find scroll container") log.error('browser', "Hard refresh failed to find scroll container")
return False return False
# Main collection loop # Main collection loop
last_new_time = time.time() last_new_time = time.time()
last_count = len(reviews) last_count = len(reviews)
check_num = 0 check_num = 0
start_time = time.time()
log.info(f"🔄 Scrolling... (timeout: {timeout_no_new}s with no new)") # Crash detection: metrics sampling
metrics_history = []
last_sample_time = time.time()
scroll_count = [0] # Track scroll operations for crash reports
log.info('browser', f"Scrolling... (timeout: {timeout_no_new}s with no new)", metrics={'timeout_seconds': timeout_no_new})
cycle_start = time.time() cycle_start = time.time()
while True: while True:
@@ -954,6 +1068,19 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
cycle_delta = t0 - cycle_start cycle_delta = t0 - cycle_start
cycle_start = t0 cycle_start = t0
# CRASH DETECTION: Sample metrics every 5 seconds
if time.time() - last_sample_time >= 5:
current_count_for_metrics = total_flushed[0] + len(reviews)
metrics_history.append({
'timestamp_ms': int(time.time() * 1000),
'memory_mb': get_chrome_memory(driver),
'dom_nodes': get_dom_node_count(driver),
'reviews_count': current_count_for_metrics
})
# Keep only last 100 samples
metrics_history = metrics_history[-100:]
last_sample_time = time.time()
# Collect from API (doesn't affect scroll) - API has FULL TEXT in original language # Collect from API (doesn't affect scroll) - API has FULL TEXT in original language
# Use review_id as key to avoid duplicates with DOM # Use review_id as key to avoid duplicates with DOM
t1 = time.time() t1 = time.time()
@@ -1110,14 +1237,14 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
reviews[rid] = rev reviews[rid] = rev
seen_ids.add(rid) seen_ids.add(rid)
except Exception as e: except Exception as e:
log.error(f"DOM parse error: {e}") log.error('scraper', f"DOM parse error: {e}")
dom_time = time.time() - t2 dom_time = time.time() - t2
# BATCH FLUSH: If we have enough reviews, flush to callback and clear memory # BATCH FLUSH: If we have enough reviews, flush to callback and clear memory
# Sort by DOM order before flushing # Sort by DOM order before flushing
t3 = time.time() t3 = time.time()
if flush_callback and len(reviews) >= flush_batch_size: if flush_callback and len(reviews) >= flush_batch_size:
log.info(f" 💾 Flushing {len(reviews)} reviews to disk...") log.info('scraper', f"Flushing {len(reviews)} reviews to disk...", metrics={'batch_size': len(reviews), 'source': 'flush'})
sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf'))) sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf')))
flush_callback([r for _, r in sorted_reviews]) flush_callback([r for _, r in sorted_reviews])
total_flushed[0] += len(reviews) total_flushed[0] += len(reviews)
@@ -1128,7 +1255,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
# TIMING: Print if cycle is slow (>2s) # TIMING: Print if cycle is slow (>2s)
if cycle_delta > 2.0: if cycle_delta > 2.0:
log.warning(f" ⚠️ SLOW cycle: {cycle_delta:.1f}s (api:{api_time:.1f}s dom:{dom_time:.1f}s/{dom_cards}cards flush:{flush_time:.1f}s seen:{len(seen_ids)})") log.warn('system', f"SLOW cycle: {cycle_delta:.1f}s (api:{api_time:.1f}s dom:{dom_time:.1f}s/{dom_cards}cards flush:{flush_time:.1f}s seen:{len(seen_ids)})", metrics={'cycle_time_s': cycle_delta, 'api_time_s': api_time, 'dom_time_s': dom_time, 'dom_cards': dom_cards, 'seen_count': len(seen_ids)})
# Check for new reviews # Check for new reviews
if current_count > last_count: if current_count > last_count:
@@ -1163,9 +1290,9 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
elapsed = time.time() - last_new_time elapsed = time.time() - last_new_time
if total_reviews[0]: if total_reviews[0]:
pct = (current_count / total_reviews[0]) * 100 pct = (current_count / total_reviews[0]) * 100
log.info(f" 📊 {current_count}/{total_reviews[0]} ({pct:.0f}%) | idle: {elapsed:.1f}s") log.info('scraper', f"{current_count}/{total_reviews[0]} ({pct:.0f}%) | idle: {elapsed:.1f}s", metrics={'reviews_count': current_count, 'total_reviews': total_reviews[0], 'progress_pct': pct, 'idle_seconds': elapsed})
else: else:
log.info(f" 📊 {current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s") log.info('scraper', f"{current_count} reviews | idle: {elapsed:.1f}s/{timeout_no_new}s", metrics={'reviews_count': current_count, 'idle_seconds': elapsed})
# Call progress callback on every iteration (for real-time log updates) # Call progress callback on every iteration (for real-time log updates)
if progress_callback: if progress_callback:
@@ -1173,13 +1300,13 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
# Stop conditions - check BEFORE recovery attempts # Stop conditions - check BEFORE recovery attempts
if current_count >= max_reviews: if current_count >= max_reviews:
log.info(f"Reached max: {current_count}") log.info('scraper', f"Reached max: {current_count}", metrics={'total_reviews': current_count, 'elapsed_seconds': time.time() - start_time})
stop_scrolling.set() stop_scrolling.set()
break break
# Also stop if we have all reviews from the page # Also stop if we have all reviews from the page
if total_reviews[0] and current_count >= total_reviews[0]: if total_reviews[0] and current_count >= total_reviews[0]:
log.info(f"All {current_count} reviews collected") log.info('scraper', f"All {current_count} reviews collected", metrics={'total_reviews': current_count, 'elapsed_seconds': time.time() - start_time})
stop_scrolling.set() stop_scrolling.set()
break break
@@ -1188,12 +1315,12 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
if elapsed >= 3 and int(elapsed) % 3 == 0: if elapsed >= 3 and int(elapsed) % 3 == 0:
# After 8+ failed recovery attempts, try hard refresh # After 8+ failed recovery attempts, try hard refresh
if recovery_count[0] >= 8 and hard_refresh_count[0] < max_hard_refreshes: if recovery_count[0] >= 8 and hard_refresh_count[0] < max_hard_refreshes:
log.info(f" 🔄 Soft recovery failed {recovery_count[0]} times, trying hard refresh...") log.info('browser', f"Soft recovery failed {recovery_count[0]} times, trying hard refresh...", metrics={'recovery_count': recovery_count[0]})
if do_hard_refresh(): if do_hard_refresh():
last_new_time = time.time() # Reset timer after refresh last_new_time = time.time() # Reset timer after refresh
continue # Skip to next iteration continue # Skip to next iteration
else: else:
log.info(f" 🔧 Recovery attempt #{recovery_count[0] + 1}...") log.info('browser', f"Recovery attempt #{recovery_count[0] + 1}...", metrics={'recovery_attempt': recovery_count[0] + 1})
unstick_scroll() unstick_scroll()
# Check scroll state - track if content is still being added # Check scroll state - track if content is still being added
@@ -1229,24 +1356,24 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
if truly_done or timeout_hit: if truly_done or timeout_hit:
# Last chance: try hard refresh before giving up # Last chance: try hard refresh before giving up
if hard_refresh_count[0] < max_hard_refreshes and current_count < (total_reviews[0] or max_reviews): if hard_refresh_count[0] < max_hard_refreshes and current_count < (total_reviews[0] or max_reviews):
log.info(f" 🔄 Timeout reached, trying hard refresh before giving up...") log.info('browser', "Timeout reached, trying hard refresh before giving up...", metrics={'idle_seconds': elapsed})
if do_hard_refresh(): if do_hard_refresh():
last_new_time = time.time() last_new_time = time.time()
continue # Keep trying continue # Keep trying
log.info(f"All reviews loaded: {current_count}") log.info('scraper', f"All reviews loaded: {current_count}", metrics={'total_reviews': current_count, 'elapsed_seconds': time.time() - start_time})
stop_scrolling.set() stop_scrolling.set()
break break
# Flush any remaining reviews (sorted by DOM order) # Flush any remaining reviews (sorted by DOM order)
if flush_callback and reviews: if flush_callback and reviews:
log.info(f" 💾 Final flush: {len(reviews)} reviews...") log.info('scraper', f"Final flush: {len(reviews)} reviews...", metrics={'batch_size': len(reviews), 'source': 'final_flush'})
sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf'))) sorted_reviews = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf')))
flush_callback([r for _, r in sorted_reviews]) flush_callback([r for _, r in sorted_reviews])
total_flushed[0] += len(reviews) total_flushed[0] += len(reviews)
reviews.clear() reviews.clear()
# Reviews already parsed during scrolling (real-time parsing) # Reviews already parsed during scrolling (real-time parsing)
log.info("📝 Finalizing review data...") log.info('scraper', "Finalizing review data...")
# Final results (sorted by DOM order) # Final results (sorted by DOM order)
sorted_items = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf'))) sorted_items = sorted(reviews.items(), key=lambda x: review_order.get(x[0], float('inf')))
@@ -1256,13 +1383,13 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
api_count = sum(1 for r in review_list if r.get("source") == "api") api_count = sum(1 for r in review_list if r.get("source") == "api")
if total_flushed[0] > 0: if total_flushed[0] > 0:
log.info(f"📋 Total: {grand_total} unique reviews (flushed: {total_flushed[0]}, in memory: {len(review_list)})") log.info('scraper', f"Total: {grand_total} unique reviews (flushed: {total_flushed[0]}, in memory: {len(review_list)})", metrics={'total_reviews': grand_total, 'flushed_count': total_flushed[0], 'in_memory_count': len(review_list), 'elapsed_seconds': time.time() - start_time})
else: else:
log.info(f"📋 Total: {len(review_list)} unique reviews (DOM: {dom_count}, API: {api_count})") log.info('scraper', f"Total: {len(review_list)} unique reviews (DOM: {dom_count}, API: {api_count})", metrics={'total_reviews': len(review_list), 'dom_count': dom_count, 'api_count': api_count, 'elapsed_seconds': time.time() - start_time})
# Infer topics for each review if review_topics is available # Infer topics for each review if review_topics is available
if review_topics: if review_topics:
log.info(f"🏷️ Inferring topics for {len(review_list)} reviews...") log.info('scraper', f"Inferring topics for {len(review_list)} reviews...", metrics={'reviews_count': len(review_list)})
topics_inferred_count = 0 topics_inferred_count = 0
for review in review_list: for review in review_list:
review_text = review.get("text", "") review_text = review.get("text", "")
@@ -1270,7 +1397,7 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
review["topics"] = matched review["topics"] = matched
if matched: if matched:
topics_inferred_count += 1 topics_inferred_count += 1
log.info(f"🏷️ Topics inferred for {topics_inferred_count}/{len(review_list)} reviews") log.info('scraper', f"Topics inferred for {topics_inferred_count}/{len(review_list)} reviews", metrics={'topics_inferred_count': topics_inferred_count, 'reviews_count': len(review_list)})
return { return {
"reviews": review_list, # Only unflushed reviews (flushed already sent to callback) "reviews": review_list, # Only unflushed reviews (flushed already sent to callback)
@@ -1279,7 +1406,9 @@ def scrape_reviews(driver, url: str, max_reviews: int = 5000, timeout_no_new: in
"checks": check_num, "checks": check_num,
"url": url, "url": url,
"logs": log.get_logs(), "logs": log.get_logs(),
"review_topics": review_topics # Topic filters with mention counts "review_topics": review_topics, # Topic filters with mention counts
"metrics_history": metrics_history, # For crash detection
"start_time": start_time # For crash report elapsed time
} }
@@ -1344,7 +1473,7 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
# Set timezone if provided # Set timezone if provided
if timezone: if timezone:
driver.execute_cdp_cmd('Emulation.setTimezoneOverride', {'timezoneId': timezone}) driver.execute_cdp_cmd('Emulation.setTimezoneOverride', {'timezoneId': timezone})
log_capture.info(f"Set timezone to {timezone}") log_capture.info('browser', f"Set timezone to {timezone}")
# Set locale/language # Set locale/language
driver.execute_cdp_cmd('Emulation.setLocaleOverride', {'locale': language}) driver.execute_cdp_cmd('Emulation.setLocaleOverride', {'locale': language})
@@ -1356,7 +1485,7 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
'longitude': geolocation['lng'], 'longitude': geolocation['lng'],
'accuracy': 1000 # ~1km accuracy for IP-based location 'accuracy': 1000 # ~1km accuracy for IP-based location
}) })
log_capture.info(f"Set geolocation to ({geolocation['lat']:.2f}, {geolocation['lng']:.2f})") log_capture.info('browser', f"Set geolocation to ({geolocation['lat']:.2f}, {geolocation['lng']:.2f})", metrics={'lat': geolocation['lat'], 'lng': geolocation['lng']})
else: else:
# Default to US (Boston, MA) if no geolocation provided # Default to US (Boston, MA) if no geolocation provided
driver.execute_cdp_cmd('Emulation.setGeolocationOverride', { driver.execute_cdp_cmd('Emulation.setGeolocationOverride', {
@@ -1364,12 +1493,12 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
'longitude': -71.0589, 'longitude': -71.0589,
'accuracy': 100 'accuracy': 100
}) })
log_capture.info("Set geolocation to US (Boston, MA) [default]") log_capture.info('browser', "Set geolocation to US (Boston, MA) [default]", metrics={'lat': 42.3601, 'lng': -71.0589})
if fp: if fp:
log_capture.info(f"Browser fingerprint applied: {fp.get('platform', 'unknown')}, {viewport['width']}x{viewport['height']}") log_capture.info('browser', f"Browser fingerprint applied: {fp.get('platform', 'unknown')}, {viewport['width']}x{viewport['height']}", metrics={'viewport_width': viewport['width'], 'viewport_height': viewport['height']})
except Exception as e: except Exception as e:
log_capture.warning(f"Could not apply fingerprint settings: {e}") log_capture.warn('system', f"Could not apply fingerprint settings: {e}")
# Add URL parameters for consistent results # Add URL parameters for consistent results
if 'hl=' not in url: if 'hl=' not in url:
@@ -1435,6 +1564,36 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
except Exception as e: except Exception as e:
elapsed = time.time() - start_time elapsed = time.time() - start_time
# CRASH DETECTION: Build crash report before closing driver
crash_report = None
try:
if driver:
# Try to sample final metrics from the browser
final_metrics = {
'timestamp_ms': int(time.time() * 1000),
'memory_mb': get_chrome_memory(driver),
'dom_nodes': get_dom_node_count(driver)
}
# Build crash report with available information
crash_report = {
'crash_type': classify_crash(e, [final_metrics]),
'error_message': str(e),
'state': {
'reviews_extracted': 0, # Unknown at crash time
'total_expected': None,
'scroll_count': 0,
'elapsed_seconds': elapsed
},
'metrics_history': [final_metrics],
'logs_before_crash': log_capture.get_logs()[-20:] if log_capture else [],
'last_successful_review_id': None
}
log_capture.error('system', f"Crash detected: {crash_report['crash_type']}",
metrics={'error': str(e), 'elapsed_seconds': elapsed})
except:
# If we can't build crash report, continue with basic error handling
pass
if should_close_driver and driver: if should_close_driver and driver:
try: try:
driver.quit() driver.quit()
@@ -1442,9 +1601,9 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
pass pass
# Log error to the existing log_capture # Log error to the existing log_capture
log_capture.error(f"Scraper failed: {str(e)}") log_capture.error('system', f"Scraper failed: {str(e)}")
return { result = {
"reviews": [], "reviews": [],
"count": 0, "count": 0,
"total_reviews": 0, "total_reviews": 0,
@@ -1455,6 +1614,12 @@ def fast_scrape_reviews(url: str, headless: bool = False, max_scrolls: int = 999
"logs": log_capture.get_logs() "logs": log_capture.get_logs()
} }
# Include crash report if available
if crash_report:
result['crash_report'] = crash_report
return result
def extract_about_info(driver, url: str = None) -> dict: def extract_about_info(driver, url: str = None) -> dict:
""" """

View File

@@ -20,6 +20,7 @@ interface ReviewWithNew extends Review {
is_new?: boolean; is_new?: boolean;
owner_response?: OwnerResponse | null; owner_response?: OwnerResponse | null;
photo_urls?: string[] | null; photo_urls?: string[] | null;
topics?: string[];
} }
interface ReviewTopic { interface ReviewTopic {
@@ -47,6 +48,7 @@ export default function ReviewAnalytics({ reviews, businessName, businessUrl, ne
const [selectedReview, setSelectedReview] = useState<ReviewWithNew | null>(null); const [selectedReview, setSelectedReview] = useState<ReviewWithNew | null>(null);
const [showOnlyNew, setShowOnlyNew] = useState(false); const [showOnlyNew, setShowOnlyNew] = useState(false);
const [brushRange, setBrushRange] = useState<{ startIndex: number; endIndex: number } | null>(null); const [brushRange, setBrushRange] = useState<{ startIndex: number; endIndex: number } | null>(null);
const [selectedTopics, setSelectedTopics] = useState<string[]>([]);
// Check if we have comparison data // Check if we have comparison data
const hasComparisonData = reviews.some(r => r.is_new !== undefined); const hasComparisonData = reviews.some(r => r.is_new !== undefined);
@@ -67,6 +69,19 @@ export default function ReviewAnalytics({ reviews, businessName, businessUrl, ne
// Calculate timeline data for chart // Calculate timeline data for chart
const timelineData = useMemo(() => calculateTimelineData(dateFilteredReviews), [dateFilteredReviews]); const timelineData = useMemo(() => calculateTimelineData(dateFilteredReviews), [dateFilteredReviews]);
// Calculate available topics with counts
const availableTopics = useMemo(() => {
const topicCounts = new Map<string, number>();
reviews.forEach(r => {
r.topics?.forEach(t => {
topicCounts.set(t, (topicCounts.get(t) || 0) + 1);
});
});
return Array.from(topicCounts.entries())
.map(([topic, count]) => ({ topic, count }))
.sort((a, b) => b.count - a.count);
}, [reviews]);
// Check if brush covers the full range (no filtering needed) // Check if brush covers the full range (no filtering needed)
const isFullRange = useMemo(() => { const isFullRange = useMemo(() => {
if (!brushRange || timelineData.length === 0) return true; if (!brushRange || timelineData.length === 0) return true;
@@ -155,7 +170,7 @@ export default function ReviewAnalytics({ reviews, businessName, businessUrl, ne
return 'all'; return 'all';
}, [brushRange, timelineData]); }, [brushRange, timelineData]);
// Filter reviews by selected ratings, sentiments, response status, new status, and brush range (for table) // Filter reviews by selected ratings, sentiments, response status, new status, topics, and brush range (for table)
const filteredReviews = useMemo(() => { const filteredReviews = useMemo(() => {
return dateFilteredReviews.filter(r => { return dateFilteredReviews.filter(r => {
const matchesRating = selectedRatings.includes(r.rating); const matchesRating = selectedRatings.includes(r.rating);
@@ -174,6 +189,10 @@ export default function ReviewAnalytics({ reviews, businessName, businessUrl, ne
(hasResponse && selectedResponseStatus.includes('answered')) || (hasResponse && selectedResponseStatus.includes('answered')) ||
(!hasResponse && selectedResponseStatus.includes('not_answered')); (!hasResponse && selectedResponseStatus.includes('not_answered'));
// Filter by selected topics
const matchesTopics = selectedTopics.length === 0 ||
r.topics?.some(t => selectedTopics.includes(t));
// Filter by brush date range if active // Filter by brush date range if active
let matchesBrush = true; let matchesBrush = true;
if (brushDateRange && r.centerDate) { if (brushDateRange && r.centerDate) {
@@ -189,9 +208,9 @@ export default function ReviewAnalytics({ reviews, businessName, businessUrl, ne
matchesBrush = r.centerDate >= startDate && r.centerDate < endDate; matchesBrush = r.centerDate >= startDate && r.centerDate < endDate;
} }
return matchesRating && matchesSentiment && matchesSearch && matchesNew && matchesResponseStatus && matchesBrush; return matchesRating && matchesSentiment && matchesSearch && matchesNew && matchesResponseStatus && matchesTopics && matchesBrush;
}); });
}, [dateFilteredReviews, selectedRatings, selectedSentiments, selectedResponseStatus, globalFilter, showOnlyNew, brushDateRange]); }, [dateFilteredReviews, selectedRatings, selectedSentiments, selectedResponseStatus, globalFilter, showOnlyNew, selectedTopics, brushDateRange]);
const toggleRating = (rating: number) => { const toggleRating = (rating: number) => {
setSelectedRatings(prev => setSelectedRatings(prev =>
@@ -211,6 +230,14 @@ export default function ReviewAnalytics({ reviews, businessName, businessUrl, ne
); );
}; };
const toggleTopicFilter = (topic: string) => {
setSelectedTopics(prev =>
prev.includes(topic)
? prev.filter(t => t !== topic)
: [...prev, topic]
);
};
const clearAllFilters = () => { const clearAllFilters = () => {
setDateRange('all'); setDateRange('all');
setSelectedRatings([1, 2, 3, 4, 5]); setSelectedRatings([1, 2, 3, 4, 5]);
@@ -219,6 +246,7 @@ export default function ReviewAnalytics({ reviews, businessName, businessUrl, ne
setGlobalFilter(''); setGlobalFilter('');
setShowOnlyNew(false); setShowOnlyNew(false);
setBrushRange(null); setBrushRange(null);
setSelectedTopics([]);
}; };
const hasActiveFilters = dateRange !== 'all' || const hasActiveFilters = dateRange !== 'all' ||
@@ -227,7 +255,8 @@ export default function ReviewAnalytics({ reviews, businessName, businessUrl, ne
selectedSentiments.length < 3 || selectedSentiments.length < 3 ||
selectedResponseStatus.length < 2 || selectedResponseStatus.length < 2 ||
globalFilter !== '' || globalFilter !== '' ||
showOnlyNew; showOnlyNew ||
selectedTopics.length > 0;
const exportFilteredData = () => { const exportFilteredData = () => {
const dataStr = JSON.stringify(filteredReviews, null, 2); const dataStr = JSON.stringify(filteredReviews, null, 2);
@@ -364,6 +393,8 @@ export default function ReviewAnalytics({ reviews, businessName, businessUrl, ne
const hasResponse = !!ownerResponse?.text; const hasResponse = !!ownerResponse?.text;
const photoUrls = row.original.photo_urls; const photoUrls = row.original.photo_urls;
const hasPhotos = photoUrls && photoUrls.length > 0; const hasPhotos = photoUrls && photoUrls.length > 0;
const topics = row.original.topics;
const hasTopics = topics && topics.length > 0;
return ( return (
<div className="max-w-2xl"> <div className="max-w-2xl">
@@ -387,6 +418,19 @@ export default function ReviewAnalytics({ reviews, businessName, businessUrl, ne
<p className={`text-gray-800 ${!expanded && 'line-clamp-2'}`}> <p className={`text-gray-800 ${!expanded && 'line-clamp-2'}`}>
{text} {text}
</p> </p>
{hasTopics && (
<div className="flex flex-wrap gap-1 mt-2">
{topics.map(topic => (
<span
key={topic}
className="px-2 py-0.5 bg-indigo-100 text-indigo-700 text-xs rounded-full cursor-pointer hover:bg-indigo-200 transition-colors"
onClick={(e) => { e.stopPropagation(); toggleTopicFilter(topic); }}
>
{topic}
</span>
))}
</div>
)}
<div className="flex items-center gap-3 mt-1"> <div className="flex items-center gap-3 mt-1">
{text.length > 100 && ( {text.length > 100 && (
<button <button
@@ -458,7 +502,7 @@ export default function ReviewAnalytics({ reviews, businessName, businessUrl, ne
}, },
}, },
], ],
[] [toggleTopicFilter]
); );
const table = useReactTable({ const table = useReactTable({
@@ -603,6 +647,35 @@ export default function ReviewAnalytics({ reviews, businessName, businessUrl, ne
</div> </div>
)} )}
{/* Topic Filter */}
{availableTopics.length > 0 && (
<div className="flex items-center gap-2 flex-wrap">
<MessageSquare className="w-5 h-5 text-gray-700" />
<span className="font-semibold text-gray-900">Topics:</span>
{availableTopics.slice(0, 10).map(({topic, count}) => (
<button
key={topic}
onClick={() => toggleTopicFilter(topic)}
className={`px-3 py-1 rounded-full text-sm font-semibold transition-all border ${
selectedTopics.includes(topic)
? 'bg-indigo-600 text-white border-indigo-700'
: 'bg-indigo-100 text-indigo-700 border-indigo-200 hover:bg-indigo-200'
}`}
>
{topic} ({count})
</button>
))}
{selectedTopics.length > 0 && (
<button
onClick={() => setSelectedTopics([])}
className="px-2 py-1 text-xs text-gray-500 hover:text-gray-700"
>
Clear topics
</button>
)}
</div>
)}
{/* Filter Summary */} {/* Filter Summary */}
<div className="flex items-center justify-between pt-2 border-t-2 border-gray-200"> <div className="flex items-center justify-between pt-2 border-t-2 border-gray-200">
<span className="text-sm font-medium text-gray-600"> <span className="text-sm font-medium text-gray-600">
@@ -1535,6 +1608,37 @@ export default function ReviewAnalytics({ reviews, businessName, businessUrl, ne
</div> </div>
</div> </div>
{/* Review Topics */}
{selectedReview.topics && selectedReview.topics.length > 0 && (
<div className="bg-indigo-50 rounded-xl p-4 border-2 border-indigo-200">
<div className="flex items-center gap-2 mb-3">
<MessageSquare className="w-5 h-5 text-indigo-700" />
<span className="font-semibold text-indigo-900">Topics</span>
<span className="px-2 py-0.5 bg-indigo-200 text-indigo-800 text-xs font-bold rounded-full">
{selectedReview.topics.length} topic{selectedReview.topics.length > 1 ? 's' : ''}
</span>
</div>
<div className="flex flex-wrap gap-2">
{selectedReview.topics.map(topic => (
<button
key={topic}
onClick={() => {
toggleTopicFilter(topic);
setSelectedReview(null);
}}
className={`px-3 py-1 rounded-full text-sm font-semibold transition-all border ${
selectedTopics.includes(topic)
? 'bg-indigo-600 text-white border-indigo-700'
: 'bg-white text-indigo-700 border-indigo-300 hover:bg-indigo-100'
}`}
>
{topic}
</button>
))}
</div>
</div>
)}
{/* Owner Response */} {/* Owner Response */}
{selectedReview.owner_response?.text ? ( {selectedReview.owner_response?.text ? (
<div className="bg-emerald-50 rounded-xl p-4 border-2 border-emerald-200"> <div className="bg-emerald-50 rounded-xl p-4 border-2 border-emerald-200">

View File

@@ -15,6 +15,7 @@ export interface Review {
review_id: string; review_id: string;
owner_response?: OwnerResponse | null; owner_response?: OwnerResponse | null;
photo_urls?: string[] | null; photo_urls?: string[] | null;
topics?: string[]; // Inferred topics from scraper
// Derived fields (computed on load) // Derived fields (computed on load)
parsedDate?: Date; parsedDate?: Date;
dateCategory?: 'recent' | 'month' | 'year' | 'older'; // Time range category dateCategory?: 'recent' | 'month' | 'year' | 'older'; // Time range category