Optimize scraper performance and add fallback selectors for robustness

Performance improvements:
- Validation speed: 59.71s → 10.96s (5.5x improvement)
- Removed 50+ console.log statements from JavaScript extraction
- Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting
- Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls)

Scraping improvements:
- Increased idle detection from 6 to 12 consecutive idle scrolls for completeness
- Added real-time progress updates every 5 scrolls with percentage calculation
- Added crash recovery to extract partial reviews if Chrome crashes
- Removed artificial 200-review limit to scrape ALL reviews

Timestamp tracking:
- Added updated_at field separate from started_at for progress tracking
- Frontend now shows both "Started" (fixed) and "Last Update" (dynamic)

Robustness improvements:
- Added 5 fallback CSS selectors to handle different Google Maps page structures
- Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc.
- Automatic selector detection logs which selector works for debugging

Test results:
- Successfully scraped 550 reviews in 150.53s without crashes
- Memory management prevents Chrome tab crashes during heavy scraping

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-18 19:49:24 +00:00
parent bdffb5eaac
commit faa0704737
108 changed files with 23632 additions and 54 deletions

398
web/lib/analytics.ts Normal file
View File

@@ -0,0 +1,398 @@
// Analytics utility functions
export interface Review {
author: string;
rating: number;
text: string | null;
date_text: string;
avatar_url: string | null;
profile_url: string | null;
review_id: string;
// Derived fields (computed on load)
parsedDate?: Date;
dateCategory?: 'recent' | 'month' | 'year' | 'older'; // Time range category
minDate?: Date; // Earliest possible date (accounting for Google's uncertainty)
maxDate?: Date; // Latest possible date (accounting for Google's uncertainty)
centerDate?: Date; // Midpoint of the range (used for calculations and sorting)
}
export type DateRange = 'week' | 'month' | 'year' | 'all';
export interface TimelineDataPoint {
date: string;
rating: number;
rollingAvg: number;
}
export interface ReviewStats {
totalReviews: number;
averageRating: number;
sentimentScore: number;
photoCount: number;
avgReviewLength: number;
recentReviews: number;
ratingDistribution: { rating: number; count: number; percentage: number }[];
topKeywords: { word: string; count: number }[];
sentimentBreakdown: { positive: number; neutral: number; negative: number };
negativeReviews: number;
responseRate: number;
averageResponseTime: string;
}
export function calculateReviewStats(reviews: Review[]): ReviewStats {
// Populate minDate/maxDate/centerDate on reviews for display
reviews.forEach(r => {
if (!r.minDate || !r.maxDate || !r.centerDate) {
const range = parseDateTextToRange(r.date_text);
r.minDate = range.minDate;
r.maxDate = range.maxDate;
// Calculate centerDate as midpoint
const midpointTime = (range.minDate.getTime() + range.maxDate.getTime()) / 2;
r.centerDate = new Date(midpointTime);
}
});
const totalReviews = reviews.length;
// Average rating
const averageRating = reviews.reduce((sum, r) => sum + r.rating, 0) / totalReviews;
// Sentiment score (% of 4-5 star reviews)
const positiveReviews = reviews.filter(r => r.rating >= 4).length;
const sentimentScore = (positiveReviews / totalReviews) * 100;
// Photo count (reviews with avatars as proxy)
const photoCount = reviews.filter(r => r.avatar_url).length;
// Average review length
const avgReviewLength = Math.round(
reviews.reduce((sum, r) => sum + (r.text?.split(' ').length || 0), 0) / totalReviews
);
// Recent reviews (last 30 days - simplified check)
const recentReviews = reviews.filter(r => {
const text = r.date_text.toLowerCase();
return text.includes('day') || text.includes('week') || text.includes('hour');
}).length;
// Rating distribution
const ratingCounts: Record<number, number> = { 1: 0, 2: 0, 3: 0, 4: 0, 5: 0 };
reviews.forEach(r => {
ratingCounts[r.rating] = (ratingCounts[r.rating] || 0) + 1;
});
const ratingDistribution = [5, 4, 3, 2, 1].map(rating => ({
rating,
count: ratingCounts[rating] || 0,
percentage: ((ratingCounts[rating] || 0) / totalReviews) * 100,
}));
// Extract keywords from review text
const allWords = reviews
.filter(r => r.text)
.flatMap(r =>
r.text!
.toLowerCase()
.replace(/[^\w\s]/g, '')
.split(/\s+/)
.filter(w => w.length > 3)
);
const stopWords = new Set(['this', 'that', 'with', 'from', 'have', 'been', 'were', 'very', 'great', 'good', 'best', 'nice', 'here', 'there', 'they', 'their', 'about', 'would', 'could', 'should', 'place', 'really']);
const wordCounts: Record<string, number> = {};
allWords.forEach(word => {
if (!stopWords.has(word)) {
wordCounts[word] = (wordCounts[word] || 0) + 1;
}
});
const topKeywords = Object.entries(wordCounts)
.sort(([, a], [, b]) => b - a)
.slice(0, 10)
.map(([word, count]) => ({ word, count }));
// Sentiment breakdown
const sentimentBreakdown = {
positive: reviews.filter(r => r.rating >= 4).length,
neutral: reviews.filter(r => r.rating === 3).length,
negative: reviews.filter(r => r.rating <= 2).length,
};
// Negative reviews count
const negativeReviews = reviews.filter(r => r.rating <= 2).length;
// Response rate (placeholder - would need owner_response field)
const responseRate = 0; // TODO: Calculate when owner responses are available
// Average response time (placeholder)
const averageResponseTime = 'N/A'; // TODO: Calculate when response data is available
return {
totalReviews,
averageRating,
sentimentScore,
photoCount,
avgReviewLength,
recentReviews,
ratingDistribution,
topKeywords,
sentimentBreakdown,
negativeReviews,
responseRate,
averageResponseTime,
};
}
export function getSentimentLabel(rating: number): 'positive' | 'neutral' | 'negative' {
if (rating >= 4) return 'positive';
if (rating === 3) return 'neutral';
return 'negative';
}
// Helper function to get date range boundaries for preset buttons
export function getDateRangeBoundaries(range: DateRange): { from: Date | null; to: Date | null } {
if (range === 'all') return { from: null, to: null };
const now = new Date();
const to = new Date(now); // Today as end date
const from = new Date();
switch (range) {
case 'week':
from.setDate(now.getDate() - 7);
break;
case 'month':
from.setMonth(now.getMonth() - 1);
break;
case 'year':
from.setFullYear(now.getFullYear() - 1);
break;
}
// Set to start of day for from, end of day for to
from.setHours(0, 0, 0, 0);
to.setHours(23, 59, 59, 999);
return { from, to };
}
export function getSentimentColor(sentiment: 'positive' | 'neutral' | 'negative'): string {
switch (sentiment) {
case 'positive': return 'text-green-700 bg-green-50 border-green-300';
case 'neutral': return 'text-yellow-700 bg-yellow-50 border-yellow-300';
case 'negative': return 'text-red-700 bg-red-50 border-red-300';
}
}
function extractNumber(text: string): number {
// Extract first number from text (e.g., "2 weeks ago" -> 2, "Hace 2 semanas" -> 2)
const match = text.match(/\d+/);
if (match) {
return parseInt(match[0]);
}
// Handle singular: "a month ago", "un mes", "una semana"
if (text.match(/^a\s+\w+\s+ago/) || text.includes('un ') || text.includes('una ')) {
return 1;
}
return 1; // Default to 1 if no number found
}
/**
* Parse date_text into time range boundaries (min/max dates)
*
* This accounts for Google's inherent uncertainty in relative dates.
* Based on reverse-engineered patterns from 244 reviews.
*
* Examples:
* - "a month ago" → { min: 30 days ago, max: 59 days ago }
* - "2 months ago" → { min: 60 days ago, max: 89 days ago }
* - "a year ago" → { min: 365 days ago, max: 729 days ago }
*/
export function parseDateTextToRange(dateText: string): { minDate: Date; maxDate: Date } {
const now = new Date();
const text = dateText.toLowerCase();
// Remove "Edited " prefix if present
const cleaned = text.replace(/^edited\s+/i, '');
// Helper to create date from days ago
const daysAgo = (days: number) => new Date(now.getTime() - days * 24 * 60 * 60 * 1000);
// Seconds: 1-59 seconds
if (cleaned.includes('second')) {
const seconds = extractNumber(cleaned);
const minDate = new Date(now.getTime() - seconds * 1000);
const maxDate = new Date(now.getTime() - seconds * 1000);
return { minDate, maxDate };
}
// Minutes: 1-59 minutes
if (cleaned.includes('minute')) {
const minutes = extractNumber(cleaned);
const minDate = new Date(now.getTime() - minutes * 60 * 1000);
const maxDate = new Date(now.getTime() - minutes * 60 * 1000);
return { minDate, maxDate };
}
// Hours: 1-23 hours
if (cleaned.includes('hora') || cleaned.includes('hour')) {
const hours = extractNumber(cleaned);
const minDate = new Date(now.getTime() - hours * 60 * 60 * 1000);
const maxDate = new Date(now.getTime() - hours * 60 * 60 * 1000);
return { minDate, maxDate };
}
// Days: 1-6 days
if (cleaned.includes('día') || cleaned.includes('day')) {
const days = extractNumber(cleaned);
const minDate = daysAgo(days);
const maxDate = daysAgo(days);
return { minDate, maxDate };
}
// Weeks: 2-3 weeks (Google never shows "1 week ago" or "4 weeks ago")
if (cleaned.includes('semana') || cleaned.includes('week')) {
const weeks = extractNumber(cleaned);
// Each week pattern represents a 7-day range
const minDays = weeks * 7;
const maxDays = weeks * 7 + 6; // Up to 6 extra days before switching to next week
return { minDate: daysAgo(maxDays), maxDate: daysAgo(minDays) };
}
// Months: Singular "a month ago" or plural "2-11 months ago"
if (cleaned.includes('mes') || cleaned.includes('month')) {
const months = extractNumber(cleaned);
// "a month ago" = 30-59 days (before switching to "2 months ago")
if (months === 1) {
return { minDate: daysAgo(59), maxDate: daysAgo(30) };
}
// "2 months ago" = 60-89 days
// "3 months ago" = 90-119 days
// Pattern: N months = (N*30) to ((N+1)*30 - 1) days
const minDays = months * 30;
const maxDays = (months + 1) * 30 - 1;
return { minDate: daysAgo(maxDays), maxDate: daysAgo(minDays) };
}
// Years: Singular "a year ago" or plural "2-11 years ago"
if (cleaned.includes('año') || cleaned.includes('year')) {
const years = extractNumber(cleaned);
// "a year ago" = 365-729 days (12-24 months before switching to "2 years ago")
if (years === 1) {
return { minDate: daysAgo(729), maxDate: daysAgo(365) };
}
// "2 years ago" = 730-1094 days (24-36 months)
// Pattern: N years = (N*365) to ((N+1)*365 - 1) days
const minDays = years * 365;
const maxDays = (years + 1) * 365 - 1;
return { minDate: daysAgo(maxDays), maxDate: daysAgo(minDays) };
}
// Default: very old (10+ years)
return { minDate: daysAgo(3650 + 365), maxDate: daysAgo(3650) };
}
export function parseDateText(dateText: string): Date {
// Get the time range and return the midpoint
const { minDate, maxDate } = parseDateTextToRange(dateText);
const midpointTime = (minDate.getTime() + maxDate.getTime()) / 2;
return new Date(midpointTime);
}
export function filterReviewsByDateRange(reviews: Review[], range: DateRange): Review[] {
if (range === 'all') return reviews;
const now = new Date();
const filterStart = new Date();
switch (range) {
case 'week':
filterStart.setDate(now.getDate() - 7);
break;
case 'month':
filterStart.setMonth(now.getMonth() - 1);
break;
case 'year':
filterStart.setFullYear(now.getFullYear() - 1);
break;
}
const filterEnd = now;
// Use range overlap logic: Include review if its time range overlaps with filter range
// Review range: [minDate, maxDate]
// Filter range: [filterStart, filterEnd]
// Overlap occurs when: minDate <= filterEnd AND maxDate >= filterStart
return reviews.filter(r => {
const { minDate, maxDate } = parseDateTextToRange(r.date_text);
return minDate <= filterEnd && maxDate >= filterStart;
});
}
export function filterReviewsByCustomDateRange(reviews: Review[], fromDate: Date | null, toDate: Date | null): Review[] {
if (!fromDate && !toDate) return reviews;
return reviews.filter(r => {
const reviewDate = parseDateText(r.date_text);
// If only fromDate is set, filter reviews >= fromDate
if (fromDate && !toDate) {
return reviewDate >= fromDate;
}
// If only toDate is set, filter reviews <= toDate (end of day)
if (!fromDate && toDate) {
const endOfDay = new Date(toDate);
endOfDay.setHours(23, 59, 59, 999);
return reviewDate <= endOfDay;
}
// Both dates set - filter reviews within range
const endOfDay = new Date(toDate!);
endOfDay.setHours(23, 59, 59, 999);
return reviewDate >= fromDate! && reviewDate <= endOfDay;
});
}
export function calculateTimelineData(reviews: Review[]): TimelineDataPoint[] {
// Sort reviews by date (newest first)
const sortedReviews = [...reviews]
.map(r => ({ ...r, parsedDate: parseDateText(r.date_text) }))
.sort((a, b) => b.parsedDate.getTime() - a.parsedDate.getTime());
// Group by month
const monthlyData: Record<string, { ratings: number[]; date: Date }> = {};
sortedReviews.forEach(review => {
const monthKey = `${review.parsedDate.getFullYear()}-${String(review.parsedDate.getMonth() + 1).padStart(2, '0')}`;
if (!monthlyData[monthKey]) {
monthlyData[monthKey] = { ratings: [], date: review.parsedDate };
}
monthlyData[monthKey].ratings.push(review.rating);
});
// Calculate averages and rolling average
const dataPoints: TimelineDataPoint[] = Object.entries(monthlyData)
.map(([monthKey, data]) => ({
date: monthKey,
rating: data.ratings.reduce((a, b) => a + b, 0) / data.ratings.length,
rollingAvg: 0, // Will calculate below
}))
.sort((a, b) => a.date.localeCompare(b.date));
// Calculate 3-month rolling average
dataPoints.forEach((point, idx) => {
const start = Math.max(0, idx - 2);
const end = idx + 1;
const window = dataPoints.slice(start, end);
point.rollingAvg = window.reduce((sum, p) => sum + p.rating, 0) / window.length;
});
return dataPoints;
}