Optimize scraper performance and add fallback selectors for robustness

Performance improvements:
- Validation speed: 59.71s → 10.96s (5.5x improvement)
- Removed 50+ console.log statements from JavaScript extraction
- Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting
- Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls)

Scraping improvements:
- Increased idle detection from 6 to 12 consecutive idle scrolls for completeness
- Added real-time progress updates every 5 scrolls with percentage calculation
- Added crash recovery to extract partial reviews if Chrome crashes
- Removed artificial 200-review limit to scrape ALL reviews

Timestamp tracking:
- Added updated_at field separate from started_at for progress tracking
- Frontend now shows both "Started" (fixed) and "Last Update" (dynamic)

Robustness improvements:
- Added 5 fallback CSS selectors to handle different Google Maps page structures
- Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc.
- Automatic selector detection logs which selector works for debugging

Test results:
- Successfully scraped 550 reviews in 150.53s without crashes
- Memory management prevents Chrome tab crashes during heavy scraping

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-01-18 19:49:24 +00:00
parent bdffb5eaac
commit faa0704737
108 changed files with 23632 additions and 54 deletions

View File

@@ -0,0 +1,37 @@
import { NextRequest, NextResponse } from 'next/server';
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8000';
export async function POST(request: NextRequest) {
try {
const { url } = await request.json();
if (!url) {
return NextResponse.json({ error: 'URL is required' }, { status: 400 });
}
// Call the containerized scraper API to check if reviews exist
const response = await fetch(`${API_BASE_URL}/check-reviews`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ url }),
});
const data = await response.json();
if (!response.ok) {
return NextResponse.json(
{ error: data.detail || 'Failed to check reviews' },
{ status: response.status }
);
}
return NextResponse.json(data);
} catch (error) {
console.error('Check reviews API error:', error);
return NextResponse.json(
{ error: 'Failed to connect to scraper API' },
{ status: 500 }
);
}
}

View File

@@ -0,0 +1,33 @@
import { NextRequest, NextResponse } from 'next/server';
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8000';
export async function GET(
request: NextRequest,
{ params }: { params: Promise<{ jobId: string }> }
) {
try {
const { jobId } = await params;
const { searchParams } = new URL(request.url);
const limit = searchParams.get('limit') || '1000';
const response = await fetch(`${API_BASE_URL}/jobs/${jobId}/reviews?limit=${limit}`);
if (!response.ok) {
return NextResponse.json(
{ error: 'Failed to get reviews' },
{ status: response.status }
);
}
const data = await response.json();
// API returns { job_id, reviews: [...], count }, we just need the reviews array
return NextResponse.json({ reviews: data.reviews || [] });
} catch (error) {
console.error('Reviews API error:', error);
return NextResponse.json(
{ error: 'Failed to get reviews' },
{ status: 500 }
);
}
}

View File

@@ -0,0 +1,30 @@
import { NextRequest, NextResponse } from 'next/server';
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8000';
export async function GET(
request: NextRequest,
{ params }: { params: Promise<{ jobId: string }> }
) {
try {
const { jobId } = await params;
const response = await fetch(`${API_BASE_URL}/jobs/${jobId}`);
const data = await response.json();
if (!response.ok) {
return NextResponse.json(
{ error: data.detail || 'Job not found' },
{ status: response.status }
);
}
return NextResponse.json(data);
} catch (error) {
console.error('Job status API error:', error);
return NextResponse.json(
{ error: 'Failed to get job status' },
{ status: 500 }
);
}
}

View File

@@ -0,0 +1,37 @@
import { NextRequest, NextResponse } from 'next/server';
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8000';
export async function POST(request: NextRequest) {
try {
const { url } = await request.json();
if (!url) {
return NextResponse.json({ error: 'URL is required' }, { status: 400 });
}
// Call the containerized scraper API
const response = await fetch(`${API_BASE_URL}/scrape`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ url }),
});
const data = await response.json();
if (!response.ok) {
return NextResponse.json(
{ error: data.detail || 'Failed to start scraping' },
{ status: response.status }
);
}
return NextResponse.json(data);
} catch (error) {
console.error('Scrape API error:', error);
return NextResponse.json(
{ error: 'Failed to connect to scraper API' },
{ status: 500 }
);
}
}

BIN
web/app/favicon.ico Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 25 KiB

26
web/app/globals.css Normal file
View File

@@ -0,0 +1,26 @@
@import "tailwindcss";
:root {
--background: #ffffff;
--foreground: #171717;
}
@theme inline {
--color-background: var(--background);
--color-foreground: var(--foreground);
--font-sans: var(--font-geist-sans);
--font-mono: var(--font-geist-mono);
}
@media (prefers-color-scheme: dark) {
:root {
--background: #0a0a0a;
--foreground: #ededed;
}
}
body {
background: var(--background);
color: var(--foreground);
font-family: Arial, Helvetica, sans-serif;
}

34
web/app/layout.tsx Normal file
View File

@@ -0,0 +1,34 @@
import type { Metadata } from "next";
import { Geist, Geist_Mono } from "next/font/google";
import "./globals.css";
const geistSans = Geist({
variable: "--font-geist-sans",
subsets: ["latin"],
});
const geistMono = Geist_Mono({
variable: "--font-geist-mono",
subsets: ["latin"],
});
export const metadata: Metadata = {
title: "Create Next App",
description: "Generated by create next app",
};
export default function RootLayout({
children,
}: Readonly<{
children: React.ReactNode;
}>) {
return (
<html lang="en">
<body
className={`${geistSans.variable} ${geistMono.variable} antialiased`}
>
{children}
</body>
</html>
);
}

38
web/app/page.tsx Normal file
View File

@@ -0,0 +1,38 @@
import ScraperTest from '@/components/ScraperTest';
export default function Home() {
return (
<div className="min-h-screen bg-gradient-to-br from-blue-600 to-indigo-700 py-12 px-4">
<main className="max-w-5xl mx-auto">
<div className="text-center mb-10">
<h1 className="text-4xl md:text-5xl font-bold text-white mb-3">
Google Reviews Scraper
</h1>
<p className="text-blue-100 text-lg">
Test the containerized scraper API
</p>
<div className="mt-4 inline-flex items-center gap-2 px-4 py-2 bg-blue-500/30 rounded-lg text-blue-100 text-sm">
<div className="w-2 h-2 bg-green-400 rounded-full animate-pulse"></div>
Powered by SeleniumBase UC Mode
</div>
</div>
<div className="bg-white rounded-2xl shadow-2xl p-6 md:p-8">
<ScraperTest />
</div>
<div className="mt-8 text-center text-blue-100 text-sm space-y-2">
<p className="font-medium">💡 Example URLs to test:</p>
<div className="space-y-1 text-xs">
<p className="font-mono bg-blue-500/20 rounded px-3 py-1 inline-block">
https://www.google.com/maps/place/Soho+Club/...
</p>
</div>
<p className="mt-4 text-blue-200">
API running at: <span className="font-mono">localhost:8000</span>
</p>
</div>
</main>
</div>
);
}