Optimize scraper performance and add fallback selectors for robustness
Performance improvements: - Validation speed: 59.71s → 10.96s (5.5x improvement) - Removed 50+ console.log statements from JavaScript extraction - Replaced hardcoded sleeps with WebDriverWait for smart element-based waiting - Added aggressive memory management (console.clear, GC, image unloading every 20 scrolls) Scraping improvements: - Increased idle detection from 6 to 12 consecutive idle scrolls for completeness - Added real-time progress updates every 5 scrolls with percentage calculation - Added crash recovery to extract partial reviews if Chrome crashes - Removed artificial 200-review limit to scrape ALL reviews Timestamp tracking: - Added updated_at field separate from started_at for progress tracking - Frontend now shows both "Started" (fixed) and "Last Update" (dynamic) Robustness improvements: - Added 5 fallback CSS selectors to handle different Google Maps page structures - Now tries: div.jftiEf.fontBodyMedium, div.jftiEf, div[data-review-id], etc. - Automatic selector detection logs which selector works for debugging Test results: - Successfully scraped 550 reviews in 150.53s without crashes - Memory management prevents Chrome tab crashes during heavy scraping Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
37
web/app/api/check-reviews/route.ts
Normal file
37
web/app/api/check-reviews/route.ts
Normal file
@@ -0,0 +1,37 @@
|
||||
import { NextRequest, NextResponse } from 'next/server';
|
||||
|
||||
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8000';
|
||||
|
||||
export async function POST(request: NextRequest) {
|
||||
try {
|
||||
const { url } = await request.json();
|
||||
|
||||
if (!url) {
|
||||
return NextResponse.json({ error: 'URL is required' }, { status: 400 });
|
||||
}
|
||||
|
||||
// Call the containerized scraper API to check if reviews exist
|
||||
const response = await fetch(`${API_BASE_URL}/check-reviews`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ url }),
|
||||
});
|
||||
|
||||
const data = await response.json();
|
||||
|
||||
if (!response.ok) {
|
||||
return NextResponse.json(
|
||||
{ error: data.detail || 'Failed to check reviews' },
|
||||
{ status: response.status }
|
||||
);
|
||||
}
|
||||
|
||||
return NextResponse.json(data);
|
||||
} catch (error) {
|
||||
console.error('Check reviews API error:', error);
|
||||
return NextResponse.json(
|
||||
{ error: 'Failed to connect to scraper API' },
|
||||
{ status: 500 }
|
||||
);
|
||||
}
|
||||
}
|
||||
33
web/app/api/jobs/[jobId]/reviews/route.ts
Normal file
33
web/app/api/jobs/[jobId]/reviews/route.ts
Normal file
@@ -0,0 +1,33 @@
|
||||
import { NextRequest, NextResponse } from 'next/server';
|
||||
|
||||
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8000';
|
||||
|
||||
export async function GET(
|
||||
request: NextRequest,
|
||||
{ params }: { params: Promise<{ jobId: string }> }
|
||||
) {
|
||||
try {
|
||||
const { jobId } = await params;
|
||||
const { searchParams } = new URL(request.url);
|
||||
const limit = searchParams.get('limit') || '1000';
|
||||
|
||||
const response = await fetch(`${API_BASE_URL}/jobs/${jobId}/reviews?limit=${limit}`);
|
||||
|
||||
if (!response.ok) {
|
||||
return NextResponse.json(
|
||||
{ error: 'Failed to get reviews' },
|
||||
{ status: response.status }
|
||||
);
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
// API returns { job_id, reviews: [...], count }, we just need the reviews array
|
||||
return NextResponse.json({ reviews: data.reviews || [] });
|
||||
} catch (error) {
|
||||
console.error('Reviews API error:', error);
|
||||
return NextResponse.json(
|
||||
{ error: 'Failed to get reviews' },
|
||||
{ status: 500 }
|
||||
);
|
||||
}
|
||||
}
|
||||
30
web/app/api/jobs/[jobId]/route.ts
Normal file
30
web/app/api/jobs/[jobId]/route.ts
Normal file
@@ -0,0 +1,30 @@
|
||||
import { NextRequest, NextResponse } from 'next/server';
|
||||
|
||||
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8000';
|
||||
|
||||
export async function GET(
|
||||
request: NextRequest,
|
||||
{ params }: { params: Promise<{ jobId: string }> }
|
||||
) {
|
||||
try {
|
||||
const { jobId } = await params;
|
||||
|
||||
const response = await fetch(`${API_BASE_URL}/jobs/${jobId}`);
|
||||
const data = await response.json();
|
||||
|
||||
if (!response.ok) {
|
||||
return NextResponse.json(
|
||||
{ error: data.detail || 'Job not found' },
|
||||
{ status: response.status }
|
||||
);
|
||||
}
|
||||
|
||||
return NextResponse.json(data);
|
||||
} catch (error) {
|
||||
console.error('Job status API error:', error);
|
||||
return NextResponse.json(
|
||||
{ error: 'Failed to get job status' },
|
||||
{ status: 500 }
|
||||
);
|
||||
}
|
||||
}
|
||||
37
web/app/api/scrape/route.ts
Normal file
37
web/app/api/scrape/route.ts
Normal file
@@ -0,0 +1,37 @@
|
||||
import { NextRequest, NextResponse } from 'next/server';
|
||||
|
||||
const API_BASE_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8000';
|
||||
|
||||
export async function POST(request: NextRequest) {
|
||||
try {
|
||||
const { url } = await request.json();
|
||||
|
||||
if (!url) {
|
||||
return NextResponse.json({ error: 'URL is required' }, { status: 400 });
|
||||
}
|
||||
|
||||
// Call the containerized scraper API
|
||||
const response = await fetch(`${API_BASE_URL}/scrape`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ url }),
|
||||
});
|
||||
|
||||
const data = await response.json();
|
||||
|
||||
if (!response.ok) {
|
||||
return NextResponse.json(
|
||||
{ error: data.detail || 'Failed to start scraping' },
|
||||
{ status: response.status }
|
||||
);
|
||||
}
|
||||
|
||||
return NextResponse.json(data);
|
||||
} catch (error) {
|
||||
console.error('Scrape API error:', error);
|
||||
return NextResponse.json(
|
||||
{ error: 'Failed to connect to scraper API' },
|
||||
{ status: 500 }
|
||||
);
|
||||
}
|
||||
}
|
||||
BIN
web/app/favicon.ico
Normal file
BIN
web/app/favicon.ico
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 25 KiB |
26
web/app/globals.css
Normal file
26
web/app/globals.css
Normal file
@@ -0,0 +1,26 @@
|
||||
@import "tailwindcss";
|
||||
|
||||
:root {
|
||||
--background: #ffffff;
|
||||
--foreground: #171717;
|
||||
}
|
||||
|
||||
@theme inline {
|
||||
--color-background: var(--background);
|
||||
--color-foreground: var(--foreground);
|
||||
--font-sans: var(--font-geist-sans);
|
||||
--font-mono: var(--font-geist-mono);
|
||||
}
|
||||
|
||||
@media (prefers-color-scheme: dark) {
|
||||
:root {
|
||||
--background: #0a0a0a;
|
||||
--foreground: #ededed;
|
||||
}
|
||||
}
|
||||
|
||||
body {
|
||||
background: var(--background);
|
||||
color: var(--foreground);
|
||||
font-family: Arial, Helvetica, sans-serif;
|
||||
}
|
||||
34
web/app/layout.tsx
Normal file
34
web/app/layout.tsx
Normal file
@@ -0,0 +1,34 @@
|
||||
import type { Metadata } from "next";
|
||||
import { Geist, Geist_Mono } from "next/font/google";
|
||||
import "./globals.css";
|
||||
|
||||
const geistSans = Geist({
|
||||
variable: "--font-geist-sans",
|
||||
subsets: ["latin"],
|
||||
});
|
||||
|
||||
const geistMono = Geist_Mono({
|
||||
variable: "--font-geist-mono",
|
||||
subsets: ["latin"],
|
||||
});
|
||||
|
||||
export const metadata: Metadata = {
|
||||
title: "Create Next App",
|
||||
description: "Generated by create next app",
|
||||
};
|
||||
|
||||
export default function RootLayout({
|
||||
children,
|
||||
}: Readonly<{
|
||||
children: React.ReactNode;
|
||||
}>) {
|
||||
return (
|
||||
<html lang="en">
|
||||
<body
|
||||
className={`${geistSans.variable} ${geistMono.variable} antialiased`}
|
||||
>
|
||||
{children}
|
||||
</body>
|
||||
</html>
|
||||
);
|
||||
}
|
||||
38
web/app/page.tsx
Normal file
38
web/app/page.tsx
Normal file
@@ -0,0 +1,38 @@
|
||||
import ScraperTest from '@/components/ScraperTest';
|
||||
|
||||
export default function Home() {
|
||||
return (
|
||||
<div className="min-h-screen bg-gradient-to-br from-blue-600 to-indigo-700 py-12 px-4">
|
||||
<main className="max-w-5xl mx-auto">
|
||||
<div className="text-center mb-10">
|
||||
<h1 className="text-4xl md:text-5xl font-bold text-white mb-3">
|
||||
Google Reviews Scraper
|
||||
</h1>
|
||||
<p className="text-blue-100 text-lg">
|
||||
Test the containerized scraper API
|
||||
</p>
|
||||
<div className="mt-4 inline-flex items-center gap-2 px-4 py-2 bg-blue-500/30 rounded-lg text-blue-100 text-sm">
|
||||
<div className="w-2 h-2 bg-green-400 rounded-full animate-pulse"></div>
|
||||
Powered by SeleniumBase UC Mode
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div className="bg-white rounded-2xl shadow-2xl p-6 md:p-8">
|
||||
<ScraperTest />
|
||||
</div>
|
||||
|
||||
<div className="mt-8 text-center text-blue-100 text-sm space-y-2">
|
||||
<p className="font-medium">💡 Example URLs to test:</p>
|
||||
<div className="space-y-1 text-xs">
|
||||
<p className="font-mono bg-blue-500/20 rounded px-3 py-1 inline-block">
|
||||
https://www.google.com/maps/place/Soho+Club/...
|
||||
</p>
|
||||
</div>
|
||||
<p className="mt-4 text-blue-200">
|
||||
API running at: <span className="font-mono">localhost:8000</span>
|
||||
</p>
|
||||
</div>
|
||||
</main>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
Reference in New Issue
Block a user