Files
whyrating/packages/api/tests/fix-document-embeddings.ts
2026-02-04 01:55:00 +01:00

100 lines
3.0 KiB
TypeScript
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* Script to fix embeddings for a specific document with corrupted embeddings
* Run: pnpm with-env npx tsx packages/api/tests/fix-document-embeddings.ts
*/
import { generateDocumentEmbeddings } from "@turbostarter/ai/pdf/embeddings";
import { sql } from "@turbostarter/db";
import { pdfEmbedding } from "@turbostarter/db/schema/pdf";
import { db } from "@turbostarter/db/server";
const DOC_ID = "JnOv9Z7JK2ZWU92OrFFFuuCY1TSksaWv";
async function fix() {
console.log(`\n=== Fixing embeddings for document: ${DOC_ID} ===\n`);
// 1. Get the document path
const docResult = await db.execute<{
id: string;
path: string;
name: string | null;
}>(sql`
SELECT id, path, name
FROM pdf.document
WHERE id = ${DOC_ID}
`);
const docs = Array.isArray(docResult) ? docResult : [];
if (docs.length === 0) {
console.log("❌ Document not found!");
process.exit(1);
}
const doc = docs[0]!;
console.log(`📄 Document: ${doc.name ?? doc.id}`);
console.log(`📁 Path: ${doc.path}`);
// 2. Delete existing (corrupted) embeddings
console.log("\n🗑 Deleting corrupted embeddings...");
await db.execute(sql`
DELETE FROM pdf.embedding
WHERE document_id = ${DOC_ID}
`);
console.log(" Done.");
// 3. Regenerate embeddings from the actual PDF
console.log("\n📊 Generating new embeddings from PDF...");
try {
const generated = await generateDocumentEmbeddings(doc.path);
console.log(` Generated ${generated.length} chunks`);
if (generated.length === 0) {
console.log("⚠️ No chunks generated - PDF may be empty or unreadable");
process.exit(1);
}
// Show sample of new embeddings
console.log("\n Sample content:");
for (let i = 0; i < Math.min(3, generated.length); i++) {
const chunk = generated[i]!;
console.log(` Page ${chunk.metadata.pageNumber}: "${chunk.content.substring(0, 60)}..."`);
}
// 4. Insert new embeddings
console.log("\n💾 Inserting new embeddings...");
await db
.insert(pdfEmbedding)
.values(
generated.map((chunk) => ({
content: chunk.content,
documentId: DOC_ID,
embedding: chunk.embedding,
pageNumber: chunk.metadata.pageNumber,
charStart: chunk.metadata.charStart,
charEnd: chunk.metadata.charEnd,
sectionTitle: chunk.metadata.sectionTitle,
})),
);
console.log(` ✅ Inserted ${generated.length} embeddings`);
// 5. Verify
const countResult = await db.execute<{ count: number }>(sql`
SELECT COUNT(*)::int as count
FROM pdf.embedding
WHERE document_id = ${DOC_ID}
`);
const count = (countResult as { count: number }[])[0]?.count ?? 0;
console.log(`\n✅ Verification: Document now has ${count} embeddings`);
} catch (error) {
console.error("\n❌ Error generating embeddings:", error);
process.exit(1);
}
process.exit(0);
}
fix().catch((e) => {
console.error("Fatal error:", e);
process.exit(1);
});