feat: whyrating - initial project from turbostarter boilerplate
This commit is contained in:
3
packages/ai/eslint.config.js
Normal file
3
packages/ai/eslint.config.js
Normal file
@@ -0,0 +1,3 @@
|
||||
import baseConfig from "@turbostarter/eslint-config/base";
|
||||
|
||||
export default baseConfig;
|
||||
56
packages/ai/package.json
Normal file
56
packages/ai/package.json
Normal file
@@ -0,0 +1,56 @@
|
||||
{
|
||||
"name": "@turbostarter/ai",
|
||||
"private": true,
|
||||
"version": "0.1.0",
|
||||
"type": "module",
|
||||
"exports": {
|
||||
".": "./src/index.ts",
|
||||
"./env": "./src/env.ts",
|
||||
"./chat/*": "./src/modules/chat/*.ts",
|
||||
"./image/*": "./src/modules/image/*.ts",
|
||||
"./pdf/*": "./src/modules/pdf/*.ts",
|
||||
"./tts/*": "./src/modules/tts/*.ts",
|
||||
"./stt/*": "./src/modules/stt/*.ts",
|
||||
"./credits/*": "./src/modules/credits/*.ts"
|
||||
},
|
||||
"scripts": {
|
||||
"clean": "git clean -xdf .cache .turbo dist node_modules",
|
||||
"format": "prettier --check . --ignore-path ../../.gitignore",
|
||||
"lint": "eslint",
|
||||
"test": "vitest run",
|
||||
"test:watch": "vitest --watch",
|
||||
"typecheck": "tsc --noEmit"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@turbostarter/eslint-config": "workspace:*",
|
||||
"@turbostarter/prettier-config": "workspace:*",
|
||||
"@turbostarter/tsconfig": "workspace:*",
|
||||
"@turbostarter/vitest-config": "workspace:*",
|
||||
"eslint": "catalog:",
|
||||
"prettier": "catalog:",
|
||||
"typescript": "catalog:",
|
||||
"vitest": "catalog:"
|
||||
},
|
||||
"prettier": "@turbostarter/prettier-config",
|
||||
"dependencies": {
|
||||
"@ai-sdk/anthropic": "2.0.41",
|
||||
"@ai-sdk/deepseek": "1.0.27",
|
||||
"@ai-sdk/fireworks": "1.0.27",
|
||||
"@ai-sdk/google": "2.0.28",
|
||||
"@ai-sdk/openai": "2.0.68",
|
||||
"@ai-sdk/replicate": "1.0.17",
|
||||
"@ai-sdk/xai": "2.0.31",
|
||||
"@anthropic-ai/sdk": "0.71.2",
|
||||
"@elevenlabs/elevenlabs-js": "2.9.0",
|
||||
"@langchain/community": "1.0.0",
|
||||
"@langchain/core": "1.0.3",
|
||||
"@tavily/core": "0.5.12",
|
||||
"@turbostarter/db": "workspace:*",
|
||||
"@turbostarter/shared": "workspace:*",
|
||||
"@turbostarter/storage": "workspace:*",
|
||||
"ai": "catalog:",
|
||||
"openai": "4.103.0",
|
||||
"pdf-parse": "1.1.1",
|
||||
"zod": "catalog:"
|
||||
}
|
||||
}
|
||||
25
packages/ai/src/env.ts
Normal file
25
packages/ai/src/env.ts
Normal file
@@ -0,0 +1,25 @@
|
||||
import { defineEnv } from "envin";
|
||||
import * as z from "zod";
|
||||
|
||||
import { envConfig, NodeEnv } from "@turbostarter/shared/constants";
|
||||
|
||||
import type { Preset } from "envin/types";
|
||||
|
||||
export const preset = {
|
||||
id: "ai",
|
||||
server: {
|
||||
ELEVENLABS_API_KEY: z.string().optional(),
|
||||
TAVILY_API_KEY: z.string().optional(),
|
||||
},
|
||||
} as const satisfies Preset;
|
||||
|
||||
export const env = defineEnv({
|
||||
...envConfig,
|
||||
shared: {
|
||||
NODE_ENV: z.enum(NodeEnv).default(NodeEnv.DEVELOPMENT),
|
||||
},
|
||||
server: {
|
||||
ELEVENLABS_API_KEY: z.string().optional(),
|
||||
TAVILY_API_KEY: z.string().optional(),
|
||||
},
|
||||
});
|
||||
2
packages/ai/src/index.ts
Normal file
2
packages/ai/src/index.ts
Normal file
@@ -0,0 +1,2 @@
|
||||
export * from "./types";
|
||||
export * from "./utils/common";
|
||||
278
packages/ai/src/modules/chat/api.ts
Normal file
278
packages/ai/src/modules/chat/api.ts
Normal file
@@ -0,0 +1,278 @@
|
||||
import {
|
||||
convertToModelMessages,
|
||||
createUIMessageStream,
|
||||
createUIMessageStreamResponse,
|
||||
smoothStream,
|
||||
stepCountIs,
|
||||
streamText,
|
||||
} from "ai";
|
||||
import * as z from "zod";
|
||||
|
||||
import { and, eq } from "@turbostarter/db";
|
||||
import { chat, message, part } from "@turbostarter/db/schema/chat";
|
||||
import { db } from "@turbostarter/db/server";
|
||||
import { omitBy } from "@turbostarter/shared/utils";
|
||||
import { getDeleteUrl, getSignedUrl } from "@turbostarter/storage/server";
|
||||
|
||||
import { repairToolCall } from "../../utils/llm";
|
||||
|
||||
import { MODELS, PROMPTS } from "./constants";
|
||||
import { modelStrategies } from "./strategies";
|
||||
import { toolStrategies } from "./tools";
|
||||
import { Role, Tool } from "./types";
|
||||
import { generateChatName, getProviderOptions, toChatMessage } from "./utils";
|
||||
|
||||
import type { ChatMessagePayload } from "./schema";
|
||||
import type {
|
||||
InsertChat,
|
||||
InsertMessage,
|
||||
InsertPart,
|
||||
} from "@turbostarter/db/schema/chat";
|
||||
|
||||
const hasPath = (details: unknown): details is { path: string } =>
|
||||
z
|
||||
.object({
|
||||
path: z.string(),
|
||||
})
|
||||
.safeParse(details).success;
|
||||
|
||||
export const createChat = async (data: InsertChat) =>
|
||||
db
|
||||
.insert(chat)
|
||||
.values(data)
|
||||
.onConflictDoUpdate({
|
||||
target: chat.id,
|
||||
set: data,
|
||||
})
|
||||
.returning();
|
||||
|
||||
export const updateChat = async (id: string, data: Partial<InsertChat>) =>
|
||||
db.update(chat).set(data).where(eq(chat.id, id));
|
||||
|
||||
export const getChat = async (id: string) =>
|
||||
db.query["chat.chat"].findFirst({
|
||||
where: eq(chat.id, id),
|
||||
});
|
||||
|
||||
const deleteAttachment = async (path: string) => {
|
||||
const { url } = await getDeleteUrl({ path });
|
||||
|
||||
await fetch(url, {
|
||||
method: "DELETE",
|
||||
});
|
||||
};
|
||||
|
||||
export const deleteChat = async (id: string) => {
|
||||
const attachments = await getFileParts(id);
|
||||
const [deleted] = await db.delete(chat).where(eq(chat.id, id)).returning();
|
||||
|
||||
if (!deleted) {
|
||||
return;
|
||||
}
|
||||
|
||||
void Promise.allSettled(
|
||||
attachments
|
||||
.map((part) => part.details)
|
||||
.filter(hasPath)
|
||||
.map((part) => deleteAttachment(part.path)),
|
||||
);
|
||||
|
||||
return deleted;
|
||||
};
|
||||
|
||||
export const getUserChats = async (userId: string) =>
|
||||
db.query["chat.chat"].findMany({
|
||||
where: eq(chat.userId, userId),
|
||||
orderBy: (chat, { desc }) => [desc(chat.createdAt)],
|
||||
});
|
||||
|
||||
export const createMessage = async (data: InsertMessage) =>
|
||||
db.insert(message).values(data).onConflictDoUpdate({
|
||||
target: message.id,
|
||||
set: data,
|
||||
});
|
||||
|
||||
export const createParts = async (data: InsertPart[]) =>
|
||||
db.insert(part).values(data).onConflictDoNothing();
|
||||
|
||||
export const getFileParts = async (chatId: string) => {
|
||||
const rows = await db
|
||||
.select()
|
||||
.from(part)
|
||||
.innerJoin(message, eq(part.messageId, message.id))
|
||||
.where(and(eq(message.chatId, chatId), eq(part.type, "file")));
|
||||
|
||||
return rows.flatMap((row) => row.part);
|
||||
};
|
||||
|
||||
export const getChatMessages = async (id: string) =>
|
||||
db.query["chat.message"].findMany({
|
||||
where: eq(message.chatId, id),
|
||||
orderBy: (message, { asc }) => [asc(message.createdAt)],
|
||||
with: {
|
||||
part: {
|
||||
orderBy: (part, { asc }) => [asc(part.order)],
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
export const getChatMessagesWithAttachments = async (id: string) => {
|
||||
const messages = await getChatMessages(id);
|
||||
|
||||
return Promise.all(
|
||||
messages.map(async (message) => ({
|
||||
...message,
|
||||
parts: await Promise.all(
|
||||
message.part.map(async (part) =>
|
||||
part.type === "file"
|
||||
? {
|
||||
...part,
|
||||
details: {
|
||||
...(hasPath(part.details)
|
||||
? {
|
||||
...part.details,
|
||||
url: (
|
||||
await getSignedUrl({
|
||||
path: part.details.path,
|
||||
})
|
||||
).url,
|
||||
}
|
||||
: {}),
|
||||
},
|
||||
}
|
||||
: part,
|
||||
),
|
||||
),
|
||||
})),
|
||||
);
|
||||
};
|
||||
|
||||
const upsertChat = async ({
|
||||
id,
|
||||
content,
|
||||
userId,
|
||||
}: {
|
||||
id: string;
|
||||
content: string;
|
||||
userId: string;
|
||||
}) => {
|
||||
const [chat] = await createChat({ id, userId });
|
||||
|
||||
if (!chat?.name) {
|
||||
void (async () => {
|
||||
const name = await generateChatName(content);
|
||||
await updateChat(id, { name });
|
||||
})();
|
||||
}
|
||||
|
||||
return chat;
|
||||
};
|
||||
|
||||
export const streamChat = async ({
|
||||
chatId,
|
||||
userId,
|
||||
signal,
|
||||
...message
|
||||
}: ChatMessagePayload & { signal: AbortSignal; userId: string }) => {
|
||||
await upsertChat({
|
||||
id: chatId,
|
||||
content: message.parts
|
||||
.filter((part) => part.type === "text")
|
||||
.map((part) => part.text)
|
||||
.join("\n"),
|
||||
userId,
|
||||
});
|
||||
|
||||
const messages = await getChatMessagesWithAttachments(chatId);
|
||||
|
||||
await createMessage({ ...message, chatId });
|
||||
await createParts(
|
||||
message.parts.map(({ type, ...details }, order) => ({
|
||||
type,
|
||||
order,
|
||||
details:
|
||||
type === "file" ? omitBy(details, (_, key) => key === "url") : details,
|
||||
messageId: message.id,
|
||||
})),
|
||||
);
|
||||
|
||||
const providerOptions = getProviderOptions(message.metadata.options);
|
||||
|
||||
const model = MODELS.find(
|
||||
(model) => model.id === message.metadata.options.model,
|
||||
);
|
||||
|
||||
if (!model) {
|
||||
throw new Error("Model not found!");
|
||||
}
|
||||
|
||||
const stream = createUIMessageStream({
|
||||
execute: ({ writer }) => {
|
||||
const result = streamText({
|
||||
model: modelStrategies.languageModel(model.id),
|
||||
messages: convertToModelMessages([
|
||||
...messages.map(toChatMessage),
|
||||
message,
|
||||
]),
|
||||
system: PROMPTS.SYSTEM,
|
||||
stopWhen: stepCountIs(5),
|
||||
abortSignal: signal,
|
||||
...(model.tools && {
|
||||
tools: toolStrategies(writer),
|
||||
activeTools: [
|
||||
...(message.metadata.options.search ? [Tool.WEB_SEARCH] : []),
|
||||
],
|
||||
experimental_repairToolCall: repairToolCall,
|
||||
}),
|
||||
providerOptions,
|
||||
experimental_transform: smoothStream({
|
||||
chunking: "word",
|
||||
delayInMs: 15,
|
||||
}),
|
||||
onError: (error) => {
|
||||
console.error(error);
|
||||
},
|
||||
});
|
||||
|
||||
void result.consumeStream();
|
||||
|
||||
writer.merge(
|
||||
result.toUIMessageStream({
|
||||
originalMessages: messages.map(toChatMessage),
|
||||
messageMetadata: ({ part }) => {
|
||||
if (part.type === "start") {
|
||||
return {
|
||||
options: message.metadata.options,
|
||||
};
|
||||
}
|
||||
},
|
||||
sendReasoning: message.metadata.options.reason,
|
||||
}),
|
||||
);
|
||||
},
|
||||
onFinish: async ({ responseMessage }) => {
|
||||
await createMessage({
|
||||
id: responseMessage.id,
|
||||
chatId,
|
||||
role: Role.ASSISTANT,
|
||||
});
|
||||
|
||||
await createParts(
|
||||
responseMessage.parts.map(({ type, ...details }, order) => ({
|
||||
type,
|
||||
details,
|
||||
messageId: responseMessage.id,
|
||||
order,
|
||||
})),
|
||||
);
|
||||
},
|
||||
});
|
||||
|
||||
return createUIMessageStreamResponse({
|
||||
stream,
|
||||
headers: {
|
||||
"Content-Type": "application/octet-stream",
|
||||
"Content-Encoding": "none",
|
||||
},
|
||||
});
|
||||
};
|
||||
119
packages/ai/src/modules/chat/constants.ts
Normal file
119
packages/ai/src/modules/chat/constants.ts
Normal file
@@ -0,0 +1,119 @@
|
||||
import { Provider } from "../../types";
|
||||
|
||||
import { Model } from "./types";
|
||||
|
||||
export const MODELS = [
|
||||
{
|
||||
id: Model.GPT_5_1,
|
||||
provider: Provider.OPENAI,
|
||||
name: "GPT-5.1",
|
||||
reason: false,
|
||||
tools: true,
|
||||
attachments: true,
|
||||
},
|
||||
{
|
||||
id: Model.GPT_4O,
|
||||
provider: Provider.OPENAI,
|
||||
name: "GPT-4o",
|
||||
reason: false,
|
||||
tools: true,
|
||||
attachments: true,
|
||||
},
|
||||
{
|
||||
id: Model.O4_MINI,
|
||||
provider: Provider.OPENAI,
|
||||
name: "o4-mini",
|
||||
reason: true,
|
||||
tools: true,
|
||||
attachments: true,
|
||||
},
|
||||
{
|
||||
id: Model.O3,
|
||||
provider: Provider.OPENAI,
|
||||
name: "o3",
|
||||
reason: true,
|
||||
tools: true,
|
||||
attachments: false,
|
||||
},
|
||||
{
|
||||
id: Model.GEMINI_2_5_PRO,
|
||||
provider: Provider.GEMINI,
|
||||
name: "Gemini 2.5 Pro",
|
||||
reason: false,
|
||||
tools: true,
|
||||
attachments: true,
|
||||
},
|
||||
{
|
||||
id: Model.GEMINI_2_5_FLASH,
|
||||
provider: Provider.GEMINI,
|
||||
name: "Gemini 2.5 Flash",
|
||||
reason: false,
|
||||
tools: true,
|
||||
attachments: true,
|
||||
},
|
||||
{
|
||||
id: Model.CLAUDE_4_SONNET,
|
||||
provider: Provider.CLAUDE,
|
||||
name: "Claude 4 Sonnet",
|
||||
reason: false,
|
||||
tools: true,
|
||||
attachments: true,
|
||||
},
|
||||
{
|
||||
id: Model.CLAUDE_3_7_SONNET,
|
||||
provider: Provider.CLAUDE,
|
||||
name: "Claude 3.7 Sonnet",
|
||||
reason: true,
|
||||
tools: true,
|
||||
attachments: true,
|
||||
},
|
||||
{
|
||||
id: Model.GROK_4,
|
||||
provider: Provider.GROK,
|
||||
name: "Grok 4",
|
||||
reason: false,
|
||||
tools: true,
|
||||
attachments: false,
|
||||
},
|
||||
{
|
||||
id: Model.GROK_3,
|
||||
provider: Provider.GROK,
|
||||
name: "Grok 3",
|
||||
reason: true,
|
||||
tools: true,
|
||||
attachments: false,
|
||||
},
|
||||
{
|
||||
id: Model.DEEPSEEK_V3,
|
||||
provider: Provider.DEEPSEEK,
|
||||
name: "DeepSeek V3",
|
||||
reason: false,
|
||||
tools: true,
|
||||
attachments: false,
|
||||
},
|
||||
{
|
||||
id: Model.DEEPSEEK_R1,
|
||||
provider: Provider.DEEPSEEK,
|
||||
name: "DeepSeek R1",
|
||||
reason: true,
|
||||
tools: false,
|
||||
attachments: false,
|
||||
},
|
||||
] as const;
|
||||
|
||||
export const PROMPTS = {
|
||||
CHAT_NAME: `- you will generate a short title based on the first message a user begins a conversation with
|
||||
- ensure it is not more than 80 characters long
|
||||
- the title should be a summary of the user's message
|
||||
- the title should creative and unique
|
||||
- do not use quotes or colons`,
|
||||
SYSTEM: `- You are a digital friend that helps users with fun and engaging conversations sometimes likes to be funny but serious at the same time.
|
||||
- Today's date is ${new Date().toLocaleDateString("en-US", { year: "numeric", month: "short", day: "2-digit", weekday: "short" })}.
|
||||
- You can use markdown formatting with tables too when needed.
|
||||
- You can use latex formtting:
|
||||
- Use $ for inline equations
|
||||
- Use $$ for block equations
|
||||
- Use "USD" for currency (not $)
|
||||
- No need to use bold or italic formatting in tables.
|
||||
- don't use the h1 heading in the markdown response.`,
|
||||
};
|
||||
55
packages/ai/src/modules/chat/schema.ts
Normal file
55
packages/ai/src/modules/chat/schema.ts
Normal file
@@ -0,0 +1,55 @@
|
||||
import * as z from "zod";
|
||||
|
||||
import { Model, Role } from "./types";
|
||||
|
||||
export const chatMessageOptionsSchema = z.object({
|
||||
reason: z.boolean().optional().default(false),
|
||||
search: z.boolean().optional().default(false),
|
||||
model: z.enum(Model),
|
||||
});
|
||||
|
||||
export const chatMessageMetadataSchema = z.object({
|
||||
options: chatMessageOptionsSchema,
|
||||
});
|
||||
|
||||
export const chatMessagePartSchema = z.discriminatedUnion("type", [
|
||||
z
|
||||
.object({
|
||||
type: z.literal("text"),
|
||||
text: z.string(),
|
||||
})
|
||||
.catchall(z.unknown()),
|
||||
z.object({
|
||||
type: z.literal("file"),
|
||||
filename: z.string(),
|
||||
mediaType: z.string(),
|
||||
url: z.string(),
|
||||
path: z.string().optional(),
|
||||
}),
|
||||
]);
|
||||
|
||||
export const chatMessageSchema = z.object({
|
||||
id: z.string(),
|
||||
chatId: z.string(),
|
||||
parts: z.array(chatMessagePartSchema),
|
||||
role: z.enum(Role).optional().default(Role.USER),
|
||||
metadata: chatMessageMetadataSchema,
|
||||
});
|
||||
|
||||
export type ChatMessagePayload = z.infer<typeof chatMessageSchema>;
|
||||
export type ChatMessagePartPayload = z.infer<typeof chatMessagePartSchema>;
|
||||
export type ChatMessageOptionsPayload = z.infer<
|
||||
typeof chatMessageOptionsSchema
|
||||
>;
|
||||
export type ChatMessageMetadataPayload = z.infer<
|
||||
typeof chatMessageMetadataSchema
|
||||
>;
|
||||
|
||||
// API input type aliases
|
||||
export type ChatMessageInput = ChatMessagePayload;
|
||||
|
||||
export {
|
||||
selectChatSchema as chatSchema,
|
||||
selectMessageSchema as messageSchema,
|
||||
selectPartSchema as partSchema,
|
||||
} from "@turbostarter/db/schema/chat";
|
||||
27
packages/ai/src/modules/chat/strategies.ts
Normal file
27
packages/ai/src/modules/chat/strategies.ts
Normal file
@@ -0,0 +1,27 @@
|
||||
import { anthropic } from "@ai-sdk/anthropic";
|
||||
import { deepseek } from "@ai-sdk/deepseek";
|
||||
import { google } from "@ai-sdk/google";
|
||||
import { openai } from "@ai-sdk/openai";
|
||||
import { xai } from "@ai-sdk/xai";
|
||||
import { customProvider } from "ai";
|
||||
|
||||
import { cached } from "../../utils/llm";
|
||||
|
||||
import { Model } from "./types";
|
||||
|
||||
export const modelStrategies = customProvider({
|
||||
languageModels: {
|
||||
[Model.GPT_5_1]: cached(openai.responses("gpt-5.1-chat-latest")),
|
||||
[Model.GPT_4O]: cached(openai.responses("gpt-4o")),
|
||||
[Model.O3]: cached(openai.responses("o3-mini")),
|
||||
[Model.O4_MINI]: cached(openai.responses("o4-mini")),
|
||||
[Model.GEMINI_2_5_PRO]: cached(google("gemini-2.5-pro")),
|
||||
[Model.GEMINI_2_5_FLASH]: cached(google("gemini-2.5-flash")),
|
||||
[Model.CLAUDE_4_SONNET]: cached(anthropic("claude-sonnet-4-5")),
|
||||
[Model.CLAUDE_3_7_SONNET]: cached(anthropic("claude-3-7-sonnet-latest")),
|
||||
[Model.GROK_4]: cached(xai("grok-4")),
|
||||
[Model.GROK_3]: cached(xai("grok-3-mini-fast")),
|
||||
[Model.DEEPSEEK_V3]: cached(deepseek("deepseek-chat")),
|
||||
[Model.DEEPSEEK_R1]: cached(deepseek("deepseek-reasoner")),
|
||||
},
|
||||
});
|
||||
11
packages/ai/src/modules/chat/tools/index.ts
Normal file
11
packages/ai/src/modules/chat/tools/index.ts
Normal file
@@ -0,0 +1,11 @@
|
||||
import { Tool } from "../types";
|
||||
|
||||
import { webSearch } from "./search";
|
||||
|
||||
import type { InferUITools, UIMessageStreamWriter } from "ai";
|
||||
|
||||
export const toolStrategies = (writer: UIMessageStreamWriter) => ({
|
||||
[Tool.WEB_SEARCH]: webSearch(writer),
|
||||
});
|
||||
|
||||
export type ChatTools = InferUITools<ReturnType<typeof toolStrategies>>;
|
||||
233
packages/ai/src/modules/chat/tools/search.ts
Normal file
233
packages/ai/src/modules/chat/tools/search.ts
Normal file
@@ -0,0 +1,233 @@
|
||||
import { tavily } from "@tavily/core";
|
||||
import { tool } from "ai";
|
||||
import * as z from "zod";
|
||||
|
||||
import { env } from "../../../env";
|
||||
|
||||
import type { TavilyClient } from "@tavily/core";
|
||||
import type { InferUITool, UIMessageStreamWriter } from "ai";
|
||||
|
||||
// Lazy initialization to avoid throwing at module load time
|
||||
let _client: TavilyClient | null = null;
|
||||
const getClient = () => {
|
||||
if (!_client) {
|
||||
if (!env.TAVILY_API_KEY) {
|
||||
throw new Error("TAVILY_API_KEY is required for web search");
|
||||
}
|
||||
_client = tavily({ apiKey: env.TAVILY_API_KEY });
|
||||
}
|
||||
return _client;
|
||||
};
|
||||
|
||||
const sanitizeUrl = (url: string): string => url.replace(/\s+/g, "%20");
|
||||
|
||||
const isValidImageUrl = async (url: string) => {
|
||||
try {
|
||||
const controller = new AbortController();
|
||||
const timeout = setTimeout(() => controller.abort(), 5000);
|
||||
|
||||
const response = await fetch(url, {
|
||||
method: "HEAD",
|
||||
signal: controller.signal,
|
||||
});
|
||||
|
||||
clearTimeout(timeout);
|
||||
|
||||
return (
|
||||
response.ok &&
|
||||
(response.headers.get("content-type")?.startsWith("image/") ?? false)
|
||||
);
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
const extractDomain = (url: string): string => {
|
||||
const urlPattern = /^https?:\/\/([^/?#]+)(?:[/?#]|$)/i;
|
||||
return urlPattern.exec(url)?.[1] ?? url;
|
||||
};
|
||||
|
||||
const processDomains = (domains?: string[]): string[] | undefined => {
|
||||
if (!domains || domains.length === 0) return undefined;
|
||||
|
||||
const processedDomains = domains.map((domain) => extractDomain(domain));
|
||||
return processedDomains.every((domain) => domain.trim() === "")
|
||||
? undefined
|
||||
: processedDomains;
|
||||
};
|
||||
|
||||
const deduplicateByDomainAndUrl = <T extends { url: string }>(
|
||||
items: T[],
|
||||
): T[] => {
|
||||
const seenDomains = new Set<string>();
|
||||
const seenUrls = new Set<string>();
|
||||
|
||||
return items.filter((item) => {
|
||||
const domain = extractDomain(item.url);
|
||||
const isNewUrl = !seenUrls.has(item.url);
|
||||
const isNewDomain = !seenDomains.has(domain);
|
||||
|
||||
if (isNewUrl && isNewDomain) {
|
||||
seenUrls.add(item.url);
|
||||
seenDomains.add(domain);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
});
|
||||
};
|
||||
|
||||
export const webSearch = (writer: UIMessageStreamWriter) =>
|
||||
tool({
|
||||
description:
|
||||
"Search the web for information with multiple queries, max results and time range.",
|
||||
inputSchema: z.object({
|
||||
queries: z
|
||||
.array(
|
||||
z.object({
|
||||
q: z
|
||||
.string()
|
||||
.describe(
|
||||
"Search query to look up on the web. At least 5 characters length.",
|
||||
),
|
||||
topic: z
|
||||
.enum(["general", "news"])
|
||||
.describe("Topic type to search for."),
|
||||
maxResults: z
|
||||
.number()
|
||||
.describe(
|
||||
"Maximum number of results to return. Up to 10, 3 by default.",
|
||||
),
|
||||
}),
|
||||
)
|
||||
.describe(
|
||||
"Array of search queries to look up on the web. At least 2 items, at most 5.",
|
||||
),
|
||||
excludeDomains: z
|
||||
.array(z.string())
|
||||
.describe(
|
||||
"A list of domains to exclude from all search results. Default is [] (empty array).",
|
||||
),
|
||||
timeRange: z
|
||||
.enum(["year", "month", "week", "day", "y", "m", "w", "d"])
|
||||
.describe(
|
||||
"The time range to search for. Defaults to undefined - all time.",
|
||||
),
|
||||
}),
|
||||
execute: async ({ queries, excludeDomains, timeRange }) => {
|
||||
try {
|
||||
const searchPromises = queries.map(async (query, index) => {
|
||||
try {
|
||||
writer.write({
|
||||
type: "data-query_completion",
|
||||
data: {
|
||||
query,
|
||||
index,
|
||||
total: queries.length,
|
||||
status: "started",
|
||||
resultsCount: 0,
|
||||
imagesCount: 0,
|
||||
},
|
||||
});
|
||||
|
||||
const data = await getClient().search(query.q, {
|
||||
topic: query.topic,
|
||||
days: query.topic === "news" ? 7 : undefined,
|
||||
maxResults: query.maxResults,
|
||||
searchDepth: "basic",
|
||||
includeAnswer: true,
|
||||
includeImages: true,
|
||||
includeImageDescriptions: true,
|
||||
excludeDomains: processDomains(excludeDomains),
|
||||
timeRange,
|
||||
});
|
||||
|
||||
writer.write({
|
||||
type: "data-query_completion",
|
||||
data: {
|
||||
query,
|
||||
index,
|
||||
total: queries.length,
|
||||
status: "completed",
|
||||
resultsCount: data.results.length,
|
||||
imagesCount: data.images.length,
|
||||
},
|
||||
});
|
||||
|
||||
const results = deduplicateByDomainAndUrl(data.results).map(
|
||||
(result) => ({
|
||||
url: result.url,
|
||||
title: result.title,
|
||||
content: result.content,
|
||||
rawContent: result.rawContent,
|
||||
publishedDate:
|
||||
query.topic === "news" ? result.publishedDate : undefined,
|
||||
}),
|
||||
);
|
||||
|
||||
const images = await Promise.all(
|
||||
deduplicateByDomainAndUrl(data.images).map(
|
||||
async ({ url, description }) => {
|
||||
const sanitizedUrl = sanitizeUrl(url);
|
||||
return (await isValidImageUrl(sanitizedUrl))
|
||||
? { url: sanitizedUrl, description: description ?? "" }
|
||||
: null;
|
||||
},
|
||||
),
|
||||
);
|
||||
|
||||
return {
|
||||
query,
|
||||
results,
|
||||
images: images.filter(
|
||||
(img): img is { url: string; description: string } =>
|
||||
img !== null && img.description !== "",
|
||||
),
|
||||
};
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
|
||||
writer.write({
|
||||
type: "data-query_completion",
|
||||
data: {
|
||||
query,
|
||||
index,
|
||||
total: queries.length,
|
||||
status: "error",
|
||||
resultsCount: 0,
|
||||
imagesCount: 0,
|
||||
},
|
||||
});
|
||||
|
||||
return {
|
||||
query,
|
||||
results: [],
|
||||
images: [],
|
||||
};
|
||||
}
|
||||
});
|
||||
|
||||
return {
|
||||
searches: await Promise.all(searchPromises),
|
||||
};
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
return {
|
||||
searches: [],
|
||||
};
|
||||
}
|
||||
},
|
||||
});
|
||||
|
||||
export type WebSearchTool = InferUITool<ReturnType<typeof webSearch>>;
|
||||
export interface DataQueryCompletionPart {
|
||||
query: {
|
||||
q: string;
|
||||
topic: string;
|
||||
maxResults: number;
|
||||
};
|
||||
index: number;
|
||||
total: number;
|
||||
status: "started" | "completed" | "error";
|
||||
resultsCount: number;
|
||||
imagesCount: number;
|
||||
}
|
||||
59
packages/ai/src/modules/chat/types.ts
Normal file
59
packages/ai/src/modules/chat/types.ts
Normal file
@@ -0,0 +1,59 @@
|
||||
export type {
|
||||
SelectChat as Chat,
|
||||
SelectMessage as Message,
|
||||
SelectPart as Part,
|
||||
} from "@turbostarter/db/schema/chat";
|
||||
|
||||
import { messageRoleEnum } from "@turbostarter/db/schema/chat";
|
||||
|
||||
import type { ChatMessageMetadataPayload } from "./schema";
|
||||
import type { ChatTools } from "./tools";
|
||||
import type { DataQueryCompletionPart } from "./tools/search";
|
||||
import type { EnumToConstant } from "@turbostarter/shared/types";
|
||||
import type { UIMessage } from "ai";
|
||||
|
||||
export const Role = Object.fromEntries(
|
||||
messageRoleEnum.enumValues.map((role) => [
|
||||
role.replace(/-/g, "_").toUpperCase(),
|
||||
role,
|
||||
]),
|
||||
) as EnumToConstant<typeof messageRoleEnum.enumValues>;
|
||||
|
||||
export type Role = (typeof Role)[keyof typeof Role];
|
||||
|
||||
export const Model = {
|
||||
O3: "o3",
|
||||
O4_MINI: "o4-mini",
|
||||
GPT_5_1: "gpt-5-1",
|
||||
GPT_4O: "gpt-4o",
|
||||
GEMINI_2_5_PRO: "gemini-2-5-pro",
|
||||
GEMINI_2_5_FLASH: "gemini-2-5-flash",
|
||||
CLAUDE_4_SONNET: "claude-4-sonnet",
|
||||
CLAUDE_3_7_SONNET: "claude-3-7-sonnet",
|
||||
GROK_4: "grok-4",
|
||||
GROK_3: "grok-3",
|
||||
DEEPSEEK_V3: "deepseek-v3",
|
||||
DEEPSEEK_R1: "deepseek-r1",
|
||||
} as const;
|
||||
|
||||
export type Model = (typeof Model)[keyof typeof Model];
|
||||
|
||||
export const Tool = {
|
||||
WEB_SEARCH: "web-search",
|
||||
} as const;
|
||||
|
||||
export type Tool = (typeof Tool)[keyof typeof Tool];
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/consistent-type-definitions
|
||||
export type ChatDataParts = {
|
||||
query_completion: DataQueryCompletionPart;
|
||||
};
|
||||
|
||||
export type ChatMessage = UIMessage<
|
||||
ChatMessageMetadataPayload,
|
||||
ChatDataParts,
|
||||
ChatTools
|
||||
>;
|
||||
export type ChatMessagePart = UIMessage["parts"][number];
|
||||
|
||||
export type { ChatTools };
|
||||
100
packages/ai/src/modules/chat/utils.ts
Normal file
100
packages/ai/src/modules/chat/utils.ts
Normal file
@@ -0,0 +1,100 @@
|
||||
import { openai } from "@ai-sdk/openai";
|
||||
import { generateObject } from "ai";
|
||||
import * as z from "zod";
|
||||
|
||||
import { Credits } from "../credits/utils";
|
||||
|
||||
import { MODELS, PROMPTS } from "./constants";
|
||||
|
||||
import type {
|
||||
ChatMessagePartPayload,
|
||||
ChatMessageOptionsPayload,
|
||||
} from "./schema";
|
||||
import type { Message, Part, ChatMessage, ChatMessagePart } from "./types";
|
||||
import type { AnthropicProviderOptions } from "@ai-sdk/anthropic";
|
||||
import type { OpenAIResponsesProviderOptions } from "@ai-sdk/openai";
|
||||
import type { XaiProviderOptions } from "@ai-sdk/xai";
|
||||
|
||||
export const generateChatName = async (content: string) => {
|
||||
const { object } = await generateObject({
|
||||
model: openai.responses("gpt-4.1-mini"),
|
||||
schema: z.object({
|
||||
name: z.string().min(1),
|
||||
}),
|
||||
system: PROMPTS.CHAT_NAME,
|
||||
prompt: `User message: ${content}`,
|
||||
});
|
||||
|
||||
return object.name;
|
||||
};
|
||||
|
||||
export const getProviderOptions = (options: ChatMessageOptionsPayload) => {
|
||||
const model = MODELS.find((model) => model.id === options.model);
|
||||
const reasoning = !!model?.reason && !!options.reason;
|
||||
|
||||
return {
|
||||
anthropic: {
|
||||
thinking: {
|
||||
type: reasoning ? "enabled" : "disabled",
|
||||
budgetTokens: 1200,
|
||||
},
|
||||
} satisfies AnthropicProviderOptions,
|
||||
openai: {
|
||||
...(reasoning
|
||||
? { reasoningEffort: "medium", reasoningSummary: "detailed" }
|
||||
: {}),
|
||||
textVerbosity: "medium",
|
||||
} satisfies OpenAIResponsesProviderOptions,
|
||||
xai: {
|
||||
...(reasoning ? { reasoningEffort: "low" } : {}),
|
||||
} satisfies XaiProviderOptions,
|
||||
};
|
||||
};
|
||||
|
||||
export const getCreditsDeduction = (
|
||||
options: ChatMessageOptionsPayload,
|
||||
parts?: ChatMessagePartPayload[],
|
||||
) => {
|
||||
const model = MODELS.find((model) => model.id === options.model);
|
||||
|
||||
const searchDeduction = options.search
|
||||
? Credits.COST.DEFAULT
|
||||
: Credits.COST.FREE;
|
||||
const reasoningDeduction =
|
||||
options.reason && model?.reason ? Credits.COST.DEFAULT : Credits.COST.FREE;
|
||||
|
||||
const attachments = parts?.filter((part) => part.type === "file");
|
||||
const attachmentDeduction = (attachments?.length ?? 0) * Credits.COST.DEFAULT;
|
||||
|
||||
return (
|
||||
Credits.COST.DEFAULT +
|
||||
searchDeduction +
|
||||
reasoningDeduction +
|
||||
attachmentDeduction
|
||||
);
|
||||
};
|
||||
|
||||
export const toChatMessagePart = ({
|
||||
type,
|
||||
details,
|
||||
}: Part): ChatMessagePart | null => {
|
||||
if (typeof details !== "object" || details === null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
type,
|
||||
...details,
|
||||
} as ChatMessagePart;
|
||||
};
|
||||
|
||||
export const toChatMessage = (
|
||||
message: Message & {
|
||||
parts?: Part[];
|
||||
},
|
||||
): ChatMessage => {
|
||||
return {
|
||||
...message,
|
||||
parts: message.parts?.map(toChatMessagePart).filter(Boolean) ?? [],
|
||||
};
|
||||
};
|
||||
45
packages/ai/src/modules/credits/config.ts
Normal file
45
packages/ai/src/modules/credits/config.ts
Normal file
@@ -0,0 +1,45 @@
|
||||
import { NodeEnv } from "@turbostarter/shared/constants";
|
||||
|
||||
import { env } from "../../env";
|
||||
|
||||
const nodeEnv = env.NODE_ENV;
|
||||
|
||||
/**
|
||||
* Centralized credits configuration.
|
||||
* Environment-aware defaults for development vs production.
|
||||
*/
|
||||
export const CreditsConfig = {
|
||||
/** Credits for new free-tier users */
|
||||
FREE_TIER: nodeEnv === NodeEnv.DEVELOPMENT ? 10000 : 100,
|
||||
|
||||
/** Credits for seed/dev users */
|
||||
DEV_SEED: 10000,
|
||||
|
||||
/** Cost by operation complexity */
|
||||
COST: {
|
||||
FREE: 0,
|
||||
LOW: 1,
|
||||
MEDIUM: 5,
|
||||
HIGH: 10,
|
||||
PREMIUM: 25,
|
||||
},
|
||||
|
||||
/** Feature-specific costs (for audit logging) */
|
||||
FEATURE_COST: {
|
||||
chat: 5,
|
||||
"text-to-speech": 10,
|
||||
"speech-to-text": 5,
|
||||
"image-generation": 25,
|
||||
"pdf-chat": 10,
|
||||
},
|
||||
} as const;
|
||||
|
||||
export type CostLevel = keyof typeof CreditsConfig.COST;
|
||||
export type FeatureName = keyof typeof CreditsConfig.FEATURE_COST;
|
||||
|
||||
/**
|
||||
* Get cost for a specific feature
|
||||
*/
|
||||
export const getFeatureCost = (feature: FeatureName): number => {
|
||||
return CreditsConfig.FEATURE_COST[feature];
|
||||
};
|
||||
90
packages/ai/src/modules/credits/server.ts
Normal file
90
packages/ai/src/modules/credits/server.ts
Normal file
@@ -0,0 +1,90 @@
|
||||
import { eq, sql } from "@turbostarter/db";
|
||||
import {
|
||||
creditTransaction,
|
||||
customer,
|
||||
} from "@turbostarter/db/schema";
|
||||
import { db } from "@turbostarter/db/server";
|
||||
import { generateId } from "@turbostarter/shared/utils";
|
||||
|
||||
import { CreditsConfig } from "./config";
|
||||
import { Credits } from "./utils";
|
||||
|
||||
export const getUserCredits = async (userId: string) => {
|
||||
const data = await db.query.customer.findFirst({
|
||||
where: eq(customer.userId, userId),
|
||||
});
|
||||
|
||||
return data?.credits ?? Credits.BALANCE;
|
||||
};
|
||||
|
||||
export const getCustomerByUserId = async (userId: string) => {
|
||||
return db.query.customer.findFirst({
|
||||
where: eq(customer.userId, userId),
|
||||
});
|
||||
};
|
||||
|
||||
export const deductUserCredits = (userId: string, amount: number) =>
|
||||
db
|
||||
.update(customer)
|
||||
.set({ credits: sql`${customer.credits} - ${amount}` })
|
||||
.where(eq(customer.userId, userId));
|
||||
|
||||
export const addUserCredits = (userId: string, amount: number) =>
|
||||
db
|
||||
.update(customer)
|
||||
.set({ credits: sql`${customer.credits} + ${amount}` })
|
||||
.where(eq(customer.userId, userId));
|
||||
|
||||
/**
|
||||
* Create a free customer record for a new user with welcome credits.
|
||||
* Called automatically on user signup via auth hooks.
|
||||
*/
|
||||
export const createFreeCustomer = async (userId: string) => {
|
||||
const id = generateId();
|
||||
const credits = CreditsConfig.FREE_TIER;
|
||||
|
||||
await db.transaction(async (tx) => {
|
||||
// Create customer record
|
||||
await tx.insert(customer).values({
|
||||
id,
|
||||
userId,
|
||||
customerId: `free_${userId}`,
|
||||
status: "active",
|
||||
plan: "free",
|
||||
credits,
|
||||
});
|
||||
|
||||
// Log the initial credit transaction
|
||||
await tx.insert(creditTransaction).values({
|
||||
id: generateId(),
|
||||
customerId: id,
|
||||
amount: credits,
|
||||
type: "signup",
|
||||
reason: "Welcome credits for new user",
|
||||
balanceAfter: credits,
|
||||
});
|
||||
});
|
||||
|
||||
return { id, credits };
|
||||
};
|
||||
|
||||
/**
|
||||
* Ensure a customer record exists for a user.
|
||||
* Creates one with free credits if not present.
|
||||
*/
|
||||
export const ensureCustomerExists = async (userId: string) => {
|
||||
const existing = await getCustomerByUserId(userId);
|
||||
if (existing) return existing;
|
||||
|
||||
const { id, credits } = await createFreeCustomer(userId);
|
||||
return {
|
||||
id,
|
||||
userId,
|
||||
customerId: `free_${userId}`,
|
||||
status: "active" as const,
|
||||
plan: "free" as const,
|
||||
credits,
|
||||
createdAt: new Date(),
|
||||
updatedAt: new Date(),
|
||||
};
|
||||
};
|
||||
32
packages/ai/src/modules/credits/utils.ts
Normal file
32
packages/ai/src/modules/credits/utils.ts
Normal file
@@ -0,0 +1,32 @@
|
||||
export const Credits = {
|
||||
BALANCE: 100,
|
||||
COST: {
|
||||
FREE: 0,
|
||||
DEFAULT: 5,
|
||||
HIGH: 10,
|
||||
},
|
||||
};
|
||||
|
||||
export type CreditsLevel = "high" | "medium" | "low";
|
||||
|
||||
export const hasEnoughCredits = (available: number, required: number) => {
|
||||
return available >= required;
|
||||
};
|
||||
|
||||
export const getCreditsLevel = (
|
||||
credits: number,
|
||||
max = Credits.BALANCE,
|
||||
): CreditsLevel => {
|
||||
const percentage = getCreditsProgress(credits, max) * 100;
|
||||
|
||||
if (percentage > 50) {
|
||||
return "high";
|
||||
} else if (percentage > 15) {
|
||||
return "medium";
|
||||
} else {
|
||||
return "low";
|
||||
}
|
||||
};
|
||||
|
||||
export const getCreditsProgress = (credits: number, max = Credits.BALANCE) =>
|
||||
credits / max;
|
||||
182
packages/ai/src/modules/image/api.ts
Normal file
182
packages/ai/src/modules/image/api.ts
Normal file
@@ -0,0 +1,182 @@
|
||||
import { generateId, experimental_generateImage as generateImage } from "ai";
|
||||
|
||||
import { and, desc, eq, inArray, lt } from "@turbostarter/db";
|
||||
import { generation, image } from "@turbostarter/db/schema/image";
|
||||
import { db } from "@turbostarter/db/server";
|
||||
import { HttpStatusCode } from "@turbostarter/shared/constants";
|
||||
import { HttpException } from "@turbostarter/shared/utils";
|
||||
import { getPublicUrl, getUploadUrl } from "@turbostarter/storage/server";
|
||||
|
||||
import { MODELS } from "./constants";
|
||||
import { modelStrategies } from "./strategies";
|
||||
|
||||
import type {
|
||||
InsertGeneration,
|
||||
InsertImage,
|
||||
} from "@turbostarter/db/schema/image";
|
||||
|
||||
export const createGeneration = async (data: InsertGeneration) =>
|
||||
db.insert(generation).values(data).returning();
|
||||
|
||||
export const getGeneration = async (id: string) =>
|
||||
db.query["image.generation"].findFirst({
|
||||
where: eq(generation.id, id),
|
||||
});
|
||||
|
||||
export const getGenerationWithImages = async (id: string) =>
|
||||
db.query["image.generation"].findFirst({
|
||||
where: eq(generation.id, id),
|
||||
with: {
|
||||
image: true,
|
||||
},
|
||||
});
|
||||
|
||||
export const updateGeneration = async (
|
||||
id: string,
|
||||
data: Partial<InsertGeneration>,
|
||||
) => db.update(generation).set(data).where(eq(generation.id, id));
|
||||
|
||||
export const getGenerationImages = async (id: string) =>
|
||||
db.query["image.image"].findMany({
|
||||
where: eq(image.generationId, id),
|
||||
});
|
||||
|
||||
export const deleteGenerationImages = async (id: string) =>
|
||||
db.delete(image).where(eq(image.generationId, id));
|
||||
|
||||
export const createImages = async (data: InsertImage[]) =>
|
||||
db.insert(image).values(data).returning();
|
||||
|
||||
export const getImages = async ({
|
||||
userId,
|
||||
limit = 10,
|
||||
cursor,
|
||||
}: {
|
||||
userId: string;
|
||||
limit?: number;
|
||||
cursor?: Date;
|
||||
}) => {
|
||||
return db.query["image.image"].findMany({
|
||||
orderBy: (t) => desc(t.createdAt),
|
||||
with: {
|
||||
generation: true,
|
||||
},
|
||||
limit,
|
||||
where: and(
|
||||
inArray(
|
||||
image.generationId,
|
||||
db
|
||||
.select({ id: generation.id })
|
||||
.from(generation)
|
||||
.innerJoin(image, eq(generation.id, image.generationId))
|
||||
.where(eq(generation.userId, userId)),
|
||||
),
|
||||
...(cursor ? [lt(image.createdAt, cursor)] : []),
|
||||
),
|
||||
});
|
||||
};
|
||||
|
||||
const resetGeneration = async (id: string) => {
|
||||
await deleteGenerationImages(id);
|
||||
await updateGeneration(id, {
|
||||
createdAt: new Date(),
|
||||
completedAt: null,
|
||||
});
|
||||
};
|
||||
|
||||
const saveImages = async ({
|
||||
images,
|
||||
generationId,
|
||||
}: {
|
||||
images: string[];
|
||||
generationId: string;
|
||||
}) => {
|
||||
const results = await Promise.allSettled(
|
||||
images.map(async (image) => {
|
||||
const path = `images/${generateId()}.png`;
|
||||
const { url: uploadUrl } = await getUploadUrl({
|
||||
path,
|
||||
});
|
||||
|
||||
await fetch(uploadUrl, {
|
||||
method: "PUT",
|
||||
body: Buffer.from(image, "base64"),
|
||||
});
|
||||
|
||||
const { url } = await getPublicUrl({
|
||||
path,
|
||||
});
|
||||
|
||||
return url;
|
||||
}),
|
||||
);
|
||||
|
||||
await createImages(
|
||||
results
|
||||
.filter((result) => result.status === "fulfilled")
|
||||
.map((result) => ({
|
||||
url: result.value,
|
||||
generationId,
|
||||
})),
|
||||
);
|
||||
};
|
||||
|
||||
export const generateImages = async ({
|
||||
id,
|
||||
abortSignal,
|
||||
}: {
|
||||
id: string;
|
||||
abortSignal?: AbortSignal;
|
||||
}) => {
|
||||
const generation = await getGenerationWithImages(id);
|
||||
const model = MODELS.find((m) => m.id === generation?.model);
|
||||
const dimension = model?.dimensions.find(
|
||||
(d) => d.id === generation?.aspectRatio,
|
||||
);
|
||||
|
||||
if (!generation || !model || !dimension) {
|
||||
throw new HttpException(HttpStatusCode.NOT_FOUND);
|
||||
}
|
||||
|
||||
if (generation.image.length) {
|
||||
await resetGeneration(generation.id);
|
||||
}
|
||||
|
||||
if (abortSignal) {
|
||||
abortSignal.onabort = async () => {
|
||||
await updateGeneration(generation.id, {
|
||||
completedAt: new Date(),
|
||||
});
|
||||
};
|
||||
}
|
||||
|
||||
const { images, warnings } = await generateImage({
|
||||
model: modelStrategies.imageModel(generation.model),
|
||||
prompt: generation.prompt,
|
||||
...(model.dimensionFormat === "size"
|
||||
? { size: dimension.value as `${number}x${number}` }
|
||||
: { aspectRatio: dimension.value as `${number}:${number}` }),
|
||||
...(model.provider !== "openai" && {
|
||||
seed: Math.floor(Math.random() * 1000000),
|
||||
}),
|
||||
n: generation.count,
|
||||
abortSignal,
|
||||
});
|
||||
|
||||
if (warnings.length) {
|
||||
console.warn(warnings);
|
||||
}
|
||||
|
||||
void saveImages({
|
||||
images: images.map((image) => image.base64),
|
||||
generationId: generation.id,
|
||||
});
|
||||
|
||||
await updateGeneration(generation.id, {
|
||||
completedAt: new Date(),
|
||||
});
|
||||
|
||||
return images.map(
|
||||
(image) => (image as unknown as { base64Data: string }).base64Data,
|
||||
);
|
||||
};
|
||||
138
packages/ai/src/modules/image/constants.ts
Normal file
138
packages/ai/src/modules/image/constants.ts
Normal file
@@ -0,0 +1,138 @@
|
||||
import { Provider } from "../../types";
|
||||
|
||||
import { AspectRatio, Model } from "./types";
|
||||
|
||||
export const MODELS = [
|
||||
{
|
||||
id: Model.GPT_IMAGE_1,
|
||||
provider: Provider.OPENAI,
|
||||
name: "GPT Image 1",
|
||||
dimensionFormat: "size",
|
||||
dimensions: [
|
||||
{
|
||||
id: AspectRatio.SQUARE,
|
||||
value: "1024x1024",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
id: Model.DALL_E_2,
|
||||
provider: Provider.OPENAI,
|
||||
name: "DALL-E 2",
|
||||
dimensionFormat: "size",
|
||||
dimensions: [
|
||||
{
|
||||
id: AspectRatio.SQUARE,
|
||||
value: "1024x1024",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
id: Model.DALL_E_3,
|
||||
provider: Provider.OPENAI,
|
||||
name: "DALL-E 3",
|
||||
dimensionFormat: "size",
|
||||
dimensions: [
|
||||
{
|
||||
id: AspectRatio.SQUARE,
|
||||
value: "1024x1024",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
id: Model.RECRAFT_V3,
|
||||
provider: Provider.RECRAFT,
|
||||
name: "Recraft v3",
|
||||
dimensionFormat: "aspectRatio",
|
||||
dimensions: [
|
||||
{
|
||||
id: AspectRatio.SQUARE,
|
||||
value: "1:1",
|
||||
},
|
||||
{
|
||||
id: AspectRatio.STANDARD,
|
||||
value: "4:3",
|
||||
},
|
||||
{
|
||||
id: AspectRatio.LANDSCAPE,
|
||||
value: "16:9",
|
||||
},
|
||||
{
|
||||
id: AspectRatio.PORTRAIT,
|
||||
value: "9:16",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
id: Model.PHOTON,
|
||||
provider: Provider.LUMA,
|
||||
name: "Photon",
|
||||
dimensionFormat: "aspectRatio",
|
||||
dimensions: [
|
||||
{
|
||||
id: AspectRatio.SQUARE,
|
||||
value: "1:1",
|
||||
},
|
||||
{
|
||||
id: AspectRatio.STANDARD,
|
||||
value: "4:3",
|
||||
},
|
||||
{
|
||||
id: AspectRatio.LANDSCAPE,
|
||||
value: "16:9",
|
||||
},
|
||||
{
|
||||
id: AspectRatio.PORTRAIT,
|
||||
value: "9:16",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
id: Model.STABLE_DIFFUSION_3_5_LARGE,
|
||||
provider: Provider.STABILITY_AI,
|
||||
name: "Stable Diffusion 3.5 Large",
|
||||
dimensionFormat: "aspectRatio",
|
||||
dimensions: [
|
||||
{
|
||||
id: AspectRatio.SQUARE,
|
||||
value: "1:1",
|
||||
},
|
||||
{
|
||||
id: AspectRatio.STANDARD,
|
||||
value: "4:3",
|
||||
},
|
||||
{
|
||||
id: AspectRatio.LANDSCAPE,
|
||||
value: "16:9",
|
||||
},
|
||||
{
|
||||
id: AspectRatio.PORTRAIT,
|
||||
value: "9:16",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
id: Model.STABLE_DIFFUSION_3_5_MEDIUM,
|
||||
provider: Provider.STABILITY_AI,
|
||||
name: "Stable Diffusion 3.5 Medium",
|
||||
dimensionFormat: "aspectRatio",
|
||||
dimensions: [
|
||||
{
|
||||
id: AspectRatio.SQUARE,
|
||||
value: "1:1",
|
||||
},
|
||||
{
|
||||
id: AspectRatio.STANDARD,
|
||||
value: "4:3",
|
||||
},
|
||||
{
|
||||
id: AspectRatio.LANDSCAPE,
|
||||
value: "16:9",
|
||||
},
|
||||
{
|
||||
id: AspectRatio.PORTRAIT,
|
||||
value: "9:16",
|
||||
},
|
||||
],
|
||||
},
|
||||
] as const;
|
||||
28
packages/ai/src/modules/image/schema.ts
Normal file
28
packages/ai/src/modules/image/schema.ts
Normal file
@@ -0,0 +1,28 @@
|
||||
import * as z from "zod";
|
||||
|
||||
import { AspectRatio } from "./types";
|
||||
|
||||
export const imageGenerationOptionsSchema = z.object({
|
||||
aspectRatio: z.enum(AspectRatio),
|
||||
model: z.string(),
|
||||
count: z.number().min(1).max(5),
|
||||
});
|
||||
|
||||
export const imageGenerationSchema = z.object({
|
||||
id: z.string().optional(),
|
||||
prompt: z.string().min(1).max(5000),
|
||||
options: imageGenerationOptionsSchema,
|
||||
});
|
||||
|
||||
export type ImageGenerationOptionsPayload = z.infer<
|
||||
typeof imageGenerationOptionsSchema
|
||||
>;
|
||||
export type ImageGenerationPayload = z.infer<typeof imageGenerationSchema>;
|
||||
|
||||
// API input type aliases
|
||||
export type ImageGenerationInput = ImageGenerationPayload;
|
||||
|
||||
export {
|
||||
selectGenerationSchema as generationSchema,
|
||||
selectImageSchema as imageSchema,
|
||||
} from "@turbostarter/db/schema/image";
|
||||
46
packages/ai/src/modules/image/strategies.ts
Normal file
46
packages/ai/src/modules/image/strategies.ts
Normal file
@@ -0,0 +1,46 @@
|
||||
import { openai } from "@ai-sdk/openai";
|
||||
import { customProvider } from "ai";
|
||||
|
||||
import { Model } from "./types";
|
||||
|
||||
import type { ImageModel } from "ai";
|
||||
|
||||
// Lazy load replicate to avoid errors when REPLICATE_API_TOKEN is not set
|
||||
const getReplicateModel = (model: string): ImageModel => {
|
||||
// eslint-disable-next-line @typescript-eslint/no-require-imports, @typescript-eslint/consistent-type-imports
|
||||
const { replicate } = require("@ai-sdk/replicate") as typeof import("@ai-sdk/replicate");
|
||||
return replicate.image(model);
|
||||
};
|
||||
|
||||
// Check for Replicate API token availability
|
||||
const hasReplicateToken = (): boolean => {
|
||||
try {
|
||||
return typeof globalThis.process !== "undefined" && !!globalThis.process.env.REPLICATE_API_TOKEN;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
export const modelStrategies = customProvider({
|
||||
imageModels: {
|
||||
[Model.GPT_IMAGE_1]: openai.image("gpt-image-1-mini"),
|
||||
[Model.DALL_E_2]: openai.image("dall-e-2"),
|
||||
[Model.DALL_E_3]: openai.image("dall-e-3"),
|
||||
},
|
||||
});
|
||||
|
||||
// Replicate models - only available when REPLICATE_API_TOKEN is set
|
||||
export const replicateModelStrategies = hasReplicateToken()
|
||||
? customProvider({
|
||||
imageModels: {
|
||||
[Model.RECRAFT_V3]: getReplicateModel("recraft-ai/recraft-v3"),
|
||||
[Model.PHOTON]: getReplicateModel("luma/photon"),
|
||||
[Model.STABLE_DIFFUSION_3_5_LARGE]: getReplicateModel(
|
||||
"stability-ai/stable-diffusion-3.5-large",
|
||||
),
|
||||
[Model.STABLE_DIFFUSION_3_5_MEDIUM]: getReplicateModel(
|
||||
"stability-ai/stable-diffusion-3.5-medium",
|
||||
),
|
||||
},
|
||||
})
|
||||
: null;
|
||||
29
packages/ai/src/modules/image/types.ts
Normal file
29
packages/ai/src/modules/image/types.ts
Normal file
@@ -0,0 +1,29 @@
|
||||
export {
|
||||
type SelectGeneration as Generation,
|
||||
type SelectImage as Image,
|
||||
} from "@turbostarter/db/schema/image";
|
||||
|
||||
import { aspectRatioEnum } from "@turbostarter/db/schema/image";
|
||||
|
||||
import type { EnumToConstant } from "@turbostarter/shared/types";
|
||||
|
||||
export const Model = {
|
||||
GPT_IMAGE_1: "gpt-image-1",
|
||||
DALL_E_2: "dall-e-2",
|
||||
DALL_E_3: "dall-e-3",
|
||||
RECRAFT_V3: "recraft-v3",
|
||||
PHOTON: "photon",
|
||||
STABLE_DIFFUSION_3_5_LARGE: "stable-diffusion-3-5-large",
|
||||
STABLE_DIFFUSION_3_5_MEDIUM: "stable-diffusion-3-5-medium",
|
||||
} as const;
|
||||
|
||||
export type Model = (typeof Model)[keyof typeof Model];
|
||||
|
||||
export const AspectRatio = Object.fromEntries(
|
||||
aspectRatioEnum.enumValues.map((aspectRatio) => [
|
||||
aspectRatio.replace(/-/g, "_").toUpperCase(),
|
||||
aspectRatio,
|
||||
]),
|
||||
) as EnumToConstant<typeof aspectRatioEnum.enumValues>;
|
||||
|
||||
export type AspectRatio = (typeof AspectRatio)[keyof typeof AspectRatio];
|
||||
564
packages/ai/src/modules/pdf/api.ts
Normal file
564
packages/ai/src/modules/pdf/api.ts
Normal file
@@ -0,0 +1,564 @@
|
||||
import {
|
||||
convertToModelMessages,
|
||||
generateId,
|
||||
smoothStream,
|
||||
stepCountIs,
|
||||
streamText,
|
||||
tool,
|
||||
} from "ai";
|
||||
import * as z from "zod";
|
||||
|
||||
import { eq, sql } from "@turbostarter/db";
|
||||
import {
|
||||
pdfChat,
|
||||
pdfDocument,
|
||||
pdfEmbedding,
|
||||
pdfMessage,
|
||||
} from "@turbostarter/db/schema/pdf";
|
||||
import { db } from "@turbostarter/db/server";
|
||||
import { generateId as generateCitationId } from "@turbostarter/shared/utils";
|
||||
import { getDeleteUrl } from "@turbostarter/storage/server";
|
||||
|
||||
import { repairToolCall } from "../../utils/llm";
|
||||
|
||||
import { PROMPTS } from "./constants";
|
||||
import { findRelevantContent, generateDocumentEmbeddings } from "./embeddings";
|
||||
import { modelStrategies } from "./strategies";
|
||||
import { Role } from "./types";
|
||||
|
||||
import type { PdfMessagePayload } from "./schema";
|
||||
import type { Citation, CitationResponse, PreciseCitation } from "./types";
|
||||
import type {
|
||||
InsertPdfChat,
|
||||
InsertPdfDocument,
|
||||
InsertPdfMessage,
|
||||
} from "@turbostarter/db/schema/pdf";
|
||||
|
||||
/**
|
||||
* Update document processing status
|
||||
*/
|
||||
const updateDocumentStatus = async (
|
||||
documentId: string,
|
||||
status: "pending" | "processing" | "ready" | "failed",
|
||||
error?: string,
|
||||
) => {
|
||||
await db
|
||||
.update(pdfDocument)
|
||||
.set({
|
||||
processingStatus: status,
|
||||
processingError: error ?? null,
|
||||
})
|
||||
.where(eq(pdfDocument.id, documentId));
|
||||
};
|
||||
|
||||
const createDocument = async (data: InsertPdfDocument) => {
|
||||
const [documentData] = await db.insert(pdfDocument).values(data).returning();
|
||||
|
||||
if (!documentData) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Process with legacy embeddings (simple, reliable, production-ready)
|
||||
void (async () => {
|
||||
try {
|
||||
// Set status to processing
|
||||
await updateDocumentStatus(documentData.id, "processing");
|
||||
|
||||
// Generate embeddings for the document
|
||||
console.log(`[api] Generating embeddings for document ${documentData.id}`);
|
||||
const chunks = await generateDocumentEmbeddings(documentData.path);
|
||||
console.log(`[api] Generated ${chunks.length} embedding chunks`);
|
||||
|
||||
// Insert embeddings into database
|
||||
if (chunks.length > 0) {
|
||||
await db.insert(pdfEmbedding).values(
|
||||
chunks.map((chunk) => ({
|
||||
content: chunk.content,
|
||||
documentId: documentData.id,
|
||||
embedding: chunk.embedding,
|
||||
pageNumber: chunk.metadata.pageNumber,
|
||||
charStart: chunk.metadata.charStart,
|
||||
charEnd: chunk.metadata.charEnd,
|
||||
sectionTitle: chunk.metadata.sectionTitle,
|
||||
})),
|
||||
);
|
||||
}
|
||||
|
||||
console.log(`[api] Embedding processing complete: ${chunks.length} chunks stored`);
|
||||
|
||||
// Set status to ready
|
||||
await updateDocumentStatus(documentData.id, "ready");
|
||||
} catch (error) {
|
||||
console.error(`[api] Failed to process PDF:`, error);
|
||||
// Set status to failed with error message
|
||||
await updateDocumentStatus(
|
||||
documentData.id,
|
||||
"failed",
|
||||
error instanceof Error ? error.message : "Unknown error",
|
||||
);
|
||||
}
|
||||
})();
|
||||
|
||||
return documentData;
|
||||
};
|
||||
|
||||
const deleteDocument = async (path: string) => {
|
||||
const { url } = await getDeleteUrl({ path });
|
||||
|
||||
await fetch(url, {
|
||||
method: "DELETE",
|
||||
});
|
||||
};
|
||||
|
||||
export const createChat = async (
|
||||
data: InsertPdfChat & Omit<InsertPdfDocument, "chatId">,
|
||||
) => {
|
||||
const [chatData] = await db
|
||||
.insert(pdfChat)
|
||||
.values(data)
|
||||
.returning()
|
||||
.onConflictDoUpdate({
|
||||
target: pdfChat.id,
|
||||
set: data,
|
||||
});
|
||||
|
||||
if (!chatData) {
|
||||
return null;
|
||||
}
|
||||
|
||||
await createDocument({
|
||||
...data,
|
||||
chatId: chatData.id,
|
||||
});
|
||||
|
||||
return chatData;
|
||||
};
|
||||
|
||||
export const createMessage = async (data: InsertPdfMessage) =>
|
||||
db.insert(pdfMessage).values(data).onConflictDoUpdate({
|
||||
target: pdfMessage.id,
|
||||
set: data,
|
||||
});
|
||||
|
||||
export const createMessages = async (data: InsertPdfMessage[]) =>
|
||||
db.insert(pdfMessage).values(data).onConflictDoNothing();
|
||||
|
||||
export const getChat = async (id: string) =>
|
||||
db.query["pdf.pdfChat"].findFirst({
|
||||
where: eq(pdfChat.id, id),
|
||||
});
|
||||
|
||||
export const deleteChat = async (id: string) => {
|
||||
const documents = await getChatDocuments(id);
|
||||
const [deleted] = await db.delete(pdfChat).where(eq(pdfChat.id, id)).returning();
|
||||
|
||||
if (!deleted) {
|
||||
return;
|
||||
}
|
||||
|
||||
void Promise.allSettled(
|
||||
documents.map((document) => deleteDocument(document.path)),
|
||||
);
|
||||
|
||||
return deleted;
|
||||
};
|
||||
|
||||
export const getUserChats = async (userId: string) =>
|
||||
db.query["pdf.pdfChat"].findMany({
|
||||
where: eq(pdfChat.userId, userId),
|
||||
orderBy: (chat, { desc }) => [desc(chat.createdAt)],
|
||||
});
|
||||
|
||||
export const getChatMessages = async (id: string) => {
|
||||
return db.query["pdf.pdfMessage"].findMany({
|
||||
where: eq(pdfMessage.chatId, id),
|
||||
orderBy: (message, { asc }) => [asc(message.createdAt)],
|
||||
});
|
||||
};
|
||||
|
||||
export const getChatDocuments = async (id: string) =>
|
||||
db.query["pdf.pdfDocument"].findMany({
|
||||
where: eq(pdfDocument.chatId, id),
|
||||
orderBy: (document, { asc }) => [asc(document.createdAt)],
|
||||
});
|
||||
|
||||
export const getDocument = async (id: string) =>
|
||||
db.query["pdf.pdfDocument"].findFirst({
|
||||
where: eq(pdfDocument.id, id),
|
||||
});
|
||||
|
||||
// ============================================================================
|
||||
// Hybrid Search (legacy embeddings + keyword fallback)
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* Unified search result for tool responses
|
||||
*/
|
||||
interface UnifiedSearchResult {
|
||||
id: string;
|
||||
content: string;
|
||||
pageNumber: number;
|
||||
similarity: number;
|
||||
/** Source type: 'legacy' for embeddings, 'keyword' for text search */
|
||||
source: "legacy" | "keyword";
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract specific identifiers from query for keyword fallback.
|
||||
* Embeddings are weak for legal references, codes, and specific numbers.
|
||||
*/
|
||||
function extractSearchKeywords(query: string): string[] {
|
||||
const patterns = [
|
||||
/\d+\/\d{4}/g, // Legal references like 35/2024
|
||||
/\b[A-Z]{2,}[-/]?\d+/g, // Codes like TDF/379
|
||||
];
|
||||
const keywords: string[] = [];
|
||||
for (const pattern of patterns) {
|
||||
const matches = query.match(pattern);
|
||||
if (matches) keywords.push(...matches);
|
||||
}
|
||||
return [...new Set(keywords)];
|
||||
}
|
||||
|
||||
/**
|
||||
* Keyword search fallback for specific identifiers that embeddings miss.
|
||||
*/
|
||||
async function keywordSearchFallback(
|
||||
query: string,
|
||||
documentId: string,
|
||||
limit = 4,
|
||||
): Promise<UnifiedSearchResult[]> {
|
||||
const keywords = extractSearchKeywords(query);
|
||||
if (keywords.length === 0) return [];
|
||||
|
||||
console.log(`[hybridSearch] Running keyword fallback for: ${keywords.join(", ")}`);
|
||||
|
||||
// Search for any of the keywords
|
||||
const keywordPattern = keywords.map((k) => `%${k}%`).join("%");
|
||||
|
||||
const results = await db.execute<{
|
||||
id: string;
|
||||
content: string;
|
||||
page_number: number | null;
|
||||
}>(sql`
|
||||
SELECT id, content, page_number
|
||||
FROM pdf.embedding
|
||||
WHERE document_id = ${documentId}
|
||||
AND content ILIKE ${keywordPattern}
|
||||
LIMIT ${limit}
|
||||
`);
|
||||
|
||||
const rows = Array.isArray(results) ? results : [];
|
||||
console.log(`[hybridSearch] Keyword fallback found ${rows.length} matches`);
|
||||
|
||||
return rows.map((row) => ({
|
||||
id: row.id,
|
||||
content: row.content,
|
||||
pageNumber: row.page_number ?? 1,
|
||||
similarity: 0.95, // High score for exact keyword matches
|
||||
source: "keyword" as const,
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Hybrid search: semantic embeddings + keyword fallback for specific identifiers
|
||||
*/
|
||||
async function hybridSearch(
|
||||
query: string,
|
||||
documentId: string,
|
||||
limit = 6,
|
||||
): Promise<UnifiedSearchResult[]> {
|
||||
console.log(`[hybridSearch] Searching for: "${query}" in document ${documentId}`);
|
||||
|
||||
// Semantic search using legacy embeddings
|
||||
const legacyResults = await findRelevantContent(query, documentId);
|
||||
let results: UnifiedSearchResult[] = legacyResults.slice(0, limit).map((r) => ({
|
||||
id: r.id,
|
||||
content: r.name,
|
||||
pageNumber: r.pageNumber,
|
||||
similarity: r.similarity,
|
||||
source: "legacy" as const,
|
||||
}));
|
||||
|
||||
console.log(`[hybridSearch] Semantic search found ${results.length} results`);
|
||||
|
||||
// Keyword fallback: ALWAYS run if query has specific identifiers (legal refs, codes)
|
||||
// Embeddings are weak for these, so we need exact text matching
|
||||
const keywords = extractSearchKeywords(query);
|
||||
if (keywords.length > 0) {
|
||||
const keywordResults = await keywordSearchFallback(query, documentId, 4);
|
||||
if (keywordResults.length > 0) {
|
||||
// Merge keyword results FIRST (they're more relevant for specific queries)
|
||||
const existingIds = new Set(results.map((r) => r.id));
|
||||
const newKeywordResults = keywordResults.filter((kr) => !existingIds.has(kr.id));
|
||||
|
||||
// Prepend keyword matches (higher priority) then add semantic results
|
||||
results = [...newKeywordResults, ...results].slice(0, limit);
|
||||
console.log(`[hybridSearch] Added ${newKeywordResults.length} keyword matches, total: ${results.length}`);
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
// Create highlight tool for precise text citations
|
||||
const createHighlightTool = () => ({
|
||||
highlightText: tool({
|
||||
description: `Highlight a specific phrase from the PDF document to support your answer.
|
||||
Use this tool for EACH fact you cite. The text must be an EXACT quote from the document.
|
||||
Keep highlights short (10-100 characters) - single sentences or key phrases only.`,
|
||||
inputSchema: z.object({
|
||||
text: z.string().min(10).max(200).describe("Exact phrase from the document to highlight"),
|
||||
page: z.number().int().positive().describe("Page number where text appears (1-indexed)"),
|
||||
relevance: z.string().optional().describe("Brief note on why this supports your answer"),
|
||||
}),
|
||||
execute: ({ text, page, relevance }) => {
|
||||
const citationId = generateCitationId();
|
||||
const citation: PreciseCitation = {
|
||||
citationId,
|
||||
text,
|
||||
page,
|
||||
relevance: relevance ?? null,
|
||||
timestamp: Date.now(),
|
||||
};
|
||||
return citation;
|
||||
},
|
||||
}),
|
||||
});
|
||||
|
||||
// Create tools with optional document filtering
|
||||
const createTools = (documentIds?: string[]) => {
|
||||
console.log(`🛠️ createTools called with documentIds:`, documentIds);
|
||||
const searchTool = {
|
||||
findRelevantContent: tool({
|
||||
description: `Get information from the PDF document to answer questions. Returns sources with IDs and page numbers that you MUST cite using [[cite:ID:PAGE]] format.`,
|
||||
inputSchema: z.object({
|
||||
query: z
|
||||
.string()
|
||||
.describe("The user's query to find relevant information for."),
|
||||
}),
|
||||
execute: async ({ query }) => {
|
||||
console.log(`🛠️ Tool execute called with query: "${query}"`);
|
||||
// If we have specific documents, search in each and combine results
|
||||
if (documentIds && documentIds.length > 0) {
|
||||
console.log(`🛠️ Searching in ${documentIds.length} documents:`, documentIds);
|
||||
const results = await Promise.all(
|
||||
documentIds.map((docId) => hybridSearch(query, docId, 6))
|
||||
);
|
||||
const combined = results.flat().slice(0, 6);
|
||||
console.log(`🛠️ Combined results:`, combined.length);
|
||||
// Return formatted results with citation instructions
|
||||
return {
|
||||
results: combined,
|
||||
citationInstructions: "IMPORTANT: Cite each source using [[cite:ID:PAGE]] format where ID is the source's id and PAGE is pageNumber.",
|
||||
};
|
||||
}
|
||||
// No specific documents - search across all (legacy behavior)
|
||||
const results = await findRelevantContent(query);
|
||||
return {
|
||||
results: results.map((r) => ({
|
||||
id: r.id,
|
||||
content: r.name,
|
||||
pageNumber: r.pageNumber,
|
||||
similarity: r.similarity,
|
||||
source: "legacy" as const,
|
||||
})),
|
||||
citationInstructions: "IMPORTANT: Cite each source using [[cite:ID:PAGE]] format where ID is the source's id and PAGE is pageNumber.",
|
||||
};
|
||||
},
|
||||
}),
|
||||
};
|
||||
const highlightTool = createHighlightTool();
|
||||
return { ...searchTool, ...highlightTool };
|
||||
};
|
||||
|
||||
// Legacy export for backwards compatibility
|
||||
export const tools = createTools();
|
||||
|
||||
// ============================================================================
|
||||
// Citation Parsing
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* Regular expression to match citation markers: [[cite:embeddingId:pageNum]]
|
||||
* Captures: embeddingId, pageNum
|
||||
*/
|
||||
const CITATION_REGEX = /\[\[cite:([a-zA-Z0-9]+):(\d+)\]\]/g;
|
||||
|
||||
/**
|
||||
* Common search result interface for citation parsing
|
||||
* Works with both legacy EmbeddingSearchResult and new UnifiedSearchResult
|
||||
*/
|
||||
interface CitableSearchResult {
|
||||
id: string;
|
||||
content?: string; // New format
|
||||
name?: string; // Legacy format
|
||||
similarity: number;
|
||||
pageNumber: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses AI response content containing [[cite:id:page]] markers and converts
|
||||
* them to numbered citations [1], [2], etc.
|
||||
*
|
||||
* @param content - Raw AI response with [[cite:id:page]] markers
|
||||
* @param searchResults - Array of search results (legacy or unified format)
|
||||
* @returns CitationResponse with parsed content and citation array
|
||||
*
|
||||
* @example
|
||||
* ```typescript
|
||||
* const response = parseCitations(
|
||||
* "The document states X [[cite:abc123:5]] and Y [[cite:def456:8]].",
|
||||
* searchResults
|
||||
* );
|
||||
* // response.content = "The document states X [1] and Y [2]."
|
||||
* // response.citations = [{ index: 1, embeddingId: "abc123", ... }, ...]
|
||||
* ```
|
||||
*/
|
||||
export function parseCitations(
|
||||
content: string,
|
||||
searchResults: CitableSearchResult[]
|
||||
): CitationResponse {
|
||||
const citations: Citation[] = [];
|
||||
const seenIds = new Map<string, number>(); // id -> citation index
|
||||
|
||||
// Create a lookup map for results
|
||||
const resultMap = new Map(
|
||||
searchResults.map((r) => [r.id, r])
|
||||
);
|
||||
|
||||
// Replace all citation markers with numbered references
|
||||
const parsedContent = content.replace(CITATION_REGEX, (_match, resultId: string, pageNumStr: string) => {
|
||||
const pageNumber = parseInt(pageNumStr, 10);
|
||||
|
||||
// If we've already seen this ID, reuse the same citation number
|
||||
if (seenIds.has(resultId)) {
|
||||
return `[${seenIds.get(resultId)}]`;
|
||||
}
|
||||
|
||||
// Create new citation
|
||||
const index = citations.length + 1;
|
||||
seenIds.set(resultId, index);
|
||||
|
||||
// Look up the result for excerpt and relevance
|
||||
const result = resultMap.get(resultId);
|
||||
const textContent = result?.content ?? result?.name ?? "";
|
||||
|
||||
citations.push({
|
||||
index,
|
||||
embeddingId: resultId, // Keep field name for API compatibility
|
||||
relevance: result?.similarity ?? 0,
|
||||
pageNumber: result?.pageNumber ?? pageNumber,
|
||||
// Create excerpt: first 150 chars of content
|
||||
excerpt: textContent
|
||||
? textContent.substring(0, 150) + (textContent.length > 150 ? "..." : "")
|
||||
: `[Content from page ${pageNumber}]`,
|
||||
});
|
||||
|
||||
return `[${index}]`;
|
||||
});
|
||||
|
||||
return {
|
||||
content: parsedContent,
|
||||
citations,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Formats search results as context for the AI with citation metadata.
|
||||
* This helps the AI understand how to cite the sources.
|
||||
* Works with both legacy EmbeddingSearchResult and new UnifiedSearchResult.
|
||||
*
|
||||
* @param results - Array of search results (legacy or unified format)
|
||||
* @returns Formatted string with citation instructions per result
|
||||
*/
|
||||
export function formatEmbeddingsForCitation(results: CitableSearchResult[]): string {
|
||||
if (results.length === 0) {
|
||||
return "No relevant content found in the document.";
|
||||
}
|
||||
|
||||
return results
|
||||
.map((r, i) => {
|
||||
const textContent = r.content ?? r.name ?? "[No content]";
|
||||
return `[Source ${i + 1}]
|
||||
ID: ${r.id}
|
||||
Page: ${r.pageNumber}
|
||||
Relevance: ${(r.similarity * 100).toFixed(1)}%
|
||||
Content: ${textContent}
|
||||
---
|
||||
To cite this source, use: [[cite:${r.id}:${r.pageNumber}]]`;
|
||||
})
|
||||
.join("\n\n");
|
||||
}
|
||||
|
||||
export const streamChatWithDocuments = async ({
|
||||
chatId,
|
||||
signal,
|
||||
documentIds,
|
||||
...message
|
||||
}: PdfMessagePayload & { signal: AbortSignal; chatId: string; documentIds?: string[] }) => {
|
||||
console.log(`📨 streamChatWithDocuments - chatId: ${chatId}, documentIds:`, documentIds);
|
||||
await createMessage({ ...message, chatId });
|
||||
|
||||
const messages = await getChatMessages(chatId);
|
||||
|
||||
const result = streamText({
|
||||
// Use uncached model - tools need fresh execution, not cached responses
|
||||
model: modelStrategies.languageModel("uncached"),
|
||||
messages: convertToModelMessages([
|
||||
...messages.map((m) => ({
|
||||
...m,
|
||||
parts: [
|
||||
{
|
||||
type: "text" as const,
|
||||
text: m.content,
|
||||
},
|
||||
],
|
||||
})),
|
||||
{
|
||||
...message,
|
||||
parts: [
|
||||
{
|
||||
type: "text" as const,
|
||||
text: message.content,
|
||||
},
|
||||
],
|
||||
},
|
||||
]),
|
||||
system: PROMPTS.SYSTEM,
|
||||
stopWhen: stepCountIs(6),
|
||||
abortSignal: signal,
|
||||
tools: createTools(documentIds),
|
||||
experimental_transform: smoothStream({
|
||||
chunking: "word",
|
||||
delayInMs: 15,
|
||||
}),
|
||||
experimental_repairToolCall: repairToolCall,
|
||||
onError: (error) => {
|
||||
console.error(error);
|
||||
},
|
||||
});
|
||||
|
||||
void result.consumeStream();
|
||||
|
||||
return result.toUIMessageStreamResponse({
|
||||
onFinish: async ({ responseMessage }) => {
|
||||
await createMessage({
|
||||
id: responseMessage.id || generateId(),
|
||||
chatId,
|
||||
content: responseMessage.parts
|
||||
.filter((part) => part.type === "text")
|
||||
.map((part) => part.text)
|
||||
.join("\n"),
|
||||
role: Role.ASSISTANT,
|
||||
});
|
||||
},
|
||||
headers: {
|
||||
"Content-Type": "application/octet-stream",
|
||||
"Content-Encoding": "none",
|
||||
},
|
||||
});
|
||||
};
|
||||
|
||||
// Re-export PreciseCitation type for consumers
|
||||
export type { PreciseCitation } from "./types";
|
||||
335
packages/ai/src/modules/pdf/chunking.test.ts
Normal file
335
packages/ai/src/modules/pdf/chunking.test.ts
Normal file
@@ -0,0 +1,335 @@
|
||||
/**
|
||||
* Unit tests for dual-resolution chunking (WF-0028)
|
||||
*/
|
||||
|
||||
import { describe, expect, it } from "vitest";
|
||||
|
||||
import {
|
||||
createDualResolutionChunks,
|
||||
getChunkingStats,
|
||||
validateChunks,
|
||||
DEFAULT_CHUNKING_CONFIG,
|
||||
} from "./chunking";
|
||||
|
||||
import type { LayoutElement, DualResolutionChunks } from "./chunking";
|
||||
|
||||
// ============================================================================
|
||||
// Test Fixtures
|
||||
// ============================================================================
|
||||
|
||||
function createTestElement(overrides: Partial<LayoutElement> = {}): LayoutElement {
|
||||
return {
|
||||
content: "Test paragraph content for unit testing purposes.",
|
||||
type: "prose",
|
||||
pageNumber: 1,
|
||||
paragraphIndex: 0,
|
||||
charStart: 0,
|
||||
charEnd: 47,
|
||||
bboxX: 0.1,
|
||||
bboxY: 0.1,
|
||||
bboxWidth: 0.8,
|
||||
bboxHeight: 0.05,
|
||||
sectionTitle: undefined,
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
function createTestElements(count: number): LayoutElement[] {
|
||||
return Array.from({ length: count }, (_, i) =>
|
||||
createTestElement({
|
||||
content: `Paragraph ${i + 1} content with some reasonable text length.`,
|
||||
paragraphIndex: i,
|
||||
charStart: i * 60,
|
||||
charEnd: (i + 1) * 60,
|
||||
bboxY: 0.1 + i * 0.05,
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// createDualResolutionChunks Tests
|
||||
// ============================================================================
|
||||
|
||||
describe("createDualResolutionChunks", () => {
|
||||
it("should return empty arrays for empty input", () => {
|
||||
const result = createDualResolutionChunks([]);
|
||||
|
||||
expect(result.citationUnits).toHaveLength(0);
|
||||
expect(result.retrievalChunks).toHaveLength(0);
|
||||
});
|
||||
|
||||
it("should create citation units for each layout element", () => {
|
||||
const elements = createTestElements(5);
|
||||
const result = createDualResolutionChunks(elements);
|
||||
|
||||
expect(result.citationUnits).toHaveLength(5);
|
||||
expect(result.citationUnits[0]?.content).toBe(elements[0]?.content);
|
||||
expect(result.citationUnits[0]?.pageNumber).toBe(1);
|
||||
expect(result.citationUnits[0]?.unitType).toBe("prose");
|
||||
});
|
||||
|
||||
it("should preserve bounding box coordinates in citation units", () => {
|
||||
const element = createTestElement({
|
||||
bboxX: 0.15,
|
||||
bboxY: 0.25,
|
||||
bboxWidth: 0.7,
|
||||
bboxHeight: 0.08,
|
||||
});
|
||||
const result = createDualResolutionChunks([element]);
|
||||
|
||||
expect(result.citationUnits[0]?.bboxX).toBe(0.15);
|
||||
expect(result.citationUnits[0]?.bboxY).toBe(0.25);
|
||||
expect(result.citationUnits[0]?.bboxWidth).toBe(0.7);
|
||||
expect(result.citationUnits[0]?.bboxHeight).toBe(0.08);
|
||||
});
|
||||
|
||||
it("should group citation units into retrieval chunks", () => {
|
||||
const elements = createTestElements(10);
|
||||
const result = createDualResolutionChunks(elements);
|
||||
|
||||
// Should create multiple retrieval chunks
|
||||
expect(result.retrievalChunks.length).toBeGreaterThan(1);
|
||||
expect(result.retrievalChunks.length).toBeLessThanOrEqual(4); // ~10/3 to 10/5
|
||||
|
||||
// Each chunk should reference valid citation unit indices
|
||||
for (const chunk of result.retrievalChunks) {
|
||||
expect(chunk.citationUnitIndices.length).toBeGreaterThanOrEqual(1);
|
||||
for (const idx of chunk.citationUnitIndices) {
|
||||
expect(idx).toBeGreaterThanOrEqual(0);
|
||||
expect(idx).toBeLessThan(10);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
it("should concatenate content in retrieval chunks", () => {
|
||||
const elements = createTestElements(3);
|
||||
const result = createDualResolutionChunks(elements, { minUnitsPerChunk: 3, maxUnitsPerChunk: 5 });
|
||||
|
||||
// With 3 elements and min=3, should be one chunk
|
||||
expect(result.retrievalChunks).toHaveLength(1);
|
||||
expect(result.retrievalChunks[0]?.content).toContain("Paragraph 1");
|
||||
expect(result.retrievalChunks[0]?.content).toContain("Paragraph 2");
|
||||
expect(result.retrievalChunks[0]?.content).toContain("Paragraph 3");
|
||||
});
|
||||
|
||||
it("should calculate correct page boundaries for retrieval chunks", () => {
|
||||
const elements = [
|
||||
createTestElement({ pageNumber: 1, paragraphIndex: 0 }),
|
||||
createTestElement({ pageNumber: 2, paragraphIndex: 1 }),
|
||||
createTestElement({ pageNumber: 3, paragraphIndex: 2 }),
|
||||
];
|
||||
const result = createDualResolutionChunks(elements, { minUnitsPerChunk: 3, maxUnitsPerChunk: 5 });
|
||||
|
||||
expect(result.retrievalChunks[0]?.pageStart).toBe(1);
|
||||
expect(result.retrievalChunks[0]?.pageEnd).toBe(3);
|
||||
});
|
||||
|
||||
it("should break on section headings when enabled", () => {
|
||||
const elements = [
|
||||
createTestElement({ content: "Intro paragraph", type: "prose", paragraphIndex: 0 }),
|
||||
createTestElement({ content: "Chapter 1", type: "heading", paragraphIndex: 1 }),
|
||||
createTestElement({ content: "Chapter content", type: "prose", paragraphIndex: 2 }),
|
||||
createTestElement({ content: "More content", type: "prose", paragraphIndex: 3 }),
|
||||
createTestElement({ content: "Even more", type: "prose", paragraphIndex: 4 }),
|
||||
];
|
||||
|
||||
const result = createDualResolutionChunks(elements, { breakOnSections: true });
|
||||
|
||||
// Should break at the heading
|
||||
expect(result.retrievalChunks.length).toBeGreaterThanOrEqual(2);
|
||||
});
|
||||
|
||||
it("should respect maxUnitsPerChunk configuration", () => {
|
||||
const elements = createTestElements(15);
|
||||
const result = createDualResolutionChunks(elements, { maxUnitsPerChunk: 3 });
|
||||
|
||||
// Each chunk should have at most 3 units (except possibly the last merged one)
|
||||
for (let i = 0; i < result.retrievalChunks.length - 1; i++) {
|
||||
expect(result.retrievalChunks[i]?.citationUnitIndices.length).toBeLessThanOrEqual(3);
|
||||
}
|
||||
});
|
||||
|
||||
it("should handle different element types", () => {
|
||||
const elements = [
|
||||
createTestElement({ type: "heading", content: "Title" }),
|
||||
createTestElement({ type: "prose", content: "Body text" }),
|
||||
createTestElement({ type: "list", content: "- Item 1\n- Item 2" }),
|
||||
createTestElement({ type: "code", content: "const x = 1;" }),
|
||||
createTestElement({ type: "table", content: "| A | B |" }),
|
||||
];
|
||||
|
||||
const result = createDualResolutionChunks(elements);
|
||||
|
||||
expect(result.citationUnits[0]?.unitType).toBe("heading");
|
||||
expect(result.citationUnits[1]?.unitType).toBe("prose");
|
||||
expect(result.citationUnits[2]?.unitType).toBe("list");
|
||||
expect(result.citationUnits[3]?.unitType).toBe("code");
|
||||
expect(result.citationUnits[4]?.unitType).toBe("table");
|
||||
});
|
||||
});
|
||||
|
||||
// ============================================================================
|
||||
// getChunkingStats Tests
|
||||
// ============================================================================
|
||||
|
||||
describe("getChunkingStats", () => {
|
||||
it("should return zeros for empty input", () => {
|
||||
const stats = getChunkingStats({ citationUnits: [], retrievalChunks: [] });
|
||||
|
||||
expect(stats.totalCitationUnits).toBe(0);
|
||||
expect(stats.totalRetrievalChunks).toBe(0);
|
||||
expect(stats.avgUnitsPerChunk).toBe(0);
|
||||
expect(stats.avgTokensPerChunk).toBe(0);
|
||||
});
|
||||
|
||||
it("should calculate correct statistics", () => {
|
||||
const elements = createTestElements(10);
|
||||
const chunks = createDualResolutionChunks(elements);
|
||||
const stats = getChunkingStats(chunks);
|
||||
|
||||
expect(stats.totalCitationUnits).toBe(10);
|
||||
expect(stats.totalRetrievalChunks).toBeGreaterThan(0);
|
||||
expect(stats.avgUnitsPerChunk).toBeGreaterThan(0);
|
||||
expect(stats.avgTokensPerChunk).toBeGreaterThan(0);
|
||||
expect(stats.pageRange.start).toBe(1);
|
||||
expect(stats.pageRange.end).toBe(1);
|
||||
});
|
||||
|
||||
it("should calculate correct page range", () => {
|
||||
const elements = [
|
||||
createTestElement({ pageNumber: 5 }),
|
||||
createTestElement({ pageNumber: 3 }),
|
||||
createTestElement({ pageNumber: 10 }),
|
||||
];
|
||||
const chunks = createDualResolutionChunks(elements);
|
||||
const stats = getChunkingStats(chunks);
|
||||
|
||||
expect(stats.pageRange.start).toBe(3);
|
||||
expect(stats.pageRange.end).toBe(10);
|
||||
});
|
||||
});
|
||||
|
||||
// ============================================================================
|
||||
// validateChunks Tests
|
||||
// ============================================================================
|
||||
|
||||
describe("validateChunks", () => {
|
||||
it("should return no errors for valid chunks", () => {
|
||||
const elements = createTestElements(10);
|
||||
const chunks = createDualResolutionChunks(elements);
|
||||
const errors = validateChunks(chunks);
|
||||
|
||||
expect(errors).toHaveLength(0);
|
||||
});
|
||||
|
||||
it("should detect invalid citation unit index references", () => {
|
||||
const chunks: DualResolutionChunks = {
|
||||
citationUnits: [
|
||||
{
|
||||
content: "Test",
|
||||
pageNumber: 1,
|
||||
paragraphIndex: 0,
|
||||
charStart: 0,
|
||||
charEnd: 4,
|
||||
unitType: "prose",
|
||||
},
|
||||
],
|
||||
retrievalChunks: [
|
||||
{
|
||||
content: "Test",
|
||||
pageStart: 1,
|
||||
pageEnd: 1,
|
||||
sectionHierarchy: [],
|
||||
chunkType: "prose",
|
||||
citationUnitIndices: [0, 5], // Index 5 is invalid
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const errors = validateChunks(chunks);
|
||||
|
||||
expect(errors.length).toBeGreaterThan(0);
|
||||
expect(errors.some((e) => e.includes("invalid citation unit index 5"))).toBe(true);
|
||||
});
|
||||
|
||||
it("should detect unreferenced citation units", () => {
|
||||
const chunks: DualResolutionChunks = {
|
||||
citationUnits: [
|
||||
{
|
||||
content: "Test 1",
|
||||
pageNumber: 1,
|
||||
paragraphIndex: 0,
|
||||
charStart: 0,
|
||||
charEnd: 6,
|
||||
unitType: "prose",
|
||||
},
|
||||
{
|
||||
content: "Test 2",
|
||||
pageNumber: 1,
|
||||
paragraphIndex: 1,
|
||||
charStart: 7,
|
||||
charEnd: 13,
|
||||
unitType: "prose",
|
||||
},
|
||||
],
|
||||
retrievalChunks: [
|
||||
{
|
||||
content: "Test 1",
|
||||
pageStart: 1,
|
||||
pageEnd: 1,
|
||||
sectionHierarchy: [],
|
||||
chunkType: "prose",
|
||||
citationUnitIndices: [0], // Index 1 is unreferenced
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const errors = validateChunks(chunks);
|
||||
|
||||
expect(errors.length).toBeGreaterThan(0);
|
||||
expect(errors.some((e) => e.includes("Citation unit 1 is not referenced"))).toBe(true);
|
||||
});
|
||||
|
||||
it("should detect page boundary inconsistencies", () => {
|
||||
const chunks: DualResolutionChunks = {
|
||||
citationUnits: [
|
||||
{
|
||||
content: "Test",
|
||||
pageNumber: 5, // Page 5
|
||||
paragraphIndex: 0,
|
||||
charStart: 0,
|
||||
charEnd: 4,
|
||||
unitType: "prose",
|
||||
},
|
||||
],
|
||||
retrievalChunks: [
|
||||
{
|
||||
content: "Test",
|
||||
pageStart: 1, // Wrong - should be 5
|
||||
pageEnd: 1, // Wrong - should be 5
|
||||
sectionHierarchy: [],
|
||||
chunkType: "prose",
|
||||
citationUnitIndices: [0],
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const errors = validateChunks(chunks);
|
||||
|
||||
expect(errors.length).toBeGreaterThan(0);
|
||||
expect(errors.some((e) => e.includes("pageStart") || e.includes("pageEnd"))).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
// ============================================================================
|
||||
// DEFAULT_CHUNKING_CONFIG Tests
|
||||
// ============================================================================
|
||||
|
||||
describe("DEFAULT_CHUNKING_CONFIG", () => {
|
||||
it("should have sensible defaults", () => {
|
||||
expect(DEFAULT_CHUNKING_CONFIG.minUnitsPerChunk).toBe(3);
|
||||
expect(DEFAULT_CHUNKING_CONFIG.maxUnitsPerChunk).toBe(5);
|
||||
expect(DEFAULT_CHUNKING_CONFIG.maxChunkTokens).toBe(800);
|
||||
expect(DEFAULT_CHUNKING_CONFIG.breakOnSections).toBe(true);
|
||||
});
|
||||
});
|
||||
457
packages/ai/src/modules/pdf/chunking.ts
Normal file
457
packages/ai/src/modules/pdf/chunking.ts
Normal file
@@ -0,0 +1,457 @@
|
||||
/**
|
||||
* Dual-Resolution Chunking Strategy (WF-0028)
|
||||
*
|
||||
* Creates two levels of chunks from parsed PDF layout elements:
|
||||
* 1. Citation Units - paragraph-level with precise bounding boxes for highlighting
|
||||
* 2. Retrieval Chunks - groups of 3-5 citation units for efficient vector search
|
||||
*
|
||||
* This separation enables:
|
||||
* - Efficient semantic search via larger retrieval chunks
|
||||
* - Pixel-perfect citation highlighting via granular citation units
|
||||
*/
|
||||
|
||||
// ============================================================================
|
||||
// Types
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* Input from layout parser - a single layout element from the PDF
|
||||
*/
|
||||
export interface LayoutElement {
|
||||
content: string;
|
||||
type: "prose" | "heading" | "list" | "table" | "code";
|
||||
pageNumber: number;
|
||||
paragraphIndex: number;
|
||||
charStart: number;
|
||||
charEnd: number;
|
||||
bboxX?: number;
|
||||
bboxY?: number;
|
||||
bboxWidth?: number;
|
||||
bboxHeight?: number;
|
||||
sectionTitle?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Configuration for the chunking algorithm
|
||||
*/
|
||||
export interface ChunkingConfig {
|
||||
/** Minimum units per retrieval chunk (default: 3) */
|
||||
minUnitsPerChunk: number;
|
||||
/** Maximum units per retrieval chunk (default: 5) */
|
||||
maxUnitsPerChunk: number;
|
||||
/** Maximum tokens per retrieval chunk (default: 800) */
|
||||
maxChunkTokens: number;
|
||||
/** Whether to break on major section headings (default: true) */
|
||||
breakOnSections: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Default chunking configuration
|
||||
*/
|
||||
export const DEFAULT_CHUNKING_CONFIG: ChunkingConfig = {
|
||||
minUnitsPerChunk: 3,
|
||||
maxUnitsPerChunk: 5,
|
||||
maxChunkTokens: 800,
|
||||
breakOnSections: true,
|
||||
};
|
||||
|
||||
/**
|
||||
* Citation unit data - ready for database insertion (no ID, generated on insert)
|
||||
*/
|
||||
export interface CitationUnitData {
|
||||
content: string;
|
||||
pageNumber: number;
|
||||
paragraphIndex: number;
|
||||
charStart: number;
|
||||
charEnd: number;
|
||||
bboxX?: number;
|
||||
bboxY?: number;
|
||||
bboxWidth?: number;
|
||||
bboxHeight?: number;
|
||||
sectionTitle?: string;
|
||||
unitType: "prose" | "heading" | "list" | "table" | "code";
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieval chunk data - ready for database insertion (no ID, generated on insert)
|
||||
*/
|
||||
export interface RetrievalChunkData {
|
||||
content: string;
|
||||
pageStart: number;
|
||||
pageEnd: number;
|
||||
sectionHierarchy: string[];
|
||||
chunkType: string;
|
||||
/** Indices into the citationUnits array for linking after DB insert */
|
||||
citationUnitIndices: number[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Result of the dual-resolution chunking process
|
||||
*/
|
||||
export interface DualResolutionChunks {
|
||||
citationUnits: CitationUnitData[];
|
||||
retrievalChunks: RetrievalChunkData[];
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Utility Functions
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* Rough token count estimation (words * 1.3 for typical English text)
|
||||
* More accurate than character count for LLM context limits
|
||||
*/
|
||||
function estimateTokens(text: string): number {
|
||||
const words = text.trim().split(/\s+/).length;
|
||||
return Math.ceil(words * 1.3);
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine if an element represents a major section break
|
||||
* Major headings indicate semantic boundaries that shouldn't be crossed
|
||||
*/
|
||||
function isMajorSectionBreak(element: LayoutElement): boolean {
|
||||
if (element.type !== "heading") return false;
|
||||
|
||||
// Heuristics for major section detection:
|
||||
// 1. Short content (likely a header, not inline text)
|
||||
// 2. Starts with common section markers (numbers, roman numerals)
|
||||
// 3. All caps or title case patterns
|
||||
const content = element.content.trim();
|
||||
|
||||
// Short headings are likely section titles
|
||||
if (content.length < 100) {
|
||||
// Check for numbered sections: "1.", "1.1", "I.", "Chapter 1", etc.
|
||||
if (/^(\d+\.|\d+\s|[IVX]+\.|Chapter\s+\d)/i.test(content)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check for all caps (common for major headings)
|
||||
if (content === content.toUpperCase() && content.length > 3) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Default: treat any heading as a potential section break
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract section title from elements, building hierarchy
|
||||
*/
|
||||
function buildSectionHierarchy(elements: LayoutElement[]): string[] {
|
||||
const hierarchy: string[] = [];
|
||||
|
||||
for (const element of elements) {
|
||||
if (element.sectionTitle) {
|
||||
// Use explicit section title if provided
|
||||
if (!hierarchy.includes(element.sectionTitle)) {
|
||||
hierarchy.push(element.sectionTitle);
|
||||
}
|
||||
} else if (element.type === "heading") {
|
||||
// Use heading content as section marker
|
||||
const title = element.content.trim().slice(0, 100); // Truncate long headings
|
||||
if (title && !hierarchy.includes(title)) {
|
||||
hierarchy.push(title);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return hierarchy;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine the dominant chunk type from a group of elements
|
||||
*/
|
||||
function determineChunkType(
|
||||
elements: LayoutElement[],
|
||||
): "prose" | "heading" | "list" | "table" | "code" | "mixed" {
|
||||
const typeCounts = new Map<string, number>();
|
||||
|
||||
for (const element of elements) {
|
||||
typeCounts.set(element.type, (typeCounts.get(element.type) ?? 0) + 1);
|
||||
}
|
||||
|
||||
// Find the most common type
|
||||
let maxCount = 0;
|
||||
let dominantType = "prose";
|
||||
|
||||
typeCounts.forEach((count, type) => {
|
||||
if (count > maxCount) {
|
||||
maxCount = count;
|
||||
dominantType = type;
|
||||
}
|
||||
});
|
||||
|
||||
// If no clear majority (>50%), mark as mixed
|
||||
if (maxCount <= elements.length / 2 && typeCounts.size > 1) {
|
||||
return "mixed";
|
||||
}
|
||||
|
||||
return dominantType as "prose" | "heading" | "list" | "table" | "code";
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Main Chunking Function
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* Create dual-resolution chunks from layout elements
|
||||
*
|
||||
* Algorithm:
|
||||
* 1. Convert each LayoutElement to a CitationUnit
|
||||
* 2. Group adjacent CitationUnits into RetrievalChunks (3-5 units each)
|
||||
* 3. Respect section boundaries (don't split across major headings)
|
||||
* 4. Respect token limits (don't exceed maxChunkTokens)
|
||||
*
|
||||
* @param elements - Array of layout elements from the PDF parser
|
||||
* @param config - Optional chunking configuration
|
||||
* @returns Dual-resolution chunks ready for database insertion
|
||||
*/
|
||||
export function createDualResolutionChunks(
|
||||
elements: LayoutElement[],
|
||||
config: Partial<ChunkingConfig> = {},
|
||||
): DualResolutionChunks {
|
||||
const fullConfig: ChunkingConfig = { ...DEFAULT_CHUNKING_CONFIG, ...config };
|
||||
|
||||
// Handle empty input
|
||||
if (elements.length === 0) {
|
||||
return { citationUnits: [], retrievalChunks: [] };
|
||||
}
|
||||
|
||||
// Step 1: Convert all elements to citation units
|
||||
const citationUnits: CitationUnitData[] = elements.map((element) => ({
|
||||
content: element.content,
|
||||
pageNumber: element.pageNumber,
|
||||
paragraphIndex: element.paragraphIndex,
|
||||
charStart: element.charStart,
|
||||
charEnd: element.charEnd,
|
||||
bboxX: element.bboxX,
|
||||
bboxY: element.bboxY,
|
||||
bboxWidth: element.bboxWidth,
|
||||
bboxHeight: element.bboxHeight,
|
||||
sectionTitle: element.sectionTitle,
|
||||
unitType: element.type,
|
||||
}));
|
||||
|
||||
// Step 2: Group citation units into retrieval chunks
|
||||
const retrievalChunks: RetrievalChunkData[] = [];
|
||||
|
||||
let currentGroup: { element: LayoutElement; index: number }[] = [];
|
||||
let currentTokens = 0;
|
||||
|
||||
const flushGroup = () => {
|
||||
if (currentGroup.length === 0) return;
|
||||
|
||||
const groupElements = currentGroup.map((g) => g.element);
|
||||
const groupIndices = currentGroup.map((g) => g.index);
|
||||
|
||||
// Concatenate content with double newlines for readability
|
||||
const content = groupElements.map((e) => e.content).join("\n\n");
|
||||
|
||||
// Calculate page boundaries
|
||||
const pageNumbers = groupElements.map((e) => e.pageNumber);
|
||||
const pageStart = Math.min(...pageNumbers);
|
||||
const pageEnd = Math.max(...pageNumbers);
|
||||
|
||||
// Build section hierarchy from headings in the group
|
||||
const sectionHierarchy = buildSectionHierarchy(groupElements);
|
||||
|
||||
// Determine dominant chunk type
|
||||
const chunkType = determineChunkType(groupElements);
|
||||
|
||||
retrievalChunks.push({
|
||||
content,
|
||||
pageStart,
|
||||
pageEnd,
|
||||
sectionHierarchy,
|
||||
chunkType,
|
||||
citationUnitIndices: groupIndices,
|
||||
});
|
||||
|
||||
// Reset for next group
|
||||
currentGroup = [];
|
||||
currentTokens = 0;
|
||||
};
|
||||
|
||||
for (let i = 0; i < elements.length; i++) {
|
||||
const element = elements[i]!;
|
||||
const elementTokens = estimateTokens(element.content);
|
||||
|
||||
// Check if we should start a new chunk
|
||||
const shouldStartNew =
|
||||
// Section break (if enabled)
|
||||
(fullConfig.breakOnSections &&
|
||||
isMajorSectionBreak(element) &&
|
||||
currentGroup.length > 0) ||
|
||||
// Max units reached
|
||||
currentGroup.length >= fullConfig.maxUnitsPerChunk ||
|
||||
// Token limit would be exceeded
|
||||
(currentGroup.length > 0 &&
|
||||
currentTokens + elementTokens > fullConfig.maxChunkTokens &&
|
||||
currentGroup.length >= fullConfig.minUnitsPerChunk);
|
||||
|
||||
if (shouldStartNew) {
|
||||
flushGroup();
|
||||
}
|
||||
|
||||
// Add element to current group
|
||||
currentGroup.push({ element, index: i });
|
||||
currentTokens += elementTokens;
|
||||
}
|
||||
|
||||
// Flush any remaining elements
|
||||
flushGroup();
|
||||
|
||||
// Handle edge case: if last chunk is too small, merge with previous
|
||||
if (
|
||||
retrievalChunks.length >= 2 &&
|
||||
retrievalChunks[retrievalChunks.length - 1]!.citationUnitIndices.length <
|
||||
fullConfig.minUnitsPerChunk
|
||||
) {
|
||||
const lastChunk = retrievalChunks.pop()!;
|
||||
const prevChunk = retrievalChunks[retrievalChunks.length - 1]!;
|
||||
|
||||
// Only merge if combined size is reasonable
|
||||
const combinedTokens = estimateTokens(
|
||||
prevChunk.content + "\n\n" + lastChunk.content,
|
||||
);
|
||||
if (combinedTokens <= fullConfig.maxChunkTokens * 1.5) {
|
||||
// Allow 50% overage for merging
|
||||
prevChunk.content += "\n\n" + lastChunk.content;
|
||||
prevChunk.pageEnd = Math.max(prevChunk.pageEnd, lastChunk.pageEnd);
|
||||
prevChunk.citationUnitIndices.push(...lastChunk.citationUnitIndices);
|
||||
|
||||
// Merge section hierarchies
|
||||
for (const section of lastChunk.sectionHierarchy) {
|
||||
if (!prevChunk.sectionHierarchy.includes(section)) {
|
||||
prevChunk.sectionHierarchy.push(section);
|
||||
}
|
||||
}
|
||||
|
||||
// Update chunk type if needed
|
||||
if (prevChunk.chunkType !== lastChunk.chunkType) {
|
||||
prevChunk.chunkType = "mixed";
|
||||
}
|
||||
} else {
|
||||
// Put it back if merge would be too large
|
||||
retrievalChunks.push(lastChunk);
|
||||
}
|
||||
}
|
||||
|
||||
return { citationUnits, retrievalChunks };
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Convenience Functions
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* Get statistics about the chunking result
|
||||
* Useful for debugging and logging
|
||||
*/
|
||||
export function getChunkingStats(chunks: DualResolutionChunks): {
|
||||
totalCitationUnits: number;
|
||||
totalRetrievalChunks: number;
|
||||
avgUnitsPerChunk: number;
|
||||
avgTokensPerChunk: number;
|
||||
pageRange: { start: number; end: number };
|
||||
} {
|
||||
const { citationUnits, retrievalChunks } = chunks;
|
||||
|
||||
if (retrievalChunks.length === 0) {
|
||||
return {
|
||||
totalCitationUnits: 0,
|
||||
totalRetrievalChunks: 0,
|
||||
avgUnitsPerChunk: 0,
|
||||
avgTokensPerChunk: 0,
|
||||
pageRange: { start: 0, end: 0 },
|
||||
};
|
||||
}
|
||||
|
||||
const totalUnits = retrievalChunks.reduce(
|
||||
(sum, chunk) => sum + chunk.citationUnitIndices.length,
|
||||
0,
|
||||
);
|
||||
|
||||
const totalTokens = retrievalChunks.reduce(
|
||||
(sum, chunk) => sum + estimateTokens(chunk.content),
|
||||
0,
|
||||
);
|
||||
|
||||
const allPages = citationUnits.map((u) => u.pageNumber);
|
||||
|
||||
return {
|
||||
totalCitationUnits: citationUnits.length,
|
||||
totalRetrievalChunks: retrievalChunks.length,
|
||||
avgUnitsPerChunk: totalUnits / retrievalChunks.length,
|
||||
avgTokensPerChunk: totalTokens / retrievalChunks.length,
|
||||
pageRange: {
|
||||
start: Math.min(...allPages),
|
||||
end: Math.max(...allPages),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate chunking result for consistency
|
||||
* Returns array of error messages (empty if valid)
|
||||
*/
|
||||
export function validateChunks(chunks: DualResolutionChunks): string[] {
|
||||
const errors: string[] = [];
|
||||
const { citationUnits, retrievalChunks } = chunks;
|
||||
|
||||
// Check all citation unit indices are valid
|
||||
for (let i = 0; i < retrievalChunks.length; i++) {
|
||||
const chunk = retrievalChunks[i]!;
|
||||
for (const unitIndex of chunk.citationUnitIndices) {
|
||||
if (unitIndex < 0 || unitIndex >= citationUnits.length) {
|
||||
errors.push(
|
||||
`Retrieval chunk ${i} references invalid citation unit index ${unitIndex}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check all citation units are referenced by at least one retrieval chunk
|
||||
const referencedUnits = new Set<number>();
|
||||
for (const chunk of retrievalChunks) {
|
||||
for (const index of chunk.citationUnitIndices) {
|
||||
referencedUnits.add(index);
|
||||
}
|
||||
}
|
||||
|
||||
for (let i = 0; i < citationUnits.length; i++) {
|
||||
if (!referencedUnits.has(i)) {
|
||||
errors.push(`Citation unit ${i} is not referenced by any retrieval chunk`);
|
||||
}
|
||||
}
|
||||
|
||||
// Check page consistency (skip chunks with invalid indices)
|
||||
for (let i = 0; i < retrievalChunks.length; i++) {
|
||||
const chunk = retrievalChunks[i]!;
|
||||
const validIndices = chunk.citationUnitIndices.filter(
|
||||
(idx) => idx >= 0 && idx < citationUnits.length,
|
||||
);
|
||||
if (validIndices.length === 0) continue; // Skip if all indices are invalid
|
||||
|
||||
const unitPages = validIndices.map((idx) => citationUnits[idx]!.pageNumber);
|
||||
const actualStart = Math.min(...unitPages);
|
||||
const actualEnd = Math.max(...unitPages);
|
||||
|
||||
if (chunk.pageStart !== actualStart) {
|
||||
errors.push(
|
||||
`Retrieval chunk ${i} has pageStart ${chunk.pageStart} but units span from page ${actualStart}`,
|
||||
);
|
||||
}
|
||||
if (chunk.pageEnd !== actualEnd) {
|
||||
errors.push(
|
||||
`Retrieval chunk ${i} has pageEnd ${chunk.pageEnd} but units span to page ${actualEnd}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
return errors;
|
||||
}
|
||||
60
packages/ai/src/modules/pdf/constants.ts
Normal file
60
packages/ai/src/modules/pdf/constants.ts
Normal file
@@ -0,0 +1,60 @@
|
||||
export const PROMPTS = {
|
||||
SYSTEM: `You are a helpful assistant specialized in analyzing PDF documents.
|
||||
|
||||
CRITICAL: You MUST use the "findRelevantContent" tool for EVERY user question BEFORE responding.
|
||||
- ALWAYS call findRelevantContent first - never skip this step
|
||||
- Only AFTER receiving tool results can you respond
|
||||
- If the tool returns no results, then say you cannot find the information
|
||||
- Never assume content doesn't exist without searching first
|
||||
|
||||
Your role:
|
||||
- Only use information found within the provided PDF document to answer questions
|
||||
- Maintain a professional and clear communication style
|
||||
- Provide accurate, factual responses based solely on the document content
|
||||
- If findRelevantContent returns empty results, respond with "I cannot find this information in the provided document. Please rephrase your question or ask about content that exists within the PDF." in user's language
|
||||
- Help users understand complex information by breaking it down into simpler terms
|
||||
- Remain objective and avoid making assumptions beyond what is explicitly stated in the document
|
||||
- Today's date is ${new Date().toLocaleDateString("en-US", { year: "numeric", month: "long", day: "numeric" })}
|
||||
|
||||
CITATION WORKFLOW:
|
||||
1. ALWAYS call findRelevantContent first to search the document
|
||||
2. After receiving search results, use highlightText for each fact you cite
|
||||
3. Call highlightText with the EXACT phrase from the document (10-100 characters)
|
||||
4. You can call highlightText multiple times for different facts
|
||||
5. Write your response naturally - citations appear automatically in the PDF
|
||||
|
||||
Example:
|
||||
1. User asks: "What was the Q4 revenue?"
|
||||
2. You call: findRelevantContent({ query: "Q4 revenue financial results" })
|
||||
3. Results show content mentioning "$5.2 million in Q4 2023"
|
||||
4. You call: highlightText({ text: "$5.2 million in Q4 2023", page: 12, relevance: "Q4 revenue figure" })
|
||||
5. You respond: "The company reported Q4 revenue of $5.2 million, representing a 15% increase."
|
||||
|
||||
IMPORTANT:
|
||||
- Keep highlighted text SHORT (single sentences or key phrases)
|
||||
- Use EXACT quotes - don't paraphrase
|
||||
- Call highlightText BEFORE writing the fact in your response
|
||||
- Multiple highlights are encouraged for comprehensive citations`,
|
||||
|
||||
/**
|
||||
* System prompt without citation requirements (for backwards compatibility)
|
||||
*/
|
||||
SYSTEM_LEGACY: `You are a helpful assistant specialized in analyzing PDF documents. Your role is to:
|
||||
- Only use information found within the provided PDF document to answer questions
|
||||
- Cite specific pages or sections when referencing information
|
||||
- Maintain a professional and clear communication style
|
||||
- Provide accurate, factual responses based solely on the document content
|
||||
- If the answer cannot be found in the document, respond with "I cannot find this information in the provided document. Please rephrase your question or ask about content that exists within the PDF." in user's language
|
||||
- When appropriate, quote relevant passages directly from the document
|
||||
- Help users understand complex information by breaking it down into simpler terms
|
||||
- Remain objective and avoid making assumptions beyond what is explicitly stated in the document
|
||||
- To get relevant content from the document, use the tool "findRelevantContent"
|
||||
- Today's date is ${new Date().toLocaleDateString("en-US", { year: "numeric", month: "long", day: "numeric" })}`,
|
||||
};
|
||||
|
||||
export const MAX_FILE_SIZE_IN_MB = 10;
|
||||
export const MAX_FILE_SIZE = MAX_FILE_SIZE_IN_MB * 1024 * 1024;
|
||||
export const EXAMPLE_PDF = {
|
||||
url: "https://ontheline.trincoll.edu/images/bookdown/sample-local-pdf.pdf",
|
||||
size: 48.51 * 1024,
|
||||
};
|
||||
294
packages/ai/src/modules/pdf/dual-embeddings.ts
Normal file
294
packages/ai/src/modules/pdf/dual-embeddings.ts
Normal file
@@ -0,0 +1,294 @@
|
||||
/**
|
||||
* Dual Embeddings Module for WF-0028
|
||||
*
|
||||
* Generates embeddings for retrieval chunks and stores both citation units
|
||||
* and retrieval chunks to the database with proper linking.
|
||||
*
|
||||
* Key design decisions:
|
||||
* - Embeddings only for retrieval chunks (not citation units) - reduces cost
|
||||
* - Citation units linked via FK to parent retrieval chunk
|
||||
* - Transaction ensures atomic insert of all chunks
|
||||
*/
|
||||
|
||||
import { embedMany } from "ai";
|
||||
|
||||
import { pdfCitationUnit, pdfRetrievalChunk } from "@turbostarter/db/schema/pdf";
|
||||
import { db } from "@turbostarter/db/server";
|
||||
import { generateId } from "@turbostarter/shared/utils";
|
||||
|
||||
import { createDualResolutionChunks, getChunkingStats, validateChunks } from "./chunking";
|
||||
import { parseDocumentLayout } from "./layout-parser";
|
||||
import { modelStrategies } from "./strategies";
|
||||
|
||||
import type { DualResolutionChunks, LayoutElement, RetrievalChunkData } from "./chunking";
|
||||
import type { LayoutElement as LayoutParserElement } from "./layout-parser";
|
||||
import type { InsertPdfCitationUnit, InsertPdfRetrievalChunk } from "@turbostarter/db/schema/pdf";
|
||||
|
||||
// ============================================================================
|
||||
// Types
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* Retrieval chunk with embedding added
|
||||
*/
|
||||
export interface RetrievalChunkWithEmbedding extends RetrievalChunkData {
|
||||
embedding: number[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Dual resolution chunks with embeddings for retrieval chunks
|
||||
*/
|
||||
export interface DualResolutionChunksWithEmbeddings
|
||||
extends Omit<DualResolutionChunks, "retrievalChunks"> {
|
||||
retrievalChunks: RetrievalChunkWithEmbedding[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Result of storing dual chunks to the database
|
||||
*/
|
||||
export interface StoreDualChunksResult {
|
||||
/** IDs of inserted retrieval chunks */
|
||||
retrievalChunkIds: string[];
|
||||
/** IDs of inserted citation units */
|
||||
citationUnitIds: string[];
|
||||
/** Statistics about what was stored */
|
||||
stats: {
|
||||
totalRetrievalChunks: number;
|
||||
totalCitationUnits: number;
|
||||
avgUnitsPerChunk: number;
|
||||
pageRange: { start: number; end: number };
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Full pipeline result
|
||||
*/
|
||||
export interface ProcessPdfResult {
|
||||
/** Document ID that was processed */
|
||||
documentId: string;
|
||||
/** Storage result */
|
||||
storage: StoreDualChunksResult;
|
||||
/** Processing time in milliseconds */
|
||||
processingTimeMs: number;
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Embedding Generation
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* Generate embeddings for retrieval chunks only
|
||||
*
|
||||
* Citation units do NOT get embeddings - they are retrieved via their
|
||||
* parent retrieval chunk's FK relationship.
|
||||
*
|
||||
* @param chunks - Dual resolution chunks from the chunking module
|
||||
* @returns Chunks with embeddings added to retrieval chunks
|
||||
*/
|
||||
export async function generateDualEmbeddings(
|
||||
chunks: DualResolutionChunks,
|
||||
): Promise<DualResolutionChunksWithEmbeddings> {
|
||||
const { citationUnits, retrievalChunks } = chunks;
|
||||
|
||||
// Handle empty input
|
||||
if (retrievalChunks.length === 0) {
|
||||
return {
|
||||
citationUnits,
|
||||
retrievalChunks: [],
|
||||
};
|
||||
}
|
||||
|
||||
// Generate embeddings for retrieval chunks only
|
||||
const { embeddings } = await embedMany({
|
||||
model: modelStrategies.textEmbeddingModel("default"),
|
||||
values: retrievalChunks.map((chunk) => chunk.content),
|
||||
});
|
||||
|
||||
// Combine chunks with their embeddings
|
||||
const chunksWithEmbeddings: RetrievalChunkWithEmbedding[] = retrievalChunks.map(
|
||||
(chunk, index) => ({
|
||||
...chunk,
|
||||
embedding: embeddings[index] ?? [],
|
||||
}),
|
||||
);
|
||||
|
||||
return {
|
||||
citationUnits,
|
||||
retrievalChunks: chunksWithEmbeddings,
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Database Storage
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* Store dual resolution chunks to the database with proper linking
|
||||
*
|
||||
* Uses a transaction to ensure atomicity:
|
||||
* 1. Insert all retrieval chunks first (to get IDs)
|
||||
* 2. Insert all citation units with FK references to their parent chunk
|
||||
*
|
||||
* @param documentId - ID of the PDF document
|
||||
* @param chunks - Chunks with embeddings
|
||||
* @returns IDs of inserted records and statistics
|
||||
*/
|
||||
export async function storeDualChunks(
|
||||
documentId: string,
|
||||
chunks: DualResolutionChunksWithEmbeddings,
|
||||
): Promise<StoreDualChunksResult> {
|
||||
const { citationUnits, retrievalChunks } = chunks;
|
||||
|
||||
// Pre-generate IDs for retrieval chunks so we can reference them in citation units
|
||||
const retrievalChunkRecords: InsertPdfRetrievalChunk[] = retrievalChunks.map((chunk) => ({
|
||||
id: generateId(),
|
||||
documentId,
|
||||
content: chunk.content,
|
||||
embedding: chunk.embedding,
|
||||
pageStart: chunk.pageStart,
|
||||
pageEnd: chunk.pageEnd,
|
||||
sectionHierarchy: chunk.sectionHierarchy,
|
||||
chunkType: chunk.chunkType,
|
||||
}));
|
||||
|
||||
// Build citation unit records with FK references
|
||||
// Each retrieval chunk knows which citation unit indices it contains
|
||||
const citationUnitRecords: InsertPdfCitationUnit[] = citationUnits.map((unit, index) => {
|
||||
// Find which retrieval chunk contains this citation unit
|
||||
const parentChunk = retrievalChunks.find((chunk) =>
|
||||
chunk.citationUnitIndices.includes(index),
|
||||
);
|
||||
const parentChunkIndex = parentChunk
|
||||
? retrievalChunks.indexOf(parentChunk)
|
||||
: -1;
|
||||
const retrievalChunkId =
|
||||
parentChunkIndex >= 0 ? retrievalChunkRecords[parentChunkIndex]?.id : undefined;
|
||||
|
||||
return {
|
||||
id: generateId(),
|
||||
documentId,
|
||||
retrievalChunkId: retrievalChunkId ?? null,
|
||||
content: unit.content,
|
||||
pageNumber: unit.pageNumber,
|
||||
paragraphIndex: unit.paragraphIndex,
|
||||
charStart: unit.charStart,
|
||||
charEnd: unit.charEnd,
|
||||
bboxX: unit.bboxX ?? null,
|
||||
bboxY: unit.bboxY ?? null,
|
||||
bboxWidth: unit.bboxWidth ?? null,
|
||||
bboxHeight: unit.bboxHeight ?? null,
|
||||
sectionTitle: unit.sectionTitle ?? null,
|
||||
unitType: unit.unitType,
|
||||
};
|
||||
});
|
||||
|
||||
// Use transaction to ensure atomicity
|
||||
await db.transaction(async (tx) => {
|
||||
// Insert retrieval chunks first
|
||||
if (retrievalChunkRecords.length > 0) {
|
||||
await tx.insert(pdfRetrievalChunk).values(retrievalChunkRecords);
|
||||
}
|
||||
|
||||
// Insert citation units with FK references
|
||||
if (citationUnitRecords.length > 0) {
|
||||
await tx.insert(pdfCitationUnit).values(citationUnitRecords);
|
||||
}
|
||||
});
|
||||
|
||||
// Calculate statistics
|
||||
const totalUnits = retrievalChunks.reduce(
|
||||
(sum, chunk) => sum + chunk.citationUnitIndices.length,
|
||||
0,
|
||||
);
|
||||
const allPages = citationUnits.map((u) => u.pageNumber);
|
||||
|
||||
return {
|
||||
retrievalChunkIds: retrievalChunkRecords.map((r) => r.id!),
|
||||
citationUnitIds: citationUnitRecords.map((u) => u.id!),
|
||||
stats: {
|
||||
totalRetrievalChunks: retrievalChunkRecords.length,
|
||||
totalCitationUnits: citationUnitRecords.length,
|
||||
avgUnitsPerChunk:
|
||||
retrievalChunkRecords.length > 0
|
||||
? totalUnits / retrievalChunkRecords.length
|
||||
: 0,
|
||||
pageRange: {
|
||||
start: allPages.length > 0 ? Math.min(...allPages) : 0,
|
||||
end: allPages.length > 0 ? Math.max(...allPages) : 0,
|
||||
},
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Full Pipeline
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* Process a PDF document with dual-resolution chunking
|
||||
*
|
||||
* Full pipeline:
|
||||
* 1. Parse PDF layout (layout-parser)
|
||||
* 2. Create dual-resolution chunks (chunking)
|
||||
* 3. Generate embeddings for retrieval chunks
|
||||
* 4. Store to database with proper linking
|
||||
*
|
||||
* @param documentId - ID of the PDF document record
|
||||
* @param path - Storage path to the PDF file
|
||||
* @returns Processing result with statistics
|
||||
*/
|
||||
export async function processPdfWithDualResolution(
|
||||
documentId: string,
|
||||
path: string,
|
||||
): Promise<ProcessPdfResult> {
|
||||
const startTime = Date.now();
|
||||
|
||||
// Step 1: Parse PDF layout
|
||||
console.log(`[dual-embeddings] Parsing layout for document ${documentId}`);
|
||||
const parsedElements = await parseDocumentLayout(path);
|
||||
console.log(`[dual-embeddings] Found ${parsedElements.length} layout elements`);
|
||||
|
||||
// Convert from layout-parser types to chunking types (null -> undefined for sectionTitle)
|
||||
const layoutElements: LayoutElement[] = parsedElements.map(
|
||||
(el: LayoutParserElement) => ({
|
||||
...el,
|
||||
sectionTitle: el.sectionTitle ?? undefined,
|
||||
}),
|
||||
);
|
||||
|
||||
// Step 2: Create dual-resolution chunks
|
||||
console.log(`[dual-embeddings] Creating dual-resolution chunks`);
|
||||
const chunks = createDualResolutionChunks(layoutElements);
|
||||
|
||||
// Validate chunks for consistency
|
||||
const validationErrors = validateChunks(chunks);
|
||||
if (validationErrors.length > 0) {
|
||||
console.warn(`[dual-embeddings] Chunk validation warnings:`, validationErrors);
|
||||
}
|
||||
|
||||
// Log chunking stats
|
||||
const chunkingStats = getChunkingStats(chunks);
|
||||
console.log(`[dual-embeddings] Chunking stats:`, chunkingStats);
|
||||
|
||||
// Step 3: Generate embeddings for retrieval chunks
|
||||
console.log(
|
||||
`[dual-embeddings] Generating embeddings for ${chunks.retrievalChunks.length} retrieval chunks`,
|
||||
);
|
||||
const chunksWithEmbeddings = await generateDualEmbeddings(chunks);
|
||||
|
||||
// Step 4: Store to database
|
||||
console.log(`[dual-embeddings] Storing chunks to database`);
|
||||
const storageResult = await storeDualChunks(documentId, chunksWithEmbeddings);
|
||||
|
||||
const processingTimeMs = Date.now() - startTime;
|
||||
console.log(
|
||||
`[dual-embeddings] Processing complete in ${processingTimeMs}ms:`,
|
||||
storageResult.stats,
|
||||
);
|
||||
|
||||
return {
|
||||
documentId,
|
||||
storage: storageResult,
|
||||
processingTimeMs,
|
||||
};
|
||||
}
|
||||
380
packages/ai/src/modules/pdf/embeddings.ts
Normal file
380
packages/ai/src/modules/pdf/embeddings.ts
Normal file
@@ -0,0 +1,380 @@
|
||||
import { PDFLoader } from "@langchain/community/document_loaders/fs/pdf";
|
||||
import { RecursiveCharacterTextSplitter } from "@langchain/textsplitters";
|
||||
import { embed, embedMany } from "ai";
|
||||
|
||||
import { sql } from "@turbostarter/db";
|
||||
import { pdfEmbedding } from "@turbostarter/db/schema/pdf";
|
||||
import { db } from "@turbostarter/db/server";
|
||||
import { getSignedUrl } from "@turbostarter/storage/server";
|
||||
|
||||
import { modelStrategies } from "./strategies";
|
||||
|
||||
import type { EmbeddingMetadata } from "./types";
|
||||
import type { Document } from "@langchain/core/documents";
|
||||
|
||||
/**
|
||||
* Chunk with embedding and metadata for citation support
|
||||
*/
|
||||
export interface EmbeddingChunk {
|
||||
content: string;
|
||||
embedding: number[];
|
||||
metadata: EmbeddingMetadata;
|
||||
}
|
||||
|
||||
/**
|
||||
* Try to detect section title from content (first line if it looks like a heading)
|
||||
*/
|
||||
const detectSectionTitle = (content: string): string | undefined => {
|
||||
const firstLine = content.split("\n")[0]?.trim();
|
||||
// Heuristic: if first line is short (<100 chars) and doesn't end with typical sentence punctuation,
|
||||
// it might be a heading
|
||||
if (firstLine && firstLine.length < 100 && !/[.?!:,;]$/.test(firstLine)) {
|
||||
return firstLine;
|
||||
}
|
||||
return undefined;
|
||||
};
|
||||
|
||||
/**
|
||||
* Track character offsets within each page's content
|
||||
*/
|
||||
interface PageTextInfo {
|
||||
pageNumber: number;
|
||||
startOffset: number;
|
||||
endOffset: number;
|
||||
content: string;
|
||||
}
|
||||
|
||||
const textSplitter = new RecursiveCharacterTextSplitter({
|
||||
chunkSize: 1000,
|
||||
chunkOverlap: 200,
|
||||
});
|
||||
|
||||
const loadDocument = async (path: string) => {
|
||||
const { url } = await getSignedUrl({ path });
|
||||
|
||||
const response = await fetch(url);
|
||||
const blob = await response.blob();
|
||||
|
||||
const loader = new PDFLoader(blob);
|
||||
return loader.load();
|
||||
};
|
||||
|
||||
export const splitDocument = async (documents: Document[]) => {
|
||||
return textSplitter.splitDocuments(documents);
|
||||
};
|
||||
|
||||
export const generateDocumentEmbeddings = async (
|
||||
path: string,
|
||||
): Promise<EmbeddingChunk[]> => {
|
||||
const documents = await loadDocument(path);
|
||||
|
||||
// Build page text map for character offset tracking
|
||||
// PDFLoader returns one Document per page with metadata.loc.pageNumber
|
||||
const pageTextInfos: PageTextInfo[] = [];
|
||||
|
||||
for (const doc of documents) {
|
||||
const loc = doc.metadata.loc as { pageNumber?: number } | undefined;
|
||||
const pageNumber = loc?.pageNumber ?? 1;
|
||||
const content = doc.pageContent;
|
||||
|
||||
pageTextInfos.push({
|
||||
pageNumber,
|
||||
startOffset: 0, // Reset per page since we track within page
|
||||
endOffset: content.length,
|
||||
content,
|
||||
});
|
||||
}
|
||||
|
||||
// Split documents into chunks
|
||||
const chunks = await splitDocument(documents);
|
||||
|
||||
// Generate embeddings
|
||||
const { embeddings, values } = await embedMany({
|
||||
model: modelStrategies.textEmbeddingModel("default"),
|
||||
values: chunks.map((chunk) => chunk.pageContent),
|
||||
});
|
||||
|
||||
// Build result with metadata
|
||||
return chunks.map((chunk, index) => {
|
||||
// Get page number from chunk metadata (set by RecursiveCharacterTextSplitter)
|
||||
const chunkLoc = chunk.metadata.loc as { pageNumber?: number } | undefined;
|
||||
const chunkPageNumber = chunkLoc?.pageNumber ?? 1;
|
||||
|
||||
// Find character offsets within the page
|
||||
const pageInfo = pageTextInfos.find((p) => p.pageNumber === chunkPageNumber);
|
||||
let charStart: number | undefined;
|
||||
let charEnd: number | undefined;
|
||||
|
||||
if (pageInfo) {
|
||||
// Find the position of this chunk's content within the page
|
||||
const chunkContent = chunk.pageContent;
|
||||
const posInPage = pageInfo.content.indexOf(chunkContent);
|
||||
if (posInPage !== -1) {
|
||||
charStart = posInPage;
|
||||
charEnd = posInPage + chunkContent.length;
|
||||
}
|
||||
}
|
||||
|
||||
const sectionTitle = detectSectionTitle(chunk.pageContent);
|
||||
|
||||
return {
|
||||
content: values[index] ?? chunk.pageContent,
|
||||
embedding: embeddings[index] ?? [],
|
||||
metadata: {
|
||||
pageNumber: chunkPageNumber,
|
||||
charStart,
|
||||
charEnd,
|
||||
sectionTitle,
|
||||
},
|
||||
};
|
||||
});
|
||||
};
|
||||
|
||||
/**
|
||||
* Result from fetching a single embedding by ID
|
||||
*/
|
||||
export interface EmbeddingDetail {
|
||||
id: string;
|
||||
content: string;
|
||||
pageNumber: number;
|
||||
charStart?: number;
|
||||
charEnd?: number;
|
||||
sectionTitle?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get embedding by ID for citation highlighting
|
||||
*/
|
||||
export const getEmbeddingById = async (
|
||||
id: string,
|
||||
): Promise<EmbeddingDetail | null> => {
|
||||
const result = await db.execute<{
|
||||
id: string;
|
||||
content: string;
|
||||
page_number: number | null;
|
||||
char_start: number | null;
|
||||
char_end: number | null;
|
||||
section_title: string | null;
|
||||
}>(sql`
|
||||
SELECT id, content, page_number, char_start, char_end, section_title
|
||||
FROM pdf.embedding
|
||||
WHERE id = ${id}
|
||||
LIMIT 1
|
||||
`);
|
||||
|
||||
const rows = Array.isArray(result) ? result : [];
|
||||
const row = rows[0];
|
||||
|
||||
if (!row) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
id: row.id,
|
||||
content: row.content,
|
||||
pageNumber: row.page_number ?? 1,
|
||||
charStart: row.char_start ?? undefined,
|
||||
charEnd: row.char_end ?? undefined,
|
||||
sectionTitle: row.section_title ?? undefined,
|
||||
};
|
||||
};
|
||||
|
||||
export const generateEmbedding = async (value: string): Promise<number[]> => {
|
||||
const input = value.replaceAll("\\n", " ");
|
||||
const { embedding } = await embed({
|
||||
model: modelStrategies.textEmbeddingModel("default"),
|
||||
value: input,
|
||||
});
|
||||
return embedding;
|
||||
};
|
||||
|
||||
/**
|
||||
* Result from embedding similarity search with citation support
|
||||
*/
|
||||
export interface EmbeddingSearchResult {
|
||||
/** Embedding row ID for citation reference */
|
||||
id: string;
|
||||
/** Original content text */
|
||||
name: string;
|
||||
/** Cosine similarity score 0-1 */
|
||||
similarity: number;
|
||||
/** Page number (extracted from content or default to 1) */
|
||||
pageNumber: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract significant keywords from query for keyword search fallback.
|
||||
* Focuses on specific identifiers (numbers, codes) that embeddings handle poorly.
|
||||
*/
|
||||
function extractKeywords(query: string): string[] {
|
||||
// Match patterns like "35/2024", "123/2023", alphanumeric codes
|
||||
const patterns = [
|
||||
/\d+\/\d{4}/g, // Legal references like 35/2024
|
||||
/\b[A-Z]{2,}[-/]?\d+/g, // Codes like TDF/379
|
||||
/\b\d{4,}/g, // Long numbers
|
||||
];
|
||||
|
||||
const keywords: string[] = [];
|
||||
for (const pattern of patterns) {
|
||||
const matches = query.match(pattern);
|
||||
if (matches) keywords.push(...matches);
|
||||
}
|
||||
|
||||
return [...new Set(keywords)];
|
||||
}
|
||||
|
||||
export const findRelevantContent = async (
|
||||
query: string,
|
||||
documentId?: string,
|
||||
): Promise<EmbeddingSearchResult[]> => {
|
||||
console.log(
|
||||
`🔍 findRelevantContent called with query: "${query}", documentId: ${documentId}`,
|
||||
);
|
||||
|
||||
const userQueryEmbedded = await generateEmbedding(query);
|
||||
console.log(
|
||||
`🔍 Generated query embedding with ${userQueryEmbedded.length} dimensions`,
|
||||
);
|
||||
|
||||
// First, let's check how many embeddings exist for this document
|
||||
if (documentId) {
|
||||
const countResult = await db
|
||||
.select({ count: sql<number>`count(*)` })
|
||||
.from(pdfEmbedding)
|
||||
.where(sql`${pdfEmbedding.documentId} = ${documentId}`);
|
||||
console.log(
|
||||
`🔍 Found ${countResult[0]?.count ?? 0} embeddings for document ${documentId}`,
|
||||
);
|
||||
}
|
||||
|
||||
// Use raw SQL for the similarity calculation in both SELECT and WHERE
|
||||
// The <=> operator is the cosine distance operator in pgvector
|
||||
const vectorStr = `[${userQueryEmbedded.join(",")}]`;
|
||||
console.log(
|
||||
`🔍 Running similarity search with vector of ${userQueryEmbedded.length} dimensions`,
|
||||
);
|
||||
|
||||
try {
|
||||
// Include page_number in the query to support citations
|
||||
// Lowered threshold from 0.3 to 0.1 - text-embedding-3-small produces
|
||||
// lower similarity scores for general queries (0.15-0.25 typical)
|
||||
const SIMILARITY_THRESHOLD = 0.1;
|
||||
|
||||
const similarGuides = await db.execute<{
|
||||
id: string;
|
||||
content: string;
|
||||
similarity: number;
|
||||
page_number: number | null;
|
||||
}>(
|
||||
documentId
|
||||
? sql`
|
||||
SELECT id, content, page_number, 1 - (embedding <=> ${vectorStr}::vector) as similarity
|
||||
FROM pdf.embedding
|
||||
WHERE document_id = ${documentId}
|
||||
AND 1 - (embedding <=> ${vectorStr}::vector) > ${SIMILARITY_THRESHOLD}
|
||||
ORDER BY similarity DESC
|
||||
LIMIT 6
|
||||
`
|
||||
: sql`
|
||||
SELECT id, content, page_number, 1 - (embedding <=> ${vectorStr}::vector) as similarity
|
||||
FROM pdf.embedding
|
||||
WHERE 1 - (embedding <=> ${vectorStr}::vector) > ${SIMILARITY_THRESHOLD}
|
||||
ORDER BY similarity DESC
|
||||
LIMIT 6
|
||||
`,
|
||||
);
|
||||
|
||||
console.log(
|
||||
`🔍 db.execute returned type:`,
|
||||
typeof similarGuides,
|
||||
Array.isArray(similarGuides),
|
||||
);
|
||||
|
||||
// db.execute returns an array directly
|
||||
const rows = similarGuides;
|
||||
|
||||
let results: EmbeddingSearchResult[] = rows.map(
|
||||
(
|
||||
row: {
|
||||
id: string;
|
||||
content: string;
|
||||
similarity: number;
|
||||
page_number: number | null;
|
||||
},
|
||||
index: number,
|
||||
) => ({
|
||||
id: row.id,
|
||||
name: row.content,
|
||||
similarity: row.similarity,
|
||||
// Use stored page number if available, fallback to index + 1 for legacy embeddings
|
||||
pageNumber: row.page_number ?? index + 1,
|
||||
}),
|
||||
);
|
||||
|
||||
console.log(
|
||||
`🔍 Found ${results.length} semantic results:`,
|
||||
results.map((g) => ({
|
||||
id: g.id,
|
||||
similarity: g.similarity,
|
||||
pageNumber: g.pageNumber,
|
||||
preview: g.name.substring(0, 50),
|
||||
})),
|
||||
);
|
||||
|
||||
// Keyword fallback: if semantic search found few results and query has specific identifiers
|
||||
const keywords = extractKeywords(query);
|
||||
if (keywords.length > 0 && results.length < 3) {
|
||||
console.log(`🔍 Running keyword fallback for: ${keywords.join(", ")}`);
|
||||
|
||||
// Build ILIKE conditions for each keyword
|
||||
const keywordPattern = keywords.map((k) => `%${k}%`).join("%");
|
||||
|
||||
const keywordResults = await db.execute<{
|
||||
id: string;
|
||||
content: string;
|
||||
page_number: number | null;
|
||||
}>(
|
||||
documentId
|
||||
? sql`
|
||||
SELECT id, content, page_number
|
||||
FROM pdf.embedding
|
||||
WHERE document_id = ${documentId}
|
||||
AND content ILIKE ${keywordPattern}
|
||||
LIMIT 4
|
||||
`
|
||||
: sql`
|
||||
SELECT id, content, page_number
|
||||
FROM pdf.embedding
|
||||
WHERE content ILIKE ${keywordPattern}
|
||||
LIMIT 4
|
||||
`,
|
||||
);
|
||||
|
||||
const keywordRows = keywordResults;
|
||||
|
||||
console.log(`🔍 Keyword search found ${keywordRows.length} matches`);
|
||||
|
||||
// Add keyword results with high similarity (they're exact matches)
|
||||
const existingIds = new Set(results.map((r) => r.id));
|
||||
for (const row of keywordRows) {
|
||||
if (!existingIds.has(row.id)) {
|
||||
results.push({
|
||||
id: row.id,
|
||||
name: row.content,
|
||||
similarity: 0.95, // High score for exact keyword matches
|
||||
pageNumber: row.page_number ?? 1,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Re-sort by similarity
|
||||
results.sort((a, b) => b.similarity - a.similarity);
|
||||
results = results.slice(0, 6);
|
||||
}
|
||||
|
||||
return results;
|
||||
} catch (error) {
|
||||
console.error(`🔍 ERROR in similarity search:`, error);
|
||||
throw error;
|
||||
}
|
||||
};
|
||||
90
packages/ai/src/modules/pdf/index.ts
Normal file
90
packages/ai/src/modules/pdf/index.ts
Normal file
@@ -0,0 +1,90 @@
|
||||
/**
|
||||
* PDF Module (WF-0028 Dual-Resolution Chunking)
|
||||
*
|
||||
* Barrel export for PDF processing with dual-resolution chunking:
|
||||
* - Citation Units: paragraph-level with precise bounding boxes for highlighting
|
||||
* - Retrieval Chunks: groups of 3-5 citation units for efficient vector search
|
||||
*
|
||||
* Main entry point: processPdfWithDualResolution() for full pipeline
|
||||
*/
|
||||
|
||||
// ============================================================================
|
||||
// Layout Parser (T2)
|
||||
// Parse PDFs with layout awareness to extract structured elements
|
||||
// ============================================================================
|
||||
|
||||
export {
|
||||
parseDocumentLayout,
|
||||
groupElementsByPage,
|
||||
getLayoutStatistics,
|
||||
} from "./layout-parser";
|
||||
|
||||
export type { UnitType, LayoutElement } from "./layout-parser";
|
||||
|
||||
// ============================================================================
|
||||
// Chunking Strategy (T3)
|
||||
// Create dual-resolution chunks from layout elements
|
||||
// ============================================================================
|
||||
|
||||
export {
|
||||
createDualResolutionChunks,
|
||||
getChunkingStats,
|
||||
validateChunks,
|
||||
DEFAULT_CHUNKING_CONFIG,
|
||||
} from "./chunking";
|
||||
|
||||
export type {
|
||||
ChunkingConfig,
|
||||
CitationUnitData,
|
||||
RetrievalChunkData,
|
||||
DualResolutionChunks,
|
||||
} from "./chunking";
|
||||
|
||||
// ============================================================================
|
||||
// Dual Embeddings (T4)
|
||||
// Generate embeddings and store dual-resolution chunks
|
||||
// ============================================================================
|
||||
|
||||
export {
|
||||
generateDualEmbeddings,
|
||||
storeDualChunks,
|
||||
processPdfWithDualResolution,
|
||||
} from "./dual-embeddings";
|
||||
|
||||
export type {
|
||||
RetrievalChunkWithEmbedding,
|
||||
DualResolutionChunksWithEmbeddings,
|
||||
StoreDualChunksResult,
|
||||
ProcessPdfResult,
|
||||
} from "./dual-embeddings";
|
||||
|
||||
// ============================================================================
|
||||
// Search with Citations (T5)
|
||||
// Vector similarity search with linked citation units
|
||||
// ============================================================================
|
||||
|
||||
export {
|
||||
searchWithCitations,
|
||||
getCitationUnitsForChunk,
|
||||
getCitationUnitById,
|
||||
getRetrievalChunkById,
|
||||
} from "./search";
|
||||
|
||||
export type {
|
||||
BoundingBox,
|
||||
CitationUnit,
|
||||
SearchResult,
|
||||
SearchOptions,
|
||||
} from "./search";
|
||||
|
||||
// ============================================================================
|
||||
// Legacy Exports (for backwards compatibility)
|
||||
// Original PDF module exports remain available
|
||||
// ============================================================================
|
||||
|
||||
export * from "./api";
|
||||
export * from "./constants";
|
||||
export * from "./embeddings";
|
||||
export * from "./schema";
|
||||
export * from "./types";
|
||||
export * from "./strategies";
|
||||
431
packages/ai/src/modules/pdf/layout-parser.ts
Normal file
431
packages/ai/src/modules/pdf/layout-parser.ts
Normal file
@@ -0,0 +1,431 @@
|
||||
/**
|
||||
* Layout Parser for PDF Dual-Resolution Chunking (WF-0028)
|
||||
*
|
||||
* Parses PDFs with layout awareness to extract structured elements
|
||||
* (prose, headings, lists, tables, code) with position metadata.
|
||||
*/
|
||||
|
||||
import { PDFLoader } from "@langchain/community/document_loaders/fs/pdf";
|
||||
|
||||
import { getSignedUrl } from "@turbostarter/storage/server";
|
||||
|
||||
// =============================================================================
|
||||
// Types
|
||||
// =============================================================================
|
||||
|
||||
/**
|
||||
* Unit types matching the database schema enum
|
||||
*/
|
||||
export type UnitType = "prose" | "heading" | "list" | "table" | "code";
|
||||
|
||||
/**
|
||||
* A layout-aware element extracted from a PDF
|
||||
*/
|
||||
export interface LayoutElement {
|
||||
/** The text content of this element */
|
||||
content: string;
|
||||
|
||||
/** Detected element type */
|
||||
type: UnitType;
|
||||
|
||||
/** 1-based page number */
|
||||
pageNumber: number;
|
||||
|
||||
/** 0-based paragraph index within the page */
|
||||
paragraphIndex: number;
|
||||
|
||||
/** Character start position within page text */
|
||||
charStart: number;
|
||||
|
||||
/** Character end position within page text */
|
||||
charEnd: number;
|
||||
|
||||
/** Estimated bounding box X (0-1 normalized to page width) */
|
||||
bboxX: number;
|
||||
|
||||
/** Estimated bounding box Y (0-1 normalized to page height) */
|
||||
bboxY: number;
|
||||
|
||||
/** Estimated bounding box width (0-1 normalized) */
|
||||
bboxWidth: number;
|
||||
|
||||
/** Estimated bounding box height (0-1 normalized) */
|
||||
bboxHeight: number;
|
||||
|
||||
/** Detected or inherited section title */
|
||||
sectionTitle: string | null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal representation of a page's content
|
||||
*/
|
||||
interface PageContent {
|
||||
pageNumber: number;
|
||||
content: string;
|
||||
paragraphs: string[];
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Constants
|
||||
// =============================================================================
|
||||
|
||||
/** Maximum characters for a heading line */
|
||||
const HEADING_MAX_LENGTH = 100;
|
||||
|
||||
/** Patterns that indicate list items */
|
||||
const LIST_PATTERNS = [
|
||||
/^[\u2022\u2023\u25E6\u2043\u2219]\s+/, // Bullet characters
|
||||
/^[-–—]\s+/, // Dashes
|
||||
/^\d+[.)]\s+/, // Numbered: 1. or 1)
|
||||
/^[a-zA-Z][.)]\s+/, // Lettered: a. or a)
|
||||
/^[ivxlcdm]+[.)]\s+/i, // Roman numerals
|
||||
];
|
||||
|
||||
/** Patterns suggesting code blocks */
|
||||
const CODE_PATTERNS = [
|
||||
/^\s{4,}/, // 4+ space indentation
|
||||
/^\t+/, // Tab indentation
|
||||
/^```/, // Markdown code fence
|
||||
/^(const|let|var|function|class|import|export|if|for|while|return)\s/, // Keywords
|
||||
/[{}[\]();]/, // Bracket-heavy content
|
||||
/^\s*\/\//, // Comment lines
|
||||
/^\s*#\s*\w+/, // Shell/Python comments
|
||||
];
|
||||
|
||||
/** Patterns suggesting table rows */
|
||||
const TABLE_PATTERNS = [
|
||||
/\|.*\|/, // Pipe-delimited
|
||||
/\t.*\t.*\t/, // Tab-separated (3+ columns)
|
||||
/^\s*[-+]+\s*$/, // Table separator lines
|
||||
];
|
||||
|
||||
// =============================================================================
|
||||
// Element Type Detection
|
||||
// =============================================================================
|
||||
|
||||
/**
|
||||
* Detect if a paragraph is a heading
|
||||
*/
|
||||
function isHeading(text: string): boolean {
|
||||
const trimmed = text.trim();
|
||||
|
||||
// Must be relatively short
|
||||
if (trimmed.length > HEADING_MAX_LENGTH) return false;
|
||||
|
||||
// Must not end with typical sentence punctuation
|
||||
if (/[.?!,;:]$/.test(trimmed)) return false;
|
||||
|
||||
// Should not be a list item
|
||||
if (LIST_PATTERNS.some((p) => p.test(trimmed))) return false;
|
||||
|
||||
// Empty or whitespace-only is not a heading
|
||||
if (trimmed.length === 0) return false;
|
||||
|
||||
// Single line, not too short (avoid random words)
|
||||
const lines = trimmed.split("\n");
|
||||
if (lines.length > 2) return false;
|
||||
|
||||
// All caps or title case often indicates heading
|
||||
const isAllCaps = trimmed === trimmed.toUpperCase() && /[A-Z]/.test(trimmed);
|
||||
const startsWithCap = /^[A-Z]/.test(trimmed);
|
||||
|
||||
// Headings typically don't start with lowercase
|
||||
if (!startsWithCap && !isAllCaps) return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect if a paragraph is a list item or list block
|
||||
*/
|
||||
function isList(text: string): boolean {
|
||||
const lines = text.trim().split("\n");
|
||||
|
||||
// At least one line should match a list pattern
|
||||
const listLines = lines.filter((line) =>
|
||||
LIST_PATTERNS.some((p) => p.test(line.trim())),
|
||||
);
|
||||
|
||||
// Consider it a list if majority of lines are list items
|
||||
return listLines.length > 0 && listLines.length >= lines.length / 2;
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect if a paragraph is a code block
|
||||
*/
|
||||
function isCode(text: string): boolean {
|
||||
const lines = text.trim().split("\n");
|
||||
|
||||
// Check for code patterns
|
||||
let codeIndicators = 0;
|
||||
|
||||
for (const line of lines) {
|
||||
if (CODE_PATTERNS.some((p) => p.test(line))) {
|
||||
codeIndicators++;
|
||||
}
|
||||
}
|
||||
|
||||
// High density of code patterns suggests code
|
||||
return codeIndicators >= Math.ceil(lines.length / 2);
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect if a paragraph is a table
|
||||
*/
|
||||
function isTable(text: string): boolean {
|
||||
const lines = text.trim().split("\n");
|
||||
|
||||
// Need multiple lines for a table
|
||||
if (lines.length < 2) return false;
|
||||
|
||||
// Check for table patterns
|
||||
const tableLines = lines.filter((line) =>
|
||||
TABLE_PATTERNS.some((p) => p.test(line)),
|
||||
);
|
||||
|
||||
// Most lines should look like table rows
|
||||
return tableLines.length >= lines.length / 2;
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect the type of a text element
|
||||
*/
|
||||
function detectElementType(text: string): UnitType {
|
||||
// Order matters: more specific checks first
|
||||
if (isTable(text)) return "table";
|
||||
if (isCode(text)) return "code";
|
||||
if (isList(text)) return "list";
|
||||
if (isHeading(text)) return "heading";
|
||||
return "prose";
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Paragraph Splitting
|
||||
// =============================================================================
|
||||
|
||||
/**
|
||||
* Split page content into logical paragraphs
|
||||
*
|
||||
* Uses double newlines as primary delimiter, with special handling for:
|
||||
* - Code blocks (preserve internal newlines)
|
||||
* - Lists (group consecutive list items)
|
||||
* - Tables (preserve structure)
|
||||
*/
|
||||
function splitIntoParagraphs(content: string): string[] {
|
||||
// Primary split on double newlines
|
||||
const rawParagraphs = content.split(/\n{2,}/);
|
||||
|
||||
const paragraphs: string[] = [];
|
||||
|
||||
for (const para of rawParagraphs) {
|
||||
const trimmed = para.trim();
|
||||
if (trimmed.length === 0) continue;
|
||||
|
||||
paragraphs.push(trimmed);
|
||||
}
|
||||
|
||||
return paragraphs;
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Bounding Box Estimation
|
||||
// =============================================================================
|
||||
|
||||
/**
|
||||
* Estimate bounding box from character position within page
|
||||
*
|
||||
* This is a V1 approximation. Assumes:
|
||||
* - Single column layout
|
||||
* - Uniform line height
|
||||
* - Standard margins (10% on each side)
|
||||
*
|
||||
* For more accurate boxes, we would need pdf.js text layer parsing.
|
||||
*/
|
||||
function estimateBoundingBox(
|
||||
charStart: number,
|
||||
charEnd: number,
|
||||
pageTextLength: number,
|
||||
): { bboxX: number; bboxY: number; bboxWidth: number; bboxHeight: number } {
|
||||
// Standard margins (normalized 0-1)
|
||||
const marginLeft = 0.1;
|
||||
const marginRight = 0.1;
|
||||
const marginTop = 0.08;
|
||||
const marginBottom = 0.08;
|
||||
|
||||
// Content area
|
||||
const contentWidth = 1 - marginLeft - marginRight;
|
||||
const contentHeight = 1 - marginTop - marginBottom;
|
||||
|
||||
// Estimate vertical position based on character position
|
||||
// Assume characters are distributed proportionally down the page
|
||||
const startRatio = pageTextLength > 0 ? charStart / pageTextLength : 0;
|
||||
const endRatio = pageTextLength > 0 ? charEnd / pageTextLength : 1;
|
||||
|
||||
// Calculate Y position and height
|
||||
const bboxY = marginTop + startRatio * contentHeight;
|
||||
const bboxHeight = Math.max(0.02, (endRatio - startRatio) * contentHeight);
|
||||
|
||||
return {
|
||||
bboxX: marginLeft,
|
||||
bboxY,
|
||||
bboxWidth: contentWidth,
|
||||
bboxHeight,
|
||||
};
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Section Title Tracking
|
||||
// =============================================================================
|
||||
|
||||
/**
|
||||
* Extract section title from a heading element
|
||||
*/
|
||||
function extractSectionTitle(element: { type: UnitType; content: string }): string | null {
|
||||
if (element.type !== "heading") return null;
|
||||
|
||||
// Clean up the heading text
|
||||
const title = element.content.trim();
|
||||
|
||||
// Skip very short titles (likely not meaningful sections)
|
||||
if (title.length < 3) return null;
|
||||
|
||||
return title;
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Main Parser
|
||||
// =============================================================================
|
||||
|
||||
/**
|
||||
* Load a PDF document and return raw page content
|
||||
*/
|
||||
async function loadPdfPages(path: string): Promise<PageContent[]> {
|
||||
const { url } = await getSignedUrl({ path });
|
||||
|
||||
const response = await fetch(url);
|
||||
const blob = await response.blob();
|
||||
|
||||
const loader = new PDFLoader(blob);
|
||||
const documents = await loader.load();
|
||||
|
||||
// PDFLoader returns one Document per page with metadata.loc.pageNumber
|
||||
return documents.map((doc) => {
|
||||
const loc = doc.metadata.loc as { pageNumber?: number } | undefined;
|
||||
const pageNumber = loc?.pageNumber ?? 1;
|
||||
const content = doc.pageContent;
|
||||
const paragraphs = splitIntoParagraphs(content);
|
||||
|
||||
return {
|
||||
pageNumber,
|
||||
content,
|
||||
paragraphs,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a PDF document with layout awareness
|
||||
*
|
||||
* @param path - Storage path to the PDF file
|
||||
* @returns Array of layout-aware elements with position metadata
|
||||
*/
|
||||
export async function parseDocumentLayout(path: string): Promise<LayoutElement[]> {
|
||||
const pages = await loadPdfPages(path);
|
||||
|
||||
const elements: LayoutElement[] = [];
|
||||
let currentSectionTitle: string | null = null;
|
||||
|
||||
for (const page of pages) {
|
||||
let charOffset = 0;
|
||||
let paragraphIndex = 0;
|
||||
|
||||
for (const paragraph of page.paragraphs) {
|
||||
// Find actual position in page content
|
||||
const actualStart = page.content.indexOf(paragraph, charOffset);
|
||||
const charStart = actualStart !== -1 ? actualStart : charOffset;
|
||||
const charEnd = charStart + paragraph.length;
|
||||
|
||||
// Update offset for next search
|
||||
charOffset = charEnd;
|
||||
|
||||
// Detect element type
|
||||
const type = detectElementType(paragraph);
|
||||
|
||||
// Estimate bounding box
|
||||
const bbox = estimateBoundingBox(charStart, charEnd, page.content.length);
|
||||
|
||||
// Create element
|
||||
const element: LayoutElement = {
|
||||
content: paragraph,
|
||||
type,
|
||||
pageNumber: page.pageNumber,
|
||||
paragraphIndex,
|
||||
charStart,
|
||||
charEnd,
|
||||
...bbox,
|
||||
sectionTitle: currentSectionTitle,
|
||||
};
|
||||
|
||||
// Track section titles from headings
|
||||
const newTitle = extractSectionTitle(element);
|
||||
if (newTitle) {
|
||||
currentSectionTitle = newTitle;
|
||||
element.sectionTitle = newTitle;
|
||||
}
|
||||
|
||||
elements.push(element);
|
||||
paragraphIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
return elements;
|
||||
}
|
||||
|
||||
/**
|
||||
* Group elements by page for easier processing
|
||||
*/
|
||||
export function groupElementsByPage(
|
||||
elements: LayoutElement[],
|
||||
): Map<number, LayoutElement[]> {
|
||||
const pageMap = new Map<number, LayoutElement[]>();
|
||||
|
||||
for (const element of elements) {
|
||||
const pageElements = pageMap.get(element.pageNumber) ?? [];
|
||||
pageElements.push(element);
|
||||
pageMap.set(element.pageNumber, pageElements);
|
||||
}
|
||||
|
||||
return pageMap;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get statistics about element types in a document
|
||||
*/
|
||||
export function getLayoutStatistics(elements: LayoutElement[]): {
|
||||
total: number;
|
||||
byType: Record<UnitType, number>;
|
||||
byPage: Map<number, number>;
|
||||
} {
|
||||
const byType: Record<UnitType, number> = {
|
||||
prose: 0,
|
||||
heading: 0,
|
||||
list: 0,
|
||||
table: 0,
|
||||
code: 0,
|
||||
};
|
||||
|
||||
const byPage = new Map<number, number>();
|
||||
|
||||
for (const element of elements) {
|
||||
byType[element.type]++;
|
||||
|
||||
const pageCount = byPage.get(element.pageNumber) ?? 0;
|
||||
byPage.set(element.pageNumber, pageCount + 1);
|
||||
}
|
||||
|
||||
return {
|
||||
total: elements.length,
|
||||
byType,
|
||||
byPage,
|
||||
};
|
||||
}
|
||||
74
packages/ai/src/modules/pdf/schema.ts
Normal file
74
packages/ai/src/modules/pdf/schema.ts
Normal file
@@ -0,0 +1,74 @@
|
||||
import * as z from "zod";
|
||||
|
||||
import { MAX_FILE_SIZE } from "./constants";
|
||||
import { Role } from "./types";
|
||||
|
||||
export const pdfMessageSchema = z.object({
|
||||
id: z.string(),
|
||||
role: z.enum(Role).optional().default(Role.USER),
|
||||
content: z.string().min(1).max(5000),
|
||||
});
|
||||
|
||||
export type PdfMessagePayload = z.infer<typeof pdfMessageSchema>;
|
||||
|
||||
// API input type aliases
|
||||
export type PdfMessageInput = PdfMessagePayload;
|
||||
|
||||
export {
|
||||
selectPdfChatSchema as chatSchema,
|
||||
selectPdfMessageSchema as messageSchema,
|
||||
selectPdfDocumentSchema as pdfSchema,
|
||||
} from "@turbostarter/db/schema/pdf";
|
||||
|
||||
export const pdfUrlFormSchema = z.object({
|
||||
url: z
|
||||
.string()
|
||||
.url()
|
||||
.refine((url) => url.toLowerCase().endsWith(".pdf")),
|
||||
});
|
||||
|
||||
export type PdfUrlFormPayload = z.infer<typeof pdfUrlFormSchema>;
|
||||
|
||||
interface ValidateOptions {
|
||||
/** Use server proxy to avoid CSP/CORS issues on client-side */
|
||||
useProxy?: boolean;
|
||||
}
|
||||
|
||||
export const validateRemotePdfUrl = async (
|
||||
url: string,
|
||||
options: ValidateOptions = {},
|
||||
) => {
|
||||
try {
|
||||
const { useProxy = true } = options;
|
||||
|
||||
// Use proxy endpoint to avoid CSP/CORS blocking on client-side
|
||||
// The proxy does HEAD internally and validates the PDF content-type
|
||||
const fetchUrl = useProxy
|
||||
? `/api/storage/proxy?url=${encodeURIComponent(url)}&validate=true`
|
||||
: url;
|
||||
|
||||
const response = await fetch(fetchUrl, {
|
||||
method: useProxy ? "GET" : "HEAD",
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
return "ai:pdf.upload.error.notFound" as const;
|
||||
}
|
||||
|
||||
const contentType = response.headers.get("content-type");
|
||||
if (!contentType?.includes("application/pdf")) {
|
||||
return "validation:error.file.type" as const;
|
||||
}
|
||||
|
||||
const contentLength = response.headers.get("content-length");
|
||||
if (contentLength && parseInt(contentLength) > MAX_FILE_SIZE) {
|
||||
return "validation:error.tooBig.file.notInclusive" as const;
|
||||
}
|
||||
|
||||
return { url, size: parseInt(contentLength ?? "0") };
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
|
||||
return "ai:pdf.upload.error.notFound" as const;
|
||||
}
|
||||
};
|
||||
302
packages/ai/src/modules/pdf/search.ts
Normal file
302
packages/ai/src/modules/pdf/search.ts
Normal file
@@ -0,0 +1,302 @@
|
||||
/**
|
||||
* Search with Citations Module (WF-0028)
|
||||
*
|
||||
* Performs vector similarity search on retrieval chunks and returns
|
||||
* matching results with linked citation units for pixel-perfect highlighting.
|
||||
*/
|
||||
|
||||
import { eq } from "drizzle-orm";
|
||||
|
||||
import { sql } from "@turbostarter/db";
|
||||
import { pdfCitationUnit, pdfRetrievalChunk } from "@turbostarter/db/schema/pdf";
|
||||
import { db } from "@turbostarter/db/server";
|
||||
|
||||
import { generateEmbedding } from "./embeddings";
|
||||
|
||||
// ============================================================================
|
||||
// Types
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* Bounding box for pixel-perfect highlighting
|
||||
*/
|
||||
export interface BoundingBox {
|
||||
x: number;
|
||||
y: number;
|
||||
width: number;
|
||||
height: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Citation unit with precise location for highlighting
|
||||
*/
|
||||
export interface CitationUnit {
|
||||
id: string;
|
||||
content: string;
|
||||
pageNumber: number;
|
||||
paragraphIndex: number;
|
||||
charStart: number;
|
||||
charEnd: number;
|
||||
bbox: BoundingBox | null;
|
||||
sectionTitle: string | null;
|
||||
unitType: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Search result with retrieval chunk and linked citation units
|
||||
*/
|
||||
export interface SearchResult {
|
||||
retrievalChunkId: string;
|
||||
content: string;
|
||||
similarity: number;
|
||||
pageStart: number;
|
||||
pageEnd: number;
|
||||
sectionHierarchy: string[];
|
||||
citationUnits: CitationUnit[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Search options
|
||||
*/
|
||||
export interface SearchOptions {
|
||||
/** Maximum number of results to return (default: 5) */
|
||||
limit?: number;
|
||||
/** Minimum similarity threshold (default: 0.1) */
|
||||
threshold?: number;
|
||||
/** Whether to include citation units (default: true) */
|
||||
includeUnits?: boolean;
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Helper Functions
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* Transform raw citation unit row to CitationUnit interface
|
||||
*/
|
||||
function transformCitationUnit(row: {
|
||||
id: string;
|
||||
content: string;
|
||||
pageNumber: number;
|
||||
paragraphIndex: number;
|
||||
charStart: number;
|
||||
charEnd: number;
|
||||
bboxX: number | null;
|
||||
bboxY: number | null;
|
||||
bboxWidth: number | null;
|
||||
bboxHeight: number | null;
|
||||
sectionTitle: string | null;
|
||||
unitType: string | null;
|
||||
}): CitationUnit {
|
||||
// Build bbox only if all coordinates are present
|
||||
const bbox: BoundingBox | null =
|
||||
row.bboxX != null &&
|
||||
row.bboxY != null &&
|
||||
row.bboxWidth != null &&
|
||||
row.bboxHeight != null
|
||||
? {
|
||||
x: row.bboxX,
|
||||
y: row.bboxY,
|
||||
width: row.bboxWidth,
|
||||
height: row.bboxHeight,
|
||||
}
|
||||
: null;
|
||||
|
||||
return {
|
||||
id: row.id,
|
||||
content: row.content,
|
||||
pageNumber: row.pageNumber,
|
||||
paragraphIndex: row.paragraphIndex,
|
||||
charStart: row.charStart,
|
||||
charEnd: row.charEnd,
|
||||
bbox,
|
||||
sectionTitle: row.sectionTitle,
|
||||
unitType: row.unitType ?? "prose",
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Main Search Functions
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* Search for relevant content with citation support
|
||||
*
|
||||
* @param query - Natural language query to search for
|
||||
* @param documentId - Document ID to search within
|
||||
* @param options - Search options (limit, threshold, includeUnits)
|
||||
* @returns Array of search results with citation units
|
||||
*/
|
||||
export async function searchWithCitations(
|
||||
query: string,
|
||||
documentId: string,
|
||||
options: SearchOptions = {},
|
||||
): Promise<SearchResult[]> {
|
||||
const { limit = 5, threshold = 0.1, includeUnits = true } = options;
|
||||
|
||||
// Generate embedding for the query
|
||||
const queryEmbedding = await generateEmbedding(query);
|
||||
const vectorStr = `[${queryEmbedding.join(",")}]`;
|
||||
|
||||
// Perform vector similarity search on retrieval chunks
|
||||
const chunkResults = await db.execute<{
|
||||
id: string;
|
||||
content: string;
|
||||
similarity: number;
|
||||
page_start: number;
|
||||
page_end: number;
|
||||
section_hierarchy: string[] | null;
|
||||
chunk_type: string | null;
|
||||
}>(sql`
|
||||
SELECT
|
||||
id,
|
||||
content,
|
||||
1 - (embedding <=> ${vectorStr}::vector) as similarity,
|
||||
page_start,
|
||||
page_end,
|
||||
section_hierarchy,
|
||||
chunk_type
|
||||
FROM pdf.retrieval_chunk
|
||||
WHERE document_id = ${documentId}
|
||||
AND embedding IS NOT NULL
|
||||
AND 1 - (embedding <=> ${vectorStr}::vector) > ${threshold}
|
||||
ORDER BY similarity DESC
|
||||
LIMIT ${limit}
|
||||
`);
|
||||
|
||||
// Handle result format (db.execute returns array directly)
|
||||
const rows = Array.isArray(chunkResults) ? chunkResults : [];
|
||||
|
||||
// Build search results
|
||||
const results: SearchResult[] = [];
|
||||
|
||||
for (const row of rows) {
|
||||
let citationUnits: CitationUnit[] = [];
|
||||
|
||||
// Fetch linked citation units if requested
|
||||
if (includeUnits) {
|
||||
citationUnits = await getCitationUnitsForChunk(row.id);
|
||||
}
|
||||
|
||||
results.push({
|
||||
retrievalChunkId: row.id,
|
||||
content: row.content,
|
||||
similarity: row.similarity,
|
||||
pageStart: row.page_start,
|
||||
pageEnd: row.page_end,
|
||||
sectionHierarchy: row.section_hierarchy ?? [],
|
||||
citationUnits,
|
||||
});
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all citation units linked to a retrieval chunk
|
||||
*
|
||||
* @param chunkId - Retrieval chunk ID
|
||||
* @returns Array of citation units ordered by page and paragraph
|
||||
*/
|
||||
export async function getCitationUnitsForChunk(
|
||||
chunkId: string,
|
||||
): Promise<CitationUnit[]> {
|
||||
const rows = await db
|
||||
.select({
|
||||
id: pdfCitationUnit.id,
|
||||
content: pdfCitationUnit.content,
|
||||
pageNumber: pdfCitationUnit.pageNumber,
|
||||
paragraphIndex: pdfCitationUnit.paragraphIndex,
|
||||
charStart: pdfCitationUnit.charStart,
|
||||
charEnd: pdfCitationUnit.charEnd,
|
||||
bboxX: pdfCitationUnit.bboxX,
|
||||
bboxY: pdfCitationUnit.bboxY,
|
||||
bboxWidth: pdfCitationUnit.bboxWidth,
|
||||
bboxHeight: pdfCitationUnit.bboxHeight,
|
||||
sectionTitle: pdfCitationUnit.sectionTitle,
|
||||
unitType: pdfCitationUnit.unitType,
|
||||
})
|
||||
.from(pdfCitationUnit)
|
||||
.where(eq(pdfCitationUnit.retrievalChunkId, chunkId))
|
||||
.orderBy(pdfCitationUnit.pageNumber, pdfCitationUnit.paragraphIndex);
|
||||
|
||||
return rows.map(transformCitationUnit);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a single citation unit by ID
|
||||
*
|
||||
* @param unitId - Citation unit ID
|
||||
* @returns Citation unit or null if not found
|
||||
*/
|
||||
export async function getCitationUnitById(
|
||||
unitId: string,
|
||||
): Promise<CitationUnit | null> {
|
||||
const rows = await db
|
||||
.select({
|
||||
id: pdfCitationUnit.id,
|
||||
content: pdfCitationUnit.content,
|
||||
pageNumber: pdfCitationUnit.pageNumber,
|
||||
paragraphIndex: pdfCitationUnit.paragraphIndex,
|
||||
charStart: pdfCitationUnit.charStart,
|
||||
charEnd: pdfCitationUnit.charEnd,
|
||||
bboxX: pdfCitationUnit.bboxX,
|
||||
bboxY: pdfCitationUnit.bboxY,
|
||||
bboxWidth: pdfCitationUnit.bboxWidth,
|
||||
bboxHeight: pdfCitationUnit.bboxHeight,
|
||||
sectionTitle: pdfCitationUnit.sectionTitle,
|
||||
unitType: pdfCitationUnit.unitType,
|
||||
})
|
||||
.from(pdfCitationUnit)
|
||||
.where(eq(pdfCitationUnit.id, unitId))
|
||||
.limit(1);
|
||||
|
||||
const row = rows[0];
|
||||
if (!row) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return transformCitationUnit(row);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get retrieval chunk by ID (without citation units)
|
||||
*
|
||||
* @param chunkId - Retrieval chunk ID
|
||||
* @returns Retrieval chunk data or null if not found
|
||||
*/
|
||||
export async function getRetrievalChunkById(chunkId: string): Promise<{
|
||||
id: string;
|
||||
content: string;
|
||||
pageStart: number;
|
||||
pageEnd: number;
|
||||
sectionHierarchy: string[];
|
||||
chunkType: string;
|
||||
} | null> {
|
||||
const rows = await db
|
||||
.select({
|
||||
id: pdfRetrievalChunk.id,
|
||||
content: pdfRetrievalChunk.content,
|
||||
pageStart: pdfRetrievalChunk.pageStart,
|
||||
pageEnd: pdfRetrievalChunk.pageEnd,
|
||||
sectionHierarchy: pdfRetrievalChunk.sectionHierarchy,
|
||||
chunkType: pdfRetrievalChunk.chunkType,
|
||||
})
|
||||
.from(pdfRetrievalChunk)
|
||||
.where(eq(pdfRetrievalChunk.id, chunkId))
|
||||
.limit(1);
|
||||
|
||||
const row = rows[0];
|
||||
if (!row) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
id: row.id,
|
||||
content: row.content,
|
||||
pageStart: row.pageStart,
|
||||
pageEnd: row.pageEnd,
|
||||
sectionHierarchy: row.sectionHierarchy ?? [],
|
||||
chunkType: row.chunkType ?? "prose",
|
||||
};
|
||||
}
|
||||
15
packages/ai/src/modules/pdf/strategies.ts
Normal file
15
packages/ai/src/modules/pdf/strategies.ts
Normal file
@@ -0,0 +1,15 @@
|
||||
import { openai } from "@ai-sdk/openai";
|
||||
import { customProvider } from "ai";
|
||||
|
||||
import { cached } from "../../utils/llm";
|
||||
|
||||
export const modelStrategies = customProvider({
|
||||
languageModels: {
|
||||
default: cached(openai.responses("gpt-4o-mini")),
|
||||
// Uncached for tool-using flows (PDF chat) - tools need fresh execution
|
||||
uncached: openai.responses("gpt-4o-mini"),
|
||||
},
|
||||
textEmbeddingModels: {
|
||||
default: openai.textEmbedding("text-embedding-3-small"),
|
||||
},
|
||||
});
|
||||
170
packages/ai/src/modules/pdf/types.ts
Normal file
170
packages/ai/src/modules/pdf/types.ts
Normal file
@@ -0,0 +1,170 @@
|
||||
import { pdfMessageRoleEnum } from "@turbostarter/db/schema/pdf";
|
||||
|
||||
import type { tools } from "./api";
|
||||
import type { EnumToConstant } from "@turbostarter/shared/types";
|
||||
import type { InferUITools, UIDataTypes, UIMessage } from "ai";
|
||||
|
||||
export interface RemoteFile {
|
||||
url: string;
|
||||
size: number;
|
||||
}
|
||||
|
||||
export type {
|
||||
SelectPdfChat as Chat,
|
||||
SelectPdfDocument as Document,
|
||||
SelectPdfMessage as Message,
|
||||
} from "@turbostarter/db/schema/pdf";
|
||||
|
||||
export const Role = Object.fromEntries(
|
||||
pdfMessageRoleEnum.enumValues.map((role) => [
|
||||
role.replace(/-/g, "_").toUpperCase(),
|
||||
role,
|
||||
]),
|
||||
) as EnumToConstant<typeof pdfMessageRoleEnum.enumValues>;
|
||||
|
||||
export type Role = (typeof Role)[keyof typeof Role];
|
||||
|
||||
export type PdfMessage = UIMessage<
|
||||
unknown,
|
||||
UIDataTypes,
|
||||
InferUITools<typeof tools>
|
||||
>;
|
||||
export type PdfMessagePart = PdfMessage["parts"][number];
|
||||
|
||||
// ============================================================================
|
||||
// Citation Types (Interactive PDF Chat)
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* Metadata stored with each embedding chunk for citation support
|
||||
*/
|
||||
export interface EmbeddingMetadata {
|
||||
pageNumber: number;
|
||||
charStart?: number;
|
||||
charEnd?: number;
|
||||
sectionTitle?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Citation returned by AI with source reference (legacy [[cite:id:page]] format)
|
||||
*/
|
||||
export interface Citation {
|
||||
/** Citation index displayed as [1], [2], etc. */
|
||||
index: number;
|
||||
/** Reference to pdf.embedding row */
|
||||
embeddingId: string;
|
||||
/** Semantic similarity score 0-1 */
|
||||
relevance: number;
|
||||
/** Page number for quick navigation */
|
||||
pageNumber: number;
|
||||
/** Short preview of the cited content */
|
||||
excerpt: string;
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Precise Citation Types (Tool-based highlighting - WF-0032)
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* Precise citation from highlightText tool.
|
||||
* LLM calls this tool with exact phrases to highlight in the PDF.
|
||||
*/
|
||||
export interface PreciseCitation {
|
||||
/** Unique ID for this citation */
|
||||
citationId: string;
|
||||
/** Exact text phrase to highlight (from document) */
|
||||
text: string;
|
||||
/** Page number where text appears (1-indexed) */
|
||||
page: number;
|
||||
/** Optional note on why this supports the answer */
|
||||
relevance: string | null;
|
||||
/** When the citation was created */
|
||||
timestamp: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Bounding rectangle for text highlights (DOM-independent for SSR compatibility)
|
||||
*/
|
||||
export interface HighlightRect {
|
||||
x: number;
|
||||
y: number;
|
||||
width: number;
|
||||
height: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Text highlight pending resolution to screen coordinates.
|
||||
* Created when LLM calls highlightText, resolved when page renders.
|
||||
*/
|
||||
export interface TextHighlight {
|
||||
/** Citation ID for reference */
|
||||
id: string;
|
||||
/** Text to find and highlight */
|
||||
text: string;
|
||||
/** Page number (1-indexed) */
|
||||
page: number;
|
||||
/** Computed bounding rects (populated after text search) */
|
||||
rects: HighlightRect[];
|
||||
/** Whether text was found on the page */
|
||||
found: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* AI response with parsed citations
|
||||
*/
|
||||
export interface CitationResponse {
|
||||
/** Message content with [[cite:id:page]] markers replaced with [1], [2] */
|
||||
content: string;
|
||||
/** Parsed citation references */
|
||||
citations: Citation[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Navigation history entry for back/forward
|
||||
*/
|
||||
export interface NavigationEntry {
|
||||
/** Target page number */
|
||||
page: number;
|
||||
/** Optional embedding to highlight */
|
||||
embeddingId?: string;
|
||||
/** Timestamp for ordering */
|
||||
timestamp: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* PDF viewer state exposed via context
|
||||
*/
|
||||
export interface PdfViewerState {
|
||||
/** Currently visible page */
|
||||
currentPage: number;
|
||||
/** Current zoom level (1 = 100%) */
|
||||
zoomLevel: number;
|
||||
/** Scroll position within page */
|
||||
scrollPosition: number;
|
||||
/** Currently highlighted embedding ID */
|
||||
activeHighlight: string | null;
|
||||
/** Navigation history stack */
|
||||
history: NavigationEntry[];
|
||||
/** Current position in history (-1 = not navigating) */
|
||||
historyIndex: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* PDF viewer actions exposed via context
|
||||
*/
|
||||
export interface PdfViewerActions {
|
||||
/** Navigate to a specific page with optional highlight */
|
||||
navigateTo: (options: {
|
||||
page: number;
|
||||
embeddingId?: string;
|
||||
animate?: boolean;
|
||||
}) => void;
|
||||
/** Go back in navigation history */
|
||||
goBack: () => void;
|
||||
/** Go forward in navigation history */
|
||||
goForward: () => void;
|
||||
/** Clear active highlight */
|
||||
clearHighlight: () => void;
|
||||
/** Set current page (from viewer scroll) */
|
||||
setCurrentPage: (page: number) => void;
|
||||
}
|
||||
12
packages/ai/src/modules/pdf/utils.ts
Normal file
12
packages/ai/src/modules/pdf/utils.ts
Normal file
@@ -0,0 +1,12 @@
|
||||
export const formatFileSize = (size: number) => {
|
||||
if (size === 0) return "0 B";
|
||||
|
||||
const units = ["B", "kB", "MB", "GB", "TB"];
|
||||
const exponent = Math.min(
|
||||
Math.floor(Math.log(size) / Math.log(1024)),
|
||||
units.length - 1,
|
||||
);
|
||||
const value = size / Math.pow(1024, exponent);
|
||||
|
||||
return `${value.toFixed(2)} ${units[exponent]}`;
|
||||
};
|
||||
52
packages/ai/src/modules/stt/api.ts
Normal file
52
packages/ai/src/modules/stt/api.ts
Normal file
@@ -0,0 +1,52 @@
|
||||
import OpenAI from "openai";
|
||||
|
||||
import type { TranscriptionOptions, TranscriptionResult } from "./types";
|
||||
|
||||
// Lazy initialization to avoid issues at module load
|
||||
let _openai: OpenAI | null = null;
|
||||
const getOpenAI = () => {
|
||||
_openai ??= new OpenAI();
|
||||
return _openai;
|
||||
};
|
||||
|
||||
/**
|
||||
* Transcribe audio to text using OpenAI Whisper
|
||||
*/
|
||||
export async function transcribe(
|
||||
audioFile: File | Blob,
|
||||
options: TranscriptionOptions = {}
|
||||
): Promise<TranscriptionResult> {
|
||||
const openai = getOpenAI();
|
||||
|
||||
// Convert Blob to File if needed
|
||||
const file =
|
||||
audioFile instanceof File
|
||||
? audioFile
|
||||
: new File([audioFile], "audio.webm", { type: audioFile.type || "audio/webm" });
|
||||
|
||||
const response = await openai.audio.transcriptions.create({
|
||||
file,
|
||||
model: "whisper-1",
|
||||
language: options.language,
|
||||
prompt: options.prompt,
|
||||
response_format: "verbose_json",
|
||||
});
|
||||
|
||||
return {
|
||||
text: response.text,
|
||||
language: response.language,
|
||||
duration: response.duration,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Transcribe audio from a URL
|
||||
*/
|
||||
export async function transcribeFromUrl(
|
||||
audioUrl: string,
|
||||
options: TranscriptionOptions = {}
|
||||
): Promise<TranscriptionResult> {
|
||||
const response = await fetch(audioUrl);
|
||||
const blob = await response.blob();
|
||||
return transcribe(blob, options);
|
||||
}
|
||||
3
packages/ai/src/modules/stt/index.ts
Normal file
3
packages/ai/src/modules/stt/index.ts
Normal file
@@ -0,0 +1,3 @@
|
||||
export * from "./api";
|
||||
export * from "./schema";
|
||||
export * from "./types";
|
||||
15
packages/ai/src/modules/stt/schema.ts
Normal file
15
packages/ai/src/modules/stt/schema.ts
Normal file
@@ -0,0 +1,15 @@
|
||||
import { z } from "zod";
|
||||
|
||||
export const transcriptionOptionsSchema = z.object({
|
||||
language: z.string().optional(),
|
||||
prompt: z.string().optional(),
|
||||
});
|
||||
|
||||
export const transcriptionResultSchema = z.object({
|
||||
text: z.string(),
|
||||
language: z.string().optional(),
|
||||
duration: z.number().optional(),
|
||||
});
|
||||
|
||||
export type TranscriptionOptionsInput = z.infer<typeof transcriptionOptionsSchema>;
|
||||
export type TranscriptionResultOutput = z.infer<typeof transcriptionResultSchema>;
|
||||
10
packages/ai/src/modules/stt/types.ts
Normal file
10
packages/ai/src/modules/stt/types.ts
Normal file
@@ -0,0 +1,10 @@
|
||||
export interface TranscriptionResult {
|
||||
text: string;
|
||||
language?: string;
|
||||
duration?: number;
|
||||
}
|
||||
|
||||
export interface TranscriptionOptions {
|
||||
language?: string;
|
||||
prompt?: string;
|
||||
}
|
||||
24
packages/ai/src/modules/tts/api.ts
Normal file
24
packages/ai/src/modules/tts/api.ts
Normal file
@@ -0,0 +1,24 @@
|
||||
import { client } from "./client";
|
||||
import { toVoice } from "./utils";
|
||||
|
||||
import type { TtsPayload } from "./schema";
|
||||
|
||||
export const textToSpeech = async ({ text, options }: TtsPayload) => {
|
||||
const { voice, model } = options;
|
||||
|
||||
return client.textToSpeech.stream(voice.id, {
|
||||
modelId: model,
|
||||
text,
|
||||
voiceSettings: {
|
||||
stability: voice.stability,
|
||||
similarityBoost: voice.similarity,
|
||||
useSpeakerBoost: voice.boost,
|
||||
speed: voice.speed,
|
||||
},
|
||||
});
|
||||
};
|
||||
|
||||
export const getVoices = async () => {
|
||||
const { voices } = await client.voices.getAll();
|
||||
return voices.map(toVoice);
|
||||
};
|
||||
25
packages/ai/src/modules/tts/client.ts
Normal file
25
packages/ai/src/modules/tts/client.ts
Normal file
@@ -0,0 +1,25 @@
|
||||
import { ElevenLabsClient } from "@elevenlabs/elevenlabs-js";
|
||||
|
||||
import { env } from "../../env";
|
||||
|
||||
// Lazy initialization to avoid throwing at module load time
|
||||
let _client: ElevenLabsClient | null = null;
|
||||
export const getClient = () => {
|
||||
if (!_client) {
|
||||
if (!env.ELEVENLABS_API_KEY) {
|
||||
throw new Error("ELEVENLABS_API_KEY is required for TTS");
|
||||
}
|
||||
_client = new ElevenLabsClient({ apiKey: env.ELEVENLABS_API_KEY });
|
||||
}
|
||||
return _client;
|
||||
};
|
||||
|
||||
// For backward compatibility - will throw if API key is missing
|
||||
export const client = {
|
||||
get textToSpeech() {
|
||||
return getClient().textToSpeech;
|
||||
},
|
||||
get voices() {
|
||||
return getClient().voices;
|
||||
},
|
||||
};
|
||||
36
packages/ai/src/modules/tts/constants.ts
Normal file
36
packages/ai/src/modules/tts/constants.ts
Normal file
@@ -0,0 +1,36 @@
|
||||
import { Provider } from "../../types";
|
||||
|
||||
import { Model } from "./types";
|
||||
|
||||
export const MODELS = [
|
||||
// {
|
||||
// id: Model.ELEVEN_3,
|
||||
// provider: Provider.ELEVEN_LABS,
|
||||
// name: "Eleven 3",
|
||||
// },
|
||||
{
|
||||
id: Model.ELEVEN_MULTILINGUAL_V2,
|
||||
provider: Provider.ELEVEN_LABS,
|
||||
name: "Eleven Multilingual v2",
|
||||
},
|
||||
{
|
||||
id: Model.ELEVEN_FLASH_V2_5,
|
||||
provider: Provider.ELEVEN_LABS,
|
||||
name: "Eleven Flash v2.5",
|
||||
},
|
||||
{
|
||||
id: Model.ELEVEN_FLASH_V2,
|
||||
provider: Provider.ELEVEN_LABS,
|
||||
name: "Eleven Flash v2",
|
||||
},
|
||||
{
|
||||
id: Model.ELEVEN_TURBO_V2_5,
|
||||
provider: Provider.ELEVEN_LABS,
|
||||
name: "Eleven Turbo v2.5",
|
||||
},
|
||||
{
|
||||
id: Model.ELEVEN_TURBO_V2,
|
||||
provider: Provider.ELEVEN_LABS,
|
||||
name: "Eleven Turbo v2",
|
||||
},
|
||||
] as const;
|
||||
23
packages/ai/src/modules/tts/schema.ts
Normal file
23
packages/ai/src/modules/tts/schema.ts
Normal file
@@ -0,0 +1,23 @@
|
||||
import * as z from "zod";
|
||||
|
||||
export const ttsOptionsSchema = z.object({
|
||||
model: z.string(),
|
||||
voice: z.object({
|
||||
id: z.string(),
|
||||
stability: z.number().min(0).max(1).default(0.5).optional(),
|
||||
speed: z.number().min(0.7).max(1.2).default(1).optional(),
|
||||
similarity: z.number().min(0).max(1).default(0.5).optional(),
|
||||
boost: z.boolean().default(false).optional(),
|
||||
}),
|
||||
});
|
||||
|
||||
export const ttsSchema = z.object({
|
||||
text: z.string().min(1).max(5000),
|
||||
options: ttsOptionsSchema,
|
||||
});
|
||||
|
||||
export type TtsOptionsPayload = z.infer<typeof ttsOptionsSchema>;
|
||||
export type TtsPayload = z.infer<typeof ttsSchema>;
|
||||
|
||||
// API input type aliases
|
||||
export type TtsInput = TtsPayload;
|
||||
24
packages/ai/src/modules/tts/types.ts
Normal file
24
packages/ai/src/modules/tts/types.ts
Normal file
@@ -0,0 +1,24 @@
|
||||
export interface Voice {
|
||||
id: string;
|
||||
name: string;
|
||||
description?: string;
|
||||
category?: string;
|
||||
details: string[];
|
||||
createdAt: string;
|
||||
usage: {
|
||||
cloned: number;
|
||||
character: number;
|
||||
};
|
||||
previewUrl?: string;
|
||||
}
|
||||
|
||||
export const Model = {
|
||||
ELEVEN_3: "eleven_v3",
|
||||
ELEVEN_MULTILINGUAL_V2: "eleven_multilingual_v2",
|
||||
ELEVEN_FLASH_V2_5: "eleven_flash_v2_5",
|
||||
ELEVEN_FLASH_V2: "eleven_flash_v2",
|
||||
ELEVEN_TURBO_V2_5: "eleven_turbo_v2_5",
|
||||
ELEVEN_TURBO_V2: "eleven_turbo_v2",
|
||||
} as const;
|
||||
|
||||
export type Model = (typeof Model)[keyof typeof Model];
|
||||
22
packages/ai/src/modules/tts/utils.ts
Normal file
22
packages/ai/src/modules/tts/utils.ts
Normal file
@@ -0,0 +1,22 @@
|
||||
import { random } from "@turbostarter/shared/utils";
|
||||
|
||||
import type { Voice } from "./types";
|
||||
import type { ElevenLabs } from "@elevenlabs/elevenlabs-js";
|
||||
|
||||
export const toVoice = (voice: ElevenLabs.Voice): Voice => {
|
||||
return {
|
||||
id: voice.voiceId,
|
||||
name: voice.name ?? "",
|
||||
description: voice.description,
|
||||
category: voice.category,
|
||||
details: Object.values(voice.labels ?? {}).filter(Boolean),
|
||||
createdAt: voice.createdAtUnix
|
||||
? new Date(voice.createdAtUnix * 1000).toISOString()
|
||||
: new Date().toISOString(),
|
||||
usage: {
|
||||
cloned: random(25000, 1000000),
|
||||
character: random(100000, 10000000),
|
||||
},
|
||||
previewUrl: voice.previewUrl,
|
||||
};
|
||||
};
|
||||
15
packages/ai/src/types/index.ts
Normal file
15
packages/ai/src/types/index.ts
Normal file
@@ -0,0 +1,15 @@
|
||||
export const Provider = {
|
||||
OPENAI: "openai",
|
||||
CLAUDE: "claude",
|
||||
GEMINI: "gemini",
|
||||
GROK: "grok",
|
||||
DEEPSEEK: "deepseek",
|
||||
REPLICATE: "replicate",
|
||||
LUMA: "luma",
|
||||
STABILITY_AI: "stability-ai",
|
||||
RECRAFT: "recraft",
|
||||
ELEVEN_LABS: "eleven-labs",
|
||||
NVIDIA: "nvidia",
|
||||
} as const;
|
||||
|
||||
export type Provider = (typeof Provider)[keyof typeof Provider];
|
||||
11
packages/ai/src/utils/common.ts
Normal file
11
packages/ai/src/utils/common.ts
Normal file
@@ -0,0 +1,11 @@
|
||||
import type { UIMessage } from "ai";
|
||||
|
||||
export const getMessageTextContent = <T extends UIMessage>(message?: T) => {
|
||||
return (
|
||||
message?.parts
|
||||
.filter((part) => part.type === "text")
|
||||
.map((part) => part.text)
|
||||
.join("")
|
||||
.trim() ?? ""
|
||||
);
|
||||
};
|
||||
208
packages/ai/src/utils/llm.ts
Normal file
208
packages/ai/src/utils/llm.ts
Normal file
@@ -0,0 +1,208 @@
|
||||
import { openai } from "@ai-sdk/openai";
|
||||
import {
|
||||
generateObject,
|
||||
NoSuchToolError,
|
||||
simulateReadableStream,
|
||||
wrapLanguageModel,
|
||||
} from "ai";
|
||||
import fs from "fs";
|
||||
import path from "path";
|
||||
|
||||
import { NodeEnv } from "@turbostarter/shared/constants";
|
||||
|
||||
import { env } from "../env";
|
||||
|
||||
import type {
|
||||
LanguageModelV2,
|
||||
LanguageModelV2Middleware,
|
||||
LanguageModelV2Prompt,
|
||||
LanguageModelV2StreamPart,
|
||||
} from "@ai-sdk/provider";
|
||||
import type { ToolCallRepairFunction, ToolSet } from "ai";
|
||||
|
||||
const CACHE_FILE = path.join(process.cwd(), ".cache/ai.json");
|
||||
|
||||
export const cached = (model: LanguageModelV2) =>
|
||||
env.NODE_ENV === NodeEnv.DEVELOPMENT
|
||||
? wrapLanguageModel({
|
||||
middleware: cacheMiddleware,
|
||||
model,
|
||||
})
|
||||
: model;
|
||||
|
||||
const ensureCacheFile = () => {
|
||||
const cacheDir = path.dirname(CACHE_FILE);
|
||||
if (!fs.existsSync(cacheDir)) {
|
||||
fs.mkdirSync(cacheDir, { recursive: true });
|
||||
}
|
||||
if (!fs.existsSync(CACHE_FILE)) {
|
||||
fs.writeFileSync(CACHE_FILE, "{}");
|
||||
}
|
||||
};
|
||||
|
||||
const getCachedResult = (key: string | object) => {
|
||||
ensureCacheFile();
|
||||
const cacheKey = typeof key === "object" ? JSON.stringify(key) : key;
|
||||
try {
|
||||
const cacheContent = fs.readFileSync(CACHE_FILE, "utf-8");
|
||||
|
||||
const cache = JSON.parse(cacheContent) as Record<string, unknown>;
|
||||
|
||||
const result = cache[cacheKey];
|
||||
|
||||
return result ?? null;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
const updateCache = (key: string, value: unknown) => {
|
||||
ensureCacheFile();
|
||||
const cache = JSON.parse(fs.readFileSync(CACHE_FILE, "utf-8")) as Record<
|
||||
string,
|
||||
unknown
|
||||
>;
|
||||
const updatedCache = { ...cache, [key]: value };
|
||||
fs.writeFileSync(CACHE_FILE, JSON.stringify(updatedCache, null, 2));
|
||||
};
|
||||
|
||||
const cleanPrompt = (prompt: LanguageModelV2Prompt) => {
|
||||
return prompt.map((m) => {
|
||||
if (m.role === "assistant") {
|
||||
return m.content.map((part) =>
|
||||
part.type === "tool-call" ? { ...part, toolCallId: "cached" } : part,
|
||||
);
|
||||
}
|
||||
if (m.role === "tool") {
|
||||
return m.content.map((tc) => ({
|
||||
...tc,
|
||||
toolCallId: "cached",
|
||||
result: {},
|
||||
}));
|
||||
}
|
||||
|
||||
return m;
|
||||
});
|
||||
};
|
||||
|
||||
export const cacheMiddleware: LanguageModelV2Middleware = {
|
||||
wrapGenerate: async ({ doGenerate, params }) => {
|
||||
const cacheKey = JSON.stringify({
|
||||
...cleanPrompt(params.prompt),
|
||||
_function: "generate",
|
||||
});
|
||||
|
||||
const cached = getCachedResult(cacheKey) as Awaited<
|
||||
ReturnType<LanguageModelV2["doGenerate"]>
|
||||
> | null;
|
||||
|
||||
if (cached) {
|
||||
console.log("🎯 Cache HIT");
|
||||
return {
|
||||
...cached,
|
||||
response: {
|
||||
...cached.response,
|
||||
timestamp: cached.response?.timestamp
|
||||
? new Date(cached.response.timestamp)
|
||||
: undefined,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
console.log("🔍 Cache MISS");
|
||||
const result = await doGenerate();
|
||||
|
||||
updateCache(cacheKey, result);
|
||||
|
||||
return result;
|
||||
},
|
||||
wrapStream: async ({ doStream, params }) => {
|
||||
const cacheKey = JSON.stringify({
|
||||
...cleanPrompt(params.prompt),
|
||||
_function: "stream",
|
||||
});
|
||||
|
||||
// Check if the result is in the cache
|
||||
const cached = getCachedResult(cacheKey);
|
||||
|
||||
// If cached, return a simulated ReadableStream that yields the cached result
|
||||
if (cached) {
|
||||
console.log("🎯 Cache HIT");
|
||||
// Format the timestamps in the cached response
|
||||
const formattedChunks = (cached as LanguageModelV2StreamPart[]).map(
|
||||
(p) => {
|
||||
if (p.type === "response-metadata" && p.timestamp) {
|
||||
return { ...p, timestamp: new Date(p.timestamp) };
|
||||
} else return p;
|
||||
},
|
||||
);
|
||||
return {
|
||||
stream: simulateReadableStream({
|
||||
initialDelayInMs: 0,
|
||||
chunkDelayInMs: 10,
|
||||
chunks: formattedChunks,
|
||||
}),
|
||||
rawCall: { rawPrompt: null, rawSettings: {} },
|
||||
};
|
||||
}
|
||||
|
||||
console.log("🔍 Cache MISS");
|
||||
// If not cached, proceed with streaming
|
||||
const { stream, ...rest } = await doStream();
|
||||
|
||||
const fullResponse: LanguageModelV2StreamPart[] = [];
|
||||
|
||||
const transformStream = new TransformStream<
|
||||
LanguageModelV2StreamPart,
|
||||
LanguageModelV2StreamPart
|
||||
>({
|
||||
transform(chunk, controller) {
|
||||
fullResponse.push(chunk);
|
||||
controller.enqueue(chunk);
|
||||
},
|
||||
flush() {
|
||||
// Store the full response in the cache after streaming is complete
|
||||
updateCache(cacheKey, fullResponse);
|
||||
},
|
||||
});
|
||||
|
||||
return {
|
||||
stream: stream.pipeThrough(transformStream),
|
||||
...rest,
|
||||
};
|
||||
},
|
||||
};
|
||||
|
||||
export const repairToolCall: ToolCallRepairFunction<ToolSet> = async ({
|
||||
toolCall,
|
||||
tools,
|
||||
inputSchema,
|
||||
error,
|
||||
}) => {
|
||||
if (NoSuchToolError.isInstance(error)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const tool = tools[toolCall.toolName];
|
||||
|
||||
if (!tool?.inputSchema) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-unsafe-assignment
|
||||
const { object: repairedArgs } = await generateObject({
|
||||
model: openai.responses("gpt-4o"),
|
||||
schema: tool.inputSchema,
|
||||
prompt: [
|
||||
`The model tried to call the tool "${toolCall.toolName}"` +
|
||||
` with the following arguments:`,
|
||||
JSON.stringify(toolCall.input),
|
||||
`The tool accepts the following schema:`,
|
||||
JSON.stringify(inputSchema(toolCall)),
|
||||
"Please fix the arguments.",
|
||||
`Today's date is ${new Date().toLocaleDateString("en-US", { year: "numeric", month: "long", day: "numeric" })}`,
|
||||
].join("\n"),
|
||||
});
|
||||
|
||||
return { ...toolCall, args: JSON.stringify(repairedArgs) };
|
||||
};
|
||||
6
packages/ai/tsconfig.json
Normal file
6
packages/ai/tsconfig.json
Normal file
@@ -0,0 +1,6 @@
|
||||
{
|
||||
"extends": "@turbostarter/tsconfig/internal.json",
|
||||
"compilerOptions": {},
|
||||
"include": ["*.ts", "src/**/*"],
|
||||
"exclude": ["node_modules"]
|
||||
}
|
||||
3
packages/ai/vitest.config.ts
Normal file
3
packages/ai/vitest.config.ts
Normal file
@@ -0,0 +1,3 @@
|
||||
import baseConfig from "@turbostarter/vitest-config/base";
|
||||
|
||||
export default baseConfig;
|
||||
4
packages/analytics/mobile/eslint.config.js
Normal file
4
packages/analytics/mobile/eslint.config.js
Normal file
@@ -0,0 +1,4 @@
|
||||
import baseConfig from "@turbostarter/eslint-config/base";
|
||||
import reactConfig from "@turbostarter/eslint-config/react";
|
||||
|
||||
export default [...baseConfig, ...reactConfig];
|
||||
34
packages/analytics/mobile/package.json
Normal file
34
packages/analytics/mobile/package.json
Normal file
@@ -0,0 +1,34 @@
|
||||
{
|
||||
"name": "@turbostarter/analytics-mobile",
|
||||
"private": true,
|
||||
"version": "0.1.0",
|
||||
"type": "module",
|
||||
"exports": {
|
||||
".": "./src/index.ts",
|
||||
"./env": "./src/env.ts"
|
||||
},
|
||||
"scripts": {
|
||||
"clean": "git clean -xdf .cache .turbo dist node_modules",
|
||||
"format": "prettier --check . --ignore-path ../../../.gitignore",
|
||||
"lint": "eslint",
|
||||
"typecheck": "tsc --noEmit"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@turbostarter/eslint-config": "workspace:*",
|
||||
"@turbostarter/prettier-config": "workspace:*",
|
||||
"@turbostarter/tsconfig": "workspace:*",
|
||||
"eslint": "catalog:",
|
||||
"prettier": "catalog:",
|
||||
"typescript": "catalog:"
|
||||
},
|
||||
"prettier": "@turbostarter/prettier-config",
|
||||
"dependencies": {
|
||||
"@turbostarter/analytics": "workspace:*",
|
||||
"@turbostarter/shared": "workspace:*",
|
||||
"envin": "catalog:",
|
||||
"mixpanel-react-native": "3.1.2",
|
||||
"posthog-react-native": "4.14.3",
|
||||
"react-native": "catalog:",
|
||||
"zod": "catalog:"
|
||||
}
|
||||
}
|
||||
1
packages/analytics/mobile/src/env.ts
Normal file
1
packages/analytics/mobile/src/env.ts
Normal file
@@ -0,0 +1 @@
|
||||
export { env, preset } from "./providers";
|
||||
1
packages/analytics/mobile/src/hooks/index.ts
Normal file
1
packages/analytics/mobile/src/hooks/index.ts
Normal file
@@ -0,0 +1 @@
|
||||
export * from "./use-tracking-permissions";
|
||||
@@ -0,0 +1,32 @@
|
||||
import { requestTrackingPermissionsAsync } from "expo-tracking-transparency";
|
||||
import { useEffect, useState, useCallback } from "react";
|
||||
import { AppState } from "react-native";
|
||||
|
||||
export const useTrackingPermissions = () => {
|
||||
const [granted, setGranted] = useState(false);
|
||||
|
||||
const checkPermission = useCallback(async () => {
|
||||
const { granted: isGranted } = await requestTrackingPermissionsAsync();
|
||||
setGranted(isGranted);
|
||||
}, []);
|
||||
|
||||
useEffect(() => {
|
||||
void checkPermission();
|
||||
}, [checkPermission]);
|
||||
|
||||
useEffect(() => {
|
||||
const subscription = AppState.addEventListener("change", (status) => {
|
||||
if (status !== "active") {
|
||||
return;
|
||||
}
|
||||
|
||||
void checkPermission();
|
||||
});
|
||||
|
||||
return () => {
|
||||
subscription.remove();
|
||||
};
|
||||
}, [checkPermission]);
|
||||
|
||||
return granted;
|
||||
};
|
||||
3
packages/analytics/mobile/src/index.ts
Normal file
3
packages/analytics/mobile/src/index.ts
Normal file
@@ -0,0 +1,3 @@
|
||||
export { Provider, track, identify, reset } from "./providers";
|
||||
|
||||
export * from "./hooks";
|
||||
@@ -0,0 +1,16 @@
|
||||
import { defineEnv } from "envin";
|
||||
|
||||
import { envConfig } from "@turbostarter/shared/constants";
|
||||
|
||||
import type { Preset } from "envin/types";
|
||||
|
||||
export const preset = {
|
||||
id: "google-analytics",
|
||||
clientPrefix: "EXPO_PUBLIC_",
|
||||
client: {},
|
||||
} as const satisfies Preset;
|
||||
|
||||
export const env = defineEnv({
|
||||
...envConfig,
|
||||
...preset,
|
||||
});
|
||||
@@ -0,0 +1,69 @@
|
||||
/* eslint-disable @typescript-eslint/no-unsafe-member-access */
|
||||
/* eslint-disable @typescript-eslint/no-unsafe-call */
|
||||
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
|
||||
// @ts-nocheck
|
||||
import analytics from "@react-native-firebase/analytics";
|
||||
import { useGlobalSearchParams, usePathname } from "expo-router";
|
||||
import { useEffect } from "react";
|
||||
|
||||
import { useTrackingPermissions } from "../../hooks";
|
||||
|
||||
import type { AnalyticsProviderClientStrategy } from "@turbostarter/analytics";
|
||||
|
||||
const setup = async () => {
|
||||
await analytics().setAnalyticsCollectionEnabled(true);
|
||||
await analytics().setConsent({
|
||||
analytics_storage: true,
|
||||
ad_storage: true,
|
||||
ad_user_data: true,
|
||||
ad_personalization: true,
|
||||
});
|
||||
};
|
||||
|
||||
const useSetup = () => {
|
||||
const granted = useTrackingPermissions();
|
||||
const pathname = usePathname();
|
||||
const params = useGlobalSearchParams();
|
||||
|
||||
useEffect(() => {
|
||||
if (!granted) {
|
||||
return;
|
||||
}
|
||||
|
||||
void setup();
|
||||
}, [granted]);
|
||||
|
||||
useEffect(() => {
|
||||
if (!granted) {
|
||||
return;
|
||||
}
|
||||
|
||||
void analytics().logScreenView({
|
||||
screen_name: pathname,
|
||||
screen_class: pathname,
|
||||
params,
|
||||
});
|
||||
}, [pathname, params, granted]);
|
||||
};
|
||||
|
||||
export const { Provider, track, identify, reset } = {
|
||||
Provider: ({ children }) => {
|
||||
useSetup();
|
||||
|
||||
return children;
|
||||
},
|
||||
track: (name, params) => {
|
||||
void analytics().logEvent(name, params);
|
||||
},
|
||||
identify: (userId, traits) => {
|
||||
void analytics().setUserId(userId);
|
||||
|
||||
if (traits) {
|
||||
void analytics().setUserProperties(traits);
|
||||
}
|
||||
},
|
||||
reset: () => {
|
||||
void analytics().setUserId(null);
|
||||
void analytics().setUserProperties({});
|
||||
},
|
||||
} satisfies AnalyticsProviderClientStrategy;
|
||||
2
packages/analytics/mobile/src/providers/index.ts
Normal file
2
packages/analytics/mobile/src/providers/index.ts
Normal file
@@ -0,0 +1,2 @@
|
||||
export * from "./mixpanel";
|
||||
export * from "./mixpanel/env";
|
||||
24
packages/analytics/mobile/src/providers/mixpanel/env.ts
Normal file
24
packages/analytics/mobile/src/providers/mixpanel/env.ts
Normal file
@@ -0,0 +1,24 @@
|
||||
/* eslint-disable turbo/no-undeclared-env-vars */
|
||||
/* eslint-disable @typescript-eslint/no-unsafe-assignment */
|
||||
import { defineEnv } from "envin";
|
||||
import * as z from "zod";
|
||||
|
||||
import { envConfig } from "@turbostarter/shared/constants";
|
||||
|
||||
import type { Preset } from "envin/types";
|
||||
|
||||
export const preset = {
|
||||
id: "mixpanel",
|
||||
clientPrefix: "EXPO_PUBLIC_",
|
||||
client: {
|
||||
EXPO_PUBLIC_MIXPANEL_TOKEN: z.string(),
|
||||
},
|
||||
} as const satisfies Preset;
|
||||
|
||||
export const env = defineEnv({
|
||||
...envConfig,
|
||||
...preset,
|
||||
env: {
|
||||
EXPO_PUBLIC_MIXPANEL_TOKEN: process.env.EXPO_PUBLIC_MIXPANEL_TOKEN,
|
||||
},
|
||||
});
|
||||
47
packages/analytics/mobile/src/providers/mixpanel/index.tsx
Normal file
47
packages/analytics/mobile/src/providers/mixpanel/index.tsx
Normal file
@@ -0,0 +1,47 @@
|
||||
import { Mixpanel } from "mixpanel-react-native";
|
||||
import { useEffect } from "react";
|
||||
|
||||
import { useTrackingPermissions } from "../../hooks";
|
||||
|
||||
import { env } from "./env";
|
||||
|
||||
import type { AnalyticsProviderClientStrategy } from "@turbostarter/analytics";
|
||||
|
||||
const optOutTracking = true;
|
||||
const trackAutomaticEvents = false;
|
||||
const mixpanel = new Mixpanel(
|
||||
env.EXPO_PUBLIC_MIXPANEL_TOKEN,
|
||||
trackAutomaticEvents,
|
||||
optOutTracking,
|
||||
);
|
||||
|
||||
void mixpanel.init();
|
||||
|
||||
export const { Provider, track, identify, reset } = {
|
||||
Provider: ({ children }) => {
|
||||
const granted = useTrackingPermissions();
|
||||
|
||||
useEffect(() => {
|
||||
void (async () => {
|
||||
const optedOut = await mixpanel.hasOptedOutTracking();
|
||||
if (granted && optedOut) {
|
||||
void mixpanel.optInTracking();
|
||||
}
|
||||
})();
|
||||
}, [granted]);
|
||||
|
||||
return <>{children}</>;
|
||||
},
|
||||
track: (name, params) => {
|
||||
mixpanel.track(name, params);
|
||||
},
|
||||
identify: (userId, traits) => {
|
||||
void mixpanel.identify(userId);
|
||||
if (traits) {
|
||||
void mixpanel.getPeople().set(traits);
|
||||
}
|
||||
},
|
||||
reset: () => {
|
||||
mixpanel.reset();
|
||||
},
|
||||
} satisfies AnalyticsProviderClientStrategy;
|
||||
29
packages/analytics/mobile/src/providers/posthog/env.ts
Normal file
29
packages/analytics/mobile/src/providers/posthog/env.ts
Normal file
@@ -0,0 +1,29 @@
|
||||
/* eslint-disable turbo/no-undeclared-env-vars */
|
||||
/* eslint-disable @typescript-eslint/no-unsafe-assignment */
|
||||
import { defineEnv } from "envin";
|
||||
import * as z from "zod";
|
||||
|
||||
import { envConfig } from "@turbostarter/shared/constants";
|
||||
|
||||
import type { Preset } from "envin/types";
|
||||
|
||||
export const preset = {
|
||||
id: "posthog",
|
||||
clientPrefix: "EXPO_PUBLIC_",
|
||||
client: {
|
||||
EXPO_PUBLIC_POSTHOG_KEY: z.string(),
|
||||
EXPO_PUBLIC_POSTHOG_HOST: z
|
||||
.string()
|
||||
.optional()
|
||||
.default("https://us.i.posthog.com"),
|
||||
},
|
||||
} as const satisfies Preset;
|
||||
|
||||
export const env = defineEnv({
|
||||
...envConfig,
|
||||
...preset,
|
||||
env: {
|
||||
EXPO_PUBLIC_POSTHOG_KEY: process.env.EXPO_PUBLIC_POSTHOG_KEY,
|
||||
EXPO_PUBLIC_POSTHOG_HOST: process.env.EXPO_PUBLIC_POSTHOG_HOST,
|
||||
},
|
||||
});
|
||||
73
packages/analytics/mobile/src/providers/posthog/index.tsx
Normal file
73
packages/analytics/mobile/src/providers/posthog/index.tsx
Normal file
@@ -0,0 +1,73 @@
|
||||
import PostHog, { PostHogProvider } from "posthog-react-native";
|
||||
import { useEffect } from "react";
|
||||
|
||||
import { useTrackingPermissions } from "../../hooks";
|
||||
|
||||
import { env } from "./env";
|
||||
|
||||
import type { AnalyticsProviderClientStrategy } from "@turbostarter/analytics";
|
||||
|
||||
let client: PostHog | null = null;
|
||||
|
||||
const getClient = () => {
|
||||
if (client) {
|
||||
return client;
|
||||
}
|
||||
|
||||
client = new PostHog(env.EXPO_PUBLIC_POSTHOG_KEY, {
|
||||
host: env.EXPO_PUBLIC_POSTHOG_HOST,
|
||||
defaultOptIn: false,
|
||||
});
|
||||
|
||||
return client;
|
||||
};
|
||||
|
||||
const Wrapper = ({ children }: { children: React.ReactNode }) => {
|
||||
const client = getClient();
|
||||
|
||||
return (
|
||||
<PostHogProvider client={client} autocapture>
|
||||
{children}
|
||||
</PostHogProvider>
|
||||
);
|
||||
};
|
||||
|
||||
const Setup = () => {
|
||||
const client = getClient();
|
||||
const granted = useTrackingPermissions();
|
||||
|
||||
useEffect(() => {
|
||||
if (granted) {
|
||||
void client.optIn();
|
||||
} else {
|
||||
void client.optOut();
|
||||
}
|
||||
}, [granted, client]);
|
||||
|
||||
return null;
|
||||
};
|
||||
|
||||
const ProviderComponent = ({ children }: { children: React.ReactNode }) => {
|
||||
return (
|
||||
<Wrapper>
|
||||
<Setup />
|
||||
{children}
|
||||
</Wrapper>
|
||||
);
|
||||
};
|
||||
|
||||
export const { Provider, track, identify, reset } = {
|
||||
Provider: ProviderComponent,
|
||||
track: (name, params) => {
|
||||
const client = getClient();
|
||||
client.capture(name, params);
|
||||
},
|
||||
identify: (userId, traits) => {
|
||||
const client = getClient();
|
||||
client.identify(userId, traits);
|
||||
},
|
||||
reset: () => {
|
||||
const client = getClient();
|
||||
client.reset();
|
||||
},
|
||||
} satisfies AnalyticsProviderClientStrategy;
|
||||
9
packages/analytics/mobile/tsconfig.json
Normal file
9
packages/analytics/mobile/tsconfig.json
Normal file
@@ -0,0 +1,9 @@
|
||||
{
|
||||
"extends": "@turbostarter/tsconfig/internal.json",
|
||||
"compilerOptions": {
|
||||
"lib": ["dom"],
|
||||
"jsx": "preserve"
|
||||
},
|
||||
"include": ["*.ts", "src/**/*"],
|
||||
"exclude": ["node_modules"]
|
||||
}
|
||||
3
packages/analytics/shared/eslint.config.js
Normal file
3
packages/analytics/shared/eslint.config.js
Normal file
@@ -0,0 +1,3 @@
|
||||
import baseConfig from "@turbostarter/eslint-config/base";
|
||||
|
||||
export default baseConfig;
|
||||
24
packages/analytics/shared/package.json
Normal file
24
packages/analytics/shared/package.json
Normal file
@@ -0,0 +1,24 @@
|
||||
{
|
||||
"name": "@turbostarter/analytics",
|
||||
"version": "0.1.0",
|
||||
"private": true,
|
||||
"type": "module",
|
||||
"exports": {
|
||||
".": "./src/index.ts"
|
||||
},
|
||||
"scripts": {
|
||||
"clean": "git clean -xdf .cache .turbo dist node_modules",
|
||||
"format": "prettier --check . --ignore-path ../../.gitignore",
|
||||
"lint": "eslint",
|
||||
"typecheck": "tsc --noEmit"
|
||||
},
|
||||
"prettier": "@turbostarter/prettier-config",
|
||||
"devDependencies": {
|
||||
"@turbostarter/eslint-config": "workspace:*",
|
||||
"@turbostarter/prettier-config": "workspace:*",
|
||||
"@turbostarter/tsconfig": "workspace:*",
|
||||
"eslint": "catalog:",
|
||||
"prettier": "catalog:",
|
||||
"typescript": "catalog:"
|
||||
}
|
||||
}
|
||||
1
packages/analytics/shared/src/index.ts
Normal file
1
packages/analytics/shared/src/index.ts
Normal file
@@ -0,0 +1 @@
|
||||
export * from "./types";
|
||||
22
packages/analytics/shared/src/types.ts
Normal file
22
packages/analytics/shared/src/types.ts
Normal file
@@ -0,0 +1,22 @@
|
||||
export type AllowedPropertyValues = string | number | boolean;
|
||||
|
||||
type TrackFunction = (
|
||||
event: string,
|
||||
data?: Record<string, AllowedPropertyValues>,
|
||||
) => void;
|
||||
|
||||
type IdentifyFunction = (
|
||||
userId: string,
|
||||
traits?: Record<string, AllowedPropertyValues>,
|
||||
) => void;
|
||||
|
||||
export interface AnalyticsProviderClientStrategy {
|
||||
Provider: ({ children }: { children: React.ReactNode }) => React.ReactNode;
|
||||
track: TrackFunction;
|
||||
identify: IdentifyFunction;
|
||||
reset: () => void;
|
||||
}
|
||||
|
||||
export interface AnalyticsProviderServerStrategy {
|
||||
track: TrackFunction;
|
||||
}
|
||||
6
packages/analytics/shared/tsconfig.json
Normal file
6
packages/analytics/shared/tsconfig.json
Normal file
@@ -0,0 +1,6 @@
|
||||
{
|
||||
"extends": "@turbostarter/tsconfig/internal.json",
|
||||
"compilerOptions": {},
|
||||
"include": ["*.ts", "src/**/*"],
|
||||
"exclude": ["node_modules"]
|
||||
}
|
||||
4
packages/analytics/web/eslint.config.js
Normal file
4
packages/analytics/web/eslint.config.js
Normal file
@@ -0,0 +1,4 @@
|
||||
import baseConfig from "@turbostarter/eslint-config/base";
|
||||
import reactConfig from "@turbostarter/eslint-config/react";
|
||||
|
||||
export default [...baseConfig, ...reactConfig];
|
||||
39
packages/analytics/web/package.json
Normal file
39
packages/analytics/web/package.json
Normal file
@@ -0,0 +1,39 @@
|
||||
{
|
||||
"name": "@turbostarter/analytics-web",
|
||||
"private": true,
|
||||
"version": "0.1.0",
|
||||
"type": "module",
|
||||
"exports": {
|
||||
".": "./src/index.tsx",
|
||||
"./env": "./src/env.ts",
|
||||
"./server": "./src/server.ts"
|
||||
},
|
||||
"scripts": {
|
||||
"clean": "git clean -xdf .cache .turbo dist node_modules",
|
||||
"format": "prettier --check . --ignore-path ../../../.gitignore",
|
||||
"lint": "eslint",
|
||||
"typecheck": "tsc --noEmit"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@turbostarter/eslint-config": "workspace:*",
|
||||
"@turbostarter/prettier-config": "workspace:*",
|
||||
"@turbostarter/tsconfig": "workspace:*",
|
||||
"eslint": "catalog:",
|
||||
"prettier": "catalog:",
|
||||
"typescript": "catalog:"
|
||||
},
|
||||
"prettier": "@turbostarter/prettier-config",
|
||||
"dependencies": {
|
||||
"@openpanel/nextjs": "1.0.9",
|
||||
"@turbostarter/analytics": "workspace:*",
|
||||
"@turbostarter/shared": "workspace:*",
|
||||
"@vemetric/node": "0.2.0",
|
||||
"@vemetric/react": "0.6.1",
|
||||
"@vercel/analytics": "1.5.0",
|
||||
"mixpanel": "0.18.1",
|
||||
"mixpanel-browser": "2.71.1",
|
||||
"posthog-js": "1.283.0",
|
||||
"posthog-node": "5.11.0",
|
||||
"zod": "catalog:"
|
||||
}
|
||||
}
|
||||
1
packages/analytics/web/src/env.ts
Normal file
1
packages/analytics/web/src/env.ts
Normal file
@@ -0,0 +1 @@
|
||||
export * from "./providers/env";
|
||||
1
packages/analytics/web/src/index.tsx
Normal file
1
packages/analytics/web/src/index.tsx
Normal file
@@ -0,0 +1 @@
|
||||
export { Provider, track, identify, reset } from "./providers";
|
||||
1
packages/analytics/web/src/providers/env.ts
Normal file
1
packages/analytics/web/src/providers/env.ts
Normal file
@@ -0,0 +1 @@
|
||||
export * from "./posthog/env";
|
||||
27
packages/analytics/web/src/providers/google-analytics/env.ts
Normal file
27
packages/analytics/web/src/providers/google-analytics/env.ts
Normal file
@@ -0,0 +1,27 @@
|
||||
/* eslint-disable turbo/no-undeclared-env-vars */
|
||||
import { defineEnv } from "envin";
|
||||
import * as z from "zod";
|
||||
|
||||
import { envConfig } from "@turbostarter/shared/constants";
|
||||
|
||||
import type { Preset } from "envin/types";
|
||||
|
||||
export const preset = {
|
||||
id: "google-analytics",
|
||||
client: {
|
||||
NEXT_PUBLIC_GOOGLE_ANALYTICS_MEASUREMENT_ID: z.string(),
|
||||
},
|
||||
server: {
|
||||
GOOGLE_ANALYTICS_SECRET: z.string(),
|
||||
},
|
||||
} as const satisfies Preset;
|
||||
|
||||
export const env = defineEnv({
|
||||
...envConfig,
|
||||
...preset,
|
||||
env: {
|
||||
...process.env,
|
||||
NEXT_PUBLIC_GOOGLE_ANALYTICS_MEASUREMENT_ID:
|
||||
process.env.NEXT_PUBLIC_GOOGLE_ANALYTICS_MEASUREMENT_ID,
|
||||
},
|
||||
});
|
||||
@@ -0,0 +1,69 @@
|
||||
import { env } from "./env";
|
||||
|
||||
import type { AnalyticsProviderClientStrategy } from "@turbostarter/analytics";
|
||||
|
||||
declare global {
|
||||
interface Window {
|
||||
dataLayer?: unknown[];
|
||||
gtag?: (...args: unknown[]) => void;
|
||||
}
|
||||
}
|
||||
|
||||
export const { Provider, track, identify, reset } = {
|
||||
Provider: ({ children }) => {
|
||||
return (
|
||||
<>
|
||||
{children}
|
||||
<script
|
||||
async
|
||||
src={`https://www.googletagmanager.com/gtag/js?id=${env.NEXT_PUBLIC_GOOGLE_ANALYTICS_MEASUREMENT_ID}`}
|
||||
onLoad={() => {
|
||||
if (typeof window === "undefined") {
|
||||
return;
|
||||
}
|
||||
|
||||
window.dataLayer = window.dataLayer ?? [];
|
||||
|
||||
function gtag(...args: unknown[]) {
|
||||
window.dataLayer?.push(args);
|
||||
}
|
||||
|
||||
window.gtag = gtag;
|
||||
|
||||
window.gtag("js", new Date());
|
||||
window.gtag(
|
||||
"config",
|
||||
env.NEXT_PUBLIC_GOOGLE_ANALYTICS_MEASUREMENT_ID,
|
||||
);
|
||||
}}
|
||||
/>
|
||||
</>
|
||||
);
|
||||
},
|
||||
track: (event, data) => {
|
||||
if (typeof window === "undefined" || !window.gtag) {
|
||||
return;
|
||||
}
|
||||
|
||||
window.gtag("event", event, data);
|
||||
},
|
||||
identify: (userId, traits) => {
|
||||
if (typeof window === "undefined" || !window.gtag) {
|
||||
return;
|
||||
}
|
||||
|
||||
window.gtag("config", env.NEXT_PUBLIC_GOOGLE_ANALYTICS_MEASUREMENT_ID, {
|
||||
user_id: userId,
|
||||
...traits,
|
||||
});
|
||||
},
|
||||
reset: () => {
|
||||
if (typeof window === "undefined" || !window.gtag) {
|
||||
return;
|
||||
}
|
||||
|
||||
window.gtag("config", env.NEXT_PUBLIC_GOOGLE_ANALYTICS_MEASUREMENT_ID, {
|
||||
user_id: null,
|
||||
});
|
||||
},
|
||||
} satisfies AnalyticsProviderClientStrategy;
|
||||
@@ -0,0 +1,36 @@
|
||||
import { randomUUID } from "crypto";
|
||||
|
||||
import { logger } from "@turbostarter/shared/logger";
|
||||
|
||||
import { env } from "./env";
|
||||
|
||||
import type {
|
||||
AllowedPropertyValues,
|
||||
AnalyticsProviderServerStrategy,
|
||||
} from "@turbostarter/analytics";
|
||||
|
||||
const postEvent = async (
|
||||
event: string,
|
||||
data?: Record<string, AllowedPropertyValues>,
|
||||
) => {
|
||||
const response = await fetch(
|
||||
`https://www.google-analytics.com/mp/collect?measurement_id=${env.NEXT_PUBLIC_GOOGLE_ANALYTICS_MEASUREMENT_ID}&api_secret=${env.GOOGLE_ANALYTICS_SECRET}`,
|
||||
{
|
||||
method: "POST",
|
||||
body: JSON.stringify({
|
||||
client_id: data?.clientId ?? randomUUID(),
|
||||
events: [{ name: event, params: data }],
|
||||
}),
|
||||
},
|
||||
);
|
||||
|
||||
if (!response.ok) {
|
||||
logger.error("Failed to post event to Google Analytics: ", response);
|
||||
}
|
||||
};
|
||||
|
||||
export const { track } = {
|
||||
track: (event, data) => {
|
||||
void postEvent(event, data);
|
||||
},
|
||||
} satisfies AnalyticsProviderServerStrategy;
|
||||
1
packages/analytics/web/src/providers/index.tsx
Normal file
1
packages/analytics/web/src/providers/index.tsx
Normal file
@@ -0,0 +1 @@
|
||||
export * from "./posthog";
|
||||
26
packages/analytics/web/src/providers/mixpanel/env.ts
Normal file
26
packages/analytics/web/src/providers/mixpanel/env.ts
Normal file
@@ -0,0 +1,26 @@
|
||||
/* eslint-disable turbo/no-undeclared-env-vars */
|
||||
import { defineEnv } from "envin";
|
||||
import * as z from "zod";
|
||||
|
||||
import { envConfig, NodeEnv } from "@turbostarter/shared/constants";
|
||||
|
||||
import type { Preset } from "envin/types";
|
||||
|
||||
export const preset = {
|
||||
id: "mixpanel",
|
||||
client: {
|
||||
NEXT_PUBLIC_MIXPANEL_TOKEN: z.string(),
|
||||
},
|
||||
} as const satisfies Preset;
|
||||
|
||||
export const env = defineEnv({
|
||||
...envConfig,
|
||||
...preset,
|
||||
shared: {
|
||||
NODE_ENV: z.enum(NodeEnv).default(NodeEnv.DEVELOPMENT),
|
||||
},
|
||||
env: {
|
||||
...process.env,
|
||||
NEXT_PUBLIC_MIXPANEL_TOKEN: process.env.NEXT_PUBLIC_MIXPANEL_TOKEN,
|
||||
},
|
||||
});
|
||||
51
packages/analytics/web/src/providers/mixpanel/index.tsx
Normal file
51
packages/analytics/web/src/providers/mixpanel/index.tsx
Normal file
@@ -0,0 +1,51 @@
|
||||
"use client";
|
||||
|
||||
import mixpanel from "mixpanel-browser";
|
||||
import { useEffect } from "react";
|
||||
|
||||
import { NodeEnv } from "@turbostarter/shared/constants";
|
||||
|
||||
import { env } from "./env";
|
||||
|
||||
import type { AnalyticsProviderClientStrategy } from "@turbostarter/analytics";
|
||||
|
||||
const init = () => {
|
||||
mixpanel.init(env.NEXT_PUBLIC_MIXPANEL_TOKEN, {
|
||||
debug: env.NODE_ENV === NodeEnv.DEVELOPMENT,
|
||||
autocapture: true,
|
||||
persistence: "localStorage",
|
||||
});
|
||||
};
|
||||
|
||||
export const { Provider, track, identify, reset } = {
|
||||
Provider: ({ children }) => {
|
||||
useEffect(() => {
|
||||
init();
|
||||
}, []);
|
||||
return children;
|
||||
},
|
||||
track: (event, properties) => {
|
||||
if (typeof window === "undefined") {
|
||||
return;
|
||||
}
|
||||
|
||||
mixpanel.track(event, properties);
|
||||
},
|
||||
identify: (userId, traits) => {
|
||||
if (typeof window === "undefined") {
|
||||
return;
|
||||
}
|
||||
|
||||
mixpanel.identify(userId);
|
||||
if (traits) {
|
||||
mixpanel.people.set(traits);
|
||||
}
|
||||
},
|
||||
reset: () => {
|
||||
if (typeof window === "undefined") {
|
||||
return;
|
||||
}
|
||||
|
||||
mixpanel.reset();
|
||||
},
|
||||
} satisfies AnalyticsProviderClientStrategy;
|
||||
33
packages/analytics/web/src/providers/mixpanel/server.ts
Normal file
33
packages/analytics/web/src/providers/mixpanel/server.ts
Normal file
@@ -0,0 +1,33 @@
|
||||
import Mixpanel from "mixpanel";
|
||||
|
||||
import { NodeEnv } from "@turbostarter/shared/constants";
|
||||
import { logger } from "@turbostarter/shared/logger";
|
||||
|
||||
import { env } from "./env";
|
||||
|
||||
import type { AnalyticsProviderServerStrategy } from "@turbostarter/analytics";
|
||||
|
||||
let client: Mixpanel.Mixpanel | null = null;
|
||||
|
||||
const getClient = () => {
|
||||
if (client) {
|
||||
return client;
|
||||
}
|
||||
|
||||
client = Mixpanel.init(env.NEXT_PUBLIC_MIXPANEL_TOKEN, {
|
||||
debug: env.NODE_ENV === NodeEnv.DEVELOPMENT,
|
||||
});
|
||||
|
||||
return client;
|
||||
};
|
||||
|
||||
export const { track } = {
|
||||
track: (event, properties) => {
|
||||
try {
|
||||
const mixpanel = getClient();
|
||||
mixpanel.track(event, properties ?? {});
|
||||
} catch (error) {
|
||||
logger.warn("Failed to track Mixpanel event: ", error);
|
||||
}
|
||||
},
|
||||
} satisfies AnalyticsProviderServerStrategy;
|
||||
27
packages/analytics/web/src/providers/open-panel/env.ts
Normal file
27
packages/analytics/web/src/providers/open-panel/env.ts
Normal file
@@ -0,0 +1,27 @@
|
||||
/* eslint-disable turbo/no-undeclared-env-vars */
|
||||
import { defineEnv } from "envin";
|
||||
import * as z from "zod";
|
||||
|
||||
import { envConfig } from "@turbostarter/shared/constants";
|
||||
|
||||
import type { Preset } from "envin/types";
|
||||
|
||||
export const preset = {
|
||||
id: "open-panel",
|
||||
client: {
|
||||
NEXT_PUBLIC_OPEN_PANEL_CLIENT_ID: z.string(),
|
||||
},
|
||||
server: {
|
||||
OPEN_PANEL_SECRET: z.string(),
|
||||
},
|
||||
} as const satisfies Preset;
|
||||
|
||||
export const env = defineEnv({
|
||||
...envConfig,
|
||||
...preset,
|
||||
env: {
|
||||
...process.env,
|
||||
NEXT_PUBLIC_OPEN_PANEL_CLIENT_ID:
|
||||
process.env.NEXT_PUBLIC_OPEN_PANEL_CLIENT_ID,
|
||||
},
|
||||
});
|
||||
45
packages/analytics/web/src/providers/open-panel/index.tsx
Normal file
45
packages/analytics/web/src/providers/open-panel/index.tsx
Normal file
@@ -0,0 +1,45 @@
|
||||
import { OpenPanelComponent } from "@openpanel/nextjs";
|
||||
|
||||
import { env } from "./env";
|
||||
|
||||
import type { AnalyticsProviderClientStrategy } from "@turbostarter/analytics";
|
||||
|
||||
export const { Provider, track, identify, reset } = {
|
||||
Provider: ({ children }) => {
|
||||
return (
|
||||
<>
|
||||
{children}
|
||||
<OpenPanelComponent
|
||||
clientId={env.NEXT_PUBLIC_OPEN_PANEL_CLIENT_ID}
|
||||
trackScreenViews
|
||||
trackAttributes
|
||||
trackOutgoingLinks
|
||||
/>
|
||||
</>
|
||||
);
|
||||
},
|
||||
track: (event, data) => {
|
||||
if (typeof window === "undefined") {
|
||||
return;
|
||||
}
|
||||
|
||||
window.op("track", event, data);
|
||||
},
|
||||
identify: (userId, traits) => {
|
||||
if (typeof window === "undefined") {
|
||||
return;
|
||||
}
|
||||
|
||||
window.op("identify", {
|
||||
profileId: userId,
|
||||
...traits,
|
||||
});
|
||||
},
|
||||
reset: () => {
|
||||
if (typeof window === "undefined") {
|
||||
return;
|
||||
}
|
||||
|
||||
window.op("clear");
|
||||
},
|
||||
} satisfies AnalyticsProviderClientStrategy;
|
||||
28
packages/analytics/web/src/providers/open-panel/server.ts
Normal file
28
packages/analytics/web/src/providers/open-panel/server.ts
Normal file
@@ -0,0 +1,28 @@
|
||||
import { OpenPanel } from "@openpanel/nextjs";
|
||||
|
||||
import { env } from "./env";
|
||||
|
||||
import type { AnalyticsProviderServerStrategy } from "@turbostarter/analytics";
|
||||
|
||||
let client: OpenPanel | null = null;
|
||||
|
||||
const getClient = () => {
|
||||
if (client) {
|
||||
return client;
|
||||
}
|
||||
|
||||
client = new OpenPanel({
|
||||
clientId: env.NEXT_PUBLIC_OPEN_PANEL_CLIENT_ID,
|
||||
clientSecret: env.OPEN_PANEL_SECRET,
|
||||
});
|
||||
|
||||
return client;
|
||||
};
|
||||
|
||||
export const { track } = {
|
||||
track: (event, data) => {
|
||||
const client = getClient();
|
||||
|
||||
void client.track(event, data);
|
||||
},
|
||||
} satisfies AnalyticsProviderServerStrategy;
|
||||
26
packages/analytics/web/src/providers/plausible/env.ts
Normal file
26
packages/analytics/web/src/providers/plausible/env.ts
Normal file
@@ -0,0 +1,26 @@
|
||||
/* eslint-disable turbo/no-undeclared-env-vars */
|
||||
import { defineEnv } from "envin";
|
||||
import * as z from "zod";
|
||||
|
||||
import { envConfig } from "@turbostarter/shared/constants";
|
||||
|
||||
import type { Preset } from "envin/types";
|
||||
|
||||
export const preset = {
|
||||
id: "plausible",
|
||||
clientPrefix: "NEXT_PUBLIC_",
|
||||
client: {
|
||||
NEXT_PUBLIC_PLAUSIBLE_DOMAIN: z.string(),
|
||||
NEXT_PUBLIC_PLAUSIBLE_HOST: z.string(),
|
||||
},
|
||||
} as const satisfies Preset;
|
||||
|
||||
export const env = defineEnv({
|
||||
...envConfig,
|
||||
...preset,
|
||||
env: {
|
||||
...process.env,
|
||||
NEXT_PUBLIC_PLAUSIBLE_DOMAIN: process.env.NEXT_PUBLIC_PLAUSIBLE_DOMAIN,
|
||||
NEXT_PUBLIC_PLAUSIBLE_HOST: process.env.NEXT_PUBLIC_PLAUSIBLE_HOST,
|
||||
},
|
||||
});
|
||||
109
packages/analytics/web/src/providers/plausible/index.tsx
Normal file
109
packages/analytics/web/src/providers/plausible/index.tsx
Normal file
@@ -0,0 +1,109 @@
|
||||
import { z } from "zod";
|
||||
|
||||
import { env } from "./env";
|
||||
|
||||
import type {
|
||||
AllowedPropertyValues,
|
||||
AnalyticsProviderClientStrategy,
|
||||
} from "@turbostarter/analytics";
|
||||
|
||||
declare global {
|
||||
interface Window {
|
||||
plausible?: (
|
||||
event: string,
|
||||
options?: { props?: Record<string, unknown> },
|
||||
) => void;
|
||||
}
|
||||
}
|
||||
|
||||
const STORAGE_KEYS = {
|
||||
USER_ID: "plausible_user_id",
|
||||
USER_TRAITS: "plausible_user_traits",
|
||||
} as const;
|
||||
|
||||
const ValueSchema = z.union([z.string(), z.number(), z.boolean()]);
|
||||
const TraitsSchema = z.record(z.string(), ValueSchema);
|
||||
|
||||
const getStoredIdentity = () => {
|
||||
if (typeof window === "undefined") {
|
||||
return { userId: undefined, traits: undefined };
|
||||
}
|
||||
|
||||
try {
|
||||
const userId = localStorage.getItem(STORAGE_KEYS.USER_ID) ?? undefined;
|
||||
const traitsStr = localStorage.getItem(STORAGE_KEYS.USER_TRAITS);
|
||||
|
||||
let traits: Record<string, AllowedPropertyValues> | undefined;
|
||||
if (traitsStr) {
|
||||
const parsed = TraitsSchema.safeParse(JSON.parse(traitsStr));
|
||||
if (parsed.success) {
|
||||
traits = parsed.data;
|
||||
}
|
||||
}
|
||||
|
||||
return { userId, traits };
|
||||
} catch {
|
||||
return { userId: undefined, traits: undefined };
|
||||
}
|
||||
};
|
||||
|
||||
export const { Provider, track, identify, reset } = {
|
||||
Provider: ({ children }) => {
|
||||
return (
|
||||
<>
|
||||
{children}
|
||||
<script
|
||||
defer
|
||||
data-domain={env.NEXT_PUBLIC_PLAUSIBLE_DOMAIN}
|
||||
src={`${env.NEXT_PUBLIC_PLAUSIBLE_HOST}/js/script.js`}
|
||||
/>
|
||||
</>
|
||||
);
|
||||
},
|
||||
track: (event, data) => {
|
||||
if (typeof window === "undefined" || !window.plausible) {
|
||||
return;
|
||||
}
|
||||
|
||||
const { userId, traits } = getStoredIdentity();
|
||||
|
||||
const props: Record<string, unknown> = {
|
||||
...traits,
|
||||
...data,
|
||||
};
|
||||
|
||||
if (userId) {
|
||||
props.userId = userId;
|
||||
}
|
||||
|
||||
window.plausible(event, {
|
||||
props,
|
||||
});
|
||||
},
|
||||
identify: (userId, traits) => {
|
||||
if (typeof window === "undefined") {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
localStorage.setItem(STORAGE_KEYS.USER_ID, userId);
|
||||
if (traits) {
|
||||
localStorage.setItem(STORAGE_KEYS.USER_TRAITS, JSON.stringify(traits));
|
||||
}
|
||||
} catch {
|
||||
// Ignore storage errors
|
||||
}
|
||||
},
|
||||
reset: () => {
|
||||
if (typeof window === "undefined") {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
localStorage.removeItem(STORAGE_KEYS.USER_ID);
|
||||
localStorage.removeItem(STORAGE_KEYS.USER_TRAITS);
|
||||
} catch {
|
||||
// Ignore storage errors
|
||||
}
|
||||
},
|
||||
} satisfies AnalyticsProviderClientStrategy;
|
||||
42
packages/analytics/web/src/providers/plausible/server.ts
Normal file
42
packages/analytics/web/src/providers/plausible/server.ts
Normal file
@@ -0,0 +1,42 @@
|
||||
import { logger } from "@turbostarter/shared/logger";
|
||||
|
||||
import { env } from "./env";
|
||||
|
||||
import type { AnalyticsProviderServerStrategy } from "@turbostarter/analytics";
|
||||
|
||||
export const { track } = {
|
||||
track: (event, data) => {
|
||||
const url = typeof data?.url === "string" ? data.url : "app://server-side";
|
||||
const referrer =
|
||||
typeof data?.referrer === "string" ? data.referrer : undefined;
|
||||
const ip = typeof data?.ip === "string" ? data.ip : undefined;
|
||||
|
||||
const props = data
|
||||
? Object.fromEntries(
|
||||
Object.entries(data).filter(
|
||||
([key]) => !["url", "referrer", "ip"].includes(key),
|
||||
),
|
||||
)
|
||||
: undefined;
|
||||
|
||||
void fetch(`${env.NEXT_PUBLIC_PLAUSIBLE_HOST}/api/event`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
"User-Agent": "TurboStarter-Server/1.0 (Server-side tracking)",
|
||||
...(ip && { "X-Forwarded-For": ip }),
|
||||
},
|
||||
body: JSON.stringify({
|
||||
domain: env.NEXT_PUBLIC_PLAUSIBLE_DOMAIN,
|
||||
name: event,
|
||||
url: url,
|
||||
...(referrer && { referrer }),
|
||||
...(props && Object.keys(props).length > 0 && { props }),
|
||||
}),
|
||||
}).then((res) => {
|
||||
if (!res.ok) {
|
||||
logger.error("Failed to post event to Plausible: ", res);
|
||||
}
|
||||
});
|
||||
},
|
||||
} satisfies AnalyticsProviderServerStrategy;
|
||||
29
packages/analytics/web/src/providers/posthog/env.ts
Normal file
29
packages/analytics/web/src/providers/posthog/env.ts
Normal file
@@ -0,0 +1,29 @@
|
||||
/* eslint-disable turbo/no-undeclared-env-vars */
|
||||
import { defineEnv } from "envin";
|
||||
import * as z from "zod";
|
||||
|
||||
import { envConfig } from "@turbostarter/shared/constants";
|
||||
|
||||
import type { Preset } from "envin/types";
|
||||
|
||||
export const preset = {
|
||||
id: "posthog",
|
||||
clientPrefix: "NEXT_PUBLIC_",
|
||||
client: {
|
||||
NEXT_PUBLIC_POSTHOG_KEY: z.string().optional(),
|
||||
NEXT_PUBLIC_POSTHOG_HOST: z
|
||||
.string()
|
||||
.optional()
|
||||
.default("https://us.i.posthog.com"),
|
||||
},
|
||||
} as const satisfies Preset;
|
||||
|
||||
export const env = defineEnv({
|
||||
...envConfig,
|
||||
...preset,
|
||||
env: {
|
||||
...process.env,
|
||||
NEXT_PUBLIC_POSTHOG_KEY: process.env.NEXT_PUBLIC_POSTHOG_KEY,
|
||||
NEXT_PUBLIC_POSTHOG_HOST: process.env.NEXT_PUBLIC_POSTHOG_HOST,
|
||||
},
|
||||
});
|
||||
71
packages/analytics/web/src/providers/posthog/index.tsx
Normal file
71
packages/analytics/web/src/providers/posthog/index.tsx
Normal file
@@ -0,0 +1,71 @@
|
||||
"use client";
|
||||
|
||||
import dynamic from "next/dynamic";
|
||||
import posthog from "posthog-js";
|
||||
import { PostHogProvider } from "posthog-js/react";
|
||||
import { Suspense } from "react";
|
||||
|
||||
import { env } from "./env";
|
||||
|
||||
import type { AnalyticsProviderClientStrategy } from "@turbostarter/analytics";
|
||||
|
||||
const PageView = dynamic(
|
||||
() => import("./page-view").then((mod) => mod.PageView),
|
||||
{
|
||||
ssr: false,
|
||||
},
|
||||
);
|
||||
|
||||
const isValidPosthogConfig =
|
||||
env.NEXT_PUBLIC_POSTHOG_KEY &&
|
||||
env.NEXT_PUBLIC_POSTHOG_KEY !== "notyet" &&
|
||||
env.NEXT_PUBLIC_POSTHOG_HOST.startsWith("http");
|
||||
|
||||
if (typeof window !== "undefined" && isValidPosthogConfig) {
|
||||
posthog.init(env.NEXT_PUBLIC_POSTHOG_KEY!, {
|
||||
api_host: env.NEXT_PUBLIC_POSTHOG_HOST,
|
||||
person_profiles: "always",
|
||||
capture_pageview: false,
|
||||
disable_external_dependency_loading: true,
|
||||
disable_session_recording: true,
|
||||
});
|
||||
}
|
||||
|
||||
export const { Provider, track, identify, reset } = {
|
||||
Provider: ({ children }) => {
|
||||
// Skip PostHog wrapper entirely when not configured
|
||||
if (!isValidPosthogConfig) {
|
||||
return <>{children}</>;
|
||||
}
|
||||
|
||||
return (
|
||||
<PostHogProvider client={posthog}>
|
||||
{children}
|
||||
<Suspense fallback={null}>
|
||||
<PageView />
|
||||
</Suspense>
|
||||
</PostHogProvider>
|
||||
);
|
||||
},
|
||||
track: (event, properties) => {
|
||||
if (typeof window === "undefined") {
|
||||
return;
|
||||
}
|
||||
|
||||
posthog.capture(event, properties);
|
||||
},
|
||||
identify: (userId, traits) => {
|
||||
if (typeof window === "undefined") {
|
||||
return;
|
||||
}
|
||||
|
||||
posthog.identify(userId, traits);
|
||||
},
|
||||
reset: () => {
|
||||
if (typeof window === "undefined") {
|
||||
return;
|
||||
}
|
||||
|
||||
posthog.reset();
|
||||
},
|
||||
} satisfies AnalyticsProviderClientStrategy;
|
||||
25
packages/analytics/web/src/providers/posthog/page-view.tsx
Normal file
25
packages/analytics/web/src/providers/posthog/page-view.tsx
Normal file
@@ -0,0 +1,25 @@
|
||||
"use client";
|
||||
|
||||
import { usePathname, useSearchParams } from "next/navigation";
|
||||
import { usePostHog } from "posthog-js/react";
|
||||
import { useEffect } from "react";
|
||||
|
||||
export const PageView = () => {
|
||||
const pathname = usePathname();
|
||||
const searchParams = useSearchParams();
|
||||
const posthog = usePostHog();
|
||||
|
||||
useEffect(() => {
|
||||
if (pathname) {
|
||||
let url = window.origin + pathname;
|
||||
if (searchParams.toString()) {
|
||||
url = url + `?${searchParams.toString()}`;
|
||||
}
|
||||
posthog.capture("$pageview", {
|
||||
$current_url: url,
|
||||
});
|
||||
}
|
||||
}, [pathname, searchParams, posthog]);
|
||||
|
||||
return null;
|
||||
};
|
||||
41
packages/analytics/web/src/providers/posthog/server.ts
Normal file
41
packages/analytics/web/src/providers/posthog/server.ts
Normal file
@@ -0,0 +1,41 @@
|
||||
import { PostHog } from "posthog-node";
|
||||
|
||||
import { env } from "./env";
|
||||
|
||||
import type { AnalyticsProviderServerStrategy } from "@turbostarter/analytics";
|
||||
|
||||
const isValidPosthogConfig =
|
||||
env.NEXT_PUBLIC_POSTHOG_KEY &&
|
||||
env.NEXT_PUBLIC_POSTHOG_KEY !== "notyet" &&
|
||||
env.NEXT_PUBLIC_POSTHOG_HOST.startsWith("http");
|
||||
|
||||
let client: PostHog | null = null;
|
||||
|
||||
const getClient = () => {
|
||||
if (!isValidPosthogConfig) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (client) {
|
||||
return client;
|
||||
}
|
||||
|
||||
client = new PostHog(env.NEXT_PUBLIC_POSTHOG_KEY, {
|
||||
host: env.NEXT_PUBLIC_POSTHOG_HOST,
|
||||
});
|
||||
|
||||
return client;
|
||||
};
|
||||
|
||||
export const { track } = {
|
||||
track: (event, data) => {
|
||||
const client = getClient();
|
||||
if (!client) return;
|
||||
|
||||
client.capture({
|
||||
event,
|
||||
distinctId: typeof data?.distinctId === "string" ? data.distinctId : "",
|
||||
properties: data,
|
||||
});
|
||||
},
|
||||
} satisfies AnalyticsProviderServerStrategy;
|
||||
1
packages/analytics/web/src/providers/server.ts
Normal file
1
packages/analytics/web/src/providers/server.ts
Normal file
@@ -0,0 +1 @@
|
||||
export * from "./posthog/server";
|
||||
29
packages/analytics/web/src/providers/umami/env.ts
Normal file
29
packages/analytics/web/src/providers/umami/env.ts
Normal file
@@ -0,0 +1,29 @@
|
||||
/* eslint-disable turbo/no-undeclared-env-vars */
|
||||
import { defineEnv } from "envin";
|
||||
import * as z from "zod";
|
||||
|
||||
import { envConfig } from "@turbostarter/shared/constants";
|
||||
|
||||
import type { Preset } from "envin/types";
|
||||
|
||||
export const preset = {
|
||||
id: "umami",
|
||||
client: {
|
||||
NEXT_PUBLIC_UMAMI_HOST: z.string(),
|
||||
NEXT_PUBLIC_UMAMI_WEBSITE_ID: z.string(),
|
||||
},
|
||||
server: {
|
||||
UMAMI_API_HOST: z.string(),
|
||||
UMAMI_API_KEY: z.string().optional(),
|
||||
},
|
||||
} as const satisfies Preset;
|
||||
|
||||
export const env = defineEnv({
|
||||
...envConfig,
|
||||
...preset,
|
||||
env: {
|
||||
...process.env,
|
||||
NEXT_PUBLIC_UMAMI_HOST: process.env.NEXT_PUBLIC_UMAMI_HOST,
|
||||
NEXT_PUBLIC_UMAMI_WEBSITE_ID: process.env.NEXT_PUBLIC_UMAMI_WEBSITE_ID,
|
||||
},
|
||||
});
|
||||
47
packages/analytics/web/src/providers/umami/index.tsx
Normal file
47
packages/analytics/web/src/providers/umami/index.tsx
Normal file
@@ -0,0 +1,47 @@
|
||||
import { env } from "./env";
|
||||
|
||||
import type { AnalyticsProviderClientStrategy } from "@turbostarter/analytics";
|
||||
|
||||
declare global {
|
||||
interface Window {
|
||||
umami?: {
|
||||
track: (event: string, data?: Record<string, unknown>) => void;
|
||||
identify: (
|
||||
userId?: string | Record<string, unknown>,
|
||||
traits?: Record<string, unknown>,
|
||||
) => void;
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export const { Provider, track, identify, reset } = {
|
||||
Provider: ({ children }) => {
|
||||
return (
|
||||
<>
|
||||
{children}
|
||||
<script
|
||||
async
|
||||
src={`${env.NEXT_PUBLIC_UMAMI_HOST}/script.js`}
|
||||
data-website-id={env.NEXT_PUBLIC_UMAMI_WEBSITE_ID}
|
||||
></script>
|
||||
</>
|
||||
);
|
||||
},
|
||||
track: (event, data) => {
|
||||
if (typeof window === "undefined" || !window.umami) {
|
||||
return;
|
||||
}
|
||||
|
||||
window.umami.track(event, data);
|
||||
},
|
||||
identify: (userId, traits) => {
|
||||
if (typeof window === "undefined" || !window.umami) {
|
||||
return;
|
||||
}
|
||||
|
||||
window.umami.identify(userId, traits);
|
||||
},
|
||||
reset: () => {
|
||||
// Umami does not explicitly support resetting the session via the client-side API
|
||||
},
|
||||
} satisfies AnalyticsProviderClientStrategy;
|
||||
45
packages/analytics/web/src/providers/umami/server.ts
Normal file
45
packages/analytics/web/src/providers/umami/server.ts
Normal file
@@ -0,0 +1,45 @@
|
||||
import { logger } from "@turbostarter/shared/logger";
|
||||
|
||||
import { env } from "./env";
|
||||
|
||||
import type { AnalyticsProviderServerStrategy } from "@turbostarter/analytics";
|
||||
|
||||
export const { track } = {
|
||||
track: (event, data) => {
|
||||
const hostname =
|
||||
typeof data?.hostname === "string" ? data.hostname : undefined;
|
||||
const language =
|
||||
typeof data?.language === "string" ? data.language : undefined;
|
||||
const referrer =
|
||||
typeof data?.referrer === "string" ? data.referrer : undefined;
|
||||
const screen = typeof data?.screen === "string" ? data.screen : undefined;
|
||||
const title = typeof data?.title === "string" ? data.title : undefined;
|
||||
const url = typeof data?.url === "string" ? data.url : "app://server-side";
|
||||
|
||||
void fetch(`${env.UMAMI_API_HOST}/api/send`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
"x-umami-api-key": env.UMAMI_API_KEY ?? "",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
type: "event",
|
||||
payload: {
|
||||
website: env.NEXT_PUBLIC_UMAMI_WEBSITE_ID,
|
||||
name: event,
|
||||
url: url,
|
||||
...(hostname && { hostname }),
|
||||
...(language && { language }),
|
||||
...(referrer && { referrer }),
|
||||
...(screen && { screen }),
|
||||
...(title && { title }),
|
||||
data,
|
||||
},
|
||||
}),
|
||||
}).then((res) => {
|
||||
if (!res.ok) {
|
||||
logger.error("Failed to post event to Umami: ", res);
|
||||
}
|
||||
});
|
||||
},
|
||||
} satisfies AnalyticsProviderServerStrategy;
|
||||
24
packages/analytics/web/src/providers/vemetric/env.ts
Normal file
24
packages/analytics/web/src/providers/vemetric/env.ts
Normal file
@@ -0,0 +1,24 @@
|
||||
/* eslint-disable turbo/no-undeclared-env-vars */
|
||||
import { defineEnv } from "envin";
|
||||
import * as z from "zod";
|
||||
|
||||
import { envConfig } from "@turbostarter/shared/constants";
|
||||
|
||||
import type { Preset } from "envin/types";
|
||||
|
||||
export const preset = {
|
||||
id: "vemetric",
|
||||
client: {
|
||||
NEXT_PUBLIC_VEMETRIC_PROJECT_TOKEN: z.string(),
|
||||
},
|
||||
} as const satisfies Preset;
|
||||
|
||||
export const env = defineEnv({
|
||||
...envConfig,
|
||||
...preset,
|
||||
env: {
|
||||
...process.env,
|
||||
NEXT_PUBLIC_VEMETRIC_PROJECT_TOKEN:
|
||||
process.env.NEXT_PUBLIC_VEMETRIC_PROJECT_TOKEN,
|
||||
},
|
||||
});
|
||||
49
packages/analytics/web/src/providers/vemetric/index.tsx
Normal file
49
packages/analytics/web/src/providers/vemetric/index.tsx
Normal file
@@ -0,0 +1,49 @@
|
||||
import { VemetricScript, vemetric } from "@vemetric/react";
|
||||
|
||||
import { env } from "./env";
|
||||
|
||||
import type { AnalyticsProviderClientStrategy } from "@turbostarter/analytics";
|
||||
|
||||
export const { Provider, track, identify, reset } = {
|
||||
Provider: ({ children }) => {
|
||||
return (
|
||||
<>
|
||||
<VemetricScript
|
||||
token={env.NEXT_PUBLIC_VEMETRIC_PROJECT_TOKEN}
|
||||
trackPageViews
|
||||
trackOutboundLinks
|
||||
trackDataAttributes
|
||||
/>
|
||||
{children}
|
||||
</>
|
||||
);
|
||||
},
|
||||
track: (event, data) => {
|
||||
if (typeof window === "undefined") {
|
||||
return;
|
||||
}
|
||||
|
||||
void vemetric.trackEvent(event, {
|
||||
eventData: data,
|
||||
});
|
||||
},
|
||||
identify: (userId, traits) => {
|
||||
if (typeof window === "undefined") {
|
||||
return;
|
||||
}
|
||||
|
||||
void vemetric.identify({
|
||||
identifier: userId,
|
||||
data: {
|
||||
set: traits,
|
||||
},
|
||||
});
|
||||
},
|
||||
reset: () => {
|
||||
if (typeof window === "undefined") {
|
||||
return;
|
||||
}
|
||||
|
||||
void vemetric.resetUser();
|
||||
},
|
||||
} satisfies AnalyticsProviderClientStrategy;
|
||||
30
packages/analytics/web/src/providers/vemetric/server.ts
Normal file
30
packages/analytics/web/src/providers/vemetric/server.ts
Normal file
@@ -0,0 +1,30 @@
|
||||
import { Vemetric } from "@vemetric/node";
|
||||
|
||||
import { env } from "./env";
|
||||
|
||||
import type { AnalyticsProviderServerStrategy } from "@turbostarter/analytics";
|
||||
|
||||
let client: Vemetric | null = null;
|
||||
|
||||
const getClient = () => {
|
||||
if (client) {
|
||||
return client;
|
||||
}
|
||||
|
||||
client = new Vemetric({
|
||||
token: env.NEXT_PUBLIC_VEMETRIC_PROJECT_TOKEN,
|
||||
});
|
||||
|
||||
return client;
|
||||
};
|
||||
|
||||
export const { track } = {
|
||||
track: (event, data) => {
|
||||
const client = getClient();
|
||||
|
||||
void client.trackEvent(event, {
|
||||
userIdentifier: data?.distinctId?.toString() ?? "anonymous",
|
||||
eventData: data,
|
||||
});
|
||||
},
|
||||
} satisfies AnalyticsProviderServerStrategy;
|
||||
15
packages/analytics/web/src/providers/vercel/env.ts
Normal file
15
packages/analytics/web/src/providers/vercel/env.ts
Normal file
@@ -0,0 +1,15 @@
|
||||
import { defineEnv } from "envin";
|
||||
|
||||
import { envConfig } from "@turbostarter/shared/constants";
|
||||
|
||||
import type { Preset } from "envin/types";
|
||||
|
||||
export const preset = {
|
||||
id: "vercel",
|
||||
server: {},
|
||||
} as const satisfies Preset;
|
||||
|
||||
export const env = defineEnv({
|
||||
...envConfig,
|
||||
...preset,
|
||||
});
|
||||
22
packages/analytics/web/src/providers/vercel/index.tsx
Normal file
22
packages/analytics/web/src/providers/vercel/index.tsx
Normal file
@@ -0,0 +1,22 @@
|
||||
import { track as trackEvent } from "@vercel/analytics";
|
||||
import { Analytics } from "@vercel/analytics/react";
|
||||
|
||||
import type { AnalyticsProviderClientStrategy } from "@turbostarter/analytics";
|
||||
|
||||
export const { Provider, track, identify, reset } = {
|
||||
Provider: ({ children }) => {
|
||||
return (
|
||||
<>
|
||||
{children}
|
||||
<Analytics />
|
||||
</>
|
||||
);
|
||||
},
|
||||
track: trackEvent,
|
||||
identify: () => {
|
||||
// Vercel Web Analytics doesn't expose identify() on the client
|
||||
},
|
||||
reset: () => {
|
||||
// Vercel Web Analytics doesn't expose reset() on the client
|
||||
},
|
||||
} satisfies AnalyticsProviderClientStrategy;
|
||||
9
packages/analytics/web/src/providers/vercel/server.ts
Normal file
9
packages/analytics/web/src/providers/vercel/server.ts
Normal file
@@ -0,0 +1,9 @@
|
||||
import { track as vercelTrack } from "@vercel/analytics/server";
|
||||
|
||||
import type { AnalyticsProviderServerStrategy } from "@turbostarter/analytics";
|
||||
|
||||
export const { track } = {
|
||||
track: (event, data) => {
|
||||
void vercelTrack(event, data);
|
||||
},
|
||||
} satisfies AnalyticsProviderServerStrategy;
|
||||
1
packages/analytics/web/src/server.ts
Normal file
1
packages/analytics/web/src/server.ts
Normal file
@@ -0,0 +1 @@
|
||||
export { track } from "./providers/server";
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user