feat: whyrating - initial project from turbostarter boilerplate

This commit is contained in:
Alejandro Gutiérrez
2026-02-04 01:54:52 +01:00
commit 5cdc07cd39
1618 changed files with 338230 additions and 0 deletions

View File

@@ -0,0 +1,3 @@
import baseConfig from "@turbostarter/eslint-config/base";
export default baseConfig;

56
packages/ai/package.json Normal file
View File

@@ -0,0 +1,56 @@
{
"name": "@turbostarter/ai",
"private": true,
"version": "0.1.0",
"type": "module",
"exports": {
".": "./src/index.ts",
"./env": "./src/env.ts",
"./chat/*": "./src/modules/chat/*.ts",
"./image/*": "./src/modules/image/*.ts",
"./pdf/*": "./src/modules/pdf/*.ts",
"./tts/*": "./src/modules/tts/*.ts",
"./stt/*": "./src/modules/stt/*.ts",
"./credits/*": "./src/modules/credits/*.ts"
},
"scripts": {
"clean": "git clean -xdf .cache .turbo dist node_modules",
"format": "prettier --check . --ignore-path ../../.gitignore",
"lint": "eslint",
"test": "vitest run",
"test:watch": "vitest --watch",
"typecheck": "tsc --noEmit"
},
"devDependencies": {
"@turbostarter/eslint-config": "workspace:*",
"@turbostarter/prettier-config": "workspace:*",
"@turbostarter/tsconfig": "workspace:*",
"@turbostarter/vitest-config": "workspace:*",
"eslint": "catalog:",
"prettier": "catalog:",
"typescript": "catalog:",
"vitest": "catalog:"
},
"prettier": "@turbostarter/prettier-config",
"dependencies": {
"@ai-sdk/anthropic": "2.0.41",
"@ai-sdk/deepseek": "1.0.27",
"@ai-sdk/fireworks": "1.0.27",
"@ai-sdk/google": "2.0.28",
"@ai-sdk/openai": "2.0.68",
"@ai-sdk/replicate": "1.0.17",
"@ai-sdk/xai": "2.0.31",
"@anthropic-ai/sdk": "0.71.2",
"@elevenlabs/elevenlabs-js": "2.9.0",
"@langchain/community": "1.0.0",
"@langchain/core": "1.0.3",
"@tavily/core": "0.5.12",
"@turbostarter/db": "workspace:*",
"@turbostarter/shared": "workspace:*",
"@turbostarter/storage": "workspace:*",
"ai": "catalog:",
"openai": "4.103.0",
"pdf-parse": "1.1.1",
"zod": "catalog:"
}
}

25
packages/ai/src/env.ts Normal file
View File

@@ -0,0 +1,25 @@
import { defineEnv } from "envin";
import * as z from "zod";
import { envConfig, NodeEnv } from "@turbostarter/shared/constants";
import type { Preset } from "envin/types";
export const preset = {
id: "ai",
server: {
ELEVENLABS_API_KEY: z.string().optional(),
TAVILY_API_KEY: z.string().optional(),
},
} as const satisfies Preset;
export const env = defineEnv({
...envConfig,
shared: {
NODE_ENV: z.enum(NodeEnv).default(NodeEnv.DEVELOPMENT),
},
server: {
ELEVENLABS_API_KEY: z.string().optional(),
TAVILY_API_KEY: z.string().optional(),
},
});

2
packages/ai/src/index.ts Normal file
View File

@@ -0,0 +1,2 @@
export * from "./types";
export * from "./utils/common";

View File

@@ -0,0 +1,278 @@
import {
convertToModelMessages,
createUIMessageStream,
createUIMessageStreamResponse,
smoothStream,
stepCountIs,
streamText,
} from "ai";
import * as z from "zod";
import { and, eq } from "@turbostarter/db";
import { chat, message, part } from "@turbostarter/db/schema/chat";
import { db } from "@turbostarter/db/server";
import { omitBy } from "@turbostarter/shared/utils";
import { getDeleteUrl, getSignedUrl } from "@turbostarter/storage/server";
import { repairToolCall } from "../../utils/llm";
import { MODELS, PROMPTS } from "./constants";
import { modelStrategies } from "./strategies";
import { toolStrategies } from "./tools";
import { Role, Tool } from "./types";
import { generateChatName, getProviderOptions, toChatMessage } from "./utils";
import type { ChatMessagePayload } from "./schema";
import type {
InsertChat,
InsertMessage,
InsertPart,
} from "@turbostarter/db/schema/chat";
const hasPath = (details: unknown): details is { path: string } =>
z
.object({
path: z.string(),
})
.safeParse(details).success;
export const createChat = async (data: InsertChat) =>
db
.insert(chat)
.values(data)
.onConflictDoUpdate({
target: chat.id,
set: data,
})
.returning();
export const updateChat = async (id: string, data: Partial<InsertChat>) =>
db.update(chat).set(data).where(eq(chat.id, id));
export const getChat = async (id: string) =>
db.query["chat.chat"].findFirst({
where: eq(chat.id, id),
});
const deleteAttachment = async (path: string) => {
const { url } = await getDeleteUrl({ path });
await fetch(url, {
method: "DELETE",
});
};
export const deleteChat = async (id: string) => {
const attachments = await getFileParts(id);
const [deleted] = await db.delete(chat).where(eq(chat.id, id)).returning();
if (!deleted) {
return;
}
void Promise.allSettled(
attachments
.map((part) => part.details)
.filter(hasPath)
.map((part) => deleteAttachment(part.path)),
);
return deleted;
};
export const getUserChats = async (userId: string) =>
db.query["chat.chat"].findMany({
where: eq(chat.userId, userId),
orderBy: (chat, { desc }) => [desc(chat.createdAt)],
});
export const createMessage = async (data: InsertMessage) =>
db.insert(message).values(data).onConflictDoUpdate({
target: message.id,
set: data,
});
export const createParts = async (data: InsertPart[]) =>
db.insert(part).values(data).onConflictDoNothing();
export const getFileParts = async (chatId: string) => {
const rows = await db
.select()
.from(part)
.innerJoin(message, eq(part.messageId, message.id))
.where(and(eq(message.chatId, chatId), eq(part.type, "file")));
return rows.flatMap((row) => row.part);
};
export const getChatMessages = async (id: string) =>
db.query["chat.message"].findMany({
where: eq(message.chatId, id),
orderBy: (message, { asc }) => [asc(message.createdAt)],
with: {
part: {
orderBy: (part, { asc }) => [asc(part.order)],
},
},
});
export const getChatMessagesWithAttachments = async (id: string) => {
const messages = await getChatMessages(id);
return Promise.all(
messages.map(async (message) => ({
...message,
parts: await Promise.all(
message.part.map(async (part) =>
part.type === "file"
? {
...part,
details: {
...(hasPath(part.details)
? {
...part.details,
url: (
await getSignedUrl({
path: part.details.path,
})
).url,
}
: {}),
},
}
: part,
),
),
})),
);
};
const upsertChat = async ({
id,
content,
userId,
}: {
id: string;
content: string;
userId: string;
}) => {
const [chat] = await createChat({ id, userId });
if (!chat?.name) {
void (async () => {
const name = await generateChatName(content);
await updateChat(id, { name });
})();
}
return chat;
};
export const streamChat = async ({
chatId,
userId,
signal,
...message
}: ChatMessagePayload & { signal: AbortSignal; userId: string }) => {
await upsertChat({
id: chatId,
content: message.parts
.filter((part) => part.type === "text")
.map((part) => part.text)
.join("\n"),
userId,
});
const messages = await getChatMessagesWithAttachments(chatId);
await createMessage({ ...message, chatId });
await createParts(
message.parts.map(({ type, ...details }, order) => ({
type,
order,
details:
type === "file" ? omitBy(details, (_, key) => key === "url") : details,
messageId: message.id,
})),
);
const providerOptions = getProviderOptions(message.metadata.options);
const model = MODELS.find(
(model) => model.id === message.metadata.options.model,
);
if (!model) {
throw new Error("Model not found!");
}
const stream = createUIMessageStream({
execute: ({ writer }) => {
const result = streamText({
model: modelStrategies.languageModel(model.id),
messages: convertToModelMessages([
...messages.map(toChatMessage),
message,
]),
system: PROMPTS.SYSTEM,
stopWhen: stepCountIs(5),
abortSignal: signal,
...(model.tools && {
tools: toolStrategies(writer),
activeTools: [
...(message.metadata.options.search ? [Tool.WEB_SEARCH] : []),
],
experimental_repairToolCall: repairToolCall,
}),
providerOptions,
experimental_transform: smoothStream({
chunking: "word",
delayInMs: 15,
}),
onError: (error) => {
console.error(error);
},
});
void result.consumeStream();
writer.merge(
result.toUIMessageStream({
originalMessages: messages.map(toChatMessage),
messageMetadata: ({ part }) => {
if (part.type === "start") {
return {
options: message.metadata.options,
};
}
},
sendReasoning: message.metadata.options.reason,
}),
);
},
onFinish: async ({ responseMessage }) => {
await createMessage({
id: responseMessage.id,
chatId,
role: Role.ASSISTANT,
});
await createParts(
responseMessage.parts.map(({ type, ...details }, order) => ({
type,
details,
messageId: responseMessage.id,
order,
})),
);
},
});
return createUIMessageStreamResponse({
stream,
headers: {
"Content-Type": "application/octet-stream",
"Content-Encoding": "none",
},
});
};

View File

@@ -0,0 +1,119 @@
import { Provider } from "../../types";
import { Model } from "./types";
export const MODELS = [
{
id: Model.GPT_5_1,
provider: Provider.OPENAI,
name: "GPT-5.1",
reason: false,
tools: true,
attachments: true,
},
{
id: Model.GPT_4O,
provider: Provider.OPENAI,
name: "GPT-4o",
reason: false,
tools: true,
attachments: true,
},
{
id: Model.O4_MINI,
provider: Provider.OPENAI,
name: "o4-mini",
reason: true,
tools: true,
attachments: true,
},
{
id: Model.O3,
provider: Provider.OPENAI,
name: "o3",
reason: true,
tools: true,
attachments: false,
},
{
id: Model.GEMINI_2_5_PRO,
provider: Provider.GEMINI,
name: "Gemini 2.5 Pro",
reason: false,
tools: true,
attachments: true,
},
{
id: Model.GEMINI_2_5_FLASH,
provider: Provider.GEMINI,
name: "Gemini 2.5 Flash",
reason: false,
tools: true,
attachments: true,
},
{
id: Model.CLAUDE_4_SONNET,
provider: Provider.CLAUDE,
name: "Claude 4 Sonnet",
reason: false,
tools: true,
attachments: true,
},
{
id: Model.CLAUDE_3_7_SONNET,
provider: Provider.CLAUDE,
name: "Claude 3.7 Sonnet",
reason: true,
tools: true,
attachments: true,
},
{
id: Model.GROK_4,
provider: Provider.GROK,
name: "Grok 4",
reason: false,
tools: true,
attachments: false,
},
{
id: Model.GROK_3,
provider: Provider.GROK,
name: "Grok 3",
reason: true,
tools: true,
attachments: false,
},
{
id: Model.DEEPSEEK_V3,
provider: Provider.DEEPSEEK,
name: "DeepSeek V3",
reason: false,
tools: true,
attachments: false,
},
{
id: Model.DEEPSEEK_R1,
provider: Provider.DEEPSEEK,
name: "DeepSeek R1",
reason: true,
tools: false,
attachments: false,
},
] as const;
export const PROMPTS = {
CHAT_NAME: `- you will generate a short title based on the first message a user begins a conversation with
- ensure it is not more than 80 characters long
- the title should be a summary of the user's message
- the title should creative and unique
- do not use quotes or colons`,
SYSTEM: `- You are a digital friend that helps users with fun and engaging conversations sometimes likes to be funny but serious at the same time.
- Today's date is ${new Date().toLocaleDateString("en-US", { year: "numeric", month: "short", day: "2-digit", weekday: "short" })}.
- You can use markdown formatting with tables too when needed.
- You can use latex formtting:
- Use $ for inline equations
- Use $$ for block equations
- Use "USD" for currency (not $)
- No need to use bold or italic formatting in tables.
- don't use the h1 heading in the markdown response.`,
};

View File

@@ -0,0 +1,55 @@
import * as z from "zod";
import { Model, Role } from "./types";
export const chatMessageOptionsSchema = z.object({
reason: z.boolean().optional().default(false),
search: z.boolean().optional().default(false),
model: z.enum(Model),
});
export const chatMessageMetadataSchema = z.object({
options: chatMessageOptionsSchema,
});
export const chatMessagePartSchema = z.discriminatedUnion("type", [
z
.object({
type: z.literal("text"),
text: z.string(),
})
.catchall(z.unknown()),
z.object({
type: z.literal("file"),
filename: z.string(),
mediaType: z.string(),
url: z.string(),
path: z.string().optional(),
}),
]);
export const chatMessageSchema = z.object({
id: z.string(),
chatId: z.string(),
parts: z.array(chatMessagePartSchema),
role: z.enum(Role).optional().default(Role.USER),
metadata: chatMessageMetadataSchema,
});
export type ChatMessagePayload = z.infer<typeof chatMessageSchema>;
export type ChatMessagePartPayload = z.infer<typeof chatMessagePartSchema>;
export type ChatMessageOptionsPayload = z.infer<
typeof chatMessageOptionsSchema
>;
export type ChatMessageMetadataPayload = z.infer<
typeof chatMessageMetadataSchema
>;
// API input type aliases
export type ChatMessageInput = ChatMessagePayload;
export {
selectChatSchema as chatSchema,
selectMessageSchema as messageSchema,
selectPartSchema as partSchema,
} from "@turbostarter/db/schema/chat";

View File

@@ -0,0 +1,27 @@
import { anthropic } from "@ai-sdk/anthropic";
import { deepseek } from "@ai-sdk/deepseek";
import { google } from "@ai-sdk/google";
import { openai } from "@ai-sdk/openai";
import { xai } from "@ai-sdk/xai";
import { customProvider } from "ai";
import { cached } from "../../utils/llm";
import { Model } from "./types";
export const modelStrategies = customProvider({
languageModels: {
[Model.GPT_5_1]: cached(openai.responses("gpt-5.1-chat-latest")),
[Model.GPT_4O]: cached(openai.responses("gpt-4o")),
[Model.O3]: cached(openai.responses("o3-mini")),
[Model.O4_MINI]: cached(openai.responses("o4-mini")),
[Model.GEMINI_2_5_PRO]: cached(google("gemini-2.5-pro")),
[Model.GEMINI_2_5_FLASH]: cached(google("gemini-2.5-flash")),
[Model.CLAUDE_4_SONNET]: cached(anthropic("claude-sonnet-4-5")),
[Model.CLAUDE_3_7_SONNET]: cached(anthropic("claude-3-7-sonnet-latest")),
[Model.GROK_4]: cached(xai("grok-4")),
[Model.GROK_3]: cached(xai("grok-3-mini-fast")),
[Model.DEEPSEEK_V3]: cached(deepseek("deepseek-chat")),
[Model.DEEPSEEK_R1]: cached(deepseek("deepseek-reasoner")),
},
});

View File

@@ -0,0 +1,11 @@
import { Tool } from "../types";
import { webSearch } from "./search";
import type { InferUITools, UIMessageStreamWriter } from "ai";
export const toolStrategies = (writer: UIMessageStreamWriter) => ({
[Tool.WEB_SEARCH]: webSearch(writer),
});
export type ChatTools = InferUITools<ReturnType<typeof toolStrategies>>;

View File

@@ -0,0 +1,233 @@
import { tavily } from "@tavily/core";
import { tool } from "ai";
import * as z from "zod";
import { env } from "../../../env";
import type { TavilyClient } from "@tavily/core";
import type { InferUITool, UIMessageStreamWriter } from "ai";
// Lazy initialization to avoid throwing at module load time
let _client: TavilyClient | null = null;
const getClient = () => {
if (!_client) {
if (!env.TAVILY_API_KEY) {
throw new Error("TAVILY_API_KEY is required for web search");
}
_client = tavily({ apiKey: env.TAVILY_API_KEY });
}
return _client;
};
const sanitizeUrl = (url: string): string => url.replace(/\s+/g, "%20");
const isValidImageUrl = async (url: string) => {
try {
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), 5000);
const response = await fetch(url, {
method: "HEAD",
signal: controller.signal,
});
clearTimeout(timeout);
return (
response.ok &&
(response.headers.get("content-type")?.startsWith("image/") ?? false)
);
} catch {
return false;
}
};
const extractDomain = (url: string): string => {
const urlPattern = /^https?:\/\/([^/?#]+)(?:[/?#]|$)/i;
return urlPattern.exec(url)?.[1] ?? url;
};
const processDomains = (domains?: string[]): string[] | undefined => {
if (!domains || domains.length === 0) return undefined;
const processedDomains = domains.map((domain) => extractDomain(domain));
return processedDomains.every((domain) => domain.trim() === "")
? undefined
: processedDomains;
};
const deduplicateByDomainAndUrl = <T extends { url: string }>(
items: T[],
): T[] => {
const seenDomains = new Set<string>();
const seenUrls = new Set<string>();
return items.filter((item) => {
const domain = extractDomain(item.url);
const isNewUrl = !seenUrls.has(item.url);
const isNewDomain = !seenDomains.has(domain);
if (isNewUrl && isNewDomain) {
seenUrls.add(item.url);
seenDomains.add(domain);
return true;
}
return false;
});
};
export const webSearch = (writer: UIMessageStreamWriter) =>
tool({
description:
"Search the web for information with multiple queries, max results and time range.",
inputSchema: z.object({
queries: z
.array(
z.object({
q: z
.string()
.describe(
"Search query to look up on the web. At least 5 characters length.",
),
topic: z
.enum(["general", "news"])
.describe("Topic type to search for."),
maxResults: z
.number()
.describe(
"Maximum number of results to return. Up to 10, 3 by default.",
),
}),
)
.describe(
"Array of search queries to look up on the web. At least 2 items, at most 5.",
),
excludeDomains: z
.array(z.string())
.describe(
"A list of domains to exclude from all search results. Default is [] (empty array).",
),
timeRange: z
.enum(["year", "month", "week", "day", "y", "m", "w", "d"])
.describe(
"The time range to search for. Defaults to undefined - all time.",
),
}),
execute: async ({ queries, excludeDomains, timeRange }) => {
try {
const searchPromises = queries.map(async (query, index) => {
try {
writer.write({
type: "data-query_completion",
data: {
query,
index,
total: queries.length,
status: "started",
resultsCount: 0,
imagesCount: 0,
},
});
const data = await getClient().search(query.q, {
topic: query.topic,
days: query.topic === "news" ? 7 : undefined,
maxResults: query.maxResults,
searchDepth: "basic",
includeAnswer: true,
includeImages: true,
includeImageDescriptions: true,
excludeDomains: processDomains(excludeDomains),
timeRange,
});
writer.write({
type: "data-query_completion",
data: {
query,
index,
total: queries.length,
status: "completed",
resultsCount: data.results.length,
imagesCount: data.images.length,
},
});
const results = deduplicateByDomainAndUrl(data.results).map(
(result) => ({
url: result.url,
title: result.title,
content: result.content,
rawContent: result.rawContent,
publishedDate:
query.topic === "news" ? result.publishedDate : undefined,
}),
);
const images = await Promise.all(
deduplicateByDomainAndUrl(data.images).map(
async ({ url, description }) => {
const sanitizedUrl = sanitizeUrl(url);
return (await isValidImageUrl(sanitizedUrl))
? { url: sanitizedUrl, description: description ?? "" }
: null;
},
),
);
return {
query,
results,
images: images.filter(
(img): img is { url: string; description: string } =>
img !== null && img.description !== "",
),
};
} catch (error) {
console.error(error);
writer.write({
type: "data-query_completion",
data: {
query,
index,
total: queries.length,
status: "error",
resultsCount: 0,
imagesCount: 0,
},
});
return {
query,
results: [],
images: [],
};
}
});
return {
searches: await Promise.all(searchPromises),
};
} catch (error) {
console.error(error);
return {
searches: [],
};
}
},
});
export type WebSearchTool = InferUITool<ReturnType<typeof webSearch>>;
export interface DataQueryCompletionPart {
query: {
q: string;
topic: string;
maxResults: number;
};
index: number;
total: number;
status: "started" | "completed" | "error";
resultsCount: number;
imagesCount: number;
}

View File

@@ -0,0 +1,59 @@
export type {
SelectChat as Chat,
SelectMessage as Message,
SelectPart as Part,
} from "@turbostarter/db/schema/chat";
import { messageRoleEnum } from "@turbostarter/db/schema/chat";
import type { ChatMessageMetadataPayload } from "./schema";
import type { ChatTools } from "./tools";
import type { DataQueryCompletionPart } from "./tools/search";
import type { EnumToConstant } from "@turbostarter/shared/types";
import type { UIMessage } from "ai";
export const Role = Object.fromEntries(
messageRoleEnum.enumValues.map((role) => [
role.replace(/-/g, "_").toUpperCase(),
role,
]),
) as EnumToConstant<typeof messageRoleEnum.enumValues>;
export type Role = (typeof Role)[keyof typeof Role];
export const Model = {
O3: "o3",
O4_MINI: "o4-mini",
GPT_5_1: "gpt-5-1",
GPT_4O: "gpt-4o",
GEMINI_2_5_PRO: "gemini-2-5-pro",
GEMINI_2_5_FLASH: "gemini-2-5-flash",
CLAUDE_4_SONNET: "claude-4-sonnet",
CLAUDE_3_7_SONNET: "claude-3-7-sonnet",
GROK_4: "grok-4",
GROK_3: "grok-3",
DEEPSEEK_V3: "deepseek-v3",
DEEPSEEK_R1: "deepseek-r1",
} as const;
export type Model = (typeof Model)[keyof typeof Model];
export const Tool = {
WEB_SEARCH: "web-search",
} as const;
export type Tool = (typeof Tool)[keyof typeof Tool];
// eslint-disable-next-line @typescript-eslint/consistent-type-definitions
export type ChatDataParts = {
query_completion: DataQueryCompletionPart;
};
export type ChatMessage = UIMessage<
ChatMessageMetadataPayload,
ChatDataParts,
ChatTools
>;
export type ChatMessagePart = UIMessage["parts"][number];
export type { ChatTools };

View File

@@ -0,0 +1,100 @@
import { openai } from "@ai-sdk/openai";
import { generateObject } from "ai";
import * as z from "zod";
import { Credits } from "../credits/utils";
import { MODELS, PROMPTS } from "./constants";
import type {
ChatMessagePartPayload,
ChatMessageOptionsPayload,
} from "./schema";
import type { Message, Part, ChatMessage, ChatMessagePart } from "./types";
import type { AnthropicProviderOptions } from "@ai-sdk/anthropic";
import type { OpenAIResponsesProviderOptions } from "@ai-sdk/openai";
import type { XaiProviderOptions } from "@ai-sdk/xai";
export const generateChatName = async (content: string) => {
const { object } = await generateObject({
model: openai.responses("gpt-4.1-mini"),
schema: z.object({
name: z.string().min(1),
}),
system: PROMPTS.CHAT_NAME,
prompt: `User message: ${content}`,
});
return object.name;
};
export const getProviderOptions = (options: ChatMessageOptionsPayload) => {
const model = MODELS.find((model) => model.id === options.model);
const reasoning = !!model?.reason && !!options.reason;
return {
anthropic: {
thinking: {
type: reasoning ? "enabled" : "disabled",
budgetTokens: 1200,
},
} satisfies AnthropicProviderOptions,
openai: {
...(reasoning
? { reasoningEffort: "medium", reasoningSummary: "detailed" }
: {}),
textVerbosity: "medium",
} satisfies OpenAIResponsesProviderOptions,
xai: {
...(reasoning ? { reasoningEffort: "low" } : {}),
} satisfies XaiProviderOptions,
};
};
export const getCreditsDeduction = (
options: ChatMessageOptionsPayload,
parts?: ChatMessagePartPayload[],
) => {
const model = MODELS.find((model) => model.id === options.model);
const searchDeduction = options.search
? Credits.COST.DEFAULT
: Credits.COST.FREE;
const reasoningDeduction =
options.reason && model?.reason ? Credits.COST.DEFAULT : Credits.COST.FREE;
const attachments = parts?.filter((part) => part.type === "file");
const attachmentDeduction = (attachments?.length ?? 0) * Credits.COST.DEFAULT;
return (
Credits.COST.DEFAULT +
searchDeduction +
reasoningDeduction +
attachmentDeduction
);
};
export const toChatMessagePart = ({
type,
details,
}: Part): ChatMessagePart | null => {
if (typeof details !== "object" || details === null) {
return null;
}
return {
type,
...details,
} as ChatMessagePart;
};
export const toChatMessage = (
message: Message & {
parts?: Part[];
},
): ChatMessage => {
return {
...message,
parts: message.parts?.map(toChatMessagePart).filter(Boolean) ?? [],
};
};

View File

@@ -0,0 +1,45 @@
import { NodeEnv } from "@turbostarter/shared/constants";
import { env } from "../../env";
const nodeEnv = env.NODE_ENV;
/**
* Centralized credits configuration.
* Environment-aware defaults for development vs production.
*/
export const CreditsConfig = {
/** Credits for new free-tier users */
FREE_TIER: nodeEnv === NodeEnv.DEVELOPMENT ? 10000 : 100,
/** Credits for seed/dev users */
DEV_SEED: 10000,
/** Cost by operation complexity */
COST: {
FREE: 0,
LOW: 1,
MEDIUM: 5,
HIGH: 10,
PREMIUM: 25,
},
/** Feature-specific costs (for audit logging) */
FEATURE_COST: {
chat: 5,
"text-to-speech": 10,
"speech-to-text": 5,
"image-generation": 25,
"pdf-chat": 10,
},
} as const;
export type CostLevel = keyof typeof CreditsConfig.COST;
export type FeatureName = keyof typeof CreditsConfig.FEATURE_COST;
/**
* Get cost for a specific feature
*/
export const getFeatureCost = (feature: FeatureName): number => {
return CreditsConfig.FEATURE_COST[feature];
};

View File

@@ -0,0 +1,90 @@
import { eq, sql } from "@turbostarter/db";
import {
creditTransaction,
customer,
} from "@turbostarter/db/schema";
import { db } from "@turbostarter/db/server";
import { generateId } from "@turbostarter/shared/utils";
import { CreditsConfig } from "./config";
import { Credits } from "./utils";
export const getUserCredits = async (userId: string) => {
const data = await db.query.customer.findFirst({
where: eq(customer.userId, userId),
});
return data?.credits ?? Credits.BALANCE;
};
export const getCustomerByUserId = async (userId: string) => {
return db.query.customer.findFirst({
where: eq(customer.userId, userId),
});
};
export const deductUserCredits = (userId: string, amount: number) =>
db
.update(customer)
.set({ credits: sql`${customer.credits} - ${amount}` })
.where(eq(customer.userId, userId));
export const addUserCredits = (userId: string, amount: number) =>
db
.update(customer)
.set({ credits: sql`${customer.credits} + ${amount}` })
.where(eq(customer.userId, userId));
/**
* Create a free customer record for a new user with welcome credits.
* Called automatically on user signup via auth hooks.
*/
export const createFreeCustomer = async (userId: string) => {
const id = generateId();
const credits = CreditsConfig.FREE_TIER;
await db.transaction(async (tx) => {
// Create customer record
await tx.insert(customer).values({
id,
userId,
customerId: `free_${userId}`,
status: "active",
plan: "free",
credits,
});
// Log the initial credit transaction
await tx.insert(creditTransaction).values({
id: generateId(),
customerId: id,
amount: credits,
type: "signup",
reason: "Welcome credits for new user",
balanceAfter: credits,
});
});
return { id, credits };
};
/**
* Ensure a customer record exists for a user.
* Creates one with free credits if not present.
*/
export const ensureCustomerExists = async (userId: string) => {
const existing = await getCustomerByUserId(userId);
if (existing) return existing;
const { id, credits } = await createFreeCustomer(userId);
return {
id,
userId,
customerId: `free_${userId}`,
status: "active" as const,
plan: "free" as const,
credits,
createdAt: new Date(),
updatedAt: new Date(),
};
};

View File

@@ -0,0 +1,32 @@
export const Credits = {
BALANCE: 100,
COST: {
FREE: 0,
DEFAULT: 5,
HIGH: 10,
},
};
export type CreditsLevel = "high" | "medium" | "low";
export const hasEnoughCredits = (available: number, required: number) => {
return available >= required;
};
export const getCreditsLevel = (
credits: number,
max = Credits.BALANCE,
): CreditsLevel => {
const percentage = getCreditsProgress(credits, max) * 100;
if (percentage > 50) {
return "high";
} else if (percentage > 15) {
return "medium";
} else {
return "low";
}
};
export const getCreditsProgress = (credits: number, max = Credits.BALANCE) =>
credits / max;

View File

@@ -0,0 +1,182 @@
import { generateId, experimental_generateImage as generateImage } from "ai";
import { and, desc, eq, inArray, lt } from "@turbostarter/db";
import { generation, image } from "@turbostarter/db/schema/image";
import { db } from "@turbostarter/db/server";
import { HttpStatusCode } from "@turbostarter/shared/constants";
import { HttpException } from "@turbostarter/shared/utils";
import { getPublicUrl, getUploadUrl } from "@turbostarter/storage/server";
import { MODELS } from "./constants";
import { modelStrategies } from "./strategies";
import type {
InsertGeneration,
InsertImage,
} from "@turbostarter/db/schema/image";
export const createGeneration = async (data: InsertGeneration) =>
db.insert(generation).values(data).returning();
export const getGeneration = async (id: string) =>
db.query["image.generation"].findFirst({
where: eq(generation.id, id),
});
export const getGenerationWithImages = async (id: string) =>
db.query["image.generation"].findFirst({
where: eq(generation.id, id),
with: {
image: true,
},
});
export const updateGeneration = async (
id: string,
data: Partial<InsertGeneration>,
) => db.update(generation).set(data).where(eq(generation.id, id));
export const getGenerationImages = async (id: string) =>
db.query["image.image"].findMany({
where: eq(image.generationId, id),
});
export const deleteGenerationImages = async (id: string) =>
db.delete(image).where(eq(image.generationId, id));
export const createImages = async (data: InsertImage[]) =>
db.insert(image).values(data).returning();
export const getImages = async ({
userId,
limit = 10,
cursor,
}: {
userId: string;
limit?: number;
cursor?: Date;
}) => {
return db.query["image.image"].findMany({
orderBy: (t) => desc(t.createdAt),
with: {
generation: true,
},
limit,
where: and(
inArray(
image.generationId,
db
.select({ id: generation.id })
.from(generation)
.innerJoin(image, eq(generation.id, image.generationId))
.where(eq(generation.userId, userId)),
),
...(cursor ? [lt(image.createdAt, cursor)] : []),
),
});
};
const resetGeneration = async (id: string) => {
await deleteGenerationImages(id);
await updateGeneration(id, {
createdAt: new Date(),
completedAt: null,
});
};
const saveImages = async ({
images,
generationId,
}: {
images: string[];
generationId: string;
}) => {
const results = await Promise.allSettled(
images.map(async (image) => {
const path = `images/${generateId()}.png`;
const { url: uploadUrl } = await getUploadUrl({
path,
});
await fetch(uploadUrl, {
method: "PUT",
body: Buffer.from(image, "base64"),
});
const { url } = await getPublicUrl({
path,
});
return url;
}),
);
await createImages(
results
.filter((result) => result.status === "fulfilled")
.map((result) => ({
url: result.value,
generationId,
})),
);
};
export const generateImages = async ({
id,
abortSignal,
}: {
id: string;
abortSignal?: AbortSignal;
}) => {
const generation = await getGenerationWithImages(id);
const model = MODELS.find((m) => m.id === generation?.model);
const dimension = model?.dimensions.find(
(d) => d.id === generation?.aspectRatio,
);
if (!generation || !model || !dimension) {
throw new HttpException(HttpStatusCode.NOT_FOUND);
}
if (generation.image.length) {
await resetGeneration(generation.id);
}
if (abortSignal) {
abortSignal.onabort = async () => {
await updateGeneration(generation.id, {
completedAt: new Date(),
});
};
}
const { images, warnings } = await generateImage({
model: modelStrategies.imageModel(generation.model),
prompt: generation.prompt,
...(model.dimensionFormat === "size"
? { size: dimension.value as `${number}x${number}` }
: { aspectRatio: dimension.value as `${number}:${number}` }),
...(model.provider !== "openai" && {
seed: Math.floor(Math.random() * 1000000),
}),
n: generation.count,
abortSignal,
});
if (warnings.length) {
console.warn(warnings);
}
void saveImages({
images: images.map((image) => image.base64),
generationId: generation.id,
});
await updateGeneration(generation.id, {
completedAt: new Date(),
});
return images.map(
(image) => (image as unknown as { base64Data: string }).base64Data,
);
};

View File

@@ -0,0 +1,138 @@
import { Provider } from "../../types";
import { AspectRatio, Model } from "./types";
export const MODELS = [
{
id: Model.GPT_IMAGE_1,
provider: Provider.OPENAI,
name: "GPT Image 1",
dimensionFormat: "size",
dimensions: [
{
id: AspectRatio.SQUARE,
value: "1024x1024",
},
],
},
{
id: Model.DALL_E_2,
provider: Provider.OPENAI,
name: "DALL-E 2",
dimensionFormat: "size",
dimensions: [
{
id: AspectRatio.SQUARE,
value: "1024x1024",
},
],
},
{
id: Model.DALL_E_3,
provider: Provider.OPENAI,
name: "DALL-E 3",
dimensionFormat: "size",
dimensions: [
{
id: AspectRatio.SQUARE,
value: "1024x1024",
},
],
},
{
id: Model.RECRAFT_V3,
provider: Provider.RECRAFT,
name: "Recraft v3",
dimensionFormat: "aspectRatio",
dimensions: [
{
id: AspectRatio.SQUARE,
value: "1:1",
},
{
id: AspectRatio.STANDARD,
value: "4:3",
},
{
id: AspectRatio.LANDSCAPE,
value: "16:9",
},
{
id: AspectRatio.PORTRAIT,
value: "9:16",
},
],
},
{
id: Model.PHOTON,
provider: Provider.LUMA,
name: "Photon",
dimensionFormat: "aspectRatio",
dimensions: [
{
id: AspectRatio.SQUARE,
value: "1:1",
},
{
id: AspectRatio.STANDARD,
value: "4:3",
},
{
id: AspectRatio.LANDSCAPE,
value: "16:9",
},
{
id: AspectRatio.PORTRAIT,
value: "9:16",
},
],
},
{
id: Model.STABLE_DIFFUSION_3_5_LARGE,
provider: Provider.STABILITY_AI,
name: "Stable Diffusion 3.5 Large",
dimensionFormat: "aspectRatio",
dimensions: [
{
id: AspectRatio.SQUARE,
value: "1:1",
},
{
id: AspectRatio.STANDARD,
value: "4:3",
},
{
id: AspectRatio.LANDSCAPE,
value: "16:9",
},
{
id: AspectRatio.PORTRAIT,
value: "9:16",
},
],
},
{
id: Model.STABLE_DIFFUSION_3_5_MEDIUM,
provider: Provider.STABILITY_AI,
name: "Stable Diffusion 3.5 Medium",
dimensionFormat: "aspectRatio",
dimensions: [
{
id: AspectRatio.SQUARE,
value: "1:1",
},
{
id: AspectRatio.STANDARD,
value: "4:3",
},
{
id: AspectRatio.LANDSCAPE,
value: "16:9",
},
{
id: AspectRatio.PORTRAIT,
value: "9:16",
},
],
},
] as const;

View File

@@ -0,0 +1,28 @@
import * as z from "zod";
import { AspectRatio } from "./types";
export const imageGenerationOptionsSchema = z.object({
aspectRatio: z.enum(AspectRatio),
model: z.string(),
count: z.number().min(1).max(5),
});
export const imageGenerationSchema = z.object({
id: z.string().optional(),
prompt: z.string().min(1).max(5000),
options: imageGenerationOptionsSchema,
});
export type ImageGenerationOptionsPayload = z.infer<
typeof imageGenerationOptionsSchema
>;
export type ImageGenerationPayload = z.infer<typeof imageGenerationSchema>;
// API input type aliases
export type ImageGenerationInput = ImageGenerationPayload;
export {
selectGenerationSchema as generationSchema,
selectImageSchema as imageSchema,
} from "@turbostarter/db/schema/image";

View File

@@ -0,0 +1,46 @@
import { openai } from "@ai-sdk/openai";
import { customProvider } from "ai";
import { Model } from "./types";
import type { ImageModel } from "ai";
// Lazy load replicate to avoid errors when REPLICATE_API_TOKEN is not set
const getReplicateModel = (model: string): ImageModel => {
// eslint-disable-next-line @typescript-eslint/no-require-imports, @typescript-eslint/consistent-type-imports
const { replicate } = require("@ai-sdk/replicate") as typeof import("@ai-sdk/replicate");
return replicate.image(model);
};
// Check for Replicate API token availability
const hasReplicateToken = (): boolean => {
try {
return typeof globalThis.process !== "undefined" && !!globalThis.process.env.REPLICATE_API_TOKEN;
} catch {
return false;
}
};
export const modelStrategies = customProvider({
imageModels: {
[Model.GPT_IMAGE_1]: openai.image("gpt-image-1-mini"),
[Model.DALL_E_2]: openai.image("dall-e-2"),
[Model.DALL_E_3]: openai.image("dall-e-3"),
},
});
// Replicate models - only available when REPLICATE_API_TOKEN is set
export const replicateModelStrategies = hasReplicateToken()
? customProvider({
imageModels: {
[Model.RECRAFT_V3]: getReplicateModel("recraft-ai/recraft-v3"),
[Model.PHOTON]: getReplicateModel("luma/photon"),
[Model.STABLE_DIFFUSION_3_5_LARGE]: getReplicateModel(
"stability-ai/stable-diffusion-3.5-large",
),
[Model.STABLE_DIFFUSION_3_5_MEDIUM]: getReplicateModel(
"stability-ai/stable-diffusion-3.5-medium",
),
},
})
: null;

View File

@@ -0,0 +1,29 @@
export {
type SelectGeneration as Generation,
type SelectImage as Image,
} from "@turbostarter/db/schema/image";
import { aspectRatioEnum } from "@turbostarter/db/schema/image";
import type { EnumToConstant } from "@turbostarter/shared/types";
export const Model = {
GPT_IMAGE_1: "gpt-image-1",
DALL_E_2: "dall-e-2",
DALL_E_3: "dall-e-3",
RECRAFT_V3: "recraft-v3",
PHOTON: "photon",
STABLE_DIFFUSION_3_5_LARGE: "stable-diffusion-3-5-large",
STABLE_DIFFUSION_3_5_MEDIUM: "stable-diffusion-3-5-medium",
} as const;
export type Model = (typeof Model)[keyof typeof Model];
export const AspectRatio = Object.fromEntries(
aspectRatioEnum.enumValues.map((aspectRatio) => [
aspectRatio.replace(/-/g, "_").toUpperCase(),
aspectRatio,
]),
) as EnumToConstant<typeof aspectRatioEnum.enumValues>;
export type AspectRatio = (typeof AspectRatio)[keyof typeof AspectRatio];

View File

@@ -0,0 +1,564 @@
import {
convertToModelMessages,
generateId,
smoothStream,
stepCountIs,
streamText,
tool,
} from "ai";
import * as z from "zod";
import { eq, sql } from "@turbostarter/db";
import {
pdfChat,
pdfDocument,
pdfEmbedding,
pdfMessage,
} from "@turbostarter/db/schema/pdf";
import { db } from "@turbostarter/db/server";
import { generateId as generateCitationId } from "@turbostarter/shared/utils";
import { getDeleteUrl } from "@turbostarter/storage/server";
import { repairToolCall } from "../../utils/llm";
import { PROMPTS } from "./constants";
import { findRelevantContent, generateDocumentEmbeddings } from "./embeddings";
import { modelStrategies } from "./strategies";
import { Role } from "./types";
import type { PdfMessagePayload } from "./schema";
import type { Citation, CitationResponse, PreciseCitation } from "./types";
import type {
InsertPdfChat,
InsertPdfDocument,
InsertPdfMessage,
} from "@turbostarter/db/schema/pdf";
/**
* Update document processing status
*/
const updateDocumentStatus = async (
documentId: string,
status: "pending" | "processing" | "ready" | "failed",
error?: string,
) => {
await db
.update(pdfDocument)
.set({
processingStatus: status,
processingError: error ?? null,
})
.where(eq(pdfDocument.id, documentId));
};
const createDocument = async (data: InsertPdfDocument) => {
const [documentData] = await db.insert(pdfDocument).values(data).returning();
if (!documentData) {
return null;
}
// Process with legacy embeddings (simple, reliable, production-ready)
void (async () => {
try {
// Set status to processing
await updateDocumentStatus(documentData.id, "processing");
// Generate embeddings for the document
console.log(`[api] Generating embeddings for document ${documentData.id}`);
const chunks = await generateDocumentEmbeddings(documentData.path);
console.log(`[api] Generated ${chunks.length} embedding chunks`);
// Insert embeddings into database
if (chunks.length > 0) {
await db.insert(pdfEmbedding).values(
chunks.map((chunk) => ({
content: chunk.content,
documentId: documentData.id,
embedding: chunk.embedding,
pageNumber: chunk.metadata.pageNumber,
charStart: chunk.metadata.charStart,
charEnd: chunk.metadata.charEnd,
sectionTitle: chunk.metadata.sectionTitle,
})),
);
}
console.log(`[api] Embedding processing complete: ${chunks.length} chunks stored`);
// Set status to ready
await updateDocumentStatus(documentData.id, "ready");
} catch (error) {
console.error(`[api] Failed to process PDF:`, error);
// Set status to failed with error message
await updateDocumentStatus(
documentData.id,
"failed",
error instanceof Error ? error.message : "Unknown error",
);
}
})();
return documentData;
};
const deleteDocument = async (path: string) => {
const { url } = await getDeleteUrl({ path });
await fetch(url, {
method: "DELETE",
});
};
export const createChat = async (
data: InsertPdfChat & Omit<InsertPdfDocument, "chatId">,
) => {
const [chatData] = await db
.insert(pdfChat)
.values(data)
.returning()
.onConflictDoUpdate({
target: pdfChat.id,
set: data,
});
if (!chatData) {
return null;
}
await createDocument({
...data,
chatId: chatData.id,
});
return chatData;
};
export const createMessage = async (data: InsertPdfMessage) =>
db.insert(pdfMessage).values(data).onConflictDoUpdate({
target: pdfMessage.id,
set: data,
});
export const createMessages = async (data: InsertPdfMessage[]) =>
db.insert(pdfMessage).values(data).onConflictDoNothing();
export const getChat = async (id: string) =>
db.query["pdf.pdfChat"].findFirst({
where: eq(pdfChat.id, id),
});
export const deleteChat = async (id: string) => {
const documents = await getChatDocuments(id);
const [deleted] = await db.delete(pdfChat).where(eq(pdfChat.id, id)).returning();
if (!deleted) {
return;
}
void Promise.allSettled(
documents.map((document) => deleteDocument(document.path)),
);
return deleted;
};
export const getUserChats = async (userId: string) =>
db.query["pdf.pdfChat"].findMany({
where: eq(pdfChat.userId, userId),
orderBy: (chat, { desc }) => [desc(chat.createdAt)],
});
export const getChatMessages = async (id: string) => {
return db.query["pdf.pdfMessage"].findMany({
where: eq(pdfMessage.chatId, id),
orderBy: (message, { asc }) => [asc(message.createdAt)],
});
};
export const getChatDocuments = async (id: string) =>
db.query["pdf.pdfDocument"].findMany({
where: eq(pdfDocument.chatId, id),
orderBy: (document, { asc }) => [asc(document.createdAt)],
});
export const getDocument = async (id: string) =>
db.query["pdf.pdfDocument"].findFirst({
where: eq(pdfDocument.id, id),
});
// ============================================================================
// Hybrid Search (legacy embeddings + keyword fallback)
// ============================================================================
/**
* Unified search result for tool responses
*/
interface UnifiedSearchResult {
id: string;
content: string;
pageNumber: number;
similarity: number;
/** Source type: 'legacy' for embeddings, 'keyword' for text search */
source: "legacy" | "keyword";
}
/**
* Extract specific identifiers from query for keyword fallback.
* Embeddings are weak for legal references, codes, and specific numbers.
*/
function extractSearchKeywords(query: string): string[] {
const patterns = [
/\d+\/\d{4}/g, // Legal references like 35/2024
/\b[A-Z]{2,}[-/]?\d+/g, // Codes like TDF/379
];
const keywords: string[] = [];
for (const pattern of patterns) {
const matches = query.match(pattern);
if (matches) keywords.push(...matches);
}
return [...new Set(keywords)];
}
/**
* Keyword search fallback for specific identifiers that embeddings miss.
*/
async function keywordSearchFallback(
query: string,
documentId: string,
limit = 4,
): Promise<UnifiedSearchResult[]> {
const keywords = extractSearchKeywords(query);
if (keywords.length === 0) return [];
console.log(`[hybridSearch] Running keyword fallback for: ${keywords.join(", ")}`);
// Search for any of the keywords
const keywordPattern = keywords.map((k) => `%${k}%`).join("%");
const results = await db.execute<{
id: string;
content: string;
page_number: number | null;
}>(sql`
SELECT id, content, page_number
FROM pdf.embedding
WHERE document_id = ${documentId}
AND content ILIKE ${keywordPattern}
LIMIT ${limit}
`);
const rows = Array.isArray(results) ? results : [];
console.log(`[hybridSearch] Keyword fallback found ${rows.length} matches`);
return rows.map((row) => ({
id: row.id,
content: row.content,
pageNumber: row.page_number ?? 1,
similarity: 0.95, // High score for exact keyword matches
source: "keyword" as const,
}));
}
/**
* Hybrid search: semantic embeddings + keyword fallback for specific identifiers
*/
async function hybridSearch(
query: string,
documentId: string,
limit = 6,
): Promise<UnifiedSearchResult[]> {
console.log(`[hybridSearch] Searching for: "${query}" in document ${documentId}`);
// Semantic search using legacy embeddings
const legacyResults = await findRelevantContent(query, documentId);
let results: UnifiedSearchResult[] = legacyResults.slice(0, limit).map((r) => ({
id: r.id,
content: r.name,
pageNumber: r.pageNumber,
similarity: r.similarity,
source: "legacy" as const,
}));
console.log(`[hybridSearch] Semantic search found ${results.length} results`);
// Keyword fallback: ALWAYS run if query has specific identifiers (legal refs, codes)
// Embeddings are weak for these, so we need exact text matching
const keywords = extractSearchKeywords(query);
if (keywords.length > 0) {
const keywordResults = await keywordSearchFallback(query, documentId, 4);
if (keywordResults.length > 0) {
// Merge keyword results FIRST (they're more relevant for specific queries)
const existingIds = new Set(results.map((r) => r.id));
const newKeywordResults = keywordResults.filter((kr) => !existingIds.has(kr.id));
// Prepend keyword matches (higher priority) then add semantic results
results = [...newKeywordResults, ...results].slice(0, limit);
console.log(`[hybridSearch] Added ${newKeywordResults.length} keyword matches, total: ${results.length}`);
}
}
return results;
}
// Create highlight tool for precise text citations
const createHighlightTool = () => ({
highlightText: tool({
description: `Highlight a specific phrase from the PDF document to support your answer.
Use this tool for EACH fact you cite. The text must be an EXACT quote from the document.
Keep highlights short (10-100 characters) - single sentences or key phrases only.`,
inputSchema: z.object({
text: z.string().min(10).max(200).describe("Exact phrase from the document to highlight"),
page: z.number().int().positive().describe("Page number where text appears (1-indexed)"),
relevance: z.string().optional().describe("Brief note on why this supports your answer"),
}),
execute: ({ text, page, relevance }) => {
const citationId = generateCitationId();
const citation: PreciseCitation = {
citationId,
text,
page,
relevance: relevance ?? null,
timestamp: Date.now(),
};
return citation;
},
}),
});
// Create tools with optional document filtering
const createTools = (documentIds?: string[]) => {
console.log(`🛠️ createTools called with documentIds:`, documentIds);
const searchTool = {
findRelevantContent: tool({
description: `Get information from the PDF document to answer questions. Returns sources with IDs and page numbers that you MUST cite using [[cite:ID:PAGE]] format.`,
inputSchema: z.object({
query: z
.string()
.describe("The user's query to find relevant information for."),
}),
execute: async ({ query }) => {
console.log(`🛠️ Tool execute called with query: "${query}"`);
// If we have specific documents, search in each and combine results
if (documentIds && documentIds.length > 0) {
console.log(`🛠️ Searching in ${documentIds.length} documents:`, documentIds);
const results = await Promise.all(
documentIds.map((docId) => hybridSearch(query, docId, 6))
);
const combined = results.flat().slice(0, 6);
console.log(`🛠️ Combined results:`, combined.length);
// Return formatted results with citation instructions
return {
results: combined,
citationInstructions: "IMPORTANT: Cite each source using [[cite:ID:PAGE]] format where ID is the source's id and PAGE is pageNumber.",
};
}
// No specific documents - search across all (legacy behavior)
const results = await findRelevantContent(query);
return {
results: results.map((r) => ({
id: r.id,
content: r.name,
pageNumber: r.pageNumber,
similarity: r.similarity,
source: "legacy" as const,
})),
citationInstructions: "IMPORTANT: Cite each source using [[cite:ID:PAGE]] format where ID is the source's id and PAGE is pageNumber.",
};
},
}),
};
const highlightTool = createHighlightTool();
return { ...searchTool, ...highlightTool };
};
// Legacy export for backwards compatibility
export const tools = createTools();
// ============================================================================
// Citation Parsing
// ============================================================================
/**
* Regular expression to match citation markers: [[cite:embeddingId:pageNum]]
* Captures: embeddingId, pageNum
*/
const CITATION_REGEX = /\[\[cite:([a-zA-Z0-9]+):(\d+)\]\]/g;
/**
* Common search result interface for citation parsing
* Works with both legacy EmbeddingSearchResult and new UnifiedSearchResult
*/
interface CitableSearchResult {
id: string;
content?: string; // New format
name?: string; // Legacy format
similarity: number;
pageNumber: number;
}
/**
* Parses AI response content containing [[cite:id:page]] markers and converts
* them to numbered citations [1], [2], etc.
*
* @param content - Raw AI response with [[cite:id:page]] markers
* @param searchResults - Array of search results (legacy or unified format)
* @returns CitationResponse with parsed content and citation array
*
* @example
* ```typescript
* const response = parseCitations(
* "The document states X [[cite:abc123:5]] and Y [[cite:def456:8]].",
* searchResults
* );
* // response.content = "The document states X [1] and Y [2]."
* // response.citations = [{ index: 1, embeddingId: "abc123", ... }, ...]
* ```
*/
export function parseCitations(
content: string,
searchResults: CitableSearchResult[]
): CitationResponse {
const citations: Citation[] = [];
const seenIds = new Map<string, number>(); // id -> citation index
// Create a lookup map for results
const resultMap = new Map(
searchResults.map((r) => [r.id, r])
);
// Replace all citation markers with numbered references
const parsedContent = content.replace(CITATION_REGEX, (_match, resultId: string, pageNumStr: string) => {
const pageNumber = parseInt(pageNumStr, 10);
// If we've already seen this ID, reuse the same citation number
if (seenIds.has(resultId)) {
return `[${seenIds.get(resultId)}]`;
}
// Create new citation
const index = citations.length + 1;
seenIds.set(resultId, index);
// Look up the result for excerpt and relevance
const result = resultMap.get(resultId);
const textContent = result?.content ?? result?.name ?? "";
citations.push({
index,
embeddingId: resultId, // Keep field name for API compatibility
relevance: result?.similarity ?? 0,
pageNumber: result?.pageNumber ?? pageNumber,
// Create excerpt: first 150 chars of content
excerpt: textContent
? textContent.substring(0, 150) + (textContent.length > 150 ? "..." : "")
: `[Content from page ${pageNumber}]`,
});
return `[${index}]`;
});
return {
content: parsedContent,
citations,
};
}
/**
* Formats search results as context for the AI with citation metadata.
* This helps the AI understand how to cite the sources.
* Works with both legacy EmbeddingSearchResult and new UnifiedSearchResult.
*
* @param results - Array of search results (legacy or unified format)
* @returns Formatted string with citation instructions per result
*/
export function formatEmbeddingsForCitation(results: CitableSearchResult[]): string {
if (results.length === 0) {
return "No relevant content found in the document.";
}
return results
.map((r, i) => {
const textContent = r.content ?? r.name ?? "[No content]";
return `[Source ${i + 1}]
ID: ${r.id}
Page: ${r.pageNumber}
Relevance: ${(r.similarity * 100).toFixed(1)}%
Content: ${textContent}
---
To cite this source, use: [[cite:${r.id}:${r.pageNumber}]]`;
})
.join("\n\n");
}
export const streamChatWithDocuments = async ({
chatId,
signal,
documentIds,
...message
}: PdfMessagePayload & { signal: AbortSignal; chatId: string; documentIds?: string[] }) => {
console.log(`📨 streamChatWithDocuments - chatId: ${chatId}, documentIds:`, documentIds);
await createMessage({ ...message, chatId });
const messages = await getChatMessages(chatId);
const result = streamText({
// Use uncached model - tools need fresh execution, not cached responses
model: modelStrategies.languageModel("uncached"),
messages: convertToModelMessages([
...messages.map((m) => ({
...m,
parts: [
{
type: "text" as const,
text: m.content,
},
],
})),
{
...message,
parts: [
{
type: "text" as const,
text: message.content,
},
],
},
]),
system: PROMPTS.SYSTEM,
stopWhen: stepCountIs(6),
abortSignal: signal,
tools: createTools(documentIds),
experimental_transform: smoothStream({
chunking: "word",
delayInMs: 15,
}),
experimental_repairToolCall: repairToolCall,
onError: (error) => {
console.error(error);
},
});
void result.consumeStream();
return result.toUIMessageStreamResponse({
onFinish: async ({ responseMessage }) => {
await createMessage({
id: responseMessage.id || generateId(),
chatId,
content: responseMessage.parts
.filter((part) => part.type === "text")
.map((part) => part.text)
.join("\n"),
role: Role.ASSISTANT,
});
},
headers: {
"Content-Type": "application/octet-stream",
"Content-Encoding": "none",
},
});
};
// Re-export PreciseCitation type for consumers
export type { PreciseCitation } from "./types";

View File

@@ -0,0 +1,335 @@
/**
* Unit tests for dual-resolution chunking (WF-0028)
*/
import { describe, expect, it } from "vitest";
import {
createDualResolutionChunks,
getChunkingStats,
validateChunks,
DEFAULT_CHUNKING_CONFIG,
} from "./chunking";
import type { LayoutElement, DualResolutionChunks } from "./chunking";
// ============================================================================
// Test Fixtures
// ============================================================================
function createTestElement(overrides: Partial<LayoutElement> = {}): LayoutElement {
return {
content: "Test paragraph content for unit testing purposes.",
type: "prose",
pageNumber: 1,
paragraphIndex: 0,
charStart: 0,
charEnd: 47,
bboxX: 0.1,
bboxY: 0.1,
bboxWidth: 0.8,
bboxHeight: 0.05,
sectionTitle: undefined,
...overrides,
};
}
function createTestElements(count: number): LayoutElement[] {
return Array.from({ length: count }, (_, i) =>
createTestElement({
content: `Paragraph ${i + 1} content with some reasonable text length.`,
paragraphIndex: i,
charStart: i * 60,
charEnd: (i + 1) * 60,
bboxY: 0.1 + i * 0.05,
}),
);
}
// ============================================================================
// createDualResolutionChunks Tests
// ============================================================================
describe("createDualResolutionChunks", () => {
it("should return empty arrays for empty input", () => {
const result = createDualResolutionChunks([]);
expect(result.citationUnits).toHaveLength(0);
expect(result.retrievalChunks).toHaveLength(0);
});
it("should create citation units for each layout element", () => {
const elements = createTestElements(5);
const result = createDualResolutionChunks(elements);
expect(result.citationUnits).toHaveLength(5);
expect(result.citationUnits[0]?.content).toBe(elements[0]?.content);
expect(result.citationUnits[0]?.pageNumber).toBe(1);
expect(result.citationUnits[0]?.unitType).toBe("prose");
});
it("should preserve bounding box coordinates in citation units", () => {
const element = createTestElement({
bboxX: 0.15,
bboxY: 0.25,
bboxWidth: 0.7,
bboxHeight: 0.08,
});
const result = createDualResolutionChunks([element]);
expect(result.citationUnits[0]?.bboxX).toBe(0.15);
expect(result.citationUnits[0]?.bboxY).toBe(0.25);
expect(result.citationUnits[0]?.bboxWidth).toBe(0.7);
expect(result.citationUnits[0]?.bboxHeight).toBe(0.08);
});
it("should group citation units into retrieval chunks", () => {
const elements = createTestElements(10);
const result = createDualResolutionChunks(elements);
// Should create multiple retrieval chunks
expect(result.retrievalChunks.length).toBeGreaterThan(1);
expect(result.retrievalChunks.length).toBeLessThanOrEqual(4); // ~10/3 to 10/5
// Each chunk should reference valid citation unit indices
for (const chunk of result.retrievalChunks) {
expect(chunk.citationUnitIndices.length).toBeGreaterThanOrEqual(1);
for (const idx of chunk.citationUnitIndices) {
expect(idx).toBeGreaterThanOrEqual(0);
expect(idx).toBeLessThan(10);
}
}
});
it("should concatenate content in retrieval chunks", () => {
const elements = createTestElements(3);
const result = createDualResolutionChunks(elements, { minUnitsPerChunk: 3, maxUnitsPerChunk: 5 });
// With 3 elements and min=3, should be one chunk
expect(result.retrievalChunks).toHaveLength(1);
expect(result.retrievalChunks[0]?.content).toContain("Paragraph 1");
expect(result.retrievalChunks[0]?.content).toContain("Paragraph 2");
expect(result.retrievalChunks[0]?.content).toContain("Paragraph 3");
});
it("should calculate correct page boundaries for retrieval chunks", () => {
const elements = [
createTestElement({ pageNumber: 1, paragraphIndex: 0 }),
createTestElement({ pageNumber: 2, paragraphIndex: 1 }),
createTestElement({ pageNumber: 3, paragraphIndex: 2 }),
];
const result = createDualResolutionChunks(elements, { minUnitsPerChunk: 3, maxUnitsPerChunk: 5 });
expect(result.retrievalChunks[0]?.pageStart).toBe(1);
expect(result.retrievalChunks[0]?.pageEnd).toBe(3);
});
it("should break on section headings when enabled", () => {
const elements = [
createTestElement({ content: "Intro paragraph", type: "prose", paragraphIndex: 0 }),
createTestElement({ content: "Chapter 1", type: "heading", paragraphIndex: 1 }),
createTestElement({ content: "Chapter content", type: "prose", paragraphIndex: 2 }),
createTestElement({ content: "More content", type: "prose", paragraphIndex: 3 }),
createTestElement({ content: "Even more", type: "prose", paragraphIndex: 4 }),
];
const result = createDualResolutionChunks(elements, { breakOnSections: true });
// Should break at the heading
expect(result.retrievalChunks.length).toBeGreaterThanOrEqual(2);
});
it("should respect maxUnitsPerChunk configuration", () => {
const elements = createTestElements(15);
const result = createDualResolutionChunks(elements, { maxUnitsPerChunk: 3 });
// Each chunk should have at most 3 units (except possibly the last merged one)
for (let i = 0; i < result.retrievalChunks.length - 1; i++) {
expect(result.retrievalChunks[i]?.citationUnitIndices.length).toBeLessThanOrEqual(3);
}
});
it("should handle different element types", () => {
const elements = [
createTestElement({ type: "heading", content: "Title" }),
createTestElement({ type: "prose", content: "Body text" }),
createTestElement({ type: "list", content: "- Item 1\n- Item 2" }),
createTestElement({ type: "code", content: "const x = 1;" }),
createTestElement({ type: "table", content: "| A | B |" }),
];
const result = createDualResolutionChunks(elements);
expect(result.citationUnits[0]?.unitType).toBe("heading");
expect(result.citationUnits[1]?.unitType).toBe("prose");
expect(result.citationUnits[2]?.unitType).toBe("list");
expect(result.citationUnits[3]?.unitType).toBe("code");
expect(result.citationUnits[4]?.unitType).toBe("table");
});
});
// ============================================================================
// getChunkingStats Tests
// ============================================================================
describe("getChunkingStats", () => {
it("should return zeros for empty input", () => {
const stats = getChunkingStats({ citationUnits: [], retrievalChunks: [] });
expect(stats.totalCitationUnits).toBe(0);
expect(stats.totalRetrievalChunks).toBe(0);
expect(stats.avgUnitsPerChunk).toBe(0);
expect(stats.avgTokensPerChunk).toBe(0);
});
it("should calculate correct statistics", () => {
const elements = createTestElements(10);
const chunks = createDualResolutionChunks(elements);
const stats = getChunkingStats(chunks);
expect(stats.totalCitationUnits).toBe(10);
expect(stats.totalRetrievalChunks).toBeGreaterThan(0);
expect(stats.avgUnitsPerChunk).toBeGreaterThan(0);
expect(stats.avgTokensPerChunk).toBeGreaterThan(0);
expect(stats.pageRange.start).toBe(1);
expect(stats.pageRange.end).toBe(1);
});
it("should calculate correct page range", () => {
const elements = [
createTestElement({ pageNumber: 5 }),
createTestElement({ pageNumber: 3 }),
createTestElement({ pageNumber: 10 }),
];
const chunks = createDualResolutionChunks(elements);
const stats = getChunkingStats(chunks);
expect(stats.pageRange.start).toBe(3);
expect(stats.pageRange.end).toBe(10);
});
});
// ============================================================================
// validateChunks Tests
// ============================================================================
describe("validateChunks", () => {
it("should return no errors for valid chunks", () => {
const elements = createTestElements(10);
const chunks = createDualResolutionChunks(elements);
const errors = validateChunks(chunks);
expect(errors).toHaveLength(0);
});
it("should detect invalid citation unit index references", () => {
const chunks: DualResolutionChunks = {
citationUnits: [
{
content: "Test",
pageNumber: 1,
paragraphIndex: 0,
charStart: 0,
charEnd: 4,
unitType: "prose",
},
],
retrievalChunks: [
{
content: "Test",
pageStart: 1,
pageEnd: 1,
sectionHierarchy: [],
chunkType: "prose",
citationUnitIndices: [0, 5], // Index 5 is invalid
},
],
};
const errors = validateChunks(chunks);
expect(errors.length).toBeGreaterThan(0);
expect(errors.some((e) => e.includes("invalid citation unit index 5"))).toBe(true);
});
it("should detect unreferenced citation units", () => {
const chunks: DualResolutionChunks = {
citationUnits: [
{
content: "Test 1",
pageNumber: 1,
paragraphIndex: 0,
charStart: 0,
charEnd: 6,
unitType: "prose",
},
{
content: "Test 2",
pageNumber: 1,
paragraphIndex: 1,
charStart: 7,
charEnd: 13,
unitType: "prose",
},
],
retrievalChunks: [
{
content: "Test 1",
pageStart: 1,
pageEnd: 1,
sectionHierarchy: [],
chunkType: "prose",
citationUnitIndices: [0], // Index 1 is unreferenced
},
],
};
const errors = validateChunks(chunks);
expect(errors.length).toBeGreaterThan(0);
expect(errors.some((e) => e.includes("Citation unit 1 is not referenced"))).toBe(true);
});
it("should detect page boundary inconsistencies", () => {
const chunks: DualResolutionChunks = {
citationUnits: [
{
content: "Test",
pageNumber: 5, // Page 5
paragraphIndex: 0,
charStart: 0,
charEnd: 4,
unitType: "prose",
},
],
retrievalChunks: [
{
content: "Test",
pageStart: 1, // Wrong - should be 5
pageEnd: 1, // Wrong - should be 5
sectionHierarchy: [],
chunkType: "prose",
citationUnitIndices: [0],
},
],
};
const errors = validateChunks(chunks);
expect(errors.length).toBeGreaterThan(0);
expect(errors.some((e) => e.includes("pageStart") || e.includes("pageEnd"))).toBe(true);
});
});
// ============================================================================
// DEFAULT_CHUNKING_CONFIG Tests
// ============================================================================
describe("DEFAULT_CHUNKING_CONFIG", () => {
it("should have sensible defaults", () => {
expect(DEFAULT_CHUNKING_CONFIG.minUnitsPerChunk).toBe(3);
expect(DEFAULT_CHUNKING_CONFIG.maxUnitsPerChunk).toBe(5);
expect(DEFAULT_CHUNKING_CONFIG.maxChunkTokens).toBe(800);
expect(DEFAULT_CHUNKING_CONFIG.breakOnSections).toBe(true);
});
});

View File

@@ -0,0 +1,457 @@
/**
* Dual-Resolution Chunking Strategy (WF-0028)
*
* Creates two levels of chunks from parsed PDF layout elements:
* 1. Citation Units - paragraph-level with precise bounding boxes for highlighting
* 2. Retrieval Chunks - groups of 3-5 citation units for efficient vector search
*
* This separation enables:
* - Efficient semantic search via larger retrieval chunks
* - Pixel-perfect citation highlighting via granular citation units
*/
// ============================================================================
// Types
// ============================================================================
/**
* Input from layout parser - a single layout element from the PDF
*/
export interface LayoutElement {
content: string;
type: "prose" | "heading" | "list" | "table" | "code";
pageNumber: number;
paragraphIndex: number;
charStart: number;
charEnd: number;
bboxX?: number;
bboxY?: number;
bboxWidth?: number;
bboxHeight?: number;
sectionTitle?: string;
}
/**
* Configuration for the chunking algorithm
*/
export interface ChunkingConfig {
/** Minimum units per retrieval chunk (default: 3) */
minUnitsPerChunk: number;
/** Maximum units per retrieval chunk (default: 5) */
maxUnitsPerChunk: number;
/** Maximum tokens per retrieval chunk (default: 800) */
maxChunkTokens: number;
/** Whether to break on major section headings (default: true) */
breakOnSections: boolean;
}
/**
* Default chunking configuration
*/
export const DEFAULT_CHUNKING_CONFIG: ChunkingConfig = {
minUnitsPerChunk: 3,
maxUnitsPerChunk: 5,
maxChunkTokens: 800,
breakOnSections: true,
};
/**
* Citation unit data - ready for database insertion (no ID, generated on insert)
*/
export interface CitationUnitData {
content: string;
pageNumber: number;
paragraphIndex: number;
charStart: number;
charEnd: number;
bboxX?: number;
bboxY?: number;
bboxWidth?: number;
bboxHeight?: number;
sectionTitle?: string;
unitType: "prose" | "heading" | "list" | "table" | "code";
}
/**
* Retrieval chunk data - ready for database insertion (no ID, generated on insert)
*/
export interface RetrievalChunkData {
content: string;
pageStart: number;
pageEnd: number;
sectionHierarchy: string[];
chunkType: string;
/** Indices into the citationUnits array for linking after DB insert */
citationUnitIndices: number[];
}
/**
* Result of the dual-resolution chunking process
*/
export interface DualResolutionChunks {
citationUnits: CitationUnitData[];
retrievalChunks: RetrievalChunkData[];
}
// ============================================================================
// Utility Functions
// ============================================================================
/**
* Rough token count estimation (words * 1.3 for typical English text)
* More accurate than character count for LLM context limits
*/
function estimateTokens(text: string): number {
const words = text.trim().split(/\s+/).length;
return Math.ceil(words * 1.3);
}
/**
* Determine if an element represents a major section break
* Major headings indicate semantic boundaries that shouldn't be crossed
*/
function isMajorSectionBreak(element: LayoutElement): boolean {
if (element.type !== "heading") return false;
// Heuristics for major section detection:
// 1. Short content (likely a header, not inline text)
// 2. Starts with common section markers (numbers, roman numerals)
// 3. All caps or title case patterns
const content = element.content.trim();
// Short headings are likely section titles
if (content.length < 100) {
// Check for numbered sections: "1.", "1.1", "I.", "Chapter 1", etc.
if (/^(\d+\.|\d+\s|[IVX]+\.|Chapter\s+\d)/i.test(content)) {
return true;
}
// Check for all caps (common for major headings)
if (content === content.toUpperCase() && content.length > 3) {
return true;
}
// Default: treat any heading as a potential section break
return true;
}
return false;
}
/**
* Extract section title from elements, building hierarchy
*/
function buildSectionHierarchy(elements: LayoutElement[]): string[] {
const hierarchy: string[] = [];
for (const element of elements) {
if (element.sectionTitle) {
// Use explicit section title if provided
if (!hierarchy.includes(element.sectionTitle)) {
hierarchy.push(element.sectionTitle);
}
} else if (element.type === "heading") {
// Use heading content as section marker
const title = element.content.trim().slice(0, 100); // Truncate long headings
if (title && !hierarchy.includes(title)) {
hierarchy.push(title);
}
}
}
return hierarchy;
}
/**
* Determine the dominant chunk type from a group of elements
*/
function determineChunkType(
elements: LayoutElement[],
): "prose" | "heading" | "list" | "table" | "code" | "mixed" {
const typeCounts = new Map<string, number>();
for (const element of elements) {
typeCounts.set(element.type, (typeCounts.get(element.type) ?? 0) + 1);
}
// Find the most common type
let maxCount = 0;
let dominantType = "prose";
typeCounts.forEach((count, type) => {
if (count > maxCount) {
maxCount = count;
dominantType = type;
}
});
// If no clear majority (>50%), mark as mixed
if (maxCount <= elements.length / 2 && typeCounts.size > 1) {
return "mixed";
}
return dominantType as "prose" | "heading" | "list" | "table" | "code";
}
// ============================================================================
// Main Chunking Function
// ============================================================================
/**
* Create dual-resolution chunks from layout elements
*
* Algorithm:
* 1. Convert each LayoutElement to a CitationUnit
* 2. Group adjacent CitationUnits into RetrievalChunks (3-5 units each)
* 3. Respect section boundaries (don't split across major headings)
* 4. Respect token limits (don't exceed maxChunkTokens)
*
* @param elements - Array of layout elements from the PDF parser
* @param config - Optional chunking configuration
* @returns Dual-resolution chunks ready for database insertion
*/
export function createDualResolutionChunks(
elements: LayoutElement[],
config: Partial<ChunkingConfig> = {},
): DualResolutionChunks {
const fullConfig: ChunkingConfig = { ...DEFAULT_CHUNKING_CONFIG, ...config };
// Handle empty input
if (elements.length === 0) {
return { citationUnits: [], retrievalChunks: [] };
}
// Step 1: Convert all elements to citation units
const citationUnits: CitationUnitData[] = elements.map((element) => ({
content: element.content,
pageNumber: element.pageNumber,
paragraphIndex: element.paragraphIndex,
charStart: element.charStart,
charEnd: element.charEnd,
bboxX: element.bboxX,
bboxY: element.bboxY,
bboxWidth: element.bboxWidth,
bboxHeight: element.bboxHeight,
sectionTitle: element.sectionTitle,
unitType: element.type,
}));
// Step 2: Group citation units into retrieval chunks
const retrievalChunks: RetrievalChunkData[] = [];
let currentGroup: { element: LayoutElement; index: number }[] = [];
let currentTokens = 0;
const flushGroup = () => {
if (currentGroup.length === 0) return;
const groupElements = currentGroup.map((g) => g.element);
const groupIndices = currentGroup.map((g) => g.index);
// Concatenate content with double newlines for readability
const content = groupElements.map((e) => e.content).join("\n\n");
// Calculate page boundaries
const pageNumbers = groupElements.map((e) => e.pageNumber);
const pageStart = Math.min(...pageNumbers);
const pageEnd = Math.max(...pageNumbers);
// Build section hierarchy from headings in the group
const sectionHierarchy = buildSectionHierarchy(groupElements);
// Determine dominant chunk type
const chunkType = determineChunkType(groupElements);
retrievalChunks.push({
content,
pageStart,
pageEnd,
sectionHierarchy,
chunkType,
citationUnitIndices: groupIndices,
});
// Reset for next group
currentGroup = [];
currentTokens = 0;
};
for (let i = 0; i < elements.length; i++) {
const element = elements[i]!;
const elementTokens = estimateTokens(element.content);
// Check if we should start a new chunk
const shouldStartNew =
// Section break (if enabled)
(fullConfig.breakOnSections &&
isMajorSectionBreak(element) &&
currentGroup.length > 0) ||
// Max units reached
currentGroup.length >= fullConfig.maxUnitsPerChunk ||
// Token limit would be exceeded
(currentGroup.length > 0 &&
currentTokens + elementTokens > fullConfig.maxChunkTokens &&
currentGroup.length >= fullConfig.minUnitsPerChunk);
if (shouldStartNew) {
flushGroup();
}
// Add element to current group
currentGroup.push({ element, index: i });
currentTokens += elementTokens;
}
// Flush any remaining elements
flushGroup();
// Handle edge case: if last chunk is too small, merge with previous
if (
retrievalChunks.length >= 2 &&
retrievalChunks[retrievalChunks.length - 1]!.citationUnitIndices.length <
fullConfig.minUnitsPerChunk
) {
const lastChunk = retrievalChunks.pop()!;
const prevChunk = retrievalChunks[retrievalChunks.length - 1]!;
// Only merge if combined size is reasonable
const combinedTokens = estimateTokens(
prevChunk.content + "\n\n" + lastChunk.content,
);
if (combinedTokens <= fullConfig.maxChunkTokens * 1.5) {
// Allow 50% overage for merging
prevChunk.content += "\n\n" + lastChunk.content;
prevChunk.pageEnd = Math.max(prevChunk.pageEnd, lastChunk.pageEnd);
prevChunk.citationUnitIndices.push(...lastChunk.citationUnitIndices);
// Merge section hierarchies
for (const section of lastChunk.sectionHierarchy) {
if (!prevChunk.sectionHierarchy.includes(section)) {
prevChunk.sectionHierarchy.push(section);
}
}
// Update chunk type if needed
if (prevChunk.chunkType !== lastChunk.chunkType) {
prevChunk.chunkType = "mixed";
}
} else {
// Put it back if merge would be too large
retrievalChunks.push(lastChunk);
}
}
return { citationUnits, retrievalChunks };
}
// ============================================================================
// Convenience Functions
// ============================================================================
/**
* Get statistics about the chunking result
* Useful for debugging and logging
*/
export function getChunkingStats(chunks: DualResolutionChunks): {
totalCitationUnits: number;
totalRetrievalChunks: number;
avgUnitsPerChunk: number;
avgTokensPerChunk: number;
pageRange: { start: number; end: number };
} {
const { citationUnits, retrievalChunks } = chunks;
if (retrievalChunks.length === 0) {
return {
totalCitationUnits: 0,
totalRetrievalChunks: 0,
avgUnitsPerChunk: 0,
avgTokensPerChunk: 0,
pageRange: { start: 0, end: 0 },
};
}
const totalUnits = retrievalChunks.reduce(
(sum, chunk) => sum + chunk.citationUnitIndices.length,
0,
);
const totalTokens = retrievalChunks.reduce(
(sum, chunk) => sum + estimateTokens(chunk.content),
0,
);
const allPages = citationUnits.map((u) => u.pageNumber);
return {
totalCitationUnits: citationUnits.length,
totalRetrievalChunks: retrievalChunks.length,
avgUnitsPerChunk: totalUnits / retrievalChunks.length,
avgTokensPerChunk: totalTokens / retrievalChunks.length,
pageRange: {
start: Math.min(...allPages),
end: Math.max(...allPages),
},
};
}
/**
* Validate chunking result for consistency
* Returns array of error messages (empty if valid)
*/
export function validateChunks(chunks: DualResolutionChunks): string[] {
const errors: string[] = [];
const { citationUnits, retrievalChunks } = chunks;
// Check all citation unit indices are valid
for (let i = 0; i < retrievalChunks.length; i++) {
const chunk = retrievalChunks[i]!;
for (const unitIndex of chunk.citationUnitIndices) {
if (unitIndex < 0 || unitIndex >= citationUnits.length) {
errors.push(
`Retrieval chunk ${i} references invalid citation unit index ${unitIndex}`,
);
}
}
}
// Check all citation units are referenced by at least one retrieval chunk
const referencedUnits = new Set<number>();
for (const chunk of retrievalChunks) {
for (const index of chunk.citationUnitIndices) {
referencedUnits.add(index);
}
}
for (let i = 0; i < citationUnits.length; i++) {
if (!referencedUnits.has(i)) {
errors.push(`Citation unit ${i} is not referenced by any retrieval chunk`);
}
}
// Check page consistency (skip chunks with invalid indices)
for (let i = 0; i < retrievalChunks.length; i++) {
const chunk = retrievalChunks[i]!;
const validIndices = chunk.citationUnitIndices.filter(
(idx) => idx >= 0 && idx < citationUnits.length,
);
if (validIndices.length === 0) continue; // Skip if all indices are invalid
const unitPages = validIndices.map((idx) => citationUnits[idx]!.pageNumber);
const actualStart = Math.min(...unitPages);
const actualEnd = Math.max(...unitPages);
if (chunk.pageStart !== actualStart) {
errors.push(
`Retrieval chunk ${i} has pageStart ${chunk.pageStart} but units span from page ${actualStart}`,
);
}
if (chunk.pageEnd !== actualEnd) {
errors.push(
`Retrieval chunk ${i} has pageEnd ${chunk.pageEnd} but units span to page ${actualEnd}`,
);
}
}
return errors;
}

View File

@@ -0,0 +1,60 @@
export const PROMPTS = {
SYSTEM: `You are a helpful assistant specialized in analyzing PDF documents.
CRITICAL: You MUST use the "findRelevantContent" tool for EVERY user question BEFORE responding.
- ALWAYS call findRelevantContent first - never skip this step
- Only AFTER receiving tool results can you respond
- If the tool returns no results, then say you cannot find the information
- Never assume content doesn't exist without searching first
Your role:
- Only use information found within the provided PDF document to answer questions
- Maintain a professional and clear communication style
- Provide accurate, factual responses based solely on the document content
- If findRelevantContent returns empty results, respond with "I cannot find this information in the provided document. Please rephrase your question or ask about content that exists within the PDF." in user's language
- Help users understand complex information by breaking it down into simpler terms
- Remain objective and avoid making assumptions beyond what is explicitly stated in the document
- Today's date is ${new Date().toLocaleDateString("en-US", { year: "numeric", month: "long", day: "numeric" })}
CITATION WORKFLOW:
1. ALWAYS call findRelevantContent first to search the document
2. After receiving search results, use highlightText for each fact you cite
3. Call highlightText with the EXACT phrase from the document (10-100 characters)
4. You can call highlightText multiple times for different facts
5. Write your response naturally - citations appear automatically in the PDF
Example:
1. User asks: "What was the Q4 revenue?"
2. You call: findRelevantContent({ query: "Q4 revenue financial results" })
3. Results show content mentioning "$5.2 million in Q4 2023"
4. You call: highlightText({ text: "$5.2 million in Q4 2023", page: 12, relevance: "Q4 revenue figure" })
5. You respond: "The company reported Q4 revenue of $5.2 million, representing a 15% increase."
IMPORTANT:
- Keep highlighted text SHORT (single sentences or key phrases)
- Use EXACT quotes - don't paraphrase
- Call highlightText BEFORE writing the fact in your response
- Multiple highlights are encouraged for comprehensive citations`,
/**
* System prompt without citation requirements (for backwards compatibility)
*/
SYSTEM_LEGACY: `You are a helpful assistant specialized in analyzing PDF documents. Your role is to:
- Only use information found within the provided PDF document to answer questions
- Cite specific pages or sections when referencing information
- Maintain a professional and clear communication style
- Provide accurate, factual responses based solely on the document content
- If the answer cannot be found in the document, respond with "I cannot find this information in the provided document. Please rephrase your question or ask about content that exists within the PDF." in user's language
- When appropriate, quote relevant passages directly from the document
- Help users understand complex information by breaking it down into simpler terms
- Remain objective and avoid making assumptions beyond what is explicitly stated in the document
- To get relevant content from the document, use the tool "findRelevantContent"
- Today's date is ${new Date().toLocaleDateString("en-US", { year: "numeric", month: "long", day: "numeric" })}`,
};
export const MAX_FILE_SIZE_IN_MB = 10;
export const MAX_FILE_SIZE = MAX_FILE_SIZE_IN_MB * 1024 * 1024;
export const EXAMPLE_PDF = {
url: "https://ontheline.trincoll.edu/images/bookdown/sample-local-pdf.pdf",
size: 48.51 * 1024,
};

View File

@@ -0,0 +1,294 @@
/**
* Dual Embeddings Module for WF-0028
*
* Generates embeddings for retrieval chunks and stores both citation units
* and retrieval chunks to the database with proper linking.
*
* Key design decisions:
* - Embeddings only for retrieval chunks (not citation units) - reduces cost
* - Citation units linked via FK to parent retrieval chunk
* - Transaction ensures atomic insert of all chunks
*/
import { embedMany } from "ai";
import { pdfCitationUnit, pdfRetrievalChunk } from "@turbostarter/db/schema/pdf";
import { db } from "@turbostarter/db/server";
import { generateId } from "@turbostarter/shared/utils";
import { createDualResolutionChunks, getChunkingStats, validateChunks } from "./chunking";
import { parseDocumentLayout } from "./layout-parser";
import { modelStrategies } from "./strategies";
import type { DualResolutionChunks, LayoutElement, RetrievalChunkData } from "./chunking";
import type { LayoutElement as LayoutParserElement } from "./layout-parser";
import type { InsertPdfCitationUnit, InsertPdfRetrievalChunk } from "@turbostarter/db/schema/pdf";
// ============================================================================
// Types
// ============================================================================
/**
* Retrieval chunk with embedding added
*/
export interface RetrievalChunkWithEmbedding extends RetrievalChunkData {
embedding: number[];
}
/**
* Dual resolution chunks with embeddings for retrieval chunks
*/
export interface DualResolutionChunksWithEmbeddings
extends Omit<DualResolutionChunks, "retrievalChunks"> {
retrievalChunks: RetrievalChunkWithEmbedding[];
}
/**
* Result of storing dual chunks to the database
*/
export interface StoreDualChunksResult {
/** IDs of inserted retrieval chunks */
retrievalChunkIds: string[];
/** IDs of inserted citation units */
citationUnitIds: string[];
/** Statistics about what was stored */
stats: {
totalRetrievalChunks: number;
totalCitationUnits: number;
avgUnitsPerChunk: number;
pageRange: { start: number; end: number };
};
}
/**
* Full pipeline result
*/
export interface ProcessPdfResult {
/** Document ID that was processed */
documentId: string;
/** Storage result */
storage: StoreDualChunksResult;
/** Processing time in milliseconds */
processingTimeMs: number;
}
// ============================================================================
// Embedding Generation
// ============================================================================
/**
* Generate embeddings for retrieval chunks only
*
* Citation units do NOT get embeddings - they are retrieved via their
* parent retrieval chunk's FK relationship.
*
* @param chunks - Dual resolution chunks from the chunking module
* @returns Chunks with embeddings added to retrieval chunks
*/
export async function generateDualEmbeddings(
chunks: DualResolutionChunks,
): Promise<DualResolutionChunksWithEmbeddings> {
const { citationUnits, retrievalChunks } = chunks;
// Handle empty input
if (retrievalChunks.length === 0) {
return {
citationUnits,
retrievalChunks: [],
};
}
// Generate embeddings for retrieval chunks only
const { embeddings } = await embedMany({
model: modelStrategies.textEmbeddingModel("default"),
values: retrievalChunks.map((chunk) => chunk.content),
});
// Combine chunks with their embeddings
const chunksWithEmbeddings: RetrievalChunkWithEmbedding[] = retrievalChunks.map(
(chunk, index) => ({
...chunk,
embedding: embeddings[index] ?? [],
}),
);
return {
citationUnits,
retrievalChunks: chunksWithEmbeddings,
};
}
// ============================================================================
// Database Storage
// ============================================================================
/**
* Store dual resolution chunks to the database with proper linking
*
* Uses a transaction to ensure atomicity:
* 1. Insert all retrieval chunks first (to get IDs)
* 2. Insert all citation units with FK references to their parent chunk
*
* @param documentId - ID of the PDF document
* @param chunks - Chunks with embeddings
* @returns IDs of inserted records and statistics
*/
export async function storeDualChunks(
documentId: string,
chunks: DualResolutionChunksWithEmbeddings,
): Promise<StoreDualChunksResult> {
const { citationUnits, retrievalChunks } = chunks;
// Pre-generate IDs for retrieval chunks so we can reference them in citation units
const retrievalChunkRecords: InsertPdfRetrievalChunk[] = retrievalChunks.map((chunk) => ({
id: generateId(),
documentId,
content: chunk.content,
embedding: chunk.embedding,
pageStart: chunk.pageStart,
pageEnd: chunk.pageEnd,
sectionHierarchy: chunk.sectionHierarchy,
chunkType: chunk.chunkType,
}));
// Build citation unit records with FK references
// Each retrieval chunk knows which citation unit indices it contains
const citationUnitRecords: InsertPdfCitationUnit[] = citationUnits.map((unit, index) => {
// Find which retrieval chunk contains this citation unit
const parentChunk = retrievalChunks.find((chunk) =>
chunk.citationUnitIndices.includes(index),
);
const parentChunkIndex = parentChunk
? retrievalChunks.indexOf(parentChunk)
: -1;
const retrievalChunkId =
parentChunkIndex >= 0 ? retrievalChunkRecords[parentChunkIndex]?.id : undefined;
return {
id: generateId(),
documentId,
retrievalChunkId: retrievalChunkId ?? null,
content: unit.content,
pageNumber: unit.pageNumber,
paragraphIndex: unit.paragraphIndex,
charStart: unit.charStart,
charEnd: unit.charEnd,
bboxX: unit.bboxX ?? null,
bboxY: unit.bboxY ?? null,
bboxWidth: unit.bboxWidth ?? null,
bboxHeight: unit.bboxHeight ?? null,
sectionTitle: unit.sectionTitle ?? null,
unitType: unit.unitType,
};
});
// Use transaction to ensure atomicity
await db.transaction(async (tx) => {
// Insert retrieval chunks first
if (retrievalChunkRecords.length > 0) {
await tx.insert(pdfRetrievalChunk).values(retrievalChunkRecords);
}
// Insert citation units with FK references
if (citationUnitRecords.length > 0) {
await tx.insert(pdfCitationUnit).values(citationUnitRecords);
}
});
// Calculate statistics
const totalUnits = retrievalChunks.reduce(
(sum, chunk) => sum + chunk.citationUnitIndices.length,
0,
);
const allPages = citationUnits.map((u) => u.pageNumber);
return {
retrievalChunkIds: retrievalChunkRecords.map((r) => r.id!),
citationUnitIds: citationUnitRecords.map((u) => u.id!),
stats: {
totalRetrievalChunks: retrievalChunkRecords.length,
totalCitationUnits: citationUnitRecords.length,
avgUnitsPerChunk:
retrievalChunkRecords.length > 0
? totalUnits / retrievalChunkRecords.length
: 0,
pageRange: {
start: allPages.length > 0 ? Math.min(...allPages) : 0,
end: allPages.length > 0 ? Math.max(...allPages) : 0,
},
},
};
}
// ============================================================================
// Full Pipeline
// ============================================================================
/**
* Process a PDF document with dual-resolution chunking
*
* Full pipeline:
* 1. Parse PDF layout (layout-parser)
* 2. Create dual-resolution chunks (chunking)
* 3. Generate embeddings for retrieval chunks
* 4. Store to database with proper linking
*
* @param documentId - ID of the PDF document record
* @param path - Storage path to the PDF file
* @returns Processing result with statistics
*/
export async function processPdfWithDualResolution(
documentId: string,
path: string,
): Promise<ProcessPdfResult> {
const startTime = Date.now();
// Step 1: Parse PDF layout
console.log(`[dual-embeddings] Parsing layout for document ${documentId}`);
const parsedElements = await parseDocumentLayout(path);
console.log(`[dual-embeddings] Found ${parsedElements.length} layout elements`);
// Convert from layout-parser types to chunking types (null -> undefined for sectionTitle)
const layoutElements: LayoutElement[] = parsedElements.map(
(el: LayoutParserElement) => ({
...el,
sectionTitle: el.sectionTitle ?? undefined,
}),
);
// Step 2: Create dual-resolution chunks
console.log(`[dual-embeddings] Creating dual-resolution chunks`);
const chunks = createDualResolutionChunks(layoutElements);
// Validate chunks for consistency
const validationErrors = validateChunks(chunks);
if (validationErrors.length > 0) {
console.warn(`[dual-embeddings] Chunk validation warnings:`, validationErrors);
}
// Log chunking stats
const chunkingStats = getChunkingStats(chunks);
console.log(`[dual-embeddings] Chunking stats:`, chunkingStats);
// Step 3: Generate embeddings for retrieval chunks
console.log(
`[dual-embeddings] Generating embeddings for ${chunks.retrievalChunks.length} retrieval chunks`,
);
const chunksWithEmbeddings = await generateDualEmbeddings(chunks);
// Step 4: Store to database
console.log(`[dual-embeddings] Storing chunks to database`);
const storageResult = await storeDualChunks(documentId, chunksWithEmbeddings);
const processingTimeMs = Date.now() - startTime;
console.log(
`[dual-embeddings] Processing complete in ${processingTimeMs}ms:`,
storageResult.stats,
);
return {
documentId,
storage: storageResult,
processingTimeMs,
};
}

View File

@@ -0,0 +1,380 @@
import { PDFLoader } from "@langchain/community/document_loaders/fs/pdf";
import { RecursiveCharacterTextSplitter } from "@langchain/textsplitters";
import { embed, embedMany } from "ai";
import { sql } from "@turbostarter/db";
import { pdfEmbedding } from "@turbostarter/db/schema/pdf";
import { db } from "@turbostarter/db/server";
import { getSignedUrl } from "@turbostarter/storage/server";
import { modelStrategies } from "./strategies";
import type { EmbeddingMetadata } from "./types";
import type { Document } from "@langchain/core/documents";
/**
* Chunk with embedding and metadata for citation support
*/
export interface EmbeddingChunk {
content: string;
embedding: number[];
metadata: EmbeddingMetadata;
}
/**
* Try to detect section title from content (first line if it looks like a heading)
*/
const detectSectionTitle = (content: string): string | undefined => {
const firstLine = content.split("\n")[0]?.trim();
// Heuristic: if first line is short (<100 chars) and doesn't end with typical sentence punctuation,
// it might be a heading
if (firstLine && firstLine.length < 100 && !/[.?!:,;]$/.test(firstLine)) {
return firstLine;
}
return undefined;
};
/**
* Track character offsets within each page's content
*/
interface PageTextInfo {
pageNumber: number;
startOffset: number;
endOffset: number;
content: string;
}
const textSplitter = new RecursiveCharacterTextSplitter({
chunkSize: 1000,
chunkOverlap: 200,
});
const loadDocument = async (path: string) => {
const { url } = await getSignedUrl({ path });
const response = await fetch(url);
const blob = await response.blob();
const loader = new PDFLoader(blob);
return loader.load();
};
export const splitDocument = async (documents: Document[]) => {
return textSplitter.splitDocuments(documents);
};
export const generateDocumentEmbeddings = async (
path: string,
): Promise<EmbeddingChunk[]> => {
const documents = await loadDocument(path);
// Build page text map for character offset tracking
// PDFLoader returns one Document per page with metadata.loc.pageNumber
const pageTextInfos: PageTextInfo[] = [];
for (const doc of documents) {
const loc = doc.metadata.loc as { pageNumber?: number } | undefined;
const pageNumber = loc?.pageNumber ?? 1;
const content = doc.pageContent;
pageTextInfos.push({
pageNumber,
startOffset: 0, // Reset per page since we track within page
endOffset: content.length,
content,
});
}
// Split documents into chunks
const chunks = await splitDocument(documents);
// Generate embeddings
const { embeddings, values } = await embedMany({
model: modelStrategies.textEmbeddingModel("default"),
values: chunks.map((chunk) => chunk.pageContent),
});
// Build result with metadata
return chunks.map((chunk, index) => {
// Get page number from chunk metadata (set by RecursiveCharacterTextSplitter)
const chunkLoc = chunk.metadata.loc as { pageNumber?: number } | undefined;
const chunkPageNumber = chunkLoc?.pageNumber ?? 1;
// Find character offsets within the page
const pageInfo = pageTextInfos.find((p) => p.pageNumber === chunkPageNumber);
let charStart: number | undefined;
let charEnd: number | undefined;
if (pageInfo) {
// Find the position of this chunk's content within the page
const chunkContent = chunk.pageContent;
const posInPage = pageInfo.content.indexOf(chunkContent);
if (posInPage !== -1) {
charStart = posInPage;
charEnd = posInPage + chunkContent.length;
}
}
const sectionTitle = detectSectionTitle(chunk.pageContent);
return {
content: values[index] ?? chunk.pageContent,
embedding: embeddings[index] ?? [],
metadata: {
pageNumber: chunkPageNumber,
charStart,
charEnd,
sectionTitle,
},
};
});
};
/**
* Result from fetching a single embedding by ID
*/
export interface EmbeddingDetail {
id: string;
content: string;
pageNumber: number;
charStart?: number;
charEnd?: number;
sectionTitle?: string;
}
/**
* Get embedding by ID for citation highlighting
*/
export const getEmbeddingById = async (
id: string,
): Promise<EmbeddingDetail | null> => {
const result = await db.execute<{
id: string;
content: string;
page_number: number | null;
char_start: number | null;
char_end: number | null;
section_title: string | null;
}>(sql`
SELECT id, content, page_number, char_start, char_end, section_title
FROM pdf.embedding
WHERE id = ${id}
LIMIT 1
`);
const rows = Array.isArray(result) ? result : [];
const row = rows[0];
if (!row) {
return null;
}
return {
id: row.id,
content: row.content,
pageNumber: row.page_number ?? 1,
charStart: row.char_start ?? undefined,
charEnd: row.char_end ?? undefined,
sectionTitle: row.section_title ?? undefined,
};
};
export const generateEmbedding = async (value: string): Promise<number[]> => {
const input = value.replaceAll("\\n", " ");
const { embedding } = await embed({
model: modelStrategies.textEmbeddingModel("default"),
value: input,
});
return embedding;
};
/**
* Result from embedding similarity search with citation support
*/
export interface EmbeddingSearchResult {
/** Embedding row ID for citation reference */
id: string;
/** Original content text */
name: string;
/** Cosine similarity score 0-1 */
similarity: number;
/** Page number (extracted from content or default to 1) */
pageNumber: number;
}
/**
* Extract significant keywords from query for keyword search fallback.
* Focuses on specific identifiers (numbers, codes) that embeddings handle poorly.
*/
function extractKeywords(query: string): string[] {
// Match patterns like "35/2024", "123/2023", alphanumeric codes
const patterns = [
/\d+\/\d{4}/g, // Legal references like 35/2024
/\b[A-Z]{2,}[-/]?\d+/g, // Codes like TDF/379
/\b\d{4,}/g, // Long numbers
];
const keywords: string[] = [];
for (const pattern of patterns) {
const matches = query.match(pattern);
if (matches) keywords.push(...matches);
}
return [...new Set(keywords)];
}
export const findRelevantContent = async (
query: string,
documentId?: string,
): Promise<EmbeddingSearchResult[]> => {
console.log(
`🔍 findRelevantContent called with query: "${query}", documentId: ${documentId}`,
);
const userQueryEmbedded = await generateEmbedding(query);
console.log(
`🔍 Generated query embedding with ${userQueryEmbedded.length} dimensions`,
);
// First, let's check how many embeddings exist for this document
if (documentId) {
const countResult = await db
.select({ count: sql<number>`count(*)` })
.from(pdfEmbedding)
.where(sql`${pdfEmbedding.documentId} = ${documentId}`);
console.log(
`🔍 Found ${countResult[0]?.count ?? 0} embeddings for document ${documentId}`,
);
}
// Use raw SQL for the similarity calculation in both SELECT and WHERE
// The <=> operator is the cosine distance operator in pgvector
const vectorStr = `[${userQueryEmbedded.join(",")}]`;
console.log(
`🔍 Running similarity search with vector of ${userQueryEmbedded.length} dimensions`,
);
try {
// Include page_number in the query to support citations
// Lowered threshold from 0.3 to 0.1 - text-embedding-3-small produces
// lower similarity scores for general queries (0.15-0.25 typical)
const SIMILARITY_THRESHOLD = 0.1;
const similarGuides = await db.execute<{
id: string;
content: string;
similarity: number;
page_number: number | null;
}>(
documentId
? sql`
SELECT id, content, page_number, 1 - (embedding <=> ${vectorStr}::vector) as similarity
FROM pdf.embedding
WHERE document_id = ${documentId}
AND 1 - (embedding <=> ${vectorStr}::vector) > ${SIMILARITY_THRESHOLD}
ORDER BY similarity DESC
LIMIT 6
`
: sql`
SELECT id, content, page_number, 1 - (embedding <=> ${vectorStr}::vector) as similarity
FROM pdf.embedding
WHERE 1 - (embedding <=> ${vectorStr}::vector) > ${SIMILARITY_THRESHOLD}
ORDER BY similarity DESC
LIMIT 6
`,
);
console.log(
`🔍 db.execute returned type:`,
typeof similarGuides,
Array.isArray(similarGuides),
);
// db.execute returns an array directly
const rows = similarGuides;
let results: EmbeddingSearchResult[] = rows.map(
(
row: {
id: string;
content: string;
similarity: number;
page_number: number | null;
},
index: number,
) => ({
id: row.id,
name: row.content,
similarity: row.similarity,
// Use stored page number if available, fallback to index + 1 for legacy embeddings
pageNumber: row.page_number ?? index + 1,
}),
);
console.log(
`🔍 Found ${results.length} semantic results:`,
results.map((g) => ({
id: g.id,
similarity: g.similarity,
pageNumber: g.pageNumber,
preview: g.name.substring(0, 50),
})),
);
// Keyword fallback: if semantic search found few results and query has specific identifiers
const keywords = extractKeywords(query);
if (keywords.length > 0 && results.length < 3) {
console.log(`🔍 Running keyword fallback for: ${keywords.join(", ")}`);
// Build ILIKE conditions for each keyword
const keywordPattern = keywords.map((k) => `%${k}%`).join("%");
const keywordResults = await db.execute<{
id: string;
content: string;
page_number: number | null;
}>(
documentId
? sql`
SELECT id, content, page_number
FROM pdf.embedding
WHERE document_id = ${documentId}
AND content ILIKE ${keywordPattern}
LIMIT 4
`
: sql`
SELECT id, content, page_number
FROM pdf.embedding
WHERE content ILIKE ${keywordPattern}
LIMIT 4
`,
);
const keywordRows = keywordResults;
console.log(`🔍 Keyword search found ${keywordRows.length} matches`);
// Add keyword results with high similarity (they're exact matches)
const existingIds = new Set(results.map((r) => r.id));
for (const row of keywordRows) {
if (!existingIds.has(row.id)) {
results.push({
id: row.id,
name: row.content,
similarity: 0.95, // High score for exact keyword matches
pageNumber: row.page_number ?? 1,
});
}
}
// Re-sort by similarity
results.sort((a, b) => b.similarity - a.similarity);
results = results.slice(0, 6);
}
return results;
} catch (error) {
console.error(`🔍 ERROR in similarity search:`, error);
throw error;
}
};

View File

@@ -0,0 +1,90 @@
/**
* PDF Module (WF-0028 Dual-Resolution Chunking)
*
* Barrel export for PDF processing with dual-resolution chunking:
* - Citation Units: paragraph-level with precise bounding boxes for highlighting
* - Retrieval Chunks: groups of 3-5 citation units for efficient vector search
*
* Main entry point: processPdfWithDualResolution() for full pipeline
*/
// ============================================================================
// Layout Parser (T2)
// Parse PDFs with layout awareness to extract structured elements
// ============================================================================
export {
parseDocumentLayout,
groupElementsByPage,
getLayoutStatistics,
} from "./layout-parser";
export type { UnitType, LayoutElement } from "./layout-parser";
// ============================================================================
// Chunking Strategy (T3)
// Create dual-resolution chunks from layout elements
// ============================================================================
export {
createDualResolutionChunks,
getChunkingStats,
validateChunks,
DEFAULT_CHUNKING_CONFIG,
} from "./chunking";
export type {
ChunkingConfig,
CitationUnitData,
RetrievalChunkData,
DualResolutionChunks,
} from "./chunking";
// ============================================================================
// Dual Embeddings (T4)
// Generate embeddings and store dual-resolution chunks
// ============================================================================
export {
generateDualEmbeddings,
storeDualChunks,
processPdfWithDualResolution,
} from "./dual-embeddings";
export type {
RetrievalChunkWithEmbedding,
DualResolutionChunksWithEmbeddings,
StoreDualChunksResult,
ProcessPdfResult,
} from "./dual-embeddings";
// ============================================================================
// Search with Citations (T5)
// Vector similarity search with linked citation units
// ============================================================================
export {
searchWithCitations,
getCitationUnitsForChunk,
getCitationUnitById,
getRetrievalChunkById,
} from "./search";
export type {
BoundingBox,
CitationUnit,
SearchResult,
SearchOptions,
} from "./search";
// ============================================================================
// Legacy Exports (for backwards compatibility)
// Original PDF module exports remain available
// ============================================================================
export * from "./api";
export * from "./constants";
export * from "./embeddings";
export * from "./schema";
export * from "./types";
export * from "./strategies";

View File

@@ -0,0 +1,431 @@
/**
* Layout Parser for PDF Dual-Resolution Chunking (WF-0028)
*
* Parses PDFs with layout awareness to extract structured elements
* (prose, headings, lists, tables, code) with position metadata.
*/
import { PDFLoader } from "@langchain/community/document_loaders/fs/pdf";
import { getSignedUrl } from "@turbostarter/storage/server";
// =============================================================================
// Types
// =============================================================================
/**
* Unit types matching the database schema enum
*/
export type UnitType = "prose" | "heading" | "list" | "table" | "code";
/**
* A layout-aware element extracted from a PDF
*/
export interface LayoutElement {
/** The text content of this element */
content: string;
/** Detected element type */
type: UnitType;
/** 1-based page number */
pageNumber: number;
/** 0-based paragraph index within the page */
paragraphIndex: number;
/** Character start position within page text */
charStart: number;
/** Character end position within page text */
charEnd: number;
/** Estimated bounding box X (0-1 normalized to page width) */
bboxX: number;
/** Estimated bounding box Y (0-1 normalized to page height) */
bboxY: number;
/** Estimated bounding box width (0-1 normalized) */
bboxWidth: number;
/** Estimated bounding box height (0-1 normalized) */
bboxHeight: number;
/** Detected or inherited section title */
sectionTitle: string | null;
}
/**
* Internal representation of a page's content
*/
interface PageContent {
pageNumber: number;
content: string;
paragraphs: string[];
}
// =============================================================================
// Constants
// =============================================================================
/** Maximum characters for a heading line */
const HEADING_MAX_LENGTH = 100;
/** Patterns that indicate list items */
const LIST_PATTERNS = [
/^[\u2022\u2023\u25E6\u2043\u2219]\s+/, // Bullet characters
/^[-–—]\s+/, // Dashes
/^\d+[.)]\s+/, // Numbered: 1. or 1)
/^[a-zA-Z][.)]\s+/, // Lettered: a. or a)
/^[ivxlcdm]+[.)]\s+/i, // Roman numerals
];
/** Patterns suggesting code blocks */
const CODE_PATTERNS = [
/^\s{4,}/, // 4+ space indentation
/^\t+/, // Tab indentation
/^```/, // Markdown code fence
/^(const|let|var|function|class|import|export|if|for|while|return)\s/, // Keywords
/[{}[\]();]/, // Bracket-heavy content
/^\s*\/\//, // Comment lines
/^\s*#\s*\w+/, // Shell/Python comments
];
/** Patterns suggesting table rows */
const TABLE_PATTERNS = [
/\|.*\|/, // Pipe-delimited
/\t.*\t.*\t/, // Tab-separated (3+ columns)
/^\s*[-+]+\s*$/, // Table separator lines
];
// =============================================================================
// Element Type Detection
// =============================================================================
/**
* Detect if a paragraph is a heading
*/
function isHeading(text: string): boolean {
const trimmed = text.trim();
// Must be relatively short
if (trimmed.length > HEADING_MAX_LENGTH) return false;
// Must not end with typical sentence punctuation
if (/[.?!,;:]$/.test(trimmed)) return false;
// Should not be a list item
if (LIST_PATTERNS.some((p) => p.test(trimmed))) return false;
// Empty or whitespace-only is not a heading
if (trimmed.length === 0) return false;
// Single line, not too short (avoid random words)
const lines = trimmed.split("\n");
if (lines.length > 2) return false;
// All caps or title case often indicates heading
const isAllCaps = trimmed === trimmed.toUpperCase() && /[A-Z]/.test(trimmed);
const startsWithCap = /^[A-Z]/.test(trimmed);
// Headings typically don't start with lowercase
if (!startsWithCap && !isAllCaps) return false;
return true;
}
/**
* Detect if a paragraph is a list item or list block
*/
function isList(text: string): boolean {
const lines = text.trim().split("\n");
// At least one line should match a list pattern
const listLines = lines.filter((line) =>
LIST_PATTERNS.some((p) => p.test(line.trim())),
);
// Consider it a list if majority of lines are list items
return listLines.length > 0 && listLines.length >= lines.length / 2;
}
/**
* Detect if a paragraph is a code block
*/
function isCode(text: string): boolean {
const lines = text.trim().split("\n");
// Check for code patterns
let codeIndicators = 0;
for (const line of lines) {
if (CODE_PATTERNS.some((p) => p.test(line))) {
codeIndicators++;
}
}
// High density of code patterns suggests code
return codeIndicators >= Math.ceil(lines.length / 2);
}
/**
* Detect if a paragraph is a table
*/
function isTable(text: string): boolean {
const lines = text.trim().split("\n");
// Need multiple lines for a table
if (lines.length < 2) return false;
// Check for table patterns
const tableLines = lines.filter((line) =>
TABLE_PATTERNS.some((p) => p.test(line)),
);
// Most lines should look like table rows
return tableLines.length >= lines.length / 2;
}
/**
* Detect the type of a text element
*/
function detectElementType(text: string): UnitType {
// Order matters: more specific checks first
if (isTable(text)) return "table";
if (isCode(text)) return "code";
if (isList(text)) return "list";
if (isHeading(text)) return "heading";
return "prose";
}
// =============================================================================
// Paragraph Splitting
// =============================================================================
/**
* Split page content into logical paragraphs
*
* Uses double newlines as primary delimiter, with special handling for:
* - Code blocks (preserve internal newlines)
* - Lists (group consecutive list items)
* - Tables (preserve structure)
*/
function splitIntoParagraphs(content: string): string[] {
// Primary split on double newlines
const rawParagraphs = content.split(/\n{2,}/);
const paragraphs: string[] = [];
for (const para of rawParagraphs) {
const trimmed = para.trim();
if (trimmed.length === 0) continue;
paragraphs.push(trimmed);
}
return paragraphs;
}
// =============================================================================
// Bounding Box Estimation
// =============================================================================
/**
* Estimate bounding box from character position within page
*
* This is a V1 approximation. Assumes:
* - Single column layout
* - Uniform line height
* - Standard margins (10% on each side)
*
* For more accurate boxes, we would need pdf.js text layer parsing.
*/
function estimateBoundingBox(
charStart: number,
charEnd: number,
pageTextLength: number,
): { bboxX: number; bboxY: number; bboxWidth: number; bboxHeight: number } {
// Standard margins (normalized 0-1)
const marginLeft = 0.1;
const marginRight = 0.1;
const marginTop = 0.08;
const marginBottom = 0.08;
// Content area
const contentWidth = 1 - marginLeft - marginRight;
const contentHeight = 1 - marginTop - marginBottom;
// Estimate vertical position based on character position
// Assume characters are distributed proportionally down the page
const startRatio = pageTextLength > 0 ? charStart / pageTextLength : 0;
const endRatio = pageTextLength > 0 ? charEnd / pageTextLength : 1;
// Calculate Y position and height
const bboxY = marginTop + startRatio * contentHeight;
const bboxHeight = Math.max(0.02, (endRatio - startRatio) * contentHeight);
return {
bboxX: marginLeft,
bboxY,
bboxWidth: contentWidth,
bboxHeight,
};
}
// =============================================================================
// Section Title Tracking
// =============================================================================
/**
* Extract section title from a heading element
*/
function extractSectionTitle(element: { type: UnitType; content: string }): string | null {
if (element.type !== "heading") return null;
// Clean up the heading text
const title = element.content.trim();
// Skip very short titles (likely not meaningful sections)
if (title.length < 3) return null;
return title;
}
// =============================================================================
// Main Parser
// =============================================================================
/**
* Load a PDF document and return raw page content
*/
async function loadPdfPages(path: string): Promise<PageContent[]> {
const { url } = await getSignedUrl({ path });
const response = await fetch(url);
const blob = await response.blob();
const loader = new PDFLoader(blob);
const documents = await loader.load();
// PDFLoader returns one Document per page with metadata.loc.pageNumber
return documents.map((doc) => {
const loc = doc.metadata.loc as { pageNumber?: number } | undefined;
const pageNumber = loc?.pageNumber ?? 1;
const content = doc.pageContent;
const paragraphs = splitIntoParagraphs(content);
return {
pageNumber,
content,
paragraphs,
};
});
}
/**
* Parse a PDF document with layout awareness
*
* @param path - Storage path to the PDF file
* @returns Array of layout-aware elements with position metadata
*/
export async function parseDocumentLayout(path: string): Promise<LayoutElement[]> {
const pages = await loadPdfPages(path);
const elements: LayoutElement[] = [];
let currentSectionTitle: string | null = null;
for (const page of pages) {
let charOffset = 0;
let paragraphIndex = 0;
for (const paragraph of page.paragraphs) {
// Find actual position in page content
const actualStart = page.content.indexOf(paragraph, charOffset);
const charStart = actualStart !== -1 ? actualStart : charOffset;
const charEnd = charStart + paragraph.length;
// Update offset for next search
charOffset = charEnd;
// Detect element type
const type = detectElementType(paragraph);
// Estimate bounding box
const bbox = estimateBoundingBox(charStart, charEnd, page.content.length);
// Create element
const element: LayoutElement = {
content: paragraph,
type,
pageNumber: page.pageNumber,
paragraphIndex,
charStart,
charEnd,
...bbox,
sectionTitle: currentSectionTitle,
};
// Track section titles from headings
const newTitle = extractSectionTitle(element);
if (newTitle) {
currentSectionTitle = newTitle;
element.sectionTitle = newTitle;
}
elements.push(element);
paragraphIndex++;
}
}
return elements;
}
/**
* Group elements by page for easier processing
*/
export function groupElementsByPage(
elements: LayoutElement[],
): Map<number, LayoutElement[]> {
const pageMap = new Map<number, LayoutElement[]>();
for (const element of elements) {
const pageElements = pageMap.get(element.pageNumber) ?? [];
pageElements.push(element);
pageMap.set(element.pageNumber, pageElements);
}
return pageMap;
}
/**
* Get statistics about element types in a document
*/
export function getLayoutStatistics(elements: LayoutElement[]): {
total: number;
byType: Record<UnitType, number>;
byPage: Map<number, number>;
} {
const byType: Record<UnitType, number> = {
prose: 0,
heading: 0,
list: 0,
table: 0,
code: 0,
};
const byPage = new Map<number, number>();
for (const element of elements) {
byType[element.type]++;
const pageCount = byPage.get(element.pageNumber) ?? 0;
byPage.set(element.pageNumber, pageCount + 1);
}
return {
total: elements.length,
byType,
byPage,
};
}

View File

@@ -0,0 +1,74 @@
import * as z from "zod";
import { MAX_FILE_SIZE } from "./constants";
import { Role } from "./types";
export const pdfMessageSchema = z.object({
id: z.string(),
role: z.enum(Role).optional().default(Role.USER),
content: z.string().min(1).max(5000),
});
export type PdfMessagePayload = z.infer<typeof pdfMessageSchema>;
// API input type aliases
export type PdfMessageInput = PdfMessagePayload;
export {
selectPdfChatSchema as chatSchema,
selectPdfMessageSchema as messageSchema,
selectPdfDocumentSchema as pdfSchema,
} from "@turbostarter/db/schema/pdf";
export const pdfUrlFormSchema = z.object({
url: z
.string()
.url()
.refine((url) => url.toLowerCase().endsWith(".pdf")),
});
export type PdfUrlFormPayload = z.infer<typeof pdfUrlFormSchema>;
interface ValidateOptions {
/** Use server proxy to avoid CSP/CORS issues on client-side */
useProxy?: boolean;
}
export const validateRemotePdfUrl = async (
url: string,
options: ValidateOptions = {},
) => {
try {
const { useProxy = true } = options;
// Use proxy endpoint to avoid CSP/CORS blocking on client-side
// The proxy does HEAD internally and validates the PDF content-type
const fetchUrl = useProxy
? `/api/storage/proxy?url=${encodeURIComponent(url)}&validate=true`
: url;
const response = await fetch(fetchUrl, {
method: useProxy ? "GET" : "HEAD",
});
if (!response.ok) {
return "ai:pdf.upload.error.notFound" as const;
}
const contentType = response.headers.get("content-type");
if (!contentType?.includes("application/pdf")) {
return "validation:error.file.type" as const;
}
const contentLength = response.headers.get("content-length");
if (contentLength && parseInt(contentLength) > MAX_FILE_SIZE) {
return "validation:error.tooBig.file.notInclusive" as const;
}
return { url, size: parseInt(contentLength ?? "0") };
} catch (error) {
console.error(error);
return "ai:pdf.upload.error.notFound" as const;
}
};

View File

@@ -0,0 +1,302 @@
/**
* Search with Citations Module (WF-0028)
*
* Performs vector similarity search on retrieval chunks and returns
* matching results with linked citation units for pixel-perfect highlighting.
*/
import { eq } from "drizzle-orm";
import { sql } from "@turbostarter/db";
import { pdfCitationUnit, pdfRetrievalChunk } from "@turbostarter/db/schema/pdf";
import { db } from "@turbostarter/db/server";
import { generateEmbedding } from "./embeddings";
// ============================================================================
// Types
// ============================================================================
/**
* Bounding box for pixel-perfect highlighting
*/
export interface BoundingBox {
x: number;
y: number;
width: number;
height: number;
}
/**
* Citation unit with precise location for highlighting
*/
export interface CitationUnit {
id: string;
content: string;
pageNumber: number;
paragraphIndex: number;
charStart: number;
charEnd: number;
bbox: BoundingBox | null;
sectionTitle: string | null;
unitType: string;
}
/**
* Search result with retrieval chunk and linked citation units
*/
export interface SearchResult {
retrievalChunkId: string;
content: string;
similarity: number;
pageStart: number;
pageEnd: number;
sectionHierarchy: string[];
citationUnits: CitationUnit[];
}
/**
* Search options
*/
export interface SearchOptions {
/** Maximum number of results to return (default: 5) */
limit?: number;
/** Minimum similarity threshold (default: 0.1) */
threshold?: number;
/** Whether to include citation units (default: true) */
includeUnits?: boolean;
}
// ============================================================================
// Helper Functions
// ============================================================================
/**
* Transform raw citation unit row to CitationUnit interface
*/
function transformCitationUnit(row: {
id: string;
content: string;
pageNumber: number;
paragraphIndex: number;
charStart: number;
charEnd: number;
bboxX: number | null;
bboxY: number | null;
bboxWidth: number | null;
bboxHeight: number | null;
sectionTitle: string | null;
unitType: string | null;
}): CitationUnit {
// Build bbox only if all coordinates are present
const bbox: BoundingBox | null =
row.bboxX != null &&
row.bboxY != null &&
row.bboxWidth != null &&
row.bboxHeight != null
? {
x: row.bboxX,
y: row.bboxY,
width: row.bboxWidth,
height: row.bboxHeight,
}
: null;
return {
id: row.id,
content: row.content,
pageNumber: row.pageNumber,
paragraphIndex: row.paragraphIndex,
charStart: row.charStart,
charEnd: row.charEnd,
bbox,
sectionTitle: row.sectionTitle,
unitType: row.unitType ?? "prose",
};
}
// ============================================================================
// Main Search Functions
// ============================================================================
/**
* Search for relevant content with citation support
*
* @param query - Natural language query to search for
* @param documentId - Document ID to search within
* @param options - Search options (limit, threshold, includeUnits)
* @returns Array of search results with citation units
*/
export async function searchWithCitations(
query: string,
documentId: string,
options: SearchOptions = {},
): Promise<SearchResult[]> {
const { limit = 5, threshold = 0.1, includeUnits = true } = options;
// Generate embedding for the query
const queryEmbedding = await generateEmbedding(query);
const vectorStr = `[${queryEmbedding.join(",")}]`;
// Perform vector similarity search on retrieval chunks
const chunkResults = await db.execute<{
id: string;
content: string;
similarity: number;
page_start: number;
page_end: number;
section_hierarchy: string[] | null;
chunk_type: string | null;
}>(sql`
SELECT
id,
content,
1 - (embedding <=> ${vectorStr}::vector) as similarity,
page_start,
page_end,
section_hierarchy,
chunk_type
FROM pdf.retrieval_chunk
WHERE document_id = ${documentId}
AND embedding IS NOT NULL
AND 1 - (embedding <=> ${vectorStr}::vector) > ${threshold}
ORDER BY similarity DESC
LIMIT ${limit}
`);
// Handle result format (db.execute returns array directly)
const rows = Array.isArray(chunkResults) ? chunkResults : [];
// Build search results
const results: SearchResult[] = [];
for (const row of rows) {
let citationUnits: CitationUnit[] = [];
// Fetch linked citation units if requested
if (includeUnits) {
citationUnits = await getCitationUnitsForChunk(row.id);
}
results.push({
retrievalChunkId: row.id,
content: row.content,
similarity: row.similarity,
pageStart: row.page_start,
pageEnd: row.page_end,
sectionHierarchy: row.section_hierarchy ?? [],
citationUnits,
});
}
return results;
}
/**
* Get all citation units linked to a retrieval chunk
*
* @param chunkId - Retrieval chunk ID
* @returns Array of citation units ordered by page and paragraph
*/
export async function getCitationUnitsForChunk(
chunkId: string,
): Promise<CitationUnit[]> {
const rows = await db
.select({
id: pdfCitationUnit.id,
content: pdfCitationUnit.content,
pageNumber: pdfCitationUnit.pageNumber,
paragraphIndex: pdfCitationUnit.paragraphIndex,
charStart: pdfCitationUnit.charStart,
charEnd: pdfCitationUnit.charEnd,
bboxX: pdfCitationUnit.bboxX,
bboxY: pdfCitationUnit.bboxY,
bboxWidth: pdfCitationUnit.bboxWidth,
bboxHeight: pdfCitationUnit.bboxHeight,
sectionTitle: pdfCitationUnit.sectionTitle,
unitType: pdfCitationUnit.unitType,
})
.from(pdfCitationUnit)
.where(eq(pdfCitationUnit.retrievalChunkId, chunkId))
.orderBy(pdfCitationUnit.pageNumber, pdfCitationUnit.paragraphIndex);
return rows.map(transformCitationUnit);
}
/**
* Get a single citation unit by ID
*
* @param unitId - Citation unit ID
* @returns Citation unit or null if not found
*/
export async function getCitationUnitById(
unitId: string,
): Promise<CitationUnit | null> {
const rows = await db
.select({
id: pdfCitationUnit.id,
content: pdfCitationUnit.content,
pageNumber: pdfCitationUnit.pageNumber,
paragraphIndex: pdfCitationUnit.paragraphIndex,
charStart: pdfCitationUnit.charStart,
charEnd: pdfCitationUnit.charEnd,
bboxX: pdfCitationUnit.bboxX,
bboxY: pdfCitationUnit.bboxY,
bboxWidth: pdfCitationUnit.bboxWidth,
bboxHeight: pdfCitationUnit.bboxHeight,
sectionTitle: pdfCitationUnit.sectionTitle,
unitType: pdfCitationUnit.unitType,
})
.from(pdfCitationUnit)
.where(eq(pdfCitationUnit.id, unitId))
.limit(1);
const row = rows[0];
if (!row) {
return null;
}
return transformCitationUnit(row);
}
/**
* Get retrieval chunk by ID (without citation units)
*
* @param chunkId - Retrieval chunk ID
* @returns Retrieval chunk data or null if not found
*/
export async function getRetrievalChunkById(chunkId: string): Promise<{
id: string;
content: string;
pageStart: number;
pageEnd: number;
sectionHierarchy: string[];
chunkType: string;
} | null> {
const rows = await db
.select({
id: pdfRetrievalChunk.id,
content: pdfRetrievalChunk.content,
pageStart: pdfRetrievalChunk.pageStart,
pageEnd: pdfRetrievalChunk.pageEnd,
sectionHierarchy: pdfRetrievalChunk.sectionHierarchy,
chunkType: pdfRetrievalChunk.chunkType,
})
.from(pdfRetrievalChunk)
.where(eq(pdfRetrievalChunk.id, chunkId))
.limit(1);
const row = rows[0];
if (!row) {
return null;
}
return {
id: row.id,
content: row.content,
pageStart: row.pageStart,
pageEnd: row.pageEnd,
sectionHierarchy: row.sectionHierarchy ?? [],
chunkType: row.chunkType ?? "prose",
};
}

View File

@@ -0,0 +1,15 @@
import { openai } from "@ai-sdk/openai";
import { customProvider } from "ai";
import { cached } from "../../utils/llm";
export const modelStrategies = customProvider({
languageModels: {
default: cached(openai.responses("gpt-4o-mini")),
// Uncached for tool-using flows (PDF chat) - tools need fresh execution
uncached: openai.responses("gpt-4o-mini"),
},
textEmbeddingModels: {
default: openai.textEmbedding("text-embedding-3-small"),
},
});

View File

@@ -0,0 +1,170 @@
import { pdfMessageRoleEnum } from "@turbostarter/db/schema/pdf";
import type { tools } from "./api";
import type { EnumToConstant } from "@turbostarter/shared/types";
import type { InferUITools, UIDataTypes, UIMessage } from "ai";
export interface RemoteFile {
url: string;
size: number;
}
export type {
SelectPdfChat as Chat,
SelectPdfDocument as Document,
SelectPdfMessage as Message,
} from "@turbostarter/db/schema/pdf";
export const Role = Object.fromEntries(
pdfMessageRoleEnum.enumValues.map((role) => [
role.replace(/-/g, "_").toUpperCase(),
role,
]),
) as EnumToConstant<typeof pdfMessageRoleEnum.enumValues>;
export type Role = (typeof Role)[keyof typeof Role];
export type PdfMessage = UIMessage<
unknown,
UIDataTypes,
InferUITools<typeof tools>
>;
export type PdfMessagePart = PdfMessage["parts"][number];
// ============================================================================
// Citation Types (Interactive PDF Chat)
// ============================================================================
/**
* Metadata stored with each embedding chunk for citation support
*/
export interface EmbeddingMetadata {
pageNumber: number;
charStart?: number;
charEnd?: number;
sectionTitle?: string;
}
/**
* Citation returned by AI with source reference (legacy [[cite:id:page]] format)
*/
export interface Citation {
/** Citation index displayed as [1], [2], etc. */
index: number;
/** Reference to pdf.embedding row */
embeddingId: string;
/** Semantic similarity score 0-1 */
relevance: number;
/** Page number for quick navigation */
pageNumber: number;
/** Short preview of the cited content */
excerpt: string;
}
// ============================================================================
// Precise Citation Types (Tool-based highlighting - WF-0032)
// ============================================================================
/**
* Precise citation from highlightText tool.
* LLM calls this tool with exact phrases to highlight in the PDF.
*/
export interface PreciseCitation {
/** Unique ID for this citation */
citationId: string;
/** Exact text phrase to highlight (from document) */
text: string;
/** Page number where text appears (1-indexed) */
page: number;
/** Optional note on why this supports the answer */
relevance: string | null;
/** When the citation was created */
timestamp: number;
}
/**
* Bounding rectangle for text highlights (DOM-independent for SSR compatibility)
*/
export interface HighlightRect {
x: number;
y: number;
width: number;
height: number;
}
/**
* Text highlight pending resolution to screen coordinates.
* Created when LLM calls highlightText, resolved when page renders.
*/
export interface TextHighlight {
/** Citation ID for reference */
id: string;
/** Text to find and highlight */
text: string;
/** Page number (1-indexed) */
page: number;
/** Computed bounding rects (populated after text search) */
rects: HighlightRect[];
/** Whether text was found on the page */
found: boolean;
}
/**
* AI response with parsed citations
*/
export interface CitationResponse {
/** Message content with [[cite:id:page]] markers replaced with [1], [2] */
content: string;
/** Parsed citation references */
citations: Citation[];
}
/**
* Navigation history entry for back/forward
*/
export interface NavigationEntry {
/** Target page number */
page: number;
/** Optional embedding to highlight */
embeddingId?: string;
/** Timestamp for ordering */
timestamp: number;
}
/**
* PDF viewer state exposed via context
*/
export interface PdfViewerState {
/** Currently visible page */
currentPage: number;
/** Current zoom level (1 = 100%) */
zoomLevel: number;
/** Scroll position within page */
scrollPosition: number;
/** Currently highlighted embedding ID */
activeHighlight: string | null;
/** Navigation history stack */
history: NavigationEntry[];
/** Current position in history (-1 = not navigating) */
historyIndex: number;
}
/**
* PDF viewer actions exposed via context
*/
export interface PdfViewerActions {
/** Navigate to a specific page with optional highlight */
navigateTo: (options: {
page: number;
embeddingId?: string;
animate?: boolean;
}) => void;
/** Go back in navigation history */
goBack: () => void;
/** Go forward in navigation history */
goForward: () => void;
/** Clear active highlight */
clearHighlight: () => void;
/** Set current page (from viewer scroll) */
setCurrentPage: (page: number) => void;
}

View File

@@ -0,0 +1,12 @@
export const formatFileSize = (size: number) => {
if (size === 0) return "0 B";
const units = ["B", "kB", "MB", "GB", "TB"];
const exponent = Math.min(
Math.floor(Math.log(size) / Math.log(1024)),
units.length - 1,
);
const value = size / Math.pow(1024, exponent);
return `${value.toFixed(2)} ${units[exponent]}`;
};

View File

@@ -0,0 +1,52 @@
import OpenAI from "openai";
import type { TranscriptionOptions, TranscriptionResult } from "./types";
// Lazy initialization to avoid issues at module load
let _openai: OpenAI | null = null;
const getOpenAI = () => {
_openai ??= new OpenAI();
return _openai;
};
/**
* Transcribe audio to text using OpenAI Whisper
*/
export async function transcribe(
audioFile: File | Blob,
options: TranscriptionOptions = {}
): Promise<TranscriptionResult> {
const openai = getOpenAI();
// Convert Blob to File if needed
const file =
audioFile instanceof File
? audioFile
: new File([audioFile], "audio.webm", { type: audioFile.type || "audio/webm" });
const response = await openai.audio.transcriptions.create({
file,
model: "whisper-1",
language: options.language,
prompt: options.prompt,
response_format: "verbose_json",
});
return {
text: response.text,
language: response.language,
duration: response.duration,
};
}
/**
* Transcribe audio from a URL
*/
export async function transcribeFromUrl(
audioUrl: string,
options: TranscriptionOptions = {}
): Promise<TranscriptionResult> {
const response = await fetch(audioUrl);
const blob = await response.blob();
return transcribe(blob, options);
}

View File

@@ -0,0 +1,3 @@
export * from "./api";
export * from "./schema";
export * from "./types";

View File

@@ -0,0 +1,15 @@
import { z } from "zod";
export const transcriptionOptionsSchema = z.object({
language: z.string().optional(),
prompt: z.string().optional(),
});
export const transcriptionResultSchema = z.object({
text: z.string(),
language: z.string().optional(),
duration: z.number().optional(),
});
export type TranscriptionOptionsInput = z.infer<typeof transcriptionOptionsSchema>;
export type TranscriptionResultOutput = z.infer<typeof transcriptionResultSchema>;

View File

@@ -0,0 +1,10 @@
export interface TranscriptionResult {
text: string;
language?: string;
duration?: number;
}
export interface TranscriptionOptions {
language?: string;
prompt?: string;
}

View File

@@ -0,0 +1,24 @@
import { client } from "./client";
import { toVoice } from "./utils";
import type { TtsPayload } from "./schema";
export const textToSpeech = async ({ text, options }: TtsPayload) => {
const { voice, model } = options;
return client.textToSpeech.stream(voice.id, {
modelId: model,
text,
voiceSettings: {
stability: voice.stability,
similarityBoost: voice.similarity,
useSpeakerBoost: voice.boost,
speed: voice.speed,
},
});
};
export const getVoices = async () => {
const { voices } = await client.voices.getAll();
return voices.map(toVoice);
};

View File

@@ -0,0 +1,25 @@
import { ElevenLabsClient } from "@elevenlabs/elevenlabs-js";
import { env } from "../../env";
// Lazy initialization to avoid throwing at module load time
let _client: ElevenLabsClient | null = null;
export const getClient = () => {
if (!_client) {
if (!env.ELEVENLABS_API_KEY) {
throw new Error("ELEVENLABS_API_KEY is required for TTS");
}
_client = new ElevenLabsClient({ apiKey: env.ELEVENLABS_API_KEY });
}
return _client;
};
// For backward compatibility - will throw if API key is missing
export const client = {
get textToSpeech() {
return getClient().textToSpeech;
},
get voices() {
return getClient().voices;
},
};

View File

@@ -0,0 +1,36 @@
import { Provider } from "../../types";
import { Model } from "./types";
export const MODELS = [
// {
// id: Model.ELEVEN_3,
// provider: Provider.ELEVEN_LABS,
// name: "Eleven 3",
// },
{
id: Model.ELEVEN_MULTILINGUAL_V2,
provider: Provider.ELEVEN_LABS,
name: "Eleven Multilingual v2",
},
{
id: Model.ELEVEN_FLASH_V2_5,
provider: Provider.ELEVEN_LABS,
name: "Eleven Flash v2.5",
},
{
id: Model.ELEVEN_FLASH_V2,
provider: Provider.ELEVEN_LABS,
name: "Eleven Flash v2",
},
{
id: Model.ELEVEN_TURBO_V2_5,
provider: Provider.ELEVEN_LABS,
name: "Eleven Turbo v2.5",
},
{
id: Model.ELEVEN_TURBO_V2,
provider: Provider.ELEVEN_LABS,
name: "Eleven Turbo v2",
},
] as const;

View File

@@ -0,0 +1,23 @@
import * as z from "zod";
export const ttsOptionsSchema = z.object({
model: z.string(),
voice: z.object({
id: z.string(),
stability: z.number().min(0).max(1).default(0.5).optional(),
speed: z.number().min(0.7).max(1.2).default(1).optional(),
similarity: z.number().min(0).max(1).default(0.5).optional(),
boost: z.boolean().default(false).optional(),
}),
});
export const ttsSchema = z.object({
text: z.string().min(1).max(5000),
options: ttsOptionsSchema,
});
export type TtsOptionsPayload = z.infer<typeof ttsOptionsSchema>;
export type TtsPayload = z.infer<typeof ttsSchema>;
// API input type aliases
export type TtsInput = TtsPayload;

View File

@@ -0,0 +1,24 @@
export interface Voice {
id: string;
name: string;
description?: string;
category?: string;
details: string[];
createdAt: string;
usage: {
cloned: number;
character: number;
};
previewUrl?: string;
}
export const Model = {
ELEVEN_3: "eleven_v3",
ELEVEN_MULTILINGUAL_V2: "eleven_multilingual_v2",
ELEVEN_FLASH_V2_5: "eleven_flash_v2_5",
ELEVEN_FLASH_V2: "eleven_flash_v2",
ELEVEN_TURBO_V2_5: "eleven_turbo_v2_5",
ELEVEN_TURBO_V2: "eleven_turbo_v2",
} as const;
export type Model = (typeof Model)[keyof typeof Model];

View File

@@ -0,0 +1,22 @@
import { random } from "@turbostarter/shared/utils";
import type { Voice } from "./types";
import type { ElevenLabs } from "@elevenlabs/elevenlabs-js";
export const toVoice = (voice: ElevenLabs.Voice): Voice => {
return {
id: voice.voiceId,
name: voice.name ?? "",
description: voice.description,
category: voice.category,
details: Object.values(voice.labels ?? {}).filter(Boolean),
createdAt: voice.createdAtUnix
? new Date(voice.createdAtUnix * 1000).toISOString()
: new Date().toISOString(),
usage: {
cloned: random(25000, 1000000),
character: random(100000, 10000000),
},
previewUrl: voice.previewUrl,
};
};

View File

@@ -0,0 +1,15 @@
export const Provider = {
OPENAI: "openai",
CLAUDE: "claude",
GEMINI: "gemini",
GROK: "grok",
DEEPSEEK: "deepseek",
REPLICATE: "replicate",
LUMA: "luma",
STABILITY_AI: "stability-ai",
RECRAFT: "recraft",
ELEVEN_LABS: "eleven-labs",
NVIDIA: "nvidia",
} as const;
export type Provider = (typeof Provider)[keyof typeof Provider];

View File

@@ -0,0 +1,11 @@
import type { UIMessage } from "ai";
export const getMessageTextContent = <T extends UIMessage>(message?: T) => {
return (
message?.parts
.filter((part) => part.type === "text")
.map((part) => part.text)
.join("")
.trim() ?? ""
);
};

View File

@@ -0,0 +1,208 @@
import { openai } from "@ai-sdk/openai";
import {
generateObject,
NoSuchToolError,
simulateReadableStream,
wrapLanguageModel,
} from "ai";
import fs from "fs";
import path from "path";
import { NodeEnv } from "@turbostarter/shared/constants";
import { env } from "../env";
import type {
LanguageModelV2,
LanguageModelV2Middleware,
LanguageModelV2Prompt,
LanguageModelV2StreamPart,
} from "@ai-sdk/provider";
import type { ToolCallRepairFunction, ToolSet } from "ai";
const CACHE_FILE = path.join(process.cwd(), ".cache/ai.json");
export const cached = (model: LanguageModelV2) =>
env.NODE_ENV === NodeEnv.DEVELOPMENT
? wrapLanguageModel({
middleware: cacheMiddleware,
model,
})
: model;
const ensureCacheFile = () => {
const cacheDir = path.dirname(CACHE_FILE);
if (!fs.existsSync(cacheDir)) {
fs.mkdirSync(cacheDir, { recursive: true });
}
if (!fs.existsSync(CACHE_FILE)) {
fs.writeFileSync(CACHE_FILE, "{}");
}
};
const getCachedResult = (key: string | object) => {
ensureCacheFile();
const cacheKey = typeof key === "object" ? JSON.stringify(key) : key;
try {
const cacheContent = fs.readFileSync(CACHE_FILE, "utf-8");
const cache = JSON.parse(cacheContent) as Record<string, unknown>;
const result = cache[cacheKey];
return result ?? null;
} catch {
return null;
}
};
const updateCache = (key: string, value: unknown) => {
ensureCacheFile();
const cache = JSON.parse(fs.readFileSync(CACHE_FILE, "utf-8")) as Record<
string,
unknown
>;
const updatedCache = { ...cache, [key]: value };
fs.writeFileSync(CACHE_FILE, JSON.stringify(updatedCache, null, 2));
};
const cleanPrompt = (prompt: LanguageModelV2Prompt) => {
return prompt.map((m) => {
if (m.role === "assistant") {
return m.content.map((part) =>
part.type === "tool-call" ? { ...part, toolCallId: "cached" } : part,
);
}
if (m.role === "tool") {
return m.content.map((tc) => ({
...tc,
toolCallId: "cached",
result: {},
}));
}
return m;
});
};
export const cacheMiddleware: LanguageModelV2Middleware = {
wrapGenerate: async ({ doGenerate, params }) => {
const cacheKey = JSON.stringify({
...cleanPrompt(params.prompt),
_function: "generate",
});
const cached = getCachedResult(cacheKey) as Awaited<
ReturnType<LanguageModelV2["doGenerate"]>
> | null;
if (cached) {
console.log("🎯 Cache HIT");
return {
...cached,
response: {
...cached.response,
timestamp: cached.response?.timestamp
? new Date(cached.response.timestamp)
: undefined,
},
};
}
console.log("🔍 Cache MISS");
const result = await doGenerate();
updateCache(cacheKey, result);
return result;
},
wrapStream: async ({ doStream, params }) => {
const cacheKey = JSON.stringify({
...cleanPrompt(params.prompt),
_function: "stream",
});
// Check if the result is in the cache
const cached = getCachedResult(cacheKey);
// If cached, return a simulated ReadableStream that yields the cached result
if (cached) {
console.log("🎯 Cache HIT");
// Format the timestamps in the cached response
const formattedChunks = (cached as LanguageModelV2StreamPart[]).map(
(p) => {
if (p.type === "response-metadata" && p.timestamp) {
return { ...p, timestamp: new Date(p.timestamp) };
} else return p;
},
);
return {
stream: simulateReadableStream({
initialDelayInMs: 0,
chunkDelayInMs: 10,
chunks: formattedChunks,
}),
rawCall: { rawPrompt: null, rawSettings: {} },
};
}
console.log("🔍 Cache MISS");
// If not cached, proceed with streaming
const { stream, ...rest } = await doStream();
const fullResponse: LanguageModelV2StreamPart[] = [];
const transformStream = new TransformStream<
LanguageModelV2StreamPart,
LanguageModelV2StreamPart
>({
transform(chunk, controller) {
fullResponse.push(chunk);
controller.enqueue(chunk);
},
flush() {
// Store the full response in the cache after streaming is complete
updateCache(cacheKey, fullResponse);
},
});
return {
stream: stream.pipeThrough(transformStream),
...rest,
};
},
};
export const repairToolCall: ToolCallRepairFunction<ToolSet> = async ({
toolCall,
tools,
inputSchema,
error,
}) => {
if (NoSuchToolError.isInstance(error)) {
return null;
}
const tool = tools[toolCall.toolName];
if (!tool?.inputSchema) {
return null;
}
// eslint-disable-next-line @typescript-eslint/no-unsafe-assignment
const { object: repairedArgs } = await generateObject({
model: openai.responses("gpt-4o"),
schema: tool.inputSchema,
prompt: [
`The model tried to call the tool "${toolCall.toolName}"` +
` with the following arguments:`,
JSON.stringify(toolCall.input),
`The tool accepts the following schema:`,
JSON.stringify(inputSchema(toolCall)),
"Please fix the arguments.",
`Today's date is ${new Date().toLocaleDateString("en-US", { year: "numeric", month: "long", day: "numeric" })}`,
].join("\n"),
});
return { ...toolCall, args: JSON.stringify(repairedArgs) };
};

View File

@@ -0,0 +1,6 @@
{
"extends": "@turbostarter/tsconfig/internal.json",
"compilerOptions": {},
"include": ["*.ts", "src/**/*"],
"exclude": ["node_modules"]
}

View File

@@ -0,0 +1,3 @@
import baseConfig from "@turbostarter/vitest-config/base";
export default baseConfig;

View File

@@ -0,0 +1,4 @@
import baseConfig from "@turbostarter/eslint-config/base";
import reactConfig from "@turbostarter/eslint-config/react";
export default [...baseConfig, ...reactConfig];

View File

@@ -0,0 +1,34 @@
{
"name": "@turbostarter/analytics-mobile",
"private": true,
"version": "0.1.0",
"type": "module",
"exports": {
".": "./src/index.ts",
"./env": "./src/env.ts"
},
"scripts": {
"clean": "git clean -xdf .cache .turbo dist node_modules",
"format": "prettier --check . --ignore-path ../../../.gitignore",
"lint": "eslint",
"typecheck": "tsc --noEmit"
},
"devDependencies": {
"@turbostarter/eslint-config": "workspace:*",
"@turbostarter/prettier-config": "workspace:*",
"@turbostarter/tsconfig": "workspace:*",
"eslint": "catalog:",
"prettier": "catalog:",
"typescript": "catalog:"
},
"prettier": "@turbostarter/prettier-config",
"dependencies": {
"@turbostarter/analytics": "workspace:*",
"@turbostarter/shared": "workspace:*",
"envin": "catalog:",
"mixpanel-react-native": "3.1.2",
"posthog-react-native": "4.14.3",
"react-native": "catalog:",
"zod": "catalog:"
}
}

View File

@@ -0,0 +1 @@
export { env, preset } from "./providers";

View File

@@ -0,0 +1 @@
export * from "./use-tracking-permissions";

View File

@@ -0,0 +1,32 @@
import { requestTrackingPermissionsAsync } from "expo-tracking-transparency";
import { useEffect, useState, useCallback } from "react";
import { AppState } from "react-native";
export const useTrackingPermissions = () => {
const [granted, setGranted] = useState(false);
const checkPermission = useCallback(async () => {
const { granted: isGranted } = await requestTrackingPermissionsAsync();
setGranted(isGranted);
}, []);
useEffect(() => {
void checkPermission();
}, [checkPermission]);
useEffect(() => {
const subscription = AppState.addEventListener("change", (status) => {
if (status !== "active") {
return;
}
void checkPermission();
});
return () => {
subscription.remove();
};
}, [checkPermission]);
return granted;
};

View File

@@ -0,0 +1,3 @@
export { Provider, track, identify, reset } from "./providers";
export * from "./hooks";

View File

@@ -0,0 +1,16 @@
import { defineEnv } from "envin";
import { envConfig } from "@turbostarter/shared/constants";
import type { Preset } from "envin/types";
export const preset = {
id: "google-analytics",
clientPrefix: "EXPO_PUBLIC_",
client: {},
} as const satisfies Preset;
export const env = defineEnv({
...envConfig,
...preset,
});

View File

@@ -0,0 +1,69 @@
/* eslint-disable @typescript-eslint/no-unsafe-member-access */
/* eslint-disable @typescript-eslint/no-unsafe-call */
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-nocheck
import analytics from "@react-native-firebase/analytics";
import { useGlobalSearchParams, usePathname } from "expo-router";
import { useEffect } from "react";
import { useTrackingPermissions } from "../../hooks";
import type { AnalyticsProviderClientStrategy } from "@turbostarter/analytics";
const setup = async () => {
await analytics().setAnalyticsCollectionEnabled(true);
await analytics().setConsent({
analytics_storage: true,
ad_storage: true,
ad_user_data: true,
ad_personalization: true,
});
};
const useSetup = () => {
const granted = useTrackingPermissions();
const pathname = usePathname();
const params = useGlobalSearchParams();
useEffect(() => {
if (!granted) {
return;
}
void setup();
}, [granted]);
useEffect(() => {
if (!granted) {
return;
}
void analytics().logScreenView({
screen_name: pathname,
screen_class: pathname,
params,
});
}, [pathname, params, granted]);
};
export const { Provider, track, identify, reset } = {
Provider: ({ children }) => {
useSetup();
return children;
},
track: (name, params) => {
void analytics().logEvent(name, params);
},
identify: (userId, traits) => {
void analytics().setUserId(userId);
if (traits) {
void analytics().setUserProperties(traits);
}
},
reset: () => {
void analytics().setUserId(null);
void analytics().setUserProperties({});
},
} satisfies AnalyticsProviderClientStrategy;

View File

@@ -0,0 +1,2 @@
export * from "./mixpanel";
export * from "./mixpanel/env";

View File

@@ -0,0 +1,24 @@
/* eslint-disable turbo/no-undeclared-env-vars */
/* eslint-disable @typescript-eslint/no-unsafe-assignment */
import { defineEnv } from "envin";
import * as z from "zod";
import { envConfig } from "@turbostarter/shared/constants";
import type { Preset } from "envin/types";
export const preset = {
id: "mixpanel",
clientPrefix: "EXPO_PUBLIC_",
client: {
EXPO_PUBLIC_MIXPANEL_TOKEN: z.string(),
},
} as const satisfies Preset;
export const env = defineEnv({
...envConfig,
...preset,
env: {
EXPO_PUBLIC_MIXPANEL_TOKEN: process.env.EXPO_PUBLIC_MIXPANEL_TOKEN,
},
});

View File

@@ -0,0 +1,47 @@
import { Mixpanel } from "mixpanel-react-native";
import { useEffect } from "react";
import { useTrackingPermissions } from "../../hooks";
import { env } from "./env";
import type { AnalyticsProviderClientStrategy } from "@turbostarter/analytics";
const optOutTracking = true;
const trackAutomaticEvents = false;
const mixpanel = new Mixpanel(
env.EXPO_PUBLIC_MIXPANEL_TOKEN,
trackAutomaticEvents,
optOutTracking,
);
void mixpanel.init();
export const { Provider, track, identify, reset } = {
Provider: ({ children }) => {
const granted = useTrackingPermissions();
useEffect(() => {
void (async () => {
const optedOut = await mixpanel.hasOptedOutTracking();
if (granted && optedOut) {
void mixpanel.optInTracking();
}
})();
}, [granted]);
return <>{children}</>;
},
track: (name, params) => {
mixpanel.track(name, params);
},
identify: (userId, traits) => {
void mixpanel.identify(userId);
if (traits) {
void mixpanel.getPeople().set(traits);
}
},
reset: () => {
mixpanel.reset();
},
} satisfies AnalyticsProviderClientStrategy;

View File

@@ -0,0 +1,29 @@
/* eslint-disable turbo/no-undeclared-env-vars */
/* eslint-disable @typescript-eslint/no-unsafe-assignment */
import { defineEnv } from "envin";
import * as z from "zod";
import { envConfig } from "@turbostarter/shared/constants";
import type { Preset } from "envin/types";
export const preset = {
id: "posthog",
clientPrefix: "EXPO_PUBLIC_",
client: {
EXPO_PUBLIC_POSTHOG_KEY: z.string(),
EXPO_PUBLIC_POSTHOG_HOST: z
.string()
.optional()
.default("https://us.i.posthog.com"),
},
} as const satisfies Preset;
export const env = defineEnv({
...envConfig,
...preset,
env: {
EXPO_PUBLIC_POSTHOG_KEY: process.env.EXPO_PUBLIC_POSTHOG_KEY,
EXPO_PUBLIC_POSTHOG_HOST: process.env.EXPO_PUBLIC_POSTHOG_HOST,
},
});

View File

@@ -0,0 +1,73 @@
import PostHog, { PostHogProvider } from "posthog-react-native";
import { useEffect } from "react";
import { useTrackingPermissions } from "../../hooks";
import { env } from "./env";
import type { AnalyticsProviderClientStrategy } from "@turbostarter/analytics";
let client: PostHog | null = null;
const getClient = () => {
if (client) {
return client;
}
client = new PostHog(env.EXPO_PUBLIC_POSTHOG_KEY, {
host: env.EXPO_PUBLIC_POSTHOG_HOST,
defaultOptIn: false,
});
return client;
};
const Wrapper = ({ children }: { children: React.ReactNode }) => {
const client = getClient();
return (
<PostHogProvider client={client} autocapture>
{children}
</PostHogProvider>
);
};
const Setup = () => {
const client = getClient();
const granted = useTrackingPermissions();
useEffect(() => {
if (granted) {
void client.optIn();
} else {
void client.optOut();
}
}, [granted, client]);
return null;
};
const ProviderComponent = ({ children }: { children: React.ReactNode }) => {
return (
<Wrapper>
<Setup />
{children}
</Wrapper>
);
};
export const { Provider, track, identify, reset } = {
Provider: ProviderComponent,
track: (name, params) => {
const client = getClient();
client.capture(name, params);
},
identify: (userId, traits) => {
const client = getClient();
client.identify(userId, traits);
},
reset: () => {
const client = getClient();
client.reset();
},
} satisfies AnalyticsProviderClientStrategy;

View File

@@ -0,0 +1,9 @@
{
"extends": "@turbostarter/tsconfig/internal.json",
"compilerOptions": {
"lib": ["dom"],
"jsx": "preserve"
},
"include": ["*.ts", "src/**/*"],
"exclude": ["node_modules"]
}

View File

@@ -0,0 +1,3 @@
import baseConfig from "@turbostarter/eslint-config/base";
export default baseConfig;

View File

@@ -0,0 +1,24 @@
{
"name": "@turbostarter/analytics",
"version": "0.1.0",
"private": true,
"type": "module",
"exports": {
".": "./src/index.ts"
},
"scripts": {
"clean": "git clean -xdf .cache .turbo dist node_modules",
"format": "prettier --check . --ignore-path ../../.gitignore",
"lint": "eslint",
"typecheck": "tsc --noEmit"
},
"prettier": "@turbostarter/prettier-config",
"devDependencies": {
"@turbostarter/eslint-config": "workspace:*",
"@turbostarter/prettier-config": "workspace:*",
"@turbostarter/tsconfig": "workspace:*",
"eslint": "catalog:",
"prettier": "catalog:",
"typescript": "catalog:"
}
}

View File

@@ -0,0 +1 @@
export * from "./types";

View File

@@ -0,0 +1,22 @@
export type AllowedPropertyValues = string | number | boolean;
type TrackFunction = (
event: string,
data?: Record<string, AllowedPropertyValues>,
) => void;
type IdentifyFunction = (
userId: string,
traits?: Record<string, AllowedPropertyValues>,
) => void;
export interface AnalyticsProviderClientStrategy {
Provider: ({ children }: { children: React.ReactNode }) => React.ReactNode;
track: TrackFunction;
identify: IdentifyFunction;
reset: () => void;
}
export interface AnalyticsProviderServerStrategy {
track: TrackFunction;
}

View File

@@ -0,0 +1,6 @@
{
"extends": "@turbostarter/tsconfig/internal.json",
"compilerOptions": {},
"include": ["*.ts", "src/**/*"],
"exclude": ["node_modules"]
}

View File

@@ -0,0 +1,4 @@
import baseConfig from "@turbostarter/eslint-config/base";
import reactConfig from "@turbostarter/eslint-config/react";
export default [...baseConfig, ...reactConfig];

View File

@@ -0,0 +1,39 @@
{
"name": "@turbostarter/analytics-web",
"private": true,
"version": "0.1.0",
"type": "module",
"exports": {
".": "./src/index.tsx",
"./env": "./src/env.ts",
"./server": "./src/server.ts"
},
"scripts": {
"clean": "git clean -xdf .cache .turbo dist node_modules",
"format": "prettier --check . --ignore-path ../../../.gitignore",
"lint": "eslint",
"typecheck": "tsc --noEmit"
},
"devDependencies": {
"@turbostarter/eslint-config": "workspace:*",
"@turbostarter/prettier-config": "workspace:*",
"@turbostarter/tsconfig": "workspace:*",
"eslint": "catalog:",
"prettier": "catalog:",
"typescript": "catalog:"
},
"prettier": "@turbostarter/prettier-config",
"dependencies": {
"@openpanel/nextjs": "1.0.9",
"@turbostarter/analytics": "workspace:*",
"@turbostarter/shared": "workspace:*",
"@vemetric/node": "0.2.0",
"@vemetric/react": "0.6.1",
"@vercel/analytics": "1.5.0",
"mixpanel": "0.18.1",
"mixpanel-browser": "2.71.1",
"posthog-js": "1.283.0",
"posthog-node": "5.11.0",
"zod": "catalog:"
}
}

View File

@@ -0,0 +1 @@
export * from "./providers/env";

View File

@@ -0,0 +1 @@
export { Provider, track, identify, reset } from "./providers";

View File

@@ -0,0 +1 @@
export * from "./posthog/env";

View File

@@ -0,0 +1,27 @@
/* eslint-disable turbo/no-undeclared-env-vars */
import { defineEnv } from "envin";
import * as z from "zod";
import { envConfig } from "@turbostarter/shared/constants";
import type { Preset } from "envin/types";
export const preset = {
id: "google-analytics",
client: {
NEXT_PUBLIC_GOOGLE_ANALYTICS_MEASUREMENT_ID: z.string(),
},
server: {
GOOGLE_ANALYTICS_SECRET: z.string(),
},
} as const satisfies Preset;
export const env = defineEnv({
...envConfig,
...preset,
env: {
...process.env,
NEXT_PUBLIC_GOOGLE_ANALYTICS_MEASUREMENT_ID:
process.env.NEXT_PUBLIC_GOOGLE_ANALYTICS_MEASUREMENT_ID,
},
});

View File

@@ -0,0 +1,69 @@
import { env } from "./env";
import type { AnalyticsProviderClientStrategy } from "@turbostarter/analytics";
declare global {
interface Window {
dataLayer?: unknown[];
gtag?: (...args: unknown[]) => void;
}
}
export const { Provider, track, identify, reset } = {
Provider: ({ children }) => {
return (
<>
{children}
<script
async
src={`https://www.googletagmanager.com/gtag/js?id=${env.NEXT_PUBLIC_GOOGLE_ANALYTICS_MEASUREMENT_ID}`}
onLoad={() => {
if (typeof window === "undefined") {
return;
}
window.dataLayer = window.dataLayer ?? [];
function gtag(...args: unknown[]) {
window.dataLayer?.push(args);
}
window.gtag = gtag;
window.gtag("js", new Date());
window.gtag(
"config",
env.NEXT_PUBLIC_GOOGLE_ANALYTICS_MEASUREMENT_ID,
);
}}
/>
</>
);
},
track: (event, data) => {
if (typeof window === "undefined" || !window.gtag) {
return;
}
window.gtag("event", event, data);
},
identify: (userId, traits) => {
if (typeof window === "undefined" || !window.gtag) {
return;
}
window.gtag("config", env.NEXT_PUBLIC_GOOGLE_ANALYTICS_MEASUREMENT_ID, {
user_id: userId,
...traits,
});
},
reset: () => {
if (typeof window === "undefined" || !window.gtag) {
return;
}
window.gtag("config", env.NEXT_PUBLIC_GOOGLE_ANALYTICS_MEASUREMENT_ID, {
user_id: null,
});
},
} satisfies AnalyticsProviderClientStrategy;

View File

@@ -0,0 +1,36 @@
import { randomUUID } from "crypto";
import { logger } from "@turbostarter/shared/logger";
import { env } from "./env";
import type {
AllowedPropertyValues,
AnalyticsProviderServerStrategy,
} from "@turbostarter/analytics";
const postEvent = async (
event: string,
data?: Record<string, AllowedPropertyValues>,
) => {
const response = await fetch(
`https://www.google-analytics.com/mp/collect?measurement_id=${env.NEXT_PUBLIC_GOOGLE_ANALYTICS_MEASUREMENT_ID}&api_secret=${env.GOOGLE_ANALYTICS_SECRET}`,
{
method: "POST",
body: JSON.stringify({
client_id: data?.clientId ?? randomUUID(),
events: [{ name: event, params: data }],
}),
},
);
if (!response.ok) {
logger.error("Failed to post event to Google Analytics: ", response);
}
};
export const { track } = {
track: (event, data) => {
void postEvent(event, data);
},
} satisfies AnalyticsProviderServerStrategy;

View File

@@ -0,0 +1 @@
export * from "./posthog";

View File

@@ -0,0 +1,26 @@
/* eslint-disable turbo/no-undeclared-env-vars */
import { defineEnv } from "envin";
import * as z from "zod";
import { envConfig, NodeEnv } from "@turbostarter/shared/constants";
import type { Preset } from "envin/types";
export const preset = {
id: "mixpanel",
client: {
NEXT_PUBLIC_MIXPANEL_TOKEN: z.string(),
},
} as const satisfies Preset;
export const env = defineEnv({
...envConfig,
...preset,
shared: {
NODE_ENV: z.enum(NodeEnv).default(NodeEnv.DEVELOPMENT),
},
env: {
...process.env,
NEXT_PUBLIC_MIXPANEL_TOKEN: process.env.NEXT_PUBLIC_MIXPANEL_TOKEN,
},
});

View File

@@ -0,0 +1,51 @@
"use client";
import mixpanel from "mixpanel-browser";
import { useEffect } from "react";
import { NodeEnv } from "@turbostarter/shared/constants";
import { env } from "./env";
import type { AnalyticsProviderClientStrategy } from "@turbostarter/analytics";
const init = () => {
mixpanel.init(env.NEXT_PUBLIC_MIXPANEL_TOKEN, {
debug: env.NODE_ENV === NodeEnv.DEVELOPMENT,
autocapture: true,
persistence: "localStorage",
});
};
export const { Provider, track, identify, reset } = {
Provider: ({ children }) => {
useEffect(() => {
init();
}, []);
return children;
},
track: (event, properties) => {
if (typeof window === "undefined") {
return;
}
mixpanel.track(event, properties);
},
identify: (userId, traits) => {
if (typeof window === "undefined") {
return;
}
mixpanel.identify(userId);
if (traits) {
mixpanel.people.set(traits);
}
},
reset: () => {
if (typeof window === "undefined") {
return;
}
mixpanel.reset();
},
} satisfies AnalyticsProviderClientStrategy;

View File

@@ -0,0 +1,33 @@
import Mixpanel from "mixpanel";
import { NodeEnv } from "@turbostarter/shared/constants";
import { logger } from "@turbostarter/shared/logger";
import { env } from "./env";
import type { AnalyticsProviderServerStrategy } from "@turbostarter/analytics";
let client: Mixpanel.Mixpanel | null = null;
const getClient = () => {
if (client) {
return client;
}
client = Mixpanel.init(env.NEXT_PUBLIC_MIXPANEL_TOKEN, {
debug: env.NODE_ENV === NodeEnv.DEVELOPMENT,
});
return client;
};
export const { track } = {
track: (event, properties) => {
try {
const mixpanel = getClient();
mixpanel.track(event, properties ?? {});
} catch (error) {
logger.warn("Failed to track Mixpanel event: ", error);
}
},
} satisfies AnalyticsProviderServerStrategy;

View File

@@ -0,0 +1,27 @@
/* eslint-disable turbo/no-undeclared-env-vars */
import { defineEnv } from "envin";
import * as z from "zod";
import { envConfig } from "@turbostarter/shared/constants";
import type { Preset } from "envin/types";
export const preset = {
id: "open-panel",
client: {
NEXT_PUBLIC_OPEN_PANEL_CLIENT_ID: z.string(),
},
server: {
OPEN_PANEL_SECRET: z.string(),
},
} as const satisfies Preset;
export const env = defineEnv({
...envConfig,
...preset,
env: {
...process.env,
NEXT_PUBLIC_OPEN_PANEL_CLIENT_ID:
process.env.NEXT_PUBLIC_OPEN_PANEL_CLIENT_ID,
},
});

View File

@@ -0,0 +1,45 @@
import { OpenPanelComponent } from "@openpanel/nextjs";
import { env } from "./env";
import type { AnalyticsProviderClientStrategy } from "@turbostarter/analytics";
export const { Provider, track, identify, reset } = {
Provider: ({ children }) => {
return (
<>
{children}
<OpenPanelComponent
clientId={env.NEXT_PUBLIC_OPEN_PANEL_CLIENT_ID}
trackScreenViews
trackAttributes
trackOutgoingLinks
/>
</>
);
},
track: (event, data) => {
if (typeof window === "undefined") {
return;
}
window.op("track", event, data);
},
identify: (userId, traits) => {
if (typeof window === "undefined") {
return;
}
window.op("identify", {
profileId: userId,
...traits,
});
},
reset: () => {
if (typeof window === "undefined") {
return;
}
window.op("clear");
},
} satisfies AnalyticsProviderClientStrategy;

View File

@@ -0,0 +1,28 @@
import { OpenPanel } from "@openpanel/nextjs";
import { env } from "./env";
import type { AnalyticsProviderServerStrategy } from "@turbostarter/analytics";
let client: OpenPanel | null = null;
const getClient = () => {
if (client) {
return client;
}
client = new OpenPanel({
clientId: env.NEXT_PUBLIC_OPEN_PANEL_CLIENT_ID,
clientSecret: env.OPEN_PANEL_SECRET,
});
return client;
};
export const { track } = {
track: (event, data) => {
const client = getClient();
void client.track(event, data);
},
} satisfies AnalyticsProviderServerStrategy;

View File

@@ -0,0 +1,26 @@
/* eslint-disable turbo/no-undeclared-env-vars */
import { defineEnv } from "envin";
import * as z from "zod";
import { envConfig } from "@turbostarter/shared/constants";
import type { Preset } from "envin/types";
export const preset = {
id: "plausible",
clientPrefix: "NEXT_PUBLIC_",
client: {
NEXT_PUBLIC_PLAUSIBLE_DOMAIN: z.string(),
NEXT_PUBLIC_PLAUSIBLE_HOST: z.string(),
},
} as const satisfies Preset;
export const env = defineEnv({
...envConfig,
...preset,
env: {
...process.env,
NEXT_PUBLIC_PLAUSIBLE_DOMAIN: process.env.NEXT_PUBLIC_PLAUSIBLE_DOMAIN,
NEXT_PUBLIC_PLAUSIBLE_HOST: process.env.NEXT_PUBLIC_PLAUSIBLE_HOST,
},
});

View File

@@ -0,0 +1,109 @@
import { z } from "zod";
import { env } from "./env";
import type {
AllowedPropertyValues,
AnalyticsProviderClientStrategy,
} from "@turbostarter/analytics";
declare global {
interface Window {
plausible?: (
event: string,
options?: { props?: Record<string, unknown> },
) => void;
}
}
const STORAGE_KEYS = {
USER_ID: "plausible_user_id",
USER_TRAITS: "plausible_user_traits",
} as const;
const ValueSchema = z.union([z.string(), z.number(), z.boolean()]);
const TraitsSchema = z.record(z.string(), ValueSchema);
const getStoredIdentity = () => {
if (typeof window === "undefined") {
return { userId: undefined, traits: undefined };
}
try {
const userId = localStorage.getItem(STORAGE_KEYS.USER_ID) ?? undefined;
const traitsStr = localStorage.getItem(STORAGE_KEYS.USER_TRAITS);
let traits: Record<string, AllowedPropertyValues> | undefined;
if (traitsStr) {
const parsed = TraitsSchema.safeParse(JSON.parse(traitsStr));
if (parsed.success) {
traits = parsed.data;
}
}
return { userId, traits };
} catch {
return { userId: undefined, traits: undefined };
}
};
export const { Provider, track, identify, reset } = {
Provider: ({ children }) => {
return (
<>
{children}
<script
defer
data-domain={env.NEXT_PUBLIC_PLAUSIBLE_DOMAIN}
src={`${env.NEXT_PUBLIC_PLAUSIBLE_HOST}/js/script.js`}
/>
</>
);
},
track: (event, data) => {
if (typeof window === "undefined" || !window.plausible) {
return;
}
const { userId, traits } = getStoredIdentity();
const props: Record<string, unknown> = {
...traits,
...data,
};
if (userId) {
props.userId = userId;
}
window.plausible(event, {
props,
});
},
identify: (userId, traits) => {
if (typeof window === "undefined") {
return;
}
try {
localStorage.setItem(STORAGE_KEYS.USER_ID, userId);
if (traits) {
localStorage.setItem(STORAGE_KEYS.USER_TRAITS, JSON.stringify(traits));
}
} catch {
// Ignore storage errors
}
},
reset: () => {
if (typeof window === "undefined") {
return;
}
try {
localStorage.removeItem(STORAGE_KEYS.USER_ID);
localStorage.removeItem(STORAGE_KEYS.USER_TRAITS);
} catch {
// Ignore storage errors
}
},
} satisfies AnalyticsProviderClientStrategy;

View File

@@ -0,0 +1,42 @@
import { logger } from "@turbostarter/shared/logger";
import { env } from "./env";
import type { AnalyticsProviderServerStrategy } from "@turbostarter/analytics";
export const { track } = {
track: (event, data) => {
const url = typeof data?.url === "string" ? data.url : "app://server-side";
const referrer =
typeof data?.referrer === "string" ? data.referrer : undefined;
const ip = typeof data?.ip === "string" ? data.ip : undefined;
const props = data
? Object.fromEntries(
Object.entries(data).filter(
([key]) => !["url", "referrer", "ip"].includes(key),
),
)
: undefined;
void fetch(`${env.NEXT_PUBLIC_PLAUSIBLE_HOST}/api/event`, {
method: "POST",
headers: {
"Content-Type": "application/json",
"User-Agent": "TurboStarter-Server/1.0 (Server-side tracking)",
...(ip && { "X-Forwarded-For": ip }),
},
body: JSON.stringify({
domain: env.NEXT_PUBLIC_PLAUSIBLE_DOMAIN,
name: event,
url: url,
...(referrer && { referrer }),
...(props && Object.keys(props).length > 0 && { props }),
}),
}).then((res) => {
if (!res.ok) {
logger.error("Failed to post event to Plausible: ", res);
}
});
},
} satisfies AnalyticsProviderServerStrategy;

View File

@@ -0,0 +1,29 @@
/* eslint-disable turbo/no-undeclared-env-vars */
import { defineEnv } from "envin";
import * as z from "zod";
import { envConfig } from "@turbostarter/shared/constants";
import type { Preset } from "envin/types";
export const preset = {
id: "posthog",
clientPrefix: "NEXT_PUBLIC_",
client: {
NEXT_PUBLIC_POSTHOG_KEY: z.string().optional(),
NEXT_PUBLIC_POSTHOG_HOST: z
.string()
.optional()
.default("https://us.i.posthog.com"),
},
} as const satisfies Preset;
export const env = defineEnv({
...envConfig,
...preset,
env: {
...process.env,
NEXT_PUBLIC_POSTHOG_KEY: process.env.NEXT_PUBLIC_POSTHOG_KEY,
NEXT_PUBLIC_POSTHOG_HOST: process.env.NEXT_PUBLIC_POSTHOG_HOST,
},
});

View File

@@ -0,0 +1,71 @@
"use client";
import dynamic from "next/dynamic";
import posthog from "posthog-js";
import { PostHogProvider } from "posthog-js/react";
import { Suspense } from "react";
import { env } from "./env";
import type { AnalyticsProviderClientStrategy } from "@turbostarter/analytics";
const PageView = dynamic(
() => import("./page-view").then((mod) => mod.PageView),
{
ssr: false,
},
);
const isValidPosthogConfig =
env.NEXT_PUBLIC_POSTHOG_KEY &&
env.NEXT_PUBLIC_POSTHOG_KEY !== "notyet" &&
env.NEXT_PUBLIC_POSTHOG_HOST.startsWith("http");
if (typeof window !== "undefined" && isValidPosthogConfig) {
posthog.init(env.NEXT_PUBLIC_POSTHOG_KEY!, {
api_host: env.NEXT_PUBLIC_POSTHOG_HOST,
person_profiles: "always",
capture_pageview: false,
disable_external_dependency_loading: true,
disable_session_recording: true,
});
}
export const { Provider, track, identify, reset } = {
Provider: ({ children }) => {
// Skip PostHog wrapper entirely when not configured
if (!isValidPosthogConfig) {
return <>{children}</>;
}
return (
<PostHogProvider client={posthog}>
{children}
<Suspense fallback={null}>
<PageView />
</Suspense>
</PostHogProvider>
);
},
track: (event, properties) => {
if (typeof window === "undefined") {
return;
}
posthog.capture(event, properties);
},
identify: (userId, traits) => {
if (typeof window === "undefined") {
return;
}
posthog.identify(userId, traits);
},
reset: () => {
if (typeof window === "undefined") {
return;
}
posthog.reset();
},
} satisfies AnalyticsProviderClientStrategy;

View File

@@ -0,0 +1,25 @@
"use client";
import { usePathname, useSearchParams } from "next/navigation";
import { usePostHog } from "posthog-js/react";
import { useEffect } from "react";
export const PageView = () => {
const pathname = usePathname();
const searchParams = useSearchParams();
const posthog = usePostHog();
useEffect(() => {
if (pathname) {
let url = window.origin + pathname;
if (searchParams.toString()) {
url = url + `?${searchParams.toString()}`;
}
posthog.capture("$pageview", {
$current_url: url,
});
}
}, [pathname, searchParams, posthog]);
return null;
};

View File

@@ -0,0 +1,41 @@
import { PostHog } from "posthog-node";
import { env } from "./env";
import type { AnalyticsProviderServerStrategy } from "@turbostarter/analytics";
const isValidPosthogConfig =
env.NEXT_PUBLIC_POSTHOG_KEY &&
env.NEXT_PUBLIC_POSTHOG_KEY !== "notyet" &&
env.NEXT_PUBLIC_POSTHOG_HOST.startsWith("http");
let client: PostHog | null = null;
const getClient = () => {
if (!isValidPosthogConfig) {
return null;
}
if (client) {
return client;
}
client = new PostHog(env.NEXT_PUBLIC_POSTHOG_KEY, {
host: env.NEXT_PUBLIC_POSTHOG_HOST,
});
return client;
};
export const { track } = {
track: (event, data) => {
const client = getClient();
if (!client) return;
client.capture({
event,
distinctId: typeof data?.distinctId === "string" ? data.distinctId : "",
properties: data,
});
},
} satisfies AnalyticsProviderServerStrategy;

View File

@@ -0,0 +1 @@
export * from "./posthog/server";

View File

@@ -0,0 +1,29 @@
/* eslint-disable turbo/no-undeclared-env-vars */
import { defineEnv } from "envin";
import * as z from "zod";
import { envConfig } from "@turbostarter/shared/constants";
import type { Preset } from "envin/types";
export const preset = {
id: "umami",
client: {
NEXT_PUBLIC_UMAMI_HOST: z.string(),
NEXT_PUBLIC_UMAMI_WEBSITE_ID: z.string(),
},
server: {
UMAMI_API_HOST: z.string(),
UMAMI_API_KEY: z.string().optional(),
},
} as const satisfies Preset;
export const env = defineEnv({
...envConfig,
...preset,
env: {
...process.env,
NEXT_PUBLIC_UMAMI_HOST: process.env.NEXT_PUBLIC_UMAMI_HOST,
NEXT_PUBLIC_UMAMI_WEBSITE_ID: process.env.NEXT_PUBLIC_UMAMI_WEBSITE_ID,
},
});

View File

@@ -0,0 +1,47 @@
import { env } from "./env";
import type { AnalyticsProviderClientStrategy } from "@turbostarter/analytics";
declare global {
interface Window {
umami?: {
track: (event: string, data?: Record<string, unknown>) => void;
identify: (
userId?: string | Record<string, unknown>,
traits?: Record<string, unknown>,
) => void;
};
}
}
export const { Provider, track, identify, reset } = {
Provider: ({ children }) => {
return (
<>
{children}
<script
async
src={`${env.NEXT_PUBLIC_UMAMI_HOST}/script.js`}
data-website-id={env.NEXT_PUBLIC_UMAMI_WEBSITE_ID}
></script>
</>
);
},
track: (event, data) => {
if (typeof window === "undefined" || !window.umami) {
return;
}
window.umami.track(event, data);
},
identify: (userId, traits) => {
if (typeof window === "undefined" || !window.umami) {
return;
}
window.umami.identify(userId, traits);
},
reset: () => {
// Umami does not explicitly support resetting the session via the client-side API
},
} satisfies AnalyticsProviderClientStrategy;

View File

@@ -0,0 +1,45 @@
import { logger } from "@turbostarter/shared/logger";
import { env } from "./env";
import type { AnalyticsProviderServerStrategy } from "@turbostarter/analytics";
export const { track } = {
track: (event, data) => {
const hostname =
typeof data?.hostname === "string" ? data.hostname : undefined;
const language =
typeof data?.language === "string" ? data.language : undefined;
const referrer =
typeof data?.referrer === "string" ? data.referrer : undefined;
const screen = typeof data?.screen === "string" ? data.screen : undefined;
const title = typeof data?.title === "string" ? data.title : undefined;
const url = typeof data?.url === "string" ? data.url : "app://server-side";
void fetch(`${env.UMAMI_API_HOST}/api/send`, {
method: "POST",
headers: {
"Content-Type": "application/json",
"x-umami-api-key": env.UMAMI_API_KEY ?? "",
},
body: JSON.stringify({
type: "event",
payload: {
website: env.NEXT_PUBLIC_UMAMI_WEBSITE_ID,
name: event,
url: url,
...(hostname && { hostname }),
...(language && { language }),
...(referrer && { referrer }),
...(screen && { screen }),
...(title && { title }),
data,
},
}),
}).then((res) => {
if (!res.ok) {
logger.error("Failed to post event to Umami: ", res);
}
});
},
} satisfies AnalyticsProviderServerStrategy;

View File

@@ -0,0 +1,24 @@
/* eslint-disable turbo/no-undeclared-env-vars */
import { defineEnv } from "envin";
import * as z from "zod";
import { envConfig } from "@turbostarter/shared/constants";
import type { Preset } from "envin/types";
export const preset = {
id: "vemetric",
client: {
NEXT_PUBLIC_VEMETRIC_PROJECT_TOKEN: z.string(),
},
} as const satisfies Preset;
export const env = defineEnv({
...envConfig,
...preset,
env: {
...process.env,
NEXT_PUBLIC_VEMETRIC_PROJECT_TOKEN:
process.env.NEXT_PUBLIC_VEMETRIC_PROJECT_TOKEN,
},
});

View File

@@ -0,0 +1,49 @@
import { VemetricScript, vemetric } from "@vemetric/react";
import { env } from "./env";
import type { AnalyticsProviderClientStrategy } from "@turbostarter/analytics";
export const { Provider, track, identify, reset } = {
Provider: ({ children }) => {
return (
<>
<VemetricScript
token={env.NEXT_PUBLIC_VEMETRIC_PROJECT_TOKEN}
trackPageViews
trackOutboundLinks
trackDataAttributes
/>
{children}
</>
);
},
track: (event, data) => {
if (typeof window === "undefined") {
return;
}
void vemetric.trackEvent(event, {
eventData: data,
});
},
identify: (userId, traits) => {
if (typeof window === "undefined") {
return;
}
void vemetric.identify({
identifier: userId,
data: {
set: traits,
},
});
},
reset: () => {
if (typeof window === "undefined") {
return;
}
void vemetric.resetUser();
},
} satisfies AnalyticsProviderClientStrategy;

View File

@@ -0,0 +1,30 @@
import { Vemetric } from "@vemetric/node";
import { env } from "./env";
import type { AnalyticsProviderServerStrategy } from "@turbostarter/analytics";
let client: Vemetric | null = null;
const getClient = () => {
if (client) {
return client;
}
client = new Vemetric({
token: env.NEXT_PUBLIC_VEMETRIC_PROJECT_TOKEN,
});
return client;
};
export const { track } = {
track: (event, data) => {
const client = getClient();
void client.trackEvent(event, {
userIdentifier: data?.distinctId?.toString() ?? "anonymous",
eventData: data,
});
},
} satisfies AnalyticsProviderServerStrategy;

View File

@@ -0,0 +1,15 @@
import { defineEnv } from "envin";
import { envConfig } from "@turbostarter/shared/constants";
import type { Preset } from "envin/types";
export const preset = {
id: "vercel",
server: {},
} as const satisfies Preset;
export const env = defineEnv({
...envConfig,
...preset,
});

View File

@@ -0,0 +1,22 @@
import { track as trackEvent } from "@vercel/analytics";
import { Analytics } from "@vercel/analytics/react";
import type { AnalyticsProviderClientStrategy } from "@turbostarter/analytics";
export const { Provider, track, identify, reset } = {
Provider: ({ children }) => {
return (
<>
{children}
<Analytics />
</>
);
},
track: trackEvent,
identify: () => {
// Vercel Web Analytics doesn't expose identify() on the client
},
reset: () => {
// Vercel Web Analytics doesn't expose reset() on the client
},
} satisfies AnalyticsProviderClientStrategy;

View File

@@ -0,0 +1,9 @@
import { track as vercelTrack } from "@vercel/analytics/server";
import type { AnalyticsProviderServerStrategy } from "@turbostarter/analytics";
export const { track } = {
track: (event, data) => {
void vercelTrack(event, data);
},
} satisfies AnalyticsProviderServerStrategy;

View File

@@ -0,0 +1 @@
export { track } from "./providers/server";

Some files were not shown because too many files have changed in this diff Show More