Files
turbostarter/packages/cognitive-context/src/tokens.ts
Alejandro Gutiérrez 3527e732d4 feat: turbostarter boilerplate
Production-ready Next.js boilerplate with:
- Runtime env validation (fail-fast on missing vars)
- Feature-gated config (S3, Stripe, email, OAuth)
- Docker + Coolify deployment pipeline
- PostgreSQL + pgvector, MinIO S3, Better Auth
- TypeScript strict mode (no ignoreBuildErrors)
- i18n (en/es), AI modules, billing, monitoring

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-04 01:01:55 +01:00

307 lines
8.0 KiB
TypeScript

/**
* Token counting and budget management module
*
* Uses js-tiktoken with cl100k_base encoding (GPT-4 compatible)
* for accurate token counting.
*/
import { readFile } from 'node:fs/promises';
import { getEncoding, Tiktoken } from 'js-tiktoken';
import type { TokenBudget, TokenReport } from './types.js';
// ============================================
// Encoder Instance (cached for performance)
// ============================================
let encoderInstance: Tiktoken | null = null;
/**
* Get or create the tiktoken encoder instance
* Uses cl100k_base encoding (GPT-4/ChatGPT compatible)
*/
function getEncoder(): Tiktoken {
if (!encoderInstance) {
encoderInstance = getEncoding('cl100k_base');
}
return encoderInstance;
}
// ============================================
// Core Token Counting
// ============================================
/**
* Count the number of tokens in a text string
*
* @param text - The text to count tokens for
* @returns The number of tokens
*
* @example
* const count = countTokens('Hello, world!');
* console.log(count); // 4
*/
export function countTokens(text: string): number {
if (!text || text.length === 0) {
return 0;
}
try {
const encoder = getEncoder();
const tokens = encoder.encode(text);
return tokens.length;
} catch (error) {
// Handle encoding errors gracefully - estimate based on characters
// Rough estimate: ~4 characters per token for English text
console.warn('Token encoding failed, using character-based estimate:', error);
return Math.ceil(text.length / 4);
}
}
/**
* Count tokens in a file
*
* @param filePath - Path to the file to count tokens for
* @param budget - Optional budget to compare against (defaults to Infinity)
* @returns TokenReport with file statistics
*
* @example
* const report = await countFileTokens('./README.md', 1000);
* console.log(report.overBudget); // false if under 1000 tokens
*/
export async function countFileTokens(
filePath: string,
budget: number = Infinity
): Promise<TokenReport> {
try {
const content = await readFile(filePath, 'utf-8');
const tokens = countTokens(content);
return {
file: filePath,
tokens,
budget,
overBudget: tokens > budget,
percentage: budget === Infinity ? 0 : Math.round((tokens / budget) * 100),
};
} catch (error) {
// If file can't be read, return a report indicating the error
// with 0 tokens (can't count what we can't read)
const message = error instanceof Error ? error.message : 'Unknown error';
console.warn(`Failed to read file ${filePath}: ${message}`);
return {
file: filePath,
tokens: 0,
budget,
overBudget: false,
percentage: 0,
};
}
}
// ============================================
// Budget Validation
// ============================================
/**
* Validate token reports against a budget configuration
*
* @param reports - Array of TokenReport objects to validate
* @param budget - TokenBudget configuration with limits
* @returns Validation result with valid flag and any violations
*
* @example
* const reports = [
* { file: 'SUMMARY.md', tokens: 250, budget: 300, overBudget: false, percentage: 83 },
* { file: 'wisdom/guide.md', tokens: 2000, budget: 1500, overBudget: true, percentage: 133 },
* ];
* const result = validateBudget(reports, budget);
* console.log(result.valid); // false
* console.log(result.violations); // [{ file: 'wisdom/guide.md', ... }]
*/
export function validateBudget(
reports: TokenReport[],
budget: TokenBudget
): { valid: boolean; violations: TokenReport[] } {
const violations: TokenReport[] = [];
// Check individual file budgets
for (const report of reports) {
if (report.overBudget) {
violations.push(report);
}
}
// Check total budget
const totalTokens = reports.reduce((sum, r) => sum + r.tokens, 0);
if (totalTokens > budget.total) {
// Add a synthetic report for total budget violation
violations.push({
file: '[TOTAL]',
tokens: totalTokens,
budget: budget.total,
overBudget: true,
percentage: Math.round((totalTokens / budget.total) * 100),
});
}
return {
valid: violations.length === 0,
violations,
};
}
// ============================================
// Text Truncation
// ============================================
/**
* Truncate text to fit within a token limit
*
* Preserves complete words/sentences when possible by truncating
* at sentence or word boundaries.
*
* @param text - The text to truncate
* @param maxTokens - Maximum number of tokens allowed
* @returns Truncated text that fits within the limit
*
* @example
* const longText = 'This is a very long text...';
* const truncated = truncateToTokenLimit(longText, 10);
* console.log(countTokens(truncated) <= 10); // true
*/
export function truncateToTokenLimit(text: string, maxTokens: number): string {
if (!text || maxTokens <= 0) {
return '';
}
const currentTokens = countTokens(text);
if (currentTokens <= maxTokens) {
return text;
}
// Binary search for the right length
// Start with an estimate based on the ratio
const ratio = maxTokens / currentTokens;
let low = 0;
let high = text.length;
let result = '';
// Initial estimate
let mid = Math.floor(text.length * ratio * 0.9); // Start slightly under
// Refine with binary search
while (low < high) {
mid = Math.floor((low + high + 1) / 2);
const substring = text.slice(0, mid);
const tokens = countTokens(substring);
if (tokens <= maxTokens) {
result = substring;
low = mid;
} else {
high = mid - 1;
}
}
// Try to find a clean break point (sentence or word boundary)
const cleanBreak = findCleanBreakPoint(result);
return cleanBreak || result;
}
/**
* Find a clean break point in text (sentence or word boundary)
*/
function findCleanBreakPoint(text: string): string {
if (!text) return '';
// Try to find the last sentence boundary
const sentenceMatch = text.match(/^(.+[.!?])\s*[^.!?]*$/);
if (sentenceMatch && sentenceMatch[1].length > text.length * 0.7) {
return sentenceMatch[1];
}
// Fall back to last word boundary
const wordMatch = text.match(/^(.+)\s+\S*$/);
if (wordMatch && wordMatch[1].length > text.length * 0.8) {
return wordMatch[1];
}
// If no good break point, return as-is
return text;
}
// ============================================
// Utility Functions
// ============================================
/**
* Create a TokenReport for a given file path and content
*
* @param file - File path
* @param content - File content
* @param budget - Token budget for this file
* @returns TokenReport
*/
export function createTokenReport(
file: string,
content: string,
budget: number
): TokenReport {
const tokens = countTokens(content);
return {
file,
tokens,
budget,
overBudget: tokens > budget,
percentage: budget > 0 ? Math.round((tokens / budget) * 100) : 0,
};
}
/**
* Get budget for a specific file type based on TokenBudget config
*
* @param filePath - The file path
* @param budget - TokenBudget configuration
* @returns The applicable budget for this file
*/
export function getBudgetForFile(filePath: string, budget: TokenBudget): number {
const lowerPath = filePath.toLowerCase();
if (lowerPath.includes('summary')) {
return budget.summary;
}
if (lowerPath.includes('capabilities')) {
return budget.capabilities;
}
if (lowerPath.includes('wisdom')) {
return budget.wisdomPerFile;
}
// Default to total budget if no specific category matches
return budget.total;
}
/**
* Format a token count for display
*
* @param tokens - Number of tokens
* @returns Formatted string (e.g., "1.2k" for 1200)
*/
export function formatTokenCount(tokens: number): string {
if (tokens >= 1000) {
return `${(tokens / 1000).toFixed(1)}k`;
}
return tokens.toString();
}
/**
* Reset the encoder instance (useful for testing)
*/
export function resetEncoder(): void {
encoderInstance = null;
}