feat: turbostarter boilerplate
Production-ready Next.js boilerplate with: - Runtime env validation (fail-fast on missing vars) - Feature-gated config (S3, Stripe, email, OAuth) - Docker + Coolify deployment pipeline - PostgreSQL + pgvector, MinIO S3, Better Auth - TypeScript strict mode (no ignoreBuildErrors) - i18n (en/es), AI modules, billing, monitoring Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
306
packages/cognitive-context/src/tokens.ts
Normal file
306
packages/cognitive-context/src/tokens.ts
Normal file
@@ -0,0 +1,306 @@
|
||||
/**
|
||||
* Token counting and budget management module
|
||||
*
|
||||
* Uses js-tiktoken with cl100k_base encoding (GPT-4 compatible)
|
||||
* for accurate token counting.
|
||||
*/
|
||||
|
||||
import { readFile } from 'node:fs/promises';
|
||||
import { getEncoding, Tiktoken } from 'js-tiktoken';
|
||||
import type { TokenBudget, TokenReport } from './types.js';
|
||||
|
||||
// ============================================
|
||||
// Encoder Instance (cached for performance)
|
||||
// ============================================
|
||||
|
||||
let encoderInstance: Tiktoken | null = null;
|
||||
|
||||
/**
|
||||
* Get or create the tiktoken encoder instance
|
||||
* Uses cl100k_base encoding (GPT-4/ChatGPT compatible)
|
||||
*/
|
||||
function getEncoder(): Tiktoken {
|
||||
if (!encoderInstance) {
|
||||
encoderInstance = getEncoding('cl100k_base');
|
||||
}
|
||||
return encoderInstance;
|
||||
}
|
||||
|
||||
// ============================================
|
||||
// Core Token Counting
|
||||
// ============================================
|
||||
|
||||
/**
|
||||
* Count the number of tokens in a text string
|
||||
*
|
||||
* @param text - The text to count tokens for
|
||||
* @returns The number of tokens
|
||||
*
|
||||
* @example
|
||||
* const count = countTokens('Hello, world!');
|
||||
* console.log(count); // 4
|
||||
*/
|
||||
export function countTokens(text: string): number {
|
||||
if (!text || text.length === 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
try {
|
||||
const encoder = getEncoder();
|
||||
const tokens = encoder.encode(text);
|
||||
return tokens.length;
|
||||
} catch (error) {
|
||||
// Handle encoding errors gracefully - estimate based on characters
|
||||
// Rough estimate: ~4 characters per token for English text
|
||||
console.warn('Token encoding failed, using character-based estimate:', error);
|
||||
return Math.ceil(text.length / 4);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Count tokens in a file
|
||||
*
|
||||
* @param filePath - Path to the file to count tokens for
|
||||
* @param budget - Optional budget to compare against (defaults to Infinity)
|
||||
* @returns TokenReport with file statistics
|
||||
*
|
||||
* @example
|
||||
* const report = await countFileTokens('./README.md', 1000);
|
||||
* console.log(report.overBudget); // false if under 1000 tokens
|
||||
*/
|
||||
export async function countFileTokens(
|
||||
filePath: string,
|
||||
budget: number = Infinity
|
||||
): Promise<TokenReport> {
|
||||
try {
|
||||
const content = await readFile(filePath, 'utf-8');
|
||||
const tokens = countTokens(content);
|
||||
|
||||
return {
|
||||
file: filePath,
|
||||
tokens,
|
||||
budget,
|
||||
overBudget: tokens > budget,
|
||||
percentage: budget === Infinity ? 0 : Math.round((tokens / budget) * 100),
|
||||
};
|
||||
} catch (error) {
|
||||
// If file can't be read, return a report indicating the error
|
||||
// with 0 tokens (can't count what we can't read)
|
||||
const message = error instanceof Error ? error.message : 'Unknown error';
|
||||
console.warn(`Failed to read file ${filePath}: ${message}`);
|
||||
|
||||
return {
|
||||
file: filePath,
|
||||
tokens: 0,
|
||||
budget,
|
||||
overBudget: false,
|
||||
percentage: 0,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================
|
||||
// Budget Validation
|
||||
// ============================================
|
||||
|
||||
/**
|
||||
* Validate token reports against a budget configuration
|
||||
*
|
||||
* @param reports - Array of TokenReport objects to validate
|
||||
* @param budget - TokenBudget configuration with limits
|
||||
* @returns Validation result with valid flag and any violations
|
||||
*
|
||||
* @example
|
||||
* const reports = [
|
||||
* { file: 'SUMMARY.md', tokens: 250, budget: 300, overBudget: false, percentage: 83 },
|
||||
* { file: 'wisdom/guide.md', tokens: 2000, budget: 1500, overBudget: true, percentage: 133 },
|
||||
* ];
|
||||
* const result = validateBudget(reports, budget);
|
||||
* console.log(result.valid); // false
|
||||
* console.log(result.violations); // [{ file: 'wisdom/guide.md', ... }]
|
||||
*/
|
||||
export function validateBudget(
|
||||
reports: TokenReport[],
|
||||
budget: TokenBudget
|
||||
): { valid: boolean; violations: TokenReport[] } {
|
||||
const violations: TokenReport[] = [];
|
||||
|
||||
// Check individual file budgets
|
||||
for (const report of reports) {
|
||||
if (report.overBudget) {
|
||||
violations.push(report);
|
||||
}
|
||||
}
|
||||
|
||||
// Check total budget
|
||||
const totalTokens = reports.reduce((sum, r) => sum + r.tokens, 0);
|
||||
if (totalTokens > budget.total) {
|
||||
// Add a synthetic report for total budget violation
|
||||
violations.push({
|
||||
file: '[TOTAL]',
|
||||
tokens: totalTokens,
|
||||
budget: budget.total,
|
||||
overBudget: true,
|
||||
percentage: Math.round((totalTokens / budget.total) * 100),
|
||||
});
|
||||
}
|
||||
|
||||
return {
|
||||
valid: violations.length === 0,
|
||||
violations,
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================
|
||||
// Text Truncation
|
||||
// ============================================
|
||||
|
||||
/**
|
||||
* Truncate text to fit within a token limit
|
||||
*
|
||||
* Preserves complete words/sentences when possible by truncating
|
||||
* at sentence or word boundaries.
|
||||
*
|
||||
* @param text - The text to truncate
|
||||
* @param maxTokens - Maximum number of tokens allowed
|
||||
* @returns Truncated text that fits within the limit
|
||||
*
|
||||
* @example
|
||||
* const longText = 'This is a very long text...';
|
||||
* const truncated = truncateToTokenLimit(longText, 10);
|
||||
* console.log(countTokens(truncated) <= 10); // true
|
||||
*/
|
||||
export function truncateToTokenLimit(text: string, maxTokens: number): string {
|
||||
if (!text || maxTokens <= 0) {
|
||||
return '';
|
||||
}
|
||||
|
||||
const currentTokens = countTokens(text);
|
||||
if (currentTokens <= maxTokens) {
|
||||
return text;
|
||||
}
|
||||
|
||||
// Binary search for the right length
|
||||
// Start with an estimate based on the ratio
|
||||
const ratio = maxTokens / currentTokens;
|
||||
let low = 0;
|
||||
let high = text.length;
|
||||
let result = '';
|
||||
|
||||
// Initial estimate
|
||||
let mid = Math.floor(text.length * ratio * 0.9); // Start slightly under
|
||||
|
||||
// Refine with binary search
|
||||
while (low < high) {
|
||||
mid = Math.floor((low + high + 1) / 2);
|
||||
const substring = text.slice(0, mid);
|
||||
const tokens = countTokens(substring);
|
||||
|
||||
if (tokens <= maxTokens) {
|
||||
result = substring;
|
||||
low = mid;
|
||||
} else {
|
||||
high = mid - 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Try to find a clean break point (sentence or word boundary)
|
||||
const cleanBreak = findCleanBreakPoint(result);
|
||||
|
||||
return cleanBreak || result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find a clean break point in text (sentence or word boundary)
|
||||
*/
|
||||
function findCleanBreakPoint(text: string): string {
|
||||
if (!text) return '';
|
||||
|
||||
// Try to find the last sentence boundary
|
||||
const sentenceMatch = text.match(/^(.+[.!?])\s*[^.!?]*$/);
|
||||
if (sentenceMatch && sentenceMatch[1].length > text.length * 0.7) {
|
||||
return sentenceMatch[1];
|
||||
}
|
||||
|
||||
// Fall back to last word boundary
|
||||
const wordMatch = text.match(/^(.+)\s+\S*$/);
|
||||
if (wordMatch && wordMatch[1].length > text.length * 0.8) {
|
||||
return wordMatch[1];
|
||||
}
|
||||
|
||||
// If no good break point, return as-is
|
||||
return text;
|
||||
}
|
||||
|
||||
// ============================================
|
||||
// Utility Functions
|
||||
// ============================================
|
||||
|
||||
/**
|
||||
* Create a TokenReport for a given file path and content
|
||||
*
|
||||
* @param file - File path
|
||||
* @param content - File content
|
||||
* @param budget - Token budget for this file
|
||||
* @returns TokenReport
|
||||
*/
|
||||
export function createTokenReport(
|
||||
file: string,
|
||||
content: string,
|
||||
budget: number
|
||||
): TokenReport {
|
||||
const tokens = countTokens(content);
|
||||
return {
|
||||
file,
|
||||
tokens,
|
||||
budget,
|
||||
overBudget: tokens > budget,
|
||||
percentage: budget > 0 ? Math.round((tokens / budget) * 100) : 0,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get budget for a specific file type based on TokenBudget config
|
||||
*
|
||||
* @param filePath - The file path
|
||||
* @param budget - TokenBudget configuration
|
||||
* @returns The applicable budget for this file
|
||||
*/
|
||||
export function getBudgetForFile(filePath: string, budget: TokenBudget): number {
|
||||
const lowerPath = filePath.toLowerCase();
|
||||
|
||||
if (lowerPath.includes('summary')) {
|
||||
return budget.summary;
|
||||
}
|
||||
|
||||
if (lowerPath.includes('capabilities')) {
|
||||
return budget.capabilities;
|
||||
}
|
||||
|
||||
if (lowerPath.includes('wisdom')) {
|
||||
return budget.wisdomPerFile;
|
||||
}
|
||||
|
||||
// Default to total budget if no specific category matches
|
||||
return budget.total;
|
||||
}
|
||||
|
||||
/**
|
||||
* Format a token count for display
|
||||
*
|
||||
* @param tokens - Number of tokens
|
||||
* @returns Formatted string (e.g., "1.2k" for 1200)
|
||||
*/
|
||||
export function formatTokenCount(tokens: number): string {
|
||||
if (tokens >= 1000) {
|
||||
return `${(tokens / 1000).toFixed(1)}k`;
|
||||
}
|
||||
return tokens.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Reset the encoder instance (useful for testing)
|
||||
*/
|
||||
export function resetEncoder(): void {
|
||||
encoderInstance = null;
|
||||
}
|
||||
Reference in New Issue
Block a user