File size: 3,327 Bytes
f0743f4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 | import { logger } from '@librechat/data-schemas';
/** Token count function that can be sync or async */
export type TokenCountFn = (text: string) => number | Promise<number>;
/**
* Safety buffer multiplier applied to character position estimates during truncation.
*
* We use 98% (0.98) rather than 100% to intentionally undershoot the target on the first attempt.
* This is necessary because:
* - Token density varies across text (some regions may have more tokens per character than the average)
* - The ratio-based estimate assumes uniform token distribution, which is rarely true
* - Undershooting is safer than overshooting: exceeding the limit requires another iteration,
* while being slightly under is acceptable
* - In practice, this buffer reduces refinement iterations from 2-3 down to 0-1 in most cases
*
* @example
* // If text has 1000 chars and 250 tokens (4 chars/token average), targeting 100 tokens:
* // Without buffer: estimate = 1000 * (100/250) = 400 chars → might yield 105 tokens (over!)
* // With 0.98 buffer: estimate = 400 * 0.98 = 392 chars → likely yields 97-99 tokens (safe)
*/
const TRUNCATION_SAFETY_BUFFER = 0.98;
/**
* Processes text content by counting tokens and truncating if it exceeds the specified limit.
* Uses ratio-based estimation to minimize expensive tokenCountFn calls.
*
* @param text - The text content to process
* @param tokenLimit - The maximum number of tokens allowed
* @param tokenCountFn - Function to count tokens (can be sync or async)
* @returns Promise resolving to object with processed text, token count, and truncation status
*
* @remarks
* This function uses a ratio-based estimation algorithm instead of binary search.
* Binary search would require O(log n) tokenCountFn calls (~17 for 100k chars),
* while this approach typically requires only 2-3 calls for a 90%+ reduction in CPU usage.
*/
export async function processTextWithTokenLimit({
text,
tokenLimit,
tokenCountFn,
}: {
text: string;
tokenLimit: number;
tokenCountFn: TokenCountFn;
}): Promise<{ text: string; tokenCount: number; wasTruncated: boolean }> {
const originalTokenCount = await tokenCountFn(text);
if (originalTokenCount <= tokenLimit) {
return {
text,
tokenCount: originalTokenCount,
wasTruncated: false,
};
}
logger.debug(
`[textTokenLimiter] Text content exceeds token limit: ${originalTokenCount} > ${tokenLimit}, truncating...`,
);
const ratio = tokenLimit / originalTokenCount;
let charPosition = Math.floor(text.length * ratio * TRUNCATION_SAFETY_BUFFER);
let truncatedText = text.substring(0, charPosition);
let tokenCount = await tokenCountFn(truncatedText);
const maxIterations = 5;
let iterations = 0;
while (tokenCount > tokenLimit && iterations < maxIterations && charPosition > 0) {
const overageRatio = tokenLimit / tokenCount;
charPosition = Math.floor(charPosition * overageRatio * TRUNCATION_SAFETY_BUFFER);
truncatedText = text.substring(0, charPosition);
tokenCount = await tokenCountFn(truncatedText);
iterations++;
}
logger.warn(
`[textTokenLimiter] Text truncated from ${originalTokenCount} to ${tokenCount} tokens (limit: ${tokenLimit})`,
);
return {
text: truncatedText,
tokenCount,
wasTruncated: true,
};
}
|