File size: 3,327 Bytes
f0743f4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import { logger } from '@librechat/data-schemas';

/** Token count function that can be sync or async */
export type TokenCountFn = (text: string) => number | Promise<number>;

/**
 * Safety buffer multiplier applied to character position estimates during truncation.
 *
 * We use 98% (0.98) rather than 100% to intentionally undershoot the target on the first attempt.
 * This is necessary because:
 * - Token density varies across text (some regions may have more tokens per character than the average)
 * - The ratio-based estimate assumes uniform token distribution, which is rarely true
 * - Undershooting is safer than overshooting: exceeding the limit requires another iteration,
 *   while being slightly under is acceptable
 * - In practice, this buffer reduces refinement iterations from 2-3 down to 0-1 in most cases
 *
 * @example
 * // If text has 1000 chars and 250 tokens (4 chars/token average), targeting 100 tokens:
 * // Without buffer: estimate = 1000 * (100/250) = 400 chars → might yield 105 tokens (over!)
 * // With 0.98 buffer: estimate = 400 * 0.98 = 392 chars → likely yields 97-99 tokens (safe)
 */
const TRUNCATION_SAFETY_BUFFER = 0.98;

/**
 * Processes text content by counting tokens and truncating if it exceeds the specified limit.
 * Uses ratio-based estimation to minimize expensive tokenCountFn calls.
 *
 * @param text - The text content to process
 * @param tokenLimit - The maximum number of tokens allowed
 * @param tokenCountFn - Function to count tokens (can be sync or async)
 * @returns Promise resolving to object with processed text, token count, and truncation status
 *
 * @remarks
 * This function uses a ratio-based estimation algorithm instead of binary search.
 * Binary search would require O(log n) tokenCountFn calls (~17 for 100k chars),
 * while this approach typically requires only 2-3 calls for a 90%+ reduction in CPU usage.
 */
export async function processTextWithTokenLimit({
  text,
  tokenLimit,
  tokenCountFn,
}: {
  text: string;
  tokenLimit: number;
  tokenCountFn: TokenCountFn;
}): Promise<{ text: string; tokenCount: number; wasTruncated: boolean }> {
  const originalTokenCount = await tokenCountFn(text);

  if (originalTokenCount <= tokenLimit) {
    return {
      text,
      tokenCount: originalTokenCount,
      wasTruncated: false,
    };
  }

  logger.debug(
    `[textTokenLimiter] Text content exceeds token limit: ${originalTokenCount} > ${tokenLimit}, truncating...`,
  );

  const ratio = tokenLimit / originalTokenCount;
  let charPosition = Math.floor(text.length * ratio * TRUNCATION_SAFETY_BUFFER);

  let truncatedText = text.substring(0, charPosition);
  let tokenCount = await tokenCountFn(truncatedText);

  const maxIterations = 5;
  let iterations = 0;

  while (tokenCount > tokenLimit && iterations < maxIterations && charPosition > 0) {
    const overageRatio = tokenLimit / tokenCount;
    charPosition = Math.floor(charPosition * overageRatio * TRUNCATION_SAFETY_BUFFER);
    truncatedText = text.substring(0, charPosition);
    tokenCount = await tokenCountFn(truncatedText);
    iterations++;
  }

  logger.warn(
    `[textTokenLimiter] Text truncated from ${originalTokenCount} to ${tokenCount} tokens (limit: ${tokenLimit})`,
  );

  return {
    text: truncatedText,
    tokenCount,
    wasTruncated: true,
  };
}