File size: 2,785 Bytes
f0743f4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import { logger } from '@librechat/data-schemas';
import { encoding_for_model as encodingForModel, get_encoding as getEncoding } from 'tiktoken';
import type { Tiktoken, TiktokenModel, TiktokenEncoding } from 'tiktoken';

interface TokenizerOptions {
  debug?: boolean;
}

class Tokenizer {
  tokenizersCache: Record<string, Tiktoken>;
  tokenizerCallsCount: number;
  private options?: TokenizerOptions;

  constructor() {
    this.tokenizersCache = {};
    this.tokenizerCallsCount = 0;
  }

  getTokenizer(
    encoding: TiktokenModel | TiktokenEncoding,
    isModelName = false,
    extendSpecialTokens: Record<string, number> = {},
  ): Tiktoken {
    let tokenizer: Tiktoken;
    if (this.tokenizersCache[encoding]) {
      tokenizer = this.tokenizersCache[encoding];
    } else {
      if (isModelName) {
        tokenizer = encodingForModel(encoding as TiktokenModel, extendSpecialTokens);
      } else {
        tokenizer = getEncoding(encoding as TiktokenEncoding, extendSpecialTokens);
      }
      this.tokenizersCache[encoding] = tokenizer;
    }
    return tokenizer;
  }

  freeAndResetAllEncoders(): void {
    try {
      Object.keys(this.tokenizersCache).forEach((key) => {
        if (this.tokenizersCache[key]) {
          this.tokenizersCache[key].free();
          delete this.tokenizersCache[key];
        }
      });
      this.tokenizerCallsCount = 1;
    } catch (error) {
      logger.error('[Tokenizer] Free and reset encoders error', error);
    }
  }

  resetTokenizersIfNecessary(): void {
    if (this.tokenizerCallsCount >= 25) {
      if (this.options?.debug) {
        logger.debug('[Tokenizer] freeAndResetAllEncoders: reached 25 encodings, resetting...');
      }
      this.freeAndResetAllEncoders();
    }
    this.tokenizerCallsCount++;
  }

  getTokenCount(text: string, encoding: TiktokenModel | TiktokenEncoding = 'cl100k_base'): number {
    this.resetTokenizersIfNecessary();
    try {
      const tokenizer = this.getTokenizer(encoding);
      return tokenizer.encode(text, 'all').length;
    } catch (error) {
      logger.error('[Tokenizer] Error getting token count:', error);
      this.freeAndResetAllEncoders();
      const tokenizer = this.getTokenizer(encoding);
      return tokenizer.encode(text, 'all').length;
    }
  }
}

const TokenizerSingleton = new Tokenizer();

/**
 * Counts the number of tokens in a given text using tiktoken.
 * This is an async wrapper around Tokenizer.getTokenCount for compatibility.
 * @param text - The text to be tokenized. Defaults to an empty string if not provided.
 * @returns The number of tokens in the provided text.
 */
export async function countTokens(text = ''): Promise<number> {
  return TokenizerSingleton.getTokenCount(text, 'cl100k_base');
}

export default TokenizerSingleton;