File size: 4,889 Bytes
f0743f4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
/**
 * @file Tokenizer.spec.cjs
 *
 * Tests the real TokenizerSingleton (no mocking of `tiktoken`).
 * Make sure to install `tiktoken` and have it configured properly.
 */

import { logger } from '@librechat/data-schemas';
import type { Tiktoken } from 'tiktoken';
import Tokenizer from './tokenizer';

jest.mock('@librechat/data-schemas', () => ({
  logger: {
    error: jest.fn(),
  },
}));

describe('Tokenizer', () => {
  it('should be a singleton (same instance)', async () => {
    const AnotherTokenizer = await import('./tokenizer'); // same path
    expect(Tokenizer).toBe(AnotherTokenizer.default);
  });

  describe('getTokenizer', () => {
    it('should create an encoder for an explicit model name (e.g., "gpt-4")', () => {
      // The real `encoding_for_model` will be called internally
      // as soon as we pass isModelName = true.
      const tokenizer = Tokenizer.getTokenizer('gpt-4', true);

      // Basic sanity checks
      expect(tokenizer).toBeDefined();
      // You can optionally check certain properties from `tiktoken` if they exist
      // e.g., expect(typeof tokenizer.encode).toBe('function');
    });

    it('should create an encoder for a known encoding (e.g., "cl100k_base")', () => {
      // The real `get_encoding` will be called internally
      // as soon as we pass isModelName = false.
      const tokenizer = Tokenizer.getTokenizer('cl100k_base', false);

      expect(tokenizer).toBeDefined();
      // e.g., expect(typeof tokenizer.encode).toBe('function');
    });

    it('should return cached tokenizer if previously fetched', () => {
      const tokenizer1 = Tokenizer.getTokenizer('cl100k_base', false);
      const tokenizer2 = Tokenizer.getTokenizer('cl100k_base', false);
      // Should be the exact same instance from the cache
      expect(tokenizer1).toBe(tokenizer2);
    });
  });

  describe('freeAndResetAllEncoders', () => {
    beforeEach(() => {
      jest.clearAllMocks();
    });

    it('should free all encoders and reset tokenizerCallsCount to 1', () => {
      // By creating two different encodings, we populate the cache
      Tokenizer.getTokenizer('cl100k_base', false);
      Tokenizer.getTokenizer('r50k_base', false);

      // Now free them
      Tokenizer.freeAndResetAllEncoders();

      // The internal cache is cleared
      expect(Tokenizer.tokenizersCache['cl100k_base']).toBeUndefined();
      expect(Tokenizer.tokenizersCache['r50k_base']).toBeUndefined();

      // tokenizerCallsCount is reset to 1
      expect(Tokenizer.tokenizerCallsCount).toBe(1);
    });

    it('should catch and log errors if freeing fails', () => {
      // Mock logger.error before the test
      const mockLoggerError = jest.spyOn(logger, 'error');

      // Set up a problematic tokenizer in the cache
      Tokenizer.tokenizersCache['cl100k_base'] = {
        free() {
          throw new Error('Intentional free error');
        },
      } as unknown as Tiktoken;

      // Should not throw uncaught errors
      Tokenizer.freeAndResetAllEncoders();

      // Verify logger.error was called with correct arguments
      expect(mockLoggerError).toHaveBeenCalledWith(
        '[Tokenizer] Free and reset encoders error',
        expect.any(Error),
      );

      // Clean up
      mockLoggerError.mockRestore();
      Tokenizer.tokenizersCache = {};
    });
  });

  describe('getTokenCount', () => {
    beforeEach(() => {
      jest.clearAllMocks();
      Tokenizer.freeAndResetAllEncoders();
    });

    it('should return the number of tokens in the given text', () => {
      const text = 'Hello, world!';
      const count = Tokenizer.getTokenCount(text, 'cl100k_base');
      expect(count).toBeGreaterThan(0);
    });

    it('should reset encoders if an error is thrown', () => {
      // We can simulate an error by temporarily overriding the selected tokenizer's `encode` method.
      const tokenizer = Tokenizer.getTokenizer('cl100k_base', false);
      const originalEncode = tokenizer.encode;
      tokenizer.encode = () => {
        throw new Error('Forced error');
      };

      // Despite the forced error, the code should catch and reset, then re-encode
      const count = Tokenizer.getTokenCount('Hello again', 'cl100k_base');
      expect(count).toBeGreaterThan(0);

      // Restore the original encode
      tokenizer.encode = originalEncode;
    });

    it('should reset tokenizers after 25 calls', () => {
      // Spy on freeAndResetAllEncoders
      const resetSpy = jest.spyOn(Tokenizer, 'freeAndResetAllEncoders');

      // Make 24 calls; should NOT reset yet
      for (let i = 0; i < 24; i++) {
        Tokenizer.getTokenCount('test text', 'cl100k_base');
      }
      expect(resetSpy).not.toHaveBeenCalled();

      // 25th call triggers the reset
      Tokenizer.getTokenCount('the 25th call!', 'cl100k_base');
      expect(resetSpy).toHaveBeenCalledTimes(1);
    });
  });
});