File size: 4,889 Bytes
f0743f4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 | /**
* @file Tokenizer.spec.cjs
*
* Tests the real TokenizerSingleton (no mocking of `tiktoken`).
* Make sure to install `tiktoken` and have it configured properly.
*/
import { logger } from '@librechat/data-schemas';
import type { Tiktoken } from 'tiktoken';
import Tokenizer from './tokenizer';
jest.mock('@librechat/data-schemas', () => ({
logger: {
error: jest.fn(),
},
}));
describe('Tokenizer', () => {
it('should be a singleton (same instance)', async () => {
const AnotherTokenizer = await import('./tokenizer'); // same path
expect(Tokenizer).toBe(AnotherTokenizer.default);
});
describe('getTokenizer', () => {
it('should create an encoder for an explicit model name (e.g., "gpt-4")', () => {
// The real `encoding_for_model` will be called internally
// as soon as we pass isModelName = true.
const tokenizer = Tokenizer.getTokenizer('gpt-4', true);
// Basic sanity checks
expect(tokenizer).toBeDefined();
// You can optionally check certain properties from `tiktoken` if they exist
// e.g., expect(typeof tokenizer.encode).toBe('function');
});
it('should create an encoder for a known encoding (e.g., "cl100k_base")', () => {
// The real `get_encoding` will be called internally
// as soon as we pass isModelName = false.
const tokenizer = Tokenizer.getTokenizer('cl100k_base', false);
expect(tokenizer).toBeDefined();
// e.g., expect(typeof tokenizer.encode).toBe('function');
});
it('should return cached tokenizer if previously fetched', () => {
const tokenizer1 = Tokenizer.getTokenizer('cl100k_base', false);
const tokenizer2 = Tokenizer.getTokenizer('cl100k_base', false);
// Should be the exact same instance from the cache
expect(tokenizer1).toBe(tokenizer2);
});
});
describe('freeAndResetAllEncoders', () => {
beforeEach(() => {
jest.clearAllMocks();
});
it('should free all encoders and reset tokenizerCallsCount to 1', () => {
// By creating two different encodings, we populate the cache
Tokenizer.getTokenizer('cl100k_base', false);
Tokenizer.getTokenizer('r50k_base', false);
// Now free them
Tokenizer.freeAndResetAllEncoders();
// The internal cache is cleared
expect(Tokenizer.tokenizersCache['cl100k_base']).toBeUndefined();
expect(Tokenizer.tokenizersCache['r50k_base']).toBeUndefined();
// tokenizerCallsCount is reset to 1
expect(Tokenizer.tokenizerCallsCount).toBe(1);
});
it('should catch and log errors if freeing fails', () => {
// Mock logger.error before the test
const mockLoggerError = jest.spyOn(logger, 'error');
// Set up a problematic tokenizer in the cache
Tokenizer.tokenizersCache['cl100k_base'] = {
free() {
throw new Error('Intentional free error');
},
} as unknown as Tiktoken;
// Should not throw uncaught errors
Tokenizer.freeAndResetAllEncoders();
// Verify logger.error was called with correct arguments
expect(mockLoggerError).toHaveBeenCalledWith(
'[Tokenizer] Free and reset encoders error',
expect.any(Error),
);
// Clean up
mockLoggerError.mockRestore();
Tokenizer.tokenizersCache = {};
});
});
describe('getTokenCount', () => {
beforeEach(() => {
jest.clearAllMocks();
Tokenizer.freeAndResetAllEncoders();
});
it('should return the number of tokens in the given text', () => {
const text = 'Hello, world!';
const count = Tokenizer.getTokenCount(text, 'cl100k_base');
expect(count).toBeGreaterThan(0);
});
it('should reset encoders if an error is thrown', () => {
// We can simulate an error by temporarily overriding the selected tokenizer's `encode` method.
const tokenizer = Tokenizer.getTokenizer('cl100k_base', false);
const originalEncode = tokenizer.encode;
tokenizer.encode = () => {
throw new Error('Forced error');
};
// Despite the forced error, the code should catch and reset, then re-encode
const count = Tokenizer.getTokenCount('Hello again', 'cl100k_base');
expect(count).toBeGreaterThan(0);
// Restore the original encode
tokenizer.encode = originalEncode;
});
it('should reset tokenizers after 25 calls', () => {
// Spy on freeAndResetAllEncoders
const resetSpy = jest.spyOn(Tokenizer, 'freeAndResetAllEncoders');
// Make 24 calls; should NOT reset yet
for (let i = 0; i < 24; i++) {
Tokenizer.getTokenCount('test text', 'cl100k_base');
}
expect(resetSpy).not.toHaveBeenCalled();
// 25th call triggers the reset
Tokenizer.getTokenCount('the 25th call!', 'cl100k_base');
expect(resetSpy).toHaveBeenCalledTimes(1);
});
});
});
|