| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | import { logger } from '@librechat/data-schemas'; |
| | import type { Tiktoken } from 'tiktoken'; |
| | import Tokenizer from './tokenizer'; |
| |
|
| | jest.mock('@librechat/data-schemas', () => ({ |
| | logger: { |
| | error: jest.fn(), |
| | }, |
| | })); |
| |
|
| | describe('Tokenizer', () => { |
| | it('should be a singleton (same instance)', async () => { |
| | const AnotherTokenizer = await import('./tokenizer'); |
| | expect(Tokenizer).toBe(AnotherTokenizer.default); |
| | }); |
| |
|
| | describe('getTokenizer', () => { |
| | it('should create an encoder for an explicit model name (e.g., "gpt-4")', () => { |
| | |
| | |
| | const tokenizer = Tokenizer.getTokenizer('gpt-4', true); |
| |
|
| | |
| | expect(tokenizer).toBeDefined(); |
| | |
| | |
| | }); |
| |
|
| | it('should create an encoder for a known encoding (e.g., "cl100k_base")', () => { |
| | |
| | |
| | const tokenizer = Tokenizer.getTokenizer('cl100k_base', false); |
| |
|
| | expect(tokenizer).toBeDefined(); |
| | |
| | }); |
| |
|
| | it('should return cached tokenizer if previously fetched', () => { |
| | const tokenizer1 = Tokenizer.getTokenizer('cl100k_base', false); |
| | const tokenizer2 = Tokenizer.getTokenizer('cl100k_base', false); |
| | |
| | expect(tokenizer1).toBe(tokenizer2); |
| | }); |
| | }); |
| |
|
| | describe('freeAndResetAllEncoders', () => { |
| | beforeEach(() => { |
| | jest.clearAllMocks(); |
| | }); |
| |
|
| | it('should free all encoders and reset tokenizerCallsCount to 1', () => { |
| | |
| | Tokenizer.getTokenizer('cl100k_base', false); |
| | Tokenizer.getTokenizer('r50k_base', false); |
| |
|
| | |
| | Tokenizer.freeAndResetAllEncoders(); |
| |
|
| | |
| | expect(Tokenizer.tokenizersCache['cl100k_base']).toBeUndefined(); |
| | expect(Tokenizer.tokenizersCache['r50k_base']).toBeUndefined(); |
| |
|
| | |
| | expect(Tokenizer.tokenizerCallsCount).toBe(1); |
| | }); |
| |
|
| | it('should catch and log errors if freeing fails', () => { |
| | |
| | const mockLoggerError = jest.spyOn(logger, 'error'); |
| |
|
| | |
| | Tokenizer.tokenizersCache['cl100k_base'] = { |
| | free() { |
| | throw new Error('Intentional free error'); |
| | }, |
| | } as unknown as Tiktoken; |
| |
|
| | |
| | Tokenizer.freeAndResetAllEncoders(); |
| |
|
| | |
| | expect(mockLoggerError).toHaveBeenCalledWith( |
| | '[Tokenizer] Free and reset encoders error', |
| | expect.any(Error), |
| | ); |
| |
|
| | |
| | mockLoggerError.mockRestore(); |
| | Tokenizer.tokenizersCache = {}; |
| | }); |
| | }); |
| |
|
| | describe('getTokenCount', () => { |
| | beforeEach(() => { |
| | jest.clearAllMocks(); |
| | Tokenizer.freeAndResetAllEncoders(); |
| | }); |
| |
|
| | it('should return the number of tokens in the given text', () => { |
| | const text = 'Hello, world!'; |
| | const count = Tokenizer.getTokenCount(text, 'cl100k_base'); |
| | expect(count).toBeGreaterThan(0); |
| | }); |
| |
|
| | it('should reset encoders if an error is thrown', () => { |
| | |
| | const tokenizer = Tokenizer.getTokenizer('cl100k_base', false); |
| | const originalEncode = tokenizer.encode; |
| | tokenizer.encode = () => { |
| | throw new Error('Forced error'); |
| | }; |
| |
|
| | |
| | const count = Tokenizer.getTokenCount('Hello again', 'cl100k_base'); |
| | expect(count).toBeGreaterThan(0); |
| |
|
| | |
| | tokenizer.encode = originalEncode; |
| | }); |
| |
|
| | it('should reset tokenizers after 25 calls', () => { |
| | |
| | const resetSpy = jest.spyOn(Tokenizer, 'freeAndResetAllEncoders'); |
| |
|
| | |
| | for (let i = 0; i < 24; i++) { |
| | Tokenizer.getTokenCount('test text', 'cl100k_base'); |
| | } |
| | expect(resetSpy).not.toHaveBeenCalled(); |
| |
|
| | |
| | Tokenizer.getTokenCount('the 25th call!', 'cl100k_base'); |
| | expect(resetSpy).toHaveBeenCalledTimes(1); |
| | }); |
| | }); |
| | }); |
| |
|