File size: 1,086 Bytes
3b70c60
 
 
 
 
 
 
 
 
 
 
c156c1f
 
 
 
 
 
 
 
 
 
 
 
3b70c60
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from transformers import AutoTokenizer

class Tokenizer:
    
    def __init__(self, hf_token=None) -> None:
        # Try to get token from environment if not provided
        
        if hf_token:
            print(f"[INFO] Using HF token for model access")
        else:
            print("[INFO] No HF token provided - using public models only")
        
        # Use a public tokenizer instead of gated Llama model
        # GPT-2 tokenizer is widely compatible and doesn't require authentication
        try:
            if hf_token:
                # Try Llama tokenizer first if we have a token
                self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", token=hf_token)
            else:
                raise Exception("No token - using fallback")
        except:
            print("[INFO] Fallback to public GPT-2 tokenizer")
            self.tokenizer = AutoTokenizer.from_pretrained("gpt2")
            
        self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    def ready_tokenizer(self):
        
        return self.tokenizer