Keeby-smilyai commited on
Commit
fea9f26
·
verified ·
1 Parent(s): 8c7da06

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -3
app.py CHANGED
@@ -24,14 +24,29 @@ CACHE_DIR = "./model_cache"
24
  # Download model files
25
  config_path = hf_hub_download(MODEL_REPO, "config.json", cache_dir=CACHE_DIR)
26
  model_path = hf_hub_download(MODEL_REPO, "model.keras", cache_dir=CACHE_DIR)
27
- tokenizer_path = hf_hub_download(MODEL_REPO, "tokenizer.json", cache_dir=CACHE_DIR)
28
 
29
  # Load config
30
  with open(config_path, 'r') as f:
31
  config = json.load(f)
32
 
33
- # Load tokenizer
34
- tokenizer = Tokenizer.from_file(tokenizer_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  eos_token_id = config.get('eos_token_id', 50256)
36
 
37
  # Load model with TF function optimization
 
24
  # Download model files
25
  config_path = hf_hub_download(MODEL_REPO, "config.json", cache_dir=CACHE_DIR)
26
  model_path = hf_hub_download(MODEL_REPO, "model.keras", cache_dir=CACHE_DIR)
 
27
 
28
  # Load config
29
  with open(config_path, 'r') as f:
30
  config = json.load(f)
31
 
32
+ # Create tokenizer from scratch
33
+ print("📦 Creating tokenizer from GPT-2 base...")
34
+ from transformers import AutoTokenizer
35
+
36
+ hf_tokenizer = AutoTokenizer.from_pretrained("gpt2")
37
+
38
+ # Add custom tokens
39
+ custom_tokens = ["<|im_start|>", "<|im_end|>"]
40
+ hf_tokenizer.add_special_tokens({"additional_special_tokens": custom_tokens})
41
+
42
+ # Save and reload as tokenizers format
43
+ os.makedirs("./temp_tokenizer", exist_ok=True)
44
+ hf_tokenizer.save_pretrained("./temp_tokenizer")
45
+ tokenizer = Tokenizer.from_file("./temp_tokenizer/tokenizer.json")
46
+
47
+ print(f"✅ Tokenizer created with vocab size: {tokenizer.get_vocab_size()}")
48
+ print(f" Custom tokens: {custom_tokens}")
49
+
50
  eos_token_id = config.get('eos_token_id', 50256)
51
 
52
  # Load model with TF function optimization