Spaces:

Smilyai-labs
/

Sam-Z-chat

Sleeping

Keeby-smilyai commited on Oct 23

Commit

fea9f26

verified ·

1 Parent(s): 8c7da06

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -24,14 +24,29 @@ CACHE_DIR = "./model_cache"
 # Download model files
 config_path = hf_hub_download(MODEL_REPO, "config.json", cache_dir=CACHE_DIR)
 model_path = hf_hub_download(MODEL_REPO, "model.keras", cache_dir=CACHE_DIR)
-tokenizer_path = hf_hub_download(MODEL_REPO, "tokenizer.json", cache_dir=CACHE_DIR)
 # Load config
 with open(config_path, 'r') as f:
     config = json.load(f)
-# Load tokenizer
-tokenizer = Tokenizer.from_file(tokenizer_path)
 eos_token_id = config.get('eos_token_id', 50256)
 # Load model with TF function optimization

 # Download model files
 config_path = hf_hub_download(MODEL_REPO, "config.json", cache_dir=CACHE_DIR)
 model_path = hf_hub_download(MODEL_REPO, "model.keras", cache_dir=CACHE_DIR)
 # Load config
 with open(config_path, 'r') as f:
     config = json.load(f)
+# Create tokenizer from scratch
+print("📦 Creating tokenizer from GPT-2 base...")
+from transformers import AutoTokenizer
+hf_tokenizer = AutoTokenizer.from_pretrained("gpt2")
+# Add custom tokens
+custom_tokens = ["<|im_start|>", "<|im_end|>"]
+hf_tokenizer.add_special_tokens({"additional_special_tokens": custom_tokens})
+# Save and reload as tokenizers format
+os.makedirs("./temp_tokenizer", exist_ok=True)
+hf_tokenizer.save_pretrained("./temp_tokenizer")
+tokenizer = Tokenizer.from_file("./temp_tokenizer/tokenizer.json")
+print(f"✅ Tokenizer created with vocab size: {tokenizer.get_vocab_size()}")
+print(f"   Custom tokens: {custom_tokens}")
 eos_token_id = config.get('eos_token_id', 50256)
 # Load model with TF function optimization