Spaces:
Runtime error
Runtime error
Update serenityai.py
Browse files- serenityai.py +9 -84
serenityai.py
CHANGED
|
@@ -7,6 +7,7 @@ Original file is located at
|
|
| 7 |
https://colab.research.google.com/drive/1LV3l6IWVK64-7RI2C7wEiW9r7ghx9d-o
|
| 8 |
"""
|
| 9 |
# %% Cell 1 - Model Initialization with Checkpoint Saving
|
|
|
|
| 10 |
import torch
|
| 11 |
from unsloth import FastLanguageModel
|
| 12 |
import os
|
|
@@ -14,24 +15,18 @@ import os
|
|
| 14 |
# Configuration
|
| 15 |
model_name = "unsloth/llama-3-8B-bnb-4bit"
|
| 16 |
max_seq_length = 2048
|
| 17 |
-
dtype = torch.
|
| 18 |
checkpoint_dir = "./serenity_checkpoints/initial_checkpoint"
|
| 19 |
os.makedirs(checkpoint_dir, exist_ok=True)
|
| 20 |
|
| 21 |
-
#
|
| 22 |
-
print(f"Available GPUs: {torch.cuda.device_count()}")
|
| 23 |
-
print(f"CUDA version: {torch.version.cuda}")
|
| 24 |
-
torch.cuda.empty_cache()
|
| 25 |
-
|
| 26 |
-
# Load model with optimized configuration
|
| 27 |
model, tokenizer = FastLanguageModel.from_pretrained(
|
| 28 |
model_name=model_name,
|
| 29 |
max_seq_length=max_seq_length,
|
| 30 |
dtype=dtype,
|
| 31 |
-
load_in_4bit=
|
| 32 |
-
device_map="
|
| 33 |
-
rope_scaling={"type": "dynamic", "factor": 2.0},
|
| 34 |
-
attn_implementation="flash_attention_2",
|
| 35 |
)
|
| 36 |
|
| 37 |
# Apply LoRA configuration
|
|
@@ -91,8 +86,8 @@ def load_from_checkpoint(checkpoint_path):
|
|
| 91 |
model_name=checkpoint_path,
|
| 92 |
max_seq_length=max_seq_length,
|
| 93 |
dtype=dtype,
|
| 94 |
-
load_in_4bit=
|
| 95 |
-
device_map="
|
| 96 |
)
|
| 97 |
|
| 98 |
# Test loading
|
|
@@ -101,79 +96,9 @@ print("Checkpoint loaded successfully!")
|
|
| 101 |
|
| 102 |
# Example inference
|
| 103 |
prompt = "User: How can I preserve my mental energy throughout the day?\nAI:"
|
| 104 |
-
inputs = loaded_tokenizer(prompt, return_tensors="pt").to("
|
| 105 |
outputs = loaded_model.generate(**inputs, max_new_tokens=100)
|
| 106 |
print(loaded_tokenizer.decode(outputs[0], skip_special_tokens=True))
|
| 107 |
-
# %% Cell 6 - Validation and Testing
|
| 108 |
-
# %% Fixing Tokenizer and Special Tokens Handling
|
| 109 |
-
from unsloth import FastLanguageModel
|
| 110 |
-
from transformers import AddedToken, AutoTokenizer
|
| 111 |
-
import torch
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
# Define Llama-3 chat template
|
| 115 |
-
LLAMA3_CHAT_TEMPLATE = """
|
| 116 |
-
{% for message in messages %}
|
| 117 |
-
{% if message['role'] == 'system' %}
|
| 118 |
-
<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{{ message['content'] }}<|eot_id|>
|
| 119 |
-
{% elif message['role'] == 'user' %}
|
| 120 |
-
<|start_header_id|>user<|end_header_id|>\n\n{{ message['content'] }}<|eot_id|>
|
| 121 |
-
{% elif message['role'] == 'assistant' %}
|
| 122 |
-
<|start_header_id|>assistant<|end_header_id|>\n\n{{ message['content'] }}<|eot_id|>
|
| 123 |
-
{% endif %}
|
| 124 |
-
{% endfor %}
|
| 125 |
-
<|start_header_id|>assistant<|end_header_id|>\n\n
|
| 126 |
-
"""
|
| 127 |
-
|
| 128 |
-
# Initialize tokenizer with proper template
|
| 129 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
| 130 |
-
"unsloth/llama-3-8B-bnb-4bit",
|
| 131 |
-
padding_side="right",
|
| 132 |
-
truncation_side="right",
|
| 133 |
-
pad_token="<|end_of_text|>",
|
| 134 |
-
additional_special_tokens=[
|
| 135 |
-
"<|begin_of_text|>",
|
| 136 |
-
"<|start_header_id|>",
|
| 137 |
-
"<|end_header_id|>",
|
| 138 |
-
"<|eot_id|>",
|
| 139 |
-
],
|
| 140 |
-
tokenizer_type="llama",
|
| 141 |
-
use_fast=True,
|
| 142 |
-
)
|
| 143 |
-
|
| 144 |
-
# Set the chat template explicitly
|
| 145 |
-
tokenizer.chat_template = LLAMA3_CHAT_TEMPLATE
|
| 146 |
-
|
| 147 |
-
# Initialize model
|
| 148 |
-
model, _ = FastLanguageModel.from_pretrained(
|
| 149 |
-
model_name="unsloth/llama-3-8B-bnb-4bit",
|
| 150 |
-
max_seq_length=2048,
|
| 151 |
-
dtype=torch.float16,
|
| 152 |
-
load_in_4bit=True,
|
| 153 |
-
device_map="auto",
|
| 154 |
-
)
|
| 155 |
-
|
| 156 |
-
# Align model config with tokenizer
|
| 157 |
-
model.config.pad_token_id = tokenizer.pad_token_id
|
| 158 |
-
model.config.eos_token_id = tokenizer.eos_token_id
|
| 159 |
-
model.config.bos_token_id = tokenizer.bos_token_id
|
| 160 |
-
|
| 161 |
-
# Verify chat template
|
| 162 |
-
print("Chat template configured:", tokenizer.chat_template is not None)
|
| 163 |
-
|
| 164 |
-
# Example usage
|
| 165 |
-
messages = [
|
| 166 |
-
{"role": "system", "content": "You are Serenity AI..."},
|
| 167 |
-
{"role": "user", "content": "I'm feeling anxious..."}
|
| 168 |
-
]
|
| 169 |
-
|
| 170 |
-
formatted_prompt = tokenizer.apply_chat_template(
|
| 171 |
-
messages,
|
| 172 |
-
tokenize=True,
|
| 173 |
-
add_generation_prompt=True
|
| 174 |
-
)
|
| 175 |
-
print("Formatted prompt:\n", formatted_prompt)
|
| 176 |
-
|
| 177 |
|
| 178 |
import os
|
| 179 |
import json
|
|
|
|
| 7 |
https://colab.research.google.com/drive/1LV3l6IWVK64-7RI2C7wEiW9r7ghx9d-o
|
| 8 |
"""
|
| 9 |
# %% Cell 1 - Model Initialization with Checkpoint Saving
|
| 10 |
+
# %% Cell 1 - Model Initialization with Checkpoint Saving
|
| 11 |
import torch
|
| 12 |
from unsloth import FastLanguageModel
|
| 13 |
import os
|
|
|
|
| 15 |
# Configuration
|
| 16 |
model_name = "unsloth/llama-3-8B-bnb-4bit"
|
| 17 |
max_seq_length = 2048
|
| 18 |
+
dtype = torch.float32 # ✅ Change to float32 for CPU
|
| 19 |
checkpoint_dir = "./serenity_checkpoints/initial_checkpoint"
|
| 20 |
os.makedirs(checkpoint_dir, exist_ok=True)
|
| 21 |
|
| 22 |
+
# Load model with optimized configuration for CPU
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
model, tokenizer = FastLanguageModel.from_pretrained(
|
| 24 |
model_name=model_name,
|
| 25 |
max_seq_length=max_seq_length,
|
| 26 |
dtype=dtype,
|
| 27 |
+
load_in_4bit=False, # ✅ Disable 4-bit quantization for CPU
|
| 28 |
+
device_map="cpu", # ✅ Force CPU usage
|
| 29 |
+
rope_scaling={"type": "dynamic", "factor": 2.0"},
|
|
|
|
| 30 |
)
|
| 31 |
|
| 32 |
# Apply LoRA configuration
|
|
|
|
| 86 |
model_name=checkpoint_path,
|
| 87 |
max_seq_length=max_seq_length,
|
| 88 |
dtype=dtype,
|
| 89 |
+
load_in_4bit=False, # ✅ Ensure 4-bit is off for CPU
|
| 90 |
+
device_map="cpu",
|
| 91 |
)
|
| 92 |
|
| 93 |
# Test loading
|
|
|
|
| 96 |
|
| 97 |
# Example inference
|
| 98 |
prompt = "User: How can I preserve my mental energy throughout the day?\nAI:"
|
| 99 |
+
inputs = loaded_tokenizer(prompt, return_tensors="pt").to("cpu") # ✅ Move to CPU
|
| 100 |
outputs = loaded_model.generate(**inputs, max_new_tokens=100)
|
| 101 |
print(loaded_tokenizer.decode(outputs[0], skip_special_tokens=True))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
import os
|
| 104 |
import json
|