Wasim0606 commited on
Commit
4b11919
·
verified ·
1 Parent(s): 5985e62

Update serenityai.py

Browse files
Files changed (1) hide show
  1. serenityai.py +9 -84
serenityai.py CHANGED
@@ -7,6 +7,7 @@ Original file is located at
7
  https://colab.research.google.com/drive/1LV3l6IWVK64-7RI2C7wEiW9r7ghx9d-o
8
  """
9
  # %% Cell 1 - Model Initialization with Checkpoint Saving
 
10
  import torch
11
  from unsloth import FastLanguageModel
12
  import os
@@ -14,24 +15,18 @@ import os
14
  # Configuration
15
  model_name = "unsloth/llama-3-8B-bnb-4bit"
16
  max_seq_length = 2048
17
- dtype = torch.float16
18
  checkpoint_dir = "./serenity_checkpoints/initial_checkpoint"
19
  os.makedirs(checkpoint_dir, exist_ok=True)
20
 
21
- # Hardware setup
22
- print(f"Available GPUs: {torch.cuda.device_count()}")
23
- print(f"CUDA version: {torch.version.cuda}")
24
- torch.cuda.empty_cache()
25
-
26
- # Load model with optimized configuration
27
  model, tokenizer = FastLanguageModel.from_pretrained(
28
  model_name=model_name,
29
  max_seq_length=max_seq_length,
30
  dtype=dtype,
31
- load_in_4bit=True,
32
- device_map="auto",
33
- rope_scaling={"type": "dynamic", "factor": 2.0},
34
- attn_implementation="flash_attention_2",
35
  )
36
 
37
  # Apply LoRA configuration
@@ -91,8 +86,8 @@ def load_from_checkpoint(checkpoint_path):
91
  model_name=checkpoint_path,
92
  max_seq_length=max_seq_length,
93
  dtype=dtype,
94
- load_in_4bit=True,
95
- device_map="auto",
96
  )
97
 
98
  # Test loading
@@ -101,79 +96,9 @@ print("Checkpoint loaded successfully!")
101
 
102
  # Example inference
103
  prompt = "User: How can I preserve my mental energy throughout the day?\nAI:"
104
- inputs = loaded_tokenizer(prompt, return_tensors="pt").to("cuda")
105
  outputs = loaded_model.generate(**inputs, max_new_tokens=100)
106
  print(loaded_tokenizer.decode(outputs[0], skip_special_tokens=True))
107
- # %% Cell 6 - Validation and Testing
108
- # %% Fixing Tokenizer and Special Tokens Handling
109
- from unsloth import FastLanguageModel
110
- from transformers import AddedToken, AutoTokenizer
111
- import torch
112
-
113
-
114
- # Define Llama-3 chat template
115
- LLAMA3_CHAT_TEMPLATE = """
116
- {% for message in messages %}
117
- {% if message['role'] == 'system' %}
118
- <|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{{ message['content'] }}<|eot_id|>
119
- {% elif message['role'] == 'user' %}
120
- <|start_header_id|>user<|end_header_id|>\n\n{{ message['content'] }}<|eot_id|>
121
- {% elif message['role'] == 'assistant' %}
122
- <|start_header_id|>assistant<|end_header_id|>\n\n{{ message['content'] }}<|eot_id|>
123
- {% endif %}
124
- {% endfor %}
125
- <|start_header_id|>assistant<|end_header_id|>\n\n
126
- """
127
-
128
- # Initialize tokenizer with proper template
129
- tokenizer = AutoTokenizer.from_pretrained(
130
- "unsloth/llama-3-8B-bnb-4bit",
131
- padding_side="right",
132
- truncation_side="right",
133
- pad_token="<|end_of_text|>",
134
- additional_special_tokens=[
135
- "<|begin_of_text|>",
136
- "<|start_header_id|>",
137
- "<|end_header_id|>",
138
- "<|eot_id|>",
139
- ],
140
- tokenizer_type="llama",
141
- use_fast=True,
142
- )
143
-
144
- # Set the chat template explicitly
145
- tokenizer.chat_template = LLAMA3_CHAT_TEMPLATE
146
-
147
- # Initialize model
148
- model, _ = FastLanguageModel.from_pretrained(
149
- model_name="unsloth/llama-3-8B-bnb-4bit",
150
- max_seq_length=2048,
151
- dtype=torch.float16,
152
- load_in_4bit=True,
153
- device_map="auto",
154
- )
155
-
156
- # Align model config with tokenizer
157
- model.config.pad_token_id = tokenizer.pad_token_id
158
- model.config.eos_token_id = tokenizer.eos_token_id
159
- model.config.bos_token_id = tokenizer.bos_token_id
160
-
161
- # Verify chat template
162
- print("Chat template configured:", tokenizer.chat_template is not None)
163
-
164
- # Example usage
165
- messages = [
166
- {"role": "system", "content": "You are Serenity AI..."},
167
- {"role": "user", "content": "I'm feeling anxious..."}
168
- ]
169
-
170
- formatted_prompt = tokenizer.apply_chat_template(
171
- messages,
172
- tokenize=True,
173
- add_generation_prompt=True
174
- )
175
- print("Formatted prompt:\n", formatted_prompt)
176
-
177
 
178
  import os
179
  import json
 
7
  https://colab.research.google.com/drive/1LV3l6IWVK64-7RI2C7wEiW9r7ghx9d-o
8
  """
9
  # %% Cell 1 - Model Initialization with Checkpoint Saving
10
+ # %% Cell 1 - Model Initialization with Checkpoint Saving
11
  import torch
12
  from unsloth import FastLanguageModel
13
  import os
 
15
  # Configuration
16
  model_name = "unsloth/llama-3-8B-bnb-4bit"
17
  max_seq_length = 2048
18
+ dtype = torch.float32 # ✅ Change to float32 for CPU
19
  checkpoint_dir = "./serenity_checkpoints/initial_checkpoint"
20
  os.makedirs(checkpoint_dir, exist_ok=True)
21
 
22
+ # Load model with optimized configuration for CPU
 
 
 
 
 
23
  model, tokenizer = FastLanguageModel.from_pretrained(
24
  model_name=model_name,
25
  max_seq_length=max_seq_length,
26
  dtype=dtype,
27
+ load_in_4bit=False, # ✅ Disable 4-bit quantization for CPU
28
+ device_map="cpu", # ✅ Force CPU usage
29
+ rope_scaling={"type": "dynamic", "factor": 2.0"},
 
30
  )
31
 
32
  # Apply LoRA configuration
 
86
  model_name=checkpoint_path,
87
  max_seq_length=max_seq_length,
88
  dtype=dtype,
89
+ load_in_4bit=False, # ✅ Ensure 4-bit is off for CPU
90
+ device_map="cpu",
91
  )
92
 
93
  # Test loading
 
96
 
97
  # Example inference
98
  prompt = "User: How can I preserve my mental energy throughout the day?\nAI:"
99
+ inputs = loaded_tokenizer(prompt, return_tensors="pt").to("cpu") # ✅ Move to CPU
100
  outputs = loaded_model.generate(**inputs, max_new_tokens=100)
101
  print(loaded_tokenizer.decode(outputs[0], skip_special_tokens=True))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
  import os
104
  import json