CreitinGameplays commited on
Commit
f1b8d5d
·
verified ·
1 Parent(s): 5b1584f

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +94 -1
README.md CHANGED
@@ -9,4 +9,97 @@ base_model:
9
  - mistralai/Mistral-Nemo-Instruct-2407
10
  pipeline_tag: text-generation
11
  library_name: transformers
12
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  - mistralai/Mistral-Nemo-Instruct-2407
10
  pipeline_tag: text-generation
11
  library_name: transformers
12
+ ---
13
+
14
+ ## Chat template:
15
+ ```
16
+ [SYSTEM]You are an AI focused on providing systematic, well-reasoned responses. Response Structure: - Format: <think>{{reasoning}}</think>{{answer}} - Reasoning: Minimum 6 logical steps only when it required in <think> block - Process: Think first, then answer.[/SYSTEM]
17
+ [INST]{user_input}[/INST]
18
+ ```
19
+
20
+ ## Run the model:
21
+ ```python
22
+ import torch
23
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer, BitsAndBytesConfig
24
+ import bitsandbytes
25
+ import torch._dynamo
26
+ from torch._dynamo import disable as dynamo_disable
27
+ import os
28
+
29
+ torch._dynamo.config.suppress_errors = True
30
+ os.environ["TORCHDYNAMO_DISABLE"] = "1"
31
+
32
+ quantization_config = BitsAndBytesConfig(
33
+ load_in_8bit=True,
34
+ #bnb_8bit_use_double_quant=True,
35
+ #bnb_8bit_quant_type="nf4",
36
+ #bnb_8bit_compute_dtype=torch.bfloat16,
37
+ #llm_int8_threshold=200.0,
38
+ llm_int8_enable_fp32_cpu_offload=True
39
+ )
40
+
41
+ model_id = "CreitinGameplays/Llama-3.1-8B-R1-v0.1"
42
+
43
+ # Initialize model and tokenizer with streaming support
44
+ model = AutoModelForCausalLM.from_pretrained(
45
+ model_id,
46
+ torch_dtype=torch.bfloat16,
47
+ device_map="auto",
48
+ quantization_config=quantization_config
49
+ )
50
+ tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)
51
+
52
+ # Custom streamer that collects the output into a string while streaming
53
+ class CollectingStreamer(TextStreamer):
54
+ def __init__(self, tokenizer):
55
+ super().__init__(tokenizer)
56
+ self.output = ""
57
+ def on_llm_new_token(self, token: str, **kwargs):
58
+ self.output += token
59
+ print(token, end="", flush=True) # prints the token as it's generated
60
+
61
+ print("Chat session started. Type 'exit' to quit.\n")
62
+
63
+ # Initialize chat history as a list of messages
64
+ chat_history = []
65
+ chat_history.append({"role": "system", "content": "You are an AI assistant made by Mistral AI"})
66
+
67
+ while True:
68
+ user_input = input("You: ")
69
+ if user_input.strip().lower() == "exit":
70
+ break
71
+
72
+ # Append the user message to the chat history
73
+ chat_history.append({"role": "user", "content": user_input})
74
+
75
+ # Prepare the prompt by formatting the complete chat history
76
+ inputs = tokenizer.apply_chat_template(
77
+ chat_history,
78
+ return_tensors="pt",
79
+ add_special_tokens=False
80
+ ).to(model.device)
81
+
82
+ # Create a new streamer for the current generation
83
+ streamer = CollectingStreamer(tokenizer)
84
+
85
+ # Generate streamed response
86
+ model.generate(
87
+ inputs,
88
+ streamer=streamer,
89
+ temperature=0.3,
90
+ top_p=0.8,
91
+ top_k=50,
92
+ repetition_penalty=1.1,
93
+ max_new_tokens=4096,
94
+ do_sample=True
95
+ )
96
+
97
+ # The complete response text is stored in streamer.output
98
+ response_text = streamer.output
99
+ print("\nAssistant:", response_text)
100
+
101
+ # Append the assistant response to the chat history
102
+ chat_history.append({"role": "assistant", "content": response_text})
103
+ ```
104
+
105
+ ### Note: This model was finetuned only with 2000 max steps.