Spaces:

abancp
/

10M-LLM

Sleeping

App Files Files Community

abancp commited on Jul 15

Commit

d727d22

verified ·

1 Parent(s): dd1b76c

Update inference_fine_tune.py

Browse files

Files changed (1) hide show

inference_fine_tune.py +23 -27

inference_fine_tune.py CHANGED Viewed

@@ -31,41 +31,37 @@ model.eval()
 state = torch.load(model_path,map_location=torch.device('cpu'))
 model.load_state_dict(state['model_state_dict'])
-def generate_response(prompt:str):
-    print("Prompt : ",prompt)
-    word = ""
     input_tokens = tokenizer.encode(prompt).ids
-    input_tokens.extend([user_token_id] + input_tokens + [ai_token_id] )
     if len(input_tokens) > config['seq_len']:
-        print(f"exceeding max length of input : {config['seq_len']}")
-        exit()
-    input_tokens = torch.tensor(input_tokens)
-    decoder_input = input_tokens.to(device)
-    if decoder_input.dim() == 1:
-       decoder_input = decoder_input.unsqueeze(0)
     temperature = 0.7
     top_k = 50
     i = 0
-    print("Output  : ",end="")
-    while decoder_input.shape[1] < 2000:
-        # Apply causal mask based on current decoder_input length
-        # decoder_mask = (decoder_input != pad_token_id).unsqueeze(0).int() & causal_mask(decoder_input.size(1)).type_as(input_mask).to(device)
-        # Get model output
-        out = model.decode(decoder_input)
-        logits = model.project(out[:, -1])  # Get logits for last token
         logits = logits / temperature
         top_k_logits, top_k_indices = torch.topk(logits, top_k)
         probs = torch.softmax(top_k_logits, dim=-1)
         next_token = torch.multinomial(probs, num_samples=1)
         next_token = top_k_indices.gather(-1, next_token)
-        word += tokenizer.decode([next_token.item()])
-        print(word,end="")
-        i+=1
-        decoder_input = torch.cat([decoder_input, next_token], dim=1)
-        if decoder_input.shape[1] > config['seq_len']:
-            decoder_input = decoder_input[:,-config['seq_len']:]
-        if next_token.item() == eos_token_id  or i >= 1024:
             break
-    print()
-    return word

 state = torch.load(model_path,map_location=torch.device('cpu'))
 model.load_state_dict(state['model_state_dict'])
+def generate_response(prompt: str):
+    print("Prompt:", prompt)
     input_tokens = tokenizer.encode(prompt).ids
+    input_tokens = [user_token_id] + input_tokens + [ai_token_id]
     if len(input_tokens) > config['seq_len']:
+        print(f"Exceeding max length of input: {config['seq_len']}")
+        return
+    input_tokens = torch.tensor(input_tokens).unsqueeze(0).to(device)  # (1, seq_len)
     temperature = 0.7
     top_k = 50
     i = 0
+    while input_tokens.shape[1] < 2000:
+        out = model.decode(input_tokens)
+        logits = model.project(out[:, -1])
         logits = logits / temperature
         top_k_logits, top_k_indices = torch.topk(logits, top_k)
         probs = torch.softmax(top_k_logits, dim=-1)
         next_token = torch.multinomial(probs, num_samples=1)
         next_token = top_k_indices.gather(-1, next_token)
+        decoded_word = tokenizer.decode([next_token.item()])
+        yield decoded_word  # Streaming output token-by-token
+        input_tokens = torch.cat([input_tokens, next_token], dim=1)
+        if input_tokens.shape[1] > config['seq_len']:
+            input_tokens = input_tokens[:, -config['seq_len']:]
+        if next_token.item() == eos_token_id or i >= 1024:
             break
+        i += 1