Spaces:

YuvrajSingh9886
/

StoryKimi-Zero

Sleeping

App Files Files Community

StoryKimi-Zero / inference.py

yuvraj-singh-9886

Add StoryKimi ZeroGPU implementation

3b70c60 4 months ago

raw

history blame contribute delete

1.91 kB

	import torch
	import torch.nn.functional as F
	from config import ModelArgs
	from model import DeepSeekV3
	from tokenizer import Tokenizer

	def topk_sampling(model, prompt, device, max_length=50, top_k=50, temperature=1.0, tokenizer=None, hf_token=None):
	if tokenizer is None:
	# Use default tokenizer if none provided
	tokenizer_instance = Tokenizer(hf_token=hf_token)
	tokenizer = tokenizer_instance.ready_tokenizer()

	input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
	generated_tokens = []

	if(len(input_ids[0]) < max_length):
	max_length -= len(input_ids[0]) # If the input is longer than max_length, set max_length to the length of the input
	else:
	max_length = len(input_ids[0]) - max_length
	for _ in range(max_length):
	with torch.no_grad(), torch.autocast(device_type='cuda', dtype=torch.bfloat16):
	# Pass inference=True to use the inference path in the model
	outputs = model(input_ids, inference=True)
	logits = outputs[:, -1, :]
	logits = logits / temperature
	probs = F.softmax(logits, dim=-1)

	# Top-k filtering
	top_k_probs, top_k_indices = torch.topk(probs, top_k, dim=-1)

	# Sample from top-k
	next_token = torch.multinomial(top_k_probs, num_samples=1)

	xcol = torch.gather(top_k_indices, -1, next_token)
	input_ids = torch.cat([input_ids, xcol], dim=1) #1 because is it the dimension of the sequence

	if hasattr(tokenizer, 'eos_token_id') and tokenizer.eos_token_id and xcol.item() == tokenizer.eos_token_id:
	break


	return tokenizer.decode(input_ids[0])


	def save_text(file_path, step, text):
	with open(file_path, 'w') as f:
	f.write(f"Step {step}: {text}\n")