import torch from lmdeploy import pipeline, TurbomindEngineConfig, GenerationConfig from .base import BaseModel class LMDeployModel(BaseModel): def __init__(self, device='cuda', cache_size_mb=100, **kwargs): assert device == 'cuda', "lmdeploy only supports cuda devices, consider changing device or using a different backend instead." cache_size_ratio = cache_size_mb * 1024**2 / torch.cuda.get_device_properties('cuda').total_memory backend_config = TurbomindEngineConfig(cache_max_entry_count=cache_size_ratio) self.pipeline = pipeline('ekwek/Soprano-80M', log_level='ERROR', backend_config=backend_config) def infer(self, prompts, top_p=0.95, temperature=0.3, repetition_penalty=1.2): gen_config=GenerationConfig(output_last_hidden_state='generation', do_sample=True, top_p=top_p, temperature=temperature, repetition_penalty=repetition_penalty, max_new_tokens=512) responses = self.pipeline(prompts, gen_config=gen_config) res = [] for response in responses: res.append({ 'finish_reason': response.finish_reason, 'hidden_state': response.last_hidden_state }) return res def stream_infer(self, prompt, top_p=0.95, temperature=0.3, repetition_penalty=1.2): gen_config=GenerationConfig(output_last_hidden_state='generation', do_sample=True, top_p=top_p, temperature=temperature, repetition_penalty=repetition_penalty, max_new_tokens=512) responses = self.pipeline.stream_infer([prompt], gen_config=gen_config) for response in responses: yield { 'finish_reason': response.finish_reason, 'hidden_state': response.last_hidden_state }