Spaces:
Runtime error
Runtime error
| import os | |
| str_cmd1 = 'pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"' | |
| str_cmd2 = 'pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes' | |
| os.system(str_cmd1) | |
| os.system(str_cmd2) | |
| #os.environ["CUDA_VISIBLE_DEVICES"] = "0" # or "0,1" for multiple GPUs | |
| from unsloth import FastLanguageModel | |
| import torch | |
| device = torch.device("cpu") | |
| max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally! | |
| dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+ | |
| load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False. | |
| from langchain_community.llms import CTransformers | |
| from langchain.chains import RetrievalQA | |
| from langchain.prompts import PromptTemplate | |
| from langchain_community.embeddings import GPT4AllEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_community.llms import HuggingFacePipeline | |
| from langchain.callbacks.base import BaseCallbackHandler | |
| from transformers import pipeline | |
| # 4bit pre quantized models we support for 4x faster downloading + no OOMs. | |
| fourbit_models = [ | |
| "unsloth/mistral-7b-bnb-4bit", | |
| "unsloth/mistral-7b-instruct-v0.2-bnb-4bit", | |
| "unsloth/llama-2-7b-bnb-4bit", | |
| "unsloth/llama-2-13b-bnb-4bit", | |
| "unsloth/codellama-34b-bnb-4bit", | |
| "unsloth/tinyllama-bnb-4bit", | |
| "unsloth/gemma-7b-bnb-4bit", # New Google 6 trillion tokens model 2.5x faster! | |
| "unsloth/gemma-2b-bnb-4bit", | |
| ] # More models at https://huggingface.co/unsloth | |
| template = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. | |
| ### Instruction: | |
| You are ResVuAssist and You are a helpful bot who reads texts and answers questions about them. | |
| ### Input: | |
| {context} | |
| QUESTION: {question} | |
| ### Response: | |
| """ | |
| # Cau hinh | |
| vector_db_path = "vectorstores/db_faiss" | |
| def initialModelAndTokenizer(): | |
| model, tokenizer = FastLanguageModel.from_pretrained( | |
| model_name = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B | |
| max_seq_length = max_seq_length, | |
| dtype = dtype, | |
| load_in_4bit = load_in_4bit, | |
| # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf | |
| ) | |
| model = FastLanguageModel.get_peft_model( | |
| model, | |
| r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128 | |
| target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", | |
| "gate_proj", "up_proj", "down_proj",], | |
| lora_alpha = 16, | |
| lora_dropout = 0, # Supports any, but = 0 is optimized | |
| bias = "none", # Supports any, but = "none" is optimized | |
| # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes! | |
| use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context | |
| random_state = 3407, | |
| use_rslora = False, # We support rank stabilized LoRA | |
| loftq_config = None, # And LoftQ | |
| ) | |
| return model, tokenizer | |
| def create_pipeline(): | |
| model, tokenizer = initialModelAndTokenizer() | |
| pipe = pipeline( | |
| "text-generation", | |
| model=model, | |
| tokenizer=tokenizer, | |
| max_new_tokens=512, | |
| temperature=0.1, | |
| top_p=0.95, | |
| repetition_penalty=1.15 | |
| ) | |
| return pipe | |
| # Tao prompt template | |
| def creat_prompt(template): | |
| prompt = PromptTemplate(template = template, input_variables=["context", "question"]) | |
| return prompt | |
| # Tao simple chain | |
| def create_qa_chain(prompt, llm, db): | |
| llm_chain = RetrievalQA.from_chain_type( | |
| llm = llm, | |
| chain_type= "stuff", | |
| # retriever = db.as_retriever(search_kwargs = {"k":8}, max_tokens_limit=1024), | |
| retriever = db.as_retriever(search_kwargs = {"k": 15}, max_tokens_limit=4096), | |
| return_source_documents = False, | |
| chain_type_kwargs= {'prompt': prompt}, | |
| ) | |
| return llm_chain | |
| # Read tu VectorDB | |
| def read_vectors_db(): | |
| # Embeding | |
| embedding_model = GPT4AllEmbeddings(model_name="all-MiniLM-L6-v2.gguf2.f16.gguf") | |
| db = FAISS.load_local(vector_db_path, embedding_model, allow_dangerous_deserialization=True) | |
| return db | |
| def get_response_value(text): | |
| start = text.find('### Response:') | |
| if start != -1: | |
| return text[start + len('### Response:'):].strip() | |
| return None | |
| def llm_chain_response(): | |
| pipe = create_pipeline() | |
| db = read_vectors_db() | |
| prompt = creat_prompt(template) | |
| llm = HuggingFacePipeline(pipeline=pipe) | |
| llm_chain =create_qa_chain(prompt, llm, db) | |
| return llm_chain |