Spaces:
Sleeping
Sleeping
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig | |
| from typing import Optional | |
| import bitsandbytes # only for using on GPU | |
| import accelerate # only for using on GPU | |
| from my_model.config import LLAMA2_config as config | |
| import warnings | |
| # Suppress only FutureWarning from transformers | |
| warnings.filterwarnings("ignore", category=FutureWarning, module="transformers") | |
| class Llama2ModelManager: | |
| """ | |
| Manages loading and configuring the LLaMA-2 model and tokenizer. | |
| Attributes: | |
| device (str): Device to use for the model ('cuda' or 'cpu'). | |
| model_name (str): Name or path of the pre-trained model. | |
| tokenizer_name (str): Name or path of the tokenizer. | |
| quantization (str): Specifies the quantization level ('4bit', '8bit', or None). | |
| from_saved (bool): Flag to load the model from a saved path. | |
| model_path (str or None): Path to the saved model if `from_saved` is True. | |
| trust_remote (bool): Whether to trust remote code when loading the tokenizer. | |
| use_fast (bool): Whether to use the fast version of the tokenizer. | |
| add_eos_token (bool): Whether to add an EOS token to the tokenizer. | |
| access_token (str): Access token for Hugging Face Hub. | |
| model (AutoModelForCausalLM or None): Loaded model, initially None. | |
| """ | |
| def __init__(self) -> None: | |
| """ | |
| Initializes the Llama2ModelManager class with configuration settings. | |
| """ | |
| self.device: str = config.DEVICE | |
| self.model_name: str = config.MODEL_NAME | |
| self.tokenizer_name: str = config.TOKENIZER_NAME | |
| self.quantization: str = config.QUANTIZATION | |
| self.from_saved: bool = config.FROM_SAVED | |
| self.model_path: Optional[str] = config.MODEL_PATH | |
| self.trust_remote: bool = config.TRUST_REMOTE | |
| self.use_fast: bool = config.USE_FAST | |
| self.add_eos_token: bool = config.ADD_EOS_TOKEN | |
| self.access_token: str = config.ACCESS_TOKEN | |
| self.model: Optional[AutoModelForCausalLM] = None | |
| def create_bnb_config(self) -> BitsAndBytesConfig: | |
| """ | |
| Creates a BitsAndBytes configuration based on the quantization setting. | |
| Returns: | |
| BitsAndBytesConfig: Configuration for BitsAndBytes optimized model. | |
| """ | |
| if self.quantization == '4bit': | |
| return BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_use_double_quant=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_compute_dtype=torch.bfloat16 | |
| ) | |
| elif self.quantization == '8bit': | |
| return BitsAndBytesConfig( | |
| load_in_8bit=True, | |
| bnb_8bit_use_double_quant=True, | |
| bnb_8bit_quant_type="nf4", | |
| bnb_8bit_compute_dtype=torch.bfloat16 | |
| ) | |
| def load_model(self) -> AutoModelForCausalLM: | |
| """ | |
| Loads the LLaMA-2 model based on the specified configuration. | |
| If the model is already loaded, returns the existing model. | |
| Returns: | |
| AutoModelForCausalLM: Loaded LLaMA-2 model. | |
| """ | |
| if self.model is not None: | |
| print("Model is already loaded.") | |
| return self.model | |
| if self.from_saved: | |
| self.model = AutoModelForCausalLM.from_pretrained(self.model_path, device_map="auto") | |
| else: | |
| bnb_config = None if self.quantization is None else self.create_bnb_config() | |
| self.model = AutoModelForCausalLM.from_pretrained(self.model_name, device_map="auto", | |
| quantization_config=bnb_config, | |
| torch_dtype=torch.float16, | |
| token=self.access_token) | |
| if self.model is not None: | |
| print(f"LLAMA2 Model loaded successfully in {self.quantization} quantization.") | |
| else: | |
| print("LLAMA2 Model failed to load.") | |
| return self.model | |
| def load_tokenizer(self) -> AutoTokenizer: | |
| """ | |
| Loads the tokenizer for the LLaMA-2 model with the specified configuration. | |
| Returns: | |
| AutoTokenizer: Loaded tokenizer for LLaMA-2 model. | |
| """ | |
| self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=self.use_fast, | |
| token=self.access_token, | |
| trust_remote_code=self.trust_remote, | |
| add_eos_token=self.add_eos_token) | |
| if self.tokenizer is not None: | |
| print(f"LLAMA2 Tokenizer loaded successfully.") | |
| else: | |
| print("LLAMA2 Tokenizer failed to load.") | |
| return self.tokenizer | |
| def load_model_and_tokenizer(self, for_fine_tuning: bool) -> Tuple[AutoModelForCausalLM, AutoTokenizer]: | |
| """ | |
| Loads the LLaMA-2 model and tokenizer, and optionally adds special tokens for fine-tuning. | |
| Args: | |
| for_fine_tuning (bool): Whether to prepare the model and tokenizer for fine-tuning. | |
| Returns: | |
| Tuple[AutoModelForCausalLM, AutoTokenizer]: The loaded model and tokenizer. | |
| """ | |
| if for_fine_tuning: | |
| self.tokenizer = self.load_tokenizer() | |
| self.model = self.load_model() | |
| self.add_special_tokens() | |
| else: | |
| self.tokenizer = self.load_tokenizer() | |
| self.model = self.load_model() | |
| return self.model, self.tokenizer | |
| def add_special_tokens(self, tokens: Optional[List[str]] = None) -> None: | |
| """ | |
| Adds special tokens to the tokenizer and updates the model's token embeddings if the model is loaded. | |
| Args: | |
| tokens (Optional[List[str]]): Special tokens to add. Defaults to a predefined set. | |
| Returns: | |
| None | |
| """ | |
| if self.tokenizer is None: | |
| print("Tokenizer is not loaded. Cannot add special tokens.") | |
| return | |
| if tokens is None: | |
| tokens = ['[CAP]', '[/CAP]', '[QES]', '[/QES]', '[OBJ]', '[/OBJ]'] | |
| # Update the tokenizer with new tokens | |
| print(f"Original vocabulary size: {len(self.tokenizer)}") | |
| print(f"Adding the following tokens: {tokens}") | |
| self.tokenizer.add_tokens(tokens, special_tokens=True) | |
| self.tokenizer.add_special_tokens({'pad_token': '<pad>'}) | |
| print(f"Adding Padding Token {self.tokenizer.pad_token}") | |
| self.tokenizer.padding_side = "right" | |
| print(f'Padding side: {self.tokenizer.padding_side}') | |
| # Resize the model token embeddings if the model is loaded | |
| if self.model is not None: | |
| self.model.resize_token_embeddings(len(self.tokenizer)) | |
| self.model.config.pad_token_id = self.tokenizer.pad_token_id | |
| print(f'Updated Vocabulary Size: {len(self.tokenizer)}') | |
| print(f'Padding Token: {self.tokenizer.pad_token}') | |
| print(f'Special Tokens: {self.tokenizer.added_tokens_decoder}') | |
| if __name__ == "__main__": | |
| pass # uncomment to to load the mode and tokenizer and add the designed special tokens. | |
| LLAMA2_manager = Llama2ModelManager() | |
| LLAMA2_model = LLAMA2_manager.load_model() # First time loading the model | |
| LLAMA2_tokenizer = LLAMA2_manager.load_tokenizer() | |
| LLAMA2_manager.add_special_tokens(LLAMA2_model, LLAMA2_tokenizer) | |