| | from typing import Sequence, Tuple, List, Union |
| | import itertools |
| |
|
| | class ResidueLevelTokenizer: |
| | """ |
| | Tokenizer for Protein Residue Level Tokenization. |
| | """ |
| |
|
| | def __init__(self, **kwargs): |
| | super(ResidueLevelTokenizer, self).__init__() |
| | self.pad_tok = ['[pad]'] |
| | self.all_toks = self.pad_tok |
| | self._tokens = ['L', 'A', 'G', 'V', 'S', 'E', 'R', 'T', 'I', 'D', 'P', 'K', 'Q', 'N', 'F', 'Y', 'M', 'H', 'W', 'C', 'X', 'B', 'U', 'Z', 'O', '.', '-'] |
| | self.all_toks.extend(self._tokens) |
| | self._special_tokens = ['MASK', 'gMASK', 'sMASK', 'eod', 'sop', 'eop', '</s>', '<M>'] |
| | self.set_special_tokens(self._special_tokens) |
| | self.special_tokens['eos']=self.special_tokens['</s>'] |
| | self.special_tokens['tMASK']=self.special_tokens['MASK'] |
| | |
| | self.all_toks.extend(self._special_tokens) |
| | self._vocab = {t: i for i, t in enumerate(self.all_toks)} |
| | self.command_token = {'[tMASK]': 'tMASK', '[MASK]':'MASK', '[gMASK]': 'gMASK', '[sMASK]':'sMASK'} |
| | |
| | |
| | |
| |
|
| | def pad_id(self): |
| | return self._vocab['[pad]'] |
| | |
| | def set_special_tokens(self, special_tokens): |
| | """Add a list of additional tokens to the encoder. |
| | The additional tokens are indexed starting from the last index of the |
| | current vocabulary in the order of the `special_tokens` list. |
| | """ |
| | if not special_tokens: |
| | self.special_tokens = {} |
| | self.special_tokens_decoder = {} |
| | return |
| | self.special_tokens = dict((tok, len(self.all_toks) + i) for i, tok in enumerate(special_tokens)) |
| | self.special_tokens_decoder = {v: k for k, v in self.special_tokens.items()} |
| | |
| | |
| | def __len__(self): |
| | return len(self._vocab) |
| |
|
| |
|
| | def EncodeAsIds(self, text, process_fn=None): |
| | """convert sequence to idx""" |
| | processed_text = text |
| | if process_fn is not None: |
| | processed_text = process_fn(processed_text) |
| | processed_text = str(processed_text) |
| | tokens = [self.TokenToId(c) for c in processed_text] |
| | return tokens |
| | |
| | def IdToToken(self, idx): |
| | if idx == 0: |
| | return '[pad]' |
| | elif idx in self.special_tokens_decoder: |
| | return f"[{self.special_tokens_decoder[idx]}]" |
| | else: |
| | try: |
| | tok = self.all_toks[idx] |
| | except: |
| | tok = '*' |
| | return tok |
| | def TokenToId(self, token): |
| | if token == '[pad]': |
| | return 0 |
| | elif token in self.special_tokens: |
| | return self.special_tokens[token] |
| | else: |
| | return self._vocab[token] |
| | |
| | def DecodeIds(self, Ids): |
| | return ''.join([self.IdToToken(tok) for tok in Ids]) |
| | |
| | def _tokenize(self, text) -> str: |
| | return text.split() |
| | |
| | def tokenize(self, text, **kwargs) -> List[str]: |
| | """ |
| | Inspired by https://github.com/huggingface/transformers/blob/master/src/transformers/tokenization_utils.py |
| | Converts a string in a sequence of tokens, using the tokenizer. |
| | |
| | Args: |
| | text (:obj:`str`): |
| | The sequence to be encoded. |
| | |
| | Returns: |
| | :obj:`List[str]`: The list of tokens. |
| | """ |
| |
|
| | def split_on_token(tok, text): |
| | result = [] |
| | split_text = text.split(tok) |
| | for i, sub_text in enumerate(split_text): |
| | |
| | |
| | |
| | |
| | |
| | if i < len(split_text) - 1: |
| | sub_text = sub_text.rstrip() |
| | if i > 0: |
| | sub_text = sub_text.lstrip() |
| |
|
| | if i == 0 and not sub_text: |
| | result.append(tok) |
| | elif i == len(split_text) - 1: |
| | if sub_text: |
| | result.append(sub_text) |
| | else: |
| | pass |
| | else: |
| | if sub_text: |
| | result.append(sub_text) |
| | result.append(tok) |
| | return result |
| |
|
| | def split_on_tokens(tok_list, text): |
| | if not text.strip(): |
| | return [] |
| |
|
| | tokenized_text = [] |
| | text_list = [text] |
| | for tok in tok_list: |
| | tokenized_text = [] |
| | for sub_text in text_list: |
| | if sub_text not in self._tokens: |
| | tokenized_text.extend(split_on_token(tok, sub_text)) |
| | else: |
| | tokenized_text.append(sub_text) |
| | text_list = tokenized_text |
| |
|
| | return list( |
| | itertools.chain.from_iterable( |
| | ( |
| | self._tokenize(token) |
| | if token not in self.all_toks |
| | else [token] |
| | for token in tokenized_text |
| | ) |
| | ) |
| | ) |
| | no_split_token = self.all_toks |
| | tokenized_text = split_on_tokens(no_split_token, text) |
| | return self.convert_tokens_to_ids(tokenized_text) |
| |
|
| | def convert_tokens_to_ids(self, tokens): |
| | """Converts a sequence of tokens into ids using the vocab.""" |
| | ids = [] |
| | |
| | |
| | for token in tokens: |
| | ids.append(self.TokenToId(token)) |
| | return ids |
| |
|
| |
|
| | class proteinglm_tokenizer: |
| | """ |
| | Protein Tokenizer based on Residue level tokenizer |
| | """ |
| |
|
| | def __init__(self): |
| | name = 'ProteinTokenizer' |
| | self.tokenizer = ResidueLevelTokenizer() |
| | self.special_tokens = self.tokenizer.special_tokens |
| |
|
| |
|
| | def IdToToken(self, idx): |
| | return self.tokenizer.IdToToken(idx) |
| |
|
| | def TokenToId(self, token): |
| | return self.tokenizer.TokenToId(token) |
| |
|
| | @property |
| | def vocab_size(self): |
| | return len(self.tokenizer) |
| |
|
| | def decode(self, token_ids): |
| | return self.tokenizer.DecodeIds([token_ids]) |
| |
|
| | @property |
| | def eod(self): |
| | return self.tokenizer.get_special_token('eos') |
| |
|
| | def detokenize(self, Ids, type_token=False): |
| | new_tokens = self.tokenizer.DecodeIds(Ids) |
| | return new_tokens |
| |
|
| | def tokenize(self, text): |
| | ids = self.tokenizer.tokenize(text) |
| | return ids |
| |
|
| | @property |
| | def vocab(self): |
| | return self.tokenizer._vocab |
| |
|
| | @property |
| | def inv_vocab(self): |
| | return {v:k for k, v in self.tokenizer._vocab.items()} |
| |
|
| | @property |
| | def get_pad_id(self): |
| | return self.tokenizer.pad_id |
| | |
| | |
| | def get_command(self, token): |
| | tok = token |
| | if token in self.tokenizer.command_token: |
| | tok = self.tokenizer.command_token[token] |
| | return self.tokenizer.special_tokens[tok] |
| |
|