Update README.md
Browse files
README.md
CHANGED
|
@@ -30,29 +30,39 @@ Details of the dataset will be shared in the supplementary materials of the pape
|
|
| 30 |
```
|
| 31 |
3. **Example Code:**
|
| 32 |
```python
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
-
# Example gene sequence
|
| 38 |
-
seq_list = ["ATTTTTTTTTTTCCCCCCCCCCCGGGGGGGGATCGATGC"]
|
| 39 |
-
|
| 40 |
-
# Initialize the tokenizer
|
| 41 |
-
tokenizer = KmerTokenizer(kmerlen=6, overlapping=True, maxlen=4096)
|
| 42 |
-
tokenized_output = tokenizer.kmer_tokenize(seq_list)
|
| 43 |
-
|
| 44 |
-
# Convert tokenized output to tensor
|
| 45 |
-
inputs = torch.tensor(tokenized_output)
|
| 46 |
-
|
| 47 |
-
# Load the pre-trained BigBird model
|
| 48 |
-
model = AutoModel.from_pretrained("MsAlEhR/MetaBERTa-bigbird-gene", output_hidden_states=True)
|
| 49 |
-
|
| 50 |
-
# Generate hidden states
|
| 51 |
-
hidden_states = model(inputs)[0]
|
| 52 |
-
|
| 53 |
-
# Compute mean and max pooling of the hidden states
|
| 54 |
-
embedding_mean = torch.mean(hidden_states[-1], dim=1)
|
| 55 |
-
embedding_max = torch.max(hidden_states[-1], dim=1)
|
| 56 |
```
|
| 57 |
|
| 58 |
**Citation:**
|
|
|
|
| 30 |
```
|
| 31 |
3. **Example Code:**
|
| 32 |
```python
|
| 33 |
+
from KmerTokenizer import KmerTokenizer
|
| 34 |
+
from transformers import AutoModel
|
| 35 |
+
import torch
|
| 36 |
+
|
| 37 |
+
# Example gene sequence
|
| 38 |
+
seq = "ATTTTTTTTTTTCCCCCCCCCCCGGGGGGGGATCGATGC"
|
| 39 |
+
|
| 40 |
+
# Initialize the tokenizer
|
| 41 |
+
tokenizer = KmerTokenizer(kmerlen=6, overlapping=True, maxlen=4096)
|
| 42 |
+
tokenized_output = tokenizer.kmer_tokenize(seq)
|
| 43 |
+
pad_token_id = 2 # Set pad token ID
|
| 44 |
+
|
| 45 |
+
# Create attention mask (1 for tokens, 0 for padding)
|
| 46 |
+
attention_mask = torch.tensor([1 if token != pad_token_id else 0 for token in tokenized_output], dtype=torch.long).unsqueeze(0)
|
| 47 |
+
|
| 48 |
+
# Convert tokenized output to LongTensor and add batch dimension
|
| 49 |
+
inputs = torch.tensor([tokenized_output], dtype=torch.long)
|
| 50 |
+
|
| 51 |
+
# Load the pre-trained BigBird model
|
| 52 |
+
model = AutoModel.from_pretrained("MsAlEhR/MetaBERTa-bigbird-gene", output_hidden_states=True)
|
| 53 |
+
|
| 54 |
+
# Generate hidden states
|
| 55 |
+
outputs = model(input_ids=inputs, attention_mask=attention_mask)
|
| 56 |
+
|
| 57 |
+
# Get embeddings from the last hidden state
|
| 58 |
+
embeddings = outputs.hidden_states[-1]
|
| 59 |
+
|
| 60 |
+
# Expand attention mask to match the embedding dimensions
|
| 61 |
+
expanded_attention_mask = attention_mask.unsqueeze(-1)
|
| 62 |
+
|
| 63 |
+
# Compute mean sequence embeddings
|
| 64 |
+
mean_sequence_embeddings = torch.sum(expanded_attention_mask * embeddings, dim=1) / torch.sum(expanded_attention_mask, dim=1)
|
| 65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
```
|
| 67 |
|
| 68 |
**Citation:**
|