MsAlEhR commited on
Commit
23072ad
·
verified ·
1 Parent(s): 52359bd

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +32 -22
README.md CHANGED
@@ -30,29 +30,39 @@ Details of the dataset will be shared in the supplementary materials of the pape
30
  ```
31
  3. **Example Code:**
32
  ```python
33
- from KmerTokenizer import KmerTokenizer
34
- from transformers import AutoModel
35
- import torch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
- # Example gene sequence
38
- seq_list = ["ATTTTTTTTTTTCCCCCCCCCCCGGGGGGGGATCGATGC"]
39
-
40
- # Initialize the tokenizer
41
- tokenizer = KmerTokenizer(kmerlen=6, overlapping=True, maxlen=4096)
42
- tokenized_output = tokenizer.kmer_tokenize(seq_list)
43
-
44
- # Convert tokenized output to tensor
45
- inputs = torch.tensor(tokenized_output)
46
-
47
- # Load the pre-trained BigBird model
48
- model = AutoModel.from_pretrained("MsAlEhR/MetaBERTa-bigbird-gene", output_hidden_states=True)
49
-
50
- # Generate hidden states
51
- hidden_states = model(inputs)[0]
52
-
53
- # Compute mean and max pooling of the hidden states
54
- embedding_mean = torch.mean(hidden_states[-1], dim=1)
55
- embedding_max = torch.max(hidden_states[-1], dim=1)
56
  ```
57
 
58
  **Citation:**
 
30
  ```
31
  3. **Example Code:**
32
  ```python
33
+ from KmerTokenizer import KmerTokenizer
34
+ from transformers import AutoModel
35
+ import torch
36
+
37
+ # Example gene sequence
38
+ seq = "ATTTTTTTTTTTCCCCCCCCCCCGGGGGGGGATCGATGC"
39
+
40
+ # Initialize the tokenizer
41
+ tokenizer = KmerTokenizer(kmerlen=6, overlapping=True, maxlen=4096)
42
+ tokenized_output = tokenizer.kmer_tokenize(seq)
43
+ pad_token_id = 2 # Set pad token ID
44
+
45
+ # Create attention mask (1 for tokens, 0 for padding)
46
+ attention_mask = torch.tensor([1 if token != pad_token_id else 0 for token in tokenized_output], dtype=torch.long).unsqueeze(0)
47
+
48
+ # Convert tokenized output to LongTensor and add batch dimension
49
+ inputs = torch.tensor([tokenized_output], dtype=torch.long)
50
+
51
+ # Load the pre-trained BigBird model
52
+ model = AutoModel.from_pretrained("MsAlEhR/MetaBERTa-bigbird-gene", output_hidden_states=True)
53
+
54
+ # Generate hidden states
55
+ outputs = model(input_ids=inputs, attention_mask=attention_mask)
56
+
57
+ # Get embeddings from the last hidden state
58
+ embeddings = outputs.hidden_states[-1]
59
+
60
+ # Expand attention mask to match the embedding dimensions
61
+ expanded_attention_mask = attention_mask.unsqueeze(-1)
62
+
63
+ # Compute mean sequence embeddings
64
+ mean_sequence_embeddings = torch.sum(expanded_attention_mask * embeddings, dim=1) / torch.sum(expanded_attention_mask, dim=1)
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  ```
67
 
68
  **Citation:**