mgelard commited on
Commit
3ac4778
·
verified ·
1 Parent(s): 29c5604

Upload tokenizer

Browse files
Files changed (2) hide show
  1. tokenizer.py +1 -0
  2. tokenizer_config.json +7 -26
tokenizer.py CHANGED
@@ -88,6 +88,7 @@ class BinnedOmicTokenizer(PreTrainedTokenizer):
88
  gene_expr = gene_expr / self.normalization_factor
89
 
90
  token_ids = np.digitize(gene_expr, self.bin_edges).astype(int)
 
91
  token_ids[gene_expr == 0.0] = 0
92
 
93
  if self.prepend_cls_token:
 
88
  gene_expr = gene_expr / self.normalization_factor
89
 
90
  token_ids = np.digitize(gene_expr, self.bin_edges).astype(int)
91
+ token_ids = np.clip(token_ids, 0, self.n_expressions_bins - 1)
92
  token_ids[gene_expr == 0.0] = 0
93
 
94
  if self.prepend_cls_token:
tokenizer_config.json CHANGED
@@ -1,32 +1,13 @@
1
  {
2
- "tokenizer_class": "MOJOTokenizer",
3
- "n_expressions_bins": {
4
- "rnaseq": 64,
5
- "methylation": 64
6
- },
7
- "min_omic_value": {
8
- "rnaseq": 0.0,
9
- "methylation": 0.006863548701544
10
- },
11
- "max_omic_value": {
12
- "rnaseq": 1.0,
13
- "methylation": 0.992831582796998
14
- },
15
- "use_max_normalization": {
16
- "rnaseq": true,
17
- "methylation": false
18
- },
19
- "normalization_factor": {
20
- "rnaseq": 5.52786861525666,
21
- "methylation": 1.0
22
- },
23
- "prepend_cls_token": false,
24
- "fixed_sequence_length": 17152,
25
- "unpadded_length": 17116,
26
  "auto_map": {
27
  "AutoTokenizer": [
28
  "tokenizer.MOJOTokenizer",
29
  null
30
  ]
31
- }
32
- }
 
 
 
 
 
1
  {
2
+ "added_tokens_decoder": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  "auto_map": {
4
  "AutoTokenizer": [
5
  "tokenizer.MOJOTokenizer",
6
  null
7
  ]
8
+ },
9
+ "clean_up_tokenization_spaces": false,
10
+ "extra_special_tokens": {},
11
+ "model_max_length": 1000000000000000019884624838656,
12
+ "tokenizer_class": "MOJOTokenizer"
13
+ }