Safetensors
File size: 1,455 Bytes
4527b5f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#!/bin/bash -l

# Activate the Poetry environment (adjust this if needed)
poetry shell

# Generate a timestamp string (e.g., 20230404123056)
TS=$(date '+%Y%m%d%H%M%S')

CONFIG_FILE="configs/tokenization_config_${TS}.json"

# Create the config file containing your tokenization arguments
cat <<EOF > "$CONFIG_FILE"
{
  "tokenizer_name_or_path": "teddy/models/teddy_g/400M",
  "gene_id_column": "index",
  "bio_annotations": true,
  "disease_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_disease_mapping.json",
  "tissue_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_tissue_mapping.json",
  "cell_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_cell_mapping.json",
  "sex_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_sex_mapping.json",
  "max_shard_samples": 500,
  "max_seq_len": 2048,
  "pad_length": 2048,
  "add_cls": false,
  "bins": 0,
  "continuous_rank": true,
  "add_disease_annotation": false,
  "include_zero_genes": false,
  "load_dir": "data/processed",
  "save_dir": "data/tokenized"
}
EOF

# Execute the tokenization.py script with three arguments:
# --data_path, --metadata_path, and --config_path
python teddy/data_processing/tokenization/tokenization.py \
  --data_path data/processed/sample_data.h5ad \
  --metadata_path data/processed/sample_data_metadata.json \
  --config_path "$CONFIG_FILE"