TEDDY

File size: 1,455 Bytes

4527b5f

#!/bin/bash -l

# Activate the Poetry environment (adjust this if needed)
poetry shell

# Generate a timestamp string (e.g., 20230404123056)
TS=$(date '+%Y%m%d%H%M%S')

CONFIG_FILE="configs/tokenization_config_${TS}.json"

# Create the config file containing your tokenization arguments
cat <<EOF > "$CONFIG_FILE"
{
  "tokenizer_name_or_path": "teddy/models/teddy_g/400M",
  "gene_id_column": "index",
  "bio_annotations": true,
  "disease_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_disease_mapping.json",
  "tissue_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_tissue_mapping.json",
  "cell_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_cell_mapping.json",
  "sex_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_sex_mapping.json",
  "max_shard_samples": 500,
  "max_seq_len": 2048,
  "pad_length": 2048,
  "add_cls": false,
  "bins": 0,
  "continuous_rank": true,
  "add_disease_annotation": false,
  "include_zero_genes": false,
  "load_dir": "data/processed",
  "save_dir": "data/tokenized"
}
EOF

# Execute the tokenization.py script with three arguments:
# --data_path, --metadata_path, and --config_path
python teddy/data_processing/tokenization/tokenization.py \
  --data_path data/processed/sample_data.h5ad \
  --metadata_path data/processed/sample_data_metadata.json \
  --config_path "$CONFIG_FILE"