#!/bin/bash -l # Activate the Poetry environment (adjust this if needed) poetry shell # Generate a timestamp string (e.g., 20230404123056) TS=$(date '+%Y%m%d%H%M%S') CONFIG_FILE="configs/tokenization_config_${TS}.json" # Create the config file containing your tokenization arguments cat < "$CONFIG_FILE" { "tokenizer_name_or_path": "teddy/models/teddy_g/400M", "gene_id_column": "index", "bio_annotations": true, "disease_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_disease_mapping.json", "tissue_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_tissue_mapping.json", "cell_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_cell_mapping.json", "sex_mapping": "teddy/data_processing/utils/bio_annotations/data/mappings/all_filtered_sex_mapping.json", "max_shard_samples": 500, "max_seq_len": 2048, "pad_length": 2048, "add_cls": false, "bins": 0, "continuous_rank": true, "add_disease_annotation": false, "include_zero_genes": false, "load_dir": "data/processed", "save_dir": "data/tokenized" } EOF # Execute the tokenization.py script with three arguments: # --data_path, --metadata_path, and --config_path python teddy/data_processing/tokenization/tokenization.py \ --data_path data/processed/sample_data.h5ad \ --metadata_path data/processed/sample_data_metadata.json \ --config_path "$CONFIG_FILE"