| { | |
| "ae_mode": "token", | |
| "attn_implementation": null, | |
| "auto_map": { | |
| "AutoConfig": "modeling_clara.CLaRaConfig", | |
| "AutoModel": "modeling_clara.CLaRa" | |
| }, | |
| "compr_base_model_name": "/mnt/ceph_rbd/model/Mistral-7B-Instruct-v0.2", | |
| "compr_every_n_layer": null, | |
| "compr_linear_type": "concat", | |
| "compr_mlp_hidden_dim": 8096, | |
| "compr_model_name": null, | |
| "compr_n_layers": 5, | |
| "compr_rate": 128, | |
| "compr_rms_norm": false, | |
| "compr_use_mlp": false, | |
| "decoder_model_name": "/mnt/conductor_data/data/hf_models/Mistral-7B-Instruct-v0.2", | |
| "device_map": null, | |
| "different_mem_tokens": true, | |
| "doc_max_length": 256, | |
| "generation_top_k": 5, | |
| "kbtc_training": false, | |
| "load_adapters": true, | |
| "load_pretrained_checkpoint": false, | |
| "lora": true, | |
| "lora_compressor": false, | |
| "lora_r": 16, | |
| "lora_r_compressor": 16, | |
| "max_new_tokens": 128, | |
| "model_type": "CLaRa", | |
| "optimize_mem_tokens": true, | |
| "pad_token_id": 2, | |
| "pure_inference": false, | |
| "quantization": "no", | |
| "sep": true, | |
| "stage2_retrieval_top_n": 1, | |
| "training_form": "both_separately", | |
| "training_stage": "stage2", | |
| "transformers_version": "4.53.3" | |
| } | |