Update README.md (#2)
Browse files- Update README.md (faabe96800e0c28814a7133d8881d3f9c78f0575)
README.md
CHANGED
|
@@ -594,157 +594,6 @@ You can finetune this model on your own dataset.
|
|
| 594 |
*What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
|
| 595 |
-->
|
| 596 |
|
| 597 |
-
## Training Details
|
| 598 |
-
|
| 599 |
-
### Training Hyperparameters
|
| 600 |
-
#### Non-Default Hyperparameters
|
| 601 |
-
|
| 602 |
-
- `eval_strategy`: steps
|
| 603 |
-
- `per_device_eval_batch_size`: 4
|
| 604 |
-
- `gradient_accumulation_steps`: 4
|
| 605 |
-
- `learning_rate`: 2e-05
|
| 606 |
-
- `lr_scheduler_type`: cosine
|
| 607 |
-
- `warmup_ratio`: 0.1
|
| 608 |
-
- `warmup_steps`: 5
|
| 609 |
-
- `bf16`: True
|
| 610 |
-
- `tf32`: True
|
| 611 |
-
- `optim`: adamw_torch_fused
|
| 612 |
-
- `gradient_checkpointing`: True
|
| 613 |
-
- `gradient_checkpointing_kwargs`: {'use_reentrant': False}
|
| 614 |
-
- `batch_sampler`: no_duplicates
|
| 615 |
-
|
| 616 |
-
#### All Hyperparameters
|
| 617 |
-
<details><summary>Click to expand</summary>
|
| 618 |
-
|
| 619 |
-
- `overwrite_output_dir`: False
|
| 620 |
-
- `do_predict`: False
|
| 621 |
-
- `eval_strategy`: steps
|
| 622 |
-
- `prediction_loss_only`: True
|
| 623 |
-
- `per_device_train_batch_size`: 8
|
| 624 |
-
- `per_device_eval_batch_size`: 4
|
| 625 |
-
- `per_gpu_train_batch_size`: None
|
| 626 |
-
- `per_gpu_eval_batch_size`: None
|
| 627 |
-
- `gradient_accumulation_steps`: 4
|
| 628 |
-
- `eval_accumulation_steps`: None
|
| 629 |
-
- `learning_rate`: 2e-05
|
| 630 |
-
- `weight_decay`: 0.0
|
| 631 |
-
- `adam_beta1`: 0.9
|
| 632 |
-
- `adam_beta2`: 0.999
|
| 633 |
-
- `adam_epsilon`: 1e-08
|
| 634 |
-
- `max_grad_norm`: 1.0
|
| 635 |
-
- `num_train_epochs`: 3
|
| 636 |
-
- `max_steps`: -1
|
| 637 |
-
- `lr_scheduler_type`: cosine
|
| 638 |
-
- `lr_scheduler_kwargs`: {}
|
| 639 |
-
- `warmup_ratio`: 0.1
|
| 640 |
-
- `warmup_steps`: 5
|
| 641 |
-
- `log_level`: passive
|
| 642 |
-
- `log_level_replica`: warning
|
| 643 |
-
- `log_on_each_node`: True
|
| 644 |
-
- `logging_nan_inf_filter`: True
|
| 645 |
-
- `save_safetensors`: True
|
| 646 |
-
- `save_on_each_node`: False
|
| 647 |
-
- `save_only_model`: False
|
| 648 |
-
- `restore_callback_states_from_checkpoint`: False
|
| 649 |
-
- `no_cuda`: False
|
| 650 |
-
- `use_cpu`: False
|
| 651 |
-
- `use_mps_device`: False
|
| 652 |
-
- `seed`: 42
|
| 653 |
-
- `data_seed`: None
|
| 654 |
-
- `jit_mode_eval`: False
|
| 655 |
-
- `use_ipex`: False
|
| 656 |
-
- `bf16`: True
|
| 657 |
-
- `fp16`: False
|
| 658 |
-
- `fp16_opt_level`: O1
|
| 659 |
-
- `half_precision_backend`: auto
|
| 660 |
-
- `bf16_full_eval`: False
|
| 661 |
-
- `fp16_full_eval`: False
|
| 662 |
-
- `tf32`: True
|
| 663 |
-
- `local_rank`: 0
|
| 664 |
-
- `ddp_backend`: None
|
| 665 |
-
- `tpu_num_cores`: None
|
| 666 |
-
- `tpu_metrics_debug`: False
|
| 667 |
-
- `debug`: []
|
| 668 |
-
- `dataloader_drop_last`: True
|
| 669 |
-
- `dataloader_num_workers`: 0
|
| 670 |
-
- `dataloader_prefetch_factor`: None
|
| 671 |
-
- `past_index`: -1
|
| 672 |
-
- `disable_tqdm`: False
|
| 673 |
-
- `remove_unused_columns`: True
|
| 674 |
-
- `label_names`: None
|
| 675 |
-
- `load_best_model_at_end`: False
|
| 676 |
-
- `ignore_data_skip`: False
|
| 677 |
-
- `fsdp`: []
|
| 678 |
-
- `fsdp_min_num_params`: 0
|
| 679 |
-
- `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
|
| 680 |
-
- `fsdp_transformer_layer_cls_to_wrap`: None
|
| 681 |
-
- `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
|
| 682 |
-
- `deepspeed`: None
|
| 683 |
-
- `label_smoothing_factor`: 0.0
|
| 684 |
-
- `optim`: adamw_torch_fused
|
| 685 |
-
- `optim_args`: None
|
| 686 |
-
- `adafactor`: False
|
| 687 |
-
- `group_by_length`: False
|
| 688 |
-
- `length_column_name`: length
|
| 689 |
-
- `ddp_find_unused_parameters`: None
|
| 690 |
-
- `ddp_bucket_cap_mb`: None
|
| 691 |
-
- `ddp_broadcast_buffers`: False
|
| 692 |
-
- `dataloader_pin_memory`: True
|
| 693 |
-
- `dataloader_persistent_workers`: False
|
| 694 |
-
- `skip_memory_metrics`: True
|
| 695 |
-
- `use_legacy_prediction_loop`: False
|
| 696 |
-
- `push_to_hub`: False
|
| 697 |
-
- `resume_from_checkpoint`: None
|
| 698 |
-
- `hub_model_id`: None
|
| 699 |
-
- `hub_strategy`: every_save
|
| 700 |
-
- `hub_private_repo`: False
|
| 701 |
-
- `hub_always_push`: False
|
| 702 |
-
- `gradient_checkpointing`: True
|
| 703 |
-
- `gradient_checkpointing_kwargs`: {'use_reentrant': False}
|
| 704 |
-
- `include_inputs_for_metrics`: False
|
| 705 |
-
- `eval_do_concat_batches`: True
|
| 706 |
-
- `fp16_backend`: auto
|
| 707 |
-
- `push_to_hub_model_id`: None
|
| 708 |
-
- `push_to_hub_organization`: None
|
| 709 |
-
- `mp_parameters`:
|
| 710 |
-
- `auto_find_batch_size`: False
|
| 711 |
-
- `full_determinism`: False
|
| 712 |
-
- `torchdynamo`: None
|
| 713 |
-
- `ray_scope`: last
|
| 714 |
-
- `ddp_timeout`: 1800
|
| 715 |
-
- `torch_compile`: False
|
| 716 |
-
- `torch_compile_backend`: None
|
| 717 |
-
- `torch_compile_mode`: None
|
| 718 |
-
- `dispatch_batches`: None
|
| 719 |
-
- `split_batches`: None
|
| 720 |
-
- `include_tokens_per_second`: False
|
| 721 |
-
- `include_num_input_tokens_seen`: False
|
| 722 |
-
- `neftune_noise_alpha`: None
|
| 723 |
-
- `optim_target_modules`: None
|
| 724 |
-
- `batch_eval_metrics`: False
|
| 725 |
-
- `batch_sampler`: no_duplicates
|
| 726 |
-
- `multi_dataset_batch_sampler`: proportional
|
| 727 |
-
|
| 728 |
-
</details>
|
| 729 |
-
|
| 730 |
-
### Training Logs
|
| 731 |
-
| Epoch | Step | Training Loss | retrival loss | sts loss | reranking loss |
|
| 732 |
-
|:------:|:----:|:-------------:|:-------------:|:--------:|:--------------:|
|
| 733 |
-
| 0.5222 | 500 | 0.7949 | 0.0187 | 2.6522 | 0.2931 |
|
| 734 |
-
| 1.0444 | 1000 | 0.6813 | 0.0139 | 2.5109 | 0.2695 |
|
| 735 |
-
| 1.5666 | 1500 | 0.5148 | 0.0118 | 2.5270 | 0.2807 |
|
| 736 |
-
| 2.0888 | 2000 | 0.48 | 0.0114 | 2.5418 | 0.2791 |
|
| 737 |
-
| 2.6110 | 2500 | 0.3782 | 0.0117 | 2.5740 | 0.2787 |
|
| 738 |
-
|
| 739 |
-
|
| 740 |
-
### Framework Versions
|
| 741 |
-
- Python: 3.10.12
|
| 742 |
-
- Sentence Transformers: 3.0.1
|
| 743 |
-
- Transformers: 4.41.2
|
| 744 |
-
- PyTorch: 2.2.0+cu121
|
| 745 |
-
- Accelerate: 0.32.1
|
| 746 |
-
- Datasets: 2.20.0
|
| 747 |
-
- Tokenizers: 0.19.1
|
| 748 |
|
| 749 |
## Citation
|
| 750 |
|
|
|
|
| 594 |
*What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
|
| 595 |
-->
|
| 596 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 597 |
|
| 598 |
## Citation
|
| 599 |
|