nguyenhuucongzz01 commited on May 22

Commit

8acc7e3

verified ·

1 Parent(s): 4b6fdc3

Upload folder using huggingface_hub

Browse files

Files changed (17) hide show

.gitattributes +1 -0
1_Pooling/config.json +10 -0
2_Dense/config.json +1 -0
2_Dense/model.safetensors +3 -0
README.md +1136 -0
adapter_config.json +37 -0
adapter_model.safetensors +3 -0
added_tokens.json +5 -0
config_sentence_transformers.json +13 -0
merges.txt +0 -0
modules.json +20 -0
sentence_bert_config.json +4 -0
special_tokens_map.json +20 -0
tokenizer.json +3 -0
tokenizer_config.json +50 -0
training_args.bin +3 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

1_Pooling/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "word_embedding_dimension": 1536,
+  "pooling_mode_cls_token": false,
+  "pooling_mode_mean_tokens": true,
+  "pooling_mode_max_tokens": false,
+  "pooling_mode_mean_sqrt_len_tokens": false,
+  "pooling_mode_weightedmean_tokens": false,
+  "pooling_mode_lasttoken": false,
+  "include_prompt": true
+}

2_Dense/config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"in_features": 1536, "out_features": 1024, "bias": true, "activation_function": "torch.nn.modules.linear.Identity"}

2_Dense/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5d0447b2202b919e514a2311e310fbe1ffbb4e92755c9d8ecc927e96d5fae37a
+size 3147944

README.md ADDED Viewed

	@@ -0,0 +1,1136 @@

+---
+tags:
+- sentence-transformers
+- sentence-similarity
+- feature-extraction
+- generated_from_trainer
+- dataset_size:8562
+- loss:CachedMultipleNegativesRankingLoss
+base_model: NovaSearch/stella_en_1.5B_v5
+widget:
+- source_sentence: 'Subject: Sharing in a Ratio
+    Construct: Given information about one part, work out the whole
+    Question: The ratio of cars to vans in a car park is \( 5: 3 \)
+    If there are \( 80 \) cars, how many vehicles (cars and vans) are there in total?
+    CorrectAnswer: \( 128 \)
+    IncorrectAnswer: \( 83 \)
+    IncorrectReason: The correct answer is \( 128 \) because the ratio of cars to
+    vans is \( 5:3 \). This means for every 5 cars, there are 3 vans. Given there
+    are 80 cars, we can determine the number of vans by setting up a proportion. Since
+    \( 5 \) parts correspond to \( 80 \) cars, each part corresponds to \( \frac{80}{5}
+    = 16 \) cars. Therefore, the number of vans, which is \( 3 \) parts, is \( 3 \times
+    16 = 48 \) vans. The total number of vehicles is the sum of cars and vans, which
+    is \( 80 + 48 = 128 \).
+    The incorrect answer \( 83 \) likely comes from a misunderstanding of how to apply
+    the ratio. One might incorrectly assume that the total number of vehicles is simply
+    the sum of the given number of cars and the number of vans directly derived from
+    the ratio without properly scaling the ratio to match the given number of cars.
+    For example, someone might incorrectly add \( 80 \) cars and \( 3 \) vans (thinking
+    the ratio directly applies without scaling), leading to \( 83 \). This is a common
+    misconception when dealing with ratios and proportions.'
+  sentences:
+  - Thinks the number in the ratio is the total
+  - Increases by the given percentage rather than finding the percentage of an amount
+  - Thinks there are 50 weeks in a year
+- source_sentence: 'Subject: Indirect (Inverse) Proportion
+    Construct: Calculations using inverse proportion
+    Question: It takes 4 workers 6 days to paint a house. How many days would it take
+    3 workers?
+    CorrectAnswer: 8
+    IncorrectAnswer: 7
+    IncorrectReason: The correct answer is 8 days because the problem involves understanding
+    the relationship between the number of workers and the time taken to complete
+    a task. If 4 workers take 6 days to paint a house, the total work required can
+    be thought of as 4 workers * 6 days = 24 worker-days. This means the total amount
+    of work needed to paint the house is 24 worker-days. If you have 3 workers, the
+    number of days required to complete the same amount of work would be 24 worker-days
+    / 3 workers = 8 days.
+    The incorrect answer of 7 days likely stems from a common misconception about
+    how work rates scale with the number of workers. Someone might incorrectly assume
+    that reducing the number of workers from 4 to 3 would increase the time by a proportional
+    amount, such as 6 days * (4/3) = 8 days, but then round down to 7 days due to
+    a miscalculation or rounding error. This misunderstanding fails to account for
+    the direct proportionality between the number of workers and the time required
+    to complete the work.'
+  sentences:
+  - When working with inverse and direct proportion, forgets to apply the root or
+    index to find k, but remembers when doing the second calculation
+  - Confuses the radius with the radius squared in the equation of a circle
+  - Believes unrelated acute/obtuse angles in a diagram are supplementary
+- source_sentence: 'Subject: Experimental Probability and Relative Frequency
+    Construct: Compare relative frequencies in order to determine which prediction
+    is likely to be more reliable
+    Question: A vegetarian restaurant wants to know people''s eating habits
+    \( 50 \) people in Town \( A \) are asked if they eat meat. \begin{tabular}{|c|c|}
+    \hline \multicolumn{2}{|c|}{ Town A } \\
+    \hline Do you eat meat? & Frequency \\
+    \hline Yes & \( 39 \) \\
+    \hline No & \( 11 \) \\
+    \hline
+    \end{tabular} \( 200 \) people in Town B are asked if they eat meat. \begin{tabular}{|cc|}
+    \hline \multicolumn{2}{|c|}{ Town B } \\
+    \hline Do you eat meat? & Frequency \\
+    \hline Yes & \( 143 \) \\
+    \hline No & \( 57 \) \\
+    \hline
+    \end{tabular} Which results are more reliable?
+    CorrectAnswer: Town B
+    IncorrectAnswer: They are the same
+    IncorrectReason: The correct answer is Town B because the reliability of survey
+    results generally increases with a larger sample size. In this case, Town B has
+    a larger sample size of 200 people compared to Town A''s 50 people. Larger samples
+    tend to provide more accurate estimates of the population''s eating habits, reducing
+    the impact of random variation and providing a more reliable representation of
+    the true population proportion.
+    The incorrect answer, "They are the same," is a misconception that assumes the
+    reliability of survey results does not depend on the sample size. This misunderstanding
+    ignores the statistical principle that larger samples generally yield more reliable
+    estimates. The smaller sample size in Town A introduces more variability and potential
+    bias, making the results less reliable compared to the larger sample in Town B.'
+  sentences:
+  - Has considered the percentage rather than the percentage of the amount
+  - Does not know that sample size affects reliability
+  - Believes sets are the same if the elements within them have a shared property
+- source_sentence: 'Subject: Square Roots, Cube Roots, etc
+    Construct: Recognise other roots of numbers
+    Question: \( \sqrt[4]{16}=? \)
+    CorrectAnswer: \( 2 \)
+    IncorrectAnswer: \( 16 \)
+    IncorrectReason: The correct answer to the problem \( \sqrt[4]{16} \) is \( 2
+    \). This is because the fourth root of a number is the number that, when raised
+    to the fourth power, gives the original number. In this case, \( 2^4 = 2 \times
+    2 \times 2 \times 2 = 16 \). Therefore, \( 2 \) is the correct answer as it satisfies
+    the equation \( 2^4 = 16 \).
+    The incorrect answer \( 16 \) likely stems from a misunderstanding of what the
+    fourth root operation means. Someone might have confused the fourth root with
+    the fourth power, thinking that \( 16 \) raised to the fourth power equals \(
+    16 \), which is not true. The fourth power of \( 16 \) would be \( 16^4 = 65536
+    \), not \( 16 \). Thus, the misconception lies in confusing the operation of taking
+    a root with raising a number to a power.'
+  sentences:
+  - 'Confuses the use of LCM and HCF in real life examples '
+  - Estimated when not appropriate
+  - Does not understand the root power of 4
+- source_sentence: 'Subject: Adding and Subtracting with Decimals
+    Construct: Subtract decimals where the numbers involved have a different number
+    of decimal places
+    Question: \( 0.55-0.2= \)
+    CorrectAnswer: \( 0.35 \)
+    IncorrectAnswer: \( 0.33 \)
+    IncorrectReason: The correct answer to the problem \( 0.55 - 0.2 \) is \( 0.35
+    \). This is correct because when you subtract \( 0.2 \) from \( 0.55 \), you are
+    essentially performing the operation \( 0.55 - 0.20 \). This can be visualized
+    as \( 55 \) hundredths minus \( 20 \) hundredths, which equals \( 35 \) hundredths,
+    or \( 0.35 \).
+    The incorrect answer \( 0.33 \) likely stems from a common misconception or a
+    calculation error. One possible reason for this mistake is a misunderstanding
+    of decimal subtraction or a misinterpretation of the place values. For example,
+    someone might incorrectly think that \( 0.55 - 0.2 \) is the same as \( 0.55 -
+    0.22 \), leading to \( 0.33 \). Alternatively, the error could be due to a simple
+    arithmetic mistake, such as not properly aligning the decimal points during the
+    subtraction process. It''s important to ensure that the decimal points are aligned
+    correctly and to understand the value of each digit in the decimal places.'
+  sentences:
+  - Does not know that 7 and -7 are different
+  - When subtracting decimals with a different number of decimals, subtracts one digit
+    from more than one column
+  - Underestimates the area of shapes when counting squares when some squares are
+    neither wholes nor halves
+pipeline_tag: sentence-similarity
+library_name: sentence-transformers
+metrics:
+- cosine_accuracy@25
+- cosine_precision@50
+- cosine_precision@100
+- cosine_precision@150
+- cosine_precision@200
+- cosine_recall@50
+- cosine_recall@100
+- cosine_recall@150
+- cosine_recall@200
+- cosine_ndcg@25
+- cosine_mrr@25
+- cosine_map@25
+model-index:
+- name: SentenceTransformer based on NovaSearch/stella_en_1.5B_v5
+  results:
+  - task:
+      type: information-retrieval
+      name: Information Retrieval
+    dataset:
+      name: val
+      type: val
+    metrics:
+    - type: cosine_accuracy@25
+      value: 0.7408256880733946
+      name: Cosine Accuracy@25
+    - type: cosine_precision@50
+      value: 0.01676605504587156
+      name: Cosine Precision@50
+    - type: cosine_precision@100
+      value: 0.00901376146788991
+      name: Cosine Precision@100
+    - type: cosine_precision@150
+      value: 0.006261467889908257
+      name: Cosine Precision@150
+    - type: cosine_precision@200
+      value: 0.0047993119266055055
+      name: Cosine Precision@200
+    - type: cosine_recall@50
+      value: 0.8371559633027523
+      name: Cosine Recall@50
+    - type: cosine_recall@100
+      value: 0.9002293577981652
+      name: Cosine Recall@100
+    - type: cosine_recall@150
+      value: 0.9380733944954128
+      name: Cosine Recall@150
+    - type: cosine_recall@200
+      value: 0.9587155963302753
+      name: Cosine Recall@200
+    - type: cosine_ndcg@25
+      value: 0.398580566924245
+      name: Cosine Ndcg@25
+    - type: cosine_mrr@25
+      value: 0.3016886474683388
+      name: Cosine Mrr@25
+    - type: cosine_map@25
+      value: 0.3016613429685574
+      name: Cosine Map@25
+---
+# SentenceTransformer based on NovaSearch/stella_en_1.5B_v5
+This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [NovaSearch/stella_en_1.5B_v5](https://huggingface.co/NovaSearch/stella_en_1.5B_v5). It maps sentences & paragraphs to a 1024-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
+## Model Details
+### Model Description
+- **Model Type:** Sentence Transformer
+- **Base model:** [NovaSearch/stella_en_1.5B_v5](https://huggingface.co/NovaSearch/stella_en_1.5B_v5) <!-- at revision b467445fc9c39af69fdb1bda9e18416df4d19f3c -->
+- **Maximum Sequence Length:** 512 tokens
+- **Output Dimensionality:** 1024 dimensions
+- **Similarity Function:** Cosine Similarity
+<!-- - **Training Dataset:** Unknown -->
+<!-- - **Language:** Unknown -->
+<!-- - **License:** Unknown -->
+### Model Sources
+- **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
+- **Repository:** [Sentence Transformers on GitHub](https://github.com/UKPLab/sentence-transformers)
+- **Hugging Face:** [Sentence Transformers on Hugging Face](https://huggingface.co/models?library=sentence-transformers)
+### Full Model Architecture
+```
+SentenceTransformer(
+  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: PeftModelForFeatureExtraction
+  (1): Pooling({'word_embedding_dimension': 1536, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
+  (2): Dense({'in_features': 1536, 'out_features': 1024, 'bias': True, 'activation_function': 'torch.nn.modules.linear.Identity'})
+)
+```
+## Usage
+### Direct Usage (Sentence Transformers)
+First install the Sentence Transformers library:
+```bash
+pip install -U sentence-transformers
+```
+Then you can load this model and run inference.
+```python
+from sentence_transformers import SentenceTransformer
+# Download from the 🤗 Hub
+model = SentenceTransformer("sentence_transformers_model_id")
+# Run inference
+sentences = [
+    "Subject: Adding and Subtracting with Decimals\nConstruct: Subtract decimals where the numbers involved have a different number of decimal places\nQuestion: \\( 0.55-0.2= \\)\nCorrectAnswer: \\( 0.35 \\)\nIncorrectAnswer: \\( 0.33 \\)\nIncorrectReason: The correct answer to the problem \\( 0.55 - 0.2 \\) is \\( 0.35 \\). This is correct because when you subtract \\( 0.2 \\) from \\( 0.55 \\), you are essentially performing the operation \\( 0.55 - 0.20 \\). This can be visualized as \\( 55 \\) hundredths minus \\( 20 \\) hundredths, which equals \\( 35 \\) hundredths, or \\( 0.35 \\).\n\nThe incorrect answer \\( 0.33 \\) likely stems from a common misconception or a calculation error. One possible reason for this mistake is a misunderstanding of decimal subtraction or a misinterpretation of the place values. For example, someone might incorrectly think that \\( 0.55 - 0.2 \\) is the same as \\( 0.55 - 0.22 \\), leading to \\( 0.33 \\). Alternatively, the error could be due to a simple arithmetic mistake, such as not properly aligning the decimal points during the subtraction process. It's important to ensure that the decimal points are aligned correctly and to understand the value of each digit in the decimal places.",
+    'When subtracting decimals with a different number of decimals, subtracts one digit from more than one column',
+    'Does not know that 7 and -7 are different',
+]
+embeddings = model.encode(sentences)
+print(embeddings.shape)
+# [3, 1024]
+# Get the similarity scores for the embeddings
+similarities = model.similarity(embeddings, embeddings)
+print(similarities.shape)
+# [3, 3]
+```
+<!--
+### Direct Usage (Transformers)
+<details><summary>Click to see the direct usage in Transformers</summary>
+</details>
+-->
+<!--
+### Downstream Usage (Sentence Transformers)
+You can finetune this model on your own dataset.
+<details><summary>Click to expand</summary>
+</details>
+-->
+<!--
+### Out-of-Scope Use
+*List how the model may foreseeably be misused and address what users ought not to do with the model.*
+-->
+## Evaluation
+### Metrics
+#### Information Retrieval
+* Dataset: `val`
+* Evaluated with [<code>InformationRetrievalEvaluator</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.InformationRetrievalEvaluator)
+| Metric               | Value      |
+|:---------------------|:-----------|
+| cosine_accuracy@25   | 0.7408     |
+| cosine_precision@50  | 0.0168     |
+| cosine_precision@100 | 0.009      |
+| cosine_precision@150 | 0.0063     |
+| cosine_precision@200 | 0.0048     |
+| cosine_recall@50     | 0.8372     |
+| cosine_recall@100    | 0.9002     |
+| cosine_recall@150    | 0.9381     |
+| cosine_recall@200    | 0.9587     |
+| **cosine_ndcg@25**   | **0.3986** |
+| cosine_mrr@25        | 0.3017     |
+| cosine_map@25        | 0.3017     |
+<!--
+## Bias, Risks and Limitations
+*What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
+-->
+<!--
+### Recommendations
+*What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
+-->
+## Training Details
+### Training Dataset
+#### Unnamed Dataset
+* Size: 8,562 training samples
+* Columns: <code>anchor</code>, <code>positive</code>, and <code>negative</code>
+* Approximate statistics based on the first 1000 samples:
+  |         | anchor                                                                               | positive                                                                          | negative                                                                          |
+  |:--------|:-------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|:----------------------------------------------------------------------------------|
+  | type    | string                                                                               | string                                                                            | string                                                                            |
+  | details | <ul><li>min: 190 tokens</li><li>mean: 331.9 tokens</li><li>max: 512 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 13.79 tokens</li><li>max: 42 tokens</li></ul> | <ul><li>min: 4 tokens</li><li>mean: 15.33 tokens</li><li>max: 41 tokens</li></ul> |
+* Samples:
+  | anchor                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        | positive                                                                 | negative                                                                                                           |
+  |:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------|
+  | <code>Subject: Function Machines<br>Construct: Calculate the square of a number<br>Question: ![A function machine with 3 rectangles in a row, joined by arrows pointing from left to right. The first rectangle on the left is empty and says "input" above it. The middle rectangle has "square root" written inside it and the final rectangle has "output" written above it and "16" written inside it.]() What is the input of this function machine?<br>CorrectAnswer: \( 256 \)<br>IncorrectAnswer: \( 8 \)<br>IncorrectReason: The correct answer is \( 256 \) because the function machine involves taking the square root of the input to produce the output. Given that the output is \( 16 \), we need to find a number whose square root is \( 16 \). Mathematically, this means solving the equation \( \sqrt{x} = 16 \). Squaring both sides, we get \( x = 16^2 = 256 \). Therefore, the input must be \( 256 \).<br><br>The incorrect answer \( 8 \) likely stems from a common misconception. Someone might have seen the output \( 16 \) and thou...</code> | <code>Mixes up squaring and multiplying by 2 or doubling</code>          | <code>Confuses written 'teen' numbers with their corresponding single-digit number</code>                          |
+  | <code>Subject: Function Machines<br>Construct: Calculate the square of a number<br>Question: ![A function machine with 3 rectangles in a row, joined by arrows pointing from left to right. The first rectangle on the left is empty and says "input" above it. The middle rectangle has "square root" written inside it and the final rectangle has "output" written above it and "16" written inside it.]() What is the input of this function machine?<br>CorrectAnswer: \( 256 \)<br>IncorrectAnswer: \( 8 \)<br>IncorrectReason: The correct answer is \( 256 \) because the function machine involves taking the square root of the input to produce the output. Given that the output is \( 16 \), we need to find a number whose square root is \( 16 \). Mathematically, this means solving the equation \( \sqrt{x} = 16 \). Squaring both sides, we get \( x = 16^2 = 256 \). Therefore, the input must be \( 256 \).<br><br>The incorrect answer \( 8 \) likely stems from a common misconception. Someone might have seen the output \( 16 \) and thou...</code> | <code>Mixes up squaring and multiplying by 2 or doubling</code>          | <code>When multiplying multiples of ten and the answer requires an extra digit, leaves off that extra digit</code> |
+  | <code>Subject: Ratio and Proportion<br>Construct: Convert between currencies given an exchange rate<br>Question: Convert 350 Thai baht to Australian Dollars.\n1 Australian dollar = 25 Thai baht<br>CorrectAnswer: 14<br>IncorrectAnswer: 350<br>IncorrectReason: The correct answer is 14 Australian dollars. This is because the conversion rate given is 1 Australian dollar (AUD) equals 25 Thai baht (THB). To convert 350 THB to AUD, you divide 350 by 25, which equals 14 AUD. This calculation correctly reflects the exchange rate and the amount of money being converted.<br><br>The incorrect answer of 350 is likely due to a misunderstanding of the conversion process. Someone might have mistakenly thought that the amount in Thai baht is the same in Australian dollars, not taking into account the exchange rate. This error occurs when the person fails to apply the conversion factor, instead assuming that the currency values are equivalent without adjustment. This misconception can lead to significant errors in financial trans...</code> | <code>Assumes a 1:1 conversion ratio between different currencies</code> | <code>Believes that the larger the divisor, the larger the answer.</code>                                          |
+* Loss: [<code>CachedMultipleNegativesRankingLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#cachedmultiplenegativesrankingloss) with these parameters:
+  ```json
+  {
+      "scale": 20.0,
+      "similarity_fct": "cos_sim"
+  }
+  ```
+### Training Hyperparameters
+#### Non-Default Hyperparameters
+- `eval_strategy`: steps
+- `per_device_train_batch_size`: 16
+- `per_device_eval_batch_size`: 4
+- `learning_rate`: 0.001
+- `num_train_epochs`: 1.0
+- `lr_scheduler_type`: cosine
+- `save_only_model`: True
+- `bf16`: True
+- `load_best_model_at_end`: True
+- `batch_sampler`: no_duplicates
+#### All Hyperparameters
+<details><summary>Click to expand</summary>
+- `overwrite_output_dir`: False
+- `do_predict`: False
+- `eval_strategy`: steps
+- `prediction_loss_only`: True
+- `per_device_train_batch_size`: 16
+- `per_device_eval_batch_size`: 4
+- `per_gpu_train_batch_size`: None
+- `per_gpu_eval_batch_size`: None
+- `gradient_accumulation_steps`: 1
+- `eval_accumulation_steps`: None
+- `torch_empty_cache_steps`: None
+- `learning_rate`: 0.001
+- `weight_decay`: 0.0
+- `adam_beta1`: 0.9
+- `adam_beta2`: 0.999
+- `adam_epsilon`: 1e-08
+- `max_grad_norm`: 1.0
+- `num_train_epochs`: 1.0
+- `max_steps`: -1
+- `lr_scheduler_type`: cosine
+- `lr_scheduler_kwargs`: {}
+- `warmup_ratio`: 0.0
+- `warmup_steps`: 0
+- `log_level`: passive
+- `log_level_replica`: warning
+- `log_on_each_node`: True
+- `logging_nan_inf_filter`: True
+- `save_safetensors`: True
+- `save_on_each_node`: False
+- `save_only_model`: True
+- `restore_callback_states_from_checkpoint`: False
+- `no_cuda`: False
+- `use_cpu`: False
+- `use_mps_device`: False
+- `seed`: 42
+- `data_seed`: None
+- `jit_mode_eval`: False
+- `use_ipex`: False
+- `bf16`: True
+- `fp16`: False
+- `fp16_opt_level`: O1
+- `half_precision_backend`: auto
+- `bf16_full_eval`: False
+- `fp16_full_eval`: False
+- `tf32`: None
+- `local_rank`: 0
+- `ddp_backend`: None
+- `tpu_num_cores`: None
+- `tpu_metrics_debug`: False
+- `debug`: []
+- `dataloader_drop_last`: False
+- `dataloader_num_workers`: 0
+- `dataloader_prefetch_factor`: None
+- `past_index`: -1
+- `disable_tqdm`: False
+- `remove_unused_columns`: True
+- `label_names`: None
+- `load_best_model_at_end`: True
+- `ignore_data_skip`: False
+- `fsdp`: []
+- `fsdp_min_num_params`: 0
+- `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
+- `tp_size`: 0
+- `fsdp_transformer_layer_cls_to_wrap`: None
+- `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
+- `deepspeed`: None
+- `label_smoothing_factor`: 0.0
+- `optim`: adamw_torch
+- `optim_args`: None
+- `adafactor`: False
+- `group_by_length`: False
+- `length_column_name`: length
+- `ddp_find_unused_parameters`: None
+- `ddp_bucket_cap_mb`: None
+- `ddp_broadcast_buffers`: False
+- `dataloader_pin_memory`: True
+- `dataloader_persistent_workers`: False
+- `skip_memory_metrics`: True
+- `use_legacy_prediction_loop`: False
+- `push_to_hub`: False
+- `resume_from_checkpoint`: None
+- `hub_model_id`: None
+- `hub_strategy`: every_save
+- `hub_private_repo`: None
+- `hub_always_push`: False
+- `gradient_checkpointing`: False
+- `gradient_checkpointing_kwargs`: None
+- `include_inputs_for_metrics`: False
+- `include_for_metrics`: []
+- `eval_do_concat_batches`: True
+- `fp16_backend`: auto
+- `push_to_hub_model_id`: None
+- `push_to_hub_organization`: None
+- `mp_parameters`:
+- `auto_find_batch_size`: False
+- `full_determinism`: False
+- `torchdynamo`: None
+- `ray_scope`: last
+- `ddp_timeout`: 1800
+- `torch_compile`: False
+- `torch_compile_backend`: None
+- `torch_compile_mode`: None
+- `include_tokens_per_second`: False
+- `include_num_input_tokens_seen`: False
+- `neftune_noise_alpha`: None
+- `optim_target_modules`: None
+- `batch_eval_metrics`: False
+- `eval_on_start`: False
+- `use_liger_kernel`: False
+- `eval_use_gather_object`: False
+- `average_tokens_across_devices`: False
+- `prompts`: None
+- `batch_sampler`: no_duplicates
+- `multi_dataset_batch_sampler`: proportional
+</details>
+### Training Logs
+<details><summary>Click to expand</summary>
+| Epoch      | Step    | Training Loss | val_cosine_ndcg@25 |
+|:----------:|:-------:|:-------------:|:------------------:|
+| 0.0019     | 1       | 0.7341        | -                  |
+| 0.0037     | 2       | 1.1246        | -                  |
+| 0.0056     | 3       | 1.1668        | -                  |
+| 0.0075     | 4       | 1.2752        | -                  |
+| 0.0093     | 5       | 1.2428        | -                  |
+| 0.0112     | 6       | 0.8722        | -                  |
+| 0.0131     | 7       | 0.9877        | -                  |
+| 0.0149     | 8       | 0.3914        | -                  |
+| 0.0168     | 9       | 1.6333        | -                  |
+| 0.0187     | 10      | 0.3793        | 0.3873             |
+| 0.0205     | 11      | 0.3277        | -                  |
+| 0.0224     | 12      | 0.689         | -                  |
+| 0.0243     | 13      | 1.4066        | -                  |
+| 0.0261     | 14      | 0.7874        | -                  |
+| 0.0280     | 15      | 0.7898        | -                  |
+| 0.0299     | 16      | 1.0844        | -                  |
+| 0.0317     | 17      | 1.0972        | -                  |
+| 0.0336     | 18      | 1.7414        | -                  |
+| 0.0354     | 19      | 0.9649        | -                  |
+| 0.0373     | 20      | 0.9025        | 0.3383             |
+| 0.0392     | 21      | 1.0195        | -                  |
+| 0.0410     | 22      | 1.5774        | -                  |
+| 0.0429     | 23      | 2.6835        | -                  |
+| 0.0448     | 24      | 1.9685        | -                  |
+| 0.0466     | 25      | 1.5736        | -                  |
+| 0.0485     | 26      | 0.4385        | -                  |
+| 0.0504     | 27      | 1.5777        | -                  |
+| 0.0522     | 28      | 0.5438        | -                  |
+| 0.0541     | 29      | 1.1351        | -                  |
+| 0.0560     | 30      | 0.4636        | 0.3349             |
+| 0.0578     | 31      | 1.749         | -                  |
+| 0.0597     | 32      | 0.6608        | -                  |
+| 0.0616     | 33      | 1.48          | -                  |
+| 0.0634     | 34      | 0.6442        | -                  |
+| 0.0653     | 35      | 1.2882        | -                  |
+| 0.0672     | 36      | 1.5927        | -                  |
+| 0.0690     | 37      | 0.819         | -                  |
+| 0.0709     | 38      | 0.5842        | -                  |
+| 0.0728     | 39      | 0.4818        | -                  |
+| 0.0746     | 40      | 0.5143        | 0.3079             |
+| 0.0765     | 41      | 1.4064        | -                  |
+| 0.0784     | 42      | 0.924         | -                  |
+| 0.0802     | 43      | 0.9097        | -                  |
+| 0.0821     | 44      | 0.4214        | -                  |
+| 0.0840     | 45      | 1.2579        | -                  |
+| 0.0858     | 46      | 3.0192        | -                  |
+| 0.0877     | 47      | 0.9019        | -                  |
+| 0.0896     | 48      | 0.8331        | -                  |
+| 0.0914     | 49      | 2.1336        | -                  |
+| 0.0933     | 50      | 0.3793        | 0.3332             |
+| 0.0951     | 51      | 0.6568        | -                  |
+| 0.0970     | 52      | 0.7644        | -                  |
+| 0.0989     | 53      | 1.1422        | -                  |
+| 0.1007     | 54      | 1.1733        | -                  |
+| 0.1026     | 55      | 1.1297        | -                  |
+| 0.1045     | 56      | 0.7746        | -                  |
+| 0.1063     | 57      | 1.2374        | -                  |
+| 0.1082     | 58      | 1.0382        | -                  |
+| 0.1101     | 59      | 0.8722        | -                  |
+| 0.1119     | 60      | 1.6862        | 0.2076             |
+| 0.1138     | 61      | 0.9489        | -                  |
+| 0.1157     | 62      | 1.6074        | -                  |
+| 0.1175     | 63      | 2.3639        | -                  |
+| 0.1194     | 64      | 1.2994        | -                  |
+| 0.1213     | 65      | 1.3806        | -                  |
+| 0.1231     | 66      | 1.6077        | -                  |
+| 0.125      | 67      | 1.2359        | -                  |
+| 0.1269     | 68      | 1.2202        | -                  |
+| 0.1287     | 69      | 0.8442        | -                  |
+| 0.1306     | 70      | 0.8537        | 0.2768             |
+| 0.1325     | 71      | 2.2377        | -                  |
+| 0.1343     | 72      | 1.0657        | -                  |
+| 0.1362     | 73      | 0.6213        | -                  |
+| 0.1381     | 74      | 1.2029        | -                  |
+| 0.1399     | 75      | 1.4392        | -                  |
+| 0.1418     | 76      | 0.7116        | -                  |
+| 0.1437     | 77      | 1.228         | -                  |
+| 0.1455     | 78      | 0.9498        | -                  |
+| 0.1474     | 79      | 1.1289        | -                  |
+| 0.1493     | 80      | 1.6371        | 0.2504             |
+| 0.1511     | 81      | 0.438         | -                  |
+| 0.1530     | 82      | 1.0909        | -                  |
+| 0.1549     | 83      | 0.8301        | -                  |
+| 0.1567     | 84      | 0.9003        | -                  |
+| 0.1586     | 85      | 1.8428        | -                  |
+| 0.1604     | 86      | 2.7758        | -                  |
+| 0.1623     | 87      | 3.7156        | -                  |
+| 0.1642     | 88      | 2.6085        | -                  |
+| 0.1660     | 89      | 2.2705        | -                  |
+| 0.1679     | 90      | 1.4518        | 0.2239             |
+| 0.1698     | 91      | 1.3423        | -                  |
+| 0.1716     | 92      | 1.4066        | -                  |
+| 0.1735     | 93      | 2.3138        | -                  |
+| 0.1754     | 94      | 2.256         | -                  |
+| 0.1772     | 95      | 1.2564        | -                  |
+| 0.1791     | 96      | 1.477         | -                  |
+| 0.1810     | 97      | 2.8484        | -                  |
+| 0.1828     | 98      | 1.3257        | -                  |
+| 0.1847     | 99      | 1.1516        | -                  |
+| 0.1866     | 100     | 1.2892        | 0.2142             |
+| 0.1884     | 101     | 1.7179        | -                  |
+| 0.1903     | 102     | 2.2282        | -                  |
+| 0.1922     | 103     | 0.9497        | -                  |
+| 0.1940     | 104     | 0.9663        | -                  |
+| 0.1959     | 105     | 1.2476        | -                  |
+| 0.1978     | 106     | 1.0585        | -                  |
+| 0.1996     | 107     | 1.565         | -                  |
+| 0.2015     | 108     | 1.4498        | -                  |
+| 0.2034     | 109     | 1.237         | -                  |
+| 0.2052     | 110     | 1.9519        | 0.1239             |
+| 0.2071     | 111     | 2.4816        | -                  |
+| 0.2090     | 112     | 2.3602        | -                  |
+| 0.2108     | 113     | 0.5189        | -                  |
+| 0.2127     | 114     | 2.1441        | -                  |
+| 0.2146     | 115     | 1.9018        | -                  |
+| 0.2164     | 116     | 1.1875        | -                  |
+| 0.2183     | 117     | 1.033         | -                  |
+| 0.2201     | 118     | 1.7925        | -                  |
+| 0.2220     | 119     | 1.1472        | -                  |
+| 0.2239     | 120     | 1.0008        | 0.2699             |
+| 0.2257     | 121     | 1.4836        | -                  |
+| 0.2276     | 122     | 0.9753        | -                  |
+| 0.2295     | 123     | 0.7691        | -                  |
+| 0.2313     | 124     | 0.9119        | -                  |
+| 0.2332     | 125     | 0.7913        | -                  |
+| 0.2351     | 126     | 1.4574        | -                  |
+| 0.2369     | 127     | 1.3908        | -                  |
+| 0.2388     | 128     | 1.2722        | -                  |
+| 0.2407     | 129     | 0.3513        | -                  |
+| 0.2425     | 130     | 1.2904        | 0.2267             |
+| 0.2444     | 131     | 1.1935        | -                  |
+| 0.2463     | 132     | 2.024         | -                  |
+| 0.2481     | 133     | 1.2138        | -                  |
+| 0.25       | 134     | 1.909         | -                  |
+| 0.2519     | 135     | 1.4939        | -                  |
+| 0.2537     | 136     | 2.5559        | -                  |
+| 0.2556     | 137     | 1.1896        | -                  |
+| 0.2575     | 138     | 1.5372        | -                  |
+| 0.2593     | 139     | 1.3159        | -                  |
+| 0.2612     | 140     | 2.8622        | 0.1801             |
+| 0.2631     | 141     | 2.2284        | -                  |
+| 0.2649     | 142     | 1.1668        | -                  |
+| 0.2668     | 143     | 1.5383        | -                  |
+| 0.2687     | 144     | 1.6872        | -                  |
+| 0.2705     | 145     | 1.3499        | -                  |
+| 0.2724     | 146     | 1.7111        | -                  |
+| 0.2743     | 147     | 0.8461        | -                  |
+| 0.2761     | 148     | 1.0737        | -                  |
+| 0.2780     | 149     | 1.2229        | -                  |
+| 0.2799     | 150     | 1.4991        | 0.2705             |
+| 0.2817     | 151     | 1.2098        | -                  |
+| 0.2836     | 152     | 0.8411        | -                  |
+| 0.2854     | 153     | 0.7454        | -                  |
+| 0.2873     | 154     | 0.5295        | -                  |
+| 0.2892     | 155     | 1.2309        | -                  |
+| 0.2910     | 156     | 1.1437        | -                  |
+| 0.2929     | 157     | 1.3461        | -                  |
+| 0.2948     | 158     | 1.1028        | -                  |
+| 0.2966     | 159     | 1.6687        | -                  |
+| 0.2985     | 160     | 1.1048        | 0.2228             |
+| 0.3004     | 161     | 1.4661        | -                  |
+| 0.3022     | 162     | 2.3891        | -                  |
+| 0.3041     | 163     | 2.0019        | -                  |
+| 0.3060     | 164     | 1.9604        | -                  |
+| 0.3078     | 165     | 2.1173        | -                  |
+| 0.3097     | 166     | 1.2352        | -                  |
+| 0.3116     | 167     | 1.0883        | -                  |
+| 0.3134     | 168     | 1.0343        | -                  |
+| 0.3153     | 169     | 0.6048        | -                  |
+| 0.3172     | 170     | 1.2634        | 0.2747             |
+| 0.3190     | 171     | 0.724         | -                  |
+| 0.3209     | 172     | 0.5937        | -                  |
+| 0.3228     | 173     | 0.9735        | -                  |
+| 0.3246     | 174     | 1.1059        | -                  |
+| 0.3265     | 175     | 0.5561        | -                  |
+| 0.3284     | 176     | 0.9019        | -                  |
+| 0.3302     | 177     | 0.6012        | -                  |
+| 0.3321     | 178     | 0.6203        | -                  |
+| 0.3340     | 179     | 0.4729        | -                  |
+| 0.3358     | 180     | 0.488         | 0.2880             |
+| 0.3377     | 181     | 0.5171        | -                  |
+| 0.3396     | 182     | 1.2202        | -                  |
+| 0.3414     | 183     | 0.4338        | -                  |
+| 0.3433     | 184     | 0.2286        | -                  |
+| 0.3451     | 185     | 1.5921        | -                  |
+| 0.3470     | 186     | 0.9065        | -                  |
+| 0.3489     | 187     | 0.7728        | -                  |
+| 0.3507     | 188     | 0.6743        | -                  |
+| 0.3526     | 189     | 0.6354        | -                  |
+| 0.3545     | 190     | 1.0883        | 0.3092             |
+| 0.3563     | 191     | 0.7866        | -                  |
+| 0.3582     | 192     | 0.4465        | -                  |
+| 0.3601     | 193     | 0.9169        | -                  |
+| 0.3619     | 194     | 1.2751        | -                  |
+| 0.3638     | 195     | 0.6479        | -                  |
+| 0.3657     | 196     | 1.0898        | -                  |
+| 0.3675     | 197     | 0.4064        | -                  |
+| 0.3694     | 198     | 1.216         | -                  |
+| 0.3713     | 199     | 0.5892        | -                  |
+| 0.3731     | 200     | 0.9736        | 0.2627             |
+| 0.375      | 201     | 1.8989        | -                  |
+| 0.3769     | 202     | 1.4159        | -                  |
+| 0.3787     | 203     | 1.4947        | -                  |
+| 0.3806     | 204     | 1.6758        | -                  |
+| 0.3825     | 205     | 1.1081        | -                  |
+| 0.3843     | 206     | 1.1187        | -                  |
+| 0.3862     | 207     | 1.7538        | -                  |
+| 0.3881     | 208     | 2.3149        | -                  |
+| 0.3899     | 209     | 0.7799        | -                  |
+| 0.3918     | 210     | 0.7268        | 0.2772             |
+| 0.3937     | 211     | 0.6603        | -                  |
+| 0.3955     | 212     | 1.034         | -                  |
+| 0.3974     | 213     | 0.765         | -                  |
+| 0.3993     | 214     | 1.8519        | -                  |
+| 0.4011     | 215     | 1.6521        | -                  |
+| 0.4030     | 216     | 1.7584        | -                  |
+| 0.4049     | 217     | 2.2637        | -                  |
+| 0.4067     | 218     | 1.1289        | -                  |
+| 0.4086     | 219     | 1.9741        | -                  |
+| 0.4104     | 220     | 1.8754        | 0.1599             |
+| 0.4123     | 221     | 1.8528        | -                  |
+| 0.4142     | 222     | 2.1507        | -                  |
+| 0.4160     | 223     | 2.1293        | -                  |
+| 0.4179     | 224     | 0.9261        | -                  |
+| 0.4198     | 225     | 1.2636        | -                  |
+| 0.4216     | 226     | 1.7696        | -                  |
+| 0.4235     | 227     | 1.0828        | -                  |
+| 0.4254     | 228     | 1.533         | -                  |
+| 0.4272     | 229     | 1.438         | -                  |
+| 0.4291     | 230     | 0.9375        | 0.2517             |
+| 0.4310     | 231     | 0.8709        | -                  |
+| 0.4328     | 232     | 1.0026        | -                  |
+| 0.4347     | 233     | 1.0076        | -                  |
+| 0.4366     | 234     | 0.8922        | -                  |
+| 0.4384     | 235     | 0.828         | -                  |
+| 0.4403     | 236     | 1.111         | -                  |
+| 0.4422     | 237     | 1.5364        | -                  |
+| 0.4440     | 238     | 0.9463        | -                  |
+| 0.4459     | 239     | 1.059         | -                  |
+| 0.4478     | 240     | 1.4188        | 0.1832             |
+| 0.4496     | 241     | 1.7641        | -                  |
+| 0.4515     | 242     | 1.4712        | -                  |
+| 0.4534     | 243     | 1.2123        | -                  |
+| 0.4552     | 244     | 0.9881        | -                  |
+| 0.4571     | 245     | 2.1159        | -                  |
+| 0.4590     | 246     | 1.073         | -                  |
+| 0.4608     | 247     | 0.3211        | -                  |
+| 0.4627     | 248     | 1.7917        | -                  |
+| 0.4646     | 249     | 0.6342        | -                  |
+| 0.4664     | 250     | 1.3472        | 0.2687             |
+| 0.4683     | 251     | 0.492         | -                  |
+| 0.4701     | 252     | 1.0642        | -                  |
+| 0.4720     | 253     | 0.6704        | -                  |
+| 0.4739     | 254     | 0.6744        | -                  |
+| 0.4757     | 255     | 1.7866        | -                  |
+| 0.4776     | 256     | 1.2805        | -                  |
+| 0.4795     | 257     | 1.0666        | -                  |
+| 0.4813     | 258     | 2.4739        | -                  |
+| 0.4832     | 259     | 2.7657        | -                  |
+| 0.4851     | 260     | 2.4601        | 0.1183             |
+| 0.4869     | 261     | 2.5174        | -                  |
+| 0.4888     | 262     | 2.7207        | -                  |
+| 0.4907     | 263     | 2.7801        | -                  |
+| 0.4925     | 264     | 1.2408        | -                  |
+| 0.4944     | 265     | 2.3538        | -                  |
+| 0.4963     | 266     | 2.2384        | -                  |
+| 0.4981     | 267     | 1.4689        | -                  |
+| 0.5        | 268     | 1.6905        | -                  |
+| 0.5019     | 269     | 1.4729        | -                  |
+| 0.5037     | 270     | 1.2211        | 0.2667             |
+| 0.5056     | 271     | 0.6759        | -                  |
+| 0.5075     | 272     | 0.8592        | -                  |
+| 0.5093     | 273     | 0.4822        | -                  |
+| 0.5112     | 274     | 1.2476        | -                  |
+| 0.5131     | 275     | 0.6806        | -                  |
+| 0.5149     | 276     | 1.3813        | -                  |
+| 0.5168     | 277     | 0.7919        | -                  |
+| 0.5187     | 278     | 0.7511        | -                  |
+| 0.5205     | 279     | 0.6702        | -                  |
+| 0.5224     | 280     | 0.8166        | 0.3069             |
+| 0.5243     | 281     | 0.3796        | -                  |
+| 0.5261     | 282     | 0.7048        | -                  |
+| 0.5280     | 283     | 1.2978        | -                  |
+| 0.5299     | 284     | 0.7682        | -                  |
+| 0.5317     | 285     | 0.554         | -                  |
+| 0.5336     | 286     | 1.0344        | -                  |
+| 0.5354     | 287     | 0.8375        | -                  |
+| 0.5373     | 288     | 0.361         | -                  |
+| 0.5392     | 289     | 0.3193        | -                  |
+| 0.5410     | 290     | 0.7264        | 0.2902             |
+| 0.5429     | 291     | 1.2829        | -                  |
+| 0.5448     | 292     | 1.6457        | -                  |
+| 0.5466     | 293     | 0.9561        | -                  |
+| 0.5485     | 294     | 1.2187        | -                  |
+| 0.5504     | 295     | 1.5597        | -                  |
+| 0.5522     | 296     | 1.6294        | -                  |
+| 0.5541     | 297     | 0.9754        | -                  |
+| 0.5560     | 298     | 1.121         | -                  |
+| 0.5578     | 299     | 1.0038        | -                  |
+| 0.5597     | 300     | 1.472         | 0.2603             |
+| 0.5616     | 301     | 1.1317        | -                  |
+| 0.5634     | 302     | 0.678         | -                  |
+| 0.5653     | 303     | 1.2261        | -                  |
+| 0.5672     | 304     | 1.4552        | -                  |
+| 0.5690     | 305     | 0.7346        | -                  |
+| 0.5709     | 306     | 1.2259        | -                  |
+| 0.5728     | 307     | 0.5651        | -                  |
+| 0.5746     | 308     | 0.5246        | -                  |
+| 0.5765     | 309     | 0.5817        | -                  |
+| 0.5784     | 310     | 1.0662        | 0.2983             |
+| 0.5802     | 311     | 1.2422        | -                  |
+| 0.5821     | 312     | 0.9479        | -                  |
+| 0.5840     | 313     | 0.8528        | -                  |
+| 0.5858     | 314     | 0.9502        | -                  |
+| 0.5877     | 315     | 1.0885        | -                  |
+| 0.5896     | 316     | 1.4663        | -                  |
+| 0.5914     | 317     | 0.6274        | -                  |
+| 0.5933     | 318     | 1.0567        | -                  |
+| 0.5951     | 319     | 1.4394        | -                  |
+| 0.5970     | 320     | 0.455         | 0.2463             |
+| 0.5989     | 321     | 0.5577        | -                  |
+| 0.6007     | 322     | 0.7305        | -                  |
+| 0.6026     | 323     | 1.3569        | -                  |
+| 0.6045     | 324     | 1.9528        | -                  |
+| 0.6063     | 325     | 0.7332        | -                  |
+| 0.6082     | 326     | 1.6955        | -                  |
+| 0.6101     | 327     | 1.5237        | -                  |
+| 0.6119     | 328     | 2.0396        | -                  |
+| 0.6138     | 329     | 1.913         | -                  |
+| 0.6157     | 330     | 1.8478        | 0.0902             |
+| 0.6175     | 331     | 2.7965        | -                  |
+| 0.6194     | 332     | 2.4383        | -                  |
+| 0.6213     | 333     | 3.3085        | -                  |
+| 0.6231     | 334     | 2.4657        | -                  |
+| 0.625      | 335     | 2.3933        | -                  |
+| 0.6269     | 336     | 2.3603        | -                  |
+| 0.6287     | 337     | 1.3248        | -                  |
+| 0.6306     | 338     | 1.568         | -                  |
+| 0.6325     | 339     | 1.6271        | -                  |
+| 0.6343     | 340     | 1.3838        | 0.1664             |
+| 0.6362     | 341     | 2.0098        | -                  |
+| 0.6381     | 342     | 1.7105        | -                  |
+| 0.6399     | 343     | 1.2461        | -                  |
+| 0.6418     | 344     | 1.293         | -                  |
+| 0.6437     | 345     | 1.4298        | -                  |
+| 0.6455     | 346     | 1.7789        | -                  |
+| 0.6474     | 347     | 1.0361        | -                  |
+| 0.6493     | 348     | 0.6129        | -                  |
+| 0.6511     | 349     | 1.5476        | -                  |
+| 0.6530     | 350     | 0.8251        | 0.2059             |
+| 0.6549     | 351     | 0.9453        | -                  |
+| 0.6567     | 352     | 1.1893        | -                  |
+| 0.6586     | 353     | 0.7976        | -                  |
+| 0.6604     | 354     | 0.5457        | -                  |
+| 0.6623     | 355     | 0.6489        | -                  |
+| 0.6642     | 356     | 1.0474        | -                  |
+| 0.6660     | 357     | 1.0201        | -                  |
+| 0.6679     | 358     | 0.5917        | -                  |
+| 0.6698     | 359     | 1.0068        | -                  |
+| 0.6716     | 360     | 0.5708        | 0.2568             |
+| 0.6735     | 361     | 0.6778        | -                  |
+| 0.6754     | 362     | 0.5382        | -                  |
+| 0.6772     | 363     | 0.9939        | -                  |
+| 0.6791     | 364     | 0.7322        | -                  |
+| 0.6810     | 365     | 1.1926        | -                  |
+| 0.6828     | 366     | 1.5369        | -                  |
+| 0.6847     | 367     | 0.9815        | -                  |
+| 0.6866     | 368     | 0.8891        | -                  |
+| 0.6884     | 369     | 1.2503        | -                  |
+| 0.6903     | 370     | 0.9369        | 0.2584             |
+| 0.6922     | 371     | 0.538         | -                  |
+| 0.6940     | 372     | 0.7312        | -                  |
+| 0.6959     | 373     | 1.1477        | -                  |
+| 0.6978     | 374     | 1.9885        | -                  |
+| 0.6996     | 375     | 0.9605        | -                  |
+| 0.7015     | 376     | 0.7769        | -                  |
+| 0.7034     | 377     | 0.7701        | -                  |
+| 0.7052     | 378     | 0.7166        | -                  |
+| 0.7071     | 379     | 0.9712        | -                  |
+| 0.7090     | 380     | 0.2171        | 0.3315             |
+| 0.7108     | 381     | 1.1501        | -                  |
+| 0.7127     | 382     | 0.9079        | -                  |
+| 0.7146     | 383     | 0.3611        | -                  |
+| 0.7164     | 384     | 0.1937        | -                  |
+| 0.7183     | 385     | 0.5164        | -                  |
+| 0.7201     | 386     | 1.4014        | -                  |
+| 0.7220     | 387     | 0.5033        | -                  |
+| 0.7239     | 388     | 0.7722        | -                  |
+| 0.7257     | 389     | 0.1686        | -                  |
+| 0.7276     | 390     | 0.5965        | 0.3521             |
+| 0.7295     | 391     | 0.2465        | -                  |
+| 0.7313     | 392     | 0.2342        | -                  |
+| 0.7332     | 393     | 0.6155        | -                  |
+| 0.7351     | 394     | 0.6689        | -                  |
+| 0.7369     | 395     | 0.4981        | -                  |
+| 0.7388     | 396     | 0.4915        | -                  |
+| 0.7407     | 397     | 0.5064        | -                  |
+| 0.7425     | 398     | 1.244         | -                  |
+| 0.7444     | 399     | 0.8528        | -                  |
+| 0.7463     | 400     | 0.6747        | 0.3463             |
+| 0.7481     | 401     | 0.3525        | -                  |
+| 0.75       | 402     | 1.2951        | -                  |
+| 0.7519     | 403     | 0.6925        | -                  |
+| 0.7537     | 404     | 0.7087        | -                  |
+| 0.7556     | 405     | 0.1436        | -                  |
+| 0.7575     | 406     | 0.6327        | -                  |
+| 0.7593     | 407     | 0.3393        | -                  |
+| 0.7612     | 408     | 0.5633        | -                  |
+| 0.7631     | 409     | 0.6249        | -                  |
+| 0.7649     | 410     | 1.5898        | 0.3513             |
+| 0.7668     | 411     | 0.6968        | -                  |
+| 0.7687     | 412     | 0.9603        | -                  |
+| 0.7705     | 413     | 0.4476        | -                  |
+| 0.7724     | 414     | 0.9167        | -                  |
+| 0.7743     | 415     | 1.2049        | -                  |
+| 0.7761     | 416     | 0.4518        | -                  |
+| 0.7780     | 417     | 0.6315        | -                  |
+| 0.7799     | 418     | 0.2537        | -                  |
+| 0.7817     | 419     | 0.6812        | -                  |
+| 0.7836     | 420     | 0.6971        | 0.3573             |
+| 0.7854     | 421     | 0.6064        | -                  |
+| 0.7873     | 422     | 0.4359        | -                  |
+| 0.7892     | 423     | 0.4889        | -                  |
+| 0.7910     | 424     | 0.7253        | -                  |
+| 0.7929     | 425     | 0.519         | -                  |
+| 0.7948     | 426     | 0.2237        | -                  |
+| 0.7966     | 427     | 0.3144        | -                  |
+| 0.7985     | 428     | 0.7395        | -                  |
+| 0.8004     | 429     | 0.5903        | -                  |
+| 0.8022     | 430     | 1.3353        | 0.3664             |
+| 0.8041     | 431     | 0.5381        | -                  |
+| 0.8060     | 432     | 0.5692        | -                  |
+| 0.8078     | 433     | 0.3789        | -                  |
+| 0.8097     | 434     | 0.4091        | -                  |
+| 0.8116     | 435     | 0.4686        | -                  |
+| 0.8134     | 436     | 0.5685        | -                  |
+| 0.8153     | 437     | 0.5923        | -                  |
+| 0.8172     | 438     | 0.2288        | -                  |
+| 0.8190     | 439     | 0.5233        | -                  |
+| 0.8209     | 440     | 0.7775        | 0.3810             |
+| 0.8228     | 441     | 1.1349        | -                  |
+| 0.8246     | 442     | 0.3454        | -                  |
+| 0.8265     | 443     | 0.3732        | -                  |
+| 0.8284     | 444     | 0.2545        | -                  |
+| 0.8302     | 445     | 0.6133        | -                  |
+| 0.8321     | 446     | 0.3711        | -                  |
+| 0.8340     | 447     | 0.2668        | -                  |
+| 0.8358     | 448     | 0.9298        | -                  |
+| 0.8377     | 449     | 0.5457        | -                  |
+| 0.8396     | 450     | 0.5153        | 0.3762             |
+| 0.8414     | 451     | 0.7944        | -                  |
+| 0.8433     | 452     | 0.274         | -                  |
+| 0.8451     | 453     | 0.1943        | -                  |
+| 0.8470     | 454     | 0.865         | -                  |
+| 0.8489     | 455     | 0.577         | -                  |
+| 0.8507     | 456     | 0.1895        | -                  |
+| 0.8526     | 457     | 0.284         | -                  |
+| 0.8545     | 458     | 0.2472        | -                  |
+| 0.8563     | 459     | 0.3254        | -                  |
+| 0.8582     | 460     | 0.9113        | 0.3778             |
+| 0.8601     | 461     | 0.4037        | -                  |
+| 0.8619     | 462     | 0.2395        | -                  |
+| 0.8638     | 463     | 0.9176        | -                  |
+| 0.8657     | 464     | 0.1605        | -                  |
+| 0.8675     | 465     | 0.2563        | -                  |
+| 0.8694     | 466     | 0.403         | -                  |
+| 0.8713     | 467     | 0.6036        | -                  |
+| 0.8731     | 468     | 0.368         | -                  |
+| 0.875      | 469     | 0.3447        | -                  |
+| 0.8769     | 470     | 0.1836        | 0.3848             |
+| 0.8787     | 471     | 0.4374        | -                  |
+| 0.8806     | 472     | 0.1704        | -                  |
+| 0.8825     | 473     | 0.326         | -                  |
+| 0.8843     | 474     | 0.3527        | -                  |
+| 0.8862     | 475     | 0.8108        | -                  |
+| 0.8881     | 476     | 0.7219        | -                  |
+| 0.8899     | 477     | 0.2727        | -                  |
+| 0.8918     | 478     | 0.6034        | -                  |
+| 0.8937     | 479     | 0.8513        | -                  |
+| 0.8955     | 480     | 0.2772        | 0.3935             |
+| 0.8974     | 481     | 0.4888        | -                  |
+| 0.8993     | 482     | 0.6024        | -                  |
+| 0.9011     | 483     | 1.1502        | -                  |
+| 0.9030     | 484     | 0.5434        | -                  |
+| 0.9049     | 485     | 0.2632        | -                  |
+| 0.9067     | 486     | 0.0767        | -                  |
+| 0.9086     | 487     | 0.5782        | -                  |
+| 0.9104     | 488     | 0.6047        | -                  |
+| 0.9123     | 489     | 0.7541        | -                  |
+| 0.9142     | 490     | 0.2185        | 0.3965             |
+| 0.9160     | 491     | 0.1558        | -                  |
+| 0.9179     | 492     | 0.1106        | -                  |
+| 0.9198     | 493     | 0.7286        | -                  |
+| 0.9216     | 494     | 0.1932        | -                  |
+| 0.9235     | 495     | 0.6639        | -                  |
+| 0.9254     | 496     | 0.422         | -                  |
+| 0.9272     | 497     | 0.7506        | -                  |
+| 0.9291     | 498     | 0.1227        | -                  |
+| 0.9310     | 499     | 0.8022        | -                  |
+| **0.9328** | **500** | **0.2475**    | **0.3951**         |
+| 0.9347     | 501     | 0.3068        | -                  |
+| 0.9366     | 502     | 0.9188        | -                  |
+| 0.9384     | 503     | 0.3704        | -                  |
+| 0.9403     | 504     | 0.2393        | -                  |
+| 0.9422     | 505     | 0.7569        | -                  |
+| 0.9440     | 506     | 0.3823        | -                  |
+| 0.9459     | 507     | 0.1712        | -                  |
+| 0.9478     | 508     | 0.3331        | -                  |
+| 0.9496     | 509     | 0.3538        | -                  |
+| 0.9515     | 510     | 0.4431        | 0.3976             |
+| 0.9534     | 511     | 0.422         | -                  |
+| 0.9552     | 512     | 0.3282        | -                  |
+| 0.9571     | 513     | 0.5834        | -                  |
+| 0.9590     | 514     | 1.1424        | -                  |
+| 0.9608     | 515     | 0.8699        | -                  |
+| 0.9627     | 516     | 0.2811        | -                  |
+| 0.9646     | 517     | 0.0964        | -                  |
+| 0.9664     | 518     | 0.2971        | -                  |
+| 0.9683     | 519     | 0.2435        | -                  |
+| 0.9701     | 520     | 1.1154        | 0.3987             |
+| 0.9720     | 521     | 0.2209        | -                  |
+| 0.9739     | 522     | 0.1551        | -                  |
+| 0.9757     | 523     | 0.3366        | -                  |
+| 0.9776     | 524     | 0.5526        | -                  |
+| 0.9795     | 525     | 0.3624        | -                  |
+| 0.9813     | 526     | 0.3311        | -                  |
+| 0.9832     | 527     | 0.7184        | -                  |
+| 0.9851     | 528     | 0.893         | -                  |
+| 0.9869     | 529     | 0.2642        | -                  |
+| 0.9888     | 530     | 0.4994        | 0.3986             |
+| 0.9907     | 531     | 0.6881        | -                  |
+| 0.9925     | 532     | 0.2637        | -                  |
+| 0.9944     | 533     | 0.6997        | -                  |
+| 0.9963     | 534     | 0.3827        | -                  |
+| 0.9981     | 535     | 0.4079        | -                  |
+| 1.0        | 536     | 0.0003        | -                  |
+* The bold row denotes the saved checkpoint.
+</details>
+### Framework Versions
+- Python: 3.11.11
+- Sentence Transformers: 3.4.1
+- Transformers: 4.51.1
+- PyTorch: 2.5.1+cu124
+- Accelerate: 1.3.0
+- Datasets: 3.5.0
+- Tokenizers: 0.21.0
+## Citation
+### BibTeX
+#### Sentence Transformers
+```bibtex
+@inproceedings{reimers-2019-sentence-bert,
+    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
+    author = "Reimers, Nils and Gurevych, Iryna",
+    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
+    month = "11",
+    year = "2019",
+    publisher = "Association for Computational Linguistics",
+    url = "https://arxiv.org/abs/1908.10084",
+}
+```
+#### CachedMultipleNegativesRankingLoss
+```bibtex
+@misc{gao2021scaling,
+    title={Scaling Deep Contrastive Learning Batch Size under Memory Limited Setup},
+    author={Luyu Gao and Yunyi Zhang and Jiawei Han and Jamie Callan},
+    year={2021},
+    eprint={2101.06983},
+    archivePrefix={arXiv},
+    primaryClass={cs.LG}
+}
+```
+<!--
+## Glossary
+*Clearly define terms in order to be accessible across audiences.*
+-->
+<!--
+## Model Card Authors
+*Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
+-->
+<!--
+## Model Card Contact
+*Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
+-->

adapter_config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "dunzhang/stella_en_1.5B_v5",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 96,
+  "lora_bias": false,
+  "lora_dropout": 0.01,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 48,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "up_proj",
+    "gate_proj",
+    "down_proj",
+    "o_proj",
+    "v_proj"
+  ],
+  "task_type": "FEATURE_EXTRACTION",
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8852785838c6c9efee5560b89cc3948f005cafa3a73fe7846630d0911de16ca9
+size 221627416

added_tokens.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644
+}

config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "__version__": {
+    "sentence_transformers": "3.4.1",
+    "transformers": "4.51.1",
+    "pytorch": "2.5.1+cu124"
+  },
+  "prompts": {
+    "s2p_query": "Instruct: Given a web search query, retrieve relevant passages that answer the query.\nQuery: ",
+    "s2s_query": "Instruct: Retrieve semantically similar text.\nQuery: "
+  },
+  "default_prompt_name": null,
+  "similarity_fn_name": "cosine"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

modules.json ADDED Viewed

	@@ -0,0 +1,20 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.models.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.models.Pooling"
+  },
+  {
+    "idx": 2,
+    "name": "2",
+    "path": "2_Dense",
+    "type": "sentence_transformers.models.Dense"
+  }
+]

sentence_bert_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "max_seq_length": 512,
+  "do_lower_case": false
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2f79052deba517b0663d877714e117a31a4a6243cddb85fc4443c80a2fa65a20
+size 11419302

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "auto_map": {
+    "AutoTokenizer": [
+      "dunzhang/stella_en_1.5B_v5--tokenization_qwen.Qwen2Tokenizer",
+      "dunzhang/stella_en_1.5B_v5--tokenization_qwen.Qwen2TokenizerFast"
+    ]
+  },
+  "bos_token": null,
+  "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 512,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:09e2f89f3fd81076a08f68ebd8d8bcbea9efc13ef33bb2c17fb93326deefd102
+size 5624

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff