jva96160 commited on Aug 19

Commit

4c1ba5a

verified ·

1 Parent(s): a16e4aa

Upload 32 files

Browse files

Files changed (22) hide show

ASRDataset.py +56 -9
README.md +536 -0
chat_template.json +1 -3
config.json +1 -1
deploy/__pycache__/main.cpython-310.pyc +0 -0
deploy/log.txt +0 -0
deploy/main.py +198 -0
deploy/testapi.ipynb +172 -0
eval.py +22 -23
eval_multiturn_textonly.py +131 -0
generation_config.json +13 -0
model-00001-of-00003.safetensors +2 -2
model-00002-of-00003.safetensors +2 -2
model-00003-of-00003.safetensors +2 -2
model.safetensors.index.json +0 -0
preprocessing_gemma3omni.py +1 -13
run.sh +4 -0
template.json +1 -0
test.ipynb +977 -0
test.jinja +51 -0
tmp.sh +3 -0
training.py +60 -128

ASRDataset.py CHANGED Viewed

@@ -427,6 +427,47 @@ class TWCostumData(BaseAudioDataset):
             self.instruction,
             answer_text
         )
 def covost_collate_fn(batch):
     input_ids_list = []
     labels_list = []
@@ -556,11 +597,12 @@ class MultiturnAudioDataset(BaseAudioDataset):
         self.text_only=text_only
         with open(json_path) as f:
             js_data = json.load(f)
         if split=='train':
             self.train=True
-            js_data = js_data[:int(len(js_data)*0.8)]
         else:
-            js_data = js_data[-int(len(js_data)*0.2):]
         for conv in js_data:
             for mess in conv['conversations']:
                 if 'audio_path' in mess:
@@ -570,7 +612,7 @@ class MultiturnAudioDataset(BaseAudioDataset):
             format_user=StringFormatter(slots=["<start_of_turn>user\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]),
             format_assistant=StringFormatter(slots=["{{content}}<end_of_turn>\n"]),
             format_system=StringFormatter(slots=["{{content}}\n\n"]),
-            format_function=FunctionFormatter(slots=["{{content}}", {"eos_token"}], tool_format="default"),
             format_tools = ToolFormatter(tool_format="default"),
             format_observation=StringFormatter(
                 slots=["<start_of_turn>tool\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]
@@ -583,6 +625,7 @@ class MultiturnAudioDataset(BaseAudioDataset):
             format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
             stop_words=["<end_of_turn>"],
             mm_plugin=get_mm_plugin(name="base"),
         )
         self.set_dataset_name(f"MultiturnCostumData")
@@ -603,9 +646,10 @@ class MultiturnAudioDataset(BaseAudioDataset):
                 if mess['from']=='human':
                     tmp['messages'].append(conv['conversations'][i+1])
                     d = deepcopy(tmp)
-                    d['audio_array'] = torchaudio.load(mess['audio_path'])[0][0]
-                    self.data.append(d)
-                    if self.text_only:
                         self.text_only_data.append(deepcopy(tmp))
                     tmp['messages'].pop()
                 elif mess['from']=='observation':
@@ -632,14 +676,16 @@ class MultiturnAudioDataset(BaseAudioDataset):
                 if system or tools:
                     tool_text = self.template.format_tools.apply(content=tools)[0] if tools else ""
                     system_text = self.template.format_system.apply(content=(system + tool_text))[0]
             if message["from"] == "human":
                 if i==len(messages)-2 and not self.text_only:
                     user_transcribe =  message["value"]
-                    elements += self.template.format_user.apply(content=system_text+'<start_of_audio>')
                 else:
-                    elements += self.template.format_user.apply(content=system_text + message["value"])
-                audio_paths.append(message['audio_path'])
             elif message["from"] == "gpt":
                 elements += self.template.format_assistant.apply(content=message["value"])
             elif message["from"] == "observation":
@@ -713,6 +759,7 @@ class MultiturnAudioDataset(BaseAudioDataset):
                     'input_audio_embeds': inputs.input_audio_embeds,
                     'audio_embed_sizes': inputs.audio_embed_sizes,
                     'input_modes': inputs.input_modes,
                 }
             else:
                 return {

             self.instruction,
             answer_text
         )
+class TWCostumDataTasks(BaseAudioDataset):
+    def __init__(self, processor, split="train", sampling_rate=16000,json_path="", debug=False):
+        super().__init__(processor, split, sampling_rate, debug)
+        import pandas as pd
+        from datasets import Dataset, Audio
+        with open(json_path) as f:
+            js_data = json.load(f)
+        raw_data = {
+            "audio": [],
+            "sentence": []
+        }
+        for conv in js_data:
+            for mess in conv['conversations']:
+                if 'audio_path' in mess:
+                    raw_data['audio'].append(mess['audio_path'])
+                    raw_data['sentence'].append(mess["value"])
+        self.set_dataset_name("TWCostumDataTasks"+json_path)
+        self.data = Dataset.from_dict(raw_data).cast_column("audio", Audio(sampling_rate=16000))
+        # Instruction Setting
+        self.instruction = random.choice(INSTRUCTION["asr"])
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        data = self.data[idx]
+        answer_text = data["sentence"]
+        return self.prepare_model_inputs(
+            data["audio"]["array"],
+            self.instruction,
+            answer_text
+        )
 def covost_collate_fn(batch):
     input_ids_list = []
     labels_list = []
         self.text_only=text_only
         with open(json_path) as f:
             js_data = json.load(f)
+        test_len = min(len(js_data)*0.2,200)
         if split=='train':
             self.train=True
+            js_data = js_data[:int(len(js_data)-test_len)]
         else:
+            js_data = js_data[-test_len:]
         for conv in js_data:
             for mess in conv['conversations']:
                 if 'audio_path' in mess:
             format_user=StringFormatter(slots=["<start_of_turn>user\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]),
             format_assistant=StringFormatter(slots=["{{content}}<end_of_turn>\n"]),
             format_system=StringFormatter(slots=["{{content}}\n\n"]),
+            format_function=FunctionFormatter(slots=["{{content}}<end_of_turn>\n"], tool_format="default"),
             format_tools = ToolFormatter(tool_format="default"),
             format_observation=StringFormatter(
                 slots=["<start_of_turn>tool\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]
             format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
             stop_words=["<end_of_turn>"],
             mm_plugin=get_mm_plugin(name="base"),
+            enable_thinking=False
         )
         self.set_dataset_name(f"MultiturnCostumData")
                 if mess['from']=='human':
                     tmp['messages'].append(conv['conversations'][i+1])
                     d = deepcopy(tmp)
+                    if not self.text_only and 'audio_path' in mess:
+                        d['audio_array'] = torchaudio.load(mess['audio_path'])[0][0]
+                        self.data.append(d)
+                    else:
                         self.text_only_data.append(deepcopy(tmp))
                     tmp['messages'].pop()
                 elif mess['from']=='observation':
                 if system or tools:
                     tool_text = self.template.format_tools.apply(content=tools)[0] if tools else ""
                     system_text = self.template.format_system.apply(content=(system + tool_text))[0]
+                elements += system_text
             if message["from"] == "human":
                 if i==len(messages)-2 and not self.text_only:
                     user_transcribe =  message["value"]
+                    elements += self.template.format_user.apply(content='<start_of_audio>')
                 else:
+                    elements += self.template.format_user.apply(content=message["value"])
+                if not self.text_only:
+                    audio_paths.append(message['audio_path'])
             elif message["from"] == "gpt":
                 elements += self.template.format_assistant.apply(content=message["value"])
             elif message["from"] == "observation":
                     'input_audio_embeds': inputs.input_audio_embeds,
                     'audio_embed_sizes': inputs.audio_embed_sizes,
                     'input_modes': inputs.input_modes,
                 }
             else:
                 return {

README.md ADDED Viewed

	@@ -0,0 +1,536 @@

+---
+license: gemma
+library_name: transformers
+pipeline_tag: image-text-to-text
+extra_gated_heading: Access Gemma on Hugging Face
+extra_gated_prompt: To access Gemma on Hugging Face, you’re required to review and
+  agree to Google’s usage license. To do this, please ensure you’re logged in to Hugging
+  Face and click below. Requests are processed immediately.
+extra_gated_button_content: Acknowledge license
+base_model: google/gemma-3-4b-pt
+---
+# Gemma 3 model card
+**Model Page**: [Gemma](https://ai.google.dev/gemma/docs/core)
+**Resources and Technical Documentation**:
+* [Gemma 3 Technical Report][g3-tech-report]
+* [Responsible Generative AI Toolkit][rai-toolkit]
+* [Gemma on Kaggle][kaggle-gemma]
+* [Gemma on Vertex Model Garden][vertex-mg-gemma3]
+**Terms of Use**: [Terms][terms]
+**Authors**: Google DeepMind
+## Model Information
+Summary description and brief definition of inputs and outputs.
+### Description
+Gemma is a family of lightweight, state-of-the-art open models from Google,
+built from the same research and technology used to create the Gemini models.
+Gemma 3 models are multimodal, handling text and image input and generating text
+output, with open weights for both pre-trained variants and instruction-tuned
+variants. Gemma 3 has a large, 128K context window, multilingual support in over
+140 languages, and is available in more sizes than previous versions. Gemma 3
+models are well-suited for a variety of text generation and image understanding
+tasks, including question answering, summarization, and reasoning. Their
+relatively small size makes it possible to deploy them in environments with
+limited resources such as laptops, desktops or your own cloud infrastructure,
+democratizing access to state of the art AI models and helping foster innovation
+for everyone.
+### Inputs and outputs
+-   **Input:**
+    -  Text string, such as a question, a prompt, or a document to be summarized
+    -  Images, normalized to 896 x 896 resolution and encoded to 256 tokens
+       each
+    -  Total input context of 128K tokens for the 4B, 12B, and 27B sizes, and
+       32K tokens for the 1B size
+-   **Output:**
+    -   Generated text in response to the input, such as an answer to a
+        question, analysis of image content, or a summary of a document
+    -   Total output context of 8192 tokens
+### Usage
+Below, there are some code snippets on how to get quickly started with running the model. First, install the Transformers library. Gemma 3 is supported starting from transformers 4.50.0.
+```sh
+$ pip install -U transformers
+```
+Then, copy the snippet from the section that is relevant for your use case.
+#### Running with the `pipeline` API
+You can initialize the model and processor for inference with `pipeline` as follows.
+```python
+from transformers import pipeline
+import torch
+pipe = pipeline(
+    "image-text-to-text",
+    model="google/gemma-3-4b-it",
+    device="cuda",
+    torch_dtype=torch.bfloat16
+)
+```
+With instruction-tuned models, you need to use chat templates to process our inputs first. Then, you can pass it to the pipeline.
+```python
+messages = [
+    {
+        "role": "system",
+        "content": [{"type": "text", "text": "You are a helpful assistant."}]
+    },
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
+            {"type": "text", "text": "What animal is on the candy?"}
+        ]
+    }
+]
+output = pipe(text=messages, max_new_tokens=200)
+print(output[0]["generated_text"][-1]["content"])
+# Okay, let's take a look!
+# Based on the image, the animal on the candy is a **turtle**.
+# You can see the shell shape and the head and legs.
+```
+#### Running the model on a single/multi GPU
+```python
+# pip install accelerate
+from transformers import AutoProcessor, Gemma3ForConditionalGeneration
+from PIL import Image
+import requests
+import torch
+model_id = "google/gemma-3-4b-it"
+model = Gemma3ForConditionalGeneration.from_pretrained(
+    model_id, device_map="auto"
+).eval()
+processor = AutoProcessor.from_pretrained(model_id)
+messages = [
+    {
+        "role": "system",
+        "content": [{"type": "text", "text": "You are a helpful assistant."}]
+    },
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
+            {"type": "text", "text": "Describe this image in detail."}
+        ]
+    }
+]
+inputs = processor.apply_chat_template(
+    messages, add_generation_prompt=True, tokenize=True,
+    return_dict=True, return_tensors="pt"
+).to(model.device, dtype=torch.bfloat16)
+input_len = inputs["input_ids"].shape[-1]
+with torch.inference_mode():
+    generation = model.generate(**inputs, max_new_tokens=100, do_sample=False)
+    generation = generation[0][input_len:]
+decoded = processor.decode(generation, skip_special_tokens=True)
+print(decoded)
+# **Overall Impression:** The image is a close-up shot of a vibrant garden scene,
+# focusing on a cluster of pink cosmos flowers and a busy bumblebee.
+# It has a slightly soft, natural feel, likely captured in daylight.
+```
+### Citation
+```none
+@article{gemma_2025,
+    title={Gemma 3},
+    url={https://goo.gle/Gemma3Report},
+    publisher={Kaggle},
+    author={Gemma Team},
+    year={2025}
+}
+```
+## Model Data
+Data used for model training and how the data was processed.
+### Training Dataset
+These models were trained on a dataset of text data that includes a wide variety
+of sources. The 27B model was trained with 14 trillion tokens, the 12B model was
+trained with 12 trillion tokens, 4B model was trained with 4 trillion tokens and
+1B with 2 trillion tokens. Here are the key components:
+-   Web Documents: A diverse collection of web text ensures the model is
+    exposed to a broad range of linguistic styles, topics, and vocabulary. The
+    training dataset includes content in over 140 languages.
+-   Code: Exposing the model to code helps it to learn the syntax and
+    patterns of programming languages, which improves its ability to generate
+    code and understand code-related questions.
+-   Mathematics: Training on mathematical text helps the model learn logical
+    reasoning, symbolic representation, and to address mathematical queries.
+-   Images: A wide range of images enables the model to perform image
+    analysis and visual data extraction tasks.
+The combination of these diverse data sources is crucial for training a powerful
+multimodal model that can handle a wide variety of different tasks and data
+formats.
+### Data Preprocessing
+Here are the key data cleaning and filtering methods applied to the training
+data:
+-   CSAM Filtering: Rigorous CSAM (Child Sexual Abuse Material) filtering
+    was applied at multiple stages in the data preparation process to ensure
+    the exclusion of harmful and illegal content.
+-   Sensitive Data Filtering: As part of making Gemma pre-trained models
+    safe and reliable, automated techniques were used to filter out certain
+    personal information and other sensitive data from training sets.
+-   Additional methods: Filtering based on content quality and safety in
+    line with [our policies][safety-policies].
+## Implementation Information
+Details about the model internals.
+### Hardware
+Gemma was trained using [Tensor Processing Unit (TPU)][tpu] hardware (TPUv4p,
+TPUv5p and TPUv5e). Training vision-language models (VLMS) requires significant
+computational power. TPUs, designed specifically for matrix operations common in
+machine learning, offer several advantages in this domain:
+-   Performance: TPUs are specifically designed to handle the massive
+    computations involved in training VLMs. They can speed up training
+    considerably compared to CPUs.
+-   Memory: TPUs often come with large amounts of high-bandwidth memory,
+    allowing for the handling of large models and batch sizes during training.
+    This can lead to better model quality.
+-   Scalability: TPU Pods (large clusters of TPUs) provide a scalable
+    solution for handling the growing complexity of large foundation models.
+    You can distribute training across multiple TPU devices for faster and more
+    efficient processing.
+-   Cost-effectiveness: In many scenarios, TPUs can provide a more
+    cost-effective solution for training large models compared to CPU-based
+    infrastructure, especially when considering the time and resources saved
+    due to faster training.
+-   These advantages are aligned with
+    [Google's commitments to operate sustainably][sustainability].
+### Software
+Training was done using [JAX][jax] and [ML Pathways][ml-pathways].
+JAX allows researchers to take advantage of the latest generation of hardware,
+including TPUs, for faster and more efficient training of large models. ML
+Pathways is Google's latest effort to build artificially intelligent systems
+capable of generalizing across multiple tasks. This is specially suitable for
+foundation models, including large language models like these ones.
+Together, JAX and ML Pathways are used as described in the
+[paper about the Gemini family of models][gemini-2-paper]; *"the 'single
+controller' programming model of Jax and Pathways allows a single Python
+process to orchestrate the entire training run, dramatically simplifying the
+development workflow."*
+## Evaluation
+Model evaluation metrics and results.
+### Benchmark Results
+These models were evaluated against a large collection of different datasets and
+metrics to cover different aspects of text generation:
+#### Reasoning and factuality
+| Benchmark                      | Metric         | Gemma 3 PT 1B  | Gemma 3 PT 4B | Gemma 3 PT 12B | Gemma 3 PT 27B |
+| ------------------------------ |----------------|:--------------:|:-------------:|:--------------:|:--------------:|
+| [HellaSwag][hellaswag]         | 10-shot        |      62.3      |      77.2     |      84.2      |      85.6      |
+| [BoolQ][boolq]                 | 0-shot         |      63.2      |      72.3     |      78.8      |      82.4      |
+| [PIQA][piqa]                   | 0-shot         |      73.8      |      79.6     |      81.8      |      83.3      |
+| [SocialIQA][socialiqa]         | 0-shot         |      48.9      |      51.9     |      53.4      |      54.9      |
+| [TriviaQA][triviaqa]           | 5-shot         |      39.8      |      65.8     |      78.2      |      85.5      |
+| [Natural Questions][naturalq]  | 5-shot         |      9.48      |      20.0     |      31.4      |      36.1      |
+| [ARC-c][arc]                   | 25-shot        |      38.4      |      56.2     |      68.9      |      70.6      |
+| [ARC-e][arc]                   | 0-shot         |      73.0      |      82.4     |      88.3      |      89.0      |
+| [WinoGrande][winogrande]       | 5-shot         |      58.2      |      64.7     |      74.3      |      78.8      |
+| [BIG-Bench Hard][bbh]          | few-shot       |      28.4      |      50.9     |      72.6      |      77.7      |
+| [DROP][drop]                   | 1-shot         |      42.4      |      60.1     |      72.2      |      77.2      |
+[hellaswag]: https://arxiv.org/abs/1905.07830
+[boolq]: https://arxiv.org/abs/1905.10044
+[piqa]: https://arxiv.org/abs/1911.11641
+[socialiqa]: https://arxiv.org/abs/1904.09728
+[triviaqa]: https://arxiv.org/abs/1705.03551
+[naturalq]: https://github.com/google-research-datasets/natural-questions
+[arc]: https://arxiv.org/abs/1911.01547
+[winogrande]: https://arxiv.org/abs/1907.10641
+[bbh]: https://paperswithcode.com/dataset/bbh
+[drop]: https://arxiv.org/abs/1903.00161
+#### STEM and code
+| Benchmark                      | Metric         | Gemma 3 PT 4B | Gemma 3 PT 12B | Gemma 3 PT 27B |
+| ------------------------------ |----------------|:-------------:|:--------------:|:--------------:|
+| [MMLU][mmlu]                   | 5-shot         |      59.6     |      74.5      |      78.6      |
+| [MMLU][mmlu] (Pro COT)         | 5-shot         |      29.2     |      45.3      |      52.2      |
+| [AGIEval][agieval]             | 3-5-shot       |      42.1     |      57.4      |      66.2      |
+| [MATH][math]                   | 4-shot         |      24.2     |      43.3      |      50.0      |
+| [GSM8K][gsm8k]                 | 8-shot         |      38.4     |      71.0      |      82.6      |
+| [GPQA][gpqa]                   | 5-shot         |      15.0     |      25.4      |      24.3      |
+| [MBPP][mbpp]                   | 3-shot         |      46.0     |      60.4      |      65.6      |
+| [HumanEval][humaneval]         | 0-shot         |      36.0     |      45.7      |      48.8      |
+[mmlu]: https://arxiv.org/abs/2009.03300
+[agieval]: https://arxiv.org/abs/2304.06364
+[math]: https://arxiv.org/abs/2103.03874
+[gsm8k]: https://arxiv.org/abs/2110.14168
+[gpqa]: https://arxiv.org/abs/2311.12022
+[mbpp]: https://arxiv.org/abs/2108.07732
+[humaneval]: https://arxiv.org/abs/2107.03374
+#### Multilingual
+| Benchmark                            | Gemma 3 PT 1B | Gemma 3 PT 4B | Gemma 3 PT 12B | Gemma 3 PT 27B |
+| ------------------------------------ |:-------------:|:-------------:|:--------------:|:--------------:|
+| [MGSM][mgsm]                         |      2.04     |      34.7     |      64.3     |      74.3     |
+| [Global-MMLU-Lite][global-mmlu-lite] |      24.9     |      57.0     |      69.4     |      75.7     |
+| [WMT24++][wmt24pp] (ChrF)            |      36.7     |      48.4     |      53.9     |      55.7     |
+| [FloRes][flores]                     |      29.5     |      39.2     |      46.0     |      48.8     |
+| [XQuAD][xquad] (all)                 |      43.9     |      68.0     |      74.5     |      76.8     |
+| [ECLeKTic][eclektic]                 |      4.69     |      11.0     |      17.2     |      24.4     |
+| [IndicGenBench][indicgenbench]       |      41.4     |      57.2     |      61.7     |      63.4     |
+[mgsm]: https://arxiv.org/abs/2210.03057
+[flores]: https://arxiv.org/abs/2106.03193
+[xquad]: https://arxiv.org/abs/1910.11856v3
+[global-mmlu-lite]: https://huggingface.co/datasets/CohereForAI/Global-MMLU-Lite
+[wmt24pp]: https://arxiv.org/abs/2502.12404v1
+[eclektic]: https://arxiv.org/abs/2502.21228
+[indicgenbench]: https://arxiv.org/abs/2404.16816
+#### Multimodal
+| Benchmark                      | Gemma 3 PT 4B | Gemma 3 PT 12B | Gemma 3 PT 27B |
+| ------------------------------ |:-------------:|:--------------:|:--------------:|
+| [COCOcap][coco-cap]            |      102      |      111       |      116       |
+| [DocVQA][docvqa] (val)         |      72.8     |      82.3      |      85.6      |
+| [InfoVQA][info-vqa] (val)      |      44.1     |      54.8      |      59.4      |
+| [MMMU][mmmu] (pt)              |      39.2     |      50.3      |      56.1      |
+| [TextVQA][textvqa] (val)       |      58.9     |      66.5      |      68.6      |
+| [RealWorldQA][realworldqa]     |      45.5     |      52.2      |      53.9      |
+| [ReMI][remi]                   |      27.3     |      38.5      |      44.8      |
+| [AI2D][ai2d]                   |      63.2     |      75.2      |      79.0      |
+| [ChartQA][chartqa]             |      63.6     |      74.7      |      76.3      |
+| [VQAv2][vqav2]                 |      63.9     |      71.2      |      72.9      |
+| [BLINK][blinkvqa]              |      38.0     |      35.9      |      39.6      |
+| [OKVQA][okvqa]                 |      51.0     |      58.7      |      60.2      |
+| [TallyQA][tallyqa]             |      42.5     |      51.8      |      54.3      |
+| [SpatialSense VQA][ss-vqa]     |      50.9     |      60.0      |      59.4      |
+| [CountBenchQA][countbenchqa]   |      26.1     |      17.8      |      68.0      |
+[coco-cap]: https://cocodataset.org/#home
+[docvqa]: https://www.docvqa.org/
+[info-vqa]: https://arxiv.org/abs/2104.12756
+[mmmu]: https://arxiv.org/abs/2311.16502
+[textvqa]: https://textvqa.org/
+[realworldqa]: https://paperswithcode.com/dataset/realworldqa
+[remi]: https://arxiv.org/html/2406.09175v1
+[ai2d]: https://allenai.org/data/diagrams
+[chartqa]: https://arxiv.org/abs/2203.10244
+[vqav2]: https://visualqa.org/index.html
+[blinkvqa]: https://arxiv.org/abs/2404.12390
+[okvqa]: https://okvqa.allenai.org/
+[tallyqa]: https://arxiv.org/abs/1810.12440
+[ss-vqa]: https://arxiv.org/abs/1908.02660
+[countbenchqa]: https://github.com/google-research/big_vision/blob/main/big_vision/datasets/countbenchqa/
+## Ethics and Safety
+Ethics and safety evaluation approach and results.
+### Evaluation Approach
+Our evaluation methods include structured evaluations and internal red-teaming
+testing of relevant content policies. Red-teaming was conducted by a number of
+different teams, each with different goals and human evaluation metrics. These
+models were evaluated against a number of different categories relevant to
+ethics and safety, including:
+-   **Child Safety**: Evaluation of text-to-text and image to text prompts
+    covering child safety policies, including child sexual abuse and
+    exploitation.
+-   **Content Safety:** Evaluation of text-to-text and image to text prompts
+    covering safety policies including, harassment, violence and gore, and hate
+    speech.
+-   **Representational Harms**: Evaluation of text-to-text and image to text
+    prompts covering safety policies including bias, stereotyping, and harmful
+    associations or inaccuracies.
+In addition to development level evaluations, we conduct "assurance
+evaluations" which are our 'arms-length' internal evaluations for responsibility
+governance decision making. They are conducted separately from the model
+development team, to inform decision making about release. High level findings
+are fed back to the model team, but prompt sets are held-out to prevent
+overfitting and preserve the results' ability to inform decision making.
+Assurance evaluation results are reported to our Responsibility & Safety Council
+as part of release review.
+### Evaluation Results
+For all areas of safety testing, we saw major improvements in the categories of
+child safety, content safety, and representational harms relative to previous
+Gemma models. All testing was conducted without safety filters to evaluate the
+model capabilities and behaviors. For both text-to-text and image-to-text, and
+across all model sizes, the model produced minimal policy violations, and showed
+significant improvements over previous Gemma models' performance with respect
+to ungrounded inferences. A limitation of our evaluations was they included only
+English language prompts.
+## Usage and Limitations
+These models have certain limitations that users should be aware of.
+### Intended Usage
+Open vision-language models (VLMs) models have a wide range of applications
+across various industries and domains. The following list of potential uses is
+not comprehensive. The purpose of this list is to provide contextual information
+about the possible use-cases that the model creators considered as part of model
+training and development.
+-   Content Creation and Communication
+    -   Text Generation: These models can be used to generate creative text
+        formats such as poems, scripts, code, marketing copy, and email drafts.
+    -   Chatbots and Conversational AI: Power conversational interfaces
+        for customer service, virtual assistants, or interactive applications.
+    -   Text Summarization: Generate concise summaries of a text corpus,
+        research papers, or reports.
+    -   Image Data Extraction: These models can be used to extract,
+        interpret, and summarize visual data for text communications.
+-   Research and Education
+    -   Natural Language Processing (NLP) and VLM Research: These
+        models can serve as a foundation for researchers to experiment with VLM
+        and NLP techniques, develop algorithms, and contribute to the
+        advancement of the field.
+    -   Language Learning Tools: Support interactive language learning
+        experiences, aiding in grammar correction or providing writing practice.
+    -   Knowledge Exploration: Assist researchers in exploring large
+        bodies of text by generating summaries or answering questions about
+        specific topics.
+### Limitations
+-   Training Data
+    -   The quality and diversity of the training data significantly
+        influence the model's capabilities. Biases or gaps in the training data
+        can lead to limitations in the model's responses.
+    -   The scope of the training dataset determines the subject areas
+        the model can handle effectively.
+-   Context and Task Complexity
+    -   Models are better at tasks that can be framed with clear
+        prompts and instructions. Open-ended or highly complex tasks might be
+        challenging.
+    -   A model's performance can be influenced by the amount of context
+        provided (longer context generally leads to better outputs, up to a
+        certain point).
+-   Language Ambiguity and Nuance
+    -   Natural language is inherently complex. Models might struggle
+        to grasp subtle nuances, sarcasm, or figurative language.
+-   Factual Accuracy
+    -   Models generate responses based on information they learned
+        from their training datasets, but they are not knowledge bases. They
+        may generate incorrect or outdated factual statements.
+-   Common Sense
+    -   Models rely on statistical patterns in language. They might
+        lack the ability to apply common sense reasoning in certain situations.
+### Ethical Considerations and Risks
+The development of vision-language models (VLMs) raises several ethical
+concerns. In creating an open model, we have carefully considered the following:
+-   Bias and Fairness
+    -   VLMs trained on large-scale, real-world text and image data can
+        reflect socio-cultural biases embedded in the training material. These
+        models underwent careful scrutiny, input data pre-processing described
+        and posterior evaluations reported in this card.
+-   Misinformation and Misuse
+    -   VLMs can be misused to generate text that is false, misleading,
+        or harmful.
+    -   Guidelines are provided for responsible use with the model, see the
+        [Responsible Generative AI Toolkit][rai-toolkit].
+-   Transparency and Accountability:
+    -   This model card summarizes details on the models' architecture,
+        capabilities, limitations, and evaluation processes.
+    -   A responsibly developed open model offers the opportunity to
+        share innovation by making VLM technology accessible to developers and
+        researchers across the AI ecosystem.
+Risks identified and mitigations:
+-   **Perpetuation of biases**: It's encouraged to perform continuous
+    monitoring (using evaluation metrics, human review) and the exploration of
+    de-biasing techniques during model training, fine-tuning, and other use
+    cases.
+-   **Generation of harmful content**: Mechanisms and guidelines for content
+    safety are essential. Developers are encouraged to exercise caution and
+    implement appropriate content safety safeguards based on their specific
+    product policies and application use cases.
+-   **Misuse for malicious purposes**: Technical limitations and developer
+    and end-user education can help mitigate against malicious applications of
+    VLMs. Educational resources and reporting mechanisms for users to flag
+    misuse are provided. Prohibited uses of Gemma models are outlined in the
+    [Gemma Prohibited Use Policy][prohibited-use].
+-   **Privacy violations**: Models were trained on data filtered for removal
+    of certain personal information and other sensitive data. Developers are
+    encouraged to adhere to privacy regulations with privacy-preserving
+    techniques.
+### Benefits
+At the time of release, this family of models provides high-performance open
+vision-language model implementations designed from the ground up for
+responsible AI development compared to similarly sized models.
+Using the benchmark evaluation metrics described in this document, these models
+have shown to provide superior performance to other, comparably-sized open model
+alternatives.
+[g3-tech-report]: https://goo.gle/Gemma3Report
+[rai-toolkit]: https://ai.google.dev/responsible
+[kaggle-gemma]: https://www.kaggle.com/models/google/gemma-3
+[vertex-mg-gemma3]: https://console.cloud.google.com/vertex-ai/publishers/google/model-garden/gemma3
+[terms]: https://ai.google.dev/gemma/terms
+[safety-policies]: https://ai.google/static/documents/ai-responsibility-update-published-february-2025.pdf
+[prohibited-use]: https://ai.google.dev/gemma/prohibited_use_policy
+[tpu]: https://cloud.google.com/tpu/docs/intro-to-tpu
+[sustainability]: https://sustainability.google/operating-sustainably/
+[jax]: https://github.com/jax-ml/jax
+[ml-pathways]: https://blog.google/technology/ai/introducing-pathways-next-generation-ai-architecture/
+[sustainability]: https://sustainability.google/operating-sustainably/
+[gemini-2-paper]: https://arxiv.org/abs/2312.11805

chat_template.json CHANGED Viewed

@@ -1,3 +1 @@
-{
-  "chat_template": "{{ bos_token }}\n{%- if messages[0]['role'] == 'system' -%}\n    {%- if messages[0]['content'] is string -%}\n        {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}\n    {%- else -%}\n        {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}\n    {%- endif -%}\n    {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n    {%- set first_user_prefix = \"\" -%}\n    {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n        {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n    {%- endif -%}\n    {%- if (message['role'] == 'assistant') -%}\n        {%- set role = \"model\" -%}\n    {%- else -%}\n        {%- set role = message['role'] -%}\n    {%- endif -%}\n    {{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n    {%- if message['content'] is string -%}\n        {{ message['content'] | trim }}\n    {%- elif message['content'] is iterable -%}\n        {%- for item in message['content'] -%}\n            {%- if item['type'] == 'image' -%}\n                {{ '<start_of_image>' }}\n            {%- elif item['type'] == 'audio' -%}\n                {{ '<start_of_audio>' }}\n            {%- elif item['type'] == 'text' -%}\n                {{ item['text'] | trim }}\n            {%- endif -%}\n        {%- endfor -%}\n    {%- else -%}\n        {{ raise_exception(\"Invalid content type\") }}\n    {%- endif -%}\n    {{ '<end_of_turn>\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n    {{'<start_of_turn>model\n'}}\n{%- endif -%}\n"
-}


1	+ {"chat_template": "{{ bos_token }}\n{%- if tools %}\n {% set tools_json = tools %}\n You have access to the following tools:\n {% for tool in tools_json %}\n > Tool Name: {{ tool.name }}\n Tool Description: {{ tool.description }}\n Tool Args:\n {% if tool.parameters.properties %}\n {% for prop_name, prop_data in tool.parameters.properties.items() %}\n - {{ prop_name }} ({{ prop_data.type }}, {% if prop_name in tool.parameters.required %}required{% else %}optional{% endif %}): {{ prop_data.description }}\n {% endfor %}\n {% else %}\n {% endif %}\n\n {% endfor %}, where each item should be object\n Use the following format if using a tool:\n ```\n Action: tool name (one of [{{ tools_json \| map(attribute='name') \| join(', ') }}])\n Action Input: the input to the tool, in a JSON format representing the kwargs (e.g. ```{\"input\": \"hello world\", \"num_beams\": 5}```)\n ```\n {{'\n'}}\n{%- endif -%}\n\n\n{%- for message in messages -%}\n {%- if message['from'] == \"human\" -%}\n {%- if message['value'] is string -%}\n {{'<start_of_turn>user\n' + message['value'] + '<end_of_turn>\n'}}\n {%- elif message['value'] is iterable -%}\n {{'<start_of_turn>user\n'}}\n {%- for item in message['value'] -%}\n {%- if item['type'] == 'image' -%}\n {{ '<start_of_image>' }}\n {%- elif item['type'] == 'audio' -%}\n {{ '<start_of_audio>' }}\n {%- elif item['type'] == 'text' -%}\n {{ item['text'] \| trim }}\n {%- endif -%}\n {{'<end_of_turn>\n'}}\n {%- endfor -%}\n {%- endif -%}\n {%- elif message['from'] == \"gpt\" -%}\n {{'<start_of_turn>model\n' + message['value'] + '<end_of_turn>\n'}} \n {%- elif message['from'] == \"function_call\" -%}\n {%- set func_call = message['value'] -%}\n {{'<start_of_turn>model\n' + 'Action:' + func_call.name + '\n' + 'Action Input:' + func_call.arguments + '<end_of_turn>\n'}} \n {%- elif message['from'] == \"observation\" -%}\n {{'<start_of_turn>tool\n' + message['value'] + '<end_of_turn>\n'}} \n {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{'<start_of_turn>model\n'}}\n{%- endif -%}\n"}

config.json CHANGED Viewed

@@ -70,7 +70,7 @@
     "layer": "((layers.*self_attn\\.(q|k|v|o)_proj)|(layers.*mlp\\.(gate|up|down)_proj))",
     "lora_alpha": 16,
     "r": 8,
-    "use_rslora": true
   },
   "text_config": {
     "attention_bias": false,

     "layer": "((layers.*self_attn\\.(q|k|v|o)_proj)|(layers.*mlp\\.(gate|up|down)_proj))",
     "lora_alpha": 16,
     "r": 8,
+    "use_rslora": false
   },
   "text_config": {
     "attention_bias": false,

deploy/__pycache__/main.cpython-310.pyc ADDED Viewed

Binary file (6.94 kB). View file

deploy/log.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

deploy/main.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import uvicorn
+from fastapi import FastAPI, UploadFile, File, Form, HTTPException
+from typing import List, Dict, Any, Optional
+import json
+import os
+from transformers import AutoProcessor, AutoModel
+import torch, torchaudio
+import os
+import copy
+from rapidfuzz import process, fuzz
+from pypinyin import pinyin, Style
+def correct_sentence_with_pinyin(user_input_sentence, location_dict, score_cutoff=50):
+    pinyin_dict = {}
+    for location in location_dict:
+        pinyin_name = ''.join([item[0] for item in pinyin(location, style=Style.NORMAL)])
+        pinyin_dict[pinyin_name] = location
+    user_pinyin_sentence = ''.join([item[0] for item in pinyin(user_input_sentence, style=Style.NORMAL)])
+    best_match_pinyin = process.extractOne(
+        query=user_pinyin_sentence,
+        choices=list(pinyin_dict.keys()), # 傳入拼音作為搜尋目標
+        scorer=fuzz.token_set_ratio,
+        score_cutoff=score_cutoff
+    )
+    if best_match_pinyin:
+        best_pinyin_name = best_match_pinyin[0]
+        corrected_location_name = pinyin_dict[best_pinyin_name]
+        best_user_substring = None
+        max_substring_score = 0
+        for i in range(len(user_input_sentence)):
+            for j in range(i + 2, min(i + 16, len(user_input_sentence) + 1)):
+                substring = user_input_sentence[i:j]
+                score = fuzz.ratio(substring, corrected_location_name)
+                if score > max_substring_score:
+                    max_substring_score = score
+                    best_user_substring = substring
+        if best_user_substring and max_substring_score > score_cutoff:
+            return user_input_sentence.replace(best_user_substring, corrected_location_name, 1)
+        else:
+            return user_input_sentence
+    return user_input_sentence
+class InferenceClass:
+    def __init__(self,model_id):
+        self.model = AutoModel.from_pretrained(
+            model_id, device_map="cuda",
+            torch_dtype=torch.bfloat16,
+            trust_remote_code=True,
+            attn_implementation="eager"
+        ).eval()
+        self.processor = AutoProcessor.from_pretrained(
+            model_id, trust_remote_code=True
+        )
+        self.remove_words_signs = lambda x:x.replace('User transcribe is :','').replace('GPT output is :','').replace('\n','').\
+                            replace(' ','').replace('?','').replace('？','').replace('!','').replace('。','').\
+                            replace('！','')
+    def call_gpt(self,inputs_tensor):
+        with torch.inference_mode():
+            inputs = {k:inputs_tensor[k].to('cuda') for k in inputs_tensor}
+            generate_ids = self.model.generate(**inputs, max_new_tokens=128, do_sample=False)
+            generate_ids = generate_ids[:, inputs['input_ids'].shape[1] :]
+            model_output = self.processor.batch_decode(
+                generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+            )[0]
+        return model_output
+    def call_function_fake(self,messages=[],obs=""):
+        messages.append({'from': 'observation', 'value': obs})
+        return messages
+    def generate(self,chat_history,tools="",audio_path=None):
+        '''
+        input:
+            audio_path : str
+            chat_history : dict
+        return:
+            model_output : dict
+        '''
+        chat_history = copy.deepcopy(chat_history)
+        if type(audio_path)!=type(None):
+            chat_history.append({'from': 'human',
+                                'value': [{'type': 'audio',
+                                            'audio': audio_path}]})
+        words_from_poi = []
+        for hist in chat_history:
+            if hist['from']=='observation' and '地點查詢成功' in hist['value'] and 'poi' in hist['value']:
+                tmp = json.loads(hist['value'])
+                for i,poi in enumerate(tmp['poi']):
+                    words_from_poi.append(poi['name'])
+        for hist in chat_history:
+            if hist['from']=='human' and type(hist['value'])==str:
+                hist['value'] = correct_sentence_with_pinyin(hist['value'],words_from_poi)
+            elif hist['from']=='function_call' and "arguments" in hist['value'] and 'keyword' in hist['value']["arguments"]:
+                hist['value']["arguments"] = eval(hist['value']["arguments"])
+                if 'keyword' in hist['value']["arguments"]:
+                    hist['value']["arguments"]['keyword'] = correct_sentence_with_pinyin(hist['value']["arguments"]['keyword'],words_from_poi)
+                hist['value']["arguments"] = str(hist['value']["arguments"])
+        # model_input_history = copy.deepcopy(chat_history)
+        # num2ch = {1:'一',2:'二',3:'三',4:'四',5:'五',6:'六'}
+        # for hist in model_input_history:
+        #     if hist['from']=='observation' and '地點查詢成功' in hist['value'] and 'poi' in hist['value']:
+        #         tmp = json.loads(hist['value'])
+        #         new_poi = []
+        #         for i,poi in enumerate(tmp['poi']):
+        #             new_poi.append('第{}個 : '.format(num2ch[i+1])+str(poi))
+        #         tmp['poi'] = new_poi
+        #         hist['value'] = json.dumps(tmp, ensure_ascii=False)
+        inputs_text = self.processor.apply_chat_template(
+            chat_history, add_generation_prompt=True, tokenize=False,
+            return_dict=True, return_tensors="pt", tools=json.loads(tools)
+        )
+        inputs_tensor = self.processor(text=inputs_text,
+                        audio=[torchaudio.load(audio_path)[0]] if type(audio_path)!=type(None) else None,
+                        add_special_tokens=False,
+                        return_tensors='pt'
+                    )
+        model_output = self.call_gpt(inputs_tensor)
+        if chat_history[-1]['from']=='observation':
+            chat_history.append({'from': 'gpt', 'value': correct_sentence_with_pinyin(model_output,words_from_poi)})
+            return chat_history
+        if ((not ';\n' in model_output) or (not 'User transcribe is :' in model_output) or (not 'GPT output is :' in model_output)\
+            or len(model_output.split(';\n'))<2 ):
+            if chat_history[-1]['value']!="抱歉我聽不清楚 能麻煩您再說一次嗎":
+                chat_history.append({'from': 'human',
+                'value': 'HUMAN_VOICE_IS_NOT_RECOGNIZED'}),
+                chat_history.append({'from': 'gpt', 'value': '抱歉我聽不清楚 能麻煩您再說一次嗎'})
+            return chat_history
+        output_t,output_o = model_output.split(';\n')[:2]
+        output_t,output_o = self.remove_words_signs(output_t),self.remove_words_signs(output_o)
+        chat_history[-1]['value'] = correct_sentence_with_pinyin(output_t,words_from_poi)
+        if 'Action:' in output_o and 'ActionInput:' in output_o: # function calling
+            function_name,function_arg = output_o.split('ActionInput:')
+            function_name = function_name.replace('Action:','')
+            if "keyword" in function_arg:
+                function_arg = json.loads(function_arg)
+                if "keyword" in function_arg:
+                    function_arg["keyword"] = correct_sentence_with_pinyin(function_arg["keyword"],words_from_poi)
+            chat_history.append({'from': 'function_call', 'value': {"name": function_name, "arguments": str(function_arg)}})
+        else: # gpt response
+            chat_history.append({'from': 'gpt', 'value': correct_sentence_with_pinyin(output_o,words_from_poi)})
+        return chat_history
+model_id =  "/home/jeff/jeff/codes/llm/InCar/gemma-3-4b-it-omni"
+pipeline = InferenceClass(model_id)
+app = FastAPI(
+    title="Audio LLM API",
+    description="An API that accepts an audio file and a list of dictionaries.",
+)
+import json
+dataset = json.load(open('/home/jeff/jeff/codes/llm/InCar/data/test_data/nav_0730_noisy.json'))
+tools = dataset[0]['tools']
+@app.post("/audio_llm/")
+async def process_audio_and_data(
+    audio_file: Optional[UploadFile] = File(None, description="The audio file to be processed."),
+    data: str = Form(..., description="A JSON string representing a list of chat history dictionaries.")
+) -> List[Dict[str, Any]]:
+    try:
+        input_data_list = json.loads(data)
+        if not isinstance(input_data_list, list) or not all(isinstance(item, dict) for item in input_data_list):
+            raise ValueError("The provided data is not a list of dictionaries.")
+    except json.JSONDecodeError:
+        raise HTTPException(
+            status_code=422,
+            detail="Invalid JSON format for 'data' field. Please provide a valid JSON string."
+        )
+    except ValueError as e:
+        raise HTTPException(
+            status_code=422,
+            detail=str(e)
+        )
+    temp_file_path=None
+    if audio_file:
+        temp_file_path = f"./audio_path/temp_{audio_file.filename}"
+        with open(temp_file_path, "wb") as buffer:
+            buffer.write(await audio_file.read())
+        print(f"Audio file saved to {temp_file_path}")
+    output_data = pipeline.generate(input_data_list,tools=tools,audio_path=temp_file_path)
+    print(output_data)
+    return output_data
+# uvicorn main:app --host 0.0.0.0 --port 8087 --log-level info --workers 1 >> ./log.txt
+if __name__ == "__main__":
+    uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)

deploy/testapi.ipynb ADDED Viewed

	@@ -0,0 +1,172 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "import json\n",
+    "import os\n",
+    "\n",
+    "def callAPI(input_data,audio_file=None,API_URL = \"http://0.0.0.0:8087/audio_llm/\"):\n",
+    "    json_data_string = json.dumps(input_data)\n",
+    "\n",
+    "    files=None\n",
+    "    if audio_file:\n",
+    "        files = {\n",
+    "            'audio_file': (os.path.basename(audio_file), open(audio_file, 'rb'), 'audio/wav')\n",
+    "        }\n",
+    "    data = {\n",
+    "        'data': json_data_string\n",
+    "    }\n",
+    "    try:\n",
+    "        response = requests.post(API_URL, files=files, data=data)\n",
+    "        if response.status_code == 200:\n",
+    "            processed_data = response.json()\n",
+    "        else:\n",
+    "            print(f\"請求失敗，狀態碼：{response.status_code}\")\n",
+    "            print(f\"錯誤訊息：{response.text}\")\n",
+    "\n",
+    "    except requests.exceptions.ConnectionError as e:\n",
+    "        print(f\"無法連線到伺服器。請確認 FastAPI 應用程式正在執行。錯誤：{e}\")\n",
+    "    return response.json()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "data = json.load(open('/home/jeff/jeff/codes/llm/InCar/data/test_data/nav_0730_noisy.json'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[{'from': 'human', 'value': '我在找台東太平逸境'}, {'from': 'function_call', 'value': {'name': 'search_and_show_place', 'arguments': \"{'keyword': '台東太平逸境'}\"}}]\n",
+      "[{'from': 'human', 'value': '我在找台中太平逸境'}, {'from': 'function_call', 'value': {'name': 'search_and_show_place', 'arguments': \"{'keyword': '台中太平逸境'}\"}}, {'from': 'observation', 'value': '{\"name\": \"search_and_show_place\", \"status\": \"success\", \"message\": \"地點查詢成功\", \"poi\": [{\"name\": \"台中太平逸境\", \"address\": \"台中市太平區新仁路一段88號\", \"latitude\": 24.109912, \"longitude\": 120.737201, \"fromMemory\": false}]}'}, {'from': 'gpt', 'value': '我找到「台中太平逸境」，請問是要到這裡嗎？'}]\n",
+      "[{'from': 'human', 'value': '我在找台中太平逸境'}, {'from': 'function_call', 'value': {'name': 'search_and_show_place', 'arguments': \"{'keyword': '台中太平逸境'}\"}}, {'from': 'observation', 'value': '{\"name\": \"search_and_show_place\", \"status\": \"success\", \"message\": \"地點查詢成功\", \"poi\": [{\"name\": \"台中太平逸境\", \"address\": \"台中市太平區新仁路一段88號\", \"latitude\": 24.109912, \"longitude\": 120.737201, \"fromMemory\": false}]}'}, {'from': 'gpt', 'value': '我找到「台中太平逸境」，請問是要到這裡嗎？'}, {'from': 'human', 'value': '去第一個'}, {'from': 'function_call', 'value': {'name': 'nav_start', 'arguments': \"{'keyword': '台中太平逸境', 'name': '台中太平逸境', 'address': '台中市太平區新仁路一段88號', 'latitude': 24.109912, 'longitude': 120.737201, 'fromMemory': False}\"}}]\n",
+      "[{'from': 'human', 'value': '我在找台中太平逸境'}, {'from': 'function_call', 'value': {'name': 'search_and_show_place', 'arguments': \"{'keyword': '台中太平逸境'}\"}}, {'from': 'observation', 'value': '{\"name\": \"search_and_show_place\", \"status\": \"success\", \"message\": \"地點查詢成功\", \"poi\": [{\"name\": \"台中太平逸境\", \"address\": \"台中市太平區新仁路一段88號\", \"latitude\": 24.109912, \"longitude\": 120.737201, \"fromMemory\": false}]}'}, {'from': 'gpt', 'value': '我找到「台中太平逸境」，請問是要到這裡嗎？'}, {'from': 'human', 'value': '去第一個'}, {'from': 'function_call', 'value': {'name': 'nav_start', 'arguments': \"{'keyword': '台中太平逸境', 'name': '台中太平逸境', 'address': '台中市太平區新仁路一段88號', 'latitude': 24.109912, 'longitude': 120.737201, 'fromMemory': False}\"}}, {'from': 'observation', 'value': '{\"name\": \"nav_start\", \"status\": \"success\", \"message\": \"導航已啟動\"}'}, {'from': 'gpt', 'value': '導航已設定，往「台中太平逸境」出發。'}]\n",
+      "[{'from': 'human', 'value': '我在找台中太平逸境'}, {'from': 'function_call', 'value': {'name': 'search_and_show_place', 'arguments': \"{'keyword': '台中太平逸境'}\"}}, {'from': 'observation', 'value': '{\"name\": \"search_and_show_place\", \"status\": \"success\", \"message\": \"地點查詢成功\", \"poi\": [{\"name\": \"台中太平逸境\", \"address\": \"台中市太平區新仁路一段88號\", \"latitude\": 24.109912, \"longitude\": 120.737201, \"fromMemory\": false}]}'}, {'from': 'gpt', 'value': '我找到「台中太平逸境」，請問是要到這裡���？'}, {'from': 'human', 'value': '去第一個'}, {'from': 'function_call', 'value': {'name': 'nav_start', 'arguments': \"{'keyword': '台中太平逸境', 'name': '台中太平逸境', 'address': '台中市太平區新仁路一段88號', 'latitude': 24.109912, 'longitude': 120.737201, 'fromMemory': False}\"}}, {'from': 'observation', 'value': '{\"name\": \"nav_start\", \"status\": \"success\", \"message\": \"導航已啟動\"}'}, {'from': 'gpt', 'value': '導航已設定，往「台中太平逸境」出發。'}, {'from': 'human', 'value': '請帶我去澳洲大堡道acher韓的區'}, {'from': 'function_call', 'value': {'name': 'search_and_show_place', 'arguments': \"{'keyword': '澳洲大堡道acher韓的區'}\"}}]\n",
+      "[{'from': 'human', 'value': '我在找台中太平逸境'}, {'from': 'function_call', 'value': {'name': 'search_and_show_place', 'arguments': \"{'keyword': '台中太平逸境'}\"}}, {'from': 'observation', 'value': '{\"name\": \"search_and_show_place\", \"status\": \"success\", \"message\": \"地點查詢成功\", \"poi\": [{\"name\": \"台中太平逸境\", \"address\": \"台中市太平區新仁路一段88號\", \"latitude\": 24.109912, \"longitude\": 120.737201, \"fromMemory\": false}]}'}, {'from': 'gpt', 'value': '我找到「台中太平逸境」，請問是要到這裡嗎？'}, {'from': 'human', 'value': '去第一個'}, {'from': 'function_call', 'value': {'name': 'nav_start', 'arguments': \"{'keyword': '台中太平逸境', 'name': '台中太平逸境', 'address': '台中市太平區新仁路一段88號', 'latitude': 24.109912, 'longitude': 120.737201, 'fromMemory': False}\"}}, {'from': 'observation', 'value': '{\"name\": \"nav_start\", \"status\": \"success\", \"message\": \"導航已啟動\"}'}, {'from': 'gpt', 'value': '導航已設定，往「台中太平逸境」出發。'}, {'from': 'human', 'value': '請帶我去澳洲大堡道acher韓的區'}, {'from': 'function_call', 'value': {'name': 'search_and_show_place', 'arguments': \"{'keyword': '澳洲大堡道acher韓的區'}\"}}, {'from': 'observation', 'value': '{\"name\": \"search_and_show_place\", \"status\": \"success\", \"message\": \"地點查詢成功\", \"poi\": [{\"name\": \"咔啦咔啦雞\", \"address\": \"南投市中山路120號\", \"latitude\": 23.9133, \"longitude\": 120.685, \"fromMemory\": true}, {\"name\": \"炸物車站\", \"address\": \"新竹市香山區中華路六段88號\", \"latitude\": 24.776, \"longitude\": 120.9331, \"fromMemory\": true}]}'}, {'from': 'gpt', 'value': '上次您是去「咔啦咔啦雞」，這次還要去這裡嗎？'}]\n",
+      "[{'from': 'human', 'value': '我在找台中太平逸境'}, {'from': 'function_call', 'value': {'name': 'search_and_show_place', 'arguments': \"{'keyword': '台中太平逸境'}\"}}, {'from': 'observation', 'value': '{\"name\": \"search_and_show_place\", \"status\": \"success\", \"message\": \"地點查詢成功\", \"poi\": [{\"name\": \"台中太平逸境\", \"address\": \"台中市太平區新仁路一段88號\", \"latitude\": 24.109912, \"longitude\": 120.737201, \"fromMemory\": false}]}'}, {'from': 'gpt', 'value': '我找到「台中太平逸境」，請問是要到這裡嗎？'}, {'from': 'human', 'value': '去第一個'}, {'from': 'function_call', 'value': {'name': 'nav_start', 'arguments': \"{'keyword': '台中太平逸境', 'name': '台中太平逸境', 'address': '台中市太平區新仁路一段88號', 'latitude': 24.109912, 'longitude': 120.737201, 'fromMemory': False}\"}}, {'from': 'observation', 'value': '{\"name\": \"nav_start\", \"status\": \"success\", \"message\": \"導航已啟動\"}'}, {'from': 'gpt', 'value': '導航已設定，往「台中太平逸境」出發。'}, {'from': 'human', 'value': '請帶我去澳洲大堡道acher韓的區'}, {'from': 'function_call', 'value': {'name': 'search_and_show_place', 'arguments': \"{'keyword': '澳洲大堡道acher韓的區'}\"}}, {'from': 'observation', 'value': '{\"name\": \"search_and_show_place\", \"status\": \"success\", \"message\": \"地點查詢成功\", \"poi\": [{\"name\": \"咔啦咔啦雞\", \"address\": \"南投市中山路120號\", \"latitude\": 23.9133, \"longitude\": 120.685, \"fromMemory\": true}, {\"name\": \"炸物車站\", \"address\": \"新竹市香山區中華路六段88號\", \"latitude\": 24.776, \"longitude\": 120.9331, \"fromMemory\": true}]}'}, {'from': 'gpt', 'value': '上次您是去「咔啦咔啦雞」，這次還要去這裡嗎？'}, {'from': 'human', 'value': '這好像不像吧'}, {'from': 'gpt', 'value': '請告訴我目的地附近的標誌性景點或描述「澳洲大堡道acher韓的區」的具體特徵，這樣我可以更精準地幫你搜尋並導航到正確的地址'}]\n",
+      "[{'from': 'human', 'value': '我在找台中太平逸境'}, {'from': 'function_call', 'value': {'name': 'search_and_show_place', 'arguments': \"{'keyword': '台中太平逸境'}\"}}, {'from': 'observation', 'value': '{\"name\": \"search_and_show_place\", \"status\": \"success\", \"message\": \"地點查詢成功\", \"poi\": [{\"name\": \"台中太平逸境\", \"address\": \"台中市��平區新仁路一段88號\", \"latitude\": 24.109912, \"longitude\": 120.737201, \"fromMemory\": false}]}'}, {'from': 'gpt', 'value': '我找到「台中太平逸境」，請問是要到這裡嗎？'}, {'from': 'human', 'value': '去第一個'}, {'from': 'function_call', 'value': {'name': 'nav_start', 'arguments': \"{'keyword': '台中太平逸境', 'name': '台中太平逸境', 'address': '台中市太平區新仁路一段88號', 'latitude': 24.109912, 'longitude': 120.737201, 'fromMemory': False}\"}}, {'from': 'observation', 'value': '{\"name\": \"nav_start\", \"status\": \"success\", \"message\": \"導航已啟動\"}'}, {'from': 'gpt', 'value': '導航已設定，往「台中太平逸境」出發。'}, {'from': 'human', 'value': '請帶我去澳洲大堡道acher韓的區'}, {'from': 'function_call', 'value': {'name': 'search_and_show_place', 'arguments': \"{'keyword': '澳洲大堡道acher韓的區'}\"}}, {'from': 'observation', 'value': '{\"name\": \"search_and_show_place\", \"status\": \"success\", \"message\": \"地點查詢成功\", \"poi\": [{\"name\": \"咔啦咔啦雞\", \"address\": \"南投市中山路120號\", \"latitude\": 23.9133, \"longitude\": 120.685, \"fromMemory\": true}, {\"name\": \"炸物車站\", \"address\": \"新竹市香山區中華路六段88號\", \"latitude\": 24.776, \"longitude\": 120.9331, \"fromMemory\": true}]}'}, {'from': 'gpt', 'value': '上次您是去「咔啦咔啦雞」，這次還要去這裡嗎？'}, {'from': 'human', 'value': '這好像不像吧'}, {'from': 'gpt', 'value': '請告訴我目的地附近的標誌性景點或描述「澳洲大堡道acher韓的區」的具體特徵，這樣我可以更精準地幫你搜尋並導航到正確的地址'}, {'from': 'human', 'value': '這裡好像不對'}, {'from': 'gpt', 'value': '請告訴我目的地附近的標誌性景點或描述「澳洲大堡道acher韓的區」的具體特徵，這樣我可以更精準地幫你搜尋並導航到正確的地址'}]\n",
+      "[{'from': 'human', 'value': '我在找台中太平逸境'}, {'from': 'function_call', 'value': {'name': 'search_and_show_place', 'arguments': \"{'keyword': '台中太平逸境'}\"}}, {'from': 'observation', 'value': '{\"name\": \"search_and_show_place\", \"status\": \"success\", \"message\": \"地點查詢成功\", \"poi\": [{\"name\": \"台中太平逸境\", \"address\": \"台中市太平區新仁路一段88號\", \"latitude\": 24.109912, \"longitude\": 120.737201, \"fromMemory\": false}]}'}, {'from': 'gpt', 'value': '我找到「台中太平逸境」，請問是要到這裡嗎？'}, {'from': 'human', 'value': '去第一個'}, {'from': 'function_call', 'value': {'name': 'nav_start', 'arguments': \"{'keyword': '台中太平逸境', 'name': '台中太平逸境', 'address': '台中市太平區新仁路一段88號', 'latitude': 24.109912, 'longitude': 120.737201, 'fromMemory': False}\"}}, {'from': 'observation', 'value': '{\"name\": \"nav_start\", \"status\": \"success\", \"message\": \"導航已啟動\"}'}, {'from': 'gpt', 'value': '導航已設定，往「台中太平逸境」出發。'}, {'from': 'human', 'value': '請帶我去澳洲大堡道acher韓的區'}, {'from': 'function_call', 'value': {'name': 'search_and_show_place', 'arguments': \"{'keyword': '澳洲大堡道acher韓的區'}\"}}, {'from': 'observation', 'value': '{\"name\": \"search_and_show_place\", \"status\": \"success\", \"message\": \"地點查詢成功\", \"poi\": [{\"name\": \"咔啦咔啦雞\", \"address\": \"南投市中山路120號\", \"latitude\": 23.9133, \"longitude\": 120.685, \"fromMemory\": true}, {\"name\": \"炸物車站\", \"address\": \"新竹市香山區中華路六段88號\", \"latitude\": 24.776, \"longitude\": 120.9331, \"fromMemory\": true}]}'}, {'from': 'gpt', 'value': '上次您是去「咔啦咔啦雞」，這次還要去這裡嗎？'}, {'from': 'human', 'value': '這好像不像吧'}, {'from': 'gpt', 'value': '請告訴我目的地附近的標誌性景點或描述「澳洲大堡道acher韓的區」的具體特徵，這樣我可以更精準地幫你搜尋並導航到正確的地址'}, {'from': 'human', 'value': '這裡好像不對'}, {'from': 'gpt', 'value': '請告訴我目的地附近的標誌性景點或描述「澳洲大堡道acher韓的區」的具體特徵，這樣我可以更精準地幫你搜尋並導航到正確的地址'}, {'from': 'human', 'value': '我有印象他們有大片玻璃門面'}, {'from': 'function_call', 'value': {'name': 'search_and_show_place', 'arguments': \"{'keyword': '澳洲大堡道acher韓的區大片玻璃門面'}\"}}]\n",
+      "[{'from': 'human', 'value': '我在找台中太平逸境'}, {'from': 'function_call', 'value': {'name': 'search_and_show_place', 'arguments': \"{'keyword': '台中太平逸境'}\"}}, {'from': 'observation', 'value': '{\"name\": \"search_and_show_place\", \"status\": \"success\", \"message\": \"地點查詢成功\", \"poi\": [{\"name\": \"台中太平逸境\", \"address\": \"台中市太平區新仁路一段88號\", \"latitude\": 24.109912, \"longitude\": 120.737201, \"fromMemory\": false}]}'}, {'from': 'gpt', 'value': '我找到���台中太平逸境」，請問是要到這裡嗎？'}, {'from': 'human', 'value': '去第一個'}, {'from': 'function_call', 'value': {'name': 'nav_start', 'arguments': \"{'keyword': '台中太平逸境', 'name': '台中太平逸境', 'address': '台中市太平區新仁路一段88號', 'latitude': 24.109912, 'longitude': 120.737201, 'fromMemory': False}\"}}, {'from': 'observation', 'value': '{\"name\": \"nav_start\", \"status\": \"success\", \"message\": \"導航已啟動\"}'}, {'from': 'gpt', 'value': '導航已設定，往「台中太平逸境」出發。'}, {'from': 'human', 'value': '請帶我去澳洲大堡道acher韓的區'}, {'from': 'function_call', 'value': {'name': 'search_and_show_place', 'arguments': \"{'keyword': '澳洲大堡道acher韓的區'}\"}}, {'from': 'observation', 'value': '{\"name\": \"search_and_show_place\", \"status\": \"success\", \"message\": \"地點查詢成功\", \"poi\": [{\"name\": \"咔啦咔啦雞\", \"address\": \"南投市中山路120號\", \"latitude\": 23.9133, \"longitude\": 120.685, \"fromMemory\": true}, {\"name\": \"炸物車站\", \"address\": \"新竹市香山區中華路六段88號\", \"latitude\": 24.776, \"longitude\": 120.9331, \"fromMemory\": true}]}'}, {'from': 'gpt', 'value': '上次您是去「咔啦咔啦雞」，這次還要去這裡嗎？'}, {'from': 'human', 'value': '這好像不像吧'}, {'from': 'gpt', 'value': '請告訴我目的地附近的標誌性景點或描述「澳洲大堡道acher韓的區」的具體特徵，這樣我可以更精準地幫你搜尋並導航到正確的地址'}, {'from': 'human', 'value': '這裡好像不對'}, {'from': 'gpt', 'value': '請告訴我目的地附近的標誌性景點或描述「澳洲大堡道acher韓的區」的具體特徵，這樣我可以更精準地幫你搜尋並導航到正確的地址'}, {'from': 'human', 'value': '我有印象他們有大片玻璃門面'}, {'from': 'function_call', 'value': {'name': 'search_and_show_place', 'arguments': \"{'keyword': '澳洲大堡道acher韓的區大片玻璃門面'}\"}}, {'from': 'observation', 'value': '{\"name\": \"search_and_show_place\", \"status\": \"success\", \"message\": \"地點查詢成功\", \"poi\": [{\"name\": \"雞丁炸舖\", \"address\": \"台東市中華路一段210號\", \"latitude\": 22.7575, \"longitude\": 121.1457, \"fromMemory\": false}, {\"name\": \"即刻炸\", \"address\": \"桃園市桃園區中正路400號\", \"latitude\": 24.9933, \"longitude\": 121.3121, \"fromMemory\": false}, {\"name\": \"BOOM炸雞\", \"address\": \"台南市永康區中華路88號\", \"latitude\": 23.0349, \"longitude\": 120.25, \"fromMemory\": false}, {\"name\": \"雞咔咔速食站\", \"address\": \"台中市南區建成路160號\", \"latitude\": 24.1217, \"longitude\": 120.6852, \"fromMemory\": false}, {\"name\": \"炸雞一下\", \"address\": \"彰化市民生路188號\", \"latitude\": 24.0801, \"longitude\": 120.5429, \"fromMemory\": false}]}'}, {'from': 'gpt', 'value': '我找到以下與「澳洲大堡道acher韓的區」相關的地點，請問您要導航到哪一個地點？'}]\n",
+      "[{'from': 'human', 'value': '我在找台中太平逸境'}, {'from': 'function_call', 'value': {'name': 'search_and_show_place', 'arguments': \"{'keyword': '台中太平逸境'}\"}}, {'from': 'observation', 'value': '{\"name\": \"search_and_show_place\", \"status\": \"success\", \"message\": \"地點查詢成功\", \"poi\": [{\"name\": \"台中太平逸境\", \"address\": \"台中市太平區新仁路一段88號\", \"latitude\": 24.109912, \"longitude\": 120.737201, \"fromMemory\": false}]}'}, {'from': 'gpt', 'value': '我找到「台中太平逸境」，請問是要到這裡嗎？'}, {'from': 'human', 'value': '去第一個'}, {'from': 'function_call', 'value': {'name': 'nav_start', 'arguments': \"{'keyword': '台中太平逸境', 'name': '台中太平逸境', 'address': '台中市太平區新仁路一段88號', 'latitude': 24.109912, 'longitude': 120.737201, 'fromMemory': False}\"}}, {'from': 'observation', 'value': '{\"name\": \"nav_start\", \"status\": \"success\", \"message\": \"導航已啟動\"}'}, {'from': 'gpt', 'value': '導航已設定，往「台中太平逸境」出發。'}, {'from': 'human', 'value': '請帶我去澳洲大堡道acher韓的區'}, {'from': 'function_call', 'value': {'name': 'search_and_show_place', 'arguments': \"{'keyword': '澳洲大堡道acher韓的區'}\"}}, {'from': 'observation', 'value': '{\"name\": \"search_and_show_place\", \"status\": \"success\", \"message\": \"地點查詢成功\", \"poi\": [{\"name\": \"咔啦咔啦雞\", \"address\": \"南投市中山路120號\", \"latitude\": 23.9133, \"longitude\": 120.685, \"fromMemory\": true}, {\"name\": \"炸物車站\", \"address\": \"新竹市香山區中華路六段88號\", \"latitude\": 24.776, \"longitude\": 120.9331, \"fromMemory\": true}]}'}, {'from': 'gpt', 'value': '上次您是去「咔啦咔啦雞」，這次還要去這裡嗎？'}, {'from': 'human', 'value': '這好像不像吧'}, {'from': 'gpt', 'value': '請告訴我目的地附近的標誌性景點或描述「澳洲大堡道acher韓的區��的具體特徵，這樣我可以更精準地幫你搜尋並導航到正確的地址'}, {'from': 'human', 'value': '這裡好像不對'}, {'from': 'gpt', 'value': '請告訴我目的地附近的標誌性景點或描述「澳洲大堡道acher韓的區」的具體特徵，這樣我可以更精準地幫你搜尋並導航到正確的地址'}, {'from': 'human', 'value': '我有印象他們有大片玻璃門面'}, {'from': 'function_call', 'value': {'name': 'search_and_show_place', 'arguments': \"{'keyword': '澳洲大堡道acher韓的區大片玻璃門面'}\"}}, {'from': 'observation', 'value': '{\"name\": \"search_and_show_place\", \"status\": \"success\", \"message\": \"地點查詢成功\", \"poi\": [{\"name\": \"雞丁炸舖\", \"address\": \"台東市中華路一段210號\", \"latitude\": 22.7575, \"longitude\": 121.1457, \"fromMemory\": false}, {\"name\": \"即刻炸\", \"address\": \"桃園市桃園區中正路400號\", \"latitude\": 24.9933, \"longitude\": 121.3121, \"fromMemory\": false}, {\"name\": \"BOOM炸雞\", \"address\": \"台南市永康區中華路88號\", \"latitude\": 23.0349, \"longitude\": 120.25, \"fromMemory\": false}, {\"name\": \"雞咔咔速食站\", \"address\": \"台中市南區建成路160號\", \"latitude\": 24.1217, \"longitude\": 120.6852, \"fromMemory\": false}, {\"name\": \"炸雞一下\", \"address\": \"彰化市民生路188號\", \"latitude\": 24.0801, \"longitude\": 120.5429, \"fromMemory\": false}]}'}, {'from': 'gpt', 'value': '我找到以下與「澳洲大堡道acher韓的區」相關的地點，請問您要導航到哪一個地點？'}, {'from': 'human', 'value': '還是算了，去比較好'}, {'from': 'gpt', 'value': '已為您取消導航清單'}]\n",
+      "[{'from': 'human', 'value': '我在找台中太平逸境'}, {'from': 'function_call', 'value': {'name': 'search_and_show_place', 'arguments': \"{'keyword': '台中太平逸境'}\"}}, {'from': 'observation', 'value': '{\"name\": \"search_and_show_place\", \"status\": \"success\", \"message\": \"地點查詢成功\", \"poi\": [{\"name\": \"台中太平逸境\", \"address\": \"台中市太平區新仁路一段88號\", \"latitude\": 24.109912, \"longitude\": 120.737201, \"fromMemory\": false}]}'}, {'from': 'gpt', 'value': '我找到「台中太平逸境」，請問是要到這裡嗎？'}, {'from': 'human', 'value': '去第一個'}, {'from': 'function_call', 'value': {'name': 'nav_start', 'arguments': \"{'keyword': '台中太平逸境', 'name': '台中太平逸境', 'address': '台中市太平區新仁路一段88號', 'latitude': 24.109912, 'longitude': 120.737201, 'fromMemory': False}\"}}, {'from': 'observation', 'value': '{\"name\": \"nav_start\", \"status\": \"success\", \"message\": \"導航已啟動\"}'}, {'from': 'gpt', 'value': '導航已設定，往「台中太平逸境」出發。'}, {'from': 'human', 'value': '請帶我去澳洲大堡道acher韓的區'}, {'from': 'function_call', 'value': {'name': 'search_and_show_place', 'arguments': \"{'keyword': '澳洲大堡道acher韓的區'}\"}}, {'from': 'observation', 'value': '{\"name\": \"search_and_show_place\", \"status\": \"success\", \"message\": \"地點查詢成功\", \"poi\": [{\"name\": \"咔啦咔啦雞\", \"address\": \"南投市中山路120號\", \"latitude\": 23.9133, \"longitude\": 120.685, \"fromMemory\": true}, {\"name\": \"炸物車站\", \"address\": \"新竹市香山區中華路六段88號\", \"latitude\": 24.776, \"longitude\": 120.9331, \"fromMemory\": true}]}'}, {'from': 'gpt', 'value': '上次您是去「咔啦咔啦雞」，這次還要去這裡嗎？'}, {'from': 'human', 'value': '這好像不像吧'}, {'from': 'gpt', 'value': '請告訴我目的地附近的標誌性景點或描述「澳洲大堡道acher韓的區」的具體特徵，這樣我可以更精準地幫你搜尋並導航到正確的地址'}, {'from': 'human', 'value': '這裡好像不對'}, {'from': 'gpt', 'value': '請告訴我目的地附近的標誌性景點或描述「澳洲大堡道acher韓的區」的具體特徵，這樣我可以更精準地幫你搜尋並導航到正確的地址'}, {'from': 'human', 'value': '我有印象他們有大片玻璃門面'}, {'from': 'function_call', 'value': {'name': 'search_and_show_place', 'arguments': \"{'keyword': '澳洲大堡道acher韓的區大片玻璃門面'}\"}}, {'from': 'observation', 'value': '{\"name\": \"search_and_show_place\", \"status\": \"success\", \"message\": \"地點查詢成功\", \"poi\": [{\"name\": \"雞丁炸舖\", \"address\": \"台東市中華路一段210號\", \"latitude\": 22.7575, \"longitude\": 121.1457, \"fromMemory\": false}, {\"name\": \"即刻炸\", \"address\": \"桃園市桃園區中正路400號\", \"latitude\": 24.9933, \"longitude\": 121.3121, \"fromMemory\": false}, {\"name\": \"BOOM炸雞\", \"address\": \"台南市永康區中華路88號\", \"latitude\": 23.0349, \"longitude\": 120.25, \"fromMemory\": false}, {\"name\": \"雞咔咔速食站\", \"address\": \"台中市南區建成路160號\", \"latitude\": 24.1217, \"longitude\": 120.6852, \"fromMemory\": false}, {\"name\": \"炸雞一下\", \"address\": \"彰化市民���路188號\", \"latitude\": 24.0801, \"longitude\": 120.5429, \"fromMemory\": false}]}'}, {'from': 'gpt', 'value': '我找到以下與「澳洲大堡道acher韓的區」相關的地點，請問您要導航到哪一個地點？'}, {'from': 'human', 'value': '還是算了，去比較好'}, {'from': 'gpt', 'value': '已為您取消導航清單'}, {'from': 'human', 'value': '還是算了，不去比較好'}, {'from': 'gpt', 'value': '已為您取消導航清單'}]\n"
+     ]
+    }
+   ],
+   "source": [
+    "for all_conv in data:\n",
+    "    history = []\n",
+    "    tools = all_conv['tools']\n",
+    "    for idx,conv in enumerate(all_conv['conversations']):\n",
+    "        if conv['from']=='function_call' or conv['from']=='gpt':continue\n",
+    "        elif conv['from']=='human':\n",
+    "            history = callAPI(history,audio_file=conv['audio_path'])\n",
+    "        elif conv['from']=='observation':\n",
+    "            history.append(conv)\n",
+    "            history = callAPI(history)\n",
+    "        print(history)\n",
+    "    break"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'from': 'human', 'value': '我在找台東太平逸境'},\n",
+       " {'from': 'function_call',\n",
+       "  'value': {'name': 'search_and_show_place',\n",
+       "   'arguments': '{\"keyword\":\"台東太平逸境\"}'}},\n",
+       " {'from': 'observation',\n",
+       "  'value': '{\"name\": \"search_and_show_place\", \"status\": \"success\", \"message\": \"地點查詢成功\", \"poi\": [{\"name\": \"台中太平逸境\", \"address\": \"台中市太平區新仁路一段88號\", \"latitude\": 24.109912, \"longitude\": 120.737201, \"fromMemory\": false}]}'},\n",
+       " {'from': 'gpt', 'value': '我找到「台中太平逸境」，請問是要到這裡嗎？'},\n",
+       " {'from': 'human', 'value': '去第一個'},\n",
+       " {'from': 'function_call',\n",
+       "  'value': {'name': 'nav_start',\n",
+       "   'arguments': '{\"keyword\":\"台東太平逸境\",\"name\":\"台中太平逸境\",\"address\":\"台中市太平區新仁路一段88號\",\"latitude\":24.109912,\"longitude\":120.737201,\"fromMemory\":false}'}},\n",
+       " {'from': 'observation',\n",
+       "  'value': '{\"name\": \"nav_start\", \"status\": \"success\", \"message\": \"導航已啟動\"}'},\n",
+       " {'from': 'gpt', 'value': '導航已設定，往「台中太平逸境」出發。'},\n",
+       " {'from': 'human', 'value': '請帶我去澳洲大堡道科韓的區政府店'},\n",
+       " {'from': 'function_call',\n",
+       "  'value': {'name': 'search_and_show_place',\n",
+       "   'arguments': '{\"keyword\":\"澳洲大堡道科韓的區政府店\"}'}},\n",
+       " {'from': 'observation',\n",
+       "  'value': '{\"name\": \"search_and_show_place\", \"status\": \"success\", \"message\": \"地點查詢成功\", \"poi\": [{\"name\": \"咔啦咔啦雞\", \"address\": \"南投市中山路120號\", \"latitude\": 23.9133, \"longitude\": 120.685, \"fromMemory\": true}, {\"name\": \"炸物車站\", \"address\": \"新竹市香山區中華路六段88號\", \"latitude\": 24.776, \"longitude\": 120.9331, \"fromMemory\": true}]}'},\n",
+       " {'from': 'gpt', 'value': '上次您是去「咔啦咔啦雞」，這次還要去這裡嗎？'},\n",
+       " {'from': 'human', 'value': '這好像不像吧'},\n",
+       " {'from': 'gpt',\n",
+       "  'value': '請告訴我目的地附近的標誌性景點或描述「澳洲大堡道科韓的區政府店」的具體特徵，這樣我可以更精準地幫你搜尋並導航到正確的地址'},\n",
+       " {'from': 'human', 'value': '這裡好像不對'},\n",
+       " {'from': 'gpt',\n",
+       "  'value': '請告訴我目的地附近的標誌性景點或描述「澳洲大堡道科韓的區政府店」的具體特徵，這樣我可以更精準地幫你搜尋並導航到正確的地址'},\n",
+       " {'from': 'human', 'value': '我有印象他們有大片玻璃門面'},\n",
+       " {'from': 'function_call',\n",
+       "  'value': {'name': 'search_and_show_place',\n",
+       "   'arguments': '{\"keyword\":\"澳洲大堡道科韓的區政府店大片玻璃門面\"}'}},\n",
+       " {'from': 'observation',\n",
+       "  'value': '{\"name\": \"search_and_show_place\", \"status\": \"success\", \"message\": \"地點查詢成功\", \"poi\": [{\"name\": \"雞丁炸舖\", \"address\": \"台東市中華路一段210號\", \"latitude\": 22.7575, \"longitude\": 121.1457, \"fromMemory\": false}, {\"name\": \"即刻炸\", \"address\": \"桃園市桃園區中正路400號\", \"latitude\": 24.9933, \"longitude\": 121.3121, \"fromMemory\": false}, {\"name\": \"BOOM炸雞\", \"address\": \"台南市永康區中華路88號\", \"latitude\": 23.0349, \"longitude\": 120.25, \"fromMemory\": false}, {\"name\": \"雞咔咔速食站\", \"address\": \"台中市南區建成路160號\", \"latitude\": 24.1217, \"longitude\": 120.6852, \"fromMemory\": false}, {\"name\": \"炸雞一下\", \"address\": \"彰化市民生路188號\", \"latitude\": 24.0801, \"longitude\": 120.5429, \"fromMemory\": false}]}'},\n",
+       " {'from': 'gpt', 'value': '我找到以下與「澳洲大堡道科韓的區政府店」相關的地點，請問您要導航到哪一個地點？'},\n",
+       " {'from': 'human', 'value': '還是算了，去比較好'},\n",
+       " {'from': 'gpt', 'value': '已為您取消導航清單'},\n",
+       " {'from': 'human', 'value': '還是算了，不去比較好'},\n",
+       " {'from': 'gpt', 'value': '已為您取消導航清單'}]"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "history"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "py10",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

eval.py CHANGED Viewed

@@ -18,14 +18,13 @@ from torch.utils.data import Dataset, DataLoader
 import soundfile as sf
 import re
 from pathlib import Path
-import opencc
-converter = opencc.OpenCC('s2tw.json')
 normalizer = {
     "en_us" : EnglishTextNormalizer(),
     "other" : BasicTextNormalizer()
 }
-model_id = "/mnt/jeff/gemma_test"
 revision = "main" #"v1.0"
 model = AutoModel.from_pretrained(
@@ -146,7 +145,7 @@ class LibriSpeechDataset(BaseAudioDataset):
                             subset,
                             split=split,
                             trust_remote_code=True,
-                            cache_dir=Path("/mnt/jeff/InCar/data")
                             )
         # (Optional) Audio length Filtering
@@ -185,7 +184,7 @@ class CommonVoiceDataset(BaseAudioDataset):
                             source_lang,
                             split=split,
                             trust_remote_code=True,
-                            cache_dir=Path("/mnt/jeff/InCar/data")
                             )
         def prepare_dataset(batch):
             """Function to preprocess the dataset with the .map method"""
@@ -202,7 +201,7 @@ class CommonVoiceDataset(BaseAudioDataset):
             batch["sentence"] = transcription
             return batch
-        self.data=self.data.map(prepare_dataset, desc="preprocess dataset")
         # (Optional) Audio length Filtering
         self.data = self.filter_by_audio_length(self.data, "audio")
@@ -249,7 +248,7 @@ class FleursDataset(BaseAudioDataset):
                             source_lang,
                             split=split,
                             trust_remote_code=True,
-                            cache_dir=Path("/mnt/jeff/InCar/data")
                             )
         def prepare_dataset(batch):
             import opencc
@@ -263,7 +262,7 @@ class FleursDataset(BaseAudioDataset):
             return batch
         if (source_lang=="cmn_hans_cn" and not self.ast) or (self.ast and target_lang=="cmn_hans_cn"):
-            self.data=self.data.map(prepare_dataset, desc="preprocess dataset")
         # (Optional) Audio length Filtering
         self.data = self.filter_by_audio_length(self.data, "audio")
@@ -281,7 +280,7 @@ class FleursDataset(BaseAudioDataset):
                                 target_lang,
                                 split=split,
                                 trust_remote_code=True,
-                                cache_dir=Path("/mnt/jeff/InCar/data")
                                 )
             source_dict = {item['id']: item for item in self.data}
@@ -435,6 +434,8 @@ def save_results(results, dataset_name, task, source_lang, target_lang=None, sam
     return filepath
 def evaluate_task(dataset, source_lang, target_lang, num_samples=-1, batch_size = 4, is_asr=True):
     task_type = "asr" if is_asr else "translation"
     eval_lang = source_lang if is_asr else target_lang
     if eval_lang in normalizer:
@@ -482,7 +483,7 @@ def evaluate_task(dataset, source_lang, target_lang, num_samples=-1, batch_size
                 "prediction": converter.convert(prediction)
             }
             sample_results.append(sample_result)
         if (batch_idx + 1) % 10 == 0:
             temp_results = []
@@ -496,6 +497,7 @@ def evaluate_task(dataset, source_lang, target_lang, num_samples=-1, batch_size
                 else:
                     temp_item = item.copy()
                     try:
                         ref = eval_normalizer(item["reference"])
                         pred = eval_normalizer(item["prediction"])
@@ -506,7 +508,7 @@ def evaluate_task(dataset, source_lang, target_lang, num_samples=-1, batch_size
                         metrics = {
                             "bleu": utt_bleu,
-                            "cer": min(100,utt_cer),
                             "wer": utt_wer
                         }
@@ -544,7 +546,7 @@ def evaluate_task(dataset, source_lang, target_lang, num_samples=-1, batch_size
         item.update({
             "bleu": utt_bleu,
-            "cer": min(100,utt_cer),
             "wer": utt_wer
         })
@@ -581,7 +583,7 @@ if __name__ == "__main__":
     ]
     num_samples = -1
-    batch_size = 32
     for source_lang, target_lang in zip(source_languages, target_languages):
         print(f"\n===== {source_lang[0]} ASR =====")
@@ -590,21 +592,12 @@ if __name__ == "__main__":
         datasets = []
         commonvoice_speech_tw = CommonVoiceDataset(
             processor=processor,
             source_lang="zh-TW",
             split=split
         )
         datasets.append(commonvoice_speech_tw)
-        fleurs = FleursDataset(
-            processor=processor,
-            split=split,
-            source_lang="en_us",  # English
-            mode="asr"
-        )
-        datasets.append(fleurs)
         # Libri Speech Clean ASR mode (English -> English text)
         # libri_speech_clean = LibriSpeechDataset(
@@ -623,7 +616,13 @@ if __name__ == "__main__":
         # datasets.append(libri_speech_other)
         # Fleurs ASR mode (English -> English text)
         for dataset in datasets:
             # ASR

 import soundfile as sf
 import re
 from pathlib import Path
 normalizer = {
     "en_us" : EnglishTextNormalizer(),
     "other" : BasicTextNormalizer()
 }
+model_id = "/home/jeff/codes/llm/InCar/gemma-3-4b-it-omni"
 revision = "main" #"v1.0"
 model = AutoModel.from_pretrained(
                             subset,
                             split=split,
                             trust_remote_code=True,
+                            cache_dir=Path("/home/jeff/codes/llm/InCar/data")
                             )
         # (Optional) Audio length Filtering
                             source_lang,
                             split=split,
                             trust_remote_code=True,
+                            cache_dir=Path("/home/jeff/codes/llm/InCar/data")
                             )
         def prepare_dataset(batch):
             """Function to preprocess the dataset with the .map method"""
             batch["sentence"] = transcription
             return batch
+        self.data.map(prepare_dataset, desc="preprocess dataset")
         # (Optional) Audio length Filtering
         self.data = self.filter_by_audio_length(self.data, "audio")
                             source_lang,
                             split=split,
                             trust_remote_code=True,
+                            cache_dir=Path("/home/jeff/codes/llm/InCar/data")
                             )
         def prepare_dataset(batch):
             import opencc
             return batch
         if (source_lang=="cmn_hans_cn" and not self.ast) or (self.ast and target_lang=="cmn_hans_cn"):
+            self.data.map(prepare_dataset, desc="preprocess dataset")
         # (Optional) Audio length Filtering
         self.data = self.filter_by_audio_length(self.data, "audio")
                                 target_lang,
                                 split=split,
                                 trust_remote_code=True,
+                                cache_dir=Path("/home/jeff/codes/llm/InCar/data")
                                 )
             source_dict = {item['id']: item for item in self.data}
     return filepath
 def evaluate_task(dataset, source_lang, target_lang, num_samples=-1, batch_size = 4, is_asr=True):
+    import opencc
+    converter = opencc.OpenCC('s2tw.json')
     task_type = "asr" if is_asr else "translation"
     eval_lang = source_lang if is_asr else target_lang
     if eval_lang in normalizer:
                 "prediction": converter.convert(prediction)
             }
             sample_results.append(sample_result)
         if (batch_idx + 1) % 10 == 0:
             temp_results = []
                 else:
                     temp_item = item.copy()
                     try:
                         ref = eval_normalizer(item["reference"])
                         pred = eval_normalizer(item["prediction"])
                         metrics = {
                             "bleu": utt_bleu,
+                            "cer": utt_cer,
                             "wer": utt_wer
                         }
         item.update({
             "bleu": utt_bleu,
+            "cer": utt_cer,
             "wer": utt_wer
         })
     ]
     num_samples = -1
+    batch_size = 2
     for source_lang, target_lang in zip(source_languages, target_languages):
         print(f"\n===== {source_lang[0]} ASR =====")
         datasets = []
         commonvoice_speech_tw = CommonVoiceDataset(
             processor=processor,
             source_lang="zh-TW",
             split=split
         )
         datasets.append(commonvoice_speech_tw)
         # Libri Speech Clean ASR mode (English -> English text)
         # libri_speech_clean = LibriSpeechDataset(
         # datasets.append(libri_speech_other)
         # Fleurs ASR mode (English -> English text)
+        fleurs = FleursDataset(
+            processor=processor,
+            split=split,
+            source_lang="en_us",  # English
+            mode="asr"
+        )
+        datasets.append(fleurs)
         for dataset in datasets:
             # ASR

eval_multiturn_textonly.py ADDED Viewed

	@@ -0,0 +1,131 @@

+from io import BytesIO
+from urllib.request import urlopen
+import soundfile
+import torch
+from datasets import load_dataset, Audio
+import numpy as np
+from transformers import AutoModel, AutoProcessor, BatchFeature,Gemma3ForCausalLM,Gemma3Processor
+from tqdm import tqdm
+import json
+import os
+import time
+from datetime import datetime
+from whisper_normalizer.english import EnglishTextNormalizer
+from whisper_normalizer.basic import BasicTextNormalizer
+import sacrebleu
+from jiwer import cer, wer
+from torch.utils.data import Dataset, DataLoader
+import soundfile as sf
+import re
+from pathlib import Path
+import opencc
+from ASRDataset import *
+# converter = opencc.OpenCC('s2tw.json')
+model_id = "./"
+revision = "main" #"v1.0"
+processor = AutoProcessor.from_pretrained(
+    model_id, revision = revision, trust_remote_code=True
+)
+results_dir = f"evaluation_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+# os.makedirs(results_dir, exist_ok=True)
+def eval_text(model,dataloader,with_input_mode=False,save_path="",start_idx=0):
+    res = {'label':[],"pred":[],'cer':[]}
+    func_error = 0
+    total_func_call = 0
+    total_error = 0
+    all_output_text = []
+    remove_sign = lambda x:x.replace('User transcribe is','').replace('GPT output is','').replace('\n','').\
+                            replace(' ','').replace('?','').replace('？','').replace('!','').replace('。','').\
+                            replace('.','').replace('！','')
+    for batch_idx, batch in enumerate(tqdm(dataloader)):
+        if batch_idx<=start_idx:continue
+        batch = {k: v.to("cuda") for k, v in batch.items() if type(v)!=type(None)}
+        try:
+            with torch.inference_mode():
+                if not with_input_mode: batch.pop('input_modes')
+                generate_ids = model.generate(**batch,
+                max_new_tokens=256,
+                temperature = 0.001, top_p = 0.95, top_k = 64, do_sample=True
+                )
+                batch_inputs = processor.batch_decode(
+                    generate_ids[:, :batch['input_ids'].shape[1]], skip_special_tokens=True,
+                    clean_up_tokenization_spaces=False
+                )
+                batch_predictions = processor.batch_decode(
+                    generate_ids[:, batch['input_ids'].shape[1]:], skip_special_tokens=True,
+                    clean_up_tokenization_spaces=False
+                )
+                batch_references = processor.batch_decode(
+                    batch['labels'], skip_special_tokens=True, clean_up_tokenization_spaces=False
+                )
+                for inp,label,output in zip(batch_inputs,batch_references,batch_predictions):
+                    cer_o = min(100,round(cer(re.sub(r"\s+", "", label), re.sub(r"\s+", "", output)) * 100, 2))
+                    res['label'].append(batch_references)
+                    res['pred'].append(batch_predictions)
+                    res['cer'].append(cer_o)
+                    all_output_text.append({
+                        'input':inp,
+                        'label':label,
+                        'output':output,
+                        'cer':cer_o,
+                    })
+                    if 'Action:' in label:
+                        func_error+=(remove_sign(label)!=remove_sign(output))
+                        total_func_call+=1
+                if batch_idx%100==0:
+                    with open(save_path,'w', encoding='utf-8') as f:
+                        json.dump(all_output_text,f, ensure_ascii=False, indent=4)
+                    avg_cer = sum(a['cer'] for a in all_output_text)/len(all_output_text)
+                    total_error = sum(a['cer']!=0 for a in all_output_text)
+                    print('total',len(all_output_text))
+                    print('total_error & rate',total_error,total_error/len(all_output_text))
+                    print('avg_cer',avg_cer)
+                    print('total_func_call',total_func_call)
+                    print('func_error & rate',func_error,',',func_error/total_func_call)
+        except:
+            print("error at ",batch_idx)
+            time.sleep(2)
+    avg_cer = sum(a['cer'] for a in all_output_text)/len(all_output_text)
+    total_error = sum(a['cer']!=0 for a in all_output_text)
+    print('total',len(all_output_text))
+    print('total_error & rate',total_error,total_error/len(all_output_text))
+    print('avg_cer',avg_cer)
+    print('total_func_call',total_func_call)
+    print('func_error & rate',func_error,',',func_error/total_func_call)
+    with open(save_path,'w', encoding='utf-8') as f:
+        json.dump(all_output_text,f, ensure_ascii=False, indent=4)
+    return res,all_output_text
+nav_data = MultiturnAudioDataset(split='eval',text_only=True,processor=processor,json_path='/mnt/data-2t/jeff/codes/LLaMA-Factory/data/nav_toolcall_train.json')
+ctrl_data = MultiturnAudioDataset(split='eval',text_only=True,processor=processor,json_path='/mnt/data-2t/jeff/codes/LLaMA-Factory/data/ctrl_toolcall_train.json')
+ctrl_dataloader = DataLoader(ctrl_data, batch_size=1, shuffle=False, collate_fn=covost_collate_fn)
+nav_dataloader = DataLoader(nav_data, batch_size=1, shuffle=False, collate_fn=covost_collate_fn)
+from transformers import AutoProcessor, Gemma3ForConditionalGeneration
+from PIL import Image
+import requests
+import torch
+model_id_org = "google/gemma-3-4b-it"
+model_org = Gemma3ForConditionalGeneration.from_pretrained(
+    model_id_org, device_map="auto",attn_implementation="eager"
+).eval()
+from peft import PeftModel
+model_org = PeftModel.from_pretrained(model_org, '/mnt/data-2t/jeff/codes/LLaMA-Factory/saves/Gemma-3-4B-Instruct/lora/train_123/checkpoint-3270')
+res_org_nav,output_org_nav = eval_text(model_org,nav_dataloader,save_path='./output_org_nav_{}.json'.format(str(datetime.now())[:16]))
+res_org_ctrl,output_org_ctrl = eval_text(model_org,ctrl_dataloader,save_path='./output_org_ctrl_{}.json'.format(str(datetime.now())[:16]))

generation_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "bos_token_id": 2,
+  "cache_implementation": "hybrid",
+  "do_sample": true,
+  "eos_token_id": [
+    1,
+    106
+  ],
+  "pad_token_id": 0,
+  "top_k": 64,
+  "top_p": 0.95,
+  "transformers_version": "4.51.3"
+}

model-00001-of-00003.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ddd3e8916f7ad6ad92651ac288227995c1d34628f0f888eb2dc5b9acb4dc0121
-size 4976361384

 version https://git-lfs.github.com/spec/v1
+oid sha256:c729778bd29ddfd0e5a97b55113d4d0b70ae902159c36601234824502556777e
+size 4983859800

model-00002-of-00003.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c343a455ca768923cb3b9ab77cbb91c9cd2526a1bee5740cf9cf86bfa85a0a7b
-size 4984907872

 version https://git-lfs.github.com/spec/v1
+oid sha256:101b7c0d33ed4b221f1c043768fc805b8d183f117746ef782df74018094ce0d9
+size 4997727608

model-00003-of-00003.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ad04449d015f4efbda75d6cc41e06296b4da996cd84053fa6f9791fe16d55d03
-size 732141104

 version https://git-lfs.github.com/spec/v1
+oid sha256:28aa17a269742a7258379fe548d426d2418a98d8535590f2f6cc63767883a0f9
+size 741700120

model.safetensors.index.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

preprocessing_gemma3omni.py CHANGED Viewed

@@ -381,24 +381,12 @@ class Gemma3OmniProcessor(ProcessorMixin):
         if audio is not None:
             full_audio_sequences = []
             audio_inputs = self.feature_extractor(audio)
-            def replace_tokens_sequentially(prompt, boa_token, audio_sequences):
-                parts = prompt.split(boa_token)
-                result = ""
-                for i in range(len(parts) - 1):
-                    result += parts[i]
-                    if i < len(audio_sequences):
-                        result += audio_sequences[i]
-                    else:
-                        result += boa_token
-                result += parts[-1]
-                return result
             for i, embed_size in enumerate(audio_inputs.audio_embed_sizes):
                 audio_tokens_expanded = "".join([self.audio_token] * embed_size)
                 full_audio_sequence = f"\n\n{self.boa_token}{audio_tokens_expanded}{self.eoa_token}\n\n"
                 full_audio_sequences.append(full_audio_sequence)
-            text = [replace_tokens_sequentially(prompt, self.boa_token, [audio_sequences]) for (prompt, audio_sequences) in zip(text, full_audio_sequences)]
-            #text = [prompt.replace(self.boa_token, audio_sequences) for (prompt, audio_sequences) in zip(text, full_audio_sequences)]
         text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"], return_tensors="np")

         if audio is not None:
             full_audio_sequences = []
             audio_inputs = self.feature_extractor(audio)
             for i, embed_size in enumerate(audio_inputs.audio_embed_sizes):
                 audio_tokens_expanded = "".join([self.audio_token] * embed_size)
                 full_audio_sequence = f"\n\n{self.boa_token}{audio_tokens_expanded}{self.eoa_token}\n\n"
                 full_audio_sequences.append(full_audio_sequence)
+            text = [prompt.replace(self.boa_token, audio_sequences) for (prompt, audio_sequences) in zip(text, full_audio_sequences)]
         text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"], return_tensors="np")

run.sh ADDED Viewed

	@@ -0,0 +1,4 @@

+export RDMAV_FORK_SAFE=1
+export NCCL_P2P_DISABLE="1"
+export NCCL_IB_DISABLE="1"
+python training.py

template.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"chat_template": "{{ bos_token }}\n{%- if tools %}\n {% set tools_json = tools %}\n You have access to the following tools:\n {% for tool in tools_json %}\n > Tool Name: {{ tool.name }}\n Tool Description: {{ tool.description }}\n Tool Args:\n {% if tool.parameters.properties %}\n {% for prop_name, prop_data in tool.parameters.properties.items() %}\n - {{ prop_name }} ({{ prop_data.type }}, {% if prop_name in tool.parameters.required %}required{% else %}optional{% endif %}): {{ prop_data.description }}\n {% endfor %}\n {% else %}\n {% endif %}\n\n {% endfor %}, where each item should be object\n Use the following format if using a tool:\n ```\n Action: tool name (one of [{{ tools_json | map(attribute='name') | join(', ') }}])\n Action Input: the input to the tool, in a JSON format representing the kwargs (e.g. ```{\"input\": \"hello world\", \"num_beams\": 5}```)\n ```\n {{'\n'}}\n{%- endif -%}\n\n\n{%- for message in messages -%}\n {%- if message['from'] == \"human\" -%}\n {%- if message['value'] is string -%}\n {{'<start_of_turn>user\n' + message['value'] + '<end_of_turn>\n'}}\n {%- elif message['value'] is iterable -%}\n {{'<start_of_turn>user\n'}}\n {%- for item in message['value'] -%}\n {%- if item['type'] == 'image' -%}\n {{ '<start_of_image>' }}\n {%- elif item['type'] == 'audio' -%}\n {{ '<start_of_audio>' }}\n {%- elif item['type'] == 'text' -%}\n {{ item['text'] | trim }}\n {%- endif -%}\n {{'<end_of_turn>\n'}}\n {%- endfor -%}\n {%- endif -%}\n {%- elif message['from'] == \"gpt\" -%}\n {{'<start_of_turn>model\n' + message['value'] + '<end_of_turn>\n'}} \n {%- elif message['from'] == \"function_call\" -%}\n {%- set func_call = message['value'] -%}\n {{'<start_of_turn>model\n' + 'Action:' + func_call.name + '\n' + 'Action Input:' + func_call.arguments + '<end_of_turn>\n'}} \n {%- elif message['from'] == \"observation\" -%}\n {{'<start_of_turn>tool\n' + message['value'] + '<end_of_turn>\n'}} \n {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{'<start_of_turn>model\n'}}\n{%- endif -%}\n"}

test.ipynb ADDED Viewed

	@@ -0,0 +1,977 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/jeff/miniconda3/envs/py10/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import AutoProcessor, AutoModel\n",
+    "import torch\n",
+    "import os\n",
+    "\n",
+    "os.environ['TORCH_USE_CUDA_DSA']=\"1\"\n",
+    "os.environ['CUDA_LAUNCH_BLOCKING']=\"1\"\n",
+    "os.environ['TORCH_DISABLE_SDPA'] = '1'\n",
+    "\n",
+    "model_id =  \"/home/jeff/jeff/codes/llm/InCar/gemma-3-4b-it-omni\"\n",
+    "revision = \"main\"\n",
+    "\n",
+    "# model = AutoModel.from_pretrained(\n",
+    "#     model_id, device_map=\"cuda\", \n",
+    "#     revision = revision, trust_remote_code=True,\n",
+    "# ).eval()\n",
+    "\n",
+    "processor = AutoProcessor.from_pretrained(\n",
+    "    model_id, revision = revision, trust_remote_code=True\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/jeff/miniconda3/envs/py10/lib/python3.10/site-packages/torchaudio/_backend/utils.py:213: UserWarning: In 2.9, this function's implementation will be changed to use torchaudio.load_with_torchcodec` under the hood. Some parameters like ``normalize``, ``format``, ``buffer_size``, and ``backend`` will be ignored. We recommend that you port your code to rely directly on TorchCodec's decoder instead: https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.decoders.AudioDecoder.html#torchcodec.decoders.AudioDecoder.\n",
+      "  warnings.warn(\n",
+      "/home/jeff/miniconda3/envs/py10/lib/python3.10/site-packages/torchaudio/_backend/ffmpeg.py:88: UserWarning: torio.io._streaming_media_decoder.StreamingMediaDecoder has been deprecated. This deprecation is part of a large refactoring effort to transition TorchAudio into a maintenance phase. The decoding and encoding capabilities of PyTorch for both audio and video are being consolidated into TorchCodec. Please see https://github.com/pytorch/audio/issues/3902 for more information. It will be removed from the 2.9 release. \n",
+      "  s = torchaudio.io.StreamReader(src, format, None, buffer_size)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from ASRDataset import *\n",
+    "datasets = MultiturnAudioDataset(processor=processor,json_path='/home/jeff/jeff/codes/llm/InCar/data/test_data/nav_0730_noisy.json')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'我在找台中太平逸境'"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from rapidfuzz import process, fuzz\n",
+    "from pypinyin import pinyin, Style\n",
+    "\n",
+    "def correct_sentence_with_pinyin(user_input_sentence, location_dict, score_cutoff=50):\n",
+    "    pinyin_dict = {}\n",
+    "    for location in location_dict:\n",
+    "        pinyin_name = ''.join([item[0] for item in pinyin(location, style=Style.NORMAL)])\n",
+    "        pinyin_dict[pinyin_name] = location\n",
+    "\n",
+    "    user_pinyin_sentence = ''.join([item[0] for item in pinyin(user_input_sentence, style=Style.NORMAL)])\n",
+    "\n",
+    "    best_match_pinyin = process.extractOne(\n",
+    "        query=user_pinyin_sentence,\n",
+    "        choices=list(pinyin_dict.keys()), # 傳入拼音作為搜尋目標\n",
+    "        scorer=fuzz.token_set_ratio,\n",
+    "        score_cutoff=score_cutoff\n",
+    "    )\n",
+    "\n",
+    "    if best_match_pinyin:\n",
+    "        best_pinyin_name = best_match_pinyin[0]\n",
+    "        corrected_location_name = pinyin_dict[best_pinyin_name]\n",
+    "\n",
+    "        best_user_substring = None\n",
+    "        max_substring_score = 0\n",
+    "        \n",
+    "        for i in range(len(user_input_sentence)):\n",
+    "            for j in range(i + 2, min(i + 16, len(user_input_sentence) + 1)):\n",
+    "                substring = user_input_sentence[i:j]\n",
+    "                \n",
+    "                score = fuzz.ratio(substring, corrected_location_name)\n",
+    "                \n",
+    "                if score > max_substring_score:\n",
+    "                    max_substring_score = score\n",
+    "                    best_user_substring = substring\n",
+    "        \n",
+    "        if best_user_substring and max_substring_score > score_cutoff:\n",
+    "            return user_input_sentence.replace(best_user_substring, corrected_location_name, 1)\n",
+    "        else:\n",
+    "            return user_input_sentence\n",
+    "    return user_input_sentence\n",
+    "large_location_dict = [\n",
+    "        \"台北太平逸\",\n",
+    "        \"台中太平逸境\",\n",
+    "        \"台北信義區\",\n",
+    "        \"高雄駁二藝術特區\",\n",
+    "        \"台南安平古堡\",\n",
+    "        \"台中逢甲夜市\",\n",
+    "        \"台北101\",\n",
+    "        \"淡水老街\"\n",
+    "    ]\n",
+    "\n",
+    "user_input_1 = \"我在找台東太平逸境\"\n",
+    "correct_sentence_with_pinyin(user_input_1, large_location_dict, score_cutoff=50)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "data = json.load(open('/home/jeff/jeff/codes/llm/InCar/data/test_data/nav_0730_noisy.json'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import copy\n",
+    "class InferenceClass:\n",
+    "    def __init__(self,model_id):\n",
+    "        self.model = AutoModel.from_pretrained(\n",
+    "            model_id, device_map=\"cuda\", \n",
+    "            torch_dtype=torch.bfloat16,\n",
+    "            trust_remote_code=True,\n",
+    "            attn_implementation=\"eager\"\n",
+    "        ).eval()\n",
+    "\n",
+    "        self.processor = AutoProcessor.from_pretrained(\n",
+    "            model_id, trust_remote_code=True\n",
+    "        )\n",
+    "        self.remove_words_signs = lambda x:x.replace('User transcribe is :','').replace('GPT output is :','').replace('\\n','').\\\n",
+    "                            replace(' ','').replace('?','').replace('？','').replace('!','').replace('。','').\\\n",
+    "                            replace('！','')\n",
+    "    def call_gpt(self,inputs_tensor):\n",
+    "        with torch.inference_mode():\n",
+    "            inputs = {k:inputs_tensor[k].to('cuda') for k in inputs_tensor}\n",
+    "            generate_ids = self.model.generate(**inputs, max_new_tokens=128, do_sample=False)\n",
+    "            generate_ids = generate_ids[:, inputs['input_ids'].shape[1] :]\n",
+    "            model_output = self.processor.batch_decode(\n",
+    "                generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False\n",
+    "            )[0]\n",
+    "        return model_output\n",
+    "    def call_function_fake(self,messages=[],obs=\"\"):\n",
+    "        messages.append({'from': 'observation', 'value': obs})\n",
+    "        return messages\n",
+    "    def generate(self,chat_history,tools=\"\",audio_path=None):\n",
+    "        '''\n",
+    "        input:\n",
+    "            audio_path : str\n",
+    "            chat_history : dict\n",
+    "        return:\n",
+    "            model_output : dict\n",
+    "        '''\n",
+    "        chat_history = copy.deepcopy(chat_history)\n",
+    "        if type(audio_path)!=type(None):\n",
+    "            chat_history.append({'from': 'human',\n",
+    "                                'value': [{'type': 'audio',\n",
+    "                                            'audio': audio_path}]})\n",
+    "        words_from_poi = []\n",
+    "        for hist in chat_history:\n",
+    "            if hist['from']=='observation' and '地點查詢成功' in hist['value'] and 'poi' in hist['value']:\n",
+    "                tmp = json.loads(hist['value'])\n",
+    "                for i,poi in enumerate(tmp['poi']):\n",
+    "                    words_from_poi.append(poi['name'])\n",
+    "        for hist in chat_history:\n",
+    "            if hist['from']=='human' and type(hist['value'])==str:\n",
+    "                hist['value'] = correct_sentence_with_pinyin(hist['value'],words_from_poi)\n",
+    "            elif hist['from']=='function_call' and \"arguments\" in hist['value'] and 'keyword' in hist['value'][\"arguments\"]:\n",
+    "                hist['value'][\"arguments\"] = eval(hist['value'][\"arguments\"])\n",
+    "                if 'keyword' in hist['value'][\"arguments\"]:\n",
+    "                    hist['value'][\"arguments\"]['keyword'] = correct_sentence_with_pinyin(hist['value'][\"arguments\"]['keyword'],words_from_poi)\n",
+    "                hist['value'][\"arguments\"] = str(hist['value'][\"arguments\"])\n",
+    "        # model_input_history = copy.deepcopy(chat_history)\n",
+    "        # num2ch = {1:'一',2:'二',3:'三',4:'四',5:'五',6:'六'}\n",
+    "        # for hist in model_input_history:\n",
+    "        #     if hist['from']=='observation' and '地點查詢成功' in hist['value'] and 'poi' in hist['value']:\n",
+    "        #         tmp = json.loads(hist['value'])\n",
+    "        #         new_poi = []\n",
+    "        #         for i,poi in enumerate(tmp['poi']):\n",
+    "        #             new_poi.append('第{}個 : '.format(num2ch[i+1])+str(poi))\n",
+    "        #         tmp['poi'] = new_poi\n",
+    "        #         hist['value'] = json.dumps(tmp, ensure_ascii=False)\n",
+    "\n",
+    "        inputs_text = self.processor.apply_chat_template(\n",
+    "            chat_history, add_generation_prompt=True, tokenize=False,\n",
+    "            return_dict=True, return_tensors=\"pt\", tools=json.loads(tools)\n",
+    "        )\n",
+    "        inputs_tensor = self.processor(text=inputs_text, \n",
+    "                        audio=[torchaudio.load(audio_path)[0]] if type(audio_path)!=type(None) else None, \n",
+    "                        add_special_tokens=False, \n",
+    "                        return_tensors='pt'\n",
+    "                    )\n",
+    "        model_output = self.call_gpt(inputs_tensor)\n",
+    "        if chat_history[-1]['from']=='observation':\n",
+    "            chat_history.append({'from': 'gpt', 'value': correct_sentence_with_pinyin(model_output,words_from_poi)})\n",
+    "            return chat_history\n",
+    "        if ((not ';\\n' in model_output) or (not 'User transcribe is :' in model_output) or (not 'GPT output is :' in model_output)\\\n",
+    "            or len(model_output.split(';\\n'))<2 ):\n",
+    "            if chat_history[-1]['value']!=\"抱歉我聽不清楚 能麻煩您再說一次嗎\":\n",
+    "                chat_history.append({'from': 'human',\n",
+    "                'value': 'HUMAN_VOICE_IS_NOT_RECOGNIZED'}),\n",
+    "                chat_history.append({'from': 'gpt', 'value': '抱歉我聽不清楚 能麻煩您再說一次嗎'})\n",
+    "            return chat_history\n",
+    "        output_t,output_o = model_output.split(';\\n')[:2]\n",
+    "        output_t,output_o = self.remove_words_signs(output_t),self.remove_words_signs(output_o)\n",
+    "        chat_history[-1]['value'] = correct_sentence_with_pinyin(output_t,words_from_poi)\n",
+    "        if 'Action:' in output_o and 'ActionInput:' in output_o: # function calling\n",
+    "            function_name,function_arg = output_o.split('ActionInput:')\n",
+    "            function_name = function_name.replace('Action:','')\n",
+    "            if \"keyword\" in function_arg:\n",
+    "                function_arg = json.loads(function_arg)\n",
+    "                if \"keyword\" in function_arg:\n",
+    "                    function_arg[\"keyword\"] = correct_sentence_with_pinyin(function_arg[\"keyword\"],words_from_poi)\n",
+    "            chat_history.append({'from': 'function_call', 'value': {\"name\": function_name, \"arguments\": str(function_arg)}})\n",
+    "        else: # gpt response\n",
+    "            chat_history.append({'from': 'gpt', 'value': correct_sentence_with_pinyin(output_o,words_from_poi)})\n",
+    "        return chat_history\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/jeff/.cache/huggingface/modules/transformers_modules/gemma-3-4b-it-omni/speech_conformer_encoder.py:2775: FutureWarning: Please specify CheckpointImpl.NO_REENTRANT as CheckpointImpl.REENTRANT will soon be removed as the default and eventually deprecated.\n",
+      "  lambda i: encoder_checkpoint_wrapper(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "######################## speech lora #############\n",
+      "######################## text lora #############\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.22s/it]\n"
+     ]
+    }
+   ],
+   "source": [
+    "pipeline = InferenceClass(model_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/jeff/miniconda3/envs/py10/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:636: UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `0.95` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`.\n",
+      "  warnings.warn(\n",
+      "/home/jeff/miniconda3/envs/py10/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:653: UserWarning: `do_sample` is set to `False`. However, `top_k` is set to `64` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_k`.\n",
+      "  warnings.warn(\n",
+      "/home/jeff/miniconda3/envs/py10/lib/python3.10/site-packages/torch/utils/checkpoint.py:85: UserWarning: None of the inputs have requires_grad=True. Gradients will be None\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[{'from': 'human', 'value': '馬上關充電口'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"EV_CHARGE_PORT_OPEN\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}]\n",
+      "[{'from': 'human', 'value': '馬上關充電口'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"EV_CHARGE_PORT_OPEN\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '充電口已關閉'}]\n",
+      "[{'from': 'human', 'value': '馬上關充電口'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"EV_CHARGE_PORT_OPEN\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '充電口已關閉'}, {'from': 'human', 'value': '立刻關掉電尾門'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SRDC_POWER_TAILGATE_ON\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}]\n",
+      "[{'from': 'human', 'value': '馬上關充電口'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"EV_CHARGE_PORT_OPEN\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '充電口已關閉'}, {'from': 'human', 'value': '立刻關掉電尾門'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SRDC_POWER_TAILGATE_ON\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '電動尾門已關閉'}]\n",
+      "[{'from': 'human', 'value': '馬上關充電口'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"EV_CHARGE_PORT_OPEN\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '充電口已關閉'}, {'from': 'human', 'value': '立刻關掉電尾門'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SRDC_POWER_TAILGATE_ON\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '電動尾門已關閉'}, {'from': 'human', 'value': '請第2排中間靠背向後'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SEAT_BACKREST_ANGLE_1_POS\",\"areaId\":\"SEAT_ROW_2_CENTER\",\"operation\":\"decrease\",\"value\":\"5\"}]}'}}]\n",
+      "[{'from': 'human', 'value': '馬上關充電口'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"EV_CHARGE_PORT_OPEN\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '充電口已關閉'}, {'from': 'human', 'value': '立刻關掉電尾門'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SRDC_POWER_TAILGATE_ON\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '電動尾門已關閉'}, {'from': 'human', 'value': '請第2排中間靠背向後'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SEAT_BACKREST_ANGLE_1_POS\",\"areaId\":\"SEAT_ROW_2_CENTER\",\"operation\":\"decrease\",\"value\":\"5\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '第二排中座椅背已向後調整'}]\n",
+      "[{'from': 'human', 'value': '馬上關充電口'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"EV_CHARGE_PORT_OPEN\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '充電口已關閉'}, {'from': 'human', 'value': '立刻關掉電尾門'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SRDC_POWER_TAILGATE_ON\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '電動尾門已關閉'}, {'from': 'human', 'value': '請第2排中間靠背向後'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SEAT_BACKREST_ANGLE_1_POS\",\"areaId\":\"SEAT_ROW_2_CENTER\",\"operation\":\"decrease\",\"value\":\"5\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '第二排中座椅背已向後調整'}, {'from': 'human', 'value': '幫我關閉電動尾門'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SRDC_POWER_TAILGATE_ON\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}]\n",
+      "[{'from': 'human', 'value': '馬上關充電口'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"EV_CHARGE_PORT_OPEN\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '充電口已關閉'}, {'from': 'human', 'value': '立刻關掉電尾門'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SRDC_POWER_TAILGATE_ON\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '電動尾門已關閉'}, {'from': 'human', 'value': '請第2排中間靠背向後'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SEAT_BACKREST_ANGLE_1_POS\",\"areaId\":\"SEAT_ROW_2_CENTER\",\"operation\":\"decrease\",\"value\":\"5\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '第二排中座椅背已向後調整'}, {'from': 'human', 'value': '幫我關閉電動尾門'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SRDC_POWER_TAILGATE_ON\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '電動尾門已關閉'}]\n",
+      "[{'from': 'human', 'value': '馬上關充電口'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"EV_CHARGE_PORT_OPEN\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '充電口已關閉'}, {'from': 'human', 'value': '立刻關掉電尾門'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SRDC_POWER_TAILGATE_ON\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '電動尾門已關閉'}, {'from': 'human', 'value': '請第2排中間靠背向後'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SEAT_BACKREST_ANGLE_1_POS\",\"areaId\":\"SEAT_ROW_2_CENTER\",\"operation\":\"decrease\",\"value\":\"5\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '第二排中座椅背已向後調整'}, {'from': 'human', 'value': '幫我關閉電動尾門'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SRDC_POWER_TAILGATE_ON\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '電動尾門已關閉'}, {'from': 'human', 'value': 'HUMAN_VOICE_CANNOT_RECOGNIZE'}, {'from': 'gpt', 'value': '抱歉我聽不清楚能麻煩您再說一次嗎'}]\n",
+      "[{'from': 'human', 'value': '馬上關充電口'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"EV_CHARGE_PORT_OPEN\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '充電口已關閉'}, {'from': 'human', 'value': '立刻關掉電尾門'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SRDC_POWER_TAILGATE_ON\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '電動尾門已關閉'}, {'from': 'human', 'value': '請第2排中間靠背向後'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SEAT_BACKREST_ANGLE_1_POS\",\"areaId\":\"SEAT_ROW_2_CENTER\",\"operation\":\"decrease\",\"value\":\"5\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '第二排中座椅背已向後調整'}, {'from': 'human', 'value': '幫我關閉電動尾門'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SRDC_POWER_TAILGATE_ON\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '電動尾門已關閉'}, {'from': 'human', 'value': 'HUMAN_VOICE_CANNOT_RECOGNIZE'}, {'from': 'gpt', 'value': '抱歉我聽不清楚能麻煩您再說一次嗎'}, {'from': 'human', 'value': '直接將最後排左邊溫度調弱到31度'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"HVAC_TEMPERATURE_SET\",\"areaId\":\"SEAT_ROW_3_LEFT\",\"operation\":\"set\",\"value\":\"31\"}]}'}}]\n",
+      "[{'from': 'human', 'value': '馬上關充電口'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"EV_CHARGE_PORT_OPEN\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '充電口已關閉'}, {'from': 'human', 'value': '立刻關掉電尾門'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SRDC_POWER_TAILGATE_ON\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '電動尾門已關閉'}, {'from': 'human', 'value': '請第2排中間靠背向後'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SEAT_BACKREST_ANGLE_1_POS\",\"areaId\":\"SEAT_ROW_2_CENTER\",\"operation\":\"decrease\",\"value\":\"5\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '第二排中座椅背已向後調整'}, {'from': 'human', 'value': '幫我關閉電動尾門'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SRDC_POWER_TAILGATE_ON\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '電動尾門已關閉'}, {'from': 'human', 'value': 'HUMAN_VOICE_CANNOT_RECOGNIZE'}, {'from': 'gpt', 'value': '抱歉我聽不清楚能麻煩您再說一次嗎'}, {'from': 'human', 'value': '直接將最後排左邊溫度調弱到31度'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"HVAC_TEMPERATURE_SET\",\"areaId\":\"SEAT_ROW_3_LEFT\",\"operation\":\"set\",\"value\":\"31\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '第三排左座空調已設定為31度'}]\n",
+      "[{'from': 'human', 'value': '馬上關充電口'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"EV_CHARGE_PORT_OPEN\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '充電口已關閉'}, {'from': 'human', 'value': '立刻關掉電尾門'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SRDC_POWER_TAILGATE_ON\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '電動尾門已關閉'}, {'from': 'human', 'value': '請第2排中間靠背向後'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SEAT_BACKREST_ANGLE_1_POS\",\"areaId\":\"SEAT_ROW_2_CENTER\",\"operation\":\"decrease\",\"value\":\"5\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '第二排中座椅背已向後調整'}, {'from': 'human', 'value': '幫我關閉電動尾門'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SRDC_POWER_TAILGATE_ON\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '電動尾門已關閉'}, {'from': 'human', 'value': 'HUMAN_VOICE_CANNOT_RECOGNIZE'}, {'from': 'gpt', 'value': '抱歉我聽不清楚能麻煩您再說一次嗎'}, {'from': 'human', 'value': '直接將最後排左邊溫度調弱到31度'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"HVAC_TEMPERATURE_SET\",\"areaId\":\"SEAT_ROW_3_LEFT\",\"operation\":\"set\",\"value\":\"31\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '第三排左座空調已設定為31度'}, {'from': 'human', 'value': 'HUMAN_VOICE_CANNOT_RECOGNIZE'}, {'from': 'gpt', 'value': '抱歉我聽不清楚能麻煩您再說一次嗎'}]\n",
+      "[{'from': 'human', 'value': '馬上關充電口'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"EV_CHARGE_PORT_OPEN\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '充電口已關閉'}, {'from': 'human', 'value': '立刻關掉電尾門'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SRDC_POWER_TAILGATE_ON\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '電動尾門已關閉'}, {'from': 'human', 'value': '請第2排中間靠背向後'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SEAT_BACKREST_ANGLE_1_POS\",\"areaId\":\"SEAT_ROW_2_CENTER\",\"operation\":\"decrease\",\"value\":\"5\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '第二排中座椅背已向後調整'}, {'from': 'human', 'value': '幫我關閉電動尾門'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SRDC_POWER_TAILGATE_ON\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '電動尾門已關閉'}, {'from': 'human', 'value': 'HUMAN_VOICE_CANNOT_RECOGNIZE'}, {'from': 'gpt', 'value': '抱歉我聽不清楚能麻煩您再說一次嗎'}, {'from': 'human', 'value': '直接將最後排左邊溫度調弱到31度'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"HVAC_TEMPERATURE_SET\",\"areaId\":\"SEAT_ROW_3_LEFT\",\"operation\":\"set\",\"value\":\"31\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '第三排左座空調已設定為31度'}, {'from': 'human', 'value': 'HUMAN_VOICE_CANNOT_RECOGNIZE'}, {'from': 'gpt', 'value': '抱歉我聽不清楚能麻煩您再說一次嗎'}, {'from': 'human', 'value': '請開啟電滑門'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SRDC_SLIDING_DOOR_ON\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"true\"}]}'}}]\n",
+      "[{'from': 'human', 'value': '馬上關充電口'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"EV_CHARGE_PORT_OPEN\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '充電口已關閉'}, {'from': 'human', 'value': '立刻關掉電尾門'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SRDC_POWER_TAILGATE_ON\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '電動尾門已關閉'}, {'from': 'human', 'value': '請第2排中間靠背向後'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SEAT_BACKREST_ANGLE_1_POS\",\"areaId\":\"SEAT_ROW_2_CENTER\",\"operation\":\"decrease\",\"value\":\"5\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '第二排中座椅背已向後調整'}, {'from': 'human', 'value': '幫我關閉電動尾門'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SRDC_POWER_TAILGATE_ON\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '電動尾門已關閉'}, {'from': 'human', 'value': 'HUMAN_VOICE_CANNOT_RECOGNIZE'}, {'from': 'gpt', 'value': '抱歉我聽不清楚能麻煩您再說一次嗎'}, {'from': 'human', 'value': '直接將最後排左邊溫度調弱到31度'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"HVAC_TEMPERATURE_SET\",\"areaId\":\"SEAT_ROW_3_LEFT\",\"operation\":\"set\",\"value\":\"31\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '第三排左座空調已設定為31度'}, {'from': 'human', 'value': 'HUMAN_VOICE_CANNOT_RECOGNIZE'}, {'from': 'gpt', 'value': '抱歉我聽不清楚能麻煩您再說一次嗎'}, {'from': 'human', 'value': '請開啟電滑門'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SRDC_SLIDING_DOOR_ON\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"true\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '電滑門已開啟'}]\n",
+      "[{'from': 'human', 'value': '馬上關充電口'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"EV_CHARGE_PORT_OPEN\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '充電口已關閉'}, {'from': 'human', 'value': '立刻關掉電尾門'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SRDC_POWER_TAILGATE_ON\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '電動尾門已關閉'}, {'from': 'human', 'value': '請第2排中間靠背向後'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SEAT_BACKREST_ANGLE_1_POS\",\"areaId\":\"SEAT_ROW_2_CENTER\",\"operation\":\"decrease\",\"value\":\"5\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '第二排中座椅背已向後調整'}, {'from': 'human', 'value': '幫我關閉電動尾門'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SRDC_POWER_TAILGATE_ON\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '電動尾門已關閉'}, {'from': 'human', 'value': 'HUMAN_VOICE_CANNOT_RECOGNIZE'}, {'from': 'gpt', 'value': '抱歉我聽不清楚能麻煩您再說一次嗎'}, {'from': 'human', 'value': '直接將最後排左邊溫度調弱到31度'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"HVAC_TEMPERATURE_SET\",\"areaId\":\"SEAT_ROW_3_LEFT\",\"operation\":\"set\",\"value\":\"31\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '第三排左座空調已設定為31度'}, {'from': 'human', 'value': 'HUMAN_VOICE_CANNOT_RECOGNIZE'}, {'from': 'gpt', 'value': '抱歉我聽不清楚能麻煩您再說一次嗎'}, {'from': 'human', 'value': '請開啟電滑門'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SRDC_SLIDING_DOOR_ON\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"true\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '電滑門已開啟'}, {'from': 'human', 'value': '中間這排左邊關空調'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"HVAC_AC_ON\",\"areaId\":\"SEAT_ROW_2_LEFT\",\"operation\":\"set\",\"value\":\"false\"}]}'}}]\n",
+      "[{'from': 'human', 'value': '馬上關充電口'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"EV_CHARGE_PORT_OPEN\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '充電口已關閉'}, {'from': 'human', 'value': '立刻關掉電尾門'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SRDC_POWER_TAILGATE_ON\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '電動尾門已關閉'}, {'from': 'human', 'value': '請第2排中間靠背向後'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SEAT_BACKREST_ANGLE_1_POS\",\"areaId\":\"SEAT_ROW_2_CENTER\",\"operation\":\"decrease\",\"value\":\"5\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '第二排中座椅背已向後調整'}, {'from': 'human', 'value': '幫我關閉電動尾門'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SRDC_POWER_TAILGATE_ON\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '電動尾門已關閉'}, {'from': 'human', 'value': 'HUMAN_VOICE_CANNOT_RECOGNIZE'}, {'from': 'gpt', 'value': '抱歉我聽不清楚能麻煩您再說一次嗎'}, {'from': 'human', 'value': '直接將最後排左邊溫度調弱到31度'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"HVAC_TEMPERATURE_SET\",\"areaId\":\"SEAT_ROW_3_LEFT\",\"operation\":\"set\",\"value\":\"31\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '第三排左座空調已設定為31度'}, {'from': 'human', 'value': 'HUMAN_VOICE_CANNOT_RECOGNIZE'}, {'from': 'gpt', 'value': '抱歉我聽不清楚能麻煩您再說一次嗎'}, {'from': 'human', 'value': '請開啟電滑門'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"SRDC_SLIDING_DOOR_ON\",\"areaId\":\"GLOBAL\",\"operation\":\"set\",\"value\":\"true\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '電滑門已開啟'}, {'from': 'human', 'value': '中間這排左邊關空調'}, {'from': 'function_call', 'value': {'name': 'control_car_properties', 'arguments': '{\"properties\":[{\"propertyId\":\"HVAC_AC_ON\",\"areaId\":\"SEAT_ROW_2_LEFT\",\"operation\":\"set\",\"value\":\"false\"}]}'}}, {'from': 'observation', 'value': '{\"name\": \"control_car_properties\", \"status\": \"success\", \"message\": \"控制指令執行完成\"}'}, {'from': 'gpt', 'value': '第二排左座已關閉空調'}]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import json\n",
+    "data = json.load(open('/home/jeff/jeff/codes/llm/InCar/data/test_data/ctrl_0730_noisy.json'))\n",
+    "for all_conv in data:\n",
+    "    history = []\n",
+    "    tools = all_conv['tools']\n",
+    "    for idx,conv in enumerate(all_conv['conversations']):\n",
+    "        if conv['from']=='function_call' or conv['from']=='gpt':continue\n",
+    "        elif conv['from']=='human':\n",
+    "            history = pipeline.generate(history,tools=tools,audio_path=conv['audio_path'])\n",
+    "        elif conv['from']=='observation':\n",
+    "            history = pipeline.call_function_fake(history,conv['value'])\n",
+    "            history = pipeline.generate(history,tools=tools)\n",
+    "        print(history)\n",
+    "    break"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'我在找台中太平逸境'"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'keyword': '台東太平逸境'}"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "eval(history[1]['value'][\"arguments\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'{\"name\": \"search_and_show_place\", \"arguments\": {\"keyword\": \"台中太平逸境\"}}'"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "all_conv['conversations'][1]['value']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'from': 'human', 'value': '我在找台東太平逸境'},\n",
+       " {'from': 'function_call',\n",
+       "  'value': {'name': 'search_and_show_place',\n",
+       "   'arguments': '{\"keyword\":\"台東太平逸境\"}'}},\n",
+       " {'from': 'observation',\n",
+       "  'value': '{\"name\": \"search_and_show_place\", \"status\": \"success\", \"message\": \"地點查詢成功\", \"poi\": [{\"name\": \"台中太平逸境\", \"address\": \"台中市太平區新仁路一段88號\", \"latitude\": 24.109912, \"longitude\": 120.737201, \"fromMemory\": false}]}'},\n",
+       " {'from': 'gpt', 'value': '我找到「台中太平逸境」，請問是要到這裡嗎？'},\n",
+       " {'from': 'human', 'value': '去第一個'},\n",
+       " {'from': 'function_call',\n",
+       "  'value': {'name': 'nav_start',\n",
+       "   'arguments': '{\"keyword\":\"台東太平逸境\",\"name\":\"台中太平逸境\",\"address\":\"台中市太平區新仁路一段88號\",\"latitude\":24.109912,\"longitude\":120.737201,\"fromMemory\":false}'}},\n",
+       " {'from': 'observation',\n",
+       "  'value': '{\"name\": \"nav_start\", \"status\": \"success\", \"message\": \"導航已啟動\"}'},\n",
+       " {'from': 'gpt', 'value': '導航已設定，往「台中太平逸境」出發。'},\n",
+       " {'from': 'human', 'value': '請帶我去澳洲大堡道科韓的書店'},\n",
+       " {'from': 'function_call',\n",
+       "  'value': {'name': 'search_and_show_place',\n",
+       "   'arguments': '{\"keyword\":\"澳洲大堡道科韓的書店\"}'}},\n",
+       " {'from': 'observation',\n",
+       "  'value': '{\"name\": \"search_and_show_place\", \"status\": \"success\", \"message\": \"地點查詢成功\", \"poi\": [{\"name\": \"咔啦咔啦雞\", \"address\": \"南投市中山路120號\", \"latitude\": 23.9133, \"longitude\": 120.685, \"fromMemory\": true}, {\"name\": \"炸物車站\", \"address\": \"新竹市香山區中華路六段88號\", \"latitude\": 24.776, \"longitude\": 120.9331, \"fromMemory\": true}]}'},\n",
+       " {'from': 'gpt', 'value': '上次您是去「咔啦咔啦雞」，這次還要去這裡嗎？'},\n",
+       " {'from': 'human', 'value': '這好像不像吧'},\n",
+       " {'from': 'gpt',\n",
+       "  'value': '請告訴��目的地附近的標誌性景點或描述「澳洲大堡道科韓的書店」的具體特徵，這樣我可以更精準地幫你搜尋並導航到正確的地址'},\n",
+       " {'from': 'human', 'value': '這裡好像不對'},\n",
+       " {'from': 'gpt',\n",
+       "  'value': '請告訴我目的地附近的標誌性景點或描述「澳洲大堡道科韓的書店」的具體特徵，這樣我可以更精準地幫你搜尋並導航到正確的地址'},\n",
+       " {'from': 'human', 'value': '我有印象他們有大片玻璃門面'},\n",
+       " {'from': 'function_call',\n",
+       "  'value': {'name': 'search_and_show_place',\n",
+       "   'arguments': '{\"keyword\":\"澳洲大堡道科韓的書店大片玻璃門面\"}'}},\n",
+       " {'from': 'observation',\n",
+       "  'value': '{\"name\": \"search_and_show_place\", \"status\": \"success\", \"message\": \"地點查詢成功\", \"poi\": [{\"name\": \"雞丁炸舖\", \"address\": \"台東市中華路一段210號\", \"latitude\": 22.7575, \"longitude\": 121.1457, \"fromMemory\": false}, {\"name\": \"即刻炸\", \"address\": \"桃園市桃園區中正路400號\", \"latitude\": 24.9933, \"longitude\": 121.3121, \"fromMemory\": false}, {\"name\": \"BOOM炸雞\", \"address\": \"台南市永康區中華路88號\", \"latitude\": 23.0349, \"longitude\": 120.25, \"fromMemory\": false}, {\"name\": \"雞咔咔速食站\", \"address\": \"台中市南區建成路160號\", \"latitude\": 24.1217, \"longitude\": 120.6852, \"fromMemory\": false}, {\"name\": \"炸雞一下\", \"address\": \"彰化市民生路188號\", \"latitude\": 24.0801, \"longitude\": 120.5429, \"fromMemory\": false}]}'},\n",
+       " {'from': 'gpt', 'value': '我找到以下與「澳洲大堡道科韓的書店」相關的地點，請問您要導航到哪一個地點？'},\n",
+       " {'from': 'human', 'value': '還是算了，去比較好'},\n",
+       " {'from': 'gpt', 'value': '已為您取消導航清單'},\n",
+       " {'from': 'human', 'value': '還是算了，不去比較好'},\n",
+       " {'from': 'gpt', 'value': '已為您取消導航清單'}]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "history"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[]"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "a = []\n",
+    "import json\n",
+    "json.loads(str(a))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'from': 'human', 'value': 'HUMAN_VOICE_CANNOT_RECOGNIZE'},\n",
+       " {'from': 'gpt', 'value': '抱歉我聽不清楚能麻煩您再說一次嗎'},\n",
+       " {'from': 'human', 'value': '請帶我去義式鄉村風的海灘式漢堡'},\n",
+       " {'from': 'function_call',\n",
+       "  'value': {'name': 'search_and_show_place',\n",
+       "   'arguments': '{\"keyword\":\"義式鄉村風的海灘式漢堡\"}'}},\n",
+       " {'from': 'observation',\n",
+       "  'value': '{\"name\": \"search_and_show_place\", \"status\": \"success\", \"message\": \"地點查詢成功\", \"poi\": [{\"name\": \"Benny’s漢堡坊\", \"address\": \"新北市三重區重新路四段201號\", \"latitude\": 25.0601, \"longitude\": 121.4876, \"fromMemory\": true}, {\"name\": \"Chubby’s Burger\", \"address\": \"台中市西屯區黎明路三段122號\", \"latitude\": 24.1812, \"longitude\": 120.6422, \"fromMemory\": true}]}'},\n",
+       " {'from': 'gpt', 'value': '上次您是去「Benny’s漢堡坊」，這次還要去這裡嗎？'},\n",
+       " {'from': 'human', 'value': '這個看起來不符合'},\n",
+       " {'from': 'gpt',\n",
+       "  'value': '請告訴我目的地附近的標誌性景點或描述「義式鄉村風的海灘式漢堡」的具體特徵，這樣我可以更精準地幫你搜尋並導航到正確的地址'},\n",
+       " {'from': 'human', 'value': '店名叫大和水셰店'},\n",
+       " {'from': 'function_call',\n",
+       "  'value': {'name': 'search_and_show_place',\n",
+       "   'arguments': '{\"keyword\":\"義式鄉村風的海灘式漢堡大和水셰店\"}'}},\n",
+       " {'from': 'observation',\n",
+       "  'value': '{\"name\": \"search_and_show_place\", \"status\": \"success\", \"message\": \"地點查詢成功\", \"poi\": [{\"name\": \"Classic Burger屋\", \"address\": \"高雄市鼓山區裕誠路350號\", \"latitude\": 22.6503, \"longitude\": 120.2911, \"fromMemory\": false}, {\"name\": \"Burger Kingdom\", \"address\": \"花蓮市中正路310號\", \"latitude\": 23.9782, \"longitude\": 121.6023, \"fromMemory\": false}, {\"name\": \"大口咬咬漢堡店\", \"address\": \"台北市大安區安和路二段95號\", \"latitude\": 25.03, \"longitude\": 121.5509, \"fromMemory\": false}]}'},\n",
+       " {'from': 'gpt', 'value': '我找到以下與「義式鄉村風的海灘式漢堡」相關的地點，請問您要導航到哪一個地點？'},\n",
+       " {'from': 'human', 'value': 'HUMAN_VOICE_CANNOT_RECOGNIZE'},\n",
+       " {'from': 'gpt', 'value': '抱歉我聽不清楚能麻煩您再說一次嗎'},\n",
+       " {'from': 'human', 'value': '全錯'},\n",
+       " {'from': 'gpt',\n",
+       "  'value': '請告訴我目的地附近的標誌性景點或描述「義式鄉村風的海灘式漢堡」的具體特徵，這樣我可以更精準地幫你搜尋並導航到正確的地址'},\n",
+       " {'from': 'human', 'value': '地點選在松山文創園區旁邊，特色為日式壽司'},\n",
+       " {'from': 'function_call',\n",
+       "  'value': {'name': 'search_and_show_place',\n",
+       "   'arguments': '{\"keyword\":\"義式鄉村風的海灘式漢堡松山文創園區旁邊日式壽司\"}'}},\n",
+       " {'from': 'observation',\n",
+       "  'value': '{\"name\": \"search_and_show_place\", \"status\": \"success\", \"message\": \"地點查詢成功\", \"poi\": [{\"name\": \"Station 13漢堡屋\", \"address\": \"新北市土城區中央路三段260號\", \"latitude\": 24.979, \"longitude\": 121.4456, \"fromMemory\": false}, {\"name\": \"牛仔漢堡屋\", \"address\": \"台南市永康區中正南路150號\", \"latitude\": 23.0271, \"longitude\": 120.2485, \"fromMemory\": false}, {\"name\": \"Happy Cow美式餐館\", \"address\": \"高雄市左營區自由三路88號\", \"latitude\": 22.6845, \"longitude\": 120.3077, \"fromMemory\": false}, {\"name\": \"Chef John’s Burger\", \"address\": \"台中市太平區太平路250號\", \"latitude\": 24.1275, \"longitude\": 120.728, \"fromMemory\": false}, {\"name\": \"Rolling Burger\", \"address\": \"嘉義市東區忠孝路88號\", \"latitude\": 23.4866, \"longitude\": 120.4511, \"fromMemory\": false}, {\"name\": \"Wagyu Burger\", \"address\": \"新竹市東區金山街101號\", \"latitude\": 24.8001, \"longitude\": 121.0102, \"fromMemory\": false}]}'},\n",
+       " {'from': 'gpt', 'value': '我找到以下與「義式鄉村風的海灘式漢堡」相關的地點，請問您要導航到哪一個地點？'},\n",
+       " {'from': 'human', 'value': '沒有選中'},\n",
+       " {'from': 'gpt',\n",
+       "  'value': '請告訴我目的地附近的標誌性景點或描述「義式鄉村風的海灘式漢堡」的具體特徵，這樣我可以更精準地幫你搜尋並導航到正確的地址'},\n",
+       " {'from': 'human', 'value': '地點應該是板橋大遠百對面,那家好像叫什麼懸的'},\n",
+       " {'from': 'function_call',\n",
+       "  'value': {'name': 'search_and_show_place',\n",
+       "   'arguments': '{\"keyword\":\"義式鄉村風的海灘式漢堡板橋大遠百對面懸\"}'}},\n",
+       " {'from': 'observation',\n",
+       "  'value': '{\"name\": \"search_and_show_place\", \"status\": \"success\", \"message\": \"地點查詢成功\", \"poi\": [{\"name\": \"Classic Burger屋\", \"address\": \"高雄市鼓山區裕誠路350號\", \"latitude\": 22.6503, \"longitude\": 120.2911, \"fromMemory\": false}, {\"name\": \"大漢堡俱樂部\", \"address\": \"台中市北區雙十路二段170號\", \"latitude\": 24.1578, \"longitude\": 120.6842, \"fromMemory\": false}, {\"name\": \"Burger Bank\", \"address\": \"新竹市北區湳雅街55號\", \"latitude\": 24.8154, \"longitude\": 120.9766, \"fromMemory\": false}, {\"name\": \"熊堡堡\", \"address\": \"高雄市楠梓區德賢路89號\", \"latitude\": 22.7261, \"longitude\": 120.3319, \"fromMemory\": false}]}'},\n",
+       " {'from': 'gpt', 'value': '我找到以下與「義式鄉村風的海灘式漢堡」相關的地點，請問您要導航到哪一個地點？'},\n",
+       " {'from': 'human', 'value': 'HUMAN_VOICE_CANNOT_RECOGNIZE'},\n",
+       " {'from': 'gpt', 'value': '抱歉我聽不清楚能麻煩您再說一次嗎'},\n",
+       " {'from': 'human', 'value': '沒看到對的'},\n",
+       " {'from': 'gpt',\n",
+       "  'value': '請告訴我目的地附近的標誌性景點或描述「義式鄉村風的海灘式漢堡」的具體特徵，這樣我可以更精準地幫你搜尋並導航到正確的地址'},\n",
+       " {'from': 'human', 'value': '位在公館商圈，有西班牙燉飯'},\n",
+       " {'from': 'function_call',\n",
+       "  'value': {'name': 'search_and_show_place',\n",
+       "   'arguments': '{\"keyword\":\"義式鄉村風的海灘式漢堡公館商圈西班牙燉飯\"}'}},\n",
+       " {'from': 'observation',\n",
+       "  'value': '{\"name\": \"search_and_show_place\", \"status\": \"success\", \"message\": \"地點查詢成功\", \"poi\": [{\"name\": \"Rolling Burger\", \"address\": \"嘉義市東區忠孝路88號\", \"latitude\": 23.4866, \"longitude\": 120.4511, \"fromMemory\": false}, {\"name\": \"Barrel & Beef\", \"address\": \"宜蘭市女中路三段122號\", \"latitude\": 24.7576, \"longitude\": 121.7523, \"fromMemory\": false}]}'},\n",
+       " {'from': 'gpt', 'value': '我找到以下與「義式鄉村風的海灘式漢堡」相關的地點，請問您要導航到哪一個地點？'},\n",
+       " {'from': 'human', 'value': '我找不到我要的選項'},\n",
+       " {'from': 'gpt',\n",
+       "  'value': '請告訴我目的地附近的標誌性景點或描述「義式鄉村風的海灘式漢堡」的具體特徵，這樣我可以更精準地幫你搜尋並導航到正確的地址'},\n",
+       " {'from': 'human', 'value': '印象中是什麼B開頭的店名'},\n",
+       " {'from': 'function_call',\n",
+       "  'value': {'name': 'search_and_show_place',\n",
+       "   'arguments': '{\"keyword\":\"義式鄉村風的海灘式漢堡B\"}'}},\n",
+       " {'from': 'observation',\n",
+       "  'value': '{\"name\": \"search_and_show_place\", \"status\": \"success\", \"message\": \"地點查詢成功\", \"poi\": [{\"name\": \"Station 13漢堡屋\", \"address\": \"新北市土城區中央路三段260號\", \"latitude\": 24.979, \"longitude\": 121.4456, \"fromMemory\": false}, {\"name\": \"Burger Bank\", \"address\": \"新竹市北區湳雅街55號\", \"latitude\": 24.8154, \"longitude\": 120.9766, \"fromMemory\": false}, {\"name\": \"Big Jack漢堡屋\", \"address\": \"台北市中山區民生東路三段56號\", \"latitude\": 25.0563, \"longitude\": 121.5387, \"fromMemory\": false}, {\"name\": \"Happy Cow美式餐館\", \"address\": \"高雄市左營區自由三路88號\", \"latitude\": 22.6845, \"longitude\": 120.3077, \"fromMemory\": false}, {\"name\": \"美式胖堡\", \"address\": \"台南市中西區西門路二段210號\", \"latitude\": 22.9987, \"longitude\": 120.1976, \"fromMemory\": false}, {\"name\": \"大口咬咬漢堡店\", \"address\": \"台北市大安區安和路二段95號\", \"latitude\": 25.03, \"longitude\": 121.5509, \"fromMemory\": false}]}'},\n",
+       " {'from': 'gpt', 'value': '我找到以下與「義式鄉村風的海灘式漢堡」相關的地點，請問您要導航到哪一個地點？'},\n",
+       " {'from': 'human', 'value': '我記得去了'},\n",
+       " {'from': 'gpt', 'value': '已為您取消導航清單'},\n",
+       " {'from': 'human', 'value': '直奔第五個'},\n",
+       " {'from': 'function_call',\n",
+       "  'value': {'name': 'nav_start',\n",
+       "   'arguments': '{\"keyword\":\"義式鄉村風的海灘式漢堡\",\"name\":\"美式胖堡\",\"address\":\"台南市中西區西門路二段210號\",\"latitude\":22.9987,\"longitude\":120.1976,\"fromMemory\":false}'}},\n",
+       " {'from': 'observation',\n",
+       "  'value': '{\"name\": \"nav_start\", \"status\": \"success\", \"message\": \"導航已啟動\"}'},\n",
+       " {'from': 'gpt', 'value': '導航已設定，往「美式胖堡」出發，祝您用餐愉快！'},\n",
+       " {'from': 'human', 'value': 'HUMAN_VOICE_CANNOT_RECOGNIZE'},\n",
+       " {'from': 'gpt', 'value': '抱歉我聽不清楚能麻煩您再說一次嗎'},\n",
+       " {'from': 'human', 'value': '請終止目前的路線安排'},\n",
+       " {'from': 'function_call', 'value': {'name': 'nav_stop', 'arguments': '{}'}},\n",
+       " {'from': 'observation',\n",
+       "  'value': '{\"name\": \"nav_stop\", \"status\": \"success\", \"message\": \"導航已取消\"}'},\n",
+       " {'from': 'gpt', 'value': '已為您取消導航，有需要再出發的話可以隨時告訴我喔！'}]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "history"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data[1]['conversations']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(pipeline.processor.apply_chat_template(\n",
+    "            history, add_generation_prompt=True, tokenize=False,\n",
+    "            return_dict=True, return_tensors=\"pt\", tools=json.loads(tools)\n",
+    "        ))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "history = [{'from': 'function_call', 'value': {\"name\": json.loads(tmp['value'])[\"name\"], \"arguments\": str(json.loads(tmp['value'])[\"arguments\"])}} \n",
+    "           if tmp['from']=='function_call' else tmp for tmp in data[0]['conversations']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipeline.generate(history[:4],tools=data[0]['tools'],audio_path='/home/jeff/jeff/codes/llm/InCar/data/test_data/audio_noisy/ctrl_toolcall_train 2-00015.wav')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "data = json.load(open('/home/jeff/jeff/codes/llm/InCar/data/test_data/nav_0730_noisy.json'))\n",
+    "data[0]['conversations'][-2] = {'from':'human',\n",
+    "    'value':[{\"type\": \"audio\", \"audio\": '/home/jeff/jeff/codes/llm/InCar/data/test_data/audio_noisy/nav_toolcall_train_0730-00005.wav'}]\n",
+    "}\n",
+    "{\n",
+    "        \"role\": \"user\",\n",
+    "        \"content\": [\n",
+    "            # ans is what_is_shown_in_this_image\n",
+    "            {\"type\": \"audio\", \"audio\": '/home/jeff/jeff/codes/llm/InCar/data/test_data/audio_noisy/nav_toolcall_train_0730-00003_purenoisy.wav'},\n",
+    "            {\"type\": \"text\", \"text\": \"Transcribe this audio clip into text.\"}\n",
+    "        ]\n",
+    "        }\n",
+    "for conv in data[0]['conversations']:\n",
+    "    if conv['from']=='function_call':\n",
+    "        conv['value']=json.loads(conv['value'])\n",
+    "        conv['value']['arguments'] = str(conv['value']['arguments'])\n",
+    "print(processor.apply_chat_template(data[0]['conversations'], add_generation_prompt=True, tokenize=False,\n",
+    "        return_dict=True,tools=json.loads(data[0]['tools'])\n",
+    "    ))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "audio_path = '/home/jeff/jeff/codes/llm/InCar/data/test_data/audio_noisy/nav_toolcall_train_0730-00005.wav'\n",
+    "messages = [\n",
+    "    {\n",
+    "        \"from\": \"human\",\n",
+    "        \"value\": [\n",
+    "            # ans is what_is_shown_in_this_image\n",
+    "            {\"type\": \"audio\", \"audio\": audio_path},\n",
+    "            {\"type\": \"text\", \"text\": \"Transcribe this audio clip into text.\"}\n",
+    "        ]\n",
+    "    }\n",
+    "]\n",
+    "\n",
+    "inputs_text = processor.apply_chat_template(\n",
+    "    data[0]['conversations'][:-1], add_generation_prompt=True, tokenize=False,\n",
+    "    return_dict=True, return_tensors=\"pt\", tools={}\n",
+    ")\n",
+    "inputs = processor(text=inputs_text, \n",
+    "                audio=[torchaudio.load(audio_path)[0]], \n",
+    "                add_special_tokens=False, \n",
+    "                return_tensors='pt'\n",
+    "            )\n",
+    "\n",
+    "with torch.inference_mode():\n",
+    "    inputs = {k:inputs[k].to('cuda') for k in inputs}\n",
+    "    generate_ids = model.generate(**inputs, max_new_tokens=128, do_sample=False)\n",
+    "    generate_ids = generate_ids[:, inputs['input_ids'].shape[1] :]\n",
+    "    response = processor.batch_decode(\n",
+    "        generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False\n",
+    "    )[0]\n",
+    "print(response)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inputs_text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inputs = processor.apply_chat_template(\n",
+    "    messages, add_generation_prompt=True, tokenize=True,\n",
+    "    return_dict=True, return_tensors=\"pt\", tools=json.loads(data[0]['tools'])\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoProcessor, AutoModel,AutoModelForCausalLM\n",
+    "import torch\n",
+    "model_id = \"/home/jeff/codes/llm/InCar/Phi-4-multimodal-instruct\"\n",
+    "revision = \"main\"\n",
+    "model2 = AutoModelForCausalLM.from_pretrained(#AutoModel.from_pretrained(\n",
+    "    model_id, device_map=\"cpu\", \n",
+    "    revision = revision, trust_remote_code=True,\n",
+    "    _attn_implementation='flash_attention_2'\n",
+    "    # torch_dtype=torch.float16\n",
+    ").eval()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "messages = [\n",
+    "    {\n",
+    "        \"role\": \"user\",\n",
+    "        \"content\": [\n",
+    "            # ans is what_is_shown_in_this_image\n",
+    "            {\"type\": \"audio\", \"audio\": \"https://huggingface.co/microsoft/Phi-4-multimodal-instruct/resolve/main/examples/what_is_shown_in_this_image.wav\"},\n",
+    "            {\"type\": \"text\", \"text\": \"Transcribe this audio clip into text.\"}\n",
+    "        ]\n",
+    "    }\n",
+    "]\n",
+    "\n",
+    "inputs = processor.apply_chat_template(\n",
+    "    messages, add_generation_prompt=True, tokenize=True,\n",
+    "    return_dict=True, return_tensors=\"pt\"\n",
+    ")\n",
+    "\n",
+    "# with torch.inference_mode():\n",
+    "#     inputs = {k:inputs[k].to('cuda') for k in inputs}\n",
+    "#     generate_ids = model.generate(**inputs, max_new_tokens=128, do_sample=False)\n",
+    "#     generate_ids = generate_ids[:, inputs['input_ids'].shape[1] :]\n",
+    "#     response = processor.batch_decode(\n",
+    "#         generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False\n",
+    "#     )[0]\n",
+    "# print(response)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(inputs['input_ids'][0]),len(inputs['input_audio_embeds'][0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model2.model.embed_tokens_extend.audio_embed.encoder(inputs['input_audio_embeds'],None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "with open('/mnt/data-2t/jeff/codes/llm/InCar/gemma-3-4b-it-omni/output_org_nav_2025-07-24 08:34.json') as f:\n",
+    "    nav_res = json.load(f)\n",
+    "remove_sign = lambda x:x.replace('User transcribe is','').replace('GPT output is','').replace('\\n','').\\\n",
+    "                            replace(' ','').replace('?','').replace('？','').replace('!','').replace('。','').\\\n",
+    "                            replace('.','').replace('！','')\n",
+    "total_func_call=0\n",
+    "func_error=0\n",
+    "errors=[]\n",
+    "for res in nav_res:\n",
+    "    if res['cer']!=0:\n",
+    "        errors.append(res)\n",
+    "    if 'Action:' in res['label']:\n",
+    "        func_error+=remove_sign(res['label'])!=remove_sign(res['output'])\n",
+    "        total_func_call+=1\n",
+    "avg_cer = sum(a['cer'] for a in nav_res)/len(nav_res)\n",
+    "total_error = sum(a['cer']!=0 for a in nav_res)\n",
+    "print('total',len(nav_res))\n",
+    "print('total_error & rate',total_error,total_error/len(nav_res))\n",
+    "print('avg_cer',avg_cer)\n",
+    "print('total_func_call',total_func_call)\n",
+    "print('func_error & rate',func_error,',',func_error/total_func_call)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "488/11046"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "aa = []\n",
+    "for e in errors:\n",
+    "    if '}' in e['output'] and remove_sign(e['output'][:e['output'].index('}')+1])==remove_sign(e['label']):continue\n",
+    "    if remove_sign(e['label']) in remove_sign(e['output']):continue\n",
+    "    aa.append(e)\n",
+    "len(aa)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "5/4830"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(aa)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "aa"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "with open('/mnt/data-2t/jeff/codes/llm/InCar/gemma-3-4b-it-omni/output_org_ctrl_2025-07-24 05:43.json') as f:\n",
+    "    nav_res = json.load(f)\n",
+    "remove_sign = lambda x:x.replace('User transcribe is','').replace('GPT output is','').replace('\\n','').\\\n",
+    "                            replace(' ','').replace('?','').replace('？','').replace('!','').replace('。','').\\\n",
+    "                            replace('.','').replace('！','')\n",
+    "total_func_call=0\n",
+    "func_error=0\n",
+    "errors=[]\n",
+    "for res in nav_res:\n",
+    "    if res['cer']!=0:\n",
+    "        errors.append(res)\n",
+    "    if 'Action:' in res['label']:\n",
+    "        func_error+=remove_sign(res['label'])!=remove_sign(res['output'])\n",
+    "        total_func_call+=1\n",
+    "avg_cer = sum(a['cer'] for a in nav_res)/len(nav_res)\n",
+    "total_error = sum(a['cer']!=0 for a in nav_res)\n",
+    "print('total',len(nav_res))\n",
+    "print('total_error & rate',total_error,total_error/len(nav_res))\n",
+    "print('avg_cer',avg_cer)\n",
+    "print('total_func_call',total_func_call)\n",
+    "print('func_error & rate',func_error,',',func_error/total_func_call)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "errors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(errors[0]['input'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "py10",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

test.jinja ADDED Viewed

	@@ -0,0 +1,51 @@

+{% set tools_json = tools %}
+{{ bos_token }}You have access to the following tools:
+{% for tool in tools_json %}
+> Tool Name: {{ tool.name }}
+Tool Description: {{ tool.description }}
+Tool Args:
+{% if tool.parameters.properties %}
+{% for prop_name, prop_data in tool.parameters.properties.items() %}
+  - {{ prop_name }} ({{ prop_data.type }}, {% if prop_name in tool.parameters.required %}required{% else %}optional{% endif %}): {{ prop_data.description }}
+{% endfor %}
+{% else %}
+{% endif %}
+{% endfor %}, where each item should be object
+Use the following format if using a tool:
+```
+Action: tool name (one of [{{ tools_json | map(attribute='name') | join(', ') }}])
+Action Input: the input to the tool, in a JSON format representing the kwargs (e.g. ```{"input": "hello world", "num_beams": 5}```)
+```
+{{'\n'}}
+{%- for message in messages -%}
+  {%- if message['from'] == "human" -%}
+    {%- if message['value'] is string -%}
+      {{'<start_of_turn>user\n' + message['value'] + '<end_of_turn>\n'}}
+    {%- elif message['value'] is iterable -%}
+      {{'<start_of_turn>user\n'}}
+      {%- for item in message['value'] -%}
+        {%- if item['type'] == 'image' -%}
+          {{ '<start_of_image>' }}
+        {%- elif item['type'] == 'audio' -%}
+          {{ '<start_of_audio>' }}
+        {%- elif item['type'] == 'text' -%}
+          {{ item['text'] | trim }}
+        {%- endif -%}
+      {{'<end_of_turn>\n'}}
+      {%- endfor -%}
+    {%- endif -%}
+  {%- elif message['from'] == "gpt" -%}
+    {{'<start_of_turn>model\n' + message['value'] + '<end_of_turn>\n'}}
+  {%- elif message['from'] == "function_call" -%}
+    {%- set func_call = message['value'] -%}
+    {{'<start_of_turn>model\n' + 'Action:' + func_call.name + '\n' + 'Action Input:' + func_call.arguments + '<end_of_turn>\n'}}
+  {%- elif message['from'] == "observation" -%}
+    {{'<start_of_turn>tool\n' + message['value'] + '<end_of_turn>\n'}}
+  {%- endif -%}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+    {{'<start_of_turn>model
+'}}
+{%- endif -%}

tmp.sh ADDED Viewed

	@@ -0,0 +1,3 @@

+python eval_multiturn_textonly.py
+cd /mnt/data-2t/jeff/codes/LLaMA-Factory
+bash train_nav_ctrl.sh

training.py CHANGED Viewed

@@ -34,7 +34,7 @@ _IGNORE_INDEX = -100
 class BaseAudioDataset(Dataset):
     def __init__(self, processor, split, sampling_rate=16000, debug=False):
         self.processor = processor
-        self.training = "train" in split or 'other' in split
         self.debug = debug
         self.sampling_rate = sampling_rate
         self.name = ""
@@ -183,7 +183,7 @@ class LibriSpeechDataset(BaseAudioDataset):
         return self.prepare_model_inputs(
             data["audio"]["array"],
-            self.instruction,
             answer_text
         )
@@ -198,11 +198,7 @@ class CommonVoiceDataset(BaseAudioDataset):
         self.lang=source_lang
         # load dataset
-        if source_lang=="zh-TW":
-            data_path = "/mnt/jeff/InCar/data/common_voice_16_1"
-        else:
-            data_path = "/mnt/jeff/InCar/data/common_voice_17_0"
-        self.data = load_dataset(data_path,
                             source_lang,
                             split=split,
                             trust_remote_code=True,
@@ -223,63 +219,10 @@ class CommonVoiceDataset(BaseAudioDataset):
             batch["sentence"] = transcription
             return batch
-        import opencc
-        converter = opencc.OpenCC('s2tw.json')
-        def To_zhTW(batch):
-            transcription = converter.convert(batch["sentence"])
-            batch["sentence"] = transcription
-            return batch
-        self.data = self.data.map(prepare_dataset, desc="preprocess dataset")
-        if source_lang=='zh-CN':
-            self.data = self.data.map(To_zhTW, desc="preprocess dataset To_zhTW")
         # (Optional) Audio length Filtering
         self.data = self.filter_by_audio_length(self.data, "audio")
-        if source_lang == "zh-TW" and split=='train':
-            import torchaudio
-            from torchaudio import transforms
-            import copy
-            import pickle
-            import os
-            def subsample(batch):
-                batch['audio']['array']=torchaudio.functional.resample(torch.FloatTensor(batch['audio']['array']), orig_freq=batch['audio']['sampling_rate'], new_freq=16000)
-                batch['audio']['sampling_rate']=16000
-                return batch
-            def TW_data_augment_fast(batch):
-                speed_perturb_fast = transforms.SpeedPerturbation(batch['audio']['sampling_rate'], [1.1])
-                new_array_fast = speed_perturb_fast(torch.FloatTensor(batch['audio']['array']))[0]
-                batch['audio']['array'] = new_array_fast
-                return batch
-            def TW_data_augment_slow(batch):
-                speed_perturb_slow = transforms.SpeedPerturbation(batch['audio']['sampling_rate'], [0.9])
-                new_array_slow = speed_perturb_slow(torch.FloatTensor(batch['audio']['array']))[0]
-                batch['audio']['array'] = new_array_slow
-                return batch
-            # data = self.data.map(subsample, num_proc=1, desc="subsample")
-            fast_path = '/mnt/jeff/InCar/data/tw_fast.pkl'
-            if not os.path.exists(fast_path):
-                data_fast = self.data.map(TW_data_augment_fast, num_proc=1, desc="augment fast")
-                with open(fast_path,'wb') as f:
-                    pickle.dump(data_fast,f)
-            else:
-                with open(fast_path,'rb') as f:
-                    data_fast=pickle.load(f)
-            slow_path = '/mnt/jeff/InCar/data/data_slow.pkl'
-            if not os.path.exists(slow_path):
-                data_slow = self.data.map(TW_data_augment_slow, num_proc=1, desc="augment slow")
-                with open(slow_path,'wb') as f:
-                    pickle.dump(data_slow,f)
-            else:
-                with open(slow_path,'rb') as f:
-                    data_slow=pickle.load(f)
-            self.data = [d for d in self.data]+[d for d in data_fast]+[d for d in data_slow]
         # Instruction Setting
         self.instruction = random.choice(INSTRUCTION["asr"])
@@ -293,7 +236,7 @@ class CommonVoiceDataset(BaseAudioDataset):
         answer_text = data["sentence"]
         return self.prepare_model_inputs(
             data["audio"]["array"],
-            self.instruction,
             answer_text
         )
@@ -325,15 +268,6 @@ class FleursDataset(BaseAudioDataset):
                             trust_remote_code=True,
                             cache_dir=Path("/mnt/jeff/InCar/data")
                             )
-        import opencc
-        converter = opencc.OpenCC('s2tw.json')
-        def prepare_dataset(batch):
-            transcription = converter.convert(batch["transcription"])
-            batch["transcription"] = transcription
-            return batch
-        if (source_lang=="cmn_hans_cn"):
-            self.data = self.data.map(prepare_dataset, desc="preprocess dataset")
         # (Optional) Audio length Filtering
         self.data = self.filter_by_audio_length(self.data, "audio")
@@ -353,8 +287,7 @@ class FleursDataset(BaseAudioDataset):
                                 trust_remote_code=True,
                                 cache_dir=Path("/mnt/jeff/InCar/data")
                                 )
-            if target_lang=="cmn_hans_cn":
-                target_data=target_data.map(prepare_dataset, desc="preprocess dataset")
             source_dict = {item['id']: item for item in self.data}
             target_dict = {item['id']: item for item in target_data}
@@ -368,11 +301,11 @@ class FleursDataset(BaseAudioDataset):
             # Instruction Setting - use target language name
             self.target_lang_name = self.lang_names.get(target_lang, target_lang.capitalize())
-            self.instruction = random.choice(INSTRUCTION["ast"])
         else:
             # ASR mode
             self.lang = source_lang
-            self.instruction = random.choice(INSTRUCTION["asr"])
         if self.debug:
             print(f"FLEURS dataset loaded: {self.mode.upper()} mode")
@@ -395,7 +328,7 @@ class FleursDataset(BaseAudioDataset):
         return self.prepare_model_inputs(
             audio_array,
-            self.instruction.format(self.target_lang_name),
             answer_text
         )
@@ -552,9 +485,9 @@ def create_model(model_name_or_path, revision="main", use_flash_attention = Fals
     model = AutoModel.from_pretrained(
         model_name_or_path,
         revision=revision,
-        torch_dtype=torch.bfloat16,
         device_map="auto",
-        attn_implementation="flash_attention_2" if use_flash_attention else "eager",
         trust_remote_code=True,
     )
@@ -564,13 +497,22 @@ def create_model(model_name_or_path, revision="main", use_flash_attention = Fals
     # Freeze all parameters
     for param in model.parameters():
         param.requires_grad = False
-    model.set_lora_adapter('speech')
-    model.to(torch.bfloat16)
     # (Optional) unfreeze audio_tower parameters
-    # for param in model.audio_tower.parameters():
-    #    param.requires_grad = True
     # Only unfreeze audio_projector parameters
     for param in model.audio_projector.parameters():
@@ -654,12 +596,12 @@ _IGNORE_INDEX = -100
 model_name_or_path = '/mnt/jeff/gemma-3-4b-it-omni'
 use_flash_attention = True
-output_dir = '../gemma_tmp7'
-batch_size = 128
 batch_size_per_gpu = 16
 learning_rate = 4.0e-5 # 1.0e-4 for fine-tuning
 wd = 0.01
-num_train_epochs = 15
 revision = "main" #"v1.0"
@@ -677,22 +619,23 @@ model = create_model(
 train_datasets = []
-# common voice asr
-commonvoice_speech_tw2 = CommonVoiceDataset(
-    processor=processor,
-    source_lang="zh-TW",
-    split="other[:70%]"
-)
-train_datasets.append(commonvoice_speech_tw2)
-commonvoice_speech_cn = CommonVoiceDataset(
     processor=processor,
-    source_lang="zh-CN",
-    split="train[:50%]"
 )
-train_datasets.append(commonvoice_speech_cn)
 commonvoice_speech_tw = CommonVoiceDataset(
     processor=processor,
     source_lang="zh-TW",
@@ -701,17 +644,6 @@ commonvoice_speech_tw = CommonVoiceDataset(
 train_datasets.append(commonvoice_speech_tw)
-# Libri Speech Clean ASR mode (English -> English text)
-libri_speech_clean = LibriSpeechDataset(
-    processor=processor,
-    subset="clean",
-    split="train.360[:50%]"
-)
-train_datasets.append(libri_speech_clean)
 # Fleurs ASR mode (English -> English text)
 en_asr_fleurs = FleursDataset(
     processor=processor,
@@ -722,14 +654,14 @@ en_asr_fleurs = FleursDataset(
 train_datasets.append(en_asr_fleurs)
-# en_ch_ast_fleurs = FleursDataset(
-#     processor=processor,
-#     split="train",
-#     source_lang="en_us",
-#     target_lang="cmn_hans_cn",
-#     mode="ast"
-# )
-# train_datasets.append(en_ch_ast_fleurs)
@@ -742,14 +674,14 @@ ch_asr_fleurs = FleursDataset(
 train_datasets.append(ch_asr_fleurs)
-# ch_en_ast_fleurs = FleursDataset(
-#     processor=processor,
-#     split="train",
-#     source_lang="cmn_hans_cn",
-#     target_lang="en_us",
-#     mode="ast"
-# )
-# train_datasets.append(ch_en_ast_fleurs)
 print("Count Num of Datasets", len(train_datasets))
 print([len(dataset) for dataset in train_datasets])
@@ -830,19 +762,19 @@ training_args = TrainingArguments(
     max_grad_norm=1.0,
     lr_scheduler_type='cosine',
     warmup_steps=50,
-    logging_steps=10,
     output_dir=output_dir,
     save_total_limit=10,
     save_only_model=True,
-    bf16=True,
     fp16=False,
     remove_unused_columns=False,
     report_to='none',
-    deepspeed=dp_config if num_gpus==1 else None,
     disable_tqdm=False,
     dataloader_num_workers=4,
     save_strategy='steps',
-    save_steps=1000,
     ddp_find_unused_parameters=True,
 )

 class BaseAudioDataset(Dataset):
     def __init__(self, processor, split, sampling_rate=16000, debug=False):
         self.processor = processor
+        self.training = "train" in split
         self.debug = debug
         self.sampling_rate = sampling_rate
         self.name = ""
         return self.prepare_model_inputs(
             data["audio"]["array"],
+            random.choice(INSTRUCTION["asr"]),
             answer_text
         )
         self.lang=source_lang
         # load dataset
+        self.data = load_dataset("/mnt/jeff/InCar/data/common_voice_16_1",
                             source_lang,
                             split=split,
                             trust_remote_code=True,
             batch["sentence"] = transcription
             return batch
+        self.data.map(prepare_dataset, desc="preprocess dataset")
         # (Optional) Audio length Filtering
         self.data = self.filter_by_audio_length(self.data, "audio")
         # Instruction Setting
         self.instruction = random.choice(INSTRUCTION["asr"])
         answer_text = data["sentence"]
         return self.prepare_model_inputs(
             data["audio"]["array"],
+            random.choice(INSTRUCTION["asr"]),
             answer_text
         )
                             trust_remote_code=True,
                             cache_dir=Path("/mnt/jeff/InCar/data")
                             )
         # (Optional) Audio length Filtering
         self.data = self.filter_by_audio_length(self.data, "audio")
                                 trust_remote_code=True,
                                 cache_dir=Path("/mnt/jeff/InCar/data")
                                 )
             source_dict = {item['id']: item for item in self.data}
             target_dict = {item['id']: item for item in target_data}
             # Instruction Setting - use target language name
             self.target_lang_name = self.lang_names.get(target_lang, target_lang.capitalize())
+            self.instruction = INSTRUCTION["ast"]
         else:
             # ASR mode
             self.lang = source_lang
+            self.instruction = INSTRUCTION["asr"]
         if self.debug:
             print(f"FLEURS dataset loaded: {self.mode.upper()} mode")
         return self.prepare_model_inputs(
             audio_array,
+            random.choice(self.instruction).format(self.target_lang_name),
             answer_text
         )
     model = AutoModel.from_pretrained(
         model_name_or_path,
         revision=revision,
+        # torch_dtype=torch.float16,
         device_map="auto",
+        attn_implementation="eager",
         trust_remote_code=True,
     )
     # Freeze all parameters
     for param in model.parameters():
         param.requires_grad = False
+    from peft import LoraConfig, get_peft_model
+    lora_config = LoraConfig(
+        r=320,
+        lora_alpha=32,
+        lora_dropout=0.05,
+        bias="none",
+        task_type="CAUSAL_LM",
+        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
+    )
+    model.language_model.model = get_peft_model(model.language_model.model, lora_config)
+    # model.to(torch.float16)
     # (Optional) unfreeze audio_tower parameters
+    for param in model.audio_tower.parameters():
+       param.requires_grad = True
     # Only unfreeze audio_projector parameters
     for param in model.audio_projector.parameters():
 model_name_or_path = '/mnt/jeff/gemma-3-4b-it-omni'
 use_flash_attention = True
+output_dir = '../gemma_tmp'
+batch_size = 16
 batch_size_per_gpu = 16
 learning_rate = 4.0e-5 # 1.0e-4 for fine-tuning
 wd = 0.01
+num_train_epochs = 5
 revision = "main" #"v1.0"
 train_datasets = []
+# Libri Speech Clean ASR mode (English -> English text)
+libri_speech_clean = LibriSpeechDataset(
     processor=processor,
+    subset="clean",
+    split="train.360"
 )
+train_datasets.append(libri_speech_clean)
+# # Libri Speech Other ASR mode (English -> English text)
+# libri_speech_other = LibriSpeechDataset(
+#     processor=processor,
+#     subset="other",
+#     split="train.500"
+# )
+# train_datasets.append(libri_speech_other)
+# common voice asr
 commonvoice_speech_tw = CommonVoiceDataset(
     processor=processor,
     source_lang="zh-TW",
 train_datasets.append(commonvoice_speech_tw)
 # Fleurs ASR mode (English -> English text)
 en_asr_fleurs = FleursDataset(
     processor=processor,
 train_datasets.append(en_asr_fleurs)
+en_ch_ast_fleurs = FleursDataset(
+    processor=processor,
+    split="train",
+    source_lang="en_us",
+    target_lang="cmn_hans_cn",
+    mode="ast"
+)
+train_datasets.append(en_ch_ast_fleurs)
 train_datasets.append(ch_asr_fleurs)
+ch_en_ast_fleurs = FleursDataset(
+    processor=processor,
+    split="train",
+    source_lang="cmn_hans_cn",
+    target_lang="en_us",
+    mode="ast"
+)
+train_datasets.append(ch_en_ast_fleurs)
 print("Count Num of Datasets", len(train_datasets))
 print([len(dataset) for dataset in train_datasets])
     max_grad_norm=1.0,
     lr_scheduler_type='cosine',
     warmup_steps=50,
+    logging_steps=5,
     output_dir=output_dir,
     save_total_limit=10,
     save_only_model=True,
+    bf16=False,
     fp16=False,
     remove_unused_columns=False,
     report_to='none',
+    deepspeed=dp_config,
     disable_tqdm=False,
     dataloader_num_workers=4,
     save_strategy='steps',
+    save_steps=10000,
     ddp_find_unused_parameters=True,
 )