Spaces:

m7mdal7aj
/

KB-VQA

Sleeping

App Files Files Community

m7mdal7aj commited on May 17, 2024

Commit

57086c5

verified ·

1 Parent(s): cb8ebbd

Update my_model/results/evaluation.py

Browse files

Files changed (1) hide show

my_model/results/evaluation.py +16 -0

my_model/results/evaluation.py CHANGED Viewed

@@ -31,6 +31,7 @@ class KBVQAEvaluator:
         gpt4_max_tokens (int): Maximum tokens for GPT-4 responses.
         gpt4_temperature (float): Temperature setting for GPT-4 responses.
     """
     def __init__(self): -> None
         """
@@ -55,6 +56,7 @@ class KBVQAEvaluator:
         self.gpt4_max_tokens = config.GPT4_MAX_TOKENS
         self.gpt4_temperature = config.GPT4_TEMPERATURE
     def stem_answers(self, answers: Union[str, List[str]]) -> Union[str, List[str]]:
         """
         Apply Porter Stemmer to either a single string or a list of strings.
@@ -72,6 +74,7 @@ class KBVQAEvaluator:
             words = answers.split()
             return " ".join(self.stemmer.stem(word.strip()) for word in words)
    def calculate_vqa_score(self, ground_truths: List[str], model_answer: str) -> float:
         """
         Calculate VQA score based on the number of matching answers, with optional fuzzy matching.
@@ -91,6 +94,7 @@ class KBVQAEvaluator:
             count = Counter(ground_truths)
             return min(count.get(model_answer, 0) / 3, 1)
     def calculate_exact_match_score(self, ground_truths: List[str], model_answer: str) -> int:
         """
         Calculate Exact Match score, with optional fuzzy matching.
@@ -108,10 +112,13 @@ class KBVQAEvaluator:
         else:
             return int(model_answer in ground_truths)
     def syntactic_evaluation(self) -> None:
         """
         Process the DataFrame: stem answers, calculate scores, and store results.
         """
         self.df['raw_answers_stemmed'] = self.df['raw_answers'].apply(literal_eval).apply(self.stem_answers)
@@ -127,6 +134,7 @@ class KBVQAEvaluator:
                 self.vqa_scores[full_config] = round(self.df[f'vqa_score_{full_config}'].mean()*100, 2)
                 self.exact_match_scores[full_config] = round(self.df[f'exact_match_score_{full_config}'].mean()*100, 2)
     def create_GPT4_messages_template(self, question: str, ground_truths: List[str], model_answer: str) -> List[dict]:
         """
         Create a message list for the GPT-4 API call based on the question, ground truths, and model answer.
@@ -158,6 +166,9 @@ class KBVQAEvaluator:
     def semantic_evaluation(self) -> None:
         """
         Perform semantic evaluation using GPT-4 for each model configuration.
         """
         openai.api_key = self.openai_api_key
         model_configurations_for_semantic_evaluation = self.model_configurations[:2] # considering only main model configs ['caption+detic', 'caption+yolov5'] without ablation, due to the cost involved.
@@ -192,6 +203,8 @@ class KBVQAEvaluator:
             self.df.to_excel(writer, sheet_name='Main Data', index=False)
             scores_df.to_excel(writer, sheet_name='Scores', index=False)
 def run_evaluation(save: bool = False, save_filename: str = "results") -> None:
     """
     Run the full evaluation process using KBVQAEvaluator and save the results to an Excel file.
@@ -199,6 +212,9 @@ def run_evaluation(save: bool = False, save_filename: str = "results") -> None:
     Args:
         save (bool): Whether to save the results to an Excel file. Defaults to False.
         save_filename (str): The filename to save the results if save is True. Defaults to "results".
     """
     # Instantiate the evaluator

         gpt4_max_tokens (int): Maximum tokens for GPT-4 responses.
         gpt4_temperature (float): Temperature setting for GPT-4 responses.
     """
     def __init__(self): -> None
         """
         self.gpt4_max_tokens = config.GPT4_MAX_TOKENS
         self.gpt4_temperature = config.GPT4_TEMPERATURE
     def stem_answers(self, answers: Union[str, List[str]]) -> Union[str, List[str]]:
         """
         Apply Porter Stemmer to either a single string or a list of strings.
             words = answers.split()
             return " ".join(self.stemmer.stem(word.strip()) for word in words)
    def calculate_vqa_score(self, ground_truths: List[str], model_answer: str) -> float:
         """
         Calculate VQA score based on the number of matching answers, with optional fuzzy matching.
             count = Counter(ground_truths)
             return min(count.get(model_answer, 0) / 3, 1)
     def calculate_exact_match_score(self, ground_truths: List[str], model_answer: str) -> int:
         """
         Calculate Exact Match score, with optional fuzzy matching.
         else:
             return int(model_answer in ground_truths)
     def syntactic_evaluation(self) -> None:
         """
         Process the DataFrame: stem answers, calculate scores, and store results.
+        Returns:
+            None.
         """
         self.df['raw_answers_stemmed'] = self.df['raw_answers'].apply(literal_eval).apply(self.stem_answers)
                 self.vqa_scores[full_config] = round(self.df[f'vqa_score_{full_config}'].mean()*100, 2)
                 self.exact_match_scores[full_config] = round(self.df[f'exact_match_score_{full_config}'].mean()*100, 2)
     def create_GPT4_messages_template(self, question: str, ground_truths: List[str], model_answer: str) -> List[dict]:
         """
         Create a message list for the GPT-4 API call based on the question, ground truths, and model answer.
     def semantic_evaluation(self) -> None:
         """
         Perform semantic evaluation using GPT-4 for each model configuration.
+        Returns:
+            None.
         """
         openai.api_key = self.openai_api_key
         model_configurations_for_semantic_evaluation = self.model_configurations[:2] # considering only main model configs ['caption+detic', 'caption+yolov5'] without ablation, due to the cost involved.
             self.df.to_excel(writer, sheet_name='Main Data', index=False)
             scores_df.to_excel(writer, sheet_name='Scores', index=False)
 def run_evaluation(save: bool = False, save_filename: str = "results") -> None:
     """
     Run the full evaluation process using KBVQAEvaluator and save the results to an Excel file.
     Args:
         save (bool): Whether to save the results to an Excel file. Defaults to False.
         save_filename (str): The filename to save the results if save is True. Defaults to "results".
+    Returns:
+            None.
     """
     # Instantiate the evaluator