Spaces:
Sleeping
Sleeping
Update my_model/results/evaluation.py
Browse files
my_model/results/evaluation.py
CHANGED
|
@@ -31,6 +31,7 @@ class KBVQAEvaluator:
|
|
| 31 |
gpt4_max_tokens (int): Maximum tokens for GPT-4 responses.
|
| 32 |
gpt4_temperature (float): Temperature setting for GPT-4 responses.
|
| 33 |
"""
|
|
|
|
| 34 |
|
| 35 |
def __init__(self): -> None
|
| 36 |
"""
|
|
@@ -55,6 +56,7 @@ class KBVQAEvaluator:
|
|
| 55 |
self.gpt4_max_tokens = config.GPT4_MAX_TOKENS
|
| 56 |
self.gpt4_temperature = config.GPT4_TEMPERATURE
|
| 57 |
|
|
|
|
| 58 |
def stem_answers(self, answers: Union[str, List[str]]) -> Union[str, List[str]]:
|
| 59 |
"""
|
| 60 |
Apply Porter Stemmer to either a single string or a list of strings.
|
|
@@ -72,6 +74,7 @@ class KBVQAEvaluator:
|
|
| 72 |
words = answers.split()
|
| 73 |
return " ".join(self.stemmer.stem(word.strip()) for word in words)
|
| 74 |
|
|
|
|
| 75 |
def calculate_vqa_score(self, ground_truths: List[str], model_answer: str) -> float:
|
| 76 |
"""
|
| 77 |
Calculate VQA score based on the number of matching answers, with optional fuzzy matching.
|
|
@@ -91,6 +94,7 @@ class KBVQAEvaluator:
|
|
| 91 |
count = Counter(ground_truths)
|
| 92 |
return min(count.get(model_answer, 0) / 3, 1)
|
| 93 |
|
|
|
|
| 94 |
def calculate_exact_match_score(self, ground_truths: List[str], model_answer: str) -> int:
|
| 95 |
"""
|
| 96 |
Calculate Exact Match score, with optional fuzzy matching.
|
|
@@ -108,10 +112,13 @@ class KBVQAEvaluator:
|
|
| 108 |
else:
|
| 109 |
return int(model_answer in ground_truths)
|
| 110 |
|
|
|
|
| 111 |
def syntactic_evaluation(self) -> None:
|
| 112 |
"""
|
| 113 |
Process the DataFrame: stem answers, calculate scores, and store results.
|
| 114 |
|
|
|
|
|
|
|
| 115 |
"""
|
| 116 |
|
| 117 |
self.df['raw_answers_stemmed'] = self.df['raw_answers'].apply(literal_eval).apply(self.stem_answers)
|
|
@@ -127,6 +134,7 @@ class KBVQAEvaluator:
|
|
| 127 |
self.vqa_scores[full_config] = round(self.df[f'vqa_score_{full_config}'].mean()*100, 2)
|
| 128 |
self.exact_match_scores[full_config] = round(self.df[f'exact_match_score_{full_config}'].mean()*100, 2)
|
| 129 |
|
|
|
|
| 130 |
def create_GPT4_messages_template(self, question: str, ground_truths: List[str], model_answer: str) -> List[dict]:
|
| 131 |
"""
|
| 132 |
Create a message list for the GPT-4 API call based on the question, ground truths, and model answer.
|
|
@@ -158,6 +166,9 @@ class KBVQAEvaluator:
|
|
| 158 |
def semantic_evaluation(self) -> None:
|
| 159 |
"""
|
| 160 |
Perform semantic evaluation using GPT-4 for each model configuration.
|
|
|
|
|
|
|
|
|
|
| 161 |
"""
|
| 162 |
openai.api_key = self.openai_api_key
|
| 163 |
model_configurations_for_semantic_evaluation = self.model_configurations[:2] # considering only main model configs ['caption+detic', 'caption+yolov5'] without ablation, due to the cost involved.
|
|
@@ -192,6 +203,8 @@ class KBVQAEvaluator:
|
|
| 192 |
self.df.to_excel(writer, sheet_name='Main Data', index=False)
|
| 193 |
scores_df.to_excel(writer, sheet_name='Scores', index=False)
|
| 194 |
|
|
|
|
|
|
|
| 195 |
def run_evaluation(save: bool = False, save_filename: str = "results") -> None:
|
| 196 |
"""
|
| 197 |
Run the full evaluation process using KBVQAEvaluator and save the results to an Excel file.
|
|
@@ -199,6 +212,9 @@ def run_evaluation(save: bool = False, save_filename: str = "results") -> None:
|
|
| 199 |
Args:
|
| 200 |
save (bool): Whether to save the results to an Excel file. Defaults to False.
|
| 201 |
save_filename (str): The filename to save the results if save is True. Defaults to "results".
|
|
|
|
|
|
|
|
|
|
| 202 |
"""
|
| 203 |
|
| 204 |
# Instantiate the evaluator
|
|
|
|
| 31 |
gpt4_max_tokens (int): Maximum tokens for GPT-4 responses.
|
| 32 |
gpt4_temperature (float): Temperature setting for GPT-4 responses.
|
| 33 |
"""
|
| 34 |
+
|
| 35 |
|
| 36 |
def __init__(self): -> None
|
| 37 |
"""
|
|
|
|
| 56 |
self.gpt4_max_tokens = config.GPT4_MAX_TOKENS
|
| 57 |
self.gpt4_temperature = config.GPT4_TEMPERATURE
|
| 58 |
|
| 59 |
+
|
| 60 |
def stem_answers(self, answers: Union[str, List[str]]) -> Union[str, List[str]]:
|
| 61 |
"""
|
| 62 |
Apply Porter Stemmer to either a single string or a list of strings.
|
|
|
|
| 74 |
words = answers.split()
|
| 75 |
return " ".join(self.stemmer.stem(word.strip()) for word in words)
|
| 76 |
|
| 77 |
+
|
| 78 |
def calculate_vqa_score(self, ground_truths: List[str], model_answer: str) -> float:
|
| 79 |
"""
|
| 80 |
Calculate VQA score based on the number of matching answers, with optional fuzzy matching.
|
|
|
|
| 94 |
count = Counter(ground_truths)
|
| 95 |
return min(count.get(model_answer, 0) / 3, 1)
|
| 96 |
|
| 97 |
+
|
| 98 |
def calculate_exact_match_score(self, ground_truths: List[str], model_answer: str) -> int:
|
| 99 |
"""
|
| 100 |
Calculate Exact Match score, with optional fuzzy matching.
|
|
|
|
| 112 |
else:
|
| 113 |
return int(model_answer in ground_truths)
|
| 114 |
|
| 115 |
+
|
| 116 |
def syntactic_evaluation(self) -> None:
|
| 117 |
"""
|
| 118 |
Process the DataFrame: stem answers, calculate scores, and store results.
|
| 119 |
|
| 120 |
+
Returns:
|
| 121 |
+
None.
|
| 122 |
"""
|
| 123 |
|
| 124 |
self.df['raw_answers_stemmed'] = self.df['raw_answers'].apply(literal_eval).apply(self.stem_answers)
|
|
|
|
| 134 |
self.vqa_scores[full_config] = round(self.df[f'vqa_score_{full_config}'].mean()*100, 2)
|
| 135 |
self.exact_match_scores[full_config] = round(self.df[f'exact_match_score_{full_config}'].mean()*100, 2)
|
| 136 |
|
| 137 |
+
|
| 138 |
def create_GPT4_messages_template(self, question: str, ground_truths: List[str], model_answer: str) -> List[dict]:
|
| 139 |
"""
|
| 140 |
Create a message list for the GPT-4 API call based on the question, ground truths, and model answer.
|
|
|
|
| 166 |
def semantic_evaluation(self) -> None:
|
| 167 |
"""
|
| 168 |
Perform semantic evaluation using GPT-4 for each model configuration.
|
| 169 |
+
|
| 170 |
+
Returns:
|
| 171 |
+
None.
|
| 172 |
"""
|
| 173 |
openai.api_key = self.openai_api_key
|
| 174 |
model_configurations_for_semantic_evaluation = self.model_configurations[:2] # considering only main model configs ['caption+detic', 'caption+yolov5'] without ablation, due to the cost involved.
|
|
|
|
| 203 |
self.df.to_excel(writer, sheet_name='Main Data', index=False)
|
| 204 |
scores_df.to_excel(writer, sheet_name='Scores', index=False)
|
| 205 |
|
| 206 |
+
|
| 207 |
+
|
| 208 |
def run_evaluation(save: bool = False, save_filename: str = "results") -> None:
|
| 209 |
"""
|
| 210 |
Run the full evaluation process using KBVQAEvaluator and save the results to an Excel file.
|
|
|
|
| 212 |
Args:
|
| 213 |
save (bool): Whether to save the results to an Excel file. Defaults to False.
|
| 214 |
save_filename (str): The filename to save the results if save is True. Defaults to "results".
|
| 215 |
+
|
| 216 |
+
Returns:
|
| 217 |
+
None.
|
| 218 |
"""
|
| 219 |
|
| 220 |
# Instantiate the evaluator
|