Spaces:

RugNlpFlashcards
/

Speech_Language_Processing_Jurafsky_Martin

Build error

App Files Files Community

Robert commited on Mar 14, 2022

Commit

2827202

1 Parent(s): 83870cc

Added a way to evaluate overall performance of our model based on exact match and F1-score.

Browse files

Files changed (3) hide show

base_model/evaluate.py +66 -0
base_model/main.py +5 -0
base_model/retriever.py +33 -0

base_model/evaluate.py ADDED Viewed

	@@ -0,0 +1,66 @@

+def normalize_text(s: str) -> str:
+    """Preprocesses the sentence string by normalizing.
+    Args:
+        s (str): the sentence
+    Returns:
+        string: normalized sentence
+    """
+    import string, re
+    def remove_articles(text):
+        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
+        return re.sub(regex, " ", text)
+    def white_space_fix(text):
+        return " ".join(text.split())
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+    def lower(text):
+        return text.lower()
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+def compute_exact_match(prediction: str, answer: str) -> int:
+    """Computes exact match for sentences.
+    Args:
+        prediction (str): the predicted answer
+        answer (str): the gold answer
+    Returns:
+        int: 1 for exact match, 0 for not
+    """
+    return int(normalize_text(prediction) == normalize_text(answer))
+def compute_f1(prediction: str, answer: str) -> float:
+    """Computes F1-score on token overlap for sentences.
+    Args:
+        prediction (str): the predicted answer
+        answer (str): the gold answer
+    Returns:
+        boolean: the f1 score
+    """
+    pred_tokens = normalize_text(prediction).split()
+    answer_tokens = normalize_text(answer).split()
+    if len(pred_tokens) == 0 or len(answer_tokens) == 0:
+        return int(pred_tokens == answer_tokens)
+    common_tokens = set(pred_tokens) & set(answer_tokens)
+    if len(common_tokens) == 0:
+        return 0
+    prec = len(common_tokens) / len(pred_tokens)
+    rec = len(common_tokens) / len(answer_tokens)
+    return 2 * (prec * rec) / (prec + rec)

base_model/main.py CHANGED Viewed

@@ -13,3 +13,8 @@ if __name__ == '__main__':
         print(f"Result {i+1} (score: {score:.02f}):")
         print(result['text'][i])
         print()  # Newline

         print(f"Result {i+1} (score: {score:.02f}):")
         print(result['text'][i])
         print()  # Newline
+    # Compute overall performance
+    exact_match, f1_score, total = r.evaluate()
+    print(f"Exact match: {exact_match} / {total}\n"
+          f"F1-score: {f1_score:.02f}")

base_model/retriever.py CHANGED Viewed

@@ -7,6 +7,9 @@ from transformers import (
 from datasets import load_dataset
 import torch
 import os.path
 # Hacky fix for FAISS error on macOS
 # See https://stackoverflow.com/a/63374568/4545692
@@ -49,6 +52,7 @@ class Retriever:
         # Dataset building
         self.dataset = self.__init_dataset(dataset)
     def __init_dataset(self,
                        dataset: str,
                        fname: str = "./models/paragraphs_embedding.faiss"):
@@ -65,6 +69,7 @@ class Retriever:
         """
         # Load dataset
         ds = load_dataset(dataset, name="paragraphs")["train"]
         if os.path.exists(fname):
             # If we already have FAISS embeddings, load them from disk
@@ -112,4 +117,32 @@ class Retriever:
         scores, results = self.dataset.get_nearest_examples(
             "embeddings", question_embedding, k=k
         )
         return scores, results

 from datasets import load_dataset
 import torch
 import os.path
+import numpy
+import evaluate
 # Hacky fix for FAISS error on macOS
 # See https://stackoverflow.com/a/63374568/4545692
         # Dataset building
         self.dataset = self.__init_dataset(dataset)
     def __init_dataset(self,
                        dataset: str,
                        fname: str = "./models/paragraphs_embedding.faiss"):
         """
         # Load dataset
         ds = load_dataset(dataset, name="paragraphs")["train"]
+        print(ds)
         if os.path.exists(fname):
             # If we already have FAISS embeddings, load them from disk
         scores, results = self.dataset.get_nearest_examples(
             "embeddings", question_embedding, k=k
         )
         return scores, results
+    def evaluate(self):
+        """Evaluates the entire model by computing F1-score and exact match on the
+        entire dataset.
+        Returns:
+            int: overall exact match
+            float: overall F1-score
+            int: total amount of questions handled
+        """
+        questions_ds = load_dataset("GroNLP/ik-nlp-22_slp", name="questions")['test']
+        questions = questions_ds['question']
+        answers = questions_ds['answer']
+        predictions = []
+        scores = 0
+        # Currently just takes the first answer and does not look at scores yet
+        for question in questions:
+            score, result = self.retrieve(question, 1)
+            scores += score[0]
+            predictions.append(result['text'][0])
+        exact_match = max((evaluate.compute_exact_match(predictions[i], answers[i])) for i in range(len(answers)))
+        f1_score = max((evaluate.compute_f1(predictions[i], answers[i])) for i in range(len(answers)))
+        return exact_match, f1_score, len(answers)