Spaces:

RugNlpFlashcards
/

Speech_Language_Processing_Jurafsky_Martin

Build error

GGroenendaal commited on Mar 18, 2022

Commit

081d5bf

1 Parent(s): 8fe5a80

move preprocessing to dependency injection

Files changed (2) hide show

base_model/evaluate.py CHANGED Viewed

@@ -1,29 +1,27 @@
-def normalize_text(s: str) -> str:
     """Preprocesses the sentence string by normalizing.
     Args:
         s (str): the sentence
     Returns:
-        string: normalized sentence
     """
-    import string, re
-    def remove_articles(text):
-        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
-        return re.sub(regex, " ", text)
-    def white_space_fix(text):
-        return " ".join(text.split())
-    def remove_punc(text):
-        exclude = set(string.punctuation)
-        return "".join(ch for ch in text if ch not in exclude)
-    def lower(text):
-        return text.lower()
-    return white_space_fix(remove_articles(remove_punc(lower(s))))
 def compute_exact_match(prediction: str, answer: str) -> int:
@@ -36,7 +34,7 @@ def compute_exact_match(prediction: str, answer: str) -> int:
     Returns:
         int: 1 for exact match, 0 for not
     """
-    return int(normalize_text(prediction) == normalize_text(answer))
 def compute_f1(prediction: str, answer: str) -> float:
@@ -49,8 +47,8 @@ def compute_f1(prediction: str, answer: str) -> float:
     Returns:
         boolean: the f1 score
     """
-    pred_tokens = normalize_text(prediction).split()
-    answer_tokens = normalize_text(answer).split()
     if len(pred_tokens) == 0 or len(answer_tokens) == 0:
         return int(pred_tokens == answer_tokens)

+from typing import Callable, List
+from base_model.string_utils import lower, remove_articles, remove_punc, white_space_fix
+def normalize_text(inp: str, functions: List[Callable[[str], str]]):
+    for fun in functions:
+        inp = fun(inp)
+    return inp
+def normalize_text_default(inp: str) -> str:
     """Preprocesses the sentence string by normalizing.
     Args:
         s (str): the sentence
     Returns:
+        string: normalized with default parames
     """
+    steps = [remove_articles, white_space_fix, remove_punc, lower]
+    return normalize_text(inp, steps)
 def compute_exact_match(prediction: str, answer: str) -> int:
     Returns:
         int: 1 for exact match, 0 for not
     """
+    return int(normalize_text_default(prediction) == normalize_text_default(answer))
 def compute_f1(prediction: str, answer: str) -> float:
     Returns:
         boolean: the f1 score
     """
+    pred_tokens = normalize_text_default(prediction).split()
+    answer_tokens = normalize_text_default(answer).split()
     if len(pred_tokens) == 0 or len(answer_tokens) == 0:
         return int(pred_tokens == answer_tokens)

base_model/string_utils.py ADDED Viewed

+import re
+import string
+def remove_articles(text):
+    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
+    return re.sub(regex, " ", text)
+def white_space_fix(text):
+    return " ".join(text.split())
+def remove_punc(text):
+    exclude = set(string.punctuation)
+    return "".join(ch for ch in text if ch not in exclude)
+def lower(text):
+    return text.lower()