Spaces:

RugNlpFlashcards
/

Speech_Language_Processing_Jurafsky_Martin

Build error

App Files Files Community

Ramon Meffert commited on Mar 12, 2022

Commit

83870cc

1 Parent(s): 8bbe3aa

Add base model retriever

Browse files

Files changed (5) hide show

README.md +48 -0
main.py → base_model/main.py +7 -6
base_model/retriever.py +53 -24
poetry.lock +29 -1
pyproject.toml +1 -0

README.md CHANGED Viewed

@@ -25,3 +25,51 @@ De meeste QA systemen bestaan uit twee onderdelen:
 - Huggingface QA tutorial: <https://huggingface.co/docs/transformers/tasks/question_answering#finetune-with-tensorflow>
 - Overview van open-domain question answering technieken: <https://lilianweng.github.io/posts/2020-10-29-odqa/>

 - Huggingface QA tutorial: <https://huggingface.co/docs/transformers/tasks/question_answering#finetune-with-tensorflow>
 - Overview van open-domain question answering technieken: <https://lilianweng.github.io/posts/2020-10-29-odqa/>
+## Base model
+Tot nu toe alleen een retriever die adhv een vraag de top-k relevante documents
+ophaalt. Haalt voor veel vragen wel hoge similarity scores, maar de documents
+die die ophaalt zijn meestal niet erg relevant.
+```bash
+poetry shell
+cd base_model
+poetry run python main.py
+```
+### Voorbeeld
+"What is the perplexity of a language model?"
+> Result 1 (score: 74.10):
+> Figure 10 .17 A sample alignment between sentences in English and French, with
+> sentences extracted from Antoine de Saint-Exupery's Le Petit Prince and a
+> hypothetical translation. Sentence alignment takes sentences e 1 , ..., e n ,
+> and f 1 , ..., f n and finds minimal > sets of sentences that are translations
+> of each other, including single sentence mappings like (e 1 ,f 1 ), (e 4 -f 3
+> ), (e 5 -f 4 ), (e 6 -f 6 ) as well as 2-1 alignments (e 2 /e 3 ,f 2 ), (e 7
+> /e 8 -f 7 ), and null alignments (f 5 ).
+>
+> Result 2 (score: 74.23):
+> Character or word overlap-based metrics like chrF (or BLEU, or etc.) are
+> mainly used to compare two systems, with the goal of answering questions like:
+> did the new algorithm we just invented improve our MT system? To know if the
+> difference between the chrF scores of two > MT systems is a significant
+> difference, we use the paired bootstrap test, or the similar randomization
+> test.
+>
+> Result 3 (score: 74.43):
+> The model thus predicts the class negative for the test sentence.
+>
+> Result 4 (score: 74.95):
+> Translating from languages with extensive pro-drop, like Chinese or Japanese,
+> to non-pro-drop languages like English can be difficult since the model must
+> somehow identify each zero and recover who or what is being talked about in
+> order to insert the proper pronoun.
+>
+> Result 5 (score: 76.22):
+> Similarly, a recent challenge set, the WinoMT dataset (Stanovsky et al., 2019)
+> shows that MT systems perform worse when they are asked to translate sentences
+> that describe people with non-stereotypical gender roles, like "The doctor
+> asked the nurse to help her in the > operation".

main.py → base_model/main.py RENAMED Viewed

@@ -1,14 +1,15 @@
-from base_model.retriever import Retriever
 if __name__ == '__main__':
     # Initialize retriever
     r = Retriever()
     # Retrieve example
-    retrieved = r.retrieve(
-        "When is a stochastic process said to be stationary?")
-    for i, (score, result) in enumerate(retrieved):
-        print(f"Result {i+1} (score: {score * 100:.02f}:")
-        print(result['text'][0])
         print()  # Newline

+from retriever import Retriever
 if __name__ == '__main__':
     # Initialize retriever
     r = Retriever()
     # Retrieve example
+    scores, result = r.retrieve(
+        "What is the perplexity of a language model?")
+    for i, score in enumerate(scores):
+        print(f"Result {i+1} (score: {score:.02f}):")
+        print(result['text'][i])
         print()  # Newline

base_model/retriever.py CHANGED Viewed

@@ -1,10 +1,21 @@
-from transformers import DPRContextEncoder, DPRContextEncoderTokenizer, \
-                         DPRQuestionEncoder, DPRQuestionEncoderTokenizer
 from datasets import load_dataset
 import torch
-class Retriever():
     """A class used to retrieve relevant documents based on some query.
     based on https://huggingface.co/docs/datasets/faiss_es#faiss.
     """
@@ -21,47 +32,64 @@ class Retriever():
         # Context encoding and tokenization
         self.ctx_encoder = DPRContextEncoder.from_pretrained(
-            "facebook/dpr-ctx_encoder-single-nq-base")
         self.ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained(
-            "facebook/dpr-ctx_encoder-single-nq-base")
         # Question encoding and tokenization
         self.q_encoder = DPRQuestionEncoder.from_pretrained(
-            "facebook/dpr-question_encoder-single-nq-base")
         self.q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
-            "facebook/dpr-question_encoder-single-nq-base")
         # Dataset building
         self.dataset = self.__init_dataset(dataset)
-    def __init_dataset(self, dataset: str):
         """Loads the dataset and adds FAISS embeddings.
         Args:
             dataset (str): A HuggingFace dataset name.
         Returns:
             Dataset: A dataset with a new column 'embeddings' containing FAISS
             embeddings.
         """
-        # TODO: save ds w/ embeddings to disk and retrieve it if it already exists
         # Load dataset
-        ds = load_dataset(dataset, name='paragraphs')['train']
-        def embed(row):
-            # Inline helper function to perform embedding
-            p = row['text']
-            tok = self.ctx_tokenizer(p, return_tensors='pt', truncation=True)
-            enc = self.ctx_encoder(**tok)[0][0].numpy()
-            return {'embeddings': enc}
-        # Add FAISS embeddings
-        ds_with_embeddings = ds.map(embed)
-        # Todo: this throws a weird error.
-        ds_with_embeddings.add_faiss_index(column='embeddings')
-        return ds_with_embeddings
     def retrieve(self, query: str, k: int = 5):
         """Retrieve the top k matches for a search query.
@@ -77,10 +105,11 @@ class Retriever():
         def embed(q):
             # Inline helper function to perform embedding
-            tok = self.q_tokenizer(q, return_tensors='pt', truncation=True)
             return self.q_encoder(**tok)[0][0].numpy()
         question_embedding = embed(query)
         scores, results = self.dataset.get_nearest_examples(
-            'embeddings', question_embedding, k=k)
         return scores, results

+from transformers import (
+    DPRContextEncoder,
+    DPRContextEncoderTokenizer,
+    DPRQuestionEncoder,
+    DPRQuestionEncoderTokenizer,
+)
 from datasets import load_dataset
 import torch
+import os.path
+# Hacky fix for FAISS error on macOS
+# See https://stackoverflow.com/a/63374568/4545692
+import os
+os.environ["KMP_DUPLICATE_LIB_OK"] = "True"
+class Retriever:
     """A class used to retrieve relevant documents based on some query.
     based on https://huggingface.co/docs/datasets/faiss_es#faiss.
     """
         # Context encoding and tokenization
         self.ctx_encoder = DPRContextEncoder.from_pretrained(
+            "facebook/dpr-ctx_encoder-single-nq-base"
+        )
         self.ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained(
+            "facebook/dpr-ctx_encoder-single-nq-base"
+        )
         # Question encoding and tokenization
         self.q_encoder = DPRQuestionEncoder.from_pretrained(
+            "facebook/dpr-question_encoder-single-nq-base"
+        )
         self.q_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
+            "facebook/dpr-question_encoder-single-nq-base"
+        )
         # Dataset building
         self.dataset = self.__init_dataset(dataset)
+    def __init_dataset(self,
+                       dataset: str,
+                       fname: str = "./models/paragraphs_embedding.faiss"):
         """Loads the dataset and adds FAISS embeddings.
         Args:
             dataset (str): A HuggingFace dataset name.
+            fname (str): The name to use to save the embeddings to disk for
+            faster loading after the first run.
         Returns:
             Dataset: A dataset with a new column 'embeddings' containing FAISS
             embeddings.
         """
         # Load dataset
+        ds = load_dataset(dataset, name="paragraphs")["train"]
+        if os.path.exists(fname):
+            # If we already have FAISS embeddings, load them from disk
+            ds.load_faiss_index('embeddings', fname)
+            return ds
+        else:
+            # If there are no FAISS embeddings, generate them
+            def embed(row):
+                # Inline helper function to perform embedding
+                p = row["text"]
+                tok = self.ctx_tokenizer(
+                    p, return_tensors="pt", truncation=True)
+                enc = self.ctx_encoder(**tok)[0][0].numpy()
+                return {"embeddings": enc}
+            # Add FAISS embeddings
+            ds_with_embeddings = ds.map(embed)
+            ds_with_embeddings.add_faiss_index(column="embeddings")
+            # save dataset w/ embeddings
+            os.makedirs("./models/", exist_ok=True)
+            ds_with_embeddings.save_faiss_index("embeddings", fname)
+            return ds_with_embeddings
     def retrieve(self, query: str, k: int = 5):
         """Retrieve the top k matches for a search query.
         def embed(q):
             # Inline helper function to perform embedding
+            tok = self.q_tokenizer(q, return_tensors="pt", truncation=True)
             return self.q_encoder(**tok)[0][0].numpy()
         question_embedding = embed(query)
         scores, results = self.dataset.get_nearest_examples(
+            "embeddings", question_embedding, k=k
+        )
         return scores, results

poetry.lock CHANGED Viewed

@@ -51,6 +51,18 @@ docs = ["furo", "sphinx", "zope.interface", "sphinx-notfound-page"]
 tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "cloudpickle"]
 tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "cloudpickle"]
 [[package]]
 name = "certifi"
 version = "2021.10.8"
@@ -460,6 +472,14 @@ python-versions = "*"
 docs = ["sphinx", "sphinx-rtd-theme", "setuptools-rust"]
 testing = ["pytest", "requests", "numpy", "datasets"]
 [[package]]
 name = "torch"
 version = "1.11.0"
@@ -590,7 +610,7 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.8"
-content-hash = "9f99ff0196acf862c585450123952a4d10e93ce9dddd7222ca43dd8076451fb3"
 [metadata.files]
 aiohttp = [
@@ -679,6 +699,10 @@ attrs = [
     {file = "attrs-21.4.0-py2.py3-none-any.whl", hash = "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4"},
     {file = "attrs-21.4.0.tar.gz", hash = "sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd"},
 ]
 certifi = [
     {file = "certifi-2021.10.8-py2.py3-none-any.whl", hash = "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"},
     {file = "certifi-2021.10.8.tar.gz", hash = "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872"},
@@ -1161,6 +1185,10 @@ tokenizers = [
     {file = "tokenizers-0.11.6-cp39-cp39-win_amd64.whl", hash = "sha256:b28966c68a2cdecd5120f4becea159eebe0335b8202e21e292eb381031026edc"},
     {file = "tokenizers-0.11.6.tar.gz", hash = "sha256:562b2022faf0882586c915385620d1f11798fc1b32bac55353a530132369a6d0"},
 ]
 torch = [
     {file = "torch-1.11.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:62052b50fffc29ca7afc0c04ef8206b6f1ca9d10629cb543077e12967e8d0398"},
     {file = "torch-1.11.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:866bfba29ac98dec35d893d8e17eaec149d0ac7a53be7baae5c98069897db667"},

 tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "cloudpickle"]
 tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "cloudpickle"]
+[[package]]
+name = "autopep8"
+version = "1.6.0"
+description = "A tool that automatically formats Python code to conform to the PEP 8 style guide"
+category = "dev"
+optional = false
+python-versions = "*"
+[package.dependencies]
+pycodestyle = ">=2.8.0"
+toml = "*"
 [[package]]
 name = "certifi"
 version = "2021.10.8"
 docs = ["sphinx", "sphinx-rtd-theme", "setuptools-rust"]
 testing = ["pytest", "requests", "numpy", "datasets"]
+[[package]]
+name = "toml"
+version = "0.10.2"
+description = "Python Library for Tom's Obvious, Minimal Language"
+category = "dev"
+optional = false
+python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
 [[package]]
 name = "torch"
 version = "1.11.0"
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.8"
+content-hash = "227b922ee14abf36ca75bb238d239d712bed9213d54c567996566d465e465733"
 [metadata.files]
 aiohttp = [
     {file = "attrs-21.4.0-py2.py3-none-any.whl", hash = "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4"},
     {file = "attrs-21.4.0.tar.gz", hash = "sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd"},
 ]
+autopep8 = [
+    {file = "autopep8-1.6.0-py2.py3-none-any.whl", hash = "sha256:ed77137193bbac52d029a52c59bec1b0629b5a186c495f1eb21b126ac466083f"},
+    {file = "autopep8-1.6.0.tar.gz", hash = "sha256:44f0932855039d2c15c4510d6df665e4730f2b8582704fa48f9c55bd3e17d979"},
+]
 certifi = [
     {file = "certifi-2021.10.8-py2.py3-none-any.whl", hash = "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"},
     {file = "certifi-2021.10.8.tar.gz", hash = "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872"},
     {file = "tokenizers-0.11.6-cp39-cp39-win_amd64.whl", hash = "sha256:b28966c68a2cdecd5120f4becea159eebe0335b8202e21e292eb381031026edc"},
     {file = "tokenizers-0.11.6.tar.gz", hash = "sha256:562b2022faf0882586c915385620d1f11798fc1b32bac55353a530132369a6d0"},
 ]
+toml = [
+    {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"},
+    {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"},
+]
 torch = [
     {file = "torch-1.11.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:62052b50fffc29ca7afc0c04ef8206b6f1ca9d10629cb543077e12967e8d0398"},
     {file = "torch-1.11.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:866bfba29ac98dec35d893d8e17eaec149d0ac7a53be7baae5c98069897db667"},

pyproject.toml CHANGED Viewed

@@ -14,6 +14,7 @@ faiss-cpu = "^1.7.2"
 [tool.poetry.dev-dependencies]
 flake8 = "^4.0.1"
 [build-system]
 requires = ["poetry-core>=1.0.0"]

 [tool.poetry.dev-dependencies]
 flake8 = "^4.0.1"
+autopep8 = "^1.6.0"
 [build-system]
 requires = ["poetry-core>=1.0.0"]