Spaces:

fracapuano
/

AISandbox

Runtime error

App Files Files Community

fracapuano commited on Aug 30, 2023

Commit

02556c2

1 Parent(s): acbe90b

fix: major code restructuring

Browse files

Files changed (1) hide show

qa/utils.py +58 -29

qa/utils.py CHANGED Viewed

@@ -7,7 +7,7 @@ from langchain.llms import OpenAI
 from langchain.docstore.document import Document
 from langchain.vectorstores import FAISS, VectorStore
 import docx2txt
-from typing import List, Dict, Any, Union, Text, Tuple
 import re
 from io import BytesIO
 import streamlit as st
@@ -15,12 +15,38 @@ from .prompts import STUFF_PROMPT
 from pypdf import PdfReader
 from openai.error import AuthenticationError
 class HashDocument(Document):
     """A document that uses the page content as the hash."""
     def __hash__(self):
         content = self.page_content + "".join(self.metadata[k] for k in self.metadata.keys())
         return hash(content)
 @st.cache_data
 def parse_docx(file: BytesIO) -> str:
     text = docx2txt.process(file)
@@ -43,7 +69,6 @@ def parse_pdf(file: BytesIO) -> List[str]:
         text = re.sub(r"\n\s*\n", "\n\n", text)
         output.append(text)
     return output
@@ -54,6 +79,19 @@ def parse_txt(file: BytesIO) -> str:
     text = re.sub(r"\n\s*\n", "\n\n", text)
     return text
 @st.cache_data
 def text_to_docs(text: Union[Text, Tuple[Text]]) -> List[Document]:
@@ -61,10 +99,13 @@ def text_to_docs(text: Union[Text, Tuple[Text]]) -> List[Document]:
     Converts a string or frozenset of strings to a list of Documents
     with metadata.
     """
-    if isinstance(text, str):
-        # Take a single string as one page
-        text = tuple([text])
-    elif isinstance(text, tuple):
         # map each page into a document instance
         page_docs = [HashDocument(page_content=page) for page in text]
         # Add page numbers as metadata
@@ -72,52 +113,40 @@ def text_to_docs(text: Union[Text, Tuple[Text]]) -> List[Document]:
             doc.metadata["page"] = i + 1
         # Split pages into chunks
         doc_chunks = []
-        # text splitter to split the text into chunks
-        text_splitter = RecursiveCharacterTextSplitter(
-                chunk_size=800,
-                separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
-                chunk_overlap=20,  # minimal overlap to capture sematic overlap across chunks
-            )
         for doc in page_docs:
             chunks = text_splitter.split_text(doc.page_content)
             for i, chunk in enumerate(chunks):
                 # Create a new document for each individual chunk
                 doc = HashDocument(
                     page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i}
                 )
-                # Add sources a metadata
                 doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}"
                 doc_chunks.append(doc)
         return doc_chunks
-    else:
-        raise ValueError("Text must be either a string or a list of strings. Got: {type(text)}")
 @st.cache_data
 def embed_docs(_docs: Tuple[Document]) -> VectorStore:
     """Embeds a list of Documents and returns a FAISS index"""
-    docs = _docs
-    if not st.session_state.get("OPENAI_API_KEY"):
-        raise AuthenticationError(
-            "Enter your OpenAI API key in the sidebar. You can get a key at https://platform.openai.com/account/api-keys."
-        )
-    else:
-        # Embed the chunks
-        embeddings = OpenAIEmbeddings(openai_api_key=st.session_state.get("OPENAI_API_KEY"))
-        index = FAISS.from_documents(list(docs), embeddings)
-        return index
 @st.cache_data
-def search_docs(_index: VectorStore, query: str) -> List[Document]:
     """Searches a FAISS index for similar chunks to the query
     and returns a list of Documents."""
     # Search for similar chunks
-    docs = _index.similarity_search(query, k=5)
     return docs

 from langchain.docstore.document import Document
 from langchain.vectorstores import FAISS, VectorStore
 import docx2txt
+from typing import List, Dict, Any, Union, Text, Tuple, Iterable
 import re
 from io import BytesIO
 import streamlit as st
 from pypdf import PdfReader
 from openai.error import AuthenticationError
+class PDFFile:
+    """A PDF file class for typing purposes."""
+    @classmethod
+    def is_pdf(file:Any) -> bool:
+        return file.name.endswith(".pdf")
+class DocxFile:
+    """A Docx file class for typing purposes."""
+    @classmethod
+    def is_docx(file:Any) -> bool:
+        return file.name.endswith(".docx")
+class TxtFile:
+    """A Txt file class for typing purposes."""
+    @classmethod
+    def is_txt(file:Any) -> bool:
+        return file.name.endswith(".txt")
+class CodeFile:
+    """A scripting-file class for typing purposes."""
+    @classmethod
+    def is_code(file:Any) -> bool:
+        return file.name.split(".")[1] in [".py", ".json", ".html", ".css", ".md"]
 class HashDocument(Document):
     """A document that uses the page content as the hash."""
     def __hash__(self):
         content = self.page_content + "".join(self.metadata[k] for k in self.metadata.keys())
         return hash(content)
 @st.cache_data
 def parse_docx(file: BytesIO) -> str:
     text = docx2txt.process(file)
         text = re.sub(r"\n\s*\n", "\n\n", text)
         output.append(text)
     return output
     text = re.sub(r"\n\s*\n", "\n\n", text)
     return text
+@st.cache_data
+def get_text_splitter(
+    chunk_size:int=500,
+    chunk_overlap:int=50,
+    separators:Iterable[Text]= ["\n\n", "\n", ".", "!", "?", ",", " ", ""])->RecursiveCharacterTextSplitter:
+    """Returns a text splitter instance with the given parameters. Cached for performance."""
+    # text splitter to split the text into chunks
+    text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=chunk_size,  # a limited chunk size ensures smaller chunks and more precise answers
+            separators=separators,  # a list of separators to split the text on
+            chunk_overlap=chunk_overlap,  # minimal overlap to capture sematic overlap across chunks
+        )
+    return text_splitter
 @st.cache_data
 def text_to_docs(text: Union[Text, Tuple[Text]]) -> List[Document]:
     Converts a string or frozenset of strings to a list of Documents
     with metadata.
     """
+    # sanity check on the input provided
+    if not isinstance(text, (str, tuple)):
+        raise ValueError("Text must be either a string or a list of strings. Got: {type(text)}")
+    elif isinstance(text, str):
+        # Take a single string as one page - make it a tuple so that is hashable
+        text = (text, )
+    if isinstance(text, tuple):
         # map each page into a document instance
         page_docs = [HashDocument(page_content=page) for page in text]
         # Add page numbers as metadata
             doc.metadata["page"] = i + 1
         # Split pages into chunks
         doc_chunks = []
+        # Get the text splitter
+        text_splitter = get_text_splitter()
         for doc in page_docs:
+            # this splits the page into chunks
             chunks = text_splitter.split_text(doc.page_content)
             for i, chunk in enumerate(chunks):
                 # Create a new document for each individual chunk
                 doc = HashDocument(
                     page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i}
                 )
+                # Add sources to metadata for retrieval later on
                 doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}"
                 doc_chunks.append(doc)
         return doc_chunks
 @st.cache_data
 def embed_docs(_docs: Tuple[Document]) -> VectorStore:
     """Embeds a list of Documents and returns a FAISS index"""
+    # Embed the chunks
+    embeddings = OpenAIEmbeddings(openai_api_key=st.session_state.get("OPENAI_API_KEY"))
+    index = FAISS.from_documents(list(_docs), embeddings)
+    return index
 @st.cache_data
+def search_docs(_index: VectorStore, query: str, k:int=5) -> List[Document]:
     """Searches a FAISS index for similar chunks to the query
     and returns a list of Documents."""
     # Search for similar chunks
+    docs = _index.similarity_search(query, k=k)
     return docs