Spaces:
Runtime error
Runtime error
fracapuano
commited on
Commit
·
cda0f94
1
Parent(s):
51a7497
add: multi-chunksize splitter for better sematic precision
Browse files- qa/utils.py +18 -18
qa/utils.py
CHANGED
|
@@ -137,22 +137,22 @@ def text_to_docs(pages: Union[Text, Tuple[Text]], **kwargs) -> List[HashDocument
|
|
| 137 |
|
| 138 |
# Split pages into chunks
|
| 139 |
doc_chunks = []
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
|
| 157 |
return doc_chunks
|
| 158 |
|
|
@@ -193,8 +193,8 @@ def get_answer(
|
|
| 193 |
chain = load_qa_with_sources_chain(
|
| 194 |
ChatOpenAI(temperature=0, openai_api_key=st.session_state.get("OPENAI_API_KEY"), model=model, streaming=stream_answer),
|
| 195 |
chain_type="stuff",
|
| 196 |
-
prompt=STUFF_PROMPT
|
| 197 |
-
|
| 198 |
# chain_type_kwargs={
|
| 199 |
# "verbose": True,
|
| 200 |
# "prompt": query,
|
|
|
|
| 137 |
|
| 138 |
# Split pages into chunks
|
| 139 |
doc_chunks = []
|
| 140 |
+
for ntokens in [50,250,500,750]:
|
| 141 |
+
# Get the text splitter
|
| 142 |
+
text_splitter = get_text_splitter(chunk_size=ntokens, chunk_overlap=ntokens//10)
|
| 143 |
+
for doc in page_docs:
|
| 144 |
+
# this splits the page into chunks
|
| 145 |
+
chunks = text_splitter.split_text(doc.page_content)
|
| 146 |
+
for i, chunk in enumerate(chunks):
|
| 147 |
+
# Create a new document for each individual chunk
|
| 148 |
+
new_doc = HashDocument(
|
| 149 |
+
page_content=chunk,
|
| 150 |
+
metadata={"file_name": doc.metadata["file_name"], "page": doc.metadata["page"], "chunk": i}
|
| 151 |
+
)
|
| 152 |
+
# Add sources to metadata for retrieval later on
|
| 153 |
+
new_doc.metadata["source"] = \
|
| 154 |
+
f"{new_doc.metadata['file_name']}/Page-{new_doc.metadata['page']}/Chunk-{new_doc.metadata['chunk']}/Chunksize-{ntokens}"
|
| 155 |
+
doc_chunks.append(new_doc)
|
| 156 |
|
| 157 |
return doc_chunks
|
| 158 |
|
|
|
|
| 193 |
chain = load_qa_with_sources_chain(
|
| 194 |
ChatOpenAI(temperature=0, openai_api_key=st.session_state.get("OPENAI_API_KEY"), model=model, streaming=stream_answer),
|
| 195 |
chain_type="stuff",
|
| 196 |
+
prompt=STUFF_PROMPT,
|
| 197 |
+
verbose=True,
|
| 198 |
# chain_type_kwargs={
|
| 199 |
# "verbose": True,
|
| 200 |
# "prompt": query,
|