Spaces:
Sleeping
Sleeping
req added, changed spacy to recursive
Browse files- app.py +6 -6
- requirements.txt +4 -0
app.py
CHANGED
|
@@ -9,7 +9,7 @@ import requests
|
|
| 9 |
import logging
|
| 10 |
|
| 11 |
from aiohttp import ClientSession
|
| 12 |
-
from langchain.text_splitter import
|
| 13 |
from datasets import Dataset, load_dataset
|
| 14 |
from tqdm import tqdm
|
| 15 |
from tqdm.asyncio import tqdm_asyncio
|
|
@@ -26,8 +26,8 @@ class Chunker:
|
|
| 26 |
def __init__(self, strategy, split_seq=".", chunk_len=512):
|
| 27 |
self.split_seq = split_seq
|
| 28 |
self.chunk_len = chunk_len
|
| 29 |
-
if strategy == "
|
| 30 |
-
self.split =
|
| 31 |
if strategy == "sequence":
|
| 32 |
self.split = self.seq_splitter
|
| 33 |
if strategy == "constant":
|
|
@@ -138,7 +138,7 @@ def run_embed(input_ds, input_splits, embed_in_text_col, output_ds, tei_url, pri
|
|
| 138 |
|
| 139 |
|
| 140 |
def change_dropdown(choice):
|
| 141 |
-
if choice == "
|
| 142 |
return [
|
| 143 |
gr.Textbox(visible=True),
|
| 144 |
gr.Textbox(visible=False)
|
|
@@ -166,8 +166,8 @@ with gr.Blocks() as demo:
|
|
| 166 |
chunk_private = gr.Checkbox(label="Make chunked dataset private")
|
| 167 |
with gr.Row():
|
| 168 |
dropdown = gr.Dropdown(
|
| 169 |
-
["
|
| 170 |
-
info="'
|
| 171 |
"'constant' makes chunks of the constant size",
|
| 172 |
scale=2
|
| 173 |
)
|
|
|
|
| 9 |
import logging
|
| 10 |
|
| 11 |
from aiohttp import ClientSession
|
| 12 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 13 |
from datasets import Dataset, load_dataset
|
| 14 |
from tqdm import tqdm
|
| 15 |
from tqdm.asyncio import tqdm_asyncio
|
|
|
|
| 26 |
def __init__(self, strategy, split_seq=".", chunk_len=512):
|
| 27 |
self.split_seq = split_seq
|
| 28 |
self.chunk_len = chunk_len
|
| 29 |
+
if strategy == "recursive":
|
| 30 |
+
self.split = RecursiveCharacterTextSplitter().split_text
|
| 31 |
if strategy == "sequence":
|
| 32 |
self.split = self.seq_splitter
|
| 33 |
if strategy == "constant":
|
|
|
|
| 138 |
|
| 139 |
|
| 140 |
def change_dropdown(choice):
|
| 141 |
+
if choice == "recursive" or choice == "sequence":
|
| 142 |
return [
|
| 143 |
gr.Textbox(visible=True),
|
| 144 |
gr.Textbox(visible=False)
|
|
|
|
| 166 |
chunk_private = gr.Checkbox(label="Make chunked dataset private")
|
| 167 |
with gr.Row():
|
| 168 |
dropdown = gr.Dropdown(
|
| 169 |
+
["recursive", "sequence", "constant"], label="Chunking strategy",
|
| 170 |
+
info="'recursive' uses a Langchain recursive tokenizer, 'sequence' splits texts by a chosen sequence, "
|
| 171 |
"'constant' makes chunks of the constant size",
|
| 172 |
scale=2
|
| 173 |
)
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
langchain==0.0.*
|
| 2 |
+
aiohttp==3.8.*
|
| 3 |
+
datasets==2.16.*
|
| 4 |
+
numpy==1.25.*
|