Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -4,12 +4,15 @@ import gradio as gr
|
|
| 4 |
# from concurrent.futures import ThreadPoolExecutor
|
| 5 |
import pdfplumber
|
| 6 |
import pandas as pd
|
|
|
|
|
|
|
| 7 |
from sentence_transformers import SentenceTransformer, models, util
|
| 8 |
word_embedding_model = models.Transformer('uer/sbert-base-chinese-nli', do_lower_case=True)
|
| 9 |
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode='cls')
|
| 10 |
embedder = SentenceTransformer(modules=[word_embedding_model, pooling_model])
|
| 11 |
-
|
| 12 |
-
|
|
|
|
| 13 |
headers = {
|
| 14 |
'Content-Type': 'application/json',
|
| 15 |
}
|
|
@@ -41,7 +44,7 @@ def doc_emb(doc: str):
|
|
| 41 |
# emb_list.append(f.result())
|
| 42 |
print('\n'.join(texts))
|
| 43 |
return texts, emb_list, gr.Textbox.update(visible=True), gr.Button.update(visible=True), gr.Markdown.update(
|
| 44 |
-
value="""操作说明 step 3:PDF
|
| 45 |
|
| 46 |
|
| 47 |
def get_response(msg, bot, doc_text_list, doc_embeddings):
|
|
@@ -89,7 +92,7 @@ def get_response(msg, bot, doc_text_list, doc_embeddings):
|
|
| 89 |
req_json['doc'] = '' if len(sub_doc_list) == 0 else '\n'.join(sub_doc_list)
|
| 90 |
data = {"content": json.dumps(req_json)}
|
| 91 |
print('data:\n', req_json)
|
| 92 |
-
result = requests.post(url=
|
| 93 |
data=json.dumps(data),
|
| 94 |
headers=headers
|
| 95 |
)
|
|
@@ -107,6 +110,17 @@ def up_file(files):
|
|
| 107 |
# 读取PDF文档第i+1页
|
| 108 |
page = pdf.pages[i]
|
| 109 |
res_list = page.extract_text().split('\n')[:-1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
tables = page.extract_tables()
|
| 111 |
for table in tables:
|
| 112 |
# 第一列当成表头:
|
|
@@ -124,7 +138,7 @@ def up_file(files):
|
|
| 124 |
print(i)
|
| 125 |
return gr.Textbox.update(value='\n'.join(doc_text_list), visible=True), gr.Button.update(
|
| 126 |
visible=True), gr.Markdown.update(
|
| 127 |
-
value="操作说明 step 2:确认PDF
|
| 128 |
|
| 129 |
|
| 130 |
with gr.Blocks() as demo:
|
|
|
|
| 4 |
# from concurrent.futures import ThreadPoolExecutor
|
| 5 |
import pdfplumber
|
| 6 |
import pandas as pd
|
| 7 |
+
import time
|
| 8 |
+
from cnocr import CnOcr
|
| 9 |
from sentence_transformers import SentenceTransformer, models, util
|
| 10 |
word_embedding_model = models.Transformer('uer/sbert-base-chinese-nli', do_lower_case=True)
|
| 11 |
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode='cls')
|
| 12 |
embedder = SentenceTransformer(modules=[word_embedding_model, pooling_model])
|
| 13 |
+
ocr = CnOcr()
|
| 14 |
+
# chat_url = 'https://souljoy-my-api.hf.space/sale'
|
| 15 |
+
chat_url = 'https://souljoy-my-api.hf.space/chatpdf'
|
| 16 |
headers = {
|
| 17 |
'Content-Type': 'application/json',
|
| 18 |
}
|
|
|
|
| 44 |
# emb_list.append(f.result())
|
| 45 |
print('\n'.join(texts))
|
| 46 |
return texts, emb_list, gr.Textbox.update(visible=True), gr.Button.update(visible=True), gr.Markdown.update(
|
| 47 |
+
value="""操作说明 step 3:PDF解析提交成功! 🙋 可以开始对话啦~"""), gr.Chatbot.update(visible=True)
|
| 48 |
|
| 49 |
|
| 50 |
def get_response(msg, bot, doc_text_list, doc_embeddings):
|
|
|
|
| 92 |
req_json['doc'] = '' if len(sub_doc_list) == 0 else '\n'.join(sub_doc_list)
|
| 93 |
data = {"content": json.dumps(req_json)}
|
| 94 |
print('data:\n', req_json)
|
| 95 |
+
result = requests.post(url=chat_url,
|
| 96 |
data=json.dumps(data),
|
| 97 |
headers=headers
|
| 98 |
)
|
|
|
|
| 110 |
# 读取PDF文档第i+1页
|
| 111 |
page = pdf.pages[i]
|
| 112 |
res_list = page.extract_text().split('\n')[:-1]
|
| 113 |
+
|
| 114 |
+
for j in range(len(page.images)):
|
| 115 |
+
# 获取图片的二进制流
|
| 116 |
+
img = page.images[j]
|
| 117 |
+
file_name = '{}-{}-{}.png'.format(str(time.time()), str(i), str(j))
|
| 118 |
+
with open(file_name, mode='wb') as f:
|
| 119 |
+
f.write(img['stream'].get_data())
|
| 120 |
+
res = ocr.ocr(file_name)
|
| 121 |
+
if len(res) > 0:
|
| 122 |
+
res_list.append(' '.join([re['text'] for re in res]))
|
| 123 |
+
|
| 124 |
tables = page.extract_tables()
|
| 125 |
for table in tables:
|
| 126 |
# 第一列当成表头:
|
|
|
|
| 138 |
print(i)
|
| 139 |
return gr.Textbox.update(value='\n'.join(doc_text_list), visible=True), gr.Button.update(
|
| 140 |
visible=True), gr.Markdown.update(
|
| 141 |
+
value="操作说明 step 2:确认PDF解析结果(可修正),点击“提交解析结果”,随后进行对话")
|
| 142 |
|
| 143 |
|
| 144 |
with gr.Blocks() as demo:
|