File size: 6,257 Bytes
fe0ce42 9fa6897 fe0ce42 9fa6897 fe0ce42 9fa6897 fe0ce42 9fa6897 fe0ce42 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
import requests
import logging
import json
import tiktoken
import gradio as gr
from typing import Any, List
from langchain.schema import Document
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from utils import json_validator, fetch_chat
class LatexTextSplitter(RecursiveCharacterTextSplitter):
"""Attempts to split the text along Latex-formatted layout elements."""
def __init__(self, **kwargs: Any):
"""Initialize a LatexTextSplitter."""
separators = [
# First, try to split along Latex sections
"\chapter{",
"\section{",
"\subsection{",
"\subsubsection{",
# Now split by environments
"\begin{"
# "\n\\begin{enumerate}",
# "\n\\begin{itemize}",
# "\n\\begin{description}",
# "\n\\begin{list}",
# "\n\\begin{quote}",
# "\n\\begin{quotation}",
# "\n\\begin{verse}",
# "\n\\begin{verbatim}",
## Now split by math environments
# "\n\\begin{align}",
# "$$",
# "$",
# Now split by the normal type of lines
" ",
"",
]
super().__init__(separators=separators, **kwargs)
class Suggest():
def __init__(self, max_ideas: int, model: str = "gpt-3.5-turbo"):
self.max_ideas = max_ideas
self.encoder = tiktoken.encoding_for_model(model)
self.model = model
self.idea_list = []
with open("./sample/sample.tex", "r") as f:
self.sample_content = f.read()
def split_chunk(self, latex_whole_document: str, chunk_size: int = 2000, retry: int = 5) -> List[Document]:
chunk_size = min(chunk_size, len(latex_whole_document))
for _ in range(retry):
try:
latex_splitter = LatexTextSplitter(
chunk_size=chunk_size,
chunk_overlap=0,
)
docs = latex_splitter.create_documents([latex_whole_document])
return docs
except:
chunk_size = chunk_size // 2
raise Exception("Latex document split check failed.")
def analyze(self, latex_whole_document: str, openai_key: str, progress: gr.Progress):
logging.info("start analysis")
docs = self.split_chunk(latex_whole_document)
progress(0.05)
output_format = """
```json
[
\\ Potential point for improvement 1
{{
"title": string \\ What this modification is about
"thought": string \\ The reason why this should be improved
"action": string \\ how to make improvement
"original": string \\ the original latex snippet that can be improved
"improved": string \\ the improved latex snippet which address your point
}},
{{}}
]
```
"""
ideas = []
for doc in progress.tqdm(docs):
prompt = f"""
I'm a computer science student.
You are my editor.
Your goal is to improve my paper quality at your best.
```
{doc.page_content}
```
The above is a segment of my research paper. If the end of the segment is not complete, just ignore it.
Point out the parts that can be improved.
Focus on grammar, writing, content, section structure.
Ignore comments and those that are outside the document environment.
List out all the points with a latex snippet which is the improved version addressing your point.
Same paragraph should be only address once.
Output the response in the following valid json format:
{output_format}
"""
idea = fetch_chat(prompt, openai_key, model=self.model)
idea = json_validator(idea, openai_key)
if isinstance(idea, list):
ideas += idea
if len(ideas) >= self.max_ideas:
break
else:
# raise gr.Error(idea)
continue
if not ideas:
raise gr.Error('No suggestions generated.')
logging.info('complete analysis')
return ideas
def read_file(self, f: str):
if f is None:
return ""
elif f.name.endswith('pdf'):
loader = UnstructuredPDFLoader(f.name)
pages = loader.load_and_split()
return "\n".join([p.page_content for p in pages])
elif f.name.endswith('tex'):
with open(f.name, "r") as f:
return f.read()
else:
return "Only support .tex & .pdf"
def generate(self, txt: str, openai_key: str, progress=gr.Progress()):
if not openai_key:
raise gr.Error("Please provide openai key !")
try:
idea_list = self.analyze(txt, openai_key, progress)
self.idea_list = idea_list
k = min(len(idea_list), self.max_ideas)
idea_buttons = [
gr.Button.update(visible=True, value=i['title'])
for e, i in enumerate(idea_list[:self.max_ideas])
]
idea_buttons += [
gr.Button.update(visible=False)
] * (self.max_ideas - len(idea_buttons))
idea_details = [
gr.Textbox.update(value="", label="thought", visible=True),
gr.Textbox.update(value="", label="action", visible=True),
gr.Textbox.update(value="", label="original", visible=True, max_lines=5, lines=5),
gr.Textbox.update(value="", label="improved", visible=True, max_lines=5, lines=5),
]
return [
gr.Textbox.update("Suggestions", interactive=False, show_label=False),
gr.Button.update(visible=True, value="Analyze")
] + idea_details + idea_buttons
except Exception as e:
raise gr.Error(str(e)) |