File size: 6,257 Bytes
fe0ce42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9fa6897
fe0ce42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9fa6897
 
 
 
 
fe0ce42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9fa6897
fe0ce42
 
 
 
 
 
 
 
 
 
 
 
 
 
9fa6897
fe0ce42
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import requests
import logging
import json
import tiktoken
import gradio as gr
from typing import Any, List
from langchain.schema import Document
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from utils import json_validator, fetch_chat


class LatexTextSplitter(RecursiveCharacterTextSplitter):
    """Attempts to split the text along Latex-formatted layout elements."""

    def __init__(self, **kwargs: Any):
        """Initialize a LatexTextSplitter."""
        separators = [
            # First, try to split along Latex sections
            "\chapter{",
            "\section{",
            "\subsection{",
            "\subsubsection{",

            # Now split by environments
            "\begin{"
            # "\n\\begin{enumerate}",
            # "\n\\begin{itemize}",
            # "\n\\begin{description}",
            # "\n\\begin{list}",
            # "\n\\begin{quote}",
            # "\n\\begin{quotation}",
            # "\n\\begin{verse}",
            # "\n\\begin{verbatim}",

            ## Now split by math environments
            # "\n\\begin{align}",
            # "$$",
            # "$",

            # Now split by the normal type of lines
            " ",
            "",
        ]
        super().__init__(separators=separators, **kwargs)
        

class Suggest():
    
    def __init__(self, max_ideas: int, model: str = "gpt-3.5-turbo"):
        self.max_ideas = max_ideas
        self.encoder = tiktoken.encoding_for_model(model)
        self.model = model
        self.idea_list = []
        with open("./sample/sample.tex", "r") as f:
            self.sample_content = f.read()
            
    def split_chunk(self, latex_whole_document: str, chunk_size: int = 2000, retry: int = 5) -> List[Document]:
        
        chunk_size = min(chunk_size, len(latex_whole_document))

        for _ in range(retry):
            try:
                latex_splitter = LatexTextSplitter(
                    chunk_size=chunk_size,
                    chunk_overlap=0,
                )
                docs = latex_splitter.create_documents([latex_whole_document])
                return docs
            except:
                chunk_size = chunk_size // 2

        raise Exception("Latex document split check failed.")

    def analyze(self, latex_whole_document: str, openai_key: str, progress: gr.Progress):

        logging.info("start analysis")
        docs = self.split_chunk(latex_whole_document)
        progress(0.05)

        output_format = """

        ```json
        [
            \\ Potential point for improvement 1
            {{
                "title": string \\ What this modification is about
                "thought": string \\ The reason why this should be improved
                "action": string \\ how to make improvement
                "original": string \\ the original latex snippet that can be improved
                "improved": string \\ the improved latex snippet which address your point
            }},
            {{}}
        ]
        ```
        """

        ideas = []
        for doc in progress.tqdm(docs):

            prompt = f"""
            I'm a computer science student.
            You are my editor.
            Your goal is to improve my paper quality at your best.


            ```
            {doc.page_content}
            ```
            The above is a segment of my research paper. If the end of the segment is not complete, just ignore it.
            Point out the parts that can be improved.
            Focus on grammar, writing, content, section structure.
            Ignore comments and those that are outside the document environment.
            List out all the points with a latex snippet which is the improved version addressing your point.
            Same paragraph should be only address once.
            Output the response in the following valid json format:
            {output_format}

            """

            idea = fetch_chat(prompt, openai_key, model=self.model)
            idea = json_validator(idea, openai_key)
            if isinstance(idea, list):
                ideas += idea
                if len(ideas) >= self.max_ideas:
                    break
            else:
                # raise gr.Error(idea)
                continue

        if not ideas:
            raise gr.Error('No suggestions generated.')

        logging.info('complete analysis')
        return ideas

    def read_file(self, f: str):
        if f is None:
            return ""
        elif f.name.endswith('pdf'):
            loader = UnstructuredPDFLoader(f.name)
            pages = loader.load_and_split()
            return "\n".join([p.page_content for p in pages])
        elif f.name.endswith('tex'):
            with open(f.name, "r") as f:
                return f.read()
        else:
            return "Only support .tex & .pdf"

    def generate(self, txt: str, openai_key: str, progress=gr.Progress()):
        
        if not openai_key:
            raise gr.Error("Please provide openai key !")
        
        try:
            idea_list = self.analyze(txt, openai_key, progress)
            self.idea_list = idea_list
            k = min(len(idea_list), self.max_ideas)

            idea_buttons = [
                gr.Button.update(visible=True, value=i['title'])
                for e, i in enumerate(idea_list[:self.max_ideas])
            ]
            idea_buttons += [
                gr.Button.update(visible=False)
            ] * (self.max_ideas - len(idea_buttons))

            idea_details = [
                gr.Textbox.update(value="", label="thought", visible=True),
                gr.Textbox.update(value="", label="action", visible=True),
                gr.Textbox.update(value="", label="original", visible=True, max_lines=5, lines=5),
                gr.Textbox.update(value="", label="improved", visible=True, max_lines=5, lines=5),
            ]

            return [
                gr.Textbox.update("Suggestions", interactive=False, show_label=False),
                gr.Button.update(visible=True, value="Analyze")
            ] + idea_details + idea_buttons
        except Exception as e:
            raise gr.Error(str(e))