EasySci commited on
Commit
564d1d7
·
1 Parent(s): 315e5a1

Upload 6 files

Browse files
src/__pycache__/chains.cpython-311.pyc ADDED
Binary file (6.27 kB). View file
 
src/__pycache__/constants.cpython-311.pyc ADDED
Binary file (9.16 kB). View file
 
src/__pycache__/search.cpython-311.pyc ADDED
Binary file (5.51 kB). View file
 
src/chains.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
2
+ from langchain.indexes import VectorstoreIndexCreator
3
+ from langchain.llms import AzureOpenAI, OpenAI
4
+ from langchain.text_splitter import CharacterTextSplitter
5
+ from langchain.embeddings import OpenAIEmbeddings
6
+ from langchain.vectorstores import Chroma
7
+ from langchain.chains import RetrievalQA, ConversationalRetrievalChain, RetrievalQAWithSourcesChain
8
+ from langchain.chains.question_answering import load_qa_chain
9
+ from langchain.memory import ConversationBufferMemory
10
+
11
+ from langchain.chat_models import AzureChatOpenAI
12
+
13
+
14
+ import os
15
+ import openai
16
+ os.environ['CWD'] = os.getcwd()
17
+
18
+ # for testing
19
+ import src.constants as constants
20
+ # import constants
21
+ os.environ['OPENAI_API_KEY'] = constants.AZURE_OPENAI_KEY_FR
22
+ os.environ['OPENAI_API_BASE'] = constants.AZURE_OPENAI_ENDPOINT_FR
23
+ os.environ['OPENAI_API_VERSION'] = "2023-05-15"
24
+ os.environ['OPENAI_API_TYPE'] = "azure"
25
+ # openai.api_type = "azure"
26
+ # openai.api_base = constants.AZURE_OPENAI_ENDPOINT_FR
27
+ # openai.api_version = "2023-05-15"
28
+ openai.api_key = constants.OPEN_AI_KEY
29
+
30
+ def get_document_key(doc):
31
+ return doc.metadata['source'] + '_page_' + str(doc.metadata['page'])
32
+
33
+
34
+ import os
35
+ from typing import Optional
36
+
37
+ class PDFEmbeddings():
38
+ def __init__(self, path: Optional[str] = None):
39
+ self.path = path or os.path.join(os.environ['CWD'], 'archive')
40
+ self.text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200)
41
+ self.embeddings = OpenAIEmbeddings(deployment= constants.AZURE_ENGINE_NAME_US, chunk_size=1,
42
+ openai_api_key= constants.AZURE_OPENAI_KEY_US,
43
+ openai_api_base= constants.AZURE_OPENAI_ENDPOINT_US,
44
+ openai_api_version= "2023-05-15",
45
+ openai_api_type= "azure",)
46
+ self.vectorstore = Chroma(persist_directory=constants.persistent_dir, embedding_function=self.embeddings)
47
+ self.retriever = self.vectorstore.as_retriever(search_type = "similarity", search_kwags= {"k": 5})
48
+ self.memory = ConversationBufferMemory(memory_key='pdf_memory', return_messages=True)
49
+
50
+ def process_documents(self):
51
+ # Load the documents and process them
52
+ loader = PyPDFDirectoryLoader(self.path)
53
+ documents = loader.load()
54
+ chunks = self.text_splitter.split_documents(documents)
55
+ self.vectorstore.add_documents(chunks)
56
+
57
+ def search(self, query: str, chain_type: str = "stuff"):
58
+ chain = RetrievalQA.from_chain_type(llm= AzureChatOpenAI(deployment_name= constants.AZURE_ENGINE_NAME_FR, temperature=0),
59
+ retriever= self.retriever, chain_type= chain_type, return_source_documents= True)
60
+ result = chain({"query": query})
61
+ return result
62
+
63
+ def conversational_search(self, query: str, chain_type: str = "stuff"):
64
+ chain = ConversationalRetrievalChain.from_llm(llm= AzureChatOpenAI(deployment_name= constants.AZURE_ENGINE_NAME_FR),
65
+ retriever= self.retriever, memory= self.memory, chain_type= chain_type)
66
+ result = chain({"question": query})
67
+ return result['answer']
68
+
69
+ def load_and_run_chain(self, query: str, chain_type: str = "stuff"):
70
+ chain = load_qa_chain(llm= AzureChatOpenAI(deployment_name= constants.AZURE_ENGINE_NAME_FR), chain_type= chain_type)
71
+ return chain.run(input_documents = self.retriever, question = query)
72
+
73
+ if __name__ == '__main__':
74
+ pdf_embed = PDFEmbeddings()
75
+ # pdf_embed.process_documents() # This takes a while, so we only do it once
76
+ result = pdf_embed.search("Give me a list of short relevant queries to look for papers related to the topics of the papers in the source documents.")
77
+ print("\n\n", result['result'], "\n")
78
+ print("Source documents:")
79
+ for doc in result['source_documents']:
80
+ print(doc.metadata['source'])
src/constants.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ AZURE_OPENAI_KEY_FR = "d296db50d1ad471caf944d1b148ae6c7"
2
+ AZURE_OPENAI_ENDPOINT_FR = "https://openai-resource-team-11-france.openai.azure.com/"
3
+ AZURE_ENGINE_NAME_FR = "gpt35-team-11"
4
+
5
+ AZURE_OPENAI_KEY_US = "af7e76186d2c46458bcb6d1ecef91d68"
6
+ AZURE_OPENAI_ENDPOINT_US = "https://openai-resource-team-11-east-us.openai.azure.com/"
7
+ AZURE_ENGINE_NAME_US = "ada002-team-11"
8
+
9
+ OPEN_AI_KEY = "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"
10
+
11
+ persistent_dir = "db"
12
+
13
+ # Taxonomy dictionary arxiv
14
+ arxiv_taxonomy = {
15
+ "astro-ph.CO": "Cosmology and Nongalactic Astrophysics",
16
+ "astro-ph.EP": "Earth and Planetary Astrophysics",
17
+ "astro-ph.GA": "Astrophysics of Galaxies",
18
+ "astro-ph.HE": "High Energy Astrophysical Phenomena",
19
+ "astro-ph.IM": "Instrumentation and Methods for Astrophysics",
20
+ "astro-ph.SR": "Solar and Stellar Astrophysics",
21
+ "cond-mat.dis-nn": "Disordered Systems and Neural Networks",
22
+ "cond-mat.mes-hall": "Mesoscale and Nanoscale Physics",
23
+ "cond-mat.mtrl-sci": "Materials Science",
24
+ "cond-mat.other": "Other Condensed Matter",
25
+ "cond-mat.quant-gas": "Quantum Gases",
26
+ "cond-mat.soft": "Soft Condensed Matter",
27
+ "cond-mat.stat-mech": "Statistical Mechanics",
28
+ "cond-mat.str-el": "Strongly Correlated Electrons",
29
+ "cond-mat.supr-con": "Superconductivity",
30
+ "cs.AI": "Artificial Intelligence",
31
+ "cs.AR": "Hardware Architecture",
32
+ "cs.CC": "Computational Complexity",
33
+ "cs.CE": "Computational Engineering, Finance, and Science",
34
+ "cs.CG": "Computational Geometry",
35
+ "cs.CL": "Computation and Language",
36
+ "cs.CR": "Cryptography and Security",
37
+ "cs.CV": "Computer Vision and Pattern Recognition",
38
+ "cs.CY": "Computers and Society",
39
+ "cs.DB": "Databases",
40
+ "cs.DC": "Distributed, Parallel, and Cluster Computing",
41
+ "cs.DL": "Digital Libraries",
42
+ "cs.DM": "Discrete Mathematics",
43
+ "cs.DS": "Data Structures and Algorithms",
44
+ "cs.ET": "Emerging Technologies",
45
+ "cs.FL": "Formal Languages and Automata Theory",
46
+ "cs.GL": "General Literature",
47
+ "cs.GR": "Graphics",
48
+ "cs.GT": "Computer Science and Game Theory",
49
+ "cs.HC": "Human-Computer Interaction",
50
+ "cs.IR": "Information Retrieval",
51
+ "cs.IT": "Information Theory",
52
+ "cs.LG": "Machine Learning",
53
+ "cs.LO": "Logic in Computer Science",
54
+ "cs.MA": "Multiagent Systems",
55
+ "cs.MM": "Multimedia",
56
+ "cs.MS": "Mathematical Software",
57
+ "cs.NA": "Numerical Analysis",
58
+ "cs.NE": "Neural and Evolutionary Computing",
59
+ "cs.NI": "Networking and Internet Architecture",
60
+ "cs.OH": "Other Computer Science",
61
+ "cs.OS": "Operating Systems",
62
+ "cs.PF": "Performance",
63
+ "cs.PL": "Programming Languages",
64
+ "cs.RO": "Robotics",
65
+ "cs.SC": "Symbolic Computation",
66
+ "cs.SD": "Sound",
67
+ "cs.SE": "Software Engineering",
68
+ "cs.SI": "Social and Information Networks",
69
+ "cs.SY": "Systems and Control",
70
+ "econ.EM": "Econometrics",
71
+ "eess.AS": "Audio and Speech Processing",
72
+ "eess.IV": "Image and Video Processing",
73
+ "eess.SP": "Signal Processing",
74
+ "gr-qc": "General Relativity and Quantum Cosmology",
75
+ "hep-ex": "High Energy Physics - Experiment",
76
+ "hep-lat": "High Energy Physics - Lattice",
77
+ "hep-ph": "High Energy Physics - Phenomenology",
78
+ "hep-th": "High Energy Physics - Theory",
79
+ "math.AC": "Commutative Algebra",
80
+ "math.AG": "Algebraic Geometry",
81
+ "math.AP": "Analysis of PDEs",
82
+ "math.AT": "Algebraic Topology",
83
+ "math.CA": "Classical Analysis and ODEs",
84
+ "math.CO": "Combinatorics",
85
+ "math.CT": "Category Theory",
86
+ "math.CV": "Complex Variables",
87
+ "math.DG": "Differential Geometry",
88
+ "math.DS": "Dynamical Systems",
89
+ "math.FA": "Functional Analysis",
90
+ "math.GM": "General Mathematics",
91
+ "math.GN": "General Topology",
92
+ "math.GR": "Group Theory",
93
+ "math.GT": "Geometric Topology",
94
+ "math.HO": "History and Overview",
95
+ "math.IT": "Information Theory",
96
+ "math.KT": "K-Theory and Homology",
97
+ "math.LO": "Logic",
98
+ "math.MG": "Metric Geometry",
99
+ "math.MP": "Mathematical Physics",
100
+ "math.NA": "Numerical Analysis",
101
+ "math.NT": "Number Theory",
102
+ "math.OA": "Operator Algebras",
103
+ "math.OC": "Optimization and Control",
104
+ "math.PR": "Probability",
105
+ "math.QA": "Quantum Algebra",
106
+ "math.RA": "Rings and Algebras",
107
+ "math.RT": "Representation Theory",
108
+ "math.SG": "Symplectic Geometry",
109
+ "math.SP": "Spectral Theory",
110
+ "math.ST": "Statistics Theory",
111
+ "math-ph": "Mathematical Physics",
112
+ "nlin.AO": "Adaptation and Self-Organizing Systems",
113
+ "nlin.CD": "Chaotic Dynamics",
114
+ "nlin.CG": "Cellular Automata and Lattice Gases",
115
+ "nlin.PS": "Pattern Formation and Solitons",
116
+ "nlin.SI": "Exactly Solvable and Integrable Systems",
117
+ "nucl-ex": "Nuclear Experiment",
118
+ "nucl-th": "Nuclear Theory",
119
+ "physics.acc-ph": "Accelerator Physics",
120
+ "physics.ao-ph": "Atmospheric and Oceanic Physics",
121
+ "physics.app-ph": "Applied Physics",
122
+ "physics.atom-ph": "Atomic Physics",
123
+ "physics.atm-clus": "Atomic and Molecular Clusters",
124
+ "physics.bio-ph": "Biological Physics",
125
+ "physics.chem-ph": "Chemical Physics",
126
+ "physics.class-ph": "Classical Physics",
127
+ "physics.comp-ph": "Computational Physics",
128
+ "physics.data-an": "Data Analysis, Statistics and Probability",
129
+ "physics.ed-ph": "Physics Education",
130
+ "physics.flu-dyn": "Fluid Dynamics",
131
+ "physics.gen-ph": "General Physics",
132
+ "physics.geo-ph": "Geophysics",
133
+ "physics.hist-ph": "History and Philosophy of Physics",
134
+ "physics.ins-det": "Instrumentation and Detectors",
135
+ "physics.med-ph": "Medical Physics",
136
+ "physics.optics": "Optics",
137
+ "physics.plasm-ph": "Plasma Physics",
138
+ "physics.pop-ph": "Popular Physics",
139
+ "physics.soc-ph": "Physics and Society",
140
+ "physics.space-ph": "Space Physics",
141
+ "q-bio.BM": "Biomolecules",
142
+ "q-bio.CB": "Cell Behavior",
143
+ "q-bio.GN": "Genomics",
144
+ "q-bio.MN": "Molecular Networks",
145
+ "q-bio.NC": "Neurons and Cognition",
146
+ "q-bio.OT": "Other Quantitative Biology",
147
+ "q-bio.PE": "Populations and Evolution",
148
+ "q-bio.QM": "Quantitative Methods",
149
+ "q-bio.SC": "Subcellular Processes",
150
+ "q-bio.TO": "Tissues and Organs",
151
+ "q-fin.CP": "Computational Finance",
152
+ "q-fin.EC": "Economics",
153
+ "q-fin.GN": "General Finance",
154
+ "q-fin.MF": "Mathematical Finance",
155
+ "q-fin.PM": "Portfolio Management",
156
+ "q-fin.PR": "Pricing of Securities",
157
+ "q-fin.RM": "Risk Management",
158
+ "q-fin.ST": "Statistical Finance",
159
+ "q-fin.TR": "Trading and Market Microstructure",
160
+ "quant-ph": "Quantum Physics",
161
+ "stat.AP": "Applications",
162
+ "stat.CO": "Computation",
163
+ "stat.ME": "Methodology",
164
+ "stat.ML": "Machine Learning",
165
+ "stat.OT": "Other Statistics",
166
+ "stat.TH": "Statistics Theory"
167
+ }
src/search.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import arxiv
2
+ from crossref.restful import Works
3
+ import pytz
4
+ from datetime import date
5
+ from datetime import datetime
6
+
7
+
8
+ class Search_Papers():
9
+ def __init__(self, query, search_by, search_by_query, sort_by, sort_order):
10
+
11
+ self.query = query
12
+ self.search_mode = search_by
13
+ self.search_mode_query = search_by_query
14
+
15
+ self.sort_by = sort_by
16
+
17
+ self.sort_order = sort_order
18
+
19
+ self.time_search = datetime.now(pytz.utc).replace(microsecond=0)
20
+
21
+ return
22
+
23
+
24
+ def search_arxiv_NResults(self, query, max_results, sort_by, sort_order):
25
+ search_results = arxiv.Search(
26
+ query = query,
27
+ max_results = max_results,
28
+ sort_by = sort_by,
29
+ sort_order = sort_order
30
+ )
31
+
32
+ return search_results.results()
33
+
34
+
35
+
36
+ def search_arxiv_Timeframe(self, query, timeframe, sort_by, sort_order):
37
+
38
+ collection = []
39
+ exit_flag = 0
40
+ n_iter = 0
41
+ max_results = 10
42
+
43
+ while True:
44
+ search_list = arxiv.Search(
45
+ query = query,
46
+ max_results = (n_iter+1)*max_results,
47
+ sort_by = sort_by,
48
+ sort_order = sort_order
49
+ )
50
+
51
+
52
+ results = list(search_list.results())
53
+
54
+ for i in range(n_iter*max_results, len(results)):
55
+ #print(timeframe, results[i].published, results[i].published < timeframe)
56
+
57
+ if results[i].published > timeframe:
58
+ collection.append(results[i])
59
+ else:
60
+ exit_flag = 1
61
+ break
62
+
63
+ if exit_flag:
64
+ break
65
+ else:
66
+ n_iter += 1
67
+
68
+
69
+ return collection
70
+
71
+
72
+
73
+ def search_arxiv(self):
74
+
75
+ if self.sort_by == "PublishDate":
76
+ sort_by = arxiv.SortCriterion.SubmittedDate
77
+ elif self.sort_by == "LastUpdatedDate":
78
+ sort_by = arxiv.SortCriterion.LastUpdatedDate
79
+ else:
80
+ sort_by = arxiv.SortCriterion.Relevance
81
+
82
+
83
+ if self.sort_order == "Ascending":
84
+ sort_order = arxiv.SortOrder.Ascending
85
+ else:
86
+ sort_order = arxiv.SortOrder.Descending
87
+
88
+
89
+
90
+ if self.search_mode == "NumberResults":
91
+ search_results = self.search_arxiv_NResults(self.query, self.search_mode_query, sort_by, sort_order)
92
+ else:
93
+ search_results = self.search_arxiv_Timeframe(self.query, self.search_mode_query, sort_by, sort_order)
94
+
95
+ return search_results
96
+
97
+
98
+ def search_general_NResults(self, query, max_results, sort_by, sort_order):
99
+ works = Works()
100
+ # search_results = works.filter(category_name = query).sort(sort_by).order(sort_order).sample(max_results)
101
+ search_results = works.query(bibliographic = query).sort(sort_by).order(sort_order).sample(max_results)
102
+ return search_results
103
+
104
+
105
+ def search_general_Timeframe(self, query, timeframe, sort_by, sort_order):
106
+ works = Works()
107
+ today = (datetime.combine(date.today(), datetime.min.time())).strftime("%Y-%m-%d")
108
+
109
+ # search_results = works.filter(category_name = query, from_created_date=timeframe, until_created_date = today).sort(sort_by).order(sort_order)
110
+ search_results = works.query(bibliographic = query).filter(from_created_date=timeframe, until_created_date = today).sort(sort_by).order(sort_order)
111
+
112
+ return search_results
113
+
114
+ #'Magnetic Field Conditions Upstream of Ganymede'
115
+
116
+ def search_general(self):
117
+
118
+ if self.search_mode == 'Timeframe':
119
+ timeframe = self.search_mode_query.strftime("%Y-%m-%d")
120
+
121
+
122
+
123
+
124
+ if self.sort_by == "PublishDate":
125
+ sort_by = 'created'
126
+ elif self.sort_by == "LastUpdatedDate":
127
+ sort_by = 'updated'
128
+ else:
129
+ sort_by = 'relevance'
130
+
131
+
132
+ if self.sort_order == "Ascending":
133
+ sort_order = "asc"
134
+ else:
135
+ sort_order = "desc"
136
+
137
+
138
+
139
+ if self.search_mode == "NumberResults":
140
+ search_results = self.search_general_NResults(self.query, self.search_mode_query, sort_by, sort_order)
141
+ else:
142
+ search_results = self.search_general_Timeframe(self.query, timeframe, sort_by, sort_order)
143
+
144
+ return search_results
145
+
146
+