Spaces:
Sleeping
Sleeping
Upload 6 files
Browse files- src/__pycache__/chains.cpython-311.pyc +0 -0
- src/__pycache__/constants.cpython-311.pyc +0 -0
- src/__pycache__/search.cpython-311.pyc +0 -0
- src/chains.py +80 -0
- src/constants.py +167 -0
- src/search.py +146 -0
src/__pycache__/chains.cpython-311.pyc
ADDED
|
Binary file (6.27 kB). View file
|
|
|
src/__pycache__/constants.cpython-311.pyc
ADDED
|
Binary file (9.16 kB). View file
|
|
|
src/__pycache__/search.cpython-311.pyc
ADDED
|
Binary file (5.51 kB). View file
|
|
|
src/chains.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
|
| 2 |
+
from langchain.indexes import VectorstoreIndexCreator
|
| 3 |
+
from langchain.llms import AzureOpenAI, OpenAI
|
| 4 |
+
from langchain.text_splitter import CharacterTextSplitter
|
| 5 |
+
from langchain.embeddings import OpenAIEmbeddings
|
| 6 |
+
from langchain.vectorstores import Chroma
|
| 7 |
+
from langchain.chains import RetrievalQA, ConversationalRetrievalChain, RetrievalQAWithSourcesChain
|
| 8 |
+
from langchain.chains.question_answering import load_qa_chain
|
| 9 |
+
from langchain.memory import ConversationBufferMemory
|
| 10 |
+
|
| 11 |
+
from langchain.chat_models import AzureChatOpenAI
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
import os
|
| 15 |
+
import openai
|
| 16 |
+
os.environ['CWD'] = os.getcwd()
|
| 17 |
+
|
| 18 |
+
# for testing
|
| 19 |
+
import src.constants as constants
|
| 20 |
+
# import constants
|
| 21 |
+
os.environ['OPENAI_API_KEY'] = constants.AZURE_OPENAI_KEY_FR
|
| 22 |
+
os.environ['OPENAI_API_BASE'] = constants.AZURE_OPENAI_ENDPOINT_FR
|
| 23 |
+
os.environ['OPENAI_API_VERSION'] = "2023-05-15"
|
| 24 |
+
os.environ['OPENAI_API_TYPE'] = "azure"
|
| 25 |
+
# openai.api_type = "azure"
|
| 26 |
+
# openai.api_base = constants.AZURE_OPENAI_ENDPOINT_FR
|
| 27 |
+
# openai.api_version = "2023-05-15"
|
| 28 |
+
openai.api_key = constants.OPEN_AI_KEY
|
| 29 |
+
|
| 30 |
+
def get_document_key(doc):
|
| 31 |
+
return doc.metadata['source'] + '_page_' + str(doc.metadata['page'])
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
import os
|
| 35 |
+
from typing import Optional
|
| 36 |
+
|
| 37 |
+
class PDFEmbeddings():
|
| 38 |
+
def __init__(self, path: Optional[str] = None):
|
| 39 |
+
self.path = path or os.path.join(os.environ['CWD'], 'archive')
|
| 40 |
+
self.text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200)
|
| 41 |
+
self.embeddings = OpenAIEmbeddings(deployment= constants.AZURE_ENGINE_NAME_US, chunk_size=1,
|
| 42 |
+
openai_api_key= constants.AZURE_OPENAI_KEY_US,
|
| 43 |
+
openai_api_base= constants.AZURE_OPENAI_ENDPOINT_US,
|
| 44 |
+
openai_api_version= "2023-05-15",
|
| 45 |
+
openai_api_type= "azure",)
|
| 46 |
+
self.vectorstore = Chroma(persist_directory=constants.persistent_dir, embedding_function=self.embeddings)
|
| 47 |
+
self.retriever = self.vectorstore.as_retriever(search_type = "similarity", search_kwags= {"k": 5})
|
| 48 |
+
self.memory = ConversationBufferMemory(memory_key='pdf_memory', return_messages=True)
|
| 49 |
+
|
| 50 |
+
def process_documents(self):
|
| 51 |
+
# Load the documents and process them
|
| 52 |
+
loader = PyPDFDirectoryLoader(self.path)
|
| 53 |
+
documents = loader.load()
|
| 54 |
+
chunks = self.text_splitter.split_documents(documents)
|
| 55 |
+
self.vectorstore.add_documents(chunks)
|
| 56 |
+
|
| 57 |
+
def search(self, query: str, chain_type: str = "stuff"):
|
| 58 |
+
chain = RetrievalQA.from_chain_type(llm= AzureChatOpenAI(deployment_name= constants.AZURE_ENGINE_NAME_FR, temperature=0),
|
| 59 |
+
retriever= self.retriever, chain_type= chain_type, return_source_documents= True)
|
| 60 |
+
result = chain({"query": query})
|
| 61 |
+
return result
|
| 62 |
+
|
| 63 |
+
def conversational_search(self, query: str, chain_type: str = "stuff"):
|
| 64 |
+
chain = ConversationalRetrievalChain.from_llm(llm= AzureChatOpenAI(deployment_name= constants.AZURE_ENGINE_NAME_FR),
|
| 65 |
+
retriever= self.retriever, memory= self.memory, chain_type= chain_type)
|
| 66 |
+
result = chain({"question": query})
|
| 67 |
+
return result['answer']
|
| 68 |
+
|
| 69 |
+
def load_and_run_chain(self, query: str, chain_type: str = "stuff"):
|
| 70 |
+
chain = load_qa_chain(llm= AzureChatOpenAI(deployment_name= constants.AZURE_ENGINE_NAME_FR), chain_type= chain_type)
|
| 71 |
+
return chain.run(input_documents = self.retriever, question = query)
|
| 72 |
+
|
| 73 |
+
if __name__ == '__main__':
|
| 74 |
+
pdf_embed = PDFEmbeddings()
|
| 75 |
+
# pdf_embed.process_documents() # This takes a while, so we only do it once
|
| 76 |
+
result = pdf_embed.search("Give me a list of short relevant queries to look for papers related to the topics of the papers in the source documents.")
|
| 77 |
+
print("\n\n", result['result'], "\n")
|
| 78 |
+
print("Source documents:")
|
| 79 |
+
for doc in result['source_documents']:
|
| 80 |
+
print(doc.metadata['source'])
|
src/constants.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
AZURE_OPENAI_KEY_FR = "d296db50d1ad471caf944d1b148ae6c7"
|
| 2 |
+
AZURE_OPENAI_ENDPOINT_FR = "https://openai-resource-team-11-france.openai.azure.com/"
|
| 3 |
+
AZURE_ENGINE_NAME_FR = "gpt35-team-11"
|
| 4 |
+
|
| 5 |
+
AZURE_OPENAI_KEY_US = "af7e76186d2c46458bcb6d1ecef91d68"
|
| 6 |
+
AZURE_OPENAI_ENDPOINT_US = "https://openai-resource-team-11-east-us.openai.azure.com/"
|
| 7 |
+
AZURE_ENGINE_NAME_US = "ada002-team-11"
|
| 8 |
+
|
| 9 |
+
OPEN_AI_KEY = "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"
|
| 10 |
+
|
| 11 |
+
persistent_dir = "db"
|
| 12 |
+
|
| 13 |
+
# Taxonomy dictionary arxiv
|
| 14 |
+
arxiv_taxonomy = {
|
| 15 |
+
"astro-ph.CO": "Cosmology and Nongalactic Astrophysics",
|
| 16 |
+
"astro-ph.EP": "Earth and Planetary Astrophysics",
|
| 17 |
+
"astro-ph.GA": "Astrophysics of Galaxies",
|
| 18 |
+
"astro-ph.HE": "High Energy Astrophysical Phenomena",
|
| 19 |
+
"astro-ph.IM": "Instrumentation and Methods for Astrophysics",
|
| 20 |
+
"astro-ph.SR": "Solar and Stellar Astrophysics",
|
| 21 |
+
"cond-mat.dis-nn": "Disordered Systems and Neural Networks",
|
| 22 |
+
"cond-mat.mes-hall": "Mesoscale and Nanoscale Physics",
|
| 23 |
+
"cond-mat.mtrl-sci": "Materials Science",
|
| 24 |
+
"cond-mat.other": "Other Condensed Matter",
|
| 25 |
+
"cond-mat.quant-gas": "Quantum Gases",
|
| 26 |
+
"cond-mat.soft": "Soft Condensed Matter",
|
| 27 |
+
"cond-mat.stat-mech": "Statistical Mechanics",
|
| 28 |
+
"cond-mat.str-el": "Strongly Correlated Electrons",
|
| 29 |
+
"cond-mat.supr-con": "Superconductivity",
|
| 30 |
+
"cs.AI": "Artificial Intelligence",
|
| 31 |
+
"cs.AR": "Hardware Architecture",
|
| 32 |
+
"cs.CC": "Computational Complexity",
|
| 33 |
+
"cs.CE": "Computational Engineering, Finance, and Science",
|
| 34 |
+
"cs.CG": "Computational Geometry",
|
| 35 |
+
"cs.CL": "Computation and Language",
|
| 36 |
+
"cs.CR": "Cryptography and Security",
|
| 37 |
+
"cs.CV": "Computer Vision and Pattern Recognition",
|
| 38 |
+
"cs.CY": "Computers and Society",
|
| 39 |
+
"cs.DB": "Databases",
|
| 40 |
+
"cs.DC": "Distributed, Parallel, and Cluster Computing",
|
| 41 |
+
"cs.DL": "Digital Libraries",
|
| 42 |
+
"cs.DM": "Discrete Mathematics",
|
| 43 |
+
"cs.DS": "Data Structures and Algorithms",
|
| 44 |
+
"cs.ET": "Emerging Technologies",
|
| 45 |
+
"cs.FL": "Formal Languages and Automata Theory",
|
| 46 |
+
"cs.GL": "General Literature",
|
| 47 |
+
"cs.GR": "Graphics",
|
| 48 |
+
"cs.GT": "Computer Science and Game Theory",
|
| 49 |
+
"cs.HC": "Human-Computer Interaction",
|
| 50 |
+
"cs.IR": "Information Retrieval",
|
| 51 |
+
"cs.IT": "Information Theory",
|
| 52 |
+
"cs.LG": "Machine Learning",
|
| 53 |
+
"cs.LO": "Logic in Computer Science",
|
| 54 |
+
"cs.MA": "Multiagent Systems",
|
| 55 |
+
"cs.MM": "Multimedia",
|
| 56 |
+
"cs.MS": "Mathematical Software",
|
| 57 |
+
"cs.NA": "Numerical Analysis",
|
| 58 |
+
"cs.NE": "Neural and Evolutionary Computing",
|
| 59 |
+
"cs.NI": "Networking and Internet Architecture",
|
| 60 |
+
"cs.OH": "Other Computer Science",
|
| 61 |
+
"cs.OS": "Operating Systems",
|
| 62 |
+
"cs.PF": "Performance",
|
| 63 |
+
"cs.PL": "Programming Languages",
|
| 64 |
+
"cs.RO": "Robotics",
|
| 65 |
+
"cs.SC": "Symbolic Computation",
|
| 66 |
+
"cs.SD": "Sound",
|
| 67 |
+
"cs.SE": "Software Engineering",
|
| 68 |
+
"cs.SI": "Social and Information Networks",
|
| 69 |
+
"cs.SY": "Systems and Control",
|
| 70 |
+
"econ.EM": "Econometrics",
|
| 71 |
+
"eess.AS": "Audio and Speech Processing",
|
| 72 |
+
"eess.IV": "Image and Video Processing",
|
| 73 |
+
"eess.SP": "Signal Processing",
|
| 74 |
+
"gr-qc": "General Relativity and Quantum Cosmology",
|
| 75 |
+
"hep-ex": "High Energy Physics - Experiment",
|
| 76 |
+
"hep-lat": "High Energy Physics - Lattice",
|
| 77 |
+
"hep-ph": "High Energy Physics - Phenomenology",
|
| 78 |
+
"hep-th": "High Energy Physics - Theory",
|
| 79 |
+
"math.AC": "Commutative Algebra",
|
| 80 |
+
"math.AG": "Algebraic Geometry",
|
| 81 |
+
"math.AP": "Analysis of PDEs",
|
| 82 |
+
"math.AT": "Algebraic Topology",
|
| 83 |
+
"math.CA": "Classical Analysis and ODEs",
|
| 84 |
+
"math.CO": "Combinatorics",
|
| 85 |
+
"math.CT": "Category Theory",
|
| 86 |
+
"math.CV": "Complex Variables",
|
| 87 |
+
"math.DG": "Differential Geometry",
|
| 88 |
+
"math.DS": "Dynamical Systems",
|
| 89 |
+
"math.FA": "Functional Analysis",
|
| 90 |
+
"math.GM": "General Mathematics",
|
| 91 |
+
"math.GN": "General Topology",
|
| 92 |
+
"math.GR": "Group Theory",
|
| 93 |
+
"math.GT": "Geometric Topology",
|
| 94 |
+
"math.HO": "History and Overview",
|
| 95 |
+
"math.IT": "Information Theory",
|
| 96 |
+
"math.KT": "K-Theory and Homology",
|
| 97 |
+
"math.LO": "Logic",
|
| 98 |
+
"math.MG": "Metric Geometry",
|
| 99 |
+
"math.MP": "Mathematical Physics",
|
| 100 |
+
"math.NA": "Numerical Analysis",
|
| 101 |
+
"math.NT": "Number Theory",
|
| 102 |
+
"math.OA": "Operator Algebras",
|
| 103 |
+
"math.OC": "Optimization and Control",
|
| 104 |
+
"math.PR": "Probability",
|
| 105 |
+
"math.QA": "Quantum Algebra",
|
| 106 |
+
"math.RA": "Rings and Algebras",
|
| 107 |
+
"math.RT": "Representation Theory",
|
| 108 |
+
"math.SG": "Symplectic Geometry",
|
| 109 |
+
"math.SP": "Spectral Theory",
|
| 110 |
+
"math.ST": "Statistics Theory",
|
| 111 |
+
"math-ph": "Mathematical Physics",
|
| 112 |
+
"nlin.AO": "Adaptation and Self-Organizing Systems",
|
| 113 |
+
"nlin.CD": "Chaotic Dynamics",
|
| 114 |
+
"nlin.CG": "Cellular Automata and Lattice Gases",
|
| 115 |
+
"nlin.PS": "Pattern Formation and Solitons",
|
| 116 |
+
"nlin.SI": "Exactly Solvable and Integrable Systems",
|
| 117 |
+
"nucl-ex": "Nuclear Experiment",
|
| 118 |
+
"nucl-th": "Nuclear Theory",
|
| 119 |
+
"physics.acc-ph": "Accelerator Physics",
|
| 120 |
+
"physics.ao-ph": "Atmospheric and Oceanic Physics",
|
| 121 |
+
"physics.app-ph": "Applied Physics",
|
| 122 |
+
"physics.atom-ph": "Atomic Physics",
|
| 123 |
+
"physics.atm-clus": "Atomic and Molecular Clusters",
|
| 124 |
+
"physics.bio-ph": "Biological Physics",
|
| 125 |
+
"physics.chem-ph": "Chemical Physics",
|
| 126 |
+
"physics.class-ph": "Classical Physics",
|
| 127 |
+
"physics.comp-ph": "Computational Physics",
|
| 128 |
+
"physics.data-an": "Data Analysis, Statistics and Probability",
|
| 129 |
+
"physics.ed-ph": "Physics Education",
|
| 130 |
+
"physics.flu-dyn": "Fluid Dynamics",
|
| 131 |
+
"physics.gen-ph": "General Physics",
|
| 132 |
+
"physics.geo-ph": "Geophysics",
|
| 133 |
+
"physics.hist-ph": "History and Philosophy of Physics",
|
| 134 |
+
"physics.ins-det": "Instrumentation and Detectors",
|
| 135 |
+
"physics.med-ph": "Medical Physics",
|
| 136 |
+
"physics.optics": "Optics",
|
| 137 |
+
"physics.plasm-ph": "Plasma Physics",
|
| 138 |
+
"physics.pop-ph": "Popular Physics",
|
| 139 |
+
"physics.soc-ph": "Physics and Society",
|
| 140 |
+
"physics.space-ph": "Space Physics",
|
| 141 |
+
"q-bio.BM": "Biomolecules",
|
| 142 |
+
"q-bio.CB": "Cell Behavior",
|
| 143 |
+
"q-bio.GN": "Genomics",
|
| 144 |
+
"q-bio.MN": "Molecular Networks",
|
| 145 |
+
"q-bio.NC": "Neurons and Cognition",
|
| 146 |
+
"q-bio.OT": "Other Quantitative Biology",
|
| 147 |
+
"q-bio.PE": "Populations and Evolution",
|
| 148 |
+
"q-bio.QM": "Quantitative Methods",
|
| 149 |
+
"q-bio.SC": "Subcellular Processes",
|
| 150 |
+
"q-bio.TO": "Tissues and Organs",
|
| 151 |
+
"q-fin.CP": "Computational Finance",
|
| 152 |
+
"q-fin.EC": "Economics",
|
| 153 |
+
"q-fin.GN": "General Finance",
|
| 154 |
+
"q-fin.MF": "Mathematical Finance",
|
| 155 |
+
"q-fin.PM": "Portfolio Management",
|
| 156 |
+
"q-fin.PR": "Pricing of Securities",
|
| 157 |
+
"q-fin.RM": "Risk Management",
|
| 158 |
+
"q-fin.ST": "Statistical Finance",
|
| 159 |
+
"q-fin.TR": "Trading and Market Microstructure",
|
| 160 |
+
"quant-ph": "Quantum Physics",
|
| 161 |
+
"stat.AP": "Applications",
|
| 162 |
+
"stat.CO": "Computation",
|
| 163 |
+
"stat.ME": "Methodology",
|
| 164 |
+
"stat.ML": "Machine Learning",
|
| 165 |
+
"stat.OT": "Other Statistics",
|
| 166 |
+
"stat.TH": "Statistics Theory"
|
| 167 |
+
}
|
src/search.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import arxiv
|
| 2 |
+
from crossref.restful import Works
|
| 3 |
+
import pytz
|
| 4 |
+
from datetime import date
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class Search_Papers():
|
| 9 |
+
def __init__(self, query, search_by, search_by_query, sort_by, sort_order):
|
| 10 |
+
|
| 11 |
+
self.query = query
|
| 12 |
+
self.search_mode = search_by
|
| 13 |
+
self.search_mode_query = search_by_query
|
| 14 |
+
|
| 15 |
+
self.sort_by = sort_by
|
| 16 |
+
|
| 17 |
+
self.sort_order = sort_order
|
| 18 |
+
|
| 19 |
+
self.time_search = datetime.now(pytz.utc).replace(microsecond=0)
|
| 20 |
+
|
| 21 |
+
return
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def search_arxiv_NResults(self, query, max_results, sort_by, sort_order):
|
| 25 |
+
search_results = arxiv.Search(
|
| 26 |
+
query = query,
|
| 27 |
+
max_results = max_results,
|
| 28 |
+
sort_by = sort_by,
|
| 29 |
+
sort_order = sort_order
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
return search_results.results()
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def search_arxiv_Timeframe(self, query, timeframe, sort_by, sort_order):
|
| 37 |
+
|
| 38 |
+
collection = []
|
| 39 |
+
exit_flag = 0
|
| 40 |
+
n_iter = 0
|
| 41 |
+
max_results = 10
|
| 42 |
+
|
| 43 |
+
while True:
|
| 44 |
+
search_list = arxiv.Search(
|
| 45 |
+
query = query,
|
| 46 |
+
max_results = (n_iter+1)*max_results,
|
| 47 |
+
sort_by = sort_by,
|
| 48 |
+
sort_order = sort_order
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
results = list(search_list.results())
|
| 53 |
+
|
| 54 |
+
for i in range(n_iter*max_results, len(results)):
|
| 55 |
+
#print(timeframe, results[i].published, results[i].published < timeframe)
|
| 56 |
+
|
| 57 |
+
if results[i].published > timeframe:
|
| 58 |
+
collection.append(results[i])
|
| 59 |
+
else:
|
| 60 |
+
exit_flag = 1
|
| 61 |
+
break
|
| 62 |
+
|
| 63 |
+
if exit_flag:
|
| 64 |
+
break
|
| 65 |
+
else:
|
| 66 |
+
n_iter += 1
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
return collection
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def search_arxiv(self):
|
| 74 |
+
|
| 75 |
+
if self.sort_by == "PublishDate":
|
| 76 |
+
sort_by = arxiv.SortCriterion.SubmittedDate
|
| 77 |
+
elif self.sort_by == "LastUpdatedDate":
|
| 78 |
+
sort_by = arxiv.SortCriterion.LastUpdatedDate
|
| 79 |
+
else:
|
| 80 |
+
sort_by = arxiv.SortCriterion.Relevance
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
if self.sort_order == "Ascending":
|
| 84 |
+
sort_order = arxiv.SortOrder.Ascending
|
| 85 |
+
else:
|
| 86 |
+
sort_order = arxiv.SortOrder.Descending
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
if self.search_mode == "NumberResults":
|
| 91 |
+
search_results = self.search_arxiv_NResults(self.query, self.search_mode_query, sort_by, sort_order)
|
| 92 |
+
else:
|
| 93 |
+
search_results = self.search_arxiv_Timeframe(self.query, self.search_mode_query, sort_by, sort_order)
|
| 94 |
+
|
| 95 |
+
return search_results
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def search_general_NResults(self, query, max_results, sort_by, sort_order):
|
| 99 |
+
works = Works()
|
| 100 |
+
# search_results = works.filter(category_name = query).sort(sort_by).order(sort_order).sample(max_results)
|
| 101 |
+
search_results = works.query(bibliographic = query).sort(sort_by).order(sort_order).sample(max_results)
|
| 102 |
+
return search_results
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def search_general_Timeframe(self, query, timeframe, sort_by, sort_order):
|
| 106 |
+
works = Works()
|
| 107 |
+
today = (datetime.combine(date.today(), datetime.min.time())).strftime("%Y-%m-%d")
|
| 108 |
+
|
| 109 |
+
# search_results = works.filter(category_name = query, from_created_date=timeframe, until_created_date = today).sort(sort_by).order(sort_order)
|
| 110 |
+
search_results = works.query(bibliographic = query).filter(from_created_date=timeframe, until_created_date = today).sort(sort_by).order(sort_order)
|
| 111 |
+
|
| 112 |
+
return search_results
|
| 113 |
+
|
| 114 |
+
#'Magnetic Field Conditions Upstream of Ganymede'
|
| 115 |
+
|
| 116 |
+
def search_general(self):
|
| 117 |
+
|
| 118 |
+
if self.search_mode == 'Timeframe':
|
| 119 |
+
timeframe = self.search_mode_query.strftime("%Y-%m-%d")
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
if self.sort_by == "PublishDate":
|
| 125 |
+
sort_by = 'created'
|
| 126 |
+
elif self.sort_by == "LastUpdatedDate":
|
| 127 |
+
sort_by = 'updated'
|
| 128 |
+
else:
|
| 129 |
+
sort_by = 'relevance'
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
if self.sort_order == "Ascending":
|
| 133 |
+
sort_order = "asc"
|
| 134 |
+
else:
|
| 135 |
+
sort_order = "desc"
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
if self.search_mode == "NumberResults":
|
| 140 |
+
search_results = self.search_general_NResults(self.query, self.search_mode_query, sort_by, sort_order)
|
| 141 |
+
else:
|
| 142 |
+
search_results = self.search_general_Timeframe(self.query, timeframe, sort_by, sort_order)
|
| 143 |
+
|
| 144 |
+
return search_results
|
| 145 |
+
|
| 146 |
+
|