File size: 4,324 Bytes
67543b3
f2144bd
67543b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b22cd20
ad752e9
 
 
 
 
67543b3
 
ad752e9
 
 
64c43f8
 
 
09cb57f
 
761832e
 
 
607b843
09cb57f
607b843
ad752e9
 
 
 
67543b3
ad752e9
9267818
ad752e9
67543b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e0853cd
 
67543b3
 
 
 
 
 
 
 
 
 
 
 
 
 
cf0f32a
 
4e66e59
67543b3
 
3e3b032
67543b3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108

from llama_index import LLMPredictor, PromptHelper, StorageContext, ServiceContext,  load_index_from_storage,  SimpleDirectoryReader, GPTVectorStoreIndex
from langchain.chat_models import ChatOpenAI
import gradio as gr
import sys
import os
import openai
from ratelimit import limits, sleep_and_retry
from langchain import HuggingFaceHub


# fixing bugs
# 1. open ai key: https://stackoverflow.com/questions/76425556/tenacity-retryerror-retryerrorfuture-at-0x7f89bc35eb90-state-finished-raised
# 2. rate limit error in lang_chain default version - install langchain==0.0.188. https://github.com/jerryjliu/llama_index/issues/924
# 3. added true Config variable in langchain: https://github.com/pydantic/pydantic/issues/3320
# 4. deploy on huggingfaces https://huggingface.co/welcome
#   create huggingfaces token https://huggingface.co/settings/tokens
#   login: huggingface-cli login
#   add requirements.txt file  https://huggingface.co/docs/hub/spaces-dependencies

os.environ["OPENAI_API_KEY"] = os.environ.get("openai_key")
openai.api_key = os.environ["OPENAI_API_KEY"]

# Define the rate limit for API calls (requests per second)
RATE_LIMIT = 3

# Implement the rate limiting decorator
@sleep_and_retry
@limits(calls=RATE_LIMIT, period=1)
def create_service_context():

#    prompt_helper = PromptHelper(max_input_size, num_outputs, max_chunk_overlap, chunk_size_limit=chunk_size_limit)
#    max_input_size = 4096
#    num_outputs = 512
#    max_chunk_overlap = 20
#    chunk_size_limit = 600 
#    prompt_helper = PromptHelper(max_input_size, num_outputs, chunk_overlap_ratio= 0.1, chunk_size_limit=chunk_size_limit)

    #LLMPredictor is a wrapper class around LangChain's LLMChain that allows easy integration into LlamaIndex
#    llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0.5, model_name="gpt-3.5-turbo", max_tokens=num_outputs))

    # Constraint parameters
#    max_input_size = 32768          # gpt-4-32k
#    num_outputs = 30000             # gpt-4-32k
#    num_outputs = 3500 # max 4096   # gpt-4o
#    max_input_size = 8192 # gpt-4
#    num_outputs = 7500  # gpt-4
#    max_input_size = 16384 # gpt-3.5-turbo-0125
#    num_outputs = 15000  # gpt-3.5-turbo-0125
    max_input_size = 4096
    num_outputs = 3300
    
    max_chunk_overlap = 15
    chunk_size_limit = 600

    # Allows the user to explicitly set certain constraint parameters
    prompt_helper = PromptHelper(max_input_size, num_outputs, chunk_overlap_ratio=0.1, chunk_size_limit=chunk_size_limit)

    # LLMPredictor is a wrapper class around LangChain's LLMChain that allows easy integration into LlamaIndex
    llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0.5, model_name="gpt-3.5-turbo", max_tokens=num_outputs)) # gpt-4-32k
    
    #constructs service_context
    service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper)
    return service_context


# Implement the rate limiting decorator
@sleep_and_retry
@limits(calls=RATE_LIMIT, period=1)
def data_ingestion_indexing(directory_path):

    #loads data from the specified directory path
    documents = SimpleDirectoryReader(directory_path).load_data()

    #when first building the index
    index = GPTVectorStoreIndex.from_documents(
        documents, service_context=create_service_context()
    )

    #persist index to disk, default "storage" folder
    index.storage_context.persist()

    return index

@sleep_and_retry
@limits(calls=RATE_LIMIT, period=1)
def data_querying(input_text):

    #rebuild storage context
    storage_context = StorageContext.from_defaults(persist_dir="./storage")

    #loads index from storage
    index = load_index_from_storage(storage_context, service_context=create_service_context())

    #queries the index with the input text
    response = index.as_query_engine().query(input_text)

    return response.response

iface = gr.Interface(fn=data_querying,
                     inputs=gr.components.Textbox(lines=20, label="Enter your question"),
                     outputs=gr.components.Textbox(lines=25, label="Response", style="height: 400px; overflow-y: scroll;"),  
                     title="Legi GPT - Monitorul oficial 25.06 - 5.07.2024, 157 pdfs, 150mb")

#passes in data directory
index = data_ingestion_indexing("books-philosophy")
iface.launch(inline=True)