rkihacker commited on
Commit
0e51f31
·
verified ·
1 Parent(s): a50d397

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +120 -91
main.py CHANGED
@@ -1,33 +1,39 @@
1
  from fastapi import FastAPI, HTTPException, Request
 
2
  from pydantic import BaseModel, Field
3
  import httpx
4
  import os
5
- from typing import List, Dict, Any, Optional
 
 
 
6
 
7
  # --- Configuration ---
8
- # Your actual Inference API key should be set as an environment variable
9
  INFERENCE_API_KEY = os.environ.get("INFERENCE_API_KEY", "inference-00050468cc1c4a20bd5ca0997c752329")
10
  INFERENCE_API_URL = "https://api.inference.net/v1/chat/completions"
11
  SEARCH_API_URL = "https://rkihacker-brave.hf.space/search"
 
 
12
 
13
- # --- System Prompt ---
14
  SYSTEM_PROMPT = """
15
- You are "Binglity-Lite", a large language model acting as a helpful AI assistant.
16
- Your primary function is to provide accurate, comprehensive, and helpful answers by synthesizing information from real-time web search results.
17
- When you are given a user's query and a set of search results, you must follow these rules:
18
- 1. Carefully analyze the user's query to understand their intent.
19
- 2. Review all the provided search results to gather relevant facts, data, and perspectives.
20
- 3. Construct a single, coherent, and well-written response that directly answers the user's query.
21
- 4. Base your answer **only** on the information found in the provided search results. Do not use any prior knowledge.
22
- 5. If the search results do not contain enough information to answer the question, state that you couldn't find a definitive answer based on the search.
23
- 6. Do not list the search results. Instead, integrate the information from them into your response.
 
24
  """
25
 
26
  # --- FastAPI App ---
27
  app = FastAPI(
28
  title="Binglity-Lite API",
29
- description="A web search-powered chat completions API.",
30
- version="1.0.0",
31
  )
32
 
33
  # --- Pydantic Models for OpenAI Compatibility ---
@@ -38,14 +44,12 @@ class ChatMessage(BaseModel):
38
  class ChatCompletionRequest(BaseModel):
39
  model: str
40
  messages: List[ChatMessage]
41
- max_tokens: Optional[int] = 1024
42
  temperature: Optional[float] = 0.7
 
43
 
44
  # --- Web Search Function ---
45
  async def perform_web_search(query: str) -> List[Dict[str, Any]]:
46
- """
47
- Performs a web search using an external API.
48
- """
49
  async with httpx.AsyncClient() as client:
50
  try:
51
  response = await client.get(
@@ -61,102 +65,127 @@ async def perform_web_search(query: str) -> List[Dict[str, Any]]:
61
  print(f"An unexpected error occurred during web search: {str(e)}")
62
  return []
63
 
64
- # --- Helper to format search results for the LLM ---
65
  def format_search_results_for_prompt(results: List[Dict[str, Any]]) -> str:
66
- """
67
- Formats the list of search result dictionaries into a string for the LLM prompt.
68
- """
69
  if not results:
70
- return "No search results found."
71
-
72
- formatted = "Here are the web search results:\n\n"
73
  for i, result in enumerate(results):
74
- formatted += f"Result [{i+1}]:\n"
75
  formatted += f"Title: {result.get('title', 'N/A')}\n"
76
  formatted += f"URL: {result.get('url', 'N/A')}\n"
77
- formatted += f"Description: {result.get('description', 'N/A')}\n\n"
78
  return formatted
79
 
80
- # --- API Endpoint ---
81
- @app.post("/v1/chat/completions")
82
- async def chat_completions(request: ChatCompletionRequest):
83
  """
84
- Implements a chat completions endpoint compatible with OpenAI's API.
85
- It performs a web search based on the user's last message.
86
  """
87
- if request.model != "Binglity-Lite":
88
- raise HTTPException(
89
- status_code=400,
90
- detail=f"Model not supported. Please use 'Binglity-Lite'. You used '{request.model}'.",
91
- )
92
-
93
- if not request.messages:
94
- raise HTTPException(status_code=400, detail="The 'messages' field is required.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
- # Extract the last user message as the query
97
- user_query = request.messages[-1].content
98
- user_role = request.messages[-1].role
 
 
99
 
100
- if user_role.lower() != 'user':
101
- raise HTTPException(status_code=400, detail="The last message must be from the 'user'.")
 
102
 
103
  # 1. Perform Web Search
104
  search_results = await perform_web_search(user_query)
105
  formatted_results = format_search_results_for_prompt(search_results)
106
-
107
- # 2. Construct the prompt for the external LLM
108
- final_prompt = f"User Query: {user_query}\n\n{formatted_results}"
109
-
110
- # 3. Call the external Inference API
111
- headers = {
112
- "Authorization": f"Bearer {INFERENCE_API_KEY}",
113
- "Content-Type": "application/json",
114
- }
115
-
116
- # The payload for the external API uses our system prompt and the combined user query + search results
117
  payload = {
118
- "model": "meta-llama/llama-3.1-8b-instruct/fp-8", # The actual model used by the inference API
119
  "messages": [
120
  {"role": "system", "content": SYSTEM_PROMPT},
121
- {"role": "user", "content": final_prompt},
122
  ],
123
  "max_tokens": request.max_tokens,
124
  "temperature": request.temperature,
 
125
  }
126
 
127
- async with httpx.AsyncClient(timeout=60.0) as client:
128
- try:
129
- response = await client.post(INFERENCE_API_URL, json=payload, headers=headers)
130
- response.raise_for_status()
131
- model_response = response.json()
132
-
133
- # 4. Format the response to be OpenAI API compliant
134
- # This part depends on the exact structure of the inference API's response
135
- # Assuming it's similar to OpenAI's, we extract the message content
136
- generated_content = model_response["choices"][0]["message"]["content"]
137
-
138
- api_response = {
139
- "id": model_response.get("id", "chatcmpl-binglity-lite-123"),
140
- "object": "chat.completion",
141
- "created": model_response.get("created", 0),
142
- "model": "Binglity-Lite",
143
- "choices": [{
144
- "index": 0,
145
- "message": {
146
- "role": "assistant",
147
- "content": generated_content,
148
- },
149
- "finish_reason": "stop",
150
- }],
151
- "usage": model_response.get("usage", {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}),
152
- }
153
- return api_response
154
-
155
- except httpx.HTTPStatusError as e:
156
- raise HTTPException(status_code=e.response.status_code, detail=f"Error from inference API: {e.response.text}")
157
- except Exception as e:
158
- raise HTTPException(status_code=500, detail=f"An unexpected error occurred: {str(e)}")
159
-
160
 
161
  @app.get("/")
162
  def read_root():
 
1
  from fastapi import FastAPI, HTTPException, Request
2
+ from fastapi.responses import StreamingResponse
3
  from pydantic import BaseModel, Field
4
  import httpx
5
  import os
6
+ import json
7
+ import time
8
+ import uuid
9
+ from typing import List, Dict, Any, Optional, AsyncGenerator
10
 
11
  # --- Configuration ---
 
12
  INFERENCE_API_KEY = os.environ.get("INFERENCE_API_KEY", "inference-00050468cc1c4a20bd5ca0997c752329")
13
  INFERENCE_API_URL = "https://api.inference.net/v1/chat/completions"
14
  SEARCH_API_URL = "https://rkihacker-brave.hf.space/search"
15
+ MODEL_NAME = "Binglity-Lite"
16
+ BACKEND_MODEL = "meta-llama/llama-3.1-8b-instruct/fp-8"
17
 
18
+ # --- A More Advanced System Prompt ---
19
  SYSTEM_PROMPT = """
20
+ You are "Binglity-Lite", a state-of-the-art AI assistant. Your purpose is to provide accurate, unbiased, and comprehensive answers by synthesizing information from real-time web search results.
21
+
22
+ **Your Instructions:**
23
+ 1. **Analyze the User's Query**: Deeply understand the user's question, intent, and the specific information they are seeking.
24
+ 2. **Synthesize, Don't List**: Do not simply list or summarize the search results. Your primary task is to integrate the information from the multiple sources provided into a single, cohesive, and well-structured response.
25
+ 3. **Be Factual and Unbiased**: Base your entire response ONLY on the information contained within the provided search results. Do not introduce any external knowledge or personal opinions.
26
+ 4. **Handle Contradictions**: If the search results present conflicting information, acknowledge the discrepancy and present the different viewpoints as found in the sources.
27
+ 5. **Address Insufficient Information**: If the search results do not contain enough information to provide a complete answer, explicitly state that. Do not speculate or fill in the gaps.
28
+ 6. **Maintain a Helpful Tone**: Your persona is knowledgeable, helpful, and neutral.
29
+ 7. **Structure for Clarity**: Use clear language and logical formatting (like paragraphs or bullet points if appropriate) to make the information easy to understand.
30
  """
31
 
32
  # --- FastAPI App ---
33
  app = FastAPI(
34
  title="Binglity-Lite API",
35
+ description="A web search-powered, streaming-capable chat completions API.",
36
+ version="1.1.0",
37
  )
38
 
39
  # --- Pydantic Models for OpenAI Compatibility ---
 
44
  class ChatCompletionRequest(BaseModel):
45
  model: str
46
  messages: List[ChatMessage]
47
+ max_tokens: Optional[int] = 2048
48
  temperature: Optional[float] = 0.7
49
+ stream: Optional[bool] = False
50
 
51
  # --- Web Search Function ---
52
  async def perform_web_search(query: str) -> List[Dict[str, Any]]:
 
 
 
53
  async with httpx.AsyncClient() as client:
54
  try:
55
  response = await client.get(
 
65
  print(f"An unexpected error occurred during web search: {str(e)}")
66
  return []
67
 
 
68
  def format_search_results_for_prompt(results: List[Dict[str, Any]]) -> str:
 
 
 
69
  if not results:
70
+ return "No relevant search results were found. Please inform the user that you could not find information on their query."
71
+
72
+ formatted = "### Web Search Results ###\n\n"
73
  for i, result in enumerate(results):
74
+ formatted += f"Source [{i+1}]:\n"
75
  formatted += f"Title: {result.get('title', 'N/A')}\n"
76
  formatted += f"URL: {result.get('url', 'N/A')}\n"
77
+ formatted += f"Content: {result.get('description', 'N/A')}\n\n"
78
  return formatted
79
 
80
+ # --- Streaming Logic ---
81
+ async def stream_response_generator(payload: Dict[str, Any]) -> AsyncGenerator[str, None]:
 
82
  """
83
+ Yields chunks from the inference API, formatted for OpenAI compatibility.
 
84
  """
85
+ headers = {
86
+ "Authorization": f"Bearer {INFERENCE_API_KEY}",
87
+ "Content-Type": "application/json",
88
+ "Accept": "text/event-stream"
89
+ }
90
+
91
+ # Create a unique ID for the response stream
92
+ response_id = f"chatcmpl-{uuid.uuid4()}"
93
+ created_time = int(time.time())
94
+
95
+ async with httpx.AsyncClient(timeout=300.0) as client:
96
+ async with client.stream("POST", INFERENCE_API_URL, json=payload, headers=headers) as response:
97
+ if response.status_code != 200:
98
+ error_content = await response.aread()
99
+ raise HTTPException(status_code=response.status_code, detail=f"Error from inference API: {error_content.decode()}")
100
+
101
+ # Stream the response line by line
102
+ async for line in response.aiter_lines():
103
+ if line.startswith("data:"):
104
+ line_data = line[5:].strip()
105
+ if line_data == "[DONE]":
106
+ # Send the final data chunk and the done message
107
+ yield f"data: {json.dumps({'id': response_id, 'model': MODEL_NAME, 'object': 'chat.completion.chunk', 'created': created_time, 'choices': [{'index': 0, 'delta': {}, 'finish_reason': 'stop'}]})}\n\n"
108
+ yield "data: [DONE]\n\n"
109
+ break
110
+
111
+ try:
112
+ chunk = json.loads(line_data)
113
+ # Reformat the chunk to be OpenAI compliant
114
+ formatted_chunk = {
115
+ "id": response_id,
116
+ "object": "chat.completion.chunk",
117
+ "created": created_time,
118
+ "model": MODEL_NAME,
119
+ "choices": [{
120
+ "index": 0,
121
+ "delta": chunk["choices"][0].get("delta", {}),
122
+ "finish_reason": chunk["choices"][0].get("finish_reason")
123
+ }]
124
+ }
125
+ yield f"data: {json.dumps(formatted_chunk)}\n\n"
126
+ except json.JSONDecodeError:
127
+ print(f"Could not decode JSON from line: {line_data}")
128
+ continue
129
 
130
+ # --- API Endpoint ---
131
+ @app.post("/v1/chat/completions")
132
+ async def chat_completions(request: ChatCompletionRequest):
133
+ if request.model != MODEL_NAME:
134
+ raise HTTPException(status_code=400, detail=f"Model not supported. Please use '{MODEL_NAME}'.")
135
 
136
+ user_query = request.messages[-1].content if request.messages else ""
137
+ if not user_query or request.messages[-1].role.lower() != 'user':
138
+ raise HTTPException(status_code=400, detail="The last message must be from the 'user' and contain content.")
139
 
140
  # 1. Perform Web Search
141
  search_results = await perform_web_search(user_query)
142
  formatted_results = format_search_results_for_prompt(search_results)
143
+
144
+ # 2. Construct prompt for the backend model
145
+ final_user_prompt = f"User's question: \"{user_query}\"\n\nBased ONLY on the provided search results below, answer the user's question.\n\n{formatted_results}"
146
+
147
+ # 3. Prepare payload for Inference API
 
 
 
 
 
 
148
  payload = {
149
+ "model": BACKEND_MODEL,
150
  "messages": [
151
  {"role": "system", "content": SYSTEM_PROMPT},
152
+ {"role": "user", "content": final_user_prompt},
153
  ],
154
  "max_tokens": request.max_tokens,
155
  "temperature": request.temperature,
156
+ "stream": request.stream,
157
  }
158
 
159
+ # 4. Handle streaming or single response
160
+ if request.stream:
161
+ return StreamingResponse(stream_response_generator(payload), media_type="text/event-stream")
162
+ else:
163
+ # Standard non-streaming request
164
+ headers = {"Authorization": f"Bearer {INFERENCE_API_KEY}"}
165
+ async with httpx.AsyncClient(timeout=120.0) as client:
166
+ try:
167
+ response = await client.post(INFERENCE_API_URL, json=payload, headers=headers)
168
+ response.raise_for_status()
169
+ model_response = response.json()
170
+
171
+ # Format response to be OpenAI API compliant
172
+ return {
173
+ "id": model_response.get("id", f"chatcmpl-{uuid.uuid4()}"),
174
+ "object": "chat.completion",
175
+ "created": model_response.get("created", int(time.time())),
176
+ "model": MODEL_NAME,
177
+ "choices": [{
178
+ "index": 0,
179
+ "message": {
180
+ "role": "assistant",
181
+ "content": model_response["choices"][0]["message"]["content"],
182
+ },
183
+ "finish_reason": "stop",
184
+ }],
185
+ "usage": model_response.get("usage", {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}),
186
+ }
187
+ except httpx.HTTPStatusError as e:
188
+ raise HTTPException(status_code=e.response.status_code, detail=f"Error from inference API: {e.response.text}")
 
 
 
189
 
190
  @app.get("/")
191
  def read_root():