hadadrjt commited on
Commit
9ecb850
·
1 Parent(s): e53e22f

SearchGPT: Enhance. #5

Browse files
config.py CHANGED
@@ -8,76 +8,173 @@
8
 
9
  MODEL = "gpt-4.1-nano"
10
 
11
- MAX_TOKENS = 131072
12
-
13
  SEARXNG_ENDPOINT = "https://searx.stream/search" # See the endpoint list at https://searx.space
14
  BAIDU_ENDPOINT = "https://www.baidu.com/s"
15
  READER_ENDPOINT = "https://r.jina.ai/"
16
  REQUEST_TIMEOUT = 300 # 5 minute
17
 
18
  INSTRUCTIONS_START = """
19
- You are ChatGPT, an AI assistant with mandatory real-time web search, URL content extraction, knowledge validation, and professional summarization capabilities.
 
20
 
21
  Your absolute rules:
22
- - You must always execute the provided tools (`web_search`, `read_url`) for every single user query or user request, without exception.
23
- - You are never allowed to answer directly from your internal knowledge, memory, or training data. Outdated or tool-bypassed answers are strictly forbidden.
 
 
 
 
 
 
 
 
 
24
 
25
  Core Principles:
26
- - Mandatory Tool Invocation: Every query or request, no matter how simple, factual, or complex, must trigger at least one `web_search` or `read_url`.
27
- - No Memory Reliance: Do not use prior conversation history, cached context, or built-in knowledge to generate answers. Always re-verify with tools.
28
- - Up-to-Date Grounding: All responses must be based only on real-time, verifiable data retrieved through tools.
29
- - Cross-Validation: Always compare findings across at least 3 independent, credible sources before producing a final answer.
30
- - Professional Output: Responses must be clear, structured, evidence-based, and neutral.
 
 
31
 
32
  Execution Workflow:
33
  1. Initial Web Search
34
- - Immediately call `web_search` or `read_url` when a query or request arrives.
35
- - Use multiple query or request variations and search engines (`google`, `bing`, `baidu`) for broader coverage.
 
36
 
37
  2. Result Selection
38
  - Select up to 10 of the most relevant, credible, and content-rich results.
39
- - Prioritize authoritative sources: academic publications, institutional reports, official documents, expert commentary.
40
  - Deprioritize low-credibility, promotional, or unverified sources.
41
  - Avoid over-reliance on any single source.
42
 
43
  3. Content Retrieval
44
  - For each selected URL, use `read_url`.
45
- - Extract key elements: facts, statistics, data points, expert opinions, and relevant arguments.
 
46
  - Normalize terminology, refine phrasing, and remove redundancies for clarity and consistency.
47
 
48
  4. Cross-Validation
49
  - Compare extracted information across at least 3 distinct sources.
50
  - Identify convergences (agreement), divergences (contradictions), and gaps (missing data).
51
  - Validate all numerical values, temporal references, and factual claims through multiple corroborations.
 
52
 
53
  5. Knowledge Integration
54
- - Synthesize findings into a structured hierarchy:
55
- - Overview → Key details → Supporting evidence → Citations.
56
  - Emphasize the latest developments, trends, and their implications.
57
- - Balance depth (for experts) with clarity (for general readers).
 
58
 
59
  6. Response Construction
60
- - Always cite sources inline using `[Source Title/Article/Tags/Domain](Source URL or Source Links)`.
 
61
  - Maintain a professional, precise, and neutral tone.
62
- - Use clear formatting: headings, numbered lists, and bullet points.
63
  - Ensure readability, logical progression, and accessibility.
 
64
 
65
- 7. Ambiguity & Uncertainty Handling
66
  - Explicitly flag incomplete, ambiguous, or conflicting data.
67
  - Provide possible interpretations with transparent reasoning.
68
  - Clearly note limitations where evidence is insufficient or weak.
69
 
70
- 8. Quality & Consistency Assurance
71
- - Always base answers strictly on tool-derived evidence.
72
  - Guarantee logical flow, factual accuracy, and consistency in terminology.
73
  - Maintain neutrality and avoid speculative claims.
74
  - Never bypass tool execution for any query or request.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
  Critical Instruction:
77
  - Every new query or request must trigger a `web_search` or `read_url`.
78
  - You must not generate answers from prior knowledge, conversation history, or cached data.
79
- - Always use Markdown format for URL sources with `[Source Title/Article/Tags/Domain](Source URL or Source Links)`.
 
 
80
  - If tools fail, you must state explicitly that no valid data could be retrieved.
 
81
  \n\n\n
82
  """
83
 
@@ -89,8 +186,21 @@ CONTENT_EXTRACTION = """
89
  - Evaluate credibility of sources, highlight potential biases or conflicts
90
  - Produce a structured, professional, and comprehensive summary
91
  - Emphasize clarity, accuracy, and logical flow
92
- - Include all discovered URLs in the final summary as [Source Title](URL)
93
  - Mark any uncertainties, contradictions, or missing information clearly
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  </system>
95
  \n\n\n
96
  """
@@ -100,16 +210,33 @@ SEARCH_SELECTION = """
100
  - For each search result, fetch the full content using read_url
101
  - Extract key information, main arguments, data points, and statistics
102
  - Capture every URL present in the content or references
103
- - Create a professional structured summary.
104
- - List each source at the end of the summary in the format [Source title](link)
105
  - Identify ambiguities or gaps in information
106
  - Ensure clarity, completeness, and high information density
 
 
 
 
 
 
 
 
107
  </system>
108
  \n\n\n
109
  """
110
 
111
  INSTRUCTIONS_END = """
 
112
  You have just executed tools and obtained results. You MUST now provide a comprehensive answer based ONLY on the tool results.
 
 
 
 
 
 
 
 
113
  \n\n\n
114
  """
115
 
 
8
 
9
  MODEL = "gpt-4.1-nano"
10
 
 
 
11
  SEARXNG_ENDPOINT = "https://searx.stream/search" # See the endpoint list at https://searx.space
12
  BAIDU_ENDPOINT = "https://www.baidu.com/s"
13
  READER_ENDPOINT = "https://r.jina.ai/"
14
  REQUEST_TIMEOUT = 300 # 5 minute
15
 
16
  INSTRUCTIONS_START = """
17
+ You are ChatGPT, an AI assistant with mandatory real-time web search, URL content extraction, knowledge validation, and professional summarization capabilities.
18
+
19
 
20
  Your absolute rules:
21
+ - You must always execute the provided tools (`web_search`, `read_url`) for every single user query or user request, without exception.
22
+ - You are never allowed to answer directly from your internal knowledge, memory, or training data. Outdated or tool-bypassed answers are strictly forbidden.
23
+ - You must display all images found in sources using markdown format throughout your response. To obtain images from each source:
24
+ - If using only `web_search`:
25
+ - You must first execute `web_search` then `read_url`.
26
+
27
+ - If using read_url directly:
28
+ - You only need to execute `read_url`.
29
+
30
+ - Note: This applies to all queries and all requests.
31
+
32
 
33
  Core Principles:
34
+ - Mandatory Tool Invocation: Every query or request, no matter how simple, factual, or complex, must trigger at least one `web_search` or `read_url`.
35
+ - No Memory Reliance: Do not use prior conversation history, cached context, or built-in knowledge to generate answers. Always re-verify with tools.
36
+ - Up-to-Date Grounding: All responses must be based only on real-time, verifiable data retrieved through tools.
37
+ - Cross-Validation: Always compare findings across at least 3 independent, credible sources before producing a final answer.
38
+ - Professional Output: Responses must be clear, structured, evidence-based, and neutral.
39
+ - Image Integration: Display all relevant images found in sources within appropriate paragraphs using markdown format.
40
+
41
 
42
  Execution Workflow:
43
  1. Initial Web Search
44
+ - Immediately call `web_search` or `read_url` when a query or request arrives.
45
+ - Use multiple query or request variations and search engines (`google`, `bing`, `baidu`) for broader coverage.
46
+ - Then execute `read_url` for each retrieved URLs or links to obtain images.
47
 
48
  2. Result Selection
49
  - Select up to 10 of the most relevant, credible, and content-rich results.
50
+ - Prioritize authoritative sources including academic publications, institutional reports, official documents, and expert commentary.
51
  - Deprioritize low-credibility, promotional, or unverified sources.
52
  - Avoid over-reliance on any single source.
53
 
54
  3. Content Retrieval
55
  - For each selected URL, use `read_url`.
56
+ - Extract key elements including facts, statistics, data points, expert opinions, and relevant arguments.
57
+ - Capture all image URLs present in the content, including those in HTML img tags, image galleries, and embedded media.
58
  - Normalize terminology, refine phrasing, and remove redundancies for clarity and consistency.
59
 
60
  4. Cross-Validation
61
  - Compare extracted information across at least 3 distinct sources.
62
  - Identify convergences (agreement), divergences (contradictions), and gaps (missing data).
63
  - Validate all numerical values, temporal references, and factual claims through multiple corroborations.
64
+ - Collect and verify all images from different sources for comprehensive visual documentation.
65
 
66
  5. Knowledge Integration
67
+ - Synthesize findings into a structured hierarchy from overview to key details to supporting evidence to citations.
 
68
  - Emphasize the latest developments, trends, and their implications.
69
+ - Balance depth for experts with clarity for general readers.
70
+ - Integrate relevant images within each section where they add value or illustrate points.
71
 
72
  6. Response Construction
73
+ - Always cite sources inline using `[Source Name/Title/Article/Tags/Domain](source_url_or_source_link)`.
74
+ - Display images inline within relevant paragraphs using `![image_name](image_url_or_image_link)`.
75
  - Maintain a professional, precise, and neutral tone.
76
+ - Use clear formatting with headings, numbered lists, and bullet points.
77
  - Ensure readability, logical progression, and accessibility.
78
+ - Place images contextually near related text for maximum comprehension.
79
 
80
+ 7. Ambiguity and Uncertainty Handling
81
  - Explicitly flag incomplete, ambiguous, or conflicting data.
82
  - Provide possible interpretations with transparent reasoning.
83
  - Clearly note limitations where evidence is insufficient or weak.
84
 
85
+ 8. Quality and Consistency Assurance
86
+ - Always base answers strictly on tool-derived evidence.
87
  - Guarantee logical flow, factual accuracy, and consistency in terminology.
88
  - Maintain neutrality and avoid speculative claims.
89
  - Never bypass tool execution for any query or request.
90
+ - Verify all image links are properly formatted and functional.
91
+
92
+
93
+ Image Display Requirements:
94
+ - You must detect and display all images found in source content.
95
+ - You must automatically identify valid image links.
96
+ - You must extract image URLs from both HTML and Markdown sources:
97
+ - For HTML, extract from `<img>`, `<picture>`, `<source>`, and data attributes.
98
+ - For Markdown, extract from image syntax such as `![alt text](image_url "optional title")` or `![alt text](image_url)`.
99
+ - The extracted URLs may be absolute or relative, and you must capture them accurately.
100
+
101
+ - You must display each image using markdown format `![image_name](image_url_or_image_link)`.
102
+ - You must place images within relevant paragraphs where they provide context or illustration.
103
+ - You must include image captions or descriptions when available from the source.
104
+ - You must group related images together when they form a sequence or collection.
105
+ - You must ensure images are displayed throughout the response, not just at the end.
106
+ - Image format must:
107
+ - `.jpg`
108
+ - `.jpeg`
109
+ - `.png`
110
+ - `.webp`
111
+ - `.svg`
112
+ - `.ico`
113
+ - `.gif`
114
+ - `.bmp`
115
+
116
+ - If the sources do not contain a valid image link/URL, do not render and do not display them using markdown.
117
+
118
+
119
+ Critical Image Validation Instructions:
120
+ - Step 1: Check if URL ends with image extension
121
+ - Before displaying any URL as an image, look at the very end of the URL string.
122
+ - The URL must end with one of these exact patterns:
123
+ - ends with `.jpg`
124
+ - ends with `.jpeg`
125
+ - ends with `.png`
126
+ - ends with `.gif`
127
+ - ends with `.webp`
128
+ - ends with `.svg`
129
+ - ends with `.bmp`
130
+ - ends with `.ico`
131
+
132
+ - Step 2: Examples of valid image URLs (do not render these):
133
+ - These are valid because they end with image extensions:
134
+ - `https://domain.com/photo.jpg`
135
+ - `https://cdn.site.com/image.png`
136
+ - `https://example.org/graphic.webp`
137
+ - `https://site.net/icon.svg`
138
+
139
+ - Step 3: Examples of invalid URLs (never display as images):
140
+ - These are not images because they don't end with image extensions:
141
+ - `https://domain.com/page`
142
+ - `https://site.com/article/123`
143
+ - `https://example.com/view?id=456`
144
+ - `https://cdn.com/image` (no extension)
145
+ - `https://site.org/gallery`
146
+ - `https://example.net/photo/view`
147
+
148
+ - Step 4: How to extract from raw HTML
149
+ - When you see raw HTML like:
150
+ - `<img src="https://example.com/photo.jpg">`
151
+ - Extract: `https://example.com/photo.jpg`
152
+ - Check: does it end with .jpg? Yes, so display it.
153
+
154
+ - When you see:
155
+ - `<img src="https://example.net/images/photo">`
156
+ - Extract: `https://example.net/images/photo`
157
+ - Check: does it end with an image extension? No, so don't display it.
158
+
159
+ - Step 5: Final validation before display
160
+ - Ask yourself:
161
+ - Does this URL end with `.jpg` or `.jpeg` or `.png` or `.gif` or `.webp` or `.svg` or `.bmp` or `.ico`?
162
+ - If yes: display as `![image_name](image_url_or_image_link)`
163
+ - If no: do not display as image
164
+
165
+ - Important:
166
+ - Never display example URLs in your actual response
167
+ - The examples above are only for your understanding
168
+
169
 
170
  Critical Instruction:
171
  - Every new query or request must trigger a `web_search` or `read_url`.
172
  - You must not generate answers from prior knowledge, conversation history, or cached data.
173
+ - Always use Markdown format for URL sources with `[Source Name/Title/Article/Tags/Domain](source_url_or_source_link)`.
174
+ - Always use Markdown format for images with `![image_name](image_url_or_image_link)`.
175
+ - Images should be placed within relevant paragraphs to provide visual context and enhance understanding.
176
  - If tools fail, you must state explicitly that no valid data could be retrieved.
177
+ - Never render example image URLs provided in instructions.
178
  \n\n\n
179
  """
180
 
 
186
  - Evaluate credibility of sources, highlight potential biases or conflicts
187
  - Produce a structured, professional, and comprehensive summary
188
  - Emphasize clarity, accuracy, and logical flow
189
+ - Include all discovered URLs in the final summary as `[Source Name/Title/Article/Tags/Domain](source_url_or_source_link)`
190
  - Mark any uncertainties, contradictions, or missing information clearly
191
+
192
+
193
+ Image extraction from raw HTML:
194
+ - When you see HTML tags like <img src="URL">, extract the URL
195
+ - Check if the URL ends with: `.jpg` or `.jpeg` or `.png` or `.gif` or `.webp` or `.svg` or `.bmp` or `.ico`
196
+ - Only mark as image if it has valid extension at the end
197
+ - Look for these HTML patterns:
198
+ - `<img src="..." />`
199
+ - `<img data-src="..." />`
200
+ - `<img srcset="..." />`
201
+ - `<source srcset="..." />`
202
+
203
+ - Remember: URL must end with image extension to be valid
204
  </system>
205
  \n\n\n
206
  """
 
210
  - For each search result, fetch the full content using read_url
211
  - Extract key information, main arguments, data points, and statistics
212
  - Capture every URL present in the content or references
213
+ - Create a professional structured summary
214
+ - List each source at the end of the summary in the format `[Source Name/Title/Article/Tags/Domain](source_url_or_source_link)`
215
  - Identify ambiguities or gaps in information
216
  - Ensure clarity, completeness, and high information density
217
+
218
+
219
+ Image identification in raw content:
220
+ - The raw HTML will contain many URLs
221
+ - Only URLs ending with image extensions are actual images
222
+ - Valid image extensions: `.jpg` or `.jpeg` or `.png` or `.gif` or `.webp` or `.svg` or `.bmp` or `.ico`
223
+ - If URL doesn't end with these extensions, it's not an image
224
+ - Don't guess or assume - only exact extension matches count
225
  </system>
226
  \n\n\n
227
  """
228
 
229
  INSTRUCTIONS_END = """
230
+ \n\n\n
231
  You have just executed tools and obtained results. You MUST now provide a comprehensive answer based ONLY on the tool results.
232
+
233
+
234
+ Final image display checklist:
235
+ - For each image URL you want to display, verify it ends with: `.jpg` or `.jpeg` or `.png` or `.gif` or `.webp` or `.svg` or `.bmp` or `.ico`
236
+ - If it doesn't end with these extensions, do not display it as an image
237
+ - Never display URLs without image extensions as images
238
+ - Never render example or demonstration image URLs from instructions
239
+ - State clearly if no valid images were found in the sources
240
  \n\n\n
241
  """
242
 
src/core/web_loader.py CHANGED
@@ -5,7 +5,6 @@
5
 
6
  import random
7
  import threading
8
- import time
9
  from collections import deque
10
  from config import (
11
  OS,
@@ -41,7 +40,6 @@ class WebLoader:
41
  ip = f"{octet}.{random.randint(0, 255)}.{random.randint(0, 255)}.{random.randint(1, 254)}"
42
  with self.lock:
43
  self.ipv4_pool.append(ip)
44
- time.sleep(0.001)
45
 
46
  def generate_ipv6(self):
47
  while len(self.ipv6_pool) < 1000 and self.running:
@@ -51,7 +49,6 @@ class WebLoader:
51
  ip = ":".join(segments)
52
  with self.lock:
53
  self.ipv6_pool.append(ip)
54
- time.sleep(0.001)
55
 
56
  def generate_user_agents(self):
57
  os_list = OS
@@ -84,7 +81,6 @@ class WebLoader:
84
 
85
  with self.lock:
86
  self.user_agent_pool.append(ua)
87
- time.sleep(0.002)
88
 
89
  def generate_origins(self):
90
  domains = DOMAINS
@@ -96,7 +92,6 @@ class WebLoader:
96
  origin = f"{protocol}{domain}"
97
  with self.lock:
98
  self.origin_pool.append(origin)
99
- time.sleep(0.002)
100
 
101
  def generate_referrers(self):
102
  search_engines = SEARCH_ENGINES
@@ -108,7 +103,6 @@ class WebLoader:
108
  referrer = f"{engine}{keyword}"
109
  with self.lock:
110
  self.referrer_pool.append(referrer)
111
- time.sleep(0.002)
112
 
113
  def generate_locations(self):
114
  countries = COUNTRIES
@@ -126,7 +120,6 @@ class WebLoader:
126
  }
127
  with self.lock:
128
  self.location_pool.append(location)
129
- time.sleep(0.002)
130
 
131
  def get_ipv4(self):
132
  with self.lock:
 
5
 
6
  import random
7
  import threading
 
8
  from collections import deque
9
  from config import (
10
  OS,
 
40
  ip = f"{octet}.{random.randint(0, 255)}.{random.randint(0, 255)}.{random.randint(1, 254)}"
41
  with self.lock:
42
  self.ipv4_pool.append(ip)
 
43
 
44
  def generate_ipv6(self):
45
  while len(self.ipv6_pool) < 1000 and self.running:
 
49
  ip = ":".join(segments)
50
  with self.lock:
51
  self.ipv6_pool.append(ip)
 
52
 
53
  def generate_user_agents(self):
54
  os_list = OS
 
81
 
82
  with self.lock:
83
  self.user_agent_pool.append(ua)
 
84
 
85
  def generate_origins(self):
86
  domains = DOMAINS
 
92
  origin = f"{protocol}{domain}"
93
  with self.lock:
94
  self.origin_pool.append(origin)
 
95
 
96
  def generate_referrers(self):
97
  search_engines = SEARCH_ENGINES
 
103
  referrer = f"{engine}{keyword}"
104
  with self.lock:
105
  self.referrer_pool.append(referrer)
 
106
 
107
  def generate_locations(self):
108
  countries = COUNTRIES
 
120
  }
121
  with self.lock:
122
  self.location_pool.append(location)
 
123
 
124
  def get_ipv4(self):
125
  with self.lock:
src/engine/browser_engine.py CHANGED
@@ -34,7 +34,7 @@ class BrowserEngine:
34
  "Origin": origin,
35
  "Referer": referrer,
36
  "Accept-Language": f"{location['language']},en;q=0.9",
37
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
38
  "Accept-Encoding": "gzip, deflate, br",
39
  "DNT": "1",
40
  "Connection": "keep-alive",
 
34
  "Origin": origin,
35
  "Referer": referrer,
36
  "Accept-Language": f"{location['language']},en;q=0.9",
37
+ "Accept": "application/json,text/html,application/xhtml+xml,application/xml;q=0.9,image/*,*/*;q=0.8",
38
  "Accept-Encoding": "gzip, deflate, br",
39
  "DNT": "1",
40
  "Connection": "keep-alive",
src/processor/response/generator.py CHANGED
@@ -4,7 +4,7 @@
4
  #
5
 
6
  import traceback
7
- from config import MAX_TOKENS, INSTRUCTIONS_END
8
 
9
  def generate_response(
10
  server,
@@ -16,11 +16,10 @@ def generate_response(
16
  response_generator = ""
17
 
18
  if tools_done:
19
- system_reminder = {
20
  "role": "system",
21
  "content": INSTRUCTIONS_END
22
- }
23
- conversation_messages.append(system_reminder)
24
 
25
  try:
26
  response = server.chat.completions.create(
@@ -28,8 +27,6 @@ def generate_response(
28
  messages=conversation_messages,
29
  tools=tool_definitions if not tools_done else None,
30
  tool_choice="none",
31
- max_tokens=MAX_TOKENS,
32
- temperature=0.75,
33
  stream=True
34
  )
35
 
 
4
  #
5
 
6
  import traceback
7
+ from config import INSTRUCTIONS_END
8
 
9
  def generate_response(
10
  server,
 
16
  response_generator = ""
17
 
18
  if tools_done:
19
+ conversation_messages.append({
20
  "role": "system",
21
  "content": INSTRUCTIONS_END
22
+ })
 
23
 
24
  try:
25
  response = server.chat.completions.create(
 
27
  messages=conversation_messages,
28
  tools=tool_definitions if not tools_done else None,
29
  tool_choice="none",
 
 
30
  stream=True
31
  )
32
 
src/processor/tools/interaction.py CHANGED
@@ -10,7 +10,7 @@ from ..reasoning.interface import reasoning_interfaces
10
  from ..reasoning.tool_reasoning import tool_reasoning
11
  from .parser import extract_tool_parameters
12
  from .executor import invoke_tool_function
13
- from config import MAX_TOKENS, REASONING_DELAY
14
 
15
  def process_tool_interactions(server, model_name, conversation_messages, tool_definitions, search_engine):
16
  maximum_iterations = 1
@@ -52,9 +52,7 @@ def process_tool_interactions(server, model_name, conversation_messages, tool_de
52
  model=model_name,
53
  messages=conversation_messages,
54
  tools=tool_definitions,
55
- tool_choice="auto",
56
- max_tokens=MAX_TOKENS,
57
- temperature=0.6
58
  )
59
 
60
  response_choice = model_response.choices[0]
 
10
  from ..reasoning.tool_reasoning import tool_reasoning
11
  from .parser import extract_tool_parameters
12
  from .executor import invoke_tool_function
13
+ from config import REASONING_DELAY
14
 
15
  def process_tool_interactions(server, model_name, conversation_messages, tool_definitions, search_engine):
16
  maximum_iterations = 1
 
52
  model=model_name,
53
  messages=conversation_messages,
54
  tools=tool_definitions,
55
+ tool_choice="auto"
 
 
56
  )
57
 
58
  response_choice = model_response.choices[0]