OpenResearcher

Running

App Files Files Community

AUXteam commited on 4 days ago

Commit

58d52ab

verified ·

1 Parent(s): 0c1d8cd

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

app.py +153 -151
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -18,6 +18,7 @@ from typing import List, Dict, Any, Optional, Tuple, Generator
 import traceback
 import base64
 from transformers import AutoTokenizer
@@ -30,10 +31,10 @@ except ImportError:
 # ============================================================
 # Configuration
 # ============================================================
-MODEL_NAME = os.getenv("MODEL_NAME", "OpenResearcher/Nemotron-3-Nano-30B-A3B")
-REMOTE_API_BASE = os.getenv("REMOTE_API_BASE", "")
-SERPER_API_KEY = os.getenv("SERPER_API_KEY", "")
-MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", "4096"))  # Safe limit for ZeroGPU
 # ============================================================
 # System Prompt & Tools
@@ -153,14 +154,20 @@ TOOL_CONTENT = """
 # Browser Tool Implementation
 # ============================================================
 class SimpleBrowser:
-    """Browser tool using Serper API."""
-    def __init__(self, serper_key: str):
-        self.serper_key = serper_key
         self.pages: Dict[str, Dict] = {}
         self.page_stack: List[str] = []
         self.link_map: Dict[int, Dict] = {} # Map from cursor ID (int) to {url, title}
         self.used_citations = [] # List of cursor IDs (int) in order of first appearance
     @property
     def current_cursor(self) -> int:
@@ -178,11 +185,8 @@ class SimpleBrowser:
         return self.used_citations.index(cursor)
     def get_page_info(self, cursor: int) -> Optional[Dict[str, str]]:
-        # Prioritize link_map as it stores search result metadata
         if cursor in self.link_map:
             return self.link_map[cursor]
-        # Fallback to page_stack for opened pages
         if 0 <= cursor < len(self.page_stack):
             url = self.page_stack[cursor]
             page = self.pages.get(url)
@@ -194,71 +198,104 @@ class SimpleBrowser:
         lines = text.split('\n')
         return '\n'.join(f"L{i + offset}: {line}" for i, line in enumerate(lines))
-    def _clean_links(self, results: List[Dict], query: str) -> Tuple[str, Dict[int, str]]:
-        link_map = {}
-        lines = []
-        for i, r in enumerate(results):
-            title = html.escape(r.get('title', 'No Title'))
-            url = r.get('link', r.get('url', ''))
-            snippet = html.escape(r.get('snippet', r.get('summary', '')))
-            try:
-                domain = url.split('/')[2] if url else ''
-            except:
-                domain = ''
-            try:
-                domain = url.split('/')[2] if url else ''
-            except:
-                domain = ''
-            self.link_map[i] = {'url': url, 'title': title}
-            link_map[i] = {'url': url, 'title': title}
-            link_text = f"【{i}†{title}†{domain}】" if domain else f"【{i}†{title}】"
-            lines.append(f"{link_text}")
-            lines.append(f"  {snippet}")
-            lines.append("")
-        return '\n'.join(lines), link_map
-    async def search(self, query: str, topn: int = 10) -> str:
-        url = "https://google.serper.dev/search"
-        headers = {'X-API-KEY': self.serper_key, 'Content-Type': 'application/json'}
-        payload = json.dumps({"q": query, "num": topn})
-        async with httpx.AsyncClient() as client:
-            try:
-                response = await client.post(url, headers=headers, data=payload, timeout=20.0)
-                if response.status_code != 200:
-                    return f"Error: Search failed with status {response.status_code}"
-                data = response.json()
-                results = data.get("organic", [])
-                if not results:
-                    return f"No results found for: '{query}'"
-                content, new_link_map = self._clean_links(results, query)
-                self.link_map.update(new_link_map) # Merge new links
-                pseudo_url = f"web-search://q={query}&ts={int(time.time())}"
-                cursor = self.current_cursor + 1
-                page_data = {
-                    'url': pseudo_url,
-                    'title': f"Search Results: {query}",
-                    'text': content,
-                    'urls': {str(k): v['url'] for k, v in new_link_map.items()}
                 }
-                self.pages[pseudo_url] = page_data
-                self.page_stack.append(pseudo_url)
-                header = f"{page_data['title']} ({pseudo_url})\n**viewing lines [0 - {len(content.split(chr(10)))-1}]**\n\n"
-                body = self._format_line_numbers(content)
-                return f"[{cursor}] {header}{body}"
-            except Exception as e:
-                return f"Error during search: {str(e)}"
     async def open(self, id: int | str = -1, cursor: int = -1, loc: int = -1, num_lines: int = -1, **kwargs) -> str:
         target_url = None
@@ -288,50 +325,23 @@ class SimpleBrowser:
         if not target_url:
             return "Error: Could not determine target URL"
-        headers = {'X-API-KEY': self.serper_key, 'Content-Type': 'application/json'}
-        payload = json.dumps({"url": target_url})
-        async with httpx.AsyncClient() as client:
-            try:
-                response = await client.post("https://scrape.serper.dev/", headers=headers, data=payload, timeout=30.0)
-                if response.status_code != 200:
-                    return f"Error fetching URL: {response.status_code}"
-                data = response.json()
-                text = data.get("text", "")
-                title = data.get("metadata", {}).get("title", "") if isinstance(data.get("metadata"), dict) else ""
-                if not text:
-                    return f"No content found at URL"
-                lines = text.split('\n')
-                content = '\n'.join(lines)
-                max_lines = 150
-                if len(lines) > max_lines:
-                    content = '\n'.join(lines[:max_lines]) + "\n\n...(content truncated)..."
-                new_cursor = self.current_cursor + 1
-                page_data = {
-                    'url': target_url,
-                    'title': title or target_url,
-                    'text': content,
-                    'urls': {}
-                }
-                self.pages[target_url] = page_data
-                self.page_stack.append(target_url)
-                start = max(0, loc) if loc >= 0 else 0
-                display_lines = content.split('\n')
-                end = min(len(display_lines), start + num_lines) if num_lines > 0 else len(display_lines)
-                header = f"{title or target_url} ({target_url})\n**viewing lines [{start} - {end-1}] of {len(display_lines)-1}**\n\n"
-                body = self._format_line_numbers('\n'.join(display_lines[start:end]), offset=start)
-                return f"[{new_cursor}] {header}{body}"
-            except Exception as e:
-                return f"Error fetching URL: {str(e)}"
     def find(self, pattern: str, cursor: int = -1) -> str:
         if not self.page_stack:
@@ -389,10 +399,12 @@ tokenizer = None
 def load_tokenizer():
     global tokenizer
     if tokenizer is None:
-        print(f"Loading tokenizer: {MODEL_NAME}")
         try:
             tokenizer = AutoTokenizer.from_pretrained(
-                MODEL_NAME,
                 trust_remote_code=True
             )
             print("Tokenizer loaded successfully!")
@@ -765,7 +777,7 @@ def render_tool_result(result: str, fn_name: str) -> str:
     if len(result) > max_length:
         formatted_result = formatted_result[:max_length] + '<br><br><em style="color: #9ca3af;">...(content truncated for display)...</em>'
-    return f'''<div class="result-card-expanded" style="border-left: 3px solid {border_color};">
 <div class="result-header-expanded">{tool_label}</div>
 <div class="result-content-expanded" style="font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', sans-serif; line-height: 1.7; color: #374151;">{title_html}{formatted_result}</div>
 </div>'''
@@ -789,20 +801,31 @@ def render_user_message(question: str) -> str:
 # ============================================================
-# Remote API Generation (via vLLM-compatible endpoint)
 # ============================================================
 async def generate_response(prompt: str, max_new_tokens: int = MAX_NEW_TOKENS) -> str:
-    """Generate response using vLLM OpenAI-compatible API."""
-    # Use /completions endpoint for raw prompt
     url = f"{REMOTE_API_BASE}/completions"
     headers = {
         "Content-Type": "application/json",
-        "ngrok-skip-browser-warning": "true",  # 绕过 ngrok 免费版的浏览器警告页面
     }
     payload = {
-        "model": MODEL_NAME,
         "prompt": prompt,
         "max_tokens": max_new_tokens,
         "temperature": 0.7,
@@ -814,7 +837,7 @@ async def generate_response(prompt: str, max_new_tokens: int = MAX_NEW_TOKENS) -
         response = await client.post(url, json=payload, headers=headers, timeout=300.0)
     if response.status_code != 200:
-        raise Exception(f"vLLM API error {response.status_code}: {response.text}")
     data = response.json()
     return data["choices"][0]["text"]
@@ -825,7 +848,6 @@ async def generate_response(prompt: str, max_new_tokens: int = MAX_NEW_TOKENS) -
 # ============================================================
 async def run_agent_streaming(
     question: str,
-    serper_key: str,
     max_rounds: int
 ) -> Generator[str, None, None]:
     global tokenizer
@@ -834,14 +856,6 @@ async def run_agent_streaming(
         yield "<p style='color: var(--body-text-color-subdued); text-align: center; padding: 2rem;'>Please enter a question to begin.</p>"
         return
-    if not serper_key:
-        yield """<div class="error-message">
-            <p><strong>Serper API Key Required</strong></p>
-            <p>Please configure your Serper API Key in the left sidebar under <strong>Settings</strong>.</p>
-            <p>Don't have an API key? <a href="https://serper.dev/" target="_blank" style="color: #667eea; text-decoration: underline;">Get one here →</a></p>
-        </div>"""
-        return
     # Load tokenizer for prompt formatting
     try:
         load_tokenizer()
@@ -849,7 +863,7 @@ async def run_agent_streaming(
         yield f"<p style='color:#dc2626;'>Error loading tokenizer: {html.escape(str(e))}</p>"
         return
-    browser = SimpleBrowser(serper_key)
     tools = json.loads(TOOL_CONTENT)
     system_prompt = DEVELOPER_CONTENT + f"\n\nToday's date: {datetime.now().strftime('%Y-%m-%d')}"
@@ -886,7 +900,7 @@ async def run_agent_streaming(
                 html_parts.append('<div class="thinking-streaming">Processing...</div>')
                 yield ''.join(html_parts)
-                # Call ZeroGPU function
                 generated = await generate_response(prompt, max_new_tokens=MAX_NEW_TOKENS)
                 # Remove placeholder
@@ -947,7 +961,7 @@ async def run_agent_streaming(
                 result = ""
                 try:
                     if actual_fn == "search":
-                        result = await browser.search(args.get("query", ""), args.get("topn", 10))
                     elif actual_fn == "open":
                         result = await browser.open(**args)
                     elif actual_fn == "find":
@@ -2344,15 +2358,13 @@ def create_interface():
     """
     with gr.Blocks(css=INLINE_CSS, theme=gr.themes.Soft(), js=CAROUSEL_JS) as demo:
-        # Header with logo and title images - convert to base64 for proper rendering
-        # Files are in the same directory as app.py (test1/)
         logo_path = os.path.join(script_dir, "or-logo1.png")
         title_path = os.path.join(script_dir, "openresearcher-title.svg")
         logo_base64 = image_to_base64(logo_path)
         title_base64 = image_to_base64(title_path)
-        # Build header HTML with base64 images
         header_html = f"""
         <div style="
             text-align: center;
@@ -2406,16 +2418,6 @@ def create_interface():
                         <span class="settings-title">⚙️ Settings</span>
                     </div>
                     ''')
-                    serper_input = gr.Textbox(
-                        label="",
-                        value=SERPER_API_KEY,
-                        type="password",
-                        placeholder="Enter your Serper API key...",
-                        show_label=False,
-                        elem_id="serper-api-input",
-                        container=False,
-                        visible=False
-                    )
                     max_rounds_input = gr.Slider(
                         minimum=1,
                         maximum=200,
@@ -2513,7 +2515,7 @@ def create_interface():
                     clear_btn = gr.Button("🗑 Clear", scale=1)
         # Function to hide welcome and show output
-        async def start_research(question, serper_key, max_rounds):
             # Generator that first hides welcome, then streams results
             # Also clears the input box for the next question
@@ -2521,13 +2523,13 @@ def create_interface():
             # IMPORTANT: Don't use empty string for output, or JS will hide the output area!
             yield "", '<div style="text-align: center; padding: 2rem; color: #6b7280;">Delving into it...</div>', ""
-            async for result in run_agent_streaming(question, serper_key, max_rounds):
                 yield "", result, ""
         # Event handlers
         submit_event = submit_btn.click(
             fn=start_research,
-            inputs=[question_input, serper_input, max_rounds_input],
             outputs=[welcome_html, output_area, question_input],
             show_progress="hidden",
             concurrency_limit=20
@@ -2535,7 +2537,7 @@ def create_interface():
         question_input.submit(
             fn=start_research,
-            inputs=[question_input, serper_input, max_rounds_input],
             outputs=[welcome_html, output_area, question_input],
             show_progress="hidden",
             concurrency_limit=20
@@ -2563,7 +2565,7 @@ def create_interface():
 if __name__ == "__main__":
     print("="*60)
-    print("OpenResearcher DeepSearch Agent - ZeroGPU Space")
     print("="*60)
     demo = create_interface()
-    demo.queue(default_concurrency_limit=20).launch()

 import traceback
 import base64
 from transformers import AutoTokenizer
+from gradio_client import Client as GradioClient
 # ============================================================
 # Configuration
 # ============================================================
+MODEL_NAME = os.getenv("MODEL_NAME", "alias-fast")
+REMOTE_API_BASE = os.getenv("REMOTE_API_BASE", "https://api.helmholtz-blablador.fz-juelich.de/v1")
+BLABLADOR_API_KEY = os.getenv("BLABLADOR_API_KEY", "")
+MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", "4096"))
 # ============================================================
 # System Prompt & Tools
 # Browser Tool Implementation
 # ============================================================
 class SimpleBrowser:
+    """Browser tool using victor/websearch Gradio API."""
+    def __init__(self):
         self.pages: Dict[str, Dict] = {}
         self.page_stack: List[str] = []
         self.link_map: Dict[int, Dict] = {} # Map from cursor ID (int) to {url, title}
         self.used_citations = [] # List of cursor IDs (int) in order of first appearance
+        try:
+            # victor/websearch is a public space, but we can pass token if available
+            hf_token = os.getenv("HF_TOKEN", "")
+            self.client = GradioClient("victor/websearch", hf_token=hf_token if hf_token else None)
+        except Exception as e:
+            print(f"Error initializing Gradio client: {e}")
+            self.client = None
     @property
     def current_cursor(self) -> int:
         return self.used_citations.index(cursor)
     def get_page_info(self, cursor: int) -> Optional[Dict[str, str]]:
         if cursor in self.link_map:
             return self.link_map[cursor]
         if 0 <= cursor < len(self.page_stack):
             url = self.page_stack[cursor]
             page = self.pages.get(url)
         lines = text.split('\n')
         return '\n'.join(f"L{i + offset}: {line}" for i, line in enumerate(lines))
+    def _parse_websearch_output(self, output: str) -> List[Dict]:
+        results = []
+        # Split by the separator ---, handling potential variations in newlines
+        parts = re.split(r'\n---\n|^\s*---\s*$', output, flags=re.MULTILINE)
+        for part in parts:
+            part = part.strip()
+            if not part or "Successfully extracted content" in part:
+                continue
+            title_match = re.search(r'## (.*)', part)
+            domain_match = re.search(r'\*\*Domain:\*\* (.*)', part)
+            url_match = re.search(r'\*\*URL:\*\* (.*)', part)
+            if title_match and url_match:
+                title = title_match.group(1).strip()
+                url = url_match.group(1).strip()
+                domain = domain_match.group(1).strip() if domain_match else ""
+                # Content starts after metadata
+                metadata_end = url_match.end()
+                content = part[metadata_end:].strip()
+                results.append({
+                    'title': title,
+                    'url': url,
+                    'domain': domain,
+                    'content': content
+                })
+        return results
+    async def search(self, query: str, topn: int = 4) -> str:
+        if not self.client:
+            return "Error: Search client not initialized"
+        try:
+            # Call the Gradio API
+            loop = asyncio.get_event_loop()
+            result_str = await loop.run_in_executor(
+                None,
+                lambda: self.client.predict(
+                    query=query,
+                    search_type="search",
+                    num_results=topn,
+                    api_name="/search_web"
+                )
+            )
+            results = self._parse_websearch_output(result_str)
+            if not results:
+                return f"No results found for: '{query}'"
+            # Populate pages and link_map
+            new_link_map = {}
+            lines = []
+            for i, r in enumerate(results):
+                title = r['title']
+                url = r['url']
+                domain = r['domain']
+                content = r['content']
+                # Create a snippet for the search result view
+                snippet = content[:200].replace('\n', ' ') + "..."
+                self.link_map[i] = {'url': url, 'title': title}
+                new_link_map[i] = {'url': url, 'title': title}
+                # Cache the full content
+                self.pages[url] = {
+                    'url': url,
+                    'title': title,
+                    'text': content
                 }
+                link_text = f"【{i}†{title}†{domain}】" if domain else f"【{i}†{title}】"
+                lines.append(f"{link_text}")
+                lines.append(f"  {snippet}")
+                lines.append("")
+            formatted_content = '\n'.join(lines)
+            pseudo_url = f"web-search://q={query}&ts={int(time.time())}"
+            cursor = self.current_cursor + 1
+            self.pages[pseudo_url] = {
+                'url': pseudo_url,
+                'title': f"Search Results: {query}",
+                'text': formatted_content,
+                'urls': {str(k): v['url'] for k, v in new_link_map.items()}
+            }
+            self.page_stack.append(pseudo_url)
+            header = f"Search Results: {query} ({pseudo_url})\n**viewing lines [0 - {len(formatted_content.split(chr(10)))-1}]**\n\n"
+            body = self._format_line_numbers(formatted_content)
+            return f"[{cursor}] {header}{body}"
+        except Exception as e:
+            return f"Error during search: {str(e)}"
     async def open(self, id: int | str = -1, cursor: int = -1, loc: int = -1, num_lines: int = -1, **kwargs) -> str:
         target_url = None
         if not target_url:
             return "Error: Could not determine target URL"
+        # Check if we already have the page content cached
+        if target_url in self.pages:
+            page = self.pages[target_url]
+            text = page['text']
+            lines = text.split('\n')
+            new_cursor = self.current_cursor + 1
+            self.page_stack.append(target_url)
+            start = max(0, loc) if loc >= 0 else 0
+            end = min(len(lines), start + num_lines) if num_lines > 0 else len(lines)
+            header = f"{page['title']} ({target_url})\n**viewing lines [{start} - {end-1}] of {len(lines)-1}**\n\n"
+            body = self._format_line_numbers('\n'.join(lines[start:end]), offset=start)
+            return f"[{new_cursor}] {header}{body}"
+        return f"Error: Content for {target_url} not found in search results. The current search API only provides content for pages returned in search results."
     def find(self, pattern: str, cursor: int = -1) -> str:
         if not self.page_stack:
 def load_tokenizer():
     global tokenizer
     if tokenizer is None:
+        # We use Nemotron as a proxy tokenizer for token counting
+        token_model = "OpenResearcher/Nemotron-3-Nano-30B-A3B"
+        print(f"Loading tokenizer: {token_model}")
         try:
             tokenizer = AutoTokenizer.from_pretrained(
+                token_model,
                 trust_remote_code=True
             )
             print("Tokenizer loaded successfully!")
     if len(result) > max_length:
         formatted_result = formatted_result[:max_length] + '<br><br><em style="color: #9ca3af;">...(content truncated for display)...</em>'
+    return f'''<div class="result-card-expanded" style="border-left: 3_solid {border_color};">
 <div class="result-header-expanded">{tool_label}</div>
 <div class="result-content-expanded" style="font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', sans-serif; line-height: 1.7; color: #374151;">{title_html}{formatted_result}</div>
 </div>'''
 # ============================================================
+# Remote API Generation (via OpenAI-compatible endpoint)
 # ============================================================
+def count_tokens(text: str) -> int:
+    """Count tokens in text using the loaded tokenizer."""
+    try:
+        tok = load_tokenizer()
+        return len(tok.encode(text))
+    except Exception:
+        # Fallback to rough estimate if tokenizer fails
+        return len(text) // 4
 async def generate_response(prompt: str, max_new_tokens: int = MAX_NEW_TOKENS) -> str:
+    """Generate response using OpenAI-compatible API with model switching."""
+    # Choose model based on prompt length
+    prompt_tokens = count_tokens(prompt)
+    selected_model = "alias-large" if prompt_tokens > 4000 else "alias-fast"
     url = f"{REMOTE_API_BASE}/completions"
     headers = {
         "Content-Type": "application/json",
+        "Authorization": f"Bearer {BLABLADOR_API_KEY}"
     }
     payload = {
+        "model": selected_model,
         "prompt": prompt,
         "max_tokens": max_new_tokens,
         "temperature": 0.7,
         response = await client.post(url, json=payload, headers=headers, timeout=300.0)
     if response.status_code != 200:
+        raise Exception(f"LLM API error {response.status_code}: {response.text}")
     data = response.json()
     return data["choices"][0]["text"]
 # ============================================================
 async def run_agent_streaming(
     question: str,
     max_rounds: int
 ) -> Generator[str, None, None]:
     global tokenizer
         yield "<p style='color: var(--body-text-color-subdued); text-align: center; padding: 2rem;'>Please enter a question to begin.</p>"
         return
     # Load tokenizer for prompt formatting
     try:
         load_tokenizer()
         yield f"<p style='color:#dc2626;'>Error loading tokenizer: {html.escape(str(e))}</p>"
         return
+    browser = SimpleBrowser()
     tools = json.loads(TOOL_CONTENT)
     system_prompt = DEVELOPER_CONTENT + f"\n\nToday's date: {datetime.now().strftime('%Y-%m-%d')}"
                 html_parts.append('<div class="thinking-streaming">Processing...</div>')
                 yield ''.join(html_parts)
+                # Call generation function
                 generated = await generate_response(prompt, max_new_tokens=MAX_NEW_TOKENS)
                 # Remove placeholder
                 result = ""
                 try:
                     if actual_fn == "search":
+                        result = await browser.search(args.get("query", ""), args.get("topn", 4))
                     elif actual_fn == "open":
                         result = await browser.open(**args)
                     elif actual_fn == "find":
     """
     with gr.Blocks(css=INLINE_CSS, theme=gr.themes.Soft(), js=CAROUSEL_JS) as demo:
+        # Header with logo and title images
         logo_path = os.path.join(script_dir, "or-logo1.png")
         title_path = os.path.join(script_dir, "openresearcher-title.svg")
         logo_base64 = image_to_base64(logo_path)
         title_base64 = image_to_base64(title_path)
         header_html = f"""
         <div style="
             text-align: center;
                         <span class="settings-title">⚙️ Settings</span>
                     </div>
                     ''')
                     max_rounds_input = gr.Slider(
                         minimum=1,
                         maximum=200,
                     clear_btn = gr.Button("🗑 Clear", scale=1)
         # Function to hide welcome and show output
+        async def start_research(question, max_rounds):
             # Generator that first hides welcome, then streams results
             # Also clears the input box for the next question
             # IMPORTANT: Don't use empty string for output, or JS will hide the output area!
             yield "", '<div style="text-align: center; padding: 2rem; color: #6b7280;">Delving into it...</div>', ""
+            async for result in run_agent_streaming(question, max_rounds):
                 yield "", result, ""
         # Event handlers
         submit_event = submit_btn.click(
             fn=start_research,
+            inputs=[question_input, max_rounds_input],
             outputs=[welcome_html, output_area, question_input],
             show_progress="hidden",
             concurrency_limit=20
         question_input.submit(
             fn=start_research,
+            inputs=[question_input, max_rounds_input],
             outputs=[welcome_html, output_area, question_input],
             show_progress="hidden",
             concurrency_limit=20
 if __name__ == "__main__":
     print("="*60)
+    print("OpenResearcher DeepSearch Agent - Helmholtz Blablador Provider")
     print("="*60)
     demo = create_interface()
+    demo.queue(default_concurrency_limit=20).launch()

requirements.txt CHANGED Viewed

@@ -8,4 +8,5 @@ bitsandbytes
 sentencepiece
 protobuf
 json5
-accelerate

 sentencepiece
 protobuf
 json5
+accelerate
+gradio_client