Bbbv

Sleeping

App Files Files Community

Ksjsjjdj commited on Nov 21

Commit

950252a

verified ·

1 Parent(s): 4cb338c

Update app.py

Browse files

Files changed (1) hide show

app.py +202 -80

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import re
 import gc
 import sys
 import time
 import queue
 import random
 import asyncio
@@ -36,6 +37,8 @@ GPU_LOCK = asyncio.Lock()
 class ChatMessage(BaseModel):
     role: str = Field()
     content: str = Field()
 class Logprob(BaseModel):
     token: str
@@ -76,6 +79,15 @@ class ChatCompletionChunk(BaseModel):
     choices: List[ChatCompletionChoice]
     usage: Optional[Usage]
 def remove_nested_think_tags_stack(text):
     stack = []
     result = ""
@@ -106,7 +118,17 @@ def cleanMessages(messages: List[ChatMessage], removeThinkingContent: bool = Fal
         role_str = message.role.strip().lower().capitalize()
         if role_str == 'Assistant' and removeThinkingContent:
             content = remove_nested_think_tags_stack(content)
-        promptStrList.append(f"{role_str}: {content}")
     return "\n\n".join(promptStrList)
 class SamplerConfig(BaseModel):
@@ -252,6 +274,8 @@ class ChatCompletionRequest(BaseModel):
     stream: Optional[bool] = Field(default=False)
     stop: Optional[List[str]] = Field(["\n\n"])
     stop_tokens: Optional[List[int]] = Field([0])
     @model_validator(mode="before")
     @classmethod
@@ -261,57 +285,105 @@ class ChatCompletionRequest(BaseModel):
             raise ValueError("messages and prompt cannot coexist.")
         return data
-class TruthProtocol:
-    STRICT_SYSTEM_PROMPT = """
-CRITICAL PROTOCOL: OBJECTIVE_REALITY_ONLY
-1. NEUTRALITY: Abandon all bias, opinions, and emotional language.
-2. FACTS ONLY: State verified facts. If data is missing, say [NO DATA]. Do not guess.
-3. NO HALLUCINATIONS: Do not invent dates, names, or events.
-4. LABELS: Tag assertions with [VERIFIED] or [UNCERTAIN].
-5. TONE: Robotic, precise, dense. No pleasantries.
 """.strip()
     @staticmethod
-    def enforce_truth_params(request: ChatCompletionRequest):
-        request.temperature = 0.12
-        request.top_p = 0.1
-        request.count_penalty = 1.1
-        request.presence_penalty = 0.6
-        request.penalty_decay = 0.996
     @staticmethod
-    def sanitise_search(query: str, results: List[dict]) -> str:
-        context = "RAW DATA STREAM (IGNORE OPINIONS, EXTRACT FACTS):\n"
-        for i, res in enumerate(results):
-            clean_body = res['body'].replace("\n", " ").strip()
-            context += f"SOURCE [{i+1}]: {clean_body} (Origin: {res['title']})\n"
-        return context
-search_cache = collections.OrderedDict()
-def search_facts(query: str) -> str:
-    if not HAS_DDG: return ""
-    if query in search_cache: return search_cache[query]
-    try:
-        ddgs = DDGS()
-        results = ddgs.text(query, max_results=4)
-        if any(x in query.lower() for x in ["verdad", "fake", "cierto", "mentira"]):
-             check = ddgs.text(f"{query} fact check verified", max_results=2)
-             if check: results.extend(check)
-        if not results: return ""
-        ctx = TruthProtocol.sanitise_search(query, results)
-        if len(search_cache) > 50: search_cache.popitem(last=False)
-        search_cache[query] = ctx
-        return ctx
-    except:
-        return ""
-def needs_verification(msg: str, model: str) -> bool:
-    if ":online" in model: return True
-    triggers = ["es verdad", "dato", "precio", "cuando", "quien", "noticia", "actualidad", "verify"]
-    return any(t in msg.lower() for t in triggers)
-app = FastAPI(title="RWKV Ultimate Server")
 app.add_middleware(
     CORSMiddleware,
@@ -374,6 +446,7 @@ def generate(request: ChatCompletionRequest, out, model_tokens: List[int], model
     cache_word_list = []
     stop_sequences = request.stop if request.stop else []
     for i in range(max_tokens):
         for n in occurrence: out[n] -= args.alpha_presence + occurrence[n] * args.alpha_frequency
@@ -396,38 +469,92 @@ def generate(request: ChatCompletionRequest, out, model_tokens: List[int], model
         out_last = i + 1
         current_buffer = "".join(cache_word_list)
         for s in stop_sequences:
-            if s in current_buffer:
                 final_content = current_buffer.split(s)[0]
                 yield {"content": final_content, "finish_reason": "stop", "state": model_state}
                 del out; gc.collect(); return
-        if len(cache_word_list) > 1:
             yield {"content": cache_word_list.pop(0), "finish_reason": None}
     yield {"content": "".join(cache_word_list), "finish_reason": "length"}
 async def chatResponseStream(request: ChatCompletionRequest, model_state: any, completionId: str, enableReasoning: bool):
-    clean_msg = cleanMessages(request.messages, enableReasoning)
-    prompt = f"{clean_msg}\n\nAssistant:{' <think' if enableReasoning else ''}"
-    async with GPU_LOCK:
-        try:
-            out, model_tokens, model_state = await runPrefill(request, prompt, [0], model_state)
-            yield f"data: {ChatCompletionChunk(id=completionId, created=int(time.time()), model=request.model, choices=[ChatCompletionChoice(index=0, delta=ChatCompletionMessage(role='Assistant', content=''), finish_reason=None)]).model_dump_json()}\n\n"
-            for chunk in generate(request, out, model_tokens, model_state, max_tokens=request.max_tokens or 4096):
-                content = chunk.get("content", "")
-                finish = chunk.get("finish_reason", None)
-                if content:
-                    yield f"data: {ChatCompletionChunk(id=completionId, created=int(time.time()), model=request.model, choices=[ChatCompletionChoice(index=0, delta=ChatCompletionMessage(content=content), finish_reason=None)]).model_dump_json()}\n\n"
-                if finish:
-                    yield f"data: {ChatCompletionChunk(id=completionId, created=int(time.time()), model=request.model, choices=[ChatCompletionChoice(index=0, delta=ChatCompletionMessage(content=''), finish_reason=finish)]).model_dump_json()}\n\n"
-                    break
-                await asyncio.sleep(0)
-        finally:
-            pass
     yield "data: [DONE]\n\n"
@@ -453,21 +580,16 @@ async def chat_completions(request: ChatCompletionRequest):
         if req_data.get(k) is None: req_data[k] = v
     realRequest = ChatCompletionRequest(**req_data)
-    sys_msg = ChatMessage(role="System", content=TruthProtocol.STRICT_SYSTEM_PROMPT)
-    if realRequest.messages:
-        if realRequest.messages[0].role == "System":
-             realRequest.messages[0].content = f"{TruthProtocol.STRICT_SYSTEM_PROMPT}\n\n{realRequest.messages[0].content}"
-        else:
-            realRequest.messages.insert(0, sys_msg)
-    last_msg = realRequest.messages[-1]
-    if last_msg.role == "user" and needs_verification(last_msg.content, raw_model):
-        ctx = search_facts(last_msg.content)
-        if ctx:
-            realRequest.messages.insert(-1, ChatMessage(role="System", content=ctx))
-    TruthProtocol.enforce_truth_params(realRequest)
     realRequest.messages = prune_context(realRequest.messages, target_model, realRequest.max_tokens or 1024)
     return StreamingResponse(chatResponseStream(realRequest, None, completionId, is_reasoning), media_type="text/event-stream")

 import gc
 import sys
 import time
+import json
 import queue
 import random
 import asyncio
 class ChatMessage(BaseModel):
     role: str = Field()
     content: str = Field()
+    name: Optional[str] = Field(None)
+    tool_call_id: Optional[str] = Field(None)
 class Logprob(BaseModel):
     token: str
     choices: List[ChatCompletionChoice]
     usage: Optional[Usage]
+class ToolFunction(BaseModel):
+    name: str
+    description: str
+    parameters: Dict[str, Any]
+class Tool(BaseModel):
+    type: Literal["function"] = "function"
+    function: ToolFunction
 def remove_nested_think_tags_stack(text):
     stack = []
     result = ""
         role_str = message.role.strip().lower().capitalize()
         if role_str == 'Assistant' and removeThinkingContent:
             content = remove_nested_think_tags_stack(content)
+        if message.role == "tool":
+            promptStrList.append(f"Tool Output ({message.name}): {content}")
+        elif message.role == "system":
+            promptStrList.append(f"System: {content}")
+        elif message.role == "user":
+            promptStrList.append(f"User: {content}")
+        elif message.role == "assistant":
+            promptStrList.append(f"Assistant: {content}")
+        else:
+            promptStrList.append(f"{role_str}: {content}")
     return "\n\n".join(promptStrList)
 class SamplerConfig(BaseModel):
     stream: Optional[bool] = Field(default=False)
     stop: Optional[List[str]] = Field(["\n\n"])
     stop_tokens: Optional[List[int]] = Field([0])
+    tools: Optional[List[Tool]] = Field(default=None)
+    tool_choice: Optional[Union[str, Dict]] = Field(default="auto")
     @model_validator(mode="before")
     @classmethod
             raise ValueError("messages and prompt cannot coexist.")
         return data
+class ToolEngine:
+    TOOL_SYSTEM_PROMPT = """
+CAPABILITY: You have access to real-time tools.
+INSTRUCTION: To use a tool, output exactly: <call>tool_name("argument")</call>
+Do not describe the tool, just call it. After the System provides the result, synthesize the answer.
+AVAILABLE TOOLS:
+1. google_search(query): Searches Google and DuckDuckGo for real-time information.
+2. visit_page(url): Accesses a specific link, reads the text, and finds sub-links.
 """.strip()
     @staticmethod
+    def google_search_request(query: str) -> str:
+        try:
+            headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}
+            resp = requests.get("https://www.google.com/search", params={"q": query, "gl": "us", "hl": "en"}, headers=headers, timeout=6)
+            if resp.status_code != 200: raise Exception("Google blocked request")
+            clean_text = re.sub(r'<script.*?>.*?</script>', '', resp.text, flags=re.DOTALL)
+            clean_text = re.sub(r'<style.*?>.*?</style>', '', clean_text, flags=re.DOTALL)
+            headings = re.findall(r'<h3.*?>(.*?)</h3>', clean_text)
+            links = re.findall(r'<a href="/url\?q=(.*?)&', clean_text)
+            limit = min(len(headings), len(links), 5)
+            output = "Google Results:\n"
+            for i in range(limit):
+                output += f"{i+1}. {re.sub(r'<.*?>', '', headings[i])} - Link: {links[i]}\n"
+            if not headings:
+                return ToolEngine.duckduckgo_fallback(query)
+            return output
+        except:
+            return ToolEngine.duckduckgo_fallback(query)
+    @staticmethod
+    def duckduckgo_fallback(query: str) -> str:
+        try:
+            if HAS_DDG:
+                res = DDGS().text(query, max_results=5)
+                return "\n".join([f"- {r['title']}: {r['body']} ({r['href']})" for r in res])
+            resp = requests.get("https://html.duckduckgo.com/html/", params={"q": query}, headers={"User-Agent": "Mozilla/5.0"}, timeout=5)
+            titles = re.findall(r'<a class="result__a"[^>]*>(.*?)</a>', resp.text)
+            snippets = re.findall(r'<a class="result__snippet"[^>]*>(.*?)</a>', resp.text)
+            limit = min(len(titles), len(snippets), 4)
+            out = "DuckDuckGo HTML Results:\n"
+            for i in range(limit):
+                t = re.sub(r'<.*?>', '', titles[i]).strip()
+                s = re.sub(r'<.*?>', '', snippets[i]).strip()
+                out += f"{i+1}. {t}: {s}\n"
+            return out
+        except Exception as e:
+            return f"Search failed: {str(e)}"
+    @staticmethod
+    def visit_page(url: str) -> str:
+        try:
+            headers = {"User-Agent": "Mozilla/5.0 (compatible; RWKV-Bot/1.0)"}
+            resp = requests.get(url, headers=headers, timeout=8)
+            resp.encoding = resp.apparent_encoding
+            text = re.sub(r'<head.*?>.*?</head>', '', resp.text, flags=re.DOTALL)
+            text = re.sub(r'<script.*?>.*?</script>', '', text, flags=re.DOTALL)
+            text = re.sub(r'<style.*?>.*?</style>', '', text, flags=re.DOTALL)
+            text = re.sub(r'<!--.*?-->', '', text, flags=re.DOTALL)
+            text = re.sub(r'<[^>]+>', ' ', text)
+            text = re.sub(r'\s+', ' ', text).strip()
+            links = re.findall(r'href=["\'](http[s]?://[^"\']+)["\']', resp.text)
+            unique_links = list(set(links))[:5]
+            content_preview = text[:3000] + ("..." if len(text) > 3000 else "")
+            return f"PAGE CONTENT ({url}):\n{content_preview}\n\nFOUND SUB-LINKS:\n" + "\n".join(unique_links)
+        except Exception as e:
+            return f"Error visiting page: {str(e)}"
     @staticmethod
+    def execute(call_str: str) -> str:
+        try:
+            match = re.match(r'(\w+)\(["\'](.*?)["\']\)', call_str)
+            if not match: return "Invalid tool call syntax."
+            func, arg = match.groups()
+            if func == "google_search":
+                return ToolEngine.google_search_request(arg)
+            elif func == "visit_page":
+                return ToolEngine.visit_page(arg)
+            else:
+                return f"Unknown tool: {func}"
+        except Exception as e:
+            return f"Tool execution error: {e}"
+app = FastAPI(title="RWKV Ultimate Agent Server")
 app.add_middleware(
     CORSMiddleware,
     cache_word_list = []
     stop_sequences = request.stop if request.stop else []
+    stop_sequences.append("<call>")
     for i in range(max_tokens):
         for n in occurrence: out[n] -= args.alpha_presence + occurrence[n] * args.alpha_frequency
         out_last = i + 1
         current_buffer = "".join(cache_word_list)
+        if "<call>" in current_buffer:
+            pre_call = current_buffer.split("<call>")[0]
+            yield {"content": pre_call, "finish_reason": "tool_start", "state": model_state}
+            del out; gc.collect(); return
         for s in stop_sequences:
+            if s in current_buffer and s != "<call>":
                 final_content = current_buffer.split(s)[0]
                 yield {"content": final_content, "finish_reason": "stop", "state": model_state}
                 del out; gc.collect(); return
+        if len(cache_word_list) > 2:
             yield {"content": cache_word_list.pop(0), "finish_reason": None}
     yield {"content": "".join(cache_word_list), "finish_reason": "length"}
 async def chatResponseStream(request: ChatCompletionRequest, model_state: any, completionId: str, enableReasoning: bool):
+    current_messages = request.messages
+    for step in range(4):
+        clean_msg = cleanMessages(current_messages, enableReasoning)
+        prompt = f"{clean_msg}\n\nAssistant:{' <think' if enableReasoning else ''}"
+        tool_buffer = ""
+        tool_call_mode = False
+        async with GPU_LOCK:
+            try:
+                out, model_tokens, model_state = await runPrefill(request, prompt, [0], model_state)
+                if step == 0:
+                    yield f"data: {ChatCompletionChunk(id=completionId, created=int(time.time()), model=request.model, choices=[ChatCompletionChoice(index=0, delta=ChatCompletionMessage(role='Assistant', content=''), finish_reason=None)]).model_dump_json()}\n\n"
+                for chunk in generate(request, out, model_tokens, model_state, max_tokens=request.max_tokens or 4096):
+                    content = chunk.get("content", "")
+                    finish = chunk.get("finish_reason", None)
+                    if finish == "tool_start":
+                        tool_call_mode = True
+                        if content:
+                            yield f"data: {ChatCompletionChunk(id=completionId, created=int(time.time()), model=request.model, choices=[ChatCompletionChoice(index=0, delta=ChatCompletionMessage(content=content), finish_reason=None)]).model_dump_json()}\n\n"
+                        break
+                    if content:
+                        yield f"data: {ChatCompletionChunk(id=completionId, created=int(time.time()), model=request.model, choices=[ChatCompletionChoice(index=0, delta=ChatCompletionMessage(content=content), finish_reason=None)]).model_dump_json()}\n\n"
+                    if finish:
+                        yield f"data: {ChatCompletionChunk(id=completionId, created=int(time.time()), model=request.model, choices=[ChatCompletionChoice(index=0, delta=ChatCompletionMessage(content=''), finish_reason=finish)]).model_dump_json()}\n\n"
+                        return
+            finally:
+                pass
+        if tool_call_mode:
+            full_tool_call = ""
+            async with GPU_LOCK:
+                try:
+                    tool_out, tool_tokens, tool_state = await runPrefill(request, "", [0], model_state)
+                    temp_tokens = []
+                    current_gen = ""
+                    for i in range(200):
+                        args = PIPELINE_ARGS(temperature=0.1, top_p=0.1)
+                        tool_token = MODEL_STORAGE[request.model].pipeline.sample_logits(tool_out, temperature=0.1, top_p=0.1)
+                        tool_out, tool_state = MODEL_STORAGE[request.model].model.forward([tool_token], tool_state)
+                        char = MODEL_STORAGE[request.model].pipeline.decode([tool_token])
+                        current_gen += char
+                        if "</call>" in current_gen:
+                            full_tool_call = current_gen.split("</call>")[0]
+                            break
+                finally:
+                    pass
+            if full_tool_call:
+                result = ToolEngine.execute(full_tool_call)
+                current_messages.append(ChatMessage(role="assistant", content=f"<call>{full_tool_call}</call>"))
+                current_messages.append(ChatMessage(role="tool", content=result, name="system"))
+            else:
+                break
+        else:
+            break
     yield "data: [DONE]\n\n"
         if req_data.get(k) is None: req_data[k] = v
     realRequest = ChatCompletionRequest(**req_data)
+    enable_tools = ":online" in raw_model or realRequest.tools is not None
+    if enable_tools:
+        sys_msg = ChatMessage(role="System", content=ToolEngine.TOOL_SYSTEM_PROMPT)
+        if realRequest.messages:
+            if realRequest.messages[0].role == "System":
+                 realRequest.messages[0].content += f"\n\n{ToolEngine.TOOL_SYSTEM_PROMPT}"
+            else:
+                realRequest.messages.insert(0, sys_msg)
     realRequest.messages = prune_context(realRequest.messages, target_model, realRequest.max_tokens or 1024)
     return StreamingResponse(chatResponseStream(realRequest, None, completionId, is_reasoning), media_type="text/event-stream")