Ksjsjjdj commited on
Commit
3dcb100
·
verified ·
1 Parent(s): 9556615

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +491 -0
app.py ADDED
@@ -0,0 +1,491 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import gc
4
+ import sys
5
+ import time
6
+ import queue
7
+ import random
8
+ import asyncio
9
+ import threading
10
+ import requests
11
+ import collections
12
+ import torch
13
+ import numpy as np
14
+ from typing import List, Optional, Dict, Any, Literal, Union
15
+ from pydantic import BaseModel, Field, model_validator
16
+ from pydantic_settings import BaseSettings
17
+ from fastapi import FastAPI, HTTPException, Request
18
+ from fastapi.responses import StreamingResponse
19
+ from fastapi.middleware.cors import CORSMiddleware
20
+ from fastapi.staticfiles import StaticFiles
21
+ from fastapi.middleware.gzip import GZipMiddleware
22
+ from huggingface_hub import hf_hub_download
23
+ from loguru import logger
24
+ from snowflake import SnowflakeGenerator
25
+
26
+ if os.environ.get("MODELSCOPE_ENVIRONMENT") == "studio":
27
+ from modelscope import patch_hub
28
+ patch_hub()
29
+
30
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:256"
31
+ os.environ["RWKV_V7_ON"] = "1"
32
+ os.environ["RWKV_JIT_ON"] = "1"
33
+
34
+ class ChatMessage(BaseModel):
35
+ role: str = Field()
36
+ content: str = Field()
37
+
38
+ class Logprob(BaseModel):
39
+ token: str
40
+ logprob: float
41
+ top_logprobs: Optional[List[Dict[str, Any]]] = None
42
+
43
+ class LogprobsContent(BaseModel):
44
+ content: Optional[List[Logprob]] = None
45
+ refusal: Optional[List[Logprob]] = None
46
+
47
+ class FunctionCall(BaseModel):
48
+ name: str
49
+ arguments: str
50
+
51
+ class ChatCompletionMessage(BaseModel):
52
+ role: Optional[str] = Field(None)
53
+ content: Optional[str] = Field(None)
54
+ reasoning_content: Optional[str] = Field(None)
55
+ tool_calls: Optional[List[Dict[str, Any]]] = Field(None)
56
+
57
+ class PromptTokensDetails(BaseModel):
58
+ cached_tokens: int
59
+
60
+ class CompletionTokensDetails(BaseModel):
61
+ reasoning_tokens: int
62
+ accepted_prediction_tokens: int
63
+ rejected_prediction_tokens: int
64
+
65
+ class Usage(BaseModel):
66
+ prompt_tokens: int
67
+ completion_tokens: int
68
+ total_tokens: int
69
+ prompt_tokens_details: Optional[PromptTokensDetails]
70
+
71
+ class ChatCompletionChoice(BaseModel):
72
+ index: int
73
+ message: Optional[ChatCompletionMessage] = None
74
+ delta: Optional[ChatCompletionMessage] = None
75
+ logprobs: Optional[LogprobsContent] = None
76
+ finish_reason: Optional[str] = Field(...)
77
+
78
+ class ChatCompletion(BaseModel):
79
+ id: str = Field(...)
80
+ object: Literal["chat.completion"] = "chat.completion"
81
+ created: int = Field(...)
82
+ model: str
83
+ choices: List[ChatCompletionChoice]
84
+ usage: Usage
85
+
86
+ class ChatCompletionChunk(BaseModel):
87
+ id: str = Field(...)
88
+ object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
89
+ created: int = Field(...)
90
+ model: str
91
+ choices: List[ChatCompletionChoice]
92
+ usage: Optional[Usage]
93
+
94
+ def remove_nested_think_tags_stack(text):
95
+ stack = []
96
+ result = ""
97
+ i = 0
98
+ while i < len(text):
99
+ if text[i : i + 7] == "<think>":
100
+ stack.append("<think>")
101
+ i += 7
102
+ elif text[i : i + 8] == "</think>":
103
+ if stack and stack[-1] == "<think>":
104
+ stack.pop()
105
+ i += 8
106
+ else:
107
+ result += text[i : i + 8]
108
+ i += 8
109
+ elif not stack:
110
+ result += text[i]
111
+ i += 1
112
+ else:
113
+ i += 1
114
+ return result
115
+
116
+ def parse_think_response(full_response: str):
117
+ think_start = full_response.find("<think")
118
+ if think_start == -1:
119
+ return None, full_response.strip()
120
+ think_end = full_response.find("</think>")
121
+ if think_end == -1:
122
+ reasoning = full_response[think_start:].strip()
123
+ content = ""
124
+ else:
125
+ reasoning = full_response[think_start : think_end + 9].strip()
126
+ content = full_response[think_end + 9 :].strip()
127
+ reasoning_content = reasoning.replace("<think", "").replace("</think>", "").strip()
128
+ return reasoning_content, content
129
+
130
+ def cleanMessages(messages: List[ChatMessage], removeThinkingContent: bool = False):
131
+ promptStrList = []
132
+ for message in messages:
133
+ content = message.content.strip()
134
+ content = re.sub(r"\n+", "\n", content)
135
+ role_str = message.role.strip().lower().capitalize()
136
+ if role_str == 'Assistant' and removeThinkingContent:
137
+ content = remove_nested_think_tags_stack(content)
138
+ promptStrList.append(f"{role_str}: {content}")
139
+ return "\n\n".join(promptStrList)
140
+
141
+ def format_bytes(size):
142
+ power = 2**10
143
+ n = 0
144
+ power_labels = {0: "", 1: "K", 2: "M", 3: "G", 4: "T"}
145
+ while size > power:
146
+ size /= power
147
+ n += 1
148
+ return f"{size:.4f}{power_labels[n]+'B'}"
149
+
150
+ LOGGER_QUEUE = queue.Queue(5)
151
+
152
+ def logger_worker():
153
+ while True:
154
+ item = LOGGER_QUEUE.get()
155
+ try:
156
+ requests.post(
157
+ os.environ.get("LOG_PORT"),
158
+ headers={"Content-Type": "application/json"},
159
+ json=item,
160
+ )
161
+ except Exception:
162
+ pass
163
+
164
+ if os.environ.get("LOG_PORT"):
165
+ threading.Thread(target=logger_worker).start()
166
+
167
+ def log(item):
168
+ LOGGER_QUEUE.put_nowait(item)
169
+
170
+ class SamplerConfig(BaseModel):
171
+ max_tokens: int = 4096
172
+ temperature: float = 1.0
173
+ top_p: float = 0.3
174
+ presence_penalty: float = 0.5
175
+ count_penalty: float = 0.5
176
+ penalty_decay: float = 0.996
177
+ stop: List[str] = ["\n\n"]
178
+ stop_tokens: List[int] = [0]
179
+
180
+ class ModelConfig(BaseModel):
181
+ SERVICE_NAME: str
182
+ DOWNLOAD_MODEL_FILE_NAME: str
183
+ DOWNLOAD_MODEL_REPO_ID: str
184
+ DOWNLOAD_MODEL_DIR: str = "models"
185
+ MODEL_FILE_PATH: Optional[str] = None
186
+ DEFAULT_CHAT: bool = False
187
+ DEFAULT_REASONING: bool = False
188
+ REASONING: bool = False
189
+ VOCAB: str = "rwkv_vocab_v20230424"
190
+ DEFAULT_SAMPLER: SamplerConfig = Field(default_factory=SamplerConfig)
191
+
192
+ class Config(BaseSettings):
193
+ HOST: str = "0.0.0.0"
194
+ PORT: int = 7860
195
+ STRATEGY: str = "cuda fp16"
196
+ RWKV_CUDA_ON: bool = True
197
+ CHUNK_LEN: int = 256
198
+ MODELS: List[ModelConfig] = [
199
+ ModelConfig(
200
+ SERVICE_NAME="rwkv7-g1a4-2.9b-20251118-ctx8192",
201
+ DOWNLOAD_MODEL_FILE_NAME="rwkv7-g1a4-2.9b-20251118-ctx8192.pth",
202
+ DOWNLOAD_MODEL_REPO_ID="BlinkDL/rwkv7-g1",
203
+ REASONING=True
204
+ ),
205
+ ModelConfig(
206
+ SERVICE_NAME="rwkv7-g1a3-1.5b-20251015-ctx8192",
207
+ DOWNLOAD_MODEL_FILE_NAME="rwkv7-g1a3-1.5b-20251015-ctx8192.pth",
208
+ DOWNLOAD_MODEL_REPO_ID="BlinkDL/rwkv7-g1",
209
+ REASONING=True
210
+ ),
211
+ ModelConfig(
212
+ SERVICE_NAME="rwkv7-g1a-0.4b-20250905-ctx4096",
213
+ DOWNLOAD_MODEL_FILE_NAME="rwkv7-g1a-0.4b-20250905-ctx4096.pth",
214
+ DOWNLOAD_MODEL_REPO_ID="BlinkDL/rwkv7-g1",
215
+ REASONING=True
216
+ ),
217
+ ModelConfig(
218
+ SERVICE_NAME="rwkv7-g1a-0.1b-20250728-ctx4096",
219
+ DOWNLOAD_MODEL_FILE_NAME="rwkv7-g1a-0.1b-20250728-ctx4096.pth",
220
+ DOWNLOAD_MODEL_REPO_ID="BlinkDL/rwkv7-g1",
221
+ REASONING=True,
222
+ DEFAULT_CHAT=True,
223
+ DEFAULT_REASONING=True
224
+ ),
225
+ ]
226
+
227
+ CONFIG = Config()
228
+
229
+ try:
230
+ from duckduckgo_search import DDGS
231
+ HAS_DDG = True
232
+ except ImportError:
233
+ HAS_DDG = False
234
+
235
+ try:
236
+ from faker import Faker
237
+ fake = Faker()
238
+ HAS_FAKER = True
239
+ except ImportError:
240
+ HAS_FAKER = False
241
+
242
+ CompletionIdGenerator = SnowflakeGenerator(42, timestamp=1741101491595)
243
+
244
+ if "cuda" in CONFIG.STRATEGY.lower() and not torch.cuda.is_available():
245
+ CONFIG.STRATEGY = "cpu fp16"
246
+ CONFIG.RWKV_CUDA_ON = False
247
+
248
+ if CONFIG.RWKV_CUDA_ON and "cuda" in CONFIG.STRATEGY.lower():
249
+ from pynvml import *
250
+ nvmlInit()
251
+ gpu_h = nvmlDeviceGetHandleByIndex(0)
252
+ os.environ["RWKV_CUDA_ON"] = "1"
253
+ torch.backends.cudnn.benchmark = True
254
+ torch.backends.cudnn.allow_tf32 = True
255
+ torch.backends.cuda.matmul.allow_tf32 = True
256
+ else:
257
+ os.environ["RWKV_CUDA_ON"] = "0"
258
+
259
+ from rwkv.model import RWKV
260
+ from rwkv.utils import PIPELINE, PIPELINE_ARGS
261
+
262
+ class ModelStorage:
263
+ MODEL_CONFIG: Optional[ModelConfig] = None
264
+ model: Optional[RWKV] = None
265
+ pipeline: Optional[PIPELINE] = None
266
+
267
+ MODEL_STORAGE: Dict[str, ModelStorage] = {}
268
+ DEFALUT_MODEL_NAME = None
269
+ DEFAULT_REASONING_MODEL_NAME = None
270
+
271
+ for model_config in CONFIG.MODELS:
272
+ if model_config.MODEL_FILE_PATH is None:
273
+ model_config.MODEL_FILE_PATH = hf_hub_download(
274
+ repo_id=model_config.DOWNLOAD_MODEL_REPO_ID,
275
+ filename=model_config.DOWNLOAD_MODEL_FILE_NAME,
276
+ local_dir=model_config.DOWNLOAD_MODEL_DIR,
277
+ )
278
+ if model_config.DEFAULT_CHAT:
279
+ DEFALUT_MODEL_NAME = model_config.SERVICE_NAME
280
+ if model_config.DEFAULT_REASONING:
281
+ DEFAULT_REASONING_MODEL_NAME = model_config.SERVICE_NAME
282
+
283
+ MODEL_STORAGE[model_config.SERVICE_NAME] = ModelStorage()
284
+ MODEL_STORAGE[model_config.SERVICE_NAME].MODEL_CONFIG = model_config
285
+ MODEL_STORAGE[model_config.SERVICE_NAME].model = RWKV(
286
+ model=model_config.MODEL_FILE_PATH.replace(".pth", ""),
287
+ strategy=CONFIG.STRATEGY,
288
+ )
289
+ MODEL_STORAGE[model_config.SERVICE_NAME].pipeline = PIPELINE(
290
+ MODEL_STORAGE[model_config.SERVICE_NAME].model, model_config.VOCAB
291
+ )
292
+ if "cuda" in CONFIG.STRATEGY:
293
+ torch.cuda.empty_cache()
294
+ gc.collect()
295
+
296
+ class ChatCompletionRequest(BaseModel):
297
+ model: str = Field(default="rwkv-latest")
298
+ messages: Optional[List[ChatMessage]] = Field(default=None)
299
+ prompt: Optional[str] = Field(default=None)
300
+ max_tokens: Optional[int] = Field(default=None)
301
+ temperature: Optional[float] = Field(default=None)
302
+ top_p: Optional[float] = Field(default=None)
303
+ presence_penalty: Optional[float] = Field(default=None)
304
+ count_penalty: Optional[float] = Field(default=None)
305
+ penalty_decay: Optional[float] = Field(default=None)
306
+ stream: Optional[bool] = Field(default=False)
307
+ stop: Optional[List[str]] = Field(["\n\n"])
308
+ stop_tokens: Optional[List[int]] = Field([0])
309
+
310
+ @model_validator(mode="before")
311
+ @classmethod
312
+ def validate_mutual_exclusivity(cls, data: Any) -> Any:
313
+ if not isinstance(data, dict): return data
314
+ if "messages" in data and "prompt" in data and data["messages"] and data["prompt"]:
315
+ raise ValueError("messages and prompt cannot coexist.")
316
+ return data
317
+
318
+ class TruthProtocol:
319
+ STRICT_SYSTEM_PROMPT = """
320
+ CRITICAL PROTOCOL: OBJECTIVE_REALITY_ONLY
321
+ 1. NEUTRALITY: Abandon all bias, opinions, and emotional language.
322
+ 2. FACTS ONLY: State verified facts. If data is missing, say [NO DATA]. Do not guess.
323
+ 3. NO HALLUCINATIONS: Do not invent dates, names, or events.
324
+ 4. LABELS: Tag assertions with [VERIFIED] or [UNCERTAIN].
325
+ 5. TONE: Robotic, precise, dense. No pleasantries.
326
+ """.strip()
327
+
328
+ @staticmethod
329
+ def enforce_truth_params(request: ChatCompletionRequest):
330
+ request.temperature = 0.12
331
+ request.top_p = 0.1
332
+ request.count_penalty = 1.1
333
+ request.presence_penalty = 0.6
334
+ request.penalty_decay = 0.996
335
+
336
+ @staticmethod
337
+ def sanitise_search(query: str, results: List[dict]) -> str:
338
+ context = "RAW DATA STREAM (IGNORE OPINIONS, EXTRACT FACTS):\n"
339
+ for i, res in enumerate(results):
340
+ clean_body = res['body'].replace("\n", " ").strip()
341
+ context += f"SOURCE [{i+1}]: {clean_body} (Origin: {res['title']})\n"
342
+ return context
343
+
344
+ search_cache = collections.OrderedDict()
345
+
346
+ def search_facts(query: str) -> str:
347
+ if not HAS_DDG: return ""
348
+ if query in search_cache: return search_cache[query]
349
+ try:
350
+ ddgs = DDGS()
351
+ results = ddgs.text(query, max_results=4)
352
+ if any(x in query.lower() for x in ["verdad", "fake", "cierto", "mentira"]):
353
+ check = ddgs.text(f"{query} fact check verified", max_results=2)
354
+ if check: results.extend(check)
355
+ if not results: return ""
356
+ ctx = TruthProtocol.sanitise_search(query, results)
357
+ if len(search_cache) > 50: search_cache.popitem(last=False)
358
+ search_cache[query] = ctx
359
+ return ctx
360
+ except:
361
+ return ""
362
+
363
+ def needs_verification(msg: str, model: str) -> bool:
364
+ if ":online" in model: return True
365
+ triggers = ["es verdad", "dato", "precio", "cuando", "quien", "noticia", "actualidad", "verify"]
366
+ return any(t in msg.lower() for t in triggers)
367
+
368
+ app = FastAPI(title="RWKV Zero-Bias Server")
369
+
370
+ app.add_middleware(
371
+ CORSMiddleware,
372
+ allow_origins=["*"],
373
+ allow_credentials=True,
374
+ allow_methods=["*"],
375
+ allow_headers=["*"],
376
+ )
377
+ app.add_middleware(GZipMiddleware, minimum_size=1000, compresslevel=5)
378
+
379
+ @app.middleware("http")
380
+ async def privacy_middleware(request: Request, call_next):
381
+ if HAS_FAKER:
382
+ request.scope["client"] = (fake.ipv4(), request.client.port if request.client else 80)
383
+ return await call_next(request)
384
+
385
+ async def runPrefill(request: ChatCompletionRequest, ctx: str, model_tokens: List[int], model_state):
386
+ ctx = ctx.replace("\r\n", "\n")
387
+ tokens = MODEL_STORAGE[request.model].pipeline.encode(ctx)
388
+ model_tokens.extend([int(x) for x in tokens])
389
+ while len(tokens) > 0:
390
+ out, model_state = MODEL_STORAGE[request.model].model.forward(tokens[: CONFIG.CHUNK_LEN], model_state)
391
+ tokens = tokens[CONFIG.CHUNK_LEN :]
392
+ await asyncio.sleep(0)
393
+ return out, model_tokens, model_state
394
+
395
+ def generate(request: ChatCompletionRequest, out, model_tokens: List[int], model_state, max_tokens=2048):
396
+ args = PIPELINE_ARGS(
397
+ temperature=request.temperature,
398
+ top_p=request.top_p,
399
+ alpha_frequency=request.count_penalty,
400
+ alpha_presence=request.presence_penalty,
401
+ token_ban=[], token_stop=[0]
402
+ )
403
+ occurrence = {}
404
+ out_tokens = []
405
+ out_last = 0
406
+ cache_word_list = []
407
+ for i in range(max_tokens):
408
+ for n in occurrence: out[n] -= args.alpha_presence + occurrence[n] * args.alpha_frequency
409
+ token = MODEL_STORAGE[request.model].pipeline.sample_logits(out, temperature=args.temperature, top_p=args.top_p)
410
+ if token == 0:
411
+ yield {"content": "".join(cache_word_list), "finish_reason": "stop", "state": model_state}
412
+ del out; gc.collect(); return
413
+ out, model_state = MODEL_STORAGE[request.model].model.forward([token], model_state)
414
+ model_tokens.append(token)
415
+ out_tokens.append(token)
416
+ for xxx in occurrence: occurrence[xxx] *= request.penalty_decay
417
+ occurrence[token] = 1 + (occurrence.get(token, 0))
418
+ tmp = MODEL_STORAGE[request.model].pipeline.decode(out_tokens[out_last:])
419
+ if "\ufffd" in tmp: continue
420
+ cache_word_list.append(tmp)
421
+ out_last = i + 1
422
+ if len(cache_word_list) > 1:
423
+ yield {"content": cache_word_list.pop(0), "finish_reason": None}
424
+ yield {"content": "".join(cache_word_list), "finish_reason": "length"}
425
+
426
+ async def chatResponseStream(request: ChatCompletionRequest, model_state: any, completionId: str, enableReasoning: bool):
427
+ clean_msg = cleanMessages(request.messages, enableReasoning)
428
+ prompt = f"{clean_msg}\n\nAssistant:{' <think' if enableReasoning else ''}"
429
+ out, model_tokens, model_state = await runPrefill(request, prompt, [0], model_state)
430
+ yield f"data: {ChatCompletionChunk(id=completionId, created=int(time.time()), model=request.model, choices=[ChatCompletionChoice(index=0, delta=ChatCompletionMessage(role='Assistant', content=''), finish_reason=None)]).model_dump_json()}\n\n"
431
+ for chunk in generate(request, out, model_tokens, model_state, max_tokens=request.max_tokens or 4096):
432
+ content = chunk["content"]
433
+ if content:
434
+ yield f"data: {ChatCompletionChunk(id=completionId, created=int(time.time()), model=request.model, choices=[ChatCompletionChoice(index=0, delta=ChatCompletionMessage(content=content), finish_reason=None)]).model_dump_json()}\n\n"
435
+ if chunk.get("finish_reason"): break
436
+ await asyncio.sleep(0)
437
+ yield "data: [DONE]\n\n"
438
+
439
+ @app.post("/api/v1/chat/completions")
440
+ async def chat_completions(request: ChatCompletionRequest):
441
+ completionId = str(next(CompletionIdGenerator))
442
+ raw_model = request.model
443
+ model_key = request.model.split(":")[0].replace(":online", "")
444
+ is_reasoning = ":thinking" in request.model
445
+ target_model = model_key
446
+ if "rwkv-latest" in model_key:
447
+ if is_reasoning and DEFAULT_REASONING_MODEL_NAME: target_model = DEFAULT_REASONING_MODEL_NAME
448
+ elif DEFALUT_MODEL_NAME: target_model = DEFALUT_MODEL_NAME
449
+ if target_model not in MODEL_STORAGE:
450
+ raise HTTPException(404, f"Model {target_model} not loaded.")
451
+ request.model = target_model
452
+ default_sampler = MODEL_STORAGE[target_model].MODEL_CONFIG.DEFAULT_SAMPLER
453
+ req_data = request.model_dump()
454
+ for k, v in default_sampler.model_dump().items():
455
+ if req_data.get(k) is None: req_data[k] = v
456
+ realRequest = ChatCompletionRequest(**req_data)
457
+ sys_msg = ChatMessage(role="System", content=TruthProtocol.STRICT_SYSTEM_PROMPT)
458
+ if realRequest.messages:
459
+ if realRequest.messages[0].role == "System":
460
+ realRequest.messages[0].content = f"{TruthProtocol.STRICT_SYSTEM_PROMPT}\n\n{realRequest.messages[0].content}"
461
+ else:
462
+ realRequest.messages.insert(0, sys_msg)
463
+ last_msg = realRequest.messages[-1]
464
+ if last_msg.role == "user" and needs_verification(last_msg.content, raw_model):
465
+ ctx = search_facts(last_msg.content)
466
+ if ctx:
467
+ realRequest.messages.insert(-1, ChatMessage(role="System", content=ctx))
468
+ TruthProtocol.enforce_truth_params(realRequest)
469
+ return StreamingResponse(chatResponseStream(realRequest, None, completionId, is_reasoning), media_type="text/event-stream")
470
+
471
+ @app.get("/api/v1/models")
472
+ @app.get("/models")
473
+ async def list_models():
474
+ models_list = []
475
+ ts = int(time.time())
476
+ for model_id in MODEL_STORAGE.keys():
477
+ models_list.append({"id": model_id, "object": "model", "created": ts, "owned_by": "rwkv-server"})
478
+ models_list.append({"id": f"{model_id}:online", "object": "model", "created": ts, "owned_by": "rwkv-server"})
479
+ if DEFALUT_MODEL_NAME:
480
+ models_list.append({"id": "rwkv-latest", "object": "model", "created": ts, "owned_by": "rwkv-system"})
481
+ models_list.append({"id": "rwkv-latest:online", "object": "model", "created": ts, "owned_by": "rwkv-system"})
482
+ if DEFAULT_REASONING_MODEL_NAME:
483
+ models_list.append({"id": "rwkv-latest:thinking", "object": "model", "created": ts, "owned_by": "rwkv-system"})
484
+ models_list.append({"id": "rwkv-latest:thinking:online", "object": "model", "created": ts, "owned_by": "rwkv-system"})
485
+ return {"object": "list", "data": models_list}
486
+
487
+ app.mount("/", StaticFiles(directory="dist-frontend", html=True), name="static")
488
+
489
+ if __name__ == "__main__":
490
+ import uvicorn
491
+ uvicorn.run(app, host=CONFIG.HOST, port=CONFIG.PORT)