skullcandy42 commited on
Commit
53128f5
·
verified ·
1 Parent(s): 1f4e090

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -84
app.py CHANGED
@@ -1,40 +1,47 @@
1
  import gradio as gr
2
- import spaces
3
  from llama_cpp import Llama
4
  from huggingface_hub import hf_hub_download
5
 
6
  CSS = """
7
  #qwen-md .katex-display { display: inline; }
8
- #qwen-md .katex-display>.katex { display: inline; }
9
- #qwen-md .katex-display>.katex>.katex-html { display: inline; }
10
  """
11
 
 
12
  hf_hub_download(
13
  repo_id="bartowski/Qwen2.5-Math-7B-Instruct-GGUF",
14
  filename="Qwen2.5-Math-7B-Instruct-Q6_K_L.gguf",
15
  local_dir="./models",
16
  )
17
 
18
- # 模型加载函数明确包装GPU环境
19
- @spaces.GPU
20
- def load_llm():
21
- llm = Llama(
22
- model_path="models/Qwen2.5-Math-7B-Instruct-Q6_K_L.gguf",
23
- flash_attn=True,
24
- n_ctx=4096,
25
- n_batch=512,
26
- chat_format="chatml",
27
- n_gpu_layers=-1,
28
- )
29
- return llm
30
 
31
- # 全局加载模型(仅加载一次,提升速度)
32
- llm = load_llm()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
- # 模型推理函数也使用GPU环境
35
- @spaces.GPU
36
- def generate_response(messages, max_tokens, temperature, top_p):
37
- response = llm.create_chat_completion(
38
  messages=messages,
39
  stream=True,
40
  max_tokens=max_tokens,
@@ -42,69 +49,11 @@ def generate_response(messages, max_tokens, temperature, top_p):
42
  top_p=top_p,
43
  )
44
 
45
- message_repl = ""
46
- for chunk in response:
47
- delta = chunk['choices'][0]["delta"]
48
- if "content" in delta:
49
- message_repl += delta["content"]
50
- yield message_repl
51
-
52
- # Gradio组件设置
53
- output_md = gr.Markdown(
54
- label="Answer",
55
- value="Answer will be presented here",
56
- latex_delimiters=[
57
- {"left": "\\(", "right": "\\)", "display": True},
58
- {"left": "\\[", "right": "\\]", "display": True},
59
- ],
60
- elem_id="qwen-md",
61
- show_copy_button=True,
62
- container=True,
63
- render=False,
64
- )
65
-
66
- target_lang = gr.Dropdown(
67
- choices=["Chinese", "English"],
68
- value="Chinese",
69
- label="Output Language",
70
- interactive=True,
71
- render=False,
72
- )
73
-
74
- new_tokens = gr.Slider(
75
- minimum=1, maximum=1024, value=128, step=1, label="Max new tokens", render=False
76
- )
77
-
78
- temperature = gr.Slider(
79
- minimum=0, maximum=1.0, value=0.2, step=0.1, label="Temperature", render=False
80
- )
81
-
82
- top_p = gr.Slider(
83
- minimum=0.0, maximum=1.0, value=0.8, step=0.05, label="Top P", render=False
84
- )
85
-
86
- input_text = gr.Textbox(label="Ask math questions here", render=False)
87
- submit_btn = gr.Button(value="Ask", render=False)
88
-
89
- banner = gr.Markdown(value="""
90
- # 📖 Qwen2.5-Math GGUF
91
- Fast mathematical reasoning with ZeroGPU optimized.
92
- """)
93
-
94
- # Gradio响应函数,调用GPU推理
95
- def respond(input_text, lang, max_tokens, temperature, top_p):
96
- sys_msg = (
97
- "你是一个乐于助人的数学助手. 你使用中文回答问题"
98
- if lang == "Chinese" else
99
- "You are a helpful math assistant. You should always provide your answer in English."
100
- )
101
- messages = [
102
- {"role": "system", "content": sys_msg},
103
- {"role": "user", "content": input_text},
104
- ]
105
-
106
- # GPU生成器直接返回
107
- yield from generate_response(messages, max_tokens, temperature, top_p)
108
 
109
  with gr.Blocks(css=CSS, theme="NoCrypt/miku") as demo:
110
  submit_btn.click(
 
1
  import gradio as gr
 
2
  from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
 
5
  CSS = """
6
  #qwen-md .katex-display { display: inline; }
 
 
7
  """
8
 
9
+ # 下载GGUF模型文件(如果本地已下载,此代码会跳过)
10
  hf_hub_download(
11
  repo_id="bartowski/Qwen2.5-Math-7B-Instruct-GGUF",
12
  filename="Qwen2.5-Math-7B-Instruct-Q6_K_L.gguf",
13
  local_dir="./models",
14
  )
15
 
16
+ # 仅CPU模式加载模型,去除flash attention和显存优化参数
17
+ llm = Llama(
18
+ model_path="models/Qwen2.5-Math-7B-Instruct-Q6_K_L.gguf",
19
+ n_ctx=2048, # 更短的上下文提高速度
20
+ n_batch=256, # 小批量减少CPU负载
21
+ n_threads=8, # 明确线程数 (可根据你的CPU核心数调整)
22
+ chat_format="chatml",
23
+ verbose=False
24
+ )
 
 
 
25
 
26
+ # Gradio 组件定义(简化配置)
27
+ input_text = gr.Textbox(label="Ask math questions here")
28
+ output_md = gr.Markdown(label="Answer", elem_id="qwen-md", show_copy_button=True)
29
+ target_lang = gr.Dropdown(choices=["Chinese", "English"], value="Chinese", label="Output Language")
30
+ new_tokens = gr.Slider(minimum=1, maximum=512, value=256, step=1, label="Max new tokens")
31
+ temperature = gr.Slider(minimum=0, maximum=1.0, value=0.2, step=0.05, label="Temperature")
32
+ top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.9, step=0.05, label="Top P")
33
+ submit_btn = gr.Button("Ask")
34
+ banner = gr.Markdown(value="### 📖 **Qwen2.5-Math 7B GGUF** - Optimized for CPU")
35
+
36
+ # 响应函数(精简版,更快速)
37
+ def respond(input_text, lang, max_tokens, temperature, top_p):
38
+ sys_msg = "你是一个乐于助人的数学助手,请使用中文回答。" if lang == "Chinese" else "You are a helpful math assistant. Please answer in English."
39
+ messages = [
40
+ {"role": "system", "content": sys_msg},
41
+ {"role": "user", "content": input_text},
42
+ ]
43
 
44
+ stream_response = llm.create_chat_completion(
 
 
 
45
  messages=messages,
46
  stream=True,
47
  max_tokens=max_tokens,
 
49
  top_p=top_p,
50
  )
51
 
52
+ result = ""
53
+ for chunk in stream_response:
54
+ content = chunk['choices'][0]["delta"].get("content", "")
55
+ result += content
56
+ yield result.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  with gr.Blocks(css=CSS, theme="NoCrypt/miku") as demo:
59
  submit_btn.click(