TobDeBer commited on
Commit
5facf65
·
verified ·
1 Parent(s): ea7eeb6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -96
app.py CHANGED
@@ -1,63 +1,78 @@
1
  from collections.abc import Iterator
2
  from datetime import datetime
3
  from pathlib import Path
4
- from typing import Iterator, List, Dict
5
- from huggingface_hub import hf_hub_download
6
  from themes.research_monochrome import ResearchMonochrome
7
- import spaces
8
- import gradio as gr
9
- from llama_cpp import Llama # <-- Neu: Llama-Klasse importieren
10
- import os
11
 
12
- # --- Konfiguration ---
 
 
 
 
13
 
14
  today_date = datetime.today().strftime("%B %-d, %Y") # noqa: DTZ002
15
- SYS_PROMPT = f"""Today's Date: {today_date}.You are Granite, developed by IBM. You are a helpful AI assistant"""
16
- TITLE = "IBM Granite 4 Tiny Preview served via llama-cpp-python"
17
- DESCRIPTION = """<p>Granite 4 Tiny is an open-source LLM supporting a 128k context window. This demo uses only 2K context.<span class="gr_docs_link"><a href="https://www.ibm.com/granite/docs/">View Documentation <i class="fa fa-external-link"></i></a></span></p>"""
18
 
 
 
 
 
 
 
 
 
 
 
 
19
  MAX_NEW_TOKENS = 1024
20
  TEMPERATURE = 0.7
21
  TOP_P = 0.85
22
  TOP_K = 50
23
  REPETITION_PENALTY = 1.05
24
- CONTEXT_WINDOW = 2048 # Kontextfenstergröße setzen
25
 
26
- # --- Modell-Setup ---
27
-
28
- # Modell herunterladen
29
- gguf_name = "granite-4.0-tiny-preview-Q4_K_M.gguf"
30
- # Der Pfad, in dem das Modell gespeichert wird
31
- model_path = hf_hub_download(
32
- repo_id="ibm-granite/granite-4.0-tiny-preview-GGUF",
33
- filename=gguf_name,
34
- local_dir="."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  )
36
- print(f"Model downloaded to: {model_path}")
37
 
38
- # Llama-Modell laden
39
- # Hinweis: Die Anzahl der Schichten, die auf die GPU entladen werden (n_gpu_layers),
40
- # sollte auf einen hohen Wert wie 999 gesetzt werden, um die gesamte GPU-Auslagerung zu erzwingen.
41
- # 'n_ctx' setzt die Kontextgröße.
42
- # 'chat_format' wird für die korrekte Formatierung der Konversation benötigt.
43
- try:
44
- llama_model = Llama(
45
- model_path=model_path,
46
- n_ctx=CONTEXT_WINDOW,
47
- n_gpu_layers=999, # Entlädt alle Schichten auf die GPU
48
- chat_format="chatml", # Granite 4 Tiny verwendet ein Format, das dem ChatML-Standard ähnelt
49
- verbose=False
50
- )
51
- print("Llama model initialized successfully.")
52
- except Exception as e:
53
- print(f"Error initializing Llama model: {e}")
54
- llama_model = None # Setze auf None, falls ein Fehler auftritt
55
 
56
- # --- Gradio-Funktionen ---
 
 
 
 
57
 
58
  custom_theme = ResearchMonochrome()
 
59
 
60
- @spaces.GPU(duration=30)
61
  def generate(
62
  message: str,
63
  chat_history: List[Dict],
@@ -67,70 +82,85 @@ def generate(
67
  top_k: float = TOP_K,
68
  max_new_tokens: int = MAX_NEW_TOKENS,
69
  ) -> Iterator[str]:
70
- """Generierungsfunktion für Chat-Demo unter Verwendung von llama-cpp-python."""
71
-
72
- if llama_model is None:
73
- yield "Error: The model failed to initialize."
74
- return
75
-
76
- # 1. Nachrichten für llama-cpp-python aufbereiten
77
- # llama-cpp-python erwartet ein OpenAI-Chat-Format
78
- messages = []
79
- messages.append({"role": "system", "content": SYS_PROMPT})
80
-
81
- # Füge den Chatverlauf hinzu
82
- for item in chat_history:
83
- # Gradio speichert als Liste von Listen: [["user_msg", "assistant_msg"], ...]
84
- # Die Struktur von `chat_history` ist jedoch als Liste von Dictionaries [..., {"role": "user", "content": "..."}]
85
- # aus der Gradio ChatInterface-Dokumentation (typischerweise)
86
- if item["role"] == "user":
87
- messages.append({"role": "user", "content": item["content"]})
88
- elif item["role"] == "assistant":
89
- messages.append({"role": "assistant", "content": item["content"]})
90
-
91
- # Füge die aktuelle Benutzernachricht hinzu
92
- messages.append({"role": "user", "content": message})
93
-
94
- # 2. Generierung starten
95
- full_response = ""
 
 
 
 
 
 
96
  try:
97
- # Verwende die OpenAI-kompatible Streaming-API von llama-cpp-python
98
- stream = llama_model.create_chat_completion_openai_v1(
99
- messages=messages,
100
- temperature=temperature,
101
- top_p=top_p,
102
- top_k=top_k,
103
- max_tokens=max_new_tokens,
104
- repeat_penalty=repetition_penalty,
105
- stop=["<|file_separator|>"], # Stopp-Token wie im Original-Code
106
- stream=True
107
- )
108
-
109
- # 3. Streamen der Antwort
110
- for chunk in stream:
111
- if chunk and "choices" in chunk and len(chunk["choices"]) > 0:
112
- delta = chunk["choices"][0]["delta"]
113
- if "content" in delta:
114
- text = delta["content"]
115
- full_response += text
116
- yield full_response
117
-
 
 
 
 
 
 
 
 
118
  except Exception as e:
119
- print(f"An error occurred during generation: {e}")
120
- yield f"Error: {e}"
121
 
122
 
123
- # --- Gradio UI-Setup (Unverändert) ---
124
-
125
  css_file_path = Path(Path(__file__).parent / "app.css")
126
 
127
  # advanced settings (displayed in Accordion)
128
  temperature_slider = gr.Slider(
129
- minimum=0, maximum=1.0, value=TEMPERATURE, step=0.1, label="Temperature", elem_classes=["gr_accordion_element"])
 
130
  top_p_slider = gr.Slider(
131
- minimum=0, maximum=1.0, value=TOP_P, step=0.05, label="Top P", elem_classes=["gr_accordion_element"])
 
132
  top_k_slider = gr.Slider(
133
- minimum=0, maximum=100, value=TOP_K, step=1, label="Top K", elem_classes=["gr_accordion_element"])
 
134
  repetition_penalty_slider = gr.Slider(
135
  minimum=0,
136
  maximum=2.0,
@@ -155,14 +185,12 @@ with gr.Blocks(fill_height=True, css_paths=css_file_path, theme=custom_theme, ti
155
  chat_interface = gr.ChatInterface(
156
  fn=generate,
157
  examples=[
158
- ["What is 1+1?"],
159
  ["Explain the concept of quantum computing to someone with no background in physics or computer science."],
160
  ["What is OpenShift?"],
161
  ["What's the importance of low latency inference?"],
162
  ["Help me boost productivity habits."],
163
  ],
164
  example_labels=[
165
- "What is 1+1?",
166
  "Explain quantum computing",
167
  "What is OpenShift?",
168
  "Importance of low latency inference",
 
1
  from collections.abc import Iterator
2
  from datetime import datetime
3
  from pathlib import Path
4
+ from threading import Thread
5
+ from huggingface_hub import hf_hub_download, login
6
  from themes.research_monochrome import ResearchMonochrome
7
+ from typing import Iterator, List, Dict
 
 
 
8
 
9
+ import os
10
+ import requests
11
+ import json
12
+ import subprocess
13
+ import gradio as gr
14
 
15
  today_date = datetime.today().strftime("%B %-d, %Y") # noqa: DTZ002
 
 
 
16
 
17
+ SYS_PROMPT = f"""Today's Date: {today_date}.
18
+ You are Granite, developed by IBM. You are a helpful AI assistant"""
19
+ TITLE = "IBM Granite 4 Micro served from local GGUF server"
20
+ DESCRIPTION = """
21
+ <p>Granite 4 Micro is an open-source LLM supporting a 128k context window. This demo uses only 2K context.
22
+ <span class="gr_docs_link">
23
+ <a href="https://www.ibm.com/granite/docs/">View Documentation <i class="fa fa-external-link"></i></a>
24
+ </span>
25
+ </p>
26
+ """
27
+ LLAMA_CPP_SERVER = "http://127.0.0.1:8081"
28
  MAX_NEW_TOKENS = 1024
29
  TEMPERATURE = 0.7
30
  TOP_P = 0.85
31
  TOP_K = 50
32
  REPETITION_PENALTY = 1.05
 
33
 
34
+ # determine platform: CUDA or CPU
35
+ try:
36
+ subprocess.run(["nvidia-smi"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
37
+ platform = "CUDA"
38
+ except subprocess.CalledProcessError:
39
+ platform = "CPU"
40
+ except FileNotFoundError:
41
+ platform = "CPU"
42
+
43
+ print(f"Detected platform {platform}")
44
+
45
+ # login to HF with space secret and download gguf and executable
46
+ #hf_token = os.getenv("HF_TOKEN") # Set this in your environment before running
47
+ #if hf_token:
48
+ # login(token=hf_token)
49
+ #else:
50
+ # raise ValueError("Hugging Face token not found. Please set HF_TOKEN environment variable.")
51
+
52
+ gguf_name = "granite-4.0-h-micro-UD-Q2_K_XL.gguf"
53
+ gguf_path = hf_hub_download(
54
+ repo_id="unsloth/granite-4.0-h-micro-GGUF",
55
+ filename=gguf_name,
56
+ local_dir="."
57
  )
 
58
 
59
+ # set exe_name depending on platform
60
+ exe_name = "llama-server-6343-cuda" if platform == "CUDA" else "llama-server-6343-blas"
61
+ exe_path = hf_hub_download(
62
+ repo_id="TobDeBer/Skipper",
63
+ filename=exe_name,
64
+ local_dir="."
65
+ )
 
 
 
 
 
 
 
 
 
 
66
 
67
+ # start llama-server
68
+ subprocess.run(["chmod", "+x", exe_name])
69
+ command = ["./" + exe_name, "-m", gguf_name, "--temp", "0.0", "-c", "2048", "-t", "8", "--port", "8081"]
70
+ process = subprocess.Popen(command)
71
+ print(f"Llama-server process started with PID {process.pid}")
72
 
73
  custom_theme = ResearchMonochrome()
74
+ print("Theme type:", type(custom_theme))
75
 
 
76
  def generate(
77
  message: str,
78
  chat_history: List[Dict],
 
82
  top_k: float = TOP_K,
83
  max_new_tokens: int = MAX_NEW_TOKENS,
84
  ) -> Iterator[str]:
85
+ """Generate function for chat demo using Llama.cpp server."""
86
+
87
+ # Build messages
88
+ conversation = []
89
+ conversation.append({"role": "system", "content": SYS_PROMPT})
90
+ conversation += chat_history
91
+ conversation.append({"role": "user", "content": message})
92
+
93
+ # Prepare the prompt for the Llama.cpp server
94
+ prompt = ""
95
+ for item in conversation:
96
+ if item["role"] == "system":
97
+ prompt += f"<|system|>\n{item['content']}\n<|file_separator|>\n"
98
+ elif item["role"] == "user":
99
+ prompt += f"<|user|>\n{item['content']}\n<|file_separator|>\n"
100
+ elif item["role"] == "assistant":
101
+ prompt += f"<|model|>\n{item['content']}\n<|file_separator|>\n"
102
+ prompt += "<|model|>\n" # Add the beginning token for the assistant
103
+
104
+
105
+ # Construct the request payload
106
+ payload = {
107
+ "prompt": prompt,
108
+ "stream": True, # Enable streaming
109
+ "max_tokens": max_new_tokens,
110
+ "temperature": temperature,
111
+ "repeat_penalty": repetition_penalty,
112
+ "top_p": top_p,
113
+ "top_k": top_k,
114
+ "stop": ["<|file_separator|>"], #stops after it sees this
115
+ }
116
+
117
  try:
118
+ # Make the request to the Llama.cpp server
119
+ with requests.post(f"{LLAMA_CPP_SERVER}/completion", json=payload, stream=True, timeout=60) as response:
120
+ response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
121
+
122
+ # Stream the response from the server
123
+ outputs = []
124
+ for line in response.iter_lines():
125
+ if line:
126
+ # Decode the line
127
+ decoded_line = line.decode('utf-8')
128
+ # Remove 'data: ' prefix if present
129
+ if decoded_line.startswith("data: "):
130
+ decoded_line = decoded_line[6:]
131
+
132
+ # Handle potential JSON decoding errors
133
+ try:
134
+ json_data = json.loads(decoded_line)
135
+ text = json_data.get("content", "") # Extract content field. crucial.
136
+ if text:
137
+ outputs.append(text)
138
+ yield "".join(outputs)
139
+
140
+ except json.JSONDecodeError:
141
+ print(f"JSONDecodeError: {decoded_line}")
142
+ # Handle the error, potentially skipping the line or logging it.
143
+
144
+ except requests.exceptions.RequestException as e:
145
+ print(f"Request failed: {e}")
146
+ yield f"Error: {e}" # Yield an error message to the user
147
  except Exception as e:
148
+ print(f"An unexpected error occurred: {e}")
149
+ yield f"Error: {e}" # Yield error message
150
 
151
 
 
 
152
  css_file_path = Path(Path(__file__).parent / "app.css")
153
 
154
  # advanced settings (displayed in Accordion)
155
  temperature_slider = gr.Slider(
156
+ minimum=0, maximum=1.0, value=TEMPERATURE, step=0.1, label="Temperature", elem_classes=["gr_accordion_element"]
157
+ )
158
  top_p_slider = gr.Slider(
159
+ minimum=0, maximum=1.0, value=TOP_P, step=0.05, label="Top P", elem_classes=["gr_accordion_element"]
160
+ )
161
  top_k_slider = gr.Slider(
162
+ minimum=0, maximum=100, value=TOP_K, step=1, label="Top K", elem_classes=["gr_accordion_element"]
163
+ )
164
  repetition_penalty_slider = gr.Slider(
165
  minimum=0,
166
  maximum=2.0,
 
185
  chat_interface = gr.ChatInterface(
186
  fn=generate,
187
  examples=[
 
188
  ["Explain the concept of quantum computing to someone with no background in physics or computer science."],
189
  ["What is OpenShift?"],
190
  ["What's the importance of low latency inference?"],
191
  ["Help me boost productivity habits."],
192
  ],
193
  example_labels=[
 
194
  "Explain quantum computing",
195
  "What is OpenShift?",
196
  "Importance of low latency inference",