Spaces:

TobDeBer
/

Granite4MicroCPU

Sleeping

App Files Files Community

TobDeBer commited on Oct 3

Commit

5facf65

verified ·

1 Parent(s): ea7eeb6

Update app.py

Browse files

Files changed (1) hide show

app.py +124 -96

app.py CHANGED Viewed

@@ -1,63 +1,78 @@
 from collections.abc import Iterator
 from datetime import datetime
 from pathlib import Path
-from typing import Iterator, List, Dict
-from huggingface_hub import hf_hub_download
 from themes.research_monochrome import ResearchMonochrome
-import spaces
-import gradio as gr
-from llama_cpp import Llama # <-- Neu: Llama-Klasse importieren
-import os
-# --- Konfiguration ---
 today_date = datetime.today().strftime("%B %-d, %Y")  # noqa: DTZ002
-SYS_PROMPT = f"""Today's Date: {today_date}.You are Granite, developed by IBM. You are a helpful AI assistant"""
-TITLE = "IBM Granite 4 Tiny Preview served via llama-cpp-python"
-DESCRIPTION = """<p>Granite 4 Tiny is an open-source LLM supporting a 128k context window. This demo uses only 2K context.<span class="gr_docs_link"><a href="https://www.ibm.com/granite/docs/">View Documentation <i class="fa fa-external-link"></i></a></span></p>"""
 MAX_NEW_TOKENS = 1024
 TEMPERATURE = 0.7
 TOP_P = 0.85
 TOP_K = 50
 REPETITION_PENALTY = 1.05
-CONTEXT_WINDOW = 2048 # Kontextfenstergröße setzen
-# --- Modell-Setup ---
-# Modell herunterladen
-gguf_name = "granite-4.0-tiny-preview-Q4_K_M.gguf"
-# Der Pfad, in dem das Modell gespeichert wird
-model_path = hf_hub_download(
-    repo_id="ibm-granite/granite-4.0-tiny-preview-GGUF",
-    filename=gguf_name,
-    local_dir="."
 )
-print(f"Model downloaded to: {model_path}")
-# Llama-Modell laden
-# Hinweis: Die Anzahl der Schichten, die auf die GPU entladen werden (n_gpu_layers),
-# sollte auf einen hohen Wert wie 999 gesetzt werden, um die gesamte GPU-Auslagerung zu erzwingen.
-# 'n_ctx' setzt die Kontextgröße.
-# 'chat_format' wird für die korrekte Formatierung der Konversation benötigt.
-try:
-    llama_model = Llama(
-        model_path=model_path,
-        n_ctx=CONTEXT_WINDOW,
-        n_gpu_layers=999, # Entlädt alle Schichten auf die GPU
-        chat_format="chatml", # Granite 4 Tiny verwendet ein Format, das dem ChatML-Standard ähnelt
-        verbose=False
-    )
-    print("Llama model initialized successfully.")
-except Exception as e:
-    print(f"Error initializing Llama model: {e}")
-    llama_model = None # Setze auf None, falls ein Fehler auftritt
-# --- Gradio-Funktionen ---
 custom_theme = ResearchMonochrome()
-@spaces.GPU(duration=30)
 def generate(
     message: str,
     chat_history: List[Dict],
@@ -67,70 +82,85 @@ def generate(
     top_k: float = TOP_K,
     max_new_tokens: int = MAX_NEW_TOKENS,
 ) -> Iterator[str]:
-    """Generierungsfunktion für Chat-Demo unter Verwendung von llama-cpp-python."""
-    if llama_model is None:
-        yield "Error: The model failed to initialize."
-        return
-    # 1. Nachrichten für llama-cpp-python aufbereiten
-    # llama-cpp-python erwartet ein OpenAI-Chat-Format
-    messages = []
-    messages.append({"role": "system", "content": SYS_PROMPT})
-    # Füge den Chatverlauf hinzu
-    for item in chat_history:
-        # Gradio speichert als Liste von Listen: [["user_msg", "assistant_msg"], ...]
-        # Die Struktur von `chat_history` ist jedoch als Liste von Dictionaries [..., {"role": "user", "content": "..."}]
-        # aus der Gradio ChatInterface-Dokumentation (typischerweise)
-        if item["role"] == "user":
-            messages.append({"role": "user", "content": item["content"]})
-        elif item["role"] == "assistant":
-            messages.append({"role": "assistant", "content": item["content"]})
-    # Füge die aktuelle Benutzernachricht hinzu
-    messages.append({"role": "user", "content": message})
-    # 2. Generierung starten
-    full_response = ""
     try:
-        # Verwende die OpenAI-kompatible Streaming-API von llama-cpp-python
-        stream = llama_model.create_chat_completion_openai_v1(
-            messages=messages,
-            temperature=temperature,
-            top_p=top_p,
-            top_k=top_k,
-            max_tokens=max_new_tokens,
-            repeat_penalty=repetition_penalty,
-            stop=["<|file_separator|>"], # Stopp-Token wie im Original-Code
-            stream=True
-        )
-        # 3. Streamen der Antwort
-        for chunk in stream:
-            if chunk and "choices" in chunk and len(chunk["choices"]) > 0:
-                delta = chunk["choices"][0]["delta"]
-                if "content" in delta:
-                    text = delta["content"]
-                    full_response += text
-                    yield full_response
     except Exception as e:
-        print(f"An error occurred during generation: {e}")
-        yield f"Error: {e}"
-# --- Gradio UI-Setup (Unverändert) ---
 css_file_path = Path(Path(__file__).parent / "app.css")
 # advanced settings (displayed in Accordion)
 temperature_slider = gr.Slider(
-    minimum=0, maximum=1.0, value=TEMPERATURE, step=0.1, label="Temperature", elem_classes=["gr_accordion_element"])
 top_p_slider = gr.Slider(
-    minimum=0, maximum=1.0, value=TOP_P, step=0.05, label="Top P", elem_classes=["gr_accordion_element"])
 top_k_slider = gr.Slider(
-    minimum=0, maximum=100, value=TOP_K, step=1, label="Top K", elem_classes=["gr_accordion_element"])
 repetition_penalty_slider = gr.Slider(
     minimum=0,
     maximum=2.0,
@@ -155,14 +185,12 @@ with gr.Blocks(fill_height=True, css_paths=css_file_path, theme=custom_theme, ti
     chat_interface = gr.ChatInterface(
         fn=generate,
         examples=[
-            ["What is 1+1?"],
             ["Explain the concept of quantum computing to someone with no background in physics or computer science."],
             ["What is OpenShift?"],
             ["What's the importance of low latency inference?"],
             ["Help me boost productivity habits."],
         ],
         example_labels=[
-            "What is 1+1?",
             "Explain quantum computing",
             "What is OpenShift?",
             "Importance of low latency inference",

 from collections.abc import Iterator
 from datetime import datetime
 from pathlib import Path
+from threading import Thread
+from huggingface_hub import hf_hub_download, login
 from themes.research_monochrome import ResearchMonochrome
+from typing import Iterator, List, Dict
+import os
+import requests
+import json
+import subprocess
+import gradio as gr
 today_date = datetime.today().strftime("%B %-d, %Y")  # noqa: DTZ002
+SYS_PROMPT = f"""Today's Date: {today_date}.
+You are Granite, developed by IBM. You are a helpful AI assistant"""
+TITLE = "IBM Granite 4 Micro served from local GGUF server"
+DESCRIPTION = """
+<p>Granite 4 Micro is an open-source LLM supporting a 128k context window. This demo uses only 2K context.
+<span class="gr_docs_link">
+<a href="https://www.ibm.com/granite/docs/">View Documentation <i class="fa fa-external-link"></i></a>
+</span>
+</p>
+"""
+LLAMA_CPP_SERVER = "http://127.0.0.1:8081"
 MAX_NEW_TOKENS = 1024
 TEMPERATURE = 0.7
 TOP_P = 0.85
 TOP_K = 50
 REPETITION_PENALTY = 1.05
+# determine platform: CUDA or CPU
+try:
+    subprocess.run(["nvidia-smi"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
+    platform = "CUDA"
+except subprocess.CalledProcessError:
+    platform = "CPU"
+except FileNotFoundError:
+    platform = "CPU"
+print(f"Detected platform {platform}")
+# login to HF with space secret and download gguf and executable
+#hf_token = os.getenv("HF_TOKEN")  # Set this in your environment before running
+#if hf_token:
+#    login(token=hf_token)
+#else:
+#    raise ValueError("Hugging Face token not found. Please set HF_TOKEN environment variable.")
+gguf_name = "granite-4.0-h-micro-UD-Q2_K_XL.gguf"
+gguf_path = hf_hub_download(
+            repo_id="unsloth/granite-4.0-h-micro-GGUF",
+            filename=gguf_name,
+            local_dir="."
 )
+# set exe_name depending on platform
+exe_name = "llama-server-6343-cuda" if platform == "CUDA" else "llama-server-6343-blas"
+exe_path = hf_hub_download(
+            repo_id="TobDeBer/Skipper",
+            filename=exe_name,
+            local_dir="."
+)
+# start llama-server
+subprocess.run(["chmod", "+x", exe_name])
+command = ["./" + exe_name, "-m", gguf_name, "--temp", "0.0", "-c", "2048", "-t", "8", "--port", "8081"]
+process = subprocess.Popen(command)
+print(f"Llama-server process started with PID {process.pid}")
 custom_theme = ResearchMonochrome()
+print("Theme type:", type(custom_theme))
 def generate(
     message: str,
     chat_history: List[Dict],
     top_k: float = TOP_K,
     max_new_tokens: int = MAX_NEW_TOKENS,
 ) -> Iterator[str]:
+    """Generate function for chat demo using Llama.cpp server."""
+    # Build messages
+    conversation = []
+    conversation.append({"role": "system", "content": SYS_PROMPT})
+    conversation += chat_history
+    conversation.append({"role": "user", "content": message})
+    # Prepare the prompt for the Llama.cpp server
+    prompt = ""
+    for item in conversation:
+      if item["role"] == "system":
+        prompt += f"<|system|>\n{item['content']}\n<|file_separator|>\n"
+      elif item["role"] == "user":
+        prompt += f"<|user|>\n{item['content']}\n<|file_separator|>\n"
+      elif item["role"] == "assistant":
+        prompt += f"<|model|>\n{item['content']}\n<|file_separator|>\n"
+    prompt += "<|model|>\n"  # Add the beginning token for the assistant
+    # Construct the request payload
+    payload = {
+        "prompt": prompt,
+        "stream": True,  # Enable streaming
+        "max_tokens": max_new_tokens,
+        "temperature": temperature,
+        "repeat_penalty": repetition_penalty,
+        "top_p": top_p,
+        "top_k": top_k,
+        "stop": ["<|file_separator|>"], #stops after it sees this
+    }
     try:
+        # Make the request to the Llama.cpp server
+        with requests.post(f"{LLAMA_CPP_SERVER}/completion", json=payload, stream=True, timeout=60) as response:
+            response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
+            # Stream the response from the server
+            outputs = []
+            for line in response.iter_lines():
+                if line:
+                    # Decode the line
+                    decoded_line = line.decode('utf-8')
+                    # Remove 'data: ' prefix if present
+                    if decoded_line.startswith("data: "):
+                        decoded_line = decoded_line[6:]
+                    # Handle potential JSON decoding errors
+                    try:
+                        json_data = json.loads(decoded_line)
+                        text = json_data.get("content", "")  # Extract content field. crucial.
+                        if text:
+                            outputs.append(text)
+                            yield "".join(outputs)
+                    except json.JSONDecodeError:
+                        print(f"JSONDecodeError: {decoded_line}")
+                        # Handle the error, potentially skipping the line or logging it.
+    except requests.exceptions.RequestException as e:
+        print(f"Request failed: {e}")
+        yield f"Error: {e}"  # Yield an error message to the user
     except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+        yield f"Error: {e}" # Yield error message
 css_file_path = Path(Path(__file__).parent / "app.css")
 # advanced settings (displayed in Accordion)
 temperature_slider = gr.Slider(
+    minimum=0, maximum=1.0, value=TEMPERATURE, step=0.1, label="Temperature", elem_classes=["gr_accordion_element"]
+)
 top_p_slider = gr.Slider(
+    minimum=0, maximum=1.0, value=TOP_P, step=0.05, label="Top P", elem_classes=["gr_accordion_element"]
+)
 top_k_slider = gr.Slider(
+    minimum=0, maximum=100, value=TOP_K, step=1, label="Top K", elem_classes=["gr_accordion_element"]
+)
 repetition_penalty_slider = gr.Slider(
     minimum=0,
     maximum=2.0,
     chat_interface = gr.ChatInterface(
         fn=generate,
         examples=[
             ["Explain the concept of quantum computing to someone with no background in physics or computer science."],
             ["What is OpenShift?"],
             ["What's the importance of low latency inference?"],
             ["Help me boost productivity habits."],
         ],
         example_labels=[
             "Explain quantum computing",
             "What is OpenShift?",
             "Importance of low latency inference",