Spaces:

openai
/

gpt-oss-safeguard-20b

Running on Zero

App Files Files Community

raphael-gl HF Staff commited on Oct 29

Commit

6ab4719

verified ·

1 Parent(s): eb6b4ca

Update app.py

Browse files

1. preload model (zero tensor packing avoid from consuming the user quota)
2. streaming

Files changed (1) hide show

app.py +95 -78

app.py CHANGED Viewed

@@ -1,19 +1,30 @@
 import os
 import time
 from typing import List, Dict, Tuple
 import gradio as gr
-from transformers import pipeline
-import spaces
 # === Config (override via Space secrets/env vars) ===
-MODEL_ID = os.environ.get("MODEL_ID", "gpt-oss-safeguard-20b")
 DEFAULT_MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", 512))
 DEFAULT_TEMPERATURE = float(os.environ.get("TEMPERATURE", 1))
 DEFAULT_TOP_P = float(os.environ.get("TOP_P", 1.0))
 DEFAULT_REPETITION_PENALTY = float(os.environ.get("REPETITION_PENALTY", 1.0))
 ZGPU_DURATION = int(os.environ.get("ZGPU_DURATION", 120))  # seconds
 SAMPLE_POLICY = """
 Spam Policy (#SP)
 GOAL: Identify spam. Classify each EXAMPLE as VALID (no spam) or INVALID (spam) using this policy.
@@ -123,13 +134,42 @@ If financial harm or fraud → classify SP4.
 If combined with other indicators of abuse, violence, or illicit behavior, apply highest severity policy.
 """
-_pipe = None  # cached pipeline
 # ----------------------------
 # Helpers (simple & explicit)
 # ----------------------------
 def _to_messages(policy: str, user_prompt: str) -> List[Dict[str, str]]:
     msgs: List[Dict[str, str]] = []
     if policy.strip():
@@ -138,94 +178,71 @@ def _to_messages(policy: str, user_prompt: str) -> List[Dict[str, str]]:
     return msgs
-def _extract_assistant_content(outputs) -> str:
-    """Extract the assistant's content from the known shape:
-    outputs = [
-      {
-        'generated_text': [
-          {'role': 'system', 'content': ...},
-          {'role': 'user', 'content': ...},
-          {'role': 'assistant', 'content': 'analysis...assistantfinal...'}
-        ]
-      }
-    ]
-    Keep this forgiving and minimal.
-    """
-    try:
-        msgs = outputs[0]["generated_text"]
-        for m in reversed(msgs):
-            if isinstance(m, dict) and m.get("role") == "assistant":
-                return m.get("content", "")
-        last = msgs[-1]
-        return last.get("content", "") if isinstance(last, dict) else str(last)
-    except Exception:
-        return str(outputs)
-def _parse_harmony_output_from_string(s: str) -> Tuple[str, str]:
-    """Split a Harmony-style concatenated string into (analysis, final).
-    Expects markers 'analysis' ... 'assistantfinal'.
-    No heavy parsing — just string finds.
-    """
-    if not isinstance(s, str):
-        s = str(s)
-    final_key = "assistantfinal"
-    j = s.find(final_key)
-    if j != -1:
-        final_text = s[j + len(final_key):].strip()
-        i = s.find("analysis")
-        if i != -1 and i < j:
-            analysis_text = s[i + len("analysis"): j].strip()
-        else:
-            analysis_text = s[:j].strip()
-        return analysis_text, final_text
-    # no explicit final marker
-    if s.startswith("analysis"):
-        return s[len("analysis"):].strip(), ""
-    return "", s.strip()
 # ----------------------------
 # Inference
 # ----------------------------
 @spaces.GPU(duration=ZGPU_DURATION)
-def generate_long_prompt(
-    policy: str,
-    prompt: str,
-    max_new_tokens: int,
-    temperature: float,
-    top_p: float,
-    repetition_penalty: float,
 ) -> Tuple[str, str, str]:
-    global _pipe
-    start = time.time()
-    if _pipe is None:
-        _pipe = pipeline(
-            task="text-generation",
-            model=MODEL_ID,
-            torch_dtype="auto",
-            device_map="auto",
-        )
     messages = _to_messages(policy, prompt)
-    outputs = _pipe(
         messages,
         max_new_tokens=max_new_tokens,
-        do_sample=True,
-        temperature=temperature,
         top_p=top_p,
-        repetition_penalty=repetition_penalty,
     )
-    assistant_str = _extract_assistant_content(outputs)
-    analysis_text, final_text = _parse_harmony_output_from_string(assistant_str)
-    elapsed = time.time() - start
-    meta = f"Model: {MODEL_ID} | Time: {elapsed:.1f}s | max_new_tokens={max_new_tokens}"
-    return analysis_text or "(No analysis)", final_text or "(No answer)", meta
 # ----------------------------
@@ -269,7 +286,7 @@ with gr.Blocks(css=CUSTOM_CSS, theme=gr.themes.Soft()) as demo:
             meta = gr.Markdown()
     btn.click(
-        fn=generate_long_prompt,
         inputs=[policy, prompt, max_new_tokens, temperature, top_p, repetition_penalty],
         outputs=[analysis, answer, meta],
         concurrency_limit=1,

+import spaces
+import logging
 import os
+import re
 import time
 from typing import List, Dict, Tuple
+import threading
+import torch
 import gradio as gr
+from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
+logging.basicConfig(level=logging.INFO)
+LOG = logging.getLogger(__name__)
 # === Config (override via Space secrets/env vars) ===
+MODEL_ID = os.environ.get("MODEL_ID", "tlhv/osb-minier")
 DEFAULT_MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", 512))
 DEFAULT_TEMPERATURE = float(os.environ.get("TEMPERATURE", 1))
 DEFAULT_TOP_P = float(os.environ.get("TOP_P", 1.0))
 DEFAULT_REPETITION_PENALTY = float(os.environ.get("REPETITION_PENALTY", 1.0))
 ZGPU_DURATION = int(os.environ.get("ZGPU_DURATION", 120))  # seconds
+ANALYSIS_PATTERN = analysis_match = re.compile(r'^(.*)assistantfinal', flags=re.DOTALL)
 SAMPLE_POLICY = """
 Spam Policy (#SP)
 GOAL: Identify spam. Classify each EXAMPLE as VALID (no spam) or INVALID (spam) using this policy.
 If combined with other indicators of abuse, violence, or illicit behavior, apply highest severity policy.
 """
+# Globals so we only load once
+_tokenizer = None
+_model = None
+_device = None
+def _ensure_loaded():
+    LOG.info("Loading model and tokenizer")
+    global _tokenizer, _model, _device
+    if _tokenizer is not None and _model is not None:
+        return
+    _tokenizer = AutoTokenizer.from_pretrained(
+        MODEL_ID, trust_remote_code=True
+    )
+    _model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        trust_remote_code=True,
+        # torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+        low_cpu_mem_usage=True,
+        device_map="auto" if torch.cuda.is_available() else None,
+    )
+    if _tokenizer.pad_token_id is None and _tokenizer.eos_token_id is not None:
+        _tokenizer.pad_token = _tokenizer.eos_token
+    _model.eval()
+    _device = next(_model.parameters()).device
+_ensure_loaded()
+LOG.info("DEVICE %s", _device)
 # ----------------------------
 # Helpers (simple & explicit)
 # ----------------------------
 def _to_messages(policy: str, user_prompt: str) -> List[Dict[str, str]]:
     msgs: List[Dict[str, str]] = []
     if policy.strip():
     return msgs
 # ----------------------------
 # Inference
 # ----------------------------
 @spaces.GPU(duration=ZGPU_DURATION)
+def generate_stream(
+        policy: str,
+        prompt: str,
+        max_new_tokens: int,
+        temperature: float,
+        top_p: float,
+        repetition_penalty: float,
 ) -> Tuple[str, str, str]:
+    start = time.time()
     messages = _to_messages(policy, prompt)
+    streamer = TextIteratorStreamer(
+        _tokenizer,
+        skip_special_tokens=True,
+        skip_prompt=True,      # <-- key fix
+    )
+    inputs = _tokenizer.apply_chat_template(
         messages,
+        return_tensors="pt",
+        add_generation_prompt=True,
+    )
+    input_ids = inputs["input_ids"] if isinstance(inputs, dict) else inputs
+    input_ids = input_ids.to(_device)
+    gen_kwargs = dict(
+        input_ids=input_ids,
         max_new_tokens=max_new_tokens,
+        do_sample=temperature > 0.0,
+        temperature=float(temperature),
         top_p=top_p,
+        pad_token_id=_tokenizer.pad_token_id,
+        eos_token_id=_tokenizer.eos_token_id,
+        streamer=streamer,
     )
+    thread = threading.Thread(target=_model.generate, kwargs=gen_kwargs)
+    thread.start()
+    analysis = ""
+    output = ""
+    for new_text in streamer:
+        output += new_text
+        if not analysis:
+            m = ANALYSIS_PATTERN.match(output)
+            if m:
+                analysis = re.sub(r'^analysis\s*', '', m.group(1))
+                output = ""
+        if not analysis:
+            analysis_text = re.sub(r'^analysis\s*', '', output)
+            final_text = None
+        else:
+            analysis_text = analysis
+            final_text = output
+        elapsed = time.time() - start
+        meta = f"Model: {MODEL_ID} | Time: {elapsed:.1f}s | max_new_tokens={max_new_tokens}"
+        yield analysis_text or "(No analysis)", final_text or "(No answer)", meta
 # ----------------------------
             meta = gr.Markdown()
     btn.click(
+        fn=generate_stream,
         inputs=[policy, prompt, max_new_tokens, temperature, top_p, repetition_penalty],
         outputs=[analysis, answer, meta],
         concurrency_limit=1,