Spaces:

taruschirag
/

DynaGuard

Sleeping

App Files Files Community

taruschirag commited on Aug 7

Commit

aefdf18

verified ·

1 Parent(s): 9f5b9de

Update app.py

Browse files

Files changed (1) hide show

app.py +125 -82

app.py CHANGED Viewed

@@ -1,24 +1,21 @@
-import os
-# --- CRITICAL: SET ENVIRONMENT VARIABLES BEFORE IMPORTING GRADIO ---
-# This ensures a stable Gradio environment.
 os.environ["GRADIO_ENABLE_SSR"] = "0"
 import gradio as gr
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from huggingface_hub import login
-# --- Hugging Face Login ---
 HF_READONLY_API_KEY = os.getenv("HF_READONLY_API_KEY")
-if HF_READONLY_API_KEY:
-    login(token=HF_READONLY_API_KEY)
-# --- Constants ---
 SYSTEM_PROMPT = """You are a guardian model evaluating…</explanation>"""
-COT_OPENING = "<think>"
-# --- Helper Functions ---
 def format_rules(rules):
     formatted_rules = "<rules>\n"
     for i, rule in enumerate(rules):
@@ -30,113 +27,159 @@ def format_transcript(transcript):
     formatted_transcript = f"<transcript>\n{transcript}\n</transcript>\n"
     return formatted_transcript
-def safe_truncate_to_bytes(text, max_bytes=4096):
-    """
-    Safely truncates text to fit within a byte limit, handling UTF-8 correctly.
-    """
-    if len(text.encode('utf-8')) <= max_bytes:
-        return text
-    # Binary search for the right truncation point
-    left, right = 0, len(text)
-    result = ""
-    while left <= right:
-        mid = (left + right) // 2
-        candidate = text[:mid]
-        if len(candidate.encode('utf-8')) <= max_bytes:
-            result = candidate
-            left = mid + 1
-        else:
-            right = mid - 1
-    # Add a truncation notice if the text was shortened
-    if len(result) < len(text):
-        notice = "\n\n[Response truncated to prevent server errors]"
-        notice_bytes = len(notice.encode('utf-8'))
-        # Make space for the notice itself
-        if len(result.encode('utf-8')) + notice_bytes > max_bytes:
-            result = result[:len(result) - len(notice)]
-        result += notice
-    return result
-# --- Your Original ModelWrapper Class ---
-# Bringing this back as it's a good way to organize your model logic.
 class ModelWrapper:
     def __init__(self, model_name="Qwen/Qwen3-0.6B"):
-        print(f"Loading model: {model_name}")
         self.model_name = model_name
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
         self.tokenizer.pad_token_id = self.tokenizer.pad_token_id or self.tokenizer.eos_token_id
         self.model = AutoModelForCausalLM.from_pretrained(
             model_name, device_map="auto", torch_dtype=torch.bfloat16).eval()
-        print("Model loaded successfully.")
-    def get_response(self, prompt, max_new_tokens=256, temperature=0.7, top_p=0.9, **kwargs):
-        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
         with torch.no_grad():
-            output_ids = self.model.generate(
                 **inputs,
                 max_new_tokens=max_new_tokens,
                 temperature=temperature,
                 top_p=top_p,
                 pad_token_id=self.tokenizer.pad_token_id,
                 do_sample=True,
                 eos_token_id=self.tokenizer.eos_token_id
             )
-        # Decode only the newly generated part of the output
-        return self.tokenizer.decode(output_ids[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
-# --- Instantiate Your Model ---
-model_wrapper = ModelWrapper()
-# --- Main Gradio Inference Function ---
 def compliance_check(rules_text, transcript_text, thinking):
     try:
-        # Input validation
-        if not rules_text.strip():
-            return "Error: Please provide at least one rule."
-        if not transcript_text.strip():
-            return "Error: Please provide a transcript to analyze."
-        rules = [r.strip() for r in rules_text.split("\n") if r.strip()]
         inp = format_rules(rules) + format_transcript(transcript_text)
-        # Prepare the prompt using a simplified chat template structure
-        message = [
-            {'role': 'system', 'content': SYSTEM_PROMPT},
-            {'role': 'user', 'content': inp}
-        ]
-        prompt = model_wrapper.tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=True)
-        if thinking:
-            prompt += f"\n{COT_OPENING}"
-        # Get the model's response
-        out = model_wrapper.get_response(prompt)
-        if not out.strip():
-            out = "No response generated from the model."
     except Exception as e:
-        print(f"An error occurred: {str(e)}")
-        out = f"An unexpected error occurred during processing. Please check the logs."
-    # Apply safe truncation to ALL possible outputs (both success and error)
-    return safe_truncate_to_bytes(out.strip())
-# — Build the Final Gradio Interface —
 demo = gr.Interface(
     fn=compliance_check,
     inputs=[
-        gr.Textbox(lines=5, label="Rules (one per line)", placeholder="Enter compliance rules..."),
-        gr.Textbox(lines=10, label="Transcript", placeholder="Paste the transcript to analyze..."),
         gr.Checkbox(label="Enable ⟨think⟩ mode", value=True)
     ],
-    outputs=gr.Textbox(label="Compliance Output", lines=10, show_copy_button=True),
     title="DynaGuard Compliance Checker",
     description="Paste your rules & transcript, then hit Submit.",
-    flagging_options=None # Modern way to disable flagging
 )
 if __name__ == "__main__":

 os.environ["GRADIO_ENABLE_SSR"] = "0"
+import os
 import gradio as gr
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
+from datasets import load_dataset
 from huggingface_hub import login
 HF_READONLY_API_KEY = os.getenv("HF_READONLY_API_KEY")
+login(token=HF_READONLY_API_KEY)
+COT_OPENING     = "<think>"
+EXPLANATION_OPENING = "<explanation>"
+LABEL_OPENING   = "<answer>"
+LABEL_CLOSING   = "</answer>"
+INPUT_FIELD     = "question"
 SYSTEM_PROMPT = """You are a guardian model evaluating…</explanation>"""
 def format_rules(rules):
     formatted_rules = "<rules>\n"
     for i, rule in enumerate(rules):
     formatted_transcript = f"<transcript>\n{transcript}\n</transcript>\n"
     return formatted_transcript
+def get_example(
+    dataset_path="tomg-group-umd/compliance_benchmark",
+    subset="compliance",
+    split="test_handcrafted",
+    example_idx=0,
+):
+    dataset = load_dataset(dataset_path, subset, split=split)
+    example = dataset[example_idx]
+    return example[INPUT_FIELD]
+def get_message(model, input, system_prompt=SYSTEM_PROMPT, enable_thinking=True):
+    message = model.apply_chat_template(system_prompt, input, enable_thinking=enable_thinking)
+    return message
 class ModelWrapper:
     def __init__(self, model_name="Qwen/Qwen3-0.6B"):
         self.model_name = model_name
+        if "nemoguard" in model_name:
+            self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
+        else:
+            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
         self.tokenizer.pad_token_id = self.tokenizer.pad_token_id or self.tokenizer.eos_token_id
         self.model = AutoModelForCausalLM.from_pretrained(
             model_name, device_map="auto", torch_dtype=torch.bfloat16).eval()
+    def get_message_template(self, system_content=None, user_content=None, assistant_content=None):
+        """Compile sys, user, assistant inputs into the proper dictionaries"""
+        message = []
+        if system_content is not None:
+            message.append({'role': 'system', 'content': system_content})
+        if user_content is not None:
+            message.append({'role': 'user', 'content': user_content})
+        if assistant_content is not None:
+            message.append({'role': 'assistant', 'content': assistant_content})
+        if not message:
+            raise ValueError("No content provided for any role.")
+        return message
+    def apply_chat_template(self, system_content, user_content, assistant_content=None, enable_thinking=True):
+        """Call the tokenizer's chat template with exactly the right arguments for whether we want it to generate thinking before the answer (which differs depending on whether it is Qwen3 or not)."""
+        if assistant_content is not None:
+            # If assistant content is passed we simply use it.
+            # This works for both Qwen3 and non-Qwen3 models. With Qwen3 any time assistant_content is provided, it automatically adds the <think></think> pair before the content, which is what we want.
+            message = self.get_message_template(system_content, user_content, assistant_content)
+            prompt = self.tokenizer.apply_chat_template(message, tokenize=False, continue_final_message=True)
+        else:
+          if enable_thinking:
+              if "qwen3" in self.model_name.lower():
+                  # Let the Qwen chat template handle the thinking token
+                  message = self.get_message_template(system_content, user_content)
+                  prompt = self.tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=True, enable_thinking=True)
+                  # The way the Qwen3 chat template works is it adds a <think></think> pair when enable_thinking=False, but for enable_thinking=True, it adds nothing and lets the model decide. Here we force the <think> tag to be there.
+                  prompt = prompt + f"\n{COT_OPENING}"
+              else:
+                  message = self.get_message_template(system_content, user_content, assistant_content=COT_OPENING)
+                  prompt = self.tokenizer.apply_chat_template(message, tokenize=False, continue_final_message=True)
+          else:
+              # This works for both Qwen3 and non-Qwen3 models.
+              # When Qwen3 gets assistant_content, it automatically adds the <think></think> pair before the content like we want. And other models ignore the enable_thinking argument.
+              message = self.get_message_template(system_content, user_content, assistant_content=LABEL_OPENING)
+              prompt = self.tokenizer.apply_chat_template(message, tokenize=False, continue_final_message=True, enable_thinking=False)
+        return prompt
+    def get_response(self, input, temperature=0.7, top_k=20, top_p=0.8, max_new_tokens=256, enable_thinking=True, system_prompt=SYSTEM_PROMPT):
+        """Generate and decode the response with the recommended temperature settings for thinking and non-thinking."""
+        print("Generating response...")
+        if "qwen3" in self.model_name.lower() and enable_thinking:
+            # Use values from https://huggingface.co/Qwen/Qwen3-8B#switching-between-thinking-and-non-thinking-mode
+            temperature = 0.6
+            top_p = 0.95
+            top_k = 20
+        message = self.apply_chat_template(system_prompt, input, enable_thinking=enable_thinking)
+        inputs = self.tokenizer(message, return_tensors="pt").to(self.model.device)
         with torch.no_grad():
+            output_content = self.model.generate(
                 **inputs,
                 max_new_tokens=max_new_tokens,
+                num_return_sequences=1,
                 temperature=temperature,
+                top_k=top_k,
                 top_p=top_p,
+                min_p=0,
                 pad_token_id=self.tokenizer.pad_token_id,
                 do_sample=True,
                 eos_token_id=self.tokenizer.eos_token_id
             )
+        output_text = self.tokenizer.decode(output_content[0], skip_special_tokens=True)
+        try:
+            sys_prompt_text = output_text.split("Brief explanation\n</explanation>")[0]
+            remainder = output_text.split("Brief explanation\n</explanation>")[-1]
+            rules_transcript_text = remainder.split("</transcript>")[0]
+            thinking_answer_text = remainder.split("</transcript>")[-1]
+            return thinking_answer_text
+        except:
+            # If parsing fails, return the portion after the input
+            input_length = len(message)
+            return output_text[input_length:] if len(output_text) > input_length else "No response generated."
+# — instantiate your model —
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+model = ModelWrapper(MODEL_NAME)
+# — Gradio inference function —
 def compliance_check(rules_text, transcript_text, thinking):
     try:
+        rules = [r for r in rules_text.split("\n") if r.strip()]
         inp = format_rules(rules) + format_transcript(transcript_text)
+        # Limit max tokens to prevent oversized responses
+        out = model.get_response(inp, enable_thinking=thinking, max_new_tokens=256)
+        # Clean up any malformed output and ensure it's a string
+        out = str(out).strip()
+        if not out:
+            out = "No response generated. Please try with different input."
+        # Ensure the response isn't too long for an HTTP response by checking byte length
+        max_bytes = 2500  # A more generous limit, in bytes
+        out_bytes = out.encode('utf-8')
+        if len(out_bytes) > max_bytes:
+            # Truncate the byte string, then decode back to a string, ignoring errors
+            # This prevents cutting a multi-byte character in half
+            truncated_bytes = out_bytes[:max_bytes]
+            out = truncated_bytes.decode('utf-8', errors='ignore')
+            out += "\n\n[Response truncated to prevent server errors]"
+        return out
     except Exception as e:
+        error_msg = f"Error: {str(e)[:200]}"  # Limit error message length
+        print(f"Full error: {e}")
+        return error_msg
+# — build Gradio interface —
 demo = gr.Interface(
     fn=compliance_check,
     inputs=[
+        gr.Textbox(lines=5, label="Rules (one per line)", max_lines=10),
+        gr.Textbox(lines=10, label="Transcript", max_lines=15),
         gr.Checkbox(label="Enable ⟨think⟩ mode", value=True)
     ],
+    outputs=gr.Textbox(label="Compliance Output", lines=10, max_lines=15),
     title="DynaGuard Compliance Checker",
     description="Paste your rules & transcript, then hit Submit.",
+    allow_flagging="never",
+    show_progress=True
 )
 if __name__ == "__main__":