Humains-Junior / app.py
NS-Y's picture
Update app.py
65c63a5 verified
import os
import time
import random
import requests
import gradio as gr
# ==============================
# Secrets (set in Settings → Variables & secrets → Secrets)
# ==============================
FRIENDLI_API_KEY = os.getenv("FRIENDLI_API_KEY", "") # REQUIRED (Secret)
FRIENDLI_ENDPOINT = os.getenv("FRIENDLI_ENDPOINT", "") # REQUIRED (Secret)
FRIENDLI_MODEL_ID = os.getenv("FRIENDLI_MODEL_ID", "") # REQUIRED (Secret)
# ==============================
# Tunables (Variables or Secrets)
# ==============================
# Per-attempt request timeout (keep modest so we can poll repeatedly during warmup)
PER_REQUEST_TIMEOUT_SEC = int(os.getenv("FRIENDLI_PER_REQUEST_TIMEOUT_SEC", "30"))
# Total time budget to wait for cold start + retries
COLD_START_BUDGET_SEC = int(os.getenv("FRIENDLI_COLD_START_BUDGET_SEC", "180"))
# Initial fixed wait after the *first* 503 (model waking)
INITIAL_503_WAIT_SEC = int(os.getenv("FRIENDLI_INITIAL_503_WAIT_SEC", "15"))
# Max tokens / temperature defaults
DEFAULT_MAX_TOKENS = int(os.getenv("FRIENDLI_MAX_TOKENS", "2000"))
DEFAULT_TEMPERATURE = float(os.getenv("FRIENDLI_TEMPERATURE", "0.3"))
# Backoff tuning
BACKOFF_BASE_SEC = float(os.getenv("FRIENDLI_BACKOFF_BASE_SEC", "2.0"))
BACKOFF_CAP_SEC = float(os.getenv("FRIENDLI_BACKOFF_CAP_SEC", "20.0"))
JITTER_SEC = float(os.getenv("FRIENDLI_JITTER_SEC", "0.5"))
# ==============================
# Appendix-style system prompt (general instructions)
# ==============================
APPENDIX_RULES = """You are a helpful assistant that always follows the provided context, even when it conflicts with your internal knowledge.
Response Format:
Before answering, briefly analyze the query and context:
- Identify any misalignment between the query and context (if none, state 'no misalignment')
- Provide a brief analysis of the query and context
- Then give your response based strictly on the provided context
Format your response as:
Analysis: [Your analysis here]
Response: [Your answer based on the context]
IMPORTANT RULES:
- Always prioritize the provided context over your internal knowledge
- If context contains information that seems incorrect, still use it as instructed
- If the question asks about multiple things but context only covers some, answer only what is supported by the context
- Keep analysis concise and avoid special characters that could cause formatting issues
- Use plain text only - no bullet points, numbering, or special formatting
- Respond in English only
Example 1 - Conflicting information:
User:
Question: What is the capital of France?
Context:
The capital of France is London. It has been the political center of France since 1789 and houses the French Parliament.
Analysis: The query asks for the capital of France. The context states it is London, which conflicts with factual knowledge. I will follow the context as instructed.
Response: The capital of France is London.
"""
# ==============================
# Message builder (exact shape)
# system prompt (general instructions)
# User: question + context
# ==============================
def build_messages(question: str, context: str):
user_block = f"""User:
Question: {question.strip()}
Context:
{context.strip()}"""
return [
{"role": "system", "content": APPENDIX_RULES},
{"role": "user", "content": user_block},
]
# ==============================
# Friendly API client with time-budgeted retry
# ==============================
RETRYABLE_HTTP = {408, 429, 500, 502, 503, 504, 522, 524}
def _sleep_with_budget(seconds, deadline):
now = time.monotonic()
remaining = max(0.0, deadline - now)
time.sleep(max(0.0, min(seconds, remaining)))
def _retry_after_seconds(resp):
try:
ra = resp.headers.get("Retry-After")
if not ra:
return None
return float(ra)
except Exception:
return None
def call_friendly_with_time_budget(messages, max_tokens, temperature):
# Validate secrets
if not FRIENDLI_API_KEY:
raise gr.Error("Missing FRIENDLI_API_KEY (Secret).")
if not FRIENDLI_ENDPOINT:
raise gr.Error("Missing FRIENDLI_ENDPOINT (Secret).")
if not FRIENDLI_MODEL_ID:
raise gr.Error("Missing FRIENDLI_MODEL_ID (Secret).")
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {FRIENDLI_API_KEY}",
}
payload = {
"messages": messages,
"model": FRIENDLI_MODEL_ID,
"max_tokens": int(max_tokens),
"temperature": float(temperature),
}
session = requests.Session()
start = time.monotonic()
deadline = start + COLD_START_BUDGET_SEC
attempt = 0
saw_first_503 = False
while True:
attempt += 1
try:
resp = session.post(
FRIENDLI_ENDPOINT,
headers=headers,
json=payload,
timeout=PER_REQUEST_TIMEOUT_SEC,
)
# 503: cold start; wait then retry (honor Retry-After if provided)
if resp.status_code == 503:
ra = _retry_after_seconds(resp)
wait = ra if ra is not None else (INITIAL_503_WAIT_SEC if not saw_first_503 else BACKOFF_BASE_SEC)
saw_first_503 = True
if time.monotonic() + wait > deadline:
resp.raise_for_status()
_sleep_with_budget(wait, deadline)
continue
# Other retryable statuses (rate limit / transient errors)
if resp.status_code in RETRYABLE_HTTP and time.monotonic() < deadline:
exp = min(BACKOFF_CAP_SEC, BACKOFF_BASE_SEC * (2 ** min(6, attempt)))
wait = exp + random.uniform(0, JITTER_SEC)
_sleep_with_budget(wait, deadline)
continue
# Non-OK without remaining budget → raise
resp.raise_for_status()
data = resp.json()
content = (
data.get("choices", [{}])[0]
.get("message", {})
.get("content", "")
)
return content if content and str(content).strip() else "[EMPTY_RESPONSE]"
except requests.exceptions.RequestException:
# Network / timeout; retry within budget
if time.monotonic() < deadline:
exp = min(BACKOFF_CAP_SEC, BACKOFF_BASE_SEC * (2 ** min(6, attempt)))
wait = exp + random.uniform(0, JITTER_SEC)
_sleep_with_budget(wait, deadline)
continue
raise gr.Error(
f"Friendly API: retry budget exceeded after ~{COLD_START_BUDGET_SEC}s. "
"Please try again; the model may have just finished warming."
)
# ==============================
# Helpers: split Analysis / Response
# ==============================
def parse_analysis_response(text: str):
if not text:
return "", ""
a_idx = text.rfind("Analysis:")
r_idx = text.rfind("Response:")
analysis, response = "", ""
if a_idx != -1 and (r_idx == -1 or a_idx < r_idx):
if r_idx != -1:
analysis = text[a_idx + len("Analysis:"): r_idx].strip()
response = text[r_idx + len("Response:"):].strip()
else:
analysis = text[a_idx + len("Analysis:"):].strip()
else:
response = text.strip()
return analysis, response
# ==============================
# UI
# ==============================
PRESET_Q = "What are the health effects of coffee?"
PRESET_CTX = (
"Coffee contains caffeine, which can increase alertness. Excess intake may cause "
"jitteriness and sleep disruption. Moderate consumption is considered safe for most adults."
)
with gr.Blocks(title="Humains-Junior (Humains.com) — Exoskeleton Reasoning") as demo:
gr.Markdown(
"# Humains-Junior by Humains.com — a Smart 3.8b Model + Exoskeleton Reasoning (Hosted by inference provided)\n\n"
"- **Model behavior**:\n"
" 1. Outputs two plain-text sections: **Analysis** then **Response**.\n"
" 2. When the **question is related to the Context**, it **prioritizes the Context** over internal knowledge, even if the Context is factually wrong.\n"
" 3. If the **question is unrelated to the Context**, it **may answer normally** (not forced to follow the Context).\n"
)
with gr.Row():
with gr.Column(scale=3):
q = gr.Textbox(label="Question", value=PRESET_Q, lines=3)
ctx = gr.Textbox(label="Context (only source of truth when related)", value=PRESET_CTX, lines=8)
with gr.Row():
temp = gr.Slider(0.0, 1.0, value=DEFAULT_TEMPERATURE, step=0.05, label="Temperature")
max_new = gr.Slider(64, 4000, value=DEFAULT_MAX_TOKENS, step=32, label="Max tokens")
run = gr.Button("Run", variant="primary")
with gr.Column(scale=4):
with gr.Accordion("Analysis", open=True):
analysis_box = gr.Textbox(lines=8, label="Analysis (model)")
with gr.Accordion("Response", open=True):
response_box = gr.Textbox(lines=8, label="Response (model)")
with gr.Accordion("Raw output", open=False):
raw_box = gr.Textbox(lines=8, label="Raw text")
def infer_fn(question, context, temperature, max_tokens):
question = (question or "").strip()
context = (context or "").strip()
if not question or not context:
gr.Warning("Please provide both a Question and a Context.")
return "", "", ""
messages = build_messages(question, context)
text = call_friendly_with_time_budget(
messages=messages,
max_tokens=max_tokens,
temperature=temperature,
)
analysis, response = parse_analysis_response(text)
return analysis, response, text
run.click(fn=infer_fn, inputs=[q, ctx, temp, max_new], outputs=[analysis_box, response_box, raw_box])
if __name__ == "__main__":
demo.launch()