Spaces:

Inpris
/

Humains-Junior

Sleeping

App Files Files Community

Humains-Junior / app.py

NS-Y

Update app.py

65c63a5 verified 2 months ago

raw

history blame contribute delete

10.1 kB

	import os
	import time
	import random
	import requests
	import gradio as gr

	# ==============================
	# Secrets (set in Settings → Variables & secrets → Secrets)
	# ==============================
	FRIENDLI_API_KEY = os.getenv("FRIENDLI_API_KEY", "") # REQUIRED (Secret)
	FRIENDLI_ENDPOINT = os.getenv("FRIENDLI_ENDPOINT", "") # REQUIRED (Secret)
	FRIENDLI_MODEL_ID = os.getenv("FRIENDLI_MODEL_ID", "") # REQUIRED (Secret)

	# ==============================
	# Tunables (Variables or Secrets)
	# ==============================
	# Per-attempt request timeout (keep modest so we can poll repeatedly during warmup)
	PER_REQUEST_TIMEOUT_SEC = int(os.getenv("FRIENDLI_PER_REQUEST_TIMEOUT_SEC", "30"))
	# Total time budget to wait for cold start + retries
	COLD_START_BUDGET_SEC = int(os.getenv("FRIENDLI_COLD_START_BUDGET_SEC", "180"))
	# Initial fixed wait after the first 503 (model waking)
	INITIAL_503_WAIT_SEC = int(os.getenv("FRIENDLI_INITIAL_503_WAIT_SEC", "15"))
	# Max tokens / temperature defaults
	DEFAULT_MAX_TOKENS = int(os.getenv("FRIENDLI_MAX_TOKENS", "2000"))
	DEFAULT_TEMPERATURE = float(os.getenv("FRIENDLI_TEMPERATURE", "0.3"))

	# Backoff tuning
	BACKOFF_BASE_SEC = float(os.getenv("FRIENDLI_BACKOFF_BASE_SEC", "2.0"))
	BACKOFF_CAP_SEC = float(os.getenv("FRIENDLI_BACKOFF_CAP_SEC", "20.0"))
	JITTER_SEC = float(os.getenv("FRIENDLI_JITTER_SEC", "0.5"))

	# ==============================
	# Appendix-style system prompt (general instructions)
	# ==============================
	APPENDIX_RULES = """You are a helpful assistant that always follows the provided context, even when it conflicts with your internal knowledge.

	Response Format:
	Before answering, briefly analyze the query and context:
	- Identify any misalignment between the query and context (if none, state 'no misalignment')
	- Provide a brief analysis of the query and context
	- Then give your response based strictly on the provided context

	Format your response as:
	Analysis: [Your analysis here]
	Response: [Your answer based on the context]

	IMPORTANT RULES:
	- Always prioritize the provided context over your internal knowledge
	- If context contains information that seems incorrect, still use it as instructed
	- If the question asks about multiple things but context only covers some, answer only what is supported by the context
	- Keep analysis concise and avoid special characters that could cause formatting issues
	- Use plain text only - no bullet points, numbering, or special formatting
	- Respond in English only

	Example 1 - Conflicting information:
	User:
	Question: What is the capital of France?
	Context:
	The capital of France is London. It has been the political center of France since 1789 and houses the French Parliament.

	Analysis: The query asks for the capital of France. The context states it is London, which conflicts with factual knowledge. I will follow the context as instructed.
	Response: The capital of France is London.
	"""

	# ==============================
	# Message builder (exact shape)
	# system prompt (general instructions)
	# User: question + context
	# ==============================
	def build_messages(question: str, context: str):
	user_block = f"""User:
	Question: {question.strip()}
	Context:
	{context.strip()}"""
	return [
	{"role": "system", "content": APPENDIX_RULES},
	{"role": "user", "content": user_block},
	]

	# ==============================
	# Friendly API client with time-budgeted retry
	# ==============================
	RETRYABLE_HTTP = {408, 429, 500, 502, 503, 504, 522, 524}

	def _sleep_with_budget(seconds, deadline):
	now = time.monotonic()
	remaining = max(0.0, deadline - now)
	time.sleep(max(0.0, min(seconds, remaining)))

	def _retry_after_seconds(resp):
	try:
	ra = resp.headers.get("Retry-After")
	if not ra:
	return None
	return float(ra)
	except Exception:
	return None

	def call_friendly_with_time_budget(messages, max_tokens, temperature):
	# Validate secrets
	if not FRIENDLI_API_KEY:
	raise gr.Error("Missing FRIENDLI_API_KEY (Secret).")
	if not FRIENDLI_ENDPOINT:
	raise gr.Error("Missing FRIENDLI_ENDPOINT (Secret).")
	if not FRIENDLI_MODEL_ID:
	raise gr.Error("Missing FRIENDLI_MODEL_ID (Secret).")

	headers = {
	"Content-Type": "application/json",
	"Authorization": f"Bearer {FRIENDLI_API_KEY}",
	}
	payload = {
	"messages": messages,
	"model": FRIENDLI_MODEL_ID,
	"max_tokens": int(max_tokens),
	"temperature": float(temperature),
	}

	session = requests.Session()
	start = time.monotonic()
	deadline = start + COLD_START_BUDGET_SEC
	attempt = 0
	saw_first_503 = False

	while True:
	attempt += 1
	try:
	resp = session.post(
	FRIENDLI_ENDPOINT,
	headers=headers,
	json=payload,
	timeout=PER_REQUEST_TIMEOUT_SEC,
	)

	# 503: cold start; wait then retry (honor Retry-After if provided)
	if resp.status_code == 503:
	ra = _retry_after_seconds(resp)
	wait = ra if ra is not None else (INITIAL_503_WAIT_SEC if not saw_first_503 else BACKOFF_BASE_SEC)
	saw_first_503 = True
	if time.monotonic() + wait > deadline:
	resp.raise_for_status()
	_sleep_with_budget(wait, deadline)
	continue

	# Other retryable statuses (rate limit / transient errors)
	if resp.status_code in RETRYABLE_HTTP and time.monotonic() < deadline:
	exp = min(BACKOFF_CAP_SEC, BACKOFF_BASE_SEC * (2 ** min(6, attempt)))
	wait = exp + random.uniform(0, JITTER_SEC)
	_sleep_with_budget(wait, deadline)
	continue

	# Non-OK without remaining budget → raise
	resp.raise_for_status()

	data = resp.json()
	content = (
	data.get("choices", [{}])[0]
	.get("message", {})
	.get("content", "")
	)
	return content if content and str(content).strip() else "[EMPTY_RESPONSE]"

	except requests.exceptions.RequestException:
	# Network / timeout; retry within budget
	if time.monotonic() < deadline:
	exp = min(BACKOFF_CAP_SEC, BACKOFF_BASE_SEC * (2 ** min(6, attempt)))
	wait = exp + random.uniform(0, JITTER_SEC)
	_sleep_with_budget(wait, deadline)
	continue
	raise gr.Error(
	f"Friendly API: retry budget exceeded after ~{COLD_START_BUDGET_SEC}s. "
	"Please try again; the model may have just finished warming."
	)

	# ==============================
	# Helpers: split Analysis / Response
	# ==============================
	def parse_analysis_response(text: str):
	if not text:
	return "", ""
	a_idx = text.rfind("Analysis:")
	r_idx = text.rfind("Response:")
	analysis, response = "", ""
	if a_idx != -1 and (r_idx == -1 or a_idx < r_idx):
	if r_idx != -1:
	analysis = text[a_idx + len("Analysis:"): r_idx].strip()
	response = text[r_idx + len("Response:"):].strip()
	else:
	analysis = text[a_idx + len("Analysis:"):].strip()
	else:
	response = text.strip()
	return analysis, response

	# ==============================
	# UI
	# ==============================
	PRESET_Q = "What are the health effects of coffee?"
	PRESET_CTX = (
	"Coffee contains caffeine, which can increase alertness. Excess intake may cause "
	"jitteriness and sleep disruption. Moderate consumption is considered safe for most adults."
	)

	with gr.Blocks(title="Humains-Junior (Humains.com) — Exoskeleton Reasoning") as demo:
	gr.Markdown(
	"# Humains-Junior by Humains.com — a Smart 3.8b Model + Exoskeleton Reasoning (Hosted by inference provided)\n\n"
	"- Model behavior:\n"
	" 1. Outputs two plain-text sections: Analysis then Response.\n"
	" 2. When the question is related to the Context, it prioritizes the Context over internal knowledge, even if the Context is factually wrong.\n"
	" 3. If the question is unrelated to the Context, it may answer normally (not forced to follow the Context).\n"
	)

	with gr.Row():
	with gr.Column(scale=3):
	q = gr.Textbox(label="Question", value=PRESET_Q, lines=3)
	ctx = gr.Textbox(label="Context (only source of truth when related)", value=PRESET_CTX, lines=8)

	with gr.Row():
	temp = gr.Slider(0.0, 1.0, value=DEFAULT_TEMPERATURE, step=0.05, label="Temperature")
	max_new = gr.Slider(64, 4000, value=DEFAULT_MAX_TOKENS, step=32, label="Max tokens")

	run = gr.Button("Run", variant="primary")

	with gr.Column(scale=4):
	with gr.Accordion("Analysis", open=True):
	analysis_box = gr.Textbox(lines=8, label="Analysis (model)")
	with gr.Accordion("Response", open=True):
	response_box = gr.Textbox(lines=8, label="Response (model)")
	with gr.Accordion("Raw output", open=False):
	raw_box = gr.Textbox(lines=8, label="Raw text")

	def infer_fn(question, context, temperature, max_tokens):
	question = (question or "").strip()
	context = (context or "").strip()
	if not question or not context:
	gr.Warning("Please provide both a Question and a Context.")
	return "", "", ""

	messages = build_messages(question, context)
	text = call_friendly_with_time_budget(
	messages=messages,
	max_tokens=max_tokens,
	temperature=temperature,
	)
	analysis, response = parse_analysis_response(text)
	return analysis, response, text

	run.click(fn=infer_fn, inputs=[q, ctx, temp, max_new], outputs=[analysis_box, response_box, raw_box])

	if __name__ == "__main__":
	demo.launch()