Spaces:

KiWA001
/

kai-api-gateway

Running

App Files Files Community

kai-api-gateway / sanitizer.py

KiWA001

fix: recursive unescape for double-encoded responses

d2b69dd 8 days ago

raw

history blame

5.28 kB

	"""
	Response Sanitizer
	------------------
	Cleans AI responses by removing promotional spam, junk text, and
	unwanted artifacts injected by some g4f providers.

	Preserves legitimate markdown/rich text formatting — only strips
	known spam patterns and obvious junk.
	"""

	import re
	import json


	# Known promotional spam patterns (case-insensitive)
	# ORDER MATTERS — broader patterns first, narrower ones after.
	SPAM_PATTERNS = [
	# llmplayground.net spam
	r"(?i)\n\swant\s+best\s+roleplay\s+experience\s\??\s\n\shttps?://llmplayground\.net\s*",
	r"(?i)\n\shttps?://llmplayground\.net\s*",

	# Pollinations.AI ad block
	r"(?is)\n+\s[-—–]+\s\n+\s\\Support\s+Pollinations\.AI:?\\\s\n+\s[-—–]+\s\n+\s🌸\s\\Ad\\\s*🌸.+?accessible\s+for\s+everyone\.?",

	# "Generated by BLACKBOX.AI" style headers (entire line + trailing promo)
	# MUST come before the partial "powered/generated/created by" pattern below
	r"(?im)^generated\s+by\s+.+$\n*",

	# General promotional / self-referencing spam
	r"(?i)\n\s(?:powered\|generated\|created)\s+by\s+[\w\s.-]+\.(?:com\|net\|org\|ai\|io)\s*",
	r"(?i)\n\s(?:visit\|check\s+out\|try)\s+https?://[\w./-]+\s+for\s+(?:more\|better\|best)\s[\w\s]\s*",
	r"(?i)\n\s(?:get\|try)\s+(?:pro\|premium\|plus)\s+(?:at\|on)\s+https?://[\w./-]+\s*",

	# Common g4f provider junk footers
	r"(?i)\n+[-—–]+\s\n\s*(?:this\|note\|disclaimer)[\s\S]{0,200}(?:free\|open[\s-]?source\|community\|g4f)[\s\S]{0,100}$",

	# Trailing promotional URLs (not part of the actual answer)
	r"\n{2,}\shttps?://(?!(?:en\.)?wikipedia\.org\|docs\.\|github\.com)[\w./-]+\s$",

	# Gemini UI Artifacts
	r"(?i)\n\sExport to Sheets\s*",
	r"(?i)\n\sShow drafts\s*",
	r"(?i)\n\sRegenerate drafts\s*",
	r"(?i)\n\stuneshare\s*",
	r"(?i)\n\smore_vert\s*",
	]

	# Compile patterns once for performance
	COMPILED_SPAM = [re.compile(p) for p in SPAM_PATTERNS]

	# Whitespace cleanup patterns
	EXCESS_NEWLINES = re.compile(r"\n{3,}")
	TRAILING_WHITESPACE = re.compile(r"[ \t]+$", re.MULTILINE)
	# Pattern for standalone /N artifact
	ARTIFACT_N_PATTERN = re.compile(r"(?<!\w)/N(?!\w)")


	def sanitize_response(text: str) -> str:
	"""
	Clean an AI response by removing spam and junk while preserving
	legitimate markdown formatting.

	Also extracts content if the model returns double-encoded JSON,
	strips internal reasoning traces (CoT), and fixes escape sequences.

	Keeps:
	- bold, italic, ~~strikethrough~~
	- # headers
	- - bullet lists
	- 1. numbered lists
	- `code` and ```code blocks```
	- [links](url) that are part of the actual answer
	- line breaks and paragraph structure
	"""
	if not text:
	return ""

	cleaned = text.strip()

	# === JSON Unwrapping (Fix for Pollinations/Reasoning Models) ===
	# Some models return a JSON string instead of raw text.
	# E.g. {"role": "assistant", "reasoning_content": "...", "content": "Hello"}
	if cleaned.startswith("{") and cleaned.endswith("}"):
	try:
	data = json.loads(cleaned)
	if isinstance(data, dict):
	# Extract actual content, ignoring reasoning/internal thought
	candidate = (
	data.get("content") or
	data.get("response") or
	data.get("message") or
	data.get("answer")
	)
	if candidate and isinstance(candidate, str):
	cleaned = candidate.strip()
	except (json.JSONDecodeError, ValueError):
	pass # Not valid JSON, process as regular text

	# === Reasoning Cleanup (Chain-of-Thought) ===
	# Remove <think> tags used by DeepSeek and similar reasoning models
	cleaned = re.sub(r"<think>.*?</think>", "", cleaned, flags=re.DOTALL)

	# === Artifact & Escape Sequence Cleanup ===
	# Fix double-escaped newlines common in poor JSON dumps
	cleaned = cleaned.replace("\\n", "\n")
	cleaned = cleaned.replace("\\r", "")
	cleaned = cleaned.replace("\\t", "\t")

	# Fix user-reported "/N" artifact (treat as newline if standalone)
	cleaned = ARTIFACT_N_PATTERN.sub("\n", cleaned)

	# === Unescape JSON/Raw Literals ===
	# Robustly decode escape sequences like \n, \", \t using Python's codec
	# We do this up to 2 times to catch double-encoded strings (common in some scraped JSON)
	for _ in range(2):
	try:
	if "\\n" not in cleaned and '\\"' not in cleaned:
	break

	# If it looks like a JSON string literal (wrapped in quotes), strip them first
	if cleaned.startswith('"') and cleaned.endswith('"'):
	cleaned = cleaned[1:-1]

	cleaned = cleaned.encode('utf-8').decode('unicode_escape')
	except Exception:
	# Fallback to manual replacement if codec fails
	cleaned = cleaned.replace("\\n", "\n").replace('\\"', '"')

	# === Spam Removal ===
	for pattern in COMPILED_SPAM:
	cleaned = pattern.sub("", cleaned)

	# === Whitespace Cleanup ===
	cleaned = EXCESS_NEWLINES.sub("\n\n", cleaned)
	cleaned = TRAILING_WHITESPACE.sub("", cleaned)

	return cleaned.strip()