kai-api-gateway / sanitizer.py
KiWA001's picture
fix: recursive unescape for double-encoded responses
d2b69dd
raw
history blame
5.28 kB
"""
Response Sanitizer
------------------
Cleans AI responses by removing promotional spam, junk text, and
unwanted artifacts injected by some g4f providers.
Preserves legitimate markdown/rich text formatting — only strips
known spam patterns and obvious junk.
"""
import re
import json
# Known promotional spam patterns (case-insensitive)
# ORDER MATTERS — broader patterns first, narrower ones after.
SPAM_PATTERNS = [
# llmplayground.net spam
r"(?i)\n*\s*want\s+best\s+roleplay\s+experience\s*\??\s*\n*\s*https?://llmplayground\.net\s*",
r"(?i)\n*\s*https?://llmplayground\.net\s*",
# Pollinations.AI ad block
r"(?is)\n+\s*[-—–]+\s*\n+\s*\*\*Support\s+Pollinations\.AI:?\*\*\s*\n+\s*[-—–]+\s*\n+\s*🌸\s*\*\*Ad\*\*\s*🌸.+?accessible\s+for\s+everyone\.?",
# "Generated by BLACKBOX.AI" style headers (entire line + trailing promo)
# MUST come before the partial "powered/generated/created by" pattern below
r"(?im)^generated\s+by\s+.+$\n*",
# General promotional / self-referencing spam
r"(?i)\n*\s*(?:powered|generated|created)\s+by\s+[\w\s.-]+\.(?:com|net|org|ai|io)\s*",
r"(?i)\n*\s*(?:visit|check\s+out|try)\s+https?://[\w./-]+\s+for\s+(?:more|better|best)\s*[\w\s]*\s*",
r"(?i)\n*\s*(?:get|try)\s+(?:pro|premium|plus)\s+(?:at|on)\s+https?://[\w./-]+\s*",
# Common g4f provider junk footers
r"(?i)\n+[-—–]+\s*\n*\s*(?:this|note|disclaimer)[\s\S]{0,200}(?:free|open[\s-]?source|community|g4f)[\s\S]{0,100}$",
# Trailing promotional URLs (not part of the actual answer)
r"\n{2,}\s*https?://(?!(?:en\.)?wikipedia\.org|docs\.|github\.com)[\w./-]+\s*$",
# Gemini UI Artifacts
r"(?i)\n*\s*Export to Sheets\s*",
r"(?i)\n*\s*Show drafts\s*",
r"(?i)\n*\s*Regenerate drafts\s*",
r"(?i)\n*\s*tuneshare\s*",
r"(?i)\n*\s*more_vert\s*",
]
# Compile patterns once for performance
COMPILED_SPAM = [re.compile(p) for p in SPAM_PATTERNS]
# Whitespace cleanup patterns
EXCESS_NEWLINES = re.compile(r"\n{3,}")
TRAILING_WHITESPACE = re.compile(r"[ \t]+$", re.MULTILINE)
# Pattern for standalone /N artifact
ARTIFACT_N_PATTERN = re.compile(r"(?<!\w)/N(?!\w)")
def sanitize_response(text: str) -> str:
"""
Clean an AI response by removing spam and junk while preserving
legitimate markdown formatting.
Also extracts content if the model returns double-encoded JSON,
strips internal reasoning traces (CoT), and fixes escape sequences.
Keeps:
- **bold**, *italic*, ~~strikethrough~~
- # headers
- - bullet lists
- 1. numbered lists
- `code` and ```code blocks```
- [links](url) that are part of the actual answer
- line breaks and paragraph structure
"""
if not text:
return ""
cleaned = text.strip()
# === JSON Unwrapping (Fix for Pollinations/Reasoning Models) ===
# Some models return a JSON string instead of raw text.
# E.g. {"role": "assistant", "reasoning_content": "...", "content": "Hello"}
if cleaned.startswith("{") and cleaned.endswith("}"):
try:
data = json.loads(cleaned)
if isinstance(data, dict):
# Extract actual content, ignoring reasoning/internal thought
candidate = (
data.get("content") or
data.get("response") or
data.get("message") or
data.get("answer")
)
if candidate and isinstance(candidate, str):
cleaned = candidate.strip()
except (json.JSONDecodeError, ValueError):
pass # Not valid JSON, process as regular text
# === Reasoning Cleanup (Chain-of-Thought) ===
# Remove <think> tags used by DeepSeek and similar reasoning models
cleaned = re.sub(r"<think>.*?</think>", "", cleaned, flags=re.DOTALL)
# === Artifact & Escape Sequence Cleanup ===
# Fix double-escaped newlines common in poor JSON dumps
cleaned = cleaned.replace("\\n", "\n")
cleaned = cleaned.replace("\\r", "")
cleaned = cleaned.replace("\\t", "\t")
# Fix user-reported "/N" artifact (treat as newline if standalone)
cleaned = ARTIFACT_N_PATTERN.sub("\n", cleaned)
# === Unescape JSON/Raw Literals ===
# Robustly decode escape sequences like \n, \", \t using Python's codec
# We do this up to 2 times to catch double-encoded strings (common in some scraped JSON)
for _ in range(2):
try:
if "\\n" not in cleaned and '\\"' not in cleaned:
break
# If it looks like a JSON string literal (wrapped in quotes), strip them first
if cleaned.startswith('"') and cleaned.endswith('"'):
cleaned = cleaned[1:-1]
cleaned = cleaned.encode('utf-8').decode('unicode_escape')
except Exception:
# Fallback to manual replacement if codec fails
cleaned = cleaned.replace("\\n", "\n").replace('\\"', '"')
# === Spam Removal ===
for pattern in COMPILED_SPAM:
cleaned = pattern.sub("", cleaned)
# === Whitespace Cleanup ===
cleaned = EXCESS_NEWLINES.sub("\n\n", cleaned)
cleaned = TRAILING_WHITESPACE.sub("", cleaned)
return cleaned.strip()