Spaces:
Running
Running
| """ | |
| Response Sanitizer | |
| ------------------ | |
| Cleans AI responses by removing promotional spam, junk text, and | |
| unwanted artifacts injected by some g4f providers. | |
| Preserves legitimate markdown/rich text formatting — only strips | |
| known spam patterns and obvious junk. | |
| """ | |
| import re | |
| import json | |
| # Known promotional spam patterns (case-insensitive) | |
| # ORDER MATTERS — broader patterns first, narrower ones after. | |
| SPAM_PATTERNS = [ | |
| # llmplayground.net spam | |
| r"(?i)\n*\s*want\s+best\s+roleplay\s+experience\s*\??\s*\n*\s*https?://llmplayground\.net\s*", | |
| r"(?i)\n*\s*https?://llmplayground\.net\s*", | |
| # Pollinations.AI ad block | |
| r"(?is)\n+\s*[-—–]+\s*\n+\s*\*\*Support\s+Pollinations\.AI:?\*\*\s*\n+\s*[-—–]+\s*\n+\s*🌸\s*\*\*Ad\*\*\s*🌸.+?accessible\s+for\s+everyone\.?", | |
| # "Generated by BLACKBOX.AI" style headers (entire line + trailing promo) | |
| # MUST come before the partial "powered/generated/created by" pattern below | |
| r"(?im)^generated\s+by\s+.+$\n*", | |
| # General promotional / self-referencing spam | |
| r"(?i)\n*\s*(?:powered|generated|created)\s+by\s+[\w\s.-]+\.(?:com|net|org|ai|io)\s*", | |
| r"(?i)\n*\s*(?:visit|check\s+out|try)\s+https?://[\w./-]+\s+for\s+(?:more|better|best)\s*[\w\s]*\s*", | |
| r"(?i)\n*\s*(?:get|try)\s+(?:pro|premium|plus)\s+(?:at|on)\s+https?://[\w./-]+\s*", | |
| # Common g4f provider junk footers | |
| r"(?i)\n+[-—–]+\s*\n*\s*(?:this|note|disclaimer)[\s\S]{0,200}(?:free|open[\s-]?source|community|g4f)[\s\S]{0,100}$", | |
| # Trailing promotional URLs (not part of the actual answer) | |
| r"\n{2,}\s*https?://(?!(?:en\.)?wikipedia\.org|docs\.|github\.com)[\w./-]+\s*$", | |
| # Gemini UI Artifacts | |
| r"(?i)\n*\s*Export to Sheets\s*", | |
| r"(?i)\n*\s*Show drafts\s*", | |
| r"(?i)\n*\s*Regenerate drafts\s*", | |
| r"(?i)\n*\s*tuneshare\s*", | |
| r"(?i)\n*\s*more_vert\s*", | |
| ] | |
| # Compile patterns once for performance | |
| COMPILED_SPAM = [re.compile(p) for p in SPAM_PATTERNS] | |
| # Whitespace cleanup patterns | |
| EXCESS_NEWLINES = re.compile(r"\n{3,}") | |
| TRAILING_WHITESPACE = re.compile(r"[ \t]+$", re.MULTILINE) | |
| # Pattern for standalone /N artifact | |
| ARTIFACT_N_PATTERN = re.compile(r"(?<!\w)/N(?!\w)") | |
| def sanitize_response(text: str) -> str: | |
| """ | |
| Clean an AI response by removing spam and junk while preserving | |
| legitimate markdown formatting. | |
| Also extracts content if the model returns double-encoded JSON, | |
| strips internal reasoning traces (CoT), and fixes escape sequences. | |
| Keeps: | |
| - **bold**, *italic*, ~~strikethrough~~ | |
| - # headers | |
| - - bullet lists | |
| - 1. numbered lists | |
| - `code` and ```code blocks``` | |
| - [links](url) that are part of the actual answer | |
| - line breaks and paragraph structure | |
| """ | |
| if not text: | |
| return "" | |
| cleaned = text.strip() | |
| # === JSON Unwrapping (Fix for Pollinations/Reasoning Models) === | |
| # Some models return a JSON string instead of raw text. | |
| # E.g. {"role": "assistant", "reasoning_content": "...", "content": "Hello"} | |
| if cleaned.startswith("{") and cleaned.endswith("}"): | |
| try: | |
| data = json.loads(cleaned) | |
| if isinstance(data, dict): | |
| # Extract actual content, ignoring reasoning/internal thought | |
| candidate = ( | |
| data.get("content") or | |
| data.get("response") or | |
| data.get("message") or | |
| data.get("answer") | |
| ) | |
| if candidate and isinstance(candidate, str): | |
| cleaned = candidate.strip() | |
| except (json.JSONDecodeError, ValueError): | |
| pass # Not valid JSON, process as regular text | |
| # === Reasoning Cleanup (Chain-of-Thought) === | |
| # Remove <think> tags used by DeepSeek and similar reasoning models | |
| cleaned = re.sub(r"<think>.*?</think>", "", cleaned, flags=re.DOTALL) | |
| # === Artifact & Escape Sequence Cleanup === | |
| # Fix double-escaped newlines common in poor JSON dumps | |
| cleaned = cleaned.replace("\\n", "\n") | |
| cleaned = cleaned.replace("\\r", "") | |
| cleaned = cleaned.replace("\\t", "\t") | |
| # Fix user-reported "/N" artifact (treat as newline if standalone) | |
| cleaned = ARTIFACT_N_PATTERN.sub("\n", cleaned) | |
| # === Unescape JSON/Raw Literals === | |
| # Robustly decode escape sequences like \n, \", \t using Python's codec | |
| # We do this up to 2 times to catch double-encoded strings (common in some scraped JSON) | |
| for _ in range(2): | |
| try: | |
| if "\\n" not in cleaned and '\\"' not in cleaned: | |
| break | |
| # If it looks like a JSON string literal (wrapped in quotes), strip them first | |
| if cleaned.startswith('"') and cleaned.endswith('"'): | |
| cleaned = cleaned[1:-1] | |
| cleaned = cleaned.encode('utf-8').decode('unicode_escape') | |
| except Exception: | |
| # Fallback to manual replacement if codec fails | |
| cleaned = cleaned.replace("\\n", "\n").replace('\\"', '"') | |
| # === Spam Removal === | |
| for pattern in COMPILED_SPAM: | |
| cleaned = pattern.sub("", cleaned) | |
| # === Whitespace Cleanup === | |
| cleaned = EXCESS_NEWLINES.sub("\n\n", cleaned) | |
| cleaned = TRAILING_WHITESPACE.sub("", cleaned) | |
| return cleaned.strip() | |