KiWA001 commited on
Commit
d2b69dd
·
1 Parent(s): 0dd1f34

fix: recursive unescape for double-encoded responses

Browse files
Files changed (1) hide show
  1. sanitizer.py +14 -9
sanitizer.py CHANGED
@@ -111,15 +111,20 @@ def sanitize_response(text: str) -> str:
111
 
112
  # === Unescape JSON/Raw Literals ===
113
  # Robustly decode escape sequences like \n, \", \t using Python's codec
114
- try:
115
- # If it looks like a JSON string literal (wrapped in quotes), strip them first
116
- if cleaned.startswith('"') and cleaned.endswith('"'):
117
- cleaned = cleaned[1:-1]
118
-
119
- cleaned = cleaned.encode('utf-8').decode('unicode_escape')
120
- except Exception:
121
- # Fallback to manual replacement if codec fails
122
- cleaned = cleaned.replace("\\n", "\n").replace('\\"', '"')
 
 
 
 
 
123
 
124
  # === Spam Removal ===
125
  for pattern in COMPILED_SPAM:
 
111
 
112
  # === Unescape JSON/Raw Literals ===
113
  # Robustly decode escape sequences like \n, \", \t using Python's codec
114
+ # We do this up to 2 times to catch double-encoded strings (common in some scraped JSON)
115
+ for _ in range(2):
116
+ try:
117
+ if "\\n" not in cleaned and '\\"' not in cleaned:
118
+ break
119
+
120
+ # If it looks like a JSON string literal (wrapped in quotes), strip them first
121
+ if cleaned.startswith('"') and cleaned.endswith('"'):
122
+ cleaned = cleaned[1:-1]
123
+
124
+ cleaned = cleaned.encode('utf-8').decode('unicode_escape')
125
+ except Exception:
126
+ # Fallback to manual replacement if codec fails
127
+ cleaned = cleaned.replace("\\n", "\n").replace('\\"', '"')
128
 
129
  # === Spam Removal ===
130
  for pattern in COMPILED_SPAM: