ModerationModelCode / src /service /invisibletext_service.py
raw
history blame
2.29 kB
import unicodedata
import time
import traceback
import uuid
from werkzeug.exceptions import InternalServerError
from config.logger import CustomLogger,request_id_var
log = CustomLogger()
log_dict={}
class InvisibleText:
'''
A class for scanning if the prompt includes invisible characters.
This class uses the unicodedata library to detect invisible characters in the output of the language model.
It detects and removes characters in categories :
'Cf' (Format characters),
'Cc' (Control characters),
'Co' (Private use characters), and
'Cn' (Unassigned characters), which are typically non-printable.
'''
def scan(self, prompt: str,banned_categories: list):
log.info("inside invisible_text check")
id=uuid.uuid4().hex
request_id_var.set(id)
log_dict[request_id_var.get()]=[]
try:
st = time.time()
chars = []
output={}
contains_unicode = any(ord(char) > 127 for char in prompt)
log.info(f"contains_unicode: {contains_unicode}")
if not contains_unicode:
output['result']=[]
output['time_taken']=str(round(time.time()-st,3))+"s"
else:
for char in prompt:
if unicodedata.category(char) not in banned_categories:
continue
chars.append(char)
output['result']=chars
output['time_taken']=str(round(time.time()-st,3))+"s"
log.info(f"output: {output}")
return output
except Exception as e:
log.error("Error occured in invisibletext_check")
log.error(f"Exception: {str(traceback.extract_tb(e.__traceback__)[0].lineno),e}")
log_dict[request_id_var.get()].append({"Line number":str(traceback.extract_tb(e.__traceback__)[0].lineno),"Error":str(e),
"Error Module":"Failed at invisibletext_check call"})
er=log_dict[request_id_var.get()]
logobj = {"_id":id,"error":er}
if len(er)!=0:
log.debug(str(logobj))
del log_dict[id]
raise InternalServerError()