| import unicodedata | |
| import time | |
| import traceback | |
| import uuid | |
| from werkzeug.exceptions import InternalServerError | |
| from config.logger import CustomLogger,request_id_var | |
| log = CustomLogger() | |
| log_dict={} | |
| class InvisibleText: | |
| ''' | |
| A class for scanning if the prompt includes invisible characters. | |
| This class uses the unicodedata library to detect invisible characters in the output of the language model. | |
| It detects and removes characters in categories : | |
| 'Cf' (Format characters), | |
| 'Cc' (Control characters), | |
| 'Co' (Private use characters), and | |
| 'Cn' (Unassigned characters), which are typically non-printable. | |
| ''' | |
| def scan(self, prompt: str,banned_categories: list): | |
| log.info("inside invisible_text check") | |
| id=uuid.uuid4().hex | |
| request_id_var.set(id) | |
| log_dict[request_id_var.get()]=[] | |
| try: | |
| st = time.time() | |
| chars = [] | |
| output={} | |
| contains_unicode = any(ord(char) > 127 for char in prompt) | |
| log.info(f"contains_unicode: {contains_unicode}") | |
| if not contains_unicode: | |
| output['result']=[] | |
| output['time_taken']=str(round(time.time()-st,3))+"s" | |
| else: | |
| for char in prompt: | |
| if unicodedata.category(char) not in banned_categories: | |
| continue | |
| chars.append(char) | |
| output['result']=chars | |
| output['time_taken']=str(round(time.time()-st,3))+"s" | |
| log.info(f"output: {output}") | |
| return output | |
| except Exception as e: | |
| log.error("Error occured in invisibletext_check") | |
| log.error(f"Exception: {str(traceback.extract_tb(e.__traceback__)[0].lineno),e}") | |
| log_dict[request_id_var.get()].append({"Line number":str(traceback.extract_tb(e.__traceback__)[0].lineno),"Error":str(e), | |
| "Error Module":"Failed at invisibletext_check call"}) | |
| er=log_dict[request_id_var.get()] | |
| logobj = {"_id":id,"error":er} | |
| if len(er)!=0: | |
| log.debug(str(logobj)) | |
| del log_dict[id] | |
| raise InternalServerError() | |