Spaces:
Sleeping
Sleeping
| import io | |
| import docx | |
| def detect_filetype(filename: str, file_bytes: bytes) -> str: | |
| fname = (filename or "").lower() | |
| if fname.endswith(".pdf"): | |
| return "pdf" | |
| if any(fname.endswith(ext) for ext in [".png", ".jpg", ".jpeg", ".tiff", ".bmp"]): | |
| return "image" | |
| if fname.endswith(".docx"): | |
| return "docx" | |
| if fname.endswith(".txt"): | |
| return "txt" | |
| if file_bytes[:4] == b"%PDF": | |
| return "pdf" | |
| return "unknown" | |
| def load_doc_text(filetype: str, file_bytes: bytes) -> str: | |
| if filetype == "docx": | |
| f = io.BytesIO(file_bytes) | |
| doc = docx.Document(f) | |
| return "\n".join([p.text for p in doc.paragraphs]) | |
| elif filetype == "txt": | |
| return file_bytes.decode("utf-8", errors="ignore") | |
| else: | |
| return "" | |