import io import docx def detect_filetype(filename: str, file_bytes: bytes) -> str: fname = (filename or "").lower() if fname.endswith(".pdf"): return "pdf" if any(fname.endswith(ext) for ext in [".png", ".jpg", ".jpeg", ".tiff", ".bmp"]): return "image" if fname.endswith(".docx"): return "docx" if fname.endswith(".txt"): return "txt" if file_bytes[:4] == b"%PDF": return "pdf" return "unknown" def load_doc_text(filetype: str, file_bytes: bytes) -> str: if filetype == "docx": f = io.BytesIO(file_bytes) doc = docx.Document(f) return "\n".join([p.text for p in doc.paragraphs]) elif filetype == "txt": return file_bytes.decode("utf-8", errors="ignore") else: return ""