Corin1998's picture
Create utils.py
1a1b2af verified
raw
history blame contribute delete
794 Bytes
import io
import docx
def detect_filetype(filename: str, file_bytes: bytes) -> str:
fname = (filename or "").lower()
if fname.endswith(".pdf"):
return "pdf"
if any(fname.endswith(ext) for ext in [".png", ".jpg", ".jpeg", ".tiff", ".bmp"]):
return "image"
if fname.endswith(".docx"):
return "docx"
if fname.endswith(".txt"):
return "txt"
if file_bytes[:4] == b"%PDF":
return "pdf"
return "unknown"
def load_doc_text(filetype: str, file_bytes: bytes) -> str:
if filetype == "docx":
f = io.BytesIO(file_bytes)
doc = docx.Document(f)
return "\n".join([p.text for p in doc.paragraphs])
elif filetype == "txt":
return file_bytes.decode("utf-8", errors="ignore")
else:
return ""