Spaces:
Sleeping
Sleeping
File size: 794 Bytes
1a1b2af |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
import io
import docx
def detect_filetype(filename: str, file_bytes: bytes) -> str:
fname = (filename or "").lower()
if fname.endswith(".pdf"):
return "pdf"
if any(fname.endswith(ext) for ext in [".png", ".jpg", ".jpeg", ".tiff", ".bmp"]):
return "image"
if fname.endswith(".docx"):
return "docx"
if fname.endswith(".txt"):
return "txt"
if file_bytes[:4] == b"%PDF":
return "pdf"
return "unknown"
def load_doc_text(filetype: str, file_bytes: bytes) -> str:
if filetype == "docx":
f = io.BytesIO(file_bytes)
doc = docx.Document(f)
return "\n".join([p.text for p in doc.paragraphs])
elif filetype == "txt":
return file_bytes.decode("utf-8", errors="ignore")
else:
return ""
|