File size: 794 Bytes
1a1b2af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import io
import docx

def detect_filetype(filename: str, file_bytes: bytes) -> str:
    fname = (filename or "").lower()
    if fname.endswith(".pdf"):
        return "pdf"
    if any(fname.endswith(ext) for ext in [".png", ".jpg", ".jpeg", ".tiff", ".bmp"]):
        return "image"
    if fname.endswith(".docx"):
        return "docx"
    if fname.endswith(".txt"):
        return "txt"
    if file_bytes[:4] == b"%PDF":
        return "pdf"
    return "unknown"

def load_doc_text(filetype: str, file_bytes: bytes) -> str:
    if filetype == "docx":
        f = io.BytesIO(file_bytes)
        doc = docx.Document(f)
        return "\n".join([p.text for p in doc.paragraphs])
    elif filetype == "txt":
        return file_bytes.decode("utf-8", errors="ignore")
    else:
        return ""