hmnshudhmn24's picture
Upload 14 files
1108401 verified
"""FastAPI app for DOCINTEL: upload PDF, extract text/OCR, get entities, summarize."""
import os, uuid, tempfile
from fastapi import FastAPI, File, UploadFile, HTTPException
from fastapi.responses import JSONResponse
from pydantic import BaseModel
from utils import ensure_dir, load_config, save_json
from ocr_extractor import extract_full_text, pdf_to_images
from entity_tagger import extract_entities
from summarize_doc import summarize_image, summarize_text
app = FastAPI(title='DOCINTEL API')
cfg = load_config()
STORAGE = cfg.get('storage_dir', './storage')
ensure_dir(STORAGE)
class QARequest(BaseModel):
question: str
@app.post('/upload_pdf')
async def upload_pdf(file: UploadFile = File(...)):
if not file.filename.lower().endswith('.pdf'):
raise HTTPException(status_code=400, detail='Only PDF files are allowed')
doc_id = str(uuid.uuid4())
save_path = os.path.join(STORAGE, f"{doc_id}_{file.filename}")
with open(save_path, 'wb') as f:
f.write(await file.read())
return {'doc_id': doc_id, 'filename': file.filename, 'path': save_path}
@app.get('/doc/{doc_id}/text')
def get_text(doc_id: str):
files = [f for f in os.listdir(STORAGE) if f.startswith(doc_id+'_')]
if not files:
raise HTTPException(status_code=404, detail='Document not found')
path = os.path.join(STORAGE, files[0])
text, ocr_pages = extract_full_text(path)
return {'doc_id': doc_id, 'text': text, 'ocr_pages_count': len(ocr_pages)}
@app.get('/doc/{doc_id}/entities')
def get_entities(doc_id: str):
files = [f for f in os.listdir(STORAGE) if f.startswith(doc_id+'_')]
if not files:
raise HTTPException(status_code=404, detail='Document not found')
path = os.path.join(STORAGE, files[0])
text, _ = extract_full_text(path)
ents = extract_entities(text)
return JSONResponse(content={'doc_id': doc_id, 'entities': ents})
@app.post('/doc/{doc_id}/summarize')
def post_summarize(doc_id: str):
files = [f for f in os.listdir(STORAGE) if f.startswith(doc_id+'_')]
if not files:
raise HTTPException(status_code=404, detail='Document not found')
path = os.path.join(STORAGE, files[0])
# convert to images and summarize first page with DONUT
pages = pdf_to_images(path, out_dir=tempfile.mkdtemp())
if not pages:
text, _ = extract_full_text(path)
summary = summarize_text(text)
return {'doc_id': doc_id, 'summary': summary}
# use first page image
summary = summarize_image(pages[0])
return {'doc_id': doc_id, 'summary': summary}