File size: 4,064 Bytes
447d423 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import os
import torch
from fastapi import FastAPI
from pydantic import BaseModel
from fastapi import UploadFile, File
from fastapi.middleware.cors import CORSMiddleware
from fastapi import UploadFile, File
from inference import extract_text_from_pdf, split_text_by_language, predict_idiom, normalize_text, load_model, IdiomMatcher
from nltk.tokenize import sent_tokenize
from langdetect import detect
from fastapi import HTTPException
import re
import fitz # PyMuPDF
# Allow requests from your React app
origins = [
"http://localhost:3000", # React dev server
"https://language-learning-base-website.vercel.app",
"https://www.idiomator.com"
"https://idiomator.com"
]
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins= origins, # or ["*"] for all origins (not recommended in production)
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Load model once at startup
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
checkpoint_path = os.path.join(os.path.dirname(__file__), "checkpoints")
model, tokenizer = load_model(checkpoint_path)
model = model.to(device)
model.eval()
class TextRequest(BaseModel):
text: str
language: str = "en" # Default to English
class IdiomResponse(BaseModel):
idioms: list[str]
language: str = "en" # Default to English
@app.get("/")
def root():
return {"status": "ok"}
@app.post("/extract_idioms_ai", response_model=IdiomResponse)
def extract_idioms(request: TextRequest):
import time
start = time.time()
print(f"[π₯] Request received at: {start}")
text = normalize_text(request.text)
language = request.language.lower() # Get the user-selected language
sentences = split_text_by_language(text, language=language)
idioms = []
for sent in sentences:
idioms.extend(predict_idiom(sent, model, tokenizer, device))
print(f"[β
] Done in {time.time() - start:.3f}s")
return {"idioms": idioms}
from fastapi import Form
def check_pdf_page_limit(pdf_bytes, max_pages=10):
with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
if len(doc) > max_pages:
raise HTTPException(status_code=400, detail=f"PDF has {len(doc)} pages. Limit is {max_pages}.")
@app.post("/extract_idioms_pdf_ai", response_model=IdiomResponse)
async def extract_idioms_pdf(
file: UploadFile = File(...),
language: str = Form(...) # β
Get language from the client
):
pdf_bytes = await file.read()
check_pdf_page_limit(pdf_bytes, max_pages=10)
text = extract_text_from_pdf(pdf_bytes)
# Normalize the extracted text!
text = normalize_text(text)
sentences = split_text_by_language(text, language=language)
idioms = []
for sent in sentences:
idioms.extend(predict_idiom(sent, model, tokenizer, device))
return {"idioms": idioms}
idiom_matcher = IdiomMatcher({
"en": "idioms_structured_1/seed_idioms_en_cleaned.jsonl",
"es": "idioms_structured_1/seed_idioms_es_cleaned.jsonl"
})
@app.post("/extract_idioms_heuristic", response_model=IdiomResponse)
def extract_idioms_heuristic(request: TextRequest):
text = normalize_text(request.text)
language = request.language.lower() # get the language from request
idiom_matches = idiom_matcher.match(text, lang=language)
idioms = [idiom["idiom"] for idiom in idiom_matches]
return {"idioms": idioms}
@app.post("/extract_idioms_pdf_heuristic", response_model=IdiomResponse)
async def extract_idioms_pdf_(
file: UploadFile = File(...),
language: str = Form(...) # β
Get language from the client
):
pdf_bytes = await file.read()
check_pdf_page_limit(pdf_bytes, max_pages=10)
text = extract_text_from_pdf(pdf_bytes)
# Normalize the extracted text!
text = normalize_text(text)
sentences = split_text_by_language(text, language=language)
idioms = []
idiom_matches = idiom_matcher.match(text, lang=language)
idioms = [idiom["idiom"] for idiom in idiom_matches]
return {"idioms": idioms}
|