GitHub Actions
Track large files with LFS
447d423
import os
import torch
from fastapi import FastAPI
from pydantic import BaseModel
from fastapi import UploadFile, File
from fastapi.middleware.cors import CORSMiddleware
from fastapi import UploadFile, File
from inference import extract_text_from_pdf, split_text_by_language, predict_idiom, normalize_text, load_model, IdiomMatcher
from nltk.tokenize import sent_tokenize
from langdetect import detect
from fastapi import HTTPException
import re
import fitz # PyMuPDF
# Allow requests from your React app
origins = [
"http://localhost:3000", # React dev server
"https://language-learning-base-website.vercel.app",
"https://www.idiomator.com"
"https://idiomator.com"
]
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins= origins, # or ["*"] for all origins (not recommended in production)
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Load model once at startup
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
checkpoint_path = os.path.join(os.path.dirname(__file__), "checkpoints")
model, tokenizer = load_model(checkpoint_path)
model = model.to(device)
model.eval()
class TextRequest(BaseModel):
text: str
language: str = "en" # Default to English
class IdiomResponse(BaseModel):
idioms: list[str]
language: str = "en" # Default to English
@app.get("/")
def root():
return {"status": "ok"}
@app.post("/extract_idioms_ai", response_model=IdiomResponse)
def extract_idioms(request: TextRequest):
import time
start = time.time()
print(f"[πŸ“₯] Request received at: {start}")
text = normalize_text(request.text)
language = request.language.lower() # Get the user-selected language
sentences = split_text_by_language(text, language=language)
idioms = []
for sent in sentences:
idioms.extend(predict_idiom(sent, model, tokenizer, device))
print(f"[βœ…] Done in {time.time() - start:.3f}s")
return {"idioms": idioms}
from fastapi import Form
def check_pdf_page_limit(pdf_bytes, max_pages=10):
with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
if len(doc) > max_pages:
raise HTTPException(status_code=400, detail=f"PDF has {len(doc)} pages. Limit is {max_pages}.")
@app.post("/extract_idioms_pdf_ai", response_model=IdiomResponse)
async def extract_idioms_pdf(
file: UploadFile = File(...),
language: str = Form(...) # βœ… Get language from the client
):
pdf_bytes = await file.read()
check_pdf_page_limit(pdf_bytes, max_pages=10)
text = extract_text_from_pdf(pdf_bytes)
# Normalize the extracted text!
text = normalize_text(text)
sentences = split_text_by_language(text, language=language)
idioms = []
for sent in sentences:
idioms.extend(predict_idiom(sent, model, tokenizer, device))
return {"idioms": idioms}
idiom_matcher = IdiomMatcher({
"en": "idioms_structured_1/seed_idioms_en_cleaned.jsonl",
"es": "idioms_structured_1/seed_idioms_es_cleaned.jsonl"
})
@app.post("/extract_idioms_heuristic", response_model=IdiomResponse)
def extract_idioms_heuristic(request: TextRequest):
text = normalize_text(request.text)
language = request.language.lower() # get the language from request
idiom_matches = idiom_matcher.match(text, lang=language)
idioms = [idiom["idiom"] for idiom in idiom_matches]
return {"idioms": idioms}
@app.post("/extract_idioms_pdf_heuristic", response_model=IdiomResponse)
async def extract_idioms_pdf_(
file: UploadFile = File(...),
language: str = Form(...) # βœ… Get language from the client
):
pdf_bytes = await file.read()
check_pdf_page_limit(pdf_bytes, max_pages=10)
text = extract_text_from_pdf(pdf_bytes)
# Normalize the extracted text!
text = normalize_text(text)
sentences = split_text_by_language(text, language=language)
idioms = []
idiom_matches = idiom_matcher.match(text, lang=language)
idioms = [idiom["idiom"] for idiom in idiom_matches]
return {"idioms": idioms}