import os import torch from fastapi import FastAPI from pydantic import BaseModel from fastapi import UploadFile, File from fastapi.middleware.cors import CORSMiddleware from fastapi import UploadFile, File from inference import extract_text_from_pdf, split_text_by_language, predict_idiom, normalize_text, load_model, IdiomMatcher from nltk.tokenize import sent_tokenize from langdetect import detect from fastapi import HTTPException import re import fitz # PyMuPDF # Allow requests from your React app origins = [ "http://localhost:3000", # React dev server "https://language-learning-base-website.vercel.app", "https://www.idiomator.com" "https://idiomator.com" ] app = FastAPI() app.add_middleware( CORSMiddleware, allow_origins= origins, # or ["*"] for all origins (not recommended in production) allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # Load model once at startup device = torch.device("mps" if torch.backends.mps.is_available() else "cpu") checkpoint_path = os.path.join(os.path.dirname(__file__), "checkpoints") model, tokenizer = load_model(checkpoint_path) model = model.to(device) model.eval() class TextRequest(BaseModel): text: str language: str = "en" # Default to English class IdiomResponse(BaseModel): idioms: list[str] language: str = "en" # Default to English @app.get("/") def root(): return {"status": "ok"} @app.post("/extract_idioms_ai", response_model=IdiomResponse) def extract_idioms(request: TextRequest): import time start = time.time() print(f"[📥] Request received at: {start}") text = normalize_text(request.text) language = request.language.lower() # Get the user-selected language sentences = split_text_by_language(text, language=language) idioms = [] for sent in sentences: idioms.extend(predict_idiom(sent, model, tokenizer, device)) print(f"[✅] Done in {time.time() - start:.3f}s") return {"idioms": idioms} from fastapi import Form def check_pdf_page_limit(pdf_bytes, max_pages=10): with fitz.open(stream=pdf_bytes, filetype="pdf") as doc: if len(doc) > max_pages: raise HTTPException(status_code=400, detail=f"PDF has {len(doc)} pages. Limit is {max_pages}.") @app.post("/extract_idioms_pdf_ai", response_model=IdiomResponse) async def extract_idioms_pdf( file: UploadFile = File(...), language: str = Form(...) # ✅ Get language from the client ): pdf_bytes = await file.read() check_pdf_page_limit(pdf_bytes, max_pages=10) text = extract_text_from_pdf(pdf_bytes) # Normalize the extracted text! text = normalize_text(text) sentences = split_text_by_language(text, language=language) idioms = [] for sent in sentences: idioms.extend(predict_idiom(sent, model, tokenizer, device)) return {"idioms": idioms} idiom_matcher = IdiomMatcher({ "en": "idioms_structured_1/seed_idioms_en_cleaned.jsonl", "es": "idioms_structured_1/seed_idioms_es_cleaned.jsonl" }) @app.post("/extract_idioms_heuristic", response_model=IdiomResponse) def extract_idioms_heuristic(request: TextRequest): text = normalize_text(request.text) language = request.language.lower() # get the language from request idiom_matches = idiom_matcher.match(text, lang=language) idioms = [idiom["idiom"] for idiom in idiom_matches] return {"idioms": idioms} @app.post("/extract_idioms_pdf_heuristic", response_model=IdiomResponse) async def extract_idioms_pdf_( file: UploadFile = File(...), language: str = Form(...) # ✅ Get language from the client ): pdf_bytes = await file.read() check_pdf_page_limit(pdf_bytes, max_pages=10) text = extract_text_from_pdf(pdf_bytes) # Normalize the extracted text! text = normalize_text(text) sentences = split_text_by_language(text, language=language) idioms = [] idiom_matches = idiom_matcher.match(text, lang=language) idioms = [idiom["idiom"] for idiom in idiom_matches] return {"idioms": idioms}