GitHub Actions
commited on
Commit
Β·
447d423
0
Parent(s):
Track large files with LFS
Browse files- .gitattributes +4 -0
- Backend/Database/Idiom_Id_Generator.py +34 -0
- Backend/Database/crud.py +211 -0
- Backend/Database/db.py +17 -0
- Backend/Database/main.py +120 -0
- Backend/Database/models.py +16 -0
- Backend/Database/requirements.txt +12 -0
- Backend/Database/schemas.py +59 -0
- Backend/Idiom_lexicon.py +3 -0
- Backend/__init__.py +0 -0
- Backend/checkpoints/README.md +3 -0
- Backend/checkpoints/adapter_config.json +3 -0
- Backend/checkpoints/adapter_model.safetensors +3 -0
- Backend/checkpoints/model.pt +3 -0
- Backend/checkpoints/model_config.json +3 -0
- Backend/checkpoints/special_tokens_map.json +3 -0
- Backend/checkpoints/tokenizer.json +3 -0
- Backend/checkpoints/tokenizer_config.json +3 -0
- Backend/checkpoints/training_args.bin +3 -0
- Backend/checkpoints/vocab.txt +3 -0
- Backend/idioms_structured_1/seed_idioms_en_cleaned.jsonl +3 -0
- Backend/idioms_structured_1/seed_idioms_es_cleaned.jsonl +3 -0
- Backend/inference.py +219 -0
- Backend/main.py +122 -0
- Backend/requirements.txt +3 -0
- Dockerfile +37 -0
- README.md +14 -0
.gitattributes
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Backend/*.txt filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
Backend/*.numbers filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
Backend/checkpoints/* filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
Backend/idioms_structured_1/*.jsonl filter=lfs diff=lfs merge=lfs -text
|
Backend/Database/Idiom_Id_Generator.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from supabase import create_client, Client
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
load_dotenv()
|
| 6 |
+
|
| 7 |
+
url = os.getenv("database_url")
|
| 8 |
+
key = os.getenv("database_service_Key")
|
| 9 |
+
|
| 10 |
+
supabase: Client = create_client(url, key)
|
| 11 |
+
|
| 12 |
+
def generate_id(lang_code: str, dialect: str | None) -> str:
|
| 13 |
+
# Normaliza dialecto si existe
|
| 14 |
+
if dialect and dialect.strip():
|
| 15 |
+
dialect_clean = dialect.replace(" ", "_").lower()
|
| 16 |
+
else:
|
| 17 |
+
dialect_clean = None
|
| 18 |
+
|
| 19 |
+
# Construye la query base
|
| 20 |
+
query = supabase.table("idioms").select("id", count="exact").eq("language", lang_code)
|
| 21 |
+
if dialect_clean:
|
| 22 |
+
query = query.eq("language", lang_code)
|
| 23 |
+
else:
|
| 24 |
+
query = query.eq("language", lang_code ) # si lo guardas como NULL
|
| 25 |
+
|
| 26 |
+
res = query.execute()
|
| 27 |
+
counter = res.count + 1
|
| 28 |
+
|
| 29 |
+
# Genera ID
|
| 30 |
+
if dialect_clean:
|
| 31 |
+
return f"{lang_code}_{dialect_clean}_{str(counter).zfill(4)}"
|
| 32 |
+
else:
|
| 33 |
+
return f"{lang_code}_unspecified_{str(counter).zfill(4)}"
|
| 34 |
+
|
Backend/Database/crud.py
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# crud.py (reemplaza la versiΓ³n SQLAlchemy)
|
| 2 |
+
import os
|
| 3 |
+
import httpx
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
from typing import Dict, Optional, List
|
| 6 |
+
from fastapi import HTTPException
|
| 7 |
+
load_dotenv()
|
| 8 |
+
|
| 9 |
+
SUPABASE_URL = os.getenv("SUPABASE_URL").rstrip("/")
|
| 10 |
+
SUPABASE_KEY = os.getenv("SUPABASE_KEY")
|
| 11 |
+
database_service_Key = os.getenv("database_service_Key")
|
| 12 |
+
TABLE = os.getenv("TABLE_NAME", "idioms")
|
| 13 |
+
|
| 14 |
+
HEADERS = {
|
| 15 |
+
"apikey": database_service_Key,
|
| 16 |
+
"Authorization": f"Bearer {database_service_Key}",
|
| 17 |
+
"Content-Type": "application/json",
|
| 18 |
+
"Accept": "application/json",
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
# helpers
|
| 22 |
+
async def _client():
|
| 23 |
+
return httpx.AsyncClient(timeout=30.0)
|
| 24 |
+
|
| 25 |
+
# CRUD
|
| 26 |
+
async def get_idioms(skip: int = 0, limit: int = 100000):
|
| 27 |
+
async with httpx.AsyncClient() as client:
|
| 28 |
+
url = f"{SUPABASE_URL}/rest/v1/{TABLE}?select=*&offset={skip}&limit={limit}"
|
| 29 |
+
print("Supabase GET URL:", url)
|
| 30 |
+
r = await client.get(url, headers=HEADERS)
|
| 31 |
+
print("Supabase GET status:", r.status_code)
|
| 32 |
+
print("Supabase GET response:", r.text)
|
| 33 |
+
r.raise_for_status()
|
| 34 |
+
return r.json()
|
| 35 |
+
|
| 36 |
+
async def get_all_idioms():
|
| 37 |
+
all_idioms = []
|
| 38 |
+
limit = 1000 # Supabase max per request
|
| 39 |
+
offset = 0
|
| 40 |
+
|
| 41 |
+
async with httpx.AsyncClient(timeout=60.0) as client:
|
| 42 |
+
while True:
|
| 43 |
+
url = f"{SUPABASE_URL}/rest/v1/{TABLE}?select=*&limit={limit}&offset={offset}"
|
| 44 |
+
r = await client.get(url, headers=HEADERS)
|
| 45 |
+
r.raise_for_status()
|
| 46 |
+
data = r.json()
|
| 47 |
+
if not data:
|
| 48 |
+
break
|
| 49 |
+
# ensure validation_count is a dict
|
| 50 |
+
for item in data:
|
| 51 |
+
if not isinstance(item.get("validation_count"), dict):
|
| 52 |
+
item["validation_count"] = {}
|
| 53 |
+
all_idioms.extend(data)
|
| 54 |
+
offset += limit
|
| 55 |
+
|
| 56 |
+
return all_idioms
|
| 57 |
+
|
| 58 |
+
async def get_idiom(idiom_id: str):
|
| 59 |
+
async with httpx.AsyncClient() as client:
|
| 60 |
+
try:
|
| 61 |
+
# Include both examples and meanings via foreign key embedding
|
| 62 |
+
url = (
|
| 63 |
+
f"{SUPABASE_URL}/rest/v1/{TABLE}?"
|
| 64 |
+
f"id=eq.{idiom_id}&"
|
| 65 |
+
f"select=*,"
|
| 66 |
+
f"idiom_meanings!idiom_meanings_idiom_id_fkey(*),"
|
| 67 |
+
f"examples!examples_idiom_id_fkey(*)"
|
| 68 |
+
|
| 69 |
+
)
|
| 70 |
+
print(f"Fetching idiom from Supabase: {url}") # debug
|
| 71 |
+
r = await client.get(url, headers=HEADERS)
|
| 72 |
+
print("HTTP status code:", r.status_code) # debug
|
| 73 |
+
r.raise_for_status()
|
| 74 |
+
data = r.json()
|
| 75 |
+
print("Raw data from Supabase:", data) # debug
|
| 76 |
+
except httpx.RequestError as e:
|
| 77 |
+
print("Request failed:", e)
|
| 78 |
+
raise HTTPException(status_code=500, detail=f"Supabase request failed: {e}")
|
| 79 |
+
except httpx.HTTPStatusError as e:
|
| 80 |
+
print("HTTP error:", e)
|
| 81 |
+
raise HTTPException(status_code=500, detail=f"Supabase HTTP error: {e}")
|
| 82 |
+
except Exception as e:
|
| 83 |
+
print("Unexpected error:", e)
|
| 84 |
+
raise HTTPException(status_code=500, detail=f"Unexpected error: {e}")
|
| 85 |
+
|
| 86 |
+
if not data:
|
| 87 |
+
print(f"No idiom found for id: {idiom_id}") # debug
|
| 88 |
+
return None
|
| 89 |
+
|
| 90 |
+
idiom = data[0]
|
| 91 |
+
if not isinstance(idiom, dict):
|
| 92 |
+
print(f"Unexpected data type for idiom: {type(idiom)}") # debug
|
| 93 |
+
raise ValueError(f"Expected dict, got: {type(idiom)}")
|
| 94 |
+
|
| 95 |
+
# --- Transform examples ---
|
| 96 |
+
raw_examples = idiom.get("examples") or []
|
| 97 |
+
idiom["examples"] = [
|
| 98 |
+
{
|
| 99 |
+
"id": ex.get("id"),
|
| 100 |
+
"source_text": ex.get("source_text") or "",
|
| 101 |
+
"source_language": ex.get("source_language") or idiom.get("language"),
|
| 102 |
+
"translations": json.loads(ex["translations"]) if isinstance(ex.get("translations"), str) else ex.get("translations") or [],
|
| 103 |
+
"dialect": ex.get("dialect"),
|
| 104 |
+
"url": ex.get("url"),
|
| 105 |
+
"source": ex.get("source"),
|
| 106 |
+
}
|
| 107 |
+
for ex in raw_examples
|
| 108 |
+
]
|
| 109 |
+
print(f"Found {len(idiom['examples'])} examples") # debug
|
| 110 |
+
|
| 111 |
+
# --- Transform meanings ---
|
| 112 |
+
raw_meanings = idiom.get("idiom_meanings") or []
|
| 113 |
+
print("Raw meanings data:", raw_meanings) # debug
|
| 114 |
+
idiom["meanings"] = [
|
| 115 |
+
{
|
| 116 |
+
"meaning_id": m.get("meaning_id"),
|
| 117 |
+
"idiom_id": m.get("idiom_id"),
|
| 118 |
+
"sense_number": m.get("sense_number"),
|
| 119 |
+
"register": m.get("register") or [],
|
| 120 |
+
"region": m.get("region") or [],
|
| 121 |
+
"definitions": m.get("definitions") or [],
|
| 122 |
+
"version": m.get("version"), # optional, if you need it
|
| 123 |
+
}
|
| 124 |
+
for m in raw_meanings
|
| 125 |
+
]
|
| 126 |
+
print("Transformed meanings data:", idiom["meanings"]) # debug
|
| 127 |
+
print(f"Found {len(idiom['meanings'])} meanings") # debug
|
| 128 |
+
return idiom
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
async def search_idioms(query: str = "", language: Optional[str] = None, skip: int = 0, limit: int = 50):
|
| 133 |
+
async with httpx.AsyncClient() as client:
|
| 134 |
+
# Compose select param to embed idiom_meanings
|
| 135 |
+
select_query = "*,idiom_meanings!idiom_meanings_idiom_id_fkey(*)"
|
| 136 |
+
|
| 137 |
+
url = (
|
| 138 |
+
f"{SUPABASE_URL}/rest/v1/{TABLE}"
|
| 139 |
+
f"?offset={skip}&limit={limit}&select={select_query}"
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
# Maintain partial text match on idiom column
|
| 143 |
+
if query:
|
| 144 |
+
url += f"&idiom=ilike.*{query}*"
|
| 145 |
+
|
| 146 |
+
# Maintain language filter if specified and not "all"
|
| 147 |
+
if language and language.lower() not in ("all", "*"):
|
| 148 |
+
url += f"&language=eq.{language}"
|
| 149 |
+
|
| 150 |
+
r = await client.get(url, headers=HEADERS)
|
| 151 |
+
r.raise_for_status()
|
| 152 |
+
data = r.json()
|
| 153 |
+
|
| 154 |
+
# Ensure validation_count is a dict
|
| 155 |
+
for item in data:
|
| 156 |
+
if not isinstance(item.get("validation_count"), dict):
|
| 157 |
+
item["validation_count"] = {}
|
| 158 |
+
|
| 159 |
+
# Transform embedded idiom_meanings to meanings field for UI use
|
| 160 |
+
raw_meanings = item.get("idiom_meanings") or []
|
| 161 |
+
item["meanings"] = [
|
| 162 |
+
{
|
| 163 |
+
"meaning_id": m.get("meaning_id"),
|
| 164 |
+
"idiom_id": m.get("idiom_id"),
|
| 165 |
+
"sense_number": m.get("sense_number"),
|
| 166 |
+
"register": m.get("register") or [],
|
| 167 |
+
"region": m.get("region") or [],
|
| 168 |
+
"definitions": m.get("definitions") or [],
|
| 169 |
+
"version": m.get("version"),
|
| 170 |
+
}
|
| 171 |
+
for m in raw_meanings
|
| 172 |
+
]
|
| 173 |
+
|
| 174 |
+
return data
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
async def create_idiom(item: dict):
|
| 179 |
+
async with httpx.AsyncClient() as client:
|
| 180 |
+
url = f"{SUPABASE_URL}/rest/v1/{TABLE}"
|
| 181 |
+
r = await client.post(url, json=item, headers=HEADERS)
|
| 182 |
+
r.raise_for_status() # fail if not 2xx
|
| 183 |
+
|
| 184 |
+
try:
|
| 185 |
+
data = r.json()
|
| 186 |
+
except ValueError:
|
| 187 |
+
# Supabase returned empty body, fallback to the original item
|
| 188 |
+
data = item
|
| 189 |
+
|
| 190 |
+
if isinstance(data, list) and data:
|
| 191 |
+
return data[0]
|
| 192 |
+
if isinstance(data, dict) and data:
|
| 193 |
+
return data
|
| 194 |
+
# final fallback
|
| 195 |
+
return item
|
| 196 |
+
async def update_idiom(idiom_id: str, item: dict):
|
| 197 |
+
async with httpx.AsyncClient() as client:
|
| 198 |
+
url = f"{SUPABASE_URL}/rest/v1/{TABLE}?id=eq.{idiom_id}"
|
| 199 |
+
r = await client.patch(url, json=item, headers=HEADERS)
|
| 200 |
+
if r.status_code not in (200, 204):
|
| 201 |
+
raise httpx.HTTPStatusError("Update failed", request=r.request, response=r)
|
| 202 |
+
# After patch, fetch the updated row
|
| 203 |
+
return await get_idiom(idiom_id)
|
| 204 |
+
|
| 205 |
+
async def delete_idiom(idiom_id: str):
|
| 206 |
+
async with httpx.AsyncClient() as client:
|
| 207 |
+
url = f"{SUPABASE_URL}/rest/v1/{TABLE}?id=eq.{idiom_id}"
|
| 208 |
+
r = await client.delete(url, headers=HEADERS)
|
| 209 |
+
if r.status_code not in (200, 204):
|
| 210 |
+
raise httpx.HTTPStatusError("Delete failed", request=r.request, response=r)
|
| 211 |
+
return {"status": "deleted"}
|
Backend/Database/db.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from sqlalchemy import create_engine
|
| 3 |
+
from sqlalchemy.orm import sessionmaker, declarative_base
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
|
| 6 |
+
load_dotenv()
|
| 7 |
+
|
| 8 |
+
DATABASE_URL = os.getenv("DATABASE_URL")
|
| 9 |
+
db_url = DATABASE_URL
|
| 10 |
+
# Engine with SSL (required by Supabase)
|
| 11 |
+
engine = create_engine(db_url, connect_args={"sslmode": "require"})
|
| 12 |
+
|
| 13 |
+
# Session factory
|
| 14 |
+
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
| 15 |
+
|
| 16 |
+
# Base class for models
|
| 17 |
+
Base = declarative_base()
|
Backend/Database/main.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# main.py
|
| 2 |
+
from fastapi import FastAPI, HTTPException, APIRouter
|
| 3 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 4 |
+
import crud
|
| 5 |
+
import schemas
|
| 6 |
+
from Idiom_Id_Generator import generate_id
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
import os
|
| 9 |
+
from pydantic import BaseModel
|
| 10 |
+
from typing import List, Dict, Optional
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
load_dotenv()
|
| 15 |
+
|
| 16 |
+
app = FastAPI(title="Idioms API - Supabase REST")
|
| 17 |
+
|
| 18 |
+
allowed_origins = [
|
| 19 |
+
"http://localhost",
|
| 20 |
+
"http://localhost:3000",
|
| 21 |
+
"http://localhost:8000",
|
| 22 |
+
"https://idiomator.vercel.app",
|
| 23 |
+
"https://www.idiomator.vercel.app",
|
| 24 |
+
"https://idiomator.com",
|
| 25 |
+
"https://www.idiomator.com",
|
| 26 |
+
]
|
| 27 |
+
app.add_middleware(
|
| 28 |
+
CORSMiddleware,
|
| 29 |
+
allow_origins= allowed_origins,#allowed_origins,
|
| 30 |
+
allow_credentials=True,
|
| 31 |
+
allow_methods=["*"],
|
| 32 |
+
allow_headers=["*"],
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
@app.get("/idioms", response_model=list[schemas.IdiomResponse])
|
| 37 |
+
async def read_idioms(skip: int = 0, limit: int = 50):
|
| 38 |
+
try:
|
| 39 |
+
data = await crud.get_idioms(skip=skip, limit=limit)
|
| 40 |
+
return data
|
| 41 |
+
except Exception as e:
|
| 42 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 43 |
+
|
| 44 |
+
@app.post("/idioms", response_model=schemas.IdiomResponse)
|
| 45 |
+
async def create_idiom(idiom: schemas.IdiomCreate):
|
| 46 |
+
try:
|
| 47 |
+
# keep your ID generation
|
| 48 |
+
idiom_id = generate_id(idiom.language, idiom.dialect)
|
| 49 |
+
idiom_dict = idiom.dict()
|
| 50 |
+
idiom_dict["id"] = idiom_id
|
| 51 |
+
created = await crud.create_idiom(idiom_dict)
|
| 52 |
+
return created
|
| 53 |
+
except Exception as e:
|
| 54 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 55 |
+
|
| 56 |
+
@app.get("/idioms/all_ids", response_model=List[Dict[str, str]])
|
| 57 |
+
async def get_all_idioms_route():
|
| 58 |
+
idioms = await crud.get_all_idioms()
|
| 59 |
+
print(f"Fetched {len(idioms)} idioms")
|
| 60 |
+
print(f"Sample idioms: {idioms[:3]}") # Print first 3 idioms for verification
|
| 61 |
+
return [
|
| 62 |
+
{"id": idiom["id"], "idiom": idiom["idiom"], "language": idiom["language"]}
|
| 63 |
+
for idiom in idioms
|
| 64 |
+
]
|
| 65 |
+
|
| 66 |
+
@app.get("/idioms/search", response_model=List[schemas.IdiomResponse])
|
| 67 |
+
async def search_idioms(
|
| 68 |
+
q: Optional[str] = None,
|
| 69 |
+
language: Optional[str] = None,
|
| 70 |
+
skip: int = 0,
|
| 71 |
+
limit: int = 50
|
| 72 |
+
):
|
| 73 |
+
|
| 74 |
+
try:
|
| 75 |
+
results = await crud.search_idioms(query=q, language=language, skip=skip, limit=limit)
|
| 76 |
+
return results
|
| 77 |
+
except Exception as e:
|
| 78 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
@app.get("/idioms/{idiom_id}", response_model=schemas.IdiomResponse)
|
| 82 |
+
async def read_idiom(idiom_id: str):
|
| 83 |
+
try:
|
| 84 |
+
db_idiom = await crud.get_idiom(idiom_id)
|
| 85 |
+
if not db_idiom:
|
| 86 |
+
raise HTTPException(status_code=404, detail="Idiom not found")
|
| 87 |
+
# Transform validation_count if needed
|
| 88 |
+
if isinstance(db_idiom.get("validation_count"), int):
|
| 89 |
+
db_idiom["validation_count"] = {"count": db_idiom["validation_count"]}
|
| 90 |
+
return db_idiom
|
| 91 |
+
except HTTPException:
|
| 92 |
+
raise
|
| 93 |
+
except Exception as e:
|
| 94 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
@app.patch("/idioms/{idiom_id}", response_model=schemas.IdiomResponse)
|
| 99 |
+
async def patch_idiom(idiom_id: str, idiom_update: schemas.IdiomBase):
|
| 100 |
+
try:
|
| 101 |
+
updated = await crud.update_idiom(idiom_id, idiom_update.dict(exclude_unset=True))
|
| 102 |
+
if not updated:
|
| 103 |
+
raise HTTPException(status_code=404, detail="Idiom not found")
|
| 104 |
+
return updated
|
| 105 |
+
except HTTPException:
|
| 106 |
+
raise
|
| 107 |
+
except Exception as e:
|
| 108 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 109 |
+
|
| 110 |
+
@app.delete("/idioms/{idiom_id}")
|
| 111 |
+
async def delete_idiom(idiom_id: str):
|
| 112 |
+
try:
|
| 113 |
+
return await crud.delete_idiom(idiom_id)
|
| 114 |
+
except Exception as e:
|
| 115 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 116 |
+
|
| 117 |
+
@app.get("/ping")
|
| 118 |
+
async def ping():
|
| 119 |
+
return {"status": "ok"}
|
| 120 |
+
|
Backend/Database/models.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sqlalchemy import Column, String, Integer, JSON
|
| 2 |
+
from db import Base
|
| 3 |
+
|
| 4 |
+
class Idiom(Base):
|
| 5 |
+
__tablename__ = "idioms"
|
| 6 |
+
|
| 7 |
+
id = Column(String, primary_key=True, index=True)
|
| 8 |
+
idiom = Column(String, nullable=False)
|
| 9 |
+
language = Column(String, nullable=False)
|
| 10 |
+
dialect = Column(String, default="unspecified")
|
| 11 |
+
idiomatic_meaning = Column(String, default="")
|
| 12 |
+
literal_meaning = Column(String, default="")
|
| 13 |
+
example = Column(String, default="")
|
| 14 |
+
validation_count = Column(JSON, default={"approved": 0, "rejected": 0}) # { "approved": 0, "rejected": 0 }
|
| 15 |
+
quality = Column(String, default="seed")
|
| 16 |
+
status = Column(String, default="pending")
|
Backend/Database/requirements.txt
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
uvicorn
|
| 2 |
+
gunicorn
|
| 3 |
+
sqlalchemy
|
| 4 |
+
supabase
|
| 5 |
+
fastapi
|
| 6 |
+
pydantic
|
| 7 |
+
databases
|
| 8 |
+
asyncpg
|
| 9 |
+
alembic
|
| 10 |
+
psycopg2-binary
|
| 11 |
+
python-dotenv
|
| 12 |
+
requests
|
Backend/Database/schemas.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel, ConfigDict
|
| 2 |
+
from typing import Dict, Optional, List
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class Translation(BaseModel):
|
| 6 |
+
language: str
|
| 7 |
+
text: str
|
| 8 |
+
|
| 9 |
+
model_config = ConfigDict(extra="ignore")
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class Example(BaseModel):
|
| 13 |
+
id: int
|
| 14 |
+
source_language: str
|
| 15 |
+
source_text: str
|
| 16 |
+
translations: List[Translation] = []
|
| 17 |
+
dialect: Optional[str] = None
|
| 18 |
+
url: Optional[str] = None
|
| 19 |
+
source: Optional[str] = None
|
| 20 |
+
|
| 21 |
+
model_config = ConfigDict(extra="ignore")
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class IdiomBase(BaseModel):
|
| 25 |
+
idiom: str
|
| 26 |
+
language: str
|
| 27 |
+
dialect: str = "unspecified"
|
| 28 |
+
idiomatic_meaning: Optional[str] = ""
|
| 29 |
+
literal_meaning: Optional[str] = ""
|
| 30 |
+
validation_count: Dict[str, int] = {}
|
| 31 |
+
quality: str = "seed"
|
| 32 |
+
status: str = "pending"
|
| 33 |
+
|
| 34 |
+
model_config = ConfigDict(extra="ignore")
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class Meaning(BaseModel):
|
| 38 |
+
meaning_id: Optional[str] = None
|
| 39 |
+
idiom_id: Optional[str] = None
|
| 40 |
+
sense_number: Optional[int] = None
|
| 41 |
+
register: List[str] = []
|
| 42 |
+
region: List[str] = []
|
| 43 |
+
definitions: List[str] = []
|
| 44 |
+
version: Optional[int] = None
|
| 45 |
+
|
| 46 |
+
model_config = ConfigDict(extra="ignore")
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
class IdiomResponse(IdiomBase):
|
| 50 |
+
id: str
|
| 51 |
+
examples: List[Example] = []
|
| 52 |
+
meanings: List[Meaning] = [] # β add this
|
| 53 |
+
|
| 54 |
+
model_config = ConfigDict(extra="ignore")
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
class IdiomCreate(IdiomBase):
|
| 58 |
+
id: str
|
| 59 |
+
examples: List[Example] = []
|
Backend/Idiom_lexicon.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
KNOWN_IDIOMS = {
|
| 2 |
+
|
| 3 |
+
}
|
Backend/__init__.py
ADDED
|
File without changes
|
Backend/checkpoints/README.md
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e46bcd33d5c550de1d2d1219ee5ba9e76ad3150e4f8db750bcc06d427db2580e
|
| 3 |
+
size 5102
|
Backend/checkpoints/adapter_config.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:baf0bbe4850ef1d5b7e70925a08c46c7bf6f77983a4b389d5d61fefedd4fe9c3
|
| 3 |
+
size 813
|
Backend/checkpoints/adapter_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fe56effed85c0e03c87510d661319fac10c770a2510078d307742b6acc2d6385
|
| 3 |
+
size 1785652
|
Backend/checkpoints/model.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5fa8a09531fb96548794a2bf557b30a66009d2f4b5bef9da95974faaa562f191
|
| 3 |
+
size 710950671
|
Backend/checkpoints/model_config.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:018070b58b7084b2f3bb109d8d8f9e867ef69fbe0ad0f2de417931973187c5ea
|
| 3 |
+
size 114
|
Backend/checkpoints/special_tokens_map.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b6d346be366a7d1d48332dbc9fdf3bf8960b5d879522b7799ddba59e76237ee3
|
| 3 |
+
size 125
|
Backend/checkpoints/tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:672146ee6867dc02a01c474090e237789f8a066ee7247bb2cb6c8688a27536a8
|
| 3 |
+
size 2919627
|
Backend/checkpoints/tokenizer_config.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1c441dfe412d9d7e47c960029a48a0159c23a38a9ab41b465d90fb1f520d4ced
|
| 3 |
+
size 1222
|
Backend/checkpoints/training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d3f804318f6d6b34171ceee3619b326cca29abcc800f2d37fcb85fb0714f761f
|
| 3 |
+
size 5649
|
Backend/checkpoints/vocab.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fe0fda7c425b48c516fc8f160d594c8022a0808447475c1a7c6d6479763f310c
|
| 3 |
+
size 995526
|
Backend/idioms_structured_1/seed_idioms_en_cleaned.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:67f1af4458c3f0defb5f12bccdc3c4cacfeb68e73a03174735e0d64787891af5
|
| 3 |
+
size 12976644
|
Backend/idioms_structured_1/seed_idioms_es_cleaned.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:09b6e067e5ac4e5ddff228011494f1e221a52dfe42ffb45b9ee1183f920c9c4d
|
| 3 |
+
size 1613110
|
Backend/inference.py
ADDED
|
@@ -0,0 +1,219 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# filepath: src/model/inference.py
|
| 2 |
+
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
| 3 |
+
from peft import PeftModel, PeftConfig
|
| 4 |
+
import torch
|
| 5 |
+
import torch.nn.functional as F
|
| 6 |
+
from Idiom_lexicon import KNOWN_IDIOMS
|
| 7 |
+
import fitz # PyMuPDF
|
| 8 |
+
import tempfile
|
| 9 |
+
from PIL import Image
|
| 10 |
+
import pytesseract
|
| 11 |
+
import nltk
|
| 12 |
+
import spacy
|
| 13 |
+
import json
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
from fastapi import HTTPException
|
| 16 |
+
|
| 17 |
+
nltk.download('punkt_tab', quiet=True)
|
| 18 |
+
from langdetect import detect
|
| 19 |
+
from nltk.tokenize import sent_tokenize
|
| 20 |
+
import re
|
| 21 |
+
LANG_MAP = {
|
| 22 |
+
'en': 'english',
|
| 23 |
+
'es': 'spanish',
|
| 24 |
+
# add more if needed
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
def split_text_by_language(text, language: str):
|
| 28 |
+
# Map input language (e.g., 'en', 'es') to NLTK language codes
|
| 29 |
+
nltk_lang = LANG_MAP.get(language.lower(), 'english')
|
| 30 |
+
sentences = sent_tokenize(text, language=nltk_lang)
|
| 31 |
+
return sentences
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def load_model(checkpoint_path):
|
| 35 |
+
config = PeftConfig.from_pretrained(checkpoint_path)
|
| 36 |
+
base_model = AutoModelForTokenClassification.from_pretrained(
|
| 37 |
+
config.base_model_name_or_path,
|
| 38 |
+
num_labels=3 # O, B-IDIOM, I-IDIOM
|
| 39 |
+
)
|
| 40 |
+
model = PeftModel.from_pretrained(base_model, checkpoint_path)
|
| 41 |
+
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
|
| 42 |
+
return model, tokenizer
|
| 43 |
+
|
| 44 |
+
def normalize_text(text):
|
| 45 |
+
# Join hyphenated words split across lines
|
| 46 |
+
text = re.sub(r'-\s*\n\s*', '', text)
|
| 47 |
+
# Replace newlines with spaces
|
| 48 |
+
text = re.sub(r'\n+', ' ', text)
|
| 49 |
+
# Collapse multiple spaces into one
|
| 50 |
+
text = re.sub(r'\s+', ' ', text)
|
| 51 |
+
return text.strip()
|
| 52 |
+
|
| 53 |
+
def filter_idioms(candidate_idioms, known_idioms, min_len=2):
|
| 54 |
+
filtered = []
|
| 55 |
+
for idiom in candidate_idioms:
|
| 56 |
+
norm = idiom.lower().strip()
|
| 57 |
+
if norm in known_idioms or len(norm.split()) >= min_len:
|
| 58 |
+
filtered.append(idiom)
|
| 59 |
+
return filtered
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
import spacy
|
| 63 |
+
import json
|
| 64 |
+
from pathlib import Path
|
| 65 |
+
|
| 66 |
+
class IdiomMatcher:
|
| 67 |
+
def __init__(self, idiom_files: dict[str, str]):
|
| 68 |
+
self.models = {
|
| 69 |
+
"en": spacy.load("en_core_web_sm"),
|
| 70 |
+
"es": spacy.load("es_core_news_sm"),
|
| 71 |
+
}
|
| 72 |
+
self.idioms_by_lang = {lang: [] for lang in idiom_files}
|
| 73 |
+
self._load_idioms(idiom_files)
|
| 74 |
+
|
| 75 |
+
def _lemmatize(self, text: str, lang: str) -> str:
|
| 76 |
+
doc = self.models[lang](text)
|
| 77 |
+
return " ".join(token.lemma_ for token in doc)
|
| 78 |
+
|
| 79 |
+
def _load_idioms(self, idiom_files: dict[str, str]):
|
| 80 |
+
for lang, file_path in idiom_files.items():
|
| 81 |
+
path = Path(file_path)
|
| 82 |
+
if not path.exists():
|
| 83 |
+
raise FileNotFoundError(f"Idiom file not found for {lang}: {file_path}")
|
| 84 |
+
|
| 85 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 86 |
+
for line in f:
|
| 87 |
+
entry = json.loads(line)
|
| 88 |
+
idiom_text = entry.get("idiom", "").strip()
|
| 89 |
+
if not idiom_text:
|
| 90 |
+
continue
|
| 91 |
+
entry["lemmatized"] = self._lemmatize(idiom_text, lang)
|
| 92 |
+
self.idioms_by_lang[lang].append(entry)
|
| 93 |
+
|
| 94 |
+
def match(self, sentence: str, lang: str):
|
| 95 |
+
if lang not in self.models:
|
| 96 |
+
raise ValueError(f"Unsupported language: {lang}")
|
| 97 |
+
sent_lemma = self._lemmatize(sentence, lang)
|
| 98 |
+
return [
|
| 99 |
+
idiom for idiom in self.idioms_by_lang[lang]
|
| 100 |
+
if idiom["lemmatized"] in sent_lemma
|
| 101 |
+
]
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def predict_idiom(text, model, tokenizer, device, conf_threshold=0.9):
|
| 105 |
+
words = text.split()
|
| 106 |
+
if not words:
|
| 107 |
+
print("[β οΈ] Empty input text")
|
| 108 |
+
return []
|
| 109 |
+
|
| 110 |
+
inputs = tokenizer(
|
| 111 |
+
words,
|
| 112 |
+
is_split_into_words=True,
|
| 113 |
+
truncation=True,
|
| 114 |
+
padding=True,
|
| 115 |
+
max_length=128,
|
| 116 |
+
return_tensors="pt"
|
| 117 |
+
).to(device)
|
| 118 |
+
|
| 119 |
+
with torch.no_grad():
|
| 120 |
+
outputs = model(**inputs)
|
| 121 |
+
logits = outputs.logits
|
| 122 |
+
probs = F.softmax(logits, dim=-1)
|
| 123 |
+
|
| 124 |
+
max_probs, predictions = torch.max(probs, dim=-1)
|
| 125 |
+
max_probs = max_probs.cpu().numpy()[0]
|
| 126 |
+
predictions = predictions.cpu().numpy()[0]
|
| 127 |
+
word_ids = inputs.word_ids(batch_index=0)
|
| 128 |
+
|
| 129 |
+
idioms = []
|
| 130 |
+
current_idiom_start = -1
|
| 131 |
+
current_idiom_end = -1
|
| 132 |
+
|
| 133 |
+
for i, (pred_label, conf, word_idx) in enumerate(zip(predictions, max_probs, word_ids)):
|
| 134 |
+
if word_idx is None:
|
| 135 |
+
if current_idiom_start != -1:
|
| 136 |
+
idioms.append(' '.join(words[current_idiom_start:current_idiom_end + 1]))
|
| 137 |
+
current_idiom_start = -1
|
| 138 |
+
current_idiom_end = -1
|
| 139 |
+
continue
|
| 140 |
+
|
| 141 |
+
if conf < conf_threshold:
|
| 142 |
+
pred_label = 0
|
| 143 |
+
|
| 144 |
+
if pred_label == 1: # B-IDIOM
|
| 145 |
+
if current_idiom_start != -1:
|
| 146 |
+
idioms.append(' '.join(words[current_idiom_start:current_idiom_end + 1]))
|
| 147 |
+
current_idiom_start = word_idx
|
| 148 |
+
current_idiom_end = word_idx
|
| 149 |
+
elif pred_label == 2: # I-IDIOM
|
| 150 |
+
if current_idiom_start != -1 and (word_idx == current_idiom_end or word_idx == current_idiom_end + 1):
|
| 151 |
+
current_idiom_end = word_idx
|
| 152 |
+
else:
|
| 153 |
+
if current_idiom_start != -1:
|
| 154 |
+
idioms.append(' '.join(words[current_idiom_start:current_idiom_end + 1]))
|
| 155 |
+
current_idiom_start = -1
|
| 156 |
+
current_idiom_end = -1
|
| 157 |
+
else: # O
|
| 158 |
+
if current_idiom_start != -1:
|
| 159 |
+
idioms.append(' '.join(words[current_idiom_start:current_idiom_end + 1]))
|
| 160 |
+
current_idiom_start = -1
|
| 161 |
+
current_idiom_end = -1
|
| 162 |
+
|
| 163 |
+
if current_idiom_start != -1:
|
| 164 |
+
idioms.append(' '.join(words[current_idiom_start:current_idiom_end + 1]))
|
| 165 |
+
|
| 166 |
+
idioms = filter_idioms(idioms, known_idioms=KNOWN_IDIOMS)
|
| 167 |
+
return idioms
|
| 168 |
+
|
| 169 |
+
import pdfplumber
|
| 170 |
+
|
| 171 |
+
def extract_text_from_pdf(pdf_bytes: bytes) -> str:
|
| 172 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
|
| 173 |
+
tmp.write(pdf_bytes)
|
| 174 |
+
tmp_path = tmp.name
|
| 175 |
+
|
| 176 |
+
doc = fitz.open(tmp_path)
|
| 177 |
+
text = ""
|
| 178 |
+
for i, page in enumerate(doc):
|
| 179 |
+
page_text = page.get_text()
|
| 180 |
+
print(f"[DEBUG] Page {i+1} extracted text (first 100 chars): {repr(page_text[:100])}")
|
| 181 |
+
text += page_text
|
| 182 |
+
doc.close()
|
| 183 |
+
text = normalize_text(text)
|
| 184 |
+
print("[DEBUG] Cleaned extracted text from PDF (first 500 chars):", repr(text[:500]))
|
| 185 |
+
if not text:
|
| 186 |
+
print("[β οΈ] No text extracted from PDF. It may be blank or not readable.")
|
| 187 |
+
return text
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
def reconstruct_words(tokens, labels):
|
| 192 |
+
"""
|
| 193 |
+
Reconstruct words from BERT tokens and their corresponding labels.
|
| 194 |
+
This function is used to map the BERT token predictions back to the original words.
|
| 195 |
+
"""
|
| 196 |
+
words = []
|
| 197 |
+
current_word = []
|
| 198 |
+
current_label = None
|
| 199 |
+
|
| 200 |
+
for token, label in zip(tokens, labels):
|
| 201 |
+
if label == 'O':
|
| 202 |
+
if current_word:
|
| 203 |
+
words.append(''.join(current_word))
|
| 204 |
+
current_word = []
|
| 205 |
+
continue
|
| 206 |
+
|
| 207 |
+
if label.startswith('B-'):
|
| 208 |
+
if current_word:
|
| 209 |
+
words.append(''.join(current_word))
|
| 210 |
+
current_word = []
|
| 211 |
+
current_label = label[2:] # Get the idiom type
|
| 212 |
+
current_word.append(token)
|
| 213 |
+
elif label.startswith('I-') and current_label == label[2:]:
|
| 214 |
+
current_word.append(token)
|
| 215 |
+
|
| 216 |
+
if current_word:
|
| 217 |
+
words.append(''.join(current_word))
|
| 218 |
+
|
| 219 |
+
return words
|
Backend/main.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import torch
|
| 3 |
+
from fastapi import FastAPI
|
| 4 |
+
from pydantic import BaseModel
|
| 5 |
+
from fastapi import UploadFile, File
|
| 6 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 7 |
+
from fastapi import UploadFile, File
|
| 8 |
+
from inference import extract_text_from_pdf, split_text_by_language, predict_idiom, normalize_text, load_model, IdiomMatcher
|
| 9 |
+
from nltk.tokenize import sent_tokenize
|
| 10 |
+
from langdetect import detect
|
| 11 |
+
from fastapi import HTTPException
|
| 12 |
+
import re
|
| 13 |
+
import fitz # PyMuPDF
|
| 14 |
+
# Allow requests from your React app
|
| 15 |
+
origins = [
|
| 16 |
+
"http://localhost:3000", # React dev server
|
| 17 |
+
"https://language-learning-base-website.vercel.app",
|
| 18 |
+
"https://www.idiomator.com"
|
| 19 |
+
"https://idiomator.com"
|
| 20 |
+
|
| 21 |
+
]
|
| 22 |
+
|
| 23 |
+
app = FastAPI()
|
| 24 |
+
|
| 25 |
+
app.add_middleware(
|
| 26 |
+
CORSMiddleware,
|
| 27 |
+
allow_origins= origins, # or ["*"] for all origins (not recommended in production)
|
| 28 |
+
allow_credentials=True,
|
| 29 |
+
allow_methods=["*"],
|
| 30 |
+
allow_headers=["*"],
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
# Load model once at startup
|
| 34 |
+
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
|
| 35 |
+
checkpoint_path = os.path.join(os.path.dirname(__file__), "checkpoints")
|
| 36 |
+
model, tokenizer = load_model(checkpoint_path)
|
| 37 |
+
model = model.to(device)
|
| 38 |
+
model.eval()
|
| 39 |
+
|
| 40 |
+
class TextRequest(BaseModel):
|
| 41 |
+
text: str
|
| 42 |
+
language: str = "en" # Default to English
|
| 43 |
+
|
| 44 |
+
class IdiomResponse(BaseModel):
|
| 45 |
+
idioms: list[str]
|
| 46 |
+
language: str = "en" # Default to English
|
| 47 |
+
|
| 48 |
+
@app.get("/")
|
| 49 |
+
def root():
|
| 50 |
+
return {"status": "ok"}
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
@app.post("/extract_idioms_ai", response_model=IdiomResponse)
|
| 54 |
+
def extract_idioms(request: TextRequest):
|
| 55 |
+
import time
|
| 56 |
+
start = time.time()
|
| 57 |
+
print(f"[π₯] Request received at: {start}")
|
| 58 |
+
|
| 59 |
+
text = normalize_text(request.text)
|
| 60 |
+
language = request.language.lower() # Get the user-selected language
|
| 61 |
+
|
| 62 |
+
sentences = split_text_by_language(text, language=language)
|
| 63 |
+
idioms = []
|
| 64 |
+
for sent in sentences:
|
| 65 |
+
idioms.extend(predict_idiom(sent, model, tokenizer, device))
|
| 66 |
+
print(f"[β
] Done in {time.time() - start:.3f}s")
|
| 67 |
+
return {"idioms": idioms}
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
from fastapi import Form
|
| 71 |
+
|
| 72 |
+
def check_pdf_page_limit(pdf_bytes, max_pages=10):
|
| 73 |
+
with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
|
| 74 |
+
if len(doc) > max_pages:
|
| 75 |
+
raise HTTPException(status_code=400, detail=f"PDF has {len(doc)} pages. Limit is {max_pages}.")
|
| 76 |
+
|
| 77 |
+
@app.post("/extract_idioms_pdf_ai", response_model=IdiomResponse)
|
| 78 |
+
async def extract_idioms_pdf(
|
| 79 |
+
file: UploadFile = File(...),
|
| 80 |
+
language: str = Form(...) # β
Get language from the client
|
| 81 |
+
):
|
| 82 |
+
|
| 83 |
+
pdf_bytes = await file.read()
|
| 84 |
+
check_pdf_page_limit(pdf_bytes, max_pages=10)
|
| 85 |
+
text = extract_text_from_pdf(pdf_bytes)
|
| 86 |
+
# Normalize the extracted text!
|
| 87 |
+
text = normalize_text(text)
|
| 88 |
+
sentences = split_text_by_language(text, language=language)
|
| 89 |
+
idioms = []
|
| 90 |
+
for sent in sentences:
|
| 91 |
+
idioms.extend(predict_idiom(sent, model, tokenizer, device))
|
| 92 |
+
return {"idioms": idioms}
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
idiom_matcher = IdiomMatcher({
|
| 96 |
+
"en": "idioms_structured_1/seed_idioms_en_cleaned.jsonl",
|
| 97 |
+
"es": "idioms_structured_1/seed_idioms_es_cleaned.jsonl"
|
| 98 |
+
})
|
| 99 |
+
|
| 100 |
+
@app.post("/extract_idioms_heuristic", response_model=IdiomResponse)
|
| 101 |
+
def extract_idioms_heuristic(request: TextRequest):
|
| 102 |
+
text = normalize_text(request.text)
|
| 103 |
+
language = request.language.lower() # get the language from request
|
| 104 |
+
idiom_matches = idiom_matcher.match(text, lang=language)
|
| 105 |
+
idioms = [idiom["idiom"] for idiom in idiom_matches]
|
| 106 |
+
|
| 107 |
+
return {"idioms": idioms}
|
| 108 |
+
@app.post("/extract_idioms_pdf_heuristic", response_model=IdiomResponse)
|
| 109 |
+
async def extract_idioms_pdf_(
|
| 110 |
+
file: UploadFile = File(...),
|
| 111 |
+
language: str = Form(...) # β
Get language from the client
|
| 112 |
+
):
|
| 113 |
+
pdf_bytes = await file.read()
|
| 114 |
+
check_pdf_page_limit(pdf_bytes, max_pages=10)
|
| 115 |
+
text = extract_text_from_pdf(pdf_bytes)
|
| 116 |
+
# Normalize the extracted text!
|
| 117 |
+
text = normalize_text(text)
|
| 118 |
+
sentences = split_text_by_language(text, language=language)
|
| 119 |
+
idioms = []
|
| 120 |
+
idiom_matches = idiom_matcher.match(text, lang=language)
|
| 121 |
+
idioms = [idiom["idiom"] for idiom in idiom_matches]
|
| 122 |
+
return {"idioms": idioms}
|
Backend/requirements.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aa0d502a504c0e0ebdff60428904ef42b61af5846a8ff1b0673e6501ef89ff38
|
| 3 |
+
size 515
|
Dockerfile
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
# Create user but don't switch yet
|
| 4 |
+
RUN useradd -m -u 1000 user
|
| 5 |
+
|
| 6 |
+
WORKDIR /app
|
| 7 |
+
|
| 8 |
+
# Install Tesseract OCR and system dependencies as root
|
| 9 |
+
RUN apt-get update && \
|
| 10 |
+
apt-get install -y --no-install-recommends tesseract-ocr libglib2.0-0 libsm6 libxext6 libxrender-dev && \
|
| 11 |
+
rm -rf /var/lib/apt/lists/*
|
| 12 |
+
|
| 13 |
+
# Now switch to the non-root user
|
| 14 |
+
USER user
|
| 15 |
+
|
| 16 |
+
# Copy requirements.txt
|
| 17 |
+
COPY --chown=user Backend/requirements.txt .
|
| 18 |
+
|
| 19 |
+
# Install Python dependencies (including nltk and langdetect)
|
| 20 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
| 21 |
+
|
| 22 |
+
# (Re)install nltk and download punkt as user, ensuring clean install and data in user dir
|
| 23 |
+
RUN pip install --upgrade --force-reinstall nltk && \
|
| 24 |
+
python -m nltk.downloader punkt_tab
|
| 25 |
+
|
| 26 |
+
# Verify punkt is present (will print path in build logs)
|
| 27 |
+
RUN python -c "import nltk; print(nltk.data.find('tokenizers/punkt_tab'))"
|
| 28 |
+
|
| 29 |
+
# Set PATH for user-installed Python packages
|
| 30 |
+
ENV PATH="/home/user/.local/bin:${PATH}"
|
| 31 |
+
|
| 32 |
+
# Copy all backend code
|
| 33 |
+
COPY --chown=user Backend/ .
|
| 34 |
+
|
| 35 |
+
EXPOSE 7860
|
| 36 |
+
|
| 37 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Multilingual Idiom Extractor
|
| 3 |
+
emoji: π
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
+
sdk: docker
|
| 7 |
+
sdk_version: "latest"
|
| 8 |
+
app_file: main.py
|
| 9 |
+
pinned: false
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# Language_Learning_BaseWebsite
|
| 13 |
+
I am just changing this so I have a change
|
| 14 |
+
|