GitHub Actions commited on
Commit
447d423
Β·
0 Parent(s):

Track large files with LFS

Browse files
.gitattributes ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Backend/*.txt filter=lfs diff=lfs merge=lfs -text
2
+ Backend/*.numbers filter=lfs diff=lfs merge=lfs -text
3
+ Backend/checkpoints/* filter=lfs diff=lfs merge=lfs -text
4
+ Backend/idioms_structured_1/*.jsonl filter=lfs diff=lfs merge=lfs -text
Backend/Database/Idiom_Id_Generator.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from supabase import create_client, Client
2
+ from dotenv import load_dotenv
3
+ import os
4
+
5
+ load_dotenv()
6
+
7
+ url = os.getenv("database_url")
8
+ key = os.getenv("database_service_Key")
9
+
10
+ supabase: Client = create_client(url, key)
11
+
12
+ def generate_id(lang_code: str, dialect: str | None) -> str:
13
+ # Normaliza dialecto si existe
14
+ if dialect and dialect.strip():
15
+ dialect_clean = dialect.replace(" ", "_").lower()
16
+ else:
17
+ dialect_clean = None
18
+
19
+ # Construye la query base
20
+ query = supabase.table("idioms").select("id", count="exact").eq("language", lang_code)
21
+ if dialect_clean:
22
+ query = query.eq("language", lang_code)
23
+ else:
24
+ query = query.eq("language", lang_code ) # si lo guardas como NULL
25
+
26
+ res = query.execute()
27
+ counter = res.count + 1
28
+
29
+ # Genera ID
30
+ if dialect_clean:
31
+ return f"{lang_code}_{dialect_clean}_{str(counter).zfill(4)}"
32
+ else:
33
+ return f"{lang_code}_unspecified_{str(counter).zfill(4)}"
34
+
Backend/Database/crud.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # crud.py (reemplaza la versiΓ³n SQLAlchemy)
2
+ import os
3
+ import httpx
4
+ from dotenv import load_dotenv
5
+ from typing import Dict, Optional, List
6
+ from fastapi import HTTPException
7
+ load_dotenv()
8
+
9
+ SUPABASE_URL = os.getenv("SUPABASE_URL").rstrip("/")
10
+ SUPABASE_KEY = os.getenv("SUPABASE_KEY")
11
+ database_service_Key = os.getenv("database_service_Key")
12
+ TABLE = os.getenv("TABLE_NAME", "idioms")
13
+
14
+ HEADERS = {
15
+ "apikey": database_service_Key,
16
+ "Authorization": f"Bearer {database_service_Key}",
17
+ "Content-Type": "application/json",
18
+ "Accept": "application/json",
19
+ }
20
+
21
+ # helpers
22
+ async def _client():
23
+ return httpx.AsyncClient(timeout=30.0)
24
+
25
+ # CRUD
26
+ async def get_idioms(skip: int = 0, limit: int = 100000):
27
+ async with httpx.AsyncClient() as client:
28
+ url = f"{SUPABASE_URL}/rest/v1/{TABLE}?select=*&offset={skip}&limit={limit}"
29
+ print("Supabase GET URL:", url)
30
+ r = await client.get(url, headers=HEADERS)
31
+ print("Supabase GET status:", r.status_code)
32
+ print("Supabase GET response:", r.text)
33
+ r.raise_for_status()
34
+ return r.json()
35
+
36
+ async def get_all_idioms():
37
+ all_idioms = []
38
+ limit = 1000 # Supabase max per request
39
+ offset = 0
40
+
41
+ async with httpx.AsyncClient(timeout=60.0) as client:
42
+ while True:
43
+ url = f"{SUPABASE_URL}/rest/v1/{TABLE}?select=*&limit={limit}&offset={offset}"
44
+ r = await client.get(url, headers=HEADERS)
45
+ r.raise_for_status()
46
+ data = r.json()
47
+ if not data:
48
+ break
49
+ # ensure validation_count is a dict
50
+ for item in data:
51
+ if not isinstance(item.get("validation_count"), dict):
52
+ item["validation_count"] = {}
53
+ all_idioms.extend(data)
54
+ offset += limit
55
+
56
+ return all_idioms
57
+
58
+ async def get_idiom(idiom_id: str):
59
+ async with httpx.AsyncClient() as client:
60
+ try:
61
+ # Include both examples and meanings via foreign key embedding
62
+ url = (
63
+ f"{SUPABASE_URL}/rest/v1/{TABLE}?"
64
+ f"id=eq.{idiom_id}&"
65
+ f"select=*,"
66
+ f"idiom_meanings!idiom_meanings_idiom_id_fkey(*),"
67
+ f"examples!examples_idiom_id_fkey(*)"
68
+
69
+ )
70
+ print(f"Fetching idiom from Supabase: {url}") # debug
71
+ r = await client.get(url, headers=HEADERS)
72
+ print("HTTP status code:", r.status_code) # debug
73
+ r.raise_for_status()
74
+ data = r.json()
75
+ print("Raw data from Supabase:", data) # debug
76
+ except httpx.RequestError as e:
77
+ print("Request failed:", e)
78
+ raise HTTPException(status_code=500, detail=f"Supabase request failed: {e}")
79
+ except httpx.HTTPStatusError as e:
80
+ print("HTTP error:", e)
81
+ raise HTTPException(status_code=500, detail=f"Supabase HTTP error: {e}")
82
+ except Exception as e:
83
+ print("Unexpected error:", e)
84
+ raise HTTPException(status_code=500, detail=f"Unexpected error: {e}")
85
+
86
+ if not data:
87
+ print(f"No idiom found for id: {idiom_id}") # debug
88
+ return None
89
+
90
+ idiom = data[0]
91
+ if not isinstance(idiom, dict):
92
+ print(f"Unexpected data type for idiom: {type(idiom)}") # debug
93
+ raise ValueError(f"Expected dict, got: {type(idiom)}")
94
+
95
+ # --- Transform examples ---
96
+ raw_examples = idiom.get("examples") or []
97
+ idiom["examples"] = [
98
+ {
99
+ "id": ex.get("id"),
100
+ "source_text": ex.get("source_text") or "",
101
+ "source_language": ex.get("source_language") or idiom.get("language"),
102
+ "translations": json.loads(ex["translations"]) if isinstance(ex.get("translations"), str) else ex.get("translations") or [],
103
+ "dialect": ex.get("dialect"),
104
+ "url": ex.get("url"),
105
+ "source": ex.get("source"),
106
+ }
107
+ for ex in raw_examples
108
+ ]
109
+ print(f"Found {len(idiom['examples'])} examples") # debug
110
+
111
+ # --- Transform meanings ---
112
+ raw_meanings = idiom.get("idiom_meanings") or []
113
+ print("Raw meanings data:", raw_meanings) # debug
114
+ idiom["meanings"] = [
115
+ {
116
+ "meaning_id": m.get("meaning_id"),
117
+ "idiom_id": m.get("idiom_id"),
118
+ "sense_number": m.get("sense_number"),
119
+ "register": m.get("register") or [],
120
+ "region": m.get("region") or [],
121
+ "definitions": m.get("definitions") or [],
122
+ "version": m.get("version"), # optional, if you need it
123
+ }
124
+ for m in raw_meanings
125
+ ]
126
+ print("Transformed meanings data:", idiom["meanings"]) # debug
127
+ print(f"Found {len(idiom['meanings'])} meanings") # debug
128
+ return idiom
129
+
130
+
131
+
132
+ async def search_idioms(query: str = "", language: Optional[str] = None, skip: int = 0, limit: int = 50):
133
+ async with httpx.AsyncClient() as client:
134
+ # Compose select param to embed idiom_meanings
135
+ select_query = "*,idiom_meanings!idiom_meanings_idiom_id_fkey(*)"
136
+
137
+ url = (
138
+ f"{SUPABASE_URL}/rest/v1/{TABLE}"
139
+ f"?offset={skip}&limit={limit}&select={select_query}"
140
+ )
141
+
142
+ # Maintain partial text match on idiom column
143
+ if query:
144
+ url += f"&idiom=ilike.*{query}*"
145
+
146
+ # Maintain language filter if specified and not "all"
147
+ if language and language.lower() not in ("all", "*"):
148
+ url += f"&language=eq.{language}"
149
+
150
+ r = await client.get(url, headers=HEADERS)
151
+ r.raise_for_status()
152
+ data = r.json()
153
+
154
+ # Ensure validation_count is a dict
155
+ for item in data:
156
+ if not isinstance(item.get("validation_count"), dict):
157
+ item["validation_count"] = {}
158
+
159
+ # Transform embedded idiom_meanings to meanings field for UI use
160
+ raw_meanings = item.get("idiom_meanings") or []
161
+ item["meanings"] = [
162
+ {
163
+ "meaning_id": m.get("meaning_id"),
164
+ "idiom_id": m.get("idiom_id"),
165
+ "sense_number": m.get("sense_number"),
166
+ "register": m.get("register") or [],
167
+ "region": m.get("region") or [],
168
+ "definitions": m.get("definitions") or [],
169
+ "version": m.get("version"),
170
+ }
171
+ for m in raw_meanings
172
+ ]
173
+
174
+ return data
175
+
176
+
177
+
178
+ async def create_idiom(item: dict):
179
+ async with httpx.AsyncClient() as client:
180
+ url = f"{SUPABASE_URL}/rest/v1/{TABLE}"
181
+ r = await client.post(url, json=item, headers=HEADERS)
182
+ r.raise_for_status() # fail if not 2xx
183
+
184
+ try:
185
+ data = r.json()
186
+ except ValueError:
187
+ # Supabase returned empty body, fallback to the original item
188
+ data = item
189
+
190
+ if isinstance(data, list) and data:
191
+ return data[0]
192
+ if isinstance(data, dict) and data:
193
+ return data
194
+ # final fallback
195
+ return item
196
+ async def update_idiom(idiom_id: str, item: dict):
197
+ async with httpx.AsyncClient() as client:
198
+ url = f"{SUPABASE_URL}/rest/v1/{TABLE}?id=eq.{idiom_id}"
199
+ r = await client.patch(url, json=item, headers=HEADERS)
200
+ if r.status_code not in (200, 204):
201
+ raise httpx.HTTPStatusError("Update failed", request=r.request, response=r)
202
+ # After patch, fetch the updated row
203
+ return await get_idiom(idiom_id)
204
+
205
+ async def delete_idiom(idiom_id: str):
206
+ async with httpx.AsyncClient() as client:
207
+ url = f"{SUPABASE_URL}/rest/v1/{TABLE}?id=eq.{idiom_id}"
208
+ r = await client.delete(url, headers=HEADERS)
209
+ if r.status_code not in (200, 204):
210
+ raise httpx.HTTPStatusError("Delete failed", request=r.request, response=r)
211
+ return {"status": "deleted"}
Backend/Database/db.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from sqlalchemy import create_engine
3
+ from sqlalchemy.orm import sessionmaker, declarative_base
4
+ from dotenv import load_dotenv
5
+
6
+ load_dotenv()
7
+
8
+ DATABASE_URL = os.getenv("DATABASE_URL")
9
+ db_url = DATABASE_URL
10
+ # Engine with SSL (required by Supabase)
11
+ engine = create_engine(db_url, connect_args={"sslmode": "require"})
12
+
13
+ # Session factory
14
+ SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
15
+
16
+ # Base class for models
17
+ Base = declarative_base()
Backend/Database/main.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # main.py
2
+ from fastapi import FastAPI, HTTPException, APIRouter
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ import crud
5
+ import schemas
6
+ from Idiom_Id_Generator import generate_id
7
+ from dotenv import load_dotenv
8
+ import os
9
+ from pydantic import BaseModel
10
+ from typing import List, Dict, Optional
11
+
12
+
13
+
14
+ load_dotenv()
15
+
16
+ app = FastAPI(title="Idioms API - Supabase REST")
17
+
18
+ allowed_origins = [
19
+ "http://localhost",
20
+ "http://localhost:3000",
21
+ "http://localhost:8000",
22
+ "https://idiomator.vercel.app",
23
+ "https://www.idiomator.vercel.app",
24
+ "https://idiomator.com",
25
+ "https://www.idiomator.com",
26
+ ]
27
+ app.add_middleware(
28
+ CORSMiddleware,
29
+ allow_origins= allowed_origins,#allowed_origins,
30
+ allow_credentials=True,
31
+ allow_methods=["*"],
32
+ allow_headers=["*"],
33
+ )
34
+
35
+
36
+ @app.get("/idioms", response_model=list[schemas.IdiomResponse])
37
+ async def read_idioms(skip: int = 0, limit: int = 50):
38
+ try:
39
+ data = await crud.get_idioms(skip=skip, limit=limit)
40
+ return data
41
+ except Exception as e:
42
+ raise HTTPException(status_code=500, detail=str(e))
43
+
44
+ @app.post("/idioms", response_model=schemas.IdiomResponse)
45
+ async def create_idiom(idiom: schemas.IdiomCreate):
46
+ try:
47
+ # keep your ID generation
48
+ idiom_id = generate_id(idiom.language, idiom.dialect)
49
+ idiom_dict = idiom.dict()
50
+ idiom_dict["id"] = idiom_id
51
+ created = await crud.create_idiom(idiom_dict)
52
+ return created
53
+ except Exception as e:
54
+ raise HTTPException(status_code=500, detail=str(e))
55
+
56
+ @app.get("/idioms/all_ids", response_model=List[Dict[str, str]])
57
+ async def get_all_idioms_route():
58
+ idioms = await crud.get_all_idioms()
59
+ print(f"Fetched {len(idioms)} idioms")
60
+ print(f"Sample idioms: {idioms[:3]}") # Print first 3 idioms for verification
61
+ return [
62
+ {"id": idiom["id"], "idiom": idiom["idiom"], "language": idiom["language"]}
63
+ for idiom in idioms
64
+ ]
65
+
66
+ @app.get("/idioms/search", response_model=List[schemas.IdiomResponse])
67
+ async def search_idioms(
68
+ q: Optional[str] = None,
69
+ language: Optional[str] = None,
70
+ skip: int = 0,
71
+ limit: int = 50
72
+ ):
73
+
74
+ try:
75
+ results = await crud.search_idioms(query=q, language=language, skip=skip, limit=limit)
76
+ return results
77
+ except Exception as e:
78
+ raise HTTPException(status_code=500, detail=str(e))
79
+
80
+
81
+ @app.get("/idioms/{idiom_id}", response_model=schemas.IdiomResponse)
82
+ async def read_idiom(idiom_id: str):
83
+ try:
84
+ db_idiom = await crud.get_idiom(idiom_id)
85
+ if not db_idiom:
86
+ raise HTTPException(status_code=404, detail="Idiom not found")
87
+ # Transform validation_count if needed
88
+ if isinstance(db_idiom.get("validation_count"), int):
89
+ db_idiom["validation_count"] = {"count": db_idiom["validation_count"]}
90
+ return db_idiom
91
+ except HTTPException:
92
+ raise
93
+ except Exception as e:
94
+ raise HTTPException(status_code=500, detail=str(e))
95
+
96
+
97
+
98
+ @app.patch("/idioms/{idiom_id}", response_model=schemas.IdiomResponse)
99
+ async def patch_idiom(idiom_id: str, idiom_update: schemas.IdiomBase):
100
+ try:
101
+ updated = await crud.update_idiom(idiom_id, idiom_update.dict(exclude_unset=True))
102
+ if not updated:
103
+ raise HTTPException(status_code=404, detail="Idiom not found")
104
+ return updated
105
+ except HTTPException:
106
+ raise
107
+ except Exception as e:
108
+ raise HTTPException(status_code=500, detail=str(e))
109
+
110
+ @app.delete("/idioms/{idiom_id}")
111
+ async def delete_idiom(idiom_id: str):
112
+ try:
113
+ return await crud.delete_idiom(idiom_id)
114
+ except Exception as e:
115
+ raise HTTPException(status_code=500, detail=str(e))
116
+
117
+ @app.get("/ping")
118
+ async def ping():
119
+ return {"status": "ok"}
120
+
Backend/Database/models.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sqlalchemy import Column, String, Integer, JSON
2
+ from db import Base
3
+
4
+ class Idiom(Base):
5
+ __tablename__ = "idioms"
6
+
7
+ id = Column(String, primary_key=True, index=True)
8
+ idiom = Column(String, nullable=False)
9
+ language = Column(String, nullable=False)
10
+ dialect = Column(String, default="unspecified")
11
+ idiomatic_meaning = Column(String, default="")
12
+ literal_meaning = Column(String, default="")
13
+ example = Column(String, default="")
14
+ validation_count = Column(JSON, default={"approved": 0, "rejected": 0}) # { "approved": 0, "rejected": 0 }
15
+ quality = Column(String, default="seed")
16
+ status = Column(String, default="pending")
Backend/Database/requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ uvicorn
2
+ gunicorn
3
+ sqlalchemy
4
+ supabase
5
+ fastapi
6
+ pydantic
7
+ databases
8
+ asyncpg
9
+ alembic
10
+ psycopg2-binary
11
+ python-dotenv
12
+ requests
Backend/Database/schemas.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, ConfigDict
2
+ from typing import Dict, Optional, List
3
+
4
+
5
+ class Translation(BaseModel):
6
+ language: str
7
+ text: str
8
+
9
+ model_config = ConfigDict(extra="ignore")
10
+
11
+
12
+ class Example(BaseModel):
13
+ id: int
14
+ source_language: str
15
+ source_text: str
16
+ translations: List[Translation] = []
17
+ dialect: Optional[str] = None
18
+ url: Optional[str] = None
19
+ source: Optional[str] = None
20
+
21
+ model_config = ConfigDict(extra="ignore")
22
+
23
+
24
+ class IdiomBase(BaseModel):
25
+ idiom: str
26
+ language: str
27
+ dialect: str = "unspecified"
28
+ idiomatic_meaning: Optional[str] = ""
29
+ literal_meaning: Optional[str] = ""
30
+ validation_count: Dict[str, int] = {}
31
+ quality: str = "seed"
32
+ status: str = "pending"
33
+
34
+ model_config = ConfigDict(extra="ignore")
35
+
36
+
37
+ class Meaning(BaseModel):
38
+ meaning_id: Optional[str] = None
39
+ idiom_id: Optional[str] = None
40
+ sense_number: Optional[int] = None
41
+ register: List[str] = []
42
+ region: List[str] = []
43
+ definitions: List[str] = []
44
+ version: Optional[int] = None
45
+
46
+ model_config = ConfigDict(extra="ignore")
47
+
48
+
49
+ class IdiomResponse(IdiomBase):
50
+ id: str
51
+ examples: List[Example] = []
52
+ meanings: List[Meaning] = [] # ← add this
53
+
54
+ model_config = ConfigDict(extra="ignore")
55
+
56
+
57
+ class IdiomCreate(IdiomBase):
58
+ id: str
59
+ examples: List[Example] = []
Backend/Idiom_lexicon.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ KNOWN_IDIOMS = {
2
+
3
+ }
Backend/__init__.py ADDED
File without changes
Backend/checkpoints/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e46bcd33d5c550de1d2d1219ee5ba9e76ad3150e4f8db750bcc06d427db2580e
3
+ size 5102
Backend/checkpoints/adapter_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:baf0bbe4850ef1d5b7e70925a08c46c7bf6f77983a4b389d5d61fefedd4fe9c3
3
+ size 813
Backend/checkpoints/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe56effed85c0e03c87510d661319fac10c770a2510078d307742b6acc2d6385
3
+ size 1785652
Backend/checkpoints/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5fa8a09531fb96548794a2bf557b30a66009d2f4b5bef9da95974faaa562f191
3
+ size 710950671
Backend/checkpoints/model_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:018070b58b7084b2f3bb109d8d8f9e867ef69fbe0ad0f2de417931973187c5ea
3
+ size 114
Backend/checkpoints/special_tokens_map.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6d346be366a7d1d48332dbc9fdf3bf8960b5d879522b7799ddba59e76237ee3
3
+ size 125
Backend/checkpoints/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:672146ee6867dc02a01c474090e237789f8a066ee7247bb2cb6c8688a27536a8
3
+ size 2919627
Backend/checkpoints/tokenizer_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c441dfe412d9d7e47c960029a48a0159c23a38a9ab41b465d90fb1f520d4ced
3
+ size 1222
Backend/checkpoints/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3f804318f6d6b34171ceee3619b326cca29abcc800f2d37fcb85fb0714f761f
3
+ size 5649
Backend/checkpoints/vocab.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe0fda7c425b48c516fc8f160d594c8022a0808447475c1a7c6d6479763f310c
3
+ size 995526
Backend/idioms_structured_1/seed_idioms_en_cleaned.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67f1af4458c3f0defb5f12bccdc3c4cacfeb68e73a03174735e0d64787891af5
3
+ size 12976644
Backend/idioms_structured_1/seed_idioms_es_cleaned.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09b6e067e5ac4e5ddff228011494f1e221a52dfe42ffb45b9ee1183f920c9c4d
3
+ size 1613110
Backend/inference.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # filepath: src/model/inference.py
2
+ from transformers import AutoTokenizer, AutoModelForTokenClassification
3
+ from peft import PeftModel, PeftConfig
4
+ import torch
5
+ import torch.nn.functional as F
6
+ from Idiom_lexicon import KNOWN_IDIOMS
7
+ import fitz # PyMuPDF
8
+ import tempfile
9
+ from PIL import Image
10
+ import pytesseract
11
+ import nltk
12
+ import spacy
13
+ import json
14
+ from pathlib import Path
15
+ from fastapi import HTTPException
16
+
17
+ nltk.download('punkt_tab', quiet=True)
18
+ from langdetect import detect
19
+ from nltk.tokenize import sent_tokenize
20
+ import re
21
+ LANG_MAP = {
22
+ 'en': 'english',
23
+ 'es': 'spanish',
24
+ # add more if needed
25
+ }
26
+
27
+ def split_text_by_language(text, language: str):
28
+ # Map input language (e.g., 'en', 'es') to NLTK language codes
29
+ nltk_lang = LANG_MAP.get(language.lower(), 'english')
30
+ sentences = sent_tokenize(text, language=nltk_lang)
31
+ return sentences
32
+
33
+
34
+ def load_model(checkpoint_path):
35
+ config = PeftConfig.from_pretrained(checkpoint_path)
36
+ base_model = AutoModelForTokenClassification.from_pretrained(
37
+ config.base_model_name_or_path,
38
+ num_labels=3 # O, B-IDIOM, I-IDIOM
39
+ )
40
+ model = PeftModel.from_pretrained(base_model, checkpoint_path)
41
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
42
+ return model, tokenizer
43
+
44
+ def normalize_text(text):
45
+ # Join hyphenated words split across lines
46
+ text = re.sub(r'-\s*\n\s*', '', text)
47
+ # Replace newlines with spaces
48
+ text = re.sub(r'\n+', ' ', text)
49
+ # Collapse multiple spaces into one
50
+ text = re.sub(r'\s+', ' ', text)
51
+ return text.strip()
52
+
53
+ def filter_idioms(candidate_idioms, known_idioms, min_len=2):
54
+ filtered = []
55
+ for idiom in candidate_idioms:
56
+ norm = idiom.lower().strip()
57
+ if norm in known_idioms or len(norm.split()) >= min_len:
58
+ filtered.append(idiom)
59
+ return filtered
60
+
61
+
62
+ import spacy
63
+ import json
64
+ from pathlib import Path
65
+
66
+ class IdiomMatcher:
67
+ def __init__(self, idiom_files: dict[str, str]):
68
+ self.models = {
69
+ "en": spacy.load("en_core_web_sm"),
70
+ "es": spacy.load("es_core_news_sm"),
71
+ }
72
+ self.idioms_by_lang = {lang: [] for lang in idiom_files}
73
+ self._load_idioms(idiom_files)
74
+
75
+ def _lemmatize(self, text: str, lang: str) -> str:
76
+ doc = self.models[lang](text)
77
+ return " ".join(token.lemma_ for token in doc)
78
+
79
+ def _load_idioms(self, idiom_files: dict[str, str]):
80
+ for lang, file_path in idiom_files.items():
81
+ path = Path(file_path)
82
+ if not path.exists():
83
+ raise FileNotFoundError(f"Idiom file not found for {lang}: {file_path}")
84
+
85
+ with open(path, "r", encoding="utf-8") as f:
86
+ for line in f:
87
+ entry = json.loads(line)
88
+ idiom_text = entry.get("idiom", "").strip()
89
+ if not idiom_text:
90
+ continue
91
+ entry["lemmatized"] = self._lemmatize(idiom_text, lang)
92
+ self.idioms_by_lang[lang].append(entry)
93
+
94
+ def match(self, sentence: str, lang: str):
95
+ if lang not in self.models:
96
+ raise ValueError(f"Unsupported language: {lang}")
97
+ sent_lemma = self._lemmatize(sentence, lang)
98
+ return [
99
+ idiom for idiom in self.idioms_by_lang[lang]
100
+ if idiom["lemmatized"] in sent_lemma
101
+ ]
102
+
103
+
104
+ def predict_idiom(text, model, tokenizer, device, conf_threshold=0.9):
105
+ words = text.split()
106
+ if not words:
107
+ print("[⚠️] Empty input text")
108
+ return []
109
+
110
+ inputs = tokenizer(
111
+ words,
112
+ is_split_into_words=True,
113
+ truncation=True,
114
+ padding=True,
115
+ max_length=128,
116
+ return_tensors="pt"
117
+ ).to(device)
118
+
119
+ with torch.no_grad():
120
+ outputs = model(**inputs)
121
+ logits = outputs.logits
122
+ probs = F.softmax(logits, dim=-1)
123
+
124
+ max_probs, predictions = torch.max(probs, dim=-1)
125
+ max_probs = max_probs.cpu().numpy()[0]
126
+ predictions = predictions.cpu().numpy()[0]
127
+ word_ids = inputs.word_ids(batch_index=0)
128
+
129
+ idioms = []
130
+ current_idiom_start = -1
131
+ current_idiom_end = -1
132
+
133
+ for i, (pred_label, conf, word_idx) in enumerate(zip(predictions, max_probs, word_ids)):
134
+ if word_idx is None:
135
+ if current_idiom_start != -1:
136
+ idioms.append(' '.join(words[current_idiom_start:current_idiom_end + 1]))
137
+ current_idiom_start = -1
138
+ current_idiom_end = -1
139
+ continue
140
+
141
+ if conf < conf_threshold:
142
+ pred_label = 0
143
+
144
+ if pred_label == 1: # B-IDIOM
145
+ if current_idiom_start != -1:
146
+ idioms.append(' '.join(words[current_idiom_start:current_idiom_end + 1]))
147
+ current_idiom_start = word_idx
148
+ current_idiom_end = word_idx
149
+ elif pred_label == 2: # I-IDIOM
150
+ if current_idiom_start != -1 and (word_idx == current_idiom_end or word_idx == current_idiom_end + 1):
151
+ current_idiom_end = word_idx
152
+ else:
153
+ if current_idiom_start != -1:
154
+ idioms.append(' '.join(words[current_idiom_start:current_idiom_end + 1]))
155
+ current_idiom_start = -1
156
+ current_idiom_end = -1
157
+ else: # O
158
+ if current_idiom_start != -1:
159
+ idioms.append(' '.join(words[current_idiom_start:current_idiom_end + 1]))
160
+ current_idiom_start = -1
161
+ current_idiom_end = -1
162
+
163
+ if current_idiom_start != -1:
164
+ idioms.append(' '.join(words[current_idiom_start:current_idiom_end + 1]))
165
+
166
+ idioms = filter_idioms(idioms, known_idioms=KNOWN_IDIOMS)
167
+ return idioms
168
+
169
+ import pdfplumber
170
+
171
+ def extract_text_from_pdf(pdf_bytes: bytes) -> str:
172
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
173
+ tmp.write(pdf_bytes)
174
+ tmp_path = tmp.name
175
+
176
+ doc = fitz.open(tmp_path)
177
+ text = ""
178
+ for i, page in enumerate(doc):
179
+ page_text = page.get_text()
180
+ print(f"[DEBUG] Page {i+1} extracted text (first 100 chars): {repr(page_text[:100])}")
181
+ text += page_text
182
+ doc.close()
183
+ text = normalize_text(text)
184
+ print("[DEBUG] Cleaned extracted text from PDF (first 500 chars):", repr(text[:500]))
185
+ if not text:
186
+ print("[⚠️] No text extracted from PDF. It may be blank or not readable.")
187
+ return text
188
+
189
+
190
+
191
+ def reconstruct_words(tokens, labels):
192
+ """
193
+ Reconstruct words from BERT tokens and their corresponding labels.
194
+ This function is used to map the BERT token predictions back to the original words.
195
+ """
196
+ words = []
197
+ current_word = []
198
+ current_label = None
199
+
200
+ for token, label in zip(tokens, labels):
201
+ if label == 'O':
202
+ if current_word:
203
+ words.append(''.join(current_word))
204
+ current_word = []
205
+ continue
206
+
207
+ if label.startswith('B-'):
208
+ if current_word:
209
+ words.append(''.join(current_word))
210
+ current_word = []
211
+ current_label = label[2:] # Get the idiom type
212
+ current_word.append(token)
213
+ elif label.startswith('I-') and current_label == label[2:]:
214
+ current_word.append(token)
215
+
216
+ if current_word:
217
+ words.append(''.join(current_word))
218
+
219
+ return words
Backend/main.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ from fastapi import FastAPI
4
+ from pydantic import BaseModel
5
+ from fastapi import UploadFile, File
6
+ from fastapi.middleware.cors import CORSMiddleware
7
+ from fastapi import UploadFile, File
8
+ from inference import extract_text_from_pdf, split_text_by_language, predict_idiom, normalize_text, load_model, IdiomMatcher
9
+ from nltk.tokenize import sent_tokenize
10
+ from langdetect import detect
11
+ from fastapi import HTTPException
12
+ import re
13
+ import fitz # PyMuPDF
14
+ # Allow requests from your React app
15
+ origins = [
16
+ "http://localhost:3000", # React dev server
17
+ "https://language-learning-base-website.vercel.app",
18
+ "https://www.idiomator.com"
19
+ "https://idiomator.com"
20
+
21
+ ]
22
+
23
+ app = FastAPI()
24
+
25
+ app.add_middleware(
26
+ CORSMiddleware,
27
+ allow_origins= origins, # or ["*"] for all origins (not recommended in production)
28
+ allow_credentials=True,
29
+ allow_methods=["*"],
30
+ allow_headers=["*"],
31
+ )
32
+
33
+ # Load model once at startup
34
+ device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
35
+ checkpoint_path = os.path.join(os.path.dirname(__file__), "checkpoints")
36
+ model, tokenizer = load_model(checkpoint_path)
37
+ model = model.to(device)
38
+ model.eval()
39
+
40
+ class TextRequest(BaseModel):
41
+ text: str
42
+ language: str = "en" # Default to English
43
+
44
+ class IdiomResponse(BaseModel):
45
+ idioms: list[str]
46
+ language: str = "en" # Default to English
47
+
48
+ @app.get("/")
49
+ def root():
50
+ return {"status": "ok"}
51
+
52
+
53
+ @app.post("/extract_idioms_ai", response_model=IdiomResponse)
54
+ def extract_idioms(request: TextRequest):
55
+ import time
56
+ start = time.time()
57
+ print(f"[πŸ“₯] Request received at: {start}")
58
+
59
+ text = normalize_text(request.text)
60
+ language = request.language.lower() # Get the user-selected language
61
+
62
+ sentences = split_text_by_language(text, language=language)
63
+ idioms = []
64
+ for sent in sentences:
65
+ idioms.extend(predict_idiom(sent, model, tokenizer, device))
66
+ print(f"[βœ…] Done in {time.time() - start:.3f}s")
67
+ return {"idioms": idioms}
68
+
69
+
70
+ from fastapi import Form
71
+
72
+ def check_pdf_page_limit(pdf_bytes, max_pages=10):
73
+ with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
74
+ if len(doc) > max_pages:
75
+ raise HTTPException(status_code=400, detail=f"PDF has {len(doc)} pages. Limit is {max_pages}.")
76
+
77
+ @app.post("/extract_idioms_pdf_ai", response_model=IdiomResponse)
78
+ async def extract_idioms_pdf(
79
+ file: UploadFile = File(...),
80
+ language: str = Form(...) # βœ… Get language from the client
81
+ ):
82
+
83
+ pdf_bytes = await file.read()
84
+ check_pdf_page_limit(pdf_bytes, max_pages=10)
85
+ text = extract_text_from_pdf(pdf_bytes)
86
+ # Normalize the extracted text!
87
+ text = normalize_text(text)
88
+ sentences = split_text_by_language(text, language=language)
89
+ idioms = []
90
+ for sent in sentences:
91
+ idioms.extend(predict_idiom(sent, model, tokenizer, device))
92
+ return {"idioms": idioms}
93
+
94
+
95
+ idiom_matcher = IdiomMatcher({
96
+ "en": "idioms_structured_1/seed_idioms_en_cleaned.jsonl",
97
+ "es": "idioms_structured_1/seed_idioms_es_cleaned.jsonl"
98
+ })
99
+
100
+ @app.post("/extract_idioms_heuristic", response_model=IdiomResponse)
101
+ def extract_idioms_heuristic(request: TextRequest):
102
+ text = normalize_text(request.text)
103
+ language = request.language.lower() # get the language from request
104
+ idiom_matches = idiom_matcher.match(text, lang=language)
105
+ idioms = [idiom["idiom"] for idiom in idiom_matches]
106
+
107
+ return {"idioms": idioms}
108
+ @app.post("/extract_idioms_pdf_heuristic", response_model=IdiomResponse)
109
+ async def extract_idioms_pdf_(
110
+ file: UploadFile = File(...),
111
+ language: str = Form(...) # βœ… Get language from the client
112
+ ):
113
+ pdf_bytes = await file.read()
114
+ check_pdf_page_limit(pdf_bytes, max_pages=10)
115
+ text = extract_text_from_pdf(pdf_bytes)
116
+ # Normalize the extracted text!
117
+ text = normalize_text(text)
118
+ sentences = split_text_by_language(text, language=language)
119
+ idioms = []
120
+ idiom_matches = idiom_matcher.match(text, lang=language)
121
+ idioms = [idiom["idiom"] for idiom in idiom_matches]
122
+ return {"idioms": idioms}
Backend/requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa0d502a504c0e0ebdff60428904ef42b61af5846a8ff1b0673e6501ef89ff38
3
+ size 515
Dockerfile ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # Create user but don't switch yet
4
+ RUN useradd -m -u 1000 user
5
+
6
+ WORKDIR /app
7
+
8
+ # Install Tesseract OCR and system dependencies as root
9
+ RUN apt-get update && \
10
+ apt-get install -y --no-install-recommends tesseract-ocr libglib2.0-0 libsm6 libxext6 libxrender-dev && \
11
+ rm -rf /var/lib/apt/lists/*
12
+
13
+ # Now switch to the non-root user
14
+ USER user
15
+
16
+ # Copy requirements.txt
17
+ COPY --chown=user Backend/requirements.txt .
18
+
19
+ # Install Python dependencies (including nltk and langdetect)
20
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
21
+
22
+ # (Re)install nltk and download punkt as user, ensuring clean install and data in user dir
23
+ RUN pip install --upgrade --force-reinstall nltk && \
24
+ python -m nltk.downloader punkt_tab
25
+
26
+ # Verify punkt is present (will print path in build logs)
27
+ RUN python -c "import nltk; print(nltk.data.find('tokenizers/punkt_tab'))"
28
+
29
+ # Set PATH for user-installed Python packages
30
+ ENV PATH="/home/user/.local/bin:${PATH}"
31
+
32
+ # Copy all backend code
33
+ COPY --chown=user Backend/ .
34
+
35
+ EXPOSE 7860
36
+
37
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Multilingual Idiom Extractor
3
+ emoji: 🌍
4
+ colorFrom: blue
5
+ colorTo: indigo
6
+ sdk: docker
7
+ sdk_version: "latest"
8
+ app_file: main.py
9
+ pinned: false
10
+ ---
11
+
12
+ # Language_Learning_BaseWebsite
13
+ I am just changing this so I have a change
14
+