Spaces:

Edgar-Demeude
/

argument-backend

Sleeping

App Files Files Community

p2002814 commited on Oct 12

Commit

bb34072

1 Parent(s): a1fcb70

now using the best fine tuned bert model, uploaded to hugging face hub

Browse files

Files changed (7) hide show

.gitattributes +1 -2
app.py +30 -26
hugging_hub.py +9 -0
models/model.pth +0 -3
relations/predict.py +1 -1
relations/predict_bert.py +49 -0
relations/tests.py +9 -24

.gitattributes CHANGED Viewed

	@@ -1,2 +1 @@
1	- models~~/*.pth~~ filter=lfs diff=lfs merge=lfs -text
2	- models/model.pth filter=lfs diff=lfs merge=lfs -text


1	+ models/** filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -3,15 +3,16 @@ from fastapi.responses import FileResponse
 from fastapi.middleware.cors import CORSMiddleware
 import pandas as pd
 from pathlib import Path
 from relations.tests import run_tests
-from relations.predict import predict_relation
-from relations.mlp import load_model_and_metadata
-from relations.processor import ArgumentDataProcessor
 from exemples.claims import test_cases
 # ABA imports
-from aba.aba_builder import build_aba_framework, prepare_aba_plus_framework, build_aba_framework_from_text
 app = FastAPI(title="Argument Mining API")
@@ -31,59 +32,60 @@ app.add_middleware(
 EXAMPLES_DIR = Path("./aba/exemples")
-# Load ML model at startup
-PYTORCH_MODEL_PATH = "models/model.pth"
-model_type = "pytorch"
-model, embedding_model, best_threshold, label_encoder = load_model_and_metadata(
-    PYTORCH_MODEL_PATH, model_type
-)
-processor = ArgumentDataProcessor()
 @app.get("/")
 def root():
     return {"message": "Argument Mining API is running..."}
-# ---------------- ML Prediction Endpoints ---------------- #
 @app.post("/predict-test")
 def predict_test():
-    """Run predefined test cases for model validation."""
-    run_tests(model, embedding_model, processor, best_threshold, label_encoder, model_type, test_cases)
     return {"message": "Test cases executed. Check server logs for details."}
 @app.post("/predict-text")
 def predict_text(arg1: str = Form(...), arg2: str = Form(...)):
-    """Predict relation between two text arguments."""
-    relation = predict_relation(arg1, arg2, model, embedding_model, processor, best_threshold, label_encoder, model_type)
-    return {"arg1": arg1, "arg2": arg2, "relation": relation}
 @app.post("/predict-csv")
 async def predict_csv(file: UploadFile):
     """Predict relations for pairs of arguments from a CSV file (max 100 rows)."""
     df = pd.read_csv(file.file)
     if len(df) > 100:
         df = df.head(100)
     results = []
     for _, row in df.iterrows():
-        relation = predict_relation(
             row["parent"],
             row["child"],
             model,
-            embedding_model,
-            processor,
-            best_threshold,
-            label_encoder,
-            model_type
         )
         results.append({
             "parent": row["parent"],
             "child": row["child"],
-            "relation": relation
         })
     return {"results": results, "note": "Limited to 100 rows max"}
@@ -114,16 +116,18 @@ async def aba_upload(file: UploadFile = File(...)):
     }
     return results
 @app.get("/aba-examples")
 def list_aba_examples():
     """Lists all sample files available on the server side."""
     examples = [f.name for f in EXAMPLES_DIR.glob("*.txt")]
     return {"examples": examples}
 @app.get("/aba-examples/{filename}")
 def get_aba_example(filename: str):
     """Returns the contents of a specific ABA sample file."""
     file_path = EXAMPLES_DIR / filename
     if not file_path.exists() or not file_path.is_file():
         return {"error": "File not found"}
-    return FileResponse(file_path, media_type="text/plain", filename=filename)

 from fastapi.middleware.cors import CORSMiddleware
 import pandas as pd
 from pathlib import Path
+import torch
 from relations.tests import run_tests
+from relations.predict_bert import predict_relation
 from exemples.claims import test_cases
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
 # ABA imports
+from aba.aba_builder import prepare_aba_plus_framework, build_aba_framework_from_text
 app = FastAPI(title="Argument Mining API")
 EXAMPLES_DIR = Path("./aba/exemples")
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Load model at startup once
+model_name = "edgar-demeude/bert-argument"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForSequenceClassification.from_pretrained(model_name)
+model.to(device)
 @app.get("/")
 def root():
     return {"message": "Argument Mining API is running..."}
+# ---------------- BERT Prediction Endpoints ---------------- #
 @app.post("/predict-test")
 def predict_test():
+    """Run predefined test cases for BERT model validation."""
+    run_tests(model, tokenizer, device, test_cases)
     return {"message": "Test cases executed. Check server logs for details."}
 @app.post("/predict-text")
 def predict_text(arg1: str = Form(...), arg2: str = Form(...)):
+    """Predict relation between two text arguments using BERT."""
+    result = predict_relation(arg1, arg2, model, tokenizer, device)
+    return {
+        "arg1": arg1,
+        "arg2": arg2,
+        "relation": result
+    }
 @app.post("/predict-csv")
 async def predict_csv(file: UploadFile):
     """Predict relations for pairs of arguments from a CSV file (max 100 rows)."""
     df = pd.read_csv(file.file)
     if len(df) > 100:
         df = df.head(100)
     results = []
     for _, row in df.iterrows():
+        result = predict_relation(
             row["parent"],
             row["child"],
             model,
+            tokenizer,
+            device
         )
         results.append({
             "parent": row["parent"],
             "child": row["child"],
+            "relation": result
         })
     return {"results": results, "note": "Limited to 100 rows max"}
     }
     return results
 @app.get("/aba-examples")
 def list_aba_examples():
     """Lists all sample files available on the server side."""
     examples = [f.name for f in EXAMPLES_DIR.glob("*.txt")]
     return {"examples": examples}
 @app.get("/aba-examples/{filename}")
 def get_aba_example(filename: str):
     """Returns the contents of a specific ABA sample file."""
     file_path = EXAMPLES_DIR / filename
     if not file_path.exists() or not file_path.is_file():
         return {"error": "File not found"}
+    return FileResponse(file_path, media_type="text/plain", filename=filename)

hugging_hub.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# File to automatically push the model to hugging face hub
+from huggingface_hub import upload_folder
+upload_folder(
+    repo_id="edgar-demeude/bert-argument",
+    folder_path="./models/bert-argument",
+    repo_type="model"
+)

models/model.pth DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4a8a1413586023e10aadde9a86e01b6425814605767924e818f1e189926ad86b
-size 9667219

relations/predict.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import torch
 from .embeddings import generate_embeddings
-def predict_relation(arg1, arg2, model, embedding_model, processor, best_threshold, label_encoder, model_type="pytorch"):
     embeddings = generate_embeddings(arg1, arg2, embedding_model, processor)
     if model_type == "pytorch":

 import torch
 from .embeddings import generate_embeddings
+def predict_relation_old(arg1, arg2, model, embedding_model, processor, best_threshold, label_encoder, model_type="pytorch"):
     embeddings = generate_embeddings(arg1, arg2, embedding_model, processor)
     if model_type == "pytorch":

relations/predict_bert.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+def load_bert_model(model_path="../models/bert-argument", device=None):
+    if device is None:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    model = AutoModelForSequenceClassification.from_pretrained(model_path)
+    model.to(device)
+    model.eval()
+    return model, tokenizer, device
+def predict_relation(parent_text, child_text, model, tokenizer, device, max_length=256):
+    """
+    Predicts whether the relation between parent and child is Support or Attack.
+    """
+    model.eval()
+    # Tokenization
+    encoding = tokenizer(
+        parent_text,
+        child_text,
+        add_special_tokens=True,
+        max_length=max_length,
+        padding='max_length',
+        truncation='only_second',
+        return_attention_mask=True,
+        return_tensors='pt'
+    )
+    input_ids = encoding['input_ids'].to(device)
+    attention_mask = encoding['attention_mask'].to(device)
+    with torch.no_grad():
+        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
+        logits = outputs.logits
+        probs = torch.softmax(logits, dim=1)
+        pred = torch.argmax(probs, dim=1).item()
+        confidence = probs[0][pred].item()
+    relation = "Support" if pred == 1 else "Attack"
+    return {
+        "predicted_label": relation,
+        "probability": confidence,
+        "confidence": confidence
+    }

relations/tests.py CHANGED Viewed

@@ -1,12 +1,11 @@
 import time
-from .predict import predict_relation
-def print_pretty_prediction(result: dict, test_case_num: int, expected: str, claim1: str, claim2: str, best_threshold: float):
     """Pretty-print the result of a single test case."""
     prediction = result["predicted_label"]
     confidence = result["confidence"]
     probability = result["probability"]
     status = "✅ Correct" if prediction == expected else "❌ Incorrect"
     print(f"\n{'='*70}")
@@ -16,16 +15,13 @@ def print_pretty_prediction(result: dict, test_case_num: int, expected: str, cla
     print(f"Claim 2: {claim2}")
     print(f"\nExpected:   {expected}")
     print(f"Predicted:  {prediction}")
-    print(f"Probability: {probability:.4f} (threshold: {best_threshold:.3f})")
     print(f"Confidence: {confidence:.2%}")
     print(f"Status:     {status}")
-def run_tests(model, embedding_model, processor, best_threshold, label_encoder, model_type, test_cases):
-    """
-    Run a list of test cases and display results.
-    Each test case must be a dict with keys: 'claim1', 'claim2', 'expected'
-    """
     print("\n" + "="*70)
     print("RUNNING TEST CASES")
     print("="*70)
@@ -38,28 +34,17 @@ def run_tests(model, embedding_model, processor, best_threshold, label_encoder,
             case["claim1"],
             case["claim2"],
             model,
-            embedding_model,
-            processor,
-            best_threshold,
-            label_encoder,
-            model_type,
-        )
-        print_pretty_prediction(
-            result,
-            test_case_num=i,
-            expected=case["expected"],
-            claim1=case["claim1"],
-            claim2=case["claim2"],
-            best_threshold=best_threshold
         )
         if result["predicted_label"] == case["expected"]:
             correct_predictions += 1
-    # Final summary
     accuracy = (correct_predictions / len(test_cases)) * 100
     elapsed_time = time.time() - start_time
     print(f"\n{'='*70}")
     print("SUMMARY")
     print(f"{'='*70}")

 import time
+from .predict_bert import predict_relation
+def print_pretty_prediction(result: dict, test_case_num: int, expected: str, claim1: str, claim2: str):
     """Pretty-print the result of a single test case."""
     prediction = result["predicted_label"]
     confidence = result["confidence"]
     probability = result["probability"]
     status = "✅ Correct" if prediction == expected else "❌ Incorrect"
     print(f"\n{'='*70}")
     print(f"Claim 2: {claim2}")
     print(f"\nExpected:   {expected}")
     print(f"Predicted:  {prediction}")
+    print(f"Probability: {probability:.4f}")
     print(f"Confidence: {confidence:.2%}")
     print(f"Status:     {status}")
+def run_tests(model, tokenizer, device, test_cases):
+    """Run test cases using the BERT model."""
     print("\n" + "="*70)
     print("RUNNING TEST CASES")
     print("="*70)
             case["claim1"],
             case["claim2"],
             model,
+            tokenizer,
+            device
         )
+        print_pretty_prediction(result, i, case["expected"], case["claim1"], case["claim2"])
         if result["predicted_label"] == case["expected"]:
             correct_predictions += 1
+    # Summary
     accuracy = (correct_predictions / len(test_cases)) * 100
     elapsed_time = time.time() - start_time
     print(f"\n{'='*70}")
     print("SUMMARY")
     print(f"{'='*70}")