davidtran999's picture
Upload backend/chatbot/training/train_intent.py with huggingface_hub
980fef7 verified
import argparse
import json
import os
from pathlib import Path
import sys
import time
from datetime import datetime
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
ROOT_DIR = Path(__file__).resolve().parents[2]
if str(ROOT_DIR) not in sys.path:
sys.path.insert(0, str(ROOT_DIR))
BASE_DIR = Path(__file__).resolve().parent
DEFAULT_DATASET = BASE_DIR / "intent_dataset.json"
GENERATED_QA_DIR = BASE_DIR / "generated_qa"
ARTIFACT_DIR = BASE_DIR / "artifacts"
LOG_DIR = ROOT_DIR / "logs" / "intent"
ARTIFACT_DIR.mkdir(parents=True, exist_ok=True)
LOG_DIR.mkdir(parents=True, exist_ok=True)
def load_dataset(path: Path):
payload = json.loads(path.read_text(encoding="utf-8"))
texts = []
labels = []
for intent in payload.get("intents", []):
name = intent["name"]
for example in intent.get("examples", []):
texts.append(example)
labels.append(name)
return texts, labels, payload
def load_generated_qa(directory: Path):
"""
Load generated QA questions as additional intent training samples.
Each JSON file is expected to contain a list of objects compatible
with `QAItem` from `generated_qa`, at minimum having:
- question: str
- intent: str
"""
texts: list[str] = []
labels: list[str] = []
if not directory.exists():
return texts, labels
for path in sorted(directory.glob("*.json")):
try:
payload = json.loads(path.read_text(encoding="utf-8"))
except Exception:
# Skip malformed files but continue loading others
continue
if not isinstance(payload, list):
continue
for item in payload:
if not isinstance(item, dict):
continue
question = str(item.get("question") or "").strip()
intent = str(item.get("intent") or "").strip() or "search_legal"
if not question:
continue
texts.append(question)
labels.append(intent)
return texts, labels
def load_combined_dataset(path: Path, generated_dir: Path):
"""
Load seed intent dataset and merge with generated QA questions.
"""
texts, labels, meta = load_dataset(path)
gen_texts, gen_labels = load_generated_qa(generated_dir)
texts.extend(gen_texts)
labels.extend(gen_labels)
return texts, labels, meta
def build_pipelines():
vectorizer = TfidfVectorizer(
analyzer="word",
ngram_range=(1, 2),
lowercase=True,
token_pattern=r"\b\w+\b",
)
nb_pipeline = Pipeline([
("tfidf", vectorizer),
("clf", MultinomialNB()),
])
logreg_pipeline = Pipeline([
("tfidf", vectorizer),
("clf", LogisticRegression(max_iter=1000, solver="lbfgs")),
])
return {
"multinomial_nb": nb_pipeline,
"logistic_regression": logreg_pipeline,
}
def train(dataset_path: Path, test_size: float = 0.2, random_state: int = 42):
texts, labels, meta = load_combined_dataset(dataset_path, GENERATED_QA_DIR)
if not texts:
raise ValueError("Dataset rỗng, không thể huấn luyện")
X_train, X_test, y_train, y_test = train_test_split(
texts, labels, test_size=test_size, random_state=random_state, stratify=labels
)
pipelines = build_pipelines()
best_model = None
best_metrics = None
for name, pipeline in pipelines.items():
start = time.perf_counter()
pipeline.fit(X_train, y_train)
train_duration = time.perf_counter() - start
y_pred = pipeline.predict(X_test)
acc = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, output_dict=True)
cm = confusion_matrix(y_test, y_pred, labels=sorted(set(labels)))
metrics = {
"model": name,
"accuracy": acc,
"train_duration_sec": train_duration,
"classification_report": report,
"confusion_matrix": cm.tolist(),
"labels": sorted(set(labels)),
"dataset_version": meta.get("version"),
"timestamp": datetime.utcnow().isoformat() + "Z",
"test_size": test_size,
"samples": len(texts),
}
if best_model is None or acc > best_metrics["accuracy"]:
best_model = pipeline
best_metrics = metrics
assert best_model is not None
model_path = ARTIFACT_DIR / "intent_model.joblib"
metrics_path = ARTIFACT_DIR / "metrics.json"
joblib.dump(best_model, model_path)
metrics_path.write_text(json.dumps(best_metrics, ensure_ascii=False, indent=2), encoding="utf-8")
log_entry = {
"event": "train_intent",
"model": best_metrics["model"],
"accuracy": best_metrics["accuracy"],
"timestamp": best_metrics["timestamp"],
"samples": best_metrics["samples"],
"dataset_version": best_metrics["dataset_version"],
"artifact": str(model_path.relative_to(ROOT_DIR)),
}
log_file = LOG_DIR / "train.log"
with log_file.open("a", encoding="utf-8") as fh:
fh.write(json.dumps(log_entry, ensure_ascii=False) + "\n")
return model_path, metrics_path, best_metrics
def parse_args():
parser = argparse.ArgumentParser(description="Huấn luyện model intent cho chatbot")
parser.add_argument("--dataset", type=Path, default=DEFAULT_DATASET, help="Đường dẫn tới intent_dataset.json")
parser.add_argument("--test-size", type=float, default=0.2, help="Tỉ lệ dữ liệu test")
parser.add_argument("--seed", type=int, default=42, help="Giá trị random seed")
return parser.parse_args()
def main():
args = parse_args()
model_path, metrics_path, metrics = train(args.dataset, test_size=args.test_size, random_state=args.seed)
print("Huấn luyện hoàn tất:")
print(f" Model: {metrics['model']}")
print(f" Accuracy: {metrics['accuracy']:.4f}")
print(f" Model artifact: {model_path}")
print(f" Metrics: {metrics_path}")
if __name__ == "__main__":
main()