Spaces:
Sleeping
Sleeping
| # src/on_topic.py | |
| from __future__ import annotations | |
| from pathlib import Path | |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.metrics import roc_auc_score | |
| import joblib | |
| ROOT = Path(__file__).resolve().parents[1] | |
| DATA = ROOT / "data" / "processed" / "features_with_semantics_q4.csv" | |
| MODEL_PATH = ROOT / "models" / "on_topic.pkl" | |
| FEATURES = ["semantic_sim","ans_len_words","ans_ttr","ans_avg_sent_len"] | |
| def main(): | |
| df = pd.read_csv(DATA, encoding="utf-8-sig") | |
| # бинарная цель: >0 считается «по теме» | |
| y = (df["score"] > 0).astype(int).values | |
| X = df[FEATURES].fillna(0).values | |
| Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) | |
| clf = LogisticRegression(max_iter=200, n_jobs=None) | |
| clf.fit(Xtr, ytr) | |
| p = clf.predict_proba(Xte)[:,1] | |
| print(f"AUC on holdout: {roc_auc_score(yte, p):.3f}") | |
| MODEL_PATH.parent.mkdir(parents=True, exist_ok=True) | |
| joblib.dump({"model": clf, "features": FEATURES}, MODEL_PATH) | |
| print(f"✅ on_topic model saved: {MODEL_PATH}") | |
| if __name__ == "__main__": | |
| main() | |