Spaces:
Sleeping
Sleeping
File size: 1,211 Bytes
bcb314a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
# src/on_topic.py
from __future__ import annotations
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import joblib
ROOT = Path(__file__).resolve().parents[1]
DATA = ROOT / "data" / "processed" / "features_with_semantics_q4.csv"
MODEL_PATH = ROOT / "models" / "on_topic.pkl"
FEATURES = ["semantic_sim","ans_len_words","ans_ttr","ans_avg_sent_len"]
def main():
df = pd.read_csv(DATA, encoding="utf-8-sig")
# бинарная цель: >0 считается «по теме»
y = (df["score"] > 0).astype(int).values
X = df[FEATURES].fillna(0).values
Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
clf = LogisticRegression(max_iter=200, n_jobs=None)
clf.fit(Xtr, ytr)
p = clf.predict_proba(Xte)[:,1]
print(f"AUC on holdout: {roc_auc_score(yte, p):.3f}")
MODEL_PATH.parent.mkdir(parents=True, exist_ok=True)
joblib.dump({"model": clf, "features": FEATURES}, MODEL_PATH)
print(f"✅ on_topic model saved: {MODEL_PATH}")
if __name__ == "__main__":
main()
|