lucas066001
feat(libs.langage_detection): Adding models saving and unittest
07f841f
import csv
import joblib
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
import travel_resolver.libs.nlp.langage_detection.variables as var
def read_data():
"""
Retreive and format data from csv input files
"""
x, y = [], []
i = 1
for lang in var.CORRESP_LANG:
first = True
current_file = "../data/langage_detection/trainset/"
current_file += lang + "_trainset.csv"
with open(current_file, "r") as csv_file:
csv_reader = csv.reader(csv_file)
for row in csv_reader:
if not first:
x.append(np.array(row, dtype=np.float64))
y.append(i)
else:
first = False
i += 1
return train_test_split(np.array(x), y, test_size=0.2, random_state=5)
def train():
"""
Train the model and generate a backup.
"""
x_train, x_test, y_train, y_test = read_data()
model = SGDClassifier()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))
joblib.dump(
model,
"../models/langage_detection/model_"
+ str(round(accuracy, 3)).replace(".", "_")
+ ".sav",
)
def main():
train()
if __name__ == "__main__":
main()