File size: 1,486 Bytes
0d41747 90cfe35 0d41747 07f841f 90cfe35 0d41747 90cfe35 0d41747 07f841f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
import csv
import joblib
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
import travel_resolver.libs.nlp.langage_detection.variables as var
def read_data():
"""
Retreive and format data from csv input files
"""
x, y = [], []
i = 1
for lang in var.CORRESP_LANG:
first = True
current_file = "../data/langage_detection/trainset/"
current_file += lang + "_trainset.csv"
with open(current_file, "r") as csv_file:
csv_reader = csv.reader(csv_file)
for row in csv_reader:
if not first:
x.append(np.array(row, dtype=np.float64))
y.append(i)
else:
first = False
i += 1
return train_test_split(np.array(x), y, test_size=0.2, random_state=5)
def train():
"""
Train the model and generate a backup.
"""
x_train, x_test, y_train, y_test = read_data()
model = SGDClassifier()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))
joblib.dump(
model,
"../models/langage_detection/model_"
+ str(round(accuracy, 3)).replace(".", "_")
+ ".sav",
)
def main():
train()
if __name__ == "__main__":
main()
|