| import csv | |
| import joblib | |
| import numpy as np | |
| from sklearn.metrics import accuracy_score | |
| from sklearn.linear_model import SGDClassifier | |
| from sklearn.model_selection import train_test_split | |
| import travel_resolver.libs.nlp.langage_detection.variables as var | |
| def read_data(): | |
| """ | |
| Retreive and format data from csv input files | |
| """ | |
| x, y = [], [] | |
| i = 1 | |
| for lang in var.CORRESP_LANG: | |
| first = True | |
| current_file = "../data/langage_detection/trainset/" | |
| current_file += lang + "_trainset.csv" | |
| with open(current_file, "r") as csv_file: | |
| csv_reader = csv.reader(csv_file) | |
| for row in csv_reader: | |
| if not first: | |
| x.append(np.array(row, dtype=np.float64)) | |
| y.append(i) | |
| else: | |
| first = False | |
| i += 1 | |
| return train_test_split(np.array(x), y, test_size=0.2, random_state=5) | |
| def train(): | |
| """ | |
| Train the model and generate a backup. | |
| """ | |
| x_train, x_test, y_train, y_test = read_data() | |
| model = SGDClassifier() | |
| model.fit(x_train, y_train) | |
| y_pred = model.predict(x_test) | |
| accuracy = accuracy_score(y_true=y_test, y_pred=y_pred) | |
| print("Accuracy: {:.2f}%".format(accuracy * 100)) | |
| joblib.dump( | |
| model, | |
| "../models/langage_detection/model_" | |
| + str(round(accuracy, 3)).replace(".", "_") | |
| + ".sav", | |
| ) | |
| def main(): | |
| train() | |
| if __name__ == "__main__": | |
| main() | |