| from collections import Counter | |
| import csv | |
| from typing import List | |
| import travel_resolver.libs.nlp.langage_detection.variables as var | |
| def extract_data_from_csv(f_in: str, f_out: str): | |
| """ | |
| Take a csv file containing strings and convert it | |
| into a csv file containig letter frequencies infos. | |
| Args: | |
| f_in (str): File path to analyse, must contain extension. | |
| f_out (str): File path containing result, must contain extension. | |
| """ | |
| with open(f_in, "r") as csv_file: | |
| csv_reader = csv.reader(csv_file) | |
| with open(f_out, "w", newline="") as output_csv: | |
| csv_writer = csv.writer(output_csv) | |
| for row in csv_reader: | |
| str = "".join(row).lower() | |
| modified_row = extract_data_from_string(str) | |
| csv_writer.writerow(modified_row) | |
| def extract_data_from_string(str_in: str) -> List: | |
| """ | |
| Retreive tab containing letter frequency informations | |
| and special char frequency of a given string. | |
| Args: | |
| str_in (str): String to analyse. | |
| Returns: | |
| (List): Tab containing special char and alphabetical frequencies. | |
| """ | |
| str_data = [] | |
| str_data = str_data + frequence_letters(str_in) | |
| str_data = str_data + frequence_char_part(str_in) | |
| return str_data | |
| def frequence_letters(str_in: str) -> List: | |
| """ | |
| Retreive tab containing letter frequency informations | |
| of a given string. | |
| Args: | |
| str_in (str): String to analyse. | |
| Returns: | |
| (List): Tab containing alphabetical char frequencies. | |
| """ | |
| counter = Counter(str_in.lower()) | |
| freq_tab = [ | |
| round(counter.get(chr(i), 0) / len(counter) * 100, 2) for i in range(97, 123) | |
| ] | |
| return freq_tab | |
| def frequence_char_part(str_in: str) -> List: | |
| """ | |
| Retreive tab containing special char frequency | |
| informations of a given string. | |
| Args: | |
| str_in (str): String to analyse. | |
| Returns: | |
| (List): Tab containing special char char frequencies. | |
| """ | |
| counter = Counter(str_in.lower()) | |
| freq_tab = [ | |
| round(counter.get(char, 0) / len(str_in) * 100, 2) for char in var.SPECIAL_CHARS | |
| ] | |
| return freq_tab | |
| def main(): | |
| for lang in var.TRAD_TARGETS: | |
| input_file = "../../assets/data/prompts/csv/" + lang + "_prompts.csv" | |
| output_csv_file = "../../assets/data/trainset/" + lang + "_trainset.csv" | |
| extract_data_from_csv(input_file, output_csv_file) | |
| if __name__ == "__main__": | |
| main() | |