lucas066001
style: Formatting files with Black formatter
90cfe35
from collections import Counter
import csv
from typing import List
import travel_resolver.libs.nlp.langage_detection.variables as var
def extract_data_from_csv(f_in: str, f_out: str):
"""
Take a csv file containing strings and convert it
into a csv file containig letter frequencies infos.
Args:
f_in (str): File path to analyse, must contain extension.
f_out (str): File path containing result, must contain extension.
"""
with open(f_in, "r") as csv_file:
csv_reader = csv.reader(csv_file)
with open(f_out, "w", newline="") as output_csv:
csv_writer = csv.writer(output_csv)
for row in csv_reader:
str = "".join(row).lower()
modified_row = extract_data_from_string(str)
csv_writer.writerow(modified_row)
def extract_data_from_string(str_in: str) -> List:
"""
Retreive tab containing letter frequency informations
and special char frequency of a given string.
Args:
str_in (str): String to analyse.
Returns:
(List): Tab containing special char and alphabetical frequencies.
"""
str_data = []
str_data = str_data + frequence_letters(str_in)
str_data = str_data + frequence_char_part(str_in)
return str_data
def frequence_letters(str_in: str) -> List:
"""
Retreive tab containing letter frequency informations
of a given string.
Args:
str_in (str): String to analyse.
Returns:
(List): Tab containing alphabetical char frequencies.
"""
counter = Counter(str_in.lower())
freq_tab = [
round(counter.get(chr(i), 0) / len(counter) * 100, 2) for i in range(97, 123)
]
return freq_tab
def frequence_char_part(str_in: str) -> List:
"""
Retreive tab containing special char frequency
informations of a given string.
Args:
str_in (str): String to analyse.
Returns:
(List): Tab containing special char char frequencies.
"""
counter = Counter(str_in.lower())
freq_tab = [
round(counter.get(char, 0) / len(str_in) * 100, 2) for char in var.SPECIAL_CHARS
]
return freq_tab
def main():
for lang in var.TRAD_TARGETS:
input_file = "../../assets/data/prompts/csv/" + lang + "_prompts.csv"
output_csv_file = "../../assets/data/trainset/" + lang + "_trainset.csv"
extract_data_from_csv(input_file, output_csv_file)
if __name__ == "__main__":
main()