File size: 2,495 Bytes
0d41747 90cfe35 0d41747 90cfe35 0d41747 90cfe35 0d41747 90cfe35 0d41747 90cfe35 0d41747 90cfe35 0d41747 90cfe35 0d41747 90cfe35 0d41747 90cfe35 0d41747 90cfe35 0d41747 90cfe35 0d41747 90cfe35 0d41747 90cfe35 0d41747 90cfe35 0d41747 90cfe35 0d41747 90cfe35 0d41747 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
from collections import Counter
import csv
from typing import List
import travel_resolver.libs.nlp.langage_detection.variables as var
def extract_data_from_csv(f_in: str, f_out: str):
"""
Take a csv file containing strings and convert it
into a csv file containig letter frequencies infos.
Args:
f_in (str): File path to analyse, must contain extension.
f_out (str): File path containing result, must contain extension.
"""
with open(f_in, "r") as csv_file:
csv_reader = csv.reader(csv_file)
with open(f_out, "w", newline="") as output_csv:
csv_writer = csv.writer(output_csv)
for row in csv_reader:
str = "".join(row).lower()
modified_row = extract_data_from_string(str)
csv_writer.writerow(modified_row)
def extract_data_from_string(str_in: str) -> List:
"""
Retreive tab containing letter frequency informations
and special char frequency of a given string.
Args:
str_in (str): String to analyse.
Returns:
(List): Tab containing special char and alphabetical frequencies.
"""
str_data = []
str_data = str_data + frequence_letters(str_in)
str_data = str_data + frequence_char_part(str_in)
return str_data
def frequence_letters(str_in: str) -> List:
"""
Retreive tab containing letter frequency informations
of a given string.
Args:
str_in (str): String to analyse.
Returns:
(List): Tab containing alphabetical char frequencies.
"""
counter = Counter(str_in.lower())
freq_tab = [
round(counter.get(chr(i), 0) / len(counter) * 100, 2) for i in range(97, 123)
]
return freq_tab
def frequence_char_part(str_in: str) -> List:
"""
Retreive tab containing special char frequency
informations of a given string.
Args:
str_in (str): String to analyse.
Returns:
(List): Tab containing special char char frequencies.
"""
counter = Counter(str_in.lower())
freq_tab = [
round(counter.get(char, 0) / len(str_in) * 100, 2) for char in var.SPECIAL_CHARS
]
return freq_tab
def main():
for lang in var.TRAD_TARGETS:
input_file = "../../assets/data/prompts/csv/" + lang + "_prompts.csv"
output_csv_file = "../../assets/data/trainset/" + lang + "_trainset.csv"
extract_data_from_csv(input_file, output_csv_file)
if __name__ == "__main__":
main()
|