File size: 12,033 Bytes
04fc5d3 2d65dc3 b42f9d4 04fc5d3 a85b199 8a27130 ae14666 04fc5d3 4fbdbc2 04fc5d3 4fbdbc2 04fc5d3 4fbdbc2 04fc5d3 e54bb7f aba5a2f e54bb7f 284713a ae14666 555572c e54bb7f 04fc5d3 e54bb7f 04fc5d3 e54bb7f 04fc5d3 e54bb7f aba5a2f e54bb7f ae14666 e54bb7f ae14666 04fc5d3 e54bb7f ae14666 e54bb7f ae14666 555572c ae14666 e54bb7f ae14666 e54bb7f 04fc5d3 ae14666 b42f9d4 567d244 b42f9d4 567d244 b42f9d4 567d244 b42f9d4 567d244 b42f9d4 567d244 b42f9d4 567d244 d7a4454 567d244 b42f9d4 d7a4454 567d244 b42f9d4 d7a4454 b42f9d4 567d244 b42f9d4 567d244 b42f9d4 4fbdbc2 567d244 85da5f9 b42f9d4 ae14666 b42f9d4 e54bb7f b42f9d4 2d65dc3 b42f9d4 85da5f9 567d244 e54bb7f b42f9d4 567d244 b42f9d4 567d244 e54bb7f b42f9d4 1da8995 9e64968 1da8995 b42f9d4 e54bb7f b42f9d4 e54bb7f b42f9d4 567d244 85da5f9 b42f9d4 567d244 e54bb7f b42f9d4 e54bb7f 2d65dc3 8a27130 2d65dc3 253d109 fcb3586 555572c fcb3586 555572c fcb3586 2a285f8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 |
import nltk, re
import tensorflow as tf
from tqdm import tqdm
# Will download the necessary resources for nltk
# Should skip if resources found
try:
stopwords = nltk.corpus.stopwords.words("french")
except LookupError:
nltk.download("punkt_tab")
nltk.download("stopwords")
stopwords = nltk.corpus.stopwords.words("french")
def get_tagged_content(sentence: str, tag: str) -> str | None:
"""
Extract the content between two tags in a sentence given the tag.
Args:
sentence (str): The sentence to extract the content from.
tag (str): The tag to extract the content between.
Returns:
str | None: The content between the tags. None if not found
Raises:
ValueError: If tag is not provided or tag not str.
Example:
>>> get_tagged_content("Je voudrais voyager de <Dep>Nice<Dep> à <Arr>Clermont Ferrand<Arr>.", "<Dep>")
"Nice"
"""
if not tag or not isinstance(tag, str):
raise ValueError("tag must be a non-empty string")
tag_match = re.search(rf"{tag}(.*?){tag}", sentence)
if tag_match:
return tag_match.group(1)
return None
def process_sentence(
sentence: str,
rm_stopwords: bool = False,
stemming: bool = False,
return_tokens: bool = False,
labels_to_adapt: list[int | str] | None = None,
stopwords_to_keep: list[str] = [],
) -> str:
"""
Given a sentence, apply some processing techniques to the sentence and return the processed sentence
**Note**: We are stemming the tokens instead of lemmatizing them because stemming is faster and in our case
we are interested in getting a response the fastest way possible.
Args:
sentence (str): The sentence to process.
rm_stopwords (bool): Whether to remove stopwords.
stemming (bool): Whether to stem the tokens.
return_tokens (bool): Whether to return the tokens instead of the sentence.
labels_to_adapt (list[int | str] | None): The labels to adapt.
Returns:
str | list | (list | str, list): The processed sentence or the processed sentence and the adapted labels based on what's left in the sentence.
"""
tokenized_sentence = nltk.word_tokenize(sentence)
stemmer = nltk.stem.snowball.FrenchStemmer()
return_labels = bool(labels_to_adapt)
labels_to_adapt = (
[0] * len(tokenized_sentence) if not labels_to_adapt else labels_to_adapt
) # default labels
labels = []
processed_sentence = ""
for token, label in zip(tokenized_sentence, labels_to_adapt):
# Skipping stopwords
if token in stopwords and rm_stopwords and token not in stopwords_to_keep:
continue
token = token if not stemming else stemmer.stem(token)
processed_sentence += token + " "
labels.append(label)
processed_sentence = processed_sentence.strip()
processed_sentence = (
processed_sentence if not return_tokens else processed_sentence.split(" ")
)
return processed_sentence if not return_labels else (processed_sentence, labels)
def convert_tagged_sentence_to_bio(
sentence: str, tag_entities_pairs: list[tuple[str, str]]
) -> str:
"""
Given a sentence with tags, convert the sentence to BIO format.
Args:
sentence (str): The sentence to convert to BIO format.
tag_entities_pairs (list[tuple[str, str]]): The tags and entities to convert to BIO format
Returns:
str: The sentence in BIO format
Example:
>>> convert_tagged_sentence_to_bio("Je voudrais voyager de <Dep>Nice<Dep> à <Arr>Clermont Ferrand<Arr>.", [("Dep", "LOC-DEP"), ("Arr", "LOC-ARR")])
Je O
voudrais O
voyager O
de O
Nice B-LOC-DEP
à O
Clermont B-LOC-ARR
Ferrand I-LOC-ARR
. O
"""
bare_sentence = sentence
tags = [pair[0] for pair in tag_entities_pairs]
entities = [pair[1] for pair in tag_entities_pairs]
for tag in tags:
bare_sentence = bare_sentence.replace(tag, "")
# extended entities
ext_entities = []
for entity in entities:
ext_entities.extend(["B-" + entity, "I-" + entity])
for tag, entity in tag_entities_pairs:
while re.search(f"{tag}(.*?){tag}", sentence):
match = re.search(f"{tag}(.*?){tag}", sentence)
temp_entities = [entity] * len(nltk.word_tokenize(match.group(1)))
temp_entities[0] = "B-" + entity
if len(temp_entities) > 1:
for i in range(1, len(temp_entities)):
temp_entities[i] = "I-" + entity
sentence = (
sentence[: match.start()]
+ " ".join(temp_entities)
+ sentence[match.end() :]
)
tokens = nltk.word_tokenize(sentence)
bare_sentence_tokens = nltk.word_tokenize(bare_sentence)
tokenized_entities = [
"O" if not token in ext_entities else token for token in tokens
]
bio_format = [
" ".join([token, entity])
for token, entity in zip(bare_sentence_tokens, tokenized_entities)
]
return "\n".join(bio_format)
def from_tagged_file_to_bio_file(
input_file: str, output_file: str, tag_entities_pairs: list[tuple[str, str]]
) -> None:
"""
Given an input file and an output file, read the input file, convert the content to BIO format, and write the content to the output file.
Args:
input_file (str): The path to the input file.
output_file (str): The path to the output file.
tag_entities_pairs (list[tuple[str, str]]): The tags and entities to convert to BIO format.
entities (list[str]): The entities to convert to BIO format.
"""
with open(input_file, "r") as file:
content = file.read()
with open(output_file, "w") as file:
sentences = content.split("\n")
for sentence in tqdm(sentences):
# skip empty lines
if not sentence:
continue
bio_format = convert_tagged_sentence_to_bio(sentence, tag_entities_pairs)
file.write(bio_format + "\n\n")
def from_bio_file_to_examples(file_path: str) -> tuple:
"""
Given a file path, read the file and convert the content to a tuple of sentences and their respective labels vectors.
**Note**: We are stemming the tokens instead of lemmatizing them because stemming is faster and in our case
we are interested in getting a response the fastest way possible.
Args:
file_path (str): The path to the file to read.
Returns:
tuple: A tuple containing the inputs and labels (inputs, labels).
"""
with open(file_path, "r") as file:
content = file.read()
lines = content.split("\n")
sentences = []
labels = []
unique_labels = set()
# getting all the unique labels
for line in lines:
if (len(line.split(" "))) < 2:
continue
word, label = line.split(" ")
label = (
"-".join(label.split("-")[-2:])
if label.startswith("B") or label.startswith("I")
else label
)
unique_labels.add(label)
unique_labels = list(unique_labels)
SORT_ORDER = {"O": 0, "LOC-DEP": 1, "LOC-ARR": 2}
# "O" (first) and "DEP" (if present has to be second)
unique_labels = sorted(unique_labels, key=lambda x: SORT_ORDER[x])
# mapping labels to ids
unique_labels = {label: i for i, label in enumerate(unique_labels)}
# tracking the vocabulary
vocab = set()
sentence_words = []
sentence_labels = []
for line in lines:
if (len(line.split(" "))) < 2:
if len(sentence_words) == 0:
continue
sentences.append(" ".join(sentence_words))
labels.append(sentence_labels)
sentence_words = []
sentence_labels = []
continue
word, label = line.split(" ")
label = (
"-".join(label.split("-")[-2:])
if label.startswith("B") or label.startswith("I")
else label
)
label = unique_labels[label]
sentence_words.append(word)
sentence_labels.append(label)
vocab.add(word)
return (sentences, labels, vocab, unique_labels)
def from_examples_to_tf_dataset(
inputs: tuple[list[list[int]], list[list[int]]],
) -> tf.data.Dataset:
"""
Given a tuple of inputs and labels, convert the tuple to a TensorFlow dataset.
Args:
inputs (tuple[list[list[int]], list[list[int]]): A tuple containing the inputs and labels (inputs, labels).
Returns:
tf.data.Dataset: The TensorFlow dataset.
"""
def gen():
for input, label in zip(inputs[0], inputs[1]):
yield input, label
dataset = tf.data.Dataset.from_generator(
gen,
output_signature=(
tf.TensorSpec(shape=(None), dtype=tf.int32),
tf.TensorSpec(shape=(None), dtype=tf.int32),
),
)
return dataset
def encode_and_pad_sentence(sentence: str, vocab: dict, max_len: int) -> list[int]:
"""
Given a sentence, a vocabulary, and a maximum length, encode the sentence and pad it to the maximum length.
Args:
sentence (str): The sentence to encode and pad.
vocab (dict): The vocabulary to use for encoding.
max_len (int): The maximum length to pad the sentence to.
Returns:
list[int]: The encoded and padded sentence.
"""
encoded_sentence = [
vocab.index(word) if word in vocab else vocab.index("<UNK>")
for word in sentence
]
return tf.keras.utils.pad_sequences(
[encoded_sentence], maxlen=max_len, padding="post", value=0
)[0]
def process_sentences_and_labels(
sentences,
labels,
rm_stopwords: bool = False,
stemming: bool = True,
return_tokens: bool = False,
stopwords_to_keep: list[str] = [],
):
"""
Process the sentences and labels using the process_sentence function from the data_processing module.
Args:
sentences (list): List of sentences to process.
labels (list): List of labels to process.
rm_stopwords (bool): Whether to remove stopwords from the sentences.
stemming (bool): Whether to apply stemming to the sentences.
return_tokens (bool): Whether to return the tokens of the sentences.
Returns:
processed_sentences (list): List of processed sentences.
processed_labels (list): List of processed labels.
"""
processed_sentences = []
processed_labels = []
for sentence, label in zip(sentences, labels):
sentence, label = process_sentence(
sentence,
labels_to_adapt=label,
rm_stopwords=rm_stopwords,
stemming=stemming,
return_tokens=return_tokens,
stopwords_to_keep=stopwords_to_keep,
)
processed_sentences.append(sentence)
processed_labels.append(label)
return processed_sentences, processed_labels
def encode_and_pad_sentence(sentence: str, vocab: list[str], max_length: int = 100):
"""
Encode a sentence into a list of integers
Parameters:
sentence (str): The sentence to encode
vocab (list): The vocabulary
Returns:
list: The list of integers
"""
encoded_sentence = [
vocab.index(word) if word in vocab else vocab.index("<UNK>")
for word in sentence
]
return tf.keras.utils.pad_sequences(
[encoded_sentence], maxlen=max_length, padding="post", value=0
)[0]
def encode_and_pad_sentence_pos(
sentence_pos: str, pos_tags: list[str], max_length: int = 100
):
"""
Encode a sentence into a list of integers
Parameters:
sentence (str): The sentence to encode
pos_tags (list): The vocabulary
Returns:
list: The list of integers
"""
encoded_sentence = [pos_tags.index(pos) for pos in sentence_pos]
return tf.keras.utils.pad_sequences(
[encoded_sentence], maxlen=max_length, padding="post", value=0
)[0]
|