Az-r-ow
feat(ihm): semi-functionnal interface, minor things to add
dcd93f5
from abc import ABC, abstractmethod
import os
import tensorflow as tf
import numpy as np
from transformers import TFCamembertForTokenClassification, CamembertTokenizerFast
import pandas as pd
from .data_processing import (
process_sentence,
encode_and_pad_sentence,
encode_and_pad_sentence_pos,
)
from .metrics import masked_loss, masked_accuracy, entity_accuracy
import stanza
from .models_definitions.bilstm.architecture import BiLSTM
from .models_definitions.lstm_with_pos.architecture import LSTM
nlp = stanza.Pipeline("fr", processors="tokenize,pos")
class NERModel(ABC):
file_path = os.path.dirname(os.path.abspath(__file__))
vocab_path = os.path.join(file_path, "vocab.pkl")
pos_tags_path = os.path.join(file_path, "pos_tags.pkl")
vocab = pd.read_pickle(vocab_path)
pos_tags = pd.read_pickle(pos_tags_path)
@abstractmethod
def get_entities(self, text: str):
pass
@abstractmethod
def predict(self, text: str):
pass
@abstractmethod
def encode_sentence(self, sentence: str):
pass
class LSTM_NER(NERModel):
def __init__(self):
self.model_weights_path = os.path.join(
self.file_path,
"models_definitions",
"lstm_with_pos",
"lstm_with_pos.weights.h5",
)
self.model = LSTM(self.vocab, 3, self.pos_tags)
self.model.load_from_weights(self.model_weights_path)
def encode_sentence(self, sentence: str):
processed_sentence = process_sentence(
sentence, stemming=True, return_tokens=True
)
encoded_sentence = tf.convert_to_tensor(
[encode_and_pad_sentence(processed_sentence, self.vocab, max_length=100)]
)
sentence_pos = nlp(sentence)
pos_tags = [word.upos for sent in sentence_pos.sentences for word in sent.words]
encoded_pos = tf.convert_to_tensor(
[encode_and_pad_sentence_pos(pos_tags, self.pos_tags, max_length=100)]
)
return [encoded_sentence, encoded_pos]
def get_entities(self, text: str):
encoded_sentence = self.encode_sentence(text)
predictions = self.predict(encoded_sentence)
return predictions[0].numpy()
def predict(self, encoded_sentence):
return tf.math.argmax(self.model.predict(encoded_sentence, verbose=0), axis=-1)
class BiLSTM_NER(NERModel):
def __init__(self):
self.model_weights_path = os.path.join(
self.file_path, "models_definitions", "bilstm", "bilstm.weights.h5"
)
self.model = BiLSTM(self.vocab, 3)
self.model.load_from_weights(self.model_weights_path)
def encode_sentence(self, sentence: str):
processed_sentence = process_sentence(
sentence, stemming=True, return_tokens=True
)
encoded_sentence = tf.convert_to_tensor(
[encode_and_pad_sentence(processed_sentence, self.vocab, max_length=100)]
)
return encoded_sentence
def get_entities(self, text: str):
encoded_sentence = self.encode_sentence(text)
predictions = self.predict(encoded_sentence)
return predictions[0].numpy()
def predict(self, encoded_sentence):
return tf.math.argmax(self.model.predict(encoded_sentence, verbose=0), axis=-1)
class CamemBERT_NER(NERModel):
def __init__(self, num_labels=3):
self.model = TFCamembertForTokenClassification.from_pretrained(
"Az-r-ow/CamemBERT-NER-Travel", num_labels=num_labels
)
self.tokenizer = CamembertTokenizerFast.from_pretrained(
"cmarkea/distilcamembert-base"
)
def encode_sentence(self, sentence: str):
return self.tokenizer(
sentence,
return_tensors="tf",
padding="max_length",
max_length=150,
)
def get_entities(self, text: str):
encoded_sentence = self.encode_sentence(text)
predictions = self.predict(encoded_sentence).logits
predictions = tf.math.argmax(predictions, axis=-1)[0].numpy()
return self.align_labels_with_original_sentence(
encoded_sentence, [predictions]
)[0]
def predict(self, encoded_sentence):
return self.model.predict(encoded_sentence)
def align_labels_with_original_sentence(self, tokenized_inputs, predictions):
"""
Aligns predictions from token classification back to the original sentence words.
Args:
tokenized_inputs (BatchEncoding): Tokenized input from CamembertTokenizerFast.
predictions (np.array): Model predictions, shape (batch_size, seq_len, num_labels).
Returns:
List[List[str]]: Adjusted labels for each word in the original sentences.
"""
aligned_labels = []
for i in range(len(predictions)): # Iterate through each example in the batch
word_ids = tokenized_inputs.word_ids(
batch_index=i
) # Get word IDs for this example
sentence_labels = []
current_word = None
word_label = None
for token_idx, word_idx in enumerate(word_ids):
# Skip special tokens where word_idx is None
if word_idx is None:
continue
# If we're at a new word
if word_idx != current_word:
# Append label for the completed word
if current_word is not None:
sentence_labels.append(word_label)
# Reset for the new word
current_word = word_idx
word_label = predictions[i][token_idx]
# Handle subwords (optional: take the first or last subword label)
else:
# Here, we take the first subword label; alternatively, update word_label as needed.
continue
# Append the last word's label
if current_word is not None:
sentence_labels.append(word_label)
aligned_labels.append(sentence_labels)
return aligned_labels