# CamemBERT fine-tuning

Because of dependency conflicts, we will be fine-tuning the model here and then loading it and evaluating in [deepl_ner.ipynb](./deepl_ner.ipynb).


In [86]:
!pip install --upgrade transformers tf-keras focal-loss

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [87]:
import os

os.environ["TF_USE_LEGACY_KERAS"] = "1"

In [88]:
import tensorflow as tf

In [89]:
from app.travel_resolver.libs.nlp import data_processing as dp

sentences, labels, vocab, unique_labels = dp.from_bio_file_to_examples(
    "./data/bio/fr.bio/10k_train_small_samples.bio"
)

# To avoid overfitting the model on sentences that don't have any labels
lambda_sentences, lambda_labels, _, __ = dp.from_bio_file_to_examples(
    "./data/bio/fr.bio/1k_train_unlabeled_samples.bio"
)

long_sentences, long_labels, _, __ = dp.from_bio_file_to_examples(
    "./data/bio/fr.bio/1k_train_large_samples.bio"
)

In [90]:
sentences = sentences + lambda_sentences + long_sentences
labels = labels + lambda_labels + long_labels

In [91]:
import app.travel_resolver.libs.nlp.data_processing as dp

processed_sentences, processed_labels = dp.process_sentences_and_labels(
    sentences, labels, return_tokens=True, stemming=False
)

In [92]:
"""
  This variable will control the maximum length of the sentence 
  as well as the embedding size
"""

MAX_LEN = 150

In [93]:
padded_labels = tf.keras.preprocessing.sequence.pad_sequences(
    processed_labels, maxlen=MAX_LEN, padding="post"
)

In [94]:
from transformers import TFAutoModelForTokenClassification, CamembertTokenizerFast
import numpy as np

tokenizer = CamembertTokenizerFast.from_pretrained("cmarkea/distilcamembert-base")

In [95]:
tokenized_sentences = tokenizer(
    processed_sentences,
    is_split_into_words=True,
    return_offsets_mapping=True,
    truncation=True,
    padding="max_length",
    max_length=MAX_LEN,
)

In [96]:
def align_labels_with_tokens(encodings, labels):
    """
    Aligns the labels to match the tokenized outputs.

    Args:
        encodings (BatchEncoding): Tokenized outputs from the Hugging Face tokenizer (must use a fast tokenizer).
        labels (List[List[int]]): Original labels for each sentence before tokenization. Each inner list corresponds to one sentence.

    Returns:
        List[List[int]]: Aligned labels, where each inner list corresponds to the aligned labels for the tokenized sentence.
                         Special tokens and padding are assigned a value of -100.
    """
    adapted_labels = []

    for i, label in enumerate(labels):
        word_ids = encodings.word_ids(
            batch_index=i
        )  # Get word IDs for the i-th sentence
        aligned_labels = []
        previous_word_id = None

        for word_id in word_ids:
            if word_id is None:
                # Special tokens (e.g., [CLS], [SEP], or padding)
                aligned_labels.append(-100)
            elif word_id != previous_word_id:
                # New word
                aligned_labels.append(label[word_id])
            else:
                # Subword token (same word)
                aligned_labels.append(
                    label[word_id]
                )  # Or append -100 to ignore subwords
            previous_word_id = word_id

        adapted_labels.append(aligned_labels)

    return adapted_labels

In [97]:
readapted_labels = align_labels_with_tokens(tokenized_sentences, padded_labels)

In [98]:
from sklearn.model_selection import train_test_split

(
    train_input_ids,
    test_input_ids,
    train_attention_masks,
    test_attention_masks,
    train_labels,
    test_labels,
) = train_test_split(
    tokenized_sentences["input_ids"],
    tokenized_sentences["attention_mask"],
    readapted_labels,
    test_size=0.2,
)

In [99]:
train_dataset = tf.data.Dataset.from_tensor_slices(
    (
        {
            "input_ids": train_input_ids,
            "attention_mask": train_attention_masks,
        },
        train_labels,
    )
)

test_dataset = tf.data.Dataset.from_tensor_slices(
    (
        {
            "input_ids": test_input_ids,
            "attention_mask": test_attention_masks,
        },
        test_labels,
    )
)

In [100]:
def entity_accuracy(y_true, y_pred):
    """
    Calculate the accuracy based on the entities. Which mean that correct `O` tags will not be taken into account.

    Parameters:
    y_true (tensor): True labels.
    y_pred (tensor): Predicted logits.

    Returns:
    accuracy (tensor): Tag accuracy.
    """

    y_true = tf.cast(y_true, tf.float32)
    # We ignore the padding and the O tag
    mask = y_true > 0
    mask = tf.cast(mask, tf.float32)

    y_pred_class = tf.math.argmax(y_pred, axis=-1)
    y_pred_class = tf.cast(y_pred_class, tf.float32)

    matches_true_pred = tf.equal(y_true, y_pred_class)
    matches_true_pred = tf.cast(matches_true_pred, tf.float32)

    matches_true_pred *= mask

    masked_acc = tf.reduce_sum(matches_true_pred) / tf.reduce_sum(mask)

    return masked_acc

In [101]:
from focal_loss import SparseCategoricalFocalLoss

camembert = TFAutoModelForTokenClassification.from_pretrained(
    "cmarkea/distilcamembert-base", num_labels=len(unique_labels)
)

loss_func = SparseCategoricalFocalLoss(
    gamma=2, class_weight=[1, 10, 10], from_logits=True
)

camembert.compile(
    optimizer=tf.keras.optimizers.legacy.Adam(8e-4),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[entity_accuracy],
)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFCamembertForTokenClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFCamembertForTokenClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFCamembertForTokenClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFCamembertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [102]:
train_dataset = train_dataset.batch(64)
test_dataset = test_dataset.batch(64)

In [103]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss", min_delta=0.001, patience=0, restore_best_weights=True
)

csv_logger = tf.keras.callbacks.CSVLogger("training.log")

camembert.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=10,
    callbacks=[early_stopping, csv_logger],
)

Epoch 1/10


TypeError: in user code:

    File "/Users/az-r-ow/Developer/TravelOrderResolver/venv/lib/python3.12/site-packages/tf_keras/src/engine/training.py", line 1398, in train_function  *
        return step_function(self, iterator)
    File "/Users/az-r-ow/Developer/TravelOrderResolver/venv/lib/python3.12/site-packages/tf_keras/src/engine/training.py", line 1381, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/az-r-ow/Developer/TravelOrderResolver/venv/lib/python3.12/site-packages/tf_keras/src/engine/training.py", line 1370, in run_step  **
        outputs = model.train_step(data)
    File "/Users/az-r-ow/Developer/TravelOrderResolver/venv/lib/python3.12/site-packages/transformers/modeling_tf_utils.py", line 1672, in train_step
        y_pred = self(x, training=True)
    File "/Users/az-r-ow/Developer/TravelOrderResolver/venv/lib/python3.12/site-packages/tf_keras/src/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/var/folders/3h/5n6s9rcj3sx0gpncsxbq_99m0000gn/T/__autograph_generated_filepc984rni.py", line 40, in tf__run_call_with_unpacked_inputs
        raise

    TypeError: Exception encountered when calling layer 'tf_camembert_for_token_classification_5' (type TFCamembertForTokenClassification).
    
    in user code:
    
        File "/Users/az-r-ow/Developer/TravelOrderResolver/venv/lib/python3.12/site-packages/transformers/modeling_tf_utils.py", line 1393, in run_call_with_unpacked_inputs  *
            return func(self, **unpacked_inputs)
    
        TypeError: outer_factory.<locals>.inner_factory.<locals>.tf__call() got an unexpected keyword argument 'offset_mapping'
    
    
    Call arguments received by layer 'tf_camembert_for_token_classification_5' (type TFCamembertForTokenClassification):
      • input_ids={'input_ids': 'tf.Tensor(shape=(None, 150), dtype=int32)', 'attention_mask': 'tf.Tensor(shape=(None, 150), dtype=int32)', 'offset_mapping': 'tf.Tensor(shape=(None, 150, 2), dtype=int32)'}
      • attention_mask=None
      • token_type_ids=None
      • position_ids=None
      • head_mask=None
      • inputs_embeds=None
      • output_attentions=None
      • output_hidden_states=None
      • return_dict=None
      • labels=None
      • training=True


In [76]:
from focal_loss import SparseCategoricalFocalLoss

loss_func = SparseCategoricalFocalLoss(gamma=1)
y_true = [0, 1, 2]
y_pred = [[0.8, 0.1, 0.1], [0.2, 0.7, 0.1], [0.2, 0.2, 0.6]]
loss_func(y_true, y_pred)

<tf.Tensor: shape=(), dtype=float32, numpy=0.1186538115143776>

In [79]:
camembert.save_pretrained("./models/distilcamembert-base-ner-cross-entropy-11")

In [78]:
# camembert.push_to_hub("CamemBERT-NER-Travel")