Az-r-ow commited on
Commit
32bf4a3
Β·
1 Parent(s): 9f61aa9

feat(CamemBERT): Fine-tuned camembert model for NER

Browse files
camemBERT_finetuning.ipynb CHANGED
@@ -11,7 +11,7 @@
11
  },
12
  {
13
  "cell_type": "code",
14
- "execution_count": null,
15
  "metadata": {},
16
  "outputs": [
17
  {
@@ -19,6 +19,9 @@
19
  "output_type": "stream",
20
  "text": [
21
  "Requirement already satisfied: transformers in ./venv/lib/python3.12/site-packages (4.46.3)\n",
 
 
 
22
  "Requirement already satisfied: filelock in ./venv/lib/python3.12/site-packages (from transformers) (3.16.1)\n",
23
  "Requirement already satisfied: huggingface-hub<1.0,>=0.23.2 in ./venv/lib/python3.12/site-packages (from transformers) (0.26.3)\n",
24
  "Requirement already satisfied: numpy>=1.17 in ./venv/lib/python3.12/site-packages (from transformers) (1.26.4)\n",
@@ -29,53 +32,44 @@
29
  "Requirement already satisfied: tokenizers<0.21,>=0.20 in ./venv/lib/python3.12/site-packages (from transformers) (0.20.3)\n",
30
  "Requirement already satisfied: safetensors>=0.4.1 in ./venv/lib/python3.12/site-packages (from transformers) (0.4.5)\n",
31
  "Requirement already satisfied: tqdm>=4.27 in ./venv/lib/python3.12/site-packages (from transformers) (4.66.5)\n",
 
32
  "Requirement already satisfied: fsspec>=2023.5.0 in ./venv/lib/python3.12/site-packages (from huggingface-hub<1.0,>=0.23.2->transformers) (2024.10.0)\n",
33
  "Requirement already satisfied: typing-extensions>=3.7.4.3 in ./venv/lib/python3.12/site-packages (from huggingface-hub<1.0,>=0.23.2->transformers) (4.12.2)\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  "Requirement already satisfied: charset-normalizer<4,>=2 in ./venv/lib/python3.12/site-packages (from requests->transformers) (3.4.0)\n",
35
  "Requirement already satisfied: idna<4,>=2.5 in ./venv/lib/python3.12/site-packages (from requests->transformers) (3.10)\n",
36
  "Requirement already satisfied: urllib3<3,>=1.21.1 in ./venv/lib/python3.12/site-packages (from requests->transformers) (2.2.3)\n",
37
  "Requirement already satisfied: certifi>=2017.4.17 in ./venv/lib/python3.12/site-packages (from requests->transformers) (2024.8.30)\n",
38
- "\n",
39
- "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n",
40
- "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
41
- "Requirement already satisfied: tf_keras in ./venv/lib/python3.12/site-packages (2.18.0)\n",
42
- "Requirement already satisfied: tensorflow<2.19,>=2.18 in ./venv/lib/python3.12/site-packages (from tf_keras) (2.18.0)\n",
43
- "Requirement already satisfied: absl-py>=1.0.0 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf_keras) (2.1.0)\n",
44
- "Requirement already satisfied: astunparse>=1.6.0 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf_keras) (1.6.3)\n",
45
- "Requirement already satisfied: flatbuffers>=24.3.25 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf_keras) (24.3.25)\n",
46
- "Requirement already satisfied: gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf_keras) (0.6.0)\n",
47
- "Requirement already satisfied: google-pasta>=0.1.1 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf_keras) (0.2.0)\n",
48
- "Requirement already satisfied: libclang>=13.0.0 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf_keras) (18.1.1)\n",
49
- "Requirement already satisfied: opt-einsum>=2.3.2 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf_keras) (3.4.0)\n",
50
- "Requirement already satisfied: packaging in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf_keras) (24.1)\n",
51
- "Requirement already satisfied: protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.3 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf_keras) (4.25.5)\n",
52
- "Requirement already satisfied: requests<3,>=2.21.0 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf_keras) (2.32.3)\n",
53
- "Requirement already satisfied: setuptools in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf_keras) (75.2.0)\n",
54
- "Requirement already satisfied: six>=1.12.0 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf_keras) (1.16.0)\n",
55
- "Requirement already satisfied: termcolor>=1.1.0 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf_keras) (2.5.0)\n",
56
- "Requirement already satisfied: typing-extensions>=3.6.6 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf_keras) (4.12.2)\n",
57
- "Requirement already satisfied: wrapt>=1.11.0 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf_keras) (1.16.0)\n",
58
- "Requirement already satisfied: grpcio<2.0,>=1.24.3 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf_keras) (1.67.0)\n",
59
- "Requirement already satisfied: tensorboard<2.19,>=2.18 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf_keras) (2.18.0)\n",
60
- "Requirement already satisfied: keras>=3.5.0 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf_keras) (3.7.0)\n",
61
- "Requirement already satisfied: numpy<2.1.0,>=1.26.0 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf_keras) (1.26.4)\n",
62
- "Requirement already satisfied: h5py>=3.11.0 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf_keras) (3.12.1)\n",
63
- "Requirement already satisfied: ml-dtypes<0.5.0,>=0.4.0 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf_keras) (0.4.1)\n",
64
- "Requirement already satisfied: wheel<1.0,>=0.23.0 in ./venv/lib/python3.12/site-packages (from astunparse>=1.6.0->tensorflow<2.19,>=2.18->tf_keras) (0.44.0)\n",
65
- "Requirement already satisfied: rich in ./venv/lib/python3.12/site-packages (from keras>=3.5.0->tensorflow<2.19,>=2.18->tf_keras) (13.9.2)\n",
66
- "Requirement already satisfied: namex in ./venv/lib/python3.12/site-packages (from keras>=3.5.0->tensorflow<2.19,>=2.18->tf_keras) (0.0.8)\n",
67
- "Requirement already satisfied: optree in ./venv/lib/python3.12/site-packages (from keras>=3.5.0->tensorflow<2.19,>=2.18->tf_keras) (0.13.0)\n",
68
- "Requirement already satisfied: charset-normalizer<4,>=2 in ./venv/lib/python3.12/site-packages (from requests<3,>=2.21.0->tensorflow<2.19,>=2.18->tf_keras) (3.4.0)\n",
69
- "Requirement already satisfied: idna<4,>=2.5 in ./venv/lib/python3.12/site-packages (from requests<3,>=2.21.0->tensorflow<2.19,>=2.18->tf_keras) (3.10)\n",
70
- "Requirement already satisfied: urllib3<3,>=1.21.1 in ./venv/lib/python3.12/site-packages (from requests<3,>=2.21.0->tensorflow<2.19,>=2.18->tf_keras) (2.2.3)\n",
71
- "Requirement already satisfied: certifi>=2017.4.17 in ./venv/lib/python3.12/site-packages (from requests<3,>=2.21.0->tensorflow<2.19,>=2.18->tf_keras) (2024.8.30)\n",
72
- "Requirement already satisfied: markdown>=2.6.8 in ./venv/lib/python3.12/site-packages (from tensorboard<2.19,>=2.18->tensorflow<2.19,>=2.18->tf_keras) (3.7)\n",
73
- "Requirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in ./venv/lib/python3.12/site-packages (from tensorboard<2.19,>=2.18->tensorflow<2.19,>=2.18->tf_keras) (0.7.2)\n",
74
- "Requirement already satisfied: werkzeug>=1.0.1 in ./venv/lib/python3.12/site-packages (from tensorboard<2.19,>=2.18->tensorflow<2.19,>=2.18->tf_keras) (3.0.4)\n",
75
- "Requirement already satisfied: MarkupSafe>=2.1.1 in ./venv/lib/python3.12/site-packages (from werkzeug>=1.0.1->tensorboard<2.19,>=2.18->tensorflow<2.19,>=2.18->tf_keras) (3.0.2)\n",
76
- "Requirement already satisfied: markdown-it-py>=2.2.0 in ./venv/lib/python3.12/site-packages (from rich->keras>=3.5.0->tensorflow<2.19,>=2.18->tf_keras) (3.0.0)\n",
77
- "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in ./venv/lib/python3.12/site-packages (from rich->keras>=3.5.0->tensorflow<2.19,>=2.18->tf_keras) (2.18.0)\n",
78
- "Requirement already satisfied: mdurl~=0.1 in ./venv/lib/python3.12/site-packages (from markdown-it-py>=2.2.0->rich->keras>=3.5.0->tensorflow<2.19,>=2.18->tf_keras) (0.1.2)\n",
79
  "\n",
80
  "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n",
81
  "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
@@ -83,12 +77,12 @@
83
  }
84
  ],
85
  "source": [
86
- "!pip install --upgrade transformers tf-keras numpy sentencepiece"
87
  ]
88
  },
89
  {
90
  "cell_type": "code",
91
- "execution_count": 2,
92
  "metadata": {},
93
  "outputs": [],
94
  "source": [
@@ -99,7 +93,7 @@
99
  },
100
  {
101
  "cell_type": "code",
102
- "execution_count": 3,
103
  "metadata": {},
104
  "outputs": [],
105
  "source": [
@@ -108,19 +102,9 @@
108
  },
109
  {
110
  "cell_type": "code",
111
- "execution_count": 4,
112
  "metadata": {},
113
- "outputs": [
114
- {
115
- "name": "stderr",
116
- "output_type": "stream",
117
- "text": [
118
- "[nltk_data] Downloading package punkt_tab to /Users/az-r-\n",
119
- "[nltk_data] ow/nltk_data...\n",
120
- "[nltk_data] Package punkt_tab is already up-to-date!\n"
121
- ]
122
- }
123
- ],
124
  "source": [
125
  "from app.travel_resolver.libs.nlp import data_processing as dp\n",
126
  "\n",
@@ -128,21 +112,22 @@
128
  " \"./data/bio/fr.bio/10k_train_small_samples.bio\"\n",
129
  ")\n",
130
  "\n",
131
- "lambda_sentences, lambda_labels, _, __ = dp.from_bio_file_to_examples(\n",
132
- " \"./data/bio/fr.bio/1k_train_unlabeled_samples.bio\"\n",
133
- ")\n",
 
134
  "\n",
135
  "large_sentences, large_labels, _, __ = dp.from_bio_file_to_examples(\n",
136
  " \"./data/bio/fr.bio/1k_train_large_samples.bio\"\n",
137
  ")\n",
138
  "\n",
139
- "sentences = sentences + lambda_sentences + large_sentences\n",
140
- "labels = labels + lambda_labels + large_labels"
141
  ]
142
  },
143
  {
144
  "cell_type": "code",
145
- "execution_count": 5,
146
  "metadata": {},
147
  "outputs": [],
148
  "source": [
@@ -155,7 +140,7 @@
155
  },
156
  {
157
  "cell_type": "code",
158
- "execution_count": 6,
159
  "metadata": {},
160
  "outputs": [],
161
  "source": [
@@ -181,7 +166,7 @@
181
  },
182
  {
183
  "cell_type": "code",
184
- "execution_count": 8,
185
  "metadata": {},
186
  "outputs": [],
187
  "source": [
@@ -192,7 +177,7 @@
192
  },
193
  {
194
  "cell_type": "code",
195
- "execution_count": null,
196
  "metadata": {},
197
  "outputs": [],
198
  "source": [
@@ -204,7 +189,7 @@
204
  },
205
  {
206
  "cell_type": "code",
207
- "execution_count": 23,
208
  "metadata": {},
209
  "outputs": [],
210
  "source": [
@@ -219,7 +204,7 @@
219
  },
220
  {
221
  "cell_type": "code",
222
- "execution_count": 24,
223
  "metadata": {},
224
  "outputs": [],
225
  "source": [
@@ -242,7 +227,7 @@
242
  },
243
  {
244
  "cell_type": "code",
245
- "execution_count": 26,
246
  "metadata": {},
247
  "outputs": [],
248
  "source": [
@@ -269,7 +254,65 @@
269
  },
270
  {
271
  "cell_type": "code",
272
- "execution_count": 32,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
  "metadata": {},
274
  "outputs": [
275
  {
@@ -279,57 +322,64 @@
279
  "All PyTorch model weights were used when initializing TFCamembertForTokenClassification.\n",
280
  "\n",
281
  "Some weights or buffers of the TF 2.0 model TFCamembertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
282
- "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
283
- "WARNING:absl:At this time, the v2.11+ optimizer `tf.keras.optimizers.Adam` runs slowly on M1/M2 Macs, please use the legacy TF-Keras optimizer instead, located at `tf.keras.optimizers.legacy.Adam`.\n"
284
  ]
285
  }
286
  ],
287
  "source": [
 
 
288
  "camembert = TFAutoModelForTokenClassification.from_pretrained(\n",
289
  " \"camembert-base\", num_labels=len(unique_labels)\n",
290
  ")\n",
291
  "\n",
 
 
 
 
292
  "camembert.compile(\n",
293
- " optimizer=tf.keras.optimizers.Adam(5e-5),\n",
294
- " loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n",
295
- " metrics=[\"accuracy\"],\n",
296
  ")"
297
  ]
298
  },
299
  {
300
  "cell_type": "code",
301
- "execution_count": 33,
302
  "metadata": {},
303
  "outputs": [],
304
  "source": [
305
- "train_dataset = train_dataset.batch(64)\n",
306
- "test_dataset = test_dataset.batch(64)"
307
  ]
308
  },
309
  {
310
  "cell_type": "code",
311
- "execution_count": 34,
312
  "metadata": {},
313
  "outputs": [
314
  {
315
  "name": "stdout",
316
  "output_type": "stream",
317
  "text": [
318
- "Epoch 1/3\n",
319
- "148/148 [==============================] - 1725s 12s/step - loss: 0.2062 - accuracy: 0.9711 - val_loss: 0.0952 - val_accuracy: 0.9873\n",
320
- "Epoch 2/3\n",
321
- "148/148 [==============================] - 1782s 12s/step - loss: 0.0681 - accuracy: 0.9922 - val_loss: 0.0442 - val_accuracy: 0.9953\n",
322
- "Epoch 3/3\n",
323
- "148/148 [==============================] - 1749s 12s/step - loss: 0.0377 - accuracy: 0.9956 - val_loss: 0.0260 - val_accuracy: 0.9964\n"
 
 
324
  ]
325
  },
326
  {
327
  "data": {
328
  "text/plain": [
329
- "<tf_keras.src.callbacks.History at 0x295a015b0>"
330
  ]
331
  },
332
- "execution_count": 34,
333
  "metadata": {},
334
  "output_type": "execute_result"
335
  }
@@ -340,7 +390,7 @@
340
  ")\n",
341
  "\n",
342
  "camembert.fit(\n",
343
- " train_dataset, validation_data=test_dataset, epochs=3, callbacks=[callback]\n",
344
  ")"
345
  ]
346
  },
@@ -348,9 +398,51 @@
348
  "cell_type": "code",
349
  "execution_count": null,
350
  "metadata": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
  "outputs": [],
352
  "source": [
353
- "camembert.save_pretrained(\"./camembert\")"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
354
  ]
355
  }
356
  ],
 
11
  },
12
  {
13
  "cell_type": "code",
14
+ "execution_count": 20,
15
  "metadata": {},
16
  "outputs": [
17
  {
 
19
  "output_type": "stream",
20
  "text": [
21
  "Requirement already satisfied: transformers in ./venv/lib/python3.12/site-packages (4.46.3)\n",
22
+ "Requirement already satisfied: tf-keras in ./venv/lib/python3.12/site-packages (2.18.0)\n",
23
+ "Collecting focal-loss\n",
24
+ " Downloading focal_loss-0.0.7-py3-none-any.whl.metadata (5.1 kB)\n",
25
  "Requirement already satisfied: filelock in ./venv/lib/python3.12/site-packages (from transformers) (3.16.1)\n",
26
  "Requirement already satisfied: huggingface-hub<1.0,>=0.23.2 in ./venv/lib/python3.12/site-packages (from transformers) (0.26.3)\n",
27
  "Requirement already satisfied: numpy>=1.17 in ./venv/lib/python3.12/site-packages (from transformers) (1.26.4)\n",
 
32
  "Requirement already satisfied: tokenizers<0.21,>=0.20 in ./venv/lib/python3.12/site-packages (from transformers) (0.20.3)\n",
33
  "Requirement already satisfied: safetensors>=0.4.1 in ./venv/lib/python3.12/site-packages (from transformers) (0.4.5)\n",
34
  "Requirement already satisfied: tqdm>=4.27 in ./venv/lib/python3.12/site-packages (from transformers) (4.66.5)\n",
35
+ "Requirement already satisfied: tensorflow<2.19,>=2.18 in ./venv/lib/python3.12/site-packages (from tf-keras) (2.18.0)\n",
36
  "Requirement already satisfied: fsspec>=2023.5.0 in ./venv/lib/python3.12/site-packages (from huggingface-hub<1.0,>=0.23.2->transformers) (2024.10.0)\n",
37
  "Requirement already satisfied: typing-extensions>=3.7.4.3 in ./venv/lib/python3.12/site-packages (from huggingface-hub<1.0,>=0.23.2->transformers) (4.12.2)\n",
38
+ "Requirement already satisfied: absl-py>=1.0.0 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (2.1.0)\n",
39
+ "Requirement already satisfied: astunparse>=1.6.0 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (1.6.3)\n",
40
+ "Requirement already satisfied: flatbuffers>=24.3.25 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (24.3.25)\n",
41
+ "Requirement already satisfied: gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (0.6.0)\n",
42
+ "Requirement already satisfied: google-pasta>=0.1.1 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (0.2.0)\n",
43
+ "Requirement already satisfied: libclang>=13.0.0 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (18.1.1)\n",
44
+ "Requirement already satisfied: opt-einsum>=2.3.2 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (3.4.0)\n",
45
+ "Requirement already satisfied: protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.3 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (4.25.5)\n",
46
+ "Requirement already satisfied: setuptools in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (75.2.0)\n",
47
+ "Requirement already satisfied: six>=1.12.0 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (1.16.0)\n",
48
+ "Requirement already satisfied: termcolor>=1.1.0 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (2.5.0)\n",
49
+ "Requirement already satisfied: wrapt>=1.11.0 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (1.16.0)\n",
50
+ "Requirement already satisfied: grpcio<2.0,>=1.24.3 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (1.67.0)\n",
51
+ "Requirement already satisfied: tensorboard<2.19,>=2.18 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (2.18.0)\n",
52
+ "Requirement already satisfied: keras>=3.5.0 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (3.7.0)\n",
53
+ "Requirement already satisfied: h5py>=3.11.0 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (3.12.1)\n",
54
+ "Requirement already satisfied: ml-dtypes<0.5.0,>=0.4.0 in ./venv/lib/python3.12/site-packages (from tensorflow<2.19,>=2.18->tf-keras) (0.4.1)\n",
55
  "Requirement already satisfied: charset-normalizer<4,>=2 in ./venv/lib/python3.12/site-packages (from requests->transformers) (3.4.0)\n",
56
  "Requirement already satisfied: idna<4,>=2.5 in ./venv/lib/python3.12/site-packages (from requests->transformers) (3.10)\n",
57
  "Requirement already satisfied: urllib3<3,>=1.21.1 in ./venv/lib/python3.12/site-packages (from requests->transformers) (2.2.3)\n",
58
  "Requirement already satisfied: certifi>=2017.4.17 in ./venv/lib/python3.12/site-packages (from requests->transformers) (2024.8.30)\n",
59
+ "Requirement already satisfied: wheel<1.0,>=0.23.0 in ./venv/lib/python3.12/site-packages (from astunparse>=1.6.0->tensorflow<2.19,>=2.18->tf-keras) (0.44.0)\n",
60
+ "Requirement already satisfied: rich in ./venv/lib/python3.12/site-packages (from keras>=3.5.0->tensorflow<2.19,>=2.18->tf-keras) (13.9.2)\n",
61
+ "Requirement already satisfied: namex in ./venv/lib/python3.12/site-packages (from keras>=3.5.0->tensorflow<2.19,>=2.18->tf-keras) (0.0.8)\n",
62
+ "Requirement already satisfied: optree in ./venv/lib/python3.12/site-packages (from keras>=3.5.0->tensorflow<2.19,>=2.18->tf-keras) (0.13.0)\n",
63
+ "Requirement already satisfied: markdown>=2.6.8 in ./venv/lib/python3.12/site-packages (from tensorboard<2.19,>=2.18->tensorflow<2.19,>=2.18->tf-keras) (3.7)\n",
64
+ "Requirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in ./venv/lib/python3.12/site-packages (from tensorboard<2.19,>=2.18->tensorflow<2.19,>=2.18->tf-keras) (0.7.2)\n",
65
+ "Requirement already satisfied: werkzeug>=1.0.1 in ./venv/lib/python3.12/site-packages (from tensorboard<2.19,>=2.18->tensorflow<2.19,>=2.18->tf-keras) (3.0.4)\n",
66
+ "Requirement already satisfied: MarkupSafe>=2.1.1 in ./venv/lib/python3.12/site-packages (from werkzeug>=1.0.1->tensorboard<2.19,>=2.18->tensorflow<2.19,>=2.18->tf-keras) (3.0.2)\n",
67
+ "Requirement already satisfied: markdown-it-py>=2.2.0 in ./venv/lib/python3.12/site-packages (from rich->keras>=3.5.0->tensorflow<2.19,>=2.18->tf-keras) (3.0.0)\n",
68
+ "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in ./venv/lib/python3.12/site-packages (from rich->keras>=3.5.0->tensorflow<2.19,>=2.18->tf-keras) (2.18.0)\n",
69
+ "Requirement already satisfied: mdurl~=0.1 in ./venv/lib/python3.12/site-packages (from markdown-it-py>=2.2.0->rich->keras>=3.5.0->tensorflow<2.19,>=2.18->tf-keras) (0.1.2)\n",
70
+ "Downloading focal_loss-0.0.7-py3-none-any.whl (19 kB)\n",
71
+ "Installing collected packages: focal-loss\n",
72
+ "Successfully installed focal-loss-0.0.7\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  "\n",
74
  "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n",
75
  "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
 
77
  }
78
  ],
79
  "source": [
80
+ "!pip install --upgrade transformers tf-keras focal-loss"
81
  ]
82
  },
83
  {
84
  "cell_type": "code",
85
+ "execution_count": 21,
86
  "metadata": {},
87
  "outputs": [],
88
  "source": [
 
93
  },
94
  {
95
  "cell_type": "code",
96
+ "execution_count": 22,
97
  "metadata": {},
98
  "outputs": [],
99
  "source": [
 
102
  },
103
  {
104
  "cell_type": "code",
105
+ "execution_count": 23,
106
  "metadata": {},
107
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
108
  "source": [
109
  "from app.travel_resolver.libs.nlp import data_processing as dp\n",
110
  "\n",
 
112
  " \"./data/bio/fr.bio/10k_train_small_samples.bio\"\n",
113
  ")\n",
114
  "\n",
115
+ "# To avoid overfitting the model on sentences that don't have any labels\n",
116
+ "# lambda_sentences, lambda_labels, _, __ = dp.from_bio_file_to_examples(\n",
117
+ "# \"./data/bio/fr.bio/1k_train_unlabeled_samples.bio\"\n",
118
+ "# )\n",
119
  "\n",
120
  "large_sentences, large_labels, _, __ = dp.from_bio_file_to_examples(\n",
121
  " \"./data/bio/fr.bio/1k_train_large_samples.bio\"\n",
122
  ")\n",
123
  "\n",
124
+ "sentences = sentences + large_sentences\n",
125
+ "labels = labels + large_labels"
126
  ]
127
  },
128
  {
129
  "cell_type": "code",
130
+ "execution_count": 24,
131
  "metadata": {},
132
  "outputs": [],
133
  "source": [
 
140
  },
141
  {
142
  "cell_type": "code",
143
+ "execution_count": 25,
144
  "metadata": {},
145
  "outputs": [],
146
  "source": [
 
166
  },
167
  {
168
  "cell_type": "code",
169
+ "execution_count": 26,
170
  "metadata": {},
171
  "outputs": [],
172
  "source": [
 
177
  },
178
  {
179
  "cell_type": "code",
180
+ "execution_count": 27,
181
  "metadata": {},
182
  "outputs": [],
183
  "source": [
 
189
  },
190
  {
191
  "cell_type": "code",
192
+ "execution_count": 28,
193
  "metadata": {},
194
  "outputs": [],
195
  "source": [
 
204
  },
205
  {
206
  "cell_type": "code",
207
+ "execution_count": 33,
208
  "metadata": {},
209
  "outputs": [],
210
  "source": [
 
227
  },
228
  {
229
  "cell_type": "code",
230
+ "execution_count": 39,
231
  "metadata": {},
232
  "outputs": [],
233
  "source": [
 
254
  },
255
  {
256
  "cell_type": "code",
257
+ "execution_count": 40,
258
+ "metadata": {},
259
+ "outputs": [],
260
+ "source": [
261
+ "def entity_accuracy(y_true, y_pred):\n",
262
+ " \"\"\"\n",
263
+ " Calculate the accuracy based on the entities. Which mean that correct `O` tags will not be taken into account.\n",
264
+ "\n",
265
+ " Parameters:\n",
266
+ " y_true (tensor): True labels.\n",
267
+ " y_pred (tensor): Predicted logits.\n",
268
+ "\n",
269
+ " Returns:\n",
270
+ " accuracy (tensor): Tag accuracy.\n",
271
+ " \"\"\"\n",
272
+ "\n",
273
+ " y_true = tf.cast(y_true, tf.float32)\n",
274
+ " # We ignore the padding and the O tag\n",
275
+ " mask = y_true > 0\n",
276
+ " mask = tf.cast(mask, tf.float32)\n",
277
+ "\n",
278
+ " y_pred_class = tf.math.argmax(y_pred, axis=-1)\n",
279
+ " y_pred_class = tf.cast(y_pred_class, tf.float32)\n",
280
+ "\n",
281
+ " matches_true_pred = tf.equal(y_true, y_pred_class)\n",
282
+ " matches_true_pred = tf.cast(matches_true_pred, tf.float32)\n",
283
+ "\n",
284
+ " matches_true_pred *= mask\n",
285
+ "\n",
286
+ " masked_acc = tf.reduce_sum(matches_true_pred) / tf.reduce_sum(mask)\n",
287
+ "\n",
288
+ " return masked_acc"
289
+ ]
290
+ },
291
+ {
292
+ "cell_type": "code",
293
+ "execution_count": 14,
294
+ "metadata": {},
295
+ "outputs": [],
296
+ "source": [
297
+ "class_weights = {0: 0.1, 1: 20.0, 2: 20.0}\n",
298
+ "\n",
299
+ "\n",
300
+ "def weighted_loss(y_true, y_pred):\n",
301
+ " weights = tf.constant(\n",
302
+ " [class_weights[i] for i in range(len(class_weights))], dtype=tf.float32\n",
303
+ " )\n",
304
+ " weights = tf.gather(\n",
305
+ " weights, tf.cast(y_true, tf.int32)\n",
306
+ " ) # Get weights for true labels\n",
307
+ " loss = tf.keras.losses.sparse_categorical_crossentropy(\n",
308
+ " y_true, y_pred, from_logits=True\n",
309
+ " )\n",
310
+ " return loss * weights"
311
+ ]
312
+ },
313
+ {
314
+ "cell_type": "code",
315
+ "execution_count": 61,
316
  "metadata": {},
317
  "outputs": [
318
  {
 
322
  "All PyTorch model weights were used when initializing TFCamembertForTokenClassification.\n",
323
  "\n",
324
  "Some weights or buffers of the TF 2.0 model TFCamembertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
325
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
 
326
  ]
327
  }
328
  ],
329
  "source": [
330
+ "from focal_loss import SparseCategoricalFocalLoss\n",
331
+ "\n",
332
  "camembert = TFAutoModelForTokenClassification.from_pretrained(\n",
333
  " \"camembert-base\", num_labels=len(unique_labels)\n",
334
  ")\n",
335
  "\n",
336
+ "loss_func = SparseCategoricalFocalLoss(\n",
337
+ " gamma=2, class_weight=[0.1, 2, 2], from_logits=True\n",
338
+ ")\n",
339
+ "\n",
340
  "camembert.compile(\n",
341
+ " optimizer=tf.keras.optimizers.legacy.Adam(5e-5),\n",
342
+ " loss=loss_func,\n",
343
+ " metrics=[\"accuracy\", entity_accuracy],\n",
344
  ")"
345
  ]
346
  },
347
  {
348
  "cell_type": "code",
349
+ "execution_count": 46,
350
  "metadata": {},
351
  "outputs": [],
352
  "source": [
353
+ "train_dataset = train_dataset.batch(32)\n",
354
+ "test_dataset = test_dataset.batch(32)"
355
  ]
356
  },
357
  {
358
  "cell_type": "code",
359
+ "execution_count": 62,
360
  "metadata": {},
361
  "outputs": [
362
  {
363
  "name": "stdout",
364
  "output_type": "stream",
365
  "text": [
366
+ "Epoch 1/4\n",
367
+ "272/272 [==============================] - 1596s 6s/step - loss: 0.0124 - accuracy: 0.9677 - entity_accuracy: 0.8099 - val_loss: 0.0038 - val_accuracy: 0.9799 - val_entity_accuracy: 0.9682\n",
368
+ "Epoch 2/4\n",
369
+ "272/272 [==============================] - 1560s 6s/step - loss: 0.0031 - accuracy: 0.9852 - entity_accuracy: 0.9684 - val_loss: 0.0019 - val_accuracy: 0.9885 - val_entity_accuracy: 0.9820\n",
370
+ "Epoch 3/4\n",
371
+ "272/272 [==============================] - 1560s 6s/step - loss: 0.0020 - accuracy: 0.9907 - entity_accuracy: 0.9767 - val_loss: 0.0016 - val_accuracy: 0.9941 - val_entity_accuracy: 0.9775\n",
372
+ "Epoch 4/4\n",
373
+ "272/272 [==============================] - 1605s 6s/step - loss: 0.0016 - accuracy: 0.9923 - entity_accuracy: 0.9789 - val_loss: 0.0017 - val_accuracy: 0.9920 - val_entity_accuracy: 0.9831\n"
374
  ]
375
  },
376
  {
377
  "data": {
378
  "text/plain": [
379
+ "<tf_keras.src.callbacks.History at 0x2dab031a0>"
380
  ]
381
  },
382
+ "execution_count": 62,
383
  "metadata": {},
384
  "output_type": "execute_result"
385
  }
 
390
  ")\n",
391
  "\n",
392
  "camembert.fit(\n",
393
+ " train_dataset, validation_data=test_dataset, epochs=4, callbacks=[callback]\n",
394
  ")"
395
  ]
396
  },
 
398
  "cell_type": "code",
399
  "execution_count": null,
400
  "metadata": {},
401
+ "outputs": [
402
+ {
403
+ "data": {
404
+ "text/plain": [
405
+ "<tf.Tensor: shape=(), dtype=float32, numpy=0.1186538115143776>"
406
+ ]
407
+ },
408
+ "execution_count": 54,
409
+ "metadata": {},
410
+ "output_type": "execute_result"
411
+ }
412
+ ],
413
+ "source": [
414
+ "from focal_loss import SparseCategoricalFocalLoss\n",
415
+ "\n",
416
+ "loss_func = SparseCategoricalFocalLoss(gamma=1)\n",
417
+ "y_true = [0, 1, 2]\n",
418
+ "y_pred = [[0.8, 0.1, 0.1], [0.2, 0.7, 0.1], [0.2, 0.2, 0.6]]\n",
419
+ "loss_func(y_true, y_pred)"
420
+ ]
421
+ },
422
+ {
423
+ "cell_type": "code",
424
+ "execution_count": 63,
425
+ "metadata": {},
426
  "outputs": [],
427
  "source": [
428
+ "camembert.save_pretrained(\"./models/camembert\")"
429
+ ]
430
+ },
431
+ {
432
+ "cell_type": "code",
433
+ "execution_count": null,
434
+ "metadata": {},
435
+ "outputs": [
436
+ {
437
+ "name": "stderr",
438
+ "output_type": "stream",
439
+ "text": [
440
+ "tf_model.h5: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 440M/440M [00:20<00:00, 21.8MB/s] \n"
441
+ ]
442
+ }
443
+ ],
444
+ "source": [
445
+ "# camembert.push_to_hub(\"CamemBERT-NER-Travel\")"
446
  ]
447
  }
448
  ],
conv_tagged_file_to_bio.py CHANGED
@@ -1,9 +1,21 @@
1
  from app.travel_resolver.libs.nlp.data_processing import from_tagged_file_to_bio_file
2
 
3
 
4
- INPUT_FILE = "./data/french_text/1k_unlabeled_samples.txt"
5
- OUTPUT_FILE = "./data/bio/fr.bio/1k_unlabeled_samples.bio"
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  tag_entities_pairs = [("<Dep>", "LOC-DEP"), ("<Arr>", "LOC-ARR")]
8
 
9
- from_tagged_file_to_bio_file(INPUT_FILE, OUTPUT_FILE, tag_entities_pairs)
 
 
1
  from app.travel_resolver.libs.nlp.data_processing import from_tagged_file_to_bio_file
2
 
3
 
4
+ INPUT_FILES = [
5
+ "./data/scripting_lcs_1/1k_train_large_samples.txt",
6
+ "./data/scripting_lcs_1/10k_train_small_samples.txt",
7
+ "./data/scripting_lcs_1/100_eval_large_samples.txt",
8
+ "./data/scripting_lcs_1/800_eval_small_samples.txt",
9
+ ]
10
+
11
+ OUTPUT_FILES = [
12
+ "./data/bio/fr.bio/1k_train_large_samples.bio",
13
+ "./data/bio/fr.bio/10k_train_small_samples.bio",
14
+ "./data/bio/fr.bio/100_eval_large_samples.bio",
15
+ "./data/bio/fr.bio/800_eval_small_samples.bio",
16
+ ]
17
 
18
  tag_entities_pairs = [("<Dep>", "LOC-DEP"), ("<Arr>", "LOC-ARR")]
19
 
20
+ for i, input_file in enumerate(INPUT_FILES):
21
+ from_tagged_file_to_bio_file(input_file, OUTPUT_FILES[i], tag_entities_pairs)
deepl_ner.ipynb CHANGED
The diff for this file is too large to render. See raw diff