Az-r-ow commited on
Commit
e54bb7f
·
1 Parent(s): fd76209

WIP(ner): HMM for NER

Browse files
app/travel_resolver/libs/nlp/data_processing.py CHANGED
@@ -34,40 +34,35 @@ def get_tagged_content(sentence: str, tag: str) -> str | None:
34
  return None
35
 
36
 
37
- def process_sentence(sentence: str, dep_token="<Dep>", arr_token="<Arr>") -> tuple:
 
 
 
38
  """
39
- Given a sentence, extract the departure and arrival locations and tokenize the sentence.
40
- Then assign labels to the tokens based on whether they are part of the departure or arrival locations.
41
- Finally, return the inputs and labels will be returned.
42
 
43
- Args:
44
- sentence (str): The sentence to process.
45
- dep_token (str): The token to mark the departure location.
46
- arr_token (str): The token to mark the arrival location.
47
 
48
- Returns:
49
- tuple: A tuple containing the inputs and labels (inputs, labels).
 
 
 
 
50
  """
51
- bare_sentence = sentence.replace(dep_token, "").replace(arr_token, "")
52
- departure = get_tagged_content(sentence, dep_token)
53
- arrival = get_tagged_content(sentence, arr_token)
54
- tokenized_sentence = nltk.word_tokenize(bare_sentence)
55
- labels = []
56
- inputs = []
57
  for token in tokenized_sentence:
58
- if token in departure:
59
- departure_labels = [2] * len(token)
60
- labels.extend(departure_labels)
61
- elif token in arrival:
62
- arrival_labels = [3] * len(token)
63
- labels.extend(arrival_labels)
64
- else:
65
- default_labels = [1] * len(token)
66
- labels.extend(default_labels)
67
- int_chars = [ord(char) for char in token]
68
- inputs.extend(int_chars)
69
 
70
- return (inputs, labels)
71
 
72
 
73
  def convert_tagged_sentence_to_bio(
@@ -161,9 +156,12 @@ def from_tagged_file_to_bio_file(
161
  file.write(bio_format + "\n")
162
 
163
 
164
- def from_bio_file_to_examples(file_path: str) -> tuple:
165
  """
166
- Given a file path, read the file and convert the content to a tuple of inputs and labels.
 
 
 
167
 
168
  Args:
169
  file_path (str): The path to the file to read.
@@ -172,12 +170,13 @@ def from_bio_file_to_examples(file_path: str) -> tuple:
172
  tuple: A tuple containing the inputs and labels (inputs, labels).
173
  """
174
  stop_sentences = [".", "?", "!"]
 
175
 
176
  with open(file_path, "r") as file:
177
  content = file.read()
178
  lines = content.split("\n")
179
 
180
- inputs = []
181
  labels = []
182
 
183
  unique_labels = set()
@@ -187,6 +186,11 @@ def from_bio_file_to_examples(file_path: str) -> tuple:
187
  if (len(line.split(" "))) < 2:
188
  continue
189
  word, label = line.split(" ")
 
 
 
 
 
190
  unique_labels.add(label)
191
 
192
  unique_labels = list(unique_labels)
@@ -195,25 +199,34 @@ def from_bio_file_to_examples(file_path: str) -> tuple:
195
  unique_labels = sorted(unique_labels, key=lambda x: (x != "O", x))
196
 
197
  # mapping labels to ids
198
- unique_labels = {label: i + 1 for i, label in enumerate(unique_labels)}
 
 
 
199
 
200
- sentence_logits = []
201
  sentence_labels = []
202
  for line in lines:
203
  if (len(line.split(" "))) < 2:
204
  continue
205
  word, label = line.split(" ")
206
- ascii_code_chars = [ord(char) for char in word]
207
- chars_labels = [unique_labels[label]] * len(ascii_code_chars)
208
- sentence_logits.extend(ascii_code_chars)
209
- sentence_labels.extend(chars_labels)
 
 
 
 
 
 
210
  if word in stop_sentences:
211
- inputs.append(sentence_logits)
212
  labels.append(sentence_labels)
213
- sentence_logits = []
214
  sentence_labels = []
215
 
216
- return (inputs, labels)
217
 
218
 
219
  def from_examples_to_tf_dataset(
 
34
  return None
35
 
36
 
37
+ def process_sentence(
38
+ sentence: str,
39
+ stemming: bool = False,
40
+ ) -> str:
41
  """
42
+ Given a sentence, apply some processing techniques to the sentence and return the processed sentence
 
 
43
 
44
+ **Note**: We are stemming the tokens instead of lemmatizing them because stemming is faster and in our case
45
+ we are interested in getting a response the fastest way possible.
 
 
46
 
47
+ Args:
48
+ sentence (str): The sentence to process.
49
+ stemming (bool): Whether to stem the tokens.
50
+
51
+ Returns:
52
+ str: The processed sentence
53
  """
54
+ tokenized_sentence = nltk.word_tokenize(sentence)
55
+ stemmer = nltk.stem.snowball.FrenchStemmer()
56
+
57
+ processed_sentence = ""
58
+
 
59
  for token in tokenized_sentence:
60
+ token = token if not stemming else stemmer.stem(token)
61
+ processed_sentence += token + " "
62
+
63
+ processed_sentence = processed_sentence.strip()
 
 
 
 
 
 
 
64
 
65
+ return processed_sentence
66
 
67
 
68
  def convert_tagged_sentence_to_bio(
 
156
  file.write(bio_format + "\n")
157
 
158
 
159
+ def from_bio_file_to_examples(file_path: str, process_sentence: bool = False) -> tuple:
160
  """
161
+ Given a file path, read the file and convert the content to a tuple of sentences and their respective labels vectors.
162
+
163
+ **Note**: We are stemming the tokens instead of lemmatizing them because stemming is faster and in our case
164
+ we are interested in getting a response the fastest way possible.
165
 
166
  Args:
167
  file_path (str): The path to the file to read.
 
170
  tuple: A tuple containing the inputs and labels (inputs, labels).
171
  """
172
  stop_sentences = [".", "?", "!"]
173
+ stemmer = nltk.stem.snowball.FrenchStemmer()
174
 
175
  with open(file_path, "r") as file:
176
  content = file.read()
177
  lines = content.split("\n")
178
 
179
+ sentences = []
180
  labels = []
181
 
182
  unique_labels = set()
 
186
  if (len(line.split(" "))) < 2:
187
  continue
188
  word, label = line.split(" ")
189
+ label = (
190
+ "-".join(label.split("-")[-2:])
191
+ if label.startswith("B") or label.startswith("I")
192
+ else label
193
+ )
194
  unique_labels.add(label)
195
 
196
  unique_labels = list(unique_labels)
 
199
  unique_labels = sorted(unique_labels, key=lambda x: (x != "O", x))
200
 
201
  # mapping labels to ids
202
+ unique_labels = {label: i for i, label in enumerate(unique_labels)}
203
+
204
+ # tracking the vocabulary
205
+ vocab = set()
206
 
207
+ sentence_words = []
208
  sentence_labels = []
209
  for line in lines:
210
  if (len(line.split(" "))) < 2:
211
  continue
212
  word, label = line.split(" ")
213
+ label = (
214
+ "-".join(label.split("-")[-2:])
215
+ if label.startswith("B") or label.startswith("I")
216
+ else label
217
+ )
218
+ word = word if not process_sentence else stemmer.stem(word)
219
+ label = unique_labels[label]
220
+ sentence_words.append(word)
221
+ sentence_labels.append(label)
222
+ vocab.add(word)
223
  if word in stop_sentences:
224
+ sentences.append(" ".join(sentence_words))
225
  labels.append(sentence_labels)
226
+ sentence_words = []
227
  sentence_labels = []
228
 
229
+ return (sentences, labels, vocab, unique_labels)
230
 
231
 
232
  def from_examples_to_tf_dataset(
data/scripting_lcs_1/script.py CHANGED
@@ -56,7 +56,7 @@ def get_cities() -> List:
56
  """
57
 
58
  cities = []
59
- with open("../sncf_stations_database.csv", "r") as csvfile:
60
  reader = csv.DictReader(csvfile, delimiter=";")
61
  for row in reader:
62
  cities.append(row["COMMUNE"])
 
56
  """
57
 
58
  cities = []
59
+ with open("../sncf/sncf_stations_database.csv", "r") as csvfile:
60
  reader = csv.DictReader(csvfile, delimiter=";")
61
  for row in reader:
62
  cities.append(row["COMMUNE"])
hmm_ner.ipynb ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Hidden Markov Model for NER\n"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": 1,
13
+ "metadata": {},
14
+ "outputs": [
15
+ {
16
+ "name": "stderr",
17
+ "output_type": "stream",
18
+ "text": [
19
+ "[nltk_data] Downloading package punkt_tab to /Users/az-r-\n",
20
+ "[nltk_data] ow/nltk_data...\n",
21
+ "[nltk_data] Package punkt_tab is already up-to-date!\n"
22
+ ]
23
+ }
24
+ ],
25
+ "source": [
26
+ "from app.travel_resolver.libs.nlp.data_processing import from_bio_file_to_examples\n",
27
+ "\n",
28
+ "BIO_FILE = \"data/bio/fr.bio/1k_samples.bio\"\n",
29
+ "\n",
30
+ "sentences, labels, vocab, unique_labels = from_bio_file_to_examples(\n",
31
+ " BIO_FILE, process_sentence=True\n",
32
+ ")"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": 3,
38
+ "metadata": {},
39
+ "outputs": [],
40
+ "source": [
41
+ "def t2_given_t1(\n",
42
+ " t2: str | int,\n",
43
+ " t1: str | int,\n",
44
+ " train_bag=labels,\n",
45
+ " unique_labels_mapping: dict = unique_labels,\n",
46
+ "):\n",
47
+ " \"\"\"\n",
48
+ " Get the probability of getting t2 given t1 in the given labels\n",
49
+ "\n",
50
+ " Args:\n",
51
+ " t2: str | int, the second tag\n",
52
+ " t1: str | int, the first tag\n",
53
+ " train_bag: list, the list of labels\n",
54
+ "\n",
55
+ " Returns:\n",
56
+ " float, the probability of getting t2 given t1\n",
57
+ " \"\"\"\n",
58
+ " t1 = t1 if isinstance(t1, int) else unique_labels_mapping[t1]\n",
59
+ " t2 = t2 if isinstance(t2, int) else unique_labels_mapping[t2]\n",
60
+ " count_t1 = 0\n",
61
+ " count_t2_t1 = 0\n",
62
+ " for row in train_bag:\n",
63
+ " for index in range(len(row) - 1):\n",
64
+ " if row[index] == t1:\n",
65
+ " count_t1 += 1\n",
66
+ " if row[index] == t1 and row[index + 1] == t2:\n",
67
+ " count_t2_t1 += 1\n",
68
+ " return count_t2_t1 / count_t1"
69
+ ]
70
+ },
71
+ {
72
+ "cell_type": "markdown",
73
+ "metadata": {},
74
+ "source": [
75
+ "In the next part, we will be getting the **transition matrix** which represents the _probability_ of transitioning from a state to another $P(S_2 | S_1)$. In our case it would be for example $P(O | \\text{ARR-LOC})$.\n"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "code",
80
+ "execution_count": 17,
81
+ "metadata": {},
82
+ "outputs": [
83
+ {
84
+ "data": {
85
+ "text/plain": [
86
+ "array([[0.81054505, 0.93784787, 0.93186004],\n",
87
+ " [0.09468065, 0.06215213, 0. ],\n",
88
+ " [0.0947743 , 0. , 0.06813996]])"
89
+ ]
90
+ },
91
+ "execution_count": 17,
92
+ "metadata": {},
93
+ "output_type": "execute_result"
94
+ }
95
+ ],
96
+ "source": [
97
+ "import numpy as np\n",
98
+ "\n",
99
+ "tags = list(unique_labels.keys())\n",
100
+ "n_tags = len(tags)\n",
101
+ "\n",
102
+ "trans_matrix = np.zeros((n_tags, n_tags))\n",
103
+ "\n",
104
+ "for t1 in range(n_tags):\n",
105
+ " for t2 in range(n_tags):\n",
106
+ " trans_matrix[t1][t2] = t2_given_t1(tags[t1], tags[t2])\n",
107
+ "\n",
108
+ "trans_matrix"
109
+ ]
110
+ },
111
+ {
112
+ "cell_type": "code",
113
+ "execution_count": 19,
114
+ "metadata": {},
115
+ "outputs": [
116
+ {
117
+ "data": {
118
+ "text/html": [
119
+ "<div>\n",
120
+ "<style scoped>\n",
121
+ " .dataframe tbody tr th:only-of-type {\n",
122
+ " vertical-align: middle;\n",
123
+ " }\n",
124
+ "\n",
125
+ " .dataframe tbody tr th {\n",
126
+ " vertical-align: top;\n",
127
+ " }\n",
128
+ "\n",
129
+ " .dataframe thead th {\n",
130
+ " text-align: right;\n",
131
+ " }\n",
132
+ "</style>\n",
133
+ "<table border=\"1\" class=\"dataframe\">\n",
134
+ " <thead>\n",
135
+ " <tr style=\"text-align: right;\">\n",
136
+ " <th></th>\n",
137
+ " <th>O</th>\n",
138
+ " <th>LOC-ARR</th>\n",
139
+ " <th>LOC-DEP</th>\n",
140
+ " </tr>\n",
141
+ " </thead>\n",
142
+ " <tbody>\n",
143
+ " <tr>\n",
144
+ " <th>O</th>\n",
145
+ " <td>0.810545</td>\n",
146
+ " <td>0.937848</td>\n",
147
+ " <td>0.93186</td>\n",
148
+ " </tr>\n",
149
+ " <tr>\n",
150
+ " <th>LOC-ARR</th>\n",
151
+ " <td>0.094681</td>\n",
152
+ " <td>0.062152</td>\n",
153
+ " <td>0.00000</td>\n",
154
+ " </tr>\n",
155
+ " <tr>\n",
156
+ " <th>LOC-DEP</th>\n",
157
+ " <td>0.094774</td>\n",
158
+ " <td>0.000000</td>\n",
159
+ " <td>0.06814</td>\n",
160
+ " </tr>\n",
161
+ " </tbody>\n",
162
+ "</table>\n",
163
+ "</div>"
164
+ ],
165
+ "text/plain": [
166
+ " O LOC-ARR LOC-DEP\n",
167
+ "O 0.810545 0.937848 0.93186\n",
168
+ "LOC-ARR 0.094681 0.062152 0.00000\n",
169
+ "LOC-DEP 0.094774 0.000000 0.06814"
170
+ ]
171
+ },
172
+ "execution_count": 19,
173
+ "metadata": {},
174
+ "output_type": "execute_result"
175
+ }
176
+ ],
177
+ "source": [
178
+ "import pandas as pd\n",
179
+ "\n",
180
+ "trans_matrix_df = pd.DataFrame(trans_matrix, columns=tags, index=tags)\n",
181
+ "\n",
182
+ "trans_matrix_df"
183
+ ]
184
+ },
185
+ {
186
+ "cell_type": "code",
187
+ "execution_count": 34,
188
+ "metadata": {},
189
+ "outputs": [
190
+ {
191
+ "data": {
192
+ "image/png": "",
193
+ "text/plain": [
194
+ "<Figure size 1800x1200 with 2 Axes>"
195
+ ]
196
+ },
197
+ "metadata": {},
198
+ "output_type": "display_data"
199
+ }
200
+ ],
201
+ "source": [
202
+ "import matplotlib.pyplot as plt\n",
203
+ "import seaborn as sns\n",
204
+ "\n",
205
+ "plt.figure(figsize=(18, 12))\n",
206
+ "\n",
207
+ "sns.set(font_scale=1.9)\n",
208
+ "\n",
209
+ "sns.heatmap(trans_matrix_df, annot=True, square=True, annot_kws={\"fontsize\": 20})\n",
210
+ "\n",
211
+ "plt.title(\"Transition Matrix\")\n",
212
+ "\n",
213
+ "plt.show()"
214
+ ]
215
+ },
216
+ {
217
+ "cell_type": "code",
218
+ "execution_count": 48,
219
+ "metadata": {},
220
+ "outputs": [],
221
+ "source": [
222
+ "def get_emission_prob(sentences=sentences, labels=labels, unique_labels=unique_labels):\n",
223
+ " tags = list(unique_labels.keys())\n",
224
+ " word_label_count = {}\n",
225
+ " sample_count = {tag: 0 for tag in tags}\n",
226
+ " for i in range(len(sentences)):\n",
227
+ " for word, label in zip(sentences[i].split(\" \"), labels[i]):\n",
228
+ " if word not in word_label_count:\n",
229
+ " word_label_count[word] = sample_count.copy()\n",
230
+ " else:\n",
231
+ " word_label_count[word][tags[label]] += 1\n",
232
+ " return word_label_count"
233
+ ]
234
+ }
235
+ ],
236
+ "metadata": {
237
+ "kernelspec": {
238
+ "display_name": "venv",
239
+ "language": "python",
240
+ "name": "python3"
241
+ },
242
+ "language_info": {
243
+ "codemirror_mode": {
244
+ "name": "ipython",
245
+ "version": 3
246
+ },
247
+ "file_extension": ".py",
248
+ "mimetype": "text/x-python",
249
+ "name": "python",
250
+ "nbconvert_exporter": "python",
251
+ "pygments_lexer": "ipython3",
252
+ "version": "3.12.4"
253
+ }
254
+ },
255
+ "nbformat": 4,
256
+ "nbformat_minor": 2
257
+ }
requirements.txt CHANGED
@@ -1,4 +1,7 @@
1
  nltk==3.9.1
2
  numpy
3
  tensorflow==2.17.0
4
- tqdm==4.66.5
 
 
 
 
1
  nltk==3.9.1
2
  numpy
3
  tensorflow==2.17.0
4
+ tqdm==4.66.5
5
+ seaborn
6
+ pillow
7
+ matplotlib