TymaaHammouda commited on
Commit
cfe897e
·
1 Parent(s): 2b51d25

Update ner service

Browse files
Files changed (5) hide show
  1. IBO_to_XML.py +135 -0
  2. NER_Distiller.py +138 -0
  3. XML_to_HTML.py +32 -0
  4. app.py +118 -35
  5. requirements.txt +1 -2
IBO_to_XML.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # By Wasim Khatib
2
+ # Version 2.0
3
+ # This function take a list a set of annotated entities, in this format: [["صرح","O"],
4
+ # ["رئيس","B-OCC"], ["نقابة","B-OCC B-ORG"],
5
+ # ["العاملين","I-OCC B-ORG"], ["في","I-OCC I-ORG"], ["جامعة","I-OCC I-ORG B-ORG"],
6
+ # ["بيرزيت","I-OCC I-ORG I-ORG B-LOC"],["ان","O"], ["غدا","O"], ["هو","O"], ["يوم","B-DATE"],["الخميس","I-DATE"]]
7
+ # after that it will return text of xml in this fomrat: صرح <OCC> رئيس <ORG> نقابة العاملين </ORG> </OCC> يوم في <ORG>
8
+ # جامعة <LOC> بيرزيت </LOC> </ORG> ان غدا هو <DATE> يوم الخميس </DATE>
9
+ # This function assume the input is correct and each tag must start with B- or I-, not empty tag and discard all tags
10
+ # start with ignore I- tags if they don’t have B-tags.
11
+ import numpy as np
12
+
13
+
14
+ def IBO_to_XML(temp):
15
+ xml_output = ""
16
+
17
+ temp_entities = sortTags(temp)
18
+
19
+ temp_list = list()
20
+
21
+ # initlize the temp_list
22
+ temp_list.append("")
23
+ word_position = 0
24
+
25
+ # For each entity, convert ibo to xml list.
26
+ for entity in temp_entities:
27
+ counter_tag = 0
28
+ # For each tag
29
+ for tag in str(entity[1]).split():
30
+
31
+ # If the counter tag greater than or equal to lenght of templist, if yes then we will append the empt value in templist
32
+ if counter_tag >= len(temp_list):
33
+ temp_list.append("")
34
+
35
+ # If the tag is equal O then and word position not equal zero then add all from templist to output ist
36
+ if "O" == tag and word_position != 0:
37
+ for j in range(len(temp_list),0,-1):
38
+ if temp_list[j-1]!= "":
39
+ xml_output+=" </"+str(temp_list[j-1])+">"
40
+ temp_list[j-1] = ""
41
+
42
+ # if its not equal O and its correct tag like B-tag or I-tag and its B
43
+ elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "B":
44
+ # if the templist of counter tag is not empty then we need add xml word that contains
45
+ # </name of previous tag> its mean that we closed the tag in xml in xml_output
46
+ if temp_list[counter_tag] != "":
47
+ xml_output+=" </"+str(temp_list[counter_tag])+">"
48
+ # After that we replace the previous tag from templist in new tag
49
+ temp_list[counter_tag] = str(tag).split("-")[1]
50
+ # And add xml word that contains <name of new tag> its mean we open the tag in xml in xml_output
51
+ xml_output += " <" + str(temp_list[counter_tag]) + ">"
52
+
53
+
54
+
55
+ # if its not equal O and its correct tag like B-tag or I-tag and its i and not first word postion
56
+ elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "I" and word_position != 0:
57
+ # we need to check if this tag like previous tag
58
+ for j in range(counter_tag,len(temp_list)):
59
+ # if its equal then will break the loop and continue
60
+ if temp_list[j] == tag[2:]:
61
+ break
62
+ # if not then we need to add xml word to close the tag like </name of previous> in xml_output
63
+ else:
64
+ if temp_list[j] != "":
65
+ xml_output+=" </"+str(temp_list[j])+">"
66
+ temp_list[j] = ""
67
+ counter_tag += 1
68
+ word_position += 1
69
+ # Add word in xml_output
70
+ xml_output +=" "+str(entity[0])
71
+ # Add all xml words in xml_output
72
+ for j in range(0, len(temp_list)):
73
+ if temp_list[j] != "":
74
+ xml_output+=" </"+str(temp_list[j])+">"
75
+ return xml_output.strip()
76
+
77
+
78
+ def sortTags(entities):
79
+ temp_entities = entities
80
+ temp_counter = 0
81
+ # For each entity, this loop will sort each tag of entitiy, first it will check if the
82
+ # previous tags has same count of this tag, second will sort the tags and check if this tags is correct
83
+ for entity in temp_entities:
84
+ tags = entity[1].split()
85
+ for tag in tags:
86
+ # if the counter is not 0 then, will complete
87
+ if temp_counter != 0:
88
+ # Check if this tag is equal I-, if yes then it will count how many tag in this tags and
89
+ # count how many tag in previous tags
90
+ if "I-" == tag[0:2]:
91
+ counter_of_this_tag = 0
92
+ counter_of_previous_tag = 0
93
+ for word in tags:
94
+ if tag.split("-")[1] in word:
95
+ counter_of_this_tag+=1
96
+ for word in temp_entities[temp_counter-1][1].split():
97
+ if tag.split("-")[1] in word:
98
+ counter_of_previous_tag+=1
99
+ # if the counter of previous tag is bigger than counter of this tag, then we
100
+ # need to add I-tag in this tags
101
+ if counter_of_previous_tag > counter_of_this_tag:
102
+ tags.append("I-"+tag.split("-")[1])
103
+ # Sort the tags
104
+ tags.sort()
105
+ # Need to revers the tags because it should begins with I
106
+ tags.reverse()
107
+ # If the counter is not 0 then we can complete
108
+ if temp_counter != 0:
109
+ this_tags = tags
110
+ previous_tags = temp_entities[temp_counter - 1][1].split()
111
+ sorted_tags = list()
112
+
113
+ # Check if the this tag is not O and previous tags is not O, then will complete,
114
+ # if not then it will ignor this tag
115
+ if "O" not in this_tags and "O" not in previous_tags:
116
+ index = 0
117
+ #For each previous tags, need sort this tag by previous tags if its I, B we can ignor
118
+ for i in previous_tags:
119
+ j = 0
120
+ while this_tags and j < len(this_tags):
121
+ if this_tags[j][0:2] == "I-" and this_tags[j][2:] == i[2:]:
122
+ sorted_tags.insert(index, this_tags.pop(j))
123
+ break
124
+ elif this_tags[j][0:2] == "B-":
125
+ break
126
+ j += 1
127
+ index += 1
128
+ sorted_tags += this_tags
129
+ tags = sorted_tags
130
+ str_tag = " "
131
+ str_tag = str_tag.join(tags)
132
+ str_tag = str_tag.strip()
133
+ temp_entities[temp_counter][1] = str_tag
134
+ temp_counter += 1
135
+ return temp_entities
NER_Distiller.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # By Wasim Khatib
2
+ # Version 2.0
3
+ # This function take a list a set of annotated entities, in this format: [["صرح","O"], ["رئيس","B-OCC"], ["نقابة","B-OCC B-ORG"],
4
+ # ["العاملين","I-OCC B-ORG"], ["في","I-OCC I-ORG"], ["جامعة","I-OCC I-ORG B-ORG"],
5
+ # ["بيرزيت","I-OCC I-ORG I-ORG B-LOC"],["ان","O"], ["غدا","O"], ["هو","O"], ["يوم","B-DATE"],["الخميس","I-DATE"]]
6
+ # after that it will return array of the set of distilled entities and their positions (start, end) and tags, such as
7
+ # [[" رئيس نقابة العاملين في جامعة بيرزيت", OCC,1,7],
8
+ # [" نقابة العاملين في جامعة بيرزيت", ORG,2,7], [" جامعة بيرزيت", ORG,5,7],["يوم الخميس", DATE,10,11]]
9
+ def distill_entities(entities):
10
+ # This is list that we put the output what we need
11
+ list_output = list()
12
+
13
+ # This line go to sort function and save the output to temp_entities
14
+ temp_entities = sortTags(entities)
15
+
16
+ # This list help us to make the output,
17
+ temp_list = list()
18
+
19
+ # initlize the temp_list
20
+ temp_list.append(["", "", 0, 0])
21
+ word_position = 0
22
+
23
+ # For each entity, convert ibo to distllir list.
24
+ for entity in temp_entities:
25
+ # This is counter tag of this entity
26
+ counter_tag = 0
27
+ # For each tag
28
+ for tag in str(entity[1]).split():
29
+ # If the counter tag greater than or equal to lenght of templist, if yes then we will append the empty value in templist
30
+ if counter_tag >= len(temp_list):
31
+ temp_list.append(["", "", 0, 0])
32
+
33
+ # If tag equal O and word postion of this tag is not equal zero then it will add all
34
+ # not empty eliment of temp list in output list
35
+ if "O" == tag and word_position != 0:
36
+ for j in range(0, len(temp_list)):
37
+ if temp_list[j][1] != "":
38
+ list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
39
+ temp_list[j][0] = ""
40
+ temp_list[j][1] = ""
41
+ temp_list[j][2] = word_position
42
+ temp_list[j][3] = word_position
43
+ # if this tag not equal O, and split by '-' the tag and check the lenght equals two and if the first eliment
44
+ # of the split its B
45
+ elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "B":
46
+ # if the temp_list of counter is not empty then it will append in output list and hten it will
47
+ # initilize by new string and tag in templist of counter
48
+ if temp_list[counter_tag][1] != "":
49
+ list_output.append([temp_list[counter_tag][0].strip(), temp_list[counter_tag][1], temp_list[counter_tag][2], temp_list[counter_tag][3]])
50
+ temp_list[counter_tag][0] = str(entity[0]) + " "
51
+ temp_list[counter_tag][1] = str(tag).split("-")[1]
52
+ temp_list[counter_tag][2] = word_position
53
+ temp_list[counter_tag][3] = word_position
54
+
55
+ # if this tag not equal O, and split by '-' the tag and check the lenght equals two and if the first eliment
56
+ # of the split its O
57
+ elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "I" and word_position != 0:
58
+ # For each of temp_list, check if in this counter tag of templist is same tag with this.tag
59
+ # then will complete if not it will save in output list and cheak another
60
+ for j in range(counter_tag,len(temp_list)):
61
+ if temp_list[j][1] == tag[2:] and temp_list[j][3] != word_position:
62
+ temp_list[j][0] += str(entity[0]) + " "
63
+ temp_list[j][3] += 1
64
+ break
65
+ else:
66
+ if temp_list[j][1] != "":
67
+ list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
68
+ temp_list[j][0] = ""
69
+ temp_list[j][1] = ""
70
+ temp_list[j][2] = word_position
71
+ temp_list[j][3] = word_position
72
+ counter_tag += 1
73
+ word_position += 1
74
+ # For each temp_list, at the end of the previous loop, there will be some
75
+ # values in this list, we should save it to the output list
76
+ for j in range(0, len(temp_list)):
77
+ if temp_list[j][1] != "":
78
+ list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
79
+ return sorted(list_output, key=lambda x: (x[2]))
80
+
81
+ def sortTags(entities):
82
+ temp_entities = entities
83
+ temp_counter = 0
84
+ # For each entity, this loop will sort each tag of entitiy, first it will check if the
85
+ # previous tags has same count of this tag, second will sort the tags and check if this tags is correct
86
+ for entity in temp_entities:
87
+ tags = entity[1].split()
88
+ for tag in tags:
89
+ # if the counter is not 0 then, will complete
90
+ if temp_counter != 0:
91
+ # Check if this tag is equal I-, if yes then it will count how many tag in this tags and
92
+ # count how many tag in previous tags
93
+ if "I-" == tag[0:2]:
94
+ counter_of_this_tag = 0
95
+ counter_of_previous_tag = 0
96
+ for word in tags:
97
+ if tag.split("-")[1] in word:
98
+ counter_of_this_tag+=1
99
+ for word in temp_entities[temp_counter-1][1].split():
100
+ if tag.split("-")[1] in word:
101
+ counter_of_previous_tag+=1
102
+ # if the counter of previous tag is bigger than counter of this tag, then we
103
+ # need to add I-tag in this tags
104
+ if counter_of_previous_tag > counter_of_this_tag:
105
+ tags.append("I-"+tag.split("-")[1])
106
+ # Sort the tags
107
+ tags.sort()
108
+ # Need to revers the tags because it should begins with I
109
+ tags.reverse()
110
+ # If the counter is not 0 then we can complete
111
+ if temp_counter != 0:
112
+ this_tags = tags
113
+ previous_tags = temp_entities[temp_counter - 1][1].split()
114
+ sorted_tags = list()
115
+
116
+ # Check if the this tag is not O and previous tags is not O, then will complete,
117
+ # if not then it will ignor this tag
118
+ if "O" not in this_tags and "O" not in previous_tags:
119
+ index = 0
120
+ #For each previous tags, need sort this tag by previous tags if its I, B we can ignor
121
+ for i in previous_tags:
122
+ j = 0
123
+ while this_tags and j < len(this_tags):
124
+ if this_tags[j][0:2] == "I-" and this_tags[j][2:] == i[2:]:
125
+ sorted_tags.insert(index, this_tags.pop(j))
126
+ break
127
+ elif this_tags[j][0:2] == "B-":
128
+ break
129
+ j += 1
130
+ index += 1
131
+ sorted_tags += this_tags
132
+ tags = sorted_tags
133
+ str_tag = " "
134
+ str_tag = str_tag.join(tags)
135
+ str_tag = str_tag.strip()
136
+ temp_entities[temp_counter][1] = str_tag
137
+ temp_counter += 1
138
+ return temp_entities
XML_to_HTML.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+
4
+ def NER_XML_to_HTML(xml):
5
+ html = re.sub(r'WORK_OF_ART','WORKOFART',xml)
6
+
7
+ # replace every end tag with end span tag "</span>"
8
+ html = re.sub(r'</[A-Z]+>','</span>',html)
9
+
10
+ # replace every start tag with the appropriate css class
11
+ html = re.sub(r'<PERS>','<span class="ner_pers" data-entity="PERS">',html)
12
+ html = re.sub(r'<GROUP>','<span class="ner_group" data-entity="NORP">',html)
13
+ html = re.sub(r'<OCC>','<span class="ner_occ" data-entity="OCC">',html)
14
+ html = re.sub(r'<ORG>','<span class="ner_org" data-entity="ORG">',html)
15
+ html = re.sub(r'<LOC>','<span class="ner_loc" data-entity="LOC">',html)
16
+ html = re.sub(r'<GPE>','<span class="ner_gpe" data-entity="GPE">',html)
17
+ html = re.sub(r'<FAC>','<span class="ner_fac" data-entity="FAC">',html)
18
+ html = re.sub(r'<EVENT>','<span class="ner_event" data-entity="EVENT">',html)
19
+ html = re.sub(r'<DATE>','<span class="ner_date" data-entity="DATE">',html)
20
+ html = re.sub(r'<TIME>','<span class="ner_time" data-entity="TIME">',html)
21
+ html = re.sub(r'<CARDINAL>','<span class="ner_cardinal" data-entity="CARDINAL">',html)
22
+ html = re.sub(r'<ORDINAL>','<span class="ner_ordinal" data-entity="ORDINAL">',html)
23
+ html = re.sub(r'<PERCENT>','<span class="ner_percent" data-entity="PERCENT">',html)
24
+ html = re.sub(r'<QUANTITY>','<span class="ner_quantity" data-entity="QUANTITY">',html)
25
+ html = re.sub(r'<UNIT>','<span class="ner_unit" data-entity="UNIT">',html)
26
+ html = re.sub(r'<MONEY>','<span class="ner_money" data-entity="MONEY">',html)
27
+ html = re.sub(r'<CURR>','<span class="ner_currency" data-entity="CURRENCY">',html)
28
+ html = re.sub(r'<LANGUAGE>','<span class="ner_language" data-entity="LANGUAGE">',html)
29
+ html = re.sub(r'<PRODUCT>','<span class="ner_product" data-entity="PRODUCT">',html)
30
+ html = re.sub(r'<WORKOFART>','<span class="ner_work_of_art" data-entity="WORK_OF_ART">',html)
31
+ html = re.sub(r'<LAW>','<span class="ner_law" data-entity="LAW">',html)
32
+ return html
app.py CHANGED
@@ -11,7 +11,9 @@ from Nested.utils.data import get_dataloaders, text2segments
11
  import json
12
  from pydantic import BaseModel
13
  from fastapi.responses import JSONResponse
14
- from sinatools.utils.tokenizer import sentence_tokenizer
 
 
15
 
16
  app = FastAPI()
17
  print("Version 2...")
@@ -56,7 +58,7 @@ id2label = {i: s for i, s in enumerate(label_vocab.itos)}
56
 
57
  def split_text_into_groups_of_Ns(sentence, max_words_per_sentence):
58
  # Split the text into words
59
- words = simple_word_tokenize(sentence)
60
 
61
  # Initialize variables
62
  groups = []
@@ -83,6 +85,118 @@ def split_text_into_groups_of_Ns(sentence, max_words_per_sentence):
83
 
84
  return groups
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  class NERRequest(BaseModel):
87
  text: str
88
  mode: str
@@ -90,8 +204,6 @@ class NERRequest(BaseModel):
90
  @app.post("/predict")
91
  def predict(request: NERRequest):
92
  # Load tagger
93
- tagger, tag_vocab, train_config = load_checkpoint(checkpoint_path)
94
-
95
  text = request.text
96
  mode = request.mode
97
 
@@ -103,37 +215,8 @@ def predict(request: NERRequest):
103
  for sentence in sentences:
104
  se = split_text_into_groups_of_Ns(sentence, max_words_per_sentence=300)
105
  for s in se:
106
- dataset, token_vocab = text2segments(sentence)
107
-
108
- vocabs = namedtuple("Vocab", ["tags", "tokens"])
109
- vocab = vocabs(tokens=token_vocab, tags=tag_vocab)
110
-
111
- dataloader = get_dataloaders(
112
- (dataset,),
113
- vocab,
114
- args_data,
115
- batch_size=32,
116
- shuffle=(False,),
117
- )[0]
118
-
119
- segments = tagger.infer(dataloader)
120
-
121
- # lists = []
122
-
123
- for segment in segments:
124
- for token in segment:
125
- item = {}
126
- item["token"] = token.text
127
-
128
- list_of_tags = [t["tag"] for t in token.pred_tag]
129
- list_of_tags = [i for i in list_of_tags if i not in ("O", " ", "")]
130
-
131
- if not list_of_tags:
132
- item["tags"] = ["O"]
133
- else:
134
- item["tags"] = list_of_tags
135
-
136
- lists.append(item)
137
 
138
  content = {
139
  "resp": lists,
 
11
  import json
12
  from pydantic import BaseModel
13
  from fastapi.responses import JSONResponse
14
+ from IBO_to_XML import IBO_to_XML
15
+ from XML_to_HTML import NER_XML_to_HTML
16
+ from NER_Distiller import distill_entities
17
 
18
  app = FastAPI()
19
  print("Version 2...")
 
58
 
59
  def split_text_into_groups_of_Ns(sentence, max_words_per_sentence):
60
  # Split the text into words
61
+ words = sentence.split()
62
 
63
  # Initialize variables
64
  groups = []
 
85
 
86
  return groups
87
 
88
+
89
+
90
+ def remove_empty_values(sentences):
91
+ return [value for value in sentences if value != '']
92
+
93
+
94
+ def sentence_tokenizer(text, dot=True, new_line=True, question_mark=True, exclamation_mark=True):
95
+ separators = []
96
+ split_text = [text]
97
+ if new_line==True:
98
+ separators.append('\n')
99
+ if dot==True:
100
+ separators.append('.')
101
+ if question_mark==True:
102
+ separators.append('?')
103
+ separators.append('؟')
104
+ if exclamation_mark==True:
105
+ separators.append('!')
106
+
107
+ for sep in separators:
108
+ new_split_text = []
109
+ for part in split_text:
110
+ tokens = part.split(sep)
111
+ tokens_with_separator = [token + sep for token in tokens[:-1]]
112
+ tokens_with_separator.append(tokens[-1].strip())
113
+ new_split_text.extend(tokens_with_separator)
114
+ split_text = new_split_text
115
+
116
+ split_text = remove_empty_values(split_text)
117
+ return split_text
118
+
119
+ def jsons_to_list_of_lists(json_list):
120
+ return [[d['token'], d['tags']] for d in json_list]
121
+
122
+ tagger, tag_vocab, train_config = load_checkpoint(checkpoint_path)
123
+
124
+ def extract(sentence):
125
+ dataset, token_vocab = text2segments(sentence)
126
+
127
+ vocabs = namedtuple("Vocab", ["tags", "tokens"])
128
+ vocab = vocabs(tokens=token_vocab, tags=tag_vocab)
129
+
130
+ dataloader = get_dataloaders(
131
+ (dataset,),
132
+ vocab,
133
+ args_data,
134
+ batch_size=32,
135
+ shuffle=(False,),
136
+ )[0]
137
+
138
+ segments = tagger.infer(dataloader)
139
+
140
+ lists = []
141
+
142
+ for segment in segments:
143
+ for token in segment:
144
+ item = {}
145
+ item["token"] = token.text
146
+
147
+ list_of_tags = [t["tag"] for t in token.pred_tag]
148
+ list_of_tags = [i for i in list_of_tags if i not in ("O", " ", "")]
149
+
150
+ if not list_of_tags:
151
+ item["tags"] = ["O"]
152
+ else:
153
+ item["tags"] = list_of_tags
154
+ lists.append(item)
155
+ return lists
156
+
157
+
158
+ def NER(sentence, mode):
159
+ #print("within NER, and mode is: ", mode)
160
+ output_list = []
161
+ xml = ""
162
+ if mode.strip() == "1":
163
+ output_list = jsons_to_list_of_lists(extract(sentence))
164
+ return output_list
165
+ elif mode.strip() == "2":
166
+ if output_list != []:
167
+ xml = IBO_to_XML(output_list)
168
+ return xml
169
+ else:
170
+ output_list = jsons_to_list_of_lists(extract(sentence))
171
+ xml = IBO_to_XML(output_list)
172
+ return xml
173
+
174
+ elif mode.strip() == "3":
175
+ print("mode is 3")
176
+ if xml != "":
177
+ #print("in if")
178
+ html = NER_XML_to_HTML(xml)
179
+ return html
180
+ else:
181
+ print("in else : ")
182
+ print("extract : ", extract(sentence))
183
+ output_list = jsons_to_list_of_lists(extract(sentence))
184
+ #print("output list : ", output_list)
185
+ xml = IBO_to_XML(output_list)
186
+ html = NER_XML_to_HTML(xml)
187
+ return html
188
+
189
+ elif mode.strip() == "4": # json short
190
+ if output_list != []:
191
+ json_short = distill_entities(output_list)
192
+ return json_short
193
+ else:
194
+ output_list = jsons_to_list_of_lists(extract(sentence))
195
+ json_short = distill_entities(output_list)
196
+ return json_short
197
+
198
+
199
+
200
  class NERRequest(BaseModel):
201
  text: str
202
  mode: str
 
204
  @app.post("/predict")
205
  def predict(request: NERRequest):
206
  # Load tagger
 
 
207
  text = request.text
208
  mode = request.mode
209
 
 
215
  for sentence in sentences:
216
  se = split_text_into_groups_of_Ns(sentence, max_words_per_sentence=300)
217
  for s in se:
218
+ output_list = NER(s, mode)
219
+ lists.append(output_list)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
 
221
  content = {
222
  "resp": lists,
requirements.txt CHANGED
@@ -5,5 +5,4 @@ numpy
5
  huggingface_hub
6
  transformers
7
  natsort
8
- seqeval
9
- sinatools
 
5
  huggingface_hub
6
  transformers
7
  natsort
8
+ seqeval