Spaces:
Running
Running
Commit
·
cfe897e
1
Parent(s):
2b51d25
Update ner service
Browse files- IBO_to_XML.py +135 -0
- NER_Distiller.py +138 -0
- XML_to_HTML.py +32 -0
- app.py +118 -35
- requirements.txt +1 -2
IBO_to_XML.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# By Wasim Khatib
|
| 2 |
+
# Version 2.0
|
| 3 |
+
# This function take a list a set of annotated entities, in this format: [["صرح","O"],
|
| 4 |
+
# ["رئيس","B-OCC"], ["نقابة","B-OCC B-ORG"],
|
| 5 |
+
# ["العاملين","I-OCC B-ORG"], ["في","I-OCC I-ORG"], ["جامعة","I-OCC I-ORG B-ORG"],
|
| 6 |
+
# ["بيرزيت","I-OCC I-ORG I-ORG B-LOC"],["ان","O"], ["غدا","O"], ["هو","O"], ["يوم","B-DATE"],["الخميس","I-DATE"]]
|
| 7 |
+
# after that it will return text of xml in this fomrat: صرح <OCC> رئيس <ORG> نقابة العاملين </ORG> </OCC> يوم في <ORG>
|
| 8 |
+
# جامعة <LOC> بيرزيت </LOC> </ORG> ان غدا هو <DATE> يوم الخميس </DATE>
|
| 9 |
+
# This function assume the input is correct and each tag must start with B- or I-, not empty tag and discard all tags
|
| 10 |
+
# start with ignore I- tags if they don’t have B-tags.
|
| 11 |
+
import numpy as np
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def IBO_to_XML(temp):
|
| 15 |
+
xml_output = ""
|
| 16 |
+
|
| 17 |
+
temp_entities = sortTags(temp)
|
| 18 |
+
|
| 19 |
+
temp_list = list()
|
| 20 |
+
|
| 21 |
+
# initlize the temp_list
|
| 22 |
+
temp_list.append("")
|
| 23 |
+
word_position = 0
|
| 24 |
+
|
| 25 |
+
# For each entity, convert ibo to xml list.
|
| 26 |
+
for entity in temp_entities:
|
| 27 |
+
counter_tag = 0
|
| 28 |
+
# For each tag
|
| 29 |
+
for tag in str(entity[1]).split():
|
| 30 |
+
|
| 31 |
+
# If the counter tag greater than or equal to lenght of templist, if yes then we will append the empt value in templist
|
| 32 |
+
if counter_tag >= len(temp_list):
|
| 33 |
+
temp_list.append("")
|
| 34 |
+
|
| 35 |
+
# If the tag is equal O then and word position not equal zero then add all from templist to output ist
|
| 36 |
+
if "O" == tag and word_position != 0:
|
| 37 |
+
for j in range(len(temp_list),0,-1):
|
| 38 |
+
if temp_list[j-1]!= "":
|
| 39 |
+
xml_output+=" </"+str(temp_list[j-1])+">"
|
| 40 |
+
temp_list[j-1] = ""
|
| 41 |
+
|
| 42 |
+
# if its not equal O and its correct tag like B-tag or I-tag and its B
|
| 43 |
+
elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "B":
|
| 44 |
+
# if the templist of counter tag is not empty then we need add xml word that contains
|
| 45 |
+
# </name of previous tag> its mean that we closed the tag in xml in xml_output
|
| 46 |
+
if temp_list[counter_tag] != "":
|
| 47 |
+
xml_output+=" </"+str(temp_list[counter_tag])+">"
|
| 48 |
+
# After that we replace the previous tag from templist in new tag
|
| 49 |
+
temp_list[counter_tag] = str(tag).split("-")[1]
|
| 50 |
+
# And add xml word that contains <name of new tag> its mean we open the tag in xml in xml_output
|
| 51 |
+
xml_output += " <" + str(temp_list[counter_tag]) + ">"
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# if its not equal O and its correct tag like B-tag or I-tag and its i and not first word postion
|
| 56 |
+
elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "I" and word_position != 0:
|
| 57 |
+
# we need to check if this tag like previous tag
|
| 58 |
+
for j in range(counter_tag,len(temp_list)):
|
| 59 |
+
# if its equal then will break the loop and continue
|
| 60 |
+
if temp_list[j] == tag[2:]:
|
| 61 |
+
break
|
| 62 |
+
# if not then we need to add xml word to close the tag like </name of previous> in xml_output
|
| 63 |
+
else:
|
| 64 |
+
if temp_list[j] != "":
|
| 65 |
+
xml_output+=" </"+str(temp_list[j])+">"
|
| 66 |
+
temp_list[j] = ""
|
| 67 |
+
counter_tag += 1
|
| 68 |
+
word_position += 1
|
| 69 |
+
# Add word in xml_output
|
| 70 |
+
xml_output +=" "+str(entity[0])
|
| 71 |
+
# Add all xml words in xml_output
|
| 72 |
+
for j in range(0, len(temp_list)):
|
| 73 |
+
if temp_list[j] != "":
|
| 74 |
+
xml_output+=" </"+str(temp_list[j])+">"
|
| 75 |
+
return xml_output.strip()
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def sortTags(entities):
|
| 79 |
+
temp_entities = entities
|
| 80 |
+
temp_counter = 0
|
| 81 |
+
# For each entity, this loop will sort each tag of entitiy, first it will check if the
|
| 82 |
+
# previous tags has same count of this tag, second will sort the tags and check if this tags is correct
|
| 83 |
+
for entity in temp_entities:
|
| 84 |
+
tags = entity[1].split()
|
| 85 |
+
for tag in tags:
|
| 86 |
+
# if the counter is not 0 then, will complete
|
| 87 |
+
if temp_counter != 0:
|
| 88 |
+
# Check if this tag is equal I-, if yes then it will count how many tag in this tags and
|
| 89 |
+
# count how many tag in previous tags
|
| 90 |
+
if "I-" == tag[0:2]:
|
| 91 |
+
counter_of_this_tag = 0
|
| 92 |
+
counter_of_previous_tag = 0
|
| 93 |
+
for word in tags:
|
| 94 |
+
if tag.split("-")[1] in word:
|
| 95 |
+
counter_of_this_tag+=1
|
| 96 |
+
for word in temp_entities[temp_counter-1][1].split():
|
| 97 |
+
if tag.split("-")[1] in word:
|
| 98 |
+
counter_of_previous_tag+=1
|
| 99 |
+
# if the counter of previous tag is bigger than counter of this tag, then we
|
| 100 |
+
# need to add I-tag in this tags
|
| 101 |
+
if counter_of_previous_tag > counter_of_this_tag:
|
| 102 |
+
tags.append("I-"+tag.split("-")[1])
|
| 103 |
+
# Sort the tags
|
| 104 |
+
tags.sort()
|
| 105 |
+
# Need to revers the tags because it should begins with I
|
| 106 |
+
tags.reverse()
|
| 107 |
+
# If the counter is not 0 then we can complete
|
| 108 |
+
if temp_counter != 0:
|
| 109 |
+
this_tags = tags
|
| 110 |
+
previous_tags = temp_entities[temp_counter - 1][1].split()
|
| 111 |
+
sorted_tags = list()
|
| 112 |
+
|
| 113 |
+
# Check if the this tag is not O and previous tags is not O, then will complete,
|
| 114 |
+
# if not then it will ignor this tag
|
| 115 |
+
if "O" not in this_tags and "O" not in previous_tags:
|
| 116 |
+
index = 0
|
| 117 |
+
#For each previous tags, need sort this tag by previous tags if its I, B we can ignor
|
| 118 |
+
for i in previous_tags:
|
| 119 |
+
j = 0
|
| 120 |
+
while this_tags and j < len(this_tags):
|
| 121 |
+
if this_tags[j][0:2] == "I-" and this_tags[j][2:] == i[2:]:
|
| 122 |
+
sorted_tags.insert(index, this_tags.pop(j))
|
| 123 |
+
break
|
| 124 |
+
elif this_tags[j][0:2] == "B-":
|
| 125 |
+
break
|
| 126 |
+
j += 1
|
| 127 |
+
index += 1
|
| 128 |
+
sorted_tags += this_tags
|
| 129 |
+
tags = sorted_tags
|
| 130 |
+
str_tag = " "
|
| 131 |
+
str_tag = str_tag.join(tags)
|
| 132 |
+
str_tag = str_tag.strip()
|
| 133 |
+
temp_entities[temp_counter][1] = str_tag
|
| 134 |
+
temp_counter += 1
|
| 135 |
+
return temp_entities
|
NER_Distiller.py
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# By Wasim Khatib
|
| 2 |
+
# Version 2.0
|
| 3 |
+
# This function take a list a set of annotated entities, in this format: [["صرح","O"], ["رئيس","B-OCC"], ["نقابة","B-OCC B-ORG"],
|
| 4 |
+
# ["العاملين","I-OCC B-ORG"], ["في","I-OCC I-ORG"], ["جامعة","I-OCC I-ORG B-ORG"],
|
| 5 |
+
# ["بيرزيت","I-OCC I-ORG I-ORG B-LOC"],["ان","O"], ["غدا","O"], ["هو","O"], ["يوم","B-DATE"],["الخميس","I-DATE"]]
|
| 6 |
+
# after that it will return array of the set of distilled entities and their positions (start, end) and tags, such as
|
| 7 |
+
# [[" رئيس نقابة العاملين في جامعة بيرزيت", OCC,1,7],
|
| 8 |
+
# [" نقابة العاملين في جامعة بيرزيت", ORG,2,7], [" جامعة بيرزيت", ORG,5,7],["يوم الخميس", DATE,10,11]]
|
| 9 |
+
def distill_entities(entities):
|
| 10 |
+
# This is list that we put the output what we need
|
| 11 |
+
list_output = list()
|
| 12 |
+
|
| 13 |
+
# This line go to sort function and save the output to temp_entities
|
| 14 |
+
temp_entities = sortTags(entities)
|
| 15 |
+
|
| 16 |
+
# This list help us to make the output,
|
| 17 |
+
temp_list = list()
|
| 18 |
+
|
| 19 |
+
# initlize the temp_list
|
| 20 |
+
temp_list.append(["", "", 0, 0])
|
| 21 |
+
word_position = 0
|
| 22 |
+
|
| 23 |
+
# For each entity, convert ibo to distllir list.
|
| 24 |
+
for entity in temp_entities:
|
| 25 |
+
# This is counter tag of this entity
|
| 26 |
+
counter_tag = 0
|
| 27 |
+
# For each tag
|
| 28 |
+
for tag in str(entity[1]).split():
|
| 29 |
+
# If the counter tag greater than or equal to lenght of templist, if yes then we will append the empty value in templist
|
| 30 |
+
if counter_tag >= len(temp_list):
|
| 31 |
+
temp_list.append(["", "", 0, 0])
|
| 32 |
+
|
| 33 |
+
# If tag equal O and word postion of this tag is not equal zero then it will add all
|
| 34 |
+
# not empty eliment of temp list in output list
|
| 35 |
+
if "O" == tag and word_position != 0:
|
| 36 |
+
for j in range(0, len(temp_list)):
|
| 37 |
+
if temp_list[j][1] != "":
|
| 38 |
+
list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
|
| 39 |
+
temp_list[j][0] = ""
|
| 40 |
+
temp_list[j][1] = ""
|
| 41 |
+
temp_list[j][2] = word_position
|
| 42 |
+
temp_list[j][3] = word_position
|
| 43 |
+
# if this tag not equal O, and split by '-' the tag and check the lenght equals two and if the first eliment
|
| 44 |
+
# of the split its B
|
| 45 |
+
elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "B":
|
| 46 |
+
# if the temp_list of counter is not empty then it will append in output list and hten it will
|
| 47 |
+
# initilize by new string and tag in templist of counter
|
| 48 |
+
if temp_list[counter_tag][1] != "":
|
| 49 |
+
list_output.append([temp_list[counter_tag][0].strip(), temp_list[counter_tag][1], temp_list[counter_tag][2], temp_list[counter_tag][3]])
|
| 50 |
+
temp_list[counter_tag][0] = str(entity[0]) + " "
|
| 51 |
+
temp_list[counter_tag][1] = str(tag).split("-")[1]
|
| 52 |
+
temp_list[counter_tag][2] = word_position
|
| 53 |
+
temp_list[counter_tag][3] = word_position
|
| 54 |
+
|
| 55 |
+
# if this tag not equal O, and split by '-' the tag and check the lenght equals two and if the first eliment
|
| 56 |
+
# of the split its O
|
| 57 |
+
elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "I" and word_position != 0:
|
| 58 |
+
# For each of temp_list, check if in this counter tag of templist is same tag with this.tag
|
| 59 |
+
# then will complete if not it will save in output list and cheak another
|
| 60 |
+
for j in range(counter_tag,len(temp_list)):
|
| 61 |
+
if temp_list[j][1] == tag[2:] and temp_list[j][3] != word_position:
|
| 62 |
+
temp_list[j][0] += str(entity[0]) + " "
|
| 63 |
+
temp_list[j][3] += 1
|
| 64 |
+
break
|
| 65 |
+
else:
|
| 66 |
+
if temp_list[j][1] != "":
|
| 67 |
+
list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
|
| 68 |
+
temp_list[j][0] = ""
|
| 69 |
+
temp_list[j][1] = ""
|
| 70 |
+
temp_list[j][2] = word_position
|
| 71 |
+
temp_list[j][3] = word_position
|
| 72 |
+
counter_tag += 1
|
| 73 |
+
word_position += 1
|
| 74 |
+
# For each temp_list, at the end of the previous loop, there will be some
|
| 75 |
+
# values in this list, we should save it to the output list
|
| 76 |
+
for j in range(0, len(temp_list)):
|
| 77 |
+
if temp_list[j][1] != "":
|
| 78 |
+
list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
|
| 79 |
+
return sorted(list_output, key=lambda x: (x[2]))
|
| 80 |
+
|
| 81 |
+
def sortTags(entities):
|
| 82 |
+
temp_entities = entities
|
| 83 |
+
temp_counter = 0
|
| 84 |
+
# For each entity, this loop will sort each tag of entitiy, first it will check if the
|
| 85 |
+
# previous tags has same count of this tag, second will sort the tags and check if this tags is correct
|
| 86 |
+
for entity in temp_entities:
|
| 87 |
+
tags = entity[1].split()
|
| 88 |
+
for tag in tags:
|
| 89 |
+
# if the counter is not 0 then, will complete
|
| 90 |
+
if temp_counter != 0:
|
| 91 |
+
# Check if this tag is equal I-, if yes then it will count how many tag in this tags and
|
| 92 |
+
# count how many tag in previous tags
|
| 93 |
+
if "I-" == tag[0:2]:
|
| 94 |
+
counter_of_this_tag = 0
|
| 95 |
+
counter_of_previous_tag = 0
|
| 96 |
+
for word in tags:
|
| 97 |
+
if tag.split("-")[1] in word:
|
| 98 |
+
counter_of_this_tag+=1
|
| 99 |
+
for word in temp_entities[temp_counter-1][1].split():
|
| 100 |
+
if tag.split("-")[1] in word:
|
| 101 |
+
counter_of_previous_tag+=1
|
| 102 |
+
# if the counter of previous tag is bigger than counter of this tag, then we
|
| 103 |
+
# need to add I-tag in this tags
|
| 104 |
+
if counter_of_previous_tag > counter_of_this_tag:
|
| 105 |
+
tags.append("I-"+tag.split("-")[1])
|
| 106 |
+
# Sort the tags
|
| 107 |
+
tags.sort()
|
| 108 |
+
# Need to revers the tags because it should begins with I
|
| 109 |
+
tags.reverse()
|
| 110 |
+
# If the counter is not 0 then we can complete
|
| 111 |
+
if temp_counter != 0:
|
| 112 |
+
this_tags = tags
|
| 113 |
+
previous_tags = temp_entities[temp_counter - 1][1].split()
|
| 114 |
+
sorted_tags = list()
|
| 115 |
+
|
| 116 |
+
# Check if the this tag is not O and previous tags is not O, then will complete,
|
| 117 |
+
# if not then it will ignor this tag
|
| 118 |
+
if "O" not in this_tags and "O" not in previous_tags:
|
| 119 |
+
index = 0
|
| 120 |
+
#For each previous tags, need sort this tag by previous tags if its I, B we can ignor
|
| 121 |
+
for i in previous_tags:
|
| 122 |
+
j = 0
|
| 123 |
+
while this_tags and j < len(this_tags):
|
| 124 |
+
if this_tags[j][0:2] == "I-" and this_tags[j][2:] == i[2:]:
|
| 125 |
+
sorted_tags.insert(index, this_tags.pop(j))
|
| 126 |
+
break
|
| 127 |
+
elif this_tags[j][0:2] == "B-":
|
| 128 |
+
break
|
| 129 |
+
j += 1
|
| 130 |
+
index += 1
|
| 131 |
+
sorted_tags += this_tags
|
| 132 |
+
tags = sorted_tags
|
| 133 |
+
str_tag = " "
|
| 134 |
+
str_tag = str_tag.join(tags)
|
| 135 |
+
str_tag = str_tag.strip()
|
| 136 |
+
temp_entities[temp_counter][1] = str_tag
|
| 137 |
+
temp_counter += 1
|
| 138 |
+
return temp_entities
|
XML_to_HTML.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def NER_XML_to_HTML(xml):
|
| 5 |
+
html = re.sub(r'WORK_OF_ART','WORKOFART',xml)
|
| 6 |
+
|
| 7 |
+
# replace every end tag with end span tag "</span>"
|
| 8 |
+
html = re.sub(r'</[A-Z]+>','</span>',html)
|
| 9 |
+
|
| 10 |
+
# replace every start tag with the appropriate css class
|
| 11 |
+
html = re.sub(r'<PERS>','<span class="ner_pers" data-entity="PERS">',html)
|
| 12 |
+
html = re.sub(r'<GROUP>','<span class="ner_group" data-entity="NORP">',html)
|
| 13 |
+
html = re.sub(r'<OCC>','<span class="ner_occ" data-entity="OCC">',html)
|
| 14 |
+
html = re.sub(r'<ORG>','<span class="ner_org" data-entity="ORG">',html)
|
| 15 |
+
html = re.sub(r'<LOC>','<span class="ner_loc" data-entity="LOC">',html)
|
| 16 |
+
html = re.sub(r'<GPE>','<span class="ner_gpe" data-entity="GPE">',html)
|
| 17 |
+
html = re.sub(r'<FAC>','<span class="ner_fac" data-entity="FAC">',html)
|
| 18 |
+
html = re.sub(r'<EVENT>','<span class="ner_event" data-entity="EVENT">',html)
|
| 19 |
+
html = re.sub(r'<DATE>','<span class="ner_date" data-entity="DATE">',html)
|
| 20 |
+
html = re.sub(r'<TIME>','<span class="ner_time" data-entity="TIME">',html)
|
| 21 |
+
html = re.sub(r'<CARDINAL>','<span class="ner_cardinal" data-entity="CARDINAL">',html)
|
| 22 |
+
html = re.sub(r'<ORDINAL>','<span class="ner_ordinal" data-entity="ORDINAL">',html)
|
| 23 |
+
html = re.sub(r'<PERCENT>','<span class="ner_percent" data-entity="PERCENT">',html)
|
| 24 |
+
html = re.sub(r'<QUANTITY>','<span class="ner_quantity" data-entity="QUANTITY">',html)
|
| 25 |
+
html = re.sub(r'<UNIT>','<span class="ner_unit" data-entity="UNIT">',html)
|
| 26 |
+
html = re.sub(r'<MONEY>','<span class="ner_money" data-entity="MONEY">',html)
|
| 27 |
+
html = re.sub(r'<CURR>','<span class="ner_currency" data-entity="CURRENCY">',html)
|
| 28 |
+
html = re.sub(r'<LANGUAGE>','<span class="ner_language" data-entity="LANGUAGE">',html)
|
| 29 |
+
html = re.sub(r'<PRODUCT>','<span class="ner_product" data-entity="PRODUCT">',html)
|
| 30 |
+
html = re.sub(r'<WORKOFART>','<span class="ner_work_of_art" data-entity="WORK_OF_ART">',html)
|
| 31 |
+
html = re.sub(r'<LAW>','<span class="ner_law" data-entity="LAW">',html)
|
| 32 |
+
return html
|
app.py
CHANGED
|
@@ -11,7 +11,9 @@ from Nested.utils.data import get_dataloaders, text2segments
|
|
| 11 |
import json
|
| 12 |
from pydantic import BaseModel
|
| 13 |
from fastapi.responses import JSONResponse
|
| 14 |
-
from
|
|
|
|
|
|
|
| 15 |
|
| 16 |
app = FastAPI()
|
| 17 |
print("Version 2...")
|
|
@@ -56,7 +58,7 @@ id2label = {i: s for i, s in enumerate(label_vocab.itos)}
|
|
| 56 |
|
| 57 |
def split_text_into_groups_of_Ns(sentence, max_words_per_sentence):
|
| 58 |
# Split the text into words
|
| 59 |
-
words =
|
| 60 |
|
| 61 |
# Initialize variables
|
| 62 |
groups = []
|
|
@@ -83,6 +85,118 @@ def split_text_into_groups_of_Ns(sentence, max_words_per_sentence):
|
|
| 83 |
|
| 84 |
return groups
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
class NERRequest(BaseModel):
|
| 87 |
text: str
|
| 88 |
mode: str
|
|
@@ -90,8 +204,6 @@ class NERRequest(BaseModel):
|
|
| 90 |
@app.post("/predict")
|
| 91 |
def predict(request: NERRequest):
|
| 92 |
# Load tagger
|
| 93 |
-
tagger, tag_vocab, train_config = load_checkpoint(checkpoint_path)
|
| 94 |
-
|
| 95 |
text = request.text
|
| 96 |
mode = request.mode
|
| 97 |
|
|
@@ -103,37 +215,8 @@ def predict(request: NERRequest):
|
|
| 103 |
for sentence in sentences:
|
| 104 |
se = split_text_into_groups_of_Ns(sentence, max_words_per_sentence=300)
|
| 105 |
for s in se:
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
vocabs = namedtuple("Vocab", ["tags", "tokens"])
|
| 109 |
-
vocab = vocabs(tokens=token_vocab, tags=tag_vocab)
|
| 110 |
-
|
| 111 |
-
dataloader = get_dataloaders(
|
| 112 |
-
(dataset,),
|
| 113 |
-
vocab,
|
| 114 |
-
args_data,
|
| 115 |
-
batch_size=32,
|
| 116 |
-
shuffle=(False,),
|
| 117 |
-
)[0]
|
| 118 |
-
|
| 119 |
-
segments = tagger.infer(dataloader)
|
| 120 |
-
|
| 121 |
-
# lists = []
|
| 122 |
-
|
| 123 |
-
for segment in segments:
|
| 124 |
-
for token in segment:
|
| 125 |
-
item = {}
|
| 126 |
-
item["token"] = token.text
|
| 127 |
-
|
| 128 |
-
list_of_tags = [t["tag"] for t in token.pred_tag]
|
| 129 |
-
list_of_tags = [i for i in list_of_tags if i not in ("O", " ", "")]
|
| 130 |
-
|
| 131 |
-
if not list_of_tags:
|
| 132 |
-
item["tags"] = ["O"]
|
| 133 |
-
else:
|
| 134 |
-
item["tags"] = list_of_tags
|
| 135 |
-
|
| 136 |
-
lists.append(item)
|
| 137 |
|
| 138 |
content = {
|
| 139 |
"resp": lists,
|
|
|
|
| 11 |
import json
|
| 12 |
from pydantic import BaseModel
|
| 13 |
from fastapi.responses import JSONResponse
|
| 14 |
+
from IBO_to_XML import IBO_to_XML
|
| 15 |
+
from XML_to_HTML import NER_XML_to_HTML
|
| 16 |
+
from NER_Distiller import distill_entities
|
| 17 |
|
| 18 |
app = FastAPI()
|
| 19 |
print("Version 2...")
|
|
|
|
| 58 |
|
| 59 |
def split_text_into_groups_of_Ns(sentence, max_words_per_sentence):
|
| 60 |
# Split the text into words
|
| 61 |
+
words = sentence.split()
|
| 62 |
|
| 63 |
# Initialize variables
|
| 64 |
groups = []
|
|
|
|
| 85 |
|
| 86 |
return groups
|
| 87 |
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def remove_empty_values(sentences):
|
| 91 |
+
return [value for value in sentences if value != '']
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def sentence_tokenizer(text, dot=True, new_line=True, question_mark=True, exclamation_mark=True):
|
| 95 |
+
separators = []
|
| 96 |
+
split_text = [text]
|
| 97 |
+
if new_line==True:
|
| 98 |
+
separators.append('\n')
|
| 99 |
+
if dot==True:
|
| 100 |
+
separators.append('.')
|
| 101 |
+
if question_mark==True:
|
| 102 |
+
separators.append('?')
|
| 103 |
+
separators.append('؟')
|
| 104 |
+
if exclamation_mark==True:
|
| 105 |
+
separators.append('!')
|
| 106 |
+
|
| 107 |
+
for sep in separators:
|
| 108 |
+
new_split_text = []
|
| 109 |
+
for part in split_text:
|
| 110 |
+
tokens = part.split(sep)
|
| 111 |
+
tokens_with_separator = [token + sep for token in tokens[:-1]]
|
| 112 |
+
tokens_with_separator.append(tokens[-1].strip())
|
| 113 |
+
new_split_text.extend(tokens_with_separator)
|
| 114 |
+
split_text = new_split_text
|
| 115 |
+
|
| 116 |
+
split_text = remove_empty_values(split_text)
|
| 117 |
+
return split_text
|
| 118 |
+
|
| 119 |
+
def jsons_to_list_of_lists(json_list):
|
| 120 |
+
return [[d['token'], d['tags']] for d in json_list]
|
| 121 |
+
|
| 122 |
+
tagger, tag_vocab, train_config = load_checkpoint(checkpoint_path)
|
| 123 |
+
|
| 124 |
+
def extract(sentence):
|
| 125 |
+
dataset, token_vocab = text2segments(sentence)
|
| 126 |
+
|
| 127 |
+
vocabs = namedtuple("Vocab", ["tags", "tokens"])
|
| 128 |
+
vocab = vocabs(tokens=token_vocab, tags=tag_vocab)
|
| 129 |
+
|
| 130 |
+
dataloader = get_dataloaders(
|
| 131 |
+
(dataset,),
|
| 132 |
+
vocab,
|
| 133 |
+
args_data,
|
| 134 |
+
batch_size=32,
|
| 135 |
+
shuffle=(False,),
|
| 136 |
+
)[0]
|
| 137 |
+
|
| 138 |
+
segments = tagger.infer(dataloader)
|
| 139 |
+
|
| 140 |
+
lists = []
|
| 141 |
+
|
| 142 |
+
for segment in segments:
|
| 143 |
+
for token in segment:
|
| 144 |
+
item = {}
|
| 145 |
+
item["token"] = token.text
|
| 146 |
+
|
| 147 |
+
list_of_tags = [t["tag"] for t in token.pred_tag]
|
| 148 |
+
list_of_tags = [i for i in list_of_tags if i not in ("O", " ", "")]
|
| 149 |
+
|
| 150 |
+
if not list_of_tags:
|
| 151 |
+
item["tags"] = ["O"]
|
| 152 |
+
else:
|
| 153 |
+
item["tags"] = list_of_tags
|
| 154 |
+
lists.append(item)
|
| 155 |
+
return lists
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def NER(sentence, mode):
|
| 159 |
+
#print("within NER, and mode is: ", mode)
|
| 160 |
+
output_list = []
|
| 161 |
+
xml = ""
|
| 162 |
+
if mode.strip() == "1":
|
| 163 |
+
output_list = jsons_to_list_of_lists(extract(sentence))
|
| 164 |
+
return output_list
|
| 165 |
+
elif mode.strip() == "2":
|
| 166 |
+
if output_list != []:
|
| 167 |
+
xml = IBO_to_XML(output_list)
|
| 168 |
+
return xml
|
| 169 |
+
else:
|
| 170 |
+
output_list = jsons_to_list_of_lists(extract(sentence))
|
| 171 |
+
xml = IBO_to_XML(output_list)
|
| 172 |
+
return xml
|
| 173 |
+
|
| 174 |
+
elif mode.strip() == "3":
|
| 175 |
+
print("mode is 3")
|
| 176 |
+
if xml != "":
|
| 177 |
+
#print("in if")
|
| 178 |
+
html = NER_XML_to_HTML(xml)
|
| 179 |
+
return html
|
| 180 |
+
else:
|
| 181 |
+
print("in else : ")
|
| 182 |
+
print("extract : ", extract(sentence))
|
| 183 |
+
output_list = jsons_to_list_of_lists(extract(sentence))
|
| 184 |
+
#print("output list : ", output_list)
|
| 185 |
+
xml = IBO_to_XML(output_list)
|
| 186 |
+
html = NER_XML_to_HTML(xml)
|
| 187 |
+
return html
|
| 188 |
+
|
| 189 |
+
elif mode.strip() == "4": # json short
|
| 190 |
+
if output_list != []:
|
| 191 |
+
json_short = distill_entities(output_list)
|
| 192 |
+
return json_short
|
| 193 |
+
else:
|
| 194 |
+
output_list = jsons_to_list_of_lists(extract(sentence))
|
| 195 |
+
json_short = distill_entities(output_list)
|
| 196 |
+
return json_short
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
|
| 200 |
class NERRequest(BaseModel):
|
| 201 |
text: str
|
| 202 |
mode: str
|
|
|
|
| 204 |
@app.post("/predict")
|
| 205 |
def predict(request: NERRequest):
|
| 206 |
# Load tagger
|
|
|
|
|
|
|
| 207 |
text = request.text
|
| 208 |
mode = request.mode
|
| 209 |
|
|
|
|
| 215 |
for sentence in sentences:
|
| 216 |
se = split_text_into_groups_of_Ns(sentence, max_words_per_sentence=300)
|
| 217 |
for s in se:
|
| 218 |
+
output_list = NER(s, mode)
|
| 219 |
+
lists.append(output_list)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
|
| 221 |
content = {
|
| 222 |
"resp": lists,
|
requirements.txt
CHANGED
|
@@ -5,5 +5,4 @@ numpy
|
|
| 5 |
huggingface_hub
|
| 6 |
transformers
|
| 7 |
natsort
|
| 8 |
-
seqeval
|
| 9 |
-
sinatools
|
|
|
|
| 5 |
huggingface_hub
|
| 6 |
transformers
|
| 7 |
natsort
|
| 8 |
+
seqeval
|
|
|