wojood-api / NER_Distiller.py
TymaaHammouda's picture
Update ner service
cfe897e
# By Wasim Khatib
# Version 2.0
# This function take a list a set of annotated entities, in this format: [["صرح","O"], ["رئيس","B-OCC"], ["نقابة","B-OCC B-ORG"],
# ["العاملين","I-OCC B-ORG"], ["في","I-OCC I-ORG"], ["جامعة","I-OCC I-ORG B-ORG"],
# ["بيرزيت","I-OCC I-ORG I-ORG B-LOC"],["ان","O"], ["غدا","O"], ["هو","O"], ["يوم","B-DATE"],["الخميس","I-DATE"]]
# after that it will return array of the set of distilled entities and their positions (start, end) and tags, such as
# [[" رئيس نقابة العاملين في جامعة بيرزيت", OCC,1,7],
# [" نقابة العاملين في جامعة بيرزيت", ORG,2,7], [" جامعة بيرزيت", ORG,5,7],["يوم الخميس", DATE,10,11]]
def distill_entities(entities):
# This is list that we put the output what we need
list_output = list()
# This line go to sort function and save the output to temp_entities
temp_entities = sortTags(entities)
# This list help us to make the output,
temp_list = list()
# initlize the temp_list
temp_list.append(["", "", 0, 0])
word_position = 0
# For each entity, convert ibo to distllir list.
for entity in temp_entities:
# This is counter tag of this entity
counter_tag = 0
# For each tag
for tag in str(entity[1]).split():
# If the counter tag greater than or equal to lenght of templist, if yes then we will append the empty value in templist
if counter_tag >= len(temp_list):
temp_list.append(["", "", 0, 0])
# If tag equal O and word postion of this tag is not equal zero then it will add all
# not empty eliment of temp list in output list
if "O" == tag and word_position != 0:
for j in range(0, len(temp_list)):
if temp_list[j][1] != "":
list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
temp_list[j][0] = ""
temp_list[j][1] = ""
temp_list[j][2] = word_position
temp_list[j][3] = word_position
# if this tag not equal O, and split by '-' the tag and check the lenght equals two and if the first eliment
# of the split its B
elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "B":
# if the temp_list of counter is not empty then it will append in output list and hten it will
# initilize by new string and tag in templist of counter
if temp_list[counter_tag][1] != "":
list_output.append([temp_list[counter_tag][0].strip(), temp_list[counter_tag][1], temp_list[counter_tag][2], temp_list[counter_tag][3]])
temp_list[counter_tag][0] = str(entity[0]) + " "
temp_list[counter_tag][1] = str(tag).split("-")[1]
temp_list[counter_tag][2] = word_position
temp_list[counter_tag][3] = word_position
# if this tag not equal O, and split by '-' the tag and check the lenght equals two and if the first eliment
# of the split its O
elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "I" and word_position != 0:
# For each of temp_list, check if in this counter tag of templist is same tag with this.tag
# then will complete if not it will save in output list and cheak another
for j in range(counter_tag,len(temp_list)):
if temp_list[j][1] == tag[2:] and temp_list[j][3] != word_position:
temp_list[j][0] += str(entity[0]) + " "
temp_list[j][3] += 1
break
else:
if temp_list[j][1] != "":
list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
temp_list[j][0] = ""
temp_list[j][1] = ""
temp_list[j][2] = word_position
temp_list[j][3] = word_position
counter_tag += 1
word_position += 1
# For each temp_list, at the end of the previous loop, there will be some
# values in this list, we should save it to the output list
for j in range(0, len(temp_list)):
if temp_list[j][1] != "":
list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
return sorted(list_output, key=lambda x: (x[2]))
def sortTags(entities):
temp_entities = entities
temp_counter = 0
# For each entity, this loop will sort each tag of entitiy, first it will check if the
# previous tags has same count of this tag, second will sort the tags and check if this tags is correct
for entity in temp_entities:
tags = entity[1].split()
for tag in tags:
# if the counter is not 0 then, will complete
if temp_counter != 0:
# Check if this tag is equal I-, if yes then it will count how many tag in this tags and
# count how many tag in previous tags
if "I-" == tag[0:2]:
counter_of_this_tag = 0
counter_of_previous_tag = 0
for word in tags:
if tag.split("-")[1] in word:
counter_of_this_tag+=1
for word in temp_entities[temp_counter-1][1].split():
if tag.split("-")[1] in word:
counter_of_previous_tag+=1
# if the counter of previous tag is bigger than counter of this tag, then we
# need to add I-tag in this tags
if counter_of_previous_tag > counter_of_this_tag:
tags.append("I-"+tag.split("-")[1])
# Sort the tags
tags.sort()
# Need to revers the tags because it should begins with I
tags.reverse()
# If the counter is not 0 then we can complete
if temp_counter != 0:
this_tags = tags
previous_tags = temp_entities[temp_counter - 1][1].split()
sorted_tags = list()
# Check if the this tag is not O and previous tags is not O, then will complete,
# if not then it will ignor this tag
if "O" not in this_tags and "O" not in previous_tags:
index = 0
#For each previous tags, need sort this tag by previous tags if its I, B we can ignor
for i in previous_tags:
j = 0
while this_tags and j < len(this_tags):
if this_tags[j][0:2] == "I-" and this_tags[j][2:] == i[2:]:
sorted_tags.insert(index, this_tags.pop(j))
break
elif this_tags[j][0:2] == "B-":
break
j += 1
index += 1
sorted_tags += this_tags
tags = sorted_tags
str_tag = " "
str_tag = str_tag.join(tags)
str_tag = str_tag.strip()
temp_entities[temp_counter][1] = str_tag
temp_counter += 1
return temp_entities