Spaces:

SinaLab
/

wojood-api

Running

App Files Files Community

wojood-api / NER_Distiller.py

TymaaHammouda

Update ner service

cfe897e 11 days ago

raw

history blame contribute delete

7.47 kB

	# By Wasim Khatib
	# Version 2.0
	# This function take a list a set of annotated entities, in this format: [["صرح","O"], ["رئيس","B-OCC"], ["نقابة","B-OCC B-ORG"],
	# ["العاملين","I-OCC B-ORG"], ["في","I-OCC I-ORG"], ["جامعة","I-OCC I-ORG B-ORG"],
	# ["بيرزيت","I-OCC I-ORG I-ORG B-LOC"],["ان","O"], ["غدا","O"], ["هو","O"], ["يوم","B-DATE"],["الخميس","I-DATE"]]
	# after that it will return array of the set of distilled entities and their positions (start, end) and tags, such as
	# [[" رئيس نقابة العاملين في جامعة بيرزيت", OCC,1,7],
	# [" نقابة العاملين في جامعة بيرزيت", ORG,2,7], [" جامعة بيرزيت", ORG,5,7],["يوم الخميس", DATE,10,11]]
	def distill_entities(entities):
	# This is list that we put the output what we need
	list_output = list()

	# This line go to sort function and save the output to temp_entities
	temp_entities = sortTags(entities)

	# This list help us to make the output,
	temp_list = list()

	# initlize the temp_list
	temp_list.append(["", "", 0, 0])
	word_position = 0

	# For each entity, convert ibo to distllir list.
	for entity in temp_entities:
	# This is counter tag of this entity
	counter_tag = 0
	# For each tag
	for tag in str(entity[1]).split():
	# If the counter tag greater than or equal to lenght of templist, if yes then we will append the empty value in templist
	if counter_tag >= len(temp_list):
	temp_list.append(["", "", 0, 0])

	# If tag equal O and word postion of this tag is not equal zero then it will add all
	# not empty eliment of temp list in output list
	if "O" == tag and word_position != 0:
	for j in range(0, len(temp_list)):
	if temp_list[j][1] != "":
	list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
	temp_list[j][0] = ""
	temp_list[j][1] = ""
	temp_list[j][2] = word_position
	temp_list[j][3] = word_position
	# if this tag not equal O, and split by '-' the tag and check the lenght equals two and if the first eliment
	# of the split its B
	elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "B":
	# if the temp_list of counter is not empty then it will append in output list and hten it will
	# initilize by new string and tag in templist of counter
	if temp_list[counter_tag][1] != "":
	list_output.append([temp_list[counter_tag][0].strip(), temp_list[counter_tag][1], temp_list[counter_tag][2], temp_list[counter_tag][3]])
	temp_list[counter_tag][0] = str(entity[0]) + " "
	temp_list[counter_tag][1] = str(tag).split("-")[1]
	temp_list[counter_tag][2] = word_position
	temp_list[counter_tag][3] = word_position

	# if this tag not equal O, and split by '-' the tag and check the lenght equals two and if the first eliment
	# of the split its O
	elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "I" and word_position != 0:
	# For each of temp_list, check if in this counter tag of templist is same tag with this.tag
	# then will complete if not it will save in output list and cheak another
	for j in range(counter_tag,len(temp_list)):
	if temp_list[j][1] == tag[2:] and temp_list[j][3] != word_position:
	temp_list[j][0] += str(entity[0]) + " "
	temp_list[j][3] += 1
	break
	else:
	if temp_list[j][1] != "":
	list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
	temp_list[j][0] = ""
	temp_list[j][1] = ""
	temp_list[j][2] = word_position
	temp_list[j][3] = word_position
	counter_tag += 1
	word_position += 1
	# For each temp_list, at the end of the previous loop, there will be some
	# values in this list, we should save it to the output list
	for j in range(0, len(temp_list)):
	if temp_list[j][1] != "":
	list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
	return sorted(list_output, key=lambda x: (x[2]))

	def sortTags(entities):
	temp_entities = entities
	temp_counter = 0
	# For each entity, this loop will sort each tag of entitiy, first it will check if the
	# previous tags has same count of this tag, second will sort the tags and check if this tags is correct
	for entity in temp_entities:
	tags = entity[1].split()
	for tag in tags:
	# if the counter is not 0 then, will complete
	if temp_counter != 0:
	# Check if this tag is equal I-, if yes then it will count how many tag in this tags and
	# count how many tag in previous tags
	if "I-" == tag[0:2]:
	counter_of_this_tag = 0
	counter_of_previous_tag = 0
	for word in tags:
	if tag.split("-")[1] in word:
	counter_of_this_tag+=1
	for word in temp_entities[temp_counter-1][1].split():
	if tag.split("-")[1] in word:
	counter_of_previous_tag+=1
	# if the counter of previous tag is bigger than counter of this tag, then we
	# need to add I-tag in this tags
	if counter_of_previous_tag > counter_of_this_tag:
	tags.append("I-"+tag.split("-")[1])
	# Sort the tags
	tags.sort()
	# Need to revers the tags because it should begins with I
	tags.reverse()
	# If the counter is not 0 then we can complete
	if temp_counter != 0:
	this_tags = tags
	previous_tags = temp_entities[temp_counter - 1][1].split()
	sorted_tags = list()

	# Check if the this tag is not O and previous tags is not O, then will complete,
	# if not then it will ignor this tag
	if "O" not in this_tags and "O" not in previous_tags:
	index = 0
	#For each previous tags, need sort this tag by previous tags if its I, B we can ignor
	for i in previous_tags:
	j = 0
	while this_tags and j < len(this_tags):
	if this_tags[j][0:2] == "I-" and this_tags[j][2:] == i[2:]:
	sorted_tags.insert(index, this_tags.pop(j))
	break
	elif this_tags[j][0:2] == "B-":
	break
	j += 1
	index += 1
	sorted_tags += this_tags
	tags = sorted_tags
	str_tag = " "
	str_tag = str_tag.join(tags)
	str_tag = str_tag.strip()
	temp_entities[temp_counter][1] = str_tag
	temp_counter += 1
	return temp_entities