Spaces:
Running
Running
| # By Wasim Khatib | |
| # Version 2.0 | |
| # This function take a list a set of annotated entities, in this format: [["صرح","O"], ["رئيس","B-OCC"], ["نقابة","B-OCC B-ORG"], | |
| # ["العاملين","I-OCC B-ORG"], ["في","I-OCC I-ORG"], ["جامعة","I-OCC I-ORG B-ORG"], | |
| # ["بيرزيت","I-OCC I-ORG I-ORG B-LOC"],["ان","O"], ["غدا","O"], ["هو","O"], ["يوم","B-DATE"],["الخميس","I-DATE"]] | |
| # after that it will return array of the set of distilled entities and their positions (start, end) and tags, such as | |
| # [[" رئيس نقابة العاملين في جامعة بيرزيت", OCC,1,7], | |
| # [" نقابة العاملين في جامعة بيرزيت", ORG,2,7], [" جامعة بيرزيت", ORG,5,7],["يوم الخميس", DATE,10,11]] | |
| def distill_entities(entities): | |
| # This is list that we put the output what we need | |
| list_output = list() | |
| # This line go to sort function and save the output to temp_entities | |
| temp_entities = sortTags(entities) | |
| # This list help us to make the output, | |
| temp_list = list() | |
| # initlize the temp_list | |
| temp_list.append(["", "", 0, 0]) | |
| word_position = 0 | |
| # For each entity, convert ibo to distllir list. | |
| for entity in temp_entities: | |
| # This is counter tag of this entity | |
| counter_tag = 0 | |
| # For each tag | |
| for tag in str(entity[1]).split(): | |
| # If the counter tag greater than or equal to lenght of templist, if yes then we will append the empty value in templist | |
| if counter_tag >= len(temp_list): | |
| temp_list.append(["", "", 0, 0]) | |
| # If tag equal O and word postion of this tag is not equal zero then it will add all | |
| # not empty eliment of temp list in output list | |
| if "O" == tag and word_position != 0: | |
| for j in range(0, len(temp_list)): | |
| if temp_list[j][1] != "": | |
| list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]]) | |
| temp_list[j][0] = "" | |
| temp_list[j][1] = "" | |
| temp_list[j][2] = word_position | |
| temp_list[j][3] = word_position | |
| # if this tag not equal O, and split by '-' the tag and check the lenght equals two and if the first eliment | |
| # of the split its B | |
| elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "B": | |
| # if the temp_list of counter is not empty then it will append in output list and hten it will | |
| # initilize by new string and tag in templist of counter | |
| if temp_list[counter_tag][1] != "": | |
| list_output.append([temp_list[counter_tag][0].strip(), temp_list[counter_tag][1], temp_list[counter_tag][2], temp_list[counter_tag][3]]) | |
| temp_list[counter_tag][0] = str(entity[0]) + " " | |
| temp_list[counter_tag][1] = str(tag).split("-")[1] | |
| temp_list[counter_tag][2] = word_position | |
| temp_list[counter_tag][3] = word_position | |
| # if this tag not equal O, and split by '-' the tag and check the lenght equals two and if the first eliment | |
| # of the split its O | |
| elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "I" and word_position != 0: | |
| # For each of temp_list, check if in this counter tag of templist is same tag with this.tag | |
| # then will complete if not it will save in output list and cheak another | |
| for j in range(counter_tag,len(temp_list)): | |
| if temp_list[j][1] == tag[2:] and temp_list[j][3] != word_position: | |
| temp_list[j][0] += str(entity[0]) + " " | |
| temp_list[j][3] += 1 | |
| break | |
| else: | |
| if temp_list[j][1] != "": | |
| list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]]) | |
| temp_list[j][0] = "" | |
| temp_list[j][1] = "" | |
| temp_list[j][2] = word_position | |
| temp_list[j][3] = word_position | |
| counter_tag += 1 | |
| word_position += 1 | |
| # For each temp_list, at the end of the previous loop, there will be some | |
| # values in this list, we should save it to the output list | |
| for j in range(0, len(temp_list)): | |
| if temp_list[j][1] != "": | |
| list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]]) | |
| return sorted(list_output, key=lambda x: (x[2])) | |
| def sortTags(entities): | |
| temp_entities = entities | |
| temp_counter = 0 | |
| # For each entity, this loop will sort each tag of entitiy, first it will check if the | |
| # previous tags has same count of this tag, second will sort the tags and check if this tags is correct | |
| for entity in temp_entities: | |
| tags = entity[1].split() | |
| for tag in tags: | |
| # if the counter is not 0 then, will complete | |
| if temp_counter != 0: | |
| # Check if this tag is equal I-, if yes then it will count how many tag in this tags and | |
| # count how many tag in previous tags | |
| if "I-" == tag[0:2]: | |
| counter_of_this_tag = 0 | |
| counter_of_previous_tag = 0 | |
| for word in tags: | |
| if tag.split("-")[1] in word: | |
| counter_of_this_tag+=1 | |
| for word in temp_entities[temp_counter-1][1].split(): | |
| if tag.split("-")[1] in word: | |
| counter_of_previous_tag+=1 | |
| # if the counter of previous tag is bigger than counter of this tag, then we | |
| # need to add I-tag in this tags | |
| if counter_of_previous_tag > counter_of_this_tag: | |
| tags.append("I-"+tag.split("-")[1]) | |
| # Sort the tags | |
| tags.sort() | |
| # Need to revers the tags because it should begins with I | |
| tags.reverse() | |
| # If the counter is not 0 then we can complete | |
| if temp_counter != 0: | |
| this_tags = tags | |
| previous_tags = temp_entities[temp_counter - 1][1].split() | |
| sorted_tags = list() | |
| # Check if the this tag is not O and previous tags is not O, then will complete, | |
| # if not then it will ignor this tag | |
| if "O" not in this_tags and "O" not in previous_tags: | |
| index = 0 | |
| #For each previous tags, need sort this tag by previous tags if its I, B we can ignor | |
| for i in previous_tags: | |
| j = 0 | |
| while this_tags and j < len(this_tags): | |
| if this_tags[j][0:2] == "I-" and this_tags[j][2:] == i[2:]: | |
| sorted_tags.insert(index, this_tags.pop(j)) | |
| break | |
| elif this_tags[j][0:2] == "B-": | |
| break | |
| j += 1 | |
| index += 1 | |
| sorted_tags += this_tags | |
| tags = sorted_tags | |
| str_tag = " " | |
| str_tag = str_tag.join(tags) | |
| str_tag = str_tag.strip() | |
| temp_entities[temp_counter][1] = str_tag | |
| temp_counter += 1 | |
| return temp_entities |