Spaces:

SinaLab
/

wojood-api

Running

App Files Files Community

TymaaHammouda commited on 12 days ago

Commit

cfe897e

1 Parent(s): 2b51d25

Update ner service

Browse files

Files changed (5) hide show

IBO_to_XML.py +135 -0
NER_Distiller.py +138 -0
XML_to_HTML.py +32 -0
app.py +118 -35
requirements.txt +1 -2

IBO_to_XML.py ADDED Viewed

	@@ -0,0 +1,135 @@

+# By Wasim Khatib
+# Version 2.0
+# This function take a list a set of annotated entities, in this format: [["صرح","O"],
+# ["رئيس","B-OCC"], ["نقابة","B-OCC B-ORG"],
+# ["العاملين","I-OCC B-ORG"], ["في","I-OCC I-ORG"], ["جامعة","I-OCC I-ORG B-ORG"],
+# ["بيرزيت","I-OCC I-ORG I-ORG B-LOC"],["ان","O"], ["غدا","O"], ["هو","O"], ["يوم","B-DATE"],["الخميس","I-DATE"]]
+# after that it will return text of xml in this fomrat: صرح <OCC> رئيس <ORG> نقابة العاملين </ORG> </OCC> يوم في <ORG>
+# جامعة <LOC> بيرزيت </LOC> </ORG> ان غدا هو <DATE> يوم الخميس </DATE>
+# This function assume the input is correct and each tag must start with B- or I-, not empty tag and discard all tags
+# start with ignore I- tags if they don’t have B-tags.
+import numpy as np
+def IBO_to_XML(temp):
+    xml_output = ""
+    temp_entities = sortTags(temp)
+    temp_list = list()
+    # initlize the temp_list
+    temp_list.append("")
+    word_position = 0
+    # For each entity, convert ibo to xml list.
+    for entity in temp_entities:
+        counter_tag = 0
+        # For each tag
+        for tag in str(entity[1]).split():
+            # If the counter tag greater than or equal to lenght of templist, if yes then we will append the empt value in templist
+            if counter_tag >= len(temp_list):
+                temp_list.append("")
+            # If the tag is equal O then and word position not equal zero then add all from templist to output ist
+            if "O" == tag and word_position != 0:
+                for j in range(len(temp_list),0,-1):
+                    if temp_list[j-1]!= "":
+                        xml_output+=" </"+str(temp_list[j-1])+">"
+                        temp_list[j-1] = ""
+            # if its not equal O and its correct tag like B-tag or I-tag and its B
+            elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "B":
+                # if the templist of counter tag is not empty then we need add xml word that contains
+                # </name of previous tag> its mean that we closed the tag in xml in xml_output
+                if temp_list[counter_tag] != "":
+                    xml_output+=" </"+str(temp_list[counter_tag])+">"
+                # After that we replace the previous tag from templist in new tag
+                temp_list[counter_tag] = str(tag).split("-")[1]
+                # And add xml word that contains <name of new tag> its mean we open the tag in xml in xml_output
+                xml_output += " <" + str(temp_list[counter_tag]) + ">"
+            # if its not equal O and its correct tag like B-tag or I-tag and its i and not first word postion
+            elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "I" and word_position != 0:
+                # we need to check if this tag like previous tag
+                for j in range(counter_tag,len(temp_list)):
+                    # if its equal then will break the loop and continue
+                    if temp_list[j] == tag[2:]:
+                        break
+                    # if not then we need to add xml word to close the tag like </name of previous> in xml_output
+                    else:
+                        if temp_list[j] != "":
+                            xml_output+=" </"+str(temp_list[j])+">"
+                            temp_list[j] = ""
+            counter_tag += 1
+        word_position += 1
+        # Add word in xml_output
+        xml_output +=" "+str(entity[0])
+    # Add all xml words in xml_output
+    for j in range(0, len(temp_list)):
+        if temp_list[j] != "":
+            xml_output+=" </"+str(temp_list[j])+">"
+    return xml_output.strip()
+def sortTags(entities):
+    temp_entities = entities
+    temp_counter = 0
+    # For each entity, this loop will sort each tag of entitiy, first it will check if the
+    # previous tags has same count of this tag, second will sort the tags and check if this tags is correct
+    for entity in temp_entities:
+        tags = entity[1].split()
+        for tag in tags:
+            # if the counter is not 0 then, will complete
+            if temp_counter != 0:
+                # Check if this tag is equal I-, if yes then it will count how many tag in this tags and
+                # count how many tag in previous tags
+                if "I-" == tag[0:2]:
+                    counter_of_this_tag = 0
+                    counter_of_previous_tag = 0
+                    for word in tags:
+                        if tag.split("-")[1] in word:
+                            counter_of_this_tag+=1
+                    for word in temp_entities[temp_counter-1][1].split():
+                        if tag.split("-")[1] in word:
+                            counter_of_previous_tag+=1
+                    # if the counter of previous tag is bigger than counter of this tag, then we
+                    # need to add I-tag in this tags
+                    if counter_of_previous_tag > counter_of_this_tag:
+                        tags.append("I-"+tag.split("-")[1])
+        # Sort the tags
+        tags.sort()
+        # Need to revers the tags because it should begins with I
+        tags.reverse()
+        # If the counter is not 0 then we can complete
+        if temp_counter != 0:
+            this_tags = tags
+            previous_tags = temp_entities[temp_counter - 1][1].split()
+            sorted_tags = list()
+            # Check if the this tag is not O and previous tags is not O, then will complete,
+            # if not then it will ignor this tag
+            if "O" not in this_tags and "O" not in previous_tags:
+                index = 0
+                #For each previous tags, need sort this tag by previous tags if its I, B we can ignor
+                for i in previous_tags:
+                    j = 0
+                    while this_tags and j < len(this_tags):
+                        if this_tags[j][0:2] == "I-" and this_tags[j][2:] == i[2:]:
+                            sorted_tags.insert(index, this_tags.pop(j))
+                            break
+                        elif this_tags[j][0:2] == "B-":
+                            break
+                        j += 1
+                    index += 1
+            sorted_tags += this_tags
+            tags = sorted_tags
+        str_tag = " "
+        str_tag = str_tag.join(tags)
+        str_tag = str_tag.strip()
+        temp_entities[temp_counter][1] = str_tag
+        temp_counter += 1
+    return temp_entities

NER_Distiller.py ADDED Viewed

	@@ -0,0 +1,138 @@

+# By Wasim Khatib
+# Version 2.0
+# This function take a list a set of annotated entities, in this format: [["صرح","O"], ["رئيس","B-OCC"], ["نقابة","B-OCC B-ORG"],
+# ["العاملين","I-OCC B-ORG"], ["في","I-OCC I-ORG"], ["جامعة","I-OCC I-ORG B-ORG"],
+# ["بيرزيت","I-OCC I-ORG I-ORG B-LOC"],["ان","O"], ["غدا","O"], ["هو","O"], ["يوم","B-DATE"],["الخميس","I-DATE"]]
+# after that it will return array of the set of distilled entities and their positions (start, end) and tags, such as
+# [[" رئيس نقابة العاملين في جامعة بيرزيت", OCC,1,7],
+# [" نقابة العاملين في جامعة بيرزيت", ORG,2,7], [" جامعة بيرزيت", ORG,5,7],["يوم الخميس", DATE,10,11]]
+def distill_entities(entities):
+    # This is list that we put the output what we need
+    list_output = list()
+    # This line go to sort function and save the output to temp_entities
+    temp_entities = sortTags(entities)
+    # This list help us to make the output,
+    temp_list = list()
+    # initlize the temp_list
+    temp_list.append(["", "", 0, 0])
+    word_position = 0
+    # For each entity, convert ibo to distllir list.
+    for entity in temp_entities:
+        # This is counter tag of this entity
+        counter_tag = 0
+        # For each tag
+        for tag in str(entity[1]).split():
+            # If the counter tag greater than or equal to lenght of templist, if yes then we will append the empty value in templist
+            if counter_tag >= len(temp_list):
+                temp_list.append(["", "", 0, 0])
+            # If tag equal O and word postion of this tag is not equal zero then it will add all
+            # not empty eliment of temp list in output list
+            if "O" == tag and word_position != 0:
+                for j in range(0, len(temp_list)):
+                    if temp_list[j][1] != "":
+                        list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
+                        temp_list[j][0] = ""
+                        temp_list[j][1] = ""
+                        temp_list[j][2] = word_position
+                        temp_list[j][3] = word_position
+            # if this tag not equal O, and split by '-' the tag and check the lenght equals two and if the first eliment
+            # of the split its B
+            elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "B":
+                # if the temp_list of counter is not empty then it will append in output list and hten it will
+                # initilize by new string and tag in templist of counter
+                if temp_list[counter_tag][1] != "":
+                    list_output.append([temp_list[counter_tag][0].strip(), temp_list[counter_tag][1], temp_list[counter_tag][2], temp_list[counter_tag][3]])
+                temp_list[counter_tag][0] = str(entity[0]) + " "
+                temp_list[counter_tag][1] = str(tag).split("-")[1]
+                temp_list[counter_tag][2] = word_position
+                temp_list[counter_tag][3] = word_position
+            # if this tag not equal O, and split by '-' the tag and check the lenght equals two and if the first eliment
+            # of the split its O
+            elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "I" and word_position != 0:
+                # For each of temp_list, check if in this counter tag of templist is same tag with this.tag
+                # then will complete if not it will save in output list and cheak another
+                for j in range(counter_tag,len(temp_list)):
+                    if temp_list[j][1] == tag[2:] and temp_list[j][3] != word_position:
+                        temp_list[j][0] += str(entity[0]) + " "
+                        temp_list[j][3] += 1
+                        break
+                    else:
+                        if temp_list[j][1] != "":
+                            list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
+                            temp_list[j][0] = ""
+                            temp_list[j][1] = ""
+                            temp_list[j][2] = word_position
+                            temp_list[j][3] = word_position
+            counter_tag += 1
+        word_position += 1
+    # For each temp_list, at the end of the previous loop, there will be some
+    # values in this list, we should save it to the output list
+    for j in range(0, len(temp_list)):
+        if temp_list[j][1] != "":
+            list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
+    return sorted(list_output, key=lambda x: (x[2]))
+def sortTags(entities):
+    temp_entities = entities
+    temp_counter = 0
+    # For each entity, this loop will sort each tag of entitiy, first it will check if the
+    # previous tags has same count of this tag, second will sort the tags and check if this tags is correct
+    for entity in temp_entities:
+        tags = entity[1].split()
+        for tag in tags:
+            # if the counter is not 0 then, will complete
+            if temp_counter != 0:
+                # Check if this tag is equal I-, if yes then it will count how many tag in this tags and
+                # count how many tag in previous tags
+                if "I-" == tag[0:2]:
+                    counter_of_this_tag = 0
+                    counter_of_previous_tag = 0
+                    for word in tags:
+                        if tag.split("-")[1] in word:
+                            counter_of_this_tag+=1
+                    for word in temp_entities[temp_counter-1][1].split():
+                        if tag.split("-")[1] in word:
+                            counter_of_previous_tag+=1
+                    # if the counter of previous tag is bigger than counter of this tag, then we
+                    # need to add I-tag in this tags
+                    if counter_of_previous_tag > counter_of_this_tag:
+                        tags.append("I-"+tag.split("-")[1])
+        # Sort the tags
+        tags.sort()
+        # Need to revers the tags because it should begins with I
+        tags.reverse()
+        # If the counter is not 0 then we can complete
+        if temp_counter != 0:
+            this_tags = tags
+            previous_tags = temp_entities[temp_counter - 1][1].split()
+            sorted_tags = list()
+            # Check if the this tag is not O and previous tags is not O, then will complete,
+            # if not then it will ignor this tag
+            if "O" not in this_tags and "O" not in previous_tags:
+                index = 0
+                #For each previous tags, need sort this tag by previous tags if its I, B we can ignor
+                for i in previous_tags:
+                    j = 0
+                    while this_tags and j < len(this_tags):
+                        if this_tags[j][0:2] == "I-" and this_tags[j][2:] == i[2:]:
+                            sorted_tags.insert(index, this_tags.pop(j))
+                            break
+                        elif this_tags[j][0:2] == "B-":
+                            break
+                        j += 1
+                    index += 1
+            sorted_tags += this_tags
+            tags = sorted_tags
+        str_tag = " "
+        str_tag = str_tag.join(tags)
+        str_tag = str_tag.strip()
+        temp_entities[temp_counter][1] = str_tag
+        temp_counter += 1
+    return temp_entities

XML_to_HTML.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import re
+def NER_XML_to_HTML(xml):
+    html = re.sub(r'WORK_OF_ART','WORKOFART',xml)
+    # replace every end tag with end span tag "</span>"
+    html = re.sub(r'</[A-Z]+>','</span>',html)
+    # replace every start tag with the appropriate css class
+    html = re.sub(r'<PERS>','<span class="ner_pers" data-entity="PERS">',html)
+    html = re.sub(r'<GROUP>','<span class="ner_group" data-entity="NORP">',html)
+    html = re.sub(r'<OCC>','<span class="ner_occ" data-entity="OCC">',html)
+    html = re.sub(r'<ORG>','<span class="ner_org" data-entity="ORG">',html)
+    html = re.sub(r'<LOC>','<span class="ner_loc" data-entity="LOC">',html)
+    html = re.sub(r'<GPE>','<span class="ner_gpe" data-entity="GPE">',html)
+    html = re.sub(r'<FAC>','<span class="ner_fac" data-entity="FAC">',html)
+    html = re.sub(r'<EVENT>','<span class="ner_event" data-entity="EVENT">',html)
+    html = re.sub(r'<DATE>','<span class="ner_date" data-entity="DATE">',html)
+    html = re.sub(r'<TIME>','<span class="ner_time" data-entity="TIME">',html)
+    html = re.sub(r'<CARDINAL>','<span class="ner_cardinal" data-entity="CARDINAL">',html)
+    html = re.sub(r'<ORDINAL>','<span class="ner_ordinal" data-entity="ORDINAL">',html)
+    html = re.sub(r'<PERCENT>','<span class="ner_percent" data-entity="PERCENT">',html)
+    html = re.sub(r'<QUANTITY>','<span class="ner_quantity" data-entity="QUANTITY">',html)
+    html = re.sub(r'<UNIT>','<span class="ner_unit" data-entity="UNIT">',html)
+    html = re.sub(r'<MONEY>','<span class="ner_money" data-entity="MONEY">',html)
+    html = re.sub(r'<CURR>','<span class="ner_currency" data-entity="CURRENCY">',html)
+    html = re.sub(r'<LANGUAGE>','<span class="ner_language" data-entity="LANGUAGE">',html)
+    html = re.sub(r'<PRODUCT>','<span class="ner_product" data-entity="PRODUCT">',html)
+    html = re.sub(r'<WORKOFART>','<span class="ner_work_of_art" data-entity="WORK_OF_ART">',html)
+    html = re.sub(r'<LAW>','<span class="ner_law" data-entity="LAW">',html)
+    return html

app.py CHANGED Viewed

@@ -11,7 +11,9 @@ from Nested.utils.data import get_dataloaders, text2segments
 import json
 from pydantic import BaseModel
 from fastapi.responses import JSONResponse
-from sinatools.utils.tokenizer import sentence_tokenizer
 app = FastAPI()
 print("Version 2...")
@@ -56,7 +58,7 @@ id2label = {i: s for i, s in enumerate(label_vocab.itos)}
 def split_text_into_groups_of_Ns(sentence, max_words_per_sentence):
     # Split the text into words
-    words = simple_word_tokenize(sentence)
     # Initialize variables
     groups = []
@@ -83,6 +85,118 @@ def split_text_into_groups_of_Ns(sentence, max_words_per_sentence):
     return groups
 class NERRequest(BaseModel):
     text: str
     mode: str
@@ -90,8 +204,6 @@ class NERRequest(BaseModel):
 @app.post("/predict")
 def predict(request: NERRequest):
     # Load tagger
-    tagger, tag_vocab, train_config = load_checkpoint(checkpoint_path)
     text = request.text
     mode = request.mode
@@ -103,37 +215,8 @@ def predict(request: NERRequest):
     for sentence in sentences:
         se = split_text_into_groups_of_Ns(sentence, max_words_per_sentence=300)
         for s in se:
-            dataset, token_vocab = text2segments(sentence)
-            vocabs = namedtuple("Vocab", ["tags", "tokens"])
-            vocab = vocabs(tokens=token_vocab, tags=tag_vocab)
-            dataloader = get_dataloaders(
-                (dataset,),
-                vocab,
-                args_data,
-                batch_size=32,
-                shuffle=(False,),
-            )[0]
-            segments = tagger.infer(dataloader)
-            # lists = []
-            for segment in segments:
-                for token in segment:
-                    item = {}
-                    item["token"] = token.text
-                    list_of_tags = [t["tag"] for t in token.pred_tag]
-                    list_of_tags = [i for i in list_of_tags if i not in ("O", " ", "")]
-                    if not list_of_tags:
-                        item["tags"] = ["O"]
-                    else:
-                        item["tags"] = list_of_tags
-                    lists.append(item)
     content = {
         "resp": lists,

 import json
 from pydantic import BaseModel
 from fastapi.responses import JSONResponse
+from IBO_to_XML import IBO_to_XML
+from XML_to_HTML import NER_XML_to_HTML
+from NER_Distiller import distill_entities
 app = FastAPI()
 print("Version 2...")
 def split_text_into_groups_of_Ns(sentence, max_words_per_sentence):
     # Split the text into words
+    words = sentence.split()
     # Initialize variables
     groups = []
     return groups
+def remove_empty_values(sentences):
+    return [value for value in sentences if value != '']
+def sentence_tokenizer(text, dot=True, new_line=True, question_mark=True, exclamation_mark=True):
+    separators = []
+    split_text = [text]
+    if new_line==True:
+        separators.append('\n')
+    if dot==True:
+        separators.append('.')
+    if question_mark==True:
+        separators.append('?')
+        separators.append('؟')
+    if exclamation_mark==True:
+        separators.append('!')
+    for sep in separators:
+        new_split_text = []
+        for part in split_text:
+            tokens = part.split(sep)
+            tokens_with_separator = [token + sep for token in tokens[:-1]]
+            tokens_with_separator.append(tokens[-1].strip())
+            new_split_text.extend(tokens_with_separator)
+        split_text = new_split_text
+    split_text = remove_empty_values(split_text)
+    return split_text
+def jsons_to_list_of_lists(json_list):
+    return [[d['token'], d['tags']] for d in json_list]
+tagger, tag_vocab, train_config = load_checkpoint(checkpoint_path)
+def extract(sentence):
+    dataset, token_vocab = text2segments(sentence)
+    vocabs = namedtuple("Vocab", ["tags", "tokens"])
+    vocab = vocabs(tokens=token_vocab, tags=tag_vocab)
+    dataloader = get_dataloaders(
+        (dataset,),
+        vocab,
+        args_data,
+        batch_size=32,
+        shuffle=(False,),
+    )[0]
+    segments = tagger.infer(dataloader)
+    lists = []
+    for segment in segments:
+        for token in segment:
+            item = {}
+            item["token"] = token.text
+            list_of_tags = [t["tag"] for t in token.pred_tag]
+            list_of_tags = [i for i in list_of_tags if i not in ("O", " ", "")]
+            if not list_of_tags:
+                item["tags"] = ["O"]
+            else:
+                item["tags"] = list_of_tags
+            lists.append(item)
+    return lists
+def NER(sentence, mode):
+    #print("within NER, and mode is: ", mode)
+    output_list = []
+    xml = ""
+    if mode.strip() == "1":
+        output_list = jsons_to_list_of_lists(extract(sentence))
+        return output_list
+    elif mode.strip() == "2":
+        if output_list != []:
+            xml = IBO_to_XML(output_list)
+            return xml
+        else:
+            output_list = jsons_to_list_of_lists(extract(sentence))
+            xml = IBO_to_XML(output_list)
+            return xml
+    elif mode.strip() == "3":
+        print("mode is 3")
+        if xml != "":
+            #print("in if")
+            html = NER_XML_to_HTML(xml)
+            return html
+        else:
+            print("in else : ")
+            print("extract : ", extract(sentence))
+            output_list = jsons_to_list_of_lists(extract(sentence))
+            #print("output list : ", output_list)
+            xml = IBO_to_XML(output_list)
+            html = NER_XML_to_HTML(xml)
+            return html
+    elif mode.strip() == "4": # json short
+        if output_list != []:
+            json_short = distill_entities(output_list)
+            return json_short
+        else:
+            output_list = jsons_to_list_of_lists(extract(sentence))
+            json_short = distill_entities(output_list)
+            return json_short
 class NERRequest(BaseModel):
     text: str
     mode: str
 @app.post("/predict")
 def predict(request: NERRequest):
     # Load tagger
     text = request.text
     mode = request.mode
     for sentence in sentences:
         se = split_text_into_groups_of_Ns(sentence, max_words_per_sentence=300)
         for s in se:
+            output_list = NER(s, mode)
+            lists.append(output_list)
     content = {
         "resp": lists,

requirements.txt CHANGED Viewed

@@ -5,5 +5,4 @@ numpy
 huggingface_hub
 transformers
 natsort
-seqeval
-sinatools

 huggingface_hub
 transformers
 natsort
+seqeval