File size: 2,561 Bytes
e7de395 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
# recognize: processes two-levels-topic-recognition
# Authors: Paul Libbrecht, Pierre Günthner and Alexander Gantikow from the AISOP project
# Installation: Install spacy then...
# Usage: python recognize.py l1-model l2-models "this is a text"
# l1-model path: a relative path (starting from this script) pointing to the level-1 model folder
# l2-model path: a relative path (starting from this script) pointing to the folder containing a folder for each L2-label
# "this is a text": the text to recognize
import sys # System-specific parameters and functions, part of Py
import spacy # Natural language processing
from pathlib import Path # Object-oriented filesystem paths, part of Py
import json # JSON object dumping functions
RoundTo = 2 # Round to precision of n decimals
Encoding = 'utf8' # Encoding of the html file to be read and parsed via BeautifulSoup
ScoreThreshold = 0.2 # Min. spacy probability value for an element to be analysed/enriched.
MaxResults = 3 # Max. number of concepts/labels added to an html element
ParagraphMinLetters = 10 # Min. number of letters of paragraph to be considered in analysis
ListMinLetters = 10 # Min. number of letters of <ul> and <ol> to be considered in analysis
SubModels = {}
Nlp = spacy.load(sys.argv[1])
SubModelDir = Path(__file__).parent.joinpath(sys.argv[2]).absolute()
input = sys.argv[3]
def filterDoc(doc, scoreThreshold, maxResults, roundTo=2):
cats = doc.cats.items()
filt = list(filter(lambda c: c[1] > scoreThreshold, cats))
sort = sorted(filt, key=lambda c: c[1], reverse=True)
maxi = sort[0:maxResults]
rund = [(l[0], round(l[1], roundTo)) for l in maxi ]
return dict(rund)
def recognize(text):
global Nlp
# find l1 labels
labels = filterDoc(Nlp(text), ScoreThreshold, MaxResults)
# find L2 labels
relabels = dict()
for label in labels.keys():
label2 = label.strip()
SubModelPath = SubModelDir.joinpath(label2).absolute()
if SubModelPath.exists():
Nlp = spacy.load(SubModelPath)
docSub = filterDoc(Nlp(text), ScoreThreshold, MaxResults)
relabels[label2] = {'score': labels[label], 'subs': docSub}
else:
relabels[label2] = {'score': labels[label]}
relabels["messages"] = "Submodel path \"" + str(SubModelPath) + "\" not found"
return relabels
print(json.dumps(recognize(input)))
|