File size: 2,561 Bytes
e7de395
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# recognize: processes two-levels-topic-recognition
# Authors: Paul Libbrecht, Pierre Günthner and Alexander Gantikow from the AISOP project

# Installation: Install spacy then...
# Usage: python recognize.py l1-model l2-models "this is a text"
# l1-model path: a relative path (starting from this script) pointing to the level-1 model folder
# l2-model path: a relative path (starting from this script) pointing to the folder containing a folder for each L2-label
# "this is a text": the text to recognize


import sys                       # System-specific parameters and functions, part of Py
import spacy                     # Natural language processing
from pathlib import Path         # Object-oriented filesystem paths, part of Py
import json                      # JSON object dumping functions


RoundTo = 2                      # Round to precision of n decimals
Encoding = 'utf8'                # Encoding of the html file to be read and parsed via BeautifulSoup
ScoreThreshold = 0.2             # Min. spacy probability value for an element to be analysed/enriched.
MaxResults = 3                   # Max. number of concepts/labels added to an html element
ParagraphMinLetters = 10         # Min. number of letters of paragraph to be considered in analysis
ListMinLetters = 10              # Min. number of letters of <ul> and <ol> to be considered in analysis

SubModels = {}

Nlp = spacy.load(sys.argv[1])
SubModelDir = Path(__file__).parent.joinpath(sys.argv[2]).absolute()
input = sys.argv[3]


def filterDoc(doc, scoreThreshold, maxResults, roundTo=2):
    cats = doc.cats.items()
    filt = list(filter(lambda c: c[1] > scoreThreshold, cats))
    sort = sorted(filt, key=lambda c: c[1], reverse=True)
    maxi = sort[0:maxResults]
    rund = [(l[0], round(l[1], roundTo)) for l in maxi ]
    return dict(rund)



def recognize(text):
    global Nlp
    # find l1 labels
    labels = filterDoc(Nlp(text), ScoreThreshold, MaxResults)

    # find L2 labels
    relabels = dict()
    for label in labels.keys():
        label2 = label.strip()
        SubModelPath = SubModelDir.joinpath(label2).absolute()
        if SubModelPath.exists():
            Nlp = spacy.load(SubModelPath)
            docSub = filterDoc(Nlp(text), ScoreThreshold, MaxResults)
            relabels[label2] = {'score': labels[label], 'subs': docSub}
        else:
            relabels[label2] = {'score': labels[label]}
            relabels["messages"] = "Submodel path \"" + str(SubModelPath) + "\" not found"

    return relabels

print(json.dumps(recognize(input)))