bowphs commited on
Commit
2be1637
·
verified ·
1 Parent(s): 3f3b639

Add files using upload-large-folder tool

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ stanza/stanza/pipeline/demo/loading.gif filter=lfs diff=lfs merge=lfs -text
stanza/saved_models/depparse/la_giuseppe_transformer_parser.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5dc9da2885756397790f0328e9b390e39dbd12d4d9eea538b23b83a87b4d42d1
3
+ size 665353634
stanza/stanza/pipeline/demo/demo_server.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, abort
2
+ import json
3
+ import stanza
4
+ import os
5
+ app = Flask(__name__, static_url_path='', static_folder=os.path.abspath(os.path.dirname(__file__)))
6
+
7
+ pipelineCache = dict()
8
+
9
+ def get_file(path):
10
+ res = os.path.join(os.path.dirname(os.path.abspath(__file__)), path)
11
+ print(res)
12
+ return res
13
+
14
+ @app.route('/<path:path>')
15
+ def static_file(path):
16
+ if path in ['stanza-brat.css', 'stanza-brat.js', 'stanza-parseviewer.js', 'loading.gif',
17
+ 'favicon.png', 'stanza-logo.png']:
18
+ return app.send_static_file(path)
19
+ elif path in 'index.html':
20
+ return app.send_static_file('stanza-brat.html')
21
+ else:
22
+ abort(403)
23
+
24
+ @app.route('/', methods=['GET'])
25
+ def index():
26
+ return static_file('index.html')
27
+
28
+ @app.route('/', methods=['POST'])
29
+ def annotate():
30
+ global pipelineCache
31
+
32
+ properties = request.args.get('properties', '')
33
+ lang = request.args.get('pipelineLanguage', '')
34
+ text = list(request.form.keys())[0]
35
+
36
+ if lang not in pipelineCache:
37
+ pipelineCache[lang] = stanza.Pipeline(lang=lang, use_gpu=False)
38
+
39
+ res = pipelineCache[lang](text)
40
+
41
+ annotated_sentences = []
42
+ for sentence in res.sentences:
43
+ tokens = []
44
+ deps = []
45
+ for word in sentence.words:
46
+ tokens.append({'index': word.id, 'word': word.text, 'lemma': word.lemma, 'pos': word.xpos, 'upos': word.upos, 'feats': word.feats, 'ner': word.parent.ner if word.parent.ner is None or word.parent.ner == 'O' else word.parent.ner[2:]})
47
+ deps.append({'dep': word.deprel, 'governor': word.head, 'governorGloss': sentence.words[word.head-1].text,
48
+ 'dependent': word.id, 'dependentGloss': word.text})
49
+ annotated_sentences.append({'basicDependencies': deps, 'tokens': tokens})
50
+ if hasattr(sentence, 'constituency') and sentence.constituency is not None:
51
+ annotated_sentences[-1]['parse'] = str(sentence.constituency)
52
+
53
+ return json.dumps({'sentences': annotated_sentences})
54
+
55
+ def create_app():
56
+ return app
stanza/stanza/pipeline/demo/loading.gif ADDED

Git LFS Details

  • SHA256: b2615978dc2c29ce5b6f6c2d45b76e11a439f79dc7554c9203b8aaa6a581aa03
  • Pointer size: 131 Bytes
  • Size of remote file: 366 kB
stanza/stanza/pipeline/demo/stanza-brat.html ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <html>
2
+ <head profile="http://www.w3.org/2005/10/profile">
3
+ <link rel='icon' href='favicon.png' type='image/png'/ >
4
+ <!-- JQuery -->
5
+ <script src="https://code.jquery.com/jquery-2.1.4.min.js"></script>
6
+ <!-- Bootstrap -->
7
+ <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.1/css/bootstrap.min.css"/>
8
+ <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.1/css/bootstrap-theme.min.css"/>
9
+ <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.1/js/bootstrap.min.js"></script>
10
+ <!-- Chosen Dropdown Library -->
11
+ <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/chosen/1.4.2/chosen.css"/>
12
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/chosen/1.4.2/chosen.jquery.min.js"></script>
13
+ <!-- Brat -->
14
+ <link rel="stylesheet" type="text/css" href="https://nlp.stanford.edu/js/brat/style-vis.css"/>
15
+ <script type="text/javascript" src="https://nlp.stanford.edu/js/brat/client/lib/head.load.min.js"></script>
16
+ <!-- d3 -->
17
+ <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.17/d3.min.js"></script>
18
+ <script type="text/javascript" src="http://cdnjs.cloudflare.com/ajax/libs/dagre-d3/0.4.17/dagre-d3.min.js"></script>
19
+
20
+ <!-- CoreNLP -->
21
+ <link rel="stylesheet" type="text/css" href="stanza-brat.css"/>
22
+ <script type="text/javascript" src="stanza-brat.js"></script>
23
+
24
+ <meta charset="UTF-8">
25
+ </head>
26
+
27
+ <body>
28
+ <div id="wrap">
29
+ <!-- A header bar -->
30
+ <nav class="navbar navbar-default navbar-static-top">
31
+ <div class="container">
32
+ <div class="navbar-header">
33
+ <a class="navbar-brand" style="height:70px;font-size:20px" href="https://stanfordnlp.github.io/stanfordnlp/"><img src="stanza-logo.png" height="30px" style="display:inline-block; margin-bottom:8px"/> 1.6.0 (updated October 2023)</a>
34
+ </div>
35
+ </div>
36
+ </nav>
37
+
38
+ <!-- The main content of the page -->
39
+ <div class="container">
40
+ <div class="row">
41
+
42
+ <!-- Text area input -->
43
+ <form id="form_annotate" accept-charset="UTF-8" onsubmit="return false;">
44
+ <div class="col-sm-12" style="margin-bottom: 5px;">
45
+ <label for="text" class="label">&mdash; Text to annotate &mdash;</label>
46
+ <textarea class="form-control" rows="2" id="text" placeholder="e.g., The quick brown fox jumped over the lazy dog." autofocus maxlength="10000"></textarea>
47
+ </div>
48
+
49
+ <!-- Annotators select -->
50
+ <div class="col-sm-8">
51
+ <label for="annotators" class="label">&mdash; Annotations &mdash;</label>
52
+ <select id="annotators" data-placeholder="CoreNLP annotators"
53
+ multiple class="chosen-select" title="Select CoreNLP annotators">
54
+ <option value="pos" selected > parts-of-speech </option>
55
+ <option value="upos" > universal parts-of-speech </option>
56
+ <option value="ner" selected > named entities </option>
57
+ <option value="lemma" selected > lemmas </option>
58
+ <option value="depparse" selected > dependency parse </option>
59
+ <option value="parse" selected > constituency parse </option>
60
+ </select>
61
+ </div>
62
+
63
+ <div class="col-sm-2">
64
+ <label for="language" class="label">&mdash; Language &mdash;</label>
65
+ <select id="language" data-placeholder="Language"
66
+ class="chosen-select" title="Language">
67
+ <option value="af">Afrikaans</option>
68
+ <option value="grc">Ancient Greek</option>
69
+ <option value="ar">Arabic</option>
70
+ <option value="hy">Armenian</option>
71
+ <option value="eu">Basque</option>
72
+ <option value="be">Belarusian</option>
73
+ <option value="bg">Bulgarian</option>
74
+ <option value="bxr">Buryat</option>
75
+ <option value="ca">Catalan</option>
76
+ <option value="zh">Chinese (simplified)</option>
77
+ <option value="zh-Hant">Chinese (traditional)</option>
78
+ <option value="lzh">Classical Chinese</option>
79
+ <option value="cop">Coptic</option>
80
+ <option value="hr">Croatian</option>
81
+ <option value="cs">Czech</option>
82
+ <option value="da">Danish</option>
83
+ <option value="nl">Dutch</option>
84
+ <option value="en" selected>English</option>
85
+ <option value="et">Estonian</option>
86
+ <option value="fi">Finnish</option>
87
+ <option value="fr">French</option>
88
+ <option value="gl">Galician</option>
89
+ <option value="de">German</option>
90
+ <option value="got">Gothic</option>
91
+ <option value="el">Greek</option>
92
+ <option value="he">Hebrew</option>
93
+ <option value="hi">Hindi</option>
94
+ <option value="hu">Hungarian</option>
95
+ <option value="id">Indonesian</option>
96
+ <option value="ga">Irish</option>
97
+ <option value="it">Italian</option>
98
+ <option value="ja">Japanese</option>
99
+ <option value="kk">Kazakh</option>
100
+ <option value="ko">Korean</option>
101
+ <option value="kmr">Kurmanji</option>
102
+ <option value="la">Latin</option>
103
+ <option value="lv">Latvian</option>
104
+ <option value="lt">Lithuanian</option>
105
+ <option value="olo">Livvi</option>
106
+ <option value="mt">Maltese</option>
107
+ <option value="mr">Marathi</option>
108
+ <option value="sme">North Sami</option>
109
+ <option value="no">Norwegian (Bokmål)</option>
110
+ <option value="nn">Norwegian (Nynorsk)</option>
111
+ <option value="cu">Old Church Slavonic</option>
112
+ <option value="fro">Old French</option>
113
+ <option value="orv">Old Russian</option>
114
+ <option value="fa">Persian</option>
115
+ <option value="pl">Polish</option>
116
+ <option value="pt">Portuguese</option>
117
+ <option value="ro">Romanian</option>
118
+ <option value="ru">Russian</option>
119
+ <option value="gd">Scottish Gaelic</option>
120
+ <option value="sr">Serbian</option>
121
+ <option value="sk">Slovak</option>
122
+ <option value="sl">Slovenian</option>
123
+ <option value="es">Spanish</option>
124
+ <option value="sv">Swedish</option>
125
+ <option value="swl">Swedish Sign Language</option>
126
+ <option value="ta">Tamil</option>
127
+ <option value="te">Telugu</option>
128
+ <option value="tr">Turkish</option>
129
+ <option value="uk">Ukrainian</option>
130
+ <option value="hsb">Upper Sorbian</option>
131
+ <option value="ur">Urdu</option>
132
+ <option value="ug">Uyghur</option>
133
+ <option value="vi">Vietnamese</option>
134
+ <option value="wo">Wolof</option>
135
+
136
+ </select>
137
+ </div>
138
+
139
+ <!-- Submit button -->
140
+ <div class="col-sm-2" style="text-align: center; margin-top: 7px; ">
141
+ <button id="submit" class="btn btn-block btn-default">Submit</button>
142
+ </div>
143
+ </form>
144
+
145
+ </div>
146
+ <div class="row">
147
+ <!-- A panel for errors to show up in -->
148
+ <div id="errors" class="row">
149
+ </div>
150
+
151
+ <!-- Loading gif -->
152
+ <div id="loading" class="row" style="display:none">
153
+ <img src="loading.gif" height="200px" style="margin-left: 200px"/>
154
+ </div>
155
+
156
+ <!-- Annotation population area -->
157
+ <div id="annotations" class="row" style="display:none">
158
+ </div>
159
+ </div>
160
+
161
+
162
+ </div>
163
+ </div>
164
+
165
+ <!-- The footer of the page -->
166
+ <footer id="footer" class="footer">
167
+ <div class="container">
168
+ <p class="text-muted">
169
+ Visualisation provided using the <a href="http://brat.nlplab.org/">brat visualisation/annotation software</a>.
170
+ </p>
171
+ </div>
172
+ </footer>
173
+
174
+ </body>
175
+ </html>
stanza/stanza/pipeline/external/pythainlp.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Processors related to PyThaiNLP in the pipeline.
3
+
4
+ GitHub Home: https://github.com/PyThaiNLP/pythainlp
5
+ """
6
+
7
+ from stanza.models.common import doc
8
+ from stanza.pipeline._constants import TOKENIZE
9
+ from stanza.pipeline.processor import ProcessorVariant, register_processor_variant
10
+
11
+ def check_pythainlp():
12
+ """
13
+ Import necessary components from pythainlp to perform tokenization.
14
+ """
15
+ try:
16
+ import pythainlp
17
+ except ImportError:
18
+ raise ImportError(
19
+ "The pythainlp library is required. "
20
+ "Try to install it with `pip install pythainlp`. "
21
+ "Go to https://github.com/PyThaiNLP/pythainlp for more information."
22
+ )
23
+ return True
24
+
25
+ @register_processor_variant(TOKENIZE, 'pythainlp')
26
+ class PyThaiNLPTokenizer(ProcessorVariant):
27
+ def __init__(self, config):
28
+ """ Construct a PyThaiNLP-based tokenizer.
29
+
30
+ Note that we always uses the default tokenizer of PyThaiNLP for sentence and word segmentation.
31
+ Currently this is a CRF model for sentence segmentation and a dictionary-based model (newmm) for word segmentation.
32
+ """
33
+ if config['lang'] != 'th':
34
+ raise Exception("PyThaiNLP tokenizer is only allowed in Thai pipeline.")
35
+
36
+ check_pythainlp()
37
+ from pythainlp.tokenize import sent_tokenize as pythai_sent_tokenize
38
+ from pythainlp.tokenize import word_tokenize as pythai_word_tokenize
39
+
40
+ self.pythai_sent_tokenize = pythai_sent_tokenize
41
+ self.pythai_word_tokenize = pythai_word_tokenize
42
+ self.no_ssplit = config.get('no_ssplit', False)
43
+
44
+ def process(self, document):
45
+ """ Tokenize a document with the PyThaiNLP tokenizer and wrap the results into a Doc object.
46
+ """
47
+ if isinstance(document, doc.Document):
48
+ text = document.text
49
+ else:
50
+ text = document
51
+ if not isinstance(text, str):
52
+ raise Exception("Must supply a string or Stanza Document object to the PyThaiNLP tokenizer.")
53
+
54
+ sentences = []
55
+ current_sentence = []
56
+ offset = 0
57
+
58
+ if self.no_ssplit:
59
+ # skip sentence segmentation
60
+ sent_strs = [text]
61
+ else:
62
+ sent_strs = self.pythai_sent_tokenize(text, engine='crfcut')
63
+ for sent_str in sent_strs:
64
+ for token_str in self.pythai_word_tokenize(sent_str, engine='newmm'):
65
+ # by default pythainlp will output whitespace as a token
66
+ # we need to skip these tokens to be consistent with other tokenizers
67
+ if token_str.isspace():
68
+ offset += len(token_str)
69
+ continue
70
+
71
+ # create token entry
72
+ token_entry = {
73
+ doc.TEXT: token_str,
74
+ doc.MISC: f"{doc.START_CHAR}={offset}|{doc.END_CHAR}={offset+len(token_str)}"
75
+ }
76
+ current_sentence.append(token_entry)
77
+ offset += len(token_str)
78
+
79
+ # finish sentence
80
+ sentences.append(current_sentence)
81
+ current_sentence = []
82
+
83
+ if len(current_sentence) > 0:
84
+ sentences.append(current_sentence)
85
+
86
+ return doc.Document(sentences, text)
stanza/stanza/tests/data/aws_annotations.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b77fd2ff28fcbe3d8bc3c96ce08ef5f8f08afbc5838cc29f41dcc91cb1109fb
3
+ size 14600
stanza/stanza/tests/data/tiny_emb.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eadc38121a965a9666418f60be45a9cda5b0a5b59461e36f9cef9619d7297f82
3
+ size 78
stanza/stanza/tests/data/tiny_emb.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc143a71d627ff8f8dfd0823af5e4ddaef47e0999ab2e6d2ead60860ae84a6b8
3
+ size 698
stanza/stanza/tests/data/tiny_emb.xz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68085a67ccee721b457701c281fff7741db0ab4769237ed075a894f5d254b76a
3
+ size 104
stanza/stanza/tests/data/tiny_emb.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2fa97fee0345fa19e2c7f41494362a7385c59df068ea84421048708aa2de3d2a
3
+ size 220