Add files using upload-large-folder tool
Browse files- .gitattributes +1 -0
- stanza/saved_models/depparse/la_giuseppe_transformer_parser.pt +3 -0
- stanza/stanza/pipeline/demo/demo_server.py +56 -0
- stanza/stanza/pipeline/demo/loading.gif +3 -0
- stanza/stanza/pipeline/demo/stanza-brat.html +175 -0
- stanza/stanza/pipeline/external/pythainlp.py +86 -0
- stanza/stanza/tests/data/aws_annotations.zip +3 -0
- stanza/stanza/tests/data/tiny_emb.gz +3 -0
- stanza/stanza/tests/data/tiny_emb.pt +3 -0
- stanza/stanza/tests/data/tiny_emb.xz +3 -0
- stanza/stanza/tests/data/tiny_emb.zip +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
stanza/stanza/pipeline/demo/loading.gif filter=lfs diff=lfs merge=lfs -text
|
stanza/saved_models/depparse/la_giuseppe_transformer_parser.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5dc9da2885756397790f0328e9b390e39dbd12d4d9eea538b23b83a87b4d42d1
|
| 3 |
+
size 665353634
|
stanza/stanza/pipeline/demo/demo_server.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from flask import Flask, request, abort
|
| 2 |
+
import json
|
| 3 |
+
import stanza
|
| 4 |
+
import os
|
| 5 |
+
app = Flask(__name__, static_url_path='', static_folder=os.path.abspath(os.path.dirname(__file__)))
|
| 6 |
+
|
| 7 |
+
pipelineCache = dict()
|
| 8 |
+
|
| 9 |
+
def get_file(path):
|
| 10 |
+
res = os.path.join(os.path.dirname(os.path.abspath(__file__)), path)
|
| 11 |
+
print(res)
|
| 12 |
+
return res
|
| 13 |
+
|
| 14 |
+
@app.route('/<path:path>')
|
| 15 |
+
def static_file(path):
|
| 16 |
+
if path in ['stanza-brat.css', 'stanza-brat.js', 'stanza-parseviewer.js', 'loading.gif',
|
| 17 |
+
'favicon.png', 'stanza-logo.png']:
|
| 18 |
+
return app.send_static_file(path)
|
| 19 |
+
elif path in 'index.html':
|
| 20 |
+
return app.send_static_file('stanza-brat.html')
|
| 21 |
+
else:
|
| 22 |
+
abort(403)
|
| 23 |
+
|
| 24 |
+
@app.route('/', methods=['GET'])
|
| 25 |
+
def index():
|
| 26 |
+
return static_file('index.html')
|
| 27 |
+
|
| 28 |
+
@app.route('/', methods=['POST'])
|
| 29 |
+
def annotate():
|
| 30 |
+
global pipelineCache
|
| 31 |
+
|
| 32 |
+
properties = request.args.get('properties', '')
|
| 33 |
+
lang = request.args.get('pipelineLanguage', '')
|
| 34 |
+
text = list(request.form.keys())[0]
|
| 35 |
+
|
| 36 |
+
if lang not in pipelineCache:
|
| 37 |
+
pipelineCache[lang] = stanza.Pipeline(lang=lang, use_gpu=False)
|
| 38 |
+
|
| 39 |
+
res = pipelineCache[lang](text)
|
| 40 |
+
|
| 41 |
+
annotated_sentences = []
|
| 42 |
+
for sentence in res.sentences:
|
| 43 |
+
tokens = []
|
| 44 |
+
deps = []
|
| 45 |
+
for word in sentence.words:
|
| 46 |
+
tokens.append({'index': word.id, 'word': word.text, 'lemma': word.lemma, 'pos': word.xpos, 'upos': word.upos, 'feats': word.feats, 'ner': word.parent.ner if word.parent.ner is None or word.parent.ner == 'O' else word.parent.ner[2:]})
|
| 47 |
+
deps.append({'dep': word.deprel, 'governor': word.head, 'governorGloss': sentence.words[word.head-1].text,
|
| 48 |
+
'dependent': word.id, 'dependentGloss': word.text})
|
| 49 |
+
annotated_sentences.append({'basicDependencies': deps, 'tokens': tokens})
|
| 50 |
+
if hasattr(sentence, 'constituency') and sentence.constituency is not None:
|
| 51 |
+
annotated_sentences[-1]['parse'] = str(sentence.constituency)
|
| 52 |
+
|
| 53 |
+
return json.dumps({'sentences': annotated_sentences})
|
| 54 |
+
|
| 55 |
+
def create_app():
|
| 56 |
+
return app
|
stanza/stanza/pipeline/demo/loading.gif
ADDED
|
Git LFS Details
|
stanza/stanza/pipeline/demo/stanza-brat.html
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<html>
|
| 2 |
+
<head profile="http://www.w3.org/2005/10/profile">
|
| 3 |
+
<link rel='icon' href='favicon.png' type='image/png'/ >
|
| 4 |
+
<!-- JQuery -->
|
| 5 |
+
<script src="https://code.jquery.com/jquery-2.1.4.min.js"></script>
|
| 6 |
+
<!-- Bootstrap -->
|
| 7 |
+
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.1/css/bootstrap.min.css"/>
|
| 8 |
+
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.1/css/bootstrap-theme.min.css"/>
|
| 9 |
+
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.1/js/bootstrap.min.js"></script>
|
| 10 |
+
<!-- Chosen Dropdown Library -->
|
| 11 |
+
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/chosen/1.4.2/chosen.css"/>
|
| 12 |
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/chosen/1.4.2/chosen.jquery.min.js"></script>
|
| 13 |
+
<!-- Brat -->
|
| 14 |
+
<link rel="stylesheet" type="text/css" href="https://nlp.stanford.edu/js/brat/style-vis.css"/>
|
| 15 |
+
<script type="text/javascript" src="https://nlp.stanford.edu/js/brat/client/lib/head.load.min.js"></script>
|
| 16 |
+
<!-- d3 -->
|
| 17 |
+
<script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.17/d3.min.js"></script>
|
| 18 |
+
<script type="text/javascript" src="http://cdnjs.cloudflare.com/ajax/libs/dagre-d3/0.4.17/dagre-d3.min.js"></script>
|
| 19 |
+
|
| 20 |
+
<!-- CoreNLP -->
|
| 21 |
+
<link rel="stylesheet" type="text/css" href="stanza-brat.css"/>
|
| 22 |
+
<script type="text/javascript" src="stanza-brat.js"></script>
|
| 23 |
+
|
| 24 |
+
<meta charset="UTF-8">
|
| 25 |
+
</head>
|
| 26 |
+
|
| 27 |
+
<body>
|
| 28 |
+
<div id="wrap">
|
| 29 |
+
<!-- A header bar -->
|
| 30 |
+
<nav class="navbar navbar-default navbar-static-top">
|
| 31 |
+
<div class="container">
|
| 32 |
+
<div class="navbar-header">
|
| 33 |
+
<a class="navbar-brand" style="height:70px;font-size:20px" href="https://stanfordnlp.github.io/stanfordnlp/"><img src="stanza-logo.png" height="30px" style="display:inline-block; margin-bottom:8px"/> 1.6.0 (updated October 2023)</a>
|
| 34 |
+
</div>
|
| 35 |
+
</div>
|
| 36 |
+
</nav>
|
| 37 |
+
|
| 38 |
+
<!-- The main content of the page -->
|
| 39 |
+
<div class="container">
|
| 40 |
+
<div class="row">
|
| 41 |
+
|
| 42 |
+
<!-- Text area input -->
|
| 43 |
+
<form id="form_annotate" accept-charset="UTF-8" onsubmit="return false;">
|
| 44 |
+
<div class="col-sm-12" style="margin-bottom: 5px;">
|
| 45 |
+
<label for="text" class="label">— Text to annotate —</label>
|
| 46 |
+
<textarea class="form-control" rows="2" id="text" placeholder="e.g., The quick brown fox jumped over the lazy dog." autofocus maxlength="10000"></textarea>
|
| 47 |
+
</div>
|
| 48 |
+
|
| 49 |
+
<!-- Annotators select -->
|
| 50 |
+
<div class="col-sm-8">
|
| 51 |
+
<label for="annotators" class="label">— Annotations —</label>
|
| 52 |
+
<select id="annotators" data-placeholder="CoreNLP annotators"
|
| 53 |
+
multiple class="chosen-select" title="Select CoreNLP annotators">
|
| 54 |
+
<option value="pos" selected > parts-of-speech </option>
|
| 55 |
+
<option value="upos" > universal parts-of-speech </option>
|
| 56 |
+
<option value="ner" selected > named entities </option>
|
| 57 |
+
<option value="lemma" selected > lemmas </option>
|
| 58 |
+
<option value="depparse" selected > dependency parse </option>
|
| 59 |
+
<option value="parse" selected > constituency parse </option>
|
| 60 |
+
</select>
|
| 61 |
+
</div>
|
| 62 |
+
|
| 63 |
+
<div class="col-sm-2">
|
| 64 |
+
<label for="language" class="label">— Language —</label>
|
| 65 |
+
<select id="language" data-placeholder="Language"
|
| 66 |
+
class="chosen-select" title="Language">
|
| 67 |
+
<option value="af">Afrikaans</option>
|
| 68 |
+
<option value="grc">Ancient Greek</option>
|
| 69 |
+
<option value="ar">Arabic</option>
|
| 70 |
+
<option value="hy">Armenian</option>
|
| 71 |
+
<option value="eu">Basque</option>
|
| 72 |
+
<option value="be">Belarusian</option>
|
| 73 |
+
<option value="bg">Bulgarian</option>
|
| 74 |
+
<option value="bxr">Buryat</option>
|
| 75 |
+
<option value="ca">Catalan</option>
|
| 76 |
+
<option value="zh">Chinese (simplified)</option>
|
| 77 |
+
<option value="zh-Hant">Chinese (traditional)</option>
|
| 78 |
+
<option value="lzh">Classical Chinese</option>
|
| 79 |
+
<option value="cop">Coptic</option>
|
| 80 |
+
<option value="hr">Croatian</option>
|
| 81 |
+
<option value="cs">Czech</option>
|
| 82 |
+
<option value="da">Danish</option>
|
| 83 |
+
<option value="nl">Dutch</option>
|
| 84 |
+
<option value="en" selected>English</option>
|
| 85 |
+
<option value="et">Estonian</option>
|
| 86 |
+
<option value="fi">Finnish</option>
|
| 87 |
+
<option value="fr">French</option>
|
| 88 |
+
<option value="gl">Galician</option>
|
| 89 |
+
<option value="de">German</option>
|
| 90 |
+
<option value="got">Gothic</option>
|
| 91 |
+
<option value="el">Greek</option>
|
| 92 |
+
<option value="he">Hebrew</option>
|
| 93 |
+
<option value="hi">Hindi</option>
|
| 94 |
+
<option value="hu">Hungarian</option>
|
| 95 |
+
<option value="id">Indonesian</option>
|
| 96 |
+
<option value="ga">Irish</option>
|
| 97 |
+
<option value="it">Italian</option>
|
| 98 |
+
<option value="ja">Japanese</option>
|
| 99 |
+
<option value="kk">Kazakh</option>
|
| 100 |
+
<option value="ko">Korean</option>
|
| 101 |
+
<option value="kmr">Kurmanji</option>
|
| 102 |
+
<option value="la">Latin</option>
|
| 103 |
+
<option value="lv">Latvian</option>
|
| 104 |
+
<option value="lt">Lithuanian</option>
|
| 105 |
+
<option value="olo">Livvi</option>
|
| 106 |
+
<option value="mt">Maltese</option>
|
| 107 |
+
<option value="mr">Marathi</option>
|
| 108 |
+
<option value="sme">North Sami</option>
|
| 109 |
+
<option value="no">Norwegian (Bokmål)</option>
|
| 110 |
+
<option value="nn">Norwegian (Nynorsk)</option>
|
| 111 |
+
<option value="cu">Old Church Slavonic</option>
|
| 112 |
+
<option value="fro">Old French</option>
|
| 113 |
+
<option value="orv">Old Russian</option>
|
| 114 |
+
<option value="fa">Persian</option>
|
| 115 |
+
<option value="pl">Polish</option>
|
| 116 |
+
<option value="pt">Portuguese</option>
|
| 117 |
+
<option value="ro">Romanian</option>
|
| 118 |
+
<option value="ru">Russian</option>
|
| 119 |
+
<option value="gd">Scottish Gaelic</option>
|
| 120 |
+
<option value="sr">Serbian</option>
|
| 121 |
+
<option value="sk">Slovak</option>
|
| 122 |
+
<option value="sl">Slovenian</option>
|
| 123 |
+
<option value="es">Spanish</option>
|
| 124 |
+
<option value="sv">Swedish</option>
|
| 125 |
+
<option value="swl">Swedish Sign Language</option>
|
| 126 |
+
<option value="ta">Tamil</option>
|
| 127 |
+
<option value="te">Telugu</option>
|
| 128 |
+
<option value="tr">Turkish</option>
|
| 129 |
+
<option value="uk">Ukrainian</option>
|
| 130 |
+
<option value="hsb">Upper Sorbian</option>
|
| 131 |
+
<option value="ur">Urdu</option>
|
| 132 |
+
<option value="ug">Uyghur</option>
|
| 133 |
+
<option value="vi">Vietnamese</option>
|
| 134 |
+
<option value="wo">Wolof</option>
|
| 135 |
+
|
| 136 |
+
</select>
|
| 137 |
+
</div>
|
| 138 |
+
|
| 139 |
+
<!-- Submit button -->
|
| 140 |
+
<div class="col-sm-2" style="text-align: center; margin-top: 7px; ">
|
| 141 |
+
<button id="submit" class="btn btn-block btn-default">Submit</button>
|
| 142 |
+
</div>
|
| 143 |
+
</form>
|
| 144 |
+
|
| 145 |
+
</div>
|
| 146 |
+
<div class="row">
|
| 147 |
+
<!-- A panel for errors to show up in -->
|
| 148 |
+
<div id="errors" class="row">
|
| 149 |
+
</div>
|
| 150 |
+
|
| 151 |
+
<!-- Loading gif -->
|
| 152 |
+
<div id="loading" class="row" style="display:none">
|
| 153 |
+
<img src="loading.gif" height="200px" style="margin-left: 200px"/>
|
| 154 |
+
</div>
|
| 155 |
+
|
| 156 |
+
<!-- Annotation population area -->
|
| 157 |
+
<div id="annotations" class="row" style="display:none">
|
| 158 |
+
</div>
|
| 159 |
+
</div>
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
</div>
|
| 163 |
+
</div>
|
| 164 |
+
|
| 165 |
+
<!-- The footer of the page -->
|
| 166 |
+
<footer id="footer" class="footer">
|
| 167 |
+
<div class="container">
|
| 168 |
+
<p class="text-muted">
|
| 169 |
+
Visualisation provided using the <a href="http://brat.nlplab.org/">brat visualisation/annotation software</a>.
|
| 170 |
+
</p>
|
| 171 |
+
</div>
|
| 172 |
+
</footer>
|
| 173 |
+
|
| 174 |
+
</body>
|
| 175 |
+
</html>
|
stanza/stanza/pipeline/external/pythainlp.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Processors related to PyThaiNLP in the pipeline.
|
| 3 |
+
|
| 4 |
+
GitHub Home: https://github.com/PyThaiNLP/pythainlp
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from stanza.models.common import doc
|
| 8 |
+
from stanza.pipeline._constants import TOKENIZE
|
| 9 |
+
from stanza.pipeline.processor import ProcessorVariant, register_processor_variant
|
| 10 |
+
|
| 11 |
+
def check_pythainlp():
|
| 12 |
+
"""
|
| 13 |
+
Import necessary components from pythainlp to perform tokenization.
|
| 14 |
+
"""
|
| 15 |
+
try:
|
| 16 |
+
import pythainlp
|
| 17 |
+
except ImportError:
|
| 18 |
+
raise ImportError(
|
| 19 |
+
"The pythainlp library is required. "
|
| 20 |
+
"Try to install it with `pip install pythainlp`. "
|
| 21 |
+
"Go to https://github.com/PyThaiNLP/pythainlp for more information."
|
| 22 |
+
)
|
| 23 |
+
return True
|
| 24 |
+
|
| 25 |
+
@register_processor_variant(TOKENIZE, 'pythainlp')
|
| 26 |
+
class PyThaiNLPTokenizer(ProcessorVariant):
|
| 27 |
+
def __init__(self, config):
|
| 28 |
+
""" Construct a PyThaiNLP-based tokenizer.
|
| 29 |
+
|
| 30 |
+
Note that we always uses the default tokenizer of PyThaiNLP for sentence and word segmentation.
|
| 31 |
+
Currently this is a CRF model for sentence segmentation and a dictionary-based model (newmm) for word segmentation.
|
| 32 |
+
"""
|
| 33 |
+
if config['lang'] != 'th':
|
| 34 |
+
raise Exception("PyThaiNLP tokenizer is only allowed in Thai pipeline.")
|
| 35 |
+
|
| 36 |
+
check_pythainlp()
|
| 37 |
+
from pythainlp.tokenize import sent_tokenize as pythai_sent_tokenize
|
| 38 |
+
from pythainlp.tokenize import word_tokenize as pythai_word_tokenize
|
| 39 |
+
|
| 40 |
+
self.pythai_sent_tokenize = pythai_sent_tokenize
|
| 41 |
+
self.pythai_word_tokenize = pythai_word_tokenize
|
| 42 |
+
self.no_ssplit = config.get('no_ssplit', False)
|
| 43 |
+
|
| 44 |
+
def process(self, document):
|
| 45 |
+
""" Tokenize a document with the PyThaiNLP tokenizer and wrap the results into a Doc object.
|
| 46 |
+
"""
|
| 47 |
+
if isinstance(document, doc.Document):
|
| 48 |
+
text = document.text
|
| 49 |
+
else:
|
| 50 |
+
text = document
|
| 51 |
+
if not isinstance(text, str):
|
| 52 |
+
raise Exception("Must supply a string or Stanza Document object to the PyThaiNLP tokenizer.")
|
| 53 |
+
|
| 54 |
+
sentences = []
|
| 55 |
+
current_sentence = []
|
| 56 |
+
offset = 0
|
| 57 |
+
|
| 58 |
+
if self.no_ssplit:
|
| 59 |
+
# skip sentence segmentation
|
| 60 |
+
sent_strs = [text]
|
| 61 |
+
else:
|
| 62 |
+
sent_strs = self.pythai_sent_tokenize(text, engine='crfcut')
|
| 63 |
+
for sent_str in sent_strs:
|
| 64 |
+
for token_str in self.pythai_word_tokenize(sent_str, engine='newmm'):
|
| 65 |
+
# by default pythainlp will output whitespace as a token
|
| 66 |
+
# we need to skip these tokens to be consistent with other tokenizers
|
| 67 |
+
if token_str.isspace():
|
| 68 |
+
offset += len(token_str)
|
| 69 |
+
continue
|
| 70 |
+
|
| 71 |
+
# create token entry
|
| 72 |
+
token_entry = {
|
| 73 |
+
doc.TEXT: token_str,
|
| 74 |
+
doc.MISC: f"{doc.START_CHAR}={offset}|{doc.END_CHAR}={offset+len(token_str)}"
|
| 75 |
+
}
|
| 76 |
+
current_sentence.append(token_entry)
|
| 77 |
+
offset += len(token_str)
|
| 78 |
+
|
| 79 |
+
# finish sentence
|
| 80 |
+
sentences.append(current_sentence)
|
| 81 |
+
current_sentence = []
|
| 82 |
+
|
| 83 |
+
if len(current_sentence) > 0:
|
| 84 |
+
sentences.append(current_sentence)
|
| 85 |
+
|
| 86 |
+
return doc.Document(sentences, text)
|
stanza/stanza/tests/data/aws_annotations.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7b77fd2ff28fcbe3d8bc3c96ce08ef5f8f08afbc5838cc29f41dcc91cb1109fb
|
| 3 |
+
size 14600
|
stanza/stanza/tests/data/tiny_emb.gz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eadc38121a965a9666418f60be45a9cda5b0a5b59461e36f9cef9619d7297f82
|
| 3 |
+
size 78
|
stanza/stanza/tests/data/tiny_emb.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bc143a71d627ff8f8dfd0823af5e4ddaef47e0999ab2e6d2ead60860ae84a6b8
|
| 3 |
+
size 698
|
stanza/stanza/tests/data/tiny_emb.xz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:68085a67ccee721b457701c281fff7741db0ab4769237ed075a894f5d254b76a
|
| 3 |
+
size 104
|
stanza/stanza/tests/data/tiny_emb.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2fa97fee0345fa19e2c7f41494362a7385c59df068ea84421048708aa2de3d2a
|
| 3 |
+
size 220
|