Add files using upload-large-folder tool

Browse files

Files changed (11) hide show

.gitattributes +1 -0
stanza/saved_models/depparse/la_giuseppe_transformer_parser.pt +3 -0
stanza/stanza/pipeline/demo/demo_server.py +56 -0
stanza/stanza/pipeline/demo/loading.gif +3 -0
stanza/stanza/pipeline/demo/stanza-brat.html +175 -0
stanza/stanza/pipeline/external/pythainlp.py +86 -0
stanza/stanza/tests/data/aws_annotations.zip +3 -0
stanza/stanza/tests/data/tiny_emb.gz +3 -0
stanza/stanza/tests/data/tiny_emb.pt +3 -0
stanza/stanza/tests/data/tiny_emb.xz +3 -0
stanza/stanza/tests/data/tiny_emb.zip +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+stanza/stanza/pipeline/demo/loading.gif filter=lfs diff=lfs merge=lfs -text

stanza/saved_models/depparse/la_giuseppe_transformer_parser.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5dc9da2885756397790f0328e9b390e39dbd12d4d9eea538b23b83a87b4d42d1
+size 665353634

stanza/stanza/pipeline/demo/demo_server.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from flask import Flask, request, abort
+import json
+import stanza
+import os
+app = Flask(__name__, static_url_path='', static_folder=os.path.abspath(os.path.dirname(__file__)))
+pipelineCache = dict()
+def get_file(path):
+    res = os.path.join(os.path.dirname(os.path.abspath(__file__)), path)
+    print(res)
+    return res
+@app.route('/<path:path>')
+def static_file(path):
+    if path in ['stanza-brat.css', 'stanza-brat.js', 'stanza-parseviewer.js', 'loading.gif',
+            'favicon.png', 'stanza-logo.png']:
+        return app.send_static_file(path)
+    elif path in 'index.html':
+        return app.send_static_file('stanza-brat.html')
+    else:
+        abort(403)
+@app.route('/', methods=['GET'])
+def index():
+    return static_file('index.html')
+@app.route('/', methods=['POST'])
+def annotate():
+    global pipelineCache
+    properties = request.args.get('properties', '')
+    lang = request.args.get('pipelineLanguage', '')
+    text = list(request.form.keys())[0]
+    if lang not in pipelineCache:
+        pipelineCache[lang] = stanza.Pipeline(lang=lang, use_gpu=False)
+    res = pipelineCache[lang](text)
+    annotated_sentences = []
+    for sentence in res.sentences:
+        tokens = []
+        deps = []
+        for word in sentence.words:
+            tokens.append({'index': word.id, 'word': word.text, 'lemma': word.lemma, 'pos': word.xpos, 'upos': word.upos, 'feats': word.feats, 'ner': word.parent.ner if word.parent.ner is None or word.parent.ner == 'O' else word.parent.ner[2:]})
+            deps.append({'dep': word.deprel, 'governor': word.head, 'governorGloss': sentence.words[word.head-1].text,
+                'dependent': word.id, 'dependentGloss': word.text})
+        annotated_sentences.append({'basicDependencies': deps, 'tokens': tokens})
+        if hasattr(sentence, 'constituency') and sentence.constituency is not None:
+            annotated_sentences[-1]['parse'] = str(sentence.constituency)
+    return json.dumps({'sentences': annotated_sentences})
+def create_app():
+    return app

stanza/stanza/pipeline/demo/loading.gif ADDED Viewed

Git LFS Details

SHA256: b2615978dc2c29ce5b6f6c2d45b76e11a439f79dc7554c9203b8aaa6a581aa03
Pointer size: 131 Bytes
Size of remote file: 366 kB

stanza/stanza/pipeline/demo/stanza-brat.html ADDED Viewed

	@@ -0,0 +1,175 @@

+<html>
+<head profile="http://www.w3.org/2005/10/profile">
+  <link rel='icon' href='favicon.png' type='image/png'/ >
+  <!-- JQuery -->
+  <script src="https://code.jquery.com/jquery-2.1.4.min.js"></script>
+  <!-- Bootstrap -->
+  <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.1/css/bootstrap.min.css"/>
+  <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.1/css/bootstrap-theme.min.css"/>
+  <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.1/js/bootstrap.min.js"></script>
+  <!-- Chosen Dropdown Library -->
+  <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/chosen/1.4.2/chosen.css"/>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/chosen/1.4.2/chosen.jquery.min.js"></script>
+  <!-- Brat -->
+  <link rel="stylesheet" type="text/css" href="https://nlp.stanford.edu/js/brat/style-vis.css"/>
+  <script type="text/javascript" src="https://nlp.stanford.edu/js/brat/client/lib/head.load.min.js"></script>
+  <!-- d3 -->
+  <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.17/d3.min.js"></script>
+  <script type="text/javascript" src="http://cdnjs.cloudflare.com/ajax/libs/dagre-d3/0.4.17/dagre-d3.min.js"></script>
+    <!-- CoreNLP -->
+  <link rel="stylesheet" type="text/css" href="stanza-brat.css"/>
+  <script type="text/javascript" src="stanza-brat.js"></script>
+  <meta charset="UTF-8">
+</head>
+<body>
+<div id="wrap">
+<!-- A header bar -->
+<nav class="navbar navbar-default navbar-static-top">
+  <div class="container">
+    <div class="navbar-header">
+        <a class="navbar-brand" style="height:70px;font-size:20px" href="https://stanfordnlp.github.io/stanfordnlp/"><img src="stanza-logo.png" height="30px" style="display:inline-block; margin-bottom:8px"/> 1.6.0 (updated October 2023)</a>
+    </div>
+  </div>
+</nav>
+<!-- The main content of the page -->
+<div class="container">
+  <div class="row">
+    <!-- Text area input -->
+    <form id="form_annotate" accept-charset="UTF-8" onsubmit="return false;">
+    <div class="col-sm-12" style="margin-bottom: 5px;">
+      <label for="text" class="label">&mdash; Text to annotate &mdash;</label>
+      <textarea class="form-control" rows="2" id="text" placeholder="e.g., The quick brown fox jumped over the lazy dog." autofocus maxlength="10000"></textarea>
+    </div>
+    <!-- Annotators select -->
+    <div class="col-sm-8">
+      <label for="annotators" class="label">&mdash; Annotations &mdash;</label>
+      <select id="annotators" data-placeholder="CoreNLP annotators"
+              multiple class="chosen-select" title="Select CoreNLP annotators">
+        <option value="pos"            selected > parts-of-speech           </option>
+        <option value="upos"                    > universal parts-of-speech </option>
+        <option value="ner"            selected > named entities            </option>
+        <option value="lemma"          selected > lemmas                    </option>
+        <option value="depparse"       selected > dependency parse          </option>
+        <option value="parse"          selected > constituency parse        </option>
+      </select>
+    </div>
+    <div class="col-sm-2">
+        <label for="language" class="label">&mdash; Language &mdash;</label>
+        <select id="language" data-placeholder="Language"
+                class="chosen-select" title="Language">
+                    <option value="af">Afrikaans</option>
+                    <option value="grc">Ancient Greek</option>
+                    <option value="ar">Arabic</option>
+                    <option value="hy">Armenian</option>
+                    <option value="eu">Basque</option>
+                    <option value="be">Belarusian</option>
+                    <option value="bg">Bulgarian</option>
+                    <option value="bxr">Buryat</option>
+                    <option value="ca">Catalan</option>
+                    <option value="zh">Chinese (simplified)</option>
+                    <option value="zh-Hant">Chinese (traditional)</option>
+                    <option value="lzh">Classical Chinese</option>
+                    <option value="cop">Coptic</option>
+                    <option value="hr">Croatian</option>
+                    <option value="cs">Czech</option>
+                    <option value="da">Danish</option>
+                    <option value="nl">Dutch</option>
+                    <option value="en" selected>English</option>
+                    <option value="et">Estonian</option>
+                    <option value="fi">Finnish</option>
+                    <option value="fr">French</option>
+                    <option value="gl">Galician</option>
+                    <option value="de">German</option>
+                    <option value="got">Gothic</option>
+                    <option value="el">Greek</option>
+                    <option value="he">Hebrew</option>
+                    <option value="hi">Hindi</option>
+                    <option value="hu">Hungarian</option>
+                    <option value="id">Indonesian</option>
+                    <option value="ga">Irish</option>
+                    <option value="it">Italian</option>
+                    <option value="ja">Japanese</option>
+                    <option value="kk">Kazakh</option>
+                    <option value="ko">Korean</option>
+                    <option value="kmr">Kurmanji</option>
+                    <option value="la">Latin</option>
+                    <option value="lv">Latvian</option>
+                    <option value="lt">Lithuanian</option>
+                    <option value="olo">Livvi</option>
+                    <option value="mt">Maltese</option>
+                    <option value="mr">Marathi</option>
+                    <option value="sme">North Sami</option>
+                    <option value="no">Norwegian (Bokmål)</option>
+                    <option value="nn">Norwegian (Nynorsk)</option>
+                    <option value="cu">Old Church Slavonic</option>
+                    <option value="fro">Old French</option>
+                    <option value="orv">Old Russian</option>
+                    <option value="fa">Persian</option>
+                    <option value="pl">Polish</option>
+                    <option value="pt">Portuguese</option>
+                    <option value="ro">Romanian</option>
+                    <option value="ru">Russian</option>
+                    <option value="gd">Scottish Gaelic</option>
+                    <option value="sr">Serbian</option>
+                    <option value="sk">Slovak</option>
+                    <option value="sl">Slovenian</option>
+                    <option value="es">Spanish</option>
+                    <option value="sv">Swedish</option>
+                    <option value="swl">Swedish Sign Language</option>
+                    <option value="ta">Tamil</option>
+                    <option value="te">Telugu</option>
+                    <option value="tr">Turkish</option>
+                    <option value="uk">Ukrainian</option>
+                    <option value="hsb">Upper Sorbian</option>
+                    <option value="ur">Urdu</option>
+                    <option value="ug">Uyghur</option>
+                    <option value="vi">Vietnamese</option>
+                    <option value="wo">Wolof</option>
+        </select>
+    </div>
+    <!-- Submit button -->
+    <div class="col-sm-2" style="text-align: center; margin-top: 7px; ">
+        <button id="submit" class="btn btn-block btn-default">Submit</button>
+    </div>
+    </form>
+  </div>
+  <div class="row">
+    <!-- A panel for errors to show up in -->
+    <div id="errors" class="row">
+    </div>
+    <!-- Loading gif -->
+    <div id="loading" class="row" style="display:none">
+      <img src="loading.gif" height="200px" style="margin-left: 200px"/>
+    </div>
+    <!-- Annotation population area -->
+    <div id="annotations" class="row" style="display:none">
+    </div>
+  </div>
+</div>
+</div>
+<!-- The footer of the page -->
+<footer id="footer" class="footer">
+  <div class="container">
+    <p class="text-muted">
+      Visualisation provided using the <a href="http://brat.nlplab.org/">brat visualisation/annotation software</a>.
+    </p>
+  </div>
+</footer>
+</body>
+</html>

stanza/stanza/pipeline/external/pythainlp.py ADDED Viewed

	@@ -0,0 +1,86 @@

+"""
+Processors related to PyThaiNLP in the pipeline.
+GitHub Home: https://github.com/PyThaiNLP/pythainlp
+"""
+from stanza.models.common import doc
+from stanza.pipeline._constants import TOKENIZE
+from stanza.pipeline.processor import ProcessorVariant, register_processor_variant
+def check_pythainlp():
+    """
+    Import necessary components from pythainlp to perform tokenization.
+    """
+    try:
+        import pythainlp
+    except ImportError:
+        raise ImportError(
+            "The pythainlp library is required. "
+            "Try to install it with `pip install pythainlp`. "
+            "Go to https://github.com/PyThaiNLP/pythainlp for more information."
+        )
+    return True
+@register_processor_variant(TOKENIZE, 'pythainlp')
+class PyThaiNLPTokenizer(ProcessorVariant):
+    def __init__(self, config):
+        """ Construct a PyThaiNLP-based tokenizer.
+        Note that we always uses the default tokenizer of PyThaiNLP for sentence and word segmentation.
+        Currently this is a CRF model for sentence segmentation and a dictionary-based model (newmm) for word segmentation.
+        """
+        if config['lang'] != 'th':
+            raise Exception("PyThaiNLP tokenizer is only allowed in Thai pipeline.")
+        check_pythainlp()
+        from pythainlp.tokenize import sent_tokenize as pythai_sent_tokenize
+        from pythainlp.tokenize import word_tokenize as pythai_word_tokenize
+        self.pythai_sent_tokenize = pythai_sent_tokenize
+        self.pythai_word_tokenize = pythai_word_tokenize
+        self.no_ssplit = config.get('no_ssplit', False)
+    def process(self, document):
+        """ Tokenize a document with the PyThaiNLP tokenizer and wrap the results into a Doc object.
+        """
+        if isinstance(document, doc.Document):
+            text = document.text
+        else:
+            text = document
+        if not isinstance(text, str):
+            raise Exception("Must supply a string or Stanza Document object to the PyThaiNLP tokenizer.")
+        sentences = []
+        current_sentence = []
+        offset = 0
+        if self.no_ssplit:
+            # skip sentence segmentation
+            sent_strs = [text]
+        else:
+            sent_strs = self.pythai_sent_tokenize(text, engine='crfcut')
+        for sent_str in sent_strs:
+            for token_str in self.pythai_word_tokenize(sent_str, engine='newmm'):
+                # by default pythainlp will output whitespace as a token
+                # we need to skip these tokens to be consistent with other tokenizers
+                if token_str.isspace():
+                    offset += len(token_str)
+                    continue
+                # create token entry
+                token_entry = {
+                    doc.TEXT: token_str,
+                    doc.MISC: f"{doc.START_CHAR}={offset}|{doc.END_CHAR}={offset+len(token_str)}"
+                }
+                current_sentence.append(token_entry)
+                offset += len(token_str)
+            # finish sentence
+            sentences.append(current_sentence)
+            current_sentence = []
+        if len(current_sentence) > 0:
+            sentences.append(current_sentence)
+        return doc.Document(sentences, text)

stanza/stanza/tests/data/aws_annotations.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b77fd2ff28fcbe3d8bc3c96ce08ef5f8f08afbc5838cc29f41dcc91cb1109fb
+size 14600

stanza/stanza/tests/data/tiny_emb.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eadc38121a965a9666418f60be45a9cda5b0a5b59461e36f9cef9619d7297f82
+size 78

stanza/stanza/tests/data/tiny_emb.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc143a71d627ff8f8dfd0823af5e4ddaef47e0999ab2e6d2ead60860ae84a6b8
+size 698

stanza/stanza/tests/data/tiny_emb.xz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:68085a67ccee721b457701c281fff7741db0ab4769237ed075a894f5d254b76a
+size 104

stanza/stanza/tests/data/tiny_emb.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2fa97fee0345fa19e2c7f41494362a7385c59df068ea84421048708aa2de3d2a
+size 220