|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import argparse |
|
|
|
|
|
import numpy as np |
|
|
|
|
|
if __name__ == "__main__": |
|
|
parser = argparse.ArgumentParser() |
|
|
parser.add_argument("source", type=str, help="CoNLL-U file to split") |
|
|
parser.add_argument("train", type=str, help="CoNLL-U file to write training data to") |
|
|
parser.add_argument("dev", type=str, help="CoNLL-U file to write development data to") |
|
|
parser.add_argument("--dev_size", type=float, default=0.1, help="Size of the development data") |
|
|
args = parser.parse_args() |
|
|
|
|
|
sentences = [] |
|
|
with open(args.source, "r", encoding="utf-8") as source: |
|
|
sentence = [] |
|
|
for line in source: |
|
|
sentence.append(line) |
|
|
if not line.rstrip("\r\n"): |
|
|
sentences.append("".join(sentence)) |
|
|
sentence = [] |
|
|
assert not sentence, "Missing empty line after the last sentence" |
|
|
|
|
|
dev_indices = set(np.random.RandomState(42).choice(len(sentences), int(len(sentences) * args.dev_size), replace=False)) |
|
|
|
|
|
with open(args.train, "w", encoding="utf-8") as train: |
|
|
with open(args.dev, "w", encoding="utf-8") as dev: |
|
|
for i, sentence in enumerate(sentences): |
|
|
(dev if i in dev_indices else train).write(sentence) |
|
|
|