| # coding=utf-8 | |
| import jieba | |
| import torch | |
| def jieba_tokenize(str): | |
| return jieba.lcut(str) | |
| _UCODE_RANGES = ( | |
| ("\u3400", "\u4db5"), # CJK Unified Ideographs Extension A, release 3.0 | |
| ("\u4e00", "\u9fa5"), # CJK Unified Ideographs, release 1.1 | |
| ("\u9fa6", "\u9fbb"), # CJK Unified Ideographs, release 4.1 | |
| ("\uf900", "\ufa2d"), # CJK Compatibility Ideographs, release 1.1 | |
| ("\ufa30", "\ufa6a"), # CJK Compatibility Ideographs, release 3.2 | |
| ("\ufa70", "\ufad9"), # CJK Compatibility Ideographs, release 4.1 | |
| ("\u20000", "\u2a6d6"), # (UTF16) CJK Unified Ideographs Extension B, release 3.1 | |
| ("\u2f800", "\u2fa1d"), # (UTF16) CJK Compatibility Supplement, release 3.1 | |
| ("\uff00", "\uffef"), # Full width ASCII, full width of English punctuation, | |
| # half width Katakana, half wide half width kana, Korean alphabet | |
| ("\u2e80", "\u2eff"), # CJK Radicals Supplement | |
| ("\u3000", "\u303f"), # CJK punctuation mark | |
| ("\u31c0", "\u31ef"), # CJK stroke | |
| ("\u2f00", "\u2fdf"), # Kangxi Radicals | |
| ("\u2ff0", "\u2fff"), # Chinese character structure | |
| ("\u3100", "\u312f"), # Phonetic symbols | |
| ("\u31a0", "\u31bf"), # Phonetic symbols (Taiwanese and Hakka expansion) | |
| ("\ufe10", "\ufe1f"), | |
| ("\ufe30", "\ufe4f"), | |
| ("\u2600", "\u26ff"), | |
| ("\u2700", "\u27bf"), | |
| ("\u3200", "\u32ff"), | |
| ("\u3300", "\u33ff"), | |
| ) | |
| def is_chinese_char(uchar): | |
| for start, end in _UCODE_RANGES: | |
| if start <= uchar <= end: | |
| return True | |
| return False | |
| def chinese_char_tokenize(line): | |
| line = line.strip() | |
| line_in_chars = "" | |
| for char in line: | |
| if is_chinese_char(char): | |
| line_in_chars += " " | |
| line_in_chars += char | |
| line_in_chars += " " | |
| else: | |
| line_in_chars += char | |
| return line_in_chars | |
| # s = '中国的首都是哪里?1,2,3d回答' | |
| # print(chinese_char_tokenize(s)) | |
| def report_memory(name): | |
| """Simple GPU memory report.""" | |
| mega_bytes = 1024.0 * 1024.0 | |
| string = name + ' memory (MB)' | |
| string += ' | allocated: {}'.format( | |
| torch.cuda.memory_allocated() / mega_bytes) | |
| string += ' | max allocated: {}'.format( | |
| torch.cuda.max_memory_allocated() / mega_bytes) | |
| string += ' | reserved: {}'.format( | |
| torch.cuda.memory_reserved() / mega_bytes) | |
| string += ' | max reserved: {}'.format( | |
| torch.cuda.max_memory_reserved() / mega_bytes) | |
| print(string) | |