From a26cb9e73a7cfa2cb4af26495644bf27277b84d2 Mon Sep 17 00:00:00 2001 From: Ayaka Mikazuki Date: Mon, 25 Apr 2022 12:59:02 +0800 Subject: [PATCH] Implement PresetConversion --- .gitignore | 1 + README.md | 39 +++++++++++++++++++------------------ setup.py | 2 +- src/StarCC/__init__.py | 44 ++++++++++++++++++++++++++++++++++++++++++ test/main.py | 40 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 106 insertions(+), 20 deletions(-) create mode 100644 test/main.py diff --git a/.gitignore b/.gitignore index 9e31d05..8987d3b 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ dist .mypy_cache /src/StarCC/dict +/test/testcases diff --git a/README.md b/README.md index e9417e8..bd021a4 100644 --- a/README.md +++ b/README.md @@ -4,34 +4,35 @@ [![Discussion - on Telegram](https://img.shields.io/badge/Discussion-on_Telegram-2ca5e0?logo=telegram)](https://t.me/+jOyC1UnIqZE3OGQ1) -## 用法 Usage +## 安裝 Installation ```sh pip install starcc ``` +## 用法 Usage + +不轉換用詞 Without phrase conversion: + ```python -from StarCC import Conversion, Dicts -convert = Conversion((Dicts.CN2ST, Dicts.ST2HK)) # change conversion mode here +from StarCC import PresetConversion +convert = PresetConversion(src='cn', dst='hk', with_phrase=False) print(convert('为什么你在床里面睡着?我们的硅二极管坏了,要去老挝修理。')) # 為什麼你在牀裏面睡着?我們的硅二極管壞了,要去老撾修理。 ``` +轉換用詞 With phrase conversion: + +```python +from StarCC import PresetConversion +convert = PresetConversion(src='cn', dst='tw', with_phrase=True) +print(convert('为什么你在床里面睡着?我们的硅二极管坏了,要去老挝修理。')) +# 為什麼你在床裡面睡著?我們的矽二極體壞了,要去寮國修理。 +``` + ## 轉換模式一覽 Supported conversion modes -| 源文本
From | 目標文本
To | 轉換詞彙?
Convert Phrases? | 配置
Config | -| :-: | :-: | :-: | :-: | -| `zh-CN` | `zh-HK` | ❌ | `Conversion((Dicts.CN2ST, Dicts.ST2HK))` | -| `zh-CN` | `zh-TW` | ❌ | `Conversion((Dicts.CN2ST, Dicts.ST2TW))` | -| `zh-CN` | `zh-JP` | ❌ | `Conversion((Dicts.CN2ST, Dicts.ST2JP))` | -| `zh-HK` | `zh-CN` | ❌ | `Conversion((Dicts.HK2ST, Dicts.ST2CN))` | -| `zh-HK` | `zh-TW` | ❌ | `Conversion((Dicts.HK2ST, Dicts.ST2TW))` | -| `zh-HK` | `zh-JP` | ❌ | `Conversion((Dicts.HK2ST, Dicts.ST2JP))` | -| `zh-TW` | `zh-CN` | ❌ | `Conversion((Dicts.TW2ST, Dicts.ST2CN))` | -| `zh-TW` | `zh-HK` | ❌ | `Conversion((Dicts.TW2ST, Dicts.ST2HK))` | -| `zh-TW` | `zh-JP` | ❌ | `Conversion((Dicts.TW2ST, Dicts.ST2JP))` | -| `zh-JP` | `zh-CN` | ❌ | `Conversion((Dicts.JP2ST, Dicts.ST2CN))` | -| `zh-JP` | `zh-HK` | ❌ | `Conversion((Dicts.JP2ST, Dicts.ST2HK))` | -| `zh-JP` | `zh-TW` | ❌ | `Conversion((Dicts.JP2ST, Dicts.ST2TW))` | -| `zh-CN` | `zh-TW` | ✅ | `Conversion((Dicts.CN2ST, Dicts.ST2TWP))` | -| `zh-TW` | `zh-CN` | ✅ | `Conversion((Dicts.TWP2ST, Dicts.ST2CN))` | +- `cn`: Simplified Chinese (Mainland China) +- `hk`: Traditional Chinese (Hong Kong) +- `tw`: Traditional Chinese (Taiwan) +- `jp`: Japanese Shinjitai diff --git a/setup.py b/setup.py index d881acd..967092d 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ setup( name='starcc', - version='0.0.1', + version='0.0.2', description='Python implementation of StarCC', long_description=long_description, long_description_content_type='text/markdown', diff --git a/src/StarCC/__init__.py b/src/StarCC/__init__.py index 35d3fe6..8db3444 100644 --- a/src/StarCC/__init__.py +++ b/src/StarCC/__init__.py @@ -71,3 +71,47 @@ def __call__(self, s: str) -> str: for trie in self.tries: s = _convert(trie, s) return s + +class PresetConversion(Conversion): + def __init__(self, src='cn', dst='hk', with_phrase: bool=False) -> None: + if src not in ('st', 'cn', 'hk', 'tw', 'jp'): + raise ValueError(f'Invalid src value: {src}') + if dst not in ('st', 'cn', 'hk', 'tw', 'jp'): + raise ValueError(f'Invalid dst value: {dst}') + assert src != dst + + dicts_list = [] + + if src != 'st': + if not with_phrase: + dicts_list.append({ + 'cn': Dicts.CN2ST, + 'hk': Dicts.HK2ST, + 'tw': Dicts.TW2ST, + 'jp': Dicts.JP2ST, + }[src]) + else: # with_phrase + if src not in ('cn', 'tw'): + raise ValueError(f'Phrase conversion for {src} is currently not supported') + dicts_list.append({ + 'cn': Dicts.CN2ST, # CN does not need to convert phrases + 'tw': Dicts.TWP2ST, + }[src]) + + if dst != 'st': + if not with_phrase: + dicts_list.append({ + 'cn': Dicts.ST2CN, + 'hk': Dicts.ST2HK, + 'tw': Dicts.ST2TW, + 'jp': Dicts.ST2JP, + }[dst]) + else: # with_phrase + if src not in ('cn', 'tw'): + raise ValueError(f'Phrase conversion for {src} is currently not supported') + dicts_list.append({ + 'cn': Dicts.ST2CN, # CN does not need to convert phrases + 'tw': Dicts.ST2TWP, + }[dst]) + + super().__init__(dicts_list) diff --git a/test/main.py b/test/main.py new file mode 100644 index 0000000..0d12573 --- /dev/null +++ b/test/main.py @@ -0,0 +1,40 @@ +import os +from os import path +from StarCC import PresetConversion + +tests = ( + ('hk2s', ('hk', 'cn', False)), + ('hk2t', ('hk', 'st', False)), + ('jp2t', ('jp', 'st', False)), + ('s2hk', ('cn', 'hk', False)), + ('s2t', ('cn', 'st', False)), + ('s2tw', ('cn', 'tw', False)), + ('s2twp', ('cn', 'tw', True)), + ('t2hk', ('st', 'hk', False)), + ('t2jp', ('st', 'jp', False)), + ('t2s', ('st', 'cn', False)), + ('tw2s', ('tw', 'cn', False)), + ('tw2sp', ('tw', 'cn', True)), + ('tw2t', ('tw', 'st', False)), +) + +if not path.exists('test/testcases'): + os.system('git -C test clone https://github.com/StarCC0/testcases.git') +os.system('git -C test pull') + +def run_test(name, config): + with open(f'test/testcases/{name}.in', encoding='utf-8') as f: + xs = f.read() + with open(f'test/testcases/{name}.ans', encoding='utf-8') as f: + ys = f.read() + + convert = PresetConversion(*config) + ys_ = convert(xs) + + if ys != ys_: + print(f'Error found in {name}\n' + f'Expected: {ys}\n' + f'Got: {ys_}\n\n') + +for test in tests: + run_test(*test)