diff --git a/cloudia/main.py b/cloudia/main.py index 5882c93..740c539 100644 --- a/cloudia/main.py +++ b/cloudia/main.py @@ -15,10 +15,11 @@ def __init__(self, stop_words: List[str] = STOPWORDS, extract_postags: List[str] = ['名詞', '英単語', 'ローマ字文'], parse_func: Any = default_parse_func, - multiprocess: bool = True, + parser: Any = 'default', + multiprocess: bool = False, individual: bool = False, **args): - args.update(dict(single_words=single_words, stop_words=stop_words, extract_postags=extract_postags)) + args.update(dict(single_words=single_words, stop_words=stop_words, extract_postags=extract_postags, parser=parser)) self.wd = WordData(data, parse_func, multiprocess, individual, **args) def make_wordcloud(self, dark_theme: bool, rate: int) -> List[Tuple[str, WordCloud]]: diff --git a/cloudia/pandas_accessor.py b/cloudia/pandas_accessor.py index 0c9a26c..3076b82 100644 --- a/cloudia/pandas_accessor.py +++ b/cloudia/pandas_accessor.py @@ -18,14 +18,15 @@ def plot(self, stop_words: List[str] = STOPWORDS, extract_postags: List[str] = ['名詞', '英単語', 'ローマ字文'], parse_func: Any = default_parse_func, + parser: Any = 'default', dark_theme: bool = False, title_size: int = 12, row_num: int = 3, figsize_rate: int = 2, - multiprocess: bool = True, + multiprocess: bool = False, individual: bool = False, **args): - Cloudia(self.df, single_words, stop_words, extract_postags, parse_func, multiprocess, individual, + Cloudia(self.df, single_words, stop_words, extract_postags, parse_func, parser, multiprocess, individual, **args).plot(dark_theme, title_size, row_num, figsize_rate) def save(self, fig_path: str, dark_theme: bool, **args: Any): @@ -43,14 +44,15 @@ def plot(self, stop_words: List[str] = STOPWORDS, extract_postags: List[str] = ['名詞', '英単語', 'ローマ字文'], parse_func: Any = default_parse_func, + parser: Any = 'default', dark_theme: bool = False, title_size: int = 12, row_num: int = 3, figsize_rate: int = 2, - multiprocess: bool = True, + multiprocess: bool = False, individual: bool = False, **args): - Cloudia(self.series, single_words, stop_words, extract_postags, parse_func, multiprocess, individual, + Cloudia(self.series, single_words, stop_words, extract_postags, parse_func, parser, multiprocess, individual, **args).plot(dark_theme, title_size, row_num, figsize_rate) def save(self, fig_path: str, dark_theme: bool, **args: Any): diff --git a/cloudia/utils.py b/cloudia/utils.py index 77e01ae..f00c9f9 100644 --- a/cloudia/utils.py +++ b/cloudia/utils.py @@ -11,8 +11,13 @@ NUM_REGEX = re.compile('^[0-9]+$') -def default_parse_func(text: str, single_words: List[str], extract_postags: List[str], stop_words: List[str]) -> List[str]: - parser = nagisa.Tagger(single_word_list=single_words) +def make_nagisa_tagger(single_words: List[str]): + return nagisa.Tagger(single_word_list=single_words) + + +def default_parse_func(text: str, single_words: List[str], extract_postags: List[str], stop_words: List[str], parser) -> List[str]: + if parser == 'default': + parser = make_nagisa_tagger(single_words) for x in ['"', ';', ',', '(', ')', '\u3000']: text = text.replace(x, ' ') text = text.lower() diff --git a/cloudia/word_data.py b/cloudia/word_data.py index 2e4641d..2688705 100644 --- a/cloudia/word_data.py +++ b/cloudia/word_data.py @@ -5,7 +5,7 @@ from joblib import Parallel, delayed import pandas as pd -from cloudia.utils import function_wrapper +from cloudia.utils import function_wrapper, make_nagisa_tagger class WordData: @@ -42,6 +42,8 @@ def _parse(self, words: List[str], parse_func: Callable[..., List[str]], multipr return self._single_thread_parse(words, parse_func, **args) def _single_thread_parse(self, words: List[str], parse_func: Callable[..., List[str]], **args) -> List[Counter]: + if args['parser'] == 'default': + args.update({'parser': make_nagisa_tagger(args['single_words'])}) return [Counter(parse_func(x, **args)) for x in words] def _parallel_parse(self, words: List[str], parse_func: Callable, **args) -> List[List[Counter]]: diff --git a/test/unit_test/test_utils.py b/test/unit_test/test_utils.py index d07a4b6..4703ea9 100644 --- a/test/unit_test/test_utils.py +++ b/test/unit_test/test_utils.py @@ -5,7 +5,7 @@ class TestUtils(unittest.TestCase): def test_default_parse_func(self): - output = default_parse_func('This is a simple test.', ['simple test'], ['英単語'], ['is']) + output = default_parse_func('This is a simple test.', ['simple test'], ['英単語'], ['is'], 'default') self.assertListEqual(output, ['this', 'simple\u3000test']) def test_function_wrapper(self): diff --git a/test/unit_test/test_word_data.py b/test/unit_test/test_word_data.py index 2c9a978..c89e9e6 100644 --- a/test/unit_test/test_word_data.py +++ b/test/unit_test/test_word_data.py @@ -79,10 +79,10 @@ def test_convert_weight(self): self.assertDictEqual(output, {'hoge': 1, 'piyo': 0.5}) def test_single_thread_parse(self): - def f(x): + def f(x, parser, single_words): return x.split(' ') - output = self.cls._single_thread_parse(['hoge hoge', 'piyo'], f) + output = self.cls._single_thread_parse(['hoge hoge', 'piyo'], f, **{'parser': 'default', 'single_words': []}) target = [Counter(['hoge', 'hoge']), Counter(['piyo'])] for o, t in zip(output, target): self.assertEqual(type(o), type(t))