diff --git a/.gitignore b/.gitignore index 29e25cf..344c176 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ Pipfile.lock +TODO.rst +dummy.py # .idea (JetBrains) .idea/ diff --git a/HISTORY.rst b/HISTORY.rst index bcf005f..33d0218 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -2,6 +2,16 @@ History ####### +************************** +1.2.0 (February 2nd, 2020) +************************** +* **BREAKING**: :code:`._.conll` now outputs a dictionary for sentences :code:`fieldname: [value1, value2...]`, and + a list of such dictionaries for a Doc +* Added a :code:`conversion_maps` argument where one can define a mapping to have better control over the model's tagset + (see the advanced example in README.rst) +* Tests for usage with :code:`spacy-stanfordnlp` +* Better documentation, including advanced example + ************************** 1.1.0 (January 21st, 2020) ************************** @@ -15,10 +25,10 @@ Minor documentation changes for PyPi. ************************** 1.0.0 (January 13th, 2020) ************************** -- Complete overhaul. Can now be used a custom pipeline component in spaCy. -- Spacy2ConllParser is now deprecated. -- The CLI interface does not rely on Spacy2ConllParser anymore but uses the custom pipeline component instead. -- Added :code:`-e|--no_force_counting` to the CLI options. By default, when using :code:`-d|--include_headers`, +* Complete overhaul. Can now be used a custom pipeline component in spaCy. +* Spacy2ConllParser is now deprecated. +* The CLI interface does not rely on Spacy2ConllParser anymore but uses the custom pipeline component instead. +* Added :code:`-e|--no_force_counting` to the CLI options. By default, when using :code:`-d|--include_headers`, parsed sentence will be numbered incrementally. This can be disabled so that the sentence numbering depends on how spaCy segments the sentences. diff --git a/Pipfile b/Pipfile index b723932..3d35a55 100644 --- a/Pipfile +++ b/Pipfile @@ -8,4 +8,7 @@ spacy = "*" packaging = "*" [dev-packages] +torch = "*" +spacy-stanfordnlp = "*" pytest = "*" +pygments = "*" diff --git a/README.rst b/README.rst index 9bb5106..1c73fad 100644 --- a/README.rst +++ b/README.rst @@ -1,18 +1,35 @@ -=========================== -Parsing to CoNLL with spaCy -=========================== +================================================ +Parsing to CoNLL with spaCy or spacy-stanfordnlp +================================================ This module allows you to parse a text to `CoNLL-U format`_. You can use it as a command line tool, or embed it in your -own scripts by adding it as a custom component to a spaCy pipeline. +own scripts by adding it as a custom component to a spaCy or spacy-stanfordnlp pipeline. -Note that the module simply takes spaCy output and puts it in a formatted string adhering to the linked ConLL-U format. It does not as of yet do an explicit tagset mapping of spaCy to UD tags. The output tags depend on the spaCy model used. +Note that the module simply takes a parser's output and puts it in a formatted string adhering to the linked ConLL-U +format. The output tags depend on the spaCy model used. If you want Universal Depencies tags as output, I advise you to +use this library in combination with `spacy_stanfordnlp`_, which is a spaCy interface using :code:`stanfordnlp` and its +models behind the scenes. Those models use the Universal Dependencies formalism. See the remainder README for more +information and usage guidelines. .. _`CoNLL-U format`: https://universaldependencies.org/format.html +.. _`spacy_stanfordnlp`: https://github.com/explosion/spacy-stanfordnlp ============ Installation ============ -Requires `spaCy`_ and an `installed spaCy language model`_. When using the module from the command line, you also need the :code:`packaging` package. +Requires `spaCy`_ and an `installed spaCy language model`_. When using the module from the command line, you also need +the :code:`packaging` package. See section `spaCy`_ for usage. + +Because `spaCy's models`_ are not necessarily trained on Universal Dependencies conventions, their output labels are +not UD either. By using :code:`spacy-stanfordnlp`, we get the easy-to-use interface of spaCy as a wrapper around +:code:`stanfordnlp` and its models that *are* trained on UD data. If you want to use the Stanford NLP models, you also +need :code:`spacy-stanfordnlp` and `a corresponding model`_. See the section `spacy-stanfordnlp`_ for usage. + +**NOTE**: :code:`spacy-stanfordnlp` is not automatically installed as a dependency for this library, because it might be +too much overhead for those who don't need UD. If you wish to use its functionality, you have to install it manually. +By default, only :code:`spacy` and :code:`packaging` are installed as dependencies. + +To install the library, simply use pip. .. code:: bash @@ -20,6 +37,7 @@ Requires `spaCy`_ and an `installed spaCy language model`_. When using the modul .. _spaCy: https://spacy.io/usage/models#section-quickstart .. _installed spaCy language model: https://spacy.io/usage/models +.. _`a corresponding model`: https://stanfordnlp.github.io/stanfordnlp/models.html ===== Usage @@ -30,8 +48,8 @@ Command line > python -m spacy_conll -h usage: [-h] [-f INPUT_FILE] [-a INPUT_ENCODING] [-b INPUT_STR] - [-t] [-o OUTPUT_FILE] [-c OUTPUT_ENCODING] [-m MODEL] [-s] - [-d] [-e] [-j N_PROCESS] [-v] + [-o OUTPUT_FILE] [-c OUTPUT_ENCODING] [-m MODEL_OR_LANG] + [-s] [-t] [-d] [-e] [-j N_PROCESS] [-u] [-v] Parse an input string or input file to CoNLL-U format. @@ -45,21 +63,23 @@ Command line default. -b INPUT_STR, --input_str INPUT_STR Input string to parse. (default: None) - -t, --is_tokenized Indicates whether your text has already been tokenized - (space-seperated). (default: False) -o OUTPUT_FILE, --output_file OUTPUT_FILE Path to output file. If not specified, the output will be printed on standard output. (default: None) -c OUTPUT_ENCODING, --output_encoding OUTPUT_ENCODING Encoding of the output file. Default value is system default. - -m MODEL, --model MODEL - spaCy model to use (must be installed). (default: - en_core_web_sm) + -m MODEL_OR_LANG, --model_or_lang MODEL_OR_LANG + spaCy or stanfordnlp model or language to use (must be + installed). (default: None) -s, --disable_sbd Disables spaCy automatic sentence boundary detection. In practice, disabling means that every line will be parsed as one sentence, regardless of its actual - content. (default: False) + content. Only works when using spaCy. (default: False) + -t, --is_tokenized Indicates whether your text has already been tokenized + (space-seperated). When used in conjunction with + spacy-stanfordnlp, it will also be assumed that the + text is sentence split by newline. (default: False) -d, --include_headers To include headers before the output of every sentence. These headers include the sentence text and @@ -73,11 +93,14 @@ Command line Number of processes to use in nlp.pipe(). -1 will use as many cores as available. Requires spaCy v2.2.2. (default: 1) + -u, --use_stanfordnlp + Use stanfordnlp models rather than spaCy models. + Requires spacy-stanfordnlp. (default: False) -v, --verbose To print the output to stdout, regardless of 'output_file'. (default: False) -For example, parsing a sentence: +For example, parsing a single line, multi-sentence string: .. code:: bash @@ -102,8 +125,17 @@ For example, parsing a large input file and writing output to output file, using > python -m spacy_conll --input_file large-input.txt --output_file large-conll-output.txt --include_headers --disable_sbd -j 4 +You can also use Stanford NLP's models to retrieve UD tags. You can do this by using the :code:`-u` flag. **NOTE**: +Using Stanford's models has limited options due to the API of :code:`stanfordnlp`. It is not possible to disable +sentence segmentation and control the tokenisation at the same time. When using the :code:`-u` flag you can only enable +the :code:`--is_tokenized` flag which behaves different when used with spaCy. With spaCy, it will simply not try to +tokenize the text and use spaces as token separators. When using :code:`spacy-stanfordnlp`, it will also be assumed that +the text is sentence split by newline. No further sentence segmentation is done. + In Python --------- +spaCy ++++++ :code:`spacy_conll` is intended to be used a custom pipeline component in spaCy. Three custom extensions are accessible, by default named :code:`conll_str`, :code:`conll_str_headers`, and :code:`conll`. @@ -145,7 +177,111 @@ The snippet above will return (and print) the following string: 2 you -PRON- PRON PRP PronType=prs 1 nsubj _ _ 3 ? ? PUNCT . PunctType=peri 1 punct _ _ + +An advanced example, showing the more complex options: + +* :code:`ext_names`: changes the attribute names to a custom key by using a dictionary. You can change: + + * :code:`conll_str`: a string representation of the CoNLL format + * :code:`conll_str_headers`: the same a conll_str but with leading lines containing sentence index and sentence text + * :code:`conll`: a dictionary containing the field names and their values. For a Doc object, this returns a list of + dictionaries where each dictionary is a sentence + +* :code:`field_names`: a dictionary containing a mapping of field names so that you can name them as you wish +* :code:`conversion_maps`: a two-level dictionary that looks like :code:`{field_name: {tag_name: replacement}}` + In other words, you can specify in which field a certain value should be replaced by another. + This is especially useful when you are not satisfied with the tagset of a model and wish + to change some tags to an alternative. + +The example below + +* changes the custom attribute :code:`conll` to :code:`connl_for_pd`; +* changes the :code:`lemma` field to :code:`word_lemma`; +* converts any :code:`-PRON-` to :code:`PRON`; +* as a bonus: uses the output dictionary to create a pandas DataFrame. + +.. code:: python + + import pandas as pd + import spacy + from spacy_conll import ConllFormatter + + + nlp = spacy.load('en') + conllformatter = ConllFormatter(nlp, + ext_names={'conll': 'connl_for_pd'}, + field_names={'lemma': 'word_lemma'}, + conversion_maps={'word_lemma': {'-PRON-': 'PRON'}}) + nlp.add_pipe(conllformatter, after='parser') + doc = nlp('I like cookies.') + df = pd.DataFrame.from_dict(doc._.connl_for_pd[0]) + print(df) + +The snippet above will output a pandas DataFrame: + +.. code:: text + + id form word_lemma upostag ... head deprel deps misc + 0 1 I PRON PRON ... 2 nsubj _ _ + 1 2 like like VERB ... 0 ROOT _ _ + 2 3 cookies cookie NOUN ... 2 dobj _ _ + 3 4 . . PUNCT ... 2 punct _ _ + + [4 rows x 10 columns] + +spacy-stanfordnlp ++++++++++++++++++ + +Using :code:`spacy_conll` in conjunction with :code:`spacy-stanfordnlp` is similar to using it with :code:`spacy`: +in practice we are still simply adding a custom component pipeline to the existing pipeline, but this time that pipeline +is a Stanford NLP pipeline that is wrapped in spaCy's API. + +.. code:: python + + from spacy_stanfordnlp import StanfordNLPLanguage + import stanfordnlp + + from spacy_conll import ConllFormatter + + + snlp = stanfordnlp.Pipeline(lang='en') + nlp = StanfordNLPLanguage(snlp) + conllformatter = ConllFormatter(nlp) + nlp.add_pipe(conllformatter, last=True) + + s = 'A cookie is a baked or cooked food that is typically small, flat and sweet.' + + doc = nlp(s) + print(doc._.conll_str) + +Output: + +.. code:: text + + 1 A a DET DT _ 2 det _ _ + 2 cookie cookie NOUN NN Number=sing 8 nsubj _ _ + 3 is be AUX VBZ VerbForm=fin|Tense=pres|Number=sing|Person=three 8 cop _ _ + 4 a a DET DT _ 8 det _ _ + 5 baked bake VERB VBN VerbForm=part|Tense=past|Aspect=perf 8 amod _ _ + 6 or or CCONJ CC ConjType=comp 7 cc _ _ + 7 cooked cook VERB VBN VerbForm=part|Tense=past|Aspect=perf 5 conj _ _ + 8 food food NOUN NN Number=sing 0 root _ _ + 9 that that PRON WDT _ 12 nsubj _ _ + 10 is be AUX VBZ VerbForm=fin|Tense=pres|Number=sing|Person=three 12 cop _ _ + 11 typically typically ADV RB Degree=pos 12 advmod _ _ + 12 small small ADJ JJ Degree=pos 8 acl:relcl _ _ + 13 , , PUNCT , PunctType=comm 14 punct _ _ + 14 flat flat ADJ JJ Degree=pos 12 conj _ _ + 15 and and CCONJ CC ConjType=comp 16 cc _ _ + 16 sweet sweet ADJ JJ Degree=pos 12 conj _ _ + 17 . . PUNCT . PunctType=peri 8 punct _ _ + +.. _`spaCy's models`: https://spacy.io/models + +---- + **DEPRECATED:** :code:`Spacy2ConllParser` ++++++++++++++++++++++++++++++++++++++++++ There are two main methods, :code:`parse()` and :code:`parseprint()`. The latter is a convenience method for printing the output of :code:`parse()` to stdout (default) or a file. diff --git a/setup.py b/setup.py index 7028dbc..0f8fe22 100644 --- a/setup.py +++ b/setup.py @@ -6,12 +6,12 @@ setup( name='spacy_conll', - version='1.1.0', + version='1.2.0', description='A custom pipeline component for spaCy that can convert any parsed Doc' ' and its sentences into CoNLL-U format. Also provides a command line entry point.', long_description=long_description, long_description_content_type='text/x-rst', - keywords='nlp spacy spacy-extension conll conllu tagging', + keywords='nlp spacy spacy-extension conll conllu tagging parsing stanfordnlp spacy_stanfordnlp', packages=['spacy_conll'], url='https://github.com/BramVanroy/spacy_conll', author='Bram Vanroy, Raquel G. Alhama', diff --git a/spacy_conll/__init__.py b/spacy_conll/__init__.py index ccc87a1..4498032 100644 --- a/spacy_conll/__init__.py +++ b/spacy_conll/__init__.py @@ -1,3 +1,7 @@ +__version__ = '1.2.0' + +from collections import OrderedDict, defaultdict + from spacy.tokens import Doc, Span # NOTE: SpacyConllParser is deprecated @@ -15,14 +19,18 @@ def __init__(self, nlp, *, ext_names=None, - field_names=None + field_names=None, + conversion_maps=None ): """ ConllFormatter constructor. The names of the extensions that are set can be changed with '*_attr' arguments. :param nlp: an initialized spaCy nlp object :param ext_names: dictionary containing names for the custom spaCy extensions - :param field_names dictionary containing names for the CoNLL fields + :param field_names: dictionary containing names for the CoNLL fields + :param conversion_maps: two-level dictionary that contains a field_name (e.g. 'lemma', 'upostag') + on the first level, and the conversion map on the second. + E.g. {'lemma': {'-PRON-': 'PRON'}} will map the lemma '-PRON-' to 'PRON' """ # To get the morphological info, we need a tag map self._tagmap = nlp.Defaults.tag_map @@ -31,13 +39,12 @@ def __init__(self, self._ext_names = { 'conll_str': 'conll_str', 'conll_str_headers': 'conll_str_headers', - 'conll': 'conll', - 'conll_dicts': 'conll_dicts' + 'conll': 'conll' } if ext_names: self._ext_names = self._merge_dicts_strict(self._ext_names, ext_names) - self._field_names = { + self._field_names = OrderedDict({ 'id': 'id', 'form': 'form', 'lemma': 'lemma', @@ -48,10 +55,13 @@ def __init__(self, 'deprel': 'deprel', 'deps': 'deps', 'misc': 'misc' - } + }) + if field_names: self._field_names = self._merge_dicts_strict(self._field_names, field_names) + self._conversion_maps = conversion_maps + # Initialize extensions self._set_extensions() @@ -100,7 +110,7 @@ def _get_span_conll(self, span, span_idx=1): conll_str_w_headers = f"# sent_id = {span_idx}\n# text = {span.text}\n" conll_str = '' - conll = [] + conll = defaultdict(list) for word_idx, word in enumerate(span, 1): if word.dep_.lower().strip() == 'root': head_idx = 0 @@ -119,12 +129,21 @@ def _get_span_conll(self, span, span_idx=1): '_', '_' ) - conll.append(token_conll) + + token_conll_d = dict(zip(self._field_names.values(), token_conll)) + + if self._conversion_maps: + token_conll_d = self._map_conll(token_conll_d) + token_conll = token_conll_d.values() + + for column_name, v in token_conll_d.items(): + conll[column_name].append(v) + conll_str += '\t'.join(map(str, token_conll)) + '\n' conll_str_w_headers += conll_str - return conll_str, conll_str_w_headers, conll + return conll_str, conll_str_w_headers, dict(conll) def _get_morphology(self, tag): """ Expands a tag into its morphological features by using a tagmap. @@ -141,6 +160,22 @@ def _get_morphology(self, tag): else: return '_' + def _map_conll(self, token_conll_d): + """ Maps labels according to a given `self._conversion_maps`. + This can be useful when users want to change the output labels of a + model to their own tagset. + + :param token_conll_d: a token's conll representation as dict (field_name: value) + :return: the modified dict where the labels have been replaced according to the converison maps + """ + for k, v in token_conll_d.items(): + try: + token_conll_d[k] = self._conversion_maps[k][v] + except KeyError: + continue + + return token_conll_d + def _set_extensions(self): """ Sets the default extensions if they do not exist yet. """ for obj in Span, Doc: diff --git a/spacy_conll/__main__.py b/spacy_conll/__main__.py index ec56ad6..e8693bc 100644 --- a/spacy_conll/__main__.py +++ b/spacy_conll/__main__.py @@ -12,17 +12,45 @@ SENT_ID_RE = re.compile(r"(?<=# sent_id = )(\d+)") +def _init_nlp(model_or_lang, is_tokenized, disable_sbd, use_stanfordnlp): + if model_or_lang is None: + model_or_lang = 'en' if use_stanfordnlp else 'en_core_web_sm' + + nlp = None + if use_stanfordnlp: + from spacy_stanfordnlp import StanfordNLPLanguage + import stanfordnlp + + snlp = stanfordnlp.Pipeline(lang=model_or_lang, tokenize_pretokenized=is_tokenized) + nlp = StanfordNLPLanguage(snlp) + else: + # Init model: + # Initialize model, with custom pipe + # taking into account 'is_tokenized', 'disable_sbd', and 'include_headers' + nlp = spacy.load(model_or_lang) + if is_tokenized: + nlp.tokenizer = nlp.tokenizer.tokens_from_list + if disable_sbd: + nlp.add_pipe(_prevent_sbd, name='prevent-sbd', before='parser') + + conllformatter = ConllFormatter(nlp) + nlp.add_pipe(conllformatter, last=True) + + return nlp + + def main(input_file=None, input_encoding=getpreferredencoding(), input_str=None, is_tokenized=False, output_file=None, output_encoding=getpreferredencoding(), - model='en_core_web_sm', + model_or_lang=None, disable_sbd=False, include_headers=False, no_force_counting=False, n_process=1, + use_stanfordnlp=False, verbose=False ): """ Parse an input string or input file to CoNLL-U format @@ -33,35 +61,27 @@ def main(input_file=None, :param is_tokenized: indicates whether your text has already been tokenized (space-seperated) :param output_file: path to output file. If not specified, the output will be printed on standard output :param output_encoding: encoding of the output file. Default value is system default - :param model: spaCy model to use (must be installed) - :param disable_sbd: disables spaCy automatic sentence boundary detection + :param model_or_lang: spaCy or stanfordnlp model or language to use (must be installed) + :param disable_sbd: disables spaCy automatic sentence boundary detection (only works for spaCy) :param include_headers: to include headers before the output of every sentence :param no_force_counting: to disable force counting the 'sent_id', starting from 1 and increasing for each sentence :param n_process: number of processes to use in nlp.pipe(). -1 will use as many cores as available + :param use_stanfordnlp: whether to use stanfordnlp models rather than spaCy models :param verbose: to print the output to stdout, regardless of 'output_file' :return: """ - # Init model: - # Initialize model, with custom pipe - # taking into account 'is_tokenized', 'disable_sbd', and 'include_headers' - nlp = spacy.load(model) - if is_tokenized: - nlp.tokenizer = nlp.tokenizer.tokens_from_list - if disable_sbd: - nlp.add_pipe(_prevent_sbd, name='prevent-sbd', before='parser') - conllformatter = ConllFormatter(nlp) - nlp.add_pipe(conllformatter, after='parser') + nlp = _init_nlp(model_or_lang, is_tokenized, disable_sbd, use_stanfordnlp) # Gather input: # Collect lines in 'lines' variable, taking into account 'is_tokenized' lines = [] if input_str is not None: - lines.append(input_str.strip().split(' ') if is_tokenized else input_str) + lines.append(input_str.strip().split(' ') if is_tokenized and not use_stanfordnlp else input_str) elif input_file is not None: with Path(input_file).open(encoding=input_encoding) as fhin: lines = [l.strip() for l in fhin.readlines()] - if is_tokenized: + if is_tokenized and not use_stanfordnlp: lines = [l.split(' ') for l in lines] else: raise ValueError("'input_file' or 'input_str' must be given.") @@ -120,18 +140,24 @@ def _prevent_sbd(doc): parser.add_argument('-a', '--input_encoding', default=getpreferredencoding(), help='Encoding of the input file. Default value is system default.') parser.add_argument('-b', '--input_str', default=None, help='Input string to parse.') - parser.add_argument('-t', '--is_tokenized', default=False, action='store_true', - help='Indicates whether your text has already been tokenized (space-seperated).') + # Output arguments parser.add_argument('-o', '--output_file', default=None, help='Path to output file. If not specified, the output will be printed on standard output.') parser.add_argument('-c', '--output_encoding', default=getpreferredencoding(), help='Encoding of the output file. Default value is system default.') - # Model arguments - parser.add_argument('-m', '--model', default='en_core_web_sm', help='spaCy model to use (must be installed).') + + # Model/pipeline arguments + parser.add_argument('-m', '--model_or_lang', default=None, + help='spaCy or stanfordnlp model or language to use (must be installed).') parser.add_argument('-s', '--disable_sbd', default=False, action='store_true', help='Disables spaCy automatic sentence boundary detection. In practice, disabling means that' - ' every line will be parsed as one sentence, regardless of its actual content.') + ' every line will be parsed as one sentence, regardless of its actual content.' + ' Only works when using spaCy.') + parser.add_argument('-t', '--is_tokenized', default=False, action='store_true', + help='Indicates whether your text has already been tokenized (space-seperated).' + ' When used in conjunction with spacy-stanfordnlp, it will also be assumed that' + ' the text is sentence split by newline.') # Additional arguments parser.add_argument('-d', '--include_headers', default=False, action='store_true', @@ -144,6 +170,8 @@ def _prevent_sbd(doc): parser.add_argument('-j', '--n_process', type=int, default=1, help='Number of processes to use in nlp.pipe(). -1 will use as many cores as available.' ' Requires spaCy v2.2.2.') + parser.add_argument('-u', '--use_stanfordnlp', default=False, action='store_true', + help='Use stanfordnlp models rather than spaCy models. Requires spacy-stanfordnlp.') parser.add_argument('-v', '--verbose', default=False, action='store_true', help="To print the output to stdout, regardless of 'output_file'.") diff --git a/spacy_conll/tests/conftest.py b/spacy_conll/tests/conftest.py index 3e6e665..f61413c 100644 --- a/spacy_conll/tests/conftest.py +++ b/spacy_conll/tests/conftest.py @@ -1,17 +1,27 @@ import pytest import spacy +from spacy_stanfordnlp import StanfordNLPLanguage +import stanfordnlp from spacy_conll import ConllFormatter @pytest.fixture(scope='session') -def en_small_with_formatter(): +def spacy_en_small_with_formatter(): nlp = spacy.load('en_core_web_sm') conllformatter = ConllFormatter(nlp) nlp.add_pipe(conllformatter, after='parser') return nlp +@pytest.fixture(scope='session') +def spacy_stanfordnlp_en_with_formatter(): + snlp = stanfordnlp.Pipeline(lang='en') + nlp = StanfordNLPLanguage(snlp) + conllformatter = ConllFormatter(nlp) + nlp.add_pipe(conllformatter, last=True) + return nlp + @pytest.fixture(scope='session') def single_string_single_sentence(): return 'A cookie is a baked or cooked food that is typically small, flat and sweet.' diff --git a/spacy_conll/tests/test_component.py b/spacy_conll/tests/test_component.py index 560d616..8ef0af5 100644 --- a/spacy_conll/tests/test_component.py +++ b/spacy_conll/tests/test_component.py @@ -1,2 +1,5 @@ -def test_nlp_has_component(en_small_with_formatter): - assert 'conll_formatter' in en_small_with_formatter.pipe_names \ No newline at end of file +def test_spacy_has_component(spacy_en_small_with_formatter): + assert 'conll_formatter' in spacy_en_small_with_formatter.pipe_names + +def test_spacy_stanfordnlp_has_component(spacy_stanfordnlp_en_with_formatter): + assert 'conll_formatter' in spacy_stanfordnlp_en_with_formatter.pipe_names diff --git a/spacy_conll/tests/test_has_extension.py b/spacy_conll/tests/test_has_extension.py deleted file mode 100644 index fac0989..0000000 --- a/spacy_conll/tests/test_has_extension.py +++ /dev/null @@ -1,82 +0,0 @@ -# Doc: single sentence -def test_doc_has_conll_str_single_sentence(en_small_with_formatter, single_string_single_sentence): - doc = en_small_with_formatter(single_string_single_sentence) - assert doc.has_extension('conll_str') - assert doc._.conll_str is not None - assert isinstance(doc._.conll_str, str) - -def test_doc_has_conll_str_headers_single_sentence(en_small_with_formatter, single_string_single_sentence): - doc = en_small_with_formatter(single_string_single_sentence) - assert doc.has_extension('conll_str_headers') - assert doc._.conll_str_headers is not None - assert isinstance(doc._.conll_str_headers, str) - -def test_doc_has_conll_single_sentence(en_small_with_formatter, single_string_single_sentence): - doc = en_small_with_formatter(single_string_single_sentence) - assert doc.has_extension('conll') - assert doc._.conll is not None - assert isinstance(doc._.conll, list) - -# Doc: multi-sentence -def test_doc_has_conll_str_multi_sentence(en_small_with_formatter, single_string_multi_sentence): - doc = en_small_with_formatter(single_string_multi_sentence) - assert doc.has_extension('conll_str') - assert doc._.conll_str is not None - assert isinstance(doc._.conll_str, str) - -def test_doc_has_conll_str_headers_multi_sentence(en_small_with_formatter, single_string_multi_sentence): - doc = en_small_with_formatter(single_string_multi_sentence) - assert doc.has_extension('conll_str_headers') - assert doc._.conll_str_headers is not None - assert isinstance(doc._.conll_str_headers, str) - -def test_doc_has_conll_multi_sentence(en_small_with_formatter, single_string_multi_sentence): - doc = en_small_with_formatter(single_string_multi_sentence) - assert doc.has_extension('conll') - assert doc._.conll is not None - assert isinstance(doc._.conll, list) - - -# Sents -def test_sents_has_conll_str_single_sentence(en_small_with_formatter, single_string_single_sentence): - doc = en_small_with_formatter(single_string_single_sentence) - for sent in doc.sents: - assert sent.has_extension('conll_str') - assert sent._.conll_str is not None - assert isinstance(sent._.conll_str, str) - -def test_sents_has_conll_str_headers_single_sentence(en_small_with_formatter, single_string_single_sentence): - doc = en_small_with_formatter(single_string_single_sentence) - for sent in doc.sents: - assert sent.has_extension('conll_str_headers') - assert sent._.conll_str_headers is not None - assert isinstance(sent._.conll_str_headers, str) - -def test_sents_has_conll_single_sentence(en_small_with_formatter, single_string_single_sentence): - doc = en_small_with_formatter(single_string_single_sentence) - for sent in doc.sents: - assert sent.has_extension('conll') - assert sent._.conll is not None - assert isinstance(sent._.conll, list) - -# Sents: multi-sentence -def test_sents_has_conll_str_multi_sentence(en_small_with_formatter, single_string_multi_sentence): - doc = en_small_with_formatter(single_string_multi_sentence) - for sent in doc.sents: - assert sent.has_extension('conll_str') - assert sent._.conll_str is not None - assert isinstance(sent._.conll_str, str) - -def test_sents_has_conll_str_headers_multi_sentence(en_small_with_formatter, single_string_multi_sentence): - doc = en_small_with_formatter(single_string_multi_sentence) - for sent in doc.sents: - assert sent.has_extension('conll_str_headers') - assert sent._.conll_str_headers is not None - assert isinstance(sent._.conll_str_headers, str) - -def test_sents_has_conll_multi_sentence(en_small_with_formatter, single_string_multi_sentence): - doc = en_small_with_formatter(single_string_multi_sentence) - for sent in doc.sents: - assert sent.has_extension('conll') - assert sent._.conll is not None - assert isinstance(sent._.conll, list) \ No newline at end of file diff --git a/spacy_conll/tests/test_spacy_has_extension.py b/spacy_conll/tests/test_spacy_has_extension.py new file mode 100644 index 0000000..f074390 --- /dev/null +++ b/spacy_conll/tests/test_spacy_has_extension.py @@ -0,0 +1,82 @@ +# Doc: single sentence +def test_doc_has_conll_str_single_sentence(spacy_en_small_with_formatter, single_string_single_sentence): + doc = spacy_en_small_with_formatter(single_string_single_sentence) + assert doc.has_extension('conll_str') + assert doc._.conll_str is not None + assert isinstance(doc._.conll_str, str) + +def test_doc_has_conll_str_headers_single_sentence(spacy_en_small_with_formatter, single_string_single_sentence): + doc = spacy_en_small_with_formatter(single_string_single_sentence) + assert doc.has_extension('conll_str_headers') + assert doc._.conll_str_headers is not None + assert isinstance(doc._.conll_str_headers, str) + +def test_doc_has_conll_single_sentence(spacy_en_small_with_formatter, single_string_single_sentence): + doc = spacy_en_small_with_formatter(single_string_single_sentence) + assert doc.has_extension('conll') + assert doc._.conll is not None + assert isinstance(doc._.conll, list) + +# Doc: multi-sentence +def test_doc_has_conll_str_multi_sentence(spacy_en_small_with_formatter, single_string_multi_sentence): + doc = spacy_en_small_with_formatter(single_string_multi_sentence) + assert doc.has_extension('conll_str') + assert doc._.conll_str is not None + assert isinstance(doc._.conll_str, str) + +def test_doc_has_conll_str_headers_multi_sentence(spacy_en_small_with_formatter, single_string_multi_sentence): + doc = spacy_en_small_with_formatter(single_string_multi_sentence) + assert doc.has_extension('conll_str_headers') + assert doc._.conll_str_headers is not None + assert isinstance(doc._.conll_str_headers, str) + +def test_doc_has_conll_multi_sentence(spacy_en_small_with_formatter, single_string_multi_sentence): + doc = spacy_en_small_with_formatter(single_string_multi_sentence) + assert doc.has_extension('conll') + assert doc._.conll is not None + assert isinstance(doc._.conll, list) + + +# Sents +def test_sents_has_conll_str_single_sentence(spacy_en_small_with_formatter, single_string_single_sentence): + doc = spacy_en_small_with_formatter(single_string_single_sentence) + for sent in doc.sents: + assert sent.has_extension('conll_str') + assert sent._.conll_str is not None + assert isinstance(sent._.conll_str, str) + +def test_sents_has_conll_str_headers_single_sentence(spacy_en_small_with_formatter, single_string_single_sentence): + doc = spacy_en_small_with_formatter(single_string_single_sentence) + for sent in doc.sents: + assert sent.has_extension('conll_str_headers') + assert sent._.conll_str_headers is not None + assert isinstance(sent._.conll_str_headers, str) + +def test_sents_has_conll_single_sentence(spacy_en_small_with_formatter, single_string_single_sentence): + doc = spacy_en_small_with_formatter(single_string_single_sentence) + for sent in doc.sents: + assert sent.has_extension('conll') + assert sent._.conll is not None + assert isinstance(sent._.conll, list) + +# Sents: multi-sentence +def test_sents_has_conll_str_multi_sentence(spacy_en_small_with_formatter, single_string_multi_sentence): + doc = spacy_en_small_with_formatter(single_string_multi_sentence) + for sent in doc.sents: + assert sent.has_extension('conll_str') + assert sent._.conll_str is not None + assert isinstance(sent._.conll_str, str) + +def test_sents_has_conll_str_headers_multi_sentence(spacy_en_small_with_formatter, single_string_multi_sentence): + doc = spacy_en_small_with_formatter(single_string_multi_sentence) + for sent in doc.sents: + assert sent.has_extension('conll_str_headers') + assert sent._.conll_str_headers is not None + assert isinstance(sent._.conll_str_headers, str) + +def test_sents_has_conll_multi_sentence(spacy_en_small_with_formatter, single_string_multi_sentence): + doc = spacy_en_small_with_formatter(single_string_multi_sentence) + for sent in doc.sents: + assert sent.has_extension('conll') + assert sent._.conll is not None + assert isinstance(sent._.conll, list) diff --git a/spacy_conll/tests/test_spacy_stanfordnlp_has_extension.py b/spacy_conll/tests/test_spacy_stanfordnlp_has_extension.py new file mode 100644 index 0000000..ba6317c --- /dev/null +++ b/spacy_conll/tests/test_spacy_stanfordnlp_has_extension.py @@ -0,0 +1,82 @@ +# Doc: single sentence +def test_doc_has_conll_str_single_sentence(spacy_stanfordnlp_en_with_formatter, single_string_single_sentence): + doc = spacy_stanfordnlp_en_with_formatter(single_string_single_sentence) + assert doc.has_extension('conll_str') + assert doc._.conll_str is not None + assert isinstance(doc._.conll_str, str) + +def test_doc_has_conll_str_headers_single_sentence(spacy_stanfordnlp_en_with_formatter, single_string_single_sentence): + doc = spacy_stanfordnlp_en_with_formatter(single_string_single_sentence) + assert doc.has_extension('conll_str_headers') + assert doc._.conll_str_headers is not None + assert isinstance(doc._.conll_str_headers, str) + +def test_doc_has_conll_single_sentence(spacy_stanfordnlp_en_with_formatter, single_string_single_sentence): + doc = spacy_stanfordnlp_en_with_formatter(single_string_single_sentence) + assert doc.has_extension('conll') + assert doc._.conll is not None + assert isinstance(doc._.conll, list) + +# Doc: multi-sentence +def test_doc_has_conll_str_multi_sentence(spacy_stanfordnlp_en_with_formatter, single_string_multi_sentence): + doc = spacy_stanfordnlp_en_with_formatter(single_string_multi_sentence) + assert doc.has_extension('conll_str') + assert doc._.conll_str is not None + assert isinstance(doc._.conll_str, str) + +def test_doc_has_conll_str_headers_multi_sentence(spacy_stanfordnlp_en_with_formatter, single_string_multi_sentence): + doc = spacy_stanfordnlp_en_with_formatter(single_string_multi_sentence) + assert doc.has_extension('conll_str_headers') + assert doc._.conll_str_headers is not None + assert isinstance(doc._.conll_str_headers, str) + +def test_doc_has_conll_multi_sentence(spacy_stanfordnlp_en_with_formatter, single_string_multi_sentence): + doc = spacy_stanfordnlp_en_with_formatter(single_string_multi_sentence) + assert doc.has_extension('conll') + assert doc._.conll is not None + assert isinstance(doc._.conll, list) + + +# Sents +def test_sents_has_conll_str_single_sentence(spacy_stanfordnlp_en_with_formatter, single_string_single_sentence): + doc = spacy_stanfordnlp_en_with_formatter(single_string_single_sentence) + for sent in doc.sents: + assert sent.has_extension('conll_str') + assert sent._.conll_str is not None + assert isinstance(sent._.conll_str, str) + +def test_sents_has_conll_str_headers_single_sentence(spacy_stanfordnlp_en_with_formatter, single_string_single_sentence): + doc = spacy_stanfordnlp_en_with_formatter(single_string_single_sentence) + for sent in doc.sents: + assert sent.has_extension('conll_str_headers') + assert sent._.conll_str_headers is not None + assert isinstance(sent._.conll_str_headers, str) + +def test_sents_has_conll_single_sentence(spacy_stanfordnlp_en_with_formatter, single_string_single_sentence): + doc = spacy_stanfordnlp_en_with_formatter(single_string_single_sentence) + for sent in doc.sents: + assert sent.has_extension('conll') + assert sent._.conll is not None + assert isinstance(sent._.conll, list) + +# Sents: multi-sentence +def test_sents_has_conll_str_multi_sentence(spacy_stanfordnlp_en_with_formatter, single_string_multi_sentence): + doc = spacy_stanfordnlp_en_with_formatter(single_string_multi_sentence) + for sent in doc.sents: + assert sent.has_extension('conll_str') + assert sent._.conll_str is not None + assert isinstance(sent._.conll_str, str) + +def test_sents_has_conll_str_headers_multi_sentence(spacy_stanfordnlp_en_with_formatter, single_string_multi_sentence): + doc = spacy_stanfordnlp_en_with_formatter(single_string_multi_sentence) + for sent in doc.sents: + assert sent.has_extension('conll_str_headers') + assert sent._.conll_str_headers is not None + assert isinstance(sent._.conll_str_headers, str) + +def test_sents_has_conll_multi_sentence(spacy_stanfordnlp_en_with_formatter, single_string_multi_sentence): + doc = spacy_stanfordnlp_en_with_formatter(single_string_multi_sentence) + for sent in doc.sents: + assert sent.has_extension('conll') + assert sent._.conll is not None + assert isinstance(sent._.conll, list)