diff --git a/HISTORY.rst b/HISTORY.rst index bcf005f..33d0218 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -2,6 +2,16 @@ History ####### +************************** +1.2.0 (February 2nd, 2020) +************************** +* **BREAKING**: :code:`._.conll` now outputs a dictionary for sentences :code:`fieldname: [value1, value2...]`, and + a list of such dictionaries for a Doc +* Added a :code:`conversion_maps` argument where one can define a mapping to have better control over the model's tagset + (see the advanced example in README.rst) +* Tests for usage with :code:`spacy-stanfordnlp` +* Better documentation, including advanced example + ************************** 1.1.0 (January 21st, 2020) ************************** @@ -15,10 +25,10 @@ Minor documentation changes for PyPi. ************************** 1.0.0 (January 13th, 2020) ************************** -- Complete overhaul. Can now be used a custom pipeline component in spaCy. -- Spacy2ConllParser is now deprecated. -- The CLI interface does not rely on Spacy2ConllParser anymore but uses the custom pipeline component instead. -- Added :code:`-e|--no_force_counting` to the CLI options. By default, when using :code:`-d|--include_headers`, +* Complete overhaul. Can now be used a custom pipeline component in spaCy. +* Spacy2ConllParser is now deprecated. +* The CLI interface does not rely on Spacy2ConllParser anymore but uses the custom pipeline component instead. +* Added :code:`-e|--no_force_counting` to the CLI options. By default, when using :code:`-d|--include_headers`, parsed sentence will be numbered incrementally. This can be disabled so that the sentence numbering depends on how spaCy segments the sentences. diff --git a/README.rst b/README.rst index 0d8f66f..1c73fad 100644 --- a/README.rst +++ b/README.rst @@ -178,6 +178,57 @@ The snippet above will return (and print) the following string: 3 ? ? PUNCT . PunctType=peri 1 punct _ _ +An advanced example, showing the more complex options: + +* :code:`ext_names`: changes the attribute names to a custom key by using a dictionary. You can change: + + * :code:`conll_str`: a string representation of the CoNLL format + * :code:`conll_str_headers`: the same a conll_str but with leading lines containing sentence index and sentence text + * :code:`conll`: a dictionary containing the field names and their values. For a Doc object, this returns a list of + dictionaries where each dictionary is a sentence + +* :code:`field_names`: a dictionary containing a mapping of field names so that you can name them as you wish +* :code:`conversion_maps`: a two-level dictionary that looks like :code:`{field_name: {tag_name: replacement}}` + In other words, you can specify in which field a certain value should be replaced by another. + This is especially useful when you are not satisfied with the tagset of a model and wish + to change some tags to an alternative. + +The example below + +* changes the custom attribute :code:`conll` to :code:`connl_for_pd`; +* changes the :code:`lemma` field to :code:`word_lemma`; +* converts any :code:`-PRON-` to :code:`PRON`; +* as a bonus: uses the output dictionary to create a pandas DataFrame. + +.. code:: python + + import pandas as pd + import spacy + from spacy_conll import ConllFormatter + + + nlp = spacy.load('en') + conllformatter = ConllFormatter(nlp, + ext_names={'conll': 'connl_for_pd'}, + field_names={'lemma': 'word_lemma'}, + conversion_maps={'word_lemma': {'-PRON-': 'PRON'}}) + nlp.add_pipe(conllformatter, after='parser') + doc = nlp('I like cookies.') + df = pd.DataFrame.from_dict(doc._.connl_for_pd[0]) + print(df) + +The snippet above will output a pandas DataFrame: + +.. code:: text + + id form word_lemma upostag ... head deprel deps misc + 0 1 I PRON PRON ... 2 nsubj _ _ + 1 2 like like VERB ... 0 ROOT _ _ + 2 3 cookies cookie NOUN ... 2 dobj _ _ + 3 4 . . PUNCT ... 2 punct _ _ + + [4 rows x 10 columns] + spacy-stanfordnlp +++++++++++++++++ diff --git a/setup.py b/setup.py index 7028dbc..0f8fe22 100644 --- a/setup.py +++ b/setup.py @@ -6,12 +6,12 @@ setup( name='spacy_conll', - version='1.1.0', + version='1.2.0', description='A custom pipeline component for spaCy that can convert any parsed Doc' ' and its sentences into CoNLL-U format. Also provides a command line entry point.', long_description=long_description, long_description_content_type='text/x-rst', - keywords='nlp spacy spacy-extension conll conllu tagging', + keywords='nlp spacy spacy-extension conll conllu tagging parsing stanfordnlp spacy_stanfordnlp', packages=['spacy_conll'], url='https://github.com/BramVanroy/spacy_conll', author='Bram Vanroy, Raquel G. Alhama',