Prepare 1.2.0 release

BramVanroy · Feb 2, 2020 · 958eb80 · 958eb80
1 parent 4240656
commit 958eb80
Show file tree

Hide file tree

Showing 3 changed files with 67 additions and 6 deletions.
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -2,6 +2,16 @@
 History
 #######
 
+**************************
+1.2.0 (February 2nd, 2020)
+**************************
+* **BREAKING**: :code:`._.conll` now outputs a dictionary for sentences :code:`fieldname: [value1, value2...]`, and
+  a list of such dictionaries for a Doc
+* Added a :code:`conversion_maps` argument where one can define a mapping to have better control over the model's tagset
+  (see the advanced example in README.rst)
+* Tests for usage with :code:`spacy-stanfordnlp`
+* Better documentation, including advanced example
+
 **************************
 1.1.0 (January 21st, 2020)
 **************************
@@ -15,10 +25,10 @@ Minor documentation changes for PyPi.
 **************************
 1.0.0 (January 13th, 2020)
 **************************
-- Complete overhaul. Can now be used a custom pipeline component in spaCy.
-- Spacy2ConllParser is now deprecated.
-- The CLI interface does not rely on Spacy2ConllParser anymore but uses the custom pipeline component instead.
-- Added :code:`-e|--no_force_counting` to the CLI options. By default, when using :code:`-d|--include_headers`,
+* Complete overhaul. Can now be used a custom pipeline component in spaCy.
+* Spacy2ConllParser is now deprecated.
+* The CLI interface does not rely on Spacy2ConllParser anymore but uses the custom pipeline component instead.
+* Added :code:`-e|--no_force_counting` to the CLI options. By default, when using :code:`-d|--include_headers`,
   parsed sentence will be numbered incrementally. This can be disabled so that the sentence numbering depends on how
   spaCy segments the sentences.
 

diff --git a/README.rst b/README.rst
@@ -178,6 +178,57 @@ The snippet above will return (and print) the following string:
     3	?	?	PUNCT	.	PunctType=peri	1	punct	_	_
 
 
+An advanced example, showing the more complex options:
+
+* :code:`ext_names`: changes the attribute names to a custom key by using a dictionary. You can change:
+
+ * :code:`conll_str`: a string representation of the CoNLL format
+ * :code:`conll_str_headers`: the same a conll_str but with leading lines containing sentence index and sentence text
+ * :code:`conll`: a dictionary containing the field names and their values. For a Doc object, this returns a list of
+                  dictionaries where each dictionary is a sentence
+
+* :code:`field_names`: a dictionary containing a mapping of field names so that you can name them as you wish
+* :code:`conversion_maps`: a two-level dictionary that looks like :code:`{field_name: {tag_name: replacement}}`
+                           In other words, you can specify in which field a certain value should be replaced by another.
+                           This is especially useful when you are not satisfied with the tagset of a model and wish
+                           to change some tags to an alternative.
+
+The example below
+
+* changes the custom attribute :code:`conll` to :code:`connl_for_pd`;
+* changes the :code:`lemma` field to :code:`word_lemma`;
+* converts any :code:`-PRON-` to :code:`PRON`;
+* as a bonus: uses the output dictionary to create a pandas DataFrame.
+
+.. code:: python
+
+    import pandas as pd
+    import spacy
+    from spacy_conll import ConllFormatter
+
+
+    nlp = spacy.load('en')
+    conllformatter = ConllFormatter(nlp,
+                                    ext_names={'conll': 'connl_for_pd'},
+                                    field_names={'lemma': 'word_lemma'},
+                                    conversion_maps={'word_lemma': {'-PRON-': 'PRON'}})
+    nlp.add_pipe(conllformatter, after='parser')
+    doc = nlp('I like cookies.')
+    df = pd.DataFrame.from_dict(doc._.connl_for_pd[0])
+    print(df)
+
+The snippet above will output a pandas DataFrame:
+
+.. code:: text
+
+       id     form word_lemma upostag  ... head deprel  deps misc
+    0   1        I       PRON    PRON  ...    2  nsubj     _    _
+    1   2     like       like    VERB  ...    0   ROOT     _    _
+    2   3  cookies     cookie    NOUN  ...    2   dobj     _    _
+    3   4        .          .   PUNCT  ...    2  punct     _    _
+
+    [4 rows x 10 columns]
+
 spacy-stanfordnlp
 +++++++++++++++++
 

diff --git a/setup.py b/setup.py
@@ -6,12 +6,12 @@
 
 setup(
     name='spacy_conll',
-    version='1.1.0',
+    version='1.2.0',
     description='A custom pipeline component for spaCy that can convert any parsed Doc'
                 ' and its sentences into CoNLL-U format. Also provides a command line entry point.',
     long_description=long_description,
     long_description_content_type='text/x-rst',
-    keywords='nlp spacy spacy-extension conll conllu tagging',
+    keywords='nlp spacy spacy-extension conll conllu tagging parsing stanfordnlp spacy_stanfordnlp',
     packages=['spacy_conll'],
     url='https://github.com/BramVanroy/spacy_conll',
     author='Bram Vanroy, Raquel G. Alhama',