From a1322bc71cea63f9ffa25e02ba190cde706bcf78 Mon Sep 17 00:00:00 2001 From: Ghislain Vaillant Date: Wed, 24 Jan 2024 19:07:04 +0100 Subject: [PATCH 01/21] DOC: Refactor documentation for faster builds - Switch from Jupyter Book to Sphinx - Use sphinx-book-theme as docs theme - Use myst-parser for Markdown support - Add commands for incremental development - Generate API docs statically with sphinx-apidoc --- .readthedocs.yaml | 7 - docs/_config.yml | 80 ---- docs/_static/medkit-icon.png | Bin 0 -> 6238 bytes .../medkit-logo.png} | Bin docs/_templates/autosummary/module.rst | 46 --- docs/_toc.yml | 49 --- docs/api-gen/index.md | 14 - docs/changelog.md | 2 + docs/conf.py | 48 +++ docs/index.md | 90 ++++- .../context_detection.md | 0 .../entity_matching.md | 0 docs/user_guide/first_steps.md | 357 +++++++++-------- docs/user_guide/install.md | 92 ++--- docs/user_guide/module.md | 14 - docs/user_guide/pipeline.md | 322 +++++++-------- docs/user_guide/provenance.md | 366 ++++++++---------- pyproject.toml | 33 +- 18 files changed, 668 insertions(+), 852 deletions(-) delete mode 100644 docs/_config.yml create mode 100644 docs/_static/medkit-icon.png rename docs/{img/medkit_logo.png => _static/medkit-logo.png} (100%) delete mode 100644 docs/_templates/autosummary/module.rst delete mode 100644 docs/_toc.yml delete mode 100644 docs/api-gen/index.md create mode 100644 docs/changelog.md create mode 100644 docs/conf.py rename docs/{user_guide => tutorial}/context_detection.md (100%) rename docs/{user_guide => tutorial}/entity_matching.md (100%) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 0d0b8f36..9d42c9d3 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -6,10 +6,6 @@ python: path: . extra_requirements: - docs - - metrics-ner - - nlstruct - - srt-io-converter - - webrtc-voice-detector build: os: ubuntu-22.04 @@ -18,6 +14,3 @@ build: apt_packages: - graphviz - libsndfile1 - jobs: - pre_build: - - jupyter-book config sphinx docs/ diff --git a/docs/_config.yml b/docs/_config.yml deleted file mode 100644 index 6db7653d..00000000 --- a/docs/_config.yml +++ /dev/null @@ -1,80 +0,0 @@ -# Book settings -# Learn more at https://jupyterbook.org/customize/config.html - -title: medkit documentation -author: HeKA Research Team -copyright: 2022-2024 -logo: img/medkit_logo.png - -# Force re-execution of notebooks on each build. -# See https://jupyterbook.org/content/execute.html -execute: - execute_notebooks: auto - stderr_output: error - raise_on_error: true - -# Define the name of the latex output file for PDF builds -latex: - latex_documents: - targetname: book.tex - -# Information about where the book exists on the web -repository: - url: https://github.com/medkit-lib/medkit # Online location of your book - path_to_book: docs # Optional path to your book, relative to the repository root - branch: main # Which branch of the repository should be used when creating links (optional) - -# Add buttons to your book -# See https://jupyterbook.org/customize/config.html#add-a-link-to-your-repository -html: - use_issues_button: true - use_repository_button: true - extra_navbar: "" - extra_footer: 'Contact: medkit-maintainers@inria.fr' - -# Add sphinx configuration options -sphinx: - config: - add_module_names: False - language: en - autoclass_content: 'both' - autodoc_typehints: 'both' - autodoc_typehints_description_target: 'documented' - autodoc_typehints_format: 'short' - autodoc_inherit_docstrings: True - autodoc_member_order: 'bysource' - autodoc_mock_imports: - - edsnlp - - nlstruct - - pandas - - pyannote - - pyannote.audio - - PyRuSH - - quickumls - - resampy - - seqeval - - soundfile - - spacy - - speechbrain - - torch - - torchaudio - - transformers - - webrtcvad - autosummary_ignore_module_all: False - nb_execution_show_tb: True - # napoleon extension config - napoleon_use_ivar: True - napoleon_use_param: True - napoleon_use_rtype: False - napoleon_attr_annotations: True - templates_path: ["_templates"] - suppress_warnings: ["etoc.toctree", "myst.domains"] - exclude_patterns: ["_build", "_templates"] - extra_extensions: - - 'sphinx.ext.autodoc' - - 'sphinx.ext.autosummary' - - 'sphinx.ext.napoleon' - - 'sphinx.ext.viewcode' - - 'sphinx_toolbox.more_autosummary' - - 'sphinx_toolbox.more_autodoc.typehints' - - 'sphinxcontrib.mermaid' diff --git a/docs/_static/medkit-icon.png b/docs/_static/medkit-icon.png new file mode 100644 index 0000000000000000000000000000000000000000..ba8a37ee4b0e93e97299f72225592b5f300e9696 GIT binary patch literal 6238 zcmV-k7@_BhP)Nxjm1hr+Wf9 z-*Hol|wrsdN6S7T8SDK<%ACJ_qe)ft*V`>4LYp z;E5dA^PvH-xhS}6b7utQR4TW39sYk=nP&UA?e$^(LwF|LgF}ag8`^hjWaXomTpTdI zsTHg}Le3SOb3r**7xG!=aTityGYG(R!Y4ow;3K%l2|#HO2!aWutvsMb_|A6>4mE@a z4O2mzFTVmH0cWp+K*^+pHvE7JFJ=M348fTBLd-qpl0=B=Fmf{Tl!}5u_;_EsA$(|v z3fgkj)!=*7(8f+Udc#X2{<{u4!e#duGK_WsyawPA z0CxcB_a)%?ATv=O?M>t8I?sfxecPdi!m|&A)R+W4>!At^DfxU zf?XG!t+@>-eFio&@C_FD5^H!?IfXXjm^uof?kY(8nScdCP(n@uzIPKy+zjw4rlJZ` z;lQr-rOH&$VgR29<8A^v_efaT*a7))9o11?1u*u7uV7KzpP7D=NVRC)zUnI=x&mMy z)-FPpUDb5?&Ro&EBTeN2coze=ZptGO3ZSl9p}sSK8E$Taq@_zWoa9Lm;#1*hlnt1T zhS7%U9s}MqmFBQ#gPov(kM z0gw<7Qe*4H!60$8F?s;Wm?tAkbHfndc1y#zU8FCcQPrX&CzUu&Hbba|+j9+)-3JzF2rK3hcF|!2GuK|1#L5dZ;H zg~M%A+EE8usL{}rX`9@8Rk9ZDrwY6s^SQav9jl}T5e*L7o(Ltx zt5wf(z)32zcpg0Fuqh9XUw0#@wavayd&;a7!U!j8k@G}Po}eV~d`~f|$wJy!uLqDV zi;PATg&&>{zrR6O!mOFzvfA8V2nwjOei5Wb_#ICF`hE1q*P(KGVHV5-s}Gv;AUO#* zEA}VWn!)Ns8&zdKgaG|gGs|ok-dmea8}4ed(+T|mZYqn4Mk`?B?v4QX_B0`jd7(Pn z0M^hN5aKz2M=RAqnBG-jw1D|?I1aO05HMq0B|JE8ka&rv8Ef+viE9|R9>A8Ueg+<8;B)}5zOe&(o z4BN$n%t$qYx)iLB;_-~nquUa#SD$Mo)ZcB5DP_L%9i_)dlW-D$$L77P3jkU)_y+`> zl^5_y!|5z38pCLIq*XW905}A|I{_Tt2^2mvXnv4ApkZyOCJ2f-t!S)U~XY^ONh4&#vU$BCO%OnYzhxYSO z{u%H+#l9!Z@PzN%VG7SD`|iBo#d-gOU|tTQ0YS70QYbYxt|*maHMaGIrrwT6)xf4x z{an6T2_hqK1tfgJSsL`X0A4l%@XOl8QGS@h{BzV+l;5e?7cTu>Y`!`Ce-3q+Q~(>x zetz2`+&VG2V>4jORi6guGf<-gkliaJyu3m_2bImNLN0eKyV-p?=O#IuWzM)%F$2t~ zA_ja-c#IkM+R73M>b~+y6b1%}oP^{;?jaQ1BegrBIOodZ3@D)@G*9G2h>9?S?GH@& zJp+T9P+*W$o}Y1&&bdPKp?QKX5JZ8X^X3lTngad%BEa*aU_W6h!h*|Y>yFI;W5_l# z;R$0)!gmN=2K+j&%1o##fU(bB3!T{tcv&FXEbT~fC->$;?sM!;p2N9Zd#EaAqk>E$ zZz15Xnc(?jkH|)1nR#lHstQ2LY5+E$|QnUt!>0p|wy%3?epq?);&V?f6{X2vZM z7KC`5fe!)HUIy=&?RB9-kocBeUBVmPto@-^Bx}8nwZ5cmFvNKwROYwq%DOqYbA$t` z_&cMH{1Ik92llD-e66p@&L0op9soCJ*VRN+%z59Q?w*|h#=h`X>vvNkDX67gLF-dV z$UDGTVk-*DuBw}JJgM8z1ssx~aj;3aW&Ozl)q zVmRO^f-?#@#t);}w|g#EWKSbZ(2G*eAgUt*wb>Y}_v@d9Y-AhdNUauR!h#D&&Vt_~lW zjee9N{Nw9FLvf&Lc**x}HZuT)7oI_G&7BH`yo7ZAkX-Y?DF9yt@Y)dZ909i?RTfuA zk*|6xC(Q8klMH_;$fx6JGhnuk_xhodlhkkVps*zPcBLdm+#ajeX6C zGD;FahiyNR(7wMlA>{`cI8ITqUKO;NH!fnjDM5C5SL=jJr0Y)ya5AdHUG_h3Xaskp zySM9m)r3sfjW=NYx*Hb--?r15#w~R~@;?Dw&-wgp8Y`Hg!<-EEADIdP@45GWdmlXN zQ1}}*LUea1;Yeb5B0!E?Z2!9Ovc8QmMy3|$!X$A{n^A|fjv36Pjy^lYO z;nRB2_riJ35FrAJL#KsI#G6b_THagLRHsvk(mfvG(gN5PL^R~;12s}jX2C1%T1=20Tx=KEIAnNXGatLM3hKV| zuHeH}pTW=p`!O^t(!jxkgN<(5oM@L7KslxV_)?>ru*0;=s-QOt?G`|TX}16xOuLi~ zc)ll%ga)=Y0gLjWq-IJsr#N-`k(|B#K_m{`6TJ^V+304@iFT;h}dPZtnvJ?hoFU zoemin+_C*3O2E`Zm!gA}wuY?q0I^~e8cWd3j5`4w4GxZnvk4I)7qMSRiKX2ch}w;* zU-zb`MaKCItl6^%1#ide*jWJ(NJ@6I(qjK>Oq~F73`#+(jlCkl9&VGsNg+7QCfG30v84$=vE{{Is{8uVmQk9)rw$EcO>Yn0Jp)SF&~MD&DGeb7n3;G64MBkP zgEZ6*OMNB*%ru=XBu@_wsQK7dMd99|&AM5ny!;Omz!_nk#u8{@sNnzbzkhbmJ$&IA?OA)|e+1q!>LR@)t4imlQESKpl&`z=l^dTYs`V zojwIxla|sp!uY74R<(Llu@Y7P=wIx$&#tij9*F}dT!ZM$L zF^LrnaW8ng+Q$IAVzdf?Uu2KQZx*eC@?AhSK}dqWE8Syf!&!sFc&M+a>jjam=OX~V z2w<+c#yC}EUj5pg>25Q6w!^)~>n>(Z?gW_VNr1P_u(!7IG|8G=0^o6Go+zQoMbuz3 z4aQ}FN(hfV;fkbv7l6Y6Tm|5PwsrQH%D)(QgT5Jkxpb^xAN;T0B9EmEFq~#q|69oX z+4gNZ46EVYebsb%yZ}^zY4f0DcXAM{e+~<=<~V&wWst#|TpR%7X)Bn<$>JfSI*Pc& z30VSKa;Y6uanO@4g9055;MOHwxDTMW8vQ=pm6W>laP(?Hz(15hE@#`fI6mqSnLWb5 zJya^2g#dgrwj3tq5wyM@%#-1UfqCCS7;$i-iZacM>Wa4FHKGC3~ zxdi!RklpvNqHI6H&X5Y*@MQu~y8;E!N^MKl_C2G3w;DQaOg z%jfKvsS#z6_YMYrWYm)mzB@SpuO}?FDj%M%J8L9%4CxRHYhL(s&3uy}lGO%ptN^+` z|EaKg(wxm0EpRilz6=y1or9+NWe1_w?6Sv}0jdk6Que%UTUKm*f`KMIg*%n+m~NKJ zGPwIc0k}*9{x)W@!t-S4_1#%!YUa{*zYH5{n136H-k+3GPbbi-3kdQ%k`-hWrY109~SKJ}dGQuzkW{B{j&Z6Ts|B2(VP%pYdZ@eH)j zVw2tk0~ma0qLXjFl;dN-|_pLpK>UmF~KFKqmjCf1n>pCJ0BiGvP4J{6Pjj z7&19v_EZ6wOIi33^9>AKZANmB5ZZ2s-LYMgQJK*@DpD`!OcMr~F@~}DU%z4Rt?$18 zYHU;&`~utkV%S?#6u?UJ-eIk&%nO=r|H@U7aXnS!eoe>xSqZ?6xyht4B)4N5%ru1w)zZ2 zLI^j2vPiXDzv&-u*mvOh=loTFeqv-t3=bXvZ~}neg~;hjAw(|fA^?|!?wuH!`ePDv zIh9>GY!{Ju@8Ix^#}r_$(a`QYeleHFx1;BK(nTLWjEfl+&>6;EOl!B9<(_m(tD`+& z6}xp872$aSoCV-sFuKEyj{;IUU@ST0xTDu6_SzTfH}|Vf0>*-!odXP<58!bEx_J9# zL}awj0dSoFJO$7RgTu9tt;A4W6Ee}NH79KUTLIh%!n5HBT?gQLA|wNUCpzFP_`4Lt zZ(_R)c#488K|@A1qQdwDy`$AaN*mXt;a>lye$m9v1NY7Q2_>KFmd zD>Wh`MlHUU0=NZ)EA2ky;6PQg0{sjmzWvg0i3!IH4i!TVO&WeuTlMp#)O7GV0PiOlKwURsn9{p{+_U` zau!7LI`F(0UwuBQ-ZLPY+Q*i~&P7E)%aorOekitY^>9fDAKNyhIg0f#JCsOK_9x!~ zgaCVmVr6BC7kgxenK>2hn9?o|W+UQua?Tgk9hi0S@DN_;Nz*iX)|5BBZD2-E*deDuDT- znY^3&aXzFsn*bIZEmUwWJ(UjZ#1p+~D#mWO;hyG?}CH10cUGjfnoZ<}|3)0~_XDxuWVOce}%TDY5m@AGVb+iq@ zug0pkXff=f#2>MsBa&Wuata65oo zLIgpr&r+HVaT$E%83f7rx?rEmsjdPzsC0Ay=TZ=?@ewfY3YkD%#udWx)J6 z_{e-w{T$vh`a>x-NzEzA*}Q@#?8m@`OgKd|_GF;Bxa!@>m2eq)p`d|5COpi*k2ySO z8_X6J&CIBGKA<52F!XFF1Y?nnP0(cX!x1c{g@Ht^1)zf5@ zVFEC;t|x6F5)NDZVf2Fv_GfvDjI6ByEHPQsg#F!rCp0=U!AItc8Y+N@)~3_JZ-{Vl z2GD#6AVl2R+!)yt_sDM0*iCO@TBy;`Hx2C;K!fT30{{U3|K#$3X~RnsU;qFB07*qo IM6N<$g8QV+rT_o{ literal 0 HcmV?d00001 diff --git a/docs/img/medkit_logo.png b/docs/_static/medkit-logo.png similarity index 100% rename from docs/img/medkit_logo.png rename to docs/_static/medkit-logo.png diff --git a/docs/_templates/autosummary/module.rst b/docs/_templates/autosummary/module.rst deleted file mode 100644 index ec9d4072..00000000 --- a/docs/_templates/autosummary/module.rst +++ /dev/null @@ -1,46 +0,0 @@ -:orphan: - -{{ fullname | escape | underline}} - -{% if not modules %} -.. automodule:: {{ fullname }} - :members: - :inherited-members: dict - :autosummary: - :autosummary-members: - :autosummary-inherited-members: dict -{% endif %} - -{% block members %} -{% if members and modules %} -APIs ----- - -For accessing these APIs, you may use import like this: - -.. code-block:: python - - from {{ fullname }} import - -.. automodule:: {{ fullname }} - :members: {% for item in members %} {{ item }}, {%- endfor %} - :inherited-members: dict - :autosummary: - :autosummary-members: - :autosummary-inherited-members: dict - :autosummary-no-nesting: -{% endif %} -{% endblock %} - -{% block modules %} -{% if modules %} -Subpackages / Submodules ------------------------- -.. autosummary:: - :toctree: - :recursive: -{% for item in modules %} - {{ item }} -{%- endfor %} -{% endif %} -{% endblock %} diff --git a/docs/_toc.yml b/docs/_toc.yml deleted file mode 100644 index 2c7f5128..00000000 --- a/docs/_toc.yml +++ /dev/null @@ -1,49 +0,0 @@ -format: jb-book -root: index -parts: -- caption: User Guide - chapters: - - file: user_guide/install - - file: user_guide/first_steps - - file: user_guide/entity_matching - - file: user_guide/context_detection - - file: user_guide/pipeline - - file: user_guide/provenance - - file: user_guide/module -- caption: Medkit components - chapters: - - file: api/core - sections: - - file: api/core_text - - file: api/core_audio - - file: api/text - - file: api/audio - - file: api/io - - file: api/training - - file: api/tools -- caption: Examples - chapters: - - file: examples/spans - - file: examples/cleaning_text - - file: examples/text_segmentation/index - sections: - - file: examples/text_segmentation/section - - file: examples/text_segmentation/syntagma - - file: examples/text_segmentation/document - - file: examples/brat_io - - file: examples/spacy/index - sections: - - file: examples/spacy/spacy_io - - file: examples/spacy/spacy_pipeline - - file: examples/custom_text_operation - - file: examples/edsnlp - - file: examples/iamsystem - - file: examples/finetuning_hf_model - - file: examples/detecting_text_duplicates - - file: examples/audio_transcription - - file: examples/audio_dataset_metrics - - file: examples/ontotox -- caption: API reference - maxdepth: 2 - chapters: - - file: api-gen/index diff --git a/docs/api-gen/index.md b/docs/api-gen/index.md deleted file mode 100644 index a4f42240..00000000 --- a/docs/api-gen/index.md +++ /dev/null @@ -1,14 +0,0 @@ -# Generated API documentation - -```{eval-rst} -.. autosummary:: - :toctree: _autosummary - :recursive: - - medkit.core - medkit.text - medkit.audio - medkit.io - medkit.training - medkit.tools -``` diff --git a/docs/changelog.md b/docs/changelog.md new file mode 100644 index 00000000..b48597b7 --- /dev/null +++ b/docs/changelog.md @@ -0,0 +1,2 @@ +:::{include} ../CHANGELOG.md +::: \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 00000000..a954775b --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,48 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = "medkit" +author = "HeKA Research Team" +project_copyright = f"2022-2024, {author}" + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = [ + "autoapi.extension", + "myst_parser", + "numpydoc", + "sphinxcontrib.mermaid", +] +templates_path = ["_templates"] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + +# -- autoapi configuration --------------------------------------------------- +# https://sphinx-autoapi.readthedocs.io/en/latest/reference/config.html + +autoapi_dirs = ["../medkit"] +autoapi_root = "api/_generated" + +# -- myst_parser configuration ----------------------------------------------- +# https://myst-parser.readthedocs.io/en/latest/configuration.html + +myst_enable_extensions = ["attrs_inline", "colon_fence"] + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = "sphinx_book_theme" +html_theme_options = { + "path_to_docs": "docs", + "repository_url": "https://github.com/medkit-lib/medkit", + "repository_branch": "main", +} +html_title = "medkit documentation" +html_logo = "_static/medkit-logo.png" +html_favicon = "_static/medkit-icon.png" +html_static_path = ["_static"] diff --git a/docs/index.md b/docs/index.md index 8f23a7bd..ea0f1df6 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,20 +1,88 @@ -# medkit - documentation +# medkit -***medkit*** is a Python library for facilitating the **extraction of features** from various modalities of patient data: text, audio for now - relational, image, genetic, and others soon. To this end, medkit offers to compose pipelines with modules, developed by us, yourself or others. +`medkit` is a Python library which facilitates **extraction of features** +from various modalities of patient data, including text and audio for now +-- relational, image, genetic, and others will follow soon. -We pay particular attention to enable the implementation of **non-destructive pipelines** (no loss of information when passing from a module to another) and a flexible tracing of **data provenance**. +To this end, medkit enables composition of pipelines with multiple modules, +developed by us, yourself or others. -***medkit*** aims at accelerating the development of a learning health system, with a strong open-source and community orientation. +`medkit` places a strong emphasis on **non-destructive operations**, +i.e. no loss of information when passing data from a module to another, +and a flexible tracing of **data provenance**. + +`medkit` aims at accelerating the development of a learning health system, +with a strong dedication to open-source and community development. :::{warning} -*medkit* core library is still under development and testing. -Some interfaces may change in the future. +The `medkit` core library is still under heavy development and testing. +Some public interfaces may change in the future. +Please check the **BREAKING CHANGES** section of the project's changelog for details. +::: + +:::{toctree} +--- +caption: User Guide +hidden: +titlesonly: +--- +user_guide/install +user_guide/first_steps +user_guide/pipeline +user_guide/provenance +user_guide/module +::: -This library with its documentation is shared for following reasons: -* helping people with some guidelines for using first medkit versions, -* helping core development team to get user feedback for improving the *medkit* library. +:::{toctree} +--- +caption: Examples +hidden: +titlesonly: +--- +examples/spans +examples/cleaning_text +examples/text_segmentation/index +examples/text_segmentation/section +examples/text_segmentation/syntagma +examples/text_segmentation/document +examples/brat_io +examples/spacy/index +examples/spacy/spacy_io +examples/spacy/spacy_pipeline +examples/custom_text_operation +examples/edsnlp +examples/iamsystem +examples/finetuning_hf_model +examples/detecting_text_duplicates +examples/audio_transcription +examples/audio_dataset_metrics +examples/ontotox ::: +:::{toctree} +--- +caption: Tutorial +hidden: +titlesonly: +--- +tutorial/context_detection +tutorial/entity_matching +::: -```{tableofcontents} -``` +:::{toctree} +--- +caption: Reference +hidden: +titlesonly: +--- +api/_generated/index +api/audio +api/core +api/core_audio +api/core_text +api/io +api/text +api/training +api/tools +changelog +::: diff --git a/docs/user_guide/context_detection.md b/docs/tutorial/context_detection.md similarity index 100% rename from docs/user_guide/context_detection.md rename to docs/tutorial/context_detection.md diff --git a/docs/user_guide/entity_matching.md b/docs/tutorial/entity_matching.md similarity index 100% rename from docs/user_guide/entity_matching.md rename to docs/tutorial/entity_matching.md diff --git a/docs/user_guide/first_steps.md b/docs/user_guide/first_steps.md index dcbea060..21b6aeaf 100644 --- a/docs/user_guide/first_steps.md +++ b/docs/user_guide/first_steps.md @@ -1,28 +1,14 @@ ---- -jupytext: - formats: md:myst - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.14.4 -kernelspec: - display_name: Python 3 (ipykernel) - language: python - name: python3 ---- - # First steps -This tutorial will show you how to use medkit to annotate a text document, by -applying pre-processing, entity matching and context detections operations. +This tutorial will show you how to use `medkit` to annotate a text document, +by successively applying pre-processing, entity matching +and context detection operations. ## Loading a text document -For starters, let's load a text file using the -{class}`~medkit.core.text.TextDocument` class: +For starters, let's load a text file using the {class}`~medkit.core.text.TextDocument` class: -```{code-cell} ipython3 +:::{code-block} # You can download the file available in source code # !wget https://raw.githubusercontent.com/medkit-lib/medkit/main/docs/data/text/1.txt @@ -30,112 +16,117 @@ from pathlib import Path from medkit.core.text import TextDocument doc = TextDocument.from_file(Path("../data/text/1.txt")) -``` +::: The full raw text can be accessed through the `text` attribute: -```{code-cell} ipython3 +:::{code-block} print(doc.text) -``` +::: -A `TextDocument` can store {class}`~medkit.core.text.TextAnnotation` objects but -for now, our document is empty. +A `TextDocument` can store {class}`~medkit.core.text.TextAnnotation` objects. +For now, our document is free of annotations. ## Splitting a document in sentences A common task in natural language processing is to split (or tokenize) text -documents in sentences. Medkit provides several segmentation operations, -including a rule-based {class}`~medkit.text.segmentation.SentenceTokenizer` -class that relies on a list of punctuation characters. Let's instantiate it: +documents in sentences. + +`medkit` provides several segmentation operations, +including a rule-based {class}`~medkit.text.segmentation.SentenceTokenizer` class +that relies on a list of punctuation characters. -```{code-cell} ipython3 +:::{code-block} from medkit.text.segmentation import SentenceTokenizer sent_tokenizer = SentenceTokenizer( output_label="sentence", punct_chars=[".", "?", "!"], ) -``` +::: + +As all operations, `SentenceTokenizer` defines a `run()` method. -As all operations, `SentenceTokenizer` defines a `run()` method. This method -returns a list of {class}`~medkit.core.text.Segment` objects (a `Segment` is a -`TextAnnotation` that represents a portion of a document's full raw text). As -input, it also expects a list of `Segment` objects. Here, we can pass a special -segment containing the whole raw text of the document, that we can retrieve -through the `raw_segment` attribute of `TextDocument`: +This method accepts a list of {class}`~medkit.core.text.Segment` objects +(a `Segment` is a `TextAnnotation` that represents parts of a document's raw text) +and returns a list of `Segment` objects. -```{code-cell} ipython3 +Here, we can pass a special `Segment` containing the full text of the document, +which can be retrieved through the `raw_segment` attribute of `TextDocument`: + +:::{code-block} sentences = sent_tokenizer.run([doc.raw_segment]) + for sentence in sentences: print(f"uid={sentence.uid}") print(f"text={sentence.text!r}") print(f"spans={sentence.spans}, label={sentence.label}\n") -``` +::: -As you can see, each segment has: +Each segment features: - an `uid` attribute, which unique value is automatically generated; - a `text` attribute holding the text that the segment refers to; - - a `spans` attribute reflecting the position of this text in the document's - full raw text. Here we only have one span for each segment, but multiple - discontinuous spans are supported; - - and a `label`, always equal to `"sentence"` in our case but it could be - different for other kinds of segments. + - a `spans` attribute reflecting the position of this text in the document's raw text. + Here, there is only one span per segment, but multiple discontinuous spans are supported; + - a `label` attribute (set to "sentence" in our example), + which could be different for other kinds of segments. ## Preprocessing a document -If you take a look at the 13th and 14th detected sentences, you will notice something -strange: +If you take a look at the 13th and 14th detected sentences, +you will notice something strange: -```{code-cell} ipython3 +:::{code-block} print(repr(sentences[12].text)) print(repr(sentences[13].text)) -``` +::: -This is actually one sentence that was split into two segments, because the -sentence tokenizer incorrectly considers the dot in the decimal weight value to -mark the end of a sentence. We could be a little smarter when configuring the -tokenizer, but instead, for the sake of learning, let's fix this with a -pre-processing step that replaces dots by commas in decimal numbers. +This is actually one sentence that was split into two segments, +because the sentence tokenizer incorrectly considers the dot in the decimal weight value +to mark the end of a sentence. +We could be a little smarter when configuring the tokenizer, +but instead, for the sake of learning, +let's fix this with a pre-processing step that replaces dots by commas in decimal numbers. -For this, we can use the -{class}`~medkit.text.preprocessing.RegexpReplacer` class, a regexp-based -"search-and-replace" operation. As many medkit operations, it can be configured -with a set of user-determined rules: +For this, we can use the {class}`~medkit.text.preprocessing.RegexpReplacer` class, +a regexp-based "search-and-replace" operation. +As other `medkit` operations, it can be configured with a set of user-determined rules: -```{code-cell} ipython3 +:::{code-block} from medkit.text.preprocessing import RegexpReplacer -rule = (r"(?<=\d)\.(?=\d)", ",") # => (pattern to replace, new text) +rule = (r"(?<=\d)\.(?=\d)", ",") # => (pattern to replace, new text) regexp_replacer = RegexpReplacer(output_label="clean_text", rules=[rule]) -``` +::: -The `run()` method of the normalizer takes a list of `Segment` objects and -returns a list of new `Segment` objects, one for each input `Segment`. In our -case we only want to preprocess the full raw text segment and we will only -receive one preprocessed segment, so we can call it with: +The `run()` method of the normalizer takes a list of `Segment` objects +and returns a list of new `Segment` objects, one for each input `Segment`. +In our case we only want to preprocess the full raw text segment, +and we will only receive one preprocessed segment, +so we can call it with: -```{code-cell} ipython3 +:::{code-block} clean_segment = regexp_replacer.run([doc.raw_segment])[0] print(clean_segment.text) -``` +::: -And then we may use again our previously-defined sentence tokenizer, but this -time on the preprocessed text: +We may use again our previously-defined sentence tokenizer again, +but this time on the preprocessed text: -```{code-cell} ipython3 +:::{code-block} sentences = sent_tokenizer.run([clean_segment]) print(sentences[12].text) -``` +::: Problem fixed! ## Finding entities -the medkit library also comes with operations to perform NER (named entity recognition), for -instance {class}`~medkit.text.ner.regexp_matcher.RegexpMatcher`. Let's -instantiate one with a few simple rules: +The `medkit` library also comes with operations to perform NER (named entity recognition), +for instance with {class}`~medkit.text.ner.regexp_matcher.RegexpMatcher`. +Let's instantiate one with a few simple rules: -```{code-cell} ipython3 +:::{code-block} from medkit.text.ner import RegexpMatcher, RegexpMatcherRule regexp_rules = [ @@ -147,60 +138,61 @@ regexp_rules = [ RegexpMatcherRule(regexp=r"\bnasonex?\b", label="treatment", case_sensitive=False), ] regexp_matcher = RegexpMatcher(rules=regexp_rules) -``` +::: -As you can see, you can also define some rules that ignore case distinctions by -setting `case-sensitive` parameter to `False`. +As you can see, you can also define some rules that ignore case distinctions +by setting `case-sensitive` parameter to `False`. In this example, we decide to make it for drugs (Allegra, Nasonex and Loratadine). -```{note} -When `RegexpMatcher` is instantiated without any rules, it will use a set of -default rules that where initially created to be used with documents in french -from the APHP EDS. These rules are stored in the -`regexp_matcher_default_rules.yml` file in the `medkit.text.ner` module. - -You may also define your own rules in a `.yml` file. You can then load them -using the `RegexpMatcher.load_rules()` static method and then pass then to the -`RegexpMatcher` at init. -``` - -Since `RegexpMatcher` is an NER operation, its `run()` method returns a list of -{class}`~medkit.core.text.Entity` objects representing the entities that were -matched (`Entity` is a subclass of `Segment`). As input, it expects a list of -`Segment` objects. Let's give it the sentences returned by the sentence -tokenizer: - -```{code-cell} ipython3 +:::{note} +When `RegexpMatcher` is instantiated without any rules, +it will use a set of default rules that where initially created +to be used with documents in French from the APHP EDS. +These rules are stored in file `regexp_matcher_default_rules.yml` +located in the `medkit.text.ner` module. + +You may also define your own rules in a `.yml` file. +You can then load them using the `RegexpMatcher.load_rules()` static method +and pass them to the `RegexpMatcher` constructor. +::: + +Since `RegexpMatcher` is an NER operation, +its `run()` method returns a list of {class}`~medkit.core.text.Entity` objects +representing the entities that were matched (`Entity` is a subclass of `Segment`). +As input, it expects a list of `Segment` objects. +Let's give it the sentences returned by the sentence tokenizer: + +:::{code-block} entities = regexp_matcher.run(sentences) for entity in entities: print(f"uid={entity.uid}") print(f"text={entity.text!r}, spans={entity.spans}, label={entity.label}\n") -``` +::: -Just like sentences, each entity has `uid`, `text`, `spans` and `label` attributes (in -this case, determined by the rule that was used to match it). +Just like sentences, each entity features `uid`, `text`, `spans` and `label` attributes +(in this case, determined by the rule that was used to match it). ## Detecting negation -So far we have detected several entities with `"problem"` or `"treatment"` -labels in our document. We might be tempted to use them directly to build a list -of problems that the patient faces and treatments that were given, but if we -look at how these entities are used in the document, we will see that some of -these entities actually denote the absence of a problem or treatment. - -To solve this kind of situations, medkit comes with context detectors, such as -{class}`~medkit.text.context.negation_detector.NegationDetector`. -`NegationDetector.run()` receives a list of `Segment` objects. It doesn't return -anything but it will append an {class}`~medkit.core.Attribute` object to each -segment with a boolean value indicating whether a negation was detected or not +So far, we have detected several entities with `"problem"` or `"treatment"` labels in our document. +We might be tempted to use them directly +to build a list of problems that the patient faces and treatments that were given, +but if we look at how these entities are used in the document, +we will see that some of these entities actually denote the absence of a problem or treatment. + +To solve this kind of situation, `medkit` comes with context detectors, +such as {class}`~medkit.text.context.negation_detector.NegationDetector`. +`NegationDetector.run()` receives a list of `Segment` objects. +It does not return anything, but it will append an {class}`~medkit.core.Attribute` object +to each segment with a boolean value indicating whether a negation was detected or not (`Segment` and `Entity` objects can have a list of `Attribute` objects, accessible through their {class}`~medkit.core.AttributeContainer`). -Let's instantiate a `NegationDetector` with a couple of simplistic handcrafted -rules and run it on our sentences: +Let's instantiate a `NegationDetector` with a couple of simplistic handcrafted rules +and run it on our sentences: -```{code-cell} ipython3 +:::{code-block} from medkit.text.context import NegationDetector, NegationDetectorRule neg_rules = [ @@ -210,38 +202,40 @@ neg_rules = [ ] neg_detector = NegationDetector(output_label="is_negated", rules=neg_rules) neg_detector.run(sentences) -``` +::: -```{note} -Similarly to `RegexpMatcher`, `DetectionDetector` also comes with a set of -default rules designed for documents from the EDS, stored in -`negation_detector_default_rules.yml` inside `medkit.text.context`. -``` +:::{note} +Similarly to `RegexpMatcher`, `DetectionDetector` also comes with a set of default rules +designed for documents from the EDS, +which are stored in file `negation_detector_default_rules.yml` +located in the `medkit.text.context` module. +::: And now, let's look at which sentence have been detected as being negated: -```{code-cell} ipython3 +:::{code-block} for sentence in sentences: neg_attr = sentence.attrs.get(label="is_negated")[0] if neg_attr.value: print(sentence.text) -``` +::: -Our simple negation detector doesn't work so bad, but sometimes -some part of the sentence has a negation and the other doesn't, and -in that case the whole sentence gets flagged as being negated. +Our simple negation detector does not work too bad, +but sometimes some part of the sentence is tagged with a negation whilst the rest does not, +resulting in the whole sentence getting flagged as being negated. -To mitigate this, we can split each sentence into finer-grained segments called -syntagmas. Medkit provide a {class}`~medkit.text.segmentation.SyntagmaTokenizer` -for that purpose. Let's instantiate one, run it on our sentences and then run -again the negation detector but this time on the syntagmas: +To mitigate this, each sentence can be split into finer-grained segments called syntagmas. +`medkit` provides a {class}`~medkit.text.segmentation.SyntagmaTokenizer` for that purpose. +Let's instantiate one, apply it to our sentences and run the negation detector again, +but this time on the syntagmas: -```{note} +:::{note} `SyntagmaTokenizer` also has default rules designed for documents from the EDS, -stored in `default_syntagma_definition.yml` inside `medkit.text.segmentation`. -``` +which are stored in file `default_syntagma_definition.yml` +located in the `medkit.text.segmentation` module. +::: -```{code-cell} ipython3 +:::{code-block} from medkit.text.segmentation import SyntagmaTokenizer synt_tokenizer = SyntagmaTokenizer( @@ -255,71 +249,74 @@ for syntagma in syntagmas: neg_attr = syntagma.attrs.get(label="is_negated")[0] if neg_attr.value: print(syntagma.text) -``` - -That's a little better. We now have some information about negation attached to -syntagmas, but our end goal is really to know, for each entity, whether it -should be considered as negated or not. In more practical terms, we now have -negation attributes attached to our syntagmas, but what we really want is to -have negation attributes attached to entities. - -In medkit, the way to do this is to use the `attrs_to_copy` parameter. This -parameter is available on all NER operations. It is used to tell the operation -which attributes should be copied from the input segments to the newly matched -entities (based on their label). In other words, it provides a way to propagate -context attributes (such as negation attributes) for segments to entities. - -Let's again use a `RegexpMatcher` to find some entities, but this time from -syntagmas rather than from sentences, and using `attrs_to_copy` to copy negation -attributes: - -```{code-cell} ipython3 +::: + +We now have some information about negation attached to syntagmas, +but the end goal is really to know, for each entity, +whether it should be considered as negated or not. +In more practical terms, we have got negation attributes attached to our syntagmas, +but what we would prefer is to have negation attributes attached to entities. + +In `medkit`, the way to do this is to use the `attrs_to_copy` parameter, +which is available for all NER operations. +This parameter tells the operation which attributes should be copied +from the input segments to the newly matched entities (based on their label). +In other words, it provides a way to propagate context attributes +(such as negation attributes) for segments to entities. + +Let's again use a `RegexpMatcher` to find some entities, +but this time from syntagmas rather than from sentences, +and using `attrs_to_copy` to copy negation attributes: + +:::{code-block} regexp_matcher = RegexpMatcher(rules=regexp_rules, attrs_to_copy=["is_negated"]) entities = regexp_matcher.run(syntagmas) for entity in entities: neg_attr = entity.attrs.get(label="is_negated")[0] print(f"text='{entity.text}', label={entity.label}, is_negated={neg_attr.value}") -``` +::: We now have a negation `Attribute` for each entity! ## Augmenting a document -We now have an interesting set of annotations. We might want to process them -directly, for instance to generate table-like data about patient treatment in -order to compute some statistics. But we could also want to attach them back to -our document in order to save them or export them to some format. +We now have an interesting set of annotations. +We might want to process them directly, +for instance to generate table-like data about patient treatment +in order to compute some statistics. +But we could also want to attach them back to our document +in order to save them or export them to some format. The annotations of a text document can be access with `TextDocument.anns`, -an instance of {class}`~medkit.core.text.TextAnnotationContainer`) that behaves -roughly like a list but also offers additional filtering methods. Annotations -can be added by calling its `add()` method: +an instance of {class}`~medkit.core.text.TextAnnotationContainer`) +that behaves roughly like a list but also offers additional filtering methods. +Annotations can be added by calling its `add()` method: -```{code-cell} ipython3 +:::{code-block} for entity in entities: doc.anns.add(entity) -``` +::: -The document and its entities can then be exported to supported external formats -(cf {class}`~medkit.io.brat.BratOutputConverter` and -{class}`~medkit.io.doccano.DoccanoOutputConverter`), or serialized in the -{mod}`~medkit.io.medkit_json` format: +The document and its corresponding entities can be exported to supported formats +such as brat (see {class}`~medkit.io.brat.BratOutputConverter`) +or Doccano (see {class}`~medkit.io.doccano.DoccanoOutputConverter`), +or serialized to JSON (see {mod}`~medkit.io.medkit_json`): -```{code-cell} ipython3 +:::{code-block} from medkit.io import medkit_json medkit_json.save_text_document(doc, "doc_1.json") -``` +::: ## Visualizing entities with displacy -Rather than printing entities, we can visualize them with `displacy`, a -visualization tool part of the [spaCy](https://spacy.io/) NLP library. Medkit -provides helper functions to facilitate the use of `displacy` in the -{mod}`~medkit.text.spacy.displacy_utils` module: +Rather than printing entities, we can visualize them with `displacy`, +a visualization tool part of the [spaCy](https://spacy.io/) NLP library. +`medkit` provides helper functions to facilitate the use of `displacy` +in the {mod}`~medkit.text.spacy.displacy_utils` module: -```{code-cell} ipython3 +:::{code-block} :tags: [scroll-output] from spacy import displacy @@ -327,7 +324,7 @@ from medkit.text.spacy.displacy_utils import medkit_doc_to_displacy displacy_data = medkit_doc_to_displacy(doc) displacy.render(displacy_data, manual=True, style="ent") -``` +::: ## Wrapping it up @@ -335,15 +332,15 @@ In this tutorial, we have: - created a `TextDocument` from an existing text file; - instantiated several pre-processing, segmentation, context detection and entity matching operations; -- ran these operations sequentially over the document and obtained entities; +- run these operations sequentially over the document and obtained entities; - attached these entities back to the original document. -The operations we have used in this tutorial are rather basic ones, mostly -rule-based, but there are many more available in medkit, including model-based -NER operations. You can learn about them in the [API reference](../api/text.md). +The operations used throughout this tutorial are rather basic ones, mostly rule-based, +but there are many more available in `medkit`, +including model-based NER operations. +You can learn more about them in the [API reference](../api/text.md). -That's a good first overview of what you can do with medkit! To dive in further, -you might take a look at an overview of the [various entity matching methods -available in medkit](entity_matching.md), [context -detection](context_detection.md), or [how to encapsulate all these operations in -a pipeline](pipeline.md). +To dive further into `medkit`, you might be interested in an overview +of the [various entity matching methods available in medkit](entity_matching.md), +[context detection](context_detection.md), +or [how to encapsulate all these operations in a pipeline](pipeline.md). diff --git a/docs/user_guide/install.md b/docs/user_guide/install.md index 9977035e..6b049ec8 100644 --- a/docs/user_guide/install.md +++ b/docs/user_guide/install.md @@ -1,81 +1,83 @@ # Installation -The medkit package supports a version of python >= 3.8. +## Supported Python versions -## Install an official version +`medkit` requires a distribution of Python with a minimum version of 3.8. -:::{important} -From 0.4.1 version, medkit package (named **medkit-lib**) is available on -[PyPi](https://pypi.org/project/medkit-lib/). +:::{note} +It is recommended to install `medkit` in a virtual or conda environment. ::: -Releases are published on . +## Install an official version -To install medkit : +Releases of `medkit` are published on [PyPI](https://pypi.org/project/medkit-lib/) +under the name **medkit-lib**. -``` -# Install medkit with required dependencies +To install `medkit` with basic functionalities: + +```console python -m pip install 'medkit-lib' +``` + +To install `medkit` with all functionalities: -# Install medkit with all extra dependencies +```console python -m pip install 'medkit-lib[all]' ``` -:::{note} -We recommend to install the medkit package in a virtual or conda environment. -::: +Using `conda`, `mamba` or `micromamba`: -Here is an example with conda: -``` -conda create -n medkit-tuto python=3.8 -conda activate medkit-tuto +```console +conda create -n medkit python=3.8 +conda activate medkit pip install 'medkit-lib[all]' ``` ## Install a development version -If you want to contribute, clone the `medkit` repository locally: - - SSH: `git clone git@github.com:medkit-lib/medkit.git` - - HTTPS: `git clone https://github.com/medkit-lib/medkit.git` +To start contributing, first clone the `medkit` [repository](https://github.com/medkit-lib/medkit.git) locally: -[Poetry](https://python-poetry.org) is used for managing dependencies and -packaging medkit. +Using Git: -```shell -cd medkit -poetry install +```console +git clone https://github.com/medkit-lib/medkit.git ``` -If you want to also install the extras dependencies, you may use: -```shell -poetry install --all-extras +or the GitHub CLI: + +```console +gh repo clone medkit-lib/medkit.git ``` -For documentation: -```shell -poetry install --with docs +This project uses [Hatch](https://hatch.pypa.io/) to manage its dependencies. +Please follow its [installation instructions](https://hatch.pypa.io/latest/install/). -make docs # for generating documentation -``` +The project can be deployed in a virtual environment and tested with: -Then, a `.venv` folder is created at the root of the project. To activate the -virtual environment: -```shell -source .venv/bin/activate +```console +hatch run test ``` -To make sure everything is set up properly, you may run the tests : +The corresponding documentation can be built with: -``` -# For unit/small tests -pytest -v tests/unit +```console +hatch run docs:build ``` -## Troubleshooting +Or served with interactive reloading with: -Sometimes, for documentation and/or testing, you may need some additional packages: +```console +hatch run docs:serve +``` + +Code linting and formatting can be applied with: +```console +hatch fmt ``` -sudo apt-get install -y gcc g++ libsndfile1 graphviz + +Additional checks may be run using [pre-commit](https://pre-commit.com/): + +```console +pre-commit run --all-files ``` -You may also refer to CI file (e.g., .gitlab-ci.yml) for up-to-date information. diff --git a/docs/user_guide/module.md b/docs/user_guide/module.md index f39c29e3..89874214 100644 --- a/docs/user_guide/module.md +++ b/docs/user_guide/module.md @@ -1,17 +1,3 @@ ---- -jupytext: - formats: md:myst - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.14.4 -kernelspec: - display_name: Python 3 (ipykernel) - language: python - name: python3 ---- - # Make your own module Here is a tutorial for developing your own operation module. diff --git a/docs/user_guide/pipeline.md b/docs/user_guide/pipeline.md index b7f9c74c..ef0d9b3a 100644 --- a/docs/user_guide/pipeline.md +++ b/docs/user_guide/pipeline.md @@ -1,28 +1,14 @@ ---- -jupytext: - formats: md:myst - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.13.8 -kernelspec: - display_name: Python 3 (ipykernel) - language: python - name: python3 ---- # Using pipelines This tutorial will show you how to encapsulate operations into a pipeline, -and how to create pipelines to augment documents. +and how to create pipelines to enrich documents. ## Using operations without a pipeline -Let's start by instantiating the preprocessing, segmentation, context detection -and entity recognition operations that we want to use. We are simply going to -reuse the ones from the [First steps](first_steps.md) tutorial: +Let's reuse the preprocessing, segmentation, context detection and entity recognition operations +from the [First steps](./first_steps.md) tutorial: -```{code-cell} ipython3 +:::{code-block} from medkit.text.preprocessing import RegexpReplacer from medkit.text.segmentation import SentenceTokenizer, SyntagmaTokenizer from medkit.text.context import NegationDetector, NegationDetectorRule @@ -33,12 +19,12 @@ rule = (r"(?<=\d)\.(?=\d)", ",") regexp_replacer = RegexpReplacer(output_label="clean_text", rules=[rule]) # segmentation -sent_tokenizer = SentenceTokenizer( +sentence_tokenizer = SentenceTokenizer( output_label="sentence", punct_chars=[".", "?", "!", "\n"], ) -synt_tokenizer = SyntagmaTokenizer( +syntagma_tokenizer = SyntagmaTokenizer( output_label="syntagma", separators=[r"\bmais\b", r"\bet\b"], ) @@ -49,7 +35,10 @@ neg_rules = [ NegationDetectorRule(regexp=r"\bsans\b", exclusion_regexps=[r"\bsans\s*doute\b"]), NegationDetectorRule(regexp=r"\bne\s*semble\s*pas"), ] -neg_detector = NegationDetector(output_label="is_negated", rules=neg_rules) +negation_detector = NegationDetector( + output_label="is_negated", + rules=neg_rules, +) # entity recognition regexp_rules = [ @@ -61,119 +50,104 @@ regexp_rules = [ RegexpMatcherRule(regexp=r"\bnasonex?\b", label="treatment", case_sensitive=False), ] regexp_matcher = RegexpMatcher(rules=regexp_rules, attrs_to_copy=["is_negated"]) -``` - -Each of these operations has a `run()` method, which we could call sequentially, -passing along the output from one operation as the input to the next operation, -and using a document's raw text segment as the initial input: +::: +Each of these operations features a `run()` method, which could be called sequentially. +Data need to be routed manually between inputs and outputs for each operation, +using a document's raw text segment as initial input: -```{code-cell} ipython3 +:::{code-block} from pathlib import Path from medkit.core.text import TextDocument # You can download the file available in source code -# !wget https://raw.githubusercontent.com/medkit-lib/medkit/main/docs/data/text/1.txt -# or create your file and copy the text +# !wget https://raw.githubusercontent.com/TeamHeka/medkit/main/docs/data/text/1.txt +# or create your file and copy the text. doc = TextDocument.from_file(Path("../data/text/1.txt")) # clean_segments contains only 1 segment: the preprocessed full text segment clean_segments = regexp_replacer.run([doc.raw_segment]) -sentences = sent_tokenizer.run(clean_segments) -syntagmas = synt_tokenizer.run(sentences) -# the negation detector doesn't return new annotations -# but rather appends attributes to the segments it received -neg_detector.run(syntagmas) +sentences = sentence_tokenizer.run(clean_segments) +syntagmas = syntagma_tokenizer.run(sentences) + +# Rhe negation detector does not return new annotations, +# but rather appends attributes to the segments it received. +negation_detector.run(syntagmas) entities = regexp_matcher.run(syntagmas) -``` +::: -But it is also possible to wrap all this operations into a `Pipeline` object, -that will be responsible of calling the `run()` method of each operation, with -the appropriate input annotations. +This way of coding is useful for interactive exploration of `medkit`. +In the next section, we will introduce a different way using `Pipeline` objects. ## Why use a pipeline? -What are the advantages of using pipelines instead of just directly calling each -operations as we just did? - -In this particular case, they aren't any real advantages. Because this is a -tutorial and we want to keep things simple, there aren't so many operations and -they are called in a linear fashion. But in real life the chaining of operations -could be more complex and then it could be easier to do that through a pipeline. - -Also, pipelines are composable (each pipeline is an operation that can itself be -put into another pipeline), therefore they can be used to structure complex -flows into smaller units handling a subpart of the processing. This also makes -it possible to reuse a pipeline for different projects, for instance by -regrouping common preprocessing steps. +The previous example features a linear sequence of operations, +which is simple enough to fit on a single page of code. +More advanced use cases may require composition of more operations, +with a more complex dependency graph and more parameters to handle. +Pipelines allows breaking an arbitrary workflow of operations +into functionally simpler and reusable units of computation. -If you are interested in [provenance tracing](provenance.md) (knowing how each -annotation was generated), then it can also be easier to handle that with a -pipeline. +If you are interested in [provenance tracing](./provenance.md) +(providing metadata regarding how each annotation was generated), +then it can also be easier to handle it with a pipeline. -Finally, in the future of medkit the scope of pipelines might be expanded to -handle more things such as batching, parallelization, and maybe training of -trainable components. +Planned extensions to `medkit` include support for batching +(applying a pipeline to multiple documents), parallelization, +and trainable components. ## Constructing a pipeline -We now want to connect these 4 operations together in a pipeline. For this, we -will stack all the operations in a python list, in the order in which they must -be executed. But we also need to "connect" the operations together, ie. to -indicate which output of an operation should be fed as input to another -operation. This why we wrap the operations in {class}`~medkit.core.PipelineStep` -objects: +We now want to compose these 4 operations together in a pipeline. +For this, we will stack all the operations in a python list, +in the order in which they must be executed. +But we also need to "connect" the operations together, +i.e. to indicate which output of an operation should be fed as input to another operation. +This is the purpose of the {class}`~medkit.core.PipelineStep` objects: -```{code-cell} ipython3 +:::{code-block} from medkit.core import PipelineStep steps = [ PipelineStep(regexp_replacer, input_keys=["full_text"], output_keys=["clean_text"]), - PipelineStep(sent_tokenizer, input_keys=["clean_text"], output_keys=["sentences"]), - PipelineStep(synt_tokenizer, input_keys=["sentences"], output_keys=["syntagmas"]), - PipelineStep(neg_detector, input_keys=["syntagmas"], output_keys=[]), # no output + PipelineStep(sentence_tokenizer, input_keys=["clean_text"], output_keys=["sentences"]), + PipelineStep(syntagma_tokenizer, input_keys=["sentences"], output_keys=["syntagmas"]), + PipelineStep(negation_detector, input_keys=["syntagmas"], output_keys=[]), # no output PipelineStep(regexp_matcher, input_keys=["syntagmas"], output_keys=["entities"]), ] -``` +::: -Each `PipelineStep` associates an operation with “keys”. As we just said, the -operations have to be connected to each other, and the keys are just names we -put on these connections to make it easier to describe them. The steps we just -constructed can be represented like this: +Each `PipelineStep` associates an operation with input and output _keys_. +Pipeline steps with matching input and output keys will be connected to each other. +The resulting pipeline can be represented like this: -```{mermaid} -:align: center +:::{mermaid} +--- +align: center +--- graph TD - A((?)) + A((full_text)):::io B(regexp_replacer) - C(sent_tokenizer) - D(synt_tokenizer) - E(neg_detector) + C(sentence_tokenizer) + D(syntagma_tokenizer) + E(negation_detector) F(entity_matcher) - G((?)):::io + G((entities)):::io - A -- full_text --> B + A --> B B -- clean_text --> C C -- sentences --> D D -- syntagmas --> E E ~~~ F D -- syntagmas --> F - F -- entities --> G + F --> G classDef io fill:#fff4dd,stroke:#edb: -``` - -We see the negation detector has no output: this is because it modifies the -sentences in-place by adding attributes to them (its `run()` function doesn't -return anything). +::: -The 1st question mark, connected to the sentence tokenizer via the "full_text" key, represents the source of the segments that will be fed into the regexp replacer, still unknown at this point since they are not the product of a previous operation. +Pipeline steps can then be used to instantiate a {class}`~medkit.core.Pipeline` object: -The 2d question mark, connected to the entity matcher via the "entities" key, represents the destination of the entities produced by the matcher, also still unknown for now. - -We will now use our pipeline steps to create a `Pipeline` object: - -```{code-cell} ipython3 +:::{code-block} from medkit.core import Pipeline pipeline = Pipeline( @@ -188,35 +162,35 @@ pipeline = Pipeline( # (and therefore that it should be the output of the regexp matcher) output_keys=["entities"] ) -``` -Here our pipeline is the equivalent of some operation that would take full text -segments as input and return entities with family attributes. This pipeline only -has one input and one output, but creating more complex pipelines with multiple -input arguments and multiple return values is supported. +::: -Let's run our pipeline and make sure everything is ok: +The resulting pipeline is functionally equivalent to some operation +processing full text segments as input and returning entities with family attributes as output. +This example pipeline features a single input and a single output, +but more complex pipelines with multiple inputs and outputs are supported. -```{code-cell} ipython3 -# run() takes a full text segment and return entities with attributes +Like any other operation, the pipeline can be evaluated using its `run` method: + +:::{code-block} entities = pipeline.run([doc.raw_segment]) for entity in entities: neg_attr = entity.attrs.get(label="is_negated")[0] print(f"text='{entity.text}', label={entity.label}, is_negated={neg_attr.value}") -``` +::: -Seems good! +## Nested pipelines -## Nesting pipelines +Since a pipeline is a `medkit` operation, it can be used as a step for another pipeline. +Nesting pipelines is useful to group operations into functional sub-blocks, +which can be used, tested and exercised in isolation. -Because a pipeline is a medkit operation (it has a `run()` method that takes -input data and return new data), it can itself be used as the step of another -pipeline. We can use this to regroup together our regexp replacer, sentence -tokenizer and family detector into a context subpipeline: +In our example, we can use this feature to regroup together our regexp replacer, +sentence tokenizer and family detector into a context sub-pipeline: -```{code-cell} ipython3 +:::{code-block} # Context pipeline that receives full text segments -# and returns preprocessed syntagmas segments with negation attributes +# and returns preprocessed syntagmas segments with negation attributes. context_pipeline = Pipeline( # Optional name to indicate task performed by a pipeline # (will be used in provenance data) @@ -230,14 +204,13 @@ context_pipeline = Pipeline( input_keys=["full_text"], output_keys=["syntagmas"], ) -``` -Likewise, we can add an additional UMLS-based matching operation (see also -[Entity Matching](entity_matching.md)) and group it with our previous regexp -matcher into an NER subpipeline: +::: -```{code-cell} ipython3 -:tags: [skip-execution] +Likewise, we can introduce a NER sub-pipelines +composed of a UMLS-based matching operation (see also [Entity Matching](./entity_matching.md)) +grouped with the previously defined regexp matcher: +:::{code-block} from medkit.text.ner import UMLSMatcher umls_matcher = UMLSMatcher( @@ -247,8 +220,8 @@ umls_matcher = UMLSMatcher( attrs_to_copy=["is_negated"], ) -# NER pipeline that receives syntagmas segments and return entities -# matched by 2 different operations +# NER pipeline that receives syntagmas segments +# and return entities matched by 2 different operations ner_pipeline = Pipeline( name="ner", steps=[ @@ -258,17 +231,15 @@ ner_pipeline = Pipeline( input_keys=["syntagmas"], output_keys=["entities"], ) -``` - -Here, the 2 pipeline steps have the same output key so the pipeline's `run()` -method will return a list containing the entities matched by the regexp matcher -and the UMLS matcher. +::: -These 2 sub-pipelines can now be grouped into an main pipeline and connected together: +Since both pipeline steps feature the same output key (_entities_), +the pipeline will return a list containing the entities matched by +both the regexp matcher and the UMLS matcher. -```{code-cell} ipython3 -:tags: [skip-execution] +The NER and context sub-pipelines can now be sequenced with: +:::{code-block} pipeline = Pipeline( steps=[ PipelineStep(context_pipeline, input_keys=["full_text"], output_keys=["syntagmas"]), @@ -277,70 +248,55 @@ pipeline = Pipeline( input_keys=["full_text"], output_keys=["entities"], ) -``` +::: which can be represented like this: ```{mermaid} :align: center graph TD - subgraph " " - A((?)) B(regexp_replacer) - C(sent_tokenizer) - D(synt_tokenizer) - E(neg_detector) - F((?)):::io - - A -- full_text --> B - B -- clean_text --> C + C(sentence_tokenizer) + D(syntagma_tokenizer) + E(negation_detector) + B -- clean text --> C C -- sentences --> D D -- syntagmas --> E - E ~~~ F - D -- syntagmas --> F - end + A((full text)) + A --> B + G((syntagmas)) + E ~~~ G + D --> G subgraph " " - G((?)) H(regexp_matcher) I(umls_matcher) - J((?)):::io - - G -- syntagmas --> H - G -- syntagmas --> I - H -- entities --> J - I -- entities --> J - + G --> H + G --> I end - K((?)) - K -- full_text--> A - F -- syntagmas --> G - - L((?)) - J -- entities --> L + J((entities)):::io + H --> J + I --> J classDef io fill:#fff4dd,stroke:#edb: ``` -Let's run the pipeline and make sure we still get entities with negation -attributes: - -```{code-cell} ipython3 -:tags: [skip-execution] +Let's run the pipeline and verify entities with negation attributes: +:::{code-block} entities = pipeline.run([doc.raw_segment]) for entity in entities: neg_attr = entity.attrs.get(label="is_negated")[0] print(entity.label, ":", entity.text) print("negation:", neg_attr.value, end="\n\n") -``` +::: -``` +```text problem : allergies negation: False @@ -428,48 +384,42 @@ negation: False ## Using a document pipeline -The pipeline we have created can be seen as an "annotation-level" pipeline. It -takes {class}`~medkit.core.text.Segment` objects as input and returns -{class}`~medkit.core.text.Entity` objects (`Segment` and `Entity` both being -subclasses of {class}`~medkit.core.text.TextAnnotation`). +The pipeline we have created can be considered an "annotation-level" pipeline. +It takes {class}`~medkit.core.text.Segment` objects as inputs +and returns {class}`~medkit.core.text.Entity` objects +(`Segment` and `Entity` both being subclasses of {class}`~medkit.core.text.TextAnnotation`). +To scale the processing of such pipeline to a collection of documents, +one needs to iterate over each document manually to obtain its entities +rather than processing all the documents at once: -As mentionned in a [previous tutorial](entity_matching.md), when dealing with a -collection of documents that we want to enrich with annotations, we need to -iterate over each document to obtain its entities rather than processing all the -documents at once: - -```{code-cell} ipython3 +:::{code-block} docs = TextDocument.from_dir(Path("..data/text")) for doc in docs: entities = pipeline.run([doc.raw_segment]) for entity in entities: doc.anns.add(entity) -``` +::: + +To handle this common use case, `medkit` provides a {class}`~medkit.core.DocPipeline` class, +which wraps a `Pipeline` instance and run it on a list of documents. -To handle this common use case, medkit provides a -{class}`~medkit.core.DocPipeline` class, that wraps a `Pipeline` instance and -run it on each document that it receives. This is how we would use it: +Here is an example of its usage: -```{code-cell} ipython3 +:::{code-block} from medkit.core import DocPipeline docs = TextDocument.from_dir(Path("..data/text")) doc_pipeline = DocPipeline(pipeline=pipeline) doc_pipeline.run(docs) -``` - -## Wrapping it up +::: -In this tutorial, we have learnt how to instantiate a `Pipeline` and describe -how operations are connected with each others through `PipelineStep` objects. We -have also seen how sub-pipelines can be nested into other pipelines. Finally, we -have seen how to transform an annotation-level `Pipeline` into a document-level -`DocPipeline`. +## Summary -If you have more questions about pipelines or wonder how to build more complex -flows, you may want to take a look at the [pipeline API -docs](api:core:pipeline). If you are interested in the advantages of pipelines -as regard provenance tracing, you may read the [provenance tracing tutorial](provenance.md). +In this section, we have learnt how to instantiate a `Pipeline` +and describe how operations are connected with each others through `PipelineStep` objects. +We have also seen how sub-pipelines can be nested to compose larger pipelines. +Finally, we have seen how to transform an annotation-level pipeline +to a document-level pipeline with `DocPipeline`. diff --git a/docs/user_guide/provenance.md b/docs/user_guide/provenance.md index c90ad8bf..08c1a1e5 100644 --- a/docs/user_guide/provenance.md +++ b/docs/user_guide/provenance.md @@ -1,49 +1,31 @@ ---- -jupytext: - formats: md:myst - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.14.4 -kernelspec: - display_name: Python 3 (ipykernel) - language: python - name: python3 ---- - # Provenance tracing -```{warning} +:::{warning} Provenance tracing is still under development and may be changed in the future. -``` +::: -One of the main features of medkit is the tracing of provenance information. -When used, medkit is able to tell how each annotation was created, that is to -say: -- the operation that generated it; -- the input data that was used by the operation to generate the annotation; +One of the distinctive features of `medkit` is the tracing of provenance information. +When enabled, `medkit` can record how each annotation was created, +i.e. the operation and associated input data used to generate it. -This is true for the whole processing chain, including intermediate steps and -annotations. +This is true for the whole processing pipeline, including intermediate steps and annotations. +Provenance information is stored through the duration of the processing +and can later be retrieved in [PROV-O](https://www.w3.org/TR/prov-o/) format. +This is particularly useful to build a chain of trust through the creation of an annotation. -The goal is to retain enough information to later output in the -[PROV-O](https://www.w3.org/TR/prov-o/) format. More practically, it can also be -useful to know how an annotation was generated in order to know if it is -trustworthy or not. - -This tutorial will teach you how to gather provenance information in medkit. -Before you read it, you should be familiar with the medkit components exposed in -the [first steps](first_steps.md) and [pipeline](pipeline.md) tutorials. +This tutorial will teach you how to gather provenance information with `medkit`. +The readers is assumed to be familiar with basic `medkit` components +introduced in the [first steps](first_steps.md) and [pipeline](pipeline.md) sections. ## A minimalistic provenance graph -Let's start with the simplest use case possible and take a look at provenance -for a single annotation, generated by a single operation. We are going to create -a very simple `TextDocument` containing just one sentence, and run a -`RegexpMatcher` on it that will match a single `Entity`: +Let's start with the simplest use case possible +and take a look at provenance for a single annotation, generated by a single operation. + +We are going to create a very simple `TextDocument` containing just one sentence, +and run a `RegexpMatcher` to match a single `Entity`: -```{code-cell} ipython3 +:::{code-block} from medkit.core.text import TextDocument from medkit.text.ner import RegexpMatcher, RegexpMatcherRule @@ -52,33 +34,32 @@ doc = TextDocument(text=text) regexp_rule = RegexpMatcherRule(regexp=r"\basthme\b", label="problem") regexp_matcher = RegexpMatcher(rules=[regexp_rule]) -``` +::: -Before we actually call the `run()` method of our regexp matcher, we will -activate the tracing of provenance for the entities it creates. This is done by -assigning it a {class}`~medkit.core.ProvTracer` object. The `ProvTracer` is in -charge of gathering all provenance info across all the operations. Operations -need to know it because they will inform it of each annotation they create. +Before calling the `run()` method of our regexp matcher, +we will activate provenance tracing for the generated entities. +This is done by assigning it a {class}`~medkit.core.ProvTracer` object. +The `ProvTracer` is in charge of gathering provenance information across all operations. -```{code-cell} ipython3 +:::{code-block} from medkit.core import ProvTracer prov_tracer = ProvTracer() regexp_matcher.set_prov_tracer(prov_tracer) -``` +::: -We may now run the regexp matcher which will, as expected, match one entity: +Now that provenance is enabled, the regexp matcher can be applied to the input document: -```{code-cell} ipython3 +:::{code-block} entities = regexp_matcher.run([doc.raw_segment]) for entity in entities: print(f"text={entity.text!r}, label={entity.label}") -``` +::: -Let's retrieve and inspect the provenance info concerning this entity: +Let's retrieve and inspect provenance information concerning the matched entity: -```{code-cell} ipython3 +:::{code-block} def print_prov(prov): # data item print(f"data_item={prov.data_item.text!r}") @@ -93,56 +74,51 @@ def print_prov(prov): entity = entities[0] prov = prov_tracer.get_prov(entity.uid) print_prov(prov) -``` - -The `get_prov()` method of `ProvTracer` returns a simple -{class}`~medkit.core.Prov` object containing all the provenance info related to -a specific object. It has the following attributes: - - `data_item` contains the object to which the provenance info refers. Here, it - is our entity. Note that it doesn't have to be an `Annotation` subclass. For - instance, it could also be an `Attribute`; - - `op_desc` holds an {class}`~medkit.core.OperationDescription` object, that - describes the operation that created the data item, in our case the regexp - matcher. The `OperationDescription` will contain the name of - the operation and the init parameters that were used; - - `source_data_items` contains the objects that were used by the operation to - create the new data item. Here there is only one source, the raw text - segment, because the entity was found in this particular segment by the - regexp matcher. But it is possible to have more than one data item in the - sources; - - reciprocally, `derived_data_items` contains the objects that were derived - from the data item by further operations. In this simple example, there are - none. - -If we are interested in all the provenance info gathered by our `ProvTracer` -instance rather than the provenance of a specific item, then we can call the -`get_provs()` method: - -```{code-cell} ipython3 +::: + +The `get_prov()` method of `ProvTracer` returns a simple {class}`~medkit.core.Prov` object +containing all the provenance information related to a specific object. +It features the following attributes: +- `data_item` contains the object to which the provenance info refers. Here, it + is our entity. Note that it doesn't have to be an `Annotation` subclass. + For instance, it could also be an `Attribute`; +- `op_desc` holds an {class}`~medkit.core.OperationDescription` object, + that describes the operation that created the data item (here, the regexp matcher). + The `OperationDescription` will contain the name of the operation and the init parameters that were used; +- `source_data_items` contains the objects that were used by the operation to create the new data item. + Here there is only one source, the raw text segment, + because the entity was found in this particular segment by the regexp matcher. + But it is possible to have more than one data item in the sources; + - `derived_data_items` contains the objects that were derived from the data item by further operations. + In this simple example, there are none. + +If we are interested in all the provenance information gathered by the `ProvTracer` instance, +rather than the provenance of a specific item, +then we can call the `get_provs()` method: + +:::{code-block} for prov in prov_tracer.get_provs(): print_prov(prov) -``` - -We can see that we have another `Prov` object with partial provenance info about -the raw text segment: we know how it was used (the entity was derived from it) -but we don't know how it was created. This is expected, as the raw segment is a -data item that was fed at the input of our processing flow, it was not created -by any operation. - -Our provenance info has a graph structure, each `Prov` object representing a -node. For visualization, medkit provides a -{func}`~medkit.tools.save_prov_to_dot` helper function that generates -[graphviz](https://graphviz.org/)-compatible `.dot` files: - -```{note} -[graphviz](https://graphviz.org/) is a graph visualization tool that defines a -simple text-based format for describing graphs, the `.dot` file format, and that -provides a `dot` command-line executable to generate images from such files. You -will need to install graphviz on your system to be able to run the following -code. On an ubuntu system, `apt install graphviz` should do the trick. -``` - -```{code-cell} ipython3 +::: + +Here, we have another `Prov` object with partial provenance information about the raw text segment: +we know how it was used (the entity was derived from it) but we don't know how it was created. +This is expected, as the raw segment is a data item that was provided as input to our processing flow, +it was not created by any operation upstream. + +Provenance information can be represented as a graph structure, +with each `Prov` object representing a node. +For visualization purposes, `medkit` provides a {func}`~medkit.tools.save_prov_to_dot` helper function +that generates [graphviz](https://graphviz.org/)-compatible `.dot` files: + +:::{note} +[graphviz](https://graphviz.org/) is a graph visualization tool that defines a simple text-based format for describing graphs, +the `.dot` file format. +It also provides command-line executable named `dot` to generate images from such files. +You will need to install `graphviz` on your system to be able to run the following code. +::: + +:::{code-cell} --- mystnb: image: @@ -165,16 +141,15 @@ dot_file = output_dir / "prov.dot" save_prov_to_dot(prov_tracer, dot_file) display_dot(dot_file) -``` +::: ## Provenance composition -Let's move on to a slightly more complex example: before using the -`RegexpMatcher` matcher, we will split our document into sentences with a -`SentenceTokenizer`. We will also wrap our `SentenceTokenizer` and our -`RegexpMatcher` in a pipeline: +Let's move on to a slightly more complex example. +Before using the `RegexpMatcher` matcher, we will split our document into sentences with a `SentenceTokenizer`. +We will also compose the `SentenceTokenizer` and our `RegexpMatcher` operations in a `Pipeline`. -```{code-cell} ipython3 +:::{code-block} from medkit.text.segmentation import SentenceTokenizer from medkit.core.pipeline import PipelineStep, Pipeline @@ -188,20 +163,16 @@ steps = [ PipelineStep(regexp_matcher, input_keys=["sentences"], output_keys=["entities"]), ] pipeline = Pipeline(steps=steps, input_keys=["full_text"], output_keys=["entities"]) -``` +::: -A pipeline being itself an operation, it also has a `set_prov_tracer()` method, -and calling it will automatically enable provenance tracing for all the -operations in the pipeline. +A pipeline being itself an operation, it also features a `set_prov_tracer()` method, +and calling it will automatically enable provenance tracing for all the operations in the pipeline. -```{note} -In this tutorial, we always use a new `ProvTracer` instance for each example. -This is because the provenance tracer accumulates provenance information, but we -don't want to keep the provenance information from the previous examples, so we -create a new one. -``` +:::{important} +Provenance tracers can only accumulate provenance information, not modify or delete it. +::: -```{code-cell} ipython3 +:::{code-block} prov_tracer = ProvTracer() pipeline.set_prov_tracer(prov_tracer) @@ -209,47 +180,44 @@ entities = pipeline.run([doc.raw_segment]) for entity in entities: print(f"text={entity.text!r}, label={entity.label}") -``` +::: -As expected, the result is identical to the first example: we have matched one -entity. However the provenance is structured differently: +As expected, the result is identical to the first example: we have matched one entity. +However, its provenance is structured differently: -```{code-cell} ipython3 +:::{code-block} for prov in prov_tracer.get_provs(): print_prov(prov) -``` +::: -We can see that now, the operation that created the entity is not the -`RegexpMatcher` anymore, but the `Pipeline`. It might seem surprising but it -does make sense: the pipeline is a processing operation itself, it received as -input the raw segment, and used it to create an entity. The sentences are -considered internal intermediary results and are not listed. +Compared to the simpler case, the operation that created the entity is the `Pipeline`, instead of the `RegexpMatcher`. +It might sound a little surprising, but it does make sense: the pipeline is a processing operation itself, +it received the raw segment as input, and used it to create an entity. +The sentences are considered internal intermediary results and are not listed. -However, if we are interested in the details about what happened inside the -`Pipeline`, the information is still available through a sub-provenance tracer +If we are interested in the details about what happened inside the `Pipeline`, +the information is still available through a sub-provenance tracer that can be retrieved with `get_sub_prov_tracer()`: -```{code-cell} ipython3 +:::{code-block} pipeline_prov_tracer = prov_tracer.get_sub_prov_tracer(pipeline.uid) for prov in pipeline_prov_tracer.get_provs(): print_prov(prov) -``` - -Although the order of each `Prov` returned by `get_provs()` isn't the order of -creation of the annotations themselves, we can see the details of what happened -in the pipeline: 2 sentences were derived from the raw text by the -`SentenceTokenizer`, then one entity was derived from one of the sentences by -the `RegexpMatcher`. - -In other words, the provenance information held by the main `ProvTracer` is -composed: it is a graph, but some part of the graph have corresponding nested -sub-graphs, that can be expanded if desired. The `save_prov_to_dot()` helper is -able to leverage this structure. By default, it will expand and display all -sub-provenance info recursively, but it has a optional `max_sub_prov_depth` -parameter that allows to limit the depth of the sub-provenance to show: - -```{code-cell} ipython3 +::: + +Although the order of each `Prov` returned by `get_provs()` is not the order of creation of the annotations themselves, +we can see the details of what happened in the pipeline. +Two sentences were derived from the raw text by the `SentenceTokenizer`, +then one entity was derived from one of the sentences by the `RegexpMatcher`. + +In other words, the provenance information held by the main `ProvTracer` is composed. +It is a graph, but some part of the graph have corresponding nested sub-graphs, that can be expanded if desired. +The `save_prov_to_dot()` helper is able to leverage this structure. +By default, it will expand and display all sub-provenance info recursively, +but it has a optional `max_sub_prov_depth` parameter that allows to limit the depth of the sub-provenance to show: + +:::{code-block} --- mystnb: image: @@ -259,9 +227,9 @@ mystnb: # show only outer-most provenance save_prov_to_dot(prov_tracer, dot_file, max_sub_prov_depth=0) display_dot(dot_file) -``` +::: -```{code-cell} ipython3 +:::{code-block} --- mystnb: image: @@ -271,32 +239,30 @@ mystnb: # expand next level of sub-provenance save_prov_to_dot(prov_tracer, dot_file, max_sub_prov_depth=1) display_dot(dot_file) -``` +::: -The same way that pipeline can contain sub-pipelines recursively, the provenance -tracer can contain sub-provenance tracers recursively for the corresponding -sub-pipelines. +The same way that pipeline can contain sub-pipelines recursively, +the provenance tracer can contain sub-provenance tracers recursively for the corresponding sub-pipelines. -Composed provenance makes it possible to preserve exhaustive provenance -information about our data but to chose the appropriate level of detail when -inspecting it. The structure of the provenance will reflect the structure of the -processing flow: if it is built in a composed way, with pipelines containing -sub-pipelines dealing with specific sub-tasks, then the provenance information -will be composed the same way. +Composed provenance makes it possible to preserve exhaustive provenance information about our data, +and chose the appropriate level of detail when inspecting it. +The provenance structure will reflect the structure of the processing flow. +If built in a composed way, with pipelines containing sub-pipelines dealing with specific sub-tasks, +then provenance information will be composed the same way. ## A more complete provenance example -To demonstrate a bit more the potential of provenance tracing in medkit, let's -build a more complicated pipeline involving a sub-pipeline and an operation that -creates attributes: +To demonstrate a bit more the potential of provenance tracing in `medkit`, +let's build a more complicated pipeline involving a sub-pipeline +and an operation that creates attributes: -```{code-cell} ipython3 +:::{code-block} from medkit.text.context import NegationDetector, NegationDetectorRule # segmentation -sent_tokenizer = SentenceTokenizer(output_label="sentence") +sentence_tokenizer = SentenceTokenizer(output_label="sentence") # negation detection -neg_detector = NegationDetector(output_label="is_negated") +negation_detector = NegationDetector(output_label="is_negated") # entity recognition regexp_rules = [ RegexpMatcherRule(regexp=r"\basthme\b", label="problem"), @@ -306,8 +272,8 @@ regexp_matcher = RegexpMatcher(rules=regexp_rules, attrs_to_copy=["is_negated"]) # context sub pipeline handling segmentation and negation detection sub_pipeline_steps = [ - PipelineStep(sent_tokenizer, input_keys=["full_text"], output_keys=["sentences"]), - PipelineStep(neg_detector, input_keys=["sentences"], output_keys=[]), # no output + PipelineStep(sentence_tokenizer, input_keys=["full_text"], output_keys=["sentences"]), + PipelineStep(negation_detector, input_keys=["sentences"], output_keys=[]), # no output ] sub_pipeline = Pipeline( sub_pipeline_steps, @@ -327,15 +293,14 @@ pipeline = Pipeline( input_keys=["full_text"], output_keys=["entities"], ) -``` +::: -Note that since we have 2 pipelines, we pass an optional `name` parameter to -each of them that will be used in the operation description and will help us to -distinguish them. +Since there are 2 pipelines, we need to pass an optional `name` parameter to each of them +that will be used in the operation description and will help us to distinguish between them. -Running the pipeline gives us 2 entities with negation attributes: +Running the main pipeline returns 2 entities with negation attributes: -```{code-cell} ipython3 +:::{code-block} prov_tracer = ProvTracer() pipeline.set_prov_tracer(prov_tracer) entities = pipeline.run([doc.raw_segment]) @@ -343,13 +308,12 @@ entities = pipeline.run([doc.raw_segment]) for entity in entities: is_negated = entity.attrs.get(label="is_negated")[0].value print(f"text={entity.text!r}, label={entity.label}, is_negated={is_negated}") -``` +::: -At the outer-most level, the provenance tells us that the main pipeline created -2 entities and 2 attributes. Intermediary data items (sentences) and operations -(`SentenceTokenizer`, `NegationDetector`, `RegexpMatcher`) are hidden . +At the outermost level, provenance tells us that the main pipeline created 2 entities and 2 attributes. +Intermediary data and operations (`SentenceTokenizer`, `NegationDetector`, `RegexpMatcher`) are hidden. -```{code-cell} ipython3 +:::{code-block} --- mystnb: image: @@ -358,17 +322,17 @@ mystnb: --- save_prov_to_dot(prov_tracer, dot_file, max_sub_prov_depth=0) display_dot(dot_file) -``` +::: You can see dotted arrow showing which attribute relates to which annotation. -While this is not strictly speaking provenance information, it is displayed -nonetheless to avoid any confusion, especially in the case where attributes -created by one operation are afterwards copied to new annotations (cf -`attrs_to_copy` as explained in the [First steps tutorial](first_steps.html#detecting-negation)). +While this is not strictly speaking provenance information, +it is displayed nonetheless to avoid any confusion, +especially in the case where attributes created by one operation +are copied to new annotations (cf `attrs_to_copy` as explained in the [First steps tutorial](first_steps.html#detecting-negation)) afterwards. -Expanding one more level of sub-provenance gives us the following graph: +Expanding one more level of provenance gives us the following graph: -```{code-cell} ipython3 +:::{code-block} --- mystnb: image: @@ -376,19 +340,16 @@ mystnb: --- save_prov_to_dot(prov_tracer, dot_file, max_sub_prov_depth=1) display_dot(dot_file) -``` - -We now see the details of the operations and data items handled in our main -pipeline: a sub-pipeline created sentence segments and negation -attributes, then the `RegexpMatcher` created entities, using the sentences -segments. The negation attributes were attached to both the sentences and the -entities derived from the sentences. +::: +Now, We can see the details of the operations and data items handled in our main pipeline. +A sub-pipeline created sentence segments and negation attributes, +then the `RegexpMatcher` created entities, using the sentences segments. +The negation attributes were attached to both the sentences and the entities derived from the sentences. -To have more details about the processing inside the context sub-pipeline, we -have to go one step deeper: +To have more details about the processing inside the context sub-pipeline, we have to go one level deeper: -```{code-cell} ipython3 +:::{code-block} --- mystnb: image: @@ -396,21 +357,20 @@ mystnb: --- save_prov_to_dot(prov_tracer, dot_file, max_sub_prov_depth=2) display_dot(dot_file) -``` +::: ## Wrapping it up -In this tutorial, we have seen how we can use `ProvTracer` to keep information -about how annotations and attributes were generated, ie. which operation created -them, using which data as input. +In this tutorial, we have seen how we can use `ProvTracer` +to keep information about how annotations and attributes were generated, +i.e. which operation created them using which data as input. -We have also seen how, when using pipelines and sub-pipelines, the provenance -information in a `ProvTracer` will be composed, the same way that our processing -graph is. This allows us to later display the level of details that we want to -see when inspecting provenance. +Furthermore, we have seen how, when using pipelines and sub-pipelines, +provenance information generated by a `ProvTracer` will be composed, +the same way that our processing graph is. +This allows us to later display the level of details that we want to see when inspecting provenance. Finally, we have seen how the `save_prov_to_dot()` helper function can be used -to quickly visualize the captured provenance information. For more advanced -provenance usage, you may want to look at the [provenance API -docs](api:core:provenance). The source code of `save_prov_to_dot()` can also -serve as a reference on how to use it. +to quickly visualize the captured provenance information. +For more advanced provenance usage, you may want to look at the [provenance API docs](api:core:provenance). +The source code of `save_prov_to_dot()` can also serve as a reference on how to use it. diff --git a/pyproject.toml b/pyproject.toml index 255d56af..575f33a2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -160,13 +160,12 @@ all = [ webrtc-voice-detector]""", ] docs = [ - "jupyter-book==1.0.*", - "matplotlib", - "pandas>=1.4", - "spacy>=3.4", + "myst-parser", + "numpydoc", + "sphinx>=7,<8", + "sphinx-autoapi", "sphinx-autobuild", - "sphinx-book-theme>=1.1.2", - "sphinx-toolbox", + "sphinx-book-theme", "sphinxcontrib-mermaid", ] @@ -205,19 +204,19 @@ cov = [ ] [tool.hatch.envs.docs] -features = [ - "docs", - "metrics-ner", - "nlstruct", - "srt-io-converter", - "webrtc-voice-detector", -] -python = "3.10" +dependencies = [ + "myst-parser", + "numpydoc", + "sphinx>=7,<8", + "sphinx-autoapi", + "sphinx-autobuild", + "sphinx-book-theme", + "sphinxcontrib-mermaid", +] [tool.hatch.envs.docs.scripts] -config = "jupyter-book config sphinx docs/" -build = "sphinx-build docs/ {args:docs/_build/html}" -serve = "sphinx-autobuild docs/ {args:docs/_build/html}" +build = "sphinx-build docs/ docs/_build/html {args}" +serve = "sphinx-autobuild docs/ docs/_build/html {args}" [tool.coverage.run] source_pkgs = ["medkit", "tests"] From 192c16376dd5601d141d660e025e0da7765d2665 Mon Sep 17 00:00:00 2001 From: Ghislain Vaillant Date: Tue, 27 Feb 2024 14:08:26 +0100 Subject: [PATCH 02/21] DOC: Fix directive and xref errors --- docs/conf.py | 1 + docs/tutorial/context_detection.md | 6 +-- docs/tutorial/entity_matching.md | 6 +-- docs/user_guide/first_steps.md | 42 +++++++++--------- docs/user_guide/module.md | 18 ++++---- docs/user_guide/pipeline.md | 24 +++++----- docs/user_guide/provenance.md | 71 ++++++++---------------------- 7 files changed, 68 insertions(+), 100 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index a954775b..55923735 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -32,6 +32,7 @@ # https://myst-parser.readthedocs.io/en/latest/configuration.html myst_enable_extensions = ["attrs_inline", "colon_fence"] +myst_heading_anchors = 2 # -- Options for HTML output ------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output diff --git a/docs/tutorial/context_detection.md b/docs/tutorial/context_detection.md index f638d176..562c9072 100644 --- a/docs/tutorial/context_detection.md +++ b/docs/tutorial/context_detection.md @@ -22,7 +22,7 @@ contextual information to entities such has: - it is related to the patient or it is part of their family's medical history NB: If you are not familiar with medkit, you should probably take a look at the -[First steps](first_steps.md) tutorial before going further. +[First steps](../user_guide/first_steps.md) tutorial before going further. Let's start by loading a document: @@ -74,7 +74,7 @@ for section_seg in section_segs: ## Sentence splitting -We have already seen sentence splitting [previously](first_steps.md) and we will +We have already seen sentence splitting [previously](../user_guide/first_steps.md) and we will reuse the same code, with a little addition: we want the section information to be propagated onto the sentences, ie. we want to be able to tell in which section a sentence belongs. @@ -113,7 +113,7 @@ this is not always the case. To handle this, medkit provides a {class}`~medkit.text.context.FamilyDetector` operation based on regular expressions. It is somewhat similar to the {class}`~medkit.text.ner.RegexpMatcher` we have -[previously](entity_matcher.html#regular-expression-matching) seen, but instead +[previously](./entity_matching.md#regular-expression-matching) seen, but instead of returning entities, it attaches attributes to the segments it receives, with a boolean value indicating whether it mentions family history. diff --git a/docs/tutorial/entity_matching.md b/docs/tutorial/entity_matching.md index f39335f8..27962d49 100644 --- a/docs/tutorial/entity_matching.md +++ b/docs/tutorial/entity_matching.md @@ -18,7 +18,7 @@ This tutorial will take you on a tour of the most common methods to perform entity matching on text documents using medkit. NB: If you are new to medkit, you should probably take a look at the [First -steps](first_steps.md) tutorial before going further. +steps](../user_guide/first_steps.md) tutorial before going further. ## Sentence splitting @@ -38,7 +38,7 @@ print(doc.text) ``` We will now use medkit's sentence tokenizing operation to create and display -sentence segments. As seen [before](first_steps.md), the sentence tokenizer +sentence segments. As seen [before](../user_guide/first_steps.md), the sentence tokenizer expects a list of segments as input and will return a list of sentence segments, and since we don't have any segments yet on our document, we use `TextDocument.raw_segment`, which is a special segment that contains the full @@ -558,7 +558,7 @@ characters pans of the entities in the original unprocessed text. If you use different methods or 3d-party tools, it is possible to wrap them into a medkit operation so you can use them within medkit, as described in [this -tutorial](module.md). Contributions to medkit are welcome so you can +tutorial](../user_guide/module.md). Contributions to medkit are welcome so you can submit your operations to be integrated into medkit! diff --git a/docs/user_guide/first_steps.md b/docs/user_guide/first_steps.md index 21b6aeaf..f29d6057 100644 --- a/docs/user_guide/first_steps.md +++ b/docs/user_guide/first_steps.md @@ -8,7 +8,7 @@ and context detection operations. For starters, let's load a text file using the {class}`~medkit.core.text.TextDocument` class: -:::{code-block} +:::{code} # You can download the file available in source code # !wget https://raw.githubusercontent.com/medkit-lib/medkit/main/docs/data/text/1.txt @@ -20,7 +20,7 @@ doc = TextDocument.from_file(Path("../data/text/1.txt")) The full raw text can be accessed through the `text` attribute: -:::{code-block} +:::{code} print(doc.text) ::: @@ -36,7 +36,7 @@ documents in sentences. including a rule-based {class}`~medkit.text.segmentation.SentenceTokenizer` class that relies on a list of punctuation characters. -:::{code-block} +:::{code} from medkit.text.segmentation import SentenceTokenizer sent_tokenizer = SentenceTokenizer( @@ -54,7 +54,7 @@ and returns a list of `Segment` objects. Here, we can pass a special `Segment` containing the full text of the document, which can be retrieved through the `raw_segment` attribute of `TextDocument`: -:::{code-block} +:::{code} sentences = sent_tokenizer.run([doc.raw_segment]) for sentence in sentences: @@ -76,7 +76,7 @@ Each segment features: If you take a look at the 13th and 14th detected sentences, you will notice something strange: -:::{code-block} +:::{code} print(repr(sentences[12].text)) print(repr(sentences[13].text)) ::: @@ -92,7 +92,7 @@ For this, we can use the {class}`~medkit.text.preprocessing.RegexpReplacer` clas a regexp-based "search-and-replace" operation. As other `medkit` operations, it can be configured with a set of user-determined rules: -:::{code-block} +:::{code} from medkit.text.preprocessing import RegexpReplacer rule = (r"(?<=\d)\.(?=\d)", ",") # => (pattern to replace, new text) @@ -105,7 +105,7 @@ In our case we only want to preprocess the full raw text segment, and we will only receive one preprocessed segment, so we can call it with: -:::{code-block} +:::{code} clean_segment = regexp_replacer.run([doc.raw_segment])[0] print(clean_segment.text) ::: @@ -113,7 +113,7 @@ print(clean_segment.text) We may use again our previously-defined sentence tokenizer again, but this time on the preprocessed text: -:::{code-block} +:::{code} sentences = sent_tokenizer.run([clean_segment]) print(sentences[12].text) ::: @@ -126,7 +126,7 @@ The `medkit` library also comes with operations to perform NER (named entity rec for instance with {class}`~medkit.text.ner.regexp_matcher.RegexpMatcher`. Let's instantiate one with a few simple rules: -:::{code-block} +:::{code} from medkit.text.ner import RegexpMatcher, RegexpMatcherRule regexp_rules = [ @@ -162,7 +162,7 @@ representing the entities that were matched (`Entity` is a subclass of `Segment` As input, it expects a list of `Segment` objects. Let's give it the sentences returned by the sentence tokenizer: -:::{code-block} +:::{code} entities = regexp_matcher.run(sentences) for entity in entities: @@ -192,7 +192,7 @@ accessible through their {class}`~medkit.core.AttributeContainer`). Let's instantiate a `NegationDetector` with a couple of simplistic handcrafted rules and run it on our sentences: -:::{code-block} +:::{code} from medkit.text.context import NegationDetector, NegationDetectorRule neg_rules = [ @@ -213,7 +213,7 @@ located in the `medkit.text.context` module. And now, let's look at which sentence have been detected as being negated: -:::{code-block} +:::{code} for sentence in sentences: neg_attr = sentence.attrs.get(label="is_negated")[0] if neg_attr.value: @@ -235,7 +235,7 @@ which are stored in file `default_syntagma_definition.yml` located in the `medkit.text.segmentation` module. ::: -:::{code-block} +:::{code} from medkit.text.segmentation import SyntagmaTokenizer synt_tokenizer = SyntagmaTokenizer( @@ -268,7 +268,7 @@ Let's again use a `RegexpMatcher` to find some entities, but this time from syntagmas rather than from sentences, and using `attrs_to_copy` to copy negation attributes: -:::{code-block} +:::{code} regexp_matcher = RegexpMatcher(rules=regexp_rules, attrs_to_copy=["is_negated"]) entities = regexp_matcher.run(syntagmas) @@ -293,7 +293,7 @@ an instance of {class}`~medkit.core.text.TextAnnotationContainer`) that behaves roughly like a list but also offers additional filtering methods. Annotations can be added by calling its `add()` method: -:::{code-block} +:::{code} for entity in entities: doc.anns.add(entity) ::: @@ -303,7 +303,7 @@ such as brat (see {class}`~medkit.io.brat.BratOutputConverter`) or Doccano (see {class}`~medkit.io.doccano.DoccanoOutputConverter`), or serialized to JSON (see {mod}`~medkit.io.medkit_json`): -:::{code-block} +:::{code} from medkit.io import medkit_json medkit_json.save_text_document(doc, "doc_1.json") @@ -316,9 +316,7 @@ a visualization tool part of the [spaCy](https://spacy.io/) NLP library. `medkit` provides helper functions to facilitate the use of `displacy` in the {mod}`~medkit.text.spacy.displacy_utils` module: -:::{code-block} -:tags: [scroll-output] - +:::{code} from spacy import displacy from medkit.text.spacy.displacy_utils import medkit_doc_to_displacy @@ -341,6 +339,6 @@ including model-based NER operations. You can learn more about them in the [API reference](../api/text.md). To dive further into `medkit`, you might be interested in an overview -of the [various entity matching methods available in medkit](entity_matching.md), -[context detection](context_detection.md), -or [how to encapsulate all these operations in a pipeline](pipeline.md). +of the [various entity matching methods available in medkit](../tutorial/entity_matching.md), +[context detection](../tutorial/context_detection.md), +or [how to encapsulate all these operations in a pipeline](./pipeline.md). diff --git a/docs/user_guide/module.md b/docs/user_guide/module.md index 89874214..1b364623 100644 --- a/docs/user_guide/module.md +++ b/docs/user_guide/module.md @@ -12,7 +12,8 @@ You can find several examples of implemented modules in [medkit.text](../api/tex For all operations inheriting from `Operation` abstract class, these 4 lines shall be added in `__init__` method: -``` + +```python def __init__(self, ..., uid=None): ... # Pass all arguments to super (remove self) @@ -22,7 +23,8 @@ def __init__(self, ..., uid=None): ``` Here is an example of a custom segmentation module: -``` + +```python class MyTokenizer(SegmentationOperation): def __init__( @@ -53,7 +55,7 @@ Here is an example of an implementation of our tokenizer. It uses a private method which processes each segment to return a list of tokens for this segment. -``` +```python class MyTokenizer(SegmentationOperation): ... def run(self, segments: List[Segment]) -> List[Segment]: @@ -80,7 +82,7 @@ An example of the functions' usage is available [here](../examples/spans). Here is an example of our tokenizer which role is to cut the segment in two segments. -``` +```python class MyTokenizer(SegmentationOperation): ... def _mytokenmethod(self, segment): @@ -135,7 +137,7 @@ Here is our example which store information about: * which operation produces it (i.e., MyTokenizer) * the source item which has been processed -``` +```python class MyTokenizer(SegmentationOperation): ... def _mytokenmethod(self, segment): @@ -164,7 +166,7 @@ To illustrate what we have seen in a more concrete manner, here is a fictional "days of the week" matcher that takes text segments as input a return entities for week days: -```{code-cell} ipython3 +:::{code} import re from medkit.core import Operation from medkit.core.text import Entity, span_utils @@ -220,7 +222,7 @@ class DayMatcher(Operation): ) return entities -``` +::: Note than since this is a entity matcher, adding support for `attrs_to_copy` -would be nice (cf [Context detection](context_detection.md)). +would be nice (cf [Context detection](../tutorial/context_detection.md)). diff --git a/docs/user_guide/pipeline.md b/docs/user_guide/pipeline.md index ef0d9b3a..06956575 100644 --- a/docs/user_guide/pipeline.md +++ b/docs/user_guide/pipeline.md @@ -8,7 +8,7 @@ and how to create pipelines to enrich documents. Let's reuse the preprocessing, segmentation, context detection and entity recognition operations from the [First steps](./first_steps.md) tutorial: -:::{code-block} +:::{code} from medkit.text.preprocessing import RegexpReplacer from medkit.text.segmentation import SentenceTokenizer, SyntagmaTokenizer from medkit.text.context import NegationDetector, NegationDetectorRule @@ -56,7 +56,7 @@ Each of these operations features a `run()` method, which could be called sequen Data need to be routed manually between inputs and outputs for each operation, using a document's raw text segment as initial input: -:::{code-block} +:::{code} from pathlib import Path from medkit.core.text import TextDocument @@ -105,7 +105,7 @@ But we also need to "connect" the operations together, i.e. to indicate which output of an operation should be fed as input to another operation. This is the purpose of the {class}`~medkit.core.PipelineStep` objects: -:::{code-block} +:::{code} from medkit.core import PipelineStep steps = [ @@ -147,7 +147,7 @@ graph TD Pipeline steps can then be used to instantiate a {class}`~medkit.core.Pipeline` object: -:::{code-block} +:::{code} from medkit.core import Pipeline pipeline = Pipeline( @@ -171,7 +171,7 @@ but more complex pipelines with multiple inputs and outputs are supported. Like any other operation, the pipeline can be evaluated using its `run` method: -:::{code-block} +:::{code} entities = pipeline.run([doc.raw_segment]) for entity in entities: @@ -188,7 +188,7 @@ which can be used, tested and exercised in isolation. In our example, we can use this feature to regroup together our regexp replacer, sentence tokenizer and family detector into a context sub-pipeline: -:::{code-block} +:::{code} # Context pipeline that receives full text segments # and returns preprocessed syntagmas segments with negation attributes. context_pipeline = Pipeline( @@ -207,10 +207,10 @@ context_pipeline = Pipeline( ::: Likewise, we can introduce a NER sub-pipelines -composed of a UMLS-based matching operation (see also [Entity Matching](./entity_matching.md)) +composed of a UMLS-based matching operation (see also [Entity Matching](../tutorial/entity_matching.md)) grouped with the previously defined regexp matcher: -:::{code-block} +:::{code} from medkit.text.ner import UMLSMatcher umls_matcher = UMLSMatcher( @@ -239,7 +239,7 @@ both the regexp matcher and the UMLS matcher. The NER and context sub-pipelines can now be sequenced with: -:::{code-block} +:::{code} pipeline = Pipeline( steps=[ PipelineStep(context_pipeline, input_keys=["full_text"], output_keys=["syntagmas"]), @@ -287,7 +287,7 @@ graph TD Let's run the pipeline and verify entities with negation attributes: -:::{code-block} +:::{code} entities = pipeline.run([doc.raw_segment]) for entity in entities: @@ -393,7 +393,7 @@ To scale the processing of such pipeline to a collection of documents, one needs to iterate over each document manually to obtain its entities rather than processing all the documents at once: -:::{code-block} +:::{code} docs = TextDocument.from_dir(Path("..data/text")) for doc in docs: @@ -407,7 +407,7 @@ which wraps a `Pipeline` instance and run it on a list of documents. Here is an example of its usage: -:::{code-block} +:::{code} from medkit.core import DocPipeline docs = TextDocument.from_dir(Path("..data/text")) diff --git a/docs/user_guide/provenance.md b/docs/user_guide/provenance.md index 08c1a1e5..226eb6d2 100644 --- a/docs/user_guide/provenance.md +++ b/docs/user_guide/provenance.md @@ -25,7 +25,7 @@ and take a look at provenance for a single annotation, generated by a single ope We are going to create a very simple `TextDocument` containing just one sentence, and run a `RegexpMatcher` to match a single `Entity`: -:::{code-block} +:::{code} from medkit.core.text import TextDocument from medkit.text.ner import RegexpMatcher, RegexpMatcherRule @@ -41,7 +41,7 @@ we will activate provenance tracing for the generated entities. This is done by assigning it a {class}`~medkit.core.ProvTracer` object. The `ProvTracer` is in charge of gathering provenance information across all operations. -:::{code-block} +:::{code} from medkit.core import ProvTracer prov_tracer = ProvTracer() @@ -50,7 +50,7 @@ regexp_matcher.set_prov_tracer(prov_tracer) Now that provenance is enabled, the regexp matcher can be applied to the input document: -:::{code-block} +:::{code} entities = regexp_matcher.run([doc.raw_segment]) for entity in entities: @@ -59,7 +59,7 @@ for entity in entities: Let's retrieve and inspect provenance information concerning the matched entity: -:::{code-block} +:::{code} def print_prov(prov): # data item print(f"data_item={prov.data_item.text!r}") @@ -96,7 +96,7 @@ If we are interested in all the provenance information gathered by the `ProvTrac rather than the provenance of a specific item, then we can call the `get_provs()` method: -:::{code-block} +:::{code} for prov in prov_tracer.get_provs(): print_prov(prov) ::: @@ -118,13 +118,7 @@ It also provides command-line executable named `dot` to generate images from suc You will need to install `graphviz` on your system to be able to run the following code. ::: -:::{code-cell} ---- -mystnb: - image: - align: center - scale: 75% ---- +:::{code} from pathlib import Path import subprocess from IPython.display import Image @@ -149,7 +143,7 @@ Let's move on to a slightly more complex example. Before using the `RegexpMatcher` matcher, we will split our document into sentences with a `SentenceTokenizer`. We will also compose the `SentenceTokenizer` and our `RegexpMatcher` operations in a `Pipeline`. -:::{code-block} +:::{code} from medkit.text.segmentation import SentenceTokenizer from medkit.core.pipeline import PipelineStep, Pipeline @@ -172,7 +166,7 @@ and calling it will automatically enable provenance tracing for all the operatio Provenance tracers can only accumulate provenance information, not modify or delete it. ::: -:::{code-block} +:::{code} prov_tracer = ProvTracer() pipeline.set_prov_tracer(prov_tracer) @@ -185,7 +179,7 @@ for entity in entities: As expected, the result is identical to the first example: we have matched one entity. However, its provenance is structured differently: -:::{code-block} +:::{code} for prov in prov_tracer.get_provs(): print_prov(prov) ::: @@ -199,7 +193,7 @@ If we are interested in the details about what happened inside the `Pipeline`, the information is still available through a sub-provenance tracer that can be retrieved with `get_sub_prov_tracer()`: -:::{code-block} +:::{code} pipeline_prov_tracer = prov_tracer.get_sub_prov_tracer(pipeline.uid) for prov in pipeline_prov_tracer.get_provs(): @@ -217,25 +211,13 @@ The `save_prov_to_dot()` helper is able to leverage this structure. By default, it will expand and display all sub-provenance info recursively, but it has a optional `max_sub_prov_depth` parameter that allows to limit the depth of the sub-provenance to show: -:::{code-block} ---- -mystnb: - image: - align: center - scale: 75% ---- +:::{code} # show only outer-most provenance save_prov_to_dot(prov_tracer, dot_file, max_sub_prov_depth=0) display_dot(dot_file) ::: -:::{code-block} ---- -mystnb: - image: - align: center - scale: 85% ---- +:::{code} # expand next level of sub-provenance save_prov_to_dot(prov_tracer, dot_file, max_sub_prov_depth=1) display_dot(dot_file) @@ -256,7 +238,7 @@ To demonstrate a bit more the potential of provenance tracing in `medkit`, let's build a more complicated pipeline involving a sub-pipeline and an operation that creates attributes: -:::{code-block} +:::{code} from medkit.text.context import NegationDetector, NegationDetectorRule # segmentation @@ -300,7 +282,7 @@ that will be used in the operation description and will help us to distinguish b Running the main pipeline returns 2 entities with negation attributes: -:::{code-block} +:::{code} prov_tracer = ProvTracer() pipeline.set_prov_tracer(prov_tracer) entities = pipeline.run([doc.raw_segment]) @@ -313,13 +295,7 @@ for entity in entities: At the outermost level, provenance tells us that the main pipeline created 2 entities and 2 attributes. Intermediary data and operations (`SentenceTokenizer`, `NegationDetector`, `RegexpMatcher`) are hidden. -:::{code-block} ---- -mystnb: - image: - align: center - scale: 85% ---- +:::{code} save_prov_to_dot(prov_tracer, dot_file, max_sub_prov_depth=0) display_dot(dot_file) ::: @@ -328,16 +304,12 @@ You can see dotted arrow showing which attribute relates to which annotation. While this is not strictly speaking provenance information, it is displayed nonetheless to avoid any confusion, especially in the case where attributes created by one operation -are copied to new annotations (cf `attrs_to_copy` as explained in the [First steps tutorial](first_steps.html#detecting-negation)) afterwards. +are copied to new annotations (cf `attrs_to_copy` as explained in the +[First steps tutorial](./first_steps.md#detecting-negation)) afterwards. Expanding one more level of provenance gives us the following graph: -:::{code-block} ---- -mystnb: - image: - align: center ---- +:::{code} save_prov_to_dot(prov_tracer, dot_file, max_sub_prov_depth=1) display_dot(dot_file) ::: @@ -349,12 +321,7 @@ The negation attributes were attached to both the sentences and the entities der To have more details about the processing inside the context sub-pipeline, we have to go one level deeper: -:::{code-block} ---- -mystnb: - image: - align: center ---- +:::{code} save_prov_to_dot(prov_tracer, dot_file, max_sub_prov_depth=2) display_dot(dot_file) ::: From 55f7955bacdd611d81046bffc22b7f53534bb8ac Mon Sep 17 00:00:00 2001 From: Ghislain Vaillant Date: Tue, 27 Feb 2024 14:18:27 +0100 Subject: [PATCH 03/21] MAINT: Add command to clean the generated docs --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 575f33a2..d78876fc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -215,6 +215,7 @@ dependencies = [ ] [tool.hatch.envs.docs.scripts] +clean = "rm -rf docs/_build" build = "sphinx-build docs/ docs/_build/html {args}" serve = "sphinx-autobuild docs/ docs/_build/html {args}" From 051853a95d38af521bb8f4966f5b461ae724d3b4 Mon Sep 17 00:00:00 2001 From: Ghislain Vaillant Date: Tue, 27 Feb 2024 14:35:31 +0100 Subject: [PATCH 04/21] DOC: Fix warnings in tutorial --- docs/tutorial/context_detection.md | 56 +++++++++-------------- docs/tutorial/entity_matching.md | 73 ++++++++++++------------------ 2 files changed, 50 insertions(+), 79 deletions(-) diff --git a/docs/tutorial/context_detection.md b/docs/tutorial/context_detection.md index 562c9072..6b13fa10 100644 --- a/docs/tutorial/context_detection.md +++ b/docs/tutorial/context_detection.md @@ -1,17 +1,3 @@ ---- -jupytext: - formats: md:myst - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.14.4 -kernelspec: - display_name: Python 3 (ipykernel) - language: python - name: python3 ---- - # Context Detection In this tutorial, we will use rule-based operations to attach additional @@ -26,13 +12,13 @@ NB: If you are not familiar with medkit, you should probably take a look at the Let's start by loading a document: -```{code-cell} ipython3 +:::{code} from pathlib import Path from medkit.core.text import TextDocument doc = TextDocument.from_file(Path("../data/mtsamplesfr/1.txt")) print(doc.text) -``` +::: ## Section detection @@ -47,7 +33,7 @@ default list of possible sections but it is missing some sections that our document has, so we will manually define our own section rules: -```{code-cell} ipython3 +:::{code} from medkit.text.segmentation import SectionTokenizer # Give a definition of the sections we may encounter @@ -70,7 +56,7 @@ for section_seg in section_segs: section_attr = section_seg.attrs.get(label="section")[0] print("section", section_attr.value) print(section_seg.text, end="\n\n\n") -``` +::: ## Sentence splitting @@ -84,7 +70,7 @@ labels that we want to copy from the input segments to the new sentences segments created by the operation. Here, we will use it to copy the "section" attribute of the section segments (which has the section name as value): -```{code-cell} ipython3 +:::{code} from medkit.text.segmentation import SentenceTokenizer sentence_tokenizer = SentenceTokenizer( @@ -104,7 +90,7 @@ for sentence_seg in sentence_segs: section_attr = sentence_seg.attrs.get(label="section")[0] print("section:", section_attr.value) print(sentence_seg.text, end="\n\n") -``` +::: ## Family history detection @@ -123,7 +109,7 @@ https://github.com/medkit-lib/medkit/blob/main/medkit/text/context/family_detect that will be used by default if you don't provide any. For the sake of learning, we will manually create a few rules: -```{code-cell} ipython3 +:::{code} from medkit.text.context import FamilyDetector, FamilyDetectorRule family_rule_1 = FamilyDetectorRule( @@ -160,7 +146,7 @@ for sentence_seg in sentence_segs: # Only print sentences about family history if family_attr.value: print(sentence_seg.text) -``` +::: As with all rule-based operations, `FamilyDetector` provides {func}`~medkit.text.context.FamilyDetector.load_rules` and @@ -174,7 +160,7 @@ hypothesis it is better to split sentences into smaller chunks, as the scope of negation and hypothesis can be very limited. For this purpose, medkit comes with a {class}`~medkit.text.segmentation.SyntagmaTokenizer` operation. -```{code-cell} ipython3 +:::{code} from medkit.text.segmentation import SyntagmaTokenizer # Here we will use the default settings of SyntagmaTokenizer, @@ -190,13 +176,13 @@ syntagma_segs = syntagma_tokenizer.run(sentence_segs) for syntagma_seg in syntagma_segs: print(syntagma_seg.text) -``` +::: As you can see, a few sentences where split into smaller parts. We can now run a {class}`~medkit.text.context.NegationDetector` instance on the syntagmas (using the [default rules file](https://github.com/medkit-lib/medkit/blob/main/medkit/text/context/negation_detector_default_rules.yml)). -```{code-cell} ipython3 +:::{code} from medkit.text.context import NegationDetector, NegationDetectorRule # NegationDetectorRule objects have the same structure as FamilyDetectorRule @@ -209,7 +195,7 @@ for syntagma_seg in syntagma_segs: negation_attr = syntagma_seg.attrs.get(label="negation")[0] if negation_attr.value: print(syntagma_seg.text) -``` +::: ## Hypothesis detection @@ -219,7 +205,7 @@ list of conjugated verb forms. By default, verbs at conditional and future tenses will be considered to indicate the presence of an hypothesis. This can be configured, as well as the list of verbs which is far from exhaustive. -```{code-cell} ipython3 +:::{code} from medkit.text.context import HypothesisDetector hypothesis_detector = HypothesisDetector(output_label="hypothesis") @@ -230,16 +216,16 @@ for syntagma_seg in syntagma_segs: hypothesis_attr = syntagma_seg.attrs.get(label="hypothesis")[0] if hypothesis_attr.value: print(syntagma_seg.text) -``` +::: As you can see, no hypothesis was detected in this document. -```{warning} +:::{warning} The default settings (rules and verbs) of `HypothesisDetector` are far from complete and may not give satisfactory results. If you plan on using `HypothesisDetector`, you will need to come up with your own set of regexp rules and conjugated verbs that work well for you data. -``` +::: ## Passing context information to matched entities @@ -248,7 +234,7 @@ it to the entities that we will find in the document. This is easily done by using the `attrs_to_copy` mechanism that we have already seen, and that is available for all NER operations: -``` +:::{code} from medkit.text.ner.hf_entity_matcher import HFEntityMatcher # Create a matcher using a pretrained HuggingFace model @@ -274,9 +260,9 @@ for entity in doc.anns.entities: hypothesis_attr = entity.attrs.get(label="hypothesis")[0] print("hypothesis:", hypothesis_attr.value) print() -``` +::: -``` +```text problem : Thrombocytose essentielle section: head family: False @@ -478,7 +464,7 @@ hypothesis: False Let's visualize this in context with `displacy`: -``` +:::{code} from spacy import displacy from medkit.text.spacy.displacy_utils import medkit_doc_to_displacy @@ -507,7 +493,7 @@ def _custom_formatter(entity): # Pass the formatter to medkit_doc_to_displacy() displacy_data = medkit_doc_to_displacy(doc, entity_formatter=_custom_formatter) displacy.render(docs=displacy_data, manual=True, style="ent") -``` +:::
PLAINTE PRINCIPALE :
Thrombocytose essentielleproblem diff --git a/docs/tutorial/entity_matching.md b/docs/tutorial/entity_matching.md index 27962d49..b28c2300 100644 --- a/docs/tutorial/entity_matching.md +++ b/docs/tutorial/entity_matching.md @@ -1,17 +1,3 @@ ---- -jupytext: - formats: md:myst - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.14.4 -kernelspec: - display_name: Python 3 (ipykernel) - language: python - name: python3 ---- - # Entity Matching This tutorial will take you on a tour of the most common methods to perform @@ -29,13 +15,13 @@ detection operation at the sentence level. Let's start by loading a medical report to work on: -```{code-cell} ipython3 +:::{code} from pathlib import Path from medkit.core.text import TextDocument doc = TextDocument.from_file(Path("../data/mtsamplesfr/1.txt")) print(doc.text) -``` +::: We will now use medkit's sentence tokenizing operation to create and display sentence segments. As seen [before](../user_guide/first_steps.md), the sentence tokenizer @@ -44,7 +30,7 @@ and since we don't have any segments yet on our document, we use `TextDocument.raw_segment`, which is a special segment that contains the full unprocessed text. -```{code-cell} ipython3 +:::{code} from medkit.text.segmentation import SentenceTokenizer # By default, SentenceTokenizer will use a list of punctuation chars to detect sentences. @@ -63,7 +49,7 @@ sentence_segs = sentence_tokenizer.run([doc.raw_segment]) # Print all returned sentence segments for sentence_seg in sentence_segs: print(sentence_seg.text, end="\n\n") -``` +::: ## Regular expression matching @@ -74,7 +60,7 @@ expressions. For a complete overview of its features, you can refer to its We are going to use regular expressions to match entities that cannot be detected by a dictionary-based approach, such as age and weight indications: -```{code-cell} ipython3 +:::{code} from medkit.text.ner import RegexpMatcher, RegexpMatcherRule # Rule with simple regexps to match age and weights @@ -108,30 +94,30 @@ regexp_matcher = RegexpMatcher(rules=[regexp_rule_1, regexp_rule_2]) entities = regexp_matcher.run(sentence_segs) for entity in entities: print(entity.text, entity.label) -``` +::: Let's visualize them with `displacy`, using the {func}`~medkit.text.spacy.displacy_utils.entities_to_displacy` helper (similar to {func}`~medkit.text.spacy.displacy_utils.medkit_doc_to_displacy` but we can pass it a list of entities rather than a `TextDocument`): -```{code-cell} ipython3 +:::{code} from spacy import displacy from medkit.text.spacy.displacy_utils import entities_to_displacy displacy_data = entities_to_displacy(entities, doc.text) displacy.render(displacy_data, manual=True, style="ent") -``` +::: Note that you can save a particular list of regexp rules into a yaml file using the {func}`~medkit.text.ner.RegexpMatcher.save_rules` static method, and then reload them with {func}`~medkit.text.ner.RegexpMatcher.load_rules`. This makes it easier to share and reuse them: -```{code-cell} ipython3 +:::{code} RegexpMatcher.save_rules([regexp_rule_1, regexp_rule_2], "weight_and_age_rules.yml") rules = RegexpMatcher.load_rules("weight_and_age_rules.yml") -``` +::: Medkit itself comes with a list of predefined regexp rules, available at https://github.com/medkit-lib/medkit/blob/main/medkit/text/ner/regexp_matcher_default_rules.yml, @@ -152,18 +138,18 @@ classification.[^atc_footnote] Let's take a look at it: [^atc_footnote]: This file was created by Bastien Rance, reusing scripts originally from Sébastien Cossin -```{code-cell} ipython3 +:::{code} import pandas as pd drugs = pd.read_csv("../data/bdpm.csv") drugs.head(n=10) -``` +::: Rather than regular expressions, we will used similarity-based matching using the {class}`~medkit.text.ner.SimstringMatcher` operation. This "fuzzy" matcher based on the [simstring algorithm](http://chokkan.org/software/simstring/) will be more tolerant to small spelling errors than the exact matching of a regular expression.We are going to create a rule for each commercial name, and to each rule we will attach the ATC identifier of each molecule when we know them: -```{code-cell} ipython3 +:::{code} from medkit.text.ner import SimstringMatcher, SimstringMatcherRule, SimstringMatcherNormalization simstring_rules = [] @@ -211,7 +197,7 @@ for entity in entities: for norm_attr in entity.attrs.norms: print(norm_attr.kb_name, norm_attr.kb_id) print() -``` +::: ## Advanced entity matching with IAMSystem @@ -224,7 +210,7 @@ terms to match is very large. Let's see how to use it to match a couple of manually-defined terms: -```{code-cell} ipython3 +:::{code} from iamsystem import Matcher, ESpellWiseAlgo from medkit.text.ner.iamsystem_matcher import IAMSystemMatcher @@ -246,7 +232,7 @@ entities = iam_system_matcher.run(sentence_segs) for entity in entities: print(entity.label, ":", entity.text) -``` +::: To learn more about the possibilities of `IAMSystem`, refer to its [documentation](https://iamsystem-python.readthedocs.io/en/). @@ -277,7 +263,7 @@ Note that the UMLS files are not freely distributable nor usable, to download them and use you must request a license on the [UMLS website](https://www.nlm.nih.gov/research/umls/index.html) -``` +:::{code} from medkit.text.ner import UMLSMatcher # Codes of UMLS semantic groups to take into account @@ -321,7 +307,8 @@ def custom_formatter(entity): displacy_data = entities_to_displacy(entities, doc.text, entity_formatter=custom_formatter) displacy.render(displacy_data, manual=True, style="ent") -``` +::: +
PLAINTE PRINCIPALE :
Thrombocytose essentielledisorder (C0040028) @@ -420,7 +407,7 @@ Let's use this model with the {class}`~medkit.text.ner.hf_entity_matcher.HFEntityMatcher` to look for entities in our document: -``` +:::{code} from medkit.text.ner.hf_entity_matcher import HFEntityMatcher # HFEntityMatcher just needs the name of a model on the HuggingFace hub or a path to a local checkpoint @@ -435,7 +422,7 @@ entities = bert_matcher.run(sentence_segs) displacy_data = entities_to_displacy(entities, doc.text) displacy.render(docs=displacy_data, manual=True, style="ent") -``` +:::
PLAINTE PRINCIPALE :
Thrombocytose essentielleproblem @@ -514,36 +501,36 @@ normalization attributes attached to them. Let's consider the more realistic case in which we are dealing with a collection of documents rather than a unique document: -```{code-cell} ipython3 +:::{code} from glob import glob # Let's load all of our sample documents docs = TextDocument.from_dir(Path("../data/mtsamplesfr/")) print(len(docs)) -``` +::: It is possible to run the sentence splitting and entity matching operations on all documents at once: -```{code-cell} ipython3 +:::{code} sentence_segs = sentence_tokenizer.run([d.raw_segment for d in docs]) entities = regexp_matcher.run(sentence_segs) for entity in entities: print(entity.label, entity.text) -``` +::: Here, `entities` contains the entities found by the regexp matcher across all of our documents, in a list. But if we want to attach the entities back to the document they belong to, then we need to process each document independently: -```{code-cell} ipython3 +:::{code} for doc in docs: clean_text_segs = sentence_tokenizer.run([doc.raw_segment]) sentence_segs = sentence_tokenizer.run(clean_text_segs) entities = regexp_matcher.run(sentence_segs) for entity in entities: doc.anns.add(entity) -``` +::: When using pipelines (which will be covered in a later tutorial), this last use case is covered by the {class}`~medkit.core.DocPipeline` class. @@ -561,10 +548,8 @@ a medkit operation so you can use them within medkit, as described in [this tutorial](../user_guide/module.md). Contributions to medkit are welcome so you can submit your operations to be integrated into medkit! - -```{code-cell} ipython3 -:tags: [remove-cell] +:::{code} import os os.unlink("weight_and_age_rules.yml") -``` +::: From d1c637fd2ac4f57a8edd69d782f47b17a9217efb Mon Sep 17 00:00:00 2001 From: Ghislain Vaillant Date: Wed, 28 Feb 2024 10:38:50 +0100 Subject: [PATCH 05/21] DOC: Update entity matching section --- docs/tutorial/entity_matching.md | 203 ++++++++++++++++--------------- 1 file changed, 102 insertions(+), 101 deletions(-) diff --git a/docs/tutorial/entity_matching.md b/docs/tutorial/entity_matching.md index b28c2300..002f6f06 100644 --- a/docs/tutorial/entity_matching.md +++ b/docs/tutorial/entity_matching.md @@ -1,17 +1,17 @@ # Entity Matching This tutorial will take you on a tour of the most common methods to perform -entity matching on text documents using medkit. +entity matching on text documents using `medkit`. -NB: If you are new to medkit, you should probably take a look at the [First -steps](../user_guide/first_steps.md) tutorial before going further. +NB: If you are new to `medkit`, you should probably take a look at the +[First steps](../user_guide/first_steps.md) tutorial before going further. ## Sentence splitting -Before trying to locate entities in a document, it is often necessary to split -it into sentences, either because some operations expect sentences rather than a -full document as their input, or because we will afterward perform some context -detection operation at the sentence level. +Before trying to locate entities in a document, +it is often necessary to split it into sentences, +either because some operations expect sentences rather than a full document as their input, +or because we will afterward perform some context detection operation at the sentence level. Let's start by loading a medical report to work on: @@ -23,12 +23,12 @@ doc = TextDocument.from_file(Path("../data/mtsamplesfr/1.txt")) print(doc.text) ::: -We will now use medkit's sentence tokenizing operation to create and display -sentence segments. As seen [before](../user_guide/first_steps.md), the sentence tokenizer -expects a list of segments as input and will return a list of sentence segments, -and since we don't have any segments yet on our document, we use -`TextDocument.raw_segment`, which is a special segment that contains the full -unprocessed text. +We will now use a sentence tokenizing operation to create and display sentence segments. +As seen [before](../user_guide/first_steps.md), the sentence tokenizer expects +a list of segments as input and will return a list of sentence segments. +Since we don't have any segments yet on our document, +we use {class}`medkit.core.text.document.TextDocument`.raw_segment, +which is a special segment that contains the full unprocessed text. :::{code} from medkit.text.segmentation import SentenceTokenizer @@ -53,12 +53,12 @@ for sentence_seg in sentence_segs: ## Regular expression matching -Medkit comes with a built-in matcher that can identify entities based on regular -expressions. For a complete overview of its features, you can refer to its -{mod}`API doc`. +`medkit` comes with a built-in matcher that can identify entities based on regular expressions. +For a complete overview of its features, you can refer to {mod}`medkit.text.ner.regexp_matcher`. -We are going to use regular expressions to match entities that cannot be -detected by a dictionary-based approach, such as age and weight indications: +We are going to use regular expressions to match entities +that cannot be detected by a dictionary-based approach, +such as age and weight indications: :::{code} from medkit.text.ner import RegexpMatcher, RegexpMatcherRule @@ -96,10 +96,9 @@ for entity in entities: print(entity.text, entity.label) ::: -Let's visualize them with `displacy`, using the -{func}`~medkit.text.spacy.displacy_utils.entities_to_displacy` helper (similar -to {func}`~medkit.text.spacy.displacy_utils.medkit_doc_to_displacy` but we can -pass it a list of entities rather than a `TextDocument`): +Let's visualize them with `displacy`, using {func}`~medkit.text.spacy.displacy_utils.entities_to_displacy` +(similar to {func}`~medkit.text.spacy.displacy_utils.medkit_doc_to_displacy`but we can pass it +a list of entities rather than a `TextDocument`): :::{code} from spacy import displacy @@ -109,32 +108,31 @@ displacy_data = entities_to_displacy(entities, doc.text) displacy.render(displacy_data, manual=True, style="ent") ::: - Note that you can save a particular list of regexp rules into a yaml file using - the {func}`~medkit.text.ner.RegexpMatcher.save_rules` static method, and - then reload them with {func}`~medkit.text.ner.RegexpMatcher.load_rules`. - This makes it easier to share and reuse them: +Note that you can save a particular list of regexp rules into a yaml file +using {func}`~medkit.text.ner.RegexpMatcher.save_rules`, +and reload them with {func}`~medkit.text.ner.RegexpMatcher.load_rules`. +This makes rules easier to share and reuse: :::{code} RegexpMatcher.save_rules([regexp_rule_1, regexp_rule_2], "weight_and_age_rules.yml") rules = RegexpMatcher.load_rules("weight_and_age_rules.yml") ::: -Medkit itself comes with a list of predefined regexp rules, available at -https://github.com/medkit-lib/medkit/blob/main/medkit/text/ner/regexp_matcher_default_rules.yml, -that will be used by default if you don't provide any rules to `RegexpMatcher`. +`medkit` comes with a list of predefined regexp rules, +available at https://github.com/medkit-lib/medkit/blob/main/medkit/text/ner/regexp_matcher_default_rules.yml, +which will be used as default if no rules are provided to the `RegexpMatcher` instance. ## Similarity-based entity matching -We will now perform entity matching but this time based on a list of terms that -we want to retrieve. +We will now perform entity matching but this time based on a list of terms +that we want to retrieve. -The medical report we have loaded mentions several drugs that we are interested -in detecting. For this, we are going to take a CSV file that contains commercial -names of drugs, along with the molecules they contain and their corresponding -identifiers in the ATC -(https://www.who.int/tools/atc-ddd-toolkit/atc-classification) -classification.[^atc_footnote] Let's take a look at it: +The medical report we have loaded mentions several drugs that we are interested in detecting. +For this, we are going to take a CSV file that contains commercial names of drugs, +along with the molecules they contain and their corresponding identifiers in the [ATC classification].[^atc_footnote] +Let's take a look at it: +[ATC classification]: https://www.who.int/tools/atc-ddd-toolkit/atc-classification [^atc_footnote]: This file was created by Bastien Rance, reusing scripts originally from Sébastien Cossin @@ -145,9 +143,13 @@ drugs = pd.read_csv("../data/bdpm.csv") drugs.head(n=10) ::: -Rather than regular expressions, we will used similarity-based matching using the {class}`~medkit.text.ner.SimstringMatcher` operation. +Rather than regular expressions, we will use similarity-based matching +using the {class}`~medkit.text.ner.SimstringMatcher` operation. -This "fuzzy" matcher based on the [simstring algorithm](http://chokkan.org/software/simstring/) will be more tolerant to small spelling errors than the exact matching of a regular expression.We are going to create a rule for each commercial name, and to each rule we will attach the ATC identifier of each molecule when we know them: +This "fuzzy" matcher based on the [simstring algorithm](http://chokkan.org/software/simstring/) +will be more tolerant to small spelling errors than the exact matching of a regular expression. +We are going to create a rule for each commercial name, and to each rule we will attach +the ATC identifier of each molecule when we know them: :::{code} from medkit.text.ner import SimstringMatcher, SimstringMatcherRule, SimstringMatcherNormalization @@ -201,12 +203,12 @@ for entity in entities: ## Advanced entity matching with IAMSystem -[IAMSystem](https://iamsystem-python.readthedocs.io/en/latest/) is an advanced -entity matcher developed by Sébastien Cossin.[^footnote_iam] It allows for a fine control of -the matching strategy and should be relatively fast, even when the dictionary of -terms to match is very large. +[IAMSystem](https://iamsystem-python.readthedocs.io/en/latest/) is an advanced entity matcher +developed by Sébastien Cossin.[^iam_footnote] +It allows for a fine control of the matching strategy and should be relatively fast, +even when the dictionary of terms to match is very large. -[^footnote_iam]: Cossin S, Jouhet V, Mougin F, Diallo G, Thiessard F. IAM at CLEF eHealth 2018: Concept Annotation and Coding in French Death Certificates. https://arxiv.org/abs/1807.03674 +[^iam_footnote]: Cossin S, Jouhet V, Mougin F, Diallo G, Thiessard F. IAM at CLEF eHealth 2018: Concept Annotation and Coding in French Death Certificates. https://arxiv.org/abs/1807.03674 Let's see how to use it to match a couple of manually-defined terms: @@ -234,34 +236,37 @@ for entity in entities: print(entity.label, ":", entity.text) ::: -To learn more about the possibilities of `IAMSystem`, refer to its -[documentation](https://iamsystem-python.readthedocs.io/en/). +To learn more about the possibilities of `IAMSystem`, +please refer to its [documentation](https://iamsystem-python.readthedocs.io/en/). ## Finding UMLS concepts -Rather than manually building a dictionary of terms to match, we may be -interested in exploiting the terms referenced by the [UMLS -metathesaurus](https://www.nlm.nih.gov/research/umls/). - -Among other things, the UMLS contains a list of medical terms in different -languages, associated with a unique identifier (named CUI) for each concept they -refer to. The concepts are grouped together into "semantic types", themselves -grouped into wider groups caller "semantic groups" such as "ANAT", "CHEM", -"DISO", "PHYSIO", "PROC", etc (cf -https://lhncbc.nlm.nih.gov/semanticnetwork/download/sg_archive/SemGroups-v04.txt -for the complete list). - -Medkit provides a similarity-based fuzzy matcher dedicated to the UMLS. It uses -2 files from the standard UMLS distribution : `MRSTY.RRF`, which contains all -the UMLS concepts with their CUI in a CSV-like format, and `MRCONSO.RRF` which -contains a list of terms in different languages with corresponding CUI. The -{class}`~medkit.text.ner.umls_matcher.UMLSMatcher` operation simply uses this -lists to build a dictionary of terms to match (it does not take advantage of the -hierarchical nature of UMLS concepts). - -Note that the UMLS files are not freely distributable nor usable, to download -them and use you must request a license on the [UMLS -website](https://www.nlm.nih.gov/research/umls/index.html) +Rather than manually building a dictionary of terms to match, +we may be interested in exploiting the terms referenced by the [UMLS metathesaurus]. + +[UMLS metathesaurus]: https://www.nlm.nih.gov/research/umls/ + +Among other things, the UMLS contains a list of medical terms in different languages, +associated with a unique identifier (named CUI) for each concept they refer to. +The concepts are grouped together into _semantic types_, +themselves grouped into wider groups caller [semantic groups], +such as "ANAT", "CHEM", "DISO", "PHYSIO", "PROC", etc... + +[semantic groups]: https://lhncbc.nlm.nih.gov/semanticnetwork/download/sg_archive/SemGroups-v04.txt + +`medkit` provides a similarity-based fuzzy matcher dedicated to the UMLS. + +It uses two files from the standard UMLS distribution: +- `MRSTY.RRF` -- which contains all UMLS concepts with their CUI in a CSV-like format; +- `MRCONSO.RRF` -- which contains a list of terms in different languages with corresponding CUI. + +The {class}`~medkit.text.ner.umls_matcher.UMLSMatcher` operation simply uses these lists +to build a dictionary of terms to match (it does not take advantage of the hierarchical nature of UMLS concepts). + +Note that the UMLS files are not freely reusable nor redistributable nor usable. +To download them, you must request a license on the [UMLS website]. + +[UMLS website]: https://www.nlm.nih.gov/research/umls/index.html :::{code} from medkit.text.ner import UMLSMatcher @@ -386,26 +391,24 @@ displacy.render(displacy_data, manual=True, style="ent") ## Finding entities with BERT models BERT language models are neural network using a transformer architecture, -trained on large amounts of textual data using self-supervised learning -techniques such as masked language modeling and next sentence prediction. +trained on large amounts of textual data using self-supervised learning techniques +such as masked language modeling and next sentence prediction. Additional layers can be added to BERT models to perform various NLP tasks, including named entity recognition. -Medkit makes it possible to use BERT models for NER by wrapping the [HuggingFace -transformers library](https://huggingface.co/docs/transformers/index). This -python deep learning library specializes in reimplementing state of the art -transformers architectures, and also provides a model hub where the weights of -many pre-trained models can be found. +`medkit` makes it possible to use BERT models for NER by wrapping the [HuggingFace transformers library]. +This deep learning library specializes in reimplementing state-of-the-art transformers architectures, +and also provides a model hub with the weights of many pre-trained models. + +[HuggingFace transformers library]: https://huggingface.co/docs/transformers/index [DrBERT](https://drbert.univ-avignon.fr/) is a BERT model trained on french -biomedical documents, available on the HuggingFace hub at -https://huggingface.co/Dr-BERT/DrBERT-7GB. The medkit team has fine-tuned DrBERT -on an annotated version of the [CAS dataset](https://hal.science/hal-01937096) -to perform entity matching: https://huggingface.co/medkit/DrBERT-CASM2 +biomedical documents, available on [HuggingFace](https://huggingface.co/Dr-BERT/DrBERT-7GB). +The medkit team fine-tuned DrBERT on an annotated version of the [CAS dataset](https://hal.science/hal-01937096) +to perform [entity matching](https://huggingface.co/medkit/DrBERT-CASM2). -Let's use this model with the -{class}`~medkit.text.ner.hf_entity_matcher.HFEntityMatcher` to look for entities -in our document: +Let's use this model using {class}`~medkit.text.ner.hf_entity_matcher.HFEntityMatcher` +to look for entities in our document: :::{code} from medkit.text.ner.hf_entity_matcher import HFEntityMatcher @@ -493,13 +496,12 @@ displacy.render(docs=displacy_data, manual=True, style="ent") 85.7 kg.
-Note that the entities obtained with `HFEntityMatcher` don't have any -normalization attributes attached to them. +Note that the entities obtained with `HFEntityMatcher` don't have any normalization attributes attached to them. ## Matching entities in multiple documents -Let's consider the more realistic case in which we are dealing with a collection -of documents rather than a unique document: +Let's consider a more realistic case in which we are dealing with a collection of documents +rather than a unique document: :::{code} from glob import glob @@ -518,10 +520,9 @@ for entity in entities: print(entity.label, entity.text) ::: -Here, `entities` contains the entities found by the regexp matcher across -all of our documents, in a list. But if we want to attach the entities back to -the document they belong to, then we need to process each document -independently: +Here, `entities` contains a list of entities found by the regexp matcher across all of our documents. +But if we want to attach the entities back to the document they belong to, +then we need to process each document independently: :::{code} for doc in docs: @@ -532,21 +533,21 @@ for doc in docs: doc.anns.add(entity) ::: -When using pipelines (which will be covered in a later tutorial), this last use -case is covered by the {class}`~medkit.core.DocPipeline` class. +When using [pipelines](../user_guide/pipeline.md), +this last use case is covered using {class}`~medkit.core.DocPipeline`. ## Wrapping it up -Medkit provides many operations to perform entity matching using various -methods: regular expressions, fuzzy matching, BERT models, etc. +`medkit` provides many operations to perform entity matching using various methods: +regular expressions, fuzzy matching, BERT models, etc. + +Even if you do complex pre-processing, `medkit` will be able to give the characters spans +of the entities in the original unprocessed text. -Even if you do complex pre-processing, medkit will be able to give the -characters pans of the entities in the original unprocessed text. +If you use different methods or 3d-party tools, it is possible to wrap them into a `medkit` operation, +so you can use them anywhere else within `medkit`. See the [module](../user_guide/module.md) section. -If you use different methods or 3d-party tools, it is possible to wrap them into -a medkit operation so you can use them within medkit, as described in [this -tutorial](../user_guide/module.md). Contributions to medkit are welcome so you can -submit your operations to be integrated into medkit! +Contributions to `medkit` are welcome, feel free to submit your operations. :::{code} import os From 2850b030c8fe3774959c835866b6f7e02a990539 Mon Sep 17 00:00:00 2001 From: Ghislain Vaillant Date: Wed, 28 Feb 2024 12:25:20 +0100 Subject: [PATCH 06/21] DOC: Update context detection section --- docs/tutorial/context_detection.md | 149 ++++++++++++++--------------- 1 file changed, 72 insertions(+), 77 deletions(-) diff --git a/docs/tutorial/context_detection.md b/docs/tutorial/context_detection.md index 6b13fa10..fe0372f1 100644 --- a/docs/tutorial/context_detection.md +++ b/docs/tutorial/context_detection.md @@ -1,14 +1,11 @@ # Context Detection -In this tutorial, we will use rule-based operations to attach additional -contextual information to entities such has: -- the section in which the entity is located -- is the entity negated -- did it appear as part of a hypothesis -- it is related to the patient or it is part of their family's medical history - -NB: If you are not familiar with medkit, you should probably take a look at the -[First steps](../user_guide/first_steps.md) tutorial before going further. +In this tutorial, we will use rule-based operations to attach additional contextual information to entities, +such has: +- the section in which the entity is located; +- is the entity negated; +- whether it appears as part of an hypothesis; +- whether it is related to the patient or part of their family's medical history. Let's start by loading a document: @@ -22,16 +19,17 @@ print(doc.text) ## Section detection -Medkit provides a {class}`~medkit.text.segmentation.SectionTokenizer` operation -that takes a input segments containing full document texts and splits them into -sections, returning a segment for each section. +`medkit` provides a {class}`~medkit.text.segmentation.SectionTokenizer` operation +that takes input segments containing full document texts and splits them into sections, +returning a segment for each section. + +The section tokenizer is configured with a list of trigger terms +signaling the beginning of a section and corresponding section names. +`medkit` provides a [default list of sections], +but it is missing some sections featured in our document, +so we will manually define our own section rules: -The section tokenizer is configured with a list of trigger terms signaling the -beginning of a section, and corresponding section names. Medkit provides a -default list of possible sections -(https://github.com/medkit-lib/medkit/blob/main/medkit/text/segmentation/default_section_definition.yml) -but it is missing some sections that our document has, so we will manually -define our own section rules: +[default list of sections]: https://github.com/medkit-lib/medkit/blob/main/medkit/text/segmentation/default_section_definition.yml :::{code} from medkit.text.segmentation import SectionTokenizer @@ -60,15 +58,16 @@ for section_seg in section_segs: ## Sentence splitting -We have already seen sentence splitting [previously](../user_guide/first_steps.md) and we will -reuse the same code, with a little addition: we want the section information to -be propagated onto the sentences, ie. we want to be able to tell in which -section a sentence belongs. +We have covered sentence splitting [previously](../user_guide/first_steps.md), +and will reuse the same code, with a little addition: +we want the section information to be propagated onto the sentences, +i.e. we want to be able to tell in which section a sentence belongs. -For this, we will use the `attrs_to_copy` init parameter. It takes a list of -labels that we want to copy from the input segments to the new sentences -segments created by the operation. Here, we will use it to copy the "section" -attribute of the section segments (which has the section name as value): +For this, we will use the `attrs_to_copy` init parameter. +It takes a list of labels that we want to copy from the input segments +to the new sentences segments created by the operation. +Here, we will use it to copy the "section" attribute of the section segments +(which has the section name as value): :::{code} from medkit.text.segmentation import SentenceTokenizer @@ -94,20 +93,20 @@ for sentence_seg in sentence_segs: ## Family history detection -In this document, we have a section dedicated to family medical history, but, -this is not always the case. To handle this, medkit provides a -{class}`~medkit.text.context.FamilyDetector` operation based on regular -expressions. It is somewhat similar to the -{class}`~medkit.text.ner.RegexpMatcher` we have -[previously](./entity_matching.md#regular-expression-matching) seen, but instead -of returning entities, it attaches attributes to the segments it receives, with -a boolean value indicating whether it mentions family history. - -Like most rule-based medkit operations, `FamilyDetector` comes with [predefined -rules]( -https://github.com/medkit-lib/medkit/blob/main/medkit/text/context/family_detector_default_rules.yml) -that will be used by default if you don't provide any. For the sake of learning, -we will manually create a few rules: +In this document, we have a section dedicated to family medical history, +but this is not always the case. +To handle this, `medkit` provides a {class}`~medkit.text.context.FamilyDetector` operation +based on regular expressions. +It is somewhat similar to {class}`~medkit.text.ner.RegexpMatcher` +encountered [previously](./entity_matching.md#regular-expression-matching), +but instead of returning entities, it attaches attributes to the segments it receives, +with a boolean value indicating whether it mentions family history. + +Like most rule-based operations, `FamilyDetector` comes with [predefined rules] +that will be used by default if none is provided. +For the sake of learning, we will manually create a few rules: + +[predefined rules]: https://github.com/medkit-lib/medkit/blob/main/medkit/text/context/family_detector_default_rules.yml :::{code} from medkit.text.context import FamilyDetector, FamilyDetectorRule @@ -149,16 +148,16 @@ for sentence_seg in sentence_segs: ::: As with all rule-based operations, `FamilyDetector` provides -{func}`~medkit.text.context.FamilyDetector.load_rules` and -{func}`~medkit.text.context.FamilyDetector.save_rules` methods to help you store -then in a yaml file. +the {func}`~medkit.text.context.FamilyDetector.load_rules` +and {func}`~medkit.text.context.FamilyDetector.save_rules` methods +to facilitate their persistence to a YAML file. ## Negation detection -Detecting family history work best at the sentence level, but for negation and -hypothesis it is better to split sentences into smaller chunks, as the scope of -negation and hypothesis can be very limited. For this purpose, medkit comes with -a {class}`~medkit.text.segmentation.SyntagmaTokenizer` operation. +Detecting family history works best at the sentence level. +However, for negation and hypothesis, it is better to split sentences into smaller chunks, +as the scope of negation and hypothesis can be very limited. +For this purpose, `medkit` provides a {class}`~medkit.text.segmentation.SyntagmaTokenizer` operation. :::{code} from medkit.text.segmentation import SyntagmaTokenizer @@ -178,9 +177,9 @@ for syntagma_seg in syntagma_segs: print(syntagma_seg.text) ::: -As you can see, a few sentences where split into smaller parts. We can now run a -{class}`~medkit.text.context.NegationDetector` instance on the syntagmas (using -the [default rules file](https://github.com/medkit-lib/medkit/blob/main/medkit/text/context/negation_detector_default_rules.yml)). +As you can see, a few sentences were split into smaller parts. +We can now run a {class}`~medkit.text.context.NegationDetector` instance on the syntagmata +(using the default rules). :::{code} from medkit.text.context import NegationDetector, NegationDetectorRule @@ -199,11 +198,11 @@ for syntagma_seg in syntagma_segs: ## Hypothesis detection -Medkit's {class}`~medkit.text.context.HypothesisDetector` is very similar to -`NegationDetector`, except that in addition to a list of rules, it also uses a -list of conjugated verb forms. By default, verbs at conditional and future -tenses will be considered to indicate the presence of an hypothesis. This can be -configured, as well as the list of verbs which is far from exhaustive. +`medkit` also provides {class}`~medkit.text.context.HypothesisDetector`, +which is very similar to {class}`~medkit.text.context.NegationDetector`, +except it also uses a list of conjugated verb forms in addition to the list of rules. +By default, verbs at conditional and future tenses indicate the presence of an hypothesis. +This can be configured alongside the list of verbs. :::{code} from medkit.text.context import HypothesisDetector @@ -221,18 +220,18 @@ for syntagma_seg in syntagma_segs: As you can see, no hypothesis was detected in this document. :::{warning} -The default settings (rules and verbs) of `HypothesisDetector` are far from -complete and may not give satisfactory results. If you plan on using -`HypothesisDetector`, you will need to come up with your own set of regexp rules -and conjugated verbs that work well for you data. +The default settings (rules and verbs) of `HypothesisDetector` are **NOT** exhaustive +and may not yield satisfactory results. +If you plan on using `HypothesisDetector`, please consider specifying your own set of rules +and conjugated verbs that are specifically tailored to your data. ::: ## Passing context information to matched entities -Now that we have gathered all this contextual information, we want to propagate -it to the entities that we will find in the document. This is easily done by -using the `attrs_to_copy` mechanism that we have already seen, and that is -available for all NER operations: +Now that we have gathered all this contextual information, +we want to propagate it to the entities that we will find in the document. +This can be done using the `attrs_to_copy` mechanism that we have already seen, +which is available to all NER operations: :::{code} from medkit.text.ner.hf_entity_matcher import HFEntityMatcher @@ -563,24 +562,20 @@ displacy.render(docs=displacy_data, manual=True, style="ent") pèseproblem 85.7 kg.
-## Adding context attributes a posteriori +## Adding context attributes retrospectively -What if we already have some entities that we imported from another source and -we want to attach the contextual information that we obtain with medkit -operations? In that case it is possible to use the -{class}`~medkit.text.postprocessing.AttributeDuplicator` operation, that makes -it possible to copy attributes a posteriori without using the `attrs_to_copy` -parameter. +What if we already have some entities that we imported from another source, +and we want to attach the resulting contextual information obtained with `medkit`? +In that case, one can copy attributes retrospectively using the +{class}`~medkit.text.postprocessing.AttributeDuplicator` operation. ## Wrapping it up -In this tutorial, we have seen how medkit can help you to detect contextual -information with built-in rule-based detectors, for which the rules can be -customized. +In this tutorial, we have seen how `medkit` can facilitate detection of contextual information +with built-in and customizable rule-based detectors. These detectors can be run on segments of different granularity, -such as sentences or syntagmas, and the results are stored in attributes. +including as sentences or syntagmas, with their results stored as attributes. -In order to make these contextual attributes propagate from the outer-most -segments down to the entities matched, we use the `attrs_to_copy` operation -init parameter. +In order to propagate these contextual attributes from the outermost segments down to the entities matched, +we use the `attrs_to_copy` operation init parameter. From e350d55edded864116cae6fddc3aa38b3b4bf62b Mon Sep 17 00:00:00 2001 From: Ghislain Vaillant Date: Wed, 28 Feb 2024 13:46:08 +0100 Subject: [PATCH 07/21] DOC: Fix toctree in text segmentation examples --- docs/examples/text_segmentation/index.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/docs/examples/text_segmentation/index.md b/docs/examples/text_segmentation/index.md index 3a0ae717..aedb27b9 100644 --- a/docs/examples/text_segmentation/index.md +++ b/docs/examples/text_segmentation/index.md @@ -1,9 +1,11 @@ # Text segmentation examples :::{note} -You may refer to [text segmentation section](api:text:segmentation_modules) for -more information. +You may refer to the [text segmentation section](api:text:segmentation_modules) for more information. ::: -```{tableofcontents} -``` +:::{toctree} +document +section +syntagma +::: From 73d456cae58c1550b83f0ffe34edfef9ce3719f0 Mon Sep 17 00:00:00 2001 From: Ghislain Vaillant Date: Wed, 28 Feb 2024 14:09:21 +0100 Subject: [PATCH 08/21] DOC: Update text segmentation examples --- docs/examples/text_segmentation/document.md | 59 ++++------- docs/examples/text_segmentation/section.md | 108 ++++++++------------ docs/examples/text_segmentation/syntagma.md | 78 ++++++-------- 3 files changed, 95 insertions(+), 150 deletions(-) diff --git a/docs/examples/text_segmentation/document.md b/docs/examples/text_segmentation/document.md index 8e91d9e3..744a3815 100644 --- a/docs/examples/text_segmentation/document.md +++ b/docs/examples/text_segmentation/document.md @@ -1,33 +1,12 @@ ---- -jupytext: - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.14.5 -kernelspec: - display_name: Python 3 (ipykernel) - language: python - name: python3 ---- - - # Document splitter -+++ - This tutorial will show an example of how to split a document using its sections as a reference. -```{seealso} -We combine some operations like **section tokenizer**, **regexp matcher** and **custom operation**. Please see the other examples for more information. -``` -+++ +## Adding annotations to a document -## Adding annotations in a document +Let's detect the sections and add some annotations using `medkit` operations. -Let's detect the sections and add some annotations using medkit operations. - -```{code-cell} ipython3 +:::{code} # You can download the file available in source code # !wget https://raw.githubusercontent.com/medkit-lib/medkit/main/docs/data/text/1.txt @@ -36,10 +15,11 @@ from medkit.core.text import TextDocument doc = TextDocument.from_file(Path("../../data/text/1.txt")) print(doc.text) -``` +::: + **Defining the operations** -```{code-cell} ipython3 +:::{code} from medkit.text.ner import RegexpMatcher, RegexpMatcherRule from medkit.text.segmentation import SectionTokenizer @@ -64,11 +44,11 @@ regexp_rules = [ RegexpMatcherRule(regexp=r"\bnasonex\b", label="treatment", case_sensitive=False), ] regexp_matcher = RegexpMatcher(rules=regexp_rules) -``` +::: -We can now annotate the document +We can now annotate the document: -```{code-cell} ipython3 +:::{code} # Detect annotations sections = section_tokenizer.run([doc.raw_segment]) entities = regexp_matcher.run([doc.raw_segment]) @@ -77,15 +57,18 @@ for ann in sections + entities: doc.anns.add(ann) print(f"The document contains {len(sections)} sections and {len(entities)} entities\n") -``` +::: ## Split the document by sections -Once annotated, we can use the medkit operation {class}`~medkit.text.postprocessing.DocumentSplitter` to create smaller versions of the document using the sections. +Once annotated, we can use {class}`~medkit.text.postprocessing.DocumentSplitter` +to create smaller versions of the document using the sections. -By default, since its `entity_labels`, `attr_labels`, and `relation_labels` are set to `None`, all annotations will be in the resulting documents. You can select the annotations using their labels. +By default, since its `entity_labels`, `attr_labels`, and `relation_labels` are set to `None`, +all annotations will be in the resulting documents. +You can select the annotations using their labels. -```{code-cell} ipython3 +:::{code} from medkit.text.postprocessing import DocumentSplitter doc_splitter = DocumentSplitter(segment_label="section", # segments of reference @@ -95,11 +78,12 @@ doc_splitter = DocumentSplitter(segment_label="section", # segments of reference ) new_docs = doc_splitter.run([doc]) print(f"The document was divided into {len(new_docs)} documents\n") -``` +::: -Each document contains entities and attributes from the source segment; below, we visualize the new documents via displacy utils. +Each document contains entities and attributes from the source segment. +Below, we visualize the new documents via the `displacy` helpers. -```{code-cell} ipython3 +:::{code} from spacy import displacy from medkit.text.spacy.displacy_utils import medkit_doc_to_displacy @@ -110,5 +94,4 @@ for new_doc in new_docs: # convert new document to displacy displacy_data = medkit_doc_to_displacy(new_doc) displacy.render(displacy_data, manual=True, style="ent", options=options_displacy) -``` - +::: diff --git a/docs/examples/text_segmentation/section.md b/docs/examples/text_segmentation/section.md index f078615a..93338a00 100644 --- a/docs/examples/text_segmentation/section.md +++ b/docs/examples/text_segmentation/section.md @@ -1,31 +1,12 @@ ---- -jupytext: - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.14.5 -kernelspec: - display_name: Python 3 (ipykernel) - language: python - name: python3 ---- - - # Section Tokenizer -+++ - This tutorial will show an example of how to apply section tokenizer medkit operation on a text document. -+++ - ## Loading a text document -For beginners, let's load a text file using the {class}`~medkit.core.text.TextDocument` class: +First, let's load a text file using the {class}`~medkit.core.text.TextDocument` class: - -```{code-cell} ipython3 +:::{code} # You can download the file available in source code # !wget https://raw.githubusercontent.com/medkit-lib/medkit/main/docs/data/text/1.txt @@ -33,22 +14,20 @@ from pathlib import Path from medkit.core.text import TextDocument doc = TextDocument.from_file(Path("../../data/text/1.txt")) -``` +::: The full raw text can be accessed through the `text` attribute: - -```{code-cell} ipython3 +:::{code} print(doc.text) -``` +::: ## Defining section definition rules -To split the text document into medkit segments corresponding to each section, we have to define a set of rules. +To split the text document into segments corresponding to each section, we have to define a set of rules. These rules allow the operation to detect keywords triggering a new section. - -```{code-cell} ipython3 +:::{code} from medkit.text.segmentation.section_tokenizer import SectionTokenizer section_dict = { @@ -60,18 +39,24 @@ section_dict = { } tokenizer = SectionTokenizer(section_dict=section_dict) -``` +::: -The sections definition is a dictionary of key-values where *key* will be the section name and *value* is a list of keywords to detect as the start of the section. +The sections definition is a dictionary of key-values where _key_ will be the section name +and _value_ a list of keywords to detect as the start of the section. -For example, if we detect the keyword `EVALUATION` in text, a new section named `diagnostique` will begin with this keyword and will end with the next detected section or otherwise, the end of the text. +For example, if we detect the keyword `EVALUATION` in text, +a new section named `diagnostique` will begin with this keyword, +and will end with the next detected section, or the end of the text otherwise. -As all operations, `SectionTokenizer` defines a `run()` method. This method returns a list of {class}`~medkit.core.text.Segment` objects (a `Segment` is a -`TextAnnotation` that represents a portion of a document's full raw text). -As input, it also expects a list of `Segment` objects. Here, we can pass a special segment containing the whole raw text of the document, that we can retrieve through the `raw_segment` attribute of `TextDocument`: +As all operations, `SectionTokenizer` defines a `run()` method. +This method returns a list of {class}`~medkit.core.text.Segment` objects +(a `Segment` is a `TextAnnotation` that represents a portion of a document's full raw text). +As input, it also expects a list of `Segment` objects. +Here, we can pass a special segment containing the whole raw text of the document, +that we can retrieve through the `raw_segment` attribute of `TextDocument`: -```{code-cell} ipython3 +:::{code} sections = tokenizer.run([doc.raw_segment]) print(f"Number of detected sections: {len(sections)}\n") @@ -81,10 +66,11 @@ for section in sections: print(f"label = {section.label}") print(f"spans = {section.spans}") print(f"text = {section.text!r}\n") -``` +::: As you can see, we have detected 6 different sections. -Each section is a segment which has: + +Each section is a segment which features: - an `uid` attribute, which unique value is automatically generated; - a `text` attribute holding the text that the segment refers to; - a `spans` attribute reflecting the position of this text in the document's @@ -96,12 +82,12 @@ Each section is a segment which has: ## Defining section rules with renaming -`SectionTokenizer` also allows to define rules (i.e., `SectionModificationRule`) for renaming detected sections based on the context of the section in the text. +`SectionTokenizer` also allows to define rules (i.e., `SectionModificationRule`) +for renaming detected sections based on the context of the section in the text. Let's take the same example. - -```{code-cell} ipython3 +:::{code} from medkit.text.segmentation.section_tokenizer import SectionTokenizer, SectionModificationRule section_dict = { @@ -111,14 +97,13 @@ section_dict = { "examen clinique": ["EXAMEN PHYSIQUE"], "diagnostique": ["EVALUATION"], } -``` +::: Now, let's add some rules for managing these cases: - if `traitement` section is detected before `diagnostique` section, then we rename it into `traitement_entree` - if `traitement` section is detected after `diagnostique` section, then we rename it into `traitement_sortie` - -```{code-cell} ipython3 +:::{code} treatment_rules = [ SectionModificationRule( section_name="traitement", @@ -133,12 +118,11 @@ treatment_rules = [ ] tokenizer = SectionTokenizer(section_dict=section_dict, section_rules=treatment_rules) -``` +::: Let's run this new operation on document raw text. - -```{code-cell} ipython3 +:::{code} sections = tokenizer.run([doc.raw_segment]) print(f"Number of detected sections: {len(sections)}\n") @@ -148,20 +132,19 @@ for section in sections: print(f"label = {section.label}") print(f"spans = {section.spans}") print(f"text = {section.text!r}\n") -``` +::: -As you can see, we still detect 6 sections but 2 have been renamed into `traitement_entree` and `traitement_sortie`. +There are still 6 sections detected, but 2 have been renamed to `traitement_entree` and `traitement_sortie`. -## Using a yaml definition file +## Using a YAML definition file We have seen how to write rules programmatically. -However, it is also possible to load a yaml file containing all your rules. +However, it is also possible to load a YAML file containing all your rules. -First, let's create the yaml file based on previous steps. +First, let's create the YAML file corresponding to the previous steps. - -```{code-cell} ipython3 +:::{code} import pathlib filepath = pathlib.Path("section.yml") @@ -174,12 +157,11 @@ SectionTokenizer.save_section_definition( with open(filepath, 'r') as f: print(f.read()) -``` - -Now, we will see how to initialize the `SectionTokenizer` operation for using this yaml file. +::: +Now, we will see how to initialize the `SectionTokenizer` operation for using this YAML file. -```{code-cell} ipython3 +:::{code} # Use tokenizer initialized using a yaml file from medkit.text.segmentation.section_tokenizer import SectionTokenizer @@ -189,12 +171,11 @@ print(f"section_dict = {section_dict!r}\n") print(f"section_rules = {section_rules!r}") tokenizer = SectionTokenizer(section_dict=section_dict, section_rules=section_rules) -``` +::: Now, let's run the operation. We can observe that the results are the same. - -```{code-cell} ipython3 +:::{code} sections = tokenizer.run([doc.raw_segment]) print(f"Number of detected sections: {len(sections)}\n") @@ -204,9 +185,8 @@ for section in sections: print(f"label = {section.label}") print(f"spans = {section.spans}") print(f"text = {section.text!r}\n") -``` +::: -```{code-cell} ipython3 -:tags: [remove-cell] +:::{code} filepath.unlink() -``` +::: diff --git a/docs/examples/text_segmentation/syntagma.md b/docs/examples/text_segmentation/syntagma.md index 99a96a01..6b597d5c 100644 --- a/docs/examples/text_segmentation/syntagma.md +++ b/docs/examples/text_segmentation/syntagma.md @@ -1,29 +1,12 @@ ---- -jupytext: - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.14.5 -kernelspec: - display_name: Python 3 (ipykernel) - language: python - name: python3 ---- - # Syntagma Tokenizer -+++ - -This tutorial will show an example of how to apply syntagma tokenizer medkit operation on a text document. - -+++ +This tutorial will show an example of how to apply syntagma tokenizer `medkit` operation on a text document. ## Loading a text document -For beginners, let's load a text file using the {class}`~medkit.core.text.TextDocument` class: +First, let's load a text file using the {class}`~medkit.core.text.TextDocument` class: -```{code-cell} ipython3 +:::{code} # You can download the file available in source code # !wget https://raw.githubusercontent.com/medkit-lib/medkit/main/docs/data/text/1.txt @@ -31,20 +14,20 @@ from pathlib import Path from medkit.core.text import TextDocument doc = TextDocument.from_file(Path("../../data/text/1.txt")) -``` +::: The full raw text can be accessed through the `text` attribute: -```{code-cell} ipython3 +:::{code} print(doc.text) -``` +::: ## Defining syntagma definition rules -To split the text document into medkit segments corresponding to a text part, we have to define a set of rules. +To split the text document into segments corresponding to a text part, we have to define a set of rules. These rules allow the operation to split the text based on regular expressions rules. -```{code-cell} ipython3 +:::{code} from medkit.text.segmentation.syntagma_tokenizer import SyntagmaTokenizer separators = ( @@ -57,17 +40,19 @@ separators = ( ) tokenizer = SyntagmaTokenizer(separators) -``` +::: The syntagmas definition is a list of regular expressions allowing to trigger the start of a new syntagma. -+++ +Like other operations, `SyntagmaTokenizer` defines a `run()` method. +This method returns a list of {class}`~medkit.core.text.Segment` objects +(a `Segment` is a `TextAnnotation` that represents a portion of a document's full raw text). -As all operations, `SyntagmaTokenizer` defines a `run()` method. This method returns a list of {class}`~medkit.core.text.Segment` objects (a `Segment` is a -`TextAnnotation` that represents a portion of a document's full raw text). -As input, it also expects a list of `Segment` objects. Here, we can pass a special segment containing the whole raw text of the document, that we can retrieve through the `raw_segment` attribute of `TextDocument`: +As input, it also expects a list of `Segment` objects. +Here, we can pass a special segment containing the whole raw text of the document, +that we can retrieve through the `raw_segment` attribute of `TextDocument`: -```{code-cell} ipython3 +:::{code} syntagmas = tokenizer.run([doc.raw_segment]) print(f"Number of detected syntagmas: {len(syntagmas)}") @@ -75,21 +60,20 @@ print(f"Syntagmas label: {syntagmas[0].label}\n") for syntagma in syntagmas: print(f"{syntagma.spans}\t{syntagma.text!r}") -``` - -As you can see, the text have been split into 39 medkit segments which default label is `"SYNTAGMA"`. Corresponding `spans` reflect the position of the text in the document's full raw text. +::: -+++ +As you can see, the text have been split into 39 segments, which default label is `"SYNTAGMA"`. +The corresponding spans reflect the position of the text in the document's raw text. -## Using a yaml definition file +## Using a YAML definition file We have seen how to write rules programmatically. -However, it is also possible to load a yaml file containing all your rules. +However, it is also possible to load a YAML file containing all your rules. -First, let's create the yaml file based on previous steps. +First, let's create the YAML file based on previous steps. -```{code-cell} ipython3 +:::{code} import pathlib filepath = pathlib.Path("syntagma.yml") @@ -101,11 +85,11 @@ SyntagmaTokenizer.save_syntagma_definition( with open(filepath, 'r') as f: print(f.read()) -``` +::: Now, we will see how to initialize the `SyntagmaTokenizer` operation for using this yaml file. -```{code-cell} ipython3 +:::{code} # Use tokenizer initialized using a yaml file from medkit.text.segmentation import SyntagmaTokenizer @@ -115,22 +99,20 @@ print("separators = ") for sep in separators: print(f"- {sep!r}") - tokenizer = SyntagmaTokenizer(separators=separators) -``` +::: Now let's run the operation. We can observe that the results are the same. -```{code-cell} ipython3 +:::{code} syntagmas = tokenizer.run([doc.raw_segment]) print(f"Number of detected syntagmas: {len(syntagmas)}\n") for syntagma in syntagmas: print(f"{syntagma.spans}\t{syntagma.text!r}") -``` +::: -```{code-cell} ipython3 -:tags: [remove-cell] +:::{code} filepath.unlink() -``` +::: From c9cb9da08eb8aeaaf0c8c3fc42490000ffd656fd Mon Sep 17 00:00:00 2001 From: Ghislain Vaillant Date: Wed, 28 Feb 2024 16:25:36 +0100 Subject: [PATCH 09/21] DOC: Update spacy examples --- docs/examples/spacy/index.md | 22 ++-- docs/examples/spacy/spacy_io.md | 148 ++++++++++++-------------- docs/examples/spacy/spacy_pipeline.md | 143 +++++++++++-------------- 3 files changed, 147 insertions(+), 166 deletions(-) diff --git a/docs/examples/spacy/index.md b/docs/examples/spacy/index.md index bbff4d5e..1936a69b 100644 --- a/docs/examples/spacy/index.md +++ b/docs/examples/spacy/index.md @@ -1,18 +1,22 @@ # Spacy integration -[spaCy](https://spacy.io/) is a library for advanced Natural Language Processing in Python. Medkit supports Spacy in input/output conversion as well as annotator. +[spaCy](https://spacy.io/) is a library for advanced Natural Language Processing in Python. -| Task | Medkit Operation | -| :------------------------------------------ | --------------------------------------------------------------------------------------- | -| Load SpacyDocs | {class}`~medkit.io.spacy.SpacyInputConverter` | -| Convert documents to SpacyDocs | {class}`~medkit.io.spacy.SpacyOutputConverter` | -| Annotate segments using a Spacy pipeline | {class}`~medkit.text.spacy.pipeline.SpacyPipeline` | -| Annotate documents using a Spacy pipeline | {class}`~medkit.text.spacy.doc_pipeline.SpacyDocPipeline` | +`medkit` supports `spaCy` through input and output conversions as well as annotators. + +| Task | Operation | +|:--------------------------------------------|-----------------------------------------------------------------------------------------| +| Load a spaCy Doc | {class}`~medkit.io.spacy.SpacyInputConverter` | +| Convert documents to spaCy Doc | {class}`~medkit.io.spacy.SpacyOutputConverter` | +| Annotate segments using a spaCy pipeline | {class}`~medkit.text.spacy.pipeline.SpacyPipeline` | +| Annotate documents using a spaCy pipeline | {class}`~medkit.text.spacy.doc_pipeline.SpacyDocPipeline` | | Detect syntactic relations between entities | {class}`~medkit.text.relations.syntactic_relation_extractor.SyntacticRelationExtractor` | :::{note} You may refer to {mod}`medkit.text.spacy` for more information. ::: -```{tableofcontents} -``` \ No newline at end of file +:::{toctree} +spacy_io +spacy_pipeline +::: diff --git a/docs/examples/spacy/spacy_io.md b/docs/examples/spacy/spacy_io.md index bc4b5e97..16ff3dbb 100644 --- a/docs/examples/spacy/spacy_io.md +++ b/docs/examples/spacy/spacy_io.md @@ -1,36 +1,28 @@ ---- -jupytext: - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.14.0 -kernelspec: - display_name: Python 3 (ipykernel) - language: python - name: python3 ---- +# Conversions to and from `spaCy` -# Input and Output conversion +`medkit` can load `spaCy` documents with **entities**, **attributes** (custom extensions) and groups of **spans**, +and convert documents back to `spaCy` easily. -Medkit can load spacy documents with **entities**, **attributes** (custom extensions) and groups of **spans** and convert medkit documents to spacy docs easily. +In this example, we will show how to import `spaCy` documents into `medkit` +and how to convert `medkit` documents into `spaCy` documents. -In this example, we will show how to import spacy documents into medkit and how to convert medkit documents into Spacy documents. We use some spacy concepts, more information can be found in the official spacy documentation. +We use some `spaCy` concepts, more information can be found in the official spacy documentation. -```{note} -For this example, you should download the french spacy model. You can download it using: -``` +::::{note} +For this example, you should download the French `spaCy` model. -```{code-cell} ipython3 -:tags: [remove-output] +You can download it using: +:::{code} import spacy.cli spacy.cli.download("fr_core_news_sm") -``` +::: + +:::: -Consider the following spacy document: +Consider the following `spaCy` document: -```{code-cell} ipython3 +:::{code} import spacy from spacy.tokens import Span as SpacySpan @@ -55,49 +47,52 @@ if not SpacySpan.has_extension("country"): for e in spacy_doc.ents: if e.label_ == 'LOC': e._.set("country", 'France') -``` +::: -**Description of the spacy document** +Description of the `spaCy` document: ---- -* Entities ---- +- Entities -```{code-cell} ipython3 +:::{code} from spacy import displacy displacy.render(spacy_doc, style="ent") -``` +::: ---- -* Spans ---- +- Spans -```{code-cell} ipython3 +:::{code} displacy.render(spacy_doc, style="span", options={"spans_key": "SECTION"}) -``` +::: The spacy document has **2** entities and **1** span group called `SECTION`. The entity 'LOC' has **1** attribute called `country`. Let's see how to convert this spacy doc in a `TextDocument` with annotations. -## Load SpacyDocs into a list of TextDocuments +## Load a `spaCy` Doc into a list of TextDocuments -The class {class}`~medkit.io.spacy.SpacyInputConverter` is in charge of converting spacy Docs into a list of TextDocuments. By default, it loads **all** entities, span groups and extension attributes for each SpacyDoc object, but you can use the `entities`, `span_groups` and `attrs` parameters to specify which items should be converted, based on their labels. +The class {class}`~medkit.io.spacy.SpacyInputConverter` is in charge of converting +`spaCy` Docs into a list of TextDocuments. -```{tip} -You can enable provenance tracing by assigning a {class}`~medkit.core.ProvTracer` object to the SpacyInputConverter with the `set_prov_tracer()` method. -``` +By default, it loads **all** entities, span groups and extension attributes for each SpacyDoc object, +but you can use the `entities`, `span_groups` and `attrs` parameters to specify which items should be converted, +based on their labels. -```{note} -**Span groups in medkit** +:::{tip} +You can enable provenance tracing by assigning a {class}`~medkit.core.ProvTracer` object +to the SpacyInputConverter with the `set_prov_tracer` method. +::: -In spacy, the spans are grouped with a `key` and each span can have its own label. To be compatible, medkit uses the key as the span `label` and the spacy label is stored as `name` in its metadata. -``` +:::{note} +**Span groups in medkit** +In `spaCy`, the spans are grouped with a _key_ and each span can have its own label. +To remain compatible, `medkit` uses the key as the span _label_ +and the spacy label is stored as _name_ in its metadata. +::: -```{code-cell} ipython3 +:::{code} from medkit.io.spacy import SpacyInputConverter # Define default Input Converter @@ -106,44 +101,52 @@ spacy_input_converter = SpacyInputConverter() # Load spacy doc into a list of documents docs = spacy_input_converter.load([spacy_doc]) medkit_doc = docs[0] -``` +::: **Description of the resulting Text document** -+++ -```{code-cell} ipython3 +:::{code} print(f"The medkit doc has {len(medkit_doc.anns)} annotations.") print(f"The medkit doc has {len(medkit_doc.anns.get_entities())} entities.") print(f"The medkit doc has {len(medkit_doc.anns.get_segments())} segment.") -``` +::: + **What about 'LOC' entity?** -```{code-cell} ipython3 + +:::{code} entity = medkit_doc.anns.get(label="LOC")[0] attributes = entity.attrs.get(label="country") print(f"Entity label={entity.label}, Entity text={entity.text}") print("Attributes loaded from spacy") print(attributes) -``` +::: + **Visualizing Medkit annotations** -As explained in other tutorials, we can display medkit entities using `displacy`, a visualizer developed by Spacy. You can use the {func}`~medkit.text.spacy.displacy_utils.medkit_doc_to_displacy` function to format medkit entities. +As explained in other tutorials, we can display `medkit` entities using `displacy`, +a visualizer developed by `spaCy`. +You can use the {func}`~medkit.text.spacy.displacy_utils.medkit_doc_to_displacy` function to format `medkit` entities. -```{code-cell} ipython3 +:::{code} from medkit.text.spacy.displacy_utils import medkit_doc_to_displacy # getting entities in displacy format (default config) entities_data = medkit_doc_to_displacy(medkit_doc) displacy.render(entities_data, style="ent",manual=True) -``` +::: -## Convert TextDocuments to SpacyDocs +## Convert TextDocuments to a `spaCy` Doc -Similarly it is possible to convert a list of TextDocument to Spacy using {class}`~medkit.io.spacy.SpacyOutputConverter`. +Likewise, it is possible to convert a list of TextDocument to `spaCy` +using {class}`~medkit.io.spacy.SpacyOutputConverter`. -You will need to provide an `nlp` object that tokenizes and generates the document with the raw text as reference. By default, it converts **all** medkit annotations and attributes to Spacy, but you can use `anns_labels` and `attrs` parameters to specify which items should be converted. +You will need to provide a `nlp` object that tokenizes and generates +the document with the raw text as reference. By default, it converts +**all** `medkit` annotations and attributes to `spaCy`, but you can use +`anns_labels` and `attrs` parameters to specify which items should be converted. -```{code-cell} ipython3 +:::{code} from medkit.io.spacy import SpacyOutputConverter # define Output Converter with default params @@ -156,39 +159,28 @@ spacy_doc = spacy_docs[0] # Explore new spacy doc print("Text of spacy doc from TextDocument:\n",spacy_doc.text) -``` +::: **Description of the resulting Spacy document** ---- -* Entities imported from medkit ---- +- Entities imported from `medkit` -```{code-cell} ipython3 +:::{code} displacy.render(spacy_doc, style="ent") -``` +::: ---- -* Spans imported from medkit ---- +- Spans imported from `medkit` -```{code-cell} ipython3 +:::{code} displacy.render(spacy_doc, style="span",options={"spans_key": "SECTION"}) - -``` +::: **What about 'LOC' entity?** -```{code-cell} ipython3 + +:::{code} entity = [e for e in spacy_doc.ents if e.label_ == 'LOC'][0] attribute = entity._.get('country') print(f"Entity label={entity.label_}. Entity text={entity.text}") print("Attribute imported from medkit") print(f"The attr `country` was imported? : {attribute is not None}, value={entity._.get('country')}") -``` - -:::{seealso} -cf. [Spacy IO module](api:io:spacy). - -Medkit has more components related to spacy, you may see [Spacy text module](api:text:spacy). - ::: diff --git a/docs/examples/spacy/spacy_pipeline.md b/docs/examples/spacy/spacy_pipeline.md index 3232cccf..062174c3 100644 --- a/docs/examples/spacy/spacy_pipeline.md +++ b/docs/examples/spacy/spacy_pipeline.md @@ -1,27 +1,14 @@ ---- -jupytext: - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.14.5 -kernelspec: - display_name: Python 3 (ipykernel) - language: python - name: python3 ---- +# Annotating with a `spaCy` pipeline -# Annotating with a Spacy pipeline -+++ +This example shows how to combine **medkit** and a **spacy pipeline** to annotate `medkit` documents. -This example shows how to combine **medkit** and a **spacy pipeline** to annotate medkit documents. +`SpaCy` has some projects in its universe with custom versions of pipeline objects. -SpaCy has some projects in its universe with custom versions of spaCy pipeline objects. - -This example uses English documents, as the pipelines we will use do not work with French documents. The aim of this example is to show how to annotate with spacy, but you could use your own custom pipelines that work with French documents. - -```{code-cell} ipython3 +This example uses English documents, as the pipelines we will use do not work with French documents. +The aim of this example is to show how to annotate with `spaCy`, +but you could use your own custom pipelines that work with French documents. +:::{code} # You can download the file available in source code # !wget https://raw.githubusercontent.com/medkit-lib/medkit/main/docs/data/text/1-EN-version.txt from pathlib import Path @@ -29,31 +16,31 @@ from medkit.core.text import TextDocument medkit_doc = TextDocument.from_file(Path("../../data/text/1-EN-version.txt")) print(medkit_doc.text) -``` +::: -The document has a few sections describing the status of a female patient. We can start by detecting some entities. In the spacy universe, we found a connector [spacy-stanza](https://github.com/explosion/spacy-stanza) to the Stanza library. **Stanza**[^footnote1] is a library developed by the Stanford NLP research group and has some biomedical and clinical NER models for english documents. +The document has a few sections describing the status of a female patient. We can start by detecting some entities. +In the spacy universe, we found a connector [spacy-stanza](https://github.com/explosion/spacy-stanza) to the `Stanza` library. +**Stanza**[^footnote1] is a library developed by the Stanford NLP research group +and has some biomedical and clinical NER models for english documents. -[^footnote1]:Peng Qi, Yuhao Zhang, Yuhui Zhang, Jason Bolton and Christopher D. Manning. 2020. Stanza: A Python Natural Language Processing Toolkit for Many Human Languages. In Association for Computational Linguistics (ACL) System Demonstrations. 2020. +[^footnote1]: Peng Qi, Yuhao Zhang, Yuhui Zhang, Jason Bolton and Christopher D. Manning. 2020. Stanza: A Python Natural Language Processing Toolkit for Many Human Languages. In Association for Computational Linguistics (ACL) System Demonstrations. 2020. -```{code-cell} ipython3 -:tags: [remove-output] +:::{code} # install spacy-stanza !python -m pip install spacy-stanza -``` -## Annotating segments with spacy +::: + +## Annotating segments with `spaCy` -Let's see how to create medkit entities with a nlp spacy object +Let's see how to create `medkit` entities with a nlp `spaCy` object. -### Prepare the spacy-stanza nlp pipeline +### Prepare the spacy-stanza NLP pipeline The list of available [biomedical NER packages](https://stanfordnlp.github.io/stanza/available_biomed_models.html#biomedical--clinical-ner-models). Let's download the `i2b2` stanza package, a pretrained model to detect 'PROBLEM', 'TEST', 'TREATMENT' entities. - -```{code-cell} ipython3 -:tags: [remove-output] - +:::{code} # import spacy related modules import stanza import spacy_stanza @@ -61,21 +48,20 @@ import spacy_stanza # stanza creates a nlp object in disk # download and initialize the i2b2 pipeline stanza.download('en', package='i2b2') -``` +::: -```{code-cell} ipython3 -:tags: [remove-output] +:::{code} # Define the nlp object nlp_spacy = spacy_stanza.load_pipeline('en', package='mimic', processors={'ner': 'i2b2'}) -``` +::: -### Define a medkit operation to add the entities +### Define a `medkit` operation to add the entities Medkit has the {class}`~.text.spacy.SpacyPipeline` operation, an operation that can wrap a nlp spacy object to annotate segments. A nlp object may create many spacy annotations, you can select the spacy entities, spans and attributes that will be converted to medkit annotations. By default, all are converted into medkit annotations. -```{code-cell} ipython3 +:::{code} from medkit.text.spacy import SpacyPipeline # Defines the medkit operation @@ -87,55 +73,53 @@ entities = medkit_stanza_matcher.run([medkit_doc.raw_segment]) # Add entities to the medkit document for ent in entities: medkit_doc.anns.add(ent) -``` +::: -```{code-cell} ipython3 +:::{code} print(medkit_doc.anns.get_entities()[0]) -``` +::: -That's all! We have detected entities using the biomedical model developed by the Stanford group. +We have detected entities using the biomedical model developed by the Stanford group. Let's visualize all the detected entities. -```{code-cell} ipython3 +:::{code} from spacy import displacy from medkit.text.spacy.displacy_utils import medkit_doc_to_displacy -``` - +::: -```{code-cell} ipython3 +:::{code} # Add some colors options_displacy = dict(colors={'TREATMENT': "#85C1E9", "PROBLEM": "#cfe2f3"}) # Format the medkit doc to displacy displacy_data = medkit_doc_to_displacy(medkit_doc) displacy.render(displacy_data,style="ent",manual=True, options=options_displacy) -``` +::: - -## Annotating documents with spacy +## Annotating documents with `spaCy` Here, we already have an annotated document. We will see how to use spacy to enrich existing annotations. -Exploring the spacy universe, we found [**negspaCy**](https://spacy.io/universe/project/negspacy), a pipeline that detects negation in spacy entities. Using the 'SpacyDoc' class, we can annotate the entities of the document and add those attributes directly. +Exploring the spacy universe, we found [**negspaCy**](https://spacy.io/universe/project/negspacy), +a pipeline that detects negation in spacy entities. +Using the 'SpacyDoc' class, we can annotate the entities of the document and add those attributes directly. -### Prepare the negspacy nlp object: +### Prepare the `negspacy` nlp object -```{code-cell} ipython3 -:tags: [remove-output] -# install negspacy +:::{code} +# install negspacy !python -m pip install negspacy -``` +::: -```{code-cell} ipython3 -:tags: [remove-output] +:::{code} # download english model from spacy import spacy if not spacy.util.is_package("en_core_web_sm"): spacy.cli.download("en_core_web_sm") -``` +::: -```{code-cell} ipython3 +:::{code} # Import spacy nlp object from negspacy from negspacy.negation import Negex @@ -145,16 +129,18 @@ nlp_spacy_negex = spacy.load("en_core_web_sm",disable=["ner"]) # Disable NER by # Config to detect negation in the i2b2 entities i2b2_labels = ["PROBLEM","TEST","TREATMENT"] nlp_spacy_negex.add_pipe("negex", config={"ent_types":i2b2_labels}) -``` - +::: -### Define a medkit operation to add the attributes +### Define a `medkit` operation to add the attributes -Medkit has the {class}`~.text.spacy.SpacyDocPipeline` operation, an operation that can wrap a nlp spacy object to annotate documents. +`medkit` has the {class}`~.text.spacy.SpacyDocPipeline` operation, +an operation that can wrap a nlp spacy object to annotate documents. -The point is to add attributes to the entities, so we select the entities of interest and do not transfer their current attributes, as they are not needed to detect the negation. +The point is to add attributes to the entities, +so we select the entities of interest and do not transfer their current attributes, +as they are not needed to detect the negation. -```{code-cell} ipython3 +:::{code} from medkit.text.spacy import SpacyDocPipeline # Define the spacy wrapper @@ -167,20 +153,20 @@ negation_detector = SpacyDocPipeline( # The docPipeline automatically adds annotations to the document # it is not necessary to add annotations as in the case of `medkit_stanza_matcher` negation_detector.run([medkit_doc]) -``` +::: Let's see if the negation has been detected in the entities. - -```{code-cell} ipython3 +:::{code} print(medkit_doc.anns.get_entities()[0]) -``` +::: -As we can see, the entity now has an attribute called **negex** with `value=false`. Which means that the entity is not part of a negation. +As we can see, the entity now has an attribute called _negex_ with `value=false`, +which means that the entity is not part of a negation. Let's find the negated entities: -```{code-cell} ipython3 +:::{code} print("The following entities are negated: \n\n") for entity in medkit_doc.anns.get_entities(): # Get the negex attr @@ -189,12 +175,11 @@ for entity in medkit_doc.anns.get_entities(): # If the attr exists and is positive, show a message. if len(attrs) > 0 and attrs[0].value: print(entity.label,entity.text,entity.spans) -``` +::: -We can show the attribute value using displacy with more information in the labels +We can show the attribute value using `displacy` with more information in the labels. - -```{code-cell} ipython3 +:::{code} # enrich entity labels with [NEG] suffix def format_entity(entity): label = entity.label @@ -208,7 +193,7 @@ options_displacy = dict(colors={'TREATMENT [NEG]': "#D28E98", "PROBLEM [NEG]": " # Format the medkit doc to displacy with a entity formatter displacy_data = medkit_doc_to_displacy(medkit_doc,entity_formatter=format_entity) displacy.render(displacy_data,style="ent",manual=True, options=options_displacy) -``` - -For more information about advanced usage of spacy related operations, you may refer to the API doc of {mod}`medkit.text.spacy`. +::: +For more information about advanced usage of spacy related operations, +you may refer to the API doc of {mod}`medkit.text.spacy`. From cb9e2f818528ebd5fdf6c03d1cf9c1560c394a0a Mon Sep 17 00:00:00 2001 From: Ghislain Vaillant Date: Wed, 28 Feb 2024 16:26:24 +0100 Subject: [PATCH 10/21] DOC: Update spans example --- docs/examples/spans.md | 61 +++++++++++++++++------------------------- 1 file changed, 24 insertions(+), 37 deletions(-) diff --git a/docs/examples/spans.md b/docs/examples/spans.md index 6156080f..18d4fcd8 100644 --- a/docs/examples/spans.md +++ b/docs/examples/spans.md @@ -1,26 +1,13 @@ ---- -jupytext: - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.13.8 -kernelspec: - display_name: Python 3 (ipykernel) - language: python - name: python3 ---- - # Text spans Here are some examples about usage of span utilities. -```{code-cell} ipython3 +:::{code} from medkit.core.text.span import Span from medkit.core.text.span_utils import replace, remove, move, extract, insert -``` +::: -```{code-cell} ipython3 +:::{code} raw_text = ( "Cher M. Dupond,\nJ’ai vu en consultation (à mon cabinet le 2019-02-01) " "Bertrand AGITE, né le 2008-02-25," @@ -28,9 +15,9 @@ raw_text = ( ) text = raw_text spans = [Span(0, len(raw_text))] -``` +::: -```{code-cell} ipython3 +:::{code} import re # replace "M." by "M @@ -39,9 +26,9 @@ match = re.search(r"M.", text, re.M) text, spans = replace(text, spans, [match.span()], ["M"]) print(text) print(spans) -``` +::: -```{code-cell} ipython3 +:::{code} # remove final endline match = re.search(r"\n\Z", text, re.M) text, spans = remove(text, spans, [match.span()]) @@ -50,9 +37,9 @@ text, spans = remove(text, spans, [match.span()]) ranges = [m.span() for m in re.finditer(r"\n+", text, re.M)] text, spans = replace(text, spans, ranges, [" "] * len(ranges)) print(text) -``` +::: -```{code-cell} ipython3 +:::{code} # extract sentences sentences = [] for match in re.finditer(r"[^\.]+\.", text, re.M): @@ -63,17 +50,17 @@ text_1, spans_1 = sentences[0] text_2, spans_2 = sentences[1] print(text_1) print(text_2) -``` +::: -```{code-cell} ipython3 +:::{code} # move parenthesized text to end in 1st sentence match = re.search(r" *\((.*)\)", text_1, re.M) text_1, spans_1 = insert(text_1, spans_1, [len(text_1) - 1], [" ; "]) text_1, spans_1 = move(text_1, spans_1, match.span(1), len(text_1) - 1) print(text_1) -``` +::: -```{code-cell} ipython3 +:::{code} # reformat dates in 1st sentence matches = list(re.finditer(r"\d{4}-\d{2}-\d{2}", text_1, re.M)) ranges = [m.span() for m in matches] @@ -83,32 +70,32 @@ new_dates = [ ] text_1, spans_1 = replace(text_1, spans_1, ranges, new_dates) print(text_1) -``` +::: -```{code-cell} ipython3 +:::{code} # replace "(-)" by "negatif" in 2d sentence match = re.search(r"\(-\)", text_2, re.M) text_2, spans_2 = replace(text_2, spans_2, [match.span()], ["negatif"]) print(text_2) -``` +::: -```{code-cell} ipython3 +:::{code} # find person entity in 1st sentence match = re.search(r"M [a-zA-Z]+", text_1) person_text, person_spans = extract( text_1, spans_1, [match.span()] ) -``` +::: -```{code-cell} ipython3 +:::{code} # find date entities in 1st sentence dates = [] for match in re.finditer(r"\d{2}/\d{2}/\d{4}", text_1): date_text, date_spans = extract(text_1, spans_1, [match.span()]) dates.append((date_text, date_spans)) -``` +::: -```{code-cell} ipython3 +:::{code} from medkit.core.text.span_utils import normalize_spans entities = [] @@ -119,9 +106,9 @@ for _, date_spans in dates: date_spans = normalize_spans(date_spans) entities.append(("date", date_spans)) print(entities) -``` +::: -```{code-cell} ipython3 +:::{code} from spacy import displacy entities_data = [ @@ -132,4 +119,4 @@ entities_data = [ entities_data = sorted(entities_data, key=lambda e: e["start"]) data = {"text": raw_text, "ents": entities_data, "uuid": 0} displacy.render(data, manual=True, style="ent", jupyter=True, minify=True) -``` +::: From 68209dc146cf023e3cadb43fb44704aaf590dee6 Mon Sep 17 00:00:00 2001 From: Ghislain Vaillant Date: Wed, 28 Feb 2024 16:36:52 +0100 Subject: [PATCH 11/21] DOC: Fixup toc tree for examples --- docs/index.md | 5 ----- 1 file changed, 5 deletions(-) diff --git a/docs/index.md b/docs/index.md index ea0f1df6..724e4039 100644 --- a/docs/index.md +++ b/docs/index.md @@ -42,13 +42,8 @@ titlesonly: examples/spans examples/cleaning_text examples/text_segmentation/index -examples/text_segmentation/section -examples/text_segmentation/syntagma -examples/text_segmentation/document examples/brat_io examples/spacy/index -examples/spacy/spacy_io -examples/spacy/spacy_pipeline examples/custom_text_operation examples/edsnlp examples/iamsystem From 99a0236798f69cf82386614d6688deaa4cd85e5d Mon Sep 17 00:00:00 2001 From: Ghislain Vaillant Date: Wed, 28 Feb 2024 16:41:11 +0100 Subject: [PATCH 12/21] DOC: Move text segmentation to tutorial --- docs/index.md | 2 +- docs/{examples => tutorial}/text_segmentation/document.md | 0 docs/{examples => tutorial}/text_segmentation/index.md | 2 +- docs/{examples => tutorial}/text_segmentation/section.md | 0 docs/{examples => tutorial}/text_segmentation/syntagma.md | 0 5 files changed, 2 insertions(+), 2 deletions(-) rename docs/{examples => tutorial}/text_segmentation/document.md (100%) rename docs/{examples => tutorial}/text_segmentation/index.md (84%) rename docs/{examples => tutorial}/text_segmentation/section.md (100%) rename docs/{examples => tutorial}/text_segmentation/syntagma.md (100%) diff --git a/docs/index.md b/docs/index.md index 724e4039..8c5daa51 100644 --- a/docs/index.md +++ b/docs/index.md @@ -41,7 +41,6 @@ titlesonly: --- examples/spans examples/cleaning_text -examples/text_segmentation/index examples/brat_io examples/spacy/index examples/custom_text_operation @@ -62,6 +61,7 @@ titlesonly: --- tutorial/context_detection tutorial/entity_matching +tutorial/text_segmentation/index ::: :::{toctree} diff --git a/docs/examples/text_segmentation/document.md b/docs/tutorial/text_segmentation/document.md similarity index 100% rename from docs/examples/text_segmentation/document.md rename to docs/tutorial/text_segmentation/document.md diff --git a/docs/examples/text_segmentation/index.md b/docs/tutorial/text_segmentation/index.md similarity index 84% rename from docs/examples/text_segmentation/index.md rename to docs/tutorial/text_segmentation/index.md index aedb27b9..79861138 100644 --- a/docs/examples/text_segmentation/index.md +++ b/docs/tutorial/text_segmentation/index.md @@ -1,4 +1,4 @@ -# Text segmentation examples +# Text segmentation :::{note} You may refer to the [text segmentation section](api:text:segmentation_modules) for more information. diff --git a/docs/examples/text_segmentation/section.md b/docs/tutorial/text_segmentation/section.md similarity index 100% rename from docs/examples/text_segmentation/section.md rename to docs/tutorial/text_segmentation/section.md diff --git a/docs/examples/text_segmentation/syntagma.md b/docs/tutorial/text_segmentation/syntagma.md similarity index 100% rename from docs/examples/text_segmentation/syntagma.md rename to docs/tutorial/text_segmentation/syntagma.md From 9c13dd6f8a61eb41893ecdf71a5216c1c7330a55 Mon Sep 17 00:00:00 2001 From: Ghislain Vaillant Date: Mon, 4 Mar 2024 17:22:52 +0100 Subject: [PATCH 13/21] DOC: Rework index page, add license section --- docs/changelog.md | 4 +-- docs/index.md | 63 ++++++++++++++++++++++------------------------- docs/license.md | 5 ++++ 3 files changed, 37 insertions(+), 35 deletions(-) create mode 100644 docs/license.md diff --git a/docs/changelog.md b/docs/changelog.md index b48597b7..66efc0fe 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,2 +1,2 @@ -:::{include} ../CHANGELOG.md -::: \ No newline at end of file +```{include} ../CHANGELOG.md +``` diff --git a/docs/index.md b/docs/index.md index 8c5daa51..3d523845 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,10 +1,10 @@ -# medkit +# Overview `medkit` is a Python library which facilitates **extraction of features** from various modalities of patient data, including text and audio for now -- relational, image, genetic, and others will follow soon. -To this end, medkit enables composition of pipelines with multiple modules, +To this end, `medkit` enables composition of pipelines with multiple modules, developed by us, yourself or others. `medkit` places a strong emphasis on **non-destructive operations**, @@ -20,25 +20,33 @@ Some public interfaces may change in the future. Please check the **BREAKING CHANGES** section of the project's changelog for details. ::: -:::{toctree} ---- -caption: User Guide -hidden: -titlesonly: ---- +```{toctree} +:caption: User Guide +:hidden: +:titlesonly: + user_guide/install user_guide/first_steps user_guide/pipeline user_guide/provenance user_guide/module -::: +``` + +```{toctree} +:caption: Tutorial +:hidden: +:titlesonly: + +tutorial/context_detection +tutorial/entity_matching +tutorial/text_segmentation/index +``` + +```{toctree} +:caption: Cookbook +:hidden: +:titlesonly: -:::{toctree} ---- -caption: Examples -hidden: -titlesonly: ---- examples/spans examples/cleaning_text examples/brat_io @@ -51,25 +59,13 @@ examples/detecting_text_duplicates examples/audio_transcription examples/audio_dataset_metrics examples/ontotox -::: +``` -:::{toctree} ---- -caption: Tutorial -hidden: -titlesonly: ---- -tutorial/context_detection -tutorial/entity_matching -tutorial/text_segmentation/index -::: +```{toctree} +:caption: Reference +:hidden: +:titlesonly: -:::{toctree} ---- -caption: Reference -hidden: -titlesonly: ---- api/_generated/index api/audio api/core @@ -80,4 +76,5 @@ api/text api/training api/tools changelog -::: +license +``` diff --git a/docs/license.md b/docs/license.md new file mode 100644 index 00000000..22567b68 --- /dev/null +++ b/docs/license.md @@ -0,0 +1,5 @@ +# License + +```{literalinclude} ../LICENSE +:language: text +``` From 4a11195c93f09e8d61e408078c10742e39fa132e Mon Sep 17 00:00:00 2001 From: Ghislain Vaillant Date: Mon, 4 Mar 2024 18:00:26 +0100 Subject: [PATCH 14/21] DOC: Add overview explaining all 4 types of documentation --- docs/conf.py | 1 + docs/index.md | 41 ++++++++++++++++++++++++++++++++++++++--- pyproject.toml | 1 + 3 files changed, 40 insertions(+), 3 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 55923735..82b2c636 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -18,6 +18,7 @@ "myst_parser", "numpydoc", "sphinxcontrib.mermaid", + "sphinx_design", ] templates_path = ["_templates"] exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] diff --git a/docs/index.md b/docs/index.md index 3d523845..2495ec5c 100644 --- a/docs/index.md +++ b/docs/index.md @@ -4,16 +4,51 @@ from various modalities of patient data, including text and audio for now -- relational, image, genetic, and others will follow soon. -To this end, `medkit` enables composition of pipelines with multiple modules, -developed by us, yourself or others. - `medkit` places a strong emphasis on **non-destructive operations**, i.e. no loss of information when passing data from a module to another, and a flexible tracing of **data provenance**. +It enables composition of pipelines with multiple modules, +developed by the _HeKA Research Team_, contributors, and eventually yourself. `medkit` aims at accelerating the development of a learning health system, with a strong dedication to open-source and community development. +::::{grid} 2 +:gutter: 2 + +:::{grid-item-card} User Guide + +To get started with `medkit` + ++++ +[Learn more »](user_guide/first_steps) +::: + +:::{grid-item-card} Tutorial + +To walk through `medkit` features + ++++ +[Learn more »](tutorial/entity_matching) +::: + +:::{grid-item-card} Cookbook + +To learn `medkit` by examples + ++++ +[Learn more »](examples/spans) +::: + +:::{grid-item-card} Reference + +For developers and contributors + ++++ +[Learn more »](api/core) +::: +:::: + :::{warning} The `medkit` core library is still under heavy development and testing. Some public interfaces may change in the future. diff --git a/pyproject.toml b/pyproject.toml index d78876fc..c612ebef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -211,6 +211,7 @@ dependencies = [ "sphinx-autoapi", "sphinx-autobuild", "sphinx-book-theme", + "sphinx-design", "sphinxcontrib-mermaid", ] From 41b55b20e80fcf4e5a946185e340300f497b9e2e Mon Sep 17 00:00:00 2001 From: Ghislain Vaillant Date: Tue, 5 Mar 2024 10:49:31 +0100 Subject: [PATCH 15/21] DOC: Prettify captions with icons --- docs/index.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/index.md b/docs/index.md index 2495ec5c..94f75992 100644 --- a/docs/index.md +++ b/docs/index.md @@ -16,7 +16,7 @@ with a strong dedication to open-source and community development. ::::{grid} 2 :gutter: 2 -:::{grid-item-card} User Guide +:::{grid-item-card} {octicon}`people;2em;sd-mr-1` User Guide To get started with `medkit` @@ -24,7 +24,7 @@ To get started with `medkit` [Learn more »](user_guide/first_steps) ::: -:::{grid-item-card} Tutorial +:::{grid-item-card} {octicon}`rocket;2em;sd-mr-1` Tutorial To walk through `medkit` features @@ -32,7 +32,7 @@ To walk through `medkit` features [Learn more »](tutorial/entity_matching) ::: -:::{grid-item-card} Cookbook +:::{grid-item-card} {octicon}`book;2em;sd-mr-1` Cookbook To learn `medkit` by examples @@ -40,7 +40,7 @@ To learn `medkit` by examples [Learn more »](examples/spans) ::: -:::{grid-item-card} Reference +:::{grid-item-card} {octicon}`search;2em;sd-mr-1` Reference For developers and contributors @@ -56,7 +56,7 @@ Please check the **BREAKING CHANGES** section of the project's changelog for det ::: ```{toctree} -:caption: User Guide +:caption: 👥 User Guide :hidden: :titlesonly: @@ -68,7 +68,7 @@ user_guide/module ``` ```{toctree} -:caption: Tutorial +:caption: 🚀 Tutorial :hidden: :titlesonly: @@ -78,7 +78,7 @@ tutorial/text_segmentation/index ``` ```{toctree} -:caption: Cookbook +:caption: 📖 Cookbook :hidden: :titlesonly: @@ -97,7 +97,7 @@ examples/ontotox ``` ```{toctree} -:caption: Reference +:caption: 🔍 Reference :hidden: :titlesonly: From 8ed0ce07a5fadf025b7a4087e908bd8aec453f9a Mon Sep 17 00:00:00 2001 From: Ghislain Vaillant Date: Mon, 11 Mar 2024 10:13:07 +0100 Subject: [PATCH 16/21] MAINT: Add sphinx-design to docs dependencies --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index c612ebef..e24c9c44 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -166,6 +166,7 @@ docs = [ "sphinx-autoapi", "sphinx-autobuild", "sphinx-book-theme", + "sphinx-design", "sphinxcontrib-mermaid", ] From 18a286966496d4f5729ff16e2cffe29ab5c45e55 Mon Sep 17 00:00:00 2001 From: Ghislain Vaillant Date: Mon, 11 Mar 2024 10:33:59 +0100 Subject: [PATCH 17/21] MAINT: Update URL for changelog --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e24c9c44..3e52b18b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -171,7 +171,7 @@ docs = [ ] [project.urls] -Changelog = "https://github.com/medkit-lib/medkit/blob/main/CHANGELOG.md" +Changelog = "https://medkit.readthedocs.io/changelog.html" Documentation = "https://medkit.readthedocs.io" Issues = "https://github.com/medkit-lib/medkit/issues" Source = "https://github.com/medkit-lib/medkit" From b445276caa90302398ed8692248e726e2855456e Mon Sep 17 00:00:00 2001 From: Ghislain Vaillant Date: Mon, 11 Mar 2024 13:40:58 +0100 Subject: [PATCH 18/21] DOC: Fix Sphinx errors and warnings --- medkit/io/medkit_json/_common.py | 11 +++++++++++ medkit/text/ner/edsnlp_tnm_matcher.py | 2 +- medkit/tools/e3c_corpus.py | 2 +- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/medkit/io/medkit_json/_common.py b/medkit/io/medkit_json/_common.py index d4c8016c..48565876 100644 --- a/medkit/io/medkit_json/_common.py +++ b/medkit/io/medkit_json/_common.py @@ -14,6 +14,17 @@ class ContentType(enum.Enum): + """Type of content + + Attributes + ---------- + TEXT_DOCUMENT : Text document + TEXT_DOCUMENT_LIST : List of text documents + TEXT_ANNOTATION_LIST : List of text annotations + AUDIO_DOCUMENT : Audio document + AUDIO_DOCUMENT_LIST : List of audio documents + AUDIO_ANNOTATION_LIST : List of audio annotations + """ TEXT_DOCUMENT = "text_document" TEXT_DOCUMENT_LIST = "text_document_list" TEXT_ANNOTATION_LIST = "text_annotation_list" diff --git a/medkit/text/ner/edsnlp_tnm_matcher.py b/medkit/text/ner/edsnlp_tnm_matcher.py index 75bd0d10..ec65d641 100644 --- a/medkit/text/ner/edsnlp_tnm_matcher.py +++ b/medkit/text/ner/edsnlp_tnm_matcher.py @@ -13,7 +13,7 @@ class EDSNLPTNMMatcher(NEROperation): - """TNM (Tumour/Node/Metastasis) string matcher based on the `_EDS-NLP TNM`_ pipeline. + """TNM (Tumour/Node/Metastasis) string matcher based on the `EDS-NLP TNM`_ pipeline. For each TNM string that is found, an entity will be created with an :class:`~medkit.text.ner.TNMAttribute` attribute attached to it containing diff --git a/medkit/tools/e3c_corpus.py b/medkit/tools/e3c_corpus.py index 1292b59a..4893835f 100644 --- a/medkit/tools/e3c_corpus.py +++ b/medkit/tools/e3c_corpus.py @@ -2,7 +2,7 @@ Notes ----- -The `E3C corpus `_ is released under a +The `E3C corpus `_ [1]_ [2]_ is released under a Creative Commons NonCommercial license (CC-BY-NC). From efc40b85e17c067aa1f2f79b7c7a5328ea577eb9 Mon Sep 17 00:00:00 2001 From: Ghislain Vaillant Date: Mon, 11 Mar 2024 14:34:18 +0100 Subject: [PATCH 19/21] DOC: Fix more warnings --- medkit/audio/transcription/__init__.py | 16 ++++++++++------ medkit/core/text/umls_norm_attribute.py | 2 +- medkit/io/medkit_json/_common.py | 3 ++- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/medkit/audio/transcription/__init__.py b/medkit/audio/transcription/__init__.py index 2dcdf492..bb5d43ea 100644 --- a/medkit/audio/transcription/__init__.py +++ b/medkit/audio/transcription/__init__.py @@ -1,15 +1,19 @@ +from medkit.audio.transcription.doc_transcriber import DocTranscriber, TranscriptionOperation +from medkit.audio.transcription.transcribed_text_document import TranscribedTextDocument +from medkit.core.utils import modules_are_available + __all__ = [ "DocTranscriber", "TranscriptionOperation", "TranscribedTextDocument", ] -from medkit.audio.transcription.doc_transcriber import DocTranscriber, TranscriptionOperation -from medkit.audio.transcription.transcribed_text_document import TranscribedTextDocument -from medkit.core.utils import modules_are_available - if modules_are_available(["torchaudio", "transformers"]): - __all__ += ["hf_transcriber"] + from medkit.audio.transcription.hf_transcriber import HFTranscriber + + __all__ += ["HFTranscriber"] if modules_are_available(["torch", "speechbrain"]): - __all__ += ["sb_transcriber"] + from medkit.audio.transcription.sb_transcriber import SBTranscriber + + __all__ += ["SBTranscriber"] diff --git a/medkit/core/text/umls_norm_attribute.py b/medkit/core/text/umls_norm_attribute.py index a19175ea..05b426f6 100644 --- a/medkit/core/text/umls_norm_attribute.py +++ b/medkit/core/text/umls_norm_attribute.py @@ -8,7 +8,7 @@ from typing_extensions import Self from medkit.core import dict_conv -from medkit.core.text import EntityNormAttribute +from medkit.core.text.entity_norm_attribute import EntityNormAttribute @dataclasses.dataclass(init=False) diff --git a/medkit/io/medkit_json/_common.py b/medkit/io/medkit_json/_common.py index 48565876..fde9cc21 100644 --- a/medkit/io/medkit_json/_common.py +++ b/medkit/io/medkit_json/_common.py @@ -14,7 +14,7 @@ class ContentType(enum.Enum): - """Type of content + """Type of content. Attributes ---------- @@ -25,6 +25,7 @@ class ContentType(enum.Enum): AUDIO_DOCUMENT_LIST : List of audio documents AUDIO_ANNOTATION_LIST : List of audio annotations """ + TEXT_DOCUMENT = "text_document" TEXT_DOCUMENT_LIST = "text_document_list" TEXT_ANNOTATION_LIST = "text_annotation_list" From f293de83de6c5dff2f7ebf085b1ef6a52a9fdb08 Mon Sep 17 00:00:00 2001 From: Ghislain Vaillant Date: Thu, 14 Mar 2024 14:39:24 +0100 Subject: [PATCH 20/21] DOC: Disable navigation with keys explicitly --- docs/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/conf.py b/docs/conf.py index 82b2c636..cd991c08 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -43,6 +43,7 @@ "path_to_docs": "docs", "repository_url": "https://github.com/medkit-lib/medkit", "repository_branch": "main", + "navigation_with_keys": False, } html_title = "medkit documentation" html_logo = "_static/medkit-logo.png" From c7c5a24de6eda27e3716fce2f5bb79b76b002f34 Mon Sep 17 00:00:00 2001 From: Ghislain Vaillant Date: Thu, 14 Mar 2024 18:52:06 +0100 Subject: [PATCH 21/21] DOC: Fix warnings on invalid directives --- docs/examples/audio_dataset_metrics.md | 42 +++------- docs/examples/audio_transcription.md | 35 ++------ docs/examples/brat_io.md | 31 +++----- docs/examples/cleaning_text.md | 32 +++----- docs/examples/custom_text_operation.md | 52 +++++------- docs/examples/detecting_text_duplicates.md | 38 +++------ docs/examples/edsnlp.md | 48 ++++------- docs/examples/finetuning_hf_model.md | 92 +++++++++------------- docs/examples/iamsystem.md | 26 ++---- docs/examples/ontotox.md | 47 ++++++----- docs/examples/spans.md | 42 +++++----- 11 files changed, 169 insertions(+), 316 deletions(-) diff --git a/docs/examples/audio_dataset_metrics.md b/docs/examples/audio_dataset_metrics.md index 28aa931d..8ac3f4a9 100644 --- a/docs/examples/audio_dataset_metrics.md +++ b/docs/examples/audio_dataset_metrics.md @@ -1,17 +1,3 @@ ---- -jupytext: - formats: md:myst - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.14.4 -kernelspec: - display_name: Python 3 (ipykernel) - language: python - name: python3 ---- - # Computing metrics on an audio dataset This demo shows how to compute diarization and transcription metrics on an audio @@ -19,10 +5,8 @@ dataset such as [simsamu](https://huggingface.co/datasets/medkit/simsamu) Download the dataset from the HuggingFace hub: -```{code-cell} ipython3 -:tags: [skip-execution] +```{code} python import huggingface_hub as hf_hub -from medkit.io import SRTInputConverter simsamu_dir = hf_hub.snapshot_download("medkit/simsamu", repo_type="dataset") ``` @@ -31,8 +15,7 @@ Load the `.m4a` audio files into audio documents, as well as reference diarization and transcription annotated documents from corresponding `.rttm` and `.srt` files: -```{code-cell} ipython3 -:tags: [skip-execution] +```{code} python from pathlib import Path from medkit.core.audio import AudioDocument from medkit.io.rttm import RTTMInputConverter @@ -58,8 +41,8 @@ for rec_dir in sorted(Path(simsamu_dir).glob("*"))[:4]: # convert m4a to wav with ffmpeg wav_file = m4a_file.with_suffix(".wav") - if not wav_file.exists(): - !ffmpeg -i {m4a_file} -acodec pcm_s16le -ac 1 -ar 16000 {wav_file} + # if not wav_file.exists(): + # !ffmpeg -i {m4a_file} -acodec pcm_s16le -ac 1 -ar 16000 {wav_file} # load empty audio doc doc = AudioDocument.from_file(wav_file) @@ -74,8 +57,7 @@ for rec_dir in sorted(Path(simsamu_dir).glob("*"))[:4]: Initialize the diarization operation with the [simsamu-diarization pipeline](https://huggingface.co/medkit/simsamu-diarization) -```{code-cell} ipython3 -:tags: [skip-execution] +```{code} python import torch from medkit.audio.segmentation.pa_speaker_detector import PASpeakerDetector @@ -94,8 +76,7 @@ speaker_detector = PASpeakerDetector( Initialize the transcription operation with the [simsamu-transcription model](https://huggingface.co/medkit/simsamu-transcription): -```{code-cell} ipython3 -:tags: [skip-execution] +```{code} python from medkit.audio.transcription.sb_transcriber import SBTranscriber transcriber = SBTranscriber( @@ -109,8 +90,7 @@ transcriber = SBTranscriber( Diarize and transcribe all documents: -```{code-cell} ipython3 -:tags: [skip-execution] +```{code} python from tqdm import tqdm # list of list of segments, per document @@ -125,8 +105,7 @@ for doc in tqdm(docs): Compute the DER (Diarization Error Rate): -```{code-cell} ipython3 -:tags: [skip-execution] +```{code} python from medkit.audio.metrics.diarization import DiarizationEvaluator diarization_evaluator = DiarizationEvaluator( @@ -145,8 +124,7 @@ der=13.45% Compute the WER (Word Error Rate) and CER (Character Error Rate): -```{code-cell} ipython3 -:tags: [skip-execution] +```{code} python from medkit.audio.metrics.transcription import TranscriptionEvaluator transcription_evaluator = TranscriptionEvaluator( @@ -164,4 +142,4 @@ wer=20.77%, cer=15.13% Note that running the transcription operation on the reference speech turns rather than those returned by the diarization operation will give lower WER and -CER values (around 15% and 9%). \ No newline at end of file +CER values (around 15% and 9%). diff --git a/docs/examples/audio_transcription.md b/docs/examples/audio_transcription.md index f5e610e7..8dd9e2f7 100644 --- a/docs/examples/audio_transcription.md +++ b/docs/examples/audio_transcription.md @@ -1,17 +1,3 @@ ---- -jupytext: - formats: md:myst - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.14.4 -kernelspec: - display_name: Python 3 (ipykernel) - language: python - name: python3 ---- - # Audio transcription This demo shows how to transcribe an audio document and then perform text @@ -22,7 +8,7 @@ operations on it. Instantiate an {class}`~.core.audio.AudioDocument` with a {class}`~.core.audio.FileAudioBuffer`: -```{code-cell} ipython3 +```{code} python from pathlib import Path import IPython.display from medkit.core.audio import AudioDocument, FileAudioBuffer @@ -41,7 +27,7 @@ Prepare pipeline to perform voice detection on audio documents, using a also use other segmentation operations such as {class}`~.audio.segmentation.pa_speaker_detector.PASpeakerDetector` ): -```{code-cell} ipython3 +```{code} python from medkit.core import Pipeline, PipelineStep, DocPipeline from medkit.audio.preprocessing import Downmixer from medkit.audio.segmentation.webrtc_voice_detector import WebRTCVoiceDetector @@ -74,7 +60,7 @@ audio_doc_pipeline = DocPipeline(audio_pipeline) Run voice detection on audio document: -```{code-cell} ipython3 +```{code} python audio_doc_pipeline.run([audio_doc]) for seg in audio_doc.anns.get(label="voice"): print(f"label={seg.label}, span={seg.span}") @@ -89,8 +75,7 @@ transcriber creating text segments from audio segments (you can also use other transcription operations such as {class}`~.audio.transcription.sb_transcriber.SBTranscriber`): -```{code-cell} ipython3 -:tags: [skip-execution] +```{code} python from medkit.audio.transcription import DocTranscriber from medkit.audio.transcription.hf_transcriber import HFTranscriber @@ -109,8 +94,7 @@ doc_transcriber = DocTranscriber( Transcribe audio document: -```{code-cell} ipython3 -:tags: [skip-execution] +```{code} python transcribed_doc = doc_transcriber.run([audio_doc])[0] print(f"fulltext={transcribed_doc.text!r}", end="\n\n") for seg in transcribed_doc.anns.get(label="transcription"): @@ -128,8 +112,7 @@ label=transcription, text=' I also have high blood pressure.' Run text entity matching on transcribed document: -```{code-cell} ipython3 -:tags: [skip-execution] +```{code} python from medkit.core.text import TextDocument from medkit.text.ner import RegexpMatcher, RegexpMatcherRule @@ -152,8 +135,7 @@ text_doc_pipeline.run([transcribed_doc]) Locate matched entities in original audio: -```{code-cell} ipython3 -:tags: [skip-execution] +```{code} python entities = transcribed_doc.anns.get_entities() for entity in entities: @@ -165,8 +147,7 @@ for entity in entities: IPython.display.display(IPython.display.Audio(data=audio.read(), rate=audio.sample_rate)) ``` -```{code-cell} ipython3 -:tags: [remove-input] +```{code} python # hardcoded display of audio spans to workaround # the fact that cells are not executed print("label=problem, text='headaches'") diff --git a/docs/examples/brat_io.md b/docs/examples/brat_io.md index 0597563f..4f2df1af 100644 --- a/docs/examples/brat_io.md +++ b/docs/examples/brat_io.md @@ -1,16 +1,3 @@ ---- -jupytext: - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.14.0 -kernelspec: - display_name: Python 3 (ipykernel) - language: python - name: python3 ---- - # Brat integration +++ @@ -28,7 +15,7 @@ In this example, we will show how to import Brat annotated files into medkit and Consider this text file: +++ -```{code-cell} ipython3 +```{code} python # You can download the file available in source code # !wget https://raw.githubusercontent.com/medkit-lib/medkit/main/docs/examples/input/brat/doc_01.txt @@ -41,7 +28,7 @@ print(Path("./input/brat/doc_01.txt").read_text(encoding="utf-8")) It has the following brat annotation file: -```{code-cell} ipython3 +```{code} python # You can download the file available in source code # !wget https://raw.githubusercontent.com/medkit-lib/medkit/main/docs/examples/input/brat/doc_01.ann @@ -52,11 +39,11 @@ print(Path("./input/brat/doc_01.ann").read_text(encoding="utf-8")) To load Brat Files, medkit provides the {class}`~medkit.io.brat.BratInputConverter` class. This converter returns a list of `TextDocument`. -```{tip} +:::{tip} You can enable provenance tracing by assigning a {class}`~medkit.core.ProvTracer` object to the BratInputConverter with the `set_prov_tracer()` method. -``` +::: -```{code-cell} ipython3 +```{code} python from medkit.io.brat import BratInputConverter # Define Input Converter @@ -78,7 +65,7 @@ print(f"Where {len(entities_disease)} annotations have 'disease' as label") The created document contains the annotations defined in the brat annotation file. We can show the entities information, for example. -```{code-cell} ipython3 +```{code} python for entity in medkit_doc.anns.get_entities(): print(f"label={entity.label}, spans={entity.spans}, text={entity.text!r}") ``` @@ -99,7 +86,7 @@ If you also want to include the segments in the brat collection, the parameter ` To facilitate integration and ensure correct visualisation, medkit automatically generates an `annotation.conf` for each collection. +++ -```{code-cell} ipython3 +```{code} python from medkit.io.brat import BratOutputConverter # Define Output Converter with default params, @@ -118,9 +105,9 @@ The collection is saved on disk including the following files: By default the name is the `document_id`, you can change it using the `doc_names` parameter. -```{note} +:::{note} Since the values of the attributes in brat must be defined in the configuration, medkit shows the top50 for each attribute. In case you want to show more values in the configuration, you can change `top_values_by_attr` in the brat output converter. - ``` +::: :::{seealso} cf. [Brat IO module](api:io:brat). diff --git a/docs/examples/cleaning_text.md b/docs/examples/cleaning_text.md index ba85491b..b6d4976b 100644 --- a/docs/examples/cleaning_text.md +++ b/docs/examples/cleaning_text.md @@ -1,16 +1,3 @@ ---- -jupytext: - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.14.0 -kernelspec: - display_name: Python 3 (ipykernel) - language: python - name: python3 ---- - # Cleaning text with a predefined operation +++ @@ -23,7 +10,7 @@ In this example, we will use a predefined {class}`~medkit.text.preprocessing.eds Consider the following document: -```{code-cell} ipython3 +```{code} python # You can download the file available in source code # !wget https://raw.githubusercontent.com/medkit-lib/medkit/main/docs/examples/input/text/text_to_clean.txt @@ -52,7 +39,7 @@ The main idea is to transform the `raw_segment` and keep track of the modificati The `EDSCleaner` is configurable, we initialize `keep_endlines=True` to facilitate the visualization. Otherwise, the output segment would be a plain text with no newlines `(\n)` characters. -```{code-cell} ipython3 +```{code} python from medkit.text.preprocessing import EDSCleaner eds_cleaner = EDSCleaner(keep_endlines=True) @@ -69,16 +56,16 @@ The class works on `Segments`. In the `run` method it performs several operation * Deletes multiple newline characters. * Deletes multiple whitespaces. -```{note} +:::{note} There are two special operations that process parentheses and dots near French keywords such as Dr., Mme. and others. To enable/disable these operations you can use `handle_parentheses_eds` and `handle_points_eds`. -``` +::: ## Extract text from the clean text Now that we have a **clean segment**, we can run an operation on the new segment. We can detect the sentences, for example. -```{code-cell} ipython3 +```{code} python from medkit.text.segmentation import SentenceTokenizer sentences = SentenceTokenizer().run([clean_segment]) @@ -91,7 +78,7 @@ for sent in sentences: The span of each generated sentence contains the modifications made by *eds_cleaner* object. Let's look at the second sentence: -```{code-cell} ipython3 +```{code} python sentence = sentences[1] print(f"text={sentence.text!r}") print("spans=\n","\n".join(f"{sp}" for sp in sentence.spans)) @@ -105,7 +92,7 @@ Since the sentence contains the information from the original spans, it will alw To get the original spans, we can use {func}`~medkit.core.text.span_utils.normalize_spans`. Next, we can extract the raw text using {func}`~medkit.core.text.span_utils.extract`. -```{code-cell} ipython3 +```{code} python from medkit.core.text.span_utils import normalize_spans, extract spans_sentence = normalize_spans(sentence.spans) @@ -116,8 +103,7 @@ print(f"- Sentence in the ORIGINAL version:\n \"{extracted_text}\"") That's how an operation transforms text and extracts information without losing the raw text. -```{seealso} +:::{seealso} For further information on the utilities used in this class, see {class}`~medkit.core.text.utils`. To see more examples of span operations [here](../examples/spans) - -``` +::: diff --git a/docs/examples/custom_text_operation.md b/docs/examples/custom_text_operation.md index 1fb09dcf..15a4f305 100644 --- a/docs/examples/custom_text_operation.md +++ b/docs/examples/custom_text_operation.md @@ -1,16 +1,3 @@ ---- -jupytext: - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.14.7 -kernelspec: - display_name: Python 3 (ipykernel) - language: python - name: python3 ---- - # Creating a custom text operation If you want to initialize a custom text operation from a simple user-defined function, you can take a look to the following examples. @@ -26,7 +13,7 @@ In this example, Jane wants to detect some entities (problems) from a raw text. ### 1. Create medkit document -```{code-cell} ipython3 +```{code} python from medkit.core.text import TextDocument text = "The patient has asthma and is using ventoline. The patient has diabetes" @@ -38,7 +25,7 @@ doc = TextDocument(text=text) Jane would like to reuse a collegue's file containing a list of regular expression rules for detecting entities. To this purpose, she had to split text into sentences before using the `RegexpMatcher` component. -```{code-cell} ipython3 +```{code} python from medkit.text.segmentation import SentenceTokenizer sentence_tokenizer = SentenceTokenizer() @@ -46,12 +33,13 @@ sentence_tokenizer = SentenceTokenizer() In real life, Jane should load the rules from a path using this instruction: -``` +```{code} python regexp_rules = RegexpMatcher.load_rules(path_to_rules_file) ``` + But for this example, it is simpler for us to define this set of rules manually. -```{code-cell} ipython3 +```{code} python from medkit.text.ner import RegexpMatcher, RegexpMatcherRule regexp_rules = [ @@ -61,7 +49,7 @@ regexp_rules = [ ] ``` -```{code-cell} ipython3 +```{code} python regexp_matcher = RegexpMatcher(rules=regexp_rules) ``` @@ -71,7 +59,7 @@ As `RegexpMatcher` is based on her collegue's file, Jane would like to add a fil For that, she has to define her own filter function and use medkit tools to instantiate this custom operation. -```{code-cell} ipython3 +```{code} python from medkit.core.text import Entity def keep_entities_with_label_problem(entity): @@ -90,7 +78,7 @@ filter_operation = create_text_operation(function=keep_entities_with_label_probl ### 4. Construct and run the pipeline -```{code-cell} ipython3 +```{code} python from medkit.core import Pipeline, PipelineStep steps=[ @@ -118,7 +106,7 @@ There are 3 results. **IMPORTANT: the following code is only for demo purpose, all pipeline steps are executed, we just select what pipeline outputs** -```{code-cell} ipython3 +```{code} python pipeline = Pipeline( steps=steps, input_keys=["raw_text"], @@ -139,7 +127,7 @@ In this example, Jane wants to pre-process the text before detecting entities. ### 1. Create medkit document -```{code-cell} ipython3 +```{code} python from medkit.core.text import TextDocument text = """IRM : Lésion de la CPMI périphérique, @@ -153,7 +141,7 @@ doc = TextDocument(text=text) Jane wants to use a dictionary to convert all abbreviations into their long text. To make it, she may define a custom function and use medkit `span_utils` to preserve spans during text modifications. -```{code-cell} ipython3 +```{code} python import re from typing import Dict from medkit.core.text import Segment, span_utils @@ -206,7 +194,7 @@ After executing the operation on the document raw text, we can observe that the * a text with abbreviations replaced by their long text, * spans which is a mix of modified spans (for replaced parts of text) and original spans (for not replaced text). -```{code-cell} ipython3 +```{code} python segments = preprocessing_operation.run([doc.raw_segment]) for segment in segments: @@ -225,7 +213,7 @@ In this example, Jane wants to count detected UMLS cui on a set of documents. In this example, we use translated .uid documents. For more info, you may refer to {mod}`medkit.tools.mtsamples`. -```{code-cell} ipython3 +```{code} python from medkit.tools.mtsamples import load_mtsamples docs = load_mtsamples(nb_max=10) @@ -239,13 +227,13 @@ print(docs[0].text) Let's initialize same operations as above (i.e., sentence tokenizer, then regexp matcher with default rules) without the filter operation. -```{code-cell} ipython3 +```{code} python from medkit.text.segmentation import SentenceTokenizer sentence_tokenizer = SentenceTokenizer() ``` -```{code-cell} ipython3 +```{code} python from medkit.text.ner import RegexpMatcher regexp_matcher = RegexpMatcher() @@ -258,7 +246,7 @@ regexp_matcher = RegexpMatcher() The extraction function is defined with a label parameter for filtering entities. Our custom operation allows to retrieve only attributes from entity with `disorder` label. -```{code-cell} ipython3 +```{code} python import re from typing import List from medkit.core.text import Entity, UMLSNormAttribute @@ -281,7 +269,7 @@ attr_extraction_operation = create_text_operation( When running the pipeline on the set of documents, the output is a list of umls normalization attributes. -```{code-cell} ipython3 +```{code} python from medkit.core import Pipeline, PipelineStep steps=[ @@ -297,7 +285,7 @@ pipeline = Pipeline( ) ``` -```{code-cell} ipython3 +```{code} python attrs = pipeline.run([doc.raw_segment for doc in docs]) attrs[:5] ``` @@ -306,12 +294,12 @@ attrs[:5] Now, Jane can analyze the number of cuis detected on her set of documents. -```{code-cell} ipython3 +```{code} python import pandas as pd df = pd.DataFrame.from_records([attr.to_dict() for attr in attrs], columns=["cui", "umls_version"]) print(df) ``` -```{code-cell} ipython3 +```{code} python df.value_counts(subset="cui") ``` diff --git a/docs/examples/detecting_text_duplicates.md b/docs/examples/detecting_text_duplicates.md index 406296ad..d8c6baaf 100644 --- a/docs/examples/detecting_text_duplicates.md +++ b/docs/examples/detecting_text_duplicates.md @@ -1,16 +1,3 @@ ---- -jupytext: - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.14.0 -kernelspec: - display_name: Python 3 (ipykernel) - language: python - name: python3 ---- - # Detecting text duplicates Medkit provides support for detecting duplicates (zones of identical text) @@ -22,7 +9,8 @@ library developed at the HEGP. No optional dependencies are required to use {class}`~.preprocessing.DuplicateFinder` but it may perform faster if the `ncls` package is installed: -``` + +```console pip install ncls ``` @@ -40,7 +28,7 @@ For the purpose of this tutorial, we have created 2 folders, each folder containing 2 text files regarding the same patient. The contents of one of the documents of the first patient were copy-pasted into the other document: -```{code-cell} +```{code} python from pathlib import Path main_dir = Path("data/duplicate_detection") @@ -48,7 +36,7 @@ file_1 = main_dir / "patient_1/a10320aa-2008_04_13.txt" print(file_1.read_text()) ``` -```{code-cell} +```{code} python file_2 = main_dir / "patient_1/f1d3e530-2008_04_14.txt" print(file_2.read_text()) ``` @@ -56,7 +44,7 @@ print(file_2.read_text()) Let's create a list of collections, with one collection per patient: -```{code-cell} +```{code} python from medkit.core import Collection from medkit.core.text import TextDocument @@ -74,7 +62,7 @@ for patient_subdir in sorted(main_dir.glob("*")): Let's now instantiate a duplicate finder and run in on our collections: -```{code-cell} +```{code} python from medkit.text.preprocessing import DuplicateFinder dup_finder = DuplicateFinder(output_label="duplicate") @@ -113,7 +101,7 @@ Let's rebuild our collection of text documents, adding a `"creation_date"` entry to the metadata of each doc (that we extract from the filename for the purpose of the example): -```{code-cell} +```{code} python collections = [] for patient_subdir in sorted(main_dir.glob("*")): docs = [] @@ -130,7 +118,7 @@ for patient_subdir in sorted(main_dir.glob("*")): and let's use that metadata when finding duplicates: -```{code-cell} +```{code} python # tell DuplicateFinder to use the "creation_date" metadata to order documents dup_finder = DuplicateFinder(output_label="duplicate", date_metadata_key="creation_date") dup_finder.run(collections) @@ -153,7 +141,7 @@ added to documents[^1]. Let's see an example of how to run a minimalistic NER pipeline on the non-duplicate zones of our documents: -```{code-cell} +```{code} python from medkit.core import DocPipeline, Pipeline, PipelineStep from medkit.text.segmentation import SentenceTokenizer from medkit.text.ner import RegexpMatcher, RegexpMatcherRule @@ -191,9 +179,7 @@ for collection in collections: Let's now visualize the annotations of the 2 documents of the first patient: -```{code-cell} ipython3 -:tags: [scroll-output] - +```{code} python from spacy import displacy from medkit.text.spacy.displacy_utils import medkit_doc_to_displacy @@ -202,9 +188,7 @@ displacy_data = medkit_doc_to_displacy(doc_1) displacy.render(displacy_data, manual=True, style="ent") ``` -```{code-cell} ipython3 -:tags: [scroll-output] - +```{code} python doc_2 = collections[0].text_docs[1] displacy_data = medkit_doc_to_displacy(doc_2) displacy.render(displacy_data, manual=True, style="ent") diff --git a/docs/examples/edsnlp.md b/docs/examples/edsnlp.md index 6c01af05..28b05357 100644 --- a/docs/examples/edsnlp.md +++ b/docs/examples/edsnlp.md @@ -1,16 +1,3 @@ ---- -jupytext: - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.14.0 -kernelspec: - display_name: Python 3 (ipykernel) - language: python - name: python3 ---- - # Using EDS-NLP with medkit [EDS-NLP](https://aphp.github.io/edsnlp/) provides a set of @@ -20,16 +7,16 @@ EDS-NLP within medkit is supported, as we will see. To follow this tutorial, you will need to install medkit spaCy support and EDS-NLP with -```{code-cell} ipython3 -:tags: [remove-output] +```console pip install 'medkit-lib[edsnlp]' ``` + ## Running an EDS-NLP spaCy pipeline on entire documents We will need a sample text document to annotate: -```{code-cell} ipython3 +```{code} python from medkit.core.text import TextDocument text = """COMPTE RENDU D'HOSPITALISATION @@ -42,9 +29,7 @@ doc = TextDocument(text) and a spaCy pipeline with a few EDS-NLP components: -```{code-cell} ipython3 -:tags: [remove-output] - +```{code} python import spacy nlp = spacy.blank("eds") @@ -71,8 +56,7 @@ a dedicated {class}`~.EDSNLPDocPipeline` operation, with some additional support for specific EDS-NLP components: -```{code-cell} ipython3 -:tags: [remove-output] +```{code} python from medkit.text.spacy.edsnlp import EDSNLPDocPipeline eds_nlp_pipeline = EDSNLPDocPipeline(nlp) @@ -80,13 +64,13 @@ eds_nlp_pipeline = EDSNLPDocPipeline(nlp) The operation is executed by applying its `run()` method on a list of documents: -```{code-cell} ipython3 +```{code} python eds_nlp_pipeline.run([doc]) ``` Let's look at the entities and segments that were found: -```{code-cell} ipython3 +```{code} python for entity in doc.anns.entities: print(f"{entity.label}: {entity.text!r}") for segment in doc.anns.segments: @@ -95,7 +79,7 @@ for segment in doc.anns.segments: Here are the attributes attached to the `"covid"` entity: -```{code-cell} ipython3 +```{code} python entity = doc.anns.get_entities(label="covid")[0] for attr in entity.attrs: print(f"{attr.label}={attr.value}") @@ -103,7 +87,7 @@ for attr in entity.attrs: and the attributes of the first `"dates"` segment: -```{code-cell} ipython3 +```{code} python date_seg = doc.anns.get_segments(label="dates")[0] for attr in date_seg.attrs: print(f"{attr.label}={attr.value}") @@ -111,7 +95,7 @@ for attr in date_seg.attrs: Let's now examine more closely the `"date"` attribute: -```{code-cell} ipython3 +```{code} python date_seg = doc.anns.get_segments(label="dates")[0] date_attr = date_seg.attrs.get(label="date")[0] date_attr @@ -121,7 +105,8 @@ This attribute is an instance of {class}`~medkit.text.ner.DateAttribute`, a subclass of {class}`~medkit.core.Attribute`.It has `year`, `month`, `day` (etc) fields containing the different parts of the date that was detected, as well as a normalized string representation in its `value` field: -```{code-cell} ipython3 + +```{code} python date_attr.value ``` @@ -137,13 +122,13 @@ Here are the supported EDS-NLP attributes values and the corresponding medkit cl - `RelativeDate` (created by `eds.dates`): {class}`medkit.text.ner.RelativeDateAttribute` - `Duration` (created by `eds.dates`): {class}`medkit.text.ner.DurationAttribute` -```{note} +:::{note} The transformations performed by {class}`~.EDSNLPDocPipeline` can be overridden or extended with the `medkit_attribute_factories` init parameter. For a list of all the default transformations, see {const}`~medkit.text.spacy.edsnlp.DEFAULT_ATTRIBUTE_FACTORIES` and corresponding functions in {mod}`medkit.text.spacy.edsnlp`. -``` +::: ## Running an EDL-NLP spaCy pipeline at the annotation level @@ -154,8 +139,7 @@ pipeline on text annotations instead of a document with using pure medkit operations for sentence tokenization and entity matching, and EDS-NLP spaCy components for covid entity matching: -```{code-cell} ipython3 -:tags: [remove-output] +```{code} python from medkit.core import Pipeline, PipelineStep from medkit.text.ner import RegexpMatcher, RegexpMatcherRule from medkit.text.segmentation import SentenceTokenizer @@ -179,7 +163,7 @@ pipeline = Pipeline( ) ``` -```{code-cell} ipython3 +```{code} python doc = TextDocument(text) entities = pipeline.run([doc.raw_segment]) for entity in entities: diff --git a/docs/examples/finetuning_hf_model.md b/docs/examples/finetuning_hf_model.md index a535e47f..5d7595b5 100644 --- a/docs/examples/finetuning_hf_model.md +++ b/docs/examples/finetuning_hf_model.md @@ -1,24 +1,13 @@ ---- -jupytext: - formats: md:myst - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.13.8 -kernelspec: - display_name: Python 3 (ipykernel) - language: python - name: python3 ---- - # Fine-tuning a Transformers model with medkit -```{note} +:::{note} This example may require optional modules from medkit, use the following to install them: -`pip install medkit-lib[training,hf-entity-matcher]` +```console +pip install 'medkit-lib[training,hf-entity-matcher]' ``` +::: + In recent years, Large Language Models (LLMs) have achieved very good performance in natural language processing (NLP) tasks. However, training a LLM (involving billions of parameters) from scratch requires a lot of resources and large quantities of text. Since these models are trained on general domain data, they learn complex patterns. We can adapt (fine-tune) the last layers to a specific task using our data and low resources. LLMs are PreTrained and accessible with libraries like [🤗 **Transformers**](https://huggingface.co/docs/transformers/index). Medkit has some components to fine-tune these models. @@ -31,9 +20,7 @@ biomedical article titles and medication leaflets, in french. Entities were annotated with UMLS semantic groups labels (ex: "ANAT", "CHEMI", "DISO", "PROC", etc). The dataset is available in the BRAT format. Let's download it: -```{code-cell} ipython3 -:tags: [skip-execution] - +```{code} python import os import urllib import zipfile @@ -41,17 +28,16 @@ import zipfile QUAERO_URL = "https://quaerofrenchmed.limsi.fr/QUAERO_FrenchMed_brat.zip" QUAERO_DIR = "QUAERO_FrenchMed/corpus/" -if not os.path.exists(QUAERO_DIR): - # download and unzip quaero dataset - !wget -O quaero.zip https://quaerofrenchmed.limsi.fr/QUAERO_FrenchMed_brat.zip - !unzip -o ./quaero.zip +# if not os.path.exists(QUAERO_DIR): +# !wget -O quaero.zip https://quaerofrenchmed.limsi.fr/QUAERO_FrenchMed_brat.zip +# !unzip -o ./quaero.zip ``` The corpus has been pre-split into train/dev/test groups, and inside each split the documents are grouped between EMEA files (drug leaflets) and MEDLINES files (article titles): -``` +```text QUAERO_FrenchMed/corpus ├── train │ ├── EMEA @@ -82,8 +68,7 @@ Instead of directly loading all the documents and annotations in each subdirecto {meth}`~.io.BratInputConverter.load_annotations()` and filter the annotations with `filter_overlapping_entities()`: -```{code-cell} ipython3 -:tags: [skip-execution] +```{code} python from glob import glob from medkit.core.text import TextDocument from medkit.io.brat import BratInputConverter @@ -125,8 +110,7 @@ to be usable during training. Here is the code to split each EMEA doc by sentence and create new mini-docs for each sentence with the entities they contain: -```{code-cell} ipython3 -:tags: [skip-execution] +```{code} python from medkit.core.text import Entity, Span from medkit.text.segmentation import SentenceTokenizer from medkit.text.postprocessing import DocumentSplitter @@ -162,8 +146,7 @@ test_docs_emea_sentences = split_emea_docs(test_docs_emea) Let's save this preprocessed version of our dataset in medkit json files, so we can reuse easily another time if needed: -```{code-cell} ipython3 -:tags: [skip-execution] +```{code} python import shutil from medkit.io import medkit_json @@ -199,8 +182,7 @@ It allows us to get the following metrics: We will use the IOB2 tagging scheme to classify the tokens before computing the metrics. The metrics are computed in strict mode, which means that each token of the entity has to be properly labelled for the entity to be considered as properly identified. -```{code-cell} ipython3 -:tags: [skip-execution] +```{code} python from pprint import pprint from medkit.text.ner import UMLSMatcher from medkit.text.metrics.ner import SeqEvalEvaluator @@ -272,12 +254,14 @@ umls_matcher_scores = ner_metrics_evaluator.compute(test_docs, predicted_entitie pprint(umls_matcher_scores) ``` -``` -{'accuracy': 0.7683723661992512, - 'macro_f1-score': 0.4246273234375716, - 'macro_precision': 0.5621040123228653, - 'macro_recall': 0.3528176174287537, - 'support': 4085} +```python +{ + 'accuracy': 0.7683723661992512, + 'macro_f1-score': 0.4246273234375716, + 'macro_precision': 0.5621040123228653, + 'macro_recall': 0.3528176174287537, + 'support': 4085, +} ``` We reach a f1-score of approximately 42%. @@ -296,8 +280,7 @@ Medkit provides simple training tools that make it possible to train or fine-tun Let's define a trainable instance for this example: -```{code-cell} ipython3 -:tags: [skip-execution] +```{code} python import torch from medkit.text.ner.hf_entity_matcher import HFEntityMatcher @@ -324,8 +307,7 @@ trainable_matcher = HFEntityMatcher.make_trainable( At this point, we have prepared the data and the component to fine-tune. All we need to do is define the trainer with its configuration -```{code-cell} ipython3 -:tags: [skip-execution] +```{code} python from medkit.training import TrainerConfig, Trainer from medkit.text.metrics.ner import SeqEvalMetricsComputer @@ -369,16 +351,14 @@ trainer = Trainer( We can now run the training loop with `trainer.train()`. It returns a dictionary with the training history and saves a checkpoint with the tuned model: -```{code-cell} ipython3 -:tags: [skip-execution] +```{code} python # Run training and keep history of losses and metrics history = trainer.train() ``` Let's take a look at how the metrics evolved during the training: -```{code-cell} ipython3 -:tags: [skip-execution] +```{code} python recall = [epoch["eval"]["macro_recall"] for epoch in history] precision = [epoch["eval"]["macro_precision"] for epoch in history] f1_score = [epoch["eval"]["macro_f1-score"] for epoch in history] @@ -389,14 +369,13 @@ plt.plot(f1_score, label="f1_score") plt.legend() ``` -![png](finetuning_hf_model_plot.png) +![](finetuning_hf_model_plot.png) After 10 epochs, we should reach a f1-score around 50% on the dev split. Let's look at the final metrics on the test split. For this we will reinstantiate an `HFEntityMatcher` with the last checkpoint: -```{code-cell} ipython3 -:tags: [skip-execution] +```{code} python from glob import glob # Retrieve best checkpoint and use it to instantiate the HuggingFace entity matcher @@ -413,12 +392,15 @@ for test_doc in test_docs: metrics = ner_metrics_evaluator.compute(test_docs, predicted_entities) pprint(metrics) ``` -``` -{'accuracy': 0.8604727993539387, - 'macro_f1-score': 0.48133173293574166, - 'macro_precision': 0.5292726724036432, - 'macro_recall': 0.4690750888795575, - 'support': 4085} + +```python +{ + 'accuracy': 0.8604727993539387, + 'macro_f1-score': 0.48133173293574166, + 'macro_precision': 0.5292726724036432, + 'macro_recall': 0.4690750888795575, + 'support': 4085, +} ``` Our fine-tuned BERT model has a better f1-score than the fuzzy simstring matcher (48% vs 42%), thanks to its better recall (47% vs 35%). diff --git a/docs/examples/iamsystem.md b/docs/examples/iamsystem.md index e0163e94..97613688 100644 --- a/docs/examples/iamsystem.md +++ b/docs/examples/iamsystem.md @@ -1,16 +1,3 @@ ---- -jupytext: - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.14.5 -kernelspec: - display_name: Python 3 (ipykernel) - language: python - name: python3 ---- - # IAMSystem Matcher +++ @@ -23,7 +10,7 @@ This tutorial will show an example of iamsystem matcher operation usage. For beginners, let's create a medkit text document from the following text. -```{code-cell} ipython +```{code} python from medkit.core.text import TextDocument text = """Le patient présente une asténie de grade 2 et une anémie de grade 3. @@ -35,7 +22,7 @@ doc = TextDocument(text=text) The full raw text can be accessed through the `text` attribute: -```{code-cell} ipython3 +```{code} python print(doc.text) ``` @@ -47,7 +34,7 @@ Before using entity matcher, we want to split the raw text in sentences, and the First, let's configure the three text operations. -```{code-cell} ipython +```{code} python from medkit.text.segmentation import SentenceTokenizer, SyntagmaTokenizer from medkit.text.context import NegationDetector, NegationDetectorRule, FamilyDetector, FamilyDetectorRule @@ -63,7 +50,7 @@ fam_detector = FamilyDetector(output_label="family") Now, let's run the operations. -```{code-cell} ipython +```{code} python sentences = sent_tokenizer.run([doc.raw_segment]) neg_detector.run(sentences) fam_detector.run(sentences) @@ -90,7 +77,7 @@ In the same manner, the sentence `Il n'a pas de vascularite` contains a negation Let's configure the iam system matcher (cf. [iamsystem official documentation](https://iamsystem-python.readthedocs.io/en/latest/)). -```{code-cell} ipython +```{code} python from medkit.text.ner.iamsystem_matcher import MedkitKeyword from iamsystem import Matcher @@ -133,7 +120,7 @@ In this example, we have defined two keywords then configured matcher with: Now, let's configure and run our medkit operation : {class}`~.text.ner.iamsystem_matcher.IAMSystemMatcher`. -```{code-cell} ipython +```{code} python from medkit.text.ner.iamsystem_matcher import IAMSystemMatcher # Configuring medkit operation with iam system matcher and @@ -155,5 +142,4 @@ for entity in entities: print(f"is_negated = {entity.attrs.get(label='is_negated')}") print(f"family = {entity.attrs.get(label='family')}") print(f"spans = {entity.spans}\n") - ``` diff --git a/docs/examples/ontotox.md b/docs/examples/ontotox.md index 105c3679..0d30f77e 100644 --- a/docs/examples/ontotox.md +++ b/docs/examples/ontotox.md @@ -27,7 +27,7 @@ We combine the following operations in a medkit pipeline to get the entities. - UMLSMatcher: Detects chemotherapy toxicity entities using the ontotox prepared terms - RegexMatcher: Detects grading information using regular expressions -```python +```{code} python from medkit.core import DocPipeline, Pipeline, PipelineStep from medkit.core.text import TextDocument from medkit.text.context import FamilyDetector, HypothesisDetector, NegationDetector @@ -35,9 +35,6 @@ from medkit.text.ner import RegexpMatcher, RegexpMatcherRule, UMLSMatcher from medkit.text.preprocessing import EDSCleaner from medkit.text.relations.syntactic_relation_extractor import SyntacticRelationExtractor from medkit.text.segmentation import SentenceTokenizer, SyntagmaTokenizer -``` - -```python def show_relations(doc): relations = doc.anns.get_relations() @@ -70,7 +67,7 @@ def show_entity_context(entity): We want to add the entities in the document, so we need to use a `DocPipeline` to do it. Let's define this component using the steps for entity detection. -```python +```{code} python # preprocessing and tokenization eds_cleaner = EDSCleaner() sentence_tokenizer = SentenceTokenizer() @@ -110,7 +107,7 @@ umls_matcher = UMLSMatcher( ) ``` -```python +```{code} python # build the pipeline pipeline_entities = Pipeline( steps=[ @@ -140,14 +137,14 @@ pipeline_entities = Pipeline( As we have already defined the input and output keys in the pipeline, all that remains is to define the `full_text` key : the raw text -```python +```{code} python entities_detector = DocPipeline( pipeline=pipeline_entities, labels_by_input_key={"full_text": [TextDocument.RAW_LABEL]}, ) ``` -```python +```{code} python # Testing the 'entities_detector' with a phrase doc = TextDocument("Une dysarthrie de grade 3. Pas de perte de poids") entities_detector.run([doc]) @@ -156,7 +153,8 @@ entities_detector.run([doc]) for entity in doc.anns.entities: show_entity_context(entity) ``` -``` + +```text disorder : dysarthrie Normalization: umls:C0013362 family: False @@ -185,7 +183,7 @@ The second part extracts the Relations. In this case, we are looking to find rel You can change the tagger (a spacy nlp object), the `relation_extractor` uses the french tagger by default. ::: -```python +```{code} python relation_extractor = SyntacticRelationExtractor( entities_target=["grade"],relation_label="has_grade" ) @@ -193,7 +191,7 @@ relation_extractor = SyntacticRelationExtractor( **Define the ontotox pipeline** -```python +```{code} python # build the pipeline ontoTOX_pipeline = Pipeline( steps=[ @@ -207,7 +205,7 @@ ontoTOX_pipeline = Pipeline( ## Running the OntoTOX pipeline -```python +```{code} python # Create a doc with the ontotox examples docs = TextDocument.from_dir("../data/text/ontotox/docs") @@ -218,7 +216,7 @@ ontoTOX_pipeline.run(docs) Let's look in detail the first document -```python +```{code} python doc = docs[0] print(f"\"{doc.text}\"") @@ -229,7 +227,8 @@ for entity in doc.anns.get_entities(): # Show relations show_relations(doc) ``` -``` + +```text "Pas de perte de poids. Le patient présente une dysarthrie de grade 3. Douleurs abdominales : grade 1. @@ -276,7 +275,7 @@ The "dysarthrie"(disorder) and "grade 3"(grade) are related. Dependency: [nmod,l The "Douleurs abdominales"(disorder) and "grade 1"(grade) are related. Dependency: [nsubj,right_to_left] ``` -```python +```{code} python for doc in docs[1:]: print(f"\"{doc.text}\"") print("Entities:") @@ -287,7 +286,7 @@ for doc in docs[1:]: print("\u2500" * 10) ``` -``` +```text "Le patient présente une oesophagite peptique de grade 2. Pas de thrombopénie. Il présente cependant également une alopécie sévère et un amaigrissment. @@ -355,11 +354,9 @@ This is how we have implemented ontotox in medkit, using the extracted informati Medkit includes some helpers to define simple operations, you may see this [tutorial](../examples/custom_text_operation) for more information. ::: -```python +```{code} python from medkit.core.text import CustomTextOpType,create_text_operation -``` -```python new_doc = TextDocument("Le patient présente une dysarthrie mais pas de grade 3") ontoTOX_pipeline.run([new_doc]) @@ -369,7 +366,8 @@ for entity in new_doc.anns.entities: show_relations(new_doc) print("\u2500" * 10) ``` -``` + +```text disorder : dysarthrie Normalization: umls:C0013362 family: False @@ -386,23 +384,24 @@ The "dysarthrie"(disorder) and "grade 3"(grade) are related. Dependency: [conj,l ────────── ``` + **Define the filter method** -```python +```{code} python def filter_negated_entities(entity): attr = entity.attrs.get(label="negation") # only keep entities without negation (negation is false) return len(attr)>0 and attr[0].value is False ``` -```python +```{code} python filter_operation = create_text_operation(function=filter_negated_entities, function_type=CustomTextOpType.FILTER) ``` **Define the new pipeline** -```python +```{code} python # add the filter operation steps_with_filter = entities_detector.pipeline.steps + [PipelineStep(filter_operation, input_keys=["entities_umls","entities_grade"], @@ -427,7 +426,7 @@ ontoTOX_pipeline_filter = Pipeline( ) ``` -```python +```{code} python new_doc = TextDocument("Le patient présente une dysarthrie mais pas de grade 3") ontoTOX_pipeline_filter.run([new_doc]) diff --git a/docs/examples/spans.md b/docs/examples/spans.md index 18d4fcd8..85aee3e9 100644 --- a/docs/examples/spans.md +++ b/docs/examples/spans.md @@ -2,12 +2,10 @@ Here are some examples about usage of span utilities. -:::{code} +```{code} python from medkit.core.text.span import Span from medkit.core.text.span_utils import replace, remove, move, extract, insert -::: -:::{code} raw_text = ( "Cher M. Dupond,\nJ’ai vu en consultation (à mon cabinet le 2019-02-01) " "Bertrand AGITE, né le 2008-02-25," @@ -15,9 +13,9 @@ raw_text = ( ) text = raw_text spans = [Span(0, len(raw_text))] -::: +``` -:::{code} +```{code} python import re # replace "M." by "M @@ -37,9 +35,9 @@ text, spans = remove(text, spans, [match.span()]) ranges = [m.span() for m in re.finditer(r"\n+", text, re.M)] text, spans = replace(text, spans, ranges, [" "] * len(ranges)) print(text) -::: +``` -:::{code} +```{code} python # extract sentences sentences = [] for match in re.finditer(r"[^\.]+\.", text, re.M): @@ -50,17 +48,17 @@ text_1, spans_1 = sentences[0] text_2, spans_2 = sentences[1] print(text_1) print(text_2) -::: +``` -:::{code} +```{code} python # move parenthesized text to end in 1st sentence match = re.search(r" *\((.*)\)", text_1, re.M) text_1, spans_1 = insert(text_1, spans_1, [len(text_1) - 1], [" ; "]) text_1, spans_1 = move(text_1, spans_1, match.span(1), len(text_1) - 1) print(text_1) -::: +``` -:::{code} +```{code} python # reformat dates in 1st sentence matches = list(re.finditer(r"\d{4}-\d{2}-\d{2}", text_1, re.M)) ranges = [m.span() for m in matches] @@ -70,32 +68,32 @@ new_dates = [ ] text_1, spans_1 = replace(text_1, spans_1, ranges, new_dates) print(text_1) -::: +``` -:::{code} +```{code} python # replace "(-)" by "negatif" in 2d sentence match = re.search(r"\(-\)", text_2, re.M) text_2, spans_2 = replace(text_2, spans_2, [match.span()], ["negatif"]) print(text_2) -::: +``` -:::{code} +```{code} python # find person entity in 1st sentence match = re.search(r"M [a-zA-Z]+", text_1) person_text, person_spans = extract( text_1, spans_1, [match.span()] ) -::: +``` -:::{code} +```{code} python # find date entities in 1st sentence dates = [] for match in re.finditer(r"\d{2}/\d{2}/\d{4}", text_1): date_text, date_spans = extract(text_1, spans_1, [match.span()]) dates.append((date_text, date_spans)) -::: +``` -:::{code} +```{code} python from medkit.core.text.span_utils import normalize_spans entities = [] @@ -106,9 +104,9 @@ for _, date_spans in dates: date_spans = normalize_spans(date_spans) entities.append(("date", date_spans)) print(entities) -::: +``` -:::{code} +```{code} python from spacy import displacy entities_data = [ @@ -119,4 +117,4 @@ entities_data = [ entities_data = sorted(entities_data, key=lambda e: e["start"]) data = {"text": raw_text, "ents": entities_data, "uuid": 0} displacy.render(data, manual=True, style="ent", jupyter=True, minify=True) -::: +```