-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
DOC : Add supporting code for NER benchmark paper
Co-Authored-By: Ghislain Vaillant <[email protected]>
- Loading branch information
Showing
8 changed files
with
1,058 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
[paths] | ||
examples = "examples.json" | ||
|
||
[nlp] | ||
lang = "fr" | ||
pipeline = ["llm"] | ||
|
||
[components] | ||
|
||
[components.llm] | ||
factory = "llm" | ||
|
||
[components.llm.task] | ||
@llm_tasks = "spacy.NER.v3" | ||
labels = ["Anatomie", "Appareil", "Organisme", "Médicament", "Objet", "Pathologie", "Phénomène", "Physiologie", "Procédure", "Région"] | ||
description = Les entités ne doivent strictement pas faire plus 2 mots de longueur. | ||
Tu dois faire le plus d entités possible. | ||
Tu dois faire des entités le plus petit possible. | ||
Les pronoms, les articles, les prépositions, les contraction ne font pas partie des entités. | ||
Les entités à privilégier sont Appareil, Médicament et Pathologie | ||
|
||
[components.llm.task.label_definitions] | ||
Anatomie = "Représente toute structure anatomique, organe, ou région du corps humain." | ||
Médicament = "Comprend médicaments, substances chimiques, enzymes, et matériaux biomédicaux." | ||
Appareil = "Englobe tout dispositif médical, d'administration de médicaments ou de recherche." | ||
Pathologie = "Inclut les anomalies acquises, congénitales, maladies, syndromes, et dysfonctionnements." | ||
Région = "Représente toute zone géographique." | ||
Organisme = "Comprend groupes d'âge, animaux, plantes, micro-organismes, et virus." | ||
Objet = "Englobe entités, aliments, objets manufacturés, et substances." | ||
Phénomène = "Inclut fonctions biologiques, effets environnementaux, phénomènes causés par l'homme ou naturels." | ||
Physiologie = "Ensemble des fonctions cellulaires, génétiques, mentales, moléculaires, et physiologiques." | ||
Procédure = "Comprend procédures de diagnostic, activités éducatives, soins de santé, et techniques de recherche." | ||
|
||
[components.llm.task.examples] | ||
@misc = "spacy.FewShotReader.v1" | ||
path = "${paths.examples}" | ||
|
||
[components.llm.model] | ||
@llm_models = "spacy.GPT-3-5.v3" | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
[paths] | ||
examples = "examples.json" | ||
|
||
[nlp] | ||
lang = "fr" | ||
pipeline = ["llm"] | ||
|
||
[components] | ||
|
||
[components.llm] | ||
factory = "llm" | ||
|
||
[components.llm.task] | ||
@llm_tasks = "spacy.NER.v3" | ||
labels = ["Anatomie", "Appareil", "Organisme", "Médicament", "Objet", "Pathologie", "Phénomène", "Physiologie", "Procédure", "Région"] | ||
description = Les entités ne doivent strictement pas faire plus 2 mots de longueur. | ||
Tu dois faire le plus d entités possible. | ||
Tu dois faire des entités le plus petit possible. | ||
Les pronoms, les articles, les prépositions, les contraction ne font pas partie des entités. | ||
Les entités à privilégier sont Appareil, Médicament et Pathologie | ||
|
||
[components.llm.task.label_definitions] | ||
Anatomie = "Représente toute structure anatomique, organe, ou région du corps humain." | ||
Médicament = "Comprend médicaments, substances chimiques, enzymes, et matériaux biomédicaux." | ||
Appareil = "Englobe tout dispositif médical, d'administration de médicaments ou de recherche." | ||
Pathologie = "Inclut les anomalies acquises, congénitales, maladies, syndromes, et dysfonctionnements." | ||
Région = "Représente toute zone géographique." | ||
Organisme = "Comprend groupes d'âge, animaux, plantes, micro-organismes, et virus." | ||
Objet = "Englobe entités, aliments, objets manufacturés, et substances." | ||
Phénomène = "Inclut fonctions biologiques, effets environnementaux, phénomènes causés par l'homme ou naturels." | ||
Physiologie = "Ensemble des fonctions cellulaires, génétiques, mentales, moléculaires, et physiologiques." | ||
Procédure = "Comprend procédures de diagnostic, activités éducatives, soins de santé, et techniques de recherche." | ||
|
||
[components.llm.task.examples] | ||
@misc = "spacy.FewShotReader.v1" | ||
path = "${paths.examples}" | ||
|
||
[components.llm.model] | ||
@llm_models = "spacy.GPT-4.v3" | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,199 @@ | ||
--- | ||
jupytext: | ||
text_representation: | ||
extension: .md | ||
format_name: myst | ||
format_version: 0.13 | ||
jupytext_version: 1.16.1 | ||
kernelspec: | ||
display_name: .venv | ||
language: python | ||
name: python3 | ||
--- | ||
|
||
# Data loading | ||
|
||
```{code-cell} ipython3 | ||
from glob import glob | ||
from medkit.core.text import TextDocument | ||
from medkit.io.brat import BratInputConverter | ||
from medkit.text.postprocessing import filter_overlapping_entities, DocumentSplitter | ||
from statistics import mean | ||
import pandas as pd | ||
from medkit.text.segmentation import SentenceTokenizer, SyntagmaTokenizer | ||
from medkit.core import Pipeline, DocPipeline, PipelineStep | ||
from pathlib import Path | ||
from medkit.tools.e3c_corpus import load_data_annotation | ||
from medkit.io.doccano import DoccanoInputConverter,DoccanoTask | ||
from sklearn.model_selection import train_test_split | ||
from medkit.tools.mtsamples import load_mtsamples | ||
from medkit.io.medkit_json import save_text_documents | ||
import json | ||
sentence_tok = SentenceTokenizer(output_label="sentence", punct_chars=["."],keep_punct=True,split_on_newlines=True) | ||
pipeline_phrase_creator = Pipeline(steps=[PipelineStep(sentence_tok, input_keys=["full_text"], output_keys=["sentences"])], | ||
input_keys=["full_text"], | ||
output_keys=["sentences"]) | ||
phrase_creator = DocPipeline(pipeline_phrase_creator) | ||
splitter = DocumentSplitter(segment_label="sentence", attr_labels=[]) | ||
def corpus_specs(_corpus, _title, num_docs): | ||
doc_data = {} | ||
doc_data['Documents'] = num_docs | ||
doc_data['Sentences'] = len(_corpus) | ||
doc_data['MSL'] = round(mean([len(sen.text) for sen in _corpus])) | ||
doc_data['All'] = sum([len(doc.anns.get_entities()) for doc in _corpus]) | ||
labels = [] | ||
for doc in _corpus: | ||
for ent in doc.anns.get_entities(): | ||
if ent.label not in doc_data: | ||
doc_data[ent.label] = 0 | ||
labels.append(ent.label) | ||
doc_data[ent.label] += 1 | ||
for label in labels: | ||
doc_data[label] = round(doc_data[label] / doc_data['All'] * 100) | ||
df = pd.DataFrame(doc_data, index=[_title]) | ||
return df | ||
def load_quaero_split(_split): | ||
QUAERO_DIR = Path.home() / "src/corpus/QUAERO_FrenchMed/corpus" | ||
converter = BratInputConverter() | ||
raw_docs = [] | ||
for text_file in sorted(QUAERO_DIR.glob(f"{_split}/*/*.txt")): | ||
doc = TextDocument.from_file(text_file) | ||
ann_file = text_file.with_suffix(".ann") | ||
entities = converter.load_annotations(ann_file) | ||
entities = filter_overlapping_entities(entities) | ||
for ent in entities: | ||
doc.anns.add(ent) | ||
raw_docs.append(doc) | ||
phrase_creator.run(raw_docs) | ||
splitted_docs = splitter.run(raw_docs) | ||
return splitted_docs, corpus_specs(splitted_docs, num_docs=len(raw_docs), _title=_split) | ||
def load_e3c_split(_split): | ||
data_collection = Path.home() / "src/corpus/E3C_corpus" | ||
dir_path = data_collection / _split | ||
raw_docs = list(load_data_annotation(dir_path=dir_path, keep_sentences = True)) | ||
for doc in raw_docs: | ||
for ent in doc.anns.get_entities(): | ||
ent.label = "DISO" | ||
phrase_creator.run(raw_docs) | ||
splitted_docs = splitter.run(raw_docs) | ||
return splitted_docs, corpus_specs(splitted_docs, num_docs=len(raw_docs), _title=_split) | ||
def load_casm2(): | ||
ANNOTATION_DIR = Path.home() / "src/corpus/CasM2_Files/m2annotations" | ||
SPLIT_SEEDS = (67, 33) | ||
TEST_SIZE = 0.2 | ||
VALIDATION_SIZE = 0.2 | ||
converter = DoccanoInputConverter(task=DoccanoTask.RELATION_EXTRACTION) | ||
raw_documents = converter.load_from_directory_zip(ANNOTATION_DIR) | ||
phrase_creator.run(raw_documents) | ||
casm2 = {} | ||
casm2['train'], casm2['test'] = train_test_split(raw_documents, random_state=SPLIT_SEEDS[0], test_size=TEST_SIZE) | ||
casm2['train'], casm2['val'] = train_test_split(casm2['train'], random_state=SPLIT_SEEDS[1], test_size=VALIDATION_SIZE) | ||
casm2_splitter = DocumentSplitter(segment_label="sentence", entity_labels=['treatment', 'test', 'problem'], attr_labels=[]) | ||
remap = {'treatment':'CHEM', 'test':'PROC','problem':'DISO'} | ||
casm2_splitted = {} | ||
docs_num = {} | ||
for key, docs in casm2.items(): | ||
docs_num[key] = len(docs) | ||
casm2_splitted[key] = casm2_splitter.run(docs) | ||
for doc in casm2_splitted[key]: | ||
for ent in doc.anns.get_entities(): | ||
if ent.label in remap: | ||
ent.label = remap[ent.label] | ||
specs = pd.concat([corpus_specs(casm2_splitted[key], key, docs_num[key]) for key in casm2.keys()]) | ||
return casm2_splitted, specs | ||
def load_quaero(): | ||
splits = ["train", "test", "dev"] | ||
quaero = {} | ||
stats = [] | ||
for split in splits: | ||
quaero[split], stat = load_quaero_split(split) | ||
stats.append(stat) | ||
specs = pd.concat(stats) | ||
quaero['val'] = quaero.pop('dev') | ||
return quaero, specs | ||
def load_e3c(): | ||
splits = ["layer1_test", "layer1_train", "layer2_val"] | ||
e3c = {} | ||
stats = [] | ||
for split in splits: | ||
e3c[split], stat = load_e3c_split(split) | ||
stats.append(stat) | ||
specs = pd.concat(stats) | ||
e3c['test'] = e3c.pop('layer1_test') | ||
e3c['train'] = e3c.pop('layer1_train') | ||
e3c['val'] = e3c.pop('layer2_val') | ||
return e3c, specs | ||
def load_processed_mtsamples(): | ||
mt_samples = load_mtsamples() | ||
doc_num = len(mt_samples) | ||
phrase_creator.run(mt_samples) | ||
mt_splitted = splitter.run(mt_samples) | ||
specs = corpus_specs(mt_splitted, 'mtsamples', doc_num) | ||
return mt_splitted, specs | ||
``` | ||
|
||
```{code-cell} ipython3 | ||
quaero, specs_quaero = load_quaero() | ||
e3c, specs_e3c = load_e3c() | ||
casm2, specs_casm2 = load_casm2() | ||
#mt, specs_mt = load_processed_mtsamples() | ||
``` | ||
|
||
```{code-cell} ipython3 | ||
specs_quaero.T | ||
``` | ||
|
||
```{code-cell} ipython3 | ||
specs_e3c.T | ||
``` | ||
|
||
```{code-cell} ipython3 | ||
specs_casm2.T | ||
``` | ||
|
||
```{code-cell} ipython3 | ||
corpus = {'quaero':quaero, 'e3c':e3c, 'casm2':casm2} | ||
for corpa_name, corpa in corpus.items(): | ||
for split_name, split in corpa.items(): | ||
output = f"datasets/{corpa_name}/{split_name}.jsonl" | ||
save_text_documents(split, output) | ||
``` |
Oops, something went wrong.