diff --git a/additional_data_exploration.ipynb b/additional_data_exploration.ipynb new file mode 100644 index 0000000..9863f02 --- /dev/null +++ b/additional_data_exploration.ipynb @@ -0,0 +1,1423 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Test" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Summarizer" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import BigBirdPegasusForConditionalGeneration, AutoTokenizer" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# tokenizer = AutoTokenizer.from_pretrained(\"google/bigbird-pegasus-large-pubmed\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# # by default encoder-attention is `block_sparse` with num_random_blocks=3, block_size=64\n", + "# model = BigBirdPegasusForConditionalGeneration.from_pretrained(\"google/bigbird-pegasus-large-pubmed\")\n", + "\n", + "# # decoder attention type can't be changed & will be \"original_full\"\n", + "# # you can change `attention_type` (encoder only) to full attention like this:\n", + "# # model = BigBirdPegasusForConditionalGeneration.from_pretrained(\"google/bigbird-pegasus-large-pubmed\", attention_type=\"original_full\")\n", + "\n", + "# # you can change `block_size` & `num_random_blocks` like this:\n", + "# # model = BigBirdPegasusForConditionalGeneration.from_pretrained(\"google/bigbird-pegasus-large-pubmed\", block_size=16, num_random_blocks=2)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# text = \"Our patient is a 78-year-old male with a past medical history of cutaneous T-cell lymphoma/mycosis fungoides (on regular outpatient extracorporeal photopheresis), type II diabetes mellitus, atrial flutter on Xarelto, and sick sinus syndrome on dual-chamber pacemaker, presented to the hospital with right upper quadrant abdominal pain. The patient was a former smoker and denied any alcohol use.\\nIn the emergency department, he was hemodynamically stable. Laboratory workup was significant for abnormally elevated liver function tests including aspartate aminotransferase/alanine aminotransferase (AST/ALT) of 204/188 U/L, alkaline phosphatase (ALP) of 550 U/L, and total bilirubin of 2.5 mg/dL. Ultrasound of the abdomen was negative for any focal liver or gallbladder lesions. There was no evidence of intrahepatic or extrahepatic biliary duct dilation. Hepatobiliary iminodiacetic acid (HIDA) scan was normal, and hence cholecystitis was ruled out. CT abdomen and pelvis and CT angiography of the chest were negative for acute pathology. As the patient had a pacemaker, magnetic resonance cholangiopancreatography (MRCP) could not be performed. Further laboratory evaluation for elevated liver enzymes, including viral hepatitis panel, thyroid-stimulating hormone (TSH), iron panel, antinuclear antibody (ANA), anti-mitochondrial antibody, alpha-1-antitrypsin antibody, anti-smooth muscle antibody, and ceruloplasmin was negative.\\nGiven that the patient has a history of cutaneous T-cell lymphoma, the important differential diagnosis included leukemic infiltration of the liver and adverse reaction to the prior chemotherapy. However, the patient received only a short course of the chemotherapeutic regimen mogamulizumab (due to insurance issues), and hence it was unlikely to cause this current clinical picture. Subsequently, a percutaneous liver biopsy was performed to confirm the diagnosis, which showed replacement of the normal liver parenchymal cells by high-grade tumor cells with a high nuclear-cytoplasmic ratio (Figures -). The tumor cells showed positive immunohistochemical staining for cytokeratin AE1/AE3, cytokeratin 20 (CK20), synaptophysin, chromogranin, and negative for CK7, caudal type homeobox transcription factor 2 (CDX-2), and thyroid transcription factor 1 (TTF-1) (Figures -). All these features were suggestive of metastatic Merkel cell carcinoma. There was no evidence of leukemic infiltrates. As the patient had no evidence of MCC involvement of the skin, he was diagnosed with metastatic MCC of the liver of unknown primary.\\nHematology/Oncology and Dermatology was consulted. Considering the medical comorbidities, the patient and family opted for comfort care measures and were discharged home.\"\n", + "# inputs = tokenizer(text, return_tensors='pt')\n", + "# prediction = model.generate(**inputs)\n", + "# prediction = tokenizer.batch_decode(prediction)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading pytorch_model.bin: 5%|▍ | 105M/2.31G [00:20<01:11, 30.8MB/s]" + ] + } + ], + "source": [ + "from transformers import pipeline\n", + "\n", + "summarizer = pipeline(\"summarization\", model=\"facebook/bart-large-cnn\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "text = \"Our patient is a 78-year-old male with a past medical history of cutaneous T-cell lymphoma/mycosis fungoides (on regular outpatient extracorporeal photopheresis), type II diabetes mellitus, atrial flutter on Xarelto, and sick sinus syndrome on dual-chamber pacemaker, presented to the hospital with right upper quadrant abdominal pain. The patient was a former smoker and denied any alcohol use.\\nIn the emergency department, he was hemodynamically stable. Laboratory workup was significant for abnormally elevated liver function tests including aspartate aminotransferase/alanine aminotransferase (AST/ALT) of 204/188 U/L, alkaline phosphatase (ALP) of 550 U/L, and total bilirubin of 2.5 mg/dL. Ultrasound of the abdomen was negative for any focal liver or gallbladder lesions. There was no evidence of intrahepatic or extrahepatic biliary duct dilation. Hepatobiliary iminodiacetic acid (HIDA) scan was normal, and hence cholecystitis was ruled out. CT abdomen and pelvis and CT angiography of the chest were negative for acute pathology. As the patient had a pacemaker, magnetic resonance cholangiopancreatography (MRCP) could not be performed. Further laboratory evaluation for elevated liver enzymes, including viral hepatitis panel, thyroid-stimulating hormone (TSH), iron panel, antinuclear antibody (ANA), anti-mitochondrial antibody, alpha-1-antitrypsin antibody, anti-smooth muscle antibody, and ceruloplasmin was negative.\\nGiven that the patient has a history of cutaneous T-cell lymphoma, the important differential diagnosis included leukemic infiltration of the liver and adverse reaction to the prior chemotherapy. However, the patient received only a short course of the chemotherapeutic regimen mogamulizumab (due to insurance issues), and hence it was unlikely to cause this current clinical picture. Subsequently, a percutaneous liver biopsy was performed to confirm the diagnosis, which showed replacement of the normal liver parenchymal cells by high-grade tumor cells with a high nuclear-cytoplasmic ratio (Figures -). The tumor cells showed positive immunohistochemical staining for cytokeratin AE1/AE3, cytokeratin 20 (CK20), synaptophysin, chromogranin, and negative for CK7, caudal type homeobox transcription factor 2 (CDX-2), and thyroid transcription factor 1 (TTF-1) (Figures -). All these features were suggestive of metastatic Merkel cell carcinoma. There was no evidence of leukemic infiltrates. As the patient had no evidence of MCC involvement of the skin, he was diagnosed with metastatic MCC of the liver of unknown primary.\\nHematology/Oncology and Dermatology was consulted. Considering the medical comorbidities, the patient and family opted for comfort care measures and were discharged home.\"" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'summary_text': 'A 78-year-old male with a past medical history of cutaneous T-cell lymphoma/mycosis fungoides, type II diabetes mellitus, atrial flutter on Xarelto, and sick sinus syndrome on dual-chamber pacemaker, presented to the hospital with right upper quadrant abdominal pain. The patient was a former smoker and denied any alcohol use. He was diagnosed with metastatic MCC of the liver of unknown primary.'}]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "summarizer(text, max_length=130, min_length=30, do_sample=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## DDXPLUS" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "import random\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "def load_json(file_path):\n", + " with open(file_path, 'r') as file:\n", + " return json.load(file)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# Assuming the data is loaded from the respective JSON and CSV files\n", + "evidences = load_json('DDxPLUS/release_evidences.json')\n", + "conditions = load_json('DDxPLUS/release_conditions.json')\n", + "patients = pd.read_csv('DDxPLUS/release_train_patients.csv') # Example for the training set" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | AGE | \n", + "DIFFERENTIAL_DIAGNOSIS | \n", + "SEX | \n", + "PATHOLOGY | \n", + "EVIDENCES | \n", + "INITIAL_EVIDENCE | \n", + "
---|---|---|---|---|---|---|
0 | \n", + "18 | \n", + "[['Bronchite', 0.19171203430383882], ['Pneumon... | \n", + "M | \n", + "IVRS ou virémie | \n", + "['crowd', 'diaph', 'douleurxx', 'douleurxx_car... | \n", + "fievre | \n", + "
1 | \n", + "21 | \n", + "[['VIH (Primo-infection)', 0.5189500564407601]... | \n", + "M | \n", + "VIH (Primo-infection) | \n", + "['adp_dlr', 'atcd_its', 'diaph', 'diarrhee', '... | \n", + "diaph | \n", + "
2 | \n", + "19 | \n", + "[['Bronchite', 0.11278064619119596], ['Pneumon... | \n", + "F | \n", + "Pneumonie | \n", + "['douleurxx', 'douleurxx_carac_@_un_coup_de_co... | \n", + "expecto | \n", + "
3 | \n", + "34 | \n", + "[['IVRS ou virémie', 0.23859396799565236], ['C... | \n", + "F | \n", + "IVRS ou virémie | \n", + "['crowd', 'douleurxx', 'douleurxx_carac_@_une_... | \n", + "douleurxx | \n", + "
4 | \n", + "36 | \n", + "[['IVRS ou virémie', 0.23677812769175735], ['P... | \n", + "M | \n", + "IVRS ou virémie | \n", + "['dayc', 'diaph', 'douleurxx', 'douleurxx_cara... | \n", + "toux | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
1025597 | \n", + "18 | \n", + "[['Épiglottite', 0.28156957795466475], ['VIH (... | \n", + "M | \n", + "Épiglottite | \n", + "['bw_bending', 'douleurxx', 'douleurxx_carac_@... | \n", + "fievre | \n", + "
1025598 | \n", + "28 | \n", + "[['Épiglottite', 0.3703962237298842], ['Laryng... | \n", + "F | \n", + "Épiglottite | \n", + "['douleurxx', 'douleurxx_carac_@_vive', 'doule... | \n", + "fievre | \n", + "
1025599 | \n", + "0 | \n", + "[['Épiglottite', 0.13193905052537108], ['Laryn... | \n", + "F | \n", + "Épiglottite | \n", + "['bw_bending', 'douleurxx', 'douleurxx_carac_@... | \n", + "stridor | \n", + "
1025600 | \n", + "26 | \n", + "[['Épiglottite', 0.3028258988138983], ['Laryng... | \n", + "F | \n", + "Épiglottite | \n", + "['douleurxx', 'douleurxx_carac_@_un_coup_de_co... | \n", + "stridor | \n", + "
1025601 | \n", + "25 | \n", + "[['Épiglottite', 0.12896823203696775], ['Laryn... | \n", + "F | \n", + "Épiglottite | \n", + "['douleurxx', 'douleurxx_carac_@_un_coup_de_co... | \n", + "douleurxx | \n", + "
1025602 rows × 6 columns
\n", + "\n", + " | patient | \n", + "doctor | \n", + "
---|---|---|
0 | \n", + "throat a bit sore and want to get a good imune... | \n", + "during this pandemic. throat pain can be from ... | \n", + "
1 | \n", + "hey there i have had cold \"symptoms\" for over ... | \n", + "yes. protection. it is not enough symptoms to ... | \n", + "
2 | \n", + "i have a tight and painful chest with a dry co... | \n", + "possible. top symptoms include fever, dry coug... | \n", + "
3 | \n", + "what will happen after the incubation period f... | \n", + "in brief: symptoms if you are infected, sympto... | \n", + "
4 | \n", + "just found out i was pregnant. yesterday diagn... | \n", + "thanks for your question on healthcare magic.i... | \n", + "
... | \n", + "... | \n", + "... | \n", + "
477 | \n", + "my 5 year old son woke up not feeling well. i ... | \n", + "in brief: arrange testing stay home, provide f... | \n", + "
478 | \n", + "i have a dry cough and sore throat- it's been ... | \n", + "in brief: covid good guidelines can be found a... | \n", + "
479 | \n", + "how do i know if i have a normal cold or maybe... | \n", + "common cold with sin. the corona virus causes ... | \n", + "
480 | \n", + "hi- i was diagnosed a month ago with community... | \n", + "hello! just because you have previously had a ... | \n", + "
481 | \n", + "i have a aunt that is in the hospital at st vi... | \n", + "hello welcome to the health care magic your co... | \n", + "
482 rows × 2 columns
\n", + "\n", + " | 0 | \n", + "1 | \n", + "2 | \n", + "3 | \n", + "4 | \n", + "5 | \n", + "6 | \n", + "7 | \n", + "8 | \n", + "9 | \n", + "10 | \n", + "11 | \n", + "12 | \n", + "13 | \n", + "14 | \n", + "15 | \n", + "16 | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "patient: throat a bit sore and want to get a g... | \n", + "doctor: during this pandemic. throat pain can ... | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
1 | \n", + "patient: hey there i have had cold \"symptoms\" ... | \n", + "doctor: yes. protection. it is not enough symp... | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
2 | \n", + "patient: i have a tight and painful chest with... | \n", + "doctor: possible. top symptoms include fever, ... | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
3 | \n", + "patient: what will happen after the incubation... | \n", + "doctor: in brief: symptoms if you are infected... | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
4 | \n", + "patient: just found out i was pregnant. yester... | \n", + "doctor: thanks for your question on healthcare... | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
477 | \n", + "patient: my 5 year old son woke up not feeling... | \n", + "doctor: in brief: arrange testing stay home, p... | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
478 | \n", + "patient: i have a dry cough and sore throat- i... | \n", + "doctor: in brief: covid good guidelines can be... | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
479 | \n", + "patient: how do i know if i have a normal cold... | \n", + "doctor: common cold with sin. the corona virus... | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
480 | \n", + "patient: hi- i was diagnosed a month ago with ... | \n", + "doctor: hello! just because you have previousl... | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
481 | \n", + "patient: i have a aunt that is in the hospital... | \n", + "doctor: hello welcome to the health care magic... | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
482 rows × 17 columns
\n", + "\n", + " | id | \n", + "Description | \n", + "Doctor | \n", + "Patient | \n", + "
---|---|---|---|---|
0 | \n", + "0 | \n", + "Q. What does abutment of the nerve root mean? | \n", + "Hi. I have gone through your query with dilige... | \n", + "Hi doctor,I am just wondering what is abutting... | \n", + "
1 | \n", + "1 | \n", + "Q. Every time I eat spicy food, I poop blood. ... | \n", + "Hello. I have gone through your information an... | \n", + "Hi doctor, I am a 26 year old male. I am 5 fee... | \n", + "
2 | \n", + "2 | \n", + "Q. Will Nano-Leo give permanent solution for e... | \n", + "Hi. For further doubts consult a sexologist on... | \n", + "Hello doctor, I am 48 years old. I am experien... | \n", + "
3 | \n", + "3 | \n", + "Q. Will Kalarchikai cure multiple ovarian cyst... | \n", + "Hello. I just read your query. See Kalarachi K... | \n", + "Hello doctor, I have multiple small cysts in b... | \n", + "
4 | \n", + "4 | \n", + "Q. I masturbate only by rubbing the tip of the... | \n", + "Hi. For further doubts consult a sexologist on... | \n", + "Hi doctor, During masturbation I just rub the ... | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
257464 | \n", + "257464 | \n", + "Unprotected sex after periods, took morning af... | \n", + "Hormonal method of birth control like pills an... | \n", + "Hello, I am , age 26 years old. On 7th of may,... | \n", + "
257465 | \n", + "257465 | \n", + "Delivered baby, plan for second child after 4-... | \n", + "Do you know how this pills act and how your me... | \n", + "okay so i got this loette pill right its a rea... | \n", + "
257466 | \n", + "257466 | \n", + "Taking loette pill, have started half way thro... | \n", + "Hi thanks for your question your taking contra... | \n", + "taking the mini pill Cerazette and missed taki... | \n", + "
257467 | \n", + "257467 | \n", + "On Cerazette, missed pills twice at night, fol... | \n", + "Hi Cassctiexx Thanks for writing in to Healthc... | \n", + "Hi I recently received the depo-provera shot o... | \n", + "
257468 | \n", + "257468 | \n", + "Excessive tiredness, depression, body aches si... | \n", + "Don't worry abt card, you need advice at this ... | \n", + "i had unprotected sex on 20th sep, n after 5hr... | \n", + "
257469 rows × 4 columns
\n", + "\n", + " | desc | \n", + "doctor | \n", + "patient | \n", + "
---|---|---|---|
0 | \n", + "Q. What does abutment of the nerve root mean? | \n", + "Hi. I have gone through your query with dilige... | \n", + "Hi doctor,I am just wondering what is abutting... | \n", + "
1 | \n", + "Q. Every time I eat spicy food, I poop blood. ... | \n", + "Hello. I have gone through your information an... | \n", + "Hi doctor, I am a 26 year old male. I am 5 fee... | \n", + "
2 | \n", + "Q. Will Nano-Leo give permanent solution for e... | \n", + "Hi. For further doubts consult a sexologist on... | \n", + "Hello doctor, I am 48 years old. I am experien... | \n", + "
3 | \n", + "Q. Will Kalarchikai cure multiple ovarian cyst... | \n", + "Hello. I just read your query. See Kalarachi K... | \n", + "Hello doctor, I have multiple small cysts in b... | \n", + "
4 | \n", + "Q. I masturbate only by rubbing the tip of the... | \n", + "Hi. For further doubts consult a sexologist on... | \n", + "Hi doctor, During masturbation I just rub the ... | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "
257464 | \n", + "Unprotected sex after periods, took morning af... | \n", + "Hormonal method of birth control like pills an... | \n", + "Hello, I am , age 26 years old. On 7th of may,... | \n", + "
257465 | \n", + "Delivered baby, plan for second child after 4-... | \n", + "Do you know how this pills act and how your me... | \n", + "okay so i got this loette pill right its a rea... | \n", + "
257466 | \n", + "Taking loette pill, have started half way thro... | \n", + "Hi thanks for your question your taking contra... | \n", + "taking the mini pill Cerazette and missed taki... | \n", + "
257467 | \n", + "On Cerazette, missed pills twice at night, fol... | \n", + "Hi Cassctiexx Thanks for writing in to Healthc... | \n", + "Hi I recently received the depo-provera shot o... | \n", + "
257468 | \n", + "Excessive tiredness, depression, body aches si... | \n", + "Don't worry abt card, you need advice at this ... | \n", + "i had unprotected sex on 20th sep, n after 5hr... | \n", + "
257469 rows × 3 columns
\n", + "