diff --git a/additional_data_exploration.ipynb b/additional_data_exploration.ipynb new file mode 100644 index 0000000..9863f02 --- /dev/null +++ b/additional_data_exploration.ipynb @@ -0,0 +1,1423 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Test" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Summarizer" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import BigBirdPegasusForConditionalGeneration, AutoTokenizer" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# tokenizer = AutoTokenizer.from_pretrained(\"google/bigbird-pegasus-large-pubmed\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# # by default encoder-attention is `block_sparse` with num_random_blocks=3, block_size=64\n", + "# model = BigBirdPegasusForConditionalGeneration.from_pretrained(\"google/bigbird-pegasus-large-pubmed\")\n", + "\n", + "# # decoder attention type can't be changed & will be \"original_full\"\n", + "# # you can change `attention_type` (encoder only) to full attention like this:\n", + "# # model = BigBirdPegasusForConditionalGeneration.from_pretrained(\"google/bigbird-pegasus-large-pubmed\", attention_type=\"original_full\")\n", + "\n", + "# # you can change `block_size` & `num_random_blocks` like this:\n", + "# # model = BigBirdPegasusForConditionalGeneration.from_pretrained(\"google/bigbird-pegasus-large-pubmed\", block_size=16, num_random_blocks=2)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# text = \"Our patient is a 78-year-old male with a past medical history of cutaneous T-cell lymphoma/mycosis fungoides (on regular outpatient extracorporeal photopheresis), type II diabetes mellitus, atrial flutter on Xarelto, and sick sinus syndrome on dual-chamber pacemaker, presented to the hospital with right upper quadrant abdominal pain. The patient was a former smoker and denied any alcohol use.\\nIn the emergency department, he was hemodynamically stable. Laboratory workup was significant for abnormally elevated liver function tests including aspartate aminotransferase/alanine aminotransferase (AST/ALT) of 204/188 U/L, alkaline phosphatase (ALP) of 550 U/L, and total bilirubin of 2.5 mg/dL. Ultrasound of the abdomen was negative for any focal liver or gallbladder lesions. There was no evidence of intrahepatic or extrahepatic biliary duct dilation. Hepatobiliary iminodiacetic acid (HIDA) scan was normal, and hence cholecystitis was ruled out. CT abdomen and pelvis and CT angiography of the chest were negative for acute pathology. As the patient had a pacemaker, magnetic resonance cholangiopancreatography (MRCP) could not be performed. Further laboratory evaluation for elevated liver enzymes, including viral hepatitis panel, thyroid-stimulating hormone (TSH), iron panel, antinuclear antibody (ANA), anti-mitochondrial antibody, alpha-1-antitrypsin antibody, anti-smooth muscle antibody, and ceruloplasmin was negative.\\nGiven that the patient has a history of cutaneous T-cell lymphoma, the important differential diagnosis included leukemic infiltration of the liver and adverse reaction to the prior chemotherapy. However, the patient received only a short course of the chemotherapeutic regimen mogamulizumab (due to insurance issues), and hence it was unlikely to cause this current clinical picture. Subsequently, a percutaneous liver biopsy was performed to confirm the diagnosis, which showed replacement of the normal liver parenchymal cells by high-grade tumor cells with a high nuclear-cytoplasmic ratio (Figures -). The tumor cells showed positive immunohistochemical staining for cytokeratin AE1/AE3, cytokeratin 20 (CK20), synaptophysin, chromogranin, and negative for CK7, caudal type homeobox transcription factor 2 (CDX-2), and thyroid transcription factor 1 (TTF-1) (Figures -). All these features were suggestive of metastatic Merkel cell carcinoma. There was no evidence of leukemic infiltrates. As the patient had no evidence of MCC involvement of the skin, he was diagnosed with metastatic MCC of the liver of unknown primary.\\nHematology/Oncology and Dermatology was consulted. Considering the medical comorbidities, the patient and family opted for comfort care measures and were discharged home.\"\n", + "# inputs = tokenizer(text, return_tensors='pt')\n", + "# prediction = model.generate(**inputs)\n", + "# prediction = tokenizer.batch_decode(prediction)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading pytorch_model.bin: 5%|▍ | 105M/2.31G [00:20<01:11, 30.8MB/s]" + ] + } + ], + "source": [ + "from transformers import pipeline\n", + "\n", + "summarizer = pipeline(\"summarization\", model=\"facebook/bart-large-cnn\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "text = \"Our patient is a 78-year-old male with a past medical history of cutaneous T-cell lymphoma/mycosis fungoides (on regular outpatient extracorporeal photopheresis), type II diabetes mellitus, atrial flutter on Xarelto, and sick sinus syndrome on dual-chamber pacemaker, presented to the hospital with right upper quadrant abdominal pain. The patient was a former smoker and denied any alcohol use.\\nIn the emergency department, he was hemodynamically stable. Laboratory workup was significant for abnormally elevated liver function tests including aspartate aminotransferase/alanine aminotransferase (AST/ALT) of 204/188 U/L, alkaline phosphatase (ALP) of 550 U/L, and total bilirubin of 2.5 mg/dL. Ultrasound of the abdomen was negative for any focal liver or gallbladder lesions. There was no evidence of intrahepatic or extrahepatic biliary duct dilation. Hepatobiliary iminodiacetic acid (HIDA) scan was normal, and hence cholecystitis was ruled out. CT abdomen and pelvis and CT angiography of the chest were negative for acute pathology. As the patient had a pacemaker, magnetic resonance cholangiopancreatography (MRCP) could not be performed. Further laboratory evaluation for elevated liver enzymes, including viral hepatitis panel, thyroid-stimulating hormone (TSH), iron panel, antinuclear antibody (ANA), anti-mitochondrial antibody, alpha-1-antitrypsin antibody, anti-smooth muscle antibody, and ceruloplasmin was negative.\\nGiven that the patient has a history of cutaneous T-cell lymphoma, the important differential diagnosis included leukemic infiltration of the liver and adverse reaction to the prior chemotherapy. However, the patient received only a short course of the chemotherapeutic regimen mogamulizumab (due to insurance issues), and hence it was unlikely to cause this current clinical picture. Subsequently, a percutaneous liver biopsy was performed to confirm the diagnosis, which showed replacement of the normal liver parenchymal cells by high-grade tumor cells with a high nuclear-cytoplasmic ratio (Figures -). The tumor cells showed positive immunohistochemical staining for cytokeratin AE1/AE3, cytokeratin 20 (CK20), synaptophysin, chromogranin, and negative for CK7, caudal type homeobox transcription factor 2 (CDX-2), and thyroid transcription factor 1 (TTF-1) (Figures -). All these features were suggestive of metastatic Merkel cell carcinoma. There was no evidence of leukemic infiltrates. As the patient had no evidence of MCC involvement of the skin, he was diagnosed with metastatic MCC of the liver of unknown primary.\\nHematology/Oncology and Dermatology was consulted. Considering the medical comorbidities, the patient and family opted for comfort care measures and were discharged home.\"" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'summary_text': 'A 78-year-old male with a past medical history of cutaneous T-cell lymphoma/mycosis fungoides, type II diabetes mellitus, atrial flutter on Xarelto, and sick sinus syndrome on dual-chamber pacemaker, presented to the hospital with right upper quadrant abdominal pain. The patient was a former smoker and denied any alcohol use. He was diagnosed with metastatic MCC of the liver of unknown primary.'}]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "summarizer(text, max_length=130, min_length=30, do_sample=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## DDXPLUS" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "import random\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "def load_json(file_path):\n", + " with open(file_path, 'r') as file:\n", + " return json.load(file)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# Assuming the data is loaded from the respective JSON and CSV files\n", + "evidences = load_json('DDxPLUS/release_evidences.json')\n", + "conditions = load_json('DDxPLUS/release_conditions.json')\n", + "patients = pd.read_csv('DDxPLUS/release_train_patients.csv') # Example for the training set" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AGEDIFFERENTIAL_DIAGNOSISSEXPATHOLOGYEVIDENCESINITIAL_EVIDENCE
018[['Bronchite', 0.19171203430383882], ['Pneumon...MIVRS ou virémie['crowd', 'diaph', 'douleurxx', 'douleurxx_car...fievre
121[['VIH (Primo-infection)', 0.5189500564407601]...MVIH (Primo-infection)['adp_dlr', 'atcd_its', 'diaph', 'diarrhee', '...diaph
219[['Bronchite', 0.11278064619119596], ['Pneumon...FPneumonie['douleurxx', 'douleurxx_carac_@_un_coup_de_co...expecto
334[['IVRS ou virémie', 0.23859396799565236], ['C...FIVRS ou virémie['crowd', 'douleurxx', 'douleurxx_carac_@_une_...douleurxx
436[['IVRS ou virémie', 0.23677812769175735], ['P...MIVRS ou virémie['dayc', 'diaph', 'douleurxx', 'douleurxx_cara...toux
.....................
102559718[['Épiglottite', 0.28156957795466475], ['VIH (...MÉpiglottite['bw_bending', 'douleurxx', 'douleurxx_carac_@...fievre
102559828[['Épiglottite', 0.3703962237298842], ['Laryng...FÉpiglottite['douleurxx', 'douleurxx_carac_@_vive', 'doule...fievre
10255990[['Épiglottite', 0.13193905052537108], ['Laryn...FÉpiglottite['bw_bending', 'douleurxx', 'douleurxx_carac_@...stridor
102560026[['Épiglottite', 0.3028258988138983], ['Laryng...FÉpiglottite['douleurxx', 'douleurxx_carac_@_un_coup_de_co...stridor
102560125[['Épiglottite', 0.12896823203696775], ['Laryn...FÉpiglottite['douleurxx', 'douleurxx_carac_@_un_coup_de_co...douleurxx
\n", + "

1025602 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " AGE DIFFERENTIAL_DIAGNOSIS SEX \\\n", + "0 18 [['Bronchite', 0.19171203430383882], ['Pneumon... M \n", + "1 21 [['VIH (Primo-infection)', 0.5189500564407601]... M \n", + "2 19 [['Bronchite', 0.11278064619119596], ['Pneumon... F \n", + "3 34 [['IVRS ou virémie', 0.23859396799565236], ['C... F \n", + "4 36 [['IVRS ou virémie', 0.23677812769175735], ['P... M \n", + "... ... ... .. \n", + "1025597 18 [['Épiglottite', 0.28156957795466475], ['VIH (... M \n", + "1025598 28 [['Épiglottite', 0.3703962237298842], ['Laryng... F \n", + "1025599 0 [['Épiglottite', 0.13193905052537108], ['Laryn... F \n", + "1025600 26 [['Épiglottite', 0.3028258988138983], ['Laryng... F \n", + "1025601 25 [['Épiglottite', 0.12896823203696775], ['Laryn... F \n", + "\n", + " PATHOLOGY \\\n", + "0 IVRS ou virémie \n", + "1 VIH (Primo-infection) \n", + "2 Pneumonie \n", + "3 IVRS ou virémie \n", + "4 IVRS ou virémie \n", + "... ... \n", + "1025597 Épiglottite \n", + "1025598 Épiglottite \n", + "1025599 Épiglottite \n", + "1025600 Épiglottite \n", + "1025601 Épiglottite \n", + "\n", + " EVIDENCES INITIAL_EVIDENCE \n", + "0 ['crowd', 'diaph', 'douleurxx', 'douleurxx_car... fievre \n", + "1 ['adp_dlr', 'atcd_its', 'diaph', 'diarrhee', '... diaph \n", + "2 ['douleurxx', 'douleurxx_carac_@_un_coup_de_co... expecto \n", + "3 ['crowd', 'douleurxx', 'douleurxx_carac_@_une_... douleurxx \n", + "4 ['dayc', 'diaph', 'douleurxx', 'douleurxx_cara... toux \n", + "... ... ... \n", + "1025597 ['bw_bending', 'douleurxx', 'douleurxx_carac_@... fievre \n", + "1025598 ['douleurxx', 'douleurxx_carac_@_vive', 'doule... fievre \n", + "1025599 ['bw_bending', 'douleurxx', 'douleurxx_carac_@... stridor \n", + "1025600 ['douleurxx', 'douleurxx_carac_@_un_coup_de_co... stridor \n", + "1025601 ['douleurxx', 'douleurxx_carac_@_un_coup_de_co... douleurxx \n", + "\n", + "[1025602 rows x 6 columns]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "patients" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "VIH (Primo-infection)\n", + "HIV (initial infection)\n", + "['atcd_its', 'diarrhee', 'douleurxx', 'douleurxx_carac_@_pénible', 'douleurxx_carac_@_sensible', 'douleurxx_carac_@_épuisante', 'douleurxx_endroitducorps_@_arrière_de_tête', 'douleurxx_endroitducorps_@_dessus_de_tête', 'douleurxx_endroitducorps_@_front', 'douleurxx_endroitducorps_@_tempe_D_', 'douleurxx_endroitducorps_@_tempe_G_', 'douleurxx_intens_@_3', 'douleurxx_irrad_@_nulle_part', 'douleurxx_precis_@_3', 'douleurxx_soudain_@_0', 'drogues_IV', 'fievre', 'gorge_dlr', 'itss_risque', 'lesions_peau', 'lesions_peau_couleur_@_pale', 'lesions_peau_desquame_@_N', 'lesions_peau_elevee_@_0', 'lesions_peau_endroitducorps_@_gencive_inférieure', 'lesions_peau_endroitducorps_@_grande_lèvre_D_', 'lesions_peau_endroitducorps_@_joue_interne_G_', 'lesions_peau_endroitducorps_@_lèvre_inferieure_D_', 'lesions_peau_endroitducorps_@_palais', 'lesions_peau_intens_@_2', 'lesions_peau_plusqu1cm_@_O', 'lesions_peau_prurit_@_0', 'msk_dlr', 'nausee', 'perte_poids', 'sex_vih', 'trav1_@_N']\n", + "{}\n" + ] + }, + { + "data": { + "text/plain": [ + "['Instruction: Based on the following evidence: N/A, what could be the potential diagnosis?\\nContext: Patient Age: 23, Sex: M, Initial Evidence: gorge_dlr\\nResponse: VIH (Primo-infection)']" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def generate_medical_prompts(patients, evidences, conditions):\n", + " prompts = []\n", + " for _, patient in patients.iterrows():\n", + " \n", + " print(patient['PATHOLOGY'])\n", + "\n", + " ground_truth_key = patient.PATHOLOGY # Ground truth pathology\n", + " ground_truth_name = conditions[ground_truth_key]['cond-name-eng'] # Ground truth pathology name \n", + " symptoms = list(conditions[ground_truth_key][\"symptoms\"].keys())\n", + "\n", + " print(ground_truth_name)\n", + "\n", + " # Randomly select an evidence for the instruction\n", + " initial_evidence = random.choice(patient['EVIDENCES'].split(';'))\n", + " evidence_details = evidences.get(initial_evidence, {})\n", + "\n", + " print(initial_evidence)\n", + " print(evidence_details)\n", + "\n", + " # Create the instruction based on the evidence\n", + " instruction = f\"Based on the following evidence: {evidence_details.get('question_en', 'N/A')}, what could be the potential diagnosis?\"\n", + "\n", + " # Context can include patient's demographic data and initial evidence\n", + " context = f\"Patient Age: {patient['AGE']}, Sex: {patient['SEX']}, Initial Evidence: {patient['INITIAL_EVIDENCE']}\"\n", + "\n", + " # Response is the ground truth pathology\n", + " response = patient['PATHOLOGY']\n", + "\n", + " # Unify into a single prompt\n", + " unified_prompt = f\"Instruction: {instruction}\\nContext: {context}\\nResponse: {response}\"\n", + " prompts.append(unified_prompt)\n", + " return prompts\n", + "\n", + "generate_medical_prompts(\n", + " patients.sample(1),\n", + " evidences, \n", + " conditions\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Medical dialogue dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "https://huggingface.co/datasets/medical_dialog" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "dialogues = load_dataset(\"medical_dialog\", \"processed.en\") #, \"en\", data_dir=\"DIALOGUES\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DatasetDict({\n", + " train: Dataset({\n", + " features: ['description', 'utterances'],\n", + " num_rows: 482\n", + " })\n", + " validation: Dataset({\n", + " features: ['description', 'utterances'],\n", + " num_rows: 60\n", + " })\n", + " test: Dataset({\n", + " features: ['description', 'utterances'],\n", + " num_rows: 61\n", + " })\n", + "})" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dialogues" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['description', 'utterances'],\n", + " num_rows: 482\n", + "})" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dialogues['train']" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['description', 'utterances'])" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dialogues['train'][0].keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'throat a bit sore and want to get a good imune booster, especially in light of the virus. please advise. have not been in contact with nyone with the virus.'" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dialogues['train'][0][\"description\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [], + "source": [ + "diag_list = []\n", + "for record in dialogues['train']:\n", + " utt = record[\"utterances\"]\n", + " diag_list.append({\n", + " \"patient\": utt[0].replace(\"patient: \", \"\"),\n", + " \"doctor\": utt[1].replace(\"doctor: \", \"\")\n", + " })\n", + "\n", + "dialogue_df = pd.DataFrame(diag_list)" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
patientdoctor
0throat a bit sore and want to get a good imune...during this pandemic. throat pain can be from ...
1hey there i have had cold \"symptoms\" for over ...yes. protection. it is not enough symptoms to ...
2i have a tight and painful chest with a dry co...possible. top symptoms include fever, dry coug...
3what will happen after the incubation period f...in brief: symptoms if you are infected, sympto...
4just found out i was pregnant. yesterday diagn...thanks for your question on healthcare magic.i...
.........
477my 5 year old son woke up not feeling well. i ...in brief: arrange testing stay home, provide f...
478i have a dry cough and sore throat- it's been ...in brief: covid good guidelines can be found a...
479how do i know if i have a normal cold or maybe...common cold with sin. the corona virus causes ...
480hi- i was diagnosed a month ago with community...hello! just because you have previously had a ...
481i have a aunt that is in the hospital at st vi...hello welcome to the health care magic your co...
\n", + "

482 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " patient \\\n", + "0 throat a bit sore and want to get a good imune... \n", + "1 hey there i have had cold \"symptoms\" for over ... \n", + "2 i have a tight and painful chest with a dry co... \n", + "3 what will happen after the incubation period f... \n", + "4 just found out i was pregnant. yesterday diagn... \n", + ".. ... \n", + "477 my 5 year old son woke up not feeling well. i ... \n", + "478 i have a dry cough and sore throat- it's been ... \n", + "479 how do i know if i have a normal cold or maybe... \n", + "480 hi- i was diagnosed a month ago with community... \n", + "481 i have a aunt that is in the hospital at st vi... \n", + "\n", + " doctor \n", + "0 during this pandemic. throat pain can be from ... \n", + "1 yes. protection. it is not enough symptoms to ... \n", + "2 possible. top symptoms include fever, dry coug... \n", + "3 in brief: symptoms if you are infected, sympto... \n", + "4 thanks for your question on healthcare magic.i... \n", + ".. ... \n", + "477 in brief: arrange testing stay home, provide f... \n", + "478 in brief: covid good guidelines can be found a... \n", + "479 common cold with sin. the corona virus causes ... \n", + "480 hello! just because you have previously had a ... \n", + "481 hello welcome to the health care magic your co... \n", + "\n", + "[482 rows x 2 columns]" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dialogue_df" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
012345678910111213141516
0patient: throat a bit sore and want to get a g...doctor: during this pandemic. throat pain can ...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1patient: hey there i have had cold \"symptoms\" ...doctor: yes. protection. it is not enough symp...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2patient: i have a tight and painful chest with...doctor: possible. top symptoms include fever, ...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
3patient: what will happen after the incubation...doctor: in brief: symptoms if you are infected...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4patient: just found out i was pregnant. yester...doctor: thanks for your question on healthcare...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
......................................................
477patient: my 5 year old son woke up not feeling...doctor: in brief: arrange testing stay home, p...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
478patient: i have a dry cough and sore throat- i...doctor: in brief: covid good guidelines can be...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
479patient: how do i know if i have a normal cold...doctor: common cold with sin. the corona virus...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
480patient: hi- i was diagnosed a month ago with ...doctor: hello! just because you have previousl...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
481patient: i have a aunt that is in the hospital...doctor: hello welcome to the health care magic...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

482 rows × 17 columns

\n", + "
" + ], + "text/plain": [ + " 0 \\\n", + "0 patient: throat a bit sore and want to get a g... \n", + "1 patient: hey there i have had cold \"symptoms\" ... \n", + "2 patient: i have a tight and painful chest with... \n", + "3 patient: what will happen after the incubation... \n", + "4 patient: just found out i was pregnant. yester... \n", + ".. ... \n", + "477 patient: my 5 year old son woke up not feeling... \n", + "478 patient: i have a dry cough and sore throat- i... \n", + "479 patient: how do i know if i have a normal cold... \n", + "480 patient: hi- i was diagnosed a month ago with ... \n", + "481 patient: i have a aunt that is in the hospital... \n", + "\n", + " 1 2 3 4 5 \\\n", + "0 doctor: during this pandemic. throat pain can ... NaN NaN NaN NaN \n", + "1 doctor: yes. protection. it is not enough symp... NaN NaN NaN NaN \n", + "2 doctor: possible. top symptoms include fever, ... NaN NaN NaN NaN \n", + "3 doctor: in brief: symptoms if you are infected... NaN NaN NaN NaN \n", + "4 doctor: thanks for your question on healthcare... NaN NaN NaN NaN \n", + ".. ... ... ... ... ... \n", + "477 doctor: in brief: arrange testing stay home, p... NaN NaN NaN NaN \n", + "478 doctor: in brief: covid good guidelines can be... NaN NaN NaN NaN \n", + "479 doctor: common cold with sin. the corona virus... NaN NaN NaN NaN \n", + "480 doctor: hello! just because you have previousl... NaN NaN NaN NaN \n", + "481 doctor: hello welcome to the health care magic... NaN NaN NaN NaN \n", + "\n", + " 6 7 8 9 10 11 12 13 14 15 16 \n", + "0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", + ".. ... ... ... ... ... ... ... ... ... ... ... \n", + "477 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", + "478 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", + "479 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", + "480 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", + "481 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", + "\n", + "[482 rows x 17 columns]" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "diag" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Diagnose-ME" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "https://www.kaggle.com/datasets/dsxavier/diagnoise-me" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "data = pd.read_json(\"diagnose-me/en_medical_dialog.json\")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idDescriptionDoctorPatient
00Q. What does abutment of the nerve root mean?Hi. I have gone through your query with dilige...Hi doctor,I am just wondering what is abutting...
11Q. Every time I eat spicy food, I poop blood. ...Hello. I have gone through your information an...Hi doctor, I am a 26 year old male. I am 5 fee...
22Q. Will Nano-Leo give permanent solution for e...Hi. For further doubts consult a sexologist on...Hello doctor, I am 48 years old. I am experien...
33Q. Will Kalarchikai cure multiple ovarian cyst...Hello. I just read your query. See Kalarachi K...Hello doctor, I have multiple small cysts in b...
44Q. I masturbate only by rubbing the tip of the...Hi. For further doubts consult a sexologist on...Hi doctor, During masturbation I just rub the ...
...............
257464257464Unprotected sex after periods, took morning af...Hormonal method of birth control like pills an...Hello, I am , age 26 years old. On 7th of may,...
257465257465Delivered baby, plan for second child after 4-...Do you know how this pills act and how your me...okay so i got this loette pill right its a rea...
257466257466Taking loette pill, have started half way thro...Hi thanks for your question your taking contra...taking the mini pill Cerazette and missed taki...
257467257467On Cerazette, missed pills twice at night, fol...Hi Cassctiexx Thanks for writing in to Healthc...Hi I recently received the depo-provera shot o...
257468257468Excessive tiredness, depression, body aches si...Don't worry abt card, you need advice at this ...i had unprotected sex on 20th sep, n after 5hr...
\n", + "

257469 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " id Description \\\n", + "0 0 Q. What does abutment of the nerve root mean? \n", + "1 1 Q. Every time I eat spicy food, I poop blood. ... \n", + "2 2 Q. Will Nano-Leo give permanent solution for e... \n", + "3 3 Q. Will Kalarchikai cure multiple ovarian cyst... \n", + "4 4 Q. I masturbate only by rubbing the tip of the... \n", + "... ... ... \n", + "257464 257464 Unprotected sex after periods, took morning af... \n", + "257465 257465 Delivered baby, plan for second child after 4-... \n", + "257466 257466 Taking loette pill, have started half way thro... \n", + "257467 257467 On Cerazette, missed pills twice at night, fol... \n", + "257468 257468 Excessive tiredness, depression, body aches si... \n", + "\n", + " Doctor \\\n", + "0 Hi. I have gone through your query with dilige... \n", + "1 Hello. I have gone through your information an... \n", + "2 Hi. For further doubts consult a sexologist on... \n", + "3 Hello. I just read your query. See Kalarachi K... \n", + "4 Hi. For further doubts consult a sexologist on... \n", + "... ... \n", + "257464 Hormonal method of birth control like pills an... \n", + "257465 Do you know how this pills act and how your me... \n", + "257466 Hi thanks for your question your taking contra... \n", + "257467 Hi Cassctiexx Thanks for writing in to Healthc... \n", + "257468 Don't worry abt card, you need advice at this ... \n", + "\n", + " Patient \n", + "0 Hi doctor,I am just wondering what is abutting... \n", + "1 Hi doctor, I am a 26 year old male. I am 5 fee... \n", + "2 Hello doctor, I am 48 years old. I am experien... \n", + "3 Hello doctor, I have multiple small cysts in b... \n", + "4 Hi doctor, During masturbation I just rub the ... \n", + "... ... \n", + "257464 Hello, I am , age 26 years old. On 7th of may,... \n", + "257465 okay so i got this loette pill right its a rea... \n", + "257466 taking the mini pill Cerazette and missed taki... \n", + "257467 Hi I recently received the depo-provera shot o... \n", + "257468 i had unprotected sex on 20th sep, n after 5hr... \n", + "\n", + "[257469 rows x 4 columns]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "data[\"Doctor\"] = data[\"Doctor\"].str.replace(r'\\s*https?://\\S+(\\s+|$)', ' ').str.strip()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "data[\"Patient\"] = data[\"Patient\"].str.replace(r'\\s*https?://\\S+(\\s+|$)', ' ').str.strip()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "data[\"Description\"] = data[\"Description\"].str.replace(r'\\s*https?://\\S+(\\s+|$)', ' ').str.strip()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "data.drop(columns=[\"id\"], inplace=True)\n", + "data.columns = [\"desc\", \"doctor\", \"patient\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
descdoctorpatient
0Q. What does abutment of the nerve root mean?Hi. I have gone through your query with dilige...Hi doctor,I am just wondering what is abutting...
1Q. Every time I eat spicy food, I poop blood. ...Hello. I have gone through your information an...Hi doctor, I am a 26 year old male. I am 5 fee...
2Q. Will Nano-Leo give permanent solution for e...Hi. For further doubts consult a sexologist on...Hello doctor, I am 48 years old. I am experien...
3Q. Will Kalarchikai cure multiple ovarian cyst...Hello. I just read your query. See Kalarachi K...Hello doctor, I have multiple small cysts in b...
4Q. I masturbate only by rubbing the tip of the...Hi. For further doubts consult a sexologist on...Hi doctor, During masturbation I just rub the ...
............
257464Unprotected sex after periods, took morning af...Hormonal method of birth control like pills an...Hello, I am , age 26 years old. On 7th of may,...
257465Delivered baby, plan for second child after 4-...Do you know how this pills act and how your me...okay so i got this loette pill right its a rea...
257466Taking loette pill, have started half way thro...Hi thanks for your question your taking contra...taking the mini pill Cerazette and missed taki...
257467On Cerazette, missed pills twice at night, fol...Hi Cassctiexx Thanks for writing in to Healthc...Hi I recently received the depo-provera shot o...
257468Excessive tiredness, depression, body aches si...Don't worry abt card, you need advice at this ...i had unprotected sex on 20th sep, n after 5hr...
\n", + "

257469 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " desc \\\n", + "0 Q. What does abutment of the nerve root mean? \n", + "1 Q. Every time I eat spicy food, I poop blood. ... \n", + "2 Q. Will Nano-Leo give permanent solution for e... \n", + "3 Q. Will Kalarchikai cure multiple ovarian cyst... \n", + "4 Q. I masturbate only by rubbing the tip of the... \n", + "... ... \n", + "257464 Unprotected sex after periods, took morning af... \n", + "257465 Delivered baby, plan for second child after 4-... \n", + "257466 Taking loette pill, have started half way thro... \n", + "257467 On Cerazette, missed pills twice at night, fol... \n", + "257468 Excessive tiredness, depression, body aches si... \n", + "\n", + " doctor \\\n", + "0 Hi. I have gone through your query with dilige... \n", + "1 Hello. I have gone through your information an... \n", + "2 Hi. For further doubts consult a sexologist on... \n", + "3 Hello. I just read your query. See Kalarachi K... \n", + "4 Hi. For further doubts consult a sexologist on... \n", + "... ... \n", + "257464 Hormonal method of birth control like pills an... \n", + "257465 Do you know how this pills act and how your me... \n", + "257466 Hi thanks for your question your taking contra... \n", + "257467 Hi Cassctiexx Thanks for writing in to Healthc... \n", + "257468 Don't worry abt card, you need advice at this ... \n", + "\n", + " patient \n", + "0 Hi doctor,I am just wondering what is abutting... \n", + "1 Hi doctor, I am a 26 year old male. I am 5 fee... \n", + "2 Hello doctor, I am 48 years old. I am experien... \n", + "3 Hello doctor, I have multiple small cysts in b... \n", + "4 Hi doctor, During masturbation I just rub the ... \n", + "... ... \n", + "257464 Hello, I am , age 26 years old. On 7th of may,... \n", + "257465 okay so i got this loette pill right its a rea... \n", + "257466 taking the mini pill Cerazette and missed taki... \n", + "257467 Hi I recently received the depo-provera shot o... \n", + "257468 i had unprotected sex on 20th sep, n after 5hr... \n", + "\n", + "[257469 rows x 3 columns]" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "adl", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}