From ac3d106789637be4b3725a98517df41f38b1186f Mon Sep 17 00:00:00 2001 From: Zhiwei Zhang Date: Thu, 28 Nov 2024 23:18:31 -0800 Subject: [PATCH 1/3] updated outliers check --- analysis/analysis.ipynb | 236 ++++++++++++++++------------------------ 1 file changed, 92 insertions(+), 144 deletions(-) diff --git a/analysis/analysis.ipynb b/analysis/analysis.ipynb index 23ba29c..27ff2f5 100644 --- a/analysis/analysis.ipynb +++ b/analysis/analysis.ipynb @@ -44,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -60,7 +60,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -90,7 +90,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -125,7 +125,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -168,7 +168,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -202,7 +202,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -300,7 +300,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -349,7 +349,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -411,19 +411,19 @@ "\n", "`citric_acid`: a float column between 0 and 1. Citric acid levels rarely exceed 1. A value of 0 is acceptable but beyond 1.0 may suggest data anomalies. \n", "\n", - "`residual_sugar`: a float column between 0.5 and 60. While white wines have higher residual sugar, 60 is a reasonable upper limit based on the observed range. Values exceeding this are highly unusual and likely outliers.\n", + "`residual_sugar`: a float column between 0.5 and 66. While white wines have higher residual sugar, 66 is a reasonable upper limit based on the observed range. Values exceeding this are highly unusual and likely outliers.\n", "\n", - "`chlorides`: a float column between 0.01 and 0.3. The majority of chloride values are tightly distributed below 0.3.\n", + "`chlorides`: a float column between 0.01 and 0.7. The majority of chloride values are tightly distributed below 0.7.\n", "\n", - "`free_sulfur_dioxide`: a float column between 0 and 100. Values beyond 100 are rare and not typical in wines. \n", + "`free_sulfur_dioxide`: a float column between 0 and 200. Values beyond 200 are rare and not typical in wines. \n", "\n", - "`total_sulfur_dioxide`: a float column between 0 and 300. Total sulfur dioxide values above 300 are uncommon and may indicate anomalies in the data or unusual winemaking practices.\n", + "`total_sulfur_dioxide`: a float column between 0 and 400. Total sulfur dioxide values above 300 are uncommon and may indicate anomalies in the data or unusual winemaking practices.\n", "\n", "`density`: a float column between 0.985 and 1.04. Density for wines typically lies within this range.\n", "\n", - "`pH`: a float column between 2.8 and 4. Wine pH is generally between 2.8 and 4.0, consistent with its acidic nature. Values outside this range are rare and potentially invalid. \n", + "`pH`: a float column between 2.5 and 4. Wine pH is generally between 2.5 and 4.0, consistent with its acidic nature. Values outside this range are rare and potentially invalid. \n", "\n", - "`sulphates`: a float column between 0.2 and 1.5. Most sulphates fall within this range; higher values indicate unusual conditions. Lower values below 0.2 are atypical and may require closer inspection.\n", + "`sulphates`: a float column between 0.2 and 1.8. Most sulphates fall within this range; higher values indicate unusual conditions. Lower values below 0.2 are atypical and may require closer inspection.\n", "\n", "`alcohol`: a float column between 8 and 15. Wine alcohol content is normally within 8% to 15%. Values outside this range suggest non-standard wine or errors in measurement.\n", "\n", @@ -434,7 +434,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -459,26 +459,26 @@ " {\n", " \"schema\": null,\n", " \"column\": \"free_sulfur_dioxide\",\n", - " \"check\": \"in_range(0, 100)\",\n", - " \"error\": \"Column 'free_sulfur_dioxide' failed element-wise validator number 0: in_range(0, 100) failure cases: 131.0, 122.5, 118.5, 146.5, 128.0, 110.0, 138.5, 124.0, 105.0, 105.0, 101.0, 101.0, 108.0, 108.0, 112.0, 108.0, 289.0\"\n", + " \"check\": \"in_range(0, 200)\",\n", + " \"error\": \"Column 'free_sulfur_dioxide' failed element-wise validator number 0: in_range(0, 200) failure cases: 289.0\"\n", " },\n", " {\n", " \"schema\": null,\n", " \"column\": \"total_sulfur_dioxide\",\n", - " \"check\": \"in_range(0, 300)\",\n", - " \"error\": \"Column 'total_sulfur_dioxide' failed element-wise validator number 0: in_range(0, 300) failure cases: 313.0, 366.5, 307.5, 344.0, 303.0, 440.0\"\n", + " \"check\": \"in_range(0, 400)\",\n", + " \"error\": \"Column 'total_sulfur_dioxide' failed element-wise validator number 0: in_range(0, 400) failure cases: 440.0\"\n", " },\n", " {\n", " \"schema\": null,\n", " \"column\": \"pH\",\n", - " \"check\": \"in_range(2.8, 4.0)\",\n", - " \"error\": \"Column 'pH' failed element-wise validator number 0: in_range(2.8, 4.0) failure cases: 2.74, 4.01, 4.01, 2.74, 2.72, 2.79, 2.79, 2.77, 2.79\"\n", + " \"check\": \"in_range(2.5, 4.0)\",\n", + " \"error\": \"Column 'pH' failed element-wise validator number 0: in_range(2.5, 4.0) failure cases: 4.01, 4.01\"\n", " },\n", " {\n", " \"schema\": null,\n", " \"column\": \"sulphates\",\n", - " \"check\": \"in_range(0.2, 1.5)\",\n", - " \"error\": \"Column 'sulphates' failed element-wise validator number 0: in_range(0.2, 1.5) failure cases: 1.56, 1.95, 1.95, 1.98, 2.0, 1.59, 1.61, 1.62\"\n", + " \"check\": \"in_range(0.2, 1.8)\",\n", + " \"error\": \"Column 'sulphates' failed element-wise validator number 0: in_range(0.2, 1.8) failure cases: 1.95, 1.95, 1.98, 2.0\"\n", " }\n", " ]\n", " }\n", @@ -486,93 +486,29 @@ "Failure Details:\n", " schema_context column check check_number \\\n", "0 Column citric_acid in_range(0.0, 1) 0 \n", - "32 Column pH in_range(2.8, 4.0) 0 \n", - "24 Column total_sulfur_dioxide in_range(0, 300) 0 \n", - "25 Column total_sulfur_dioxide in_range(0, 300) 0 \n", - "26 Column pH in_range(2.8, 4.0) 0 \n", - "27 Column pH in_range(2.8, 4.0) 0 \n", - "28 Column pH in_range(2.8, 4.0) 0 \n", - "29 Column pH in_range(2.8, 4.0) 0 \n", - "30 Column pH in_range(2.8, 4.0) 0 \n", - "31 Column pH in_range(2.8, 4.0) 0 \n", - "33 Column pH in_range(2.8, 4.0) 0 \n", - "22 Column total_sulfur_dioxide in_range(0, 300) 0 \n", - "34 Column pH in_range(2.8, 4.0) 0 \n", - "35 Column sulphates in_range(0.2, 1.5) 0 \n", - "36 Column sulphates in_range(0.2, 1.5) 0 \n", - "37 Column sulphates in_range(0.2, 1.5) 0 \n", - "38 Column sulphates in_range(0.2, 1.5) 0 \n", - "39 Column sulphates in_range(0.2, 1.5) 0 \n", - "40 Column sulphates in_range(0.2, 1.5) 0 \n", - "41 Column sulphates in_range(0.2, 1.5) 0 \n", - "23 Column total_sulfur_dioxide in_range(0, 300) 0 \n", - "21 Column total_sulfur_dioxide in_range(0, 300) 0 \n", "1 Column citric_acid in_range(0.0, 1) 0 \n", - "10 Column free_sulfur_dioxide in_range(0, 100) 0 \n", "2 Column chlorides in_range(0.01, 0.7) 0 \n", - "3 Column free_sulfur_dioxide in_range(0, 100) 0 \n", - "4 Column free_sulfur_dioxide in_range(0, 100) 0 \n", - "5 Column free_sulfur_dioxide in_range(0, 100) 0 \n", - "6 Column free_sulfur_dioxide in_range(0, 100) 0 \n", - "7 Column free_sulfur_dioxide in_range(0, 100) 0 \n", - "8 Column free_sulfur_dioxide in_range(0, 100) 0 \n", - "9 Column free_sulfur_dioxide in_range(0, 100) 0 \n", - "11 Column free_sulfur_dioxide in_range(0, 100) 0 \n", - "20 Column total_sulfur_dioxide in_range(0, 300) 0 \n", - "12 Column free_sulfur_dioxide in_range(0, 100) 0 \n", - "13 Column free_sulfur_dioxide in_range(0, 100) 0 \n", - "14 Column free_sulfur_dioxide in_range(0, 100) 0 \n", - "15 Column free_sulfur_dioxide in_range(0, 100) 0 \n", - "16 Column free_sulfur_dioxide in_range(0, 100) 0 \n", - "17 Column free_sulfur_dioxide in_range(0, 100) 0 \n", - "18 Column free_sulfur_dioxide in_range(0, 100) 0 \n", - "19 Column free_sulfur_dioxide in_range(0, 100) 0 \n", - "42 Column sulphates in_range(0.2, 1.5) 0 \n", + "3 Column free_sulfur_dioxide in_range(0, 200) 0 \n", + "4 Column total_sulfur_dioxide in_range(0, 400) 0 \n", + "5 Column pH in_range(2.5, 4.0) 0 \n", + "6 Column pH in_range(2.5, 4.0) 0 \n", + "7 Column sulphates in_range(0.2, 1.8) 0 \n", + "8 Column sulphates in_range(0.2, 1.8) 0 \n", + "9 Column sulphates in_range(0.2, 1.8) 0 \n", + "10 Column sulphates in_range(0.2, 1.8) 0 \n", "\n", " failure_case index \n", "0 1.660 2344 \n", - "32 2.790 3559 \n", - "24 303.000 4253 \n", - "25 440.000 6344 \n", - "26 2.740 151 \n", - "27 4.010 1316 \n", - "28 4.010 1321 \n", - "29 2.740 2813 \n", - "30 2.720 3499 \n", - "31 2.790 3558 \n", - "33 2.770 3761 \n", - "22 307.500 3530 \n", - "34 2.790 5361 \n", - "35 1.560 13 \n", - "36 1.950 86 \n", - "37 1.950 91 \n", - "38 1.980 92 \n", - "39 2.000 151 \n", - "40 1.590 169 \n", - "41 1.610 226 \n", - "23 344.000 3726 \n", - "21 366.500 3016 \n", "1 1.230 4751 \n", - "10 124.000 4906 \n", "2 0.009 5372 \n", - "3 131.000 1924 \n", - "4 122.500 2258 \n", - "5 118.500 3287 \n", - "6 146.500 3530 \n", - "7 128.000 3933 \n", - "8 110.000 3935 \n", - "9 138.500 4649 \n", - "11 105.000 5060 \n", - "20 313.000 1924 \n", - "12 105.000 5069 \n", - "13 101.000 5119 \n", - "14 101.000 5122 \n", - "15 108.000 5219 \n", - "16 108.000 5460 \n", - "17 112.000 5467 \n", - "18 108.000 5468 \n", - "19 289.000 6344 \n", - "42 1.620 723 \n" + "3 289.000 6344 \n", + "4 440.000 6344 \n", + "5 4.010 1316 \n", + "6 4.010 1321 \n", + "7 1.950 86 \n", + "8 1.950 91 \n", + "9 1.980 92 \n", + "10 2.000 151 \n" ] } ], @@ -585,11 +521,11 @@ " \"citric_acid\": pa.Column(float, pa.Check.between(0.0, 1)),\n", " \"residual_sugar\": pa.Column(float, pa.Check.between(0.5, 66)),\n", " \"chlorides\": pa.Column(float, pa.Check.between(0.01, 0.7)),\n", - " \"free_sulfur_dioxide\": pa.Column(float, pa.Check.between(0, 100)),\n", - " \"total_sulfur_dioxide\": pa.Column(float, pa.Check.between(0, 300)),\n", + " \"free_sulfur_dioxide\": pa.Column(float, pa.Check.between(0, 200)),\n", + " \"total_sulfur_dioxide\": pa.Column(float, pa.Check.between(0, 400)),\n", " \"density\": pa.Column(float, pa.Check.between(0.985, 1.04)),\n", - " \"pH\": pa.Column(float, pa.Check.between(2.8, 4.0)),\n", - " \"sulphates\": pa.Column(float, pa.Check.between(0.2, 1.5)),\n", + " \"pH\": pa.Column(float, pa.Check.between(2.5, 4.0)),\n", + " \"sulphates\": pa.Column(float, pa.Check.between(0.2, 1.8)),\n", " \"alcohol\": pa.Column(float, pa.Check.between(8.0, 15.0)),\n", " \"quality\": pa.Column(int, pa.Check.between(3, 9)),\n", " \"color\": pa.Column(str, pa.Check.isin([\"red\", \"white\"])), \n", @@ -609,7 +545,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The results suggests that we have several outliers for column `citric_acid`. Considering the outliers don't deviate significantly, we will ignore them in this case." + "Based on the test result, we do have several outliers for feature `citric_acid`, `chlorides`, `free_sulfur_dioxid`, `total_sulfur_dioxide`, `pH`, and `sulphates`. The outliers exhibit minimal deviations from the predefined thresholds and represent only a small fraction of the total dataset. These slight exceedances are unlikely to significantly influence the analysis or model performance. Therefore, we will ignore them in this case." ] }, { @@ -623,7 +559,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -634,7 +570,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -674,7 +610,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -877,7 +813,7 @@ "max 4.010000 2.000000 14.200000 9.000000 " ] }, - "execution_count": 12, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -889,7 +825,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -904,23 +840,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.ConcatChart(...)" ] }, - "execution_count": 13, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -987,7 +923,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -1003,23 +939,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.ConcatChart(...)" ] }, - "execution_count": 14, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -1093,9 +1029,21 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 29, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'deepchecks'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[29], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# imports for correlation anomality validations\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdeepchecks\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtabular\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Dataset\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdeepchecks\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtabular\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mchecks\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m FeatureLabelCorrelation, FeatureFeatureCorrelation, PredictionDrift\n\u001b[1;32m 5\u001b[0m wine_train_ds \u001b[38;5;241m=\u001b[39m Dataset(train_df, label\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcolor\u001b[39m\u001b[38;5;124m\"\u001b[39m, cat_features\u001b[38;5;241m=\u001b[39m[])\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'deepchecks'" + ] + } + ], "source": [ "# imports for correlation anomality validations\n", "from deepchecks.tabular import Dataset\n", @@ -1132,7 +1080,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1154,7 +1102,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -1313,7 +1261,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1325,7 +1273,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1343,7 +1291,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -1432,7 +1380,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -1550,7 +1498,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -1693,7 +1641,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -1819,7 +1767,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -1846,7 +1794,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -1908,7 +1856,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -1943,7 +1891,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "metadata": {}, "outputs": [ { From de0e685a6a83c74a8df72fe33b793b14589014a7 Mon Sep 17 00:00:00 2001 From: Zhiwei Zhang Date: Fri, 29 Nov 2024 11:09:23 -0800 Subject: [PATCH 2/3] updated outlier checks and category level check --- analysis/analysis.ipynb | 210 ++++++++++++++++++++++------------------ 1 file changed, 117 insertions(+), 93 deletions(-) diff --git a/analysis/analysis.ipynb b/analysis/analysis.ipynb index 27ff2f5..f8d567e 100644 --- a/analysis/analysis.ipynb +++ b/analysis/analysis.ipynb @@ -44,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -60,7 +60,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -90,7 +90,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -125,7 +125,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -168,7 +168,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -202,7 +202,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -300,7 +300,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -349,7 +349,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -417,7 +417,7 @@ "\n", "`free_sulfur_dioxide`: a float column between 0 and 200. Values beyond 200 are rare and not typical in wines. \n", "\n", - "`total_sulfur_dioxide`: a float column between 0 and 400. Total sulfur dioxide values above 300 are uncommon and may indicate anomalies in the data or unusual winemaking practices.\n", + "`total_sulfur_dioxide`: a float column between 0 and 400. Total sulfur dioxide values above 400 are uncommon and may indicate anomalies in the data or unusual winemaking practices.\n", "\n", "`density`: a float column between 0.985 and 1.04. Density for wines typically lies within this range.\n", "\n", @@ -427,14 +427,12 @@ "\n", "`alcohol`: a float column between 8 and 15. Wine alcohol content is normally within 8% to 15%. Values outside this range suggest non-standard wine or errors in measurement.\n", "\n", - "`quality`: a float column between 3 and 9. The wine quality score ranges from 3 to 9 based on the dataset. \n", - "\n", - "`color`: a categorical column with levels \"red\" and \"white\".\n" + "`quality`: a float column between 3 and 9. The wine quality score ranges from 3 to 9 based on the dataset. \n" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -528,7 +526,7 @@ " \"sulphates\": pa.Column(float, pa.Check.between(0.2, 1.8)),\n", " \"alcohol\": pa.Column(float, pa.Check.between(8.0, 15.0)),\n", " \"quality\": pa.Column(int, pa.Check.between(3, 9)),\n", - " \"color\": pa.Column(str, pa.Check.isin([\"red\", \"white\"])), \n", + " \"color\": pa.Column(str, pa.Check.isin([\"red\", \"white\"]))\n", " }\n", ")\n", "\n", @@ -548,6 +546,44 @@ "Based on the test result, we do have several outliers for feature `citric_acid`, `chlorides`, `free_sulfur_dioxid`, `total_sulfur_dioxide`, `pH`, and `sulphates`. The outliers exhibit minimal deviations from the predefined thresholds and represent only a small fraction of the total dataset. These slight exceedances are unlikely to significantly influence the analysis or model performance. Therefore, we will ignore them in this case." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 7. Check Category levels\n", + "The following will check if categorical feature has any string mismatch or singla value. For this data set, there's one categorical feature `color`, which should have levels of \"red\" and \"white\"." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test pass!\n" + ] + } + ], + "source": [ + "# Category levels check\n", + "schema = pa.DataFrameSchema(\n", + " {\n", + " \"color\": pa.Column(str, pa.Check.isin([\"red\", \"white\"]))\n", + " }\n", + ")\n", + "\n", + "try:\n", + " schema.validate(wine, lazy = True)\n", + " print(\"Test pass!\")\n", + "except pa.errors.SchemaErrors as e:\n", + " print(\"Test fail:\", e)\n", + " print(\"Failure Details:\")\n", + " print(e.failure_cases)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -559,7 +595,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -570,7 +606,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -610,7 +646,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -813,7 +849,7 @@ "max 4.010000 2.000000 14.200000 9.000000 " ] }, - "execution_count": 26, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -825,7 +861,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -840,23 +876,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.ConcatChart(...)" ] }, - "execution_count": 27, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -923,7 +959,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -939,23 +975,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.ConcatChart(...)" ] }, - "execution_count": 28, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1029,21 +1065,9 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 17, "metadata": {}, - "outputs": [ - { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'deepchecks'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[29], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# imports for correlation anomality validations\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdeepchecks\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtabular\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Dataset\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdeepchecks\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtabular\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mchecks\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m FeatureLabelCorrelation, FeatureFeatureCorrelation, PredictionDrift\n\u001b[1;32m 5\u001b[0m wine_train_ds \u001b[38;5;241m=\u001b[39m Dataset(train_df, label\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcolor\u001b[39m\u001b[38;5;124m\"\u001b[39m, cat_features\u001b[38;5;241m=\u001b[39m[])\n", - "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'deepchecks'" - ] - } - ], + "outputs": [], "source": [ "# imports for correlation anomality validations\n", "from deepchecks.tabular import Dataset\n", @@ -1080,7 +1104,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -1102,7 +1126,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -1249,7 +1273,7 @@ "1104 12.4 6 red " ] }, - "execution_count": 17, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -1261,7 +1285,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -1273,7 +1297,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -1291,7 +1315,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -1365,7 +1389,7 @@ " random_state=123))])" ] }, - "execution_count": 20, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -1380,7 +1404,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -1411,12 +1435,12 @@ " \n", " \n", " fit_time\n", - " 0.008\n", - " 0.002\n", + " 0.045\n", + " 0.028\n", " \n", " \n", " score_time\n", - " 0.012\n", + " 0.017\n", " 0.005\n", " \n", " \n", @@ -1465,8 +1489,8 @@ ], "text/plain": [ " mean std\n", - "fit_time 0.008 0.002\n", - "score_time 0.012 0.005\n", + "fit_time 0.045 0.028\n", + "score_time 0.017 0.005\n", "test_accuracy 0.994 0.003\n", "train_accuracy 0.994 0.001\n", "test_precision 0.987 0.006\n", @@ -1477,7 +1501,7 @@ "train_f1 0.988 0.001" ] }, - "execution_count": 21, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -1498,7 +1522,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -1533,7 +1557,7 @@ " max_iter=1000,\n", " random_state=123))]),\n", " n_jobs=-1,\n", - " param_distributions={'logisticregression__C': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x36f232490>},\n", + " param_distributions={'logisticregression__C': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x32bfbc510>},\n", " random_state=123, return_train_score=True,\n", " scoring=make_scorer(f1_score, pos_label=red), verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
@@ -8763,7 +9031,7 @@

Modelling

@@ -9400,7 +9264,7 @@

Modelling