OtterLawyer · OtterLawyer · Feb 26, 2024 · Feb 27, 2024
diff --git a/lecture_3_unsupervised_learning/homework/Cluster_tSNE.ipynb b/lecture_3_unsupervised_learning/homework/Cluster_tSNE.ipynb
diff --git a/lecture_3_unsupervised_learning/homework/__pycache__/my_awesome_eda.cpython-39.pyc b/lecture_3_unsupervised_learning/homework/__pycache__/my_awesome_eda.cpython-39.pyc
diff --git a/lecture_3_unsupervised_learning/homework/labeled_fc_data.csv b/lecture_3_unsupervised_learning/homework/labeled_fc_data.csv
diff --git a/lecture_3_unsupervised_learning/homework/my_awesome_eda.py b/lecture_3_unsupervised_learning/homework/my_awesome_eda.py
@@ -0,0 +1,87 @@
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+
+def run_eda(df):
+    import pip
+    import sys
+    try:
+        __import__('cowsay')
+    except ImportError:
+        pip.main(['install', 'cowsay'])
+
+    import cowsay
+
+    cowsay.cow('Namaste')
+
+    print(f'There are {df.shape[0]} rows and {df.shape[1]} columns')
+    print('-------------')
+
+    cols = df.columns
+    DATA_TYPES_MAP = {'int64': 'int', 'object': 'str', 'float64': 'float', 'bool': 'bool',
+                      'datetime64': 'datetime', 'timedelta': 'timedelta', 'category': 'categorical'}
+
+    def get_type(col):
+        if len(col) < 7 and DATA_TYPES_MAP[str(df[col].dtypes)] == 'int':
+            return 'categorical'
+        return DATA_TYPES_MAP[str(df[col].dtypes)]
+
+    type_map = dict()
+
+    for i in cols:
+        type_map[i] = get_type(i)
+        print(f'{i} has {df[i].dtypes} dtype which can be categorised as {type_map[i]}')
+
+        if type_map[i] == 'categorical':
+            print(df[i].value_counts())
+            print((df[i].value_counts(normalize=True)))
+
+        if type_map[i] == 'int' or type_map[i] == 'float':
+            print(f'Minimal is {df[i].min()}')
+            print(f'Maximum is {df[i].max()}')
+            mean = df[i].mean()
+            print(f'Mean is {mean}')
+            q1 = df[i].quantile(0.25)
+            print(f'q0.25 equals {q1}')
+            print(f'Median is {df[i].median()}')
+            q3 = df[i].quantile(0.75)
+            print(f'q0.75 equals {q3}')
+            iqr = q3 - q1
+            print(f'There are {len(df[i][df[i] < mean - iqr * 1.5]) + len(df[i][df[i] > mean + iqr * 1.5])} outliers')
+        print('-------------')
+
+    print(
+        f'There are {sum(df.isna().sum())} of NaN values in {df.shape[0] - df.dropna().shape[0]} rows in columns {df.columns[df.isna().any()].tolist()} in dataframe ')
+
+    print(f'There are duplicates in {len(df.groupby(df.columns.tolist(), as_index=False).size())} rows')
+
+    # Plotting the percentage of missing values in each column
+    missing_values = df.isnull().mean() * 100
+    plt.figure(figsize=(10, 5))
+    sns.barplot(x=missing_values.index, y=missing_values.values)
+    plt.title('Percentage of missing values in each column')
+    plt.ylabel('Missing values (%)')
+    plt.xticks(rotation=90)
+    plt.show()
+
+    # Plotting the correlation heatmap for all variables
+    plt.figure(figsize=(10, 10))
+    sns.heatmap(df.corr(), annot=True, square=True, cmap='coolwarm')
+    plt.title('Correlation heatmap')
+    plt.show()
+
+    # Plotting the histogram and boxplot for each numeric variable
+    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+
+    for col in numeric_cols:
+        plt.figure(figsize=(12, 6))
+        plt.subplot(211)
+        sns.histplot(df[col], kde=False)
+        plt.title(f'Distribution of {col}')
+        plt.subplot(212)
+        sns.boxplot(df[col])
+        plt.title(f'Boxplot of {col}')
+        plt.tight_layout()
+        plt.show()
diff --git a/lecture_3_unsupervised_learning/lecture/Unsupervised learning.pdf b/lecture_3_unsupervised_learning/lecture/Unsupervised learning.pdf
diff --git a/lecture_4_dt/README.md b/lecture_4_dt/README.md
@@ -0,0 +1 @@
+Decision Tree.
diff --git a/lecture_4_dt/homework/dt_lecture_practice.ipynb b/lecture_4_dt/homework/dt_lecture_practice.ipynb
@@ -0,0 +1,135 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b084d51a-694e-4cc8-99cb-f2e8cc62269d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from sklearn.tree import DecisionTreeClassifier\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.metrics import accuracy_score"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ab8e95f5-5349-423d-8c28-e80f50a61ce2",
+   "metadata": {},
+   "source": [
+    "Будем учиться классифицировать жанры музыки, полузуюсь не аудио дорожками, а уже осмысленными признаками (правда, не все из них имеют понятные названия). Сами данные можно найти [здесь](https://www.kaggle.com/purumalgi/music-genre-classification). Будем использовать файл `train.csv`. Нашей задачей будет предсказание переменной `Class` (жанр песни) при помощи деревьев решений. Можно попробовать также другие методы классификации, которые мы прошли, и сравнить результаты. При обучении всех моделей не забывайте про разделение на тренировку и тест (или кросс валидацию [ссылка](https://www.youtube.com/watch?v=fSytzGwwBVw))."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "04e0c9f5-2043-4ed7-92e4-e53677b60df8",
+   "metadata": {},
+   "source": [
+    "### Задание 1. EDA (10 баллов)\n",
+    "\n",
+    "Как обычно, начнем с того, что посмотрим на наши данные: типы переменных, пропущенные значения, баланс классов и все такое. Ваш ход:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1ab19c5c-4a2b-4388-8063-ec56e415408a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = pd.read_csv(\"train.csv\")\n",
+    "data.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d19b38c0-2bbd-476d-8ec8-c244fb8b2a78",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d7dfab66-247d-423e-a4f9-5891ce11627b",
+   "metadata": {},
+   "source": [
+    "### Задание 2. Обучение дерева решений (10 баллов)\n",
+    "\n",
+    "Предобработайте признаки так, чтобы их можно было подать в модель дерева решений и обучите ее, постарайтесь подобрать оптимальные гиперпараметры, которые вам известны. Постарайтесь также обосновать выбор метрики в данном случае. При подборе гиперпараметров может помочь `GridSearchCV` [ссылка](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f69d6a02-fac9-48ec-9ae8-1613f27283d5",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "11c54184-da20-4f53-9d5f-575e2cad3a4d",
+   "metadata": {},
+   "source": [
+    "### Задание 3. Другие модели (без баллов, просто если есть желание)\n",
+    "\n",
+    "Проделайте операции из **Задания 2** с другими известными вам классификаторами. Стало ли лучше?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eb381e31-ecb5-47aa-8eb0-1fed6e248c61",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a4d857d4-15fc-456d-adb3-d0ebd1690cfa",
+   "metadata": {},
+   "source": [
+    "### Задание 4. (0.5 доп. балла)\n",
+    "\n",
+    "Расскажите про свои любимые фильмы, мультики и тд.\n",
+    "\n",
+    "1.\n",
+    "\n",
+    "2.\n",
+    "\n",
+    "3."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9ab7a7eb-eae9-4b07-bcc1-d5deca2aba00",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "configs_task",
+   "language": "python",
+   "name": "configs_task"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}