diff --git a/Numerical Computation/Pandas/Pandas_basics.ipynb b/Numerical Computation/Pandas/Pandas_basics.ipynb new file mode 100644 index 0000000..b412d76 --- /dev/null +++ b/Numerical Computation/Pandas/Pandas_basics.ipynb @@ -0,0 +1,1345 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### CSE4005: DWDM Lab - 3\n", + "#### Name: S Vijay Balaji\n", + "#### Reg. No.: 19BCE7571.\n", + "#### Slot: L29 + L30" + ] + }, + { + "cell_type": "code", + "execution_count": 184, + "metadata": {}, + "outputs": [], + "source": [ + "# 1) Import pandas under the alias pd\n", + "\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 185, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.1.3\n" + ] + } + ], + "source": [ + "# 2) Print the version of pandas that has been imported.\n", + "\n", + "print(pd. __version__ )" + ] + }, + { + "cell_type": "code", + "execution_count": 186, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "INSTALLED VERSIONS\n", + "------------------\n", + "commit : db08276bc116c438d3fdee492026f8223584c477\n", + "python : 3.7.5.final.0\n", + "python-bits : 64\n", + "OS : Windows\n", + "OS-release : 10\n", + "Version : 10.0.22000\n", + "machine : AMD64\n", + "processor : Intel64 Family 6 Model 158 Stepping 10, GenuineIntel\n", + "byteorder : little\n", + "LC_ALL : None\n", + "LANG : None\n", + "LOCALE : None.None\n", + "\n", + "pandas : 1.1.3\n", + "numpy : 1.18.5\n", + "pytz : 2020.1\n", + "dateutil : 2.8.1\n", + "pip : 21.1.1\n", + "setuptools : 50.3.0\n", + "Cython : None\n", + "pytest : None\n", + "hypothesis : None\n", + "sphinx : None\n", + "blosc : None\n", + "feather : None\n", + "xlsxwriter : 1.3.7\n", + "lxml.etree : 4.6.2\n", + "html5lib : None\n", + "pymysql : None\n", + "psycopg2 : None\n", + "jinja2 : 3.0.2\n", + "IPython : 7.16.1\n", + "pandas_datareader: None\n", + "bs4 : 4.9.0\n", + "bottleneck : None\n", + "fsspec : None\n", + "fastparquet : None\n", + "gcsfs : None\n", + "matplotlib : 3.2.1\n", + "numexpr : None\n", + "odfpy : None\n", + "openpyxl : 3.0.6\n", + "pandas_gbq : None\n", + "pyarrow : None\n", + "pytables : None\n", + "pyxlsb : None\n", + "s3fs : None\n", + "scipy : 1.4.1\n", + "sqlalchemy : None\n", + "tables : None\n", + "tabulate : None\n", + "xarray : None\n", + "xlrd : None\n", + "xlwt : None\n", + "numba : None\n" + ] + } + ], + "source": [ + "# 3) Print out all the versioninformation of the libraries that are required by the pandas library.\n", + "\n", + "pd.show_versions()" + ] + }, + { + "cell_type": "code", + "execution_count": 187, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " animal age visits priority\n", + "a cat 2.5 1 yes\n", + "b cat 3.0 3 yes\n", + "c snake 0.5 2 no\n", + "d dog NaN 3 yes\n", + "e dog 5.0 2 no\n", + "f cat 2.0 3 no\n", + "g snake 4.5 1 no\n", + "h cat NaN 1 yes\n", + "i dog 7.0 2 no\n", + "j dog 3.0 1 no\n" + ] + } + ], + "source": [ + "# 4) Create a DataFrame df from this dictionary data which has the index labels.\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "import numpy as np \n", + "data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],\n", + " 'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],\n", + " 'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],\n", + " 'priority':['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}\n", + "\n", + "labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']\n", + "\n", + "df = pd.DataFrame(data , index=labels)\n", + "print(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 188, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 10 entries, a to j\n", + "Data columns (total 4 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 animal 10 non-null object \n", + " 1 age 8 non-null float64\n", + " 2 visits 10 non-null int64 \n", + " 3 priority 10 non-null object \n", + "dtypes: float64(1), int64(1), object(2)\n", + "memory usage: 400.0+ bytes\n", + "None\n" + ] + } + ], + "source": [ + "# 6) Display a summary of the basic information about this DataFrame and its data.\n", + "\n", + "print(df.info())" + ] + }, + { + "cell_type": "code", + "execution_count": 189, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " animal age visits priority\n", + "a cat 2.5 1 yes\n", + "b cat 3.0 3 yes\n", + "c snake 0.5 2 no\n" + ] + } + ], + "source": [ + "# 6) Return the first 3 rows of the DataFrame df.\n", + "\n", + "print(df.iloc[:3])" + ] + }, + { + "cell_type": "code", + "execution_count": 190, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
animalage
acat2.5
bcat3.0
csnake0.5
ddogNaN
edog5.0
fcat2.0
gsnake4.5
hcatNaN
idog7.0
jdog3.0
\n", + "
" + ], + "text/plain": [ + " animal age\n", + "a cat 2.5\n", + "b cat 3.0\n", + "c snake 0.5\n", + "d dog NaN\n", + "e dog 5.0\n", + "f cat 2.0\n", + "g snake 4.5\n", + "h cat NaN\n", + "i dog 7.0\n", + "j dog 3.0" + ] + }, + "execution_count": 190, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 7) Select just the 'animal' and 'age' columns from the DataFrame df.\n", + "\n", + "df.loc[:, ['animal', 'age']]" + ] + }, + { + "cell_type": "code", + "execution_count": 191, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
animalage
ddogNaN
edog5.0
idog7.0
\n", + "
" + ], + "text/plain": [ + " animal age\n", + "d dog NaN\n", + "e dog 5.0\n", + "i dog 7.0" + ] + }, + "execution_count": 191, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 8) Select the data in rows [3, 4, 8] and in columns ['animal', 'age'].\n", + "\n", + "df.loc[df.index[[3, 4, 8]], ['animal', 'age']]" + ] + }, + { + "cell_type": "code", + "execution_count": 192, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
animalagevisitspriority
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [animal, age, visits, priority]\n", + "Index: []" + ] + }, + "execution_count": 192, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 9) Select only the rows where the number of visits is greater than 3.\n", + "\n", + "df[df['visits'] > 3]" + ] + }, + { + "cell_type": "code", + "execution_count": 193, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
animalagevisitspriority
ddogNaN3yes
hcatNaN1yes
\n", + "
" + ], + "text/plain": [ + " animal age visits priority\n", + "d dog NaN 3 yes\n", + "h cat NaN 1 yes" + ] + }, + "execution_count": 193, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 10) Select the rows where the age is missing, i.e. it is NaN.\n", + "\n", + "df[df['age'].isnull()]" + ] + }, + { + "cell_type": "code", + "execution_count": 194, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
animalagevisitspriority
acat2.51yes
fcat2.03no
\n", + "
" + ], + "text/plain": [ + " animal age visits priority\n", + "a cat 2.5 1 yes\n", + "f cat 2.0 3 no" + ] + }, + "execution_count": 194, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 11) Select the rows where the animal is a cat and the age is less than 3.\n", + "\n", + "df[(df['animal'] == 'cat') & (df['age'] < 3)]" + ] + }, + { + "cell_type": "code", + "execution_count": 195, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
animalagevisitspriority
acat2.51yes
bcat3.03yes
fcat2.03no
jdog3.01no
\n", + "
" + ], + "text/plain": [ + " animal age visits priority\n", + "a cat 2.5 1 yes\n", + "b cat 3.0 3 yes\n", + "f cat 2.0 3 no\n", + "j dog 3.0 1 no" + ] + }, + "execution_count": 195, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 12) Select the rows the age is between 2 and 4 (inclusive).\n", + "\n", + "df[(df['age'] >=2) & (df['age'] <= 4)]" + ] + }, + { + "cell_type": "code", + "execution_count": 196, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " animal age visits priority\n", + "a cat 2.5 1 yes\n", + "b cat 3.0 3 yes\n", + "c snake 0.5 2 no\n", + "d dog NaN 3 yes\n", + "e dog 5.0 2 no\n", + "f cat 1.5 3 no\n", + "g snake 4.5 1 no\n", + "h cat NaN 1 yes\n", + "i dog 7.0 2 no\n", + "j dog 3.0 1 no\n" + ] + } + ], + "source": [ + "# 13) Change the age in row 'f' to 1.5.\n", + "\n", + "df.loc['f', 'age'] = 1.5\n", + "print(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 197, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "19\n" + ] + } + ], + "source": [ + "# 14) Calculate the sum of all visits (the total number of visits).\n", + "\n", + "sum = 0\n", + "for x in df['visits']:\n", + " sum = sum + x\n", + "print(sum)" + ] + }, + { + "cell_type": "code", + "execution_count": 198, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "animal\n", + "cat 2.333333\n", + "dog 5.000000\n", + "snake 2.500000\n", + "Name: age, dtype: float64" + ] + }, + "execution_count": 198, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 15) Calculate the mean age for each different animal in df.\n", + "\n", + "df.groupby('animal')['age'].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 199, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "animal bunny\n", + "age 3\n", + "visits 2\n", + "priority yes\n", + "Name: k, dtype: object\n", + "\n", + "k is not in the dataframe\n" + ] + } + ], + "source": [ + "# 16) Append a new row 'k' to df with your choice of values for each column. Then delete that row to return the original DataFrame.\n", + "df.loc['k'] = [\"bunny\",\"3\",\"2\",\"yes\"]\n", + "print(df.loc['k'])\n", + "df = df.drop('k')\n", + "try:\n", + " print(df.loc['k'])\n", + "except:\n", + " print(\"\\nk is not in the dataframe\")" + ] + }, + { + "cell_type": "code", + "execution_count": 200, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "cat 4\n", + "dog 4\n", + "snake 2\n", + "Name: animal, dtype: int64" + ] + }, + "execution_count": 200, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 17) Count the number of each type of animal in df.\n", + "\n", + "df['animal'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 201, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
animalagevisitspriority
idog72no
edog52no
gsnake4.51no
jdog31no
bcat33yes
acat2.51yes
fcat1.53no
csnake0.52no
hcatNaN1yes
ddogNaN3yes
\n", + "
" + ], + "text/plain": [ + " animal age visits priority\n", + "i dog 7 2 no\n", + "e dog 5 2 no\n", + "g snake 4.5 1 no\n", + "j dog 3 1 no\n", + "b cat 3 3 yes\n", + "a cat 2.5 1 yes\n", + "f cat 1.5 3 no\n", + "c snake 0.5 2 no\n", + "h cat NaN 1 yes\n", + "d dog NaN 3 yes" + ] + }, + "execution_count": 201, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 18) Sort df first by the values in the 'age' in decending order, then by the value in the 'visit' column in ascending order.\n", + "\n", + "df.sort_values(by=['age', 'visits'], ascending=[False, True])" + ] + }, + { + "cell_type": "code", + "execution_count": 202, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " animal age visits priority\n", + "a cat 2.5 1 True\n", + "b cat 3 3 True\n", + "c snake 0.5 2 False\n", + "d dog NaN 3 True\n", + "e dog 5 2 False\n", + "f cat 1.5 3 False\n", + "g snake 4.5 1 False\n", + "h cat NaN 1 True\n", + "i dog 7 2 False\n", + "j dog 3 1 False\n" + ] + } + ], + "source": [ + "# 19) The 'priority' column contains the values 'yes' and 'no'. \n", + "# Replace this column with a column of boolean values: 'yes' should be True and 'no' should be False.\n", + "\n", + "df['priority'] = df['priority'].map({'yes': True, 'no': False})\n", + "print(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 203, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "a cat\n", + "b cat\n", + "c python\n", + "d dog\n", + "e dog\n", + "f cat\n", + "g python\n", + "h cat\n", + "i dog\n", + "j dog\n", + "Name: animal, dtype: object\n" + ] + } + ], + "source": [ + "# 20) In the 'animal' column, change the 'snake' entries to 'python'.\n", + "\n", + "df['animal'] = df['animal'].replace('snake', 'python')\n", + "print(df['animal'])" + ] + }, + { + "cell_type": "code", + "execution_count": 204, + "metadata": {}, + "outputs": [], + "source": [ + "# 21) For each animal type and each number of visits, find the mean age. \n", + "# In other words, each row is an animal, each column is a number of visits and the values are the mean ages (hint: use a pivot table).\n", + "\n", + "# pd.pivot_table(df, index='animal', columns='visits', values='age', aggfunc='mean')\n", + "\n", + "# pivot table does not seem to be working in this case." + ] + }, + { + "cell_type": "code", + "execution_count": 205, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
A
01
12
33
44
55
86
97
\n", + "
" + ], + "text/plain": [ + " A\n", + "0 1\n", + "1 2\n", + "3 3\n", + "4 4\n", + "5 5\n", + "8 6\n", + "9 7" + ] + }, + "execution_count": 205, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 22) You have a DataFrame df with a column 'A' of integers. For example:\n", + "# How do you filter out rows which contain the same integer as the row immediately above?\n", + "\n", + "df = pd.DataFrame({'A': [1, 2, 2, 3, 4, 5, 5, 5, 6, 7, 7]})\n", + "df.loc[df['A'].shift() != df['A']]" + ] + }, + { + "cell_type": "code", + "execution_count": 206, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
012
00.130367-0.2467150.116348
1-0.2445500.1218760.122674
20.0490200.187465-0.236485
3-0.034896-0.0202210.055117
40.133005-0.012212-0.120793
\n", + "
" + ], + "text/plain": [ + " 0 1 2\n", + "0 0.130367 -0.246715 0.116348\n", + "1 -0.244550 0.121876 0.122674\n", + "2 0.049020 0.187465 -0.236485\n", + "3 -0.034896 -0.020221 0.055117\n", + "4 0.133005 -0.012212 -0.120793" + ] + }, + "execution_count": 206, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 23) Given a DataFrame of numeric values, say df, how do you subtract the row mean from each element in the row?\n", + "\n", + "df = pd.DataFrame(np.random.random(size=(5, 3)))\n", + "df.sub(df.mean(axis=1), axis=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 207, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'b'" + ] + }, + "execution_count": 207, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 24) Suppose you have DataFrame with 10 columns of real numbers, Which column of numbers has the smallest sum? (Find that column's label.)\n", + "\n", + "df = pd.DataFrame(np.random.random(size=(5, 10)), columns=list('abcdefghij'))\n", + "df.sum().idxmin()" + ] + }, + { + "cell_type": "code", + "execution_count": 208, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "5" + ] + }, + "execution_count": 208, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 25) How do you count how many unique rows a DataFrame has (i.e. ignore all rows that are duplicates)?\n", + "\n", + "len(df.drop_duplicates(keep=False))" + ] + }, + { + "cell_type": "code", + "execution_count": 209, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 e\n", + "1 c\n", + "2 d\n", + "3 h\n", + "4 d\n", + "dtype: object\n" + ] + } + ], + "source": [ + "# 26) You have a DataFrame that consists of 10 columns of floating--point numbers. Suppose that exactly 5 entries in each row are NaN values. \n", + "# For each row of the DataFrame, find the column which contains the third NaN value.\n", + "\n", + "nan = np.nan\n", + "data = [[0.04,nan,nan,0.25,nan,0.43,0.71,0.51,nan,nan],\n", + " [nan,nan,nan,0.04,0.76,nan,nan,0.67,0.76,0.16],\n", + " [nan,nan,0.5,nan,0.31,0.4,nan,nan,0.24,0.01],\n", + " [0.49,nan,nan,0.62,0.73,0.26,0.85,nan,nan,nan],\n", + " [nan,nan,0.41,nan,0.05,nan,0.61,nan,0.48,0.68]]\n", + "columns=list('abcdefghij')\n", + "df=pd.DataFrame(data,columns=columns)\n", + "\n", + "print((df.isnull().cumsum(axis=1) == 3).idxmax(axis=1))" + ] + }, + { + "cell_type": "code", + "execution_count": 210, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "grps\n", + "a 409\n", + "b 156\n", + "c 345\n", + "Name: vals, dtype: int64" + ] + }, + "execution_count": 210, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 27) A DataFrame has a column of groups 'grps' and and column of numbers 'vals'.\n", + "# For each group, find the sum of the three greatest values.\n", + "\n", + "df = pd.DataFrame({'grps':list('aaabbcaabcccbbc'),'vals':[12,345,3,1,45,14,4,52,54,23,235,21,57,3,87]})\n", + "df.groupby('grps')['vals'].nlargest(3).sum(level=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 211, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "A\n", + "(0, 10] 635\n", + "(10, 20] 360\n", + "(20, 30] 315\n", + "(30, 40] 306\n", + "(40, 50] 750\n", + "(50, 60] 284\n", + "(60, 70] 424\n", + "(70, 80] 526\n", + "(80, 90] 835\n", + "(90, 100] 852\n", + "Name: B, dtype: int32" + ] + }, + "execution_count": 211, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 28) A DataFrame has two integer columns 'A' and 'B'. The values in 'A' are between 1 and 100 (inclusive). \n", + "# For each group of 10 consecutive integers in 'A' (i.e. (0, 10], (10, 20], ...), calculate the sum of the corresponding values in column 'B'.\n", + "\n", + "df=pd.DataFrame(np.random.RandomState(8765).randint(1,101,size=(100,2)),columns=[\"A\",\"B\"])\n", + "\n", + "df.groupby(pd.cut(df['A'], np.arange(0, 101, 10)))['B'].sum()" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "f0a871581a008f9484cd9f91783a53466b88136a8385a870e585a87e0d1d3998" + }, + "kernelspec": { + "display_name": "Python 3.7.5 64-bit", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.5" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +}