{ "cells": [ { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "# Part 1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The file \"Purchase_Data.csv\" contains purchase data at a Chinese supermarket for various products. Read this csv into a dataframe called df_data and use the head() method to check that you have read in the data correctly. Make sure you import pandas." ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
invoice_numberinvoice_timestore_idproduct_idcategory_idnum_itemprice
02.020000e+117/15/15 10:45G81670003002101011.02.0
12.020000e+117/15/15 12:16G8167070884702031.02.0
22.020000e+117/15/15 12:16G8167070884702031.02.0
32.020000e+117/15/15 12:16G8167070891704011.06.0
42.020000e+117/15/15 12:16G8167090363704011.06.0
\n", "
" ], "text/plain": [ " invoice_number invoice_time store_id product_id category_id num_item \\\n", "0 2.020000e+11 7/15/15 10:45 G816 7000300 210101 1.0 \n", "1 2.020000e+11 7/15/15 12:16 G816 7070884 70203 1.0 \n", "2 2.020000e+11 7/15/15 12:16 G816 7070884 70203 1.0 \n", "3 2.020000e+11 7/15/15 12:16 G816 7070891 70401 1.0 \n", "4 2.020000e+11 7/15/15 12:16 G816 7090363 70401 1.0 \n", "\n", " price \n", "0 2.0 \n", "1 2.0 \n", "2 2.0 \n", "3 6.0 \n", "4 6.0 " ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "#Write your code here\n", "df_data = pd.read_csv(\"Pandas_Data_Practice/Test/Purchase_Data.csv\")\n", "df_data.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Use the dtypes attribute to view how each column is stored" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "invoice_number float64\n", "invoice_time object\n", "store_id object\n", "product_id int64\n", "category_id int64\n", "num_item float64\n", "price float64\n", "dtype: object" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Write your code here\n", "df_data.dtypes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Slice the first two columns using .loc and store the result in a variable called result_1." ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
invoice_numberinvoice_time
02.020000e+117/15/15 10:45
12.020000e+117/15/15 12:16
22.020000e+117/15/15 12:16
32.020000e+117/15/15 12:16
42.020000e+117/15/15 12:16
\n", "
" ], "text/plain": [ " invoice_number invoice_time\n", "0 2.020000e+11 7/15/15 10:45\n", "1 2.020000e+11 7/15/15 12:16\n", "2 2.020000e+11 7/15/15 12:16\n", "3 2.020000e+11 7/15/15 12:16\n", "4 2.020000e+11 7/15/15 12:16" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Write you code here\n", "result_1 = df_data.loc[:,\"invoice_number\":\\\n", " \"invoice_time\"]\n", "result_1.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Slice the first two rows using .loc and store the result in a variable called result_2." ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
invoice_numberinvoice_timestore_idproduct_idcategory_idnum_itemprice
02.020000e+117/15/15 10:45G81670003002101011.02.0
12.020000e+117/15/15 12:16G8167070884702031.02.0
\n", "
" ], "text/plain": [ " invoice_number invoice_time store_id product_id category_id num_item \\\n", "0 2.020000e+11 7/15/15 10:45 G816 7000300 210101 1.0 \n", "1 2.020000e+11 7/15/15 12:16 G816 7070884 70203 1.0 \n", "\n", " price \n", "0 2.0 \n", "1 2.0 " ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Write you code here\n", "result_2 = df_data.loc[0:1, :]\n", "result_2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Slice the first four rows and the first five columns and store the result in a variable called result_3." ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
invoice_numberinvoice_timestore_idproduct_idcategory_id
02.020000e+117/15/15 10:45G8167000300210101
12.020000e+117/15/15 12:16G816707088470203
22.020000e+117/15/15 12:16G816707088470203
32.020000e+117/15/15 12:16G816707089170401
\n", "
" ], "text/plain": [ " invoice_number invoice_time store_id product_id category_id\n", "0 2.020000e+11 7/15/15 10:45 G816 7000300 210101\n", "1 2.020000e+11 7/15/15 12:16 G816 7070884 70203\n", "2 2.020000e+11 7/15/15 12:16 G816 7070884 70203\n", "3 2.020000e+11 7/15/15 12:16 G816 7070891 70401" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Write your code here\n", "result_3 = df_data.loc[0:3, \"invoice_number\":\\\n", " \"category_id\"]\n", "result_3" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Slice rows 0,4,6 and columns invoice time and price and store the result in variable called result_4." ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
invoice_timeprice
07/15/15 10:452.0
47/15/15 12:166.0
67/15/15 15:282.0
\n", "
" ], "text/plain": [ " invoice_time price\n", "0 7/15/15 10:45 2.0\n", "4 7/15/15 12:16 6.0\n", "6 7/15/15 15:28 2.0" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Write your code here\n", "result_4 = df_data.loc[[0,4,6], [\"invoice_time\",\\\n", " \"price\"]]\n", "result_4" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Store the number rows in a variable called num_rows." ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "5382" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dim = df_data.shape\n", "num_rows = dim[0]\n", "num_rows" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Print out the last row of the data to dataframe. **Hint:** use the variable num_rows from the previous exercise." ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "2.0" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Write your code here\n", "last_row = df_data.loc[num_rows-1,:]\n", "last_row[\"price\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Part 2" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
invoice_numberinvoice_timestore_idproduct_idcategory_idnum_itemprice
02.020000e+117/15/15 10:45G81670003002101011.02.0
12.020000e+117/15/15 12:16G8167070884702031.02.0
22.020000e+117/15/15 12:16G8167070884702031.02.0
32.020000e+117/15/15 12:16G8167070891704011.06.0
42.020000e+117/15/15 12:16G8167090363704011.06.0
\n", "
" ], "text/plain": [ " invoice_number invoice_time store_id product_id category_id num_item \\\n", "0 2.020000e+11 7/15/15 10:45 G816 7000300 210101 1.0 \n", "1 2.020000e+11 7/15/15 12:16 G816 7070884 70203 1.0 \n", "2 2.020000e+11 7/15/15 12:16 G816 7070884 70203 1.0 \n", "3 2.020000e+11 7/15/15 12:16 G816 7070891 70401 1.0 \n", "4 2.020000e+11 7/15/15 12:16 G816 7090363 70401 1.0 \n", "\n", " price \n", "0 2.0 \n", "1 2.0 \n", "2 2.0 \n", "3 6.0 \n", "4 6.0 " ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_data.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Compute the average and max price charged. Store the results in variables called avg_price and max_price" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(3.1920522110741354, 6.01)" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Write your code here\n", "avg_price = df_data.price.mean()\n", "max_price = df_data.price.max()\n", "avg_price,max_price" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Compute the average price and average number of items purchased with only a single call to the mean() method. Store these averages in avg_price and avg_num_items." ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(3.1920522110741354, 1.2983091787439613)" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Write you code here\n", "avgs = df_data[[\"price\", \"num_item\"]].mean()\n", "avg_price = avgs[\"price\"]\n", "avg_num_items = avgs[\"num_item\"]\n", "\n", "avg_price, avg_num_items" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create a column called \"total\", which is the product of the price and num_item columns." ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
invoice_numberinvoice_timestore_idproduct_idcategory_idnum_itempricetotal
02.020000e+117/15/15 10:45G81670003002101011.02.02.0
12.020000e+117/15/15 12:16G8167070884702031.02.02.0
22.020000e+117/15/15 12:16G8167070884702031.02.02.0
32.020000e+117/15/15 12:16G8167070891704011.06.06.0
42.020000e+117/15/15 12:16G8167090363704011.06.06.0
\n", "
" ], "text/plain": [ " invoice_number invoice_time store_id product_id category_id num_item \\\n", "0 2.020000e+11 7/15/15 10:45 G816 7000300 210101 1.0 \n", "1 2.020000e+11 7/15/15 12:16 G816 7070884 70203 1.0 \n", "2 2.020000e+11 7/15/15 12:16 G816 7070884 70203 1.0 \n", "3 2.020000e+11 7/15/15 12:16 G816 7070891 70401 1.0 \n", "4 2.020000e+11 7/15/15 12:16 G816 7090363 70401 1.0 \n", "\n", " price total \n", "0 2.0 2.0 \n", "1 2.0 2.0 \n", "2 2.0 2.0 \n", "3 6.0 6.0 \n", "4 6.0 6.0 " ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Write your code here\n", "df_data[\"total\"] = df_data.price*df_data.num_item\n", "\n", "df_data.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "What is the average price of the three most expensive purchases? " ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "164.0" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Write your code here\n", "df_data_sorted = df_data.sort_values(\\\n", " by = \"total\",\\\n", " inplace = False,\n", " ascending = False)\n", "\n", "top_three = df_data_sorted.loc[\\\n", " [4510, 3784, 4288],\"total\"]\n", "top_three_mean = top_three.mean()\n", "top_three_mean" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "How many different products are there?" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "53" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Write your code here\n", "list_prods = df_data.product_id.unique()\n", "num_prods = len(list_prods)\n", "num_prods" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Find the difference between the mean and median of the column total. Use the describe() method and then index the dataframe it returns." ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0.9318078781127945" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Write your code here\n", "df_summary = df_data.describe()\n", "avg_total = df_summary.loc[\"mean\", \"total\"]\n", "median_total = df_summary.loc[\"50%\", \"total\"]\n", "diff = avg_total - median_total\n", "diff" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Use the drop() method to delete the column total from the dataframe df_data." ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
invoice_numberinvoice_timestore_idproduct_idcategory_idnum_itempricetotal
02.020000e+117/15/15 10:45G81670003002101011.02.02.0
12.020000e+117/15/15 12:16G8167070884702031.02.02.0
22.020000e+117/15/15 12:16G8167070884702031.02.02.0
32.020000e+117/15/15 12:16G8167070891704011.06.06.0
42.020000e+117/15/15 12:16G8167090363704011.06.06.0
\n", "
" ], "text/plain": [ " invoice_number invoice_time store_id product_id category_id num_item \\\n", "0 2.020000e+11 7/15/15 10:45 G816 7000300 210101 1.0 \n", "1 2.020000e+11 7/15/15 12:16 G816 7070884 70203 1.0 \n", "2 2.020000e+11 7/15/15 12:16 G816 7070884 70203 1.0 \n", "3 2.020000e+11 7/15/15 12:16 G816 7070891 70401 1.0 \n", "4 2.020000e+11 7/15/15 12:16 G816 7090363 70401 1.0 \n", "\n", " price total \n", "0 2.0 2.0 \n", "1 2.0 2.0 \n", "2 2.0 2.0 \n", "3 6.0 6.0 \n", "4 6.0 6.0 " ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Write your code here\n", "df_data_no_total = df_data.\\\n", " drop(\"total\", axis = 1,inplace = False)\n", "df_data.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's dig a little deeper into the data set and answer some basic question." ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "invoice_number float64\n", "invoice_time object\n", "store_id object\n", "product_id int64\n", "category_id int64\n", "num_item float64\n", "price float64\n", "total float64\n", "dtype: object" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_data.dtypes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "First, covert the column invoice time to a datetime column. Print out the new column types to make sure this was done correctly" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "invoice_number float64\n", "invoice_time datetime64[ns]\n", "store_id object\n", "product_id int64\n", "category_id int64\n", "num_item float64\n", "price float64\n", "dtype: object" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Write your code here\n", "df_data = pd.read_csv(\\\n", " \"Pandas_Data_Practice/Test/Purchase_Data.csv\",\\\n", " parse_dates = [\"invoice_time\"])\n", "df_data.dtypes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Next, let's create columns called \"DOW\" and \"Month\", which store the corresponding weekday and month of the invoice respectively. Print the head of the dataframe to make sure this was done correctly." ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
invoice_numberinvoice_timestore_idproduct_idcategory_idnum_itempricemonthDOW
02.020000e+112015-07-15 10:45:00G81670003002101011.02.07Wednesday
12.020000e+112015-07-15 12:16:00G8167070884702031.02.07Wednesday
22.020000e+112015-07-15 12:16:00G8167070884702031.02.07Wednesday
32.020000e+112015-07-15 12:16:00G8167070891704011.06.07Wednesday
42.020000e+112015-07-15 12:16:00G8167090363704011.06.07Wednesday
\n", "
" ], "text/plain": [ " invoice_number invoice_time store_id product_id category_id \\\n", "0 2.020000e+11 2015-07-15 10:45:00 G816 7000300 210101 \n", "1 2.020000e+11 2015-07-15 12:16:00 G816 7070884 70203 \n", "2 2.020000e+11 2015-07-15 12:16:00 G816 7070884 70203 \n", "3 2.020000e+11 2015-07-15 12:16:00 G816 7070891 70401 \n", "4 2.020000e+11 2015-07-15 12:16:00 G816 7090363 70401 \n", "\n", " num_item price month DOW \n", "0 1.0 2.0 7 Wednesday \n", "1 1.0 2.0 7 Wednesday \n", "2 1.0 2.0 7 Wednesday \n", "3 1.0 6.0 7 Wednesday \n", "4 1.0 6.0 7 Wednesday " ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Write your code here\n", "df_data[\"month\"] = df_data.invoice_time.dt.month\n", "df_data[\"DOW\"] = df_data.invoice_time.dt.\\\n", " weekday_name\n", "df_data.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create a variable called num_purchases_wed, which stores the number of purchases made on Wednesday? You should use value_counts and then index the series it return." ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "959" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Write your code here\n", "counts_dow = df_data.DOW.value_counts()\n", "num_purchases_wed = counts_dow[\"Wednesday\"]\n", "num_purchases_wed" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Part 3" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In this set of practice exercises, we will be working with a demographic data regarding the passengers aboard the Titanic. Read in the data frame and use the head() method to check that it was read in correctly." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
08923Kelly, Mr. Jamesmale34.5003309117.8292NaNQ
18933Wilkes, Mrs. James (Ellen Needs)female47.0103632727.0000NaNS
28942Myles, Mr. Thomas Francismale62.0002402769.6875NaNQ
38953Wirz, Mr. Albertmale27.0003151548.6625NaNS
48963Hirvonen, Mrs. Alexander (Helga E Lindqvist)female22.011310129812.2875NaNS
\n", "
" ], "text/plain": [ " PassengerId Pclass Name Sex \\\n", "0 892 3 Kelly, Mr. James male \n", "1 893 3 Wilkes, Mrs. James (Ellen Needs) female \n", "2 894 2 Myles, Mr. Thomas Francis male \n", "3 895 3 Wirz, Mr. Albert male \n", "4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female \n", "\n", " Age SibSp Parch Ticket Fare Cabin Embarked \n", "0 34.5 0 0 330911 7.8292 NaN Q \n", "1 47.0 1 0 363272 7.0000 NaN S \n", "2 62.0 0 0 240276 9.6875 NaN Q \n", "3 27.0 0 0 315154 8.6625 NaN S \n", "4 22.0 1 1 3101298 12.2875 NaN S " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "#Write your code here\n", "df_titanic = pd.read_csv(\"Pandas_Data_Practice/Titanic.csv\")\n", "df_titanic.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Use the rename method to change the column \"Name\" to \"Passenger_Name\" and the column \"Ticket\" to \"Ticket_Num\"." ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdPclassPassenger_NameSexAgeSibSpParchTicket_NumFareCabinEmbarked
08923Kelly, Mr. Jamesmale34.5003309117.8292NaNQ
18933Wilkes, Mrs. James (Ellen Needs)female47.0103632727.0000NaNS
28942Myles, Mr. Thomas Francismale62.0002402769.6875NaNQ
38953Wirz, Mr. Albertmale27.0003151548.6625NaNS
48963Hirvonen, Mrs. Alexander (Helga E Lindqvist)female22.011310129812.2875NaNS
\n", "
" ], "text/plain": [ " PassengerId Pclass Passenger_Name Sex \\\n", "0 892 3 Kelly, Mr. James male \n", "1 893 3 Wilkes, Mrs. James (Ellen Needs) female \n", "2 894 2 Myles, Mr. Thomas Francis male \n", "3 895 3 Wirz, Mr. Albert male \n", "4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female \n", "\n", " Age SibSp Parch Ticket_Num Fare Cabin Embarked \n", "0 34.5 0 0 330911 7.8292 NaN Q \n", "1 47.0 1 0 363272 7.0000 NaN S \n", "2 62.0 0 0 240276 9.6875 NaN Q \n", "3 27.0 0 0 315154 8.6625 NaN S \n", "4 22.0 1 1 3101298 12.2875 NaN S " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Write your code here\n", "df_titanic.rename(columns={\"Name\":\"Passenger_Name\",\n", " \"Ticket\":\"Ticket_Num\"},\\\n", " inplace = True)\n", "df_titanic.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Change the index of the df_titanic to the column PasengerID." ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PclassPassenger_NameSexAgeSibSpParchTicket_NumFareCabinEmbarked
PassengerId
8923Kelly, Mr. Jamesmale34.5003309117.8292NaNQ
8933Wilkes, Mrs. James (Ellen Needs)female47.0103632727.0000NaNS
8942Myles, Mr. Thomas Francismale62.0002402769.6875NaNQ
8953Wirz, Mr. Albertmale27.0003151548.6625NaNS
8963Hirvonen, Mrs. Alexander (Helga E Lindqvist)female22.011310129812.2875NaNS
\n", "
" ], "text/plain": [ " Pclass Passenger_Name Sex \\\n", "PassengerId \n", "892 3 Kelly, Mr. James male \n", "893 3 Wilkes, Mrs. James (Ellen Needs) female \n", "894 2 Myles, Mr. Thomas Francis male \n", "895 3 Wirz, Mr. Albert male \n", "896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female \n", "\n", " Age SibSp Parch Ticket_Num Fare Cabin Embarked \n", "PassengerId \n", "892 34.5 0 0 330911 7.8292 NaN Q \n", "893 47.0 1 0 363272 7.0000 NaN S \n", "894 62.0 0 0 240276 9.6875 NaN Q \n", "895 27.0 0 0 315154 8.6625 NaN S \n", "896 22.0 1 1 3101298 12.2875 NaN S " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_titanic.set_index(\"PassengerId\", inplace = True)\n", "df_titanic.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Select the name of passenger 896" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'Hirvonen, Mrs. Alexander (Helga E Lindqvist)'" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Write your code here\n", "df_titanic.loc[896, \"Passenger_Name\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Change the index back to row numbers but making sure that we don't lose the PassengerId column. " ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdPclassPassenger_NameSexAgeSibSpParchTicket_NumFareCabinEmbarked
08923Kelly, Mr. Jamesmale34.5003309117.8292NaNQ
18933Wilkes, Mrs. James (Ellen Needs)female47.0103632727.0000NaNS
28942Myles, Mr. Thomas Francismale62.0002402769.6875NaNQ
38953Wirz, Mr. Albertmale27.0003151548.6625NaNS
48963Hirvonen, Mrs. Alexander (Helga E Lindqvist)female22.011310129812.2875NaNS
\n", "
" ], "text/plain": [ " PassengerId Pclass Passenger_Name Sex \\\n", "0 892 3 Kelly, Mr. James male \n", "1 893 3 Wilkes, Mrs. James (Ellen Needs) female \n", "2 894 2 Myles, Mr. Thomas Francis male \n", "3 895 3 Wirz, Mr. Albert male \n", "4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female \n", "\n", " Age SibSp Parch Ticket_Num Fare Cabin Embarked \n", "0 34.5 0 0 330911 7.8292 NaN Q \n", "1 47.0 1 0 363272 7.0000 NaN S \n", "2 62.0 0 0 240276 9.6875 NaN Q \n", "3 27.0 0 0 315154 8.6625 NaN S \n", "4 22.0 1 1 3101298 12.2875 NaN S " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Write your code here\n", "df_titanic.reset_index(drop = False, inplace = True)\n", "df_titanic.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "How many missing entries are there in the Age column? Use the isnull() method and recall than Booleans are exactly like 1s (True) and 0s (False)." ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "86" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Write you code here\n", "df_titanic.isnull().sum()[\"Age\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Compute the avg age of passengers ignoring the missing data." ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "30.272590361445783" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Write your code here\n", "mean_age = df_titanic.Age.mean()\n", "mean_age" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Using the fillna() method replace the missing values in the Age column with the mean." ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdPclassPassenger_NameSexAgeSibSpParchTicket_NumFareCabinEmbarked
08923Kelly, Mr. Jamesmale34.5003309117.8292NaNQ
18933Wilkes, Mrs. James (Ellen Needs)female47.0103632727.0000NaNS
28942Myles, Mr. Thomas Francismale62.0002402769.6875NaNQ
38953Wirz, Mr. Albertmale27.0003151548.6625NaNS
48963Hirvonen, Mrs. Alexander (Helga E Lindqvist)female22.011310129812.2875NaNS
\n", "
" ], "text/plain": [ " PassengerId Pclass Passenger_Name Sex \\\n", "0 892 3 Kelly, Mr. James male \n", "1 893 3 Wilkes, Mrs. James (Ellen Needs) female \n", "2 894 2 Myles, Mr. Thomas Francis male \n", "3 895 3 Wirz, Mr. Albert male \n", "4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female \n", "\n", " Age SibSp Parch Ticket_Num Fare Cabin Embarked \n", "0 34.5 0 0 330911 7.8292 NaN Q \n", "1 47.0 1 0 363272 7.0000 NaN S \n", "2 62.0 0 0 240276 9.6875 NaN Q \n", "3 27.0 0 0 315154 8.6625 NaN S \n", "4 22.0 1 1 3101298 12.2875 NaN S " ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Write your code here\n", "df_titanic.fillna({\"Age\":mean_age}, inplace = True)\n", "df_titanic.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "What is the average age of the 5 oldest passengers? The reset_index method will be helpful here." ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "67.0" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Write your code here\n", "df_titanic_sorted = df_titanic.sort_values(by = \"Age\",\\\n", " inplace = False,\\\n", " ascending = False)\n", "\n", "final_df = df_titanic_sorted.reset_index(inplace = False,\\\n", " drop = True)\n", "\n", "final_df.loc[0:4, \"Age\"].mean()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.0" }, "toc": { "nav_menu": { "height": "48px", "width": "252px" }, "number_sections": true, "sideBar": true, "skip_h1_title": false, "toc_cell": false, "toc_position": {}, "toc_section_display": "block", "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 2 }