{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Filtering 2 Practice - Groupby\n",
    "\n",
    "In this practice exercise, we will analyze a yelp dataset. Import pandas and numpy below and read in the data set. Then get rid of all rows with NaN."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(72742, 40)"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import math\n",
    "\n",
    "df_yelp = pd.read_csv(\"../Data/Yelp_Data.csv\")\n",
    "df_yelp.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(72739, 40)"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_yelp.dropna(axis = 0, inplace=True)\n",
    "\n",
    "df_yelp.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Aggregate Practice"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Question 1:** Find the average star rating for each state (You can ignore the number of reviews for each restaurant). Store the result df_1."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "state\n",
       "AL     3.500000\n",
       "AR     5.000000\n",
       "AZ     3.709510\n",
       "BW     3.768531\n",
       "CA     3.500000\n",
       "EDH    3.807777\n",
       "ELN    3.791667\n",
       "FIF    3.700000\n",
       "HAM    4.500000\n",
       "IL     3.549728\n",
       "KHL    3.500000\n",
       "MA     5.000000\n",
       "MLN    3.842466\n",
       "MN     5.000000\n",
       "NC     3.587095\n",
       "NM     3.000000\n",
       "NTH    2.000000\n",
       "NV     3.682979\n",
       "NW     4.500000\n",
       "ON     3.577825\n",
       "OR     2.500000\n",
       "PA     3.699276\n",
       "QC     3.579906\n",
       "RP     4.083333\n",
       "SC     3.568266\n",
       "SCB    4.000000\n",
       "TX     4.500000\n",
       "WI     3.659335\n",
       "Name: stars, dtype: float64"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_1 = df_yelp.groupby(by = [\"state\"]).stars.mean()\n",
    "df_1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Question 2:** For each state compute the avg number of stars taking into account the number of reviews they got. First create a column called \"total_stars\", which is stars*review.  Then use groupby to compute the avg of total_stars as well as a count for the number of restaurants rated in each state and store the result in df_2. After this groupby remove all states that have fewer than 100 reviews. Then use this df to create column that is the average star rating."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>total_stars</th>\n",
       "      <th>review_count</th>\n",
       "      <th>final_rating</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>state</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>AZ</th>\n",
       "      <td>3415460.5</td>\n",
       "      <td>905891</td>\n",
       "      <td>3.770278</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>BW</th>\n",
       "      <td>31424.0</td>\n",
       "      <td>8408</td>\n",
       "      <td>3.737393</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CA</th>\n",
       "      <td>427.5</td>\n",
       "      <td>126</td>\n",
       "      <td>3.392857</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>EDH</th>\n",
       "      <td>151256.0</td>\n",
       "      <td>39082</td>\n",
       "      <td>3.870222</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>IL</th>\n",
       "      <td>63702.5</td>\n",
       "      <td>17273</td>\n",
       "      <td>3.687981</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>MLN</th>\n",
       "      <td>5079.5</td>\n",
       "      <td>1283</td>\n",
       "      <td>3.959080</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>NC</th>\n",
       "      <td>531745.0</td>\n",
       "      <td>144071</td>\n",
       "      <td>3.690854</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>NV</th>\n",
       "      <td>3904524.0</td>\n",
       "      <td>1047481</td>\n",
       "      <td>3.727537</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ON</th>\n",
       "      <td>18858.0</td>\n",
       "      <td>5191</td>\n",
       "      <td>3.632826</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>PA</th>\n",
       "      <td>373508.5</td>\n",
       "      <td>99219</td>\n",
       "      <td>3.764486</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>QC</th>\n",
       "      <td>27409.5</td>\n",
       "      <td>7612</td>\n",
       "      <td>3.600828</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>SC</th>\n",
       "      <td>14587.0</td>\n",
       "      <td>3993</td>\n",
       "      <td>3.653143</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>TX</th>\n",
       "      <td>4020.0</td>\n",
       "      <td>1002</td>\n",
       "      <td>4.011976</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>WI</th>\n",
       "      <td>241349.5</td>\n",
       "      <td>64401</td>\n",
       "      <td>3.747605</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       total_stars  review_count  final_rating\n",
       "state                                         \n",
       "AZ       3415460.5        905891      3.770278\n",
       "BW         31424.0          8408      3.737393\n",
       "CA           427.5           126      3.392857\n",
       "EDH       151256.0         39082      3.870222\n",
       "IL         63702.5         17273      3.687981\n",
       "MLN         5079.5          1283      3.959080\n",
       "NC        531745.0        144071      3.690854\n",
       "NV       3904524.0       1047481      3.727537\n",
       "ON         18858.0          5191      3.632826\n",
       "PA        373508.5         99219      3.764486\n",
       "QC         27409.5          7612      3.600828\n",
       "SC         14587.0          3993      3.653143\n",
       "TX          4020.0          1002      4.011976\n",
       "WI        241349.5         64401      3.747605"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_yelp[\"total_stars\"] = df_yelp[\"stars\"]*df_yelp[\"review_count\"]\n",
    "\n",
    "df_2 = df_yelp.groupby(by = [\"state\"])[\"total_stars\", \"review_count\"]\\\n",
    "                    .agg({\"total_stars\":sum, \"review_count\": sum})\n",
    "    \n",
    "\n",
    "\n",
    "df_2 = df_2.loc[df_2.review_count>=100, :].copy()    \n",
    "    \n",
    "df_2[\"final_rating\"] = df_2[\"total_stars\"]/df_2[\"review_count\"]\n",
    "df_2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Question 3**: Find the most popular zip code for restaurants rated in AZ and NV.  You will have to write a custom function called Get_Most_Popular_Zip.  Create the function below."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def Get_Most_Popular_Zip(group):\n",
    "    \n",
    "    D_count_zip = {}\n",
    "    \n",
    "    for index in list(group.index):\n",
    "        address = group[index]\n",
    "        parsed_address = address.strip(\" \").split(\" \")\n",
    "        zip_code = parsed_address[-1]\n",
    "        if zip_code in D_count_zip:\n",
    "            D_count_zip[zip_code]+=1\n",
    "        else:\n",
    "            D_count_zip[zip_code] =1\n",
    "            \n",
    "    max_count = max(D_count_zip.values())\n",
    "    \n",
    "    for zip_code in D_count_zip:\n",
    "        if D_count_zip[zip_code] == max_count:\n",
    "            \n",
    "            return zip_code\n",
    "        "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now pick out the restaurants from the states of interest and perform the group by and store the result in df_zip."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>full_address</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>state</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>AZ</th>\n",
       "      <td>85251</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>NV</th>\n",
       "      <td>89109</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      full_address\n",
       "state             \n",
       "AZ           85251\n",
       "NV           89109"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_yelp_2 = df_yelp.loc[df_yelp[\"state\"].isin([\"AZ\", \"NV\"]),:]\n",
    "\n",
    "df_zip = df_yelp_2.groupby(by = [\"state\"]).agg({\"full_address\": Get_Most_Popular_Zip})\n",
    "\n",
    "df_zip"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Tranform Practice"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Question 4:** First select only the restaurants from AZ, IL, NC, and NV. Add a column called \"Fraction_Reviews\", which gives the fraction of reviews from the given state."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>business_id</th>\n",
       "      <th>city</th>\n",
       "      <th>full_address</th>\n",
       "      <th>latitude</th>\n",
       "      <th>longitude</th>\n",
       "      <th>name</th>\n",
       "      <th>review_count</th>\n",
       "      <th>stars</th>\n",
       "      <th>state</th>\n",
       "      <th>type</th>\n",
       "      <th>...</th>\n",
       "      <th>Vietnamese</th>\n",
       "      <th>Asian Fusion</th>\n",
       "      <th>Diners</th>\n",
       "      <th>Greek</th>\n",
       "      <th>Vegetarian</th>\n",
       "      <th>Number_of_Checkins</th>\n",
       "      <th>Number_of_Tips</th>\n",
       "      <th>Number_of_Tip_Likes</th>\n",
       "      <th>Total_Reviews_By_State</th>\n",
       "      <th>Fraction_Reviews</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>738</th>\n",
       "      <td>xXxJ7b_WUy0LBLIEM7614w</td>\n",
       "      <td>Belmont</td>\n",
       "      <td>196 YMCA Dr Belmont, NC 28012</td>\n",
       "      <td>35.263370</td>\n",
       "      <td>-81.019970</td>\n",
       "      <td>Stowe Family YMCA</td>\n",
       "      <td>5</td>\n",
       "      <td>4.0</td>\n",
       "      <td>NC</td>\n",
       "      <td>business</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>55</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>144071</td>\n",
       "      <td>0.000035</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>739</th>\n",
       "      <td>fvKQpza6x2daBFXaGSm4Fg</td>\n",
       "      <td>Belmont</td>\n",
       "      <td>7016 Wilkinson Blvd Belmont, NC 28012</td>\n",
       "      <td>35.248484</td>\n",
       "      <td>-81.021369</td>\n",
       "      <td>Carolina Auto Repair</td>\n",
       "      <td>3</td>\n",
       "      <td>3.5</td>\n",
       "      <td>NC</td>\n",
       "      <td>business</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>144071</td>\n",
       "      <td>0.000021</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>740</th>\n",
       "      <td>xkrhaKM1YtiLEcorBI5zcQ</td>\n",
       "      <td>Belmont</td>\n",
       "      <td>6928 W Wilkinson Blvd Belmont, NC 28012</td>\n",
       "      <td>35.248667</td>\n",
       "      <td>-81.021242</td>\n",
       "      <td>Demetrios Restaurant</td>\n",
       "      <td>5</td>\n",
       "      <td>3.5</td>\n",
       "      <td>NC</td>\n",
       "      <td>business</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>144071</td>\n",
       "      <td>0.000035</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>741</th>\n",
       "      <td>qm6yyQUe8Ln29NzDfWqxVA</td>\n",
       "      <td>Belmont</td>\n",
       "      <td>23 N Main St Belmont, NC 28012</td>\n",
       "      <td>35.243132</td>\n",
       "      <td>-81.038059</td>\n",
       "      <td>Cherubs Craft and Coffee Shop</td>\n",
       "      <td>5</td>\n",
       "      <td>5.0</td>\n",
       "      <td>NC</td>\n",
       "      <td>business</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>144071</td>\n",
       "      <td>0.000035</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>742</th>\n",
       "      <td>EZrCQtZxiEo1kkAYt2EQqw</td>\n",
       "      <td>Belmont</td>\n",
       "      <td>660 Park St Belmont, NC 28012</td>\n",
       "      <td>35.253329</td>\n",
       "      <td>-81.027326</td>\n",
       "      <td>Wendy's</td>\n",
       "      <td>7</td>\n",
       "      <td>2.5</td>\n",
       "      <td>NC</td>\n",
       "      <td>business</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>16</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>144071</td>\n",
       "      <td>0.000049</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 42 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                business_id     city                             full_address  \\\n",
       "738  xXxJ7b_WUy0LBLIEM7614w  Belmont            196 YMCA Dr Belmont, NC 28012   \n",
       "739  fvKQpza6x2daBFXaGSm4Fg  Belmont    7016 Wilkinson Blvd Belmont, NC 28012   \n",
       "740  xkrhaKM1YtiLEcorBI5zcQ  Belmont  6928 W Wilkinson Blvd Belmont, NC 28012   \n",
       "741  qm6yyQUe8Ln29NzDfWqxVA  Belmont           23 N Main St Belmont, NC 28012   \n",
       "742  EZrCQtZxiEo1kkAYt2EQqw  Belmont            660 Park St Belmont, NC 28012   \n",
       "\n",
       "      latitude  longitude                           name  review_count  stars  \\\n",
       "738  35.263370 -81.019970              Stowe Family YMCA             5    4.0   \n",
       "739  35.248484 -81.021369           Carolina Auto Repair             3    3.5   \n",
       "740  35.248667 -81.021242           Demetrios Restaurant             5    3.5   \n",
       "741  35.243132 -81.038059  Cherubs Craft and Coffee Shop             5    5.0   \n",
       "742  35.253329 -81.027326                        Wendy's             7    2.5   \n",
       "\n",
       "    state      type        ...         Vietnamese  Asian Fusion  Diners  \\\n",
       "738    NC  business        ...                  0             0       0   \n",
       "739    NC  business        ...                  0             0       0   \n",
       "740    NC  business        ...                  0             0       0   \n",
       "741    NC  business        ...                  0             0       0   \n",
       "742    NC  business        ...                  0             0       0   \n",
       "\n",
       "     Greek  Vegetarian  Number_of_Checkins  Number_of_Tips  \\\n",
       "738      0           0                  55               0   \n",
       "739      0           0                   0               0   \n",
       "740      0           0                   8               0   \n",
       "741      0           0                   5               0   \n",
       "742      0           0                  16               3   \n",
       "\n",
       "     Number_of_Tip_Likes  Total_Reviews_By_State  Fraction_Reviews  \n",
       "738                    0                  144071          0.000035  \n",
       "739                    0                  144071          0.000021  \n",
       "740                    0                  144071          0.000035  \n",
       "741                    0                  144071          0.000035  \n",
       "742                    0                  144071          0.000049  \n",
       "\n",
       "[5 rows x 42 columns]"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Get the states we care about\n",
    "df_yelp_3 = df_yelp.loc[df_yelp[\"state\"].isin([\"AZ\", \"IL\", \"NC\", \"NV\"]),:].copy()\n",
    "\n",
    "#Use a tranform to add a column that gives the total reviews from each state\n",
    "df_yelp_3[\"Total_Reviews_By_State\"] = df_yelp_3.groupby(by = [\"state\"]).review_count.transform(lambda x: x.sum())\n",
    "\n",
    "df_yelp_3[\"Fraction_Reviews\"] = df_yelp_3[\"review_count\"]/df_yelp_3[\"Total_Reviews_By_State\"]\n",
    "\n",
    "df_yelp_3.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Filter Practice"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Question 5: **  Create a dataframe (df_well_reviewed) which only keeps states that have at least 100 different restaurants reviews. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>business_id</th>\n",
       "      <th>city</th>\n",
       "      <th>full_address</th>\n",
       "      <th>latitude</th>\n",
       "      <th>longitude</th>\n",
       "      <th>name</th>\n",
       "      <th>review_count</th>\n",
       "      <th>stars</th>\n",
       "      <th>state</th>\n",
       "      <th>type</th>\n",
       "      <th>...</th>\n",
       "      <th>Pakistani</th>\n",
       "      <th>Barbeque</th>\n",
       "      <th>Vietnamese</th>\n",
       "      <th>Asian Fusion</th>\n",
       "      <th>Diners</th>\n",
       "      <th>Greek</th>\n",
       "      <th>Vegetarian</th>\n",
       "      <th>Number_of_Checkins</th>\n",
       "      <th>Number_of_Tips</th>\n",
       "      <th>Number_of_Tip_Likes</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5UmKMjUEUNdYWqANhGckJw</td>\n",
       "      <td>Dravosburg</td>\n",
       "      <td>4734 Lebanon Church Rd Dravosburg, PA 15034</td>\n",
       "      <td>40.354327</td>\n",
       "      <td>-79.900706</td>\n",
       "      <td>Mr Hoagie</td>\n",
       "      <td>4</td>\n",
       "      <td>4.5</td>\n",
       "      <td>PA</td>\n",
       "      <td>business</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>UsFtqoBl7naz8AVUBZMjQQ</td>\n",
       "      <td>Dravosburg</td>\n",
       "      <td>202 McClure St Dravosburg, PA 15034</td>\n",
       "      <td>40.350553</td>\n",
       "      <td>-79.886814</td>\n",
       "      <td>Clancy's Pub</td>\n",
       "      <td>4</td>\n",
       "      <td>3.5</td>\n",
       "      <td>PA</td>\n",
       "      <td>business</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3eu6MEFlq2Dg7bQh8QbdOg</td>\n",
       "      <td>Dravosburg</td>\n",
       "      <td>1 Ravine St Dravosburg, PA 15034</td>\n",
       "      <td>40.350956</td>\n",
       "      <td>-79.889059</td>\n",
       "      <td>Joe Cislo's Auto</td>\n",
       "      <td>3</td>\n",
       "      <td>5.0</td>\n",
       "      <td>PA</td>\n",
       "      <td>business</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>cE27W9VPgO88Qxe4ol6y_g</td>\n",
       "      <td>Bethel Park</td>\n",
       "      <td>1530 Hamilton Rd Bethel Park, PA 15234</td>\n",
       "      <td>40.354116</td>\n",
       "      <td>-80.014660</td>\n",
       "      <td>Cool Springs Golf Center</td>\n",
       "      <td>5</td>\n",
       "      <td>2.5</td>\n",
       "      <td>PA</td>\n",
       "      <td>business</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>HZdLhv6COCleJMo7nPl-RA</td>\n",
       "      <td>Pittsburgh</td>\n",
       "      <td>301 South Hills Village Pittsburgh, PA 15241</td>\n",
       "      <td>40.357620</td>\n",
       "      <td>-80.059980</td>\n",
       "      <td>Verizon</td>\n",
       "      <td>5</td>\n",
       "      <td>2.5</td>\n",
       "      <td>PA</td>\n",
       "      <td>business</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 40 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "              business_id         city  \\\n",
       "0  5UmKMjUEUNdYWqANhGckJw   Dravosburg   \n",
       "1  UsFtqoBl7naz8AVUBZMjQQ   Dravosburg   \n",
       "2  3eu6MEFlq2Dg7bQh8QbdOg   Dravosburg   \n",
       "3  cE27W9VPgO88Qxe4ol6y_g  Bethel Park   \n",
       "4  HZdLhv6COCleJMo7nPl-RA   Pittsburgh   \n",
       "\n",
       "                                   full_address   latitude  longitude  \\\n",
       "0   4734 Lebanon Church Rd Dravosburg, PA 15034  40.354327 -79.900706   \n",
       "1           202 McClure St Dravosburg, PA 15034  40.350553 -79.886814   \n",
       "2              1 Ravine St Dravosburg, PA 15034  40.350956 -79.889059   \n",
       "3        1530 Hamilton Rd Bethel Park, PA 15234  40.354116 -80.014660   \n",
       "4  301 South Hills Village Pittsburgh, PA 15241  40.357620 -80.059980   \n",
       "\n",
       "                       name  review_count  stars state      type  \\\n",
       "0                 Mr Hoagie             4    4.5    PA  business   \n",
       "1              Clancy's Pub             4    3.5    PA  business   \n",
       "2          Joe Cislo's Auto             3    5.0    PA  business   \n",
       "3  Cool Springs Golf Center             5    2.5    PA  business   \n",
       "4                   Verizon             5    2.5    PA  business   \n",
       "\n",
       "          ...           Pakistani  Barbeque  Vietnamese  Asian Fusion  Diners  \\\n",
       "0         ...                   0         0           0             0       0   \n",
       "1         ...                   0         0           0             0       0   \n",
       "2         ...                   0         0           0             0       0   \n",
       "3         ...                   0         0           0             0       0   \n",
       "4         ...                   0         0           0             0       0   \n",
       "\n",
       "   Greek  Vegetarian  Number_of_Checkins  Number_of_Tips  Number_of_Tip_Likes  \n",
       "0      0           0                   0               0                    0  \n",
       "1      0           0                   0               0                    0  \n",
       "2      0           0                   0               0                    0  \n",
       "3      0           0                   9               1                    0  \n",
       "4      0           0                   0               0                    0  \n",
       "\n",
       "[5 rows x 40 columns]"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_well_reviewed = df_yelp.groupby(\"state\").filter(lambda x: len(x)>=100)\n",
    "\n",
    "df_well_reviewed.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "**Question 6:** For states that have at least 1000 reviews over all restaurants, compute the total number of Diners. Store the result in a dataframe called df_diners."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "state\n",
       "AZ     95\n",
       "BW      1\n",
       "EDH     8\n",
       "IL      8\n",
       "MLN     2\n",
       "NC     31\n",
       "NV     48\n",
       "ON      8\n",
       "PA     51\n",
       "QC      4\n",
       "SC      2\n",
       "TX      0\n",
       "WI     16\n",
       "Name: Diners, dtype: int64"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_diners  = df_yelp.groupby(\"state\").filter(lambda x: x.review_count.sum() >=1000).groupby(\"state\").Diners.sum()\n",
    "\n",
    "df_diners"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Question 7:** For each state that has at least two cities whose name starts with a P, compute the number of tips. For this you will need to write a custom function to compute the number of cities that start with a P."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def Count_P(df):\n",
    "    \n",
    "    count_p=0\n",
    "    list_cities = df.city.unique()\n",
    "    for city in list_cities:\n",
    "        first_letter = city[0].lower()\n",
    "        if first_letter == \"p\":\n",
    "            count_p+=1\n",
    "            \n",
    "    if count_p>=2:\n",
    "        return True\n",
    "    else:\n",
    "        return False\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "state\n",
       "AZ     223405\n",
       "BW        129\n",
       "EDH      4395\n",
       "NC      31845\n",
       "NV     283645\n",
       "PA      16602\n",
       "QC       1240\n",
       "Name: Number_of_Tips, dtype: int64"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_num_tips = df_yelp.groupby(\"state\").filter(Count_P).groupby(\"state\").Number_of_Tips.sum()\n",
    "df_num_tips"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  },
  "toc": {
   "nav_menu": {
    "height": "84px",
    "width": "252px"
   },
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": "block",
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}