{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Practice with apply\n",
    "\n",
    "Consider the following dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>hr</th>\n",
       "      <th>names</th>\n",
       "      <th>pos</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>25</td>\n",
       "      <td>Chapman, Matt</td>\n",
       "      <td>3B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>30</td>\n",
       "      <td>Olson, Matt</td>\n",
       "      <td>1B</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>45</td>\n",
       "      <td>Davis, Khris</td>\n",
       "      <td>OF</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   hr          names pos\n",
       "0  25  Chapman, Matt  3B\n",
       "1  30    Olson, Matt  1B\n",
       "2  45   Davis, Khris  OF"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Create dataframe\n",
    "names = [\"Chapman, Matt\", \"Olson, Matt\", \"Davis, Khris\"]\n",
    "positions = [\"3B\", \"1B\", \"OF\"]\n",
    "hrs = [25,30,45]\n",
    "df_players = pd.DataFrame({\"names\":names, \"pos\":positions, \"hr\":hrs})\n",
    "df_players"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Use apply to add a column for each players last name."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>hr</th>\n",
       "      <th>names</th>\n",
       "      <th>pos</th>\n",
       "      <th>last_name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>25</td>\n",
       "      <td>Chapman, Matt</td>\n",
       "      <td>3B</td>\n",
       "      <td>Matt</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>30</td>\n",
       "      <td>Olson, Matt</td>\n",
       "      <td>1B</td>\n",
       "      <td>Matt</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>45</td>\n",
       "      <td>Davis, Khris</td>\n",
       "      <td>OF</td>\n",
       "      <td>Khris</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   hr          names pos last_name\n",
       "0  25  Chapman, Matt  3B      Matt\n",
       "1  30    Olson, Matt  1B      Matt\n",
       "2  45   Davis, Khris  OF     Khris"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def Get_Last_Name(name):\n",
    "    \n",
    "    return name.split(\" \")[-1]\n",
    "\n",
    "#Write your code here\n",
    "df_players[\"last_name\"] = df_players.names.apply(Get_Last_Name)\n",
    "df_players"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Add a column denoted whether the player is an all-star.  We will say a player is an all-star if they play third base or outfield and have at least threshold_hr = 20 homeruns. If the player is an all-star the row in the new column should read \"all-star\" and otherwise it shoul read \"not selected\"."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>hr</th>\n",
       "      <th>names</th>\n",
       "      <th>pos</th>\n",
       "      <th>AS</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>25</td>\n",
       "      <td>Chapman, Matt</td>\n",
       "      <td>3B</td>\n",
       "      <td>all-star</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>30</td>\n",
       "      <td>Olson, Matt</td>\n",
       "      <td>1B</td>\n",
       "      <td>not selected</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>45</td>\n",
       "      <td>Davis, Khris</td>\n",
       "      <td>OF</td>\n",
       "      <td>all-star</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   hr          names pos            AS\n",
       "0  25  Chapman, Matt  3B      all-star\n",
       "1  30    Olson, Matt  1B  not selected\n",
       "2  45   Davis, Khris  OF      all-star"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Write your code here\n",
    "def Check_All_Star(row, threshold_hr):\n",
    "    \n",
    "    pos = row[\"pos\"]\n",
    "    hr = row[\"hr\"]\n",
    "    if pos in [\"3B\", \"OF\"] and hr>threshold_hr:\n",
    "        return \"all-star\"\n",
    "    else:\n",
    "        return \"not selected\"\n",
    "    \n",
    "df_players[\"AS\"] = df_players.apply(Check_All_Star, threshold_hr = 20, axis = 1)\n",
    "df_players"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Movie Review Sentiments Analysis\n",
    "\n",
    "In this practice exercise, we will use logistic regression to classify the sentiment of movie reviews.  The file \"Movie_Reviews.tsv\" contains sentences from movie review whose sentiment has already been classified.  The \"Sentiment\" column has the following meaning:\n",
    "\n",
    "- 0: negative\n",
    "- 1: somewhat negative\n",
    "- 2: neutral\n",
    "- 3: somewhat positive\n",
    "- 4: positive"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Place imports here\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.tokenize import word_tokenize\n",
    "from sklearn.linear_model import LogisticRegression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "ename": "TypeError",
     "evalue": "argument of type 'WordListCorpusReader' is not iterable",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-3-f07a495566bf>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;34m\"a\"\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mstopwords\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mTypeError\u001b[0m: argument of type 'WordListCorpusReader' is not iterable"
     ]
    }
   ],
   "source": [
    "\"a\" in stopwords"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PhraseId</th>\n",
       "      <th>SentenceId</th>\n",
       "      <th>Phrase</th>\n",
       "      <th>Sentiment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>A series of escapades demonstrating the adage ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>A series of escapades demonstrating the adage ...</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>A series</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>A</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>series</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>of escapades demonstrating the adage that what...</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "      <td>of</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>8</td>\n",
       "      <td>1</td>\n",
       "      <td>escapades demonstrating the adage that what is...</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "      <td>escapades</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>demonstrating the adage that what is good for ...</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PhraseId  SentenceId                                             Phrase  \\\n",
       "0         1           1  A series of escapades demonstrating the adage ...   \n",
       "1         2           1  A series of escapades demonstrating the adage ...   \n",
       "2         3           1                                           A series   \n",
       "3         4           1                                                  A   \n",
       "4         5           1                                             series   \n",
       "5         6           1  of escapades demonstrating the adage that what...   \n",
       "6         7           1                                                 of   \n",
       "7         8           1  escapades demonstrating the adage that what is...   \n",
       "8         9           1                                          escapades   \n",
       "9        10           1  demonstrating the adage that what is good for ...   \n",
       "\n",
       "   Sentiment  \n",
       "0          1  \n",
       "1          2  \n",
       "2          2  \n",
       "3          2  \n",
       "4          2  \n",
       "5          2  \n",
       "6          2  \n",
       "7          2  \n",
       "8          2  \n",
       "9          2  "
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Write your code here\n",
    "movie_reviews = pd.read_csv(\"Data/Movie_Reviews.tsv\", delimiter=\"\\t\")\n",
    "\n",
    "movie_reviews.head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In order to run a logistic regression, we need to create features.  Let's use something similar to the sentiment scores from a HW #3 as features.  In other words, lets use (# pos words/# total words) and (# neg words/# total words).  To do this, lets write functions again that read in the positive and negative words again in lowercase."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "354"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def Read_Words(file_path):\n",
    "    \n",
    "    f= open(file_path)\n",
    "    \n",
    "    return f.read().lower().strip('\\n').split(\" \")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's get rid of all phrases that have fewer than 10 words. Write a function that counts the words and add a column using apply with these counts."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def Count_Words(phrase):\n",
    "    \n",
    "    words = word_tokenize(phrase)\n",
    "    total_words = [word for word in words if word.isalpha()]\n",
    "\n",
    "    return len(total_words)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PhraseId</th>\n",
       "      <th>SentenceId</th>\n",
       "      <th>Phrase</th>\n",
       "      <th>Sentiment</th>\n",
       "      <th>Phrase_Length</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>A series of escapades demonstrating the adage ...</td>\n",
       "      <td>1</td>\n",
       "      <td>35</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>A series of escapades demonstrating the adage ...</td>\n",
       "      <td>2</td>\n",
       "      <td>14</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>A series</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>A</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>series</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PhraseId  SentenceId                                             Phrase  \\\n",
       "0         1           1  A series of escapades demonstrating the adage ...   \n",
       "1         2           1  A series of escapades demonstrating the adage ...   \n",
       "2         3           1                                           A series   \n",
       "3         4           1                                                  A   \n",
       "4         5           1                                             series   \n",
       "\n",
       "   Sentiment  Phrase_Length  \n",
       "0          1             35  \n",
       "1          2             14  \n",
       "2          2              2  \n",
       "3          2              1  \n",
       "4          2              1  "
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Add phrase_length column\n",
    "movie_reviews[\"Phrase_Length\"] = movie_reviews.Phrase.apply(Count_Words)\n",
    "\n",
    "movie_reviews.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PhraseId</th>\n",
       "      <th>SentenceId</th>\n",
       "      <th>Phrase</th>\n",
       "      <th>Sentiment</th>\n",
       "      <th>Phrase_Length</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>A series of escapades demonstrating the adage ...</td>\n",
       "      <td>1</td>\n",
       "      <td>35</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>A series of escapades demonstrating the adage ...</td>\n",
       "      <td>2</td>\n",
       "      <td>14</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>of escapades demonstrating the adage that what...</td>\n",
       "      <td>2</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>8</td>\n",
       "      <td>1</td>\n",
       "      <td>escapades demonstrating the adage that what is...</td>\n",
       "      <td>2</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>demonstrating the adage that what is good for ...</td>\n",
       "      <td>2</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PhraseId  SentenceId                                             Phrase  \\\n",
       "0         1           1  A series of escapades demonstrating the adage ...   \n",
       "1         2           1  A series of escapades demonstrating the adage ...   \n",
       "5         6           1  of escapades demonstrating the adage that what...   \n",
       "7         8           1  escapades demonstrating the adage that what is...   \n",
       "9        10           1  demonstrating the adage that what is good for ...   \n",
       "\n",
       "   Sentiment  Phrase_Length  \n",
       "0          1             35  \n",
       "1          2             14  \n",
       "5          2             12  \n",
       "7          2             11  \n",
       "9          2             10  "
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Get ride of phrases that have fewer than 10 words\n",
    "movie_reviews = movie_reviews.loc[movie_reviews.Phrase_Length>=10,:]\n",
    "\n",
    "movie_reviews.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we need to write a function that counts the fraction of each type of word, which we can then pass to the apply function. This function will first have to parse the phrase and get counts of the total number of words (ignoring stop words) and the number of positive or negative words."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def Get_Fraction(phrase,stop_words, all_sent_words):\n",
    "    \n",
    "    words = [word.lower() for word in word_tokenize(phrase)]\n",
    "    total_words = [word for word in words if word not in stop_words and word.isalpha()]\n",
    "    sent_words = [word for word in words if word in all_sent_words and word.isalpha()]\n",
    "    \n",
    "    return len(sent_words)/len(total_words)\n",
    "    \n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Use the apply() method to add columns to the data frames with these two proportions."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PhraseId</th>\n",
       "      <th>SentenceId</th>\n",
       "      <th>Phrase</th>\n",
       "      <th>Sentiment</th>\n",
       "      <th>Phrase_Length</th>\n",
       "      <th>Pos_Score</th>\n",
       "      <th>Neg_Score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>A series of escapades demonstrating the adage ...</td>\n",
       "      <td>1</td>\n",
       "      <td>35</td>\n",
       "      <td>0.133333</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>A series of escapades demonstrating the adage ...</td>\n",
       "      <td>2</td>\n",
       "      <td>14</td>\n",
       "      <td>0.166667</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>of escapades demonstrating the adage that what...</td>\n",
       "      <td>2</td>\n",
       "      <td>12</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>8</td>\n",
       "      <td>1</td>\n",
       "      <td>escapades demonstrating the adage that what is...</td>\n",
       "      <td>2</td>\n",
       "      <td>11</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>demonstrating the adage that what is good for ...</td>\n",
       "      <td>2</td>\n",
       "      <td>10</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PhraseId  SentenceId                                             Phrase  \\\n",
       "0         1           1  A series of escapades demonstrating the adage ...   \n",
       "1         2           1  A series of escapades demonstrating the adage ...   \n",
       "5         6           1  of escapades demonstrating the adage that what...   \n",
       "7         8           1  escapades demonstrating the adage that what is...   \n",
       "9        10           1  demonstrating the adage that what is good for ...   \n",
       "\n",
       "   Sentiment  Phrase_Length  Pos_Score  Neg_Score  \n",
       "0          1             35   0.133333        0.0  \n",
       "1          2             14   0.166667        0.0  \n",
       "5          2             12   0.200000        0.0  \n",
       "7          2             11   0.200000        0.0  \n",
       "9          2             10   0.250000        0.0  "
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Get positive, negative and stop words\n",
    "pos_words = Read_Words(\"Data/pos_words.txt\")\n",
    "neg_words = Read_Words(\"Data/neg_words.txt\")\n",
    "stop_words = stopwords.words('english')\n",
    "\n",
    "#Add the two new columns\n",
    "movie_reviews[\"Pos_Score\"] = movie_reviews.Phrase.apply(Get_Fraction, stop_words = stop_words, all_sent_words = pos_words)\n",
    "movie_reviews[\"Neg_Score\"] = movie_reviews.Phrase.apply(Get_Fraction, stop_words = stop_words, all_sent_words = neg_words)\n",
    "movie_reviews.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we will use sklearn's logistic regression package."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "feature_list = [\"Pos_Score\", \"Neg_Score\"]\n",
    "X = np.array(movie_reviews.loc[:, feature_list])\n",
    "y = np.array(movie_reviews.Sentiment)\n",
    "    \n",
    "# instantiate a logistic regression model, and fit with X and y\n",
    "model = LogisticRegression(multi_class = \"multinomial\", solver = 'newton-cg')\n",
    "model = model.fit(X, y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now let's see how it works by giving it a review and seeing how our model classified this review. Let's see what our model predicts for the following two reviews."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1, 3])"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "review_1 = \"This movie was the worst movie I've ever seen.  The acting was horrible!\"\n",
    "review_2 = \"Amazing! simply the best movie ever. I would love to see it again.\"\n",
    "pos_score_1 = Get_Fraction(review_1 ,stop_words, pos_words)\n",
    "neg_score_1 = Get_Fraction(review_1 ,stop_words, neg_words)\n",
    "pos_score_2 = Get_Fraction(review_2 ,stop_words, pos_words)\n",
    "neg_score_2 = Get_Fraction(review_2 ,stop_words, neg_words)\n",
    "\n",
    "model.predict(np.array([[pos_score_1, neg_score_1], [pos_score_2, neg_score_2]]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  },
  "toc": {
   "colors": {
    "hover_highlight": "#DAA520",
    "navigate_num": "#000000",
    "navigate_text": "#333333",
    "running_highlight": "#FF0000",
    "selected_highlight": "#FFD700",
    "sidebar_border": "#EEEEEE",
    "wrapper_background": "#FFFFFF"
   },
   "moveMenuLeft": true,
   "nav_menu": {
    "height": "12px",
    "width": "252px"
   },
   "navigate_menu": true,
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "threshold": 4,
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": "block",
   "toc_window_display": false,
   "widenNotebook": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}