Reading in CSV

#Readin in CSV into data frame. Variable called app_df stored data as df.
#Make sure you have correctly set your working directory.  File name
#must be in qutoes
app_df = read.csv("Applicant_Data.csv")

#The head(df,i) prints out the first i rows of df
#Use this to check that everything was read in correctly
head(app_df[,1:8],8)

  X  ID ROUND AGE SEX CITZ.CODE          X1ST.CONTACT JOB.MONTHS
1 1   7     1  29   M         U            Phone call         60
2 2  24     1  28   M         U             Home Page         60
3 3  46     1  27   M         U            Phone call         45
4 4  91     1  26   F         U                Letter         36
5 5 115     1  28   F         U            Phone call         56
6 6 118     1  25   M         U            Phone call         48
7 7 130     1  30   F         N            Phone call        100
8 8 184     1  27   M         U Test Score Tape (DNU)         48

#Check to see how R read in the data frame. Anything seem off here?
str(app_df)

'data.frame':   1999 obs. of  15 variables:
 $ X             : int  1 2 3 4 5 6 7 8 9 10 ...
 $ ID            : int  7 24 46 91 115 118 130 184 249 338 ...
 $ ROUND         : int  1 1 1 1 1 1 1 1 1 1 ...
 $ AGE           : num  29 28 27 26 28 25 30 27 27 32 ...
 $ SEX           : Factor w/ 2 levels "F","M": 2 2 2 1 1 2 1 2 2 2 ...
 $ CITZ.CODE     : Factor w/ 2 levels "N","U": 2 2 2 2 2 2 1 2 1 1 ...
 $ X1ST.CONTACT  : Factor w/ 16 levels "Email","Fax",..: 9 4 9 6 9 9 9 14 4 1 ...
 $ JOB.MONTHS    : num  60 60 45 36 56 48 100 48 56 102 ...
 $ INDUSTRY      : int  10 10 10 10 10 10 10 10 10 10 ...
 $ INDUSTRY.DESC.: Factor w/ 29 levels "Accounting","Advertising/Marketing Services",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ DECISION      : Factor w/ 9 levels "Cancel Enrollment",..: 4 4 4 4 9 4 4 4 8 4 ...
 $ GMAT          : num  652 650 710 640 710 ...
 $ PCT           : num  87 90 97 88 98 ...
 $ DEGREE        : Factor w/ 26 levels "AB","BA","BBA",..: 7 7 7 2 7 7 25 7 24 2 ...
 $ MJR           : Factor w/ 44 levels "BUSI","ENGG",..: 6 3 3 6 3 6 6 6 8 6 ...

#Lets change round to a factor since it is a category
app_df$ROUND = as.factor(app_df$ROUND)

Plotting with ggplot

-Make sure you have loaded the ggplot2 library

-Basic syntax for plotting: ggplot(df, aes(x,y)) + geom_type_of_plot

df is where you put the name of the data frame
aes stands for aesthetic. Put x,y column names here.
geom_point for scatter plots, dot plots, etc…
geom_line for time series, trend lines, etc…
geom_boxplot for boxplots
geom_barplot for barplot
geom_histogram for histogram

Basic Scatterplot

#Looking at the relationship between AGE anf GMAT score
ggplot(app_df, aes(AGE, GMAT)) + geom_point()

Scatterplot - Changing Color

#Putting a color input inside geom_point will change the color
#of every point.
ggplot(app_df, aes(AGE, GMAT)) + geom_point(color="blue")

Scatterplot - Changing Titles, Annotating and Changing Range

#The nice thing about ggplot is that we can use the + sign to add
#features to our plots. Here is how we add titles and axis labels 
ggplot(app_df, aes(AGE, GMAT)) + geom_point() + 
labs(title="Are Older People Smarter?", x= "Ages (years)", y = "GMAT Score")

#There are clearly a few outlier that we should get rid of. We can use
#the annotate command to add text to our plots
ggplot(app_df, aes(AGE, GMAT)) + geom_point() + 
labs(title="Are Older People Smarter?", x= "Ages (years)", y = "GMAT Score")+
annotate("text", x = 8, y = 570, size =5, label = "OUTLIERS!")

#Another function of annotate
ggplot(app_df, aes(AGE, GMAT)) + geom_point() + 
labs(title="Are Older People Smarter?", x= "Ages (years)", y = "GMAT Score")+
annotate("text", x = 8, y = 570, size =5, label = "OUTLIERS!") +
annotate("rect", xmin = 0, xmax = 2, ymin = 500, ymax = 650,
  alpha = .2)

#Here is one way to remove the outliers and control the range over which we plot
ggplot(app_df, aes(AGE, GMAT)) + geom_point() + xlim(c(20,60)) + ylim(c(0,800)) + 
labs(title="Are Older People Smarter?", x= "Ages (years)", y = "GMAT Score")

Scatterplot - Changing Color by Group

#Giving an input to color in the aes of ggplot() differentiates the points by 
#the categories
ggplot(app_df, aes(AGE, GMAT, color = SEX)) + geom_point() + xlim(c(20,60)) + ylim(c(0,800)) + 
labs(title="Are Older People Smarter?", x= "Ages (years)", y = "GMAT Score")

Scatterplot - Facet Wrap/Grid

#Lets look if age is correlated with months on the job
ggplot(app_df, aes(AGE, JOB.MONTHS, color = SEX))+ geom_point()

#facet_wrap allows us to separate out the plots
ggplot(app_df, aes(AGE, JOB.MONTHS))+ geom_point() + facet_wrap(~SEX)

#We can color and do a facet_warp
ggplot(app_df, aes(AGE, JOB.MONTHS, color = SEX))+ geom_point() + 
facet_wrap(~ROUND)

#We can facet wrap on more than one feature
ggplot(app_df, aes(AGE, JOB.MONTHS, color = SEX))+ geom_point() + 
facet_wrap(ROUND~CITZ.CODE)

#facet_grid looks a little better when you are considering two features
ggplot(app_df, aes(AGE, JOB.MONTHS, color = SEX))+ geom_point() + 
facet_grid(ROUND~CITZ.CODE)

Creating a Barplot

#Basic bar plot where the y-axis is the count of the number of
#applicants in each round
ggplot(app_df, aes(ROUND))+ geom_bar(width = 0.5)+
labs(title="Applicants per Round")

#Here we are looking to differentiate by sex, but it doesn't look
#good
ggplot(app_df, aes(ROUND, color=SEX))+ geom_bar(width = 0.75)+
labs(title="Applicants per Round")

#Use the fill instead!
ggplot(app_df, aes(ROUND, fill=SEX))+ geom_bar(width = 0.75)+
labs(title="Applicants per Round")

#We can facet wrap
ggplot(app_df, aes(ROUND, fill=SEX))+ geom_bar(width = 0.75)+
labs(title="Applicants per Round") +  facet_wrap(~CITZ.CODE)

#Here is how we make the bars appear sie by side
ggplot(app_df, aes(ROUND, fill=SEX))+ geom_bar(width = 0.75, 
position = position_dodge())+ labs(title="Applicants per Round")

Creating Histograms

#Here is how we can make a histogram.  The y-axis is count here.
#Notice that we can fill and control the binwidth.
ggplot(app_df, aes(GMAT, fill=SEX))+ geom_histogram(binwidth = 25)+
  labs(title="GMAT By Sex")

Creating Boxplots

#Basic box plot
ggplot(app_df, aes(ROUND, GMAT))+ geom_boxplot()

#Look what happens if round was not a factor.  Generally, when
#making a boxplot the x input should be a factor
ggplot(app_df, aes(as.integer(ROUND), GMAT))+ geom_boxplot()

#Flipping the coordinates
 ggplot(app_df, aes(ROUND, GMAT))+ geom_boxplot() + coord_flip()

 #Facet wrapping on sex
 ggplot(app_df, aes(ROUND, GMAT))+ geom_boxplot() + 
facet_wrap(~SEX)

Practice with mtcars

Let’s do some practice with the built in data set mtcars

head(mtcars)

                   mpg cyl disp  hp drat    wt  qsec vs am gear carb
Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

The 0’s and 1’s in the am and vs columns are a bit ambiguous (the vs column refers to the type of engine). For the vs column change the 0’s to “vertical”" and the 1’s to “straight”. For the am column change the 0’s to “automatic”" and the 1’s to “manual”.

#Lets change the am and vs columns to factors since these are categories
mtcars$vs = as.factor(mtcars$vs)
mtcars$am = as.factor(mtcars$am)

#Since the binary indicators are ambigious lets change this
levels(mtcars$vs) = c("Vertical", "Straight")
levels(mtcars$am) = c("Automatic", "Manual")

#This is how you set the "order" of the categories
mtcars$vs = factor(mtcars$vs, levels =  c("Vertical", "Straight"))

head(mtcars)

                   mpg cyl disp  hp drat    wt  qsec       vs        am
Mazda RX4         21.0   6  160 110 3.90 2.620 16.46 Vertical    Manual
Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02 Vertical    Manual
Datsun 710        22.8   4  108  93 3.85 2.320 18.61 Straight    Manual
Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44 Straight Automatic
Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02 Vertical Automatic
Valiant           18.1   6  225 105 2.76 3.460 20.22 Straight Automatic
                  gear carb
Mazda RX4            4    4
Mazda RX4 Wag        4    4
Datsun 710           4    1
Hornet 4 Drive       3    1
Hornet Sportabout    3    2
Valiant              3    1

Find the mean and median mpg of the cars with 4, 6 and 8 cylinders (this means I want three different means and medians). Since we haven’t learned how to group rows yet, you have to accomplish this task using sorting and slicing.

#First lets create a data frame that has the info sorted by the number of cylinders
mtcars_sorted_bycyl = mtcars[order(mtcars$cyl),]
head(mtcars_sorted_bycyl,13)

                mpg cyl  disp  hp drat    wt  qsec       vs        am gear
Datsun 710     22.8   4 108.0  93 3.85 2.320 18.61 Straight    Manual    4
Merc 240D      24.4   4 146.7  62 3.69 3.190 20.00 Straight Automatic    4
Merc 230       22.8   4 140.8  95 3.92 3.150 22.90 Straight Automatic    4
Fiat 128       32.4   4  78.7  66 4.08 2.200 19.47 Straight    Manual    4
Honda Civic    30.4   4  75.7  52 4.93 1.615 18.52 Straight    Manual    4
Toyota Corolla 33.9   4  71.1  65 4.22 1.835 19.90 Straight    Manual    4
Toyota Corona  21.5   4 120.1  97 3.70 2.465 20.01 Straight Automatic    3
Fiat X1-9      27.3   4  79.0  66 4.08 1.935 18.90 Straight    Manual    4
Porsche 914-2  26.0   4 120.3  91 4.43 2.140 16.70 Vertical    Manual    5
Lotus Europa   30.4   4  95.1 113 3.77 1.513 16.90 Straight    Manual    5
Volvo 142E     21.4   4 121.0 109 4.11 2.780 18.60 Straight    Manual    4
Mazda RX4      21.0   6 160.0 110 3.90 2.620 16.46 Vertical    Manual    4
Mazda RX4 Wag  21.0   6 160.0 110 3.90 2.875 17.02 Vertical    Manual    4
               carb
Datsun 710        1
Merc 240D         2
Merc 230          2
Fiat 128          1
Honda Civic       2
Toyota Corolla    1
Toyota Corona     1
Fiat X1-9         1
Porsche 914-2     2
Lotus Europa      2
Volvo 142E        2
Mazda RX4         4
Mazda RX4 Wag     4

#Now I will create three separate data frames for the cars of each cylinder type. I know
#the cutoffs just by looking at the sorted df
mtcars_4cyl=mtcars_sorted_bycyl[1:11,]
mtcars_6cyl=mtcars_sorted_bycyl[12:18,]
mtcars_8cyl=mtcars_sorted_bycyl[19:32,]

#We could compute the mean and median of the mpg separately for each df or
#we can use the summary command which gives us both statistics at the same time

#4 cylinders
summary(mtcars_4cyl$mpg)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  21.40   22.80   26.00   26.66   30.40   33.90

#6 cylinders
summary(mtcars_6cyl$mpg)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  17.80   18.65   19.70   19.74   21.00   21.40

#8 cylinders
summary(mtcars_8cyl$mpg)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  10.40   14.40   15.20   15.10   16.25   19.20

Create a bar plot that shows the number of cars with 4,6, and 8 cylinders across the four different combinations of the columns vs and am. This should be 4 bar plots.

#We will create a bar plot and use a facet grid
plot_cyl_byamvs = ggplot(mtcars, aes(cyl)) + geom_bar() + 
labs(title = "Distribution of Number of Cylinders ", xlab = "am", ylab = "vs")+facet_grid(am~vs)
plot_cyl_byamvs

#cyl is a numeric so the plot looks a little weird.
#Let convert it to a factor
plot_cyl_byamvs = ggplot(mtcars, aes(as.factor(cyl))) + geom_bar() + 
labs(title = "Distribution of Number of Cylinders ", xlab = "am", ylab = "vs") + facet_grid(am~vs)
plot_cyl_byamvs

Show the relationship between mpg and hp for 4 cylinder cars and then again for 6 cylinder cars.

#First lets create a data frame that has the info sorted by the number of cylinders
mtcars_sorted_bycyl = mtcars[order(mtcars$cyl),]
mtcars_4cyl=mtcars_sorted_bycyl[1:11,]
mtcars_6cyl=mtcars_sorted_bycyl[12:18,]

#This is how we can plot things from teo different data frames
plot = ggplot() + geom_point(data = mtcars_4cyl, aes(hp, mpg ,color = "4 cylinder")) + 
geom_point(data = mtcars_6cyl, aes(hp, mpg, color = "6 cylinder")) 
plot

#We can even add labels to each point
plot + geom_label(data = mtcars_4cyl, aes(hp, mpg , label = rownames(mtcars_4cyl)), vjust=-0.3 ,size=3)

Plotting with ggplot

Topics for this Lecture

Reading in CSV

Plotting with ggplot

Basic Scatterplot

Scatterplot - Changing Color

Scatterplot - Changing Titles, Annotating and Changing Range

Scatterplot - Changing Color by Group

Scatterplot - Facet Wrap/Grid

Creating a Barplot

Creating Histograms

Creating Boxplots

Practice with mtcars

Take Homes

Next Lecture + HW1