High Level

#Creating a vector
v=c("a","b","c", "f", "z")
v

[1] "a" "b" "c" "f" "z"

-How do I access/change/add column to a data frame?

-How do I access/change the indiviual elements? How do I access/change chunks of the vector?

-How do I find how big the vector is? How do I apply built in functions to the vector?

Accessing Elements of Vector

#Creating a vector
v=c("a","b","c", "f", "z")
#Picking out single element. The first element has index 1.  This command picks out the second element. 
v[2]

[1] "b"

#Picking out contiguous chunks (The 1st-4th element inclusive)
v[1:4]

[1] "a" "b" "c" "f"

#Picking out non-contiguous chunk (elements in posisitions 1,3,5)
v[c(1,3,5)]

[1] "a" "c" "z"

Changing Elements of a Vector

v=c("a","b","c", "f", "z")
#Changing the 2nd element
v[2] = 1
v

[1] "a" "1" "c" "f" "z"

#Change elements 1-4
v[1:4]= c(1,2,3,4)
v

[1] "1" "2" "3" "4" "z"

#Change elements 1,3,5
v[c(1,3,5)] = c("Joe")
v

[1] "Joe" "2"   "Joe" "4"   "Joe"

Appending and Finding Length

v=c("a","b","c", "f", "z")
#Finding length of vector
length(v)

[1] 5

#Appending single element
newV = c(v,"q")
newV

[1] "a" "b" "c" "f" "z" "q"

#Appending vector
newerV = c(v, c(1,2,3,4))
newerV

[1] "a" "b" "c" "f" "z" "1" "2" "3" "4"

Creating a Data Frame

-Data frame has two dimensions and it can hold multiple data types

#Build columns
names= c("Micheal", "Deandre", "Christian") 
age=c(28,22,17)
club = c("Toronto FC", "Newcastle", "Borussia")
#Make df.  This is more readable and less error prone
df_Soccer = data.frame(names, age, club)

#View data frame
df_Soccer

      names age       club
1   Micheal  28 Toronto FC
2   Deandre  22  Newcastle
3 Christian  17   Borussia

Column Names

df_Soccer

      names age       club
1   Micheal  28 Toronto FC
2   Deandre  22  Newcastle
3 Christian  17   Borussia

#Get column names. This returns a vector
colnames(df_Soccer)

[1] "names" "age"   "club"

#Change a column name. Why would we want to do this?
colnames(df_Soccer)[1] = "Names"
colnames(df_Soccer)

[1] "Names" "age"   "club"

Accessing Elements of DF

df_Soccer

      Names age       club
1   Micheal  28 Toronto FC
2   Deandre  22  Newcastle
3 Christian  17   Borussia

#Picking out single element (1st row, 2nd column)
df_Soccer[1,2]

[1] 28

#Picking out contiguous chunk (1st row, all columns)
df_Soccer[1:2,]

    Names age       club
1 Micheal  28 Toronto FC
2 Deandre  22  Newcastle

Accessing Elements of DF Cont’d

df_Soccer

      Names age       club
1   Micheal  28 Toronto FC
2   Deandre  22  Newcastle
3 Christian  17   Borussia

#Picking out non-contiguous chunk (columns 2 and 3 in rows 1 and 3)
df_Soccer[c(1,3),c(2,3)]

  age       club
1  28 Toronto FC
3  17   Borussia

Another Way for DFs

df_Soccer

      Names age       club
1   Micheal  28 Toronto FC
2   Deandre  22  Newcastle
3 Christian  17   Borussia

#Picking out column (What is the difference?)
df_Soccer["club"]

        club
1 Toronto FC
2  Newcastle
3   Borussia

df_Soccer$club

[1] Toronto FC Newcastle  Borussia  
Levels: Borussia Newcastle Toronto FC

Another Way for DFs Cont’d

df_Soccer

      Names age       club
1   Micheal  28 Toronto FC
2   Deandre  22  Newcastle
3 Christian  17   Borussia

#This returns the column as a data frame so there are two dimensions
class(df_Soccer["club"])

[1] "data.frame"

#This return a single dimensional vector
class(df_Soccer$club)

[1] "factor"

Another Way for DFs Cont’d

df_Soccer

      Names age       club
1   Micheal  28 Toronto FC
2   Deandre  22  Newcastle
3 Christian  17   Borussia

#Picking out the first two rows of column club.  This return a vector
df_Soccer["club"][1:2,]

[1] Toronto FC Newcastle 
Levels: Borussia Newcastle Toronto FC

#Return same thing
df_Soccer[1:2,"club"]

[1] Toronto FC Newcastle 
Levels: Borussia Newcastle Toronto FC

#Return same thing
df_Soccer$club[1:2]

[1] Toronto FC Newcastle 
Levels: Borussia Newcastle Toronto FC

Another Way for DFs Cont’d

df_Soccer

      Names age       club
1   Micheal  28 Toronto FC
2   Deandre  22  Newcastle
3 Christian  17   Borussia

#Picking out multiple column. Returns a df.
df_Soccer[,c("Names","club")]

      Names       club
1   Micheal Toronto FC
2   Deandre  Newcastle
3 Christian   Borussia

#Return first two rows of age and clubs columns. Notice that this returns a df.
df_Soccer[1:2,c("age" ,"club")]

  age       club
1  28 Toronto FC
2  22  Newcastle

Changing Elements of DF

df_Soccer

      Names age       club
1   Micheal  28 Toronto FC
2   Deandre  22  Newcastle
3 Christian  17   Borussia

#Changing the third entry of the age column
df_Soccer[3,"age"]=18
df_Soccer

      Names age       club
1   Micheal  28 Toronto FC
2   Deandre  22  Newcastle
3 Christian  18   Borussia

#Changing entire age column
df_Soccer$age=c(30,25,18)
df_Soccer

      Names age       club
1   Micheal  30 Toronto FC
2   Deandre  25  Newcastle
3 Christian  18   Borussia

Adding Columns

df_Soccer

      Names age       club
1   Micheal  30 Toronto FC
2   Deandre  25  Newcastle
3 Christian  18   Borussia

#Adding a single column. Note the new name of this column
df_Soccer$goals=c(4,2,8)
df_Soccer

      Names age       club goals
1   Micheal  30 Toronto FC     4
2   Deandre  25  Newcastle     2
3 Christian  18   Borussia     8

#Add multiple columns
#First we create the column as a df
df_Stats = data.frame(goals = c(4,2,8), assists = c(10,1,7))
#Use the cbind (column bind)
df_Soccer_Full = cbind(df_Soccer, df_Stats)
df_Soccer_Full

      Names age       club goals goals assists
1   Micheal  30 Toronto FC     4     4      10
2   Deandre  25  Newcastle     2     2       1
3 Christian  18   Borussia     8     8       7

Adding Multiple Rows

#Re-inialize df_Soccer
df_Soccer = data.frame(names= c("Micheal", "Deandre", "Christian"), 
age=c(28,22,17), club = c("Toronto FC", "Newcastle", "Borussia"))
df_Soccer

      names age       club
1   Micheal  28 Toronto FC
2   Deandre  22  Newcastle
3 Christian  17   Borussia

#Create df with the new rows
df_newPlayer = data.frame(names = c("Jake","Marshall"), 
              age = c(28,27), club=rep("CMS",2))

#Add multiple rows with rbind (row bind) function 
df_Soccer_Full = rbind(df_Soccer, df_newPlayer)
df_Soccer_Full

      names age       club
1   Micheal  28 Toronto FC
2   Deandre  22  Newcastle
3 Christian  17   Borussia
4      Jake  28        CMS
5  Marshall  27        CMS

Getting Dimensions of DF

#Re-inialize df_Soccer
df_Soccer = data.frame(names= c("Micheal", "Deandre", "Christian"), 
age=c(28,22,17), club = c("Toronto FC", "Newcastle", "Borussia"))

#Returns a vector: first entry is row and second is columns
dim(df_Soccer)

[1] 3 3

#Returns number of columns
ncol(df_Soccer)

[1] 3

#Return number of rows. (length(df) will also give you number of rows)
nrow(df_Soccer)

[1] 3

Creating New Column as Function of Other Columns

#Add a new column to df_Soccer
df_Soccer = data.frame(names= c("Micheal", "Deandre", "Christian"), 
age=c(28,22,17), club = c("Toronto FC", "Newcastle", "Borussia"), 
experienceYears = c(9,5,2) )
df_Soccer

      names age       club experienceYears
1   Micheal  28 Toronto FC               9
2   Deandre  22  Newcastle               5
3 Christian  17   Borussia               2

#Create column that is the age of the player in months
df_Soccer$ageMonths= df_Soccer$age*12
df_Soccer

      names age       club experienceYears ageMonths
1   Micheal  28 Toronto FC               9       336
2   Deandre  22  Newcastle               5       264
3 Christian  17   Borussia               2       204

#Create a column that is the ratio of age to years if experience
df_Soccer$ratio = df_Soccer$age/df_Soccer$experienceYears
df_Soccer

      names age       club experienceYears ageMonths    ratio
1   Micheal  28 Toronto FC               9       336 3.111111
2   Deandre  22  Newcastle               5       264 4.400000
3 Christian  17   Borussia               2       204 8.500000

Applying Built in Functions to Columns

#Re-inialize df_Soccer
df_Soccer = data.frame(names= c("Micheal", "Deandre", "Christian"), 
age=c(28,22,17), club = c("Toronto FC", "Newcastle", "Borussia"))

df_Soccer

      names age       club
1   Micheal  28 Toronto FC
2   Deandre  22  Newcastle
3 Christian  17   Borussia

#Using summary (the non-numeric column give you bogus)
summary(df_Soccer)

       names        age                club  
 Christian:1   Min.   :17.00   Borussia  :1  
 Deandre  :1   1st Qu.:19.50   Newcastle :1  
 Micheal  :1   Median :22.00   Toronto FC:1  
               Mean   :22.33                 
               3rd Qu.:25.00                 
               Max.   :28.00

#Using the mean and median function
mean(df_Soccer$age)

[1] 22.33333

median(df_Soccer$age)

[1] 22

Sorting a DF

#Re-inialize df_Soccer
df_Soccer = data.frame(names= c("Micheal", "Deandre", "Christian"), 
age=c(28,22,17), club = c("Toronto FC", "Newcastle", "Borussia"))

df_Soccer

      names age       club
1   Micheal  28 Toronto FC
2   Deandre  22  Newcastle
3 Christian  17   Borussia

#Sort by age ascending
df_Soccer[order(df_Soccer$age),]

      names age       club
3 Christian  17   Borussia
2   Deandre  22  Newcastle
1   Micheal  28 Toronto FC

#Sort by age descending
df_Soccer[order(-df_Soccer$age),]

      names age       club
1   Micheal  28 Toronto FC
2   Deandre  22  Newcastle
3 Christian  17   Borussia

#Add in some new columns
stats=data.frame(goals = c(10,2,10), assists = c(5,2,9))
df_addedStats = cbind(df_Soccer,stats)

df_addedStats

      names age       club goals assists
1   Micheal  28 Toronto FC    10       5
2   Deandre  22  Newcastle     2       2
3 Christian  17   Borussia    10       9

#Sort by goals and then by assists
#(This means we tie break with goals)
df_addedStats[order(-df_addedStats$goals, -df_addedStats$assists),]

      names age       club goals assists
3 Christian  17   Borussia    10       9
1   Micheal  28 Toronto FC    10       5
2   Deandre  22  Newcastle     2       2

Converting Column to Factor

#Lets say we have a  new column that is 1 if they play in Europe and 0
#otherwise
df_Soccer = data.frame(names= c("Micheal", "Deandre", "Christian"), 
age=c(28,22,17), club = c("Toronto FC", "Newcastle", "Borussia"), 
Europe = c(0,1,1))
df_Soccer

      names age       club Europe
1   Micheal  28 Toronto FC      0
2   Deandre  22  Newcastle      1
3 Christian  17   Borussia      1

#Lets see how R stores the Europe column
str(df_Soccer)

'data.frame':   3 obs. of  4 variables:
 $ names : Factor w/ 3 levels "Christian","Deandre",..: 3 2 1
 $ age   : num  28 22 17
 $ club  : Factor w/ 3 levels "Borussia","Newcastle",..: 3 2 1
 $ Europe: num  0 1 1

#Convert Europe to factor using the as.factor command
#(There are also as.numeric and as.charcter commands)
df_Soccer$Europe = as.factor(df_Soccer$Europe)

str(df_Soccer)

'data.frame':   3 obs. of  4 variables:
 $ names : Factor w/ 3 levels "Christian","Deandre",..: 3 2 1
 $ age   : num  28 22 17
 $ club  : Factor w/ 3 levels "Borussia","Newcastle",..: 3 2 1
 $ Europe: Factor w/ 2 levels "0","1": 1 2 2

#Get meaningful levels
levels(df_Soccer$Europe)

[1] "0" "1"

levels(df_Soccer$Europe) = c("US", "Europe")
levels(df_Soccer$Europe)

[1] "US"     "Europe"

df_Soccer

      names age       club Europe
1   Micheal  28 Toronto FC     US
2   Deandre  22  Newcastle Europe
3 Christian  17   Borussia Europe

Slicing and Indexing

High Level

Accessing Elements of Vector

Changing Elements of a Vector

Appending and Finding Length

Creating a Data Frame

Column Names

Accessing Elements of DF

Accessing Elements of DF Cont’d

Another Way for DFs

Another Way for DFs Cont’d

Another Way for DFs Cont’d

Another Way for DFs Cont’d

Changing Elements of DF

Adding Columns

Adding Multiple Rows

Getting Dimensions of DF

Creating New Column as Function of Other Columns

Applying Built in Functions to Columns

Sorting a DF

Converting Column to Factor

Take Homes