Slicing and Indexing

Jake Feldman

High Level

#Creating a vector
v=c("a","b","c", "f", "z")
v
[1] "a" "b" "c" "f" "z"

-How do I access/change/add column to a data frame?

-How do I access/change the indiviual elements? How do I access/change chunks of the vector?

-How do I find how big the vector is? How do I apply built in functions to the vector?

Accessing Elements of Vector

#Creating a vector
v=c("a","b","c", "f", "z")
#Picking out single element. The first element has index 1.  This command picks out the second element. 
v[2]
[1] "b"
#Picking out contiguous chunks (The 1st-4th element inclusive)
v[1:4]
[1] "a" "b" "c" "f"
#Picking out non-contiguous chunk (elements in posisitions 1,3,5)
v[c(1,3,5)]
[1] "a" "c" "z"

Changing Elements of a Vector

v=c("a","b","c", "f", "z")
#Changing the 2nd element
v[2] = 1
v
[1] "a" "1" "c" "f" "z"
#Change elements 1-4
v[1:4]= c(1,2,3,4)
v
[1] "1" "2" "3" "4" "z"
#Change elements 1,3,5
v[c(1,3,5)] = c("Joe")
v
[1] "Joe" "2"   "Joe" "4"   "Joe"

Appending and Finding Length

v=c("a","b","c", "f", "z")
#Finding length of vector
length(v)
[1] 5
#Appending single element
newV = c(v,"q")
newV
[1] "a" "b" "c" "f" "z" "q"
#Appending vector
newerV = c(v, c(1,2,3,4))
newerV
[1] "a" "b" "c" "f" "z" "1" "2" "3" "4"

Creating a Data Frame

-Data frame has two dimensions and it can hold multiple data types

#Build columns
names= c("Micheal", "Deandre", "Christian") 
age=c(28,22,17)
club = c("Toronto FC", "Newcastle", "Borussia")
#Make df.  This is more readable and less error prone
df_Soccer = data.frame(names, age, club)

#View data frame
df_Soccer
      names age       club
1   Micheal  28 Toronto FC
2   Deandre  22  Newcastle
3 Christian  17   Borussia

Column Names

df_Soccer
      names age       club
1   Micheal  28 Toronto FC
2   Deandre  22  Newcastle
3 Christian  17   Borussia
#Get column names. This returns a vector
colnames(df_Soccer)
[1] "names" "age"   "club" 
#Change a column name. Why would we want to do this?
colnames(df_Soccer)[1] = "Names"
colnames(df_Soccer)
[1] "Names" "age"   "club" 

Accessing Elements of DF

df_Soccer
      Names age       club
1   Micheal  28 Toronto FC
2   Deandre  22  Newcastle
3 Christian  17   Borussia
#Picking out single element (1st row, 2nd column)
df_Soccer[1,2]
[1] 28
#Picking out contiguous chunk (1st row, all columns)
df_Soccer[1:2,]
    Names age       club
1 Micheal  28 Toronto FC
2 Deandre  22  Newcastle

Accessing Elements of DF Cont’d

df_Soccer
      Names age       club
1   Micheal  28 Toronto FC
2   Deandre  22  Newcastle
3 Christian  17   Borussia
#Picking out non-contiguous chunk (columns 2 and 3 in rows 1 and 3)
df_Soccer[c(1,3),c(2,3)]
  age       club
1  28 Toronto FC
3  17   Borussia

Another Way for DFs

df_Soccer
      Names age       club
1   Micheal  28 Toronto FC
2   Deandre  22  Newcastle
3 Christian  17   Borussia
#Picking out column (What is the difference?)
df_Soccer["club"]
        club
1 Toronto FC
2  Newcastle
3   Borussia
df_Soccer$club
[1] Toronto FC Newcastle  Borussia  
Levels: Borussia Newcastle Toronto FC

Another Way for DFs Cont’d

df_Soccer 
      Names age       club
1   Micheal  28 Toronto FC
2   Deandre  22  Newcastle
3 Christian  17   Borussia
#This returns the column as a data frame so there are two dimensions
class(df_Soccer["club"])
[1] "data.frame"
#This return a single dimensional vector
class(df_Soccer$club)
[1] "factor"

Another Way for DFs Cont’d

df_Soccer 
      Names age       club
1   Micheal  28 Toronto FC
2   Deandre  22  Newcastle
3 Christian  17   Borussia
#Picking out the first two rows of column club.  This return a vector
df_Soccer["club"][1:2,]
[1] Toronto FC Newcastle 
Levels: Borussia Newcastle Toronto FC
#Return same thing
df_Soccer[1:2,"club"]
[1] Toronto FC Newcastle 
Levels: Borussia Newcastle Toronto FC
#Return same thing
df_Soccer$club[1:2]
[1] Toronto FC Newcastle 
Levels: Borussia Newcastle Toronto FC

Another Way for DFs Cont’d

df_Soccer
      Names age       club
1   Micheal  28 Toronto FC
2   Deandre  22  Newcastle
3 Christian  17   Borussia
#Picking out multiple column. Returns a df.
df_Soccer[,c("Names","club")]
      Names       club
1   Micheal Toronto FC
2   Deandre  Newcastle
3 Christian   Borussia
#Return first two rows of age and clubs columns. Notice that this returns a df.
df_Soccer[1:2,c("age" ,"club")]
  age       club
1  28 Toronto FC
2  22  Newcastle

Changing Elements of DF

df_Soccer 
      Names age       club
1   Micheal  28 Toronto FC
2   Deandre  22  Newcastle
3 Christian  17   Borussia
#Changing the third entry of the age column
df_Soccer[3,"age"]=18
df_Soccer
      Names age       club
1   Micheal  28 Toronto FC
2   Deandre  22  Newcastle
3 Christian  18   Borussia
#Changing entire age column
df_Soccer$age=c(30,25,18)
df_Soccer
      Names age       club
1   Micheal  30 Toronto FC
2   Deandre  25  Newcastle
3 Christian  18   Borussia

Adding Columns

df_Soccer
      Names age       club
1   Micheal  30 Toronto FC
2   Deandre  25  Newcastle
3 Christian  18   Borussia
#Adding a single column. Note the new name of this column
df_Soccer$goals=c(4,2,8)
df_Soccer
      Names age       club goals
1   Micheal  30 Toronto FC     4
2   Deandre  25  Newcastle     2
3 Christian  18   Borussia     8
#Add multiple columns
#First we create the column as a df
df_Stats = data.frame(goals = c(4,2,8), assists = c(10,1,7))
#Use the cbind (column bind)
df_Soccer_Full = cbind(df_Soccer, df_Stats)
df_Soccer_Full
      Names age       club goals goals assists
1   Micheal  30 Toronto FC     4     4      10
2   Deandre  25  Newcastle     2     2       1
3 Christian  18   Borussia     8     8       7

Adding Multiple Rows

#Re-inialize df_Soccer
df_Soccer = data.frame(names= c("Micheal", "Deandre", "Christian"), 
age=c(28,22,17), club = c("Toronto FC", "Newcastle", "Borussia"))
df_Soccer 
      names age       club
1   Micheal  28 Toronto FC
2   Deandre  22  Newcastle
3 Christian  17   Borussia
#Create df with the new rows
df_newPlayer = data.frame(names = c("Jake","Marshall"), 
              age = c(28,27), club=rep("CMS",2))

#Add multiple rows with rbind (row bind) function 
df_Soccer_Full = rbind(df_Soccer, df_newPlayer)
df_Soccer_Full
      names age       club
1   Micheal  28 Toronto FC
2   Deandre  22  Newcastle
3 Christian  17   Borussia
4      Jake  28        CMS
5  Marshall  27        CMS

Getting Dimensions of DF

#Re-inialize df_Soccer
df_Soccer = data.frame(names= c("Micheal", "Deandre", "Christian"), 
age=c(28,22,17), club = c("Toronto FC", "Newcastle", "Borussia"))

#Returns a vector: first entry is row and second is columns
dim(df_Soccer)
[1] 3 3
#Returns number of columns
ncol(df_Soccer)
[1] 3
#Return number of rows. (length(df) will also give you number of rows)
nrow(df_Soccer)
[1] 3

Creating New Column as Function of Other Columns

#Add a new column to df_Soccer
df_Soccer = data.frame(names= c("Micheal", "Deandre", "Christian"), 
age=c(28,22,17), club = c("Toronto FC", "Newcastle", "Borussia"), 
experienceYears = c(9,5,2) )
df_Soccer
      names age       club experienceYears
1   Micheal  28 Toronto FC               9
2   Deandre  22  Newcastle               5
3 Christian  17   Borussia               2
#Create column that is the age of the player in months
df_Soccer$ageMonths= df_Soccer$age*12
df_Soccer
      names age       club experienceYears ageMonths
1   Micheal  28 Toronto FC               9       336
2   Deandre  22  Newcastle               5       264
3 Christian  17   Borussia               2       204
#Create a column that is the ratio of age to years if experience
df_Soccer$ratio = df_Soccer$age/df_Soccer$experienceYears
df_Soccer
      names age       club experienceYears ageMonths    ratio
1   Micheal  28 Toronto FC               9       336 3.111111
2   Deandre  22  Newcastle               5       264 4.400000
3 Christian  17   Borussia               2       204 8.500000

Applying Built in Functions to Columns

#Re-inialize df_Soccer
df_Soccer = data.frame(names= c("Micheal", "Deandre", "Christian"), 
age=c(28,22,17), club = c("Toronto FC", "Newcastle", "Borussia"))

df_Soccer 
      names age       club
1   Micheal  28 Toronto FC
2   Deandre  22  Newcastle
3 Christian  17   Borussia
#Using summary (the non-numeric column give you bogus)
summary(df_Soccer)
       names        age                club  
 Christian:1   Min.   :17.00   Borussia  :1  
 Deandre  :1   1st Qu.:19.50   Newcastle :1  
 Micheal  :1   Median :22.00   Toronto FC:1  
               Mean   :22.33                 
               3rd Qu.:25.00                 
               Max.   :28.00                 
#Using the mean and median function
mean(df_Soccer$age)
[1] 22.33333
median(df_Soccer$age)
[1] 22

Sorting a DF

#Re-inialize df_Soccer
df_Soccer = data.frame(names= c("Micheal", "Deandre", "Christian"), 
age=c(28,22,17), club = c("Toronto FC", "Newcastle", "Borussia"))

df_Soccer 
      names age       club
1   Micheal  28 Toronto FC
2   Deandre  22  Newcastle
3 Christian  17   Borussia
#Sort by age ascending
df_Soccer[order(df_Soccer$age),]
      names age       club
3 Christian  17   Borussia
2   Deandre  22  Newcastle
1   Micheal  28 Toronto FC
#Sort by age descending
df_Soccer[order(-df_Soccer$age),]
      names age       club
1   Micheal  28 Toronto FC
2   Deandre  22  Newcastle
3 Christian  17   Borussia
#Add in some new columns
stats=data.frame(goals = c(10,2,10), assists = c(5,2,9))
df_addedStats = cbind(df_Soccer,stats)

df_addedStats
      names age       club goals assists
1   Micheal  28 Toronto FC    10       5
2   Deandre  22  Newcastle     2       2
3 Christian  17   Borussia    10       9
#Sort by goals and then by assists
#(This means we tie break with goals)
df_addedStats[order(-df_addedStats$goals, -df_addedStats$assists),]
      names age       club goals assists
3 Christian  17   Borussia    10       9
1   Micheal  28 Toronto FC    10       5
2   Deandre  22  Newcastle     2       2

Converting Column to Factor

#Lets say we have a  new column that is 1 if they play in Europe and 0
#otherwise
df_Soccer = data.frame(names= c("Micheal", "Deandre", "Christian"), 
age=c(28,22,17), club = c("Toronto FC", "Newcastle", "Borussia"), 
Europe = c(0,1,1))
df_Soccer
      names age       club Europe
1   Micheal  28 Toronto FC      0
2   Deandre  22  Newcastle      1
3 Christian  17   Borussia      1
#Lets see how R stores the Europe column
str(df_Soccer)
'data.frame':   3 obs. of  4 variables:
 $ names : Factor w/ 3 levels "Christian","Deandre",..: 3 2 1
 $ age   : num  28 22 17
 $ club  : Factor w/ 3 levels "Borussia","Newcastle",..: 3 2 1
 $ Europe: num  0 1 1
#Convert Europe to factor using the as.factor command
#(There are also as.numeric and as.charcter commands)
df_Soccer$Europe = as.factor(df_Soccer$Europe)

str(df_Soccer)
'data.frame':   3 obs. of  4 variables:
 $ names : Factor w/ 3 levels "Christian","Deandre",..: 3 2 1
 $ age   : num  28 22 17
 $ club  : Factor w/ 3 levels "Borussia","Newcastle",..: 3 2 1
 $ Europe: Factor w/ 2 levels "0","1": 1 2 2
#Get meaningful levels
levels(df_Soccer$Europe)
[1] "0" "1"
levels(df_Soccer$Europe) = c("US", "Europe")
levels(df_Soccer$Europe)
[1] "US"     "Europe"
df_Soccer
      names age       club Europe
1   Micheal  28 Toronto FC     US
2   Deandre  22  Newcastle Europe
3 Christian  17   Borussia Europe

Take Homes

-Manipulating Data Frames: