Jake Feldman
#Creating a vector
v=c("a","b","c", "f", "z")
v
[1] "a" "b" "c" "f" "z"
-How do I access/change/add column to a data frame?
-How do I access/change the indiviual elements? How do I access/change chunks of the vector?
-How do I find how big the vector is? How do I apply built in functions to the vector?
#Creating a vector
v=c("a","b","c", "f", "z")
#Picking out single element. The first element has index 1. This command picks out the second element.
v[2]
[1] "b"
#Picking out contiguous chunks (The 1st-4th element inclusive)
v[1:4]
[1] "a" "b" "c" "f"
#Picking out non-contiguous chunk (elements in posisitions 1,3,5)
v[c(1,3,5)]
[1] "a" "c" "z"
v=c("a","b","c", "f", "z")
#Changing the 2nd element
v[2] = 1
v
[1] "a" "1" "c" "f" "z"
#Change elements 1-4
v[1:4]= c(1,2,3,4)
v
[1] "1" "2" "3" "4" "z"
#Change elements 1,3,5
v[c(1,3,5)] = c("Joe")
v
[1] "Joe" "2" "Joe" "4" "Joe"
v=c("a","b","c", "f", "z")
#Finding length of vector
length(v)
[1] 5
#Appending single element
newV = c(v,"q")
newV
[1] "a" "b" "c" "f" "z" "q"
#Appending vector
newerV = c(v, c(1,2,3,4))
newerV
[1] "a" "b" "c" "f" "z" "1" "2" "3" "4"
-Data frame has two dimensions and it can hold multiple data types
#Build columns
names= c("Micheal", "Deandre", "Christian")
age=c(28,22,17)
club = c("Toronto FC", "Newcastle", "Borussia")
#Make df. This is more readable and less error prone
df_Soccer = data.frame(names, age, club)
#View data frame
df_Soccer
names age club
1 Micheal 28 Toronto FC
2 Deandre 22 Newcastle
3 Christian 17 Borussia
df_Soccer
names age club
1 Micheal 28 Toronto FC
2 Deandre 22 Newcastle
3 Christian 17 Borussia
#Get column names. This returns a vector
colnames(df_Soccer)
[1] "names" "age" "club"
#Change a column name. Why would we want to do this?
colnames(df_Soccer)[1] = "Names"
colnames(df_Soccer)
[1] "Names" "age" "club"
df_Soccer
Names age club
1 Micheal 28 Toronto FC
2 Deandre 22 Newcastle
3 Christian 17 Borussia
#Picking out single element (1st row, 2nd column)
df_Soccer[1,2]
[1] 28
#Picking out contiguous chunk (1st row, all columns)
df_Soccer[1:2,]
Names age club
1 Micheal 28 Toronto FC
2 Deandre 22 Newcastle
df_Soccer
Names age club
1 Micheal 28 Toronto FC
2 Deandre 22 Newcastle
3 Christian 17 Borussia
#Picking out non-contiguous chunk (columns 2 and 3 in rows 1 and 3)
df_Soccer[c(1,3),c(2,3)]
age club
1 28 Toronto FC
3 17 Borussia
df_Soccer
Names age club
1 Micheal 28 Toronto FC
2 Deandre 22 Newcastle
3 Christian 17 Borussia
#Picking out column (What is the difference?)
df_Soccer["club"]
club
1 Toronto FC
2 Newcastle
3 Borussia
df_Soccer$club
[1] Toronto FC Newcastle Borussia
Levels: Borussia Newcastle Toronto FC
df_Soccer
Names age club
1 Micheal 28 Toronto FC
2 Deandre 22 Newcastle
3 Christian 17 Borussia
#This returns the column as a data frame so there are two dimensions
class(df_Soccer["club"])
[1] "data.frame"
#This return a single dimensional vector
class(df_Soccer$club)
[1] "factor"
df_Soccer
Names age club
1 Micheal 28 Toronto FC
2 Deandre 22 Newcastle
3 Christian 17 Borussia
#Picking out the first two rows of column club. This return a vector
df_Soccer["club"][1:2,]
[1] Toronto FC Newcastle
Levels: Borussia Newcastle Toronto FC
#Return same thing
df_Soccer[1:2,"club"]
[1] Toronto FC Newcastle
Levels: Borussia Newcastle Toronto FC
#Return same thing
df_Soccer$club[1:2]
[1] Toronto FC Newcastle
Levels: Borussia Newcastle Toronto FC
df_Soccer
Names age club
1 Micheal 28 Toronto FC
2 Deandre 22 Newcastle
3 Christian 17 Borussia
#Picking out multiple column. Returns a df.
df_Soccer[,c("Names","club")]
Names club
1 Micheal Toronto FC
2 Deandre Newcastle
3 Christian Borussia
#Return first two rows of age and clubs columns. Notice that this returns a df.
df_Soccer[1:2,c("age" ,"club")]
age club
1 28 Toronto FC
2 22 Newcastle
df_Soccer
Names age club
1 Micheal 28 Toronto FC
2 Deandre 22 Newcastle
3 Christian 17 Borussia
#Changing the third entry of the age column
df_Soccer[3,"age"]=18
df_Soccer
Names age club
1 Micheal 28 Toronto FC
2 Deandre 22 Newcastle
3 Christian 18 Borussia
#Changing entire age column
df_Soccer$age=c(30,25,18)
df_Soccer
Names age club
1 Micheal 30 Toronto FC
2 Deandre 25 Newcastle
3 Christian 18 Borussia
df_Soccer
Names age club
1 Micheal 30 Toronto FC
2 Deandre 25 Newcastle
3 Christian 18 Borussia
#Adding a single column. Note the new name of this column
df_Soccer$goals=c(4,2,8)
df_Soccer
Names age club goals
1 Micheal 30 Toronto FC 4
2 Deandre 25 Newcastle 2
3 Christian 18 Borussia 8
#Add multiple columns
#First we create the column as a df
df_Stats = data.frame(goals = c(4,2,8), assists = c(10,1,7))
#Use the cbind (column bind)
df_Soccer_Full = cbind(df_Soccer, df_Stats)
df_Soccer_Full
Names age club goals goals assists
1 Micheal 30 Toronto FC 4 4 10
2 Deandre 25 Newcastle 2 2 1
3 Christian 18 Borussia 8 8 7
#Re-inialize df_Soccer
df_Soccer = data.frame(names= c("Micheal", "Deandre", "Christian"),
age=c(28,22,17), club = c("Toronto FC", "Newcastle", "Borussia"))
df_Soccer
names age club
1 Micheal 28 Toronto FC
2 Deandre 22 Newcastle
3 Christian 17 Borussia
#Create df with the new rows
df_newPlayer = data.frame(names = c("Jake","Marshall"),
age = c(28,27), club=rep("CMS",2))
#Add multiple rows with rbind (row bind) function
df_Soccer_Full = rbind(df_Soccer, df_newPlayer)
df_Soccer_Full
names age club
1 Micheal 28 Toronto FC
2 Deandre 22 Newcastle
3 Christian 17 Borussia
4 Jake 28 CMS
5 Marshall 27 CMS
#Re-inialize df_Soccer
df_Soccer = data.frame(names= c("Micheal", "Deandre", "Christian"),
age=c(28,22,17), club = c("Toronto FC", "Newcastle", "Borussia"))
#Returns a vector: first entry is row and second is columns
dim(df_Soccer)
[1] 3 3
#Returns number of columns
ncol(df_Soccer)
[1] 3
#Return number of rows. (length(df) will also give you number of rows)
nrow(df_Soccer)
[1] 3
#Add a new column to df_Soccer
df_Soccer = data.frame(names= c("Micheal", "Deandre", "Christian"),
age=c(28,22,17), club = c("Toronto FC", "Newcastle", "Borussia"),
experienceYears = c(9,5,2) )
df_Soccer
names age club experienceYears
1 Micheal 28 Toronto FC 9
2 Deandre 22 Newcastle 5
3 Christian 17 Borussia 2
#Create column that is the age of the player in months
df_Soccer$ageMonths= df_Soccer$age*12
df_Soccer
names age club experienceYears ageMonths
1 Micheal 28 Toronto FC 9 336
2 Deandre 22 Newcastle 5 264
3 Christian 17 Borussia 2 204
#Create a column that is the ratio of age to years if experience
df_Soccer$ratio = df_Soccer$age/df_Soccer$experienceYears
df_Soccer
names age club experienceYears ageMonths ratio
1 Micheal 28 Toronto FC 9 336 3.111111
2 Deandre 22 Newcastle 5 264 4.400000
3 Christian 17 Borussia 2 204 8.500000
#Re-inialize df_Soccer
df_Soccer = data.frame(names= c("Micheal", "Deandre", "Christian"),
age=c(28,22,17), club = c("Toronto FC", "Newcastle", "Borussia"))
df_Soccer
names age club
1 Micheal 28 Toronto FC
2 Deandre 22 Newcastle
3 Christian 17 Borussia
#Using summary (the non-numeric column give you bogus)
summary(df_Soccer)
names age club
Christian:1 Min. :17.00 Borussia :1
Deandre :1 1st Qu.:19.50 Newcastle :1
Micheal :1 Median :22.00 Toronto FC:1
Mean :22.33
3rd Qu.:25.00
Max. :28.00
#Using the mean and median function
mean(df_Soccer$age)
[1] 22.33333
median(df_Soccer$age)
[1] 22
#Re-inialize df_Soccer
df_Soccer = data.frame(names= c("Micheal", "Deandre", "Christian"),
age=c(28,22,17), club = c("Toronto FC", "Newcastle", "Borussia"))
df_Soccer
names age club
1 Micheal 28 Toronto FC
2 Deandre 22 Newcastle
3 Christian 17 Borussia
#Sort by age ascending
df_Soccer[order(df_Soccer$age),]
names age club
3 Christian 17 Borussia
2 Deandre 22 Newcastle
1 Micheal 28 Toronto FC
#Sort by age descending
df_Soccer[order(-df_Soccer$age),]
names age club
1 Micheal 28 Toronto FC
2 Deandre 22 Newcastle
3 Christian 17 Borussia
#Add in some new columns
stats=data.frame(goals = c(10,2,10), assists = c(5,2,9))
df_addedStats = cbind(df_Soccer,stats)
df_addedStats
names age club goals assists
1 Micheal 28 Toronto FC 10 5
2 Deandre 22 Newcastle 2 2
3 Christian 17 Borussia 10 9
#Sort by goals and then by assists
#(This means we tie break with goals)
df_addedStats[order(-df_addedStats$goals, -df_addedStats$assists),]
names age club goals assists
3 Christian 17 Borussia 10 9
1 Micheal 28 Toronto FC 10 5
2 Deandre 22 Newcastle 2 2
#Lets say we have a new column that is 1 if they play in Europe and 0
#otherwise
df_Soccer = data.frame(names= c("Micheal", "Deandre", "Christian"),
age=c(28,22,17), club = c("Toronto FC", "Newcastle", "Borussia"),
Europe = c(0,1,1))
df_Soccer
names age club Europe
1 Micheal 28 Toronto FC 0
2 Deandre 22 Newcastle 1
3 Christian 17 Borussia 1
#Lets see how R stores the Europe column
str(df_Soccer)
'data.frame': 3 obs. of 4 variables:
$ names : Factor w/ 3 levels "Christian","Deandre",..: 3 2 1
$ age : num 28 22 17
$ club : Factor w/ 3 levels "Borussia","Newcastle",..: 3 2 1
$ Europe: num 0 1 1
#Convert Europe to factor using the as.factor command
#(There are also as.numeric and as.charcter commands)
df_Soccer$Europe = as.factor(df_Soccer$Europe)
str(df_Soccer)
'data.frame': 3 obs. of 4 variables:
$ names : Factor w/ 3 levels "Christian","Deandre",..: 3 2 1
$ age : num 28 22 17
$ club : Factor w/ 3 levels "Borussia","Newcastle",..: 3 2 1
$ Europe: Factor w/ 2 levels "0","1": 1 2 2
#Get meaningful levels
levels(df_Soccer$Europe)
[1] "0" "1"
levels(df_Soccer$Europe) = c("US", "Europe")
levels(df_Soccer$Europe)
[1] "US" "Europe"
df_Soccer
names age club Europe
1 Micheal 28 Toronto FC US
2 Deandre 22 Newcastle Europe
3 Christian 17 Borussia Europe
-Manipulating Data Frames:
Accessing and changing parts
Creating new columns and rows
Applying functions to columns
Coverting the data types of columns