Jake Feldman
-Consider the built in data set chickwts and answer the following questions:
How many chickens have a weight above 200? Give this Column an appropriate name using Aliasing
What is the average weigth of the chicks given each feed type?
What is the average weight of chicks having a weight between 200 and 300 for each feed type?
df = sqldf("SELECT COUNT(*) AS 'Heavier than 200' FROM chickwts
WHERE weight >=200")
df
Heavier than 200
1 54
df = sqldf("SELECT feed , AVG(Weight) AS 'Avg. Weight' FROM chickwts
GROUP BY feed")
df
feed Avg. Weight
1 casein 323.5833
2 horsebean 160.2000
3 linseed 218.7500
4 meatmeal 276.9091
5 soybean 246.4286
6 sunflower 328.9167
df = sqldf("SELECT feed , AVG(Weight) AS 'Avg. Weight' FROM chickwts
WHERE weight >=200 AND weight<=300 GROUP BY feed")
df
feed Avg. Weight
1 casein 245.2500
2 horsebean 222.0000
3 linseed 239.5714
4 meatmeal 245.2000
5 soybean 251.0000
6 sunflower 272.6667
-Like WHERE clause but used after a GROUP BY
-Filters after groups have been formed
-SELECT FUNC(columns) FROM df WHERE condition GROUP BY column HAVING condition
#Using HAVING
df = sqldf("SELECT cyl, COUNT(*) As count FROM mtcars GROUP BY cyl
HAVING count>8")
df
cyl count
1 4 11
2 8 14
#HAVING with a WHERE clause
df = sqldf("SELECT cyl, COUNT(*) As count FROM mtcars WHERE mpg>20
GROUP BY cyl HAVING count>8")
df
cyl count
1 4 11
-ORDER BY determines order in which rows are return: ASC DESC
-Recall that for basic sorting we saw how to do this with out SQL
-Comes after the HAVING
-SELECT FUNC(columns) FROM df WHERE condition GROUP BY column HAVING condition ORDER BY columns ASC/DESC
#Sorted by mpg
df = sqldf("SELECT mpg, cyl FROM mtcars ORDER BY mpg DESC")
head(df)
mpg cyl
1 33.9 4
2 32.4 4
3 30.4 4
4 30.4 4
5 27.3 4
6 26.0 4
#Order by multiple columns
df = sqldf("SELECT cyl,hp FROM mtcars ORDER BY cyl DESC, hp DESC")
head(df,12)
cyl hp
1 8 335
2 8 264
3 8 245
4 8 245
5 8 230
6 8 215
7 8 205
8 8 180
9 8 180
10 8 180
11 8 175
12 8 175
#Combining everything so far. You can see that the ordering here happens after
#the groups have been formed
df = sqldf("SELECT cyl, COUNT(*) As count FROM mtcars WHERE mpg>15
GROUP BY cyl HAVING count>3 ORDER BY count DESC")
df
cyl count
1 4 11
2 8 8
3 6 7
-LIMIT controls the number of rows we get
-Good way to find the top 3, for example, of some list
-Can specify starting point
-SELECT FUNC(columns) FROM df WHERE condition GROUP BY column HAVING condition ORDER BY columns ASC/DESC LIMIT number
#Sorted by mpg show only 5 rows
df = sqldf("SELECT mpg, cyl FROM mtcars ORDER BY mpg DESC LIMIT 5")
df
mpg cyl
1 33.9 4
2 32.4 4
3 30.4 4
4 30.4 4
5 27.3 4
#Sorted by mpg show rows 3-8
df = sqldf("SELECT mpg, cyl FROM mtcars ORDER BY mpg DESC LIMIT 3,5")
df
mpg cyl
1 30.4 4
2 27.3 4
3 26.0 4
4 24.4 4
5 22.8 4
#BETWEEN (includes boundaries)
df = sqldf("SELECT mpg FROM mtcars WHERE mpg BETWEEN 30.4 AND 32.4 ")
head(df,10)
mpg
1 32.4
2 30.4
3 30.4
# NOT BETWEEN (does not include boundaries)
df = sqldf("SELECT mpg FROM mtcars WHERE mpg NOT BETWEEN 30.4 AND 32.4 ")
head(df,10)
mpg
1 21.0
2 21.0
3 22.8
4 21.4
5 18.7
6 18.1
7 14.3
8 24.4
9 22.8
10 19.2
#IN
df = sqldf("SELECT mpg,cyl FROM mtcars WHERE cyl IN (4,6) ")
head(df,10)
mpg cyl
1 21.0 6
2 21.0 6
3 22.8 4
4 21.4 6
5 18.1 6
6 24.4 4
7 22.8 4
8 19.2 6
9 17.8 6
10 32.4 4
#LIKE
#Create column with car names to filter on
mtcars$Cars = rownames(mtcars)
head(mtcars)
mpg cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
Cars
Mazda RX4 Mazda RX4
Mazda RX4 Wag Mazda RX4 Wag
Datsun 710 Datsun 710
Hornet 4 Drive Hornet 4 Drive
Hornet Sportabout Hornet Sportabout
Valiant Valiant
#Get all Mercedes Cars - This gives us 0 rows! Tries to match exactly
df = sqldf("SELECT mpg,cyl FROM mtcars WHERE Cars Like 'Merc' ")
head(df,10)
[1] mpg cyl
<0 rows> (or 0-length row.names)
#Get a specific Mercedes Cars
df = sqldf("SELECT Cars, mpg,cyl FROM mtcars WHERE Cars Like 'Merc 240D' ")
head(df,10)
Cars mpg cyl
1 Merc 240D 24.4 4
#% is a substitute for zero or more characters
df = sqldf("SELECT Cars, mpg,cyl FROM mtcars WHERE Cars Like '%Merc%' ")
df
Cars mpg cyl
1 Merc 240D 24.4 4
2 Merc 230 22.8 4
3 Merc 280 19.2 6
4 Merc 280C 17.8 6
5 Merc 450SE 16.4 8
6 Merc 450SL 17.3 8
7 Merc 450SLC 15.2 8
#_ is a substitute for one character
#(this is how you do two conditions in the like)
df = sqldf("SELECT Cars, mpg,cyl FROM mtcars WHERE Cars Like '_oyo%'
OR Cars Like '%Ford%' ")
df
Cars mpg cyl
1 Toyota Corolla 33.9 4
2 Toyota Corona 21.5 4
3 Ford Pantera L 15.8 8
-Use in SELECT, WHERE, HAVING
-Use Like an IF statement
#In SELECT statement we can use it to create a new column
df= sqldf("SELECT hp, CASE WHEN hp>=150 THEN 'Fast'
WHEN hp >= 100 AND hp <=150 THEN 'Medium'
ELSE 'Slow' END AS Speed FROM mtcars ")
head(df,8)
hp Speed
1 110 Medium
2 110 Medium
3 93 Slow
4 110 Medium
5 175 Fast
6 105 Medium
7 245 Fast
8 62 Slow
#With HAVING or WHERE we use it to filter on different criterion
df= sqldf("SELECT cyl, gear, AVG(mpg) AS avg_mpg FROM mtcars
GROUP BY cyl, gear HAVING CASE WHEN cyl =4 THEN avg_mpg > 20
ELSE avg_mpg>15 END ORDER BY cyl, gear ")
df
cyl gear avg_mpg
1 4 3 21.500
2 4 4 26.925
3 4 5 28.200
4 6 3 19.750
5 6 4 19.750
6 6 5 19.700
7 8 3 15.050
8 8 5 15.400
-Can’t use these functions with Group by
-CEILING(num): Closest integer below num
-FLOOR(num): Closest intger above num
-ROUND(num1, num2): Round num1 to num2 decimal places
-SQRT(num): Takes square root of num
-MOD(num1, num2): Remainder when num1 is divided by num2
-POWER(num1, num2): Raises num1 to the power of num2
#ROUND
df= sqldf("SELECT ROUND(mpg) AS rounded_mpg FROM mtcars")
head(df)
rounded_mpg
1 21
2 21
3 23
4 21
5 19
6 18
#POWER
df= sqldf("SELECT POWER(cyl, 2) AS random FROM mtcars")
head(df)
random
1 36
2 36
3 16
4 36
5 64
6 36
# Read in the data
df_wine = read.csv("wine.csv")
#Check to see that the data has been read in correctly
head(df_wine)
No Grape Winery Appelation State
1 1 Zinfandel Robert Biale St. Helena California
2 2 Zinfandel Chiarello Family Napa Valley California
3 3 Zinfandel Robert Biale Napa Valley California
4 4 Zinfandel Robert Biale Napa Valley California
5 5 Zinfandel Robert Biale St. Helena California
6 6 Zinfandel Pedroncelli Dry Creek Valley California
Name Year Price Score Cases Drink
1 Old Kraft Vineyard 2008 44 93 275 now
2 Giana 2008 35 93 480 now
3 Black Chicken 2008 40 91 2700 2012
4 Napa Ranches 2008 38 89 525 now
5 Varozza Vineyard 2008 44 88 275 2012
6 Mother Clone 2008 15 88 6000 now
#We want Name and Grape to be a character column and Cases to be a numeric
df_wine$Name = as.character(df_wine$Name)
df_wine$Grape = as.character(df_wine$Grape)
#Finding the cheapest wine at each score and sort this
df_rankings = sqldf("SELECT Score, MIN(Price) FROM df_wine GROUP BY Score ORDER BY Score DESC")
head(df_rankings)
Score MIN(Price)
1 98 43
2 97 135
3 96 135
4 95 33
5 94 40
6 93 25
#Find this same thing by year
df_rankings_year = sqldf("SELECT Year, Score, MIN(Price) FROM df_wine GROUP BY Year, Score ORDER BY Year, Score DESC")
head(df_rankings_year)
Year Score MIN(Price)
1 2004 92 75
2 2004 91 50
3 2004 90 45
4 2005 95 60
5 2005 93 29
6 2005 92 28
#Which type of grape has the highest average score
df_rankings_grape = sqldf("SELECT Grape, Avg(Score) AS avg_score FROM df_wine GROUP BY Grape ORDER BY avg_score DESC")
head(df_rankings_grape)
Grape avg_score
1 Cabernet Franc 92.00000
2 Chardonnay 91.09615
3 Roussanne 91.00000
4 Cabernet Sauvingnon 90.95588
5 Pinot Noir 90.68807
6 Syrah 90.54098
#Add the gold label
df_gold_label = sqldf("SELECT *, CASE WHEN (Year BETWEEN 2004 AND 2006) AND Score >=90 THEN 'Gold'
WHEN (Year BETWEEN 2007 AND 2008) AND Score >=93 THEN 'Gold'
WHEN Year >2008 AND Score >=96 THEN 'Gold'
ELSE 'None' END AS medal FROM df_wine ")
#Check to see if it worked
head(df_gold_label)
No Grape Winery Appelation State
1 1 Zinfandel Robert Biale St. Helena California
2 2 Zinfandel Chiarello Family Napa Valley California
3 3 Zinfandel Robert Biale Napa Valley California
4 4 Zinfandel Robert Biale Napa Valley California
5 5 Zinfandel Robert Biale St. Helena California
6 6 Zinfandel Pedroncelli Dry Creek Valley California
Name Year Price Score Cases Drink medal
1 Old Kraft Vineyard 2008 44 93 275 now Gold
2 Giana 2008 35 93 480 now Gold
3 Black Chicken 2008 40 91 2700 2012 None
4 Napa Ranches 2008 38 89 525 now None
5 Varozza Vineyard 2008 44 88 275 2012 None
6 Mother Clone 2008 15 88 6000 now None
#Pick out the gold rows
gold_count = sqldf("SELECT COUNT(*) FROM df_gold_label WHERE medal='Gold'")
gold_count
COUNT(*)
1 155