Warm Up Solutions

How many chickens have a weight above 200?

df = sqldf("SELECT COUNT(*) AS 'Heavier than 200' FROM chickwts 
           WHERE weight >=200")
df

  Heavier than 200
1               54

What is the average weigth of chicken given each feed type?

df = sqldf("SELECT feed , AVG(Weight) AS 'Avg. Weight' FROM chickwts 
           GROUP BY feed")
df

       feed Avg. Weight
1    casein    323.5833
2 horsebean    160.2000
3   linseed    218.7500
4  meatmeal    276.9091
5   soybean    246.4286
6 sunflower    328.9167

What is the average weight of chicks having a weight between 200 and 300 for each feed type?

df = sqldf("SELECT feed , AVG(Weight) AS 'Avg. Weight'  FROM chickwts 
           WHERE weight >=200 AND weight<=300 GROUP BY feed")
df

       feed Avg. Weight
1    casein    245.2500
2 horsebean    222.0000
3   linseed    239.5714
4  meatmeal    245.2000
5   soybean    251.0000
6 sunflower    272.6667

Having Clause

-Like WHERE clause but used after a GROUP BY

-Filters after groups have been formed

-SELECT FUNC(columns) FROM df WHERE condition GROUP BY column HAVING condition

#Using HAVING
df = sqldf("SELECT cyl, COUNT(*) As count FROM mtcars  GROUP BY cyl 
           HAVING count>8")
df

  cyl count
1   4    11
2   8    14

#HAVING with a WHERE clause
df = sqldf("SELECT cyl, COUNT(*) As count FROM mtcars WHERE mpg>20  
 GROUP BY cyl HAVING count>8")
df

  cyl count
1   4    11

Order By Clause

-ORDER BY determines order in which rows are return: ASC DESC

-Recall that for basic sorting we saw how to do this with out SQL

-Comes after the HAVING

-SELECT FUNC(columns) FROM df WHERE condition GROUP BY column HAVING condition ORDER BY columns ASC/DESC

#Sorted by mpg
df = sqldf("SELECT mpg, cyl FROM mtcars ORDER BY mpg DESC")
head(df)

#Order by multiple columns
df = sqldf("SELECT cyl,hp FROM mtcars ORDER BY cyl DESC, hp DESC")
head(df,12)

#Combining everything so far. You can see that the ordering here happens after
#the groups have been formed
df = sqldf("SELECT cyl, COUNT(*) As count FROM mtcars WHERE mpg>15  
GROUP BY cyl HAVING count>3 ORDER BY count DESC")
df

  cyl count
1   4    11
2   8     8
3   6     7

Limit Clause

-LIMIT controls the number of rows we get

-Good way to find the top 3, for example, of some list

-Can specify starting point

-SELECT FUNC(columns) FROM df WHERE condition GROUP BY column HAVING condition ORDER BY columns ASC/DESC LIMIT number

#Sorted by mpg show only 5 rows
df = sqldf("SELECT mpg, cyl FROM mtcars ORDER BY mpg DESC LIMIT 5")
df

#Sorted by mpg show rows 3-8
df = sqldf("SELECT mpg, cyl FROM mtcars ORDER BY mpg DESC LIMIT 3,5")
df

Advanced Filtering

#BETWEEN (includes boundaries)
df = sqldf("SELECT mpg FROM mtcars  WHERE mpg BETWEEN 30.4 AND 32.4 ")
head(df,10)

   mpg
1 32.4
2 30.4
3 30.4

# NOT BETWEEN (does not include boundaries)
df = sqldf("SELECT mpg FROM mtcars  WHERE mpg NOT BETWEEN 30.4 AND 32.4 ")
head(df,10)

#IN
df = sqldf("SELECT mpg,cyl FROM mtcars  WHERE cyl IN (4,6) ")
head(df,10)

    mpg cyl
1  21.0   6
2  21.0   6
3  22.8   4
4  21.4   6
5  18.1   6
6  24.4   4
7  22.8   4
8  19.2   6
9  17.8   6
10 32.4   4

#LIKE
#Create column with car names to filter on
mtcars$Cars = rownames(mtcars)
head(mtcars)

                   mpg cyl disp  hp drat    wt  qsec vs am gear carb
Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
                               Cars
Mazda RX4                 Mazda RX4
Mazda RX4 Wag         Mazda RX4 Wag
Datsun 710               Datsun 710
Hornet 4 Drive       Hornet 4 Drive
Hornet Sportabout Hornet Sportabout
Valiant                     Valiant

#Get all Mercedes Cars - This gives us 0 rows! Tries to match exactly
df = sqldf("SELECT mpg,cyl FROM mtcars  WHERE Cars Like 'Merc' ")
head(df,10)

[1] mpg cyl
<0 rows> (or 0-length row.names)

#Get a specific Mercedes Cars 
df = sqldf("SELECT Cars, mpg,cyl FROM mtcars  WHERE Cars Like 'Merc 240D' ")
head(df,10)

       Cars  mpg cyl
1 Merc 240D 24.4   4

#% is a substitute for zero or more characters
df = sqldf("SELECT Cars, mpg,cyl FROM mtcars  WHERE Cars Like '%Merc%' ")
df

         Cars  mpg cyl
1   Merc 240D 24.4   4
2    Merc 230 22.8   4
3    Merc 280 19.2   6
4   Merc 280C 17.8   6
5  Merc 450SE 16.4   8
6  Merc 450SL 17.3   8
7 Merc 450SLC 15.2   8

#_ is a substitute for one character 
#(this is how you do two conditions in the like)
df = sqldf("SELECT Cars, mpg,cyl FROM mtcars  WHERE Cars Like '_oyo%' 
           OR Cars Like '%Ford%' ")
df

            Cars  mpg cyl
1 Toyota Corolla 33.9   4
2  Toyota Corona 21.5   4
3 Ford Pantera L 15.8   8

CASE

-Use in SELECT, WHERE, HAVING

-Use Like an IF statement

#In SELECT statement we can use it to create a new column
df= sqldf("SELECT hp, CASE WHEN hp>=150 THEN 'Fast' 
          WHEN hp >= 100 AND hp <=150 THEN 'Medium' 
          ELSE 'Slow' END AS Speed FROM mtcars ")
head(df,8)

   hp  Speed
1 110 Medium
2 110 Medium
3  93   Slow
4 110 Medium
5 175   Fast
6 105 Medium
7 245   Fast
8  62   Slow

#With HAVING or WHERE we use it to filter on different criterion
df= sqldf("SELECT cyl, gear,  AVG(mpg) AS avg_mpg FROM mtcars 
          GROUP BY cyl, gear HAVING CASE WHEN cyl =4 THEN avg_mpg > 20 
          ELSE avg_mpg>15 END ORDER BY cyl, gear  ")
df

  cyl gear avg_mpg
1   4    3  21.500
2   4    4  26.925
3   4    5  28.200
4   6    3  19.750
5   6    4  19.750
6   6    5  19.700
7   8    3  15.050
8   8    5  15.400

Individual Row Functions

-Can’t use these functions with Group by

-CEILING(num): Closest integer below num

-FLOOR(num): Closest intger above num

-ROUND(num1, num2): Round num1 to num2 decimal places

-SQRT(num): Takes square root of num

-MOD(num1, num2): Remainder when num1 is divided by num2

-POWER(num1, num2): Raises num1 to the power of num2

#ROUND
df= sqldf("SELECT ROUND(mpg) AS rounded_mpg FROM mtcars")
head(df)

  rounded_mpg
1          21
2          21
3          23
4          21
5          19
6          18

#POWER
df= sqldf("SELECT POWER(cyl, 2) AS random FROM mtcars")
head(df)

Wine Data Set

# Read in the data
df_wine = read.csv("wine.csv")

#Check to see that the data has been read in correctly
head(df_wine)

  No     Grape           Winery       Appelation      State
1  1 Zinfandel     Robert Biale       St. Helena California
2  2 Zinfandel Chiarello Family      Napa Valley California
3  3 Zinfandel     Robert Biale      Napa Valley California
4  4 Zinfandel     Robert Biale      Napa Valley California
5  5 Zinfandel     Robert Biale       St. Helena California
6  6 Zinfandel      Pedroncelli Dry Creek Valley California
                Name Year Price Score Cases Drink
1 Old Kraft Vineyard 2008    44    93   275   now
2              Giana 2008    35    93   480   now
3      Black Chicken 2008    40    91  2700  2012
4       Napa Ranches 2008    38    89   525   now
5   Varozza Vineyard 2008    44    88   275  2012
6       Mother Clone 2008    15    88  6000   now

#We want Name and Grape to be a character column and Cases to be a numeric
df_wine$Name = as.character(df_wine$Name)
df_wine$Grape = as.character(df_wine$Grape)

Find the cheapest wine at each score and sort this by descending score.

#Finding the cheapest wine at each score and sort this
df_rankings = sqldf("SELECT Score, MIN(Price) FROM df_wine GROUP BY Score ORDER BY Score DESC")
head(df_rankings)

  Score MIN(Price)
1    98         43
2    97        135
3    96        135
4    95         33
5    94         40
6    93         25

Find the cheapest wine at each score in each year. First sort by year and then by score.

#Find this same thing  by year
df_rankings_year = sqldf("SELECT Year, Score, MIN(Price) FROM df_wine GROUP BY Year, Score ORDER BY Year, Score DESC")
head(df_rankings_year)

  Year Score MIN(Price)
1 2004    92         75
2 2004    91         50
3 2004    90         45
4 2005    95         60
5 2005    93         29
6 2005    92         28

Which type of grape has the highest average score?

#Which type of grape has the highest average score
df_rankings_grape = sqldf("SELECT Grape, Avg(Score) AS avg_score FROM df_wine GROUP BY Grape ORDER BY avg_score DESC")
head(df_rankings_grape)

                Grape avg_score
1      Cabernet Franc  92.00000
2          Chardonnay  91.09615
3           Roussanne  91.00000
4 Cabernet Sauvingnon  90.95588
5          Pinot Noir  90.68807
6               Syrah  90.54098

Between the years 2004 and 2006 a wine received a gold medal if it had a score of at least 90. In 2007 and 2008 this threshold increased to 93. After 2008, it increased to 96. How many wines in this list received a gold medal?

#Add the gold label
df_gold_label = sqldf("SELECT *, CASE WHEN (Year BETWEEN 2004 AND 2006) AND Score >=90 THEN 'Gold'
                WHEN (Year BETWEEN 2007 AND 2008) AND Score >=93 THEN 'Gold'
                WHEN Year >2008 AND Score >=96 THEN 'Gold'
                ELSE 'None' END AS medal FROM df_wine ")

#Check to see if it worked
head(df_gold_label)

  No     Grape           Winery       Appelation      State
1  1 Zinfandel     Robert Biale       St. Helena California
2  2 Zinfandel Chiarello Family      Napa Valley California
3  3 Zinfandel     Robert Biale      Napa Valley California
4  4 Zinfandel     Robert Biale      Napa Valley California
5  5 Zinfandel     Robert Biale       St. Helena California
6  6 Zinfandel      Pedroncelli Dry Creek Valley California
                Name Year Price Score Cases Drink medal
1 Old Kraft Vineyard 2008    44    93   275   now  Gold
2              Giana 2008    35    93   480   now  Gold
3      Black Chicken 2008    40    91  2700  2012  None
4       Napa Ranches 2008    38    89   525   now  None
5   Varozza Vineyard 2008    44    88   275  2012  None
6       Mother Clone 2008    15    88  6000   now  None

#Pick out the gold rows
gold_count = sqldf("SELECT COUNT(*) FROM df_gold_label WHERE medal='Gold'")
gold_count

  COUNT(*)
1      155

Basic SQL 2

Warm UP