Joins 2

Jake Feldman

Warm UP

-Download and open Warm_Up.r. I have already the author/book data frames from last lecture

  1. Find the titles and and authors of all books in English

Warm Up Solutions

  1. Find the titles and and authors of all books in English
authors
  author_id      name  COB
1         1      J.K. U.S.
2         2 Steinbeck U.S.
books
  book_id              title translation
1       1               H.P.     English
2       2               H.P.      German
3       3               H.P.      French
4       4 Travels w/ Charley     English
5       5 Travels w/ Charley     Turkish
6       6    Grapes of Wrath     English
book_author
  book_id author_id
1       1         1
2       2         1
3       3         1
4       4         2
5       5         2
6       6         2
#Get the author IDs for each book
get_Author_id = sqldf("SELECT B.*, AB.author_id FROM books B 
                      JOIN book_author AB ON B.book_id = AB.book_id ")
Loading required package: tcltk
get_Author_id
  book_id              title translation author_id
1       1               H.P.     English         1
2       2               H.P.      German         1
3       3               H.P.      French         1
4       4 Travels w/ Charley     English         2
5       5 Travels w/ Charley     Turkish         2
6       6    Grapes of Wrath     English         2
#Get the author names and filter by the books in English
sqldf("SELECT A.name, title, translation FROM get_Author_id B 
JOIN authors A ON B.author_id = A.author_id WHERE translation ='English' ")
       name              title translation
1      J.K.               H.P.     English
2 Steinbeck Travels w/ Charley     English
3 Steinbeck    Grapes of Wrath     English
#This is how you can do it in one query.  Just like arithmetic, 
#queries are executed from inside to out
sqldf("SELECT  A.name, title, translation FROM authors A JOIN 
(SELECT B.*, AB.author_id FROM books B JOIN book_author AB 
ON B.book_id = AB.book_id) AS C ON A.author_id = C.author_id 
WHERE translation = 'English' ")
     A.name              title translation
1      J.K.               H.P.     English
2 Steinbeck Travels w/ Charley     English
3 Steinbeck    Grapes of Wrath     English

Airlines Data Set

head(airlines)
  Id            Airline Abbreviation Country
1  1    United Airlines          UAL     USA
2  2         US Airways        USAir     USA
3  3     Delta Airlines        Delta     USA
4  4 Southwest Airlines    Southwest     USA
5  5  American Airlines     American     USA
6  6 Northwest Airlines    Northwest     USA
head(airports)
       City AirportCode         AirportName        Country CountryAbbrev
1 Aberdeen          APG       Phillips AAF  United States            US 
2 Aberdeen          ABR          Municipal  United States            US 
3  Abilene          DYS          Dyess AFB  United States            US 
4  Abilene          ABI          Municipal  United States            US 
5 Abingdon          VJI Virginia Highlands  United States            US 
6      Ada          ADT                Ada  United States            US 
head(flights)
  Airline FlightNo SourceAirport DestAirport
1       1       28           APG         ASY
2       1       29           ASY         APG
3       1       44           CVO         ACV
4       1       45           ACV         CVO
5       1       54           AHD         AHT
6       1       55           AHT         AHD

Airlines Data Practice

For each airline, find the most popular source and destination city

Gameplan:

-Find the airline associated with each city (W/ Join)

-Find the city associated with each source/destination airport (W/ Join)

-Get counts for each airline

-Pick out the most popular

Find the airline associated with each city (W/ Join)

df_step1 = sqldf("SELECT A.Airline, B.FlightNo, B.SourceAirport, 
B.DestAirport FROM flights B JOIN airlines A 
ON A.Id = B.Airline ")

head(df_step1)
          Airline FlightNo SourceAirport DestAirport
1 United Airlines       28           APG         ASY
2 United Airlines       29           ASY         APG
3 United Airlines       44           CVO         ACV
4 United Airlines       45           ACV         CVO
5 United Airlines       54           AHD         AHT
6 United Airlines       55           AHT         AHD

Find the city associated with each source/destination airport (W/ Join)

df_step2a  =sqldf("SELECT A.*,B.City As SourceCity FROM df_step1 A JOIN 
airports B ON B.AirportCode = A.SourceAirport  ")

df_step2  =sqldf("SELECT A.*,B.City As DestCity FROM df_step2a A JOIN 
airports B ON B.AirportCode = A.DestAirport  ")

head(df_step2)
          Airline FlightNo SourceAirport DestAirport SourceCity  DestCity
1 United Airlines       28           APG         ASY  Aberdeen    Ashley 
2 United Airlines       29           ASY         APG    Ashley  Aberdeen 
3 United Airlines       44           CVO         ACV    Albany    Arcata 
4 United Airlines       45           ACV         CVO    Arcata    Albany 
5 United Airlines       54           AHD         AHT   Ardmore  Amchitka 
6 United Airlines       55           AHT         AHD  Amchitka   Ardmore 

Get counts for each airlines for each source and destination

df_step3_source = sqldf("SELECT Airline, SourceCity, COUNT(*) 
AS numFlightsSource FROM df_step2 GROUP BY Airline, SourceCity")

df_step3_dest = sqldf("SELECT Airline, DestCity, COUNT(*) 
AS numFlightsDest FROM df_step2 GROUP BY Airline, DestCity")

head(df_step3_dest)
          Airline         DestCity numFlightsDest
1 AirTran Airways  Alexandria LA                1
2 AirTran Airways  Aliceville AL                3
3 AirTran Airways      Amarillo                 2
4 AirTran Airways   Ann Arbor MI                2
5 AirTran Airways    Anniston AL                1
6 AirTran Airways        Aberdeen               2

Find Maximums

df_step4_maxsource = sqldf("SELECT Airline, SourceCity,MAX(numFlightsSource) AS popSource 
                           FROM df_step3_source GROUP BY Airline")

head(df_step4_maxsource)
               Airline SourceCity popSource
1      AirTran Airways   Abilene          5
2        Allegiant Air     Altus          5
3    American Airlines    Albany          5
4 Continental Airlines     Akiak          5
5       Delta Airlines  Appleton          4
6    Frontier Airlines   Abilene          5

Query As Character

-We can first write the query and then give it to sqldf

-This is one way we can write general queries that take inputs

query= "SELECT cyl, AVG(mpg) FROM mtcars GROUP BY cyl"
class(query)
[1] "character"
sqldf(query)
  cyl AVG(mpg)
1   4 26.66364
2   6 19.74286
3   8 15.10000

Using sprintf

-We use %s for strings

a = "string"
sprintf("This is where a %s goes.", a)
[1] "This is where a string goes."

-The %f is for decimals

sprintf("Pi as is: %f", pi)        
[1] "Pi as is: 3.141593"
sprintf("Pi with 3 decimals of precision :%.3f", pi)     
[1] "Pi with 3 decimals of precision :3.142"
sprintf("Pi as integer: %.0f", pi)     
[1] "Pi as integer: 3"

-We use %d for integers

sprintf("I had %d goals old %d %s at age %0.1f ", 5,6,"assists", 23.5)        
[1] "I had 5 goals old 6 assists at age 23.5 "

Writing General Queries

#With WHERE clause
query= "SELECT cyl, mpg FROM mtcars WHERE cyl=%1.0f"
cyl=4
newQuery = sprintf(query, cyl)
newQuery
[1] "SELECT cyl, mpg FROM mtcars WHERE cyl=4"
#Run the query
sqldf(newQuery)
   cyl  mpg
1    4 22.8
2    4 24.4
3    4 22.8
4    4 32.4
5    4 30.4
6    4 33.9
7    4 21.5
8    4 27.3
9    4 26.0
10   4 30.4
11   4 21.4
#With HAVING clause
threshold =15
query= "SELECT cyl, AVG(mpg) FROM mtcars WHERE mpg>=%1.0f GROUP by cyl "
newQuery = sprintf(query, threshold)
newQuery
[1] "SELECT cyl, AVG(mpg) FROM mtcars WHERE mpg>=15 GROUP by cyl "
#Run the query
sqldf(newQuery)
  cyl AVG(mpg)
1   4 26.66364
2   6 19.74286
3   8 16.47778
#With GROUP BY clause
categories = c("cyl", "am")
query= "SELECT %s , AVG(mpg) FROM mtcars  GROUP by %s"
newQuery = sprintf(query, categories[1],categories[2])
newQuery
[1] "SELECT cyl , AVG(mpg) FROM mtcars  GROUP by am"
#Run the query
sqldf(newQuery)
  cyl AVG(mpg)
1   8 17.14737
2   4 24.39231
#With Like clause
query = "SELECT feed,AVG(weight) FROM chickwts WHERE feed LIKE  '%s'"
feed = "casein"
newQuery = sprintf(query, feed)
newQuery
[1] "SELECT feed,AVG(weight) FROM chickwts WHERE feed LIKE  'casein'"
#Run the query
sqldf(newQuery)
    feed AVG(weight)
1 casein    323.5833

Writing Nested Queries

-Queries are executed in to out as if this were an arithmetic operation

-Never need to write nested queries but it can make things more concise

#This is how you can do it in one query.  Just like arithmetic, 
#queries are executed from inside to out
sqldf("SELECT  A.name, title, translation FROM authors A JOIN 
(SELECT B.*, AB.author_id FROM books B JOIN book_author AB 
ON B.book_id = AB.book_id) AS C ON A.author_id = C.author_id 
WHERE translation = 'English' ")
     A.name              title translation
1      J.K.               H.P.     English
2 Steinbeck Travels w/ Charley     English
3 Steinbeck    Grapes of Wrath     English
#Lets do the Standings question in one query (without the team names)
sqldf("SELECT HomeTeamID, AVG(BinaryWL) AS avgMOV, AVG(MOV) WinPerc FROM (SELECT HomeTeamID, CASE WHEN HomeScore >
      AwayScore THEN 1 ELSE 0 END AS BinaryWL,HomeScore-AwayScore AS MOV FROM games) GROUP BY HomeTeamID ORDER BY
      WinPerc   DESC, avgMOV DESC")
   HomeTeamID    avgMOV    WinPerc
1          31 1.0000000  23.166667
2          11 1.0000000  18.333333
3           4 1.0000000  16.000000
4           2 1.0000000  12.833333
5           1 0.6666667  12.666667
6           9 0.8333333  11.000000
7           7 0.7142857  10.714286
8          23 0.8333333  10.333333
9          32 0.8333333   8.666667
10         25 0.6000000   8.200000
11         17 1.0000000   8.166667
12         10 0.6666667   7.166667
13         28 0.6666667   6.833333
14         20 0.5000000   6.500000
15         24 0.5714286   5.714286
16         29 0.6666667   5.333333
17         18 0.5000000   4.000000
18         27 0.6666667   3.333333
19         13 0.5000000   2.833333
20         22 0.5000000   1.833333
21         21 0.5000000  -1.000000
22          5 0.4285714  -2.000000
23         16 0.5000000  -2.166667
24         14 0.3333333  -2.166667
25         15 0.3333333  -3.000000
26         30 0.4000000  -4.200000
27          8 0.4000000  -5.600000
28         26 0.2857143  -5.714286
29         12 0.2000000  -6.400000
30          6 0.3333333  -7.333333
31         19 0.0000000  -9.333333
32          3 0.1666667 -12.333333
#With team names
sqldf("SELECT B.TeamName, A.WinPerc, A.avgMOV FROM (SELECT HomeTeamID, AVG(BinaryWL) AS avgMOV, 
      AVG(MOV) WinPerc FROM (SELECT HomeTeamID, CASE WHEN HomeScore > AwayScore THEN 1 ELSE 0 END AS BinaryWL,
      HomeScore-AwayScore AS MOV FROM games) GROUP BY HomeTeamID ORDER BY WinPerc DESC, avgMOV DESC) AS A JOIN teams
      B ON A.HomeTeamID = B.TeamID")
               TeamName    WinPerc    avgMOV
1      Baltimore Ravens  12.666667 0.6666667
2        Denver Broncos  12.833333 1.0000000
3       Oakland Raiders -12.333333 0.1666667
4   Philadelphia Eagles  16.000000 1.0000000
5        Dallas Cowboys  -2.000000 0.4285714
6  Jacksonville Jaguars  -7.333333 0.3333333
7    Indianapolis Colts  10.714286 0.7142857
8   Washington Redskins  -5.600000 0.4000000
9    San Diego Chargers  11.000000 0.8333333
10   Kansas City Chiefs   7.166667 0.6666667
11 New England Patriots  18.333333 1.0000000
12     Tennessee Titans  -6.400000 0.2000000
13       Houston Texans   2.833333 0.5000000
14      New York Giants  -2.166667 0.3333333
15    Carolina Panthers  -3.000000 0.3333333
16  San Francisco 49ers  -2.166667 0.5000000
17    Arizona Cardinals   8.166667 1.0000000
18        St Louis Rams   4.000000 0.5000000
19 Tampa Bay Buccaneers  -9.333333 0.0000000
20      Atlanta Falcons   6.500000 0.5000000
21    Minnesota Vikings  -1.000000 0.5000000
22   New Orleans Saints   1.833333 0.5000000
23     Seattle Seahawks  10.333333 0.8333333
24        Buffalo Bills   5.714286 0.5714286
25       Miami Dolphins   8.200000 0.6000000
26        New York Jets  -5.714286 0.2857143
27     Cleveland Browns   3.333333 0.6666667
28  Pittsburgh Steelers   6.833333 0.6666667
29   Cincinnati Bengals   5.333333 0.6666667
30        Chicago Bears  -4.200000 0.4000000
31    Green Bay Packers  23.166667 1.0000000
32        Detroit Lions   8.666667 0.8333333

Builing Fantasy Football Tool

Let me show you how to make the error bars

#Covert cyl to a factor for nicer plotting
mtcars$cyl = as.factor(mtcars$cyl)

#We will get the average mpg and std for each of the three cylinders
stats = sqldf("SELECT cyl, AVG(mpg) AS avg_mpg, STDEV(mpg) AS stdev_mpg FROM mtcars GROUP BY cyl")


#Lets make the stdev the size of the error bars 
ggplot(stats, aes(x = cyl,y=avg_mpg)) + geom_point(size=3, shape=21, fill="blue") +
  geom_errorbar(aes(x=cyl,ymin=avg_mpg-stdev_mpg, ymax=avg_mpg+stdev_mpg), colour="red", width=.1) +
  geom_label(aes(x= cyl, y =avg_mpg+stdev_mpg, label = cyl), vjust=-0.3 ,size=4)