Jake Feldman
-Download and open Warm_Up.r. I have already the author/book data frames from last lecture
authors
author_id name COB
1 1 J.K. U.S.
2 2 Steinbeck U.S.
books
book_id title translation
1 1 H.P. English
2 2 H.P. German
3 3 H.P. French
4 4 Travels w/ Charley English
5 5 Travels w/ Charley Turkish
6 6 Grapes of Wrath English
book_author
book_id author_id
1 1 1
2 2 1
3 3 1
4 4 2
5 5 2
6 6 2
#Get the author IDs for each book
get_Author_id = sqldf("SELECT B.*, AB.author_id FROM books B
JOIN book_author AB ON B.book_id = AB.book_id ")
Loading required package: tcltk
get_Author_id
book_id title translation author_id
1 1 H.P. English 1
2 2 H.P. German 1
3 3 H.P. French 1
4 4 Travels w/ Charley English 2
5 5 Travels w/ Charley Turkish 2
6 6 Grapes of Wrath English 2
#Get the author names and filter by the books in English
sqldf("SELECT A.name, title, translation FROM get_Author_id B
JOIN authors A ON B.author_id = A.author_id WHERE translation ='English' ")
name title translation
1 J.K. H.P. English
2 Steinbeck Travels w/ Charley English
3 Steinbeck Grapes of Wrath English
#This is how you can do it in one query. Just like arithmetic,
#queries are executed from inside to out
sqldf("SELECT A.name, title, translation FROM authors A JOIN
(SELECT B.*, AB.author_id FROM books B JOIN book_author AB
ON B.book_id = AB.book_id) AS C ON A.author_id = C.author_id
WHERE translation = 'English' ")
A.name title translation
1 J.K. H.P. English
2 Steinbeck Travels w/ Charley English
3 Steinbeck Grapes of Wrath English
head(airlines)
Id Airline Abbreviation Country
1 1 United Airlines UAL USA
2 2 US Airways USAir USA
3 3 Delta Airlines Delta USA
4 4 Southwest Airlines Southwest USA
5 5 American Airlines American USA
6 6 Northwest Airlines Northwest USA
head(airports)
City AirportCode AirportName Country CountryAbbrev
1 Aberdeen APG Phillips AAF United States US
2 Aberdeen ABR Municipal United States US
3 Abilene DYS Dyess AFB United States US
4 Abilene ABI Municipal United States US
5 Abingdon VJI Virginia Highlands United States US
6 Ada ADT Ada United States US
head(flights)
Airline FlightNo SourceAirport DestAirport
1 1 28 APG ASY
2 1 29 ASY APG
3 1 44 CVO ACV
4 1 45 ACV CVO
5 1 54 AHD AHT
6 1 55 AHT AHD
For each airline, find the most popular source and destination city
Gameplan:
-Find the airline associated with each city (W/ Join)
-Find the city associated with each source/destination airport (W/ Join)
-Get counts for each airline
-Pick out the most popular
Find the airline associated with each city (W/ Join)
df_step1 = sqldf("SELECT A.Airline, B.FlightNo, B.SourceAirport,
B.DestAirport FROM flights B JOIN airlines A
ON A.Id = B.Airline ")
head(df_step1)
Airline FlightNo SourceAirport DestAirport
1 United Airlines 28 APG ASY
2 United Airlines 29 ASY APG
3 United Airlines 44 CVO ACV
4 United Airlines 45 ACV CVO
5 United Airlines 54 AHD AHT
6 United Airlines 55 AHT AHD
Find the city associated with each source/destination airport (W/ Join)
df_step2a =sqldf("SELECT A.*,B.City As SourceCity FROM df_step1 A JOIN
airports B ON B.AirportCode = A.SourceAirport ")
df_step2 =sqldf("SELECT A.*,B.City As DestCity FROM df_step2a A JOIN
airports B ON B.AirportCode = A.DestAirport ")
head(df_step2)
Airline FlightNo SourceAirport DestAirport SourceCity DestCity
1 United Airlines 28 APG ASY Aberdeen Ashley
2 United Airlines 29 ASY APG Ashley Aberdeen
3 United Airlines 44 CVO ACV Albany Arcata
4 United Airlines 45 ACV CVO Arcata Albany
5 United Airlines 54 AHD AHT Ardmore Amchitka
6 United Airlines 55 AHT AHD Amchitka Ardmore
Get counts for each airlines for each source and destination
df_step3_source = sqldf("SELECT Airline, SourceCity, COUNT(*)
AS numFlightsSource FROM df_step2 GROUP BY Airline, SourceCity")
df_step3_dest = sqldf("SELECT Airline, DestCity, COUNT(*)
AS numFlightsDest FROM df_step2 GROUP BY Airline, DestCity")
head(df_step3_dest)
Airline DestCity numFlightsDest
1 AirTran Airways Alexandria LA 1
2 AirTran Airways Aliceville AL 3
3 AirTran Airways Amarillo 2
4 AirTran Airways Ann Arbor MI 2
5 AirTran Airways Anniston AL 1
6 AirTran Airways Aberdeen 2
Find Maximums
df_step4_maxsource = sqldf("SELECT Airline, SourceCity,MAX(numFlightsSource) AS popSource
FROM df_step3_source GROUP BY Airline")
head(df_step4_maxsource)
Airline SourceCity popSource
1 AirTran Airways Abilene 5
2 Allegiant Air Altus 5
3 American Airlines Albany 5
4 Continental Airlines Akiak 5
5 Delta Airlines Appleton 4
6 Frontier Airlines Abilene 5
-We can first write the query and then give it to sqldf
-This is one way we can write general queries that take inputs
query= "SELECT cyl, AVG(mpg) FROM mtcars GROUP BY cyl"
class(query)
[1] "character"
sqldf(query)
cyl AVG(mpg)
1 4 26.66364
2 6 19.74286
3 8 15.10000
-We use %s for strings
a = "string"
sprintf("This is where a %s goes.", a)
[1] "This is where a string goes."
-The %f is for decimals
sprintf("Pi as is: %f", pi)
[1] "Pi as is: 3.141593"
sprintf("Pi with 3 decimals of precision :%.3f", pi)
[1] "Pi with 3 decimals of precision :3.142"
sprintf("Pi as integer: %.0f", pi)
[1] "Pi as integer: 3"
-We use %d for integers
sprintf("I had %d goals old %d %s at age %0.1f ", 5,6,"assists", 23.5)
[1] "I had 5 goals old 6 assists at age 23.5 "
#With WHERE clause
query= "SELECT cyl, mpg FROM mtcars WHERE cyl=%1.0f"
cyl=4
newQuery = sprintf(query, cyl)
newQuery
[1] "SELECT cyl, mpg FROM mtcars WHERE cyl=4"
#Run the query
sqldf(newQuery)
cyl mpg
1 4 22.8
2 4 24.4
3 4 22.8
4 4 32.4
5 4 30.4
6 4 33.9
7 4 21.5
8 4 27.3
9 4 26.0
10 4 30.4
11 4 21.4
#With HAVING clause
threshold =15
query= "SELECT cyl, AVG(mpg) FROM mtcars WHERE mpg>=%1.0f GROUP by cyl "
newQuery = sprintf(query, threshold)
newQuery
[1] "SELECT cyl, AVG(mpg) FROM mtcars WHERE mpg>=15 GROUP by cyl "
#Run the query
sqldf(newQuery)
cyl AVG(mpg)
1 4 26.66364
2 6 19.74286
3 8 16.47778
#With GROUP BY clause
categories = c("cyl", "am")
query= "SELECT %s , AVG(mpg) FROM mtcars GROUP by %s"
newQuery = sprintf(query, categories[1],categories[2])
newQuery
[1] "SELECT cyl , AVG(mpg) FROM mtcars GROUP by am"
#Run the query
sqldf(newQuery)
cyl AVG(mpg)
1 8 17.14737
2 4 24.39231
#With Like clause
query = "SELECT feed,AVG(weight) FROM chickwts WHERE feed LIKE '%s'"
feed = "casein"
newQuery = sprintf(query, feed)
newQuery
[1] "SELECT feed,AVG(weight) FROM chickwts WHERE feed LIKE 'casein'"
#Run the query
sqldf(newQuery)
feed AVG(weight)
1 casein 323.5833
-Queries are executed in to out as if this were an arithmetic operation
-Never need to write nested queries but it can make things more concise
#This is how you can do it in one query. Just like arithmetic,
#queries are executed from inside to out
sqldf("SELECT A.name, title, translation FROM authors A JOIN
(SELECT B.*, AB.author_id FROM books B JOIN book_author AB
ON B.book_id = AB.book_id) AS C ON A.author_id = C.author_id
WHERE translation = 'English' ")
A.name title translation
1 J.K. H.P. English
2 Steinbeck Travels w/ Charley English
3 Steinbeck Grapes of Wrath English
#Lets do the Standings question in one query (without the team names)
sqldf("SELECT HomeTeamID, AVG(BinaryWL) AS avgMOV, AVG(MOV) WinPerc FROM (SELECT HomeTeamID, CASE WHEN HomeScore >
AwayScore THEN 1 ELSE 0 END AS BinaryWL,HomeScore-AwayScore AS MOV FROM games) GROUP BY HomeTeamID ORDER BY
WinPerc DESC, avgMOV DESC")
HomeTeamID avgMOV WinPerc
1 31 1.0000000 23.166667
2 11 1.0000000 18.333333
3 4 1.0000000 16.000000
4 2 1.0000000 12.833333
5 1 0.6666667 12.666667
6 9 0.8333333 11.000000
7 7 0.7142857 10.714286
8 23 0.8333333 10.333333
9 32 0.8333333 8.666667
10 25 0.6000000 8.200000
11 17 1.0000000 8.166667
12 10 0.6666667 7.166667
13 28 0.6666667 6.833333
14 20 0.5000000 6.500000
15 24 0.5714286 5.714286
16 29 0.6666667 5.333333
17 18 0.5000000 4.000000
18 27 0.6666667 3.333333
19 13 0.5000000 2.833333
20 22 0.5000000 1.833333
21 21 0.5000000 -1.000000
22 5 0.4285714 -2.000000
23 16 0.5000000 -2.166667
24 14 0.3333333 -2.166667
25 15 0.3333333 -3.000000
26 30 0.4000000 -4.200000
27 8 0.4000000 -5.600000
28 26 0.2857143 -5.714286
29 12 0.2000000 -6.400000
30 6 0.3333333 -7.333333
31 19 0.0000000 -9.333333
32 3 0.1666667 -12.333333
#With team names
sqldf("SELECT B.TeamName, A.WinPerc, A.avgMOV FROM (SELECT HomeTeamID, AVG(BinaryWL) AS avgMOV,
AVG(MOV) WinPerc FROM (SELECT HomeTeamID, CASE WHEN HomeScore > AwayScore THEN 1 ELSE 0 END AS BinaryWL,
HomeScore-AwayScore AS MOV FROM games) GROUP BY HomeTeamID ORDER BY WinPerc DESC, avgMOV DESC) AS A JOIN teams
B ON A.HomeTeamID = B.TeamID")
TeamName WinPerc avgMOV
1 Baltimore Ravens 12.666667 0.6666667
2 Denver Broncos 12.833333 1.0000000
3 Oakland Raiders -12.333333 0.1666667
4 Philadelphia Eagles 16.000000 1.0000000
5 Dallas Cowboys -2.000000 0.4285714
6 Jacksonville Jaguars -7.333333 0.3333333
7 Indianapolis Colts 10.714286 0.7142857
8 Washington Redskins -5.600000 0.4000000
9 San Diego Chargers 11.000000 0.8333333
10 Kansas City Chiefs 7.166667 0.6666667
11 New England Patriots 18.333333 1.0000000
12 Tennessee Titans -6.400000 0.2000000
13 Houston Texans 2.833333 0.5000000
14 New York Giants -2.166667 0.3333333
15 Carolina Panthers -3.000000 0.3333333
16 San Francisco 49ers -2.166667 0.5000000
17 Arizona Cardinals 8.166667 1.0000000
18 St Louis Rams 4.000000 0.5000000
19 Tampa Bay Buccaneers -9.333333 0.0000000
20 Atlanta Falcons 6.500000 0.5000000
21 Minnesota Vikings -1.000000 0.5000000
22 New Orleans Saints 1.833333 0.5000000
23 Seattle Seahawks 10.333333 0.8333333
24 Buffalo Bills 5.714286 0.5714286
25 Miami Dolphins 8.200000 0.6000000
26 New York Jets -5.714286 0.2857143
27 Cleveland Browns 3.333333 0.6666667
28 Pittsburgh Steelers 6.833333 0.6666667
29 Cincinnati Bengals 5.333333 0.6666667
30 Chicago Bears -4.200000 0.4000000
31 Green Bay Packers 23.166667 1.0000000
32 Detroit Lions 8.666667 0.8333333
Let me show you how to make the error bars
#Covert cyl to a factor for nicer plotting
mtcars$cyl = as.factor(mtcars$cyl)
#We will get the average mpg and std for each of the three cylinders
stats = sqldf("SELECT cyl, AVG(mpg) AS avg_mpg, STDEV(mpg) AS stdev_mpg FROM mtcars GROUP BY cyl")
#Lets make the stdev the size of the error bars
ggplot(stats, aes(x = cyl,y=avg_mpg)) + geom_point(size=3, shape=21, fill="blue") +
geom_errorbar(aes(x=cyl,ymin=avg_mpg-stdev_mpg, ymax=avg_mpg+stdev_mpg), colour="red", width=.1) +
geom_label(aes(x= cyl, y =avg_mpg+stdev_mpg, label = cyl), vjust=-0.3 ,size=4)