Tidy tuesday seemed like a really nice game for me, hence I’ve tried to do something quick with the dataset of the week.
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1.9000 ✔ purrr 0.2.4
## ✔ tibble 1.4.2 ✔ dplyr 0.7.4
## ✔ tidyr 0.8.0 ✔ stringr 1.3.0
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::vars() masks ggplot2::vars()
library(readxl)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
The dataset has been downloaded and stored in a data
folder of the project.
download.file(url = "https://github.com/rfordatascience/tidytuesday/blob/master/data/us_avg_tuition.xlsx?raw=true", destfile = "../data/us_avg_tuition.xlsx")
raw <- readxl::read_excel("../data/us_avg_tuition.xlsx")
class(raw)
## [1] "tbl_df" "tbl" "data.frame"
glimpse(raw)
## Observations: 50
## Variables: 13
## $ State <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "Califor...
## $ `2004-05` <dbl> 5682.838, 4328.281, 5138.495, 5772.302, 5285.921, 47...
## $ `2005-06` <dbl> 5840.550, 4632.623, 5415.516, 6082.379, 5527.881, 54...
## $ `2006-07` <dbl> 5753.496, 4918.501, 5481.419, 6231.977, 5334.826, 55...
## $ `2007-08` <dbl> 6008.169, 5069.822, 5681.638, 6414.900, 5672.472, 62...
## $ `2008-09` <dbl> 6475.092, 5075.482, 6058.464, 6416.503, 5897.888, 62...
## $ `2009-10` <dbl> 7188.954, 5454.607, 7263.204, 6627.092, 7258.771, 69...
## $ `2010-11` <dbl> 8071.134, 5759.153, 8839.605, 6900.912, 8193.739, 77...
## $ `2011-12` <dbl> 8451.902, 5762.421, 9966.716, 7028.991, 9436.426, 83...
## $ `2012-13` <dbl> 9098.069, 6026.143, 10133.503, 7286.580, 9360.574, 8...
## $ `2013-14` <dbl> 9358.929, 6012.445, 10296.200, 7408.495, 9274.193, 9...
## $ `2014-15` <dbl> 9496.084, 6148.808, 10413.844, 7606.410, 9186.824, 9...
## $ `2015-16` <dbl> 9751.101, 6571.340, 10646.278, 7867.297, 9269.844, 9...
There are twelve year observations per state. I decided to use all the years.
# tidy dataset
mydata <- gather(raw, key = Year, value = Tuition, -State)
glimpse(mydata)
## Observations: 600
## Variables: 3
## $ State <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "Californi...
## $ Year <chr> "2004-05", "2004-05", "2004-05", "2004-05", "2004-05",...
## $ Tuition <dbl> 5682.838, 4328.281, 5138.495, 5772.302, 5285.921, 4703...
# clean years to have for digit numbers
mydata <-
mydata %>%
mutate(Year = str_sub(Year, start = 1, end = 4))
glimpse(mydata)
## Observations: 600
## Variables: 3
## $ State <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "Californi...
## $ Year <chr> "2004", "2004", "2004", "2004", "2004", "2004", "2004"...
## $ Tuition <dbl> 5682.838, 4328.281, 5138.495, 5772.302, 5285.921, 4703...
unique(mydata$State)
## [1] "Alabama" "Alaska" "Arizona" "Arkansas"
## [5] "California" "Colorado" "Connecticut" "Delaware"
## [9] "Florida" "Georgia" "Hawaii" "Idaho"
## [13] "Illinois" "Indiana" "Iowa" "Kansas"
## [17] "Kentucky" "Louisiana" "Maine" "Maryland"
## [21] "Massachusetts" "Michigan" "Minnesota" "Mississippi"
## [25] "Missouri" "Montana" "Nebraska" "Nevada"
## [29] "New Hampshire" "New Jersey" "New Mexico" "New York"
## [33] "North Carolina" "North Dakota" "Ohio" "Oklahoma"
## [37] "Oregon" "Pennsylvania" "Rhode Island" "South Carolina"
## [41] "South Dakota" "Tennessee" "Texas" "Utah"
## [45] "Vermont" "Virginia" "Washington" "West Virginia"
## [49] "Wisconsin" "Wyoming"
mydata1 <-
mydata %>%
select(State, Tuition) %>%
group_by(State) %>%
summarise(Mean_State = mean(Tuition)) %>%
arrange(desc(Mean_State))
glimpse(mydata1)
## Observations: 50
## Variables: 2
## $ State <chr> "Vermont", "New Hampshire", "New Jersey", "Pennsylv...
## $ Mean_State <dbl> 13067.232, 12781.375, 12054.295, 11969.935, 11227.8...
ggplot(mydata1) +
geom_bar(aes(x = reorder(State, Mean_State), y = Mean_State),
stat = "identity", position = "dodge",
fill = "cadetblue3") +
coord_flip() +
theme_LPD()
mydata2 <-
mydata %>%
group_by(State) %>%
mutate(Mean_State = mean(Tuition)) %>%
ungroup() %>%
mutate(AwayFromMean = Mean_State - mean(Mean_State)) %>%
mutate(Category = ifelse(AwayFromMean < 0, "less than the mean", "higher than the mean")) %>%
arrange(desc(AwayFromMean)) %>%
select(State, AwayFromMean, Category) %>%
distinct()
glimpse(mydata2)
## Observations: 50
## Variables: 3
## $ State <chr> "Vermont", "New Hampshire", "New Jersey", "Pennsy...
## $ AwayFromMean <dbl> 5167.96992, 4882.11302, 4155.03306, 4070.67317, 3...
## $ Category <chr> "higher than the mean", "higher than the mean", "...
ggplot(mydata2) +
geom_bar(aes(x = reorder(State, -AwayFromMean), y = AwayFromMean,
fill = Category),
stat = "identity", position = "dodge") +
geom_hline(yintercept = 0, linetype = 2) +
coord_flip() +
scale_fill_brewer() +
labs(x = "State", y = "Away from the mean tuition") +
theme_LPD()
This is an Rmarkdown document, you can find the code here.