Tidy tuesday

Tidy tuesday seemed like a really nice game for me, hence I’ve tried to do something quick with the dataset of the week.

1. Libraries

library(tidyverse)
## ── Attaching packages ────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1.9000     ✔ purrr   0.2.4     
## ✔ tibble  1.4.2          ✔ dplyr   0.7.4     
## ✔ tidyr   0.8.0          ✔ stringr 1.3.0     
## ✔ readr   1.1.1          ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::vars()   masks ggplot2::vars()
library(readxl)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date

2. download the dataset

The dataset has been downloaded and stored in a data folder of the project.

download.file(url = "https://github.com/rfordatascience/tidytuesday/blob/master/data/us_avg_tuition.xlsx?raw=true", destfile = "../data/us_avg_tuition.xlsx")

3. Read the data

raw <- readxl::read_excel("../data/us_avg_tuition.xlsx")

4. Explore the data a bit

class(raw)
## [1] "tbl_df"     "tbl"        "data.frame"
glimpse(raw)
## Observations: 50
## Variables: 13
## $ State     <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "Califor...
## $ `2004-05` <dbl> 5682.838, 4328.281, 5138.495, 5772.302, 5285.921, 47...
## $ `2005-06` <dbl> 5840.550, 4632.623, 5415.516, 6082.379, 5527.881, 54...
## $ `2006-07` <dbl> 5753.496, 4918.501, 5481.419, 6231.977, 5334.826, 55...
## $ `2007-08` <dbl> 6008.169, 5069.822, 5681.638, 6414.900, 5672.472, 62...
## $ `2008-09` <dbl> 6475.092, 5075.482, 6058.464, 6416.503, 5897.888, 62...
## $ `2009-10` <dbl> 7188.954, 5454.607, 7263.204, 6627.092, 7258.771, 69...
## $ `2010-11` <dbl> 8071.134, 5759.153, 8839.605, 6900.912, 8193.739, 77...
## $ `2011-12` <dbl> 8451.902, 5762.421, 9966.716, 7028.991, 9436.426, 83...
## $ `2012-13` <dbl> 9098.069, 6026.143, 10133.503, 7286.580, 9360.574, 8...
## $ `2013-14` <dbl> 9358.929, 6012.445, 10296.200, 7408.495, 9274.193, 9...
## $ `2014-15` <dbl> 9496.084, 6148.808, 10413.844, 7606.410, 9186.824, 9...
## $ `2015-16` <dbl> 9751.101, 6571.340, 10646.278, 7867.297, 9269.844, 9...

There are twelve year observations per state. I decided to use all the years.

5. tidy the data

# tidy dataset
mydata <- gather(raw, key = Year, value = Tuition, -State)
glimpse(mydata)
## Observations: 600
## Variables: 3
## $ State   <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "Californi...
## $ Year    <chr> "2004-05", "2004-05", "2004-05", "2004-05", "2004-05",...
## $ Tuition <dbl> 5682.838, 4328.281, 5138.495, 5772.302, 5285.921, 4703...
# clean years to have for digit numbers
mydata <- 
    mydata %>% 
    mutate(Year = str_sub(Year, start = 1, end = 4))

glimpse(mydata)
## Observations: 600
## Variables: 3
## $ State   <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "Californi...
## $ Year    <chr> "2004", "2004", "2004", "2004", "2004", "2004", "2004"...
## $ Tuition <dbl> 5682.838, 4328.281, 5138.495, 5772.302, 5285.921, 4703...

5.1 How many states are in this dataset?

unique(mydata$State)
##  [1] "Alabama"        "Alaska"         "Arizona"        "Arkansas"      
##  [5] "California"     "Colorado"       "Connecticut"    "Delaware"      
##  [9] "Florida"        "Georgia"        "Hawaii"         "Idaho"         
## [13] "Illinois"       "Indiana"        "Iowa"           "Kansas"        
## [17] "Kentucky"       "Louisiana"      "Maine"          "Maryland"      
## [21] "Massachusetts"  "Michigan"       "Minnesota"      "Mississippi"   
## [25] "Missouri"       "Montana"        "Nebraska"       "Nevada"        
## [29] "New Hampshire"  "New Jersey"     "New Mexico"     "New York"      
## [33] "North Carolina" "North Dakota"   "Ohio"           "Oklahoma"      
## [37] "Oregon"         "Pennsylvania"   "Rhode Island"   "South Carolina"
## [41] "South Dakota"   "Tennessee"      "Texas"          "Utah"          
## [45] "Vermont"        "Virginia"       "Washington"     "West Virginia" 
## [49] "Wisconsin"      "Wyoming"

6. Mean by state

mydata1 <-
    mydata %>% 
    select(State, Tuition) %>% 
    group_by(State) %>% 
    summarise(Mean_State = mean(Tuition)) %>% 
    arrange(desc(Mean_State))
glimpse(mydata1)
## Observations: 50
## Variables: 2
## $ State      <chr> "Vermont", "New Hampshire", "New Jersey", "Pennsylv...
## $ Mean_State <dbl> 13067.232, 12781.375, 12054.295, 11969.935, 11227.8...

7. Visualising the mean tuition by state

ggplot(mydata1) +
    geom_bar(aes(x = reorder(State, Mean_State), y = Mean_State),
             stat = "identity", position = "dodge",
                 fill = "cadetblue3") +
    coord_flip() +
    theme_LPD()

8. Another way of visualizing in respect of the mean

mydata2 <-
    mydata %>% 
    group_by(State) %>% 
    mutate(Mean_State = mean(Tuition)) %>% 
    ungroup() %>% 
    mutate(AwayFromMean = Mean_State - mean(Mean_State)) %>% 
    mutate(Category = ifelse(AwayFromMean < 0, "less than the mean", "higher than the mean")) %>% 
    arrange(desc(AwayFromMean)) %>% 
    select(State, AwayFromMean, Category) %>% 
    distinct()
glimpse(mydata2)
## Observations: 50
## Variables: 3
## $ State        <chr> "Vermont", "New Hampshire", "New Jersey", "Pennsy...
## $ AwayFromMean <dbl> 5167.96992, 4882.11302, 4155.03306, 4070.67317, 3...
## $ Category     <chr> "higher than the mean", "higher than the mean", "...

9. Which states have higher and lower tuition from the mean?

ggplot(mydata2) +
    geom_bar(aes(x = reorder(State, -AwayFromMean), y = AwayFromMean, 
                 fill = Category),
             stat = "identity", position = "dodge") +
    geom_hline(yintercept = 0, linetype = 2) +
    coord_flip() +
    scale_fill_brewer() +
    labs(x = "State", y = "Away from the mean tuition") +
    theme_LPD()

This is an Rmarkdown document, you can find the code here.