class: center, middle, inverse, title-slide .title[ # Data types and recoding
💽 ] .author[ ### S. Mason Garrison ] --- layout: true <div class="my-footer"> <span> <a href="https://DataScience4Psych.github.io/DataScience4Psych/" target="_blank">Data Science for Psychologists</a> </span> </div> --- class: middle # Why should you care about data types? --- ## Example: Cat lovers A survey asked respondents their name and number of cats. The instructions said to enter the number of cats as a numerical value. .medi[ ``` r cat_lovers <- read_csv("data/cat-lovers.csv") ``` ] ``` ## # A tibble: 60 × 3 ## name number_of_cats handedness ## <chr> <chr> <chr> ## 1 Bernice Warren 0 left ## 2 Woodrow Stone 0 left ## 3 Willie Bass 1 left ## 4 Tyrone Estrada 3 left ## 5 Alex Daniels 3 left ## 6 Jane Bates 2 left ## 7 Latoya Simpson 1 left ## 8 Darin Woods 1 left ## 9 Agnes Cobb 0 left ## 10 Tabitha Grant 0 left ## # ℹ 50 more rows ``` --- ## Oh why won't you work?! ``` r cat_lovers %>% summarize(mean_cats = mean(number_of_cats)) ``` ``` ## Warning: There was 1 warning in `summarize()`. ## ℹ In argument: `mean_cats = mean(number_of_cats)`. ## Caused by warning in `mean.default()`: ## ! argument is not numeric or logical: returning NA ``` ``` ## # A tibble: 1 × 1 ## mean_cats ## <dbl> ## 1 NA ``` --- .medi[ ``` r ?mean ``` ] <img src="img/mean-help.png" width="75%" style="display: block; margin: auto;" /> --- ## Oh why won't you still work??!! ``` r cat_lovers %>% summarize(mean_cats = mean(number_of_cats, na.rm = TRUE)) ``` ``` ## Warning: There was 1 warning in `summarize()`. ## ℹ In argument: `mean_cats = mean(number_of_cats, na.rm = TRUE)`. ## Caused by warning in `mean.default()`: ## ! argument is not numeric or logical: returning NA ``` ``` ## # A tibble: 1 × 1 ## mean_cats ## <dbl> ## 1 NA ``` --- ## Take a breath and look at your data .question[ What is the type of the `number_of_cats` variable? ] ``` r glimpse(cat_lovers) ``` ``` ## Rows: 60 ## Columns: 3 ## $ name <chr> "Bernice Warren", "Woodrow Stone", "Will… ## $ number_of_cats <chr> "0", "0", "1", "3", "3", "2", "1", "1", … ## $ handedness <chr> "left", "left", "left", "left", "left", … ``` --- ## Let's take another look <div id="htmlwidget-4eb53065b7328530dcdf" style="width:100%;height:90%;" class="datatables html-widget"></div> <script type="application/json" data-for="htmlwidget-4eb53065b7328530dcdf">{"x":{"filter":"none","data":[["1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29","30","31","32","33","34","35","36","37","38","39","40","41","42","43","44","45","46","47","48","49","50","51","52","53","54","55","56","57","58","59","60"],["Bernice Warren","Woodrow Stone","Willie Bass","Tyrone Estrada","Alex Daniels","Jane Bates","Latoya Simpson","Darin Woods","Agnes Cobb","Tabitha Grant","Perry Cross","Wanda Silva","Alicia Sims","Emily Logan","Woodrow Elliott","Brent Copeland","Pedro Carlson","Patsy Luna","Brett Robbins","Oliver George","Calvin Perry","Lora Gutierrez","Charlotte Sparks","Earl Mack","Leslie Wade","Santiago Barker","Jose Bell","Lynda Smith","Bradford Marshall","Irving Miller","Caroline Simpson","Frances Welch","Melba Jenkins","Veronica Morales","Juanita Cunningham","Maurice Howard","Teri Pierce","Phil Franklin","Jan Zimmerman","Leslie Price","Bessie Patterson","Ethel Wolfe","Naomi Wright","Sadie Frank","Lonnie Cannon","Tony Garcia","Darla Newton","Ginger Clark","Lionel Campbell","Florence Klein","Harriet Leonard","Terrence Harrington","Travis Garner","Doug Bass","Pat Norris","Dawn Young","Shari Alvarez","Tamara Robinson","Megan Morgan","Kara Obrien"],["0","0","1","3","3","2","1","1","0","0","0","0","1","3","3","2","1","1","0","0","1","1","0","0","4","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","1","3","3","2","1","1.5 - honestly I think one of my cats is half human","0","0","1","0","1","three","1","1","1","0","0","2"],["left","left","left","left","left","left","left","left","left","left","left","left","left","right","right","right","right","right","right","right","right","right","right","right","right","right","right","right","right","right","right","right","right","right","right","right","right","right","right","right","right","right","right","right","right","right","right","right","right","right","right","right","right","right","right","ambidextrous","ambidextrous","ambidextrous","ambidextrous","ambidextrous"]],"container":"<table class=\"display\">\n <thead>\n <tr>\n <th> <\/th>\n <th>name<\/th>\n <th>number_of_cats<\/th>\n <th>handedness<\/th>\n <\/tr>\n <\/thead>\n<\/table>","options":{"order":[],"autoWidth":false,"orderClasses":false,"columnDefs":[{"orderable":false,"targets":0}]}},"evals":[],"jsHooks":[]}</script> --- ## You might need to babysit your respondents .midi[ ``` r cat_lovers %>% mutate(number_of_cats = case_when( name == "Ginger Clark" ~ 2, name == "Doug Bass" ~ 3, TRUE ~ as.numeric(number_of_cats) )) %>% summarize(mean_cats = mean(number_of_cats)) ``` ``` ## Warning: There was 1 warning in `mutate()`. ## ℹ In argument: `number_of_cats = case_when(...)`. ## Caused by warning: ## ! NAs introduced by coercion ``` ``` ## # A tibble: 1 × 1 ## mean_cats ## <dbl> ## 1 0.833 ``` ] --- ## Always you need to respect data types ``` r cat_lovers %>% mutate( number_of_cats = case_when( name == "Ginger Clark" ~ "2", name == "Doug Bass" ~ "3", TRUE ~ number_of_cats ), number_of_cats = as.numeric(number_of_cats) ) %>% summarize(mean_cats = mean(number_of_cats)) ``` ``` ## # A tibble: 1 × 1 ## mean_cats ## <dbl> ## 1 0.833 ``` --- ## Now that we know what we're doing... ``` r *cat_lovers <- cat_lovers %>% mutate( number_of_cats = case_when( name == "Ginger Clark" ~ "2", name == "Doug Bass" ~ "3", TRUE ~ number_of_cats ), number_of_cats = as.numeric(number_of_cats) ) ``` --- ## Moral of the story - If your data does not behave how you expect it to, type coercion upon reading in the data might be the reason. - Go in and investigate your data, apply the fix, *save your data*, live happily ever after. --- class: middle # Wrapping Up... --- class: middle .hand-blue[now that we have a good motivation for] .hand-blue[learning about data types in R] <br> .hand-blue[let's learn about data types in R!] --- class: middle # Data types --- ## Data types in R - **logical** - **double** - **integer** - **character** - and some more, but we won't be focusing on those --- ## Logical & character .pull-left[ **logical** - boolean values `TRUE` and `FALSE` ``` r typeof(TRUE) ``` ``` ## [1] "logical" ``` ] .pull-right[ **character** - character strings ``` r typeof("hello") ``` ``` ## [1] "character" ``` ] --- ## Double & integer .pull-left[ **double** - floating point numerical values (default numerical type) ``` r typeof(1.335) ``` ``` ## [1] "double" ``` ``` r typeof(7) ``` ``` ## [1] "double" ``` ] .pull-right[ **integer** - integer numerical values (indicated with an `L`) ``` r typeof(7L) ``` ``` ## [1] "integer" ``` ``` r typeof(1:3) ``` ``` ## [1] "integer" ``` ] --- ## Concatenation Vectors can be constructed using the `c()` function. ``` r c(1, 2, 3) ``` ``` ## [1] 1 2 3 ``` ``` r c("Hello", "World!") ``` ``` ## [1] "Hello" "World!" ``` ``` r c(c("hi", "hello"), c("bye", "jello")) ``` ``` ## [1] "hi" "hello" "bye" "jello" ``` --- ## Converting between types .hand[with intention...] .medi.pull-left[ ``` r x <- 1:3 x ``` ``` ## [1] 1 2 3 ``` ``` r typeof(x) ``` ``` ## [1] "integer" ``` ``` r y <- as.character(x) y ``` ``` ## [1] "1" "2" "3" ``` ``` r typeof(y) ``` ``` ## [1] "character" ``` ] -- .medi.pull-right[ ``` r x <- c(TRUE, FALSE) x ``` ``` ## [1] TRUE FALSE ``` ``` r typeof(x) ``` ``` ## [1] "logical" ``` ``` r y <- as.numeric(x) y ``` ``` ## [1] 1 0 ``` ``` r typeof(y) ``` ``` ## [1] "double" ``` ] --- ## Converting between types .hand[without intention...] R will happily convert between various types without complaint when different types of data are concatenated in a vector, and that's not always a great thing! .pull-left[ ``` r c(1, "Hello") ``` ``` ## [1] "1" "Hello" ``` ``` r c(FALSE, 3L) ``` ``` ## [1] 0 3 ``` ] .pull-right[ ``` r c(1.2, 3L) ``` ``` ## [1] 1.2 3.0 ``` ``` r c(2L, "two") ``` ``` ## [1] "2" "two" ``` ] --- ## Explicit vs. implicit coercion Let's give formal names to what we've seen so far: - **Explicit coercion** is when you call a function like `as.logical()`, `as.numeric()`, `as.integer()`, `as.double()`, or `as.character()`. - **Implicit coercion** happens when you use a vector in a specific context that expects a certain type of vector. --- .medi.your-turn[ - [class git repo](https://github.com/DataScience4Psych) > `AE 05 - Hotels + Data types` > open `type-coercion.Rmd` and knit. - What is the type of the given vectors? First, guess. Then, try it out in R. If your guess was correct, great! If not, discuss why they have that type. ] .midi[ **Example:** Suppose we want to know the type of `c(1, "a")`. First, I'd look at: .pull-left[ ``` r typeof(1) ``` ``` ## [1] "double" ``` ] .pull-right[ ``` r typeof("a") ``` ``` ## [1] "character" ``` ] and make a guess based on these. Then finally I'd check: .pull-left[ ``` r typeof(c(1, "a")) ``` ``` ## [1] "character" ``` ] ] --- class: middle # Wrapping Up... --- class: middle # Special values --- ## Special values - `NA`: Not available - `NaN`: Not a number - `Inf`: Positive infinity - `-Inf`: Negative infinity -- .pull-left[ ``` r pi / 0 ``` ``` ## [1] Inf ``` ``` r 0 / 0 ``` ``` ## [1] NaN ``` ] .pull-right[ ``` r 1/0 - 1/0 ``` ``` ## [1] NaN ``` ``` r 1/0 + 1/0 ``` ``` ## [1] Inf ``` ] --- ## `NA`s are special ❄️s ``` r x <- c(1, 2, 3, 4, NA) ``` ``` r mean(x) ``` ``` ## [1] NA ``` ``` r mean(x, na.rm = TRUE) ``` ``` ## [1] 2.5 ``` ``` r summary(x) ``` ``` ## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's ## 1.00 1.75 2.50 2.50 3.25 4.00 1 ``` --- ## `NA`s are logical R uses `NA` to represent missing values in its data structures. ``` r typeof(NA) ``` ``` ## [1] "logical" ``` --- ## Mental model for `NA`s - Unlike `NaN`, `NA`s are genuinely unknown values - But that doesn't mean they can't function in a logical way - Let's think about why `NA`s are logical... -- .question[ Why do the following give different answers? ] .pull-left[ ``` r # TRUE or NA TRUE | NA ``` ``` ## [1] TRUE ``` ] .pull-right[ ``` r # FALSE or NA FALSE | NA ``` ``` ## [1] NA ``` ] `\(\rightarrow\)` See next slide for answers... --- - `NA` is unknown, so it could be `TRUE` or `FALSE` .medi.pull-left[ - `TRUE` or `TRUE` is `TRUE` and `TRUE` or `FALSE` is also `TRUE`, and since both are `TRUE` ``` r TRUE | TRUE ``` ``` ## [1] TRUE ``` ``` r FALSE | TRUE ``` ``` ## [1] TRUE ``` ] .medi.pull-right[ - `FALSE` or `TRUE` is `TRUE` and `FALSE` or `FALSE` is also `FALSE`, so you you can't tell which should be the right answer ``` r FALSE | TRUE ``` ``` ## [1] TRUE ``` ``` r FALSE | FALSE ``` ``` ## [1] FALSE ``` ] - Doesn't make sense for mathematical operations but make sense in the context of missing data --- class: middle # Wrapping Up... --- class: middle # Data classes --- ## Data classes We talked about *types* so far, next we'll introduce the concept of *classes* - Vectors are like Lego building blocks - We stick them together to build more complicated constructs, e.g. *representations of data* - The **class** attribute relates to the S3 class of an object which determines its behaviour - You don't need to worry about what S3 classes really mean, but you can read more about it [here](https://adv-r.hadley.nz/s3.html#s3-classes) if you're curious - Examples: factors, dates, and data frames --- ## Factors R uses factors to handle categorical variables, variables that have a fixed and known set of possible values ``` r x <- factor(c("BS", "MS", "PhD", "MS")) x ``` ``` ## [1] BS MS PhD MS ## Levels: BS MS PhD ``` .pull-left[ ``` r typeof(x) ``` ``` ## [1] "integer" ``` ] .pull-right[ ``` r class(x) ``` ``` ## [1] "factor" ``` ] --- ## More on factors We can think of factors like character (level labels) and an integer (level numbers) glued together ``` r glimpse(x) ``` ``` ## Factor w/ 3 levels "BS","MS","PhD": 1 2 3 2 ``` ``` r as.integer(x) ``` ``` ## [1] 1 2 3 2 ``` --- ## Dates ``` r y <- as.Date("2020-01-01") y ``` ``` ## [1] "2020-01-01" ``` ``` r typeof(y) ``` ``` ## [1] "double" ``` ``` r class(y) ``` ``` ## [1] "Date" ``` --- ## More on dates We can think of dates like an integer (the number of days since the origin, 1 Jan 1970) and an integer (the origin) glued together ``` r as.integer(y) ``` ``` ## [1] 18262 ``` ``` r as.integer(y) / 365 # roughly 50 yrs ``` ``` ## [1] 50.03288 ``` --- ## Data frames We can think of data frames like vectors of equal length glued together ``` r df <- data.frame(x = 1:2, y = 3:4) df ``` ``` ## x y ## 1 1 3 ## 2 2 4 ``` .pull-left[ ``` r typeof(df) ``` ``` ## [1] "list" ``` ] .pull-right[ ``` r class(df) ``` ``` ## [1] "data.frame" ``` ] --- ## Lists Lists are a generic vector container vectors of any type can go in them ``` r l <- list( x = 1:4, y = c("hi", "hello", "jello"), z = c(TRUE, FALSE) ) l ``` ``` ## $x ## [1] 1 2 3 4 ## ## $y ## [1] "hi" "hello" "jello" ## ## $z ## [1] TRUE FALSE ``` --- ## Lists and data frames - A data frame is a special list containing vectors of equal length - When we use the `pull()` function, we extract a vector from the data frame ``` r df ``` ``` ## x y ## 1 1 3 ## 2 2 4 ``` ``` r df %>% pull(y) ``` ``` ## [1] 3 4 ``` --- class: middle # Wrapping Up... --- class: middle # Working with factors --- ## Read data in as character strings ``` r glimpse(cat_lovers) ``` ``` ## Rows: 60 ## Columns: 3 ## $ name <chr> "Bernice Warren", "Woodrow Stone", "Will… ## $ number_of_cats <dbl> 0, 0, 1, 3, 3, 2, 1, 1, 0, 0, 0, 0, 1, 3… ## $ handedness <chr> "left", "left", "left", "left", "left", … ``` --- ## But coerce when plotting .midi[ ``` r ggplot(cat_lovers, mapping = aes(x = handedness)) + geom_bar() ``` <img src="d11_types_files/figure-html/unnamed-chunk-45-1.png" width="65%" style="display: block; margin: auto;" /> ] --- ## Use forcats to manipulate factors .midi[ ``` r cat_lovers %>% * mutate(handedness = fct_infreq(handedness)) %>% ggplot(mapping = aes(x = handedness)) + geom_bar() ``` <img src="d11_types_files/figure-html/unnamed-chunk-46-1.png" width="65%" style="display: block; margin: auto;" /> ] --- ## Come for the functionality .pull-left[ ... stay for the logo ] .pull-right[ <img src="img/forcats-part-of-tidyverse.png" width="75%" style="display: block; margin: auto;" /> ] - Factors are useful when you have true categorical data and you want to override the ordering of character vectors to improve display - They are also useful in modeling scenarios - The **forcats** package provides a suite of useful tools that solve common problems with factors --- .small.your-turn[ - [class git repo](https://github.com/DataScience4Psych) > `AE 05 - Hotels + Data types` > `hotels-forcats.Rmd` > knit - Recreate the following. The x-axis first, then, as a stretch goal, the y-axis. ] <img src="d11_types_files/figure-html/unnamed-chunk-48-1.png" width="95%" style="display: block; margin: auto;" /> --- class: middle # Wrapping Up... --- class: middle # Working with dates --- ## Make a date .pull-left[ <img src="img/lubridate-not-part-of-tidyverse.png" width="67%" style="display: block; margin: auto;" /> ] .pull-right[ - **lubridate** is the tidyverse-friendly package that makes dealing with dates a little easier - It's not one of the *core* tidyverse packages, hence it's installed with `install.packages("tidyverse)` but it's not loaded with it, and needs to be explicitly loaded with `library(lubridate)` ] --- class: middle .hand-blue[ we're just going to scratch the surface of working with dates in R here... ] --- .question[ Calculate and visualize the number of bookings on any given arrival date. ] ``` r hotels %>% select(starts_with("arrival_")) ``` ``` ## # A tibble: 119,390 × 4 ## arrival_date_year arrival_date_month arrival_date_week_number ## <dbl> <chr> <dbl> ## 1 2015 July 27 ## 2 2015 July 27 ## 3 2015 July 27 ## 4 2015 July 27 ## 5 2015 July 27 ## 6 2015 July 27 ## 7 2015 July 27 ## 8 2015 July 27 ## 9 2015 July 27 ## 10 2015 July 27 ## # ℹ 119,380 more rows ## # ℹ 1 more variable: arrival_date_day_of_month <dbl> ``` --- ## Step 1. Put together dates. .medi[ ``` r library(glue) hotels %>% mutate( * arrival_date = glue("{arrival_date_year} {arrival_date_month} {arrival_date_day_of_month}") ) %>% select(starts_with("arrival_")) ``` ``` ## # A tibble: 119,390 × 5 ## arrival_date_year arrival_date_month arrival_date_week_number ## <dbl> <chr> <dbl> ## 1 2015 July 27 ## 2 2015 July 27 ## 3 2015 July 27 ## 4 2015 July 27 ## 5 2015 July 27 ## 6 2015 July 27 ## 7 2015 July 27 ## 8 2015 July 27 ## 9 2015 July 27 ## 10 2015 July 27 ## # ℹ 119,380 more rows ## # ℹ 2 more variables: arrival_date_day_of_month <dbl>, ## # arrival_date <glue> ``` ] --- ## Step 2. Count number of bookings per date. .medi[ ``` r hotels %>% mutate(arrival_date = glue("{arrival_date_year} {arrival_date_month} {arrival_date_day_of_month}")) %>% count(arrival_date) ``` ``` ## # A tibble: 793 × 2 ## arrival_date n ## <glue> <int> ## 1 2015 August 1 110 ## 2 2015 August 10 207 ## 3 2015 August 11 117 ## 4 2015 August 12 133 ## 5 2015 August 13 107 ## 6 2015 August 14 329 ## 7 2015 August 15 190 ## 8 2015 August 16 98 ## 9 2015 August 17 188 ## 10 2015 August 18 94 ## # ℹ 783 more rows ``` ] --- ## Step 3. Visualize number of bookings per date. .small[ ``` r hotels %>% mutate(arrival_date = glue("{arrival_date_year} {arrival_date_month} {arrival_date_day_of_month}")) %>% count(arrival_date) %>% ggplot(aes(x = arrival_date, y = n, group = 1)) + geom_line()+ theme_bw() ``` <img src="d11_types_files/figure-html/unnamed-chunk-53-1.png" width="55%" style="display: block; margin: auto;" /> ] --- .hand[zooming in a bit...] .question[ Why does the plot start with August when we know our data start in July? And why does 10 August come after 1 August? ] .midi[ <img src="d11_types_files/figure-html/unnamed-chunk-54-1.png" width="70%" style="display: block; margin: auto;" /> ] --- ## Step 1. `REVISED` Put together dates `as dates`. .medi[ ``` r library(lubridate) hotels %>% mutate( * arrival_date = ymd(glue("{arrival_date_year} {arrival_date_month} {arrival_date_day_of_month}")) ) %>% select(starts_with("arrival_")) ``` ``` ## # A tibble: 119,390 × 5 ## arrival_date_year arrival_date_month arrival_date_week_number ## <dbl> <chr> <dbl> ## 1 2015 July 27 ## 2 2015 July 27 ## 3 2015 July 27 ## 4 2015 July 27 ## 5 2015 July 27 ## 6 2015 July 27 ## 7 2015 July 27 ## 8 2015 July 27 ## 9 2015 July 27 ## 10 2015 July 27 ## # ℹ 119,380 more rows ## # ℹ 2 more variables: arrival_date_day_of_month <dbl>, ## # arrival_date <date> ``` ] --- ## Step 2. Count number of bookings per date. .medi[ ``` r hotels %>% mutate(arrival_date = ymd(glue("{arrival_date_year} {arrival_date_month} {arrival_date_day_of_month}"))) %>% count(arrival_date) ``` ``` ## # A tibble: 793 × 2 ## arrival_date n ## <date> <int> ## 1 2015-07-01 122 ## 2 2015-07-02 93 ## 3 2015-07-03 56 ## 4 2015-07-04 88 ## 5 2015-07-05 53 ## 6 2015-07-06 75 ## 7 2015-07-07 54 ## 8 2015-07-08 69 ## 9 2015-07-09 80 ## 10 2015-07-10 51 ## # ℹ 783 more rows ``` ] --- ## Step 3a. Visualize number of bookings per date. .small[ ``` r hotels %>% mutate(arrival_date = ymd(glue("{arrival_date_year} {arrival_date_month} {arrival_date_day_of_month}"))) %>% count(arrival_date) %>% ggplot(aes(x = arrival_date, y = n, group = 1)) + geom_line()+theme_bw() ``` <img src="d11_types_files/figure-html/unnamed-chunk-57-1.png" width="55%" style="display: block; margin: auto;" /> ] --- ## Step 3b. Visualize using a smooth curve. .small[ ``` r hotels %>% mutate(arrival_date = ymd(glue("{arrival_date_year} {arrival_date_month} {arrival_date_day_of_month}"))) %>% count(arrival_date) %>% ggplot(aes(x = arrival_date, y = n, group = 1)) + * geom_smooth() + theme_bw() ``` <img src="d11_types_files/figure-html/unnamed-chunk-58-1.png" width="55%" style="display: block; margin: auto;" /> ] --- # Sources - Mine Çetinkaya-Rundel's Data Science in a Box ([link](https://datasciencebox.org/)) - Julia Fukuyama's EDA ([link](https://jfukuyama.github.io/)) --- class: middle # Wrapping Up...