A04 - Data Wrangling

A04 - Data Wrangling#


Revised

19 Jun 2023


Programming Environment#

library(tidyverse)  # includes lots of data verbs like `group_by()` and `summarise()`
library(mosaicData) # includes the `HELPmiss` data set

sessionInfo()
── Attaching core tidyverse packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
 dplyr     1.1.2      readr     2.1.4
 forcats   1.0.0      stringr   1.5.0
 ggplot2   3.4.3      tibble    3.2.1
 lubridate 1.9.2      tidyr     1.3.0
 purrr     1.0.2     
── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
 dplyr::filter() masks stats::filter()
 dplyr::lag()    masks stats::lag()
 Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
R version 4.3.0 (2023-04-21)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS 14.4.1

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: America/New_York
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] mosaicData_0.20.3 lubridate_1.9.2   forcats_1.0.0     stringr_1.5.0    
 [5] dplyr_1.1.2       purrr_1.0.2       readr_2.1.4       tidyr_1.3.0      
 [9] tibble_3.2.1      ggplot2_3.4.3     tidyverse_2.0.0  

loaded via a namespace (and not attached):
 [1] gtable_0.3.3     jsonlite_1.8.5   compiler_4.3.0   crayon_1.5.2    
 [5] tidyselect_1.2.0 IRdisplay_1.1    scales_1.2.1     uuid_1.1-0      
 [9] fastmap_1.1.1    IRkernel_1.3.2   R6_2.5.1         generics_0.1.3  
[13] munsell_0.5.0    pillar_1.9.0     tzdb_0.4.0       rlang_1.1.1     
[17] utf8_1.2.3       stringi_1.7.12   repr_1.1.6       timechange_0.2.0
[21] cli_3.6.1        withr_2.5.0      magrittr_2.0.3   digest_0.6.31   
[25] grid_4.3.0       base64enc_0.1-3  hms_1.1.3        pbdZMQ_0.3-9    
[29] lifecycle_1.0.3  vctrs_0.6.3      evaluate_0.21    glue_1.6.2      
[33] fansi_1.0.4      colorspace_2.1-0 tools_4.3.0      pkgconfig_2.0.3 
[37] htmltools_0.5.5 

# Load the `HELPmiss` data set into our RStudio environment
#   data("HELPmiss", package = "mosaicData")

?mosaicData::HELPmiss
head(HELPmiss)
A data.frame: 6 × 28
ageanysubcesdd1daysanysubdayslinkdrugriske2bfemalesexpcspss_frracegrpsatreatsexrisksubstancetreatavg_drinksmax_drinkshospitalizations
<int><fct><int><int><int><int><int><int><int><fct><dbl><int><fct><fct><int><fct><fct><int><int><int>
137yes49 3177225 0NA0male 58.41369 0blackno 4cocaineyes1326 3
237yes3022 2 NA 0NA0male 36.03694 1whiteno 7alcoholyes566222
326yes39 0 336520NA0male 74.8063313blackno 2heroin no 0 0 0
439yes15 2189343 0 11female61.9316811whiteyes4heroin no 5 5 2
532yes3912 2 57 0 10male 37.3455810blackno 6cocaineno 101312
647yes 6 1 31365 0NA1female46.47521 5blackno 5cocaineyes 4 4 1
# number of cases
HELPmiss %>%
  summarize(count = n())
A data.frame: 1 × 1
count
<int>
470
# total e2b
HELPmiss %>%
  summarize(
    total_e2b = sum(e2b, na.rm = TRUE)
  )
A data.frame: 1 × 1
total_e2b
<int>
549
# mean daysanysub
HELPmiss %>%
  summarize(
    mean_daysanysub = mean(daysanysub, na.rm = TRUE)
  )
A data.frame: 1 × 1
mean_daysanysub
<dbl>
75.13095
HELPmiss %>%
  group_by(sex) %>%
  summarize(
    count           = n(),
    total_e2b       = sum(e2b,         na.rm = TRUE),
    mean_daysanysub = mean(daysanysub, na.rm = TRUE)
  )
A tibble: 2 × 4
sexcounttotal_e2bmean_daysanysub
<fct><int><int><dbl>
male 35945772.60513
female111 9283.77193
HELPmiss %>%
  group_by(homeless) %>%
  summarize(
    count           = n(),
    total_e2b       = sum(e2b,         na.rm = TRUE),
    mean_daysanysub = mean(daysanysub, na.rm = TRUE)
  )
A tibble: 2 × 4
homelesscounttotal_e2bmean_daysanysub
<fct><int><int><dbl>
housed 25116077.10078
homeless21938973.06504
HELPmiss %>%
  group_by(substance) %>%
  summarize(
    count           = n(),
    total_e2b       = sum(e2b,         na.rm = TRUE),
    mean_daysanysub = mean(daysanysub, na.rm = TRUE)
  )
A tibble: 4 × 4
substancecounttotal_e2bmean_daysanysub
<fct><int><int><dbl>
alcohol18524278.49495
cocaine156 9886.67033
heroin 12820852.82258
missing 1 1 NaN
HELPmiss %>%
  group_by(homeless, sex) %>%
  summarize(
    count           = n(),
    total_e2b       = sum(e2b,         na.rm = TRUE),
    mean_daysanysub = mean(daysanysub, na.rm = TRUE)
  )
`summarise()` has grouped output by 'homeless'. You can override using the `.groups` argument.
A grouped_df: 4 × 5
homelesssexcounttotal_e2bmean_daysanysub
<fct><fct><int><int><dbl>
housed male 18211074.30851
housed female 69 5084.60000
homelessmale 17734771.01980
homelessfemale 42 4282.45455
HELPmiss %>%
  group_by(homeless, substance) %>%
  summarize(
    count           = n(),
    total_e2b       = sum(e2b,         na.rm = TRUE),
    mean_daysanysub = mean(daysanysub, na.rm = TRUE)
  )
`summarise()` has grouped output by 'homeless'. You can override using the `.groups` argument.
A grouped_df: 7 × 5
homelesssubstancecounttotal_e2bmean_daysanysub
<fct><fct><int><int><dbl>
housed alcohol 76 34104.79487
housed cocaine 96 37 81.05556
housed heroin 79 89 41.16667
homelessalcohol109208 61.40000
homelesscocaine 60 61 94.86486
homelessheroin 49119 68.96154
homelessmissing 1 1 NaN
HELPmiss %>%
  count(substance) %>%
  arrange(desc(n))
A data.frame: 4 × 2
substancen
<fct><int>
alcohol185
cocaine156
heroin 128
missing 1
ggplot(data = HELPmiss) +
  geom_boxplot(mapping = aes(x = age, fill = substance))
../../../../_images/699ca13c71dfad44b9a4fe56aec8a7c78ed8656407c37422aefed602f3ce9fac.png
options(repr.plot.width = 20, repr.plot.height = 10)

ggplot(data = HELPmiss) +
  geom_bar(
    mapping     = aes(x    = age,
                      fill = substance),
    position    = 'stack',
    show.legend = TRUE,
    stat        = 'count'
  ) +
  facet_grid(sex ~ racegrp) +
  theme(text = element_text(size = 20))
../../../../_images/30fa5f7d3bfa066972aef2aa56352b5ede1cdd6b3db3fb4739a5c4d32295137d.png