Data Computing 2#
Kaplan, Daniel & Matthew Beckman. (2021). Data Computing. 2nd Ed. Home.
Revised
19 Jun 2023
Programming Environment#
# install.packages('devtools')
# devtools::install_github('mdbeckman/dcData')
packages <- c(
'data.table', # library(data.table)
'dcData', # library(dcData)
'foreign', # library(foreign)
'leaflet', # library(leaflet)
'lubridate', # library(lubridate)
'mosaic', # library(mosaic)
'mosaicData', # library(mosaicData)
'Rcpp', # library(Rcpp)
'rvest', # library(rvest)
'tidyverse' # library(tidyverse)
)
# Install packages not yet installed
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
install.packages(packages[!installed_packages])
}
# Load packages
invisible(lapply(packages, library, character.only = TRUE))
str_c('EXECUTED : ', now())
sessionInfo()
# R.version.string # R.Version()
# .libPaths()
# installed.packages()
Attaching package: ‘lubridate’
The following objects are masked from ‘package:data.table’:
hour, isoweek, mday, minute, month, quarter, second, wday, week,
yday, year
The following objects are masked from ‘package:base’:
date, intersect, setdiff, union
Registered S3 method overwritten by 'mosaic':
method from
fortify.SpatialPolygonsDataFrame ggplot2
The 'mosaic' package masks several functions from core packages in order to add
additional features. The original behavior of these functions should not be affected by this.
Attaching package: ‘mosaic’
The following objects are masked from ‘package:dplyr’:
count, do, tally
The following object is masked from ‘package:Matrix’:
mean
The following object is masked from ‘package:ggplot2’:
stat
The following objects are masked from ‘package:stats’:
binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
quantile, sd, t.test, var
The following objects are masked from ‘package:base’:
max, mean, min, prod, range, sample, sum
── Attaching core tidyverse packages ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
✔ forcats 1.0.0 ✔ stringr 1.5.0
✔ purrr 1.0.2 ✔ tibble 3.2.1
✔ readr 2.1.4 ✔ tidyr 1.3.0
── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::between() masks data.table::between()
✖ mosaic::count() masks dplyr::count()
✖ purrr::cross() masks mosaic::cross()
✖ mosaic::do() masks dplyr::do()
✖ tidyr::expand() masks Matrix::expand()
✖ dplyr::filter() masks stats::filter()
✖ dplyr::first() masks data.table::first()
✖ readr::guess_encoding() masks rvest::guess_encoding()
✖ lubridate::hour() masks data.table::hour()
✖ lubridate::isoweek() masks data.table::isoweek()
✖ dplyr::lag() masks stats::lag()
✖ dplyr::last() masks data.table::last()
✖ lubridate::mday() masks data.table::mday()
✖ lubridate::minute() masks data.table::minute()
✖ lubridate::month() masks data.table::month()
✖ tidyr::pack() masks Matrix::pack()
✖ lubridate::quarter() masks data.table::quarter()
✖ lubridate::second() masks data.table::second()
✖ mosaic::stat() masks ggplot2::stat()
✖ mosaic::tally() masks dplyr::tally()
✖ purrr::transpose() masks data.table::transpose()
✖ tidyr::unpack() masks Matrix::unpack()
✖ lubridate::wday() masks data.table::wday()
✖ lubridate::week() masks data.table::week()
✖ lubridate::yday() masks data.table::yday()
✖ lubridate::year() masks data.table::year()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
R version 4.3.0 (2023-04-21)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS 15.1
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
time zone: America/New_York
tzcode source: internal
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] forcats_1.0.0 stringr_1.5.0 purrr_1.0.2 readr_2.1.4
[5] tidyr_1.3.0 tibble_3.2.1 tidyverse_2.0.0 rvest_1.0.3
[9] Rcpp_1.0.10 mosaic_1.8.4.2 mosaicData_0.20.3 ggformula_0.10.4
[13] dplyr_1.1.2 Matrix_1.5-4 ggplot2_3.4.3 lattice_0.21-8
[17] lubridate_1.9.2 leaflet_2.2.0 foreign_0.8-84 dcData_0.1.0
[21] data.table_1.14.8
loaded via a namespace (and not attached):
[1] utf8_1.2.3 generics_0.1.3 xml2_1.3.4 stringi_1.7.12
[5] hms_1.1.3 digest_0.6.31 magrittr_2.0.3 evaluate_0.21
[9] grid_4.3.0 timechange_0.2.0 pbdZMQ_0.3-9 fastmap_1.1.1
[13] jsonlite_1.8.5 httr_1.4.6 fansi_1.0.4 crosstalk_1.2.0
[17] scales_1.2.1 tweenr_2.0.2 cli_3.6.1 labelled_2.11.0
[21] rlang_1.1.1 crayon_1.5.2 polyclip_1.10-4 munsell_0.5.0
[25] base64enc_0.1-3 withr_2.5.0 repr_1.1.6 tools_4.3.0
[29] tzdb_0.4.0 uuid_1.1-0 colorspace_2.1-0 mosaicCore_0.9.2.1
[33] IRdisplay_1.1 vctrs_0.6.3 R6_2.5.1 ggridges_0.5.4
[37] lifecycle_1.0.3 htmlwidgets_1.6.2 ggstance_0.3.6 MASS_7.3-58.4
[41] pkgconfig_2.0.3 pillar_1.9.0 gtable_0.3.3 glue_1.6.2
[45] ggforce_0.4.1 haven_2.5.2 tidyselect_1.2.0 IRkernel_1.3.2
[49] farver_2.1.1 htmltools_0.5.5 compiler_4.3.0
Issue
Issues using Leaflet (and the terra dependency in particular) in VSCode on macOS.
The following is an attempted, but so far failed, approach.
brew install gdal proj
remotes::install_github("rspatial/terra", configure.args = "--with-gdal-config=/opt/homebrew/bin/gdal-config --with-proj-include=/opt/homebrew/opt/proj/include --with-proj-lib=/opt/homebrew/opt/proj/lib --with-proj-share=/opt/homebrew/opt/proj/share/proj", type = "source", force = TRUE)
02 - Computing with R#
data(package = 'dcData')
data('NCHS', package = 'dcData')
View(NCHS)
Show code cell output
sex | age | pregnant | ethnicity | death | followup | smoker | diabetic | height | weight | ⋯ | bmd | fmhm_other | hdl | chol | bps | bpd | income | pop_weight | psu | stratum |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<fct> | <dbl> | <fct> | <fct> | <chr> | <dbl> | <fct> | <dbl> | <dbl> | <dbl> | ⋯ | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> |
female | 2 | no | Non-Hispanic Black | NA | NA | no | 0 | 0.916 | 12.5 | ⋯ | NA | NA | NA | NA | NA | NA | 0.86 | 2970.8044 | 1 | 5 |
male | 77 | no | Non-Hispanic White | alive | 90 | no | 0 | 1.740 | 75.4 | ⋯ | 1.2196 | 0.12359 | 54 | 215 | 98 | 56 | 5.00 | 10224.1331 | 3 | 1 |
female | 10 | no | Non-Hispanic White | NA | NA | no | 0 | 1.366 | 32.9 | ⋯ | NA | NA | 30 | 129 | 108 | 63 | 1.47 | 14172.3111 | 2 | 7 |
male | 1 | no | Non-Hispanic Black | NA | NA | no | 0 | NA | 13.3 | ⋯ | NA | NA | NA | NA | NA | NA | 0.57 | 3041.5927 | 1 | 2 |
male | 49 | no | Non-Hispanic White | alive | 74 | yes | 0 | 1.783 | 92.5 | ⋯ | 1.0870 | 1.17688 | 42 | 279 | 122 | 83 | 5.00 | 30657.3119 | 2 | 8 |
female | 19 | no | Other/Multi | alive | 86 | no | 0 | 1.620 | 59.2 | ⋯ | 0.8680 | -1.22452 | 61 | 153 | 114 | 70 | 1.21 | 12224.8756 | 2 | 2 |
female | 59 | no | Non-Hispanic Black | alive | 76 | no | 0 | 1.629 | 78.0 | ⋯ | 1.0870 | -0.18566 | 105 | 245 | 123 | 81 | 0.00 | 8930.8802 | 2 | 4 |
male | 13 | no | Non-Hispanic White | NA | NA | no | 0 | 1.620 | 40.7 | ⋯ | 0.8706 | -0.48886 | 67 | 162 | 98 | 52 | 0.53 | 9823.0003 | 1 | 6 |
female | 11 | no | Non-Hispanic Black | NA | NA | no | 0 | 1.569 | 45.5 | ⋯ | NA | NA | 58 | 148 | 112 | 50 | 0.00 | 2345.5760 | 2 | 9 |
male | 43 | no | Non-Hispanic Black | alive | 79 | no | 0 | 1.901 | 111.8 | ⋯ | 1.3000 | -0.18884 | 51 | 140 | 142 | 95 | 0.00 | 7484.2336 | 1 | 7 |
male | 15 | no | Non-Hispanic White | NA | NA | no | 0 | 1.719 | 65.0 | ⋯ | 1.2090 | -0.41861 | 40 | 132 | 107 | 51 | 1.25 | 10758.6505 | 2 | 1 |
male | 37 | no | Non-Hispanic White | alive | 82 | no | 0 | 1.800 | 99.2 | ⋯ | 1.1990 | 0.20224 | 38 | 156 | 174 | 99 | 4.93 | 30448.0967 | 2 | 6 |
male | 70 | no | Mexican American | cardiovascular death | 16 | no | 1 | 1.577 | 63.6 | ⋯ | 0.8464 | 1.36872 | 49 | 314 | 130 | 66 | 1.07 | 752.5150 | 2 | 13 |
male | 81 | no | Non-Hispanic White | alive | 85 | yes | 0 | 1.662 | 75.5 | ⋯ | 1.0550 | 0.95469 | 40 | 174 | 136 | 61 | 2.67 | 6626.6640 | 1 | 12 |
female | 38 | no | Non-Hispanic White | alive | 92 | yes | 0 | 1.749 | 81.6 | ⋯ | 1.1174 | -0.24117 | 58 | 199 | 109 | 69 | 4.52 | 34313.1432 | 2 | 11 |
female | 85 | no | Non-Hispanic Black | other | 62 | no | 0 | 1.442 | 41.5 | ⋯ | 0.8510 | -0.06773 | 55 | 164 | 139 | 60 | 0.38 | 5872.4952 | 1 | 11 |
male | 2 | no | Non-Hispanic Black | NA | NA | no | 0 | 0.886 | 11.4 | ⋯ | NA | NA | NA | NA | NA | NA | 0.92 | 2570.3422 | 1 | 5 |
female | 1 | no | Non-Hispanic White | NA | NA | no | 0 | NA | 11.1 | ⋯ | NA | NA | NA | NA | NA | NA | 2.84 | 9973.7900 | 1 | 8 |
male | 0 | no | Mexican American | NA | NA | NA | NA | NA | 11.5 | ⋯ | NA | NA | NA | NA | NA | NA | 2.40 | 801.5593 | 2 | 13 |
female | 23 | yes | Mexican American | alive | 86 | yes | 0 | 1.589 | 59.8 | ⋯ | NA | -0.12459 | 43 | 145 | 103 | 60 | 3.03 | 6618.1184 | 2 | 2 |
male | 18 | no | Mexican American | alive | 87 | no | 0 | 1.685 | 112.9 | ⋯ | 1.1064 | 0.49518 | 34 | 161 | 119 | 78 | 1.80 | 1283.5993 | 2 | 3 |
female | 13 | no | Non-Hispanic Black | NA | NA | no | 0 | 1.622 | 50.9 | ⋯ | NA | -1.22452 | 62 | 151 | 105 | 73 | 0.75 | 2070.0273 | 2 | 2 |
female | 12 | no | Non-Hispanic Black | NA | NA | no | 0 | 1.691 | 75.0 | ⋯ | NA | 0.30949 | 32 | 160 | 97 | 56 | 5.00 | 1779.4018 | 2 | 4 |
female | 53 | no | Non-Hispanic White | alive | 72 | yes | 0 | 1.642 | 69.9 | ⋯ | 0.9540 | -0.46297 | 105 | 219 | 114 | 71 | 2.67 | 24695.7644 | 2 | 12 |
female | 42 | no | Non-Hispanic White | alive | 84 | no | 0 | 1.667 | 104.5 | ⋯ | 1.2266 | -0.32911 | 55 | 159 | 119 | 85 | 1.77 | 42467.3728 | 1 | 5 |
female | 14 | no | Mexican American | NA | NA | no | 0 | 1.631 | 85.6 | ⋯ | NA | -0.99286 | 49 | 145 | 99 | 61 | 2.51 | 690.6990 | 2 | 13 |
male | 18 | no | Mexican American | alive | 85 | no | 0 | 1.613 | 72.2 | ⋯ | 1.1034 | NA | NA | NA | 117 | 77 | 0.00 | 880.8045 | 2 | 13 |
male | 18 | no | Non-Hispanic White | alive | 91 | no | 0 | 1.796 | 62.7 | ⋯ | 1.1622 | -1.14831 | 64 | 151 | 92 | 43 | 3.43 | 11514.5245 | 2 | 1 |
male | 62 | no | Non-Hispanic White | other | 26 | no | 1 | 1.749 | 113.0 | ⋯ | 1.2130 | 0.60531 | 49 | 216 | 124 | 71 | 1.07 | 16359.4822 | 2 | 12 |
female | 7 | no | Non-Hispanic Black | NA | NA | NA | NA | 1.261 | 21.7 | ⋯ | NA | NA | 72 | 184 | NA | NA | 0.00 | 3608.9188 | 2 | 5 |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋱ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
female | 40 | no | Non-Hispanic White | alive | NA | no | 0 | NA | NA | ⋯ | NA | NA | NA | NA | NA | NA | 5.00 | 0.0000 | 1 | 36 |
male | 83 | no | Non-Hispanic White | alive | 26 | no | 0 | 1.608 | 62.6 | ⋯ | 1.2500 | 0.30891 | 73 | 237 | 180 | 36 | 0.84 | 2842.5528 | 2 | 41 |
female | 17 | no | Mexican American | NA | NA | no | 0 | 1.660 | 73.4 | ⋯ | 1.0670 | NA | 63 | 192 | 116 | 78 | 2.43 | 2269.4901 | 1 | 42 |
male | 85 | no | Non-Hispanic White | alive | NA | no | 0 | NA | NA | ⋯ | NA | 0.00000 | NA | NA | NA | NA | 3.00 | 0.0000 | 2 | 35 |
female | 46 | no | Non-Hispanic White | alive | 31 | yes | 0 | 1.738 | 57.5 | ⋯ | 1.1650 | 0.80966 | 28 | 218 | 103 | 72 | 1.04 | 16782.0437 | 1 | 33 |
female | 33 | yes | Other/Multi | alive | 47 | no | 0 | 1.633 | 76.9 | ⋯ | NA | -0.75543 | 72 | 228 | 112 | 63 | 5.00 | 2410.5345 | 1 | 35 |
male | 0 | no | Other/Multi | NA | NA | NA | NA | NA | 10.0 | ⋯ | NA | NA | NA | NA | NA | NA | 4.47 | 1518.4104 | 2 | 37 |
female | 4 | no | Mexican American | NA | NA | no | 0 | 1.080 | 19.3 | ⋯ | NA | NA | NA | NA | NA | NA | 0.80 | 2726.6750 | 2 | 40 |
male | 40 | no | Non-Hispanic Black | alive | 31 | no | 0 | 1.748 | 76.4 | ⋯ | 1.4320 | NA | 82 | 232 | 96 | 57 | 1.83 | 10810.7055 | 2 | 32 |
female | 0 | no | Mexican American | NA | NA | NA | NA | NA | 7.8 | ⋯ | NA | NA | NA | NA | NA | NA | 0.64 | 715.5026 | 2 | 43 |
male | 12 | no | Non-Hispanic Black | NA | NA | no | 0 | 1.428 | 31.1 | ⋯ | 1.0170 | -0.48886 | 74 | 187 | 101 | 63 | 0.56 | 1895.2838 | 1 | 30 |
female | 72 | no | Non-Hispanic White | alive | 27 | no | 0 | 1.502 | 67.5 | ⋯ | 1.0110 | -0.16663 | 79 | 162 | 140 | 70 | 1.18 | 10217.0484 | 2 | 30 |
female | 26 | yes | Other/Multi | alive | 25 | no | 0 | 1.558 | 53.8 | ⋯ | NA | -0.53363 | NA | NA | 106 | 54 | 0.00 | 2791.2864 | 2 | 39 |
female | 14 | no | Mexican American | NA | NA | no | 0 | 1.550 | 54.3 | ⋯ | 1.0890 | -0.96314 | 60 | 168 | 118 | 67 | 1.01 | 1133.9264 | 2 | 40 |
male | 34 | no | Non-Hispanic White | alive | 31 | no | 0 | 1.844 | 100.0 | ⋯ | 1.1810 | -0.20340 | 61 | 197 | 132 | 71 | 4.79 | 36995.3239 | 2 | 32 |
male | 13 | no | Mexican American | NA | NA | no | 0 | 1.559 | 61.2 | ⋯ | 0.9180 | -0.16427 | 29 | 120 | 104 | 60 | 1.33 | 1996.1073 | 2 | 42 |
male | 12 | no | Non-Hispanic White | NA | NA | no | 0 | 1.619 | 46.2 | ⋯ | 0.8916 | NA | NA | NA | 94 | 52 | 2.70 | 12648.6416 | 2 | 42 |
female | 55 | no | Non-Hispanic White | alive | 31 | no | 0 | 1.648 | 57.8 | ⋯ | 1.0690 | -1.22452 | 73 | 157 | 96 | 67 | 1.21 | 15357.9095 | 2 | 31 |
female | 19 | no | Mexican American | alive | 36 | no | 0 | 1.531 | 52.8 | ⋯ | 1.1090 | -0.41705 | 43 | 129 | 106 | 62 | 0.00 | 1410.3413 | 1 | 40 |
female | 54 | no | Non-Hispanic White | alive | 47 | no | 0 | 1.661 | 71.5 | ⋯ | 1.0850 | -0.75543 | 64 | 208 | 115 | 59 | 5.00 | 26313.6174 | 1 | 35 |
female | 2 | no | Non-Hispanic White | NA | NA | no | 0 | NA | 12.5 | ⋯ | NA | NA | NA | NA | NA | NA | 1.34 | 5568.3884 | 2 | 34 |
female | 16 | no | Mexican American | NA | NA | yes | 0 | 1.498 | 43.7 | ⋯ | 1.0110 | -0.50255 | 52 | 120 | 111 | 69 | 0.66 | 2289.3041 | 1 | 38 |
male | 23 | no | Mexican American | alive | 37 | yes | 0 | 1.721 | 77.4 | ⋯ | 0.9940 | 0.84841 | 44 | 211 | 127 | 69 | 3.93 | 9982.5528 | 1 | 39 |
female | 0 | no | Mexican American | NA | NA | NA | NA | NA | 9.5 | ⋯ | NA | NA | NA | NA | NA | NA | 1.24 | 1391.6587 | 2 | 39 |
female | 68 | no | Mexican American | alive | 34 | no | 0 | 1.530 | 68.0 | ⋯ | 0.8650 | 0.51794 | 41 | 202 | 134 | NA | 2.08 | 1128.5607 | 2 | 40 |
female | 5 | no | Non-Hispanic White | NA | NA | no | 0 | 1.098 | 19.3 | ⋯ | NA | NA | NA | NA | NA | NA | 2.96 | 18259.3185 | 2 | 39 |
female | 83 | no | Non-Hispanic White | alive | 38 | no | 0 | 1.624 | 61.9 | ⋯ | 1.0180 | 0.07722 | 101 | 264 | 140 | 52 | 0.00 | 7025.9255 | 1 | 39 |
male | 44 | no | Mexican American | alive | 37 | yes | 0 | 1.734 | 89.7 | ⋯ | 1.0230 | 1.55613 | 51 | 282 | 149 | 90 | 3.93 | 5832.3181 | 1 | 39 |
male | 54 | no | Non-Hispanic White | alive | 36 | no | 0 | 1.835 | 126.8 | ⋯ | 1.1382 | 0.24084 | 35 | 168 | 114 | 62 | 5.00 | 32734.1635 | 1 | 43 |
male | 1 | no | Non-Hispanic White | NA | NA | no | 0 | NA | 10.3 | ⋯ | NA | NA | NA | NA | NA | NA | 5.00 | 10280.9366 | 2 | 29 |
SmallNCHS <- sample_n(tbl = NCHS, size = 100)
head(SmallNCHS)
sex | age | pregnant | ethnicity | death | followup | smoker | diabetic | height | weight | ⋯ | bmd | fmhm_other | hdl | chol | bps | bpd | income | pop_weight | psu | stratum | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<fct> | <dbl> | <fct> | <fct> | <chr> | <dbl> | <fct> | <dbl> | <dbl> | <dbl> | ⋯ | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | |
1 | female | 10 | no | Mexican American | NA | NA | no | 0 | 1.424 | 32.5 | ⋯ | NA | NA | 57 | 174 | 95 | 65 | 0.90 | 3147.735 | 1 | 4 |
2 | male | 14 | no | Non-Hispanic White | NA | NA | no | 0 | 1.754 | 83.0 | ⋯ | 0.906 | -0.66171 | 45 | 133 | 112 | 61 | 5.00 | 12532.205 | 1 | 9 |
3 | male | 70 | no | Non-Hispanic White | alive | 61 | no | 0 | 1.624 | 76.0 | ⋯ | 1.189 | 0.12119 | 32 | 148 | 136 | 52 | 5.00 | 11762.741 | 1 | 22 |
4 | male | 3 | no | Non-Hispanic Black | NA | NA | no | 0 | 1.077 | 20.3 | ⋯ | NA | NA | 62 | 140 | NA | NA | 0.37 | 3400.876 | 2 | 33 |
5 | male | 6 | no | Other/Multi | NA | NA | no | 0 | 1.116 | 18.8 | ⋯ | NA | NA | 59 | 153 | NA | NA | 1.06 | 16155.083 | 2 | 24 |
6 | female | 6 | no | Mexican American | NA | NA | no | 0 | 1.272 | 25.7 | ⋯ | NA | NA | 51 | 184 | NA | NA | 0.12 | 3310.146 | 1 | 31 |
Motors <- readr::read_csv('https://mdbeckman.github.io/dcSupplement/data/engines.csv')
head(Motors)
Rows: 39 Columns: 9
── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (1): Engine
dbl (8): mass, ncylinder, strokes, displacement, bore, stroke, BHP, RPM
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Engine | mass | ncylinder | strokes | displacement | bore | stroke | BHP | RPM |
---|---|---|---|---|---|---|---|---|
<chr> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> |
Webra Speedy | 0.135 | 1 | 2 | 1.8 | 13.5 | 12.5 | 0.45 | 22000 |
Motori Cipolla | 0.150 | 1 | 2 | 2.5 | 15.0 | 14.0 | 1.00 | 26000 |
Webra Speed 20 | 0.250 | 1 | 2 | 3.4 | 16.5 | 16.0 | 0.78 | 22000 |
Webra 40 | 0.270 | 1 | 2 | 6.5 | 21.0 | 19.0 | 0.96 | 15500 |
Webra 61 Blackhead | 0.430 | 1 | 2 | 10.0 | 24.0 | 22.0 | 1.55 | 14000 |
Webra 6WR | 0.490 | 1 | 2 | 10.0 | 24.0 | 22.0 | 2.76 | 19000 |
data('CPS85', package = 'mosaicData')
head(CPS85)
wage | educ | race | sex | hispanic | south | married | exper | union | age | sector | |
---|---|---|---|---|---|---|---|---|---|---|---|
<dbl> | <int> | <fct> | <fct> | <fct> | <fct> | <fct> | <int> | <fct> | <int> | <fct> | |
1 | 9.0 | 10 | W | M | NH | NS | Married | 27 | Not | 43 | const |
2 | 5.5 | 12 | W | M | NH | NS | Married | 20 | Not | 38 | sales |
3 | 3.8 | 12 | W | F | NH | NS | Single | 4 | Not | 22 | sales |
4 | 10.5 | 12 | W | F | NH | NS | Married | 29 | Not | 47 | clerical |
5 | 15.0 | 12 | W | M | NH | NS | Married | 40 | Union | 58 | const |
6 | 9.0 | 16 | W | F | NH | NS | Married | 27 | Not | 49 | clerical |
05 - Intro to data graphics#
head(x = dcData::NCHS, n = 5)
sex | age | pregnant | ethnicity | death | followup | smoker | diabetic | height | weight | ⋯ | bmd | fmhm_other | hdl | chol | bps | bpd | income | pop_weight | psu | stratum | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<fct> | <dbl> | <fct> | <fct> | <chr> | <dbl> | <fct> | <dbl> | <dbl> | <dbl> | ⋯ | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | |
1 | female | 2 | no | Non-Hispanic Black | NA | NA | no | 0 | 0.916 | 12.5 | ⋯ | NA | NA | NA | NA | NA | NA | 0.86 | 2970.804 | 1 | 5 |
2 | male | 77 | no | Non-Hispanic White | alive | 90 | no | 0 | 1.740 | 75.4 | ⋯ | 1.2196 | 0.12359 | 54 | 215 | 98 | 56 | 5.00 | 10224.133 | 3 | 1 |
3 | female | 10 | no | Non-Hispanic White | NA | NA | no | 0 | 1.366 | 32.9 | ⋯ | NA | NA | 30 | 129 | 108 | 63 | 1.47 | 14172.311 | 2 | 7 |
4 | male | 1 | no | Non-Hispanic Black | NA | NA | no | 0 | NA | 13.3 | ⋯ | NA | NA | NA | NA | NA | NA | 0.57 | 3041.593 | 1 | 2 |
5 | male | 49 | no | Non-Hispanic White | alive | 74 | yes | 0 | 1.783 | 92.5 | ⋯ | 1.0870 | 1.17688 | 42 | 279 | 122 | 83 | 5.00 | 30657.312 | 2 | 8 |
head(x = dcData::NCI60, n = 5)
Probe | BR.MCF7 | BR.MDA_MB_231 | BR.HS578T | BR.BT_549 | BR.T47D | CNS.SF_268 | CNS.SF_295 | CNS.SF_539 | CNS.SNB_19 | ⋯ | PR.PC_3 | PR.DU_145 | RE.786_0 | RE.A498 | RE.ACHN | RE.CAKI_1 | RE.RXF_393 | RE.SN12C | RE.TK_10 | RE.UO_31 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<fct> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | ⋯ | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | |
1 | AT_D_3 | -7.45 | -7.51 | -7.30 | -7.37 | -6.15 | -7.16 | -7.03 | -7.44 | -7.43 | ⋯ | -6.88 | -6.78 | -7.25 | -7.17 | -6.51 | -6.66 | -6.53 | -6.86 | -7.03 | -6.85 |
2 | AT_D_5 | -7.05 | -6.62 | -6.88 | -6.78 | -7.18 | -7.25 | -7.22 | -7.37 | -7.26 | ⋯ | -6.27 | -7.04 | -6.68 | -6.83 | -6.65 | -6.61 | -6.23 | -6.74 | -6.85 | -6.67 |
3 | AT_D_M | -7.05 | -7.29 | -7.30 | -7.37 | -7.61 | -6.56 | -7.63 | -7.44 | -7.43 | ⋯ | -6.45 | -7.29 | -7.25 | -7.14 | -6.53 | -6.93 | -6.17 | -7.20 | -6.93 | -6.85 |
4 | AT_L_3 | -7.32 | -7.01 | -7.22 | -6.60 | -7.45 | -7.00 | -7.32 | -7.27 | -7.39 | ⋯ | -6.30 | -7.01 | -6.73 | -6.50 | -6.37 | -6.54 | -7.07 | -6.86 | -6.55 | -6.67 |
5 | AT_L_5 | -7.38 | -7.22 | -7.30 | -7.37 | -7.70 | -7.25 | -7.61 | -7.44 | -7.43 | ⋯ | -6.48 | -7.15 | -7.06 | -6.44 | -6.47 | -6.62 | -7.00 | -7.22 | -6.89 | -6.36 |
10 - More data verbs#
head(x = dcData::CountryData, n = 5)
country | area | pop | growth | birth | death | migr | maternal | infant | life | ⋯ | mainlines | cell | netHosts | netUsers | airports | railways | roadways | waterways | marine | military | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<chr> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | ⋯ | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | |
1 | Afghanistan | 652230 | 31822848 | 2.29 | 38.84 | 14.12 | -1.83 | 460 | 117.23 | 50.49 | ⋯ | 13500 | 18000000 | 223 | 1000000 | 52 | NA | 42150 | 1200 | NA | NA |
2 | Akrotiri | 123 | 15700 | NA | NA | NA | NA | NA | NA | NA | ⋯ | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
3 | Albania | 28748 | 3020209 | 0.30 | 12.73 | 6.47 | -3.31 | 27 | 13.19 | 77.96 | ⋯ | 312000 | 3500000 | 15528 | 1300000 | 4 | 339 | 18000 | 41 | 17 | 1.47 |
4 | Algeria | 2381741 | 38813722 | 1.88 | 23.99 | 4.31 | -0.93 | 97 | 21.76 | 76.39 | ⋯ | 3200000 | 37692000 | 676 | 4700000 | 157 | 3973 | 113655 | NA | 38 | 4.48 |
5 | American Samoa | 199 | 54517 | -0.35 | 22.87 | 4.68 | -21.64 | NA | 8.92 | 74.91 | ⋯ | 10000 | NA | 2387 | NA | 3 | NA | 241 | NA | NA | NA |
11 - Joining two data frames#
head(x = dcData::MigrationFlows, n = 3) %>%
select(destcode, origincode, Y2000)
destcode | origincode | Y2000 | |
---|---|---|---|
<fct> | <fct> | <int> | |
1 | FRA | AFG | 923 |
2 | FRA | DZA | 425229 |
3 | FRA | AUS | 9168 |
head(x = dcData::CountryData, n = 6) %>%
select(country, life, infant)
country | life | infant | |
---|---|---|---|
<chr> | <dbl> | <dbl> | |
1 | Afghanistan | 50.49 | 117.23 |
2 | Akrotiri | NA | NA |
3 | Albania | 77.96 | 13.19 |
4 | Algeria | 76.39 | 21.76 |
5 | American Samoa | 74.91 | 8.92 |
6 | Andorra | 82.65 | 3.69 |
head(x = dcData::CountryCentroids, n = 4) %>%
select(name, iso_a3)
name | iso_a3 | |
---|---|---|
<chr> | <chr> | |
1 | Afghanistan | AFG |
2 | Aland | ALA |
3 | Albania | ALB |
4 | Algeria | DZA |
# to translate the variable `country` in table `CountryData`
# to the code `iso_a3`,
# join table `CountryCentroids` to table `CountryData`
InfantMortality <-
CountryCentroids %>%
select(name, iso_a3) %>%
left_join(y = CountryData %>% select(country, infant),
by = c('name' = 'country'))
head(x = InfantMortality, n = 6)
name | iso_a3 | infant | |
---|---|---|---|
<chr> | <chr> | <dbl> | |
1 | Afghanistan | AFG | 117.23 |
2 | Aland | ALA | NA |
3 | Albania | ALB | 13.19 |
4 | Algeria | DZA | 21.76 |
5 | American Samoa | ASM | 8.92 |
6 | Andorra | AND | 3.69 |
# infant mortality of the destination country
MigrationFlows %>%
left_join(y = InfantMortality, by = c('destcode' = 'iso_a3')) %>%
head(n = 6) %>%
select(sex, destcode, origincode, Y2000, name, infant)
sex | destcode | origincode | Y2000 | name | infant | |
---|---|---|---|---|---|---|
<fct> | <chr> | <fct> | <int> | <chr> | <dbl> | |
1 | Male | FRA | AFG | 923 | France | 3.31 |
2 | Male | FRA | DZA | 425229 | France | 3.31 |
3 | Male | FRA | AUS | 9168 | France | 3.31 |
4 | Male | FRA | AUT | 7764 | France | 3.31 |
5 | Male | FRA | AZE | 118 | France | 3.31 |
6 | Male | FRA | BLR | 245 | France | 3.31 |
# infant mortality of the origin country
MigrationFlows %>%
left_join(y = InfantMortality, by = c('origincode' = 'iso_a3')) %>%
head(n = 6) %>%
select(sex, destcode, origincode, Y2000, name, infant)
sex | destcode | origincode | Y2000 | name | infant | |
---|---|---|---|---|---|---|
<fct> | <fct> | <chr> | <int> | <chr> | <dbl> | |
1 | Male | FRA | AFG | 923 | Afghanistan | 117.23 |
2 | Male | FRA | DZA | 425229 | Algeria | 21.76 |
3 | Male | FRA | AUS | 9168 | Australia | 4.43 |
4 | Male | FRA | AUT | 7764 | Austria | 4.16 |
5 | Male | FRA | AZE | 118 | Azerbaijan | 26.67 |
6 | Male | FRA | BLR | 245 | Belarus | 3.64 |
head(x = dcData::CountryGroups, n = 5)
country | G8 | G20 | GGG | |
---|---|---|---|---|
<fct> | <lgl> | <lgl> | <lgl> | |
1 | Canada | TRUE | TRUE | FALSE |
2 | France | TRUE | TRUE | FALSE |
3 | Germany | TRUE | TRUE | FALSE |
4 | Italy | TRUE | TRUE | FALSE |
5 | Japan | TRUE | TRUE | FALSE |
G8Countries <-
CountryGroups %>%
dplyr::filter(G8) %>%
select(country)
G8Countries
country |
---|
<fct> |
Canada |
France |
Germany |
Italy |
Japan |
Russia |
United Kingdom |
United States |
G8CountryData <-
CountryData %>%
inner_join(y = G8Countries, by = join_by(country))
G8CountryData %>%
select(country, GDP, pop)
country | GDP | pop |
---|---|---|
<chr> | <dbl> | <dbl> |
Canada | 1.518e+12 | 34834841 |
France | 2.276e+12 | 66259012 |
Germany | 3.227e+12 | 80996685 |
Italy | 1.805e+12 | 61680122 |
Japan | 4.729e+12 | 127103388 |
Russia | 2.553e+12 | 142470272 |
United Kingdom | 2.387e+12 | 63742977 |
United States | 1.672e+13 | 318892103 |
Exercises#
[11.1]#
Most data verbs are one table-in one table-out (1I1O) whereas join data verbs are two table-in one table-out (2I1O).
dcData::BabyNames %>%
group_by(year) %>%
summarize(total = sum(count)) %>%
head()
year | total |
---|---|
<int> | <int> |
1880 | 201484 |
1881 | 192700 |
1882 | 221537 |
1883 | 216952 |
1884 | 243468 |
1885 | 240856 |
[11.2]#
The value of the matching variable of row \(n\) of the left table does not necessarily match the value of the matching variable of row \(n\) of the right table.
[11.3]#
head(x = dcData::ZipGeography, n = 5)
State | Population | HousingUnits | LandArea | WaterArea | CityName | PostOfficeName | County | AreaCode | Timezone | Latitude | Longitude | ZIP | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<fct> | <dbl> | <dbl> | <dbl> | <dbl> | <fct> | <fct> | <fct> | <dbl> | <fct> | <dbl> | <dbl> | <chr> | |
1 | New York | 0 | 0 | 0.1 | 46.3 | Holtsville | Holtsville | Suffolk | 631 | EST | 40.92233 | -72.63708 | 00501 |
2 | New York | 0 | 0 | 0.0 | 170.3 | Holtsville | Holtsville | Suffolk | 631 | EST | 40.92233 | -72.63708 | 00544 |
3 | 0 | 0 | 0.0 | 4.7 | Adjuntas | Adjuntas | Adjuntas | 787 | EST+1 | 18.16527 | -66.72258 | 00601 | |
4 | 42042 | 15590 | 80.1 | 0.0 | Aguada | Aguada | Aguada | 787 | EST+1 | 18.39310 | -67.18095 | 00602 | |
5 | 55530 | 21626 | 78.7 | 0.1 | Aguadilla | Aguadilla | Aguadilla | 787 | EST+1 | 18.45591 | -67.14578 | 00603 |
tail(x = dcData::ZipDemography, n = 5)
Totalpopulation | Male | Female | MedianAge | Under5years | X18yearsandover | X65yearsandover | Onerace | White | BlackorAfricanAmerican | ⋯ | Medianfamilyincomedollars | Percapitaincomedollars | Familiesbelowpovertylevel | Individualsbelowpovertylevel | Singlefamilyowneroccupiedhomes | Medianvaluedollars | Medianofselectedmonthlyownercosts | WithaMortgage | Notmortgaged | ZIP | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | ⋯ | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <chr> | |
42737 | 1447 | 757 | 690 | 30.3 | 120 | 964 | 99 | 1329 | 141 | 4 | ⋯ | 49803 | 16176 | 26 | 113 | 253 | 104800 | NA | 648 | 325 | 99926 |
42738 | 120 | 69 | 51 | 42.2 | 6 | 88 | 13 | 111 | 105 | 0 | ⋯ | 29750 | 11742 | 8 | 53 | 47 | 80600 | NA | 750 | 178 | 99927 |
42739 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | ⋯ | NA | NA | NA | NA | NA | NA | NA | NA | NA | 99928 |
42740 | 2424 | 1250 | 1174 | 39.6 | 148 | 1726 | 280 | 2191 | 1797 | 3 | ⋯ | 52917 | 21688 | 53 | 235 | 438 | 127600 | NA | 1170 | 405 | 99929 |
42741 | 36 | 22 | 14 | 35.0 | 2 | 29 | 1 | 36 | 33 | 0 | ⋯ | 39792 | 14332 | 0 | 0 | 0 | 0 | NA | 0 | 0 | 99950 |
NewYork <-
ZipGeography %>%
left_join(y = ZipDemography, by = c('ZIP' = 'ZIP')) %>%
filter(State == 'New York') %>%
select(State, Population, Totalpopulation, Under5years, X18yearsandover, X65yearsandover) %>%
transmute(
State = State,
pop = coalesce(Population, Totalpopulation),
age0to4 = Under5years,
age5to17 = pop - Under5years - X18yearsandover,
age18to64 = X18yearsandover - X65yearsandover,
age65plus = X65yearsandover,
total = age0to4 + age5to17 + age18to64 + age65plus,
check1 = total == pop,
)
head(x = NewYork, n = 5)
State | pop | age0to4 | age5to17 | age18to64 | age65plus | total | check1 | |
---|---|---|---|---|---|---|---|---|
<fct> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <lgl> | |
1 | New York | 0 | NA | NA | NA | NA | NA | NA |
2 | New York | 0 | NA | NA | NA | NA | NA | NA |
3 | New York | 289 | NA | NA | NA | NA | NA | NA |
4 | New York | 17310 | 457 | 1213 | 13189 | 2451 | 17310 | TRUE |
5 | New York | 84870 | 4149 | 12750 | 54797 | 13174 | 84870 | TRUE |
PopulationGroups <-
ZipGeography %>%
left_join(y = ZipDemography, by = c('ZIP' = 'ZIP')) %>%
select(State, Population, Totalpopulation, Under5years, X18yearsandover, X65yearsandover) %>%
transmute(
State = State,
pop = coalesce(Population, Totalpopulation),
age0to4 = Under5years,
age5to17 = pop - Under5years - X18yearsandover,
age18to64 = X18yearsandover - X65yearsandover,
age65plus = X65yearsandover,
total = age0to4 + age5to17 + age18to64 + age65plus,
check1 = total == pop,
) %>%
group_by(State) %>%
summarize(
total0to4 = sum(age0to4, na.rm=TRUE),
total5to17 = sum(age5to17, na.rm=TRUE),
total18to64 = sum(age18to64, na.rm=TRUE),
total65plus = sum(age65plus, na.rm=TRUE),
total = sum(pop, na.rm=TRUE)
) %>%
mutate(
diff = total - (total0to4 + total5to17 + total18to64 + total65plus),
check = total0to4 + total5to17 + total18to64 + total65plus == total
)
PopulationGroups
State | total0to4 | total5to17 | total18to64 | total65plus | total | diff | check |
---|---|---|---|---|---|---|---|
<fct> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <lgl> |
0 | 0 | 0 | 0 | 741752 | 741752 | FALSE | |
Massachusetts | 0 | 0 | 0 | 0 | 6349048 | 6349048 | FALSE |
New Hampshire | 0 | 0 | 0 | 0 | 1235735 | 1235735 | FALSE |
New York | 1239375 | 3450540 | 11837416 | 2448224 | 18975844 | 289 | FALSE |
Rhode Island | 0 | 0 | 0 | 0 | 1048319 | 1048319 | FALSE |
Maine | 0 | 0 | 0 | 0 | 1273094 | 1273094 | FALSE |
Vermont | 0 | 0 | 0 | 0 | 608822 | 608822 | FALSE |
Connecticut | 0 | 0 | 0 | 0 | 3405565 | 3405565 | FALSE |
New Jersey | 0 | 0 | 0 | 0 | 8413990 | 8413990 | FALSE |
Pennsylvania | 727790 | 2194356 | 7439389 | 1919052 | 12280587 | 0 | TRUE |
Delaware | 51529 | 143043 | 487241 | 101712 | 783525 | 0 | TRUE |
District of Columbia | 32536 | 82456 | 387169 | 69898 | 572059 | 0 | TRUE |
Maryland | 353393 | 1002779 | 3341007 | 599307 | 5296486 | 0 | TRUE |
Virginia | 461957 | 1276186 | 4547561 | 792248 | 7077952 | 0 | TRUE |
West Virginia | 101791 | 300500 | 1128772 | 276833 | 1807896 | 0 | TRUE |
North Carolina | 539506 | 1424526 | 5116187 | 969035 | 8049254 | 0 | TRUE |
Georgia | 595087 | 1573898 | 5231348 | 785170 | 8185503 | 0 | TRUE |
South Carolina | 264656 | 744885 | 2516802 | 485272 | 4011615 | 0 | TRUE |
Florida | 945721 | 2700255 | 9526267 | 2807388 | 15979631 | 0 | TRUE |
Alabama | 295936 | 827248 | 2743296 | 579644 | 4446124 | 0 | TRUE |
Mississippi | 204273 | 570596 | 1725052 | 343378 | 2843299 | 0 | TRUE |
Tennessee | 374219 | 1022301 | 3582610 | 703259 | 5682389 | 0 | TRUE |
Kentucky | 266475 | 730021 | 2546187 | 504739 | 4047422 | 0 | TRUE |
Ohio | 754924 | 2133383 | 6956963 | 1507732 | 11353002 | 0 | TRUE |
Indiana | 423215 | 1151181 | 3753258 | 752831 | 6080485 | 0 | TRUE |
Michigan | 672002 | 1923732 | 6123512 | 1218958 | 9938204 | 0 | TRUE |
Iowa | 188416 | 545293 | 1756618 | 436232 | 2926559 | 0 | TRUE |
Wisconsin | 342340 | 1026416 | 3292356 | 702550 | 5363662 | 0 | TRUE |
Minnesota | 329597 | 957361 | 3038349 | 594262 | 4919569 | 0 | TRUE |
North Dakota | 39370 | 121280 | 386412 | 94365 | 641427 | 0 | TRUE |
South Dakota | 51054 | 151454 | 443889 | 108109 | 754506 | 0 | TRUE |
Illinois | 876539 | 2368834 | 7673666 | 1499998 | 12419037 | 0 | TRUE |
Montana | 54853 | 175212 | 551216 | 120960 | 902241 | 0 | TRUE |
Missouri | 369905 | 1057798 | 3412243 | 755444 | 5595390 | 0 | TRUE |
Kansas | 188684 | 524227 | 1619075 | 356199 | 2688185 | 0 | TRUE |
Nebraska | 117035 | 333140 | 1028660 | 232159 | 1710994 | 0 | TRUE |
Louisiana | 317160 | 901852 | 2730442 | 516492 | 4465946 | 0 | TRUE |
Arkansas | 181626 | 498826 | 1619048 | 374034 | 2673534 | 0 | TRUE |
Oklahoma | 236042 | 655289 | 2099793 | 455109 | 3446233 | 0 | TRUE |
Texas | 1624203 | 4260543 | 12888235 | 2071668 | 20844649 | 0 | TRUE |
Colorado | 297494 | 803274 | 2784280 | 416049 | 4301097 | 0 | TRUE |
Arizona | 382483 | 985121 | 3096256 | 667891 | 5131751 | 0 | TRUE |
Idaho | 97558 | 270952 | 777971 | 145737 | 1292218 | 0 | TRUE |
Utah | 209299 | 508994 | 1323734 | 190138 | 2232165 | 0 | TRUE |
Wyoming | 30935 | 97924 | 306976 | 57667 | 493502 | 0 | TRUE |
New Mexico | 130286 | 376732 | 1095624 | 211732 | 1814374 | 0 | TRUE |
California | 2486843 | 6762415 | 21019901 | 3595271 | 33864430 | 0 | TRUE |
Nevada | 145790 | 365847 | 1264174 | 218694 | 1994505 | 0 | TRUE |
Hawaii | 78161 | 217601 | 755139 | 160588 | 1211489 | 0 | TRUE |
Oregon | 222906 | 623356 | 2135687 | 438070 | 3420019 | 0 | TRUE |
Washington | 394280 | 1119436 | 3717866 | 662093 | 5893675 | 0 | TRUE |
Alaska | 47514 | 142771 | 399108 | 35599 | 624992 | 0 | TRUE |
NA | 0 | 0 | 0 | 0 | 0 | 0 | TRUE |
#options(repr.plot.width=20, repr.plot.height=20)
ZipGeography %>%
filter(State != '') %>%
left_join(y = ZipDemography, by = c('ZIP' = 'ZIP')) %>%
select(State, Population, Totalpopulation, Under5years, X18yearsandover, X65yearsandover) %>%
transmute(
State = State,
ageTotal = coalesce(Population, Totalpopulation, 0),
age0to4 = coalesce(Under5years, 0),
age5to17 = coalesce(ageTotal - Under5years - X18yearsandover, 0),
age18to64 = coalesce(X18yearsandover - X65yearsandover, 0),
age65plus = coalesce(X65yearsandover, 0)
) %>%
pivot_longer(c(`age0to4`, `age5to17`, `age18to64`, `age65plus`), names_to = 'ageRange', values_to = 'cases') %>%
ggplot() +
geom_bar(
mapping = aes(y = State, x = cases, fill = ageRange),
position = 'stack',
stat = 'identity',
na.rm = TRUE
) +
theme(
text=element_text(size=20)
)
15 - Statistics#
Data graphics
translate each case into a representation in terms of a glyph’s graphical attributes: color, position, shape, size, etc.
A representation for the collective properties of cases?
collective properties: features of groups of cases
summary functions: max, mean, median, etc.
Statistics
an area of science concerned with characterizing case-to-case variation and the collective properties of cases, while also quantifying uncertainty
Data Science
a collaboration between statistics and computer science
Confidence Interval
Confidence Band
How to display variation among cases? The distribution of values of one variable? The distribution of values of several variables?
15.2 - Density distribution of one variable#
NCHS %>%
ggplot(aes(weight, 1)) +
geom_point()
# density of cases, represented by the degree of darkness
NCHS %>%
ggplot(aes(weight, 1)) +
geom_point(alpha = 0.05, position = 'jitter') +
xlab('Weight (kg)') +
ylab('') +
theme_bw() +
scale_color_grey()
# density of cases, represented by the height of a function
# the scale of the y-axis has units of inverse kilograms
# the scale is arranged so that the area under the curve is equal to unity
# this convention facilitates densities for different groups; and means that narrow distributions tend to have high density
NCHS %>%
ggplot(aes(weight)) +
geom_density(color = 'gray', fill = 'gray', alpha = 0.75) +
xlab('Weight (kg)') +
ylab('Density (1/kg)') +
theme_bw() +
scale_color_grey()
NCHS %>%
ggplot(aes(weight)) +
geom_density(color = 'gray', fill = 'gray', alpha = 0.75) +
geom_point(
alpha = 0.02,
mapping = aes(y = 0.002),
position = position_jitter(height = 0.002)
) +
xlab('Weight (kg)') +
ylab('Density (1/kg)') +
theme_bw() +
scale_color_grey()
NCHS %>%
ggplot(aes(x = weight, group = sex)) +
geom_density(aes(color = sex, fill = sex), alpha = 0.5) +
xlab('Weight (kg)') +
ylab('Density (1/kg)') +
theme(legend.position = 'top') +
theme_bw() +
scale_color_grey()
NCHS %>%
mutate(ageGroup = mosaic::ntiles(x = age, n = 6, format = 'interval')) %>%
ggplot(aes(x = weight, group = sex)) +
geom_density(aes(color = sex, fill = sex), alpha = 0.75) +
facet_wrap(~ ageGroup, nrow = 1) +
xlab('Weight (kg)') +
ylab('Density (1/kg)') +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
15.3 - Other depictions of density distribution#
NCHS %>%
mutate(ageGroup = mosaic::ntiles(x = age, n = 6, format = 'interval')) %>%
ggplot(aes(y = weight, x = ageGroup)) +
geom_boxplot(
alpha = 0.25,
mapping = aes(color = ageGroup, fill = ageGroup),
outlier.color = 'gray',
outlier.size = 2
) +
facet_wrap(~ sex) +
xlab('Age Group (yrs)') +
ylab('Weight (kg)') +
theme(legend.position = 'top') +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
NCHS %>%
mutate(ageGroup = mosaic::ntiles(x = age, n = 6, format = 'interval')) %>%
ggplot(aes(y = weight, x = ageGroup)) +
geom_violin(fill = 'gray') +
facet_wrap(~ sex) +
xlab('Age Group (yrs)') +
ylab('Weight (kg)') +
theme(legend.position = 'top') +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
NCHS %>%
mutate(ageGroup = mosaic::ntiles(x = age, n = 6, format = 'interval')) %>%
ggplot(aes(y = weight, x = ageGroup)) +
geom_boxplot(
alpha = 0.75,
mapping = aes(fill = diabetic),
outlier.color = 'gray',
outlier.size = 1,
position = position_dodge(width = 0.8)
) +
facet_grid(sex ~ .) +
xlab('Age Group (yrs)') +
ylab('Weight (kg)') +
theme(legend.position = 'top') +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Warning message:
“Removed 2575 rows containing non-finite values (`stat_boxplot()`).”
Warning message:
“The following aesthetics were dropped during statistical transformation: fill
ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?”
Warning message:
“The following aesthetics were dropped during statistical transformation: fill
ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?”
15.4 - Confidence intervals#
NCHS %>%
mutate(ageGroup = mosaic::ntiles(x = age, n = 6, format = 'interval')) %>%
ggplot(aes(y = weight, x = ageGroup)) +
geom_boxplot(
mapping = aes(fill = diabetic),
notch = TRUE,
outlier.color = 'gray',
outlier.size = 1,
position = position_dodge(width = 0.8)
) +
facet_grid(sex ~ .) +
xlab('Age Group (yrs)') +
ylab('Weight (kg)') +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
Warning message:
“Removed 2575 rows containing non-finite values (`stat_boxplot()`).”
Warning message:
“The following aesthetics were dropped during statistical transformation: fill
ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?”
Warning message:
“The following aesthetics were dropped during statistical transformation: fill
ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?”
15.5 - Model functions#
head(NCHS)
sex | age | pregnant | ethnicity | death | followup | smoker | diabetic | height | weight | ⋯ | bmd | fmhm_other | hdl | chol | bps | bpd | income | pop_weight | psu | stratum | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<fct> | <dbl> | <fct> | <fct> | <chr> | <dbl> | <fct> | <dbl> | <dbl> | <dbl> | ⋯ | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | |
1 | female | 2 | no | Non-Hispanic Black | NA | NA | no | 0 | 0.916 | 12.5 | ⋯ | NA | NA | NA | NA | NA | NA | 0.86 | 2970.804 | 1 | 5 |
2 | male | 77 | no | Non-Hispanic White | alive | 90 | no | 0 | 1.740 | 75.4 | ⋯ | 1.2196 | 0.12359 | 54 | 215 | 98 | 56 | 5.00 | 10224.133 | 3 | 1 |
3 | female | 10 | no | Non-Hispanic White | NA | NA | no | 0 | 1.366 | 32.9 | ⋯ | NA | NA | 30 | 129 | 108 | 63 | 1.47 | 14172.311 | 2 | 7 |
4 | male | 1 | no | Non-Hispanic Black | NA | NA | no | 0 | NA | 13.3 | ⋯ | NA | NA | NA | NA | NA | NA | 0.57 | 3041.593 | 1 | 2 |
5 | male | 49 | no | Non-Hispanic White | alive | 74 | yes | 0 | 1.783 | 92.5 | ⋯ | 1.0870 | 1.17688 | 42 | 279 | 122 | 83 | 5.00 | 30657.312 | 2 | 8 |
6 | female | 19 | no | Other/Multi | alive | 86 | no | 0 | 1.620 | 59.2 | ⋯ | 0.8680 | -1.22452 | 61 | 153 | 114 | 70 | 1.21 | 12224.876 | 2 | 2 |
NCHS %>%
ggplot(aes(x = age, y = weight, color = diabetic)) +
stat_smooth(se = FALSE) +
geom_point(alpha = 0.02) +
ylim(0, 120) +
theme(legend.position = 'top')
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Warning message:
“Removed 3189 rows containing non-finite values (`stat_smooth()`).”
Warning message:
“The following aesthetics were dropped during statistical transformation: colour
ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?”
Warning message:
“Removed 3189 rows containing missing values (`geom_point()`).”
NCHS %>%
ggplot(aes(x = age, y = weight, color = diabetic)) +
stat_smooth() +
geom_point(alpha = 0.02) +
ylim(0, 120) +
theme(legend.position = 'top')
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Warning message:
“Removed 3189 rows containing non-finite values (`stat_smooth()`).”
Warning message:
“The following aesthetics were dropped during statistical transformation: colour
ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?”
Warning message:
“Removed 3189 rows containing missing values (`geom_point()`).”
NCHS %>%
ggplot(aes(x = age, y = weight),
color = diabetic,
fill = diabetic
) +
stat_smooth(method = lm) +
geom_point(alpha = 0.02) +
ylim(0, 120) +
theme(legend.position = 'top')
16 - Data scraping and intake methods#
[Data Scraping]
Data scraping is the gathering of data from sources such as web browsers in which they are not already in a tidy (data frame/table) format and the translation of such data to one or more data frames/tables.
[Data Cleaning]
Data clearning is the correcting of errors in data that stem either from blunders in data entry or from deficiencies in the way data is stored or coded.
Data frame-friendly formats
CSV
base::read.csv(stringsAsFactors = FALSE)
data.table::fread()
mosaic::read.file()
readr::read_csv()
technical software package
Octave/MATLAB
Stata
SPSS
Minitab
SAS
Epi
relational databases
Excel
web
HTML
<table>
JSON
XML
Google spreadsheet qua HTML
API
[List]
A list is an R object used to store a collection of other R objects. Elements of a list can even have different types; e.g., data frames, plots, model objects, even other lists.
as.numeric()
as.character()
readr::parse_number()
webURL <- 'https://mdbeckman.github.io/dcSupplement/data/houses-for-sale.csv'
myDataTable <- readr::read_csv(webURL)
myDataTable %>%
select(price, bedrooms, bathrooms, fuel, air_cond, construction) %>%
head()
Rows: 1728 Columns: 16
── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
dbl (16): price, lot_size, waterfront, age, land_value, construction, air_co...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
price | bedrooms | bathrooms | fuel | air_cond | construction |
---|---|---|---|---|---|
<dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> |
132500 | 2 | 1.0 | 3 | 0 | 0 |
181115 | 3 | 2.5 | 2 | 0 | 0 |
109000 | 4 | 1.0 | 2 | 0 | 0 |
155000 | 3 | 1.5 | 2 | 0 | 0 |
86060 | 2 | 1.0 | 2 | 1 | 1 |
120000 | 4 | 1.0 | 2 | 0 | 0 |
# RStudio
file_name <- file.choose() # then navigate and click on your file
MyDataTable2 <-
data.table::fread(
input = file_name,
nrows = 0,
select = c(1, 4, 5, 10),
drop = c(2, 3, 6)
) %>%
as.data.frame()
web_page <- 'https://en.wikipedia.org/wiki/Mile_run_world_record_progression'
SetOfTables <-
web_page %>%
read_html() %>%
html_nodes(css = 'table') %>%
html_table(fill = TRUE)
length(SetOfTables)
Table3 <- SetOfTables[[3]]
Table3
Table4 <- SetOfTables[[4]]
Table4
Time | Athlete | Nationality | Date | Venue |
---|---|---|---|---|
<chr> | <chr> | <chr> | <chr> | <chr> |
4:52 | Cadet Marshall | United Kingdom | 2 September 1852 | Addiscome |
4:45 | Thomas Finch | United Kingdom | 3 November 1858 | Oxford |
4:45 | St. Vincent Hammick | United Kingdom | 15 November 1858 | Oxford |
4:40 | Gerald Surman | United Kingdom | 24 November 1859 | Oxford |
4:33 | George Farran | United Kingdom | 23 May 1862 | Dublin |
X1 | X2 |
---|---|
<lgl> | <chr> |
NA | Ratified |
NA | Not ratified |
NA | Ratified but later rescinded |
NA | Pending ratification |
# Four of the variables from the `houses-for-sale.csv` file giving features of the Saratoga, NY houses stored as integer codes; each case is a different house.
Houses <-
read_csv('https://mdbeckman.github.io/dcSupplement/data/houses-for-sale.csv')
Houses %>%
select(fuel, heat, sewer, construction) %>%
head()
Rows: 1728 Columns: 16
── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
dbl (16): price, lot_size, waterfront, age, land_value, construction, air_co...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
fuel | heat | sewer | construction |
---|---|---|---|
<dbl> | <dbl> | <dbl> | <dbl> |
3 | 4 | 2 | 0 |
2 | 3 | 2 | 0 |
2 | 3 | 3 | 0 |
2 | 2 | 2 | 0 |
2 | 2 | 3 | 1 |
2 | 2 | 2 | 0 |
# Codes for the house system types.
# describes the codes in a format that makes it easy to add new code values as the need arises
Translations <-
read_csv('https://mdbeckman.github.io/dcSupplement/data/house_codes.csv')
Translations %>%
head()
Rows: 13 Columns: 3
── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): system_type, meaning
dbl (1): code
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
code | system_type | meaning |
---|---|---|
<dbl> | <chr> | <chr> |
0 | new_const | no |
1 | new_const | yes |
1 | sewer_type | none |
2 | sewer_type | private |
3 | sewer_type | public |
0 | central_air | no |
# There is a column for each system type that translates the integer code to a meaningful term.
# In cases where the integer has no corresponding term, `invalid` has been entered; this provides a quick way to distinguish between incorrect entries and missing entries.
CodeVals <-
Translations %>%
spread(key = system_type, value = meaning, fill = 'invalid')
CodeVals %>%
head()
code | central_air | fuel_type | heat_type | new_const | sewer_type |
---|---|---|---|---|---|
<dbl> | <chr> | <chr> | <chr> | <chr> | <chr> |
0 | no | invalid | invalid | no | invalid |
1 | yes | invalid | invalid | yes | none |
2 | invalid | gas | hot air | invalid | private |
3 | invalid | electric | hot water | invalid | public |
4 | invalid | oil | electric | invalid | invalid |
# To carry out the translation, join each variable, one at a time, to the data frame of interest.
# Note how the `by` value changes for each variable.
Houses <-
Houses %>%
left_join(CodeVals %>% select(code, fuel_type), by = c(fuel = 'code')) %>%
left_join(CodeVals %>% select(code, heat_type), by = c(heat = 'code')) %>%
left_join(CodeVals %>% select(code, sewer_type), by = c(sewer = 'code'))
Houses %>% head()
price | lot_size | waterfront | age | land_value | construction | air_cond | fuel | heat | sewer | living_area | pct_college | bedrooms | fireplaces | bathrooms | rooms | fuel_type | heat_type | sewer_type |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <chr> | <chr> | <chr> |
132500 | 0.09 | 0 | 42 | 50000 | 0 | 0 | 3 | 4 | 2 | 906 | 35 | 2 | 1 | 1.0 | 5 | electric | electric | private |
181115 | 0.92 | 0 | 0 | 22300 | 0 | 0 | 2 | 3 | 2 | 1953 | 51 | 3 | 0 | 2.5 | 6 | gas | hot water | private |
109000 | 0.19 | 0 | 133 | 7300 | 0 | 0 | 2 | 3 | 3 | 1944 | 51 | 4 | 1 | 1.0 | 8 | gas | hot water | public |
155000 | 0.41 | 0 | 13 | 18700 | 0 | 0 | 2 | 2 | 2 | 1944 | 51 | 3 | 1 | 1.5 | 5 | gas | hot air | private |
86060 | 0.11 | 0 | 0 | 15000 | 1 | 1 | 2 | 2 | 3 | 840 | 51 | 2 | 0 | 1.0 | 3 | gas | hot air | public |
120000 | 0.68 | 0 | 31 | 14000 | 0 | 0 | 2 | 2 | 2 | 1152 | 22 | 4 | 1 | 1.0 | 8 | gas | hot air | private |
OrdwayBirds <-
OrdwayBirds %>%
mutate(
Month = as.numeric(Month),
Year = as.numeric(Year),
Day = as.numeric(Day)
)
head(OrdwayBirds)
WhenAndWho <-
OrdwayBirds %>%
select(Who = DataEntryPerson, When = Timestamp) %>%
mutate(When = lubridate::mdy_hms(When))
head(WhenAndWho)
WhenAndWho %>%
ggplot(aes(x = When, y = Who)) +
geom_point(alpha = 0.2)
WhenAndWho %>%
group_by(Who) %>%
summarize(start = min(When, na.rm = TRUE),
finish = max(When, na.rm = TRUE)) %>%
head()
bogus | Timestamp | Year | Day | Month | CaptureTime | SpeciesName | Sex | Age | BandNumber | ⋯ | Condition | Release | Comments | DataEntryPerson | Weight | WingChord | Temperature | RecaptureOriginal | RecapturePrevious | TailLength | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<chr> | <chr> | <dbl> | <dbl> | <dbl> | <chr> | <chr> | <chr> | <chr> | <chr> | ⋯ | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | |
3 | . | 4/14/2010 13:20:56 | 1972 | 16 | 7 | 7:00:00 | Song Sparrow | U | AHY (After Hatch Year) | 107-151187 | ⋯ | none | none | none | Jerald Dosch | ||||||
4 | . | NA | NA | NA | ⋯ | Caitlin Baker | |||||||||||||||
5 | . | 5/13/2010 16:00:30 | 1972 | 16 | 7 | 7:00:00 | Song Sparrow | U | AHY (After Hatch Year) | 107-151187 | ⋯ | none | none | none | Caitlin Baker | ||||||
6 | . | 5/13/2010 16:02:15 | 1972 | 16 | 7 | 7:00:00 | Field Sparrow | U | AHY (After Hatch Year) | 1260-74572 | ⋯ | none | none | none | Caitlin Baker | ||||||
7 | . | 5/13/2010 16:03:18 | 1972 | 16 | 7 | 7:00:00 | Field Sparrow | U | AHY (After Hatch Year) | 1260-74541 | ⋯ | none | none | none | Caitlin Baker | ||||||
8 | . | 5/13/2010 16:04:23 | 1972 | 16 | 7 | 7:00:00 | Song Sparrow | U | AHY (After Hatch Year) | 107-151188 | ⋯ | none | none | none | Caitlin Baker |
Who | When | |
---|---|---|
<chr> | <dttm> | |
3 | Jerald Dosch | 2010-04-14 13:20:56 |
4 | Caitlin Baker | NA |
5 | Caitlin Baker | 2010-05-13 16:00:30 |
6 | Caitlin Baker | 2010-05-13 16:02:15 |
7 | Caitlin Baker | 2010-05-13 16:03:18 |
8 | Caitlin Baker | 2010-05-13 16:04:23 |
Warning message:
“Removed 4 rows containing missing values (`geom_point()`).”
Warning message:
“There were 2 warnings in `summarize()`.
The first warning was:
ℹ In argument: `start = min(When, na.rm = TRUE)`.
ℹ In group 1: `Who = ""`.
Caused by warning in `min.default()`:
! no non-missing arguments to min; returning Inf
ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.”
Who | start | finish |
---|---|---|
<chr> | <dttm> | <dttm> |
Inf | -Inf | |
Abby Colehour | 2011-04-23 15:50:24 | 2011-04-23 15:50:24 |
Brennan Panzarella | 2010-09-13 10:48:12 | 2011-04-10 21:58:56 |
Caitlin Baker | 2010-05-13 16:00:30 | 2010-05-28 19:41:52 |
Emily Merrill | 2010-06-08 09:10:01 | 2010-06-08 14:47:21 |
Jerald Dosch | 2010-04-14 13:20:56 | 2010-04-14 13:20:56 |
Exercises#
[16.6.1]#
Here are some character strings containing times or dates written in different formats. Your task it twofold: (A) for each, choose an appropriate function from the lubridate
package to translate the character string into a datetime object in R and then (B) use R to calculate the number of days between that date and your birthday.
'April 30, 1777' # Johann Carl Friedrich Gauss
'06-23-1912' # Alan Turing
'3 March 1847' # Alexander Graham Bell
'Nov. 11th, 1918 at 11:00 am' # the armistice ending WWI on the western front
'July 20, 1969' # the first manned moon landing
lubridate::mdy('April 30, 1777')
lubridate::mdy('06-23-1912')
lubridate::dmy('3 March 1847')
lubridate::mdy_hm('Nov. 11th, 1918 at 11:00 am')
lubridate::mdy('July 20, 1969')
birthday <- lubridate::mdy('February 4, 1992')
birthday
[1] "1918-11-11 11:00:00 UTC"
birthday - lubridate::mdy('April 30, 1777')
birthday - lubridate::mdy('06-23-1912')
birthday - lubridate::dmy('3 March 1847')
birthday - lubridate::as_date(lubridate::mdy_hm('Nov. 11th, 1918 at 11:00 am'))
birthday - lubridate::mdy('July 20, 1969')
Time difference of 78441 days
Time difference of 29080 days
Time difference of 52933 days
Time difference of 26748 days
Time difference of 8234 days
[16.6.2]#
Here are some strings containing numerical amounts. For each one, say whether as.numeric()
or readr::parse_number()
(or both or neither) properly converts the given string to a numeric value.
'42,659.30'
'17%'
'Nineteen'
'£100'
'9.8 m/seconds-square'
'6.62606957 x 10^-34 m2 kg / s'
'6.62606957e-34'
'42.659,30' # a European style
as.numeric('42,659.30') # incorrect
as.numeric('17%') # incorrect
as.numeric('Nineteen') # incorrect
as.numeric('£100') # incorrect
as.numeric('9.8 m/seconds-square') # incorrect
as.numeric('6.62606957 x 10^-34 m2 kg / s') # incorrect
as.numeric('6.62606957e-34') # correct
as.numeric('42.659,30') # incorrect
Warning message in eval(expr, envir, enclos):
“NAs introduced by coercion”
Warning message in eval(expr, envir, enclos):
“NAs introduced by coercion”
Warning message in eval(expr, envir, enclos):
“NAs introduced by coercion”
Warning message in eval(expr, envir, enclos):
“NAs introduced by coercion”
Warning message in eval(expr, envir, enclos):
“NAs introduced by coercion”
Warning message in eval(expr, envir, enclos):
“NAs introduced by coercion”
Warning message in eval(expr, envir, enclos):
“NAs introduced by coercion”
readr::parse_number('42,659.30') # correct
readr::parse_number('17%') # correct
readr::parse_number('Nineteen') # incorrect
readr::parse_number('£100') # correct
readr::parse_number('9.8 m/seconds-square') # correct
readr::parse_number('6.62606957 x 10^-34 m2 kg / s') # incorrect
readr::parse_number('6.62606957e-34') # correct
readr::parse_number('42.659,30') # incorrect
Warning message:
“1 parsing failure.
row col expected actual
1 -- a number Nineteen
”
[16.6.3]#
Grab Table 4 (or another similar table) from the Wikipedia page on world records in the mile (or some similar event). Make a plot of the record time versus the date in which it occurred. Also, mark each point with the name of the athlete written above the point. (Hint: Use geom_text()
.) To convert time entries such as '4:20.5'
into seconds, use the lubridate
package’s as.duration(ms('40:20.5'))
. You can get rid of the footnote markers such as [5]
in the dates using the gsub()
transformation function which replaces the characters identified in the first argument with those in the second argument. The string '\\[.\\]$'
is an example of a regular expression which identifies a pattern of characters, in this case a single character in square brackets just before the end of the string.
Table4 %>%
mutate(
Date = lubridate::dmy(gsub('\\[.\\]$', '', Date)),
Time = lubridate::as.duration(lubridate::ms(Time))
) %>%
ggplot(aes(x = Date, y = Time)) +
geom_point() +
geom_text(aes(label = Athlete, hjust = -0.1, vjust = -1)) +
labs(x = 'Year', y = 'Time (s)')
Error in `mutate()`:
ℹ In argument: `Date = lubridate::dmy(gsub("\\[.\\]$", "", Date))`.
Caused by error in `as.character()`:
! cannot coerce type 'closure' to vector of type 'character'
Traceback:
1. Table4 %>% mutate(Date = lubridate::dmy(gsub("\\[.\\]$", "",
. Date)), Time = lubridate::as.duration(lubridate::ms(Time))) %>%
. ggplot(aes(x = Date, y = Time))
2. ggplot(., aes(x = Date, y = Time))
3. mutate(., Date = lubridate::dmy(gsub("\\[.\\]$", "", Date)),
. Time = lubridate::as.duration(lubridate::ms(Time)))
4. mutate.data.frame(., Date = lubridate::dmy(gsub("\\[.\\]$", "",
. Date)), Time = lubridate::as.duration(lubridate::ms(Time)))
5. mutate_cols(.data, dplyr_quosures(...), by)
6. withCallingHandlers(for (i in seq_along(dots)) {
. poke_error_context(dots, i, mask = mask)
. context_poke("column", old_current_column)
. new_columns <- mutate_col(dots[[i]], data, mask, new_columns)
. }, error = dplyr_error_handler(dots = dots, mask = mask, bullets = mutate_bullets,
. error_call = error_call, error_class = "dplyr:::mutate_error"),
. warning = dplyr_warning_handler(state = warnings_state, mask = mask,
. error_call = error_call))
7. mutate_col(dots[[i]], data, mask, new_columns)
8. mask$eval_all_mutate(quo)
9. eval()
10. lubridate::dmy(gsub("\\[.\\]$", "", Date))
11. .parse_xxx(..., orders = "dmy", quiet = quiet, tz = tz, locale = locale,
. truncated = truncated)
12. unlist(lapply(list(...), .num_to_date), use.names = FALSE)
13. lapply(list(...), .num_to_date)
14. gsub("\\[.\\]$", "", Date)
15. .handleSimpleError(function (cnd)
. {
. local_error_context(dots, i = frame[[i_sym]], mask = mask)
. if (inherits(cnd, "dplyr:::internal_error")) {
. parent <- error_cnd(message = bullets(cnd))
. }
. else {
. parent <- cnd
. }
. message <- c(cnd_bullet_header(action), i = if (has_active_group_context(mask)) cnd_bullet_cur_group_label())
. abort(message, class = error_class, parent = parent, call = error_call)
. }, "cannot coerce type 'closure' to vector of type 'character'",
. base::quote(as.character(x)))
16. h(simpleError(msg, call))
17. abort(message, class = error_class, parent = parent, call = error_call)
18. signal_abort(cnd, .file)
Project: Popular Names#
Project: Popular Names from Kaplan, Daniel & Matthew Beckman. (2021). Data Computing. 2nd Ed. Home.
?dcData::BabyNames
BabyNames package:dcData R Documentation
_N_a_m_e_s _o_f _c_h_i_l_d_r_e_n _a_s _r_e_c_o_r_d_e_d _b_y _t_h_e _U_S _S_o_c_i_a_l _S_e_c_u_r_i_t_y _A_d_m_i_n_i_s_t_r_a_t_i_o_n.
_D_e_s_c_r_i_p_t_i_o_n:
The US Social Security Administration provides yearly lists of
names given to babies. These data combine the yearly lists.
`BabyNames` is the raw data from the SSA. The case is a
year-name-sex, for example: Jane F 1922. The count is the number
of children of that sex given that name in that year. Names
assigned to fewer than five children of one sex in any year are
not listed, presumably out of privacy concerns.
_U_s_a_g_e:
data("BabyNames")
_F_o_r_m_a_t:
A data frame with 1,792,091 entries on four variables:
name The given name (character string)
sex F or M (character string)
count The number of babies given that name and of that sex.
(integer)
year Year of birth (integer)
_S_o_u_r_c_e:
The data were compiled from the US Social Security Administration
web site: http://www.ssa.gov/oact/babynames/names.zip
_E_x_a_m_p_l_e_s:
data(BabyNames)
str(BabyNames)
head(x = dcData::BabyNames)
name | sex | count | year | |
---|---|---|---|---|
<chr> | <chr> | <int> | <int> | |
1 | Mary | F | 7065 | 1880 |
2 | Anna | F | 2604 | 1880 |
3 | Emma | F | 2003 | 1880 |
4 | Elizabeth | F | 1939 | 1880 |
5 | Minnie | F | 1746 | 1880 |
6 | Margaret | F | 1578 | 1880 |
count(x = dcData::BabyNames, sex)
sex | n |
---|---|
<chr> | <int> |
F | 1062432 |
M | 729659 |
count(x = dcData::BabyNames, year) %>%
arrange(desc(n)) %>%
head(n = 10)
year | n | |
---|---|---|
<int> | <int> | |
1 | 2008 | 35038 |
2 | 2007 | 34925 |
3 | 2009 | 34668 |
4 | 2006 | 34066 |
5 | 2010 | 34027 |
6 | 2011 | 33858 |
7 | 2012 | 33658 |
8 | 2013 | 33072 |
9 | 2005 | 32527 |
10 | 2004 | 32033 |
select(.data = dcData::BabyNames, name, count) %>%
arrange(desc(count)) %>%
head(n = 10)
name | count | |
---|---|---|
<chr> | <int> | |
1 | Linda | 99674 |
2 | Linda | 96210 |
3 | James | 94758 |
4 | Michael | 92711 |
5 | Robert | 91652 |
6 | Linda | 90994 |
7 | Michael | 90629 |
8 | Michael | 90512 |
9 | James | 88610 |
10 | Michael | 88481 |
select(.data = dcData::BabyNames, name, count) %>%
arrange(desc(count)) %>%
filter(count == 5)
name | count |
---|---|
<chr> | <int> |
Adelle | 5 |
Adina | 5 |
Adrienne | 5 |
Albertine | 5 |
Alys | 5 |
Ana | 5 |
Araminta | 5 |
Arthur | 5 |
Birtha | 5 |
Bulah | 5 |
Caddie | 5 |
Celie | 5 |
Charlotta | 5 |
Clair | 5 |
Concepcion | 5 |
Cordella | 5 |
Corrine | 5 |
Delila | 5 |
Delphine | 5 |
Dosha | 5 |
Edgar | 5 |
Elaine | 5 |
Elisa | 5 |
Ellar | 5 |
Elmire | 5 |
Elvina | 5 |
Ena | 5 |
Estie | 5 |
Etter | 5 |
Fronnie | 5 |
⋮ | ⋮ |
Zayceon | 5 |
Zayid | 5 |
Zaylynn | 5 |
Zecheriah | 5 |
Zedric | 5 |
Zefram | 5 |
Zekhi | 5 |
Zenith | 5 |
Zennon | 5 |
Zepplin | 5 |
Zevon | 5 |
Zhaiden | 5 |
Zhen | 5 |
Zhian | 5 |
Zien | 5 |
Zierre | 5 |
Zimri | 5 |
Ziquan | 5 |
Ziyaad | 5 |
Ziyang | 5 |
Zmari | 5 |
Zolan | 5 |
Zurich | 5 |
Zyeer | 5 |
Zyere | 5 |
Zyhier | 5 |
Zylar | 5 |
Zymari | 5 |
Zymeer | 5 |
Zyree | 5 |
nonGlyphReadyForm <- dcData::BabyNames %>%
filter(name %in% c('Abraham', 'Franklin', 'Woodrow'))
head(x = nonGlyphReadyForm)
nonGlyphReadyForm %>%
ggplot() +
geom_line(
alpha = 0.5,
linewidth = 1,
mapping = aes(x = year,
y = count,
color = name,
linetype = name),
position = 'identity',
stat = 'identity',
show.legend=TRUE
) +
xlab('Year') +
ylab('Popularity') +
ggtitle('Popularity of Names Over Time')
GlyphReadyForm <- dcData::BabyNames %>%
filter(name %in% c('Abraham', 'Franklin', 'Woodrow')) %>%
group_by(year, name) %>%
summarize(total = sum(count))
head(x = GlyphReadyForm)
GlyphReadyForm %>%
ggplot() +
geom_line(
alpha = 0.5,
linewidth = 1,
mapping = aes(x = year,
y = total,
color = name,
linetype = name),
position = 'identity',
stat = 'identity',
show.legend=TRUE
) +
xlab('Year') +
ylab('Popularity') +
ggtitle('Popularity of Names Over Time')
GlyphReadyForm <- dcData::BabyNames %>%
filter(name %in% c('Dave', 'Davey', 'David')) %>%
group_by(year, name) %>%
summarize(total = sum(count))
head(x = GlyphReadyForm)
GlyphReadyForm %>%
ggplot() +
geom_line(
alpha = 0.5,
linewidth = 1,
mapping = aes(x = year,
y = total,
color = name,
linetype = name),
position = 'identity',
stat = 'identity',
show.legend=TRUE
) +
xlab('Year') +
ylab('Popularity') +
ggtitle('Popularity of Names Over Time')
dcData::BabyNames %>%
filter(name %in% c('Davey'))
name | sex | count | year |
---|---|---|---|
<chr> | <chr> | <int> | <int> |
Davey | M | 5 | 1925 |
Davey | M | 9 | 1928 |
Davey | M | 89 | 1929 |
Davey | F | 5 | 1930 |
Davey | M | 54 | 1930 |
Davey | M | 45 | 1931 |
Davey | M | 26 | 1932 |
Davey | M | 15 | 1933 |
Davey | M | 22 | 1934 |
Davey | M | 17 | 1935 |
Davey | M | 9 | 1936 |
Davey | M | 15 | 1937 |
Davey | M | 14 | 1938 |
Davey | M | 16 | 1939 |
Davey | M | 8 | 1940 |
Davey | M | 17 | 1941 |
Davey | M | 18 | 1942 |
Davey | F | 5 | 1943 |
Davey | M | 19 | 1943 |
Davey | M | 10 | 1944 |
Davey | M | 15 | 1945 |
Davey | M | 25 | 1946 |
Davey | M | 37 | 1947 |
Davey | M | 42 | 1948 |
Davey | M | 50 | 1949 |
Davey | M | 41 | 1950 |
Davey | M | 39 | 1951 |
Davey | M | 44 | 1952 |
Davey | M | 74 | 1953 |
Davey | M | 55 | 1954 |
⋮ | ⋮ | ⋮ | ⋮ |
Davey | M | 29 | 1988 |
Davey | M | 33 | 1989 |
Davey | M | 20 | 1990 |
Davey | M | 26 | 1991 |
Davey | M | 26 | 1992 |
Davey | M | 72 | 1993 |
Davey | F | 5 | 1994 |
Davey | M | 75 | 1994 |
Davey | M | 34 | 1995 |
Davey | M | 31 | 1996 |
Davey | M | 34 | 1997 |
Davey | M | 25 | 1998 |
Davey | M | 28 | 1999 |
Davey | F | 6 | 2000 |
Davey | M | 14 | 2000 |
Davey | M | 20 | 2001 |
Davey | M | 29 | 2002 |
Davey | M | 14 | 2003 |
Davey | M | 14 | 2004 |
Davey | M | 22 | 2005 |
Davey | F | 5 | 2006 |
Davey | M | 18 | 2006 |
Davey | M | 25 | 2007 |
Davey | M | 14 | 2008 |
Davey | M | 14 | 2009 |
Davey | M | 26 | 2010 |
Davey | M | 19 | 2011 |
Davey | M | 17 | 2012 |
Davey | F | 7 | 2013 |
Davey | M | 18 | 2013 |
GlyphReadyForm <- dcData::BabyNames %>%
filter(name %in% c('Davey')) %>%
group_by(year, name) %>%
summarize(total = sum(count))
head(x = GlyphReadyForm)
GlyphReadyForm %>%
ggplot() +
geom_line(
alpha = 0.5,
linewidth = 1,
mapping = aes(x = year,
y = total,
color = name,
linetype = name),
position = 'identity',
stat = 'identity',
show.legend=TRUE
) +
xlab('Year') +
ylab('Popularity') +
ggtitle('Popularity of Names Over Time')
GlyphReadyForm <- dcData::BabyNames %>%
filter(name %in% c('Carol', 'Carole', 'Caroline', 'Carolyn', 'Carrie')) %>%
group_by(year, name) %>%
summarize(total = sum(count))
head(x = GlyphReadyForm)
GlyphReadyForm %>%
ggplot() +
geom_line(
alpha = 0.5,
linewidth = 1,
mapping = aes(x = year,
y = total,
color = name,
linetype = name),
position = 'identity',
stat = 'identity',
show.legend=TRUE
) +
xlab('Year') +
ylab('Popularity') +
ggtitle('Popularity of Names Over Time')
GlyphReadyForm <- dcData::BabyNames %>%
filter(name %in% c('Jeff', 'Jeffrey', 'Geof', 'Geoff', 'Geoffrey')) %>%
group_by(year, name) %>%
summarize(total = sum(count))
head(x = GlyphReadyForm)
GlyphReadyForm %>%
ggplot() +
geom_line(
alpha = 0.5,
linewidth = 1,
mapping = aes(x = year,
y = total,
color = name,
linetype = name),
position = 'identity',
stat = 'identity',
show.legend=TRUE
) +
xlab('Year') +
ylab('Popularity') +
ggtitle('Popularity of Names Over Time')
GlyphReadyForm <- dcData::BabyNames %>%
filter(name %in% c('Geof', 'Geoff')) %>%
group_by(year, name) %>%
summarize(total = sum(count))
head(x = GlyphReadyForm)
GlyphReadyForm %>%
ggplot() +
geom_line(
alpha = 0.5,
linewidth = 1,
mapping = aes(x = year,
y = total,
color = name,
linetype = name),
position = 'identity',
stat = 'identity',
show.legend=TRUE
) +
xlab('Year') +
ylab('Popularity') +
ggtitle('Popularity of Names Over Time')
GlyphReadyForm <- dcData::BabyNames %>%
filter(name %in% c(
'Elizabeth',
'Elisa',
'Elise',
'Eliza',
'Ellie',
'Elsa',
'Isabelle',
'Lisa',
'Liz',
'Liza',
'Lizzie'
)) %>%
group_by(year, name) %>%
summarize(total = sum(count))
head(x = GlyphReadyForm)
GlyphReadyForm %>%
ggplot() +
geom_line(
alpha = 0.5,
linewidth = 1,
mapping = aes(x = year,
y = total,
color = name,
linetype = name),
position = 'identity',
stat = 'identity',
show.legend=TRUE
) +
xlab('Year') +
ylab('Popularity') +
ggtitle('Popularity of Names Over Time')
GlyphReadyForm <- dcData::BabyNames %>%
filter(name %in% c(
'Bethany',
'Bessie',
'Betsey',
'Betta',
'Betty'
)) %>%
group_by(year, name) %>%
summarize(total = sum(count))
head(x = GlyphReadyForm)
GlyphReadyForm %>%
ggplot() +
geom_line(
alpha = 0.5,
linewidth = 1,
mapping = aes(x = year,
y = total,
color = name,
linetype = name),
position = 'identity',
stat = 'identity',
show.legend=TRUE
) +
xlab('Year') +
ylab('Popularity') +
ggtitle('Popularity of Names Over Time')
Project: Bicycle Sharing#
Project: Bicycle Sharing from Kaplan, Daniel & Matthew Beckman. (2021). Data Computing. 2nd Ed. Home.
# information about the location of each of the stations in the system
station_url <- 'https://mdbeckman.github.io/dcSupplement/data/DC-Stations.csv'
Stations <- readr::read_csv(station_url)
# the rental history from 2014 Q4
# the `Trips` data table is a random subset of 10,000 trips from the full quarterly data
# the full data set of more than 600,000 trips can be accessed by removing `-Small` from the url
trip_url <- 'https://mdbeckman.github.io/dcSupplement/data/Trips-History-Data-2014-Q4-Small.rds'
Trips <- readRDS(gzcon(url(trip_url)))
source('https://mdbeckman.github.io/dcSupplement/R/haversine.R')
Rows: 347 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): name
dbl (4): lat, long, nbBikes, nbEmptyDocks
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Stations <-
# Stations %>%
# select(lat, long)
# Stations %>%
# head()
# Trips <-
# Trips %>%
# select(sstation, estation, client, sdate, edate)
# Trips %>%
# head()
# the distribution of times that bikes were checked out
Trips %>%
ggplot(aes(x = sdate)) +
geom_density(fill = 'gray', color = NA)
How long?#
Make a box-and-whisker plot showing the distribution of the duration of rental events broken down by client
. The duration of the rental can be calculated as as.numeric(edate - sdate)
. The units will be in either hours, minutes, or seconds. It should not be much trouble for you to figure out which one. When you make your plot, you will likely find that the axis range is being set by a few outliers. These may be bikes that were forgotten. Arrange your scale to ignore these outliers, or filter them out.
When are bikes used?#
[1] Variable Trips$sdate
indicates the datetime that the bicycle was checked out of the station. Make histograms or density plots of each of these discrete components. Explain what each plot is showing about how bikes are checked out. For example, the distribution by hour of the day shows that few bikes are checked out before 5am, and that there are busy times around rush hour: 8am and 5pm.
Day of the year: bike rentals are highest in October; decline in November; and pick up again in the first half of December.
Day of the week: bike rentals are fairly stable over the course of the week; Thursday and Friday see the most rentals, and Saturday and Sunday see the least. Monday sees a little bit more than Tuesday and Wednesday.
Hour of the day: bike rentals are low before 5:00am; spike up during morning rush hour; decrease a bit throughout the day; spike back up during evening rush hour; and rapidly decline to early morning levels into the night.
Minute of the hour: I’m not sure there’s much of a trend along this scale, other than that the numbers of rentals oscillate a bit (i.e., it is not fixed).
[2] A similar sort of display of events per hour can be accomplished by calculating and displaying each hour’s count. The graphic shows a lot of variation of bike use over the course of the day. Now consider two additional variables: the day of the week and the client type. Group the bike rentals by three variables: hour of the day, day of the week, and client type. Find the total number of events in each grouping and plot this count versus hour. Use the group
aesthetic to represent one of the other variables and faceting to represent the other. (Hint: utilize facet_wrap()
in the plotting commands.) Make the same sort of display of how bike rental vary of hour, day of the week, and client type, but use geom_density()
rather than grouping and counting. Compare the two displays–one of discrete counts and one of density–and describe any major differences.
Generally, registered clients rent more bikes than casual clients across hour of the day and across day of the week. On an hourly basis, however, casual client rentals usually reach a maximum in the afternoon, when registered client rentals dip in between morning and evening rush hour. Also, casual client rentals peak over the weekend, when registered client rentals drop the most.
Trips$sdate[1]
lubridate::yday(Trips$sdate)[1] # day of the year from 1 to 365
lubridate::wday(Trips$sdate)[1] # day of the week from Sunday to Saturday
lubridate::hour(Trips$sdate)[1] # hour of the day
lubridate::minute(Trips$sdate)[1] # minute of the hour
[1] "2014-10-15 08:58:00 UTC"
Trips %>%
ggplot(aes(x = lubridate::yday(sdate))) +
geom_density(fill = 'gray', color = NA) +
xlab('Day of the Year')
Trips %>%
ggplot(aes(x = lubridate::wday(sdate))) +
geom_density(fill = 'gray', color = NA) +
xlab('Day of the Week') +
scale_x_continuous(breaks = 0:8)
Trips %>%
ggplot(aes(x = lubridate::hour(sdate))) +
geom_density(fill = 'gray', color = NA) +
xlab('Hour of the Day') +
scale_x_continuous(breaks = 0:24)
Trips %>%
ggplot(aes(x = lubridate::minute(sdate))) +
geom_density(fill = 'gray', color = NA) +
xlab('Minute of the Hour') +
scale_x_continuous(breaks = 0:60)
Trips %>%
mutate(
HOD = lubridate::hour(sdate),
DOW = lubridate::wday(sdate),
) %>%
group_by(HOD, DOW, client) %>%
summarize(total = n()) %>%
head()
`summarise()` has grouped output by 'HOD', 'DOW'. You can override using the
`.groups` argument.
HOD | DOW | client | total |
---|---|---|---|
<int> | <dbl> | <chr> | <int> |
0 | 1 | Casual | 1 |
0 | 1 | Registered | 30 |
0 | 2 | Registered | 5 |
0 | 3 | Casual | 2 |
0 | 3 | Registered | 8 |
0 | 4 | Registered | 7 |
options(repr.plot.width = 20)
Trips %>%
mutate(
HOD = lubridate::hour(sdate),
DOW = lubridate::wday(sdate),
) %>%
group_by(HOD, DOW, client) %>%
summarize(total = n()) %>%
ggplot(mapping = aes(x = HOD, y = total)) +
geom_line(mapping = aes(color = client)) + # the `group` aesthetic is unnecessary when using the `color` aesthetic
geom_point() +
facet_wrap(. ~ DOW, nrow = 1) +
xlab('H') +
scale_x_continuous(breaks = 0:23)
options(repr.plot.width = 7)
How far?#
Find the distance between each pair of stations. You know the position from the lat
and long
variables in Stations
. This is enough information to find the distance. The calculation has been implemented in the haversine()
function. haversine()
is a transformation function. To use it, create a data table where a case is a pair of stations and there are variables for the latitude and longitude of the starting station and the ending station. To do this, join the Station
data to itself. The statements Simple
and Simple2
show how to create appropriately named variables for joining. Look the head()
of Simple
and Simple2
and make sure you understand how they are related to Stations
. The joining of Simple
and Simple2
should match every station to every other station. (Since a ride can start and end at the same station, it also makes sense to match each station to itself.) This sort of matching does not make use of any matching variables; everything is matched to everything else. This is called a full outer join. (A full outer join matches every case in the left table to each and every case in the right table.) Make sure you understand what the full outer join does before proceeding. For instance, you should be able to predict how many cases the output will have when the left input has n cases and the right has m cases.
[1] There are 347 cases in the Stations
data table. How many cases will there be in a full outer join of Simple
to Simple2
?
\(120409 = 347 \times 347\)
It’s often impractical to carry out a full outer join. For example, joining BabyNames
to itself with a full outer join will generate a result with more than three trillion cases. (Three trillion cases from the BabyNames
data is the equivalent of about 5 million hours of MP3 compressed music. A typical human lifespan is about 0.6 million hours.) Perform the full outer join and then use haversine()
to compute the distance between each pair of stations. Check your result for sensibility. Make a histogram of the station-to-station distances and explain where it looks like what you would expect. (Hint: you could use the Internet to look up the distance from one end of Washington, D.C. to the other.) Once you have PairDistances
, you can join it with Trips
to calculate the start-to-end distance of each trip. (Of course, a rider may not go directly from one station to another.)
[2] Look at the variables in Stations
and Trips
and explain why Simple
and Simple2
were given different variable names for the station.
Simple
and Simple2
were given different variable names for the station because we want start station-end station pairs, which serves as the composite key on which we join the resulting table to the Trips
table. If we don’t explicitly change the name of the name
variable, then we won’t have a composite key on which to join the resulting table to the Trips
table.
An inner_join()
is appropriate for finding the distance of each ride. (Watch out! The Trips
data and the PairDistances
data are large enough that the join is expensive: it takes about a minute.) Display the distribution of the ride distances of the ride.
[3] Compare it to the distances between pairs of stations. Are they similar? Why or why not?
The visualization shows that the pairwise distance between stations has a larger spread than trip distance, which is skewed to the right (i.e., most trips are short, whereas a fair number of station-station distances are quite large: these routes are probably not used very often).
Simple <-
Stations %>%
select(name, lat, long) %>%
rename(sstation = name)
Simple2 <-
Simple %>%
rename(estation = sstation, lat2 = lat, long2 = long)
StationPairs <-
merge(Simple, Simple2, by = NULL)
PairDistances <-
StationPairs %>%
mutate(distance = haversine(lat, long, lat2, long2)) %>%
select(sstation, estation, distance)
Trips %>%
inner_join(PairDistances, by = c('sstation', 'estation')) %>%
ggplot(aes(x = distance)) +
geom_density()
Mapping the stations#
You can draw detailed maps with the leaflet
package. You may need to install it. leaflet
works much like ggplot()
but provides special facilities for maps.
stationMap <-
leaflet(Stations) %>%
addTiles() %>%
addCircleMarkers(radius = 2, color = 'red')
#setView(-77.04, 38.9, zoom = 12)
stationMap
Assuming "long" and "lat" are longitude and latitude, respectively
Long-distance stations#
Around each station on the map, draw a circle whose radius reflects the median distance covered by rentals starting at that station. To draw the circles, use the same leaflet
commands as before, but add in a line like this: addCircles(radius = ~ mid, color = "blue", opacity = 0.0001)
. For addCircles()
to draw circles at the right scale, the units of the median distance should be presented in meters rather than kilometers. This will create too much overlap, unfortunately. So, set the radius to be half or one-third the median distance in meters. From your map, explain the pattern you see in the relationship between station location and median distance.
Project: Bird Species#
Project: Bird Species from Kaplan, Daniel & Matthew Beckman. (2021). Data Computing. 2nd Ed. Home.
?dcData::OrdwayBirds
OrdwayBirds package:dcData R Documentation
_B_i_r_d_s _c_a_p_t_u_r_e_d _a_n_d _r_e_l_e_a_s_e_d _a_t _O_r_d_w_a_y, _c_o_m_p_l_e_t_e _a_n_d _u_n_c_l_e_a_n_e_d
_D_e_s_c_r_i_p_t_i_o_n:
The historical record of birds captured and released at the
Katharine Ordway Natural History Study Area, a 278-acre preserve
in Inver Grove Heights, Minnesota, owned and managed by Macalester
College.
_U_s_a_g_e:
data("OrdwayBirds")
_F_o_r_m_a_t:
A data frame with 15,829 observations on the following 26
variables:
bogus
Timestamp indicates when the data were entered into an electronic
record, not anything about the bird being described
Year year of capture
Day day of capture
Month month of capture
CaptureTime time of capture
SpeciesName
Sex
Age
BandNumber
TrapID
Weather
BandingReport
RecaptureYN
RecaptureMonth
RecaptureDay
Condition
Release
Comments
DataEntryPerson
Weight
WingChord
Temperature
RecaptureOriginal
RecapturePrevious
TailLength
_D_e_t_a_i_l_s:
There are many extraneous levels of variables such as species.
Part of the purpose of this data set is to teach about data
cleaning.
_S_o_u_r_c_e:
Jerald Dosch, Dept. of Biology, Macalester College: the manager of
the Study Area.
_S_e_e _A_l_s_o:
`OrdwaySpeciesNames`
_E_x_a_m_p_l_e_s:
data(OrdwayBirds)
str(OrdwayBirds)
?dcData::OrdwaySpeciesNames
OrdwaySpeciesNames package:dcData R Documentation
_C_o_r_r_e_c_t_e_d _S_p_e_c_i_e_s _N_a_m_e_s _f_o_r _t_h_e _O_r_d_w_a_y _B_i_r_d_s
_D_e_s_c_r_i_p_t_i_o_n:
This data frame lists all the species name that appear in
`OrdwayBirds`. In many cases, the species name was mis-spelled in
the original. As a result, many birds are listed as separate
species even though, in reality, they all belong to the same
species. For each potentially mis-spelled species name, this table
gives a standardized name.
_U_s_a_g_e:
data("OrdwaySpeciesNames")
_F_o_r_m_a_t:
A data frame with 265 observations on the following 2 variables:
SpeciesName The original spelling, or misspelling, of a bird
species.
SpeciesNameCleaned Corrected spelling (or NA if the original was
not identifiable.)
_S_o_u_r_c_e:
Daniel Kaplan and students in a 2013 Data and Computing
Fundamentals class at Macalester College (Saint Paul, MN) read
through original names in `OrdwayBirds` and typed corrected
spelling shown in `SpeciesNameCleaned`.
_S_e_e _A_l_s_o:
`OrdwayBirds`
_E_x_a_m_p_l_e_s:
data(OrdwaySpeciesNames)
str(OrdwaySpeciesNames)
OrdwayBirds %>%
select(Month, Day) %>%
head()
Month | Day | |
---|---|---|
<dbl> | <dbl> | |
3 | 7 | 16 |
4 | NA | NA |
5 | 7 | 16 |
6 | 7 | 16 |
7 | 7 | 16 |
8 | 7 | 16 |
OrdwayBirds <-
OrdwayBirds %>%
select(SpeciesName, Month, Day) %>%
mutate(
Month = as.numeric(as.character(Month)),
Day = as.numeric(as.character(Day))
)
OrdwayBirds %>%
head()
SpeciesName | Month | Day | |
---|---|---|---|
<chr> | <dbl> | <dbl> | |
3 | Song Sparrow | 7 | 16 |
4 | NA | NA | |
5 | Song Sparrow | 7 | 16 |
6 | Field Sparrow | 7 | 16 |
7 | Field Sparrow | 7 | 16 |
8 | Song Sparrow | 7 | 16 |
OrdwaySpeciesNames %>%
filter(is.na(SpeciesNameCleaned))
SpeciesName | SpeciesNameCleaned |
---|---|
<chr> | <chr> |
NA | |
-lost- | NA |
-missing- | NA |
[Nothing, just dashes] | NA |
13:00:00 | NA |
Bank Swallow | NA |
Barn Swallow | NA |
Bay-breasted Warbler | NA |
Blackpoll Warbler | NA |
Blue Jay | NA |
Blue-headed Vireo | NA |
Blue-winged Warbler | NA |
Bluebird | NA |
Boreal Chickadee | NA |
Brewer's Sparrow | NA |
Brown Creeper | NA |
Brown Thrasher | NA |
Brown Towhee | NA |
Cactus Wren | NA |
Common Crow | NA |
Common Grackle | NA |
Common Nighthawk | NA |
Common Redpoll | NA |
Common Yellowthroat | NA |
Connecticut Warbler | NA |
Downy Woodpecker | NA |
E Bluebird | NA |
Eastern Bluebird | NA |
Eastern Kingbird | NA |
Eastern Meadowlark | NA |
Eastern Robin | NA |
Flicker | NA |
Fox Sparrow | NA |
Goldfinch | NA |
Grackle | NA |
Green Heron | NA |
Ground Dove | NA |
Hairy Woodpecker | NA |
Hermit Thrush | NA |
Horned Lark | NA |
House Finch | NA |
House Sparrow | NA |
Inca Dove | NA |
Indigo Bunting | NA |
Killdeer | NA |
Kingbird | NA |
Kiskadee F.C. | NA |
Magnolia Warbler | NA |
Mockingbird | NA |
Rough-winged Swallow | NA |
Task 1#
[1] Including misspellings, how many different species are there in the OrdwayBirds
data?
There are 275 unique values of the variable SpeciesName
. This reduces to 268 after dropping the following invalid values:
''
'-lost-'
'-missing-'
'13:00:00'
'[Nothing, just dashes]'
'lost'
'none'
[2] Consider the OrdwaySpeciesNames
data frame also found in the dcData
package as well. How many distinct species are there in the SpeciesNameCleaned
variable in OrdwaySpeciesNames
? You will find it helpful to use n_distinct()
a reduction function, which counts the number of unique values in a variable.
There are 108 unique values of the variable SpeciesNameCleaned
after accounting for the value NA
.
OrdwayBirds %>%
count(SpeciesName)
SpeciesName | n |
---|---|
<chr> | <int> |
4 | |
-lost- | 1 |
-missing- | 1 |
13:00:00 | 1 |
Acadian Flycatcher | 1 |
American Gold Finch | 50 |
American Goldfinch | 1153 |
American Golf Finch | 1 |
American Redfinch | 1 |
American Redstart | 3 |
American Robin | 4 |
Arkansas Kingbird | 1 |
Baltimore Oriole | 206 |
Bank Swallow | 21 |
Barn Swallow | 23 |
Batimore Oriole | 1 |
Bay-breasted Warbler | 2 |
Blac-capped Chickadee | 1 |
Black and White Warbler | 9 |
Black-Capped Chickadee | 13 |
Black-and-white Warbler | 1 |
Black-billed Cookoo | 1 |
Black-billed Cuckoo | 15 |
Black-capeed Chickadee | 1 |
Black-capped Chicakdee | 1 |
Black-capped Chickadee | 1110 |
Black-capped Chikadee | 1 |
Black-capped chickadee | 187 |
Black-throat Sparrow | 31 |
Black-throat-Sparrow | 1 |
⋮ | ⋮ |
White-breast Nuthatch | 23 |
White-breasted Nuthatch | 236 |
White-crown Sparrow | 17 |
White-crowned Sparrow | 78 |
White-eyed Vireo | 1 |
White-thorat Sparrow | 1 |
White-throat Sparrow | 86 |
White-throated Sparrow | 229 |
White-winged Junco | 2 |
Wht-brstd Nuthatch | 1 |
Wilson Warbler | 4 |
Wilson's Warbler | 22 |
Winter Wren | 1 |
Wood Pewee | 37 |
Wood Thrush | 3 |
Woodcock | 1 |
Wren | 2 |
Yellow Flicker | 1 |
Yellow Shafted Flicker | 4 |
Yellow Warbler | 19 |
Yellow-bellied Flycatcher | 7 |
Yellow-bellied Sapsucker | 3 |
Yellow-shaft Flicker | 6 |
Yellow-shafted Flicker | 34 |
Yellow-shafted flicker | 6 |
Yellow-tailed Oriole | 1 |
Yellowthroat | 107 |
[Nothing, just dashes] | 1 |
lost | 1 |
none | 2 |
OrdwayBirds %>%
select(SpeciesName) %>%
n_distinct()
OrdwaySpeciesNames %>%
count(SpeciesNameCleaned)
SpeciesNameCleaned | n |
---|---|
<chr> | <int> |
Acadian Flycatcher | 1 |
American Goldfinch | 3 |
American Redfinch | 1 |
American Redstart | 1 |
American Robin | 1 |
Arkansas Kingbird | 1 |
Baltimore Oriole | 3 |
Black and White Warbler | 2 |
Black-billed Cookoo | 2 |
Black-capped Chickadee | 8 |
Black-throat Sparrow | 2 |
Brown-headed Cowbird | 2 |
Cardinal | 2 |
Carolina Chickadee | 1 |
Catbird | 4 |
Cedar Waxwing | 2 |
Chestnut-backed Chickadee | 1 |
Chestnut-sided Warbler | 1 |
Chickadee | 2 |
Chipping Sparrow | 4 |
Clay-colored Sparrow | 3 |
Cowbird | 1 |
Curve-billed Thrasher | 2 |
Eastern Phoebe | 2 |
Eastern Wood Pewee | 2 |
Field Sparrow | 2 |
Golden-Crowned Kinglet | 3 |
Gray - cheeked Thrush | 4 |
Great Crested Flycatcher | 3 |
Harris's Sparrow | 3 |
⋮ | ⋮ |
Tennessee Warbler | 2 |
Traill's Flycatcher | 1 |
Tree L | 1 |
Tree Swallow | 4 |
Tufted Titmouse | 1 |
Unknown | 1 |
Varied Thrush | 1 |
Veery | 1 |
Vesper Sparrow | 1 |
Warbling Vireo | 1 |
White-Crested Sparrow | 1 |
White-Fronted Dove | 1 |
White-breasted Nuthatch | 5 |
White-crowned Sparrow | 2 |
White-eyed Vireo | 1 |
White-throat Sparrow | 5 |
White-winged Junco | 1 |
Wilson's Warbler | 2 |
Winter Wren | 1 |
Wood Pewee | 1 |
Wood Thrush | 1 |
Woodcock | 1 |
Wren | 1 |
Yellow Shafted Flicker | 5 |
Yellow Warbler | 1 |
Yellow-bellied Flycatcher | 1 |
Yellow-bellied Sapsucker | 1 |
Yellow-tailed Oriole | 1 |
Yellowthroat | 1 |
NA | 50 |
OrdwaySpeciesNames %>%
select(SpeciesNameCleaned) %>%
n_distinct()
Task 2#
Use the OrdwaySpeciesNames
table to create a new data frame that corrects the misspellings in SpeciesNames
. This can be done easily using the inner_join()
data verb. Look at the names of the variables in OrdwaySpeciesNames
and OrdwayBirds
.
[1] Which variable(s) was used for matching cases?
The variable SpeciesName
was used for matching cases.
[2] What were the variable(s) that will be added?
The variables SpeciesNameCleaned
(renamed to Species
), Month
, and Day
will be added.
Corrected <-
OrdwayBirds %>%
inner_join(y = OrdwaySpeciesNames) %>%
select(Species = SpeciesNameCleaned, Month, Day) %>%
na.omit()
Corrected %>%
head()
Joining with `by = join_by(SpeciesName)`
Warning message in inner_join(., y = OrdwaySpeciesNames):
“Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 4 of `x` matches multiple rows in `y`.
ℹ Row 211 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
"many-to-many"` to silence this warning.”
Species | Month | Day | |
---|---|---|---|
<chr> | <dbl> | <dbl> | |
1 | Song Sparrow | 7 | 16 |
3 | Song Sparrow | 7 | 16 |
4 | Field Sparrow | 7 | 16 |
5 | Field Sparrow | 7 | 16 |
6 | Field Sparrow | 7 | 16 |
7 | Field Sparrow | 7 | 16 |
Task 3#
Call the variable that contains the total count
. Arrange this into descending order from the species with the most birds, and look through the list. (Hint: Remember n()
. Also, one of the arguments to one of the data verbs will be desc(count)
to arrange the cases into descending order. Display the top 10 species in terms of the number of bird captures.) Define for yourself a “major species” as a species with more than a particular threshold count. Set your threshold so that there are 5 or 6 species designated as major. Filter to produce a data frame with only the birds that belong to a major species. Save the output in a table called Majors
. (Hint: Remember that summary functions can be used case-by-case when filtering or mutating a data frame that has been grouped.)
[1] How many bird captures are reported for each of the corrected species?
See below for the result (major species threshold >= 1000).
Corrected %>%
count(Species) %>%
arrange(desc(n)) %>%
head(n = 10)
Species | n | |
---|---|---|
<chr> | <int> | |
1 | Slate-colored Junco | 2732 |
2 | Tree Swallow | 1537 |
3 | Black-capped Chickadee | 1327 |
4 | American Goldfinch | 1204 |
5 | Field Sparrow | 1164 |
6 | Lincoln's Sparrow | 790 |
7 | Robin | 608 |
8 | Catbird | 554 |
9 | Song Sparrow | 512 |
10 | House Wren | 460 |
Corrected %>%
group_by(Species) %>%
summarize(count = n()) %>%
arrange(desc(count)) %>%
head(n = 10)
Species | count |
---|---|
<chr> | <int> |
Slate-colored Junco | 2732 |
Tree Swallow | 1537 |
Black-capped Chickadee | 1327 |
American Goldfinch | 1204 |
Field Sparrow | 1164 |
Lincoln's Sparrow | 790 |
Robin | 608 |
Catbird | 554 |
Song Sparrow | 512 |
House Wren | 460 |
Majors <-
Corrected %>%
group_by(Species) %>%
summarize(count = n()) %>%
arrange(desc(count)) %>%
filter(count >= 1000)
Majors
Species | count |
---|---|
<chr> | <int> |
Slate-colored Junco | 2732 |
Tree Swallow | 1537 |
Black-capped Chickadee | 1327 |
American Goldfinch | 1204 |
Field Sparrow | 1164 |
Task 4#
When you have correctly produced Majors
, write a command that produces the month-by-month count of each of the major species. Call this table ByMonth
. Display this month-by-month count with a bar chart arranged in a way that you think tells the story of what time of year the various species appear. You can use mplot()
to explore different possibilies. (Warning: mplot()
and similar interactive functions should not appear in your Rmd file, it needs to be used interactively from the console. Use the “Show Expression” button in mplot()
to create an expression that you can cut and paste into a chunk in your Rmd document, so that the graph gets created when you compile it.) Once you have the graph, use it to answer these questions:
[1] Which species are present year-round?
American Goldfinch (11-12 mo)
Black-capped Chickadee (12 mo)
[2] Which species are migratory, that is, primarily present in one or two seasons?
Field Sparrow (6 mo)
Slate-colored Junco (8-9 mo)
Tree Swallow (3-5 mo)
[3] What is the peak month for each major species?
10
American Goldfinch11
Black-capped Chickadee05
Field Sparrow10
Slate-colored Junco06
Tree Swallow
[4] Which major species are seen in good numbers for at least 6 months of the year? (Hint: n_distinct()
and >= 6
.)
Arguably, the only species that is not seen in good numbers for at least 6 months of the year is the tree swallow.
ByMonth <-
OrdwayBirds %>%
group_by(SpeciesName, Month = as.integer(Month)) %>%
summarize(count = n()) %>%
filter(SpeciesName %in% Majors$Species)
ByMonth
`summarise()` has grouped output by 'SpeciesName'. You can override using the
`.groups` argument.
SpeciesName | Month | count |
---|---|---|
<chr> | <int> | <int> |
American Goldfinch | 1 | 10 |
American Goldfinch | 2 | 51 |
American Goldfinch | 3 | 48 |
American Goldfinch | 4 | 21 |
American Goldfinch | 5 | 125 |
American Goldfinch | 6 | 63 |
American Goldfinch | 7 | 67 |
American Goldfinch | 8 | 70 |
American Goldfinch | 9 | 151 |
American Goldfinch | 10 | 364 |
American Goldfinch | 11 | 180 |
American Goldfinch | 12 | 3 |
Black-capped Chickadee | 1 | 56 |
Black-capped Chickadee | 2 | 140 |
Black-capped Chickadee | 3 | 96 |
Black-capped Chickadee | 4 | 51 |
Black-capped Chickadee | 5 | 48 |
Black-capped Chickadee | 6 | 20 |
Black-capped Chickadee | 7 | 13 |
Black-capped Chickadee | 8 | 11 |
Black-capped Chickadee | 9 | 66 |
Black-capped Chickadee | 10 | 173 |
Black-capped Chickadee | 11 | 271 |
Black-capped Chickadee | 12 | 165 |
Field Sparrow | 4 | 83 |
Field Sparrow | 5 | 197 |
Field Sparrow | 6 | 15 |
Field Sparrow | 7 | 79 |
Field Sparrow | 8 | 64 |
Field Sparrow | 9 | 74 |
Field Sparrow | 10 | 69 |
Field Sparrow | 11 | 1 |
Slate-colored Junco | 1 | 113 |
Slate-colored Junco | 2 | 61 |
Slate-colored Junco | 3 | 188 |
Slate-colored Junco | 4 | 694 |
Slate-colored Junco | 5 | 1 |
Slate-colored Junco | 8 | 1 |
Slate-colored Junco | 9 | 35 |
Slate-colored Junco | 10 | 1178 |
Slate-colored Junco | 11 | 272 |
Slate-colored Junco | 12 | 174 |
Tree Swallow | 4 | 2 |
Tree Swallow | 5 | 11 |
Tree Swallow | 6 | 171 |
Tree Swallow | 7 | 16 |
Tree Swallow | 11 | 1 |
ByMonth %>%
group_by(SpeciesName) %>%
summarize(
MonthsPerYear = n(),
SixMonthsOrMore = n_distinct(Month) >= 6
)
SpeciesName | MonthsPerYear | SixMonthsOrMore |
---|---|---|
<chr> | <int> | <lgl> |
American Goldfinch | 12 | TRUE |
Black-capped Chickadee | 12 | TRUE |
Field Sparrow | 8 | TRUE |
Slate-colored Junco | 10 | TRUE |
Tree Swallow | 5 | FALSE |
Project: Scraping Nuclear Reactors#
Project: Scraping Nuclear Reactors from Kaplan, Daniel & Matthew Beckman. (2021). Data Computing. 2nd Ed. Home.
In what ways is the table tidy? How is it not tidy? What’s different about it from a tidy table?
Tidy Data Criteria
(i) rows (or cases, observations) must each represent the same underlying attribute (i.e., each observation must have its own row)
(ii) columns (or variables, values) must each contain the same type of value for each row (i.e., each variable must have its own column)
(iii) each value must have its own cell
It’s impossible to only satisfy two of the three criteria. This implies the following.
(i) put each dataset into a tibble
(ii) put each variable into a column
There is at least one row that does not represent a typical case (that is, the header row(s)). Certain columns are blank and need to be removed. The remaining columns may contain heterogeneous data formats or data types. And missing values must be addressed. But with some cleaning, the table hints at what a typical case should look like.
page <- 'https://en.wikipedia.org/wiki/List_of_commerical_nuclear_reactors'
tableList <-
page %>%
read_html() %>%
html_nodes(css = 'table') %>%
html_table(fill = TRUE)
length(tableList)
Japan <-
tableList[[21]] %>%
select(1:9)
#names(Japan)[c(3, 6)] <- c('type', 'grossMW')
head(Japan)
Plantname | UnitNo. | Type | Model | Status | Capacity(MW) | Beginbuilding | Commercialoperation | Closed |
---|---|---|---|---|---|---|---|---|
<chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> |
Plantname | UnitNo. | Type | Model | Status | Capacity(MW) | Beginbuilding | Commercialoperation | Closed |
Fukushima Daiichi | 1 | BWR | BWR-3 | Inoperable | 439 | 25 Jul 1967 | 26 Mar 1971 | 19 May 2011 |
Fukushima Daiichi | 2 | BWR | BWR-4 | Inoperable | 760 | 9 Jun 1969 | 18 Jul 1974 | 19 May 2011 |
Fukushima Daiichi | 3 | BWR | BWR-4 | Inoperable | 760 | 28 Dec 1970 | 27 Mar 1976 | 19 May 2011 |
Fukushima Daiichi | 4 | BWR | BWR-4 | Inoperable | 760 | 12 Feb 1973 | 12 Oct 1978 | 19 May 2011 |
Fukushima Daiichi | 5 | BWR | BWR-4 | Shut down | 760 | 22 May 1972 | 18 Apr 1978 | 17 Dec 2013 |
Among other things, some of the variable names appear redundant and others have multiple words separated by spaces. You can rename variables using the data verb rename()
, finding appropriate names from the Wikipedia table. Another problem is that the first row is not data but a continuation of the variable names. So row number 1 should be dropped.
Japan <-
Japan %>%
filter(row_number() > 1) %>%
rename(
name = Plantname,
reactor = `UnitNo.`,
type = Type,
model = Model,
status = Status,
netMW = `Capacity(MW)`,
construction = Beginbuilding,
operation = Commercialoperation,
closure = Closed
)
head(Japan)
name | reactor | type | model | status | netMW | construction | operation | closure |
---|---|---|---|---|---|---|---|---|
<chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> |
Fukushima Daiichi | 1 | BWR | BWR-3 | Inoperable | 439 | 25 Jul 1967 | 26 Mar 1971 | 19 May 2011 |
Fukushima Daiichi | 2 | BWR | BWR-4 | Inoperable | 760 | 9 Jun 1969 | 18 Jul 1974 | 19 May 2011 |
Fukushima Daiichi | 3 | BWR | BWR-4 | Inoperable | 760 | 28 Dec 1970 | 27 Mar 1976 | 19 May 2011 |
Fukushima Daiichi | 4 | BWR | BWR-4 | Inoperable | 760 | 12 Feb 1973 | 12 Oct 1978 | 19 May 2011 |
Fukushima Daiichi | 5 | BWR | BWR-4 | Shut down | 760 | 22 May 1972 | 18 Apr 1978 | 17 Dec 2013 |
Fukushima Daiichi | 6 | BWR | BWR-5 | Shut down | 1067 | 26 Oct 1973 | 24 Oct 1979 | 17 Dec 2013 |
str(Japan)
tibble [68 × 9] (S3: tbl_df/tbl/data.frame)
$ name : chr [1:68] "Fukushima Daiichi" "Fukushima Daiichi" "Fukushima Daiichi" "Fukushima Daiichi" ...
$ reactor : chr [1:68] "1" "2" "3" "4" ...
$ type : chr [1:68] "BWR" "BWR" "BWR" "BWR" ...
$ model : chr [1:68] "BWR-3" "BWR-4" "BWR-4" "BWR-4" ...
$ status : chr [1:68] "Inoperable" "Inoperable" "Inoperable" "Inoperable" ...
$ netMW : chr [1:68] "439" "760" "760" "760" ...
$ construction: chr [1:68] "25 Jul 1967" "9 Jun 1969" "28 Dec 1970" "12 Feb 1973" ...
$ operation : chr [1:68] "26 Mar 1971" "18 Jul 1974" "27 Mar 1976" "12 Oct 1978" ...
$ closure : chr [1:68] "19 May 2011" "19 May 2011" "19 May 2011" "19 May 2011" ...
Using your cleaned data, make a plot of net generation capacity versus date of construction. Color the points by the type of reactor (for example, BWR, PWR, etc.) In addition to your plot, give a sentence or two of interpretation; what patterns do you see?
# BWR boiling water reactor
# FBR
# GCR
# PWR pressurized water reactor
Japan <-
Japan %>%
mutate(
netMW = as.integer(netMW),
construction = lubridate::dmy(construction),
operation = lubridate::dmy(operation),
closure = lubridate::dmy(closure)
)
head(Japan)
Japan %>%
ggplot(mapping = aes(x = construction, y = netMW)) +
geom_point(aes(color = type))
Warning message:
“There was 1 warning in `mutate()`.
ℹ In argument: `closure = lubridate::dmy(closure)`.
Caused by warning:
! 6 failed to parse.”
name | reactor | type | model | status | netMW | construction | operation | closure |
---|---|---|---|---|---|---|---|---|
<chr> | <chr> | <chr> | <chr> | <chr> | <int> | <date> | <date> | <date> |
Fukushima Daiichi | 1 | BWR | BWR-3 | Inoperable | 439 | 1967-07-25 | 1971-03-26 | 2011-05-19 |
Fukushima Daiichi | 2 | BWR | BWR-4 | Inoperable | 760 | 1969-06-09 | 1974-07-18 | 2011-05-19 |
Fukushima Daiichi | 3 | BWR | BWR-4 | Inoperable | 760 | 1970-12-28 | 1976-03-27 | 2011-05-19 |
Fukushima Daiichi | 4 | BWR | BWR-4 | Inoperable | 760 | 1973-02-12 | 1978-10-12 | 2011-05-19 |
Fukushima Daiichi | 5 | BWR | BWR-4 | Shut down | 760 | 1972-05-22 | 1978-04-18 | 2013-12-17 |
Fukushima Daiichi | 6 | BWR | BWR-5 | Shut down | 1067 | 1973-10-26 | 1979-10-24 | 2013-12-17 |
Warning message:
“Removed 7 rows containing missing values (`geom_point()`).”
Carry out the same cleaning process for the China reactor table, and then append it with the Japan data. Use mutate()
to add a variable that has the name of the country. (Hint: functions such as bind_cols()
or bind_rows()
form the dplyr
package are helpful for appending data frames.) Collating the data for all countries is a matter of repeating this process over and over. Inevitably, there are inconsistencies. For example, the US data had been organized in a somewhat different format when compared to the Japan and China data for many years until Wikipedia editors decided to reconcile them.
China <-
tableList[[10]] %>%
select(1:9)
China <-
China %>%
filter(row_number() > 2) %>%
rename(
name = Plantname,
reactor = `UnitNo.`,
type = Type,
model = Model,
status = Status,
netMW = `Capacity(MW)`,
construction = Beginbuilding,
operation = Commercialoperation,
closure = Closed
)
China <-
China %>%
mutate(
netMW = as.integer(netMW),
construction = lubridate::dmy(construction),
operation = lubridate::dmy(operation),
closure = lubridate::dmy(closure)
)
China %>%
ggplot(mapping = aes(x = construction, y = netMW)) +
geom_point(aes(color = type))
Warning message:
“There were 2 warnings in `mutate()`.
The first warning was:
ℹ In argument: `construction = lubridate::dmy(construction)`.
Caused by warning:
! 15 failed to parse.
ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.”
Warning message:
“Removed 47 rows containing missing values (`geom_point()`).”
bind_rows(Japan, China) %>%
mutate(Country = ifelse(name %in% unique(Japan$name), 'Japan', 'China')) %>%
head()
name | reactor | type | model | status | netMW | construction | operation | closure | Country |
---|---|---|---|---|---|---|---|---|---|
<chr> | <chr> | <chr> | <chr> | <chr> | <int> | <date> | <date> | <date> | <chr> |
Fukushima Daiichi | 1 | BWR | BWR-3 | Inoperable | 439 | 1967-07-25 | 1971-03-26 | 2011-05-19 | Japan |
Fukushima Daiichi | 2 | BWR | BWR-4 | Inoperable | 760 | 1969-06-09 | 1974-07-18 | 2011-05-19 | Japan |
Fukushima Daiichi | 3 | BWR | BWR-4 | Inoperable | 760 | 1970-12-28 | 1976-03-27 | 2011-05-19 | Japan |
Fukushima Daiichi | 4 | BWR | BWR-4 | Inoperable | 760 | 1973-02-12 | 1978-10-12 | 2011-05-19 | Japan |
Fukushima Daiichi | 5 | BWR | BWR-4 | Shut down | 760 | 1972-05-22 | 1978-04-18 | 2013-12-17 | Japan |
Fukushima Daiichi | 6 | BWR | BWR-5 | Shut down | 1067 | 1973-10-26 | 1979-10-24 | 2013-12-17 | Japan |
Make an informative graphic that shows how long it took between start of construction and commissioning for operation of each nuclear reactor in Japan (or another country of your choice). One possibility: use reactor name vs date as the frame. For each reactor, set the glyph to be a line extending from start of construction to commissioning. You can do this with geom_segment()
using name as the y coordinate and time as the x coordinate. (Tip: use the paste()
function to create the reactorID
on the vertical axis.)
Project: Street or Road?#
Project: Street or Road? from Kaplan, Daniel & Matthew Beckman. (2021). Data Computing. 2nd Ed. Home.
Example#
People’s addresses involve streets, lanes, courts, avenues, and so on. How many such road-related words are in common use? In answering this question, you would presumably want to look at lots of addresses and extract the road-related term. You could do this by eye, reading down a list of a few hundred or thousand addresses. But if you want to do it on a really large scale, a city or state or country, you would want some automated help, for instance, a computer program that discards the sorts of entries you have already identified to give a greater concentration of unidentified terms. In this activity, you’re going to build such a program.
# about 15,000 street addresses of registered voters in Wake County, NC
Addresses <- read_csv('https://mdbeckman.github.io/dcSupplement/data/street-addresses.csv')
head(Addresses)
Rows: 15483 Columns: 1
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): address
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
address |
---|
<chr> |
2117 MARINER CIRCLE |
101 EPPING WAY |
PO BOX 58592 |
5102 ECHO RIDGE RD |
PO BOX 37218 |
PO BOX 37218 |
# about 900,000 medicare service provider street addresses
download.file(url = 'https://mdbeckman.github.io/dcSupplement/data/CMS_ProvidersSimple.rds',
destfile = 'CMS_ProvidersSimple.rds')
DataTable <- readRDS('CMS_ProvidersSimple.rds')
head(DataTable)
address | first_name | sex | |
---|---|---|---|
<chr> | <chr> | <chr> | |
1 | 900 SETON DR | ARDALAN | M |
2 | 2650 RIDGE AVE | THOMAS | M |
3 | 4126 N HOLLAND SYLVANIA RD | RASHID | M |
4 | 456 MAGEE AVE | DAVID | M |
5 | 11100 EUCLID AVE | JENNIFER | F |
6 | 12605 E 16TH AVE | KEVIN | M |
To solve such problems, start by looking at a few dozen of the addresses to familiarize yourself with common patterns. Suppose you wanted to extract the PO Box number from an address. Read the street address data and pull out a sample of a few dozen cases.
In everyday langauge, describe a pattern that you think will identify the information you are looking for.
The PO Box cases tend to have a substring ‘PO’.
Translate (1) into the form of a regular expression.
The regular expression for ‘PO’ is simply ‘PO’.
Filter to retain the cases that match the expression. Hint:
filter()
andgrepl()
are useful for this.Filter to retain the cases that do not match the expression.
Examine the results of (3) and (4) to identify shortcomings in your patterns.
Improve or extend the pattern to deal with the mistaken cases.
Repeat until satisfied.
Put extraction parentheses around the parts of the regular expression that contain the info you want.
Sample <-
Addresses %>%
sample_n(size = 50)
head(Sample)
address |
---|
<chr> |
1921 MCKINNEY STREET |
PO BOX 2973 |
PO BOX 99181 |
NCSU BOX 22323 |
3585 E 10TH ST |
PO BOX 96 |
pattern <- 'PO'
Matches <-
Sample %>%
filter(grepl(pattern = pattern, address))
head(Matches)
address |
---|
<chr> |
PO BOX 2973 |
PO BOX 99181 |
PO BOX 96 |
PO BOX 28454 |
PO BOX 37297 |
PO BOX 26761 |
Dont <-
Sample %>%
filter(!grepl(pattern = 'PO', address))
head(Dont)
address |
---|
<chr> |
1921 MCKINNEY STREET |
NCSU BOX 22323 |
3585 E 10TH ST |
NCSU BOX 15330 |
NCSU BOX 04082 |
251 N MAIN STREET 3786 |
pattern <- 'BOX\\s+(\\d+)'
Matches <-
Sample %>%
filter( grepl(pattern, address))
head(Matches)
address |
---|
<chr> |
PO BOX 2973 |
PO BOX 99181 |
NCSU BOX 22323 |
PO BOX 96 |
PO BOX 28454 |
NCSU BOX 15330 |
Dont <-
Sample %>%
filter(!grepl(pattern, address))
head(Dont)
address |
---|
<chr> |
1921 MCKINNEY STREET |
3585 E 10TH ST |
251 N MAIN STREET 3786 |
3307 BROOKLINE COURT |
1703 PATTON ROAD |
3231 WALNUT CREEK PKWY |
BoxNumbers <-
Sample %>%
filter(grepl(pattern, address)) %>%
tidyr::extract(address, into = 'boxnum', regex = pattern)
head(BoxNumbers)
boxnum |
---|
<chr> |
2973 |
99181 |
22323 |
96 |
28454 |
15330 |
pattern <- 'PO'
Matches <-
Addresses %>%
filter(grepl(pattern = pattern, address))
head(Matches)
address |
---|
<chr> |
PO BOX 58592 |
PO BOX 37218 |
PO BOX 37218 |
PO BOX 1953 |
PO BOX 132 |
PO BOX 360 |
Dont <-
Addresses %>%
filter(!grepl(pattern = 'PO', address))
head(Dont)
address |
---|
<chr> |
2117 MARINER CIRCLE |
101 EPPING WAY |
5102 ECHO RIDGE RD |
5007 PURITAN RD |
04-I ROBIN CIRCLE |
4800 WESTERN BLVD |
pattern <- 'BOX\\s+(\\d+)'
Matches <-
Addresses %>%
filter( grepl(pattern, address))
head(Matches)
address |
---|
<chr> |
PO BOX 58592 |
PO BOX 37218 |
PO BOX 37218 |
PO BOX 1953 |
PO BOX 132 |
PO BOX 360 |
Dont <-
Addresses %>%
filter(!grepl(pattern, address))
head(Dont)
address |
---|
<chr> |
2117 MARINER CIRCLE |
101 EPPING WAY |
5102 ECHO RIDGE RD |
5007 PURITAN RD |
04-I ROBIN CIRCLE |
4800 WESTERN BLVD |
BoxNumbers <-
Addresses %>%
filter(grepl(pattern, address)) %>%
tidyr::extract(address, into = 'boxnum', regex = pattern)
head(BoxNumbers)
boxnum |
---|
<chr> |
58592 |
37218 |
37218 |
1953 |
132 |
360 |
Back to the Streets#
Street endings (e.g., ‘ST’, ‘LANE’) are often found at the end of the address string. Use this as a starting point to find the most common endings. Once you have a set of specific street endings, you can use the regex “or” symbol, e.g., ‘(ST|RD|ROAD)’. The parentheses are not incidental. They are there to mark a pattern that you want to extract. In this case, in addition to knowing that there is a ST or RD or ROAD in an address, you want to know which one of those possibilities it is so that you can count the occurrence of each of the possibilities. To find street endings that aren’t in your set, you can filter out the street endings or non street addresses you already know about.
[1] Read the following R statements. Next to each line, give a short explanation of what the line contributes to the task. For each of the regexes, explain in simple everyday language what pattern is being matched.
pattern <- '(ST|RD|ROAD)'
LeftOvers <-
Addresses %>%
filter(
!grepl(pattern, address), # matches other than: 'ST' or 'RD' or 'ROAD'
!grepl('\\sAPT|UNIT\\s[\\d]+$', address), # matches other than: (1 whitespace, 'APT') or ('UNIT', 1 whitespace, 1 or more digits at the end of the string)
!grepl(' BOX ', address) # matches other than: 1 space, 'BOX', 1 space
)
LeftOvers
[2] For each set of patterns that you identify, compute the LeftOvers
. Examine them visually to find new street endings to add to the pattern, e.g., LANE. When you have this working on the small sample, use a larger sample and, eventually, the whole data set. It’s practically impossible to find a method that will work perfectly on new data, but do the best you can. In your report, implement your method and explain how it works, line by line. Present your result: how many addresses there are of each kind of road word?
‘DR’, ‘RD’, and ‘ST’ are the most common street types. In general, abbrviated street types are more common than their longform counterparts.
Breaking addresses into their components is a common task. People who work on this problem intensively sometimes publish their regular expressions. Here’s one from Ross Hammer published at https://regexlib.com/Search.aspx?k=street:
^\s*((?:(?:\d+(?:\x20+\w+\.?)+(?:(?:\x20+STREET|ST|DRIVE|DR|AVENUE|AVE|ROAD|RD|LOOP|COURT
|CT|CIRCLE|LANE|LN|BOULEVARD|BLVD)\.?)?)|(?:(?:P\.\x20?O\.|P\x20?O)\x20*Box\x20+\d+)|
(?:General\x20+Delivery)|(?:C[\\\/]O\x20+(?:\w+\x20*)+))\,?\x20*(?:(?:(?:APT|BLDG|DEPT|
FL|HNGR|LOT|PIER|RM|S(?:LIP|PC|T(?:E|OP))|TRLR|UNIT|\x23)\.?\x20*(?:[a-zA-Z0-9\-]+))|
(?:BSMT|FRNT|LBBY|LOWR|OFC|PH|REAR|SIDE|UPPR))?)\,?\s+((?:(?:\d+(?:\x20+\w+\.?)+
(?:(?:\x20+STREET|ST|DRIVE|DR|AVENUE|AVE|ROAD|RD|LOOP|COURT|CT|CIRCLE|LANE|LN|BOULEVARD|
BLVD)\.?)?)|(?:(?:P\.\x20?O\.|P\x20?O)\x20*Box\x20+\d+)|(?:General\x20+Delivery)|
(?:C[\\\/]O\x20+(?:\w+\x20*)+))\,?\x20*(?:(?:(?:APT|BLDG|DEPT|FL|HNGR|LOT|PIER|RM|
S(?:LIP|PC|T(?:E|OP))|TRLR|UNIT|\x23)\.?\x20*(?:[a-zA-Z0-9\-]+))|(?:BSMT|FRNT|LBBY|
LOWR|OFC|PH|REAR|SIDE|UPPR))?)?\,?\s+((?:[A-Za-z]+\x20*)+)\,\s+(A[LKSZRAP]|C[AOT]|
D[EC]|F[LM]|G[AU]|HI|I[ADLN]|K[SY]|LA|M[ADEHINOPST]|N[CDEHJMVY]|O[HKR]|P[ARW]|RI|
S[CD]|T[NX]|UT|V[AIT]|W[AIVY])\s+(\d+(?:-\d+)?)\s*$
Addresses %>%
filter(
!grepl(pattern = ' AVENUE |AVENUE$', x = address), # matches other than: 'AVENUE' flanked by whitespace anywhere in the string, or 'AVENUE' at the end of the string
!grepl(pattern = ' AVE |AVE$', x = address), # matches other than: 'AVE' flanked by whitespace anywhere in the string, or 'AVE' at the end of the string
!grepl(pattern = ' BLVD |BLVD$', x = address), # matches other than: 'BLVD' flanked by whitespace anywhere in the string, or 'BLVD' at the end of the string
!grepl(pattern = ' BOULEVARD |BOULEVARD$', x = address), # matches other than: 'BOULEVARD' flanked by whitespace anywhere in the string, or 'BOULEVARD' at the end of the string
!grepl(pattern = ' CIR |CIR$', x = address), # matches other than: 'CIR' flanked by whitespace anywhere in the string, or 'CIR' at the end of the string
!grepl(pattern = ' CIRCLE |CIRCLE$', x = address), # matches other than: 'CIRCLE' flanked by whitespace anywhere in the string, or 'CIRCLE' at the end of the string
!grepl(pattern = ' CT |CT$', x = address), # matches other than: 'CT' flanked by whitespace anywhere in the string, or 'CT' at the end of the string
!grepl(pattern = ' COURT |COURT$', x = address), # matches other than: 'COURT' flanked by whitespace anywhere in the string, or 'COURT' at the end of the string
!grepl(pattern = ' DR |DR$', x = address), # matches other than: 'DR' flanked by whitespace anywhere in the string, or 'DR' at the end of the string
!grepl(pattern = ' DRIVE |DRIVE$', x = address), # matches other than: 'DRIVE' flanked by whitespace anywhere in the string, or 'DRIVE' at the end of the string
!grepl(pattern = ' LN |LN$', x = address), # matches other than: 'LN' flanked by whitespace anywhere in the string, or 'LN' at the end of the string
!grepl(pattern = ' LANE |LANE$', x = address), # matches other than: 'LANE' flanked by whitespace anywhere in the string, or 'LANE' at the end of the string
!grepl(pattern = ' PL |PL$', x = address), # matches other than: 'PL' flanked by whitespace anywhere in the string, or 'PL' at the end of the string
!grepl(pattern = ' PLACE |PLACE$', x = address), # matches other than: 'PLACE' flanked by whitespace anywhere in the string, or 'PLACE' at the end of the string
!grepl(pattern = ' RD |RD$', x = address), # matches other than: 'RD' flanked by whitespace anywhere in the string, or 'RD' at the end of the string
!grepl(pattern = ' ROAD |ROAD$', x = address), # matches other than: 'ROAD' flanked by whitespace anywhere in the string, or 'ROAD' at the end of the string
!grepl(pattern = ' ST |ST$', x = address), # matches other than: 'ST' flanked by whitespace anywhere in the string, or 'ST' at the end of the string
!grepl(pattern = ' STREET |STREET$', x = address), # matches other than: 'STREET' flanked by whitespace anywhere in the string, or 'STREET' at the end of the string
!grepl(pattern = ' TRAIL |TRAIL$', x = address), # matches other than: 'TRAIL' flanked by whitespace anywhere in the string, or 'TRAIL' at the end of the string
!grepl(pattern = ' WAY |WAY$', x = address), # matches other than: 'WAY' flanked by whitespace anywhere in the string, or 'WAY' at the end of the string
!grepl(pattern = ' WINERY |WINERY$', x = address), # matches other than: 'WINERY' flanked by whitespace anywhere in the string, or 'WINERY' at the end of the string
!grepl(pattern = 'BOX\\s+(\\d+)', x = address) # filters out all the 'BOX' from above
) %>%
head()
address |
---|
<chr> |
NCSU B0X 15637 |
HMIA-267 |
1104 PULLEN HALL |
311 BAREFOOT |
512 LIVE OAK |
1706 WINGATE UNIVERSITY |
pattern <- '\\s+('
pattern <- str_c(pattern, 'AVE|AVENUE')
pattern <- str_c(pattern, '|BLVD|BOULEVARD')
pattern <- str_c(pattern, '|CIR|CIRCLE')
pattern <- str_c(pattern, '|CT|COURT')
pattern <- str_c(pattern, '|DR|DRIVE')
pattern <- str_c(pattern, '|LN|LANE')
pattern <- str_c(pattern, '|PL|PLACE')
pattern <- str_c(pattern, '|RD|ROAD')
pattern <- str_c(pattern, '|ST|STREET')
pattern <- str_c(pattern, '|TRAIL')
pattern <- str_c(pattern, '|WAY')
pattern <- str_c(pattern, '|WINERY')
pattern <- str_c(pattern, ')(\\s+|$)?')
pattern
streets <-
Addresses %>%
filter(
grepl(x = address, pattern = pattern, ignore.case = TRUE)
) %>%
tidyr::extract(
col = address,
regex = pattern,
into = c(
'street',
' '
)
) %>%
group_by(street) %>%
summarize(total = n()) %>%
arrange(desc(total))
streets
street | total |
---|---|
<chr> | <int> |
DR | 888 |
RD | 688 |
ST | 676 |
AVE | 259 |
CT | 229 |
ROAD | 188 |
LN | 152 |
PL | 145 |
LANE | 141 |
CIR | 120 |
WAY | 113 |
COURT | 90 |
BLVD | 50 |
TRAIL | 26 |
BOULEVARD | 4 |
WINERY | 1 |
Bibliography#
Kaplan, Daniel & Matthew Beckman. (2021). Data Computing. 2nd Ed. Home.