Data Computing 2

Data Computing 2 #

Kaplan, Daniel & Matthew Beckman. (2021). Data Computing. 2nd Ed. Home.

Revised

19 Jun 2023

Programming Environment #

# install.packages('devtools')
# devtools::install_github('mdbeckman/dcData')

packages <- c(
  'data.table', # library(data.table)
  'dcData',     # library(dcData)
  'foreign',    # library(foreign)
  'leaflet',    # library(leaflet)
  'lubridate',  # library(lubridate)
  'mosaic',     # library(mosaic)
  'mosaicData', # library(mosaicData)
  'Rcpp',       # library(Rcpp)
  'rvest',      # library(rvest)
  'tidyverse'   # library(tidyverse)
)

# Install packages not yet installed
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
  install.packages(packages[!installed_packages])
}

# Load packages
invisible(lapply(packages, library, character.only = TRUE))

str_c('EXECUTED : ', now())
sessionInfo()
# R.version.string # R.Version()
# .libPaths()
# installed.packages()

Attaching package: ‘lubridate’

The following objects are masked from ‘package:data.table’:

    hour, isoweek, mday, minute, month, quarter, second, wday, week,
    yday, year

The following objects are masked from ‘package:base’:

    date, intersect, setdiff, union

Registered S3 method overwritten by 'mosaic':
  method                           from   
  fortify.SpatialPolygonsDataFrame ggplot2

The 'mosaic' package masks several functions from core packages in order to add 
additional features.  The original behavior of these functions should not be affected by this.

Attaching package: ‘mosaic’

The following objects are masked from ‘package:dplyr’:

    count, do, tally

The following object is masked from ‘package:Matrix’:

    mean

The following object is masked from ‘package:ggplot2’:

    stat

The following objects are masked from ‘package:stats’:

    binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
    quantile, sd, t.test, var

The following objects are masked from ‘package:base’:

    max, mean, min, prod, range, sample, sum

── Attaching core tidyverse packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
✔ forcats 1.0.0     ✔ stringr 1.5.0
✔ purrr   1.0.2     ✔ tibble  3.2.1
✔ readr   2.1.4     ✔ tidyr   1.3.0

── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::between()        masks data.table::between()
✖ mosaic::count()         masks dplyr::count()
✖ purrr::cross()          masks mosaic::cross()
✖ mosaic::do()            masks dplyr::do()
✖ tidyr::expand()         masks Matrix::expand()
✖ dplyr::filter()         masks stats::filter()
✖ dplyr::first()          masks data.table::first()
✖ readr::guess_encoding() masks rvest::guess_encoding()
✖ lubridate::hour()       masks data.table::hour()
✖ lubridate::isoweek()    masks data.table::isoweek()
✖ dplyr::lag()            masks stats::lag()
✖ dplyr::last()           masks data.table::last()
✖ lubridate::mday()       masks data.table::mday()
✖ lubridate::minute()     masks data.table::minute()
✖ lubridate::month()      masks data.table::month()
✖ tidyr::pack()           masks Matrix::pack()
✖ lubridate::quarter()    masks data.table::quarter()
✖ lubridate::second()     masks data.table::second()
✖ mosaic::stat()          masks ggplot2::stat()
✖ mosaic::tally()         masks dplyr::tally()
✖ purrr::transpose()      masks data.table::transpose()
✖ tidyr::unpack()         masks Matrix::unpack()
✖ lubridate::wday()       masks data.table::wday()
✖ lubridate::week()       masks data.table::week()
✖ lubridate::yday()       masks data.table::yday()
✖ lubridate::year()       masks data.table::year()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

'EXECUTED : 2025-01-23 15:23:07.070629'

R version 4.3.0 (2023-04-21)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS 15.2

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: America/New_York
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] forcats_1.0.0     stringr_1.5.0     purrr_1.0.2       readr_2.1.4      
 [5] tidyr_1.3.0       tibble_3.2.1      tidyverse_2.0.0   rvest_1.0.3      
 [9] Rcpp_1.0.10       mosaic_1.8.4.2    mosaicData_0.20.3 ggformula_0.10.4 
[13] dplyr_1.1.2       Matrix_1.5-4      ggplot2_3.4.3     lattice_0.21-8   
[17] lubridate_1.9.2   leaflet_2.2.0     foreign_0.8-84    dcData_0.1.0     
[21] data.table_1.14.8

loaded via a namespace (and not attached):
 [1] utf8_1.2.3         generics_0.1.3     xml2_1.3.4         stringi_1.7.12    
 [5] hms_1.1.3          digest_0.6.31      magrittr_2.0.3     evaluate_0.21     
 [9] grid_4.3.0         timechange_0.2.0   pbdZMQ_0.3-9       fastmap_1.1.1     
[13] jsonlite_1.8.5     httr_1.4.6         fansi_1.0.4        crosstalk_1.2.0   
[17] scales_1.2.1       tweenr_2.0.2       cli_3.6.1          labelled_2.11.0   
[21] rlang_1.1.1        crayon_1.5.2       polyclip_1.10-4    munsell_0.5.0     
[25] base64enc_0.1-3    withr_2.5.0        repr_1.1.6         tools_4.3.0       
[29] tzdb_0.4.0         uuid_1.1-0         colorspace_2.1-0   mosaicCore_0.9.2.1
[33] IRdisplay_1.1      vctrs_0.6.3        R6_2.5.1           ggridges_0.5.4    
[37] lifecycle_1.0.3    htmlwidgets_1.6.2  ggstance_0.3.6     MASS_7.3-58.4     
[41] pkgconfig_2.0.3    pillar_1.9.0       gtable_0.3.3       glue_1.6.2        
[45] ggforce_0.4.1      haven_2.5.2        tidyselect_1.2.0   IRkernel_1.3.2    
[49] farver_2.1.1       htmltools_0.5.5    compiler_4.3.0    

Issue

Issues using Leaflet (and the terra dependency in particular) in VSCode on macOS.

The following is an attempted, but so far failed, approach.

brew install gdal proj

remotes::install_github("rspatial/terra", configure.args = "--with-gdal-config=/opt/homebrew/bin/gdal-config --with-proj-include=/opt/homebrew/opt/proj/include --with-proj-lib=/opt/homebrew/opt/proj/lib --with-proj-share=/opt/homebrew/opt/proj/share/proj", type = "source", force = TRUE)

02 - Computing with R #

data(package = 'dcData')

data('NCHS', package = 'dcData')

View(NCHS)

Show code cell output Hide code cell output

A data.frame: 31126 × 31
sex	age	pregnant	ethnicity	death	followup	smoker	diabetic	height	weight	⋯	bmd	fmhm_other	hdl	chol	bps	bpd	income	pop_weight	psu	stratum
<fct>	<dbl>	<fct>	<fct>	<chr>	<dbl>	<fct>	<dbl>	<dbl>	<dbl>	⋯	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
female	2	no	Non-Hispanic Black	NA	NA	no	0	0.916	12.5	⋯	NA	NA	NA	NA	NA	NA	0.86	2970.8044	1	5
male	77	no	Non-Hispanic White	alive	90	no	0	1.740	75.4	⋯	1.2196	0.12359	54	215	98	56	5.00	10224.1331	3	1
female	10	no	Non-Hispanic White	NA	NA	no	0	1.366	32.9	⋯	NA	NA	30	129	108	63	1.47	14172.3111	2	7
male	1	no	Non-Hispanic Black	NA	NA	no	0	NA	13.3	⋯	NA	NA	NA	NA	NA	NA	0.57	3041.5927	1	2
male	49	no	Non-Hispanic White	alive	74	yes	0	1.783	92.5	⋯	1.0870	1.17688	42	279	122	83	5.00	30657.3119	2	8
female	19	no	Other/Multi	alive	86	no	0	1.620	59.2	⋯	0.8680	-1.22452	61	153	114	70	1.21	12224.8756	2	2
female	59	no	Non-Hispanic Black	alive	76	no	0	1.629	78.0	⋯	1.0870	-0.18566	105	245	123	81	0.00	8930.8802	2	4
male	13	no	Non-Hispanic White	NA	NA	no	0	1.620	40.7	⋯	0.8706	-0.48886	67	162	98	52	0.53	9823.0003	1	6
female	11	no	Non-Hispanic Black	NA	NA	no	0	1.569	45.5	⋯	NA	NA	58	148	112	50	0.00	2345.5760	2	9
male	43	no	Non-Hispanic Black	alive	79	no	0	1.901	111.8	⋯	1.3000	-0.18884	51	140	142	95	0.00	7484.2336	1	7
male	15	no	Non-Hispanic White	NA	NA	no	0	1.719	65.0	⋯	1.2090	-0.41861	40	132	107	51	1.25	10758.6505	2	1
male	37	no	Non-Hispanic White	alive	82	no	0	1.800	99.2	⋯	1.1990	0.20224	38	156	174	99	4.93	30448.0967	2	6
male	70	no	Mexican American	cardiovascular death	16	no	1	1.577	63.6	⋯	0.8464	1.36872	49	314	130	66	1.07	752.5150	2	13
male	81	no	Non-Hispanic White	alive	85	yes	0	1.662	75.5	⋯	1.0550	0.95469	40	174	136	61	2.67	6626.6640	1	12
female	38	no	Non-Hispanic White	alive	92	yes	0	1.749	81.6	⋯	1.1174	-0.24117	58	199	109	69	4.52	34313.1432	2	11
female	85	no	Non-Hispanic Black	other	62	no	0	1.442	41.5	⋯	0.8510	-0.06773	55	164	139	60	0.38	5872.4952	1	11
male	2	no	Non-Hispanic Black	NA	NA	no	0	0.886	11.4	⋯	NA	NA	NA	NA	NA	NA	0.92	2570.3422	1	5
female	1	no	Non-Hispanic White	NA	NA	no	0	NA	11.1	⋯	NA	NA	NA	NA	NA	NA	2.84	9973.7900	1	8
male	0	no	Mexican American	NA	NA	NA	NA	NA	11.5	⋯	NA	NA	NA	NA	NA	NA	2.40	801.5593	2	13
female	23	yes	Mexican American	alive	86	yes	0	1.589	59.8	⋯	NA	-0.12459	43	145	103	60	3.03	6618.1184	2	2
male	18	no	Mexican American	alive	87	no	0	1.685	112.9	⋯	1.1064	0.49518	34	161	119	78	1.80	1283.5993	2	3
female	13	no	Non-Hispanic Black	NA	NA	no	0	1.622	50.9	⋯	NA	-1.22452	62	151	105	73	0.75	2070.0273	2	2
female	12	no	Non-Hispanic Black	NA	NA	no	0	1.691	75.0	⋯	NA	0.30949	32	160	97	56	5.00	1779.4018	2	4
female	53	no	Non-Hispanic White	alive	72	yes	0	1.642	69.9	⋯	0.9540	-0.46297	105	219	114	71	2.67	24695.7644	2	12
female	42	no	Non-Hispanic White	alive	84	no	0	1.667	104.5	⋯	1.2266	-0.32911	55	159	119	85	1.77	42467.3728	1	5
female	14	no	Mexican American	NA	NA	no	0	1.631	85.6	⋯	NA	-0.99286	49	145	99	61	2.51	690.6990	2	13
male	18	no	Mexican American	alive	85	no	0	1.613	72.2	⋯	1.1034	NA	NA	NA	117	77	0.00	880.8045	2	13
male	18	no	Non-Hispanic White	alive	91	no	0	1.796	62.7	⋯	1.1622	-1.14831	64	151	92	43	3.43	11514.5245	2	1
male	62	no	Non-Hispanic White	other	26	no	1	1.749	113.0	⋯	1.2130	0.60531	49	216	124	71	1.07	16359.4822	2	12
female	7	no	Non-Hispanic Black	NA	NA	NA	NA	1.261	21.7	⋯	NA	NA	72	184	NA	NA	0.00	3608.9188	2	5
⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋱	⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮
female	40	no	Non-Hispanic White	alive	NA	no	0	NA	NA	⋯	NA	NA	NA	NA	NA	NA	5.00	0.0000	1	36
male	83	no	Non-Hispanic White	alive	26	no	0	1.608	62.6	⋯	1.2500	0.30891	73	237	180	36	0.84	2842.5528	2	41
female	17	no	Mexican American	NA	NA	no	0	1.660	73.4	⋯	1.0670	NA	63	192	116	78	2.43	2269.4901	1	42
male	85	no	Non-Hispanic White	alive	NA	no	0	NA	NA	⋯	NA	0.00000	NA	NA	NA	NA	3.00	0.0000	2	35
female	46	no	Non-Hispanic White	alive	31	yes	0	1.738	57.5	⋯	1.1650	0.80966	28	218	103	72	1.04	16782.0437	1	33
female	33	yes	Other/Multi	alive	47	no	0	1.633	76.9	⋯	NA	-0.75543	72	228	112	63	5.00	2410.5345	1	35
male	0	no	Other/Multi	NA	NA	NA	NA	NA	10.0	⋯	NA	NA	NA	NA	NA	NA	4.47	1518.4104	2	37
female	4	no	Mexican American	NA	NA	no	0	1.080	19.3	⋯	NA	NA	NA	NA	NA	NA	0.80	2726.6750	2	40
male	40	no	Non-Hispanic Black	alive	31	no	0	1.748	76.4	⋯	1.4320	NA	82	232	96	57	1.83	10810.7055	2	32
female	0	no	Mexican American	NA	NA	NA	NA	NA	7.8	⋯	NA	NA	NA	NA	NA	NA	0.64	715.5026	2	43
male	12	no	Non-Hispanic Black	NA	NA	no	0	1.428	31.1	⋯	1.0170	-0.48886	74	187	101	63	0.56	1895.2838	1	30
female	72	no	Non-Hispanic White	alive	27	no	0	1.502	67.5	⋯	1.0110	-0.16663	79	162	140	70	1.18	10217.0484	2	30
female	26	yes	Other/Multi	alive	25	no	0	1.558	53.8	⋯	NA	-0.53363	NA	NA	106	54	0.00	2791.2864	2	39
female	14	no	Mexican American	NA	NA	no	0	1.550	54.3	⋯	1.0890	-0.96314	60	168	118	67	1.01	1133.9264	2	40
male	34	no	Non-Hispanic White	alive	31	no	0	1.844	100.0	⋯	1.1810	-0.20340	61	197	132	71	4.79	36995.3239	2	32
male	13	no	Mexican American	NA	NA	no	0	1.559	61.2	⋯	0.9180	-0.16427	29	120	104	60	1.33	1996.1073	2	42
male	12	no	Non-Hispanic White	NA	NA	no	0	1.619	46.2	⋯	0.8916	NA	NA	NA	94	52	2.70	12648.6416	2	42
female	55	no	Non-Hispanic White	alive	31	no	0	1.648	57.8	⋯	1.0690	-1.22452	73	157	96	67	1.21	15357.9095	2	31
female	19	no	Mexican American	alive	36	no	0	1.531	52.8	⋯	1.1090	-0.41705	43	129	106	62	0.00	1410.3413	1	40
female	54	no	Non-Hispanic White	alive	47	no	0	1.661	71.5	⋯	1.0850	-0.75543	64	208	115	59	5.00	26313.6174	1	35
female	2	no	Non-Hispanic White	NA	NA	no	0	NA	12.5	⋯	NA	NA	NA	NA	NA	NA	1.34	5568.3884	2	34
female	16	no	Mexican American	NA	NA	yes	0	1.498	43.7	⋯	1.0110	-0.50255	52	120	111	69	0.66	2289.3041	1	38
male	23	no	Mexican American	alive	37	yes	0	1.721	77.4	⋯	0.9940	0.84841	44	211	127	69	3.93	9982.5528	1	39
female	0	no	Mexican American	NA	NA	NA	NA	NA	9.5	⋯	NA	NA	NA	NA	NA	NA	1.24	1391.6587	2	39
female	68	no	Mexican American	alive	34	no	0	1.530	68.0	⋯	0.8650	0.51794	41	202	134	NA	2.08	1128.5607	2	40
female	5	no	Non-Hispanic White	NA	NA	no	0	1.098	19.3	⋯	NA	NA	NA	NA	NA	NA	2.96	18259.3185	2	39
female	83	no	Non-Hispanic White	alive	38	no	0	1.624	61.9	⋯	1.0180	0.07722	101	264	140	52	0.00	7025.9255	1	39
male	44	no	Mexican American	alive	37	yes	0	1.734	89.7	⋯	1.0230	1.55613	51	282	149	90	3.93	5832.3181	1	39
male	54	no	Non-Hispanic White	alive	36	no	0	1.835	126.8	⋯	1.1382	0.24084	35	168	114	62	5.00	32734.1635	1	43
male	1	no	Non-Hispanic White	NA	NA	no	0	NA	10.3	⋯	NA	NA	NA	NA	NA	NA	5.00	10280.9366	2	29

SmallNCHS <- sample_n(tbl = NCHS, size = 100)
head(SmallNCHS)

A data.frame: 6 × 31
	sex	age	pregnant	ethnicity	death	followup	smoker	diabetic	height	weight	⋯	bmd	fmhm_other	hdl	chol	bps	bpd	income	pop_weight	psu	stratum
	<fct>	<dbl>	<fct>	<fct>	<chr>	<dbl>	<fct>	<dbl>	<dbl>	<dbl>	⋯	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
1	male	15	no	Mexican American	NA	NA	no	0	1.767	68.6	⋯	0.9030	0.24084	41	184	107	67	1.72	1513.735	1	6
2	female	0	no	Non-Hispanic White	NA	NA	NA	NA	NA	6.5	⋯	NA	NA	NA	NA	NA	NA	4.78	3953.781	2	26
3	male	82	no	Non-Hispanic White	cardiovascular death	24	no	1	1.672	85.1	⋯	1.1640	0.71159	NA	NA	135	58	5.00	7257.691	2	37
4	female	13	no	Non-Hispanic Black	NA	NA	no	0	1.553	64.8	⋯	NA	NA	NA	NA	100	33	5.00	2126.649	2	2
5	male	0	no	Mexican American	NA	NA	NA	NA	NA	5.8	⋯	NA	NA	NA	NA	NA	NA	0.00	1613.417	1	3
6	male	72	no	Non-Hispanic White	alive	53	no	0	1.788	101.9	⋯	1.2308	NA	NA	NA	NA	NA	5.00	12045.960	1	14

Motors <- readr::read_csv('https://mdbeckman.github.io/dcSupplement/data/engines.csv')
head(Motors)

Rows: 39 Columns: 9

── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (1): Engine
dbl (8): mass, ncylinder, strokes, displacement, bore, stroke, BHP, RPM

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

A tibble: 6 × 9
Engine	mass	ncylinder	strokes	displacement	bore	stroke	BHP	RPM
<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
Webra Speedy	0.135	1	2	1.8	13.5	12.5	0.45	22000
Motori Cipolla	0.150	1	2	2.5	15.0	14.0	1.00	26000
Webra Speed 20	0.250	1	2	3.4	16.5	16.0	0.78	22000
Webra 40	0.270	1	2	6.5	21.0	19.0	0.96	15500
Webra 61 Blackhead	0.430	1	2	10.0	24.0	22.0	1.55	14000
Webra 6WR	0.490	1	2	10.0	24.0	22.0	2.76	19000

data('CPS85', package = 'mosaicData')
head(CPS85)

A data.frame: 6 × 11
	wage	educ	race	sex	hispanic	south	married	exper	union	age	sector
	<dbl>	<int>	<fct>	<fct>	<fct>	<fct>	<fct>	<int>	<fct>	<int>	<fct>
1	9.0	10	W	M	NH	NS	Married	27	Not	43	const
2	5.5	12	W	M	NH	NS	Married	20	Not	38	sales
3	3.8	12	W	F	NH	NS	Single	4	Not	22	sales
4	10.5	12	W	F	NH	NS	Married	29	Not	47	clerical
5	15.0	12	W	M	NH	NS	Married	40	Union	58	const
6	9.0	16	W	F	NH	NS	Married	27	Not	49	clerical

05 - Intro to data graphics #

head(x = dcData::NCHS, n = 5)

A data.frame: 5 × 31
	sex	age	pregnant	ethnicity	death	followup	smoker	diabetic	height	weight	⋯	bmd	fmhm_other	hdl	chol	bps	bpd	income	pop_weight	psu	stratum
	<fct>	<dbl>	<fct>	<fct>	<chr>	<dbl>	<fct>	<dbl>	<dbl>	<dbl>	⋯	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
1	female	2	no	Non-Hispanic Black	NA	NA	no	0	0.916	12.5	⋯	NA	NA	NA	NA	NA	NA	0.86	2970.804	1	5
2	male	77	no	Non-Hispanic White	alive	90	no	0	1.740	75.4	⋯	1.2196	0.12359	54	215	98	56	5.00	10224.133	3	1
3	female	10	no	Non-Hispanic White	NA	NA	no	0	1.366	32.9	⋯	NA	NA	30	129	108	63	1.47	14172.311	2	7
4	male	1	no	Non-Hispanic Black	NA	NA	no	0	NA	13.3	⋯	NA	NA	NA	NA	NA	NA	0.57	3041.593	1	2
5	male	49	no	Non-Hispanic White	alive	74	yes	0	1.783	92.5	⋯	1.0870	1.17688	42	279	122	83	5.00	30657.312	2	8

head(x = dcData::NCI60, n = 5)

A data.frame: 5 × 61
	Probe	BR.MCF7	BR.MDA_MB_231	BR.HS578T	BR.BT_549	BR.T47D	CNS.SF_268	CNS.SF_295	CNS.SF_539	CNS.SNB_19	⋯	PR.PC_3	PR.DU_145	RE.786_0	RE.A498	RE.ACHN	RE.CAKI_1	RE.RXF_393	RE.SN12C	RE.TK_10	RE.UO_31
	<fct>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	⋯	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
1	AT_D_3	-7.45	-7.51	-7.30	-7.37	-6.15	-7.16	-7.03	-7.44	-7.43	⋯	-6.88	-6.78	-7.25	-7.17	-6.51	-6.66	-6.53	-6.86	-7.03	-6.85
2	AT_D_5	-7.05	-6.62	-6.88	-6.78	-7.18	-7.25	-7.22	-7.37	-7.26	⋯	-6.27	-7.04	-6.68	-6.83	-6.65	-6.61	-6.23	-6.74	-6.85	-6.67
3	AT_D_M	-7.05	-7.29	-7.30	-7.37	-7.61	-6.56	-7.63	-7.44	-7.43	⋯	-6.45	-7.29	-7.25	-7.14	-6.53	-6.93	-6.17	-7.20	-6.93	-6.85
4	AT_L_3	-7.32	-7.01	-7.22	-6.60	-7.45	-7.00	-7.32	-7.27	-7.39	⋯	-6.30	-7.01	-6.73	-6.50	-6.37	-6.54	-7.07	-6.86	-6.55	-6.67
5	AT_L_5	-7.38	-7.22	-7.30	-7.37	-7.70	-7.25	-7.61	-7.44	-7.43	⋯	-6.48	-7.15	-7.06	-6.44	-6.47	-6.62	-7.00	-7.22	-6.89	-6.36

10 - More data verbs #

head(x = dcData::CountryData, n = 5)

A data.frame: 5 × 76
	country	area	pop	growth	birth	death	migr	maternal	infant	life	⋯	mainlines	cell	netHosts	netUsers	airports	railways	roadways	waterways	marine	military
	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	⋯	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
1	Afghanistan	652230	31822848	2.29	38.84	14.12	-1.83	460	117.23	50.49	⋯	13500	18000000	223	1000000	52	NA	42150	1200	NA	NA
2	Akrotiri	123	15700	NA	NA	NA	NA	NA	NA	NA	⋯	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
3	Albania	28748	3020209	0.30	12.73	6.47	-3.31	27	13.19	77.96	⋯	312000	3500000	15528	1300000	4	339	18000	41	17	1.47
4	Algeria	2381741	38813722	1.88	23.99	4.31	-0.93	97	21.76	76.39	⋯	3200000	37692000	676	4700000	157	3973	113655	NA	38	4.48
5	American Samoa	199	54517	-0.35	22.87	4.68	-21.64	NA	8.92	74.91	⋯	10000	NA	2387	NA	3	NA	241	NA	NA	NA

11 - Joining two data frames #

head(x = dcData::MigrationFlows, n = 3) %>%
  select(destcode, origincode, Y2000)

A data.frame: 3 × 3
	destcode	origincode	Y2000
	<fct>	<fct>	<int>
1	FRA	AFG	923
2	FRA	DZA	425229
3	FRA	AUS	9168

head(x = dcData::CountryData, n = 6) %>%
  select(country, life, infant)

A data.frame: 6 × 3
	country	life	infant
	<chr>	<dbl>	<dbl>
1	Afghanistan	50.49	117.23
2	Akrotiri	NA	NA
3	Albania	77.96	13.19
4	Algeria	76.39	21.76
5	American Samoa	74.91	8.92
6	Andorra	82.65	3.69

head(x = dcData::CountryCentroids, n = 4) %>%
  select(name, iso_a3)

A data.frame: 4 × 2
	name	iso_a3
	<chr>	<chr>
1	Afghanistan	AFG
2	Aland	ALA
3	Albania	ALB
4	Algeria	DZA

# to translate the variable `country` in table `CountryData`
# to the code `iso_a3`,
# join table `CountryCentroids` to table `CountryData`

InfantMortality <-
  CountryCentroids %>%
  select(name, iso_a3) %>%
  left_join(y = CountryData %>% select(country, infant),
            by = c('name' = 'country'))
head(x = InfantMortality, n = 6)

A data.frame: 6 × 3
	name	iso_a3	infant
	<chr>	<chr>	<dbl>
1	Afghanistan	AFG	117.23
2	Aland	ALA	NA
3	Albania	ALB	13.19
4	Algeria	DZA	21.76
5	American Samoa	ASM	8.92
6	Andorra	AND	3.69

# infant mortality of the destination country

MigrationFlows %>%
  left_join(y = InfantMortality, by = c('destcode' = 'iso_a3')) %>%
  head(n = 6) %>%
  select(sex, destcode, origincode, Y2000, name, infant)

A data.frame: 6 × 6
	sex	destcode	origincode	Y2000	name	infant
	<fct>	<chr>	<fct>	<int>	<chr>	<dbl>
1	Male	FRA	AFG	923	France	3.31
2	Male	FRA	DZA	425229	France	3.31
3	Male	FRA	AUS	9168	France	3.31
4	Male	FRA	AUT	7764	France	3.31
5	Male	FRA	AZE	118	France	3.31
6	Male	FRA	BLR	245	France	3.31

# infant mortality of the origin country

MigrationFlows %>%
  left_join(y = InfantMortality, by = c('origincode' = 'iso_a3')) %>%
  head(n = 6) %>%
  select(sex, destcode, origincode, Y2000, name, infant)

A data.frame: 6 × 6
	sex	destcode	origincode	Y2000	name	infant
	<fct>	<fct>	<chr>	<int>	<chr>	<dbl>
1	Male	FRA	AFG	923	Afghanistan	117.23
2	Male	FRA	DZA	425229	Algeria	21.76
3	Male	FRA	AUS	9168	Australia	4.43
4	Male	FRA	AUT	7764	Austria	4.16
5	Male	FRA	AZE	118	Azerbaijan	26.67
6	Male	FRA	BLR	245	Belarus	3.64

head(x = dcData::CountryGroups, n = 5)

A data.frame: 5 × 4
	country	G8	G20	GGG
	<fct>	<lgl>	<lgl>	<lgl>
1	Canada	TRUE	TRUE	FALSE
2	France	TRUE	TRUE	FALSE
3	Germany	TRUE	TRUE	FALSE
4	Italy	TRUE	TRUE	FALSE
5	Japan	TRUE	TRUE	FALSE

G8Countries <-
  CountryGroups %>%
  dplyr::filter(G8) %>%
  select(country)
G8Countries

A data.frame: 8 × 1
country
<fct>
Canada
France
Germany
Italy
Japan
Russia
United Kingdom
United States

G8CountryData <-
  CountryData %>%
    inner_join(y = G8Countries, by = join_by(country))
G8CountryData %>%
  select(country, GDP, pop)

A data.frame: 8 × 3
country	GDP	pop
<chr>	<dbl>	<dbl>
Canada	1.518e+12	34834841
France	2.276e+12	66259012
Germany	3.227e+12	80996685
Italy	1.805e+12	61680122
Japan	4.729e+12	127103388
Russia	2.553e+12	142470272
United Kingdom	2.387e+12	63742977
United States	1.672e+13	318892103

Exercises #

[11.1]#

Most data verbs are one table-in one table-out (1I1O) whereas join data verbs are two table-in one table-out (2I1O).

dcData::BabyNames %>%
  group_by(year) %>%
    summarize(total = sum(count)) %>%
      head()

A tibble: 6 × 2
year	total
<int>	<int>
1880	201484
1881	192700
1882	221537
1883	216952
1884	243468
1885	240856

[11.2]#

The value of the matching variable of row $n$ of the left table does not necessarily match the value of the matching variable of row $n$ of the right table.

[11.3]#

head(x = dcData::ZipGeography, n = 5)

A data.frame: 5 × 13
	State	Population	HousingUnits	LandArea	WaterArea	CityName	PostOfficeName	County	AreaCode	Timezone	Latitude	Longitude	ZIP
	<fct>	<dbl>	<dbl>	<dbl>	<dbl>	<fct>	<fct>	<fct>	<dbl>	<fct>	<dbl>	<dbl>	<chr>
1	New York	0	0	0.1	46.3	Holtsville	Holtsville	Suffolk	631	EST	40.92233	-72.63708	00501
2	New York	0	0	0.0	170.3	Holtsville	Holtsville	Suffolk	631	EST	40.92233	-72.63708	00544
3		0	0	0.0	4.7	Adjuntas	Adjuntas	Adjuntas	787	EST+1	18.16527	-66.72258	00601
4		42042	15590	80.1	0.0	Aguada	Aguada	Aguada	787	EST+1	18.39310	-67.18095	00602
5		55530	21626	78.7	0.1	Aguadilla	Aguadilla	Aguadilla	787	EST+1	18.45591	-67.14578	00603

tail(x = dcData::ZipDemography, n = 5)

A data.frame: 5 × 44
	Totalpopulation	Male	Female	MedianAge	Under5years	X18yearsandover	X65yearsandover	Onerace	White	BlackorAfricanAmerican	⋯	Medianfamilyincomedollars	Percapitaincomedollars	Familiesbelowpovertylevel	Individualsbelowpovertylevel	Singlefamilyowneroccupiedhomes	Medianvaluedollars	Medianofselectedmonthlyownercosts	WithaMortgage	Notmortgaged	ZIP
	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	⋯	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<chr>
42737	1447	757	690	30.3	120	964	99	1329	141	4	⋯	49803	16176	26	113	253	104800	NA	648	325	99926
42738	120	69	51	42.2	6	88	13	111	105	0	⋯	29750	11742	8	53	47	80600	NA	750	178	99927
42739	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	⋯	NA	NA	NA	NA	NA	NA	NA	NA	NA	99928
42740	2424	1250	1174	39.6	148	1726	280	2191	1797	3	⋯	52917	21688	53	235	438	127600	NA	1170	405	99929
42741	36	22	14	35.0	2	29	1	36	33	0	⋯	39792	14332	0	0	0	0	NA	0	0	99950

NewYork <-
  ZipGeography %>%
    left_join(y = ZipDemography, by = c('ZIP' = 'ZIP')) %>%
      filter(State == 'New York') %>%
        select(State, Population, Totalpopulation, Under5years, X18yearsandover, X65yearsandover) %>%
          transmute(
            State     = State,
            pop       = coalesce(Population, Totalpopulation),
            age0to4   = Under5years,
            age5to17  = pop - Under5years - X18yearsandover,
            age18to64 = X18yearsandover - X65yearsandover,
            age65plus = X65yearsandover,
            total     = age0to4 + age5to17 + age18to64 + age65plus,
            check1    = total == pop,
          )
head(x = NewYork, n = 5)

A data.frame: 5 × 8
	State	pop	age0to4	age5to17	age18to64	age65plus	total	check1
	<fct>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<lgl>
1	New York	0	NA	NA	NA	NA	NA	NA
2	New York	0	NA	NA	NA	NA	NA	NA
3	New York	289	NA	NA	NA	NA	NA	NA
4	New York	17310	457	1213	13189	2451	17310	TRUE
5	New York	84870	4149	12750	54797	13174	84870	TRUE

PopulationGroups <-
  ZipGeography %>%
    left_join(y = ZipDemography, by = c('ZIP' = 'ZIP')) %>%
      select(State, Population, Totalpopulation, Under5years, X18yearsandover, X65yearsandover) %>%
        transmute(
          State     = State,
          pop       = coalesce(Population, Totalpopulation),
          age0to4   = Under5years,
          age5to17  = pop - Under5years - X18yearsandover,
          age18to64 = X18yearsandover - X65yearsandover,
          age65plus = X65yearsandover,
          total     = age0to4 + age5to17 + age18to64 + age65plus,
          check1    = total == pop,
        ) %>%
          group_by(State) %>%
            summarize(
              total0to4   = sum(age0to4,    na.rm=TRUE),
              total5to17  = sum(age5to17,   na.rm=TRUE),
              total18to64 = sum(age18to64,  na.rm=TRUE),
              total65plus = sum(age65plus,  na.rm=TRUE),
              total       = sum(pop, na.rm=TRUE)
            ) %>%
              mutate(
                diff  = total - (total0to4 + total5to17 + total18to64 + total65plus),
                check = total0to4 + total5to17 + total18to64 + total65plus == total
              )
PopulationGroups

A tibble: 53 × 8
State	total0to4	total5to17	total18to64	total65plus	total	diff	check
<fct>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<lgl>
	0	0	0	0	741752	741752	FALSE
Massachusetts	0	0	0	0	6349048	6349048	FALSE
New Hampshire	0	0	0	0	1235735	1235735	FALSE
New York	1239375	3450540	11837416	2448224	18975844	289	FALSE
Rhode Island	0	0	0	0	1048319	1048319	FALSE
Maine	0	0	0	0	1273094	1273094	FALSE
Vermont	0	0	0	0	608822	608822	FALSE
Connecticut	0	0	0	0	3405565	3405565	FALSE
New Jersey	0	0	0	0	8413990	8413990	FALSE
Pennsylvania	727790	2194356	7439389	1919052	12280587	0	TRUE
Delaware	51529	143043	487241	101712	783525	0	TRUE
District of Columbia	32536	82456	387169	69898	572059	0	TRUE
Maryland	353393	1002779	3341007	599307	5296486	0	TRUE
Virginia	461957	1276186	4547561	792248	7077952	0	TRUE
West Virginia	101791	300500	1128772	276833	1807896	0	TRUE
North Carolina	539506	1424526	5116187	969035	8049254	0	TRUE
Georgia	595087	1573898	5231348	785170	8185503	0	TRUE
South Carolina	264656	744885	2516802	485272	4011615	0	TRUE
Florida	945721	2700255	9526267	2807388	15979631	0	TRUE
Alabama	295936	827248	2743296	579644	4446124	0	TRUE
Mississippi	204273	570596	1725052	343378	2843299	0	TRUE
Tennessee	374219	1022301	3582610	703259	5682389	0	TRUE
Kentucky	266475	730021	2546187	504739	4047422	0	TRUE
Ohio	754924	2133383	6956963	1507732	11353002	0	TRUE
Indiana	423215	1151181	3753258	752831	6080485	0	TRUE
Michigan	672002	1923732	6123512	1218958	9938204	0	TRUE
Iowa	188416	545293	1756618	436232	2926559	0	TRUE
Wisconsin	342340	1026416	3292356	702550	5363662	0	TRUE
Minnesota	329597	957361	3038349	594262	4919569	0	TRUE
North Dakota	39370	121280	386412	94365	641427	0	TRUE
South Dakota	51054	151454	443889	108109	754506	0	TRUE
Illinois	876539	2368834	7673666	1499998	12419037	0	TRUE
Montana	54853	175212	551216	120960	902241	0	TRUE
Missouri	369905	1057798	3412243	755444	5595390	0	TRUE
Kansas	188684	524227	1619075	356199	2688185	0	TRUE
Nebraska	117035	333140	1028660	232159	1710994	0	TRUE
Louisiana	317160	901852	2730442	516492	4465946	0	TRUE
Arkansas	181626	498826	1619048	374034	2673534	0	TRUE
Oklahoma	236042	655289	2099793	455109	3446233	0	TRUE
Texas	1624203	4260543	12888235	2071668	20844649	0	TRUE
Colorado	297494	803274	2784280	416049	4301097	0	TRUE
Arizona	382483	985121	3096256	667891	5131751	0	TRUE
Idaho	97558	270952	777971	145737	1292218	0	TRUE
Utah	209299	508994	1323734	190138	2232165	0	TRUE
Wyoming	30935	97924	306976	57667	493502	0	TRUE
New Mexico	130286	376732	1095624	211732	1814374	0	TRUE
California	2486843	6762415	21019901	3595271	33864430	0	TRUE
Nevada	145790	365847	1264174	218694	1994505	0	TRUE
Hawaii	78161	217601	755139	160588	1211489	0	TRUE
Oregon	222906	623356	2135687	438070	3420019	0	TRUE
Washington	394280	1119436	3717866	662093	5893675	0	TRUE
Alaska	47514	142771	399108	35599	624992	0	TRUE
NA	0	0	0	0	0	0	TRUE

#options(repr.plot.width=20, repr.plot.height=20)

ZipGeography %>%
  filter(State != '') %>%
  left_join(y = ZipDemography, by = c('ZIP' = 'ZIP')) %>%
    select(State, Population, Totalpopulation, Under5years, X18yearsandover, X65yearsandover) %>%
      transmute(
        State     = State,
        ageTotal  = coalesce(Population, Totalpopulation, 0),
        age0to4   = coalesce(Under5years, 0),
        age5to17  = coalesce(ageTotal - Under5years - X18yearsandover, 0),
        age18to64 = coalesce(X18yearsandover - X65yearsandover, 0),
        age65plus = coalesce(X65yearsandover, 0)
      ) %>%
        pivot_longer(c(`age0to4`, `age5to17`, `age18to64`, `age65plus`), names_to = 'ageRange', values_to = 'cases') %>%
          ggplot() +
            geom_bar(
              mapping  = aes(y = State, x = cases, fill = ageRange),
              position = 'stack',
              stat     = 'identity',
              na.rm    = TRUE
            ) +
            theme(
              text=element_text(size=20)
            )

../../../_images/c59b8007fd38a0d9829d6006489bb58725e200cdce12fb51568b770b80248da0.png

15 - Statistics #

Data graphics

translate each case into a representation in terms of a glyph’s graphical attributes: color, position, shape, size, etc.

A representation for the collective properties of cases?

collective properties: features of groups of cases
summary functions: max, mean, median, etc.

Statistics

an area of science concerned with characterizing case-to-case variation and the collective properties of cases, while also quantifying uncertainty

Data Science

a collaboration between statistics and computer science

Confidence Interval

Confidence Band

How to display variation among cases? The distribution of values of one variable? The distribution of values of several variables?

15.2 - Density distribution of one variable #

NCHS %>%
  ggplot(aes(weight, 1)) +
    geom_point()

Warning message:
“Removed 2575 rows containing missing values (`geom_point()`).”

../../../_images/06c04ca41601ae0b29d89d4781539f37c6ed1ec346fe18c51ceb22a206008f47.png

# density of cases, represented by the degree of darkness

NCHS %>%
  ggplot(aes(weight, 1)) +
    geom_point(alpha = 0.05, position = 'jitter') +
    xlab('Weight (kg)') +
    ylab('') +
    theme_bw() +
    scale_color_grey()

Warning message:
“Removed 2575 rows containing missing values (`geom_point()`).”

../../../_images/41486ce71cbd290e0ff3956c925aebff93c4756d7ba28b17b6353275739b8b76.png

# density of cases, represented by the height of a function
# the scale of the y-axis has units of inverse kilograms
# the scale is arranged so that the area under the curve is equal to unity
# this convention facilitates densities for different groups; and means that narrow distributions tend to have high density

NCHS %>%
  ggplot(aes(weight)) +
    geom_density(color = 'gray', fill = 'gray', alpha = 0.75) +
    xlab('Weight (kg)') +
    ylab('Density (1/kg)') +
    theme_bw() +
    scale_color_grey()

Warning message:
“Removed 2575 rows containing non-finite values (`stat_density()`).”

../../../_images/d89ad92ef7c8ced7eb524c4beba1c0c53a826305d887c7fa190aa7eaee3f46a9.png

NCHS %>%
  ggplot(aes(weight)) +
    geom_density(color = 'gray', fill = 'gray', alpha = 0.75) +
    geom_point(
      alpha    = 0.02, 
      mapping  = aes(y = 0.002),
      position = position_jitter(height = 0.002)
    ) +
    xlab('Weight (kg)') +
    ylab('Density (1/kg)') +
    theme_bw() +
    scale_color_grey()

Warning message:
“Removed 2575 rows containing non-finite values (`stat_density()`).”

Warning message:
“Removed 2575 rows containing missing values (`geom_point()`).”

../../../_images/95080c6bd193973a38c33197595f6b1ec8e87448a91b64d95e07b51a35f2fc13.png

NCHS %>%
  ggplot(aes(x = weight, group = sex)) +
  geom_density(aes(color = sex, fill = sex), alpha = 0.5) +
  xlab('Weight (kg)') +
  ylab('Density (1/kg)') +
  theme(legend.position = 'top') +
  theme_bw() +
  scale_color_grey()

Warning message:
“Removed 2575 rows containing non-finite values (`stat_density()`).”

../../../_images/d48c6f35cffde107e978f7ba79cf9b7264b45258d6c32b80f6905b0ba431adfb.png

NCHS %>%
  mutate(ageGroup = mosaic::ntiles(x = age, n = 6, format = 'interval')) %>%
    ggplot(aes(x = weight, group = sex)) +
      geom_density(aes(color = sex, fill = sex), alpha = 0.75) +
      facet_wrap(~ ageGroup, nrow = 1) +
      xlab('Weight (kg)') +
      ylab('Density (1/kg)') +
      theme(axis.text.x = element_text(angle = 60, hjust = 1))

Warning message:
“Removed 2575 rows containing non-finite values (`stat_density()`).”

../../../_images/3c7f7f8549f60ec7ab581712e96ffa184a70112231a8198a61115000f77d74b0.png

15.3 - Other depictions of density distribution #

NCHS %>%
  mutate(ageGroup = mosaic::ntiles(x = age, n = 6, format = 'interval')) %>%
    ggplot(aes(y = weight, x = ageGroup)) +
      geom_boxplot(
        alpha         = 0.25,
        mapping       = aes(color = ageGroup, fill = ageGroup),
        outlier.color = 'gray',
        outlier.size  = 2
      ) +
      facet_wrap(~ sex) +
      xlab('Age Group (yrs)') +
      ylab('Weight (kg)') +
      theme(legend.position = 'top') +
      theme(axis.text.x = element_text(angle = 60, hjust = 1))

Warning message:
“Removed 2575 rows containing non-finite values (`stat_boxplot()`).”

../../../_images/e63a2e611e7f67bc88c3aeab89bc687e717c104d8230439932339a3d40e06fa8.png

NCHS %>%
  mutate(ageGroup = mosaic::ntiles(x = age, n = 6, format = 'interval')) %>%
    ggplot(aes(y = weight, x = ageGroup)) +
      geom_violin(fill = 'gray') +
      facet_wrap(~ sex) +
      xlab('Age Group (yrs)') +
      ylab('Weight (kg)') +
      theme(legend.position = 'top') +
      theme(axis.text.x = element_text(angle = 60, hjust = 1))

Warning message:
“Removed 2575 rows containing non-finite values (`stat_ydensity()`).”

../../../_images/30aec579a0262062b2d9ca7c1ca04a710c2cdd8afc02a33ad819422be5909e93.png

NCHS %>%
  mutate(ageGroup = mosaic::ntiles(x = age, n = 6, format = 'interval')) %>%
    ggplot(aes(y = weight, x = ageGroup)) +
      geom_boxplot(
        alpha         = 0.75,
        mapping       = aes(fill = diabetic),
        outlier.color = 'gray',
        outlier.size  = 1,
        position = position_dodge(width = 0.8)
      ) +
      facet_grid(sex ~ .) +
      xlab('Age Group (yrs)') +
      ylab('Weight (kg)') +
      theme(legend.position = 'top') +
      theme(axis.text.x = element_text(angle = 90, hjust = 1))

Warning message:
“Removed 2575 rows containing non-finite values (`stat_boxplot()`).”

Warning message:
“The following aesthetics were dropped during statistical transformation: fill
ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?”

Warning message:
“The following aesthetics were dropped during statistical transformation: fill
ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?”

../../../_images/4cc432eec1f4cf3e4ba4b0ae03ba36310c30c283b5973079980f199775303b34.png

15.4 - Confidence intervals #

NCHS %>%
  mutate(ageGroup = mosaic::ntiles(x = age, n = 6, format = 'interval')) %>%
    ggplot(aes(y = weight, x = ageGroup)) +
      geom_boxplot(
        mapping       = aes(fill = diabetic),
        notch         = TRUE,
        outlier.color = 'gray',
        outlier.size  = 1,
        position      = position_dodge(width = 0.8)
      ) +
      facet_grid(sex ~ .) +
      xlab('Age Group (yrs)') +
      ylab('Weight (kg)') +
      theme(axis.text.x = element_text(angle = 60, hjust = 1))

Warning message:
“Removed 2575 rows containing non-finite values (`stat_boxplot()`).”

Warning message:
“The following aesthetics were dropped during statistical transformation: fill
ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?”

Warning message:
“The following aesthetics were dropped during statistical transformation: fill
ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?”

../../../_images/82cc6eb339f0eb5e39e3387dadc8a88d7be24faf294723758fb166fd8b401968.png

15.5 - Model functions #

head(NCHS)

A data.frame: 6 × 31
	sex	age	pregnant	ethnicity	death	followup	smoker	diabetic	height	weight	⋯	bmd	fmhm_other	hdl	chol	bps	bpd	income	pop_weight	psu	stratum
	<fct>	<dbl>	<fct>	<fct>	<chr>	<dbl>	<fct>	<dbl>	<dbl>	<dbl>	⋯	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
1	female	2	no	Non-Hispanic Black	NA	NA	no	0	0.916	12.5	⋯	NA	NA	NA	NA	NA	NA	0.86	2970.804	1	5
2	male	77	no	Non-Hispanic White	alive	90	no	0	1.740	75.4	⋯	1.2196	0.12359	54	215	98	56	5.00	10224.133	3	1
3	female	10	no	Non-Hispanic White	NA	NA	no	0	1.366	32.9	⋯	NA	NA	30	129	108	63	1.47	14172.311	2	7
4	male	1	no	Non-Hispanic Black	NA	NA	no	0	NA	13.3	⋯	NA	NA	NA	NA	NA	NA	0.57	3041.593	1	2
5	male	49	no	Non-Hispanic White	alive	74	yes	0	1.783	92.5	⋯	1.0870	1.17688	42	279	122	83	5.00	30657.312	2	8
6	female	19	no	Other/Multi	alive	86	no	0	1.620	59.2	⋯	0.8680	-1.22452	61	153	114	70	1.21	12224.876	2	2

NCHS %>%
  ggplot(aes(x = age, y = weight, color = diabetic)) +
    stat_smooth(se = FALSE) +
    geom_point(alpha = 0.02) +
    ylim(0, 120) +
    theme(legend.position = 'top')

`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Warning message:
“Removed 3189 rows containing non-finite values (`stat_smooth()`).”

Warning message:
“The following aesthetics were dropped during statistical transformation: colour
ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?”

Warning message:
“Removed 3189 rows containing missing values (`geom_point()`).”

../../../_images/702e9ec1feff14a88ddf6d9c0e6aed8f0ea400b47d515281fdec581174e67fa8.png

NCHS %>%
  ggplot(aes(x = age, y = weight, color = diabetic)) +
    stat_smooth() +
    geom_point(alpha = 0.02) +
    ylim(0, 120) +
    theme(legend.position = 'top')

`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Warning message:
“Removed 3189 rows containing non-finite values (`stat_smooth()`).”

Warning message:
“The following aesthetics were dropped during statistical transformation: colour
ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?”

Warning message:
“Removed 3189 rows containing missing values (`geom_point()`).”

../../../_images/e0cbed36632f90209909be99e0c05c444ac03b08afaa7bca580ddd073bbe5e9a.png

NCHS %>%
  ggplot(aes(x = age, y = weight),
    color = diabetic,
    fill  = diabetic
  ) +
  stat_smooth(method = lm) +
  geom_point(alpha = 0.02) +
  ylim(0, 120) +
  theme(legend.position = 'top')

`geom_smooth()` using formula = 'y ~ x'

Warning message:
“Removed 3189 rows containing non-finite values (`stat_smooth()`).”

Warning message:
“Removed 3189 rows containing missing values (`geom_point()`).”

../../../_images/9be91693203ade5cd63a8f8fa4901fad0a0733084b6553b1668d4f232289ec36.png

16 - Data scraping and intake methods #

[Data Scraping]

Data scraping is the gathering of data from sources such as web browsers in which they are not already in a tidy (data frame/table) format and the translation of such data to one or more data frames/tables.

[Data Cleaning]

Data clearning is the correcting of errors in data that stem either from blunders in data entry or from deficiencies in the way data is stored or coded.

Data frame-friendly formats

CSV
- base::read.csv(stringsAsFactors = FALSE)
- data.table::fread()
- mosaic::read.file()
- readr::read_csv()
technical software package
- Octave/MATLAB
- Stata
- SPSS
- Minitab
- SAS
- Epi
relational databases
Excel
web
- HTML <table>
- JSON
- XML
- Google spreadsheet qua HTML
- API

[List]

A list is an R object used to store a collection of other R objects. Elements of a list can even have different types; e.g., data frames, plots, model objects, even other lists.

as.numeric()
as.character()
readr::parse_number()

webURL      <- 'https://mdbeckman.github.io/dcSupplement/data/houses-for-sale.csv'
myDataTable <- readr::read_csv(webURL)
myDataTable %>%
  select(price, bedrooms, bathrooms, fuel, air_cond, construction) %>%
  head()

Rows: 1728 Columns: 16

── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
dbl (16): price, lot_size, waterfront, age, land_value, construction, air_co...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

A tibble: 6 × 6
price	bedrooms	bathrooms	fuel	air_cond	construction
<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
132500	2	1.0	3	0	0
181115	3	2.5	2	0	0
109000	4	1.0	2	0	0
155000	3	1.5	2	0	0
86060	2	1.0	2	1	1
120000	4	1.0	2	0	0

# RStudio
file_name <- file.choose() # then navigate and click on your file
MyDataTable2 <-
  data.table::fread(
    input  = file_name,
    nrows  = 0,
    select = c(1, 4, 5, 10),
    drop   = c(2, 3, 6)
  ) %>%
  as.data.frame()

web_page <- 'https://en.wikipedia.org/wiki/Mile_run_world_record_progression'

SetOfTables <-
  web_page %>%
    read_html() %>%
    html_nodes(css = 'table') %>%
    html_table(fill = TRUE)

length(SetOfTables)

Table3 <- SetOfTables[[3]]
Table3
Table4 <- SetOfTables[[4]]
Table4

15

A tibble: 5 × 5
Time	Athlete	Nationality	Date	Venue
<chr>	<chr>	<chr>	<chr>	<chr>
4:52	Cadet Marshall	United Kingdom	2 September 1852	Addiscome
4:45	Thomas Finch	United Kingdom	3 November 1858	Oxford
4:45	St. Vincent Hammick	United Kingdom	15 November 1858	Oxford
4:40	Gerald Surman	United Kingdom	24 November 1859	Oxford
4:33	George Farran	United Kingdom	23 May 1862	Dublin

A tibble: 4 × 2
X1	X2
<lgl>	<chr>
NA	Ratified
NA	Not ratified
NA	Ratified but later rescinded
NA	Pending ratification

# Four of the variables from the `houses-for-sale.csv` file giving features of the Saratoga, NY houses stored as integer codes; each case is a different house.
Houses <-
  read_csv('https://mdbeckman.github.io/dcSupplement/data/houses-for-sale.csv')
Houses %>%
  select(fuel, heat, sewer, construction) %>%
  head()

Rows: 1728 Columns: 16

── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
dbl (16): price, lot_size, waterfront, age, land_value, construction, air_co...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

A tibble: 6 × 4
fuel	heat	sewer	construction
<dbl>	<dbl>	<dbl>	<dbl>
3	4	2	0
2	3	2	0
2	3	3	0
2	2	2	0
2	2	3	1
2	2	2	0

# Codes for the house system types.
#   describes the codes in a format that makes it easy to add new code values as the need arises
Translations <-
  read_csv('https://mdbeckman.github.io/dcSupplement/data/house_codes.csv')
Translations %>%
  head()

Rows: 13 Columns: 3

── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): system_type, meaning
dbl (1): code

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

A tibble: 6 × 3
code	system_type	meaning
<dbl>	<chr>	<chr>
0	new_const	no
1	new_const	yes
1	sewer_type	none
2	sewer_type	private
3	sewer_type	public
0	central_air	no

# There is a column for each system type that translates the integer code to a meaningful term.
# In cases where the integer has no corresponding term, `invalid` has been entered; this provides a quick way to distinguish between incorrect entries and missing entries.
CodeVals <-
  Translations %>%
    spread(key = system_type, value = meaning, fill = 'invalid')
CodeVals %>%
  head()

A tibble: 5 × 6
code	central_air	fuel_type	heat_type	new_const	sewer_type
<dbl>	<chr>	<chr>	<chr>	<chr>	<chr>
0	no	invalid	invalid	no	invalid
1	yes	invalid	invalid	yes	none
2	invalid	gas	hot air	invalid	private
3	invalid	electric	hot water	invalid	public
4	invalid	oil	electric	invalid	invalid

# To carry out the translation, join each variable, one at a time, to the data frame of interest.
# Note how the `by` value changes for each variable.
Houses <-
  Houses %>%
    left_join(CodeVals %>% select(code, fuel_type),  by = c(fuel  = 'code')) %>%
    left_join(CodeVals %>% select(code, heat_type),  by = c(heat  = 'code')) %>%
    left_join(CodeVals %>% select(code, sewer_type), by = c(sewer = 'code'))
Houses %>% head()

A tibble: 6 × 19
price	lot_size	waterfront	age	land_value	construction	air_cond	fuel	heat	sewer	living_area	pct_college	bedrooms	fireplaces	bathrooms	rooms	fuel_type	heat_type	sewer_type
<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<chr>	<chr>	<chr>
132500	0.09	0	42	50000	0	0	3	4	2	906	35	2	1	1.0	5	electric	electric	private
181115	0.92	0	0	22300	0	0	2	3	2	1953	51	3	0	2.5	6	gas	hot water	private
109000	0.19	0	133	7300	0	0	2	3	3	1944	51	4	1	1.0	8	gas	hot water	public
155000	0.41	0	13	18700	0	0	2	2	2	1944	51	3	1	1.5	5	gas	hot air	private
86060	0.11	0	0	15000	1	1	2	2	3	840	51	2	0	1.0	3	gas	hot air	public
120000	0.68	0	31	14000	0	0	2	2	2	1152	22	4	1	1.0	8	gas	hot air	private

OrdwayBirds <-
  OrdwayBirds %>%
    mutate(
      Month = as.numeric(Month),
      Year  = as.numeric(Year),
      Day   = as.numeric(Day)
    )
head(OrdwayBirds)

WhenAndWho <-
  OrdwayBirds %>%
    select(Who = DataEntryPerson, When = Timestamp) %>%
    mutate(When = lubridate::mdy_hms(When))
head(WhenAndWho)

WhenAndWho %>%
  ggplot(aes(x = When, y = Who)) +
    geom_point(alpha = 0.2)

WhenAndWho %>%
  group_by(Who) %>%
  summarize(start  = min(When, na.rm = TRUE),
            finish = max(When, na.rm = TRUE)) %>%
  head()

A data.frame: 6 × 26
	bogus	Timestamp	Year	Day	Month	CaptureTime	SpeciesName	Sex	Age	BandNumber	⋯	Condition	Release	Comments	DataEntryPerson	Weight	WingChord	Temperature	RecaptureOriginal	RecapturePrevious	TailLength
	<chr>	<chr>	<dbl>	<dbl>	<dbl>	<chr>	<chr>	<chr>	<chr>	<chr>	⋯	<chr>	<chr>	<chr>	<chr>	<chr>	<chr>	<chr>	<chr>	<chr>	<chr>
3	.	4/14/2010 13:20:56	1972	16	7	7:00:00	Song Sparrow	U	AHY (After Hatch Year)	107-151187	⋯	none	none	none	Jerald Dosch
4	.		NA	NA	NA						⋯				Caitlin Baker
5	.	5/13/2010 16:00:30	1972	16	7	7:00:00	Song Sparrow	U	AHY (After Hatch Year)	107-151187	⋯	none	none	none	Caitlin Baker
6	.	5/13/2010 16:02:15	1972	16	7	7:00:00	Field Sparrow	U	AHY (After Hatch Year)	1260-74572	⋯	none	none	none	Caitlin Baker
7	.	5/13/2010 16:03:18	1972	16	7	7:00:00	Field Sparrow	U	AHY (After Hatch Year)	1260-74541	⋯	none	none	none	Caitlin Baker
8	.	5/13/2010 16:04:23	1972	16	7	7:00:00	Song Sparrow	U	AHY (After Hatch Year)	107-151188	⋯	none	none	none	Caitlin Baker

A data.frame: 6 × 2
	Who	When
	<chr>	<dttm>
3	Jerald Dosch	2010-04-14 13:20:56
4	Caitlin Baker	NA
5	Caitlin Baker	2010-05-13 16:00:30
6	Caitlin Baker	2010-05-13 16:02:15
7	Caitlin Baker	2010-05-13 16:03:18
8	Caitlin Baker	2010-05-13 16:04:23

Warning message:
“Removed 4 rows containing missing values (`geom_point()`).”

Warning message:
“There were 2 warnings in `summarize()`.
The first warning was:
ℹ In argument: `start = min(When, na.rm = TRUE)`.
ℹ In group 1: `Who = ""`.
Caused by warning in `min.default()`:
! no non-missing arguments to min; returning Inf
ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.”

A tibble: 6 × 3
Who	start	finish
<chr>	<dttm>	<dttm>
	Inf	-Inf
Abby Colehour	2011-04-23 15:50:24	2011-04-23 15:50:24
Brennan Panzarella	2010-09-13 10:48:12	2011-04-10 21:58:56
Caitlin Baker	2010-05-13 16:00:30	2010-05-28 19:41:52
Emily Merrill	2010-06-08 09:10:01	2010-06-08 14:47:21
Jerald Dosch	2010-04-14 13:20:56	2010-04-14 13:20:56

../../../_images/da2fc2283f03596019963d311c54f11231eb1c53205c6568bdda953177e106bb.png

Exercises #

[16.6.1]#

Here are some character strings containing times or dates written in different formats. Your task it twofold: (A) for each, choose an appropriate function from the lubridate package to translate the character string into a datetime object in R and then (B) use R to calculate the number of days between that date and your birthday.

'April 30, 1777'              # Johann Carl Friedrich Gauss
'06-23-1912'                  # Alan Turing
'3 March 1847'                # Alexander Graham Bell
'Nov. 11th, 1918 at 11:00 am' # the armistice ending WWI on the western front
'July 20, 1969'               # the first manned moon landing

lubridate::mdy('April 30, 1777')
lubridate::mdy('06-23-1912')
lubridate::dmy('3 March 1847')
lubridate::mdy_hm('Nov. 11th, 1918 at 11:00 am')
lubridate::mdy('July 20, 1969')

birthday <- lubridate::mdy('February 4, 1992')
birthday

1777-04-30

1912-06-23

1847-03-03

[1] "1918-11-11 11:00:00 UTC"

1969-07-20

1992-02-04

birthday - lubridate::mdy('April 30, 1777')
birthday - lubridate::mdy('06-23-1912')
birthday - lubridate::dmy('3 March 1847')
birthday - lubridate::as_date(lubridate::mdy_hm('Nov. 11th, 1918 at 11:00 am'))
birthday - lubridate::mdy('July 20, 1969')

Time difference of 78441 days

Time difference of 29080 days

Time difference of 52933 days

Time difference of 26748 days

Time difference of 8234 days

[16.6.2]#

Here are some strings containing numerical amounts. For each one, say whether as.numeric() or readr::parse_number() (or both or neither) properly converts the given string to a numeric value.

'42,659.30'
'17%'
'Nineteen'
'£100'
'9.8 m/seconds-square'
'6.62606957 x 10^-34 m2 kg / s'
'6.62606957e-34'
'42.659,30' # a European style

as.numeric('42,659.30')                     # incorrect
as.numeric('17%')                           # incorrect
as.numeric('Nineteen')                      # incorrect
as.numeric('£100')                          # incorrect
as.numeric('9.8 m/seconds-square')          # incorrect
as.numeric('6.62606957 x 10^-34 m2 kg / s') # incorrect
as.numeric('6.62606957e-34')                #   correct
as.numeric('42.659,30')                     # incorrect

Warning message in eval(expr, envir, enclos):
“NAs introduced by coercion”

<NA>

Warning message in eval(expr, envir, enclos):
“NAs introduced by coercion”

<NA>

Warning message in eval(expr, envir, enclos):
“NAs introduced by coercion”

<NA>

Warning message in eval(expr, envir, enclos):
“NAs introduced by coercion”

<NA>

Warning message in eval(expr, envir, enclos):
“NAs introduced by coercion”

<NA>

Warning message in eval(expr, envir, enclos):
“NAs introduced by coercion”

<NA>

6.62606957e-34

Warning message in eval(expr, envir, enclos):
“NAs introduced by coercion”

<NA>

readr::parse_number('42,659.30')                     #   correct
readr::parse_number('17%')                           #   correct
readr::parse_number('Nineteen')                      # incorrect
readr::parse_number('£100')                          #   correct
readr::parse_number('9.8 m/seconds-square')          #   correct
readr::parse_number('6.62606957 x 10^-34 m2 kg / s') # incorrect
readr::parse_number('6.62606957e-34')                #   correct
readr::parse_number('42.659,30')                     # incorrect

42659.3

17

Warning message:
“1 parsing failure.
row col expected   actual
  1  -- a number Nineteen
”

<NA>

100

9.8

6.62606957

6.62606957e-34

42.6593

[16.6.3]#

Grab Table 4 (or another similar table) from the Wikipedia page on world records in the mile (or some similar event). Make a plot of the record time versus the date in which it occurred. Also, mark each point with the name of the athlete written above the point. (Hint: Use geom_text().) To convert time entries such as '4:20.5' into seconds, use the lubridate package’s as.duration(ms('40:20.5')). You can get rid of the footnote markers such as [5] in the dates using the gsub() transformation function which replaces the characters identified in the first argument with those in the second argument. The string '\\[.\\]$' is an example of a regular expression which identifies a pattern of characters, in this case a single character in square brackets just before the end of the string.

Table4 %>%
  mutate(
    Date = lubridate::dmy(gsub('\\[.\\]$', '', Date)),
    Time = lubridate::as.duration(lubridate::ms(Time))
  ) %>%
  ggplot(aes(x = Date, y = Time)) +
    geom_point() +
    geom_text(aes(label = Athlete, hjust = -0.1, vjust = -1)) +
    labs(x = 'Year', y = 'Time (s)')

Error in `mutate()`:
ℹ In argument: `Date = lubridate::dmy(gsub("\\[.\\]$", "", Date))`.
Caused by error in `as.character()`:
! cannot coerce type 'closure' to vector of type 'character'
Traceback:

1. Table4 %>% mutate(Date = lubridate::dmy(gsub("\\[.\\]$", "", 
 .     Date)), Time = lubridate::as.duration(lubridate::ms(Time))) %>% 
 .     ggplot(aes(x = Date, y = Time))
2. ggplot(., aes(x = Date, y = Time))
3. mutate(., Date = lubridate::dmy(gsub("\\[.\\]$", "", Date)), 
 .     Time = lubridate::as.duration(lubridate::ms(Time)))
4. mutate.data.frame(., Date = lubridate::dmy(gsub("\\[.\\]$", "", 
 .     Date)), Time = lubridate::as.duration(lubridate::ms(Time)))
5. mutate_cols(.data, dplyr_quosures(...), by)
6. withCallingHandlers(for (i in seq_along(dots)) {
 .     poke_error_context(dots, i, mask = mask)
 .     context_poke("column", old_current_column)
 .     new_columns <- mutate_col(dots[[i]], data, mask, new_columns)
 . }, error = dplyr_error_handler(dots = dots, mask = mask, bullets = mutate_bullets, 
 .     error_call = error_call, error_class = "dplyr:::mutate_error"), 
 .     warning = dplyr_warning_handler(state = warnings_state, mask = mask, 
 .         error_call = error_call))
7. mutate_col(dots[[i]], data, mask, new_columns)
8. mask$eval_all_mutate(quo)
9. eval()
10. lubridate::dmy(gsub("\\[.\\]$", "", Date))
11. .parse_xxx(..., orders = "dmy", quiet = quiet, tz = tz, locale = locale, 
  .     truncated = truncated)
12. unlist(lapply(list(...), .num_to_date), use.names = FALSE)
13. lapply(list(...), .num_to_date)
14. gsub("\\[.\\]$", "", Date)
15. .handleSimpleError(function (cnd) 
  . {
  .     local_error_context(dots, i = frame[[i_sym]], mask = mask)
  .     if (inherits(cnd, "dplyr:::internal_error")) {
  .         parent <- error_cnd(message = bullets(cnd))
  .     }
  .     else {
  .         parent <- cnd
  .     }
  .     message <- c(cnd_bullet_header(action), i = if (has_active_group_context(mask)) cnd_bullet_cur_group_label())
  .     abort(message, class = error_class, parent = parent, call = error_call)
  . }, "cannot coerce type 'closure' to vector of type 'character'", 
  .     base::quote(as.character(x)))
16. h(simpleError(msg, call))
17. abort(message, class = error_class, parent = parent, call = error_call)
18. signal_abort(cnd, .file)

Project: Popular Names #

Project: Popular Names from Kaplan, Daniel & Matthew Beckman. (2021). Data Computing. 2nd Ed. Home.

?dcData::BabyNames

BabyNames                package:dcData                R Documentation

_N_a_m_e_s _o_f _c_h_i_l_d_r_e_n _a_s _r_e_c_o_r_d_e_d _b_y _t_h_e _U_S _S_o_c_i_a_l _S_e_c_u_r_i_t_y _A_d_m_i_n_i_s_t_r_a_t_i_o_n.

_D_e_s_c_r_i_p_t_i_o_n:

     The US Social Security Administration provides yearly lists of
     names given to babies. These data combine the yearly lists.

     `BabyNames` is the raw data from the SSA. The case is a
     year-name-sex, for example: Jane F 1922. The count is the number
     of children of that sex given that name in that year. Names
     assigned to fewer than five children of one sex in any year are
     not listed, presumably out of privacy concerns.

_U_s_a_g_e:

     data("BabyNames")
     
_F_o_r_m_a_t:

     A data frame with 1,792,091 entries on four variables:

     name The given name (character string)

     sex F or M (character string)

     count The number of babies given that name and of that sex.
          (integer)

     year Year of birth (integer)

_S_o_u_r_c_e:

     The data were compiled from the US Social Security Administration
     web site: http://www.ssa.gov/oact/babynames/names.zip

_E_x_a_m_p_l_e_s:

     data(BabyNames)
     str(BabyNames)
     

head(x = dcData::BabyNames)

A data.frame: 6 × 4
	name	sex	count	year
	<chr>	<chr>	<int>	<int>
1	Mary	F	7065	1880
2	Anna	F	2604	1880
3	Emma	F	2003	1880
4	Elizabeth	F	1939	1880
5	Minnie	F	1746	1880
6	Margaret	F	1578	1880

count(x = dcData::BabyNames, sex)

A data.frame: 2 × 2
sex	n
<chr>	<int>
F	1062432
M	729659

count(x = dcData::BabyNames, year) %>%
  arrange(desc(n)) %>%
    head(n = 10)

A data.frame: 10 × 2
	year	n
	<int>	<int>
1	2008	35038
2	2007	34925
3	2009	34668
4	2006	34066
5	2010	34027
6	2011	33858
7	2012	33658
8	2013	33072
9	2005	32527
10	2004	32033

select(.data = dcData::BabyNames, name, count) %>%
  arrange(desc(count)) %>%
    head(n = 10)

A data.frame: 10 × 2
	name	count
	<chr>	<int>
1	Linda	99674
2	Linda	96210
3	James	94758
4	Michael	92711
5	Robert	91652
6	Linda	90994
7	Michael	90629
8	Michael	90512
9	James	88610
10	Michael	88481

select(.data = dcData::BabyNames, name, count) %>%
  arrange(desc(count)) %>%
    filter(count == 5)

A data.frame: 249909 × 2
name	count
<chr>	<int>
Adelle	5
Adina	5
Adrienne	5
Albertine	5
Alys	5
Ana	5
Araminta	5
Arthur	5
Birtha	5
Bulah	5
Caddie	5
Celie	5
Charlotta	5
Clair	5
Concepcion	5
Cordella	5
Corrine	5
Delila	5
Delphine	5
Dosha	5
Edgar	5
Elaine	5
Elisa	5
Ellar	5
Elmire	5
Elvina	5
Ena	5
Estie	5
Etter	5
Fronnie	5
⋮	⋮
Zayceon	5
Zayid	5
Zaylynn	5
Zecheriah	5
Zedric	5
Zefram	5
Zekhi	5
Zenith	5
Zennon	5
Zepplin	5
Zevon	5
Zhaiden	5
Zhen	5
Zhian	5
Zien	5
Zierre	5
Zimri	5
Ziquan	5
Ziyaad	5
Ziyang	5
Zmari	5
Zolan	5
Zurich	5
Zyeer	5
Zyere	5
Zyhier	5
Zylar	5
Zymari	5
Zymeer	5
Zyree	5

nonGlyphReadyForm <- dcData::BabyNames %>%
  filter(name %in% c('Abraham', 'Franklin', 'Woodrow'))
head(x = nonGlyphReadyForm)

nonGlyphReadyForm %>%
  ggplot() +
    geom_line(
      alpha     = 0.5,
      linewidth = 1,
      mapping   = aes(x        = year,
                      y        = count,
                      color    = name,
                      linetype = name),
      position  = 'identity',
      stat      = 'identity',
      show.legend=TRUE
    ) +
    xlab('Year') +
    ylab('Popularity') +
    ggtitle('Popularity of Names Over Time')

A data.frame: 6 × 4
	name	sex	count	year
	<chr>	<chr>	<int>	<int>
1	Franklin	M	120	1880
2	Abraham	M	81	1880
3	Franklin	M	117	1881
4	Abraham	M	86	1881
5	Franklin	M	112	1882
6	Abraham	M	91	1882

../../../_images/80e5b8939dba20c76791d97f39f8d25dcbf53ed1111ad66266fee13cb8249179.png

GlyphReadyForm <- dcData::BabyNames %>%
  filter(name %in% c('Abraham', 'Franklin', 'Woodrow')) %>%
    group_by(year, name) %>%
      summarize(total = sum(count))
head(x = GlyphReadyForm)

GlyphReadyForm %>%
  ggplot() +
    geom_line(
      alpha     = 0.5,
      linewidth = 1,
      mapping   = aes(x        = year,
                      y        = total,
                      color    = name,
                      linetype = name),
      position  = 'identity',
      stat      = 'identity',
      show.legend=TRUE
    ) +
    xlab('Year') +
    ylab('Popularity') +
    ggtitle('Popularity of Names Over Time')

`summarise()` has grouped output by 'year'. You can override using the
`.groups` argument.

A grouped_df: 6 × 3
year	name	total
<int>	<chr>	<int>
1880	Abraham	81
1880	Franklin	120
1881	Abraham	86
1881	Franklin	117
1882	Abraham	91
1882	Franklin	112

../../../_images/37fa182a61a11d560f2f19d31acbabe6a9fd738e8c0519a5f926841042c1e882.png

GlyphReadyForm <- dcData::BabyNames %>%
  filter(name %in% c('Dave', 'Davey', 'David')) %>%
    group_by(year, name) %>%
      summarize(total = sum(count))
head(x = GlyphReadyForm)

GlyphReadyForm %>%
  ggplot() +
    geom_line(
      alpha     = 0.5,
      linewidth = 1,
      mapping   = aes(x        = year,
                      y        = total,
                      color    = name,
                      linetype = name),
      position  = 'identity',
      stat      = 'identity',
      show.legend=TRUE
    ) +
    xlab('Year') +
    ylab('Popularity') +
    ggtitle('Popularity of Names Over Time')

`summarise()` has grouped output by 'year'. You can override using the
`.groups` argument.

A grouped_df: 6 × 3
year	name	total
<int>	<chr>	<int>
1880	Dave	131
1880	David	869
1881	Dave	135
1881	David	750
1882	Dave	144
1882	David	838

../../../_images/249e0edb373fa317bf3406d0646ed81fd1ac0a2a9066e56d3c9bdacc009627ad.png

dcData::BabyNames %>%
  filter(name %in% c('Davey'))

A data.frame: 94 × 4
name	sex	count	year
<chr>	<chr>	<int>	<int>
Davey	M	5	1925
Davey	M	9	1928
Davey	M	89	1929
Davey	F	5	1930
Davey	M	54	1930
Davey	M	45	1931
Davey	M	26	1932
Davey	M	15	1933
Davey	M	22	1934
Davey	M	17	1935
Davey	M	9	1936
Davey	M	15	1937
Davey	M	14	1938
Davey	M	16	1939
Davey	M	8	1940
Davey	M	17	1941
Davey	M	18	1942
Davey	F	5	1943
Davey	M	19	1943
Davey	M	10	1944
Davey	M	15	1945
Davey	M	25	1946
Davey	M	37	1947
Davey	M	42	1948
Davey	M	50	1949
Davey	M	41	1950
Davey	M	39	1951
Davey	M	44	1952
Davey	M	74	1953
Davey	M	55	1954
⋮	⋮	⋮	⋮
Davey	M	29	1988
Davey	M	33	1989
Davey	M	20	1990
Davey	M	26	1991
Davey	M	26	1992
Davey	M	72	1993
Davey	F	5	1994
Davey	M	75	1994
Davey	M	34	1995
Davey	M	31	1996
Davey	M	34	1997
Davey	M	25	1998
Davey	M	28	1999
Davey	F	6	2000
Davey	M	14	2000
Davey	M	20	2001
Davey	M	29	2002
Davey	M	14	2003
Davey	M	14	2004
Davey	M	22	2005
Davey	F	5	2006
Davey	M	18	2006
Davey	M	25	2007
Davey	M	14	2008
Davey	M	14	2009
Davey	M	26	2010
Davey	M	19	2011
Davey	M	17	2012
Davey	F	7	2013
Davey	M	18	2013

GlyphReadyForm <- dcData::BabyNames %>%
  filter(name %in% c('Davey')) %>%
    group_by(year, name) %>%
      summarize(total = sum(count))
head(x = GlyphReadyForm)

GlyphReadyForm %>%
  ggplot() +
    geom_line(
      alpha     = 0.5,
      linewidth = 1,
      mapping   = aes(x        = year,
                      y        = total,
                      color    = name,
                      linetype = name),
      position  = 'identity',
      stat      = 'identity',
      show.legend=TRUE
    ) +
    xlab('Year') +
    ylab('Popularity') +
    ggtitle('Popularity of Names Over Time')

`summarise()` has grouped output by 'year'. You can override using the
`.groups` argument.

A grouped_df: 6 × 3
year	name	total
<int>	<chr>	<int>
1925	Davey	5
1928	Davey	9
1929	Davey	89
1930	Davey	59
1931	Davey	45
1932	Davey	26

../../../_images/334d92dd8a52adbf6816f0a86d853d44f39ade49d75d5b704fb0dbdb5235cda7.png

GlyphReadyForm <- dcData::BabyNames %>%
  filter(name %in% c('Carol', 'Carole', 'Caroline', 'Carolyn', 'Carrie')) %>%
    group_by(year, name) %>%
      summarize(total = sum(count))
head(x = GlyphReadyForm)

GlyphReadyForm %>%
  ggplot() +
    geom_line(
      alpha     = 0.5,
      linewidth = 1,
      mapping   = aes(x        = year,
                      y        = total,
                      color    = name,
                      linetype = name),
      position  = 'identity',
      stat      = 'identity',
      show.legend=TRUE
    ) +
    xlab('Year') +
    ylab('Popularity') +
    ggtitle('Popularity of Names Over Time')

`summarise()` has grouped output by 'year'. You can override using the
`.groups` argument.

A grouped_df: 6 × 3
year	name	total
<int>	<chr>	<int>
1880	Carol	7
1880	Caroline	306
1880	Carolyn	80
1880	Carrie	957
1881	Carol	11
1881	Caroline	316

../../../_images/110c65292503f8d1fe62ef8c9a87e76d5458fd877b2351b5c20e6483d5fea8c0.png

GlyphReadyForm <- dcData::BabyNames %>%
  filter(name %in% c('Jeff', 'Jeffrey', 'Geof', 'Geoff', 'Geoffrey')) %>%
    group_by(year, name) %>%
      summarize(total = sum(count))
head(x = GlyphReadyForm)

GlyphReadyForm %>%
  ggplot() +
    geom_line(
      alpha     = 0.5,
      linewidth = 1,
      mapping   = aes(x        = year,
                      y        = total,
                      color    = name,
                      linetype = name),
      position  = 'identity',
      stat      = 'identity',
      show.legend=TRUE
    ) +
    xlab('Year') +
    ylab('Popularity') +
    ggtitle('Popularity of Names Over Time')

`summarise()` has grouped output by 'year'. You can override using the
`.groups` argument.

A grouped_df: 6 × 3
year	name	total
<int>	<chr>	<int>
1880	Jeff	66
1881	Jeff	61
1882	Jeff	81
1883	Jeff	44
1884	Jeff	74
1885	Jeff	63

../../../_images/f9a0617799d0bcaf39e4e6e5c1a59ef918cd05978633f28de224b7a559c8a543.png

GlyphReadyForm <- dcData::BabyNames %>%
  filter(name %in% c('Geof', 'Geoff')) %>%
    group_by(year, name) %>%
      summarize(total = sum(count))
head(x = GlyphReadyForm)

GlyphReadyForm %>%
  ggplot() +
    geom_line(
      alpha     = 0.5,
      linewidth = 1,
      mapping   = aes(x        = year,
                      y        = total,
                      color    = name,
                      linetype = name),
      position  = 'identity',
      stat      = 'identity',
      show.legend=TRUE
    ) +
    xlab('Year') +
    ylab('Popularity') +
    ggtitle('Popularity of Names Over Time')

`summarise()` has grouped output by 'year'. You can override using the
`.groups` argument.

A grouped_df: 6 × 3
year	name	total
<int>	<chr>	<int>
1940	Geoff	5
1942	Geoff	7
1943	Geoff	5
1944	Geoff	8
1945	Geoff	8
1946	Geoff	7

../../../_images/e20031c43f908fcf9c1d4d93e88dcc23e9e8bc63d3f99ed0ad40932c20247b77.png

GlyphReadyForm <- dcData::BabyNames %>%
  filter(name %in% c(
    'Elizabeth',
    'Elisa',
    'Elise',
    'Eliza',
    'Ellie',
    'Elsa',
    'Isabelle',
    'Lisa',
    'Liz',
    'Liza',
    'Lizzie'
    )) %>%
    group_by(year, name) %>%
      summarize(total = sum(count))
head(x = GlyphReadyForm)

GlyphReadyForm %>%
  ggplot() +
    geom_line(
      alpha     = 0.5,
      linewidth = 1,
      mapping   = aes(x        = year,
                      y        = total,
                      color    = name,
                      linetype = name),
      position  = 'identity',
      stat      = 'identity',
      show.legend=TRUE
    ) +
    xlab('Year') +
    ylab('Popularity') +
    ggtitle('Popularity of Names Over Time')

`summarise()` has grouped output by 'year'. You can override using the
`.groups` argument.

A grouped_df: 6 × 3
year	name	total
<int>	<chr>	<int>
1880	Elisa	5
1880	Elise	24
1880	Eliza	258
1880	Elizabeth	1948
1880	Ellie	17
1880	Elsa	20

../../../_images/b11023dce7a24ab6639654a0fbb298a007fbacfaa63328842bd5b3a392a3db37.png

GlyphReadyForm <- dcData::BabyNames %>%
  filter(name %in% c(
    'Bethany',
    'Bessie',
    'Betsey',
    'Betta',
    'Betty'
    )) %>%
    group_by(year, name) %>%
      summarize(total = sum(count))
head(x = GlyphReadyForm)

GlyphReadyForm %>%
  ggplot() +
    geom_line(
      alpha     = 0.5,
      linewidth = 1,
      mapping   = aes(x        = year,
                      y        = total,
                      color    = name,
                      linetype = name),
      position  = 'identity',
      stat      = 'identity',
      show.legend=TRUE
    ) +
    xlab('Year') +
    ylab('Popularity') +
    ggtitle('Popularity of Names Over Time')

`summarise()` has grouped output by 'year'. You can override using the
`.groups` argument.

A grouped_df: 6 × 3
year	name	total
<int>	<chr>	<int>
1880	Bessie	796
1880	Betty	117
1881	Bessie	911
1881	Betty	112
1882	Bessie	1153
1882	Betty	123

../../../_images/c3be1ec693793122671831b0e0d1a4c15e4d1b25cb0732f2c2b2a8b92317ea27.png

Project: Bicycle Sharing #

Project: Bicycle Sharing from Kaplan, Daniel & Matthew Beckman. (2021). Data Computing. 2nd Ed. Home.

# information about the location of each of the stations in the system
station_url <- 'https://mdbeckman.github.io/dcSupplement/data/DC-Stations.csv'
Stations    <- readr::read_csv(station_url)
# the rental history from 2014 Q4
#   the `Trips` data table is a random subset of 10,000 trips from the full quarterly data
#   the full data set of more than 600,000 trips can be accessed by removing `-Small` from the url
trip_url    <- 'https://mdbeckman.github.io/dcSupplement/data/Trips-History-Data-2014-Q4-Small.rds'
Trips       <- readRDS(gzcon(url(trip_url)))

source('https://mdbeckman.github.io/dcSupplement/R/haversine.R')

Rows: 347 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): name
dbl (4): lat, long, nbBikes, nbEmptyDocks

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Stations <-
#   Stations %>%
#     select(lat, long)
# Stations %>%
#   head()

# Trips <-
#   Trips %>%
#     select(sstation, estation, client, sdate, edate)
# Trips %>%
#   head()

# the distribution of times that bikes were checked out
Trips %>%
  ggplot(aes(x = sdate)) +
    geom_density(fill = 'gray', color = NA)

../../../_images/86889dae0003983d59f37a462d0aaa564cb7e10ce06ee705585a34c1eec87491.png

How long?#

Make a box-and-whisker plot showing the distribution of the duration of rental events broken down by client. The duration of the rental can be calculated as as.numeric(edate - sdate). The units will be in either hours, minutes, or seconds. It should not be much trouble for you to figure out which one. When you make your plot, you will likely find that the axis range is being set by a few outliers. These may be bikes that were forgotten. Arrange your scale to ignore these outliers, or filter them out.

Trips %>%
  mutate(duration = as.numeric(edate - sdate)) %>%
  filter(duration <= 3600) %>%
  ggplot() +
    geom_boxplot(
      mapping = aes(x = client, y = duration / 60)
    ) +
    labs(x = 'Type of Client', y = 'Trip Duration [min]')

../../../_images/9347d63885270a62df7d10cb7217044d9107ecd2a6a89cbacb1794d6ab071f93.png

When are bikes used?#

[1] Variable Trips$sdate indicates the datetime that the bicycle was checked out of the station. Make histograms or density plots of each of these discrete components. Explain what each plot is showing about how bikes are checked out. For example, the distribution by hour of the day shows that few bikes are checked out before 5am, and that there are busy times around rush hour: 8am and 5pm.

Day of the year: bike rentals are highest in October; decline in November; and pick up again in the first half of December.
Day of the week: bike rentals are fairly stable over the course of the week; Thursday and Friday see the most rentals, and Saturday and Sunday see the least. Monday sees a little bit more than Tuesday and Wednesday.
Hour of the day: bike rentals are low before 5:00am; spike up during morning rush hour; decrease a bit throughout the day; spike back up during evening rush hour; and rapidly decline to early morning levels into the night.
Minute of the hour: I’m not sure there’s much of a trend along this scale, other than that the numbers of rentals oscillate a bit (i.e., it is not fixed).

[2] A similar sort of display of events per hour can be accomplished by calculating and displaying each hour’s count. The graphic shows a lot of variation of bike use over the course of the day. Now consider two additional variables: the day of the week and the client type. Group the bike rentals by three variables: hour of the day, day of the week, and client type. Find the total number of events in each grouping and plot this count versus hour. Use the group aesthetic to represent one of the other variables and faceting to represent the other. (Hint: utilize facet_wrap() in the plotting commands.) Make the same sort of display of how bike rental vary of hour, day of the week, and client type, but use geom_density() rather than grouping and counting. Compare the two displays–one of discrete counts and one of density–and describe any major differences.

Generally, registered clients rent more bikes than casual clients across hour of the day and across day of the week. On an hourly basis, however, casual client rentals usually reach a maximum in the afternoon, when registered client rentals dip in between morning and evening rush hour. Also, casual client rentals peak over the weekend, when registered client rentals drop the most.

Trips$sdate[1]
lubridate::yday(Trips$sdate)[1] # day of the year from 1 to 365
lubridate::wday(Trips$sdate)[1] # day of the week from Sunday to Saturday
lubridate::hour(Trips$sdate)[1] # hour of the day
lubridate::minute(Trips$sdate)[1] # minute of the hour

[1] "2014-10-15 08:58:00 UTC"

288

4

8

58

Trips %>%
  ggplot(aes(x = lubridate::yday(sdate))) +
    geom_density(fill = 'gray', color = NA) +
    xlab('Day of the Year')

../../../_images/04756986fc22cb17998db839f9c171ee6d44fda3311a326799d4a336d8872e18.png

Trips %>%
  ggplot(aes(x = lubridate::wday(sdate))) +
    geom_density(fill = 'gray', color = NA) +
    xlab('Day of the Week') +
    scale_x_continuous(breaks = 0:8)

../../../_images/b32b4203a0a59bff834f80110fe3eee8aeffebc27d13d5200d585cd1c6a3ae51.png

Trips %>%
  ggplot(aes(x = lubridate::hour(sdate))) +
    geom_density(fill = 'gray', color = NA) +
    xlab('Hour of the Day') +
    scale_x_continuous(breaks = 0:24)

../../../_images/abc0534248a19b4e2753be734e5c1ab24dd0865cc2e0ef1399989d03a3b5ff7b.png

Trips %>%
  ggplot(aes(x = lubridate::minute(sdate))) +
    geom_density(fill = 'gray', color = NA) +
    xlab('Minute of the Hour') +
    scale_x_continuous(breaks = 0:60)

../../../_images/53a72525b478bbebbc620527772174573b492da9790110d1bae47f403eeff942.png

Trips %>%
  mutate(
    HOD = lubridate::hour(sdate),
    DOW = lubridate::wday(sdate),
  ) %>%
  group_by(HOD, DOW, client) %>%
  summarize(total = n()) %>%
  head()

`summarise()` has grouped output by 'HOD', 'DOW'. You can override using the
`.groups` argument.

A grouped_df: 6 × 4
HOD	DOW	client	total
<int>	<dbl>	<chr>	<int>
0	1	Casual	1
0	1	Registered	30
0	2	Registered	5
0	3	Casual	2
0	3	Registered	8
0	4	Registered	7

options(repr.plot.width = 20)

Trips %>%
  mutate(
    HOD = lubridate::hour(sdate),
    DOW = lubridate::wday(sdate),
  ) %>%
  group_by(HOD, DOW, client) %>%
  summarize(total = n()) %>%
  ggplot(mapping = aes(x = HOD, y = total)) +
    geom_line(mapping = aes(color = client)) + # the `group` aesthetic is unnecessary when using the `color` aesthetic
    geom_point() +
    facet_wrap(. ~ DOW, nrow = 1) +
    xlab('H') +
    scale_x_continuous(breaks = 0:23)

options(repr.plot.width =  7)

`summarise()` has grouped output by 'HOD', 'DOW'. You can override using the
`.groups` argument.

../../../_images/31e2b7bdbaf2a9acd089a8aa52e9b86e792275cae5405954b5cf226931e109d5.png

How far?#

Find the distance between each pair of stations. You know the position from the lat and long variables in Stations. This is enough information to find the distance. The calculation has been implemented in the haversine() function. haversine() is a transformation function. To use it, create a data table where a case is a pair of stations and there are variables for the latitude and longitude of the starting station and the ending station. To do this, join the Station data to itself. The statements Simple and Simple2 show how to create appropriately named variables for joining. Look the head() of Simple and Simple2 and make sure you understand how they are related to Stations. The joining of Simple and Simple2 should match every station to every other station. (Since a ride can start and end at the same station, it also makes sense to match each station to itself.) This sort of matching does not make use of any matching variables; everything is matched to everything else. This is called a full outer join. (A full outer join matches every case in the left table to each and every case in the right table.) Make sure you understand what the full outer join does before proceeding. For instance, you should be able to predict how many cases the output will have when the left input has n cases and the right has m cases.

[1] There are 347 cases in the Stations data table. How many cases will there be in a full outer join of Simple to Simple2?

$120409 = 347 \times 347$

It’s often impractical to carry out a full outer join. For example, joining BabyNames to itself with a full outer join will generate a result with more than three trillion cases. (Three trillion cases from the BabyNames data is the equivalent of about 5 million hours of MP3 compressed music. A typical human lifespan is about 0.6 million hours.) Perform the full outer join and then use haversine() to compute the distance between each pair of stations. Check your result for sensibility. Make a histogram of the station-to-station distances and explain where it looks like what you would expect. (Hint: you could use the Internet to look up the distance from one end of Washington, D.C. to the other.) Once you have PairDistances, you can join it with Trips to calculate the start-to-end distance of each trip. (Of course, a rider may not go directly from one station to another.)

[2] Look at the variables in Stations and Trips and explain why Simple and Simple2 were given different variable names for the station.

Simple and Simple2 were given different variable names for the station because we want start station-end station pairs, which serves as the composite key on which we join the resulting table to the Trips table. If we don’t explicitly change the name of the name variable, then we won’t have a composite key on which to join the resulting table to the Trips table.

An inner_join() is appropriate for finding the distance of each ride. (Watch out! The Trips data and the PairDistances data are large enough that the join is expensive: it takes about a minute.) Display the distribution of the ride distances of the ride.

[3] Compare it to the distances between pairs of stations. Are they similar? Why or why not?

The visualization shows that the pairwise distance between stations has a larger spread than trip distance, which is skewed to the right (i.e., most trips are short, whereas a fair number of station-station distances are quite large: these routes are probably not used very often).

Simple <-
  Stations %>%
    select(name, lat, long) %>%
    rename(sstation = name)
Simple2 <-
  Simple %>%
    rename(estation = sstation, lat2 = lat, long2 = long)
StationPairs <-
  merge(Simple, Simple2, by = NULL)
PairDistances <-
  StationPairs %>%
    mutate(distance = haversine(lat, long, lat2, long2)) %>%
    select(sstation, estation, distance)

PairDistances %>%
  ggplot() +
    geom_density(aes(x = distance))

../../../_images/395d7d39ee42f66906df24fa14f93e5d64bb01cf6b0f391873dece38bc85ae14.png

Trips %>%
  inner_join(PairDistances, by = c('sstation', 'estation')) %>%
  ggplot(aes(x = distance)) +
    geom_density()

../../../_images/ccd9a9e21adf1880bfacd3ebf8cd04282172d9673f484e5ffa897503988b658a.png

Trips %>%
  inner_join(PairDistances, by = c('sstation', 'estation')) %>%
  ggplot(aes(x = distance)) +
    geom_density(
      color = 'blue'
    ) +
    geom_density(
      alpha = 0.2,
      color = NA,
      data  = PairDistances,
      fill  = 'red'
    ) +
    xlab('Trip Distance (km)')

../../../_images/5e04897419fc4e58d288084565f46b1a0c02c48c84d25e93f900fe0d82269a1d.png

Mapping the stations #

You can draw detailed maps with the leaflet package. You may need to install it. leaflet works much like ggplot() but provides special facilities for maps.

stationMap <-
  leaflet(Stations) %>%
    addTiles() %>%
    addCircleMarkers(radius = 2, color = 'red')
    #setView(-77.04, 38.9, zoom = 12)
stationMap

Assuming "long" and "lat" are longitude and latitude, respectively

Long-distance stations #

Around each station on the map, draw a circle whose radius reflects the median distance covered by rentals starting at that station. To draw the circles, use the same leaflet commands as before, but add in a line like this: addCircles(radius = ~ mid, color = "blue", opacity = 0.0001). For addCircles() to draw circles at the right scale, the units of the median distance should be presented in meters rather than kilometers. This will create too much overlap, unfortunately. So, set the radius to be half or one-third the median distance in meters. From your map, explain the pattern you see in the relationship between station location and median distance.

Project: Bird Species #

Project: Bird Species from Kaplan, Daniel & Matthew Beckman. (2021). Data Computing. 2nd Ed. Home.

?dcData::OrdwayBirds

OrdwayBirds               package:dcData               R Documentation

_B_i_r_d_s _c_a_p_t_u_r_e_d _a_n_d _r_e_l_e_a_s_e_d _a_t _O_r_d_w_a_y, _c_o_m_p_l_e_t_e _a_n_d _u_n_c_l_e_a_n_e_d

_D_e_s_c_r_i_p_t_i_o_n:

     The historical record of birds captured and released at the
     Katharine Ordway Natural History Study Area, a 278-acre preserve
     in Inver Grove Heights, Minnesota, owned and managed by Macalester
     College.

_U_s_a_g_e:

     data("OrdwayBirds")
     
_F_o_r_m_a_t:

     A data frame with 15,829 observations on the following 26
     variables:

     bogus

     Timestamp indicates when the data were entered into an electronic
          record, not anything about the bird being described

     Year year of capture

     Day day of capture

     Month month of capture

     CaptureTime time of capture

     SpeciesName

     Sex

     Age

     BandNumber

     TrapID

     Weather

     BandingReport

     RecaptureYN

     RecaptureMonth

     RecaptureDay

     Condition

     Release

     Comments

     DataEntryPerson

     Weight

     WingChord

     Temperature

     RecaptureOriginal

     RecapturePrevious

     TailLength

_D_e_t_a_i_l_s:

     There are many extraneous levels of variables such as species.
     Part of the purpose of this data set is to teach about data
     cleaning.

_S_o_u_r_c_e:

     Jerald Dosch, Dept. of Biology, Macalester College: the manager of
     the Study Area.

_S_e_e _A_l_s_o:

     `OrdwaySpeciesNames`

_E_x_a_m_p_l_e_s:

     data(OrdwayBirds)
     str(OrdwayBirds)

?dcData::OrdwaySpeciesNames

OrdwaySpeciesNames           package:dcData            R Documentation

_C_o_r_r_e_c_t_e_d _S_p_e_c_i_e_s _N_a_m_e_s _f_o_r _t_h_e _O_r_d_w_a_y _B_i_r_d_s

_D_e_s_c_r_i_p_t_i_o_n:

     This data frame lists all the species name that appear in
     `OrdwayBirds`. In many cases, the species name was mis-spelled in
     the original. As a result, many birds are listed as separate
     species even though, in reality, they all belong to the same
     species. For each potentially mis-spelled species name, this table
     gives a standardized name.

_U_s_a_g_e:

     data("OrdwaySpeciesNames")
     
_F_o_r_m_a_t:

     A data frame with 265 observations on the following 2 variables:

     SpeciesName The original spelling, or misspelling, of a bird
          species.

     SpeciesNameCleaned Corrected spelling (or NA if the original was
          not identifiable.)

_S_o_u_r_c_e:

     Daniel Kaplan and students in a 2013 Data and Computing
     Fundamentals class at Macalester College (Saint Paul, MN) read
     through original names in `OrdwayBirds` and typed corrected
     spelling shown in `SpeciesNameCleaned`.

_S_e_e _A_l_s_o:

     `OrdwayBirds`

_E_x_a_m_p_l_e_s:

     data(OrdwaySpeciesNames)
     str(OrdwaySpeciesNames)
     

OrdwayBirds %>%
  select(Month, Day) %>%
  head()

A data.frame: 6 × 2
	Month	Day
	<dbl>	<dbl>
3	7	16
4	NA	NA
5	7	16
6	7	16
7	7	16
8	7	16

OrdwayBirds <-
  OrdwayBirds %>%
  select(SpeciesName, Month, Day) %>%
  mutate(
    Month = as.numeric(as.character(Month)),
    Day   = as.numeric(as.character(Day))
  )
OrdwayBirds %>%
  head()

A data.frame: 6 × 3
	SpeciesName	Month	Day
	<chr>	<dbl>	<dbl>
3	Song Sparrow	7	16
4		NA	NA
5	Song Sparrow	7	16
6	Field Sparrow	7	16
7	Field Sparrow	7	16
8	Song Sparrow	7	16

OrdwaySpeciesNames %>%
  filter(is.na(SpeciesNameCleaned))

A data.frame: 50 × 2
SpeciesName	SpeciesNameCleaned
<chr>	<chr>
	NA
-lost-	NA
-missing-	NA
[Nothing, just dashes]	NA
13:00:00	NA
Bank Swallow	NA
Barn Swallow	NA
Bay-breasted Warbler	NA
Blackpoll Warbler	NA
Blue Jay	NA
Blue-headed Vireo	NA
Blue-winged Warbler	NA
Bluebird	NA
Boreal Chickadee	NA
Brewer's Sparrow	NA
Brown Creeper	NA
Brown Thrasher	NA
Brown Towhee	NA
Cactus Wren	NA
Common Crow	NA
Common Grackle	NA
Common Nighthawk	NA
Common Redpoll	NA
Common Yellowthroat	NA
Connecticut Warbler	NA
Downy Woodpecker	NA
E Bluebird	NA
Eastern Bluebird	NA
Eastern Kingbird	NA
Eastern Meadowlark	NA
Eastern Robin	NA
Flicker	NA
Fox Sparrow	NA
Goldfinch	NA
Grackle	NA
Green Heron	NA
Ground Dove	NA
Hairy Woodpecker	NA
Hermit Thrush	NA
Horned Lark	NA
House Finch	NA
House Sparrow	NA
Inca Dove	NA
Indigo Bunting	NA
Killdeer	NA
Kingbird	NA
Kiskadee F.C.	NA
Magnolia Warbler	NA
Mockingbird	NA
Rough-winged Swallow	NA

Task 1 #

[1] Including misspellings, how many different species are there in the OrdwayBirds data?

There are 275 unique values of the variable SpeciesName. This reduces to 268 after dropping the following invalid values:

''
'-lost-'
'-missing-'
'13:00:00'
'[Nothing, just dashes]'
'lost'
'none'

[2] Consider the OrdwaySpeciesNames data frame also found in the dcData package as well. How many distinct species are there in the SpeciesNameCleaned variable in OrdwaySpeciesNames? You will find it helpful to use n_distinct() a reduction function, which counts the number of unique values in a variable.

There are 108 unique values of the variable SpeciesNameCleaned after accounting for the value NA.

OrdwayBirds %>%
  count(SpeciesName)

A data.frame: 275 × 2
SpeciesName	n
<chr>	<int>
	4
-lost-	1
-missing-	1
13:00:00	1
Acadian Flycatcher	1
American Gold Finch	50
American Goldfinch	1153
American Golf Finch	1
American Redfinch	1
American Redstart	3
American Robin	4
Arkansas Kingbird	1
Baltimore Oriole	206
Bank Swallow	21
Barn Swallow	23
Batimore Oriole	1
Bay-breasted Warbler	2
Blac-capped Chickadee	1
Black and White Warbler	9
Black-Capped Chickadee	13
Black-and-white Warbler	1
Black-billed Cookoo	1
Black-billed Cuckoo	15
Black-capeed Chickadee	1
Black-capped Chicakdee	1
Black-capped Chickadee	1110
Black-capped Chikadee	1
Black-capped chickadee	187
Black-throat Sparrow	31
Black-throat-Sparrow	1
⋮	⋮
White-breast Nuthatch	23
White-breasted Nuthatch	236
White-crown Sparrow	17
White-crowned Sparrow	78
White-eyed Vireo	1
White-thorat Sparrow	1
White-throat Sparrow	86
White-throated Sparrow	229
White-winged Junco	2
Wht-brstd Nuthatch	1
Wilson Warbler	4
Wilson's Warbler	22
Winter Wren	1
Wood Pewee	37
Wood Thrush	3
Woodcock	1
Wren	2
Yellow Flicker	1
Yellow Shafted Flicker	4
Yellow Warbler	19
Yellow-bellied Flycatcher	7
Yellow-bellied Sapsucker	3
Yellow-shaft Flicker	6
Yellow-shafted Flicker	34
Yellow-shafted flicker	6
Yellow-tailed Oriole	1
Yellowthroat	107
[Nothing, just dashes]	1
lost	1
none	2

OrdwayBirds %>%
  select(SpeciesName) %>%
    n_distinct()

275

OrdwaySpeciesNames %>%
  count(SpeciesNameCleaned)

A data.frame: 109 × 2
SpeciesNameCleaned	n
<chr>	<int>
Acadian Flycatcher	1
American Goldfinch	3
American Redfinch	1
American Redstart	1
American Robin	1
Arkansas Kingbird	1
Baltimore Oriole	3
Black and White Warbler	2
Black-billed Cookoo	2
Black-capped Chickadee	8
Black-throat Sparrow	2
Brown-headed Cowbird	2
Cardinal	2
Carolina Chickadee	1
Catbird	4
Cedar Waxwing	2
Chestnut-backed Chickadee	1
Chestnut-sided Warbler	1
Chickadee	2
Chipping Sparrow	4
Clay-colored Sparrow	3
Cowbird	1
Curve-billed Thrasher	2
Eastern Phoebe	2
Eastern Wood Pewee	2
Field Sparrow	2
Golden-Crowned Kinglet	3
Gray - cheeked Thrush	4
Great Crested Flycatcher	3
Harris's Sparrow	3
⋮	⋮
Tennessee Warbler	2
Traill's Flycatcher	1
Tree L	1
Tree Swallow	4
Tufted Titmouse	1
Unknown	1
Varied Thrush	1
Veery	1
Vesper Sparrow	1
Warbling Vireo	1
White-Crested Sparrow	1
White-Fronted Dove	1
White-breasted Nuthatch	5
White-crowned Sparrow	2
White-eyed Vireo	1
White-throat Sparrow	5
White-winged Junco	1
Wilson's Warbler	2
Winter Wren	1
Wood Pewee	1
Wood Thrush	1
Woodcock	1
Wren	1
Yellow Shafted Flicker	5
Yellow Warbler	1
Yellow-bellied Flycatcher	1
Yellow-bellied Sapsucker	1
Yellow-tailed Oriole	1
Yellowthroat	1
NA	50

OrdwaySpeciesNames %>%
  select(SpeciesNameCleaned) %>%
    n_distinct()

109

Task 2 #

Use the OrdwaySpeciesNames table to create a new data frame that corrects the misspellings in SpeciesNames. This can be done easily using the inner_join() data verb. Look at the names of the variables in OrdwaySpeciesNames and OrdwayBirds.

[1] Which variable(s) was used for matching cases?

The variable SpeciesName was used for matching cases.

[2] What were the variable(s) that will be added?

The variables SpeciesNameCleaned (renamed to Species), Month, and Day will be added.

Corrected <-
  OrdwayBirds %>%
    inner_join(y = OrdwaySpeciesNames) %>%
    select(Species = SpeciesNameCleaned, Month, Day) %>%
    na.omit()
Corrected %>%
  head()

Joining with `by = join_by(SpeciesName)`
Warning message in inner_join(., y = OrdwaySpeciesNames):
“Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 4 of `x` matches multiple rows in `y`.
ℹ Row 211 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.”

A data.frame: 6 × 3
	Species	Month	Day
	<chr>	<dbl>	<dbl>
1	Song Sparrow	7	16
3	Song Sparrow	7	16
4	Field Sparrow	7	16
5	Field Sparrow	7	16
6	Field Sparrow	7	16
7	Field Sparrow	7	16

Task 3 #

Call the variable that contains the total count. Arrange this into descending order from the species with the most birds, and look through the list. (Hint: Remember n(). Also, one of the arguments to one of the data verbs will be desc(count) to arrange the cases into descending order. Display the top 10 species in terms of the number of bird captures.) Define for yourself a “major species” as a species with more than a particular threshold count. Set your threshold so that there are 5 or 6 species designated as major. Filter to produce a data frame with only the birds that belong to a major species. Save the output in a table called Majors. (Hint: Remember that summary functions can be used case-by-case when filtering or mutating a data frame that has been grouped.)

[1] How many bird captures are reported for each of the corrected species?

See below for the result (major species threshold >= 1000).

Corrected %>%
  count(Species) %>%
  arrange(desc(n)) %>%
  head(n = 10)

A data.frame: 10 × 2
	Species	n
	<chr>	<int>
1	Slate-colored Junco	2732
2	Tree Swallow	1537
3	Black-capped Chickadee	1327
4	American Goldfinch	1204
5	Field Sparrow	1164
6	Lincoln's Sparrow	790
7	Robin	608
8	Catbird	554
9	Song Sparrow	512
10	House Wren	460

Corrected %>%
  group_by(Species) %>%
  summarize(count = n()) %>%
  arrange(desc(count)) %>%
  head(n = 10)

A tibble: 10 × 2
Species	count
<chr>	<int>
Slate-colored Junco	2732
Tree Swallow	1537
Black-capped Chickadee	1327
American Goldfinch	1204
Field Sparrow	1164
Lincoln's Sparrow	790
Robin	608
Catbird	554
Song Sparrow	512
House Wren	460

Majors <-
  Corrected %>%
    group_by(Species) %>%
    summarize(count = n()) %>%
    arrange(desc(count)) %>%
    filter(count >= 1000)
Majors

A tibble: 5 × 2
Species	count
<chr>	<int>
Slate-colored Junco	2732
Tree Swallow	1537
Black-capped Chickadee	1327
American Goldfinch	1204
Field Sparrow	1164

Task 4 #

When you have correctly produced Majors, write a command that produces the month-by-month count of each of the major species. Call this table ByMonth. Display this month-by-month count with a bar chart arranged in a way that you think tells the story of what time of year the various species appear. You can use mplot() to explore different possibilies. (Warning: mplot() and similar interactive functions should not appear in your Rmd file, it needs to be used interactively from the console. Use the “Show Expression” button in mplot() to create an expression that you can cut and paste into a chunk in your Rmd document, so that the graph gets created when you compile it.) Once you have the graph, use it to answer these questions:

[1] Which species are present year-round?

American Goldfinch (11-12 mo)
Black-capped Chickadee (12 mo)

[2] Which species are migratory, that is, primarily present in one or two seasons?

Field Sparrow (6 mo)
Slate-colored Junco (8-9 mo)
Tree Swallow (3-5 mo)

[3] What is the peak month for each major species?

10 American Goldfinch
11 Black-capped Chickadee
05 Field Sparrow
10 Slate-colored Junco
06 Tree Swallow

[4] Which major species are seen in good numbers for at least 6 months of the year? (Hint: n_distinct() and >= 6.)

Arguably, the only species that is not seen in good numbers for at least 6 months of the year is the tree swallow.

ByMonth <-
  OrdwayBirds %>%
    group_by(SpeciesName, Month = as.integer(Month)) %>%
    summarize(count = n()) %>%
    filter(SpeciesName %in% Majors$Species)
ByMonth

`summarise()` has grouped output by 'SpeciesName'. You can override using the
`.groups` argument.

A grouped_df: 47 × 3
SpeciesName	Month	count
<chr>	<int>	<int>
American Goldfinch	1	10
American Goldfinch	2	51
American Goldfinch	3	48
American Goldfinch	4	21
American Goldfinch	5	125
American Goldfinch	6	63
American Goldfinch	7	67
American Goldfinch	8	70
American Goldfinch	9	151
American Goldfinch	10	364
American Goldfinch	11	180
American Goldfinch	12	3
Black-capped Chickadee	1	56
Black-capped Chickadee	2	140
Black-capped Chickadee	3	96
Black-capped Chickadee	4	51
Black-capped Chickadee	5	48
Black-capped Chickadee	6	20
Black-capped Chickadee	7	13
Black-capped Chickadee	8	11
Black-capped Chickadee	9	66
Black-capped Chickadee	10	173
Black-capped Chickadee	11	271
Black-capped Chickadee	12	165
Field Sparrow	4	83
Field Sparrow	5	197
Field Sparrow	6	15
Field Sparrow	7	79
Field Sparrow	8	64
Field Sparrow	9	74
Field Sparrow	10	69
Field Sparrow	11	1
Slate-colored Junco	1	113
Slate-colored Junco	2	61
Slate-colored Junco	3	188
Slate-colored Junco	4	694
Slate-colored Junco	5	1
Slate-colored Junco	8	1
Slate-colored Junco	9	35
Slate-colored Junco	10	1178
Slate-colored Junco	11	272
Slate-colored Junco	12	174
Tree Swallow	4	2
Tree Swallow	5	11
Tree Swallow	6	171
Tree Swallow	7	16
Tree Swallow	11	1

ByMonth %>%
  group_by(SpeciesName) %>%
  summarize(
    MonthsPerYear   = n(),
    SixMonthsOrMore = n_distinct(Month) >= 6
  )

A tibble: 5 × 3
SpeciesName	MonthsPerYear	SixMonthsOrMore
<chr>	<int>	<lgl>
American Goldfinch	12	TRUE
Black-capped Chickadee	12	TRUE
Field Sparrow	8	TRUE
Slate-colored Junco	10	TRUE
Tree Swallow	5	FALSE

ByMonth %>%
  ggplot() +
    geom_bar(
      mapping     = aes(x = Month, y = count, fill = SpeciesName),
      na.rm       = FALSE,
      position    = 'stack',
      show.legend = TRUE,
      stat        = 'identity'
    ) +
    scale_x_continuous(breaks = 1:12)

../../../_images/1bedb06c18461fcfb7f548b06bae4b55139fb3c83d148d96ee5c5b47b35daa60.png

Project: Scraping Nuclear Reactors #

Project: Scraping Nuclear Reactors from Kaplan, Daniel & Matthew Beckman. (2021). Data Computing. 2nd Ed. Home.

In what ways is the table tidy? How is it not tidy? What’s different about it from a tidy table?

Tidy Data Criteria

(i) rows (or cases, observations) must each represent the same underlying attribute (i.e., each observation must have its own row)
(ii) columns (or variables, values) must each contain the same type of value for each row (i.e., each variable must have its own column)
(iii) each value must have its own cell
It’s impossible to only satisfy two of the three criteria. This implies the following.
- (i) put each dataset into a tibble
- (ii) put each variable into a column

There is at least one row that does not represent a typical case (that is, the header row(s)). Certain columns are blank and need to be removed. The remaining columns may contain heterogeneous data formats or data types. And missing values must be addressed. But with some cleaning, the table hints at what a typical case should look like.

page      <- 'https://en.wikipedia.org/wiki/List_of_commerical_nuclear_reactors'
tableList <-
  page %>%
    read_html() %>%
    html_nodes(css = 'table') %>%
    html_table(fill = TRUE)
length(tableList)

Japan <-
  tableList[[21]] %>%
    select(1:9)
#names(Japan)[c(3, 6)] <- c('type', 'grossMW')
head(Japan)

55

A tibble: 6 × 9
Plantname	UnitNo.	Type	Model	Status	Capacity(MW)	Beginbuilding	Commercialoperation	Closed
<chr>	<chr>	<chr>	<chr>	<chr>	<chr>	<chr>	<chr>	<chr>
Plantname	UnitNo.	Type	Model	Status	Capacity(MW)	Beginbuilding	Commercialoperation	Closed
Fukushima Daiichi	1	BWR	BWR-3	Inoperable	439	25 Jul 1967	26 Mar 1971	19 May 2011
Fukushima Daiichi	2	BWR	BWR-4	Inoperable	760	9 Jun 1969	18 Jul 1974	19 May 2011
Fukushima Daiichi	3	BWR	BWR-4	Inoperable	760	28 Dec 1970	27 Mar 1976	19 May 2011
Fukushima Daiichi	4	BWR	BWR-4	Inoperable	760	12 Feb 1973	12 Oct 1978	19 May 2011
Fukushima Daiichi	5	BWR	BWR-4	Shut down	760	22 May 1972	18 Apr 1978	17 Dec 2013

Among other things, some of the variable names appear redundant and others have multiple words separated by spaces. You can rename variables using the data verb rename(), finding appropriate names from the Wikipedia table. Another problem is that the first row is not data but a continuation of the variable names. So row number 1 should be dropped.

Japan <-
  Japan %>%
    filter(row_number() > 1) %>%
    rename(
      name         = Plantname,
      reactor      = `UnitNo.`,
      type         = Type,
      model        = Model,
      status       = Status,
      netMW        = `Capacity(MW)`,
      construction = Beginbuilding,
      operation    = Commercialoperation,
      closure      = Closed
    )
head(Japan)

A tibble: 6 × 9
name	reactor	type	model	status	netMW	construction	operation	closure
<chr>	<chr>	<chr>	<chr>	<chr>	<chr>	<chr>	<chr>	<chr>
Fukushima Daiichi	1	BWR	BWR-3	Inoperable	439	25 Jul 1967	26 Mar 1971	19 May 2011
Fukushima Daiichi	2	BWR	BWR-4	Inoperable	760	9 Jun 1969	18 Jul 1974	19 May 2011
Fukushima Daiichi	3	BWR	BWR-4	Inoperable	760	28 Dec 1970	27 Mar 1976	19 May 2011
Fukushima Daiichi	4	BWR	BWR-4	Inoperable	760	12 Feb 1973	12 Oct 1978	19 May 2011
Fukushima Daiichi	5	BWR	BWR-4	Shut down	760	22 May 1972	18 Apr 1978	17 Dec 2013
Fukushima Daiichi	6	BWR	BWR-5	Shut down	1067	26 Oct 1973	24 Oct 1979	17 Dec 2013

str(Japan)

tibble [68 × 9] (S3: tbl_df/tbl/data.frame)
 $ name        : chr [1:68] "Fukushima Daiichi" "Fukushima Daiichi" "Fukushima Daiichi" "Fukushima Daiichi" ...
 $ reactor     : chr [1:68] "1" "2" "3" "4" ...
 $ type        : chr [1:68] "BWR" "BWR" "BWR" "BWR" ...
 $ model       : chr [1:68] "BWR-3" "BWR-4" "BWR-4" "BWR-4" ...
 $ status      : chr [1:68] "Inoperable" "Inoperable" "Inoperable" "Inoperable" ...
 $ netMW       : chr [1:68] "439" "760" "760" "760" ...
 $ construction: chr [1:68] "25 Jul 1967" "9 Jun 1969" "28 Dec 1970" "12 Feb 1973" ...
 $ operation   : chr [1:68] "26 Mar 1971" "18 Jul 1974" "27 Mar 1976" "12 Oct 1978" ...
 $ closure     : chr [1:68] "19 May 2011" "19 May 2011" "19 May 2011" "19 May 2011" ...

Using your cleaned data, make a plot of net generation capacity versus date of construction. Color the points by the type of reactor (for example, BWR, PWR, etc.) In addition to your plot, give a sentence or two of interpretation; what patterns do you see?

# BWR boiling water reactor
# FBR
# GCR
# PWR pressurized water reactor

Japan <-
  Japan %>%
    mutate(
      netMW        = as.integer(netMW),
      construction = lubridate::dmy(construction),
      operation    = lubridate::dmy(operation),
      closure      = lubridate::dmy(closure)
    )
head(Japan)

Japan %>%
  ggplot(mapping = aes(x = construction, y = netMW)) +
    geom_point(aes(color = type))

Warning message:
“There was 1 warning in `mutate()`.
ℹ In argument: `closure = lubridate::dmy(closure)`.
Caused by warning:
!  6 failed to parse.”

A tibble: 6 × 9
name	reactor	type	model	status	netMW	construction	operation	closure
<chr>	<chr>	<chr>	<chr>	<chr>	<int>	<date>	<date>	<date>
Fukushima Daiichi	1	BWR	BWR-3	Inoperable	439	1967-07-25	1971-03-26	2011-05-19
Fukushima Daiichi	2	BWR	BWR-4	Inoperable	760	1969-06-09	1974-07-18	2011-05-19
Fukushima Daiichi	3	BWR	BWR-4	Inoperable	760	1970-12-28	1976-03-27	2011-05-19
Fukushima Daiichi	4	BWR	BWR-4	Inoperable	760	1973-02-12	1978-10-12	2011-05-19
Fukushima Daiichi	5	BWR	BWR-4	Shut down	760	1972-05-22	1978-04-18	2013-12-17
Fukushima Daiichi	6	BWR	BWR-5	Shut down	1067	1973-10-26	1979-10-24	2013-12-17

Warning message:
“Removed 7 rows containing missing values (`geom_point()`).”

../../../_images/b62e597410a4fcd77faf67a9b810925e9a47f75d382c3b0cc93be60dae6226e2.png

Carry out the same cleaning process for the China reactor table, and then append it with the Japan data. Use mutate() to add a variable that has the name of the country. (Hint: functions such as bind_cols() or bind_rows() form the dplyr package are helpful for appending data frames.) Collating the data for all countries is a matter of repeating this process over and over. Inevitably, there are inconsistencies. For example, the US data had been organized in a somewhat different format when compared to the Japan and China data for many years until Wikipedia editors decided to reconcile them.

China <-
  tableList[[10]] %>%
    select(1:9)
China <-
  China %>%
    filter(row_number() > 2) %>%
    rename(
      name         = Plantname,
      reactor      = `UnitNo.`,
      type         = Type,
      model        = Model,
      status       = Status,
      netMW        = `Capacity(MW)`,
      construction = Beginbuilding,
      operation    = Commercialoperation,
      closure      = Closed
    )
China <-
  China %>%
    mutate(
      netMW        = as.integer(netMW),
      construction = lubridate::dmy(construction),
      operation    = lubridate::dmy(operation),
      closure      = lubridate::dmy(closure)
    )
China %>%
  ggplot(mapping = aes(x = construction, y = netMW)) +
    geom_point(aes(color = type))

Warning message:
“There were 2 warnings in `mutate()`.
The first warning was:
ℹ In argument: `construction = lubridate::dmy(construction)`.
Caused by warning:
!  15 failed to parse.
ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.”
Warning message:
“Removed 47 rows containing missing values (`geom_point()`).”

../../../_images/37043f5cace06ab2c8957024e65610096536b8cc276dabba2c49fb70a2a4b39d.png

bind_rows(Japan, China) %>%
  mutate(Country = ifelse(name %in% unique(Japan$name), 'Japan', 'China')) %>%
  head()

A tibble: 6 × 10
name	reactor	type	model	status	netMW	construction	operation	closure	Country
<chr>	<chr>	<chr>	<chr>	<chr>	<int>	<date>	<date>	<date>	<chr>
Fukushima Daiichi	1	BWR	BWR-3	Inoperable	439	1967-07-25	1971-03-26	2011-05-19	Japan
Fukushima Daiichi	2	BWR	BWR-4	Inoperable	760	1969-06-09	1974-07-18	2011-05-19	Japan
Fukushima Daiichi	3	BWR	BWR-4	Inoperable	760	1970-12-28	1976-03-27	2011-05-19	Japan
Fukushima Daiichi	4	BWR	BWR-4	Inoperable	760	1973-02-12	1978-10-12	2011-05-19	Japan
Fukushima Daiichi	5	BWR	BWR-4	Shut down	760	1972-05-22	1978-04-18	2013-12-17	Japan
Fukushima Daiichi	6	BWR	BWR-5	Shut down	1067	1973-10-26	1979-10-24	2013-12-17	Japan

Make an informative graphic that shows how long it took between start of construction and commissioning for operation of each nuclear reactor in Japan (or another country of your choice). One possibility: use reactor name vs date as the frame. For each reactor, set the glyph to be a line extending from start of construction to commissioning. You can do this with geom_segment() using name as the y coordinate and time as the x coordinate. (Tip: use the paste() function to create the reactorID on the vertical axis.)

options(warn = -1)

Japan %>%
  mutate(
    reactorID = paste(name, reactor),
    time      = operation - construction
  ) %>%
  ggplot() +
    geom_segment(
      linewidth = 2,
      mapping = aes(
        x     = operation,
        xend  = construction,
        y     = reactorID,
        yend  = reactorID,
        color = type
      )
    )

../../../_images/cb842782173f3b67767816cc811fa9f056637bc6fdb668883964a35844d31018.png

Project: Street or Road?#

Project: Street or Road? from Kaplan, Daniel & Matthew Beckman. (2021). Data Computing. 2nd Ed. Home.

Example #

People’s addresses involve streets, lanes, courts, avenues, and so on. How many such road-related words are in common use? In answering this question, you would presumably want to look at lots of addresses and extract the road-related term. You could do this by eye, reading down a list of a few hundred or thousand addresses. But if you want to do it on a really large scale, a city or state or country, you would want some automated help, for instance, a computer program that discards the sorts of entries you have already identified to give a greater concentration of unidentified terms. In this activity, you’re going to build such a program.

# about 15,000 street addresses of registered voters in Wake County, NC
Addresses <- read_csv('https://mdbeckman.github.io/dcSupplement/data/street-addresses.csv')
head(Addresses)

Rows: 15483 Columns: 1
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): address

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

A tibble: 6 × 1
address
<chr>
2117 MARINER CIRCLE
101 EPPING WAY
PO BOX 58592
5102 ECHO RIDGE RD
PO BOX 37218
PO BOX 37218

# about 900,000 medicare service provider street addresses
download.file(url      = 'https://mdbeckman.github.io/dcSupplement/data/CMS_ProvidersSimple.rds',
              destfile = 'CMS_ProvidersSimple.rds')
DataTable <- readRDS('CMS_ProvidersSimple.rds')
head(DataTable)

A data.frame: 6 × 3
	address	first_name	sex
	<chr>	<chr>	<chr>
1	900 SETON DR	ARDALAN	M
2	2650 RIDGE AVE	THOMAS	M
3	4126 N HOLLAND SYLVANIA RD	RASHID	M
4	456 MAGEE AVE	DAVID	M
5	11100 EUCLID AVE	JENNIFER	F
6	12605 E 16TH AVE	KEVIN	M

To solve such problems, start by looking at a few dozen of the addresses to familiarize yourself with common patterns. Suppose you wanted to extract the PO Box number from an address. Read the street address data and pull out a sample of a few dozen cases.

In everyday langauge, describe a pattern that you think will identify the information you are looking for.
- The PO Box cases tend to have a substring ‘PO’.
Translate (1) into the form of a regular expression.
- The regular expression for ‘PO’ is simply ‘PO’.
Filter to retain the cases that match the expression. Hint: filter() and grepl() are useful for this.
Filter to retain the cases that do not match the expression.
Examine the results of (3) and (4) to identify shortcomings in your patterns.
Improve or extend the pattern to deal with the mistaken cases.
Repeat until satisfied.
Put extraction parentheses around the parts of the regular expression that contain the info you want.

Sample <-
  Addresses %>%
    sample_n(size = 50)
head(Sample)

A tibble: 6 × 1
address
<chr>
1921 MCKINNEY STREET
PO BOX 2973
PO BOX 99181
NCSU BOX 22323
3585 E 10TH ST
PO BOX 96

pattern <- 'PO'

Matches <-
  Sample %>%
    filter(grepl(pattern = pattern, address))
head(Matches)

A tibble: 6 × 1
address
<chr>
PO BOX 2973
PO BOX 99181
PO BOX 96
PO BOX 28454
PO BOX 37297
PO BOX 26761

Dont <-
  Sample %>%
    filter(!grepl(pattern = 'PO', address))
head(Dont)

A tibble: 6 × 1
address
<chr>
1921 MCKINNEY STREET
NCSU BOX 22323
3585 E 10TH ST
NCSU BOX 15330
NCSU BOX 04082
251 N MAIN STREET 3786

pattern <- 'BOX\\s+(\\d+)'

Matches <-
  Sample %>%
    filter( grepl(pattern, address))
head(Matches)

A tibble: 6 × 1
address
<chr>
PO BOX 2973
PO BOX 99181
NCSU BOX 22323
PO BOX 96
PO BOX 28454
NCSU BOX 15330

Dont <-
  Sample %>%
    filter(!grepl(pattern, address))
head(Dont)

A tibble: 6 × 1
address
<chr>
1921 MCKINNEY STREET
3585 E 10TH ST
251 N MAIN STREET 3786
3307 BROOKLINE COURT
1703 PATTON ROAD
3231 WALNUT CREEK PKWY

BoxNumbers <-
  Sample %>%
    filter(grepl(pattern, address)) %>%
    tidyr::extract(address, into = 'boxnum', regex = pattern)
head(BoxNumbers)

A tibble: 6 × 1
boxnum
<chr>
2973
99181
22323
96
28454
15330

pattern <- 'PO'

Matches <-
  Addresses %>%
    filter(grepl(pattern = pattern, address))
head(Matches)

A tibble: 6 × 1
address
<chr>
PO BOX 58592
PO BOX 37218
PO BOX 37218
PO BOX 1953
PO BOX 132
PO BOX 360

Dont <-
  Addresses %>%
    filter(!grepl(pattern = 'PO', address))
head(Dont)

A tibble: 6 × 1
address
<chr>
2117 MARINER CIRCLE
101 EPPING WAY
5102 ECHO RIDGE RD
5007 PURITAN RD
04-I ROBIN CIRCLE
4800 WESTERN BLVD

pattern <- 'BOX\\s+(\\d+)'

Matches <-
  Addresses %>%
    filter( grepl(pattern, address))
head(Matches)

A tibble: 6 × 1
address
<chr>
PO BOX 58592
PO BOX 37218
PO BOX 37218
PO BOX 1953
PO BOX 132
PO BOX 360

Dont <-
  Addresses %>%
    filter(!grepl(pattern, address))
head(Dont)

A tibble: 6 × 1
address
<chr>
2117 MARINER CIRCLE
101 EPPING WAY
5102 ECHO RIDGE RD
5007 PURITAN RD
04-I ROBIN CIRCLE
4800 WESTERN BLVD

BoxNumbers <-
  Addresses %>%
    filter(grepl(pattern, address)) %>%
    tidyr::extract(address, into = 'boxnum', regex = pattern)
head(BoxNumbers)

A tibble: 6 × 1
boxnum
<chr>
58592
37218
37218
1953
132
360

Back to the Streets #

Street endings (e.g., ‘ST’, ‘LANE’) are often found at the end of the address string. Use this as a starting point to find the most common endings. Once you have a set of specific street endings, you can use the regex “or” symbol, e.g., ‘(ST|RD|ROAD)’. The parentheses are not incidental. They are there to mark a pattern that you want to extract. In this case, in addition to knowing that there is a ST or RD or ROAD in an address, you want to know which one of those possibilities it is so that you can count the occurrence of each of the possibilities. To find street endings that aren’t in your set, you can filter out the street endings or non street addresses you already know about.

[1] Read the following R statements. Next to each line, give a short explanation of what the line contributes to the task. For each of the regexes, explain in simple everyday language what pattern is being matched.

pattern <- '(ST|RD|ROAD)'

LeftOvers <-
  Addresses %>%
    filter(
      !grepl(pattern, address),                 # matches other than: 'ST' or 'RD' or 'ROAD'
      !grepl('\\sAPT|UNIT\\s[\\d]+$', address), # matches other than: (1 whitespace, 'APT') or ('UNIT', 1 whitespace, 1 or more digits at the end of the string)
      !grepl(' BOX ', address)                  # matches other than: 1 space, 'BOX', 1 space
    )
LeftOvers

[2] For each set of patterns that you identify, compute the LeftOvers. Examine them visually to find new street endings to add to the pattern, e.g., LANE. When you have this working on the small sample, use a larger sample and, eventually, the whole data set. It’s practically impossible to find a method that will work perfectly on new data, but do the best you can. In your report, implement your method and explain how it works, line by line. Present your result: how many addresses there are of each kind of road word?

‘DR’, ‘RD’, and ‘ST’ are the most common street types. In general, abbrviated street types are more common than their longform counterparts.

Breaking addresses into their components is a common task. People who work on this problem intensively sometimes publish their regular expressions. Here’s one from Ross Hammer published at https://regexlib.com/Search.aspx?k=street:

^\s*((?:(?:\d+(?:\x20+\w+\.?)+(?:(?:\x20+STREET|ST|DRIVE|DR|AVENUE|AVE|ROAD|RD|LOOP|COURT
|CT|CIRCLE|LANE|LN|BOULEVARD|BLVD)\.?)?)|(?:(?:P\.\x20?O\.|P\x20?O)\x20*Box\x20+\d+)|
(?:General\x20+Delivery)|(?:C[\\\/]O\x20+(?:\w+\x20*)+))\,?\x20*(?:(?:(?:APT|BLDG|DEPT|
FL|HNGR|LOT|PIER|RM|S(?:LIP|PC|T(?:E|OP))|TRLR|UNIT|\x23)\.?\x20*(?:[a-zA-Z0-9\-]+))|
(?:BSMT|FRNT|LBBY|LOWR|OFC|PH|REAR|SIDE|UPPR))?)\,?\s+((?:(?:\d+(?:\x20+\w+\.?)+
(?:(?:\x20+STREET|ST|DRIVE|DR|AVENUE|AVE|ROAD|RD|LOOP|COURT|CT|CIRCLE|LANE|LN|BOULEVARD|
BLVD)\.?)?)|(?:(?:P\.\x20?O\.|P\x20?O)\x20*Box\x20+\d+)|(?:General\x20+Delivery)|
(?:C[\\\/]O\x20+(?:\w+\x20*)+))\,?\x20*(?:(?:(?:APT|BLDG|DEPT|FL|HNGR|LOT|PIER|RM|
S(?:LIP|PC|T(?:E|OP))|TRLR|UNIT|\x23)\.?\x20*(?:[a-zA-Z0-9\-]+))|(?:BSMT|FRNT|LBBY|
LOWR|OFC|PH|REAR|SIDE|UPPR))?)?\,?\s+((?:[A-Za-z]+\x20*)+)\,\s+(A[LKSZRAP]|C[AOT]|
D[EC]|F[LM]|G[AU]|HI|I[ADLN]|K[SY]|LA|M[ADEHINOPST]|N[CDEHJMVY]|O[HKR]|P[ARW]|RI|
S[CD]|T[NX]|UT|V[AIT]|W[AIVY])\s+(\d+(?:-\d+)?)\s*$

Addresses %>%
  filter(
    !grepl(pattern = ' AVENUE |AVENUE$', x = address),       # matches other than: 'AVENUE' flanked by whitespace anywhere in the string, or 'AVENUE' at the end of the string
    !grepl(pattern = ' AVE |AVE$', x = address),             # matches other than: 'AVE' flanked by whitespace anywhere in the string, or 'AVE' at the end of the string
    !grepl(pattern = ' BLVD |BLVD$', x = address),           # matches other than: 'BLVD' flanked by whitespace anywhere in the string, or 'BLVD' at the end of the string
    !grepl(pattern = ' BOULEVARD |BOULEVARD$', x = address), # matches other than: 'BOULEVARD' flanked by whitespace anywhere in the string, or 'BOULEVARD' at the end of the string
    !grepl(pattern = ' CIR |CIR$', x = address),             # matches other than: 'CIR' flanked by whitespace anywhere in the string, or 'CIR' at the end of the string
    !grepl(pattern = ' CIRCLE |CIRCLE$', x = address),       # matches other than: 'CIRCLE' flanked by whitespace anywhere in the string, or 'CIRCLE' at the end of the string
    !grepl(pattern = ' CT |CT$', x = address),               # matches other than: 'CT' flanked by whitespace anywhere in the string, or 'CT' at the end of the string
    !grepl(pattern = ' COURT |COURT$', x = address),         # matches other than: 'COURT' flanked by whitespace anywhere in the string, or 'COURT' at the end of the string
    !grepl(pattern = ' DR |DR$', x = address),               # matches other than: 'DR' flanked by whitespace anywhere in the string, or 'DR' at the end of the string
    !grepl(pattern = ' DRIVE |DRIVE$', x = address),         # matches other than: 'DRIVE' flanked by whitespace anywhere in the string, or 'DRIVE' at the end of the string
    !grepl(pattern = ' LN |LN$', x = address),               # matches other than: 'LN' flanked by whitespace anywhere in the string, or 'LN' at the end of the string
    !grepl(pattern = ' LANE |LANE$', x = address),           # matches other than: 'LANE' flanked by whitespace anywhere in the string, or 'LANE' at the end of the string
    !grepl(pattern = ' PL |PL$', x = address),               # matches other than: 'PL' flanked by whitespace anywhere in the string, or 'PL' at the end of the string
    !grepl(pattern = ' PLACE |PLACE$', x = address),         # matches other than: 'PLACE' flanked by whitespace anywhere in the string, or 'PLACE' at the end of the string
    !grepl(pattern = ' RD |RD$', x = address),               # matches other than: 'RD' flanked by whitespace anywhere in the string, or 'RD' at the end of the string
    !grepl(pattern = ' ROAD |ROAD$', x = address),           # matches other than: 'ROAD' flanked by whitespace anywhere in the string, or 'ROAD' at the end of the string
    !grepl(pattern = ' ST |ST$', x = address),               # matches other than: 'ST' flanked by whitespace anywhere in the string, or 'ST' at the end of the string
    !grepl(pattern = ' STREET |STREET$', x = address),       # matches other than: 'STREET' flanked by whitespace anywhere in the string, or 'STREET' at the end of the string
    !grepl(pattern = ' TRAIL |TRAIL$', x = address),         # matches other than: 'TRAIL' flanked by whitespace anywhere in the string, or 'TRAIL' at the end of the string
    !grepl(pattern = ' WAY |WAY$', x = address),             # matches other than: 'WAY' flanked by whitespace anywhere in the string, or 'WAY' at the end of the string
    !grepl(pattern = ' WINERY |WINERY$', x = address),       # matches other than: 'WINERY' flanked by whitespace anywhere in the string, or 'WINERY' at the end of the string
    !grepl(pattern = 'BOX\\s+(\\d+)', x = address)           # filters out all the 'BOX' from above
  ) %>%
  head()

A tibble: 6 × 1
address
<chr>
NCSU B0X 15637
HMIA-267
1104 PULLEN HALL
311 BAREFOOT
512 LIVE OAK
1706 WINGATE UNIVERSITY

pattern <- '\\s+('
pattern <- str_c(pattern, 'AVE|AVENUE')
pattern <- str_c(pattern, '|BLVD|BOULEVARD')
pattern <- str_c(pattern, '|CIR|CIRCLE')
pattern <- str_c(pattern, '|CT|COURT')
pattern <- str_c(pattern, '|DR|DRIVE')
pattern <- str_c(pattern, '|LN|LANE')
pattern <- str_c(pattern, '|PL|PLACE')
pattern <- str_c(pattern, '|RD|ROAD')
pattern <- str_c(pattern, '|ST|STREET')
pattern <- str_c(pattern, '|TRAIL')
pattern <- str_c(pattern, '|WAY')
pattern <- str_c(pattern, '|WINERY')
pattern <- str_c(pattern, ')(\\s+|$)?')
pattern

streets <-
  Addresses %>%
    filter(
      grepl(x = address, pattern = pattern, ignore.case = TRUE)
    ) %>%
    tidyr::extract(
      col   = address,
      regex = pattern,
      into  = c(
        'street',
        ' '
      )
    ) %>%
    group_by(street) %>%
    summarize(total = n()) %>%
    arrange(desc(total))
streets

'\\s+(AVE|AVENUE|BLVD|BOULEVARD|CIR|CIRCLE|CT|COURT|DR|DRIVE|LN|LANE|PL|PLACE|RD|ROAD|ST|STREET|TRAIL|WAY|WINERY)(\\s+|$)?'

A tibble: 16 × 2
street	total
<chr>	<int>
DR	888
RD	688
ST	676
AVE	259
CT	229
ROAD	188
LN	152
PL	145
LANE	141
CIR	120
WAY	113
COURT	90
BLVD	50
TRAIL	26
BOULEVARD	4
WINERY	1

streets %>%
  ggplot(aes(x = reorder(street, -total), y = total)) +
    geom_col() +
    xlab('street')

../../../_images/867724b761323912e73e277a1261147ba62ea3fe7088b452ab5f032bd2815fc5.png

Bibliography #

Kaplan, Daniel & Matthew Beckman. (2021). Data Computing. 2nd Ed. Home.

Data Computing 2

Contents