Data Computing 2#

Kaplan, Daniel & Matthew Beckman. (2021). Data Computing. 2nd Ed. Home.


Revised

19 Jun 2023


Programming Environment#

# install.packages('devtools')
# devtools::install_github('mdbeckman/dcData')

packages <- c(
  'data.table', # library(data.table)
  'dcData',     # library(dcData)
  'foreign',    # library(foreign)
  'leaflet',    # library(leaflet)
  'lubridate',  # library(lubridate)
  'mosaic',     # library(mosaic)
  'mosaicData', # library(mosaicData)
  'Rcpp',       # library(Rcpp)
  'rvest',      # library(rvest)
  'tidyverse'   # library(tidyverse)
)

# Install packages not yet installed
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
  install.packages(packages[!installed_packages])
}

# Load packages
invisible(lapply(packages, library, character.only = TRUE))

str_c('EXECUTED : ', now())
sessionInfo()
# R.version.string # R.Version()
# .libPaths()
# installed.packages()
Attaching package: ‘lubridate’
The following objects are masked from ‘package:data.table’:

    hour, isoweek, mday, minute, month, quarter, second, wday, week,
    yday, year
The following objects are masked from ‘package:base’:

    date, intersect, setdiff, union
Registered S3 method overwritten by 'mosaic':
  method                           from   
  fortify.SpatialPolygonsDataFrame ggplot2
The 'mosaic' package masks several functions from core packages in order to add 
additional features.  The original behavior of these functions should not be affected by this.
Attaching package: ‘mosaic’
The following objects are masked from ‘package:dplyr’:

    count, do, tally
The following object is masked from ‘package:Matrix’:

    mean
The following object is masked from ‘package:ggplot2’:

    stat
The following objects are masked from ‘package:stats’:

    binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
    quantile, sd, t.test, var
The following objects are masked from ‘package:base’:

    max, mean, min, prod, range, sample, sum
── Attaching core tidyverse packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
 forcats 1.0.0      stringr 1.5.0
 purrr   1.0.2      tibble  3.2.1
 readr   2.1.4      tidyr   1.3.0
── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
 dplyr::between()        masks data.table::between()
 mosaic::count()         masks dplyr::count()
 purrr::cross()          masks mosaic::cross()
 mosaic::do()            masks dplyr::do()
 tidyr::expand()         masks Matrix::expand()
 dplyr::filter()         masks stats::filter()
 dplyr::first()          masks data.table::first()
 readr::guess_encoding() masks rvest::guess_encoding()
 lubridate::hour()       masks data.table::hour()
 lubridate::isoweek()    masks data.table::isoweek()
 dplyr::lag()            masks stats::lag()
 dplyr::last()           masks data.table::last()
 lubridate::mday()       masks data.table::mday()
 lubridate::minute()     masks data.table::minute()
 lubridate::month()      masks data.table::month()
 tidyr::pack()           masks Matrix::pack()
 lubridate::quarter()    masks data.table::quarter()
 lubridate::second()     masks data.table::second()
 mosaic::stat()          masks ggplot2::stat()
 mosaic::tally()         masks dplyr::tally()
 purrr::transpose()      masks data.table::transpose()
 tidyr::unpack()         masks Matrix::unpack()
 lubridate::wday()       masks data.table::wday()
 lubridate::week()       masks data.table::week()
 lubridate::yday()       masks data.table::yday()
 lubridate::year()       masks data.table::year()
 Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
'EXECUTED : 2025-01-17 18:21:40.31441'
R version 4.3.0 (2023-04-21)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS 15.2

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: America/New_York
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] forcats_1.0.0     stringr_1.5.0     purrr_1.0.2       readr_2.1.4      
 [5] tidyr_1.3.0       tibble_3.2.1      tidyverse_2.0.0   rvest_1.0.3      
 [9] Rcpp_1.0.10       mosaic_1.8.4.2    mosaicData_0.20.3 ggformula_0.10.4 
[13] dplyr_1.1.2       Matrix_1.5-4      ggplot2_3.4.3     lattice_0.21-8   
[17] lubridate_1.9.2   leaflet_2.2.0     foreign_0.8-84    dcData_0.1.0     
[21] data.table_1.14.8

loaded via a namespace (and not attached):
 [1] utf8_1.2.3         generics_0.1.3     xml2_1.3.4         stringi_1.7.12    
 [5] hms_1.1.3          digest_0.6.31      magrittr_2.0.3     evaluate_0.21     
 [9] grid_4.3.0         timechange_0.2.0   pbdZMQ_0.3-9       fastmap_1.1.1     
[13] jsonlite_1.8.5     httr_1.4.6         fansi_1.0.4        crosstalk_1.2.0   
[17] scales_1.2.1       tweenr_2.0.2       cli_3.6.1          labelled_2.11.0   
[21] rlang_1.1.1        crayon_1.5.2       polyclip_1.10-4    munsell_0.5.0     
[25] base64enc_0.1-3    withr_2.5.0        repr_1.1.6         tools_4.3.0       
[29] tzdb_0.4.0         uuid_1.1-0         colorspace_2.1-0   mosaicCore_0.9.2.1
[33] IRdisplay_1.1      vctrs_0.6.3        R6_2.5.1           ggridges_0.5.4    
[37] lifecycle_1.0.3    htmlwidgets_1.6.2  ggstance_0.3.6     MASS_7.3-58.4     
[41] pkgconfig_2.0.3    pillar_1.9.0       gtable_0.3.3       glue_1.6.2        
[45] ggforce_0.4.1      haven_2.5.2        tidyselect_1.2.0   IRkernel_1.3.2    
[49] farver_2.1.1       htmltools_0.5.5    compiler_4.3.0    

Issue

Issues using Leaflet (and the terra dependency in particular) in VSCode on macOS.

The following is an attempted, but so far failed, approach.

brew install gdal proj
remotes::install_github("rspatial/terra", configure.args = "--with-gdal-config=/opt/homebrew/bin/gdal-config --with-proj-include=/opt/homebrew/opt/proj/include --with-proj-lib=/opt/homebrew/opt/proj/lib --with-proj-share=/opt/homebrew/opt/proj/share/proj", type = "source", force = TRUE)

02 - Computing with R#

data(package = 'dcData')
data('NCHS', package = 'dcData')
View(NCHS)
Hide code cell output
A data.frame: 31126 × 31
sexagepregnantethnicitydeathfollowupsmokerdiabeticheightweightbmdfmhm_otherhdlcholbpsbpdincomepop_weightpsustratum
<fct><dbl><fct><fct><chr><dbl><fct><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
female 2no Non-Hispanic BlackNA NAno 00.916 12.5 NA NA NA NA NANA0.86 2970.80441 5
male 77no Non-Hispanic Whitealive 90no 01.740 75.41.2196 0.12359 54215 98565.0010224.13313 1
female10no Non-Hispanic WhiteNA NAno 01.366 32.9 NA NA 30129108631.4714172.31112 7
male 1no Non-Hispanic BlackNA NAno 0 NA 13.3 NA NA NA NA NANA0.57 3041.59271 2
male 49no Non-Hispanic Whitealive 74yes 01.783 92.51.0870 1.17688 42279122835.0030657.31192 8
female19no Other/Multi alive 86no 01.620 59.20.8680-1.22452 61153114701.2112224.87562 2
female59no Non-Hispanic Blackalive 76no 01.629 78.01.0870-0.18566105245123810.00 8930.88022 4
male 13no Non-Hispanic WhiteNA NAno 01.620 40.70.8706-0.48886 67162 98520.53 9823.00031 6
female11no Non-Hispanic BlackNA NAno 01.569 45.5 NA NA 58148112500.00 2345.57602 9
male 43no Non-Hispanic Blackalive 79no 01.901111.81.3000-0.18884 51140142950.00 7484.23361 7
male 15no Non-Hispanic WhiteNA NAno 01.719 65.01.2090-0.41861 40132107511.2510758.65052 1
male 37no Non-Hispanic Whitealive 82no 01.800 99.21.1990 0.20224 38156174994.9330448.09672 6
male 70no Mexican American cardiovascular death16no 11.577 63.60.8464 1.36872 49314130661.07 752.5150213
male 81no Non-Hispanic Whitealive 85yes 01.662 75.51.0550 0.95469 40174136612.67 6626.6640112
female38no Non-Hispanic Whitealive 92yes 01.749 81.61.1174-0.24117 58199109694.5234313.1432211
female85no Non-Hispanic Blackother 62no 01.442 41.50.8510-0.06773 55164139600.38 5872.4952111
male 2no Non-Hispanic BlackNA NAno 00.886 11.4 NA NA NA NA NANA0.92 2570.34221 5
female 1no Non-Hispanic WhiteNA NAno 0 NA 11.1 NA NA NA NA NANA2.84 9973.79001 8
male 0no Mexican American NA NANA NA NA 11.5 NA NA NA NA NANA2.40 801.5593213
female23yesMexican American alive 86yes 01.589 59.8 NA-0.12459 43145103603.03 6618.11842 2
male 18no Mexican American alive 87no 01.685112.91.1064 0.49518 34161119781.80 1283.59932 3
female13no Non-Hispanic BlackNA NAno 01.622 50.9 NA-1.22452 62151105730.75 2070.02732 2
female12no Non-Hispanic BlackNA NAno 01.691 75.0 NA 0.30949 32160 97565.00 1779.40182 4
female53no Non-Hispanic Whitealive 72yes 01.642 69.90.9540-0.46297105219114712.6724695.7644212
female42no Non-Hispanic Whitealive 84no 01.667104.51.2266-0.32911 55159119851.7742467.37281 5
female14no Mexican American NA NAno 01.631 85.6 NA-0.99286 49145 99612.51 690.6990213
male 18no Mexican American alive 85no 01.613 72.21.1034 NA NA NA117770.00 880.8045213
male 18no Non-Hispanic Whitealive 91no 01.796 62.71.1622-1.14831 64151 92433.4311514.52452 1
male 62no Non-Hispanic Whiteother 26no 11.749113.01.2130 0.60531 49216124711.0716359.4822212
female 7no Non-Hispanic BlackNA NANA NA1.261 21.7 NA NA 72184 NANA0.00 3608.91882 5
female40no Non-Hispanic WhitealiveNAno 0 NA NA NA NA NA NA NANA5.00 0.0000136
male 83no Non-Hispanic Whitealive26no 01.608 62.61.2500 0.30891 73237180360.84 2842.5528241
female17no Mexican American NA NAno 01.660 73.41.0670 NA 63192116782.43 2269.4901142
male 85no Non-Hispanic WhitealiveNAno 0 NA NA NA 0.00000 NA NA NANA3.00 0.0000235
female46no Non-Hispanic Whitealive31yes 01.738 57.51.1650 0.80966 28218103721.0416782.0437133
female33yesOther/Multi alive47no 01.633 76.9 NA-0.75543 72228112635.00 2410.5345135
male 0no Other/Multi NA NANA NA NA 10.0 NA NA NA NA NANA4.47 1518.4104237
female 4no Mexican American NA NAno 01.080 19.3 NA NA NA NA NANA0.80 2726.6750240
male 40no Non-Hispanic Blackalive31no 01.748 76.41.4320 NA 82232 96571.8310810.7055232
female 0no Mexican American NA NANA NA NA 7.8 NA NA NA NA NANA0.64 715.5026243
male 12no Non-Hispanic BlackNA NAno 01.428 31.11.0170-0.48886 74187101630.56 1895.2838130
female72no Non-Hispanic Whitealive27no 01.502 67.51.0110-0.16663 79162140701.1810217.0484230
female26yesOther/Multi alive25no 01.558 53.8 NA-0.53363 NA NA106540.00 2791.2864239
female14no Mexican American NA NAno 01.550 54.31.0890-0.96314 60168118671.01 1133.9264240
male 34no Non-Hispanic Whitealive31no 01.844100.01.1810-0.20340 61197132714.7936995.3239232
male 13no Mexican American NA NAno 01.559 61.20.9180-0.16427 29120104601.33 1996.1073242
male 12no Non-Hispanic WhiteNA NAno 01.619 46.20.8916 NA NA NA 94522.7012648.6416242
female55no Non-Hispanic Whitealive31no 01.648 57.81.0690-1.22452 73157 96671.2115357.9095231
female19no Mexican American alive36no 01.531 52.81.1090-0.41705 43129106620.00 1410.3413140
female54no Non-Hispanic Whitealive47no 01.661 71.51.0850-0.75543 64208115595.0026313.6174135
female 2no Non-Hispanic WhiteNA NAno 0 NA 12.5 NA NA NA NA NANA1.34 5568.3884234
female16no Mexican American NA NAyes 01.498 43.71.0110-0.50255 52120111690.66 2289.3041138
male 23no Mexican American alive37yes 01.721 77.40.9940 0.84841 44211127693.93 9982.5528139
female 0no Mexican American NA NANA NA NA 9.5 NA NA NA NA NANA1.24 1391.6587239
female68no Mexican American alive34no 01.530 68.00.8650 0.51794 41202134NA2.08 1128.5607240
female 5no Non-Hispanic WhiteNA NAno 01.098 19.3 NA NA NA NA NANA2.9618259.3185239
female83no Non-Hispanic Whitealive38no 01.624 61.91.0180 0.07722101264140520.00 7025.9255139
male 44no Mexican American alive37yes 01.734 89.71.0230 1.55613 51282149903.93 5832.3181139
male 54no Non-Hispanic Whitealive36no 01.835126.81.1382 0.24084 35168114625.0032734.1635143
male 1no Non-Hispanic WhiteNA NAno 0 NA 10.3 NA NA NA NA NANA5.0010280.9366229
SmallNCHS <- sample_n(tbl = NCHS, size = 100)
head(SmallNCHS)
A data.frame: 6 × 31
sexagepregnantethnicitydeathfollowupsmokerdiabeticheightweightbmdfmhm_otherhdlcholbpsbpdincomepop_weightpsustratum
<fct><dbl><fct><fct><chr><dbl><fct><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
1male 14noNon-Hispanic BlackNA NAno01.592 47.81.0382 NA49150 NANA1.61 1618.739229
2female45noMexican American alive33no01.577111.11.1052-0.3259256220110764.99 5860.458242
3female18noNon-Hispanic Whitealive49no01.700 94.51.2060-0.3259259233117672.7613739.259220
4male 8noMexican American NA NAno01.328 32.80.7800 NA50162104620.63 1975.047128
5female 2noMexican American NA NAno00.911 13.2 NA NANA NA NANA2.31 1967.3232 2
6female35noOther Hispanic alive73no01.642101.91.0608 NANA NA111670.0025094.0592 7
Motors <- readr::read_csv('https://mdbeckman.github.io/dcSupplement/data/engines.csv')
head(Motors)
Rows: 39 Columns: 9
── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (1): Engine
dbl (8): mass, ncylinder, strokes, displacement, bore, stroke, BHP, RPM
 Use `spec()` to retrieve the full column specification for this data.
 Specify the column types or set `show_col_types = FALSE` to quiet this message.
A tibble: 6 × 9
EnginemassncylinderstrokesdisplacementborestrokeBHPRPM
<chr><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
Webra Speedy 0.13512 1.813.512.50.4522000
Motori Cipolla 0.15012 2.515.014.01.0026000
Webra Speed 20 0.25012 3.416.516.00.7822000
Webra 40 0.27012 6.521.019.00.9615500
Webra 61 Blackhead0.4301210.024.022.01.5514000
Webra 6WR 0.4901210.024.022.02.7619000
data('CPS85', package = 'mosaicData')
head(CPS85)
A data.frame: 6 × 11
wageeducracesexhispanicsouthmarriedexperunionagesector
<dbl><int><fct><fct><fct><fct><fct><int><fct><int><fct>
1 9.010WMNHNSMarried27Not 43const
2 5.512WMNHNSMarried20Not 38sales
3 3.812WFNHNSSingle 4Not 22sales
410.512WFNHNSMarried29Not 47clerical
515.012WMNHNSMarried40Union58const
6 9.016WFNHNSMarried27Not 49clerical

05 - Intro to data graphics#

head(x = dcData::NCHS, n = 5)
A data.frame: 5 × 31
sexagepregnantethnicitydeathfollowupsmokerdiabeticheightweightbmdfmhm_otherhdlcholbpsbpdincomepop_weightpsustratum
<fct><dbl><fct><fct><chr><dbl><fct><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
1female 2noNon-Hispanic BlackNA NAno 00.91612.5 NA NANA NA NANA0.86 2970.80415
2male 77noNon-Hispanic Whitealive90no 01.74075.41.21960.1235954215 98565.0010224.13331
3female10noNon-Hispanic WhiteNA NAno 01.36632.9 NA NA30129108631.4714172.31127
4male 1noNon-Hispanic BlackNA NAno 0 NA13.3 NA NANA NA NANA0.57 3041.59312
5male 49noNon-Hispanic Whitealive74yes01.78392.51.08701.1768842279122835.0030657.31228
head(x = dcData::NCI60, n = 5)
A data.frame: 5 × 61
ProbeBR.MCF7BR.MDA_MB_231BR.HS578TBR.BT_549BR.T47DCNS.SF_268CNS.SF_295CNS.SF_539CNS.SNB_19PR.PC_3PR.DU_145RE.786_0RE.A498RE.ACHNRE.CAKI_1RE.RXF_393RE.SN12CRE.TK_10RE.UO_31
<fct><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
1AT_D_3-7.45-7.51-7.30-7.37-6.15-7.16-7.03-7.44-7.43-6.88-6.78-7.25-7.17-6.51-6.66-6.53-6.86-7.03-6.85
2AT_D_5-7.05-6.62-6.88-6.78-7.18-7.25-7.22-7.37-7.26-6.27-7.04-6.68-6.83-6.65-6.61-6.23-6.74-6.85-6.67
3AT_D_M-7.05-7.29-7.30-7.37-7.61-6.56-7.63-7.44-7.43-6.45-7.29-7.25-7.14-6.53-6.93-6.17-7.20-6.93-6.85
4AT_L_3-7.32-7.01-7.22-6.60-7.45-7.00-7.32-7.27-7.39-6.30-7.01-6.73-6.50-6.37-6.54-7.07-6.86-6.55-6.67
5AT_L_5-7.38-7.22-7.30-7.37-7.70-7.25-7.61-7.44-7.43-6.48-7.15-7.06-6.44-6.47-6.62-7.00-7.22-6.89-6.36

10 - More data verbs#

head(x = dcData::CountryData, n = 5)
A data.frame: 5 × 76
countryareapopgrowthbirthdeathmigrmaternalinfantlifemainlinescellnetHostsnetUsersairportsrailwaysroadwayswaterwaysmarinemilitary
<chr><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
1Afghanistan 65223031822848 2.2938.8414.12 -1.83460117.2350.49 1350018000000 2231000000 52 NA 421501200NA NA
2Akrotiri 123 15700 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NANA NA
3Albania 28748 3020209 0.3012.73 6.47 -3.31 27 13.1977.96 312000 3500000155281300000 4 339 18000 41171.47
4Algeria 238174138813722 1.8823.99 4.31 -0.93 97 21.7676.39320000037692000 67647000001573973113655 NA384.48
5American Samoa 199 54517-0.3522.87 4.68-21.64 NA 8.9274.91 10000 NA 2387 NA 3 NA 241 NANA NA

11 - Joining two data frames#

head(x = dcData::MigrationFlows, n = 3) %>%
  select(destcode, origincode, Y2000)
A data.frame: 3 × 3
destcodeorigincodeY2000
<fct><fct><int>
1FRAAFG 923
2FRADZA425229
3FRAAUS 9168
head(x = dcData::CountryData, n = 6) %>%
  select(country, life, infant)
A data.frame: 6 × 3
countrylifeinfant
<chr><dbl><dbl>
1Afghanistan 50.49117.23
2Akrotiri NA NA
3Albania 77.96 13.19
4Algeria 76.39 21.76
5American Samoa74.91 8.92
6Andorra 82.65 3.69
head(x = dcData::CountryCentroids, n = 4) %>%
  select(name, iso_a3)
A data.frame: 4 × 2
nameiso_a3
<chr><chr>
1AfghanistanAFG
2Aland ALA
3Albania ALB
4Algeria DZA
# to translate the variable `country` in table `CountryData`
# to the code `iso_a3`,
# join table `CountryCentroids` to table `CountryData`

InfantMortality <-
  CountryCentroids %>%
  select(name, iso_a3) %>%
  left_join(y = CountryData %>% select(country, infant),
            by = c('name' = 'country'))
head(x = InfantMortality, n = 6)
A data.frame: 6 × 3
nameiso_a3infant
<chr><chr><dbl>
1Afghanistan AFG117.23
2Aland ALA NA
3Albania ALB 13.19
4Algeria DZA 21.76
5American SamoaASM 8.92
6Andorra AND 3.69
# infant mortality of the destination country

MigrationFlows %>%
  left_join(y = InfantMortality, by = c('destcode' = 'iso_a3')) %>%
  head(n = 6) %>%
  select(sex, destcode, origincode, Y2000, name, infant)
A data.frame: 6 × 6
sexdestcodeorigincodeY2000nameinfant
<fct><chr><fct><int><chr><dbl>
1MaleFRAAFG 923France3.31
2MaleFRADZA425229France3.31
3MaleFRAAUS 9168France3.31
4MaleFRAAUT 7764France3.31
5MaleFRAAZE 118France3.31
6MaleFRABLR 245France3.31
# infant mortality of the origin country

MigrationFlows %>%
  left_join(y = InfantMortality, by = c('origincode' = 'iso_a3')) %>%
  head(n = 6) %>%
  select(sex, destcode, origincode, Y2000, name, infant)
A data.frame: 6 × 6
sexdestcodeorigincodeY2000nameinfant
<fct><fct><chr><int><chr><dbl>
1MaleFRAAFG 923Afghanistan117.23
2MaleFRADZA425229Algeria 21.76
3MaleFRAAUS 9168Australia 4.43
4MaleFRAAUT 7764Austria 4.16
5MaleFRAAZE 118Azerbaijan 26.67
6MaleFRABLR 245Belarus 3.64
head(x = dcData::CountryGroups, n = 5)
A data.frame: 5 × 4
countryG8G20GGG
<fct><lgl><lgl><lgl>
1Canada TRUETRUEFALSE
2France TRUETRUEFALSE
3GermanyTRUETRUEFALSE
4Italy TRUETRUEFALSE
5Japan TRUETRUEFALSE
G8Countries <-
  CountryGroups %>%
  dplyr::filter(G8) %>%
  select(country)
G8Countries
A data.frame: 8 × 1
country
<fct>
Canada
France
Germany
Italy
Japan
Russia
United Kingdom
United States
G8CountryData <-
  CountryData %>%
    inner_join(y = G8Countries, by = join_by(country))
G8CountryData %>%
  select(country, GDP, pop)
A data.frame: 8 × 3
countryGDPpop
<chr><dbl><dbl>
Canada 1.518e+12 34834841
France 2.276e+12 66259012
Germany 3.227e+12 80996685
Italy 1.805e+12 61680122
Japan 4.729e+12127103388
Russia 2.553e+12142470272
United Kingdom2.387e+12 63742977
United States 1.672e+13318892103

Exercises#

[11.1]#

Most data verbs are one table-in one table-out (1I1O) whereas join data verbs are two table-in one table-out (2I1O).

dcData::BabyNames %>%
  group_by(year) %>%
    summarize(total = sum(count)) %>%
      head()
A tibble: 6 × 2
yeartotal
<int><int>
1880201484
1881192700
1882221537
1883216952
1884243468
1885240856

[11.2]#

The value of the matching variable of row \(n\) of the left table does not necessarily match the value of the matching variable of row \(n\) of the right table.

[11.3]#

head(x = dcData::ZipGeography, n = 5)
A data.frame: 5 × 13
StatePopulationHousingUnitsLandAreaWaterAreaCityNamePostOfficeNameCountyAreaCodeTimezoneLatitudeLongitudeZIP
<fct><dbl><dbl><dbl><dbl><fct><fct><fct><dbl><fct><dbl><dbl><chr>
1New York 0 0 0.1 46.3HoltsvilleHoltsvilleSuffolk 631EST 40.92233-72.6370800501
2New York 0 0 0.0170.3HoltsvilleHoltsvilleSuffolk 631EST 40.92233-72.6370800544
3 0 0 0.0 4.7Adjuntas Adjuntas Adjuntas 787EST+118.16527-66.7225800601
4 420421559080.1 0.0Aguada Aguada Aguada 787EST+118.39310-67.1809500602
5 555302162678.7 0.1Aguadilla Aguadilla Aguadilla787EST+118.45591-67.1457800603
tail(x = dcData::ZipDemography, n = 5)
A data.frame: 5 × 44
TotalpopulationMaleFemaleMedianAgeUnder5yearsX18yearsandoverX65yearsandoverOneraceWhiteBlackorAfricanAmericanMedianfamilyincomedollarsPercapitaincomedollarsFamiliesbelowpovertylevelIndividualsbelowpovertylevelSinglefamilyowneroccupiedhomesMedianvaluedollarsMedianofselectedmonthlyownercostsWithaMortgageNotmortgagedZIP
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><chr>
427371447 757 69030.3120 964 991329 141 4498031617626113253104800NA 64832599926
42738 120 69 5142.2 6 88 13 111 105 02975011742 8 53 47 80600NA 75017899927
42739 NA NA NA NA NA NA NA NA NANA NA NANA NA NA NANA NA NA99928
4274024241250117439.6148172628021911797 3529172168853235438127600NA117040599929
42741 36 22 1435.0 2 29 1 36 33 03979214332 0 0 0 0NA 0 099950
NewYork <-
  ZipGeography %>%
    left_join(y = ZipDemography, by = c('ZIP' = 'ZIP')) %>%
      filter(State == 'New York') %>%
        select(State, Population, Totalpopulation, Under5years, X18yearsandover, X65yearsandover) %>%
          transmute(
            State     = State,
            pop       = coalesce(Population, Totalpopulation),
            age0to4   = Under5years,
            age5to17  = pop - Under5years - X18yearsandover,
            age18to64 = X18yearsandover - X65yearsandover,
            age65plus = X65yearsandover,
            total     = age0to4 + age5to17 + age18to64 + age65plus,
            check1    = total == pop,
          )
head(x = NewYork, n = 5)
A data.frame: 5 × 8
Statepopage0to4age5to17age18to64age65plustotalcheck1
<fct><dbl><dbl><dbl><dbl><dbl><dbl><lgl>
1New York 0 NA NA NA NA NA NA
2New York 0 NA NA NA NA NA NA
3New York 289 NA NA NA NA NA NA
4New York17310 457 121313189 245117310TRUE
5New York84870414912750547971317484870TRUE
PopulationGroups <-
  ZipGeography %>%
    left_join(y = ZipDemography, by = c('ZIP' = 'ZIP')) %>%
      select(State, Population, Totalpopulation, Under5years, X18yearsandover, X65yearsandover) %>%
        transmute(
          State     = State,
          pop       = coalesce(Population, Totalpopulation),
          age0to4   = Under5years,
          age5to17  = pop - Under5years - X18yearsandover,
          age18to64 = X18yearsandover - X65yearsandover,
          age65plus = X65yearsandover,
          total     = age0to4 + age5to17 + age18to64 + age65plus,
          check1    = total == pop,
        ) %>%
          group_by(State) %>%
            summarize(
              total0to4   = sum(age0to4,    na.rm=TRUE),
              total5to17  = sum(age5to17,   na.rm=TRUE),
              total18to64 = sum(age18to64,  na.rm=TRUE),
              total65plus = sum(age65plus,  na.rm=TRUE),
              total       = sum(pop, na.rm=TRUE)
            ) %>%
              mutate(
                diff  = total - (total0to4 + total5to17 + total18to64 + total65plus),
                check = total0to4 + total5to17 + total18to64 + total65plus == total
              )
PopulationGroups
A tibble: 53 × 8
Statetotal0to4total5to17total18to64total65plustotaldiffcheck
<fct><dbl><dbl><dbl><dbl><dbl><dbl><lgl>
0 0 0 0 741752 741752FALSE
Massachusetts 0 0 0 0 63490486349048FALSE
New Hampshire 0 0 0 0 12357351235735FALSE
New York 1239375345054011837416244822418975844 289FALSE
Rhode Island 0 0 0 0 10483191048319FALSE
Maine 0 0 0 0 12730941273094FALSE
Vermont 0 0 0 0 608822 608822FALSE
Connecticut 0 0 0 0 34055653405565FALSE
New Jersey 0 0 0 0 84139908413990FALSE
Pennsylvania 7277902194356 7439389191905212280587 0 TRUE
Delaware 51529 143043 487241 101712 783525 0 TRUE
District of Columbia 32536 82456 387169 69898 572059 0 TRUE
Maryland 3533931002779 3341007 599307 5296486 0 TRUE
Virginia 4619571276186 4547561 792248 7077952 0 TRUE
West Virginia 101791 300500 1128772 276833 1807896 0 TRUE
North Carolina 5395061424526 5116187 969035 8049254 0 TRUE
Georgia 5950871573898 5231348 785170 8185503 0 TRUE
South Carolina 264656 744885 2516802 485272 4011615 0 TRUE
Florida 9457212700255 9526267280738815979631 0 TRUE
Alabama 295936 827248 2743296 579644 4446124 0 TRUE
Mississippi 204273 570596 1725052 343378 2843299 0 TRUE
Tennessee 3742191022301 3582610 703259 5682389 0 TRUE
Kentucky 266475 730021 2546187 504739 4047422 0 TRUE
Ohio 7549242133383 6956963150773211353002 0 TRUE
Indiana 4232151151181 3753258 752831 6080485 0 TRUE
Michigan 6720021923732 61235121218958 9938204 0 TRUE
Iowa 188416 545293 1756618 436232 2926559 0 TRUE
Wisconsin 3423401026416 3292356 702550 5363662 0 TRUE
Minnesota 329597 957361 3038349 594262 4919569 0 TRUE
North Dakota 39370 121280 386412 94365 641427 0 TRUE
South Dakota 51054 151454 443889 108109 754506 0 TRUE
Illinois 8765392368834 7673666149999812419037 0 TRUE
Montana 54853 175212 551216 120960 902241 0 TRUE
Missouri 3699051057798 3412243 755444 5595390 0 TRUE
Kansas 188684 524227 1619075 356199 2688185 0 TRUE
Nebraska 117035 333140 1028660 232159 1710994 0 TRUE
Louisiana 317160 901852 2730442 516492 4465946 0 TRUE
Arkansas 181626 498826 1619048 374034 2673534 0 TRUE
Oklahoma 236042 655289 2099793 455109 3446233 0 TRUE
Texas 1624203426054312888235207166820844649 0 TRUE
Colorado 297494 803274 2784280 416049 4301097 0 TRUE
Arizona 382483 985121 3096256 667891 5131751 0 TRUE
Idaho 97558 270952 777971 145737 1292218 0 TRUE
Utah 209299 508994 1323734 190138 2232165 0 TRUE
Wyoming 30935 97924 306976 57667 493502 0 TRUE
New Mexico 130286 376732 1095624 211732 1814374 0 TRUE
California 2486843676241521019901359527133864430 0 TRUE
Nevada 145790 365847 1264174 218694 1994505 0 TRUE
Hawaii 78161 217601 755139 160588 1211489 0 TRUE
Oregon 222906 623356 2135687 438070 3420019 0 TRUE
Washington 3942801119436 3717866 662093 5893675 0 TRUE
Alaska 47514 142771 399108 35599 624992 0 TRUE
NA 0 0 0 0 0 0 TRUE
#options(repr.plot.width=20, repr.plot.height=20)

ZipGeography %>%
  filter(State != '') %>%
  left_join(y = ZipDemography, by = c('ZIP' = 'ZIP')) %>%
    select(State, Population, Totalpopulation, Under5years, X18yearsandover, X65yearsandover) %>%
      transmute(
        State     = State,
        ageTotal  = coalesce(Population, Totalpopulation, 0),
        age0to4   = coalesce(Under5years, 0),
        age5to17  = coalesce(ageTotal - Under5years - X18yearsandover, 0),
        age18to64 = coalesce(X18yearsandover - X65yearsandover, 0),
        age65plus = coalesce(X65yearsandover, 0)
      ) %>%
        pivot_longer(c(`age0to4`, `age5to17`, `age18to64`, `age65plus`), names_to = 'ageRange', values_to = 'cases') %>%
          ggplot() +
            geom_bar(
              mapping  = aes(y = State, x = cases, fill = ageRange),
              position = 'stack',
              stat     = 'identity',
              na.rm    = TRUE
            ) +
            theme(
              text=element_text(size=20)
            )
../../../_images/c59b8007fd38a0d9829d6006489bb58725e200cdce12fb51568b770b80248da0.png

15 - Statistics#

Data graphics

  • translate each case into a representation in terms of a glyph’s graphical attributes: color, position, shape, size, etc.

A representation for the collective properties of cases?

  • collective properties: features of groups of cases

  • summary functions: max, mean, median, etc.

Statistics

  • an area of science concerned with characterizing case-to-case variation and the collective properties of cases, while also quantifying uncertainty

Data Science

  • a collaboration between statistics and computer science

Confidence Interval

Confidence Band

How to display variation among cases? The distribution of values of one variable? The distribution of values of several variables?

15.2 - Density distribution of one variable#

NCHS %>%
  ggplot(aes(weight, 1)) +
    geom_point()
Warning message:
“Removed 2575 rows containing missing values (`geom_point()`).”
../../../_images/06c04ca41601ae0b29d89d4781539f37c6ed1ec346fe18c51ceb22a206008f47.png
# density of cases, represented by the degree of darkness

NCHS %>%
  ggplot(aes(weight, 1)) +
    geom_point(alpha = 0.05, position = 'jitter') +
    xlab('Weight (kg)') +
    ylab('') +
    theme_bw() +
    scale_color_grey()
Warning message:
“Removed 2575 rows containing missing values (`geom_point()`).”
../../../_images/a88e8ca34f1dceae04807392e8bf200af7ee37afe7b49b124de085c96fb23e5f.png
# density of cases, represented by the height of a function
# the scale of the y-axis has units of inverse kilograms
# the scale is arranged so that the area under the curve is equal to unity
# this convention facilitates densities for different groups; and means that narrow distributions tend to have high density

NCHS %>%
  ggplot(aes(weight)) +
    geom_density(color = 'gray', fill = 'gray', alpha = 0.75) +
    xlab('Weight (kg)') +
    ylab('Density (1/kg)') +
    theme_bw() +
    scale_color_grey()
Warning message:
“Removed 2575 rows containing non-finite values (`stat_density()`).”
../../../_images/d89ad92ef7c8ced7eb524c4beba1c0c53a826305d887c7fa190aa7eaee3f46a9.png
NCHS %>%
  ggplot(aes(weight)) +
    geom_density(color = 'gray', fill = 'gray', alpha = 0.75) +
    geom_point(
      alpha    = 0.02, 
      mapping  = aes(y = 0.002),
      position = position_jitter(height = 0.002)
    ) +
    xlab('Weight (kg)') +
    ylab('Density (1/kg)') +
    theme_bw() +
    scale_color_grey()
Warning message:
“Removed 2575 rows containing non-finite values (`stat_density()`).”
Warning message:
“Removed 2575 rows containing missing values (`geom_point()`).”
../../../_images/8bd82b9b7bc29f8191324e2dd0ed08da4163cdf931bd193a2d2f9c7ad8226bfc.png
NCHS %>%
  ggplot(aes(x = weight, group = sex)) +
  geom_density(aes(color = sex, fill = sex), alpha = 0.5) +
  xlab('Weight (kg)') +
  ylab('Density (1/kg)') +
  theme(legend.position = 'top') +
  theme_bw() +
  scale_color_grey()
Warning message:
“Removed 2575 rows containing non-finite values (`stat_density()`).”
../../../_images/d48c6f35cffde107e978f7ba79cf9b7264b45258d6c32b80f6905b0ba431adfb.png
NCHS %>%
  mutate(ageGroup = mosaic::ntiles(x = age, n = 6, format = 'interval')) %>%
    ggplot(aes(x = weight, group = sex)) +
      geom_density(aes(color = sex, fill = sex), alpha = 0.75) +
      facet_wrap(~ ageGroup, nrow = 1) +
      xlab('Weight (kg)') +
      ylab('Density (1/kg)') +
      theme(axis.text.x = element_text(angle = 60, hjust = 1))
Warning message:
“Removed 2575 rows containing non-finite values (`stat_density()`).”
../../../_images/3c7f7f8549f60ec7ab581712e96ffa184a70112231a8198a61115000f77d74b0.png

15.3 - Other depictions of density distribution#

NCHS %>%
  mutate(ageGroup = mosaic::ntiles(x = age, n = 6, format = 'interval')) %>%
    ggplot(aes(y = weight, x = ageGroup)) +
      geom_boxplot(
        alpha         = 0.25,
        mapping       = aes(color = ageGroup, fill = ageGroup),
        outlier.color = 'gray',
        outlier.size  = 2
      ) +
      facet_wrap(~ sex) +
      xlab('Age Group (yrs)') +
      ylab('Weight (kg)') +
      theme(legend.position = 'top') +
      theme(axis.text.x = element_text(angle = 60, hjust = 1))
Warning message:
“Removed 2575 rows containing non-finite values (`stat_boxplot()`).”
../../../_images/e63a2e611e7f67bc88c3aeab89bc687e717c104d8230439932339a3d40e06fa8.png
NCHS %>%
  mutate(ageGroup = mosaic::ntiles(x = age, n = 6, format = 'interval')) %>%
    ggplot(aes(y = weight, x = ageGroup)) +
      geom_violin(fill = 'gray') +
      facet_wrap(~ sex) +
      xlab('Age Group (yrs)') +
      ylab('Weight (kg)') +
      theme(legend.position = 'top') +
      theme(axis.text.x = element_text(angle = 60, hjust = 1))
Warning message:
“Removed 2575 rows containing non-finite values (`stat_ydensity()`).”
../../../_images/30aec579a0262062b2d9ca7c1ca04a710c2cdd8afc02a33ad819422be5909e93.png
NCHS %>%
  mutate(ageGroup = mosaic::ntiles(x = age, n = 6, format = 'interval')) %>%
    ggplot(aes(y = weight, x = ageGroup)) +
      geom_boxplot(
        alpha         = 0.75,
        mapping       = aes(fill = diabetic),
        outlier.color = 'gray',
        outlier.size  = 1,
        position = position_dodge(width = 0.8)
      ) +
      facet_grid(sex ~ .) +
      xlab('Age Group (yrs)') +
      ylab('Weight (kg)') +
      theme(legend.position = 'top') +
      theme(axis.text.x = element_text(angle = 90, hjust = 1))
Warning message:
“Removed 2575 rows containing non-finite values (`stat_boxplot()`).”
Warning message:
“The following aesthetics were dropped during statistical transformation: fill
 This can happen when ggplot fails to infer the correct grouping structure in the data.
 Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?”
Warning message:
“The following aesthetics were dropped during statistical transformation: fill
 This can happen when ggplot fails to infer the correct grouping structure in the data.
 Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?”
../../../_images/4cc432eec1f4cf3e4ba4b0ae03ba36310c30c283b5973079980f199775303b34.png

15.4 - Confidence intervals#

NCHS %>%
  mutate(ageGroup = mosaic::ntiles(x = age, n = 6, format = 'interval')) %>%
    ggplot(aes(y = weight, x = ageGroup)) +
      geom_boxplot(
        mapping       = aes(fill = diabetic),
        notch         = TRUE,
        outlier.color = 'gray',
        outlier.size  = 1,
        position      = position_dodge(width = 0.8)
      ) +
      facet_grid(sex ~ .) +
      xlab('Age Group (yrs)') +
      ylab('Weight (kg)') +
      theme(axis.text.x = element_text(angle = 60, hjust = 1))
Warning message:
“Removed 2575 rows containing non-finite values (`stat_boxplot()`).”
Warning message:
“The following aesthetics were dropped during statistical transformation: fill
 This can happen when ggplot fails to infer the correct grouping structure in the data.
 Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?”
Warning message:
“The following aesthetics were dropped during statistical transformation: fill
 This can happen when ggplot fails to infer the correct grouping structure in the data.
 Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?”
../../../_images/82cc6eb339f0eb5e39e3387dadc8a88d7be24faf294723758fb166fd8b401968.png

15.5 - Model functions#

head(NCHS)
A data.frame: 6 × 31
sexagepregnantethnicitydeathfollowupsmokerdiabeticheightweightbmdfmhm_otherhdlcholbpsbpdincomepop_weightpsustratum
<fct><dbl><fct><fct><chr><dbl><fct><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
1female 2noNon-Hispanic BlackNA NAno 00.91612.5 NA NANA NA NANA0.86 2970.80415
2male 77noNon-Hispanic Whitealive90no 01.74075.41.2196 0.1235954215 98565.0010224.13331
3female10noNon-Hispanic WhiteNA NAno 01.36632.9 NA NA30129108631.4714172.31127
4male 1noNon-Hispanic BlackNA NAno 0 NA13.3 NA NANA NA NANA0.57 3041.59312
5male 49noNon-Hispanic Whitealive74yes01.78392.51.0870 1.1768842279122835.0030657.31228
6female19noOther/Multi alive86no 01.62059.20.8680-1.2245261153114701.2112224.87622
NCHS %>%
  ggplot(aes(x = age, y = weight, color = diabetic)) +
    stat_smooth(se = FALSE) +
    geom_point(alpha = 0.02) +
    ylim(0, 120) +
    theme(legend.position = 'top')
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Warning message:
“Removed 3189 rows containing non-finite values (`stat_smooth()`).”
Warning message:
“The following aesthetics were dropped during statistical transformation: colour
 This can happen when ggplot fails to infer the correct grouping structure in the data.
 Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?”
Warning message:
“Removed 3189 rows containing missing values (`geom_point()`).”
../../../_images/702e9ec1feff14a88ddf6d9c0e6aed8f0ea400b47d515281fdec581174e67fa8.png
NCHS %>%
  ggplot(aes(x = age, y = weight, color = diabetic)) +
    stat_smooth() +
    geom_point(alpha = 0.02) +
    ylim(0, 120) +
    theme(legend.position = 'top')
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Warning message:
“Removed 3189 rows containing non-finite values (`stat_smooth()`).”
Warning message:
“The following aesthetics were dropped during statistical transformation: colour
 This can happen when ggplot fails to infer the correct grouping structure in the data.
 Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?”
Warning message:
“Removed 3189 rows containing missing values (`geom_point()`).”
../../../_images/e0cbed36632f90209909be99e0c05c444ac03b08afaa7bca580ddd073bbe5e9a.png
NCHS %>%
  ggplot(aes(x = age, y = weight),
    color = diabetic,
    fill  = diabetic
  ) +
  stat_smooth(method = lm) +
  geom_point(alpha = 0.02) +
  ylim(0, 120) +
  theme(legend.position = 'top')
`geom_smooth()` using formula = 'y ~ x'
Warning message:
“Removed 3189 rows containing non-finite values (`stat_smooth()`).”
Warning message:
“Removed 3189 rows containing missing values (`geom_point()`).”
../../../_images/9be91693203ade5cd63a8f8fa4901fad0a0733084b6553b1668d4f232289ec36.png

16 - Data scraping and intake methods#

[Data Scraping]

Data scraping is the gathering of data from sources such as web browsers in which they are not already in a tidy (data frame/table) format and the translation of such data to one or more data frames/tables.

[Data Cleaning]

Data clearning is the correcting of errors in data that stem either from blunders in data entry or from deficiencies in the way data is stored or coded.

Data frame-friendly formats

  • CSV

    • base::read.csv(stringsAsFactors = FALSE)

    • data.table::fread()

    • mosaic::read.file()

    • readr::read_csv()

  • technical software package

    • Octave/MATLAB

    • Stata

    • SPSS

    • Minitab

    • SAS

    • Epi

  • relational databases

  • Excel

  • web

    • HTML <table>

    • JSON

    • XML

    • Google spreadsheet qua HTML

    • API

[List]

A list is an R object used to store a collection of other R objects. Elements of a list can even have different types; e.g., data frames, plots, model objects, even other lists.

  • as.numeric()

  • as.character()

  • readr::parse_number()

webURL      <- 'https://mdbeckman.github.io/dcSupplement/data/houses-for-sale.csv'
myDataTable <- readr::read_csv(webURL)
myDataTable %>%
  select(price, bedrooms, bathrooms, fuel, air_cond, construction) %>%
  head()
Rows: 1728 Columns: 16
── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
dbl (16): price, lot_size, waterfront, age, land_value, construction, air_co...
 Use `spec()` to retrieve the full column specification for this data.
 Specify the column types or set `show_col_types = FALSE` to quiet this message.
A tibble: 6 × 6
pricebedroomsbathroomsfuelair_condconstruction
<dbl><dbl><dbl><dbl><dbl><dbl>
13250021.0300
18111532.5200
10900041.0200
15500031.5200
8606021.0211
12000041.0200
# RStudio
file_name <- file.choose() # then navigate and click on your file
MyDataTable2 <-
  data.table::fread(
    input  = file_name,
    nrows  = 0,
    select = c(1, 4, 5, 10),
    drop   = c(2, 3, 6)
  ) %>%
  as.data.frame()
web_page <- 'https://en.wikipedia.org/wiki/Mile_run_world_record_progression'

SetOfTables <-
  web_page %>%
    read_html() %>%
    html_nodes(css = 'table') %>%
    html_table(fill = TRUE)

length(SetOfTables)

Table3 <- SetOfTables[[3]]
Table3
Table4 <- SetOfTables[[4]]
Table4
15
A tibble: 5 × 5
TimeAthleteNationalityDateVenue
<chr><chr><chr><chr><chr>
4:52Cadet Marshall United Kingdom2 September 1852Addiscome
4:45Thomas Finch United Kingdom3 November 1858 Oxford
4:45St. Vincent HammickUnited Kingdom15 November 1858Oxford
4:40Gerald Surman United Kingdom24 November 1859Oxford
4:33George Farran United Kingdom23 May 1862 Dublin
A tibble: 4 × 2
X1X2
<lgl><chr>
NARatified
NANot ratified
NARatified but later rescinded
NAPending ratification
# Four of the variables from the `houses-for-sale.csv` file giving features of the Saratoga, NY houses stored as integer codes; each case is a different house.
Houses <-
  read_csv('https://mdbeckman.github.io/dcSupplement/data/houses-for-sale.csv')
Houses %>%
  select(fuel, heat, sewer, construction) %>%
  head()
Rows: 1728 Columns: 16
── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
dbl (16): price, lot_size, waterfront, age, land_value, construction, air_co...
 Use `spec()` to retrieve the full column specification for this data.
 Specify the column types or set `show_col_types = FALSE` to quiet this message.
A tibble: 6 × 4
fuelheatsewerconstruction
<dbl><dbl><dbl><dbl>
3420
2320
2330
2220
2231
2220
# Codes for the house system types.
#   describes the codes in a format that makes it easy to add new code values as the need arises
Translations <-
  read_csv('https://mdbeckman.github.io/dcSupplement/data/house_codes.csv')
Translations %>%
  head()
Rows: 13 Columns: 3
── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): system_type, meaning
dbl (1): code
 Use `spec()` to retrieve the full column specification for this data.
 Specify the column types or set `show_col_types = FALSE` to quiet this message.
A tibble: 6 × 3
codesystem_typemeaning
<dbl><chr><chr>
0new_const no
1new_const yes
1sewer_type none
2sewer_type private
3sewer_type public
0central_airno
# There is a column for each system type that translates the integer code to a meaningful term.
# In cases where the integer has no corresponding term, `invalid` has been entered; this provides a quick way to distinguish between incorrect entries and missing entries.
CodeVals <-
  Translations %>%
    spread(key = system_type, value = meaning, fill = 'invalid')
CodeVals %>%
  head()
A tibble: 5 × 6
codecentral_airfuel_typeheat_typenew_constsewer_type
<dbl><chr><chr><chr><chr><chr>
0no invalid invalid no invalid
1yes invalid invalid yes none
2invalidgas hot air invalidprivate
3invalidelectrichot waterinvalidpublic
4invalidoil electric invalidinvalid
# To carry out the translation, join each variable, one at a time, to the data frame of interest.
# Note how the `by` value changes for each variable.
Houses <-
  Houses %>%
    left_join(CodeVals %>% select(code, fuel_type),  by = c(fuel  = 'code')) %>%
    left_join(CodeVals %>% select(code, heat_type),  by = c(heat  = 'code')) %>%
    left_join(CodeVals %>% select(code, sewer_type), by = c(sewer = 'code'))
Houses %>% head()
A tibble: 6 × 19
pricelot_sizewaterfrontageland_valueconstructionair_condfuelheatsewerliving_areapct_collegebedroomsfireplacesbathroomsroomsfuel_typeheat_typesewer_type
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><chr><chr><chr>
1325000.090 425000000342 90635211.05electricelectric private
1811150.920 02230000232195351302.56gas hot waterprivate
1090000.190133 730000233194451411.08gas hot waterpublic
1550000.410 131870000222194451311.55gas hot air private
860600.110 01500011223 84051201.03gas hot air public
1200000.680 311400000222115222411.08gas hot air private
OrdwayBirds <-
  OrdwayBirds %>%
    mutate(
      Month = as.numeric(Month),
      Year  = as.numeric(Year),
      Day   = as.numeric(Day)
    )
head(OrdwayBirds)

WhenAndWho <-
  OrdwayBirds %>%
    select(Who = DataEntryPerson, When = Timestamp) %>%
    mutate(When = lubridate::mdy_hms(When))
head(WhenAndWho)

WhenAndWho %>%
  ggplot(aes(x = When, y = Who)) +
    geom_point(alpha = 0.2)

WhenAndWho %>%
  group_by(Who) %>%
  summarize(start  = min(When, na.rm = TRUE),
            finish = max(When, na.rm = TRUE)) %>%
  head()
A data.frame: 6 × 26
bogusTimestampYearDayMonthCaptureTimeSpeciesNameSexAgeBandNumberConditionReleaseCommentsDataEntryPersonWeightWingChordTemperatureRecaptureOriginalRecapturePreviousTailLength
<chr><chr><dbl><dbl><dbl><chr><chr><chr><chr><chr><chr><chr><chr><chr><chr><chr><chr><chr><chr><chr>
3.4/14/2010 13:20:56197216 77:00:00Song Sparrow UAHY (After Hatch Year)107-151187nonenonenoneJerald Dosch
4. NANANA Caitlin Baker
5.5/13/2010 16:00:30197216 77:00:00Song Sparrow UAHY (After Hatch Year)107-151187nonenonenoneCaitlin Baker
6.5/13/2010 16:02:15197216 77:00:00Field SparrowUAHY (After Hatch Year)1260-74572nonenonenoneCaitlin Baker
7.5/13/2010 16:03:18197216 77:00:00Field SparrowUAHY (After Hatch Year)1260-74541nonenonenoneCaitlin Baker
8.5/13/2010 16:04:23197216 77:00:00Song Sparrow UAHY (After Hatch Year)107-151188nonenonenoneCaitlin Baker
A data.frame: 6 × 2
WhoWhen
<chr><dttm>
3Jerald Dosch 2010-04-14 13:20:56
4Caitlin BakerNA
5Caitlin Baker2010-05-13 16:00:30
6Caitlin Baker2010-05-13 16:02:15
7Caitlin Baker2010-05-13 16:03:18
8Caitlin Baker2010-05-13 16:04:23
Warning message:
“Removed 4 rows containing missing values (`geom_point()`).”
Warning message:
“There were 2 warnings in `summarize()`.
The first warning was:
 In argument: `start = min(When, na.rm = TRUE)`.
 In group 1: `Who = ""`.
Caused by warning in `min.default()`:
! no non-missing arguments to min; returning Inf
 Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.”
A tibble: 6 × 3
Whostartfinish
<chr><dttm><dttm>
Inf-Inf
Abby Colehour 2011-04-23 15:50:242011-04-23 15:50:24
Brennan Panzarella2010-09-13 10:48:122011-04-10 21:58:56
Caitlin Baker 2010-05-13 16:00:302010-05-28 19:41:52
Emily Merrill 2010-06-08 09:10:012010-06-08 14:47:21
Jerald Dosch 2010-04-14 13:20:562010-04-14 13:20:56
../../../_images/da2fc2283f03596019963d311c54f11231eb1c53205c6568bdda953177e106bb.png

Exercises#

[16.6.1]#

Here are some character strings containing times or dates written in different formats. Your task it twofold: (A) for each, choose an appropriate function from the lubridate package to translate the character string into a datetime object in R and then (B) use R to calculate the number of days between that date and your birthday.

'April 30, 1777'              # Johann Carl Friedrich Gauss
'06-23-1912'                  # Alan Turing
'3 March 1847'                # Alexander Graham Bell
'Nov. 11th, 1918 at 11:00 am' # the armistice ending WWI on the western front
'July 20, 1969'               # the first manned moon landing
lubridate::mdy('April 30, 1777')
lubridate::mdy('06-23-1912')
lubridate::dmy('3 March 1847')
lubridate::mdy_hm('Nov. 11th, 1918 at 11:00 am')
lubridate::mdy('July 20, 1969')

birthday <- lubridate::mdy('February 4, 1992')
birthday
[1] "1918-11-11 11:00:00 UTC"
birthday - lubridate::mdy('April 30, 1777')
birthday - lubridate::mdy('06-23-1912')
birthday - lubridate::dmy('3 March 1847')
birthday - lubridate::as_date(lubridate::mdy_hm('Nov. 11th, 1918 at 11:00 am'))
birthday - lubridate::mdy('July 20, 1969')
Time difference of 78441 days
Time difference of 29080 days
Time difference of 52933 days
Time difference of 26748 days
Time difference of 8234 days

[16.6.2]#

Here are some strings containing numerical amounts. For each one, say whether as.numeric() or readr::parse_number() (or both or neither) properly converts the given string to a numeric value.

'42,659.30'
'17%'
'Nineteen'
'£100'
'9.8 m/seconds-square'
'6.62606957 x 10^-34 m2 kg / s'
'6.62606957e-34'
'42.659,30' # a European style
as.numeric('42,659.30')                     # incorrect
as.numeric('17%')                           # incorrect
as.numeric('Nineteen')                      # incorrect
as.numeric('£100')                          # incorrect
as.numeric('9.8 m/seconds-square')          # incorrect
as.numeric('6.62606957 x 10^-34 m2 kg / s') # incorrect
as.numeric('6.62606957e-34')                #   correct
as.numeric('42.659,30')                     # incorrect
Warning message in eval(expr, envir, enclos):
“NAs introduced by coercion”
<NA>
Warning message in eval(expr, envir, enclos):
“NAs introduced by coercion”
<NA>
Warning message in eval(expr, envir, enclos):
“NAs introduced by coercion”
<NA>
Warning message in eval(expr, envir, enclos):
“NAs introduced by coercion”
<NA>
Warning message in eval(expr, envir, enclos):
“NAs introduced by coercion”
<NA>
Warning message in eval(expr, envir, enclos):
“NAs introduced by coercion”
<NA>
6.62606957e-34
Warning message in eval(expr, envir, enclos):
“NAs introduced by coercion”
<NA>
readr::parse_number('42,659.30')                     #   correct
readr::parse_number('17%')                           #   correct
readr::parse_number('Nineteen')                      # incorrect
readr::parse_number('£100')                          #   correct
readr::parse_number('9.8 m/seconds-square')          #   correct
readr::parse_number('6.62606957 x 10^-34 m2 kg / s') # incorrect
readr::parse_number('6.62606957e-34')                #   correct
readr::parse_number('42.659,30')                     # incorrect
42659.3
17
Warning message:
“1 parsing failure.
row col expected   actual
  1  -- a number Nineteen
”
<NA>
100
9.8
6.62606957
6.62606957e-34
42.6593

[16.6.3]#

Grab Table 4 (or another similar table) from the Wikipedia page on world records in the mile (or some similar event). Make a plot of the record time versus the date in which it occurred. Also, mark each point with the name of the athlete written above the point. (Hint: Use geom_text().) To convert time entries such as '4:20.5' into seconds, use the lubridate package’s as.duration(ms('40:20.5')). You can get rid of the footnote markers such as [5] in the dates using the gsub() transformation function which replaces the characters identified in the first argument with those in the second argument. The string '\\[.\\]$' is an example of a regular expression which identifies a pattern of characters, in this case a single character in square brackets just before the end of the string.

Table4 %>%
  mutate(
    Date = lubridate::dmy(gsub('\\[.\\]$', '', Date)),
    Time = lubridate::as.duration(lubridate::ms(Time))
  ) %>%
  ggplot(aes(x = Date, y = Time)) +
    geom_point() +
    geom_text(aes(label = Athlete, hjust = -0.1, vjust = -1)) +
    labs(x = 'Year', y = 'Time (s)')
Error in `mutate()`:
 In argument: `Date = lubridate::dmy(gsub("\\[.\\]$", "", Date))`.
Caused by error in `as.character()`:
! cannot coerce type 'closure' to vector of type 'character'
Traceback:

1. Table4 %>% mutate(Date = lubridate::dmy(gsub("\\[.\\]$", "", 
 .     Date)), Time = lubridate::as.duration(lubridate::ms(Time))) %>% 
 .     ggplot(aes(x = Date, y = Time))
2. ggplot(., aes(x = Date, y = Time))
3. mutate(., Date = lubridate::dmy(gsub("\\[.\\]$", "", Date)), 
 .     Time = lubridate::as.duration(lubridate::ms(Time)))
4. mutate.data.frame(., Date = lubridate::dmy(gsub("\\[.\\]$", "", 
 .     Date)), Time = lubridate::as.duration(lubridate::ms(Time)))
5. mutate_cols(.data, dplyr_quosures(...), by)
6. withCallingHandlers(for (i in seq_along(dots)) {
 .     poke_error_context(dots, i, mask = mask)
 .     context_poke("column", old_current_column)
 .     new_columns <- mutate_col(dots[[i]], data, mask, new_columns)
 . }, error = dplyr_error_handler(dots = dots, mask = mask, bullets = mutate_bullets, 
 .     error_call = error_call, error_class = "dplyr:::mutate_error"), 
 .     warning = dplyr_warning_handler(state = warnings_state, mask = mask, 
 .         error_call = error_call))
7. mutate_col(dots[[i]], data, mask, new_columns)
8. mask$eval_all_mutate(quo)
9. eval()
10. lubridate::dmy(gsub("\\[.\\]$", "", Date))
11. .parse_xxx(..., orders = "dmy", quiet = quiet, tz = tz, locale = locale, 
  .     truncated = truncated)
12. unlist(lapply(list(...), .num_to_date), use.names = FALSE)
13. lapply(list(...), .num_to_date)
14. gsub("\\[.\\]$", "", Date)
15. .handleSimpleError(function (cnd) 
  . {
  .     local_error_context(dots, i = frame[[i_sym]], mask = mask)
  .     if (inherits(cnd, "dplyr:::internal_error")) {
  .         parent <- error_cnd(message = bullets(cnd))
  .     }
  .     else {
  .         parent <- cnd
  .     }
  .     message <- c(cnd_bullet_header(action), i = if (has_active_group_context(mask)) cnd_bullet_cur_group_label())
  .     abort(message, class = error_class, parent = parent, call = error_call)
  . }, "cannot coerce type 'closure' to vector of type 'character'", 
  .     base::quote(as.character(x)))
16. h(simpleError(msg, call))
17. abort(message, class = error_class, parent = parent, call = error_call)
18. signal_abort(cnd, .file)


Project: Bicycle Sharing#

Project: Bicycle Sharing from Kaplan, Daniel & Matthew Beckman. (2021). Data Computing. 2nd Ed. Home.


# information about the location of each of the stations in the system
station_url <- 'https://mdbeckman.github.io/dcSupplement/data/DC-Stations.csv'
Stations    <- readr::read_csv(station_url)
# the rental history from 2014 Q4
#   the `Trips` data table is a random subset of 10,000 trips from the full quarterly data
#   the full data set of more than 600,000 trips can be accessed by removing `-Small` from the url
trip_url    <- 'https://mdbeckman.github.io/dcSupplement/data/Trips-History-Data-2014-Q4-Small.rds'
Trips       <- readRDS(gzcon(url(trip_url)))

source('https://mdbeckman.github.io/dcSupplement/R/haversine.R')
Rows: 347 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): name
dbl (4): lat, long, nbBikes, nbEmptyDocks

 Use `spec()` to retrieve the full column specification for this data.
 Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Stations <-
#   Stations %>%
#     select(lat, long)
# Stations %>%
#   head()
# Trips <-
#   Trips %>%
#     select(sstation, estation, client, sdate, edate)
# Trips %>%
#   head()
# the distribution of times that bikes were checked out
Trips %>%
  ggplot(aes(x = sdate)) +
    geom_density(fill = 'gray', color = NA)
../../../_images/86889dae0003983d59f37a462d0aaa564cb7e10ce06ee705585a34c1eec87491.png

How long?#

Make a box-and-whisker plot showing the distribution of the duration of rental events broken down by client. The duration of the rental can be calculated as as.numeric(edate - sdate). The units will be in either hours, minutes, or seconds. It should not be much trouble for you to figure out which one. When you make your plot, you will likely find that the axis range is being set by a few outliers. These may be bikes that were forgotten. Arrange your scale to ignore these outliers, or filter them out.

Trips %>%
  mutate(duration = as.numeric(edate - sdate)) %>%
  filter(duration <= 3600) %>%
  ggplot() +
    geom_boxplot(
      mapping = aes(x = client, y = duration / 60)
    ) +
    labs(x = 'Type of Client', y = 'Trip Duration [min]')
../../../_images/9347d63885270a62df7d10cb7217044d9107ecd2a6a89cbacb1794d6ab071f93.png

When are bikes used?#

[1] Variable Trips$sdate indicates the datetime that the bicycle was checked out of the station. Make histograms or density plots of each of these discrete components. Explain what each plot is showing about how bikes are checked out. For example, the distribution by hour of the day shows that few bikes are checked out before 5am, and that there are busy times around rush hour: 8am and 5pm.

  • Day of the year: bike rentals are highest in October; decline in November; and pick up again in the first half of December.

  • Day of the week: bike rentals are fairly stable over the course of the week; Thursday and Friday see the most rentals, and Saturday and Sunday see the least. Monday sees a little bit more than Tuesday and Wednesday.

  • Hour of the day: bike rentals are low before 5:00am; spike up during morning rush hour; decrease a bit throughout the day; spike back up during evening rush hour; and rapidly decline to early morning levels into the night.

  • Minute of the hour: I’m not sure there’s much of a trend along this scale, other than that the numbers of rentals oscillate a bit (i.e., it is not fixed).

[2] A similar sort of display of events per hour can be accomplished by calculating and displaying each hour’s count. The graphic shows a lot of variation of bike use over the course of the day. Now consider two additional variables: the day of the week and the client type. Group the bike rentals by three variables: hour of the day, day of the week, and client type. Find the total number of events in each grouping and plot this count versus hour. Use the group aesthetic to represent one of the other variables and faceting to represent the other. (Hint: utilize facet_wrap() in the plotting commands.) Make the same sort of display of how bike rental vary of hour, day of the week, and client type, but use geom_density() rather than grouping and counting. Compare the two displays–one of discrete counts and one of density–and describe any major differences.

Generally, registered clients rent more bikes than casual clients across hour of the day and across day of the week. On an hourly basis, however, casual client rentals usually reach a maximum in the afternoon, when registered client rentals dip in between morning and evening rush hour. Also, casual client rentals peak over the weekend, when registered client rentals drop the most.

Trips$sdate[1]
lubridate::yday(Trips$sdate)[1] # day of the year from 1 to 365
lubridate::wday(Trips$sdate)[1] # day of the week from Sunday to Saturday
lubridate::hour(Trips$sdate)[1] # hour of the day
lubridate::minute(Trips$sdate)[1] # minute of the hour
[1] "2014-10-15 08:58:00 UTC"
288
4
8
58
Trips %>%
  ggplot(aes(x = lubridate::yday(sdate))) +
    geom_density(fill = 'gray', color = NA) +
    xlab('Day of the Year')
../../../_images/04756986fc22cb17998db839f9c171ee6d44fda3311a326799d4a336d8872e18.png
Trips %>%
  ggplot(aes(x = lubridate::wday(sdate))) +
    geom_density(fill = 'gray', color = NA) +
    xlab('Day of the Week') +
    scale_x_continuous(breaks = 0:8)
../../../_images/b32b4203a0a59bff834f80110fe3eee8aeffebc27d13d5200d585cd1c6a3ae51.png
Trips %>%
  ggplot(aes(x = lubridate::hour(sdate))) +
    geom_density(fill = 'gray', color = NA) +
    xlab('Hour of the Day') +
    scale_x_continuous(breaks = 0:24)
../../../_images/abc0534248a19b4e2753be734e5c1ab24dd0865cc2e0ef1399989d03a3b5ff7b.png
Trips %>%
  ggplot(aes(x = lubridate::minute(sdate))) +
    geom_density(fill = 'gray', color = NA) +
    xlab('Minute of the Hour') +
    scale_x_continuous(breaks = 0:60)
../../../_images/53a72525b478bbebbc620527772174573b492da9790110d1bae47f403eeff942.png
Trips %>%
  mutate(
    HOD = lubridate::hour(sdate),
    DOW = lubridate::wday(sdate),
  ) %>%
  group_by(HOD, DOW, client) %>%
  summarize(total = n()) %>%
  head()
`summarise()` has grouped output by 'HOD', 'DOW'. You can override using the
`.groups` argument.
A grouped_df: 6 × 4
HODDOWclienttotal
<int><dbl><chr><int>
01Casual 1
01Registered30
02Registered 5
03Casual 2
03Registered 8
04Registered 7
options(repr.plot.width = 20)

Trips %>%
  mutate(
    HOD = lubridate::hour(sdate),
    DOW = lubridate::wday(sdate),
  ) %>%
  group_by(HOD, DOW, client) %>%
  summarize(total = n()) %>%
  ggplot(mapping = aes(x = HOD, y = total)) +
    geom_line(mapping = aes(color = client)) + # the `group` aesthetic is unnecessary when using the `color` aesthetic
    geom_point() +
    facet_wrap(. ~ DOW, nrow = 1) +
    xlab('H') +
    scale_x_continuous(breaks = 0:23)

options(repr.plot.width =  7)
`summarise()` has grouped output by 'HOD', 'DOW'. You can override using the
`.groups` argument.
../../../_images/31e2b7bdbaf2a9acd089a8aa52e9b86e792275cae5405954b5cf226931e109d5.png

How far?#

Find the distance between each pair of stations. You know the position from the lat and long variables in Stations. This is enough information to find the distance. The calculation has been implemented in the haversine() function. haversine() is a transformation function. To use it, create a data table where a case is a pair of stations and there are variables for the latitude and longitude of the starting station and the ending station. To do this, join the Station data to itself. The statements Simple and Simple2 show how to create appropriately named variables for joining. Look the head() of Simple and Simple2 and make sure you understand how they are related to Stations. The joining of Simple and Simple2 should match every station to every other station. (Since a ride can start and end at the same station, it also makes sense to match each station to itself.) This sort of matching does not make use of any matching variables; everything is matched to everything else. This is called a full outer join. (A full outer join matches every case in the left table to each and every case in the right table.) Make sure you understand what the full outer join does before proceeding. For instance, you should be able to predict how many cases the output will have when the left input has n cases and the right has m cases.

[1] There are 347 cases in the Stations data table. How many cases will there be in a full outer join of Simple to Simple2?

\(120409 = 347 \times 347\)

It’s often impractical to carry out a full outer join. For example, joining BabyNames to itself with a full outer join will generate a result with more than three trillion cases. (Three trillion cases from the BabyNames data is the equivalent of about 5 million hours of MP3 compressed music. A typical human lifespan is about 0.6 million hours.) Perform the full outer join and then use haversine() to compute the distance between each pair of stations. Check your result for sensibility. Make a histogram of the station-to-station distances and explain where it looks like what you would expect. (Hint: you could use the Internet to look up the distance from one end of Washington, D.C. to the other.) Once you have PairDistances, you can join it with Trips to calculate the start-to-end distance of each trip. (Of course, a rider may not go directly from one station to another.)

[2] Look at the variables in Stations and Trips and explain why Simple and Simple2 were given different variable names for the station.

Simple and Simple2 were given different variable names for the station because we want start station-end station pairs, which serves as the composite key on which we join the resulting table to the Trips table. If we don’t explicitly change the name of the name variable, then we won’t have a composite key on which to join the resulting table to the Trips table.

An inner_join() is appropriate for finding the distance of each ride. (Watch out! The Trips data and the PairDistances data are large enough that the join is expensive: it takes about a minute.) Display the distribution of the ride distances of the ride.

[3] Compare it to the distances between pairs of stations. Are they similar? Why or why not?

The visualization shows that the pairwise distance between stations has a larger spread than trip distance, which is skewed to the right (i.e., most trips are short, whereas a fair number of station-station distances are quite large: these routes are probably not used very often).

Simple <-
  Stations %>%
    select(name, lat, long) %>%
    rename(sstation = name)
Simple2 <-
  Simple %>%
    rename(estation = sstation, lat2 = lat, long2 = long)
StationPairs <-
  merge(Simple, Simple2, by = NULL)
PairDistances <-
  StationPairs %>%
    mutate(distance = haversine(lat, long, lat2, long2)) %>%
    select(sstation, estation, distance)
PairDistances %>%
  ggplot() +
    geom_density(aes(x = distance))
../../../_images/395d7d39ee42f66906df24fa14f93e5d64bb01cf6b0f391873dece38bc85ae14.png
Trips %>%
  inner_join(PairDistances, by = c('sstation', 'estation')) %>%
  ggplot(aes(x = distance)) +
    geom_density()
../../../_images/ccd9a9e21adf1880bfacd3ebf8cd04282172d9673f484e5ffa897503988b658a.png
Trips %>%
  inner_join(PairDistances, by = c('sstation', 'estation')) %>%
  ggplot(aes(x = distance)) +
    geom_density(
      color = 'blue'
    ) +
    geom_density(
      alpha = 0.2,
      color = NA,
      data  = PairDistances,
      fill  = 'red'
    ) +
    xlab('Trip Distance (km)')
../../../_images/5e04897419fc4e58d288084565f46b1a0c02c48c84d25e93f900fe0d82269a1d.png

Mapping the stations#

You can draw detailed maps with the leaflet package. You may need to install it. leaflet works much like ggplot() but provides special facilities for maps.

stationMap <-
  leaflet(Stations) %>%
    addTiles() %>%
    addCircleMarkers(radius = 2, color = 'red')
    #setView(-77.04, 38.9, zoom = 12)
stationMap
Assuming "long" and "lat" are longitude and latitude, respectively

Long-distance stations#

Around each station on the map, draw a circle whose radius reflects the median distance covered by rentals starting at that station. To draw the circles, use the same leaflet commands as before, but add in a line like this: addCircles(radius = ~ mid, color = "blue", opacity = 0.0001). For addCircles() to draw circles at the right scale, the units of the median distance should be presented in meters rather than kilometers. This will create too much overlap, unfortunately. So, set the radius to be half or one-third the median distance in meters. From your map, explain the pattern you see in the relationship between station location and median distance.


Project: Bird Species#

Project: Bird Species from Kaplan, Daniel & Matthew Beckman. (2021). Data Computing. 2nd Ed. Home.


?dcData::OrdwayBirds
OrdwayBirds               package:dcData               R Documentation

_B_i_r_d_s _c_a_p_t_u_r_e_d _a_n_d _r_e_l_e_a_s_e_d _a_t _O_r_d_w_a_y, _c_o_m_p_l_e_t_e _a_n_d _u_n_c_l_e_a_n_e_d

_D_e_s_c_r_i_p_t_i_o_n:

     The historical record of birds captured and released at the
     Katharine Ordway Natural History Study Area, a 278-acre preserve
     in Inver Grove Heights, Minnesota, owned and managed by Macalester
     College.

_U_s_a_g_e:

     data("OrdwayBirds")
     
_F_o_r_m_a_t:

     A data frame with 15,829 observations on the following 26
     variables:

     bogus

     Timestamp indicates when the data were entered into an electronic
          record, not anything about the bird being described

     Year year of capture

     Day day of capture

     Month month of capture

     CaptureTime time of capture

     SpeciesName

     Sex

     Age

     BandNumber

     TrapID

     Weather

     BandingReport

     RecaptureYN

     RecaptureMonth

     RecaptureDay

     Condition

     Release

     Comments

     DataEntryPerson

     Weight

     WingChord

     Temperature

     RecaptureOriginal

     RecapturePrevious

     TailLength

_D_e_t_a_i_l_s:

     There are many extraneous levels of variables such as species.
     Part of the purpose of this data set is to teach about data
     cleaning.

_S_o_u_r_c_e:

     Jerald Dosch, Dept. of Biology, Macalester College: the manager of
     the Study Area.

_S_e_e _A_l_s_o:

     `OrdwaySpeciesNames`

_E_x_a_m_p_l_e_s:

     data(OrdwayBirds)
     str(OrdwayBirds)
     
?dcData::OrdwaySpeciesNames
OrdwaySpeciesNames           package:dcData            R Documentation

_C_o_r_r_e_c_t_e_d _S_p_e_c_i_e_s _N_a_m_e_s _f_o_r _t_h_e _O_r_d_w_a_y _B_i_r_d_s

_D_e_s_c_r_i_p_t_i_o_n:

     This data frame lists all the species name that appear in
     `OrdwayBirds`. In many cases, the species name was mis-spelled in
     the original. As a result, many birds are listed as separate
     species even though, in reality, they all belong to the same
     species. For each potentially mis-spelled species name, this table
     gives a standardized name.

_U_s_a_g_e:

     data("OrdwaySpeciesNames")
     
_F_o_r_m_a_t:

     A data frame with 265 observations on the following 2 variables:

     SpeciesName The original spelling, or misspelling, of a bird
          species.

     SpeciesNameCleaned Corrected spelling (or NA if the original was
          not identifiable.)

_S_o_u_r_c_e:

     Daniel Kaplan and students in a 2013 Data and Computing
     Fundamentals class at Macalester College (Saint Paul, MN) read
     through original names in `OrdwayBirds` and typed corrected
     spelling shown in `SpeciesNameCleaned`.

_S_e_e _A_l_s_o:

     `OrdwayBirds`

_E_x_a_m_p_l_e_s:

     data(OrdwaySpeciesNames)
     str(OrdwaySpeciesNames)
     
OrdwayBirds %>%
  select(Month, Day) %>%
  head()
A data.frame: 6 × 2
MonthDay
<dbl><dbl>
3 716
4NANA
5 716
6 716
7 716
8 716
OrdwayBirds <-
  OrdwayBirds %>%
  select(SpeciesName, Month, Day) %>%
  mutate(
    Month = as.numeric(as.character(Month)),
    Day   = as.numeric(as.character(Day))
  )
OrdwayBirds %>%
  head()
A data.frame: 6 × 3
SpeciesNameMonthDay
<chr><dbl><dbl>
3Song Sparrow 716
4 NANA
5Song Sparrow 716
6Field Sparrow 716
7Field Sparrow 716
8Song Sparrow 716
OrdwaySpeciesNames %>%
  filter(is.na(SpeciesNameCleaned))
A data.frame: 50 × 2
SpeciesNameSpeciesNameCleaned
<chr><chr>
NA
-lost- NA
-missing- NA
[Nothing, just dashes]NA
13:00:00 NA
Bank Swallow NA
Barn Swallow NA
Bay-breasted Warbler NA
Blackpoll Warbler NA
Blue Jay NA
Blue-headed Vireo NA
Blue-winged Warbler NA
Bluebird NA
Boreal Chickadee NA
Brewer's Sparrow NA
Brown Creeper NA
Brown Thrasher NA
Brown Towhee NA
Cactus Wren NA
Common Crow NA
Common Grackle NA
Common Nighthawk NA
Common Redpoll NA
Common Yellowthroat NA
Connecticut Warbler NA
Downy Woodpecker NA
E Bluebird NA
Eastern Bluebird NA
Eastern Kingbird NA
Eastern Meadowlark NA
Eastern Robin NA
Flicker NA
Fox Sparrow NA
Goldfinch NA
Grackle NA
Green Heron NA
Ground Dove NA
Hairy Woodpecker NA
Hermit Thrush NA
Horned Lark NA
House Finch NA
House Sparrow NA
Inca Dove NA
Indigo Bunting NA
Killdeer NA
Kingbird NA
Kiskadee F.C. NA
Magnolia Warbler NA
Mockingbird NA
Rough-winged Swallow NA

Task 1#

[1] Including misspellings, how many different species are there in the OrdwayBirds data?

There are 275 unique values of the variable SpeciesName. This reduces to 268 after dropping the following invalid values:

  • ''

  • '-lost-'

  • '-missing-'

  • '13:00:00'

  • '[Nothing, just dashes]'

  • 'lost'

  • 'none'

[2] Consider the OrdwaySpeciesNames data frame also found in the dcData package as well. How many distinct species are there in the SpeciesNameCleaned variable in OrdwaySpeciesNames? You will find it helpful to use n_distinct() a reduction function, which counts the number of unique values in a variable.

There are 108 unique values of the variable SpeciesNameCleaned after accounting for the value NA.

OrdwayBirds %>%
  count(SpeciesName)
A data.frame: 275 × 2
SpeciesNamen
<chr><int>
4
-lost- 1
-missing- 1
13:00:00 1
Acadian Flycatcher 1
American Gold Finch 50
American Goldfinch 1153
American Golf Finch 1
American Redfinch 1
American Redstart 3
American Robin 4
Arkansas Kingbird 1
Baltimore Oriole 206
Bank Swallow 21
Barn Swallow 23
Batimore Oriole 1
Bay-breasted Warbler 2
Blac-capped Chickadee 1
Black and White Warbler 9
Black-Capped Chickadee 13
Black-and-white Warbler 1
Black-billed Cookoo 1
Black-billed Cuckoo 15
Black-capeed Chickadee 1
Black-capped Chicakdee 1
Black-capped Chickadee 1110
Black-capped Chikadee 1
Black-capped chickadee 187
Black-throat Sparrow 31
Black-throat-Sparrow 1
White-breast Nuthatch 23
White-breasted Nuthatch 236
White-crown Sparrow 17
White-crowned Sparrow 78
White-eyed Vireo 1
White-thorat Sparrow 1
White-throat Sparrow 86
White-throated Sparrow 229
White-winged Junco 2
Wht-brstd Nuthatch 1
Wilson Warbler 4
Wilson's Warbler 22
Winter Wren 1
Wood Pewee 37
Wood Thrush 3
Woodcock 1
Wren 2
Yellow Flicker 1
Yellow Shafted Flicker 4
Yellow Warbler 19
Yellow-bellied Flycatcher 7
Yellow-bellied Sapsucker 3
Yellow-shaft Flicker 6
Yellow-shafted Flicker 34
Yellow-shafted flicker 6
Yellow-tailed Oriole 1
Yellowthroat 107
[Nothing, just dashes] 1
lost 1
none 2
OrdwayBirds %>%
  select(SpeciesName) %>%
    n_distinct()
275
OrdwaySpeciesNames %>%
  count(SpeciesNameCleaned)
A data.frame: 109 × 2
SpeciesNameCleanedn
<chr><int>
Acadian Flycatcher 1
American Goldfinch 3
American Redfinch 1
American Redstart 1
American Robin 1
Arkansas Kingbird 1
Baltimore Oriole 3
Black and White Warbler 2
Black-billed Cookoo 2
Black-capped Chickadee 8
Black-throat Sparrow 2
Brown-headed Cowbird 2
Cardinal 2
Carolina Chickadee 1
Catbird 4
Cedar Waxwing 2
Chestnut-backed Chickadee1
Chestnut-sided Warbler 1
Chickadee 2
Chipping Sparrow 4
Clay-colored Sparrow 3
Cowbird 1
Curve-billed Thrasher 2
Eastern Phoebe 2
Eastern Wood Pewee 2
Field Sparrow 2
Golden-Crowned Kinglet 3
Gray - cheeked Thrush 4
Great Crested Flycatcher 3
Harris's Sparrow 3
Tennessee Warbler 2
Traill's Flycatcher 1
Tree L 1
Tree Swallow 4
Tufted Titmouse 1
Unknown 1
Varied Thrush 1
Veery 1
Vesper Sparrow 1
Warbling Vireo 1
White-Crested Sparrow 1
White-Fronted Dove 1
White-breasted Nuthatch 5
White-crowned Sparrow 2
White-eyed Vireo 1
White-throat Sparrow 5
White-winged Junco 1
Wilson's Warbler 2
Winter Wren 1
Wood Pewee 1
Wood Thrush 1
Woodcock 1
Wren 1
Yellow Shafted Flicker 5
Yellow Warbler 1
Yellow-bellied Flycatcher 1
Yellow-bellied Sapsucker 1
Yellow-tailed Oriole 1
Yellowthroat 1
NA 50
OrdwaySpeciesNames %>%
  select(SpeciesNameCleaned) %>%
    n_distinct()
109

Task 2#

Use the OrdwaySpeciesNames table to create a new data frame that corrects the misspellings in SpeciesNames. This can be done easily using the inner_join() data verb. Look at the names of the variables in OrdwaySpeciesNames and OrdwayBirds.

[1] Which variable(s) was used for matching cases?

The variable SpeciesName was used for matching cases.

[2] What were the variable(s) that will be added?

The variables SpeciesNameCleaned (renamed to Species), Month, and Day will be added.

Corrected <-
  OrdwayBirds %>%
    inner_join(y = OrdwaySpeciesNames) %>%
    select(Species = SpeciesNameCleaned, Month, Day) %>%
    na.omit()
Corrected %>%
  head()
Joining with `by = join_by(SpeciesName)`
Warning message in inner_join(., y = OrdwaySpeciesNames):
“Detected an unexpected many-to-many relationship between `x` and `y`.
 Row 4 of `x` matches multiple rows in `y`.
 Row 211 of `y` matches multiple rows in `x`.
 If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.”
A data.frame: 6 × 3
SpeciesMonthDay
<chr><dbl><dbl>
1Song Sparrow 716
3Song Sparrow 716
4Field Sparrow716
5Field Sparrow716
6Field Sparrow716
7Field Sparrow716

Task 3#

Call the variable that contains the total count. Arrange this into descending order from the species with the most birds, and look through the list. (Hint: Remember n(). Also, one of the arguments to one of the data verbs will be desc(count) to arrange the cases into descending order. Display the top 10 species in terms of the number of bird captures.) Define for yourself a “major species” as a species with more than a particular threshold count. Set your threshold so that there are 5 or 6 species designated as major. Filter to produce a data frame with only the birds that belong to a major species. Save the output in a table called Majors. (Hint: Remember that summary functions can be used case-by-case when filtering or mutating a data frame that has been grouped.)

[1] How many bird captures are reported for each of the corrected species?

See below for the result (major species threshold >= 1000).

Corrected %>%
  count(Species) %>%
  arrange(desc(n)) %>%
  head(n = 10)
A data.frame: 10 × 2
Speciesn
<chr><int>
1Slate-colored Junco 2732
2Tree Swallow 1537
3Black-capped Chickadee1327
4American Goldfinch 1204
5Field Sparrow 1164
6Lincoln's Sparrow 790
7Robin 608
8Catbird 554
9Song Sparrow 512
10House Wren 460
Corrected %>%
  group_by(Species) %>%
  summarize(count = n()) %>%
  arrange(desc(count)) %>%
  head(n = 10)
A tibble: 10 × 2
Speciescount
<chr><int>
Slate-colored Junco 2732
Tree Swallow 1537
Black-capped Chickadee1327
American Goldfinch 1204
Field Sparrow 1164
Lincoln's Sparrow 790
Robin 608
Catbird 554
Song Sparrow 512
House Wren 460
Majors <-
  Corrected %>%
    group_by(Species) %>%
    summarize(count = n()) %>%
    arrange(desc(count)) %>%
    filter(count >= 1000)
Majors
A tibble: 5 × 2
Speciescount
<chr><int>
Slate-colored Junco 2732
Tree Swallow 1537
Black-capped Chickadee1327
American Goldfinch 1204
Field Sparrow 1164

Task 4#

When you have correctly produced Majors, write a command that produces the month-by-month count of each of the major species. Call this table ByMonth. Display this month-by-month count with a bar chart arranged in a way that you think tells the story of what time of year the various species appear. You can use mplot() to explore different possibilies. (Warning: mplot() and similar interactive functions should not appear in your Rmd file, it needs to be used interactively from the console. Use the “Show Expression” button in mplot() to create an expression that you can cut and paste into a chunk in your Rmd document, so that the graph gets created when you compile it.) Once you have the graph, use it to answer these questions:

[1] Which species are present year-round?

  • American Goldfinch (11-12 mo)

  • Black-capped Chickadee (12 mo)

[2] Which species are migratory, that is, primarily present in one or two seasons?

  • Field Sparrow (6 mo)

  • Slate-colored Junco (8-9 mo)

  • Tree Swallow (3-5 mo)

[3] What is the peak month for each major species?

  • 10 American Goldfinch

  • 11 Black-capped Chickadee

  • 05 Field Sparrow

  • 10 Slate-colored Junco

  • 06 Tree Swallow

[4] Which major species are seen in good numbers for at least 6 months of the year? (Hint: n_distinct() and >= 6.)

Arguably, the only species that is not seen in good numbers for at least 6 months of the year is the tree swallow.

ByMonth <-
  OrdwayBirds %>%
    group_by(SpeciesName, Month = as.integer(Month)) %>%
    summarize(count = n()) %>%
    filter(SpeciesName %in% Majors$Species)
ByMonth
`summarise()` has grouped output by 'SpeciesName'. You can override using the
`.groups` argument.
A grouped_df: 47 × 3
SpeciesNameMonthcount
<chr><int><int>
American Goldfinch 1 10
American Goldfinch 2 51
American Goldfinch 3 48
American Goldfinch 4 21
American Goldfinch 5 125
American Goldfinch 6 63
American Goldfinch 7 67
American Goldfinch 8 70
American Goldfinch 9 151
American Goldfinch 10 364
American Goldfinch 11 180
American Goldfinch 12 3
Black-capped Chickadee 1 56
Black-capped Chickadee 2 140
Black-capped Chickadee 3 96
Black-capped Chickadee 4 51
Black-capped Chickadee 5 48
Black-capped Chickadee 6 20
Black-capped Chickadee 7 13
Black-capped Chickadee 8 11
Black-capped Chickadee 9 66
Black-capped Chickadee10 173
Black-capped Chickadee11 271
Black-capped Chickadee12 165
Field Sparrow 4 83
Field Sparrow 5 197
Field Sparrow 6 15
Field Sparrow 7 79
Field Sparrow 8 64
Field Sparrow 9 74
Field Sparrow 10 69
Field Sparrow 11 1
Slate-colored Junco 1 113
Slate-colored Junco 2 61
Slate-colored Junco 3 188
Slate-colored Junco 4 694
Slate-colored Junco 5 1
Slate-colored Junco 8 1
Slate-colored Junco 9 35
Slate-colored Junco 101178
Slate-colored Junco 11 272
Slate-colored Junco 12 174
Tree Swallow 4 2
Tree Swallow 5 11
Tree Swallow 6 171
Tree Swallow 7 16
Tree Swallow 11 1
ByMonth %>%
  group_by(SpeciesName) %>%
  summarize(
    MonthsPerYear   = n(),
    SixMonthsOrMore = n_distinct(Month) >= 6
  )
A tibble: 5 × 3
SpeciesNameMonthsPerYearSixMonthsOrMore
<chr><int><lgl>
American Goldfinch 12 TRUE
Black-capped Chickadee12 TRUE
Field Sparrow 8 TRUE
Slate-colored Junco 10 TRUE
Tree Swallow 5FALSE
ByMonth %>%
  ggplot() +
    geom_bar(
      mapping     = aes(x = Month, y = count, fill = SpeciesName),
      na.rm       = FALSE,
      position    = 'stack',
      show.legend = TRUE,
      stat        = 'identity'
    ) +
    scale_x_continuous(breaks = 1:12)
../../../_images/1bedb06c18461fcfb7f548b06bae4b55139fb3c83d148d96ee5c5b47b35daa60.png

Project: Scraping Nuclear Reactors#

Project: Scraping Nuclear Reactors from Kaplan, Daniel & Matthew Beckman. (2021). Data Computing. 2nd Ed. Home.


In what ways is the table tidy? How is it not tidy? What’s different about it from a tidy table?

Tidy Data Criteria

  • (i) rows (or cases, observations) must each represent the same underlying attribute (i.e., each observation must have its own row)

  • (ii) columns (or variables, values) must each contain the same type of value for each row (i.e., each variable must have its own column)

  • (iii) each value must have its own cell

  • It’s impossible to only satisfy two of the three criteria. This implies the following.

    • (i) put each dataset into a tibble

    • (ii) put each variable into a column

There is at least one row that does not represent a typical case (that is, the header row(s)). Certain columns are blank and need to be removed. The remaining columns may contain heterogeneous data formats or data types. And missing values must be addressed. But with some cleaning, the table hints at what a typical case should look like.

page      <- 'https://en.wikipedia.org/wiki/List_of_commerical_nuclear_reactors'
tableList <-
  page %>%
    read_html() %>%
    html_nodes(css = 'table') %>%
    html_table(fill = TRUE)
length(tableList)

Japan <-
  tableList[[21]] %>%
    select(1:9)
#names(Japan)[c(3, 6)] <- c('type', 'grossMW')
head(Japan)
55
A tibble: 6 × 9
PlantnameUnitNo.TypeModelStatusCapacity(MW)BeginbuildingCommercialoperationClosed
<chr><chr><chr><chr><chr><chr><chr><chr><chr>
Plantname UnitNo.TypeModelStatus Capacity(MW)BeginbuildingCommercialoperationClosed
Fukushima Daiichi1 BWR BWR-3Inoperable439 25 Jul 1967 26 Mar 1971 19 May 2011
Fukushima Daiichi2 BWR BWR-4Inoperable760 9 Jun 1969 18 Jul 1974 19 May 2011
Fukushima Daiichi3 BWR BWR-4Inoperable760 28 Dec 1970 27 Mar 1976 19 May 2011
Fukushima Daiichi4 BWR BWR-4Inoperable760 12 Feb 1973 12 Oct 1978 19 May 2011
Fukushima Daiichi5 BWR BWR-4Shut down 760 22 May 1972 18 Apr 1978 17 Dec 2013

Among other things, some of the variable names appear redundant and others have multiple words separated by spaces. You can rename variables using the data verb rename(), finding appropriate names from the Wikipedia table. Another problem is that the first row is not data but a continuation of the variable names. So row number 1 should be dropped.

Japan <-
  Japan %>%
    filter(row_number() > 1) %>%
    rename(
      name         = Plantname,
      reactor      = `UnitNo.`,
      type         = Type,
      model        = Model,
      status       = Status,
      netMW        = `Capacity(MW)`,
      construction = Beginbuilding,
      operation    = Commercialoperation,
      closure      = Closed
    )
head(Japan)
A tibble: 6 × 9
namereactortypemodelstatusnetMWconstructionoperationclosure
<chr><chr><chr><chr><chr><chr><chr><chr><chr>
Fukushima Daiichi1BWRBWR-3Inoperable439 25 Jul 196726 Mar 197119 May 2011
Fukushima Daiichi2BWRBWR-4Inoperable760 9 Jun 1969 18 Jul 197419 May 2011
Fukushima Daiichi3BWRBWR-4Inoperable760 28 Dec 197027 Mar 197619 May 2011
Fukushima Daiichi4BWRBWR-4Inoperable760 12 Feb 197312 Oct 197819 May 2011
Fukushima Daiichi5BWRBWR-4Shut down 760 22 May 197218 Apr 197817 Dec 2013
Fukushima Daiichi6BWRBWR-5Shut down 106726 Oct 197324 Oct 197917 Dec 2013
str(Japan)
tibble [68 × 9] (S3: tbl_df/tbl/data.frame)
 $ name        : chr [1:68] "Fukushima Daiichi" "Fukushima Daiichi" "Fukushima Daiichi" "Fukushima Daiichi" ...
 $ reactor     : chr [1:68] "1" "2" "3" "4" ...
 $ type        : chr [1:68] "BWR" "BWR" "BWR" "BWR" ...
 $ model       : chr [1:68] "BWR-3" "BWR-4" "BWR-4" "BWR-4" ...
 $ status      : chr [1:68] "Inoperable" "Inoperable" "Inoperable" "Inoperable" ...
 $ netMW       : chr [1:68] "439" "760" "760" "760" ...
 $ construction: chr [1:68] "25 Jul 1967" "9 Jun 1969" "28 Dec 1970" "12 Feb 1973" ...
 $ operation   : chr [1:68] "26 Mar 1971" "18 Jul 1974" "27 Mar 1976" "12 Oct 1978" ...
 $ closure     : chr [1:68] "19 May 2011" "19 May 2011" "19 May 2011" "19 May 2011" ...

Using your cleaned data, make a plot of net generation capacity versus date of construction. Color the points by the type of reactor (for example, BWR, PWR, etc.) In addition to your plot, give a sentence or two of interpretation; what patterns do you see?

# BWR boiling water reactor
# FBR
# GCR
# PWR pressurized water reactor

Japan <-
  Japan %>%
    mutate(
      netMW        = as.integer(netMW),
      construction = lubridate::dmy(construction),
      operation    = lubridate::dmy(operation),
      closure      = lubridate::dmy(closure)
    )
head(Japan)

Japan %>%
  ggplot(mapping = aes(x = construction, y = netMW)) +
    geom_point(aes(color = type))
Warning message:
“There was 1 warning in `mutate()`.
 In argument: `closure = lubridate::dmy(closure)`.
Caused by warning:
!  6 failed to parse.”
A tibble: 6 × 9
namereactortypemodelstatusnetMWconstructionoperationclosure
<chr><chr><chr><chr><chr><int><date><date><date>
Fukushima Daiichi1BWRBWR-3Inoperable 4391967-07-251971-03-262011-05-19
Fukushima Daiichi2BWRBWR-4Inoperable 7601969-06-091974-07-182011-05-19
Fukushima Daiichi3BWRBWR-4Inoperable 7601970-12-281976-03-272011-05-19
Fukushima Daiichi4BWRBWR-4Inoperable 7601973-02-121978-10-122011-05-19
Fukushima Daiichi5BWRBWR-4Shut down 7601972-05-221978-04-182013-12-17
Fukushima Daiichi6BWRBWR-5Shut down 10671973-10-261979-10-242013-12-17
Warning message:
“Removed 7 rows containing missing values (`geom_point()`).”
../../../_images/b62e597410a4fcd77faf67a9b810925e9a47f75d382c3b0cc93be60dae6226e2.png

Carry out the same cleaning process for the China reactor table, and then append it with the Japan data. Use mutate() to add a variable that has the name of the country. (Hint: functions such as bind_cols() or bind_rows() form the dplyr package are helpful for appending data frames.) Collating the data for all countries is a matter of repeating this process over and over. Inevitably, there are inconsistencies. For example, the US data had been organized in a somewhat different format when compared to the Japan and China data for many years until Wikipedia editors decided to reconcile them.

China <-
  tableList[[10]] %>%
    select(1:9)
China <-
  China %>%
    filter(row_number() > 2) %>%
    rename(
      name         = Plantname,
      reactor      = `UnitNo.`,
      type         = Type,
      model        = Model,
      status       = Status,
      netMW        = `Capacity(MW)`,
      construction = Beginbuilding,
      operation    = Commercialoperation,
      closure      = Closed
    )
China <-
  China %>%
    mutate(
      netMW        = as.integer(netMW),
      construction = lubridate::dmy(construction),
      operation    = lubridate::dmy(operation),
      closure      = lubridate::dmy(closure)
    )
China %>%
  ggplot(mapping = aes(x = construction, y = netMW)) +
    geom_point(aes(color = type))
Warning message:
“There were 2 warnings in `mutate()`.
The first warning was:
 In argument: `construction = lubridate::dmy(construction)`.
Caused by warning:
!  15 failed to parse.
 Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.”
Warning message:
“Removed 47 rows containing missing values (`geom_point()`).”
../../../_images/37043f5cace06ab2c8957024e65610096536b8cc276dabba2c49fb70a2a4b39d.png
bind_rows(Japan, China) %>%
  mutate(Country = ifelse(name %in% unique(Japan$name), 'Japan', 'China')) %>%
  head()
A tibble: 6 × 10
namereactortypemodelstatusnetMWconstructionoperationclosureCountry
<chr><chr><chr><chr><chr><int><date><date><date><chr>
Fukushima Daiichi1BWRBWR-3Inoperable 4391967-07-251971-03-262011-05-19Japan
Fukushima Daiichi2BWRBWR-4Inoperable 7601969-06-091974-07-182011-05-19Japan
Fukushima Daiichi3BWRBWR-4Inoperable 7601970-12-281976-03-272011-05-19Japan
Fukushima Daiichi4BWRBWR-4Inoperable 7601973-02-121978-10-122011-05-19Japan
Fukushima Daiichi5BWRBWR-4Shut down 7601972-05-221978-04-182013-12-17Japan
Fukushima Daiichi6BWRBWR-5Shut down 10671973-10-261979-10-242013-12-17Japan

Make an informative graphic that shows how long it took between start of construction and commissioning for operation of each nuclear reactor in Japan (or another country of your choice). One possibility: use reactor name vs date as the frame. For each reactor, set the glyph to be a line extending from start of construction to commissioning. You can do this with geom_segment() using name as the y coordinate and time as the x coordinate. (Tip: use the paste() function to create the reactorID on the vertical axis.)

options(warn = -1)

Japan %>%
  mutate(
    reactorID = paste(name, reactor),
    time      = operation - construction
  ) %>%
  ggplot() +
    geom_segment(
      linewidth = 2,
      mapping = aes(
        x     = operation,
        xend  = construction,
        y     = reactorID,
        yend  = reactorID,
        color = type
      )
    )
../../../_images/cb842782173f3b67767816cc811fa9f056637bc6fdb668883964a35844d31018.png

Project: Street or Road?#

Project: Street or Road? from Kaplan, Daniel & Matthew Beckman. (2021). Data Computing. 2nd Ed. Home.


Example#

People’s addresses involve streets, lanes, courts, avenues, and so on. How many such road-related words are in common use? In answering this question, you would presumably want to look at lots of addresses and extract the road-related term. You could do this by eye, reading down a list of a few hundred or thousand addresses. But if you want to do it on a really large scale, a city or state or country, you would want some automated help, for instance, a computer program that discards the sorts of entries you have already identified to give a greater concentration of unidentified terms. In this activity, you’re going to build such a program.

# about 15,000 street addresses of registered voters in Wake County, NC
Addresses <- read_csv('https://mdbeckman.github.io/dcSupplement/data/street-addresses.csv')
head(Addresses)
Rows: 15483 Columns: 1
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): address

 Use `spec()` to retrieve the full column specification for this data.
 Specify the column types or set `show_col_types = FALSE` to quiet this message.
A tibble: 6 × 1
address
<chr>
2117 MARINER CIRCLE
101 EPPING WAY
PO BOX 58592
5102 ECHO RIDGE RD
PO BOX 37218
PO BOX 37218
# about 900,000 medicare service provider street addresses
download.file(url      = 'https://mdbeckman.github.io/dcSupplement/data/CMS_ProvidersSimple.rds',
              destfile = 'CMS_ProvidersSimple.rds')
DataTable <- readRDS('CMS_ProvidersSimple.rds')
head(DataTable)
A data.frame: 6 × 3
addressfirst_namesex
<chr><chr><chr>
1900 SETON DR ARDALAN M
22650 RIDGE AVE THOMAS M
34126 N HOLLAND SYLVANIA RDRASHID M
4456 MAGEE AVE DAVID M
511100 EUCLID AVE JENNIFERF
612605 E 16TH AVE KEVIN M

To solve such problems, start by looking at a few dozen of the addresses to familiarize yourself with common patterns. Suppose you wanted to extract the PO Box number from an address. Read the street address data and pull out a sample of a few dozen cases.

  1. In everyday langauge, describe a pattern that you think will identify the information you are looking for.

    • The PO Box cases tend to have a substring ‘PO’.

  2. Translate (1) into the form of a regular expression.

    • The regular expression for ‘PO’ is simply ‘PO’.

  3. Filter to retain the cases that match the expression. Hint: filter() and grepl() are useful for this.

  4. Filter to retain the cases that do not match the expression.

  5. Examine the results of (3) and (4) to identify shortcomings in your patterns.

  6. Improve or extend the pattern to deal with the mistaken cases.

  7. Repeat until satisfied.

  8. Put extraction parentheses around the parts of the regular expression that contain the info you want.

Sample <-
  Addresses %>%
    sample_n(size = 50)
head(Sample)
A tibble: 6 × 1
address
<chr>
1921 MCKINNEY STREET
PO BOX 2973
PO BOX 99181
NCSU BOX 22323
3585 E 10TH ST
PO BOX 96
pattern <- 'PO'

Matches <-
  Sample %>%
    filter(grepl(pattern = pattern, address))
head(Matches)
A tibble: 6 × 1
address
<chr>
PO BOX 2973
PO BOX 99181
PO BOX 96
PO BOX 28454
PO BOX 37297
PO BOX 26761
Dont <-
  Sample %>%
    filter(!grepl(pattern = 'PO', address))
head(Dont)
A tibble: 6 × 1
address
<chr>
1921 MCKINNEY STREET
NCSU BOX 22323
3585 E 10TH ST
NCSU BOX 15330
NCSU BOX 04082
251 N MAIN STREET 3786
pattern <- 'BOX\\s+(\\d+)'

Matches <-
  Sample %>%
    filter( grepl(pattern, address))
head(Matches)
A tibble: 6 × 1
address
<chr>
PO BOX 2973
PO BOX 99181
NCSU BOX 22323
PO BOX 96
PO BOX 28454
NCSU BOX 15330
Dont <-
  Sample %>%
    filter(!grepl(pattern, address))
head(Dont)
A tibble: 6 × 1
address
<chr>
1921 MCKINNEY STREET
3585 E 10TH ST
251 N MAIN STREET 3786
3307 BROOKLINE COURT
1703 PATTON ROAD
3231 WALNUT CREEK PKWY
BoxNumbers <-
  Sample %>%
    filter(grepl(pattern, address)) %>%
    tidyr::extract(address, into = 'boxnum', regex = pattern)
head(BoxNumbers)
A tibble: 6 × 1
boxnum
<chr>
2973
99181
22323
96
28454
15330
pattern <- 'PO'

Matches <-
  Addresses %>%
    filter(grepl(pattern = pattern, address))
head(Matches)
A tibble: 6 × 1
address
<chr>
PO BOX 58592
PO BOX 37218
PO BOX 37218
PO BOX 1953
PO BOX 132
PO BOX 360
Dont <-
  Addresses %>%
    filter(!grepl(pattern = 'PO', address))
head(Dont)
A tibble: 6 × 1
address
<chr>
2117 MARINER CIRCLE
101 EPPING WAY
5102 ECHO RIDGE RD
5007 PURITAN RD
04-I ROBIN CIRCLE
4800 WESTERN BLVD
pattern <- 'BOX\\s+(\\d+)'

Matches <-
  Addresses %>%
    filter( grepl(pattern, address))
head(Matches)
A tibble: 6 × 1
address
<chr>
PO BOX 58592
PO BOX 37218
PO BOX 37218
PO BOX 1953
PO BOX 132
PO BOX 360
Dont <-
  Addresses %>%
    filter(!grepl(pattern, address))
head(Dont)
A tibble: 6 × 1
address
<chr>
2117 MARINER CIRCLE
101 EPPING WAY
5102 ECHO RIDGE RD
5007 PURITAN RD
04-I ROBIN CIRCLE
4800 WESTERN BLVD
BoxNumbers <-
  Addresses %>%
    filter(grepl(pattern, address)) %>%
    tidyr::extract(address, into = 'boxnum', regex = pattern)
head(BoxNumbers)
A tibble: 6 × 1
boxnum
<chr>
58592
37218
37218
1953
132
360

Back to the Streets#

Street endings (e.g., ‘ST’, ‘LANE’) are often found at the end of the address string. Use this as a starting point to find the most common endings. Once you have a set of specific street endings, you can use the regex “or” symbol, e.g., ‘(ST|RD|ROAD)’. The parentheses are not incidental. They are there to mark a pattern that you want to extract. In this case, in addition to knowing that there is a ST or RD or ROAD in an address, you want to know which one of those possibilities it is so that you can count the occurrence of each of the possibilities. To find street endings that aren’t in your set, you can filter out the street endings or non street addresses you already know about.

[1] Read the following R statements. Next to each line, give a short explanation of what the line contributes to the task. For each of the regexes, explain in simple everyday language what pattern is being matched.

pattern <- '(ST|RD|ROAD)'

LeftOvers <-
  Addresses %>%
    filter(
      !grepl(pattern, address),                 # matches other than: 'ST' or 'RD' or 'ROAD'
      !grepl('\\sAPT|UNIT\\s[\\d]+$', address), # matches other than: (1 whitespace, 'APT') or ('UNIT', 1 whitespace, 1 or more digits at the end of the string)
      !grepl(' BOX ', address)                  # matches other than: 1 space, 'BOX', 1 space
    )
LeftOvers

[2] For each set of patterns that you identify, compute the LeftOvers. Examine them visually to find new street endings to add to the pattern, e.g., LANE. When you have this working on the small sample, use a larger sample and, eventually, the whole data set. It’s practically impossible to find a method that will work perfectly on new data, but do the best you can. In your report, implement your method and explain how it works, line by line. Present your result: how many addresses there are of each kind of road word?

‘DR’, ‘RD’, and ‘ST’ are the most common street types. In general, abbrviated street types are more common than their longform counterparts.

Breaking addresses into their components is a common task. People who work on this problem intensively sometimes publish their regular expressions. Here’s one from Ross Hammer published at https://regexlib.com/Search.aspx?k=street:

^\s*((?:(?:\d+(?:\x20+\w+\.?)+(?:(?:\x20+STREET|ST|DRIVE|DR|AVENUE|AVE|ROAD|RD|LOOP|COURT
|CT|CIRCLE|LANE|LN|BOULEVARD|BLVD)\.?)?)|(?:(?:P\.\x20?O\.|P\x20?O)\x20*Box\x20+\d+)|
(?:General\x20+Delivery)|(?:C[\\\/]O\x20+(?:\w+\x20*)+))\,?\x20*(?:(?:(?:APT|BLDG|DEPT|
FL|HNGR|LOT|PIER|RM|S(?:LIP|PC|T(?:E|OP))|TRLR|UNIT|\x23)\.?\x20*(?:[a-zA-Z0-9\-]+))|
(?:BSMT|FRNT|LBBY|LOWR|OFC|PH|REAR|SIDE|UPPR))?)\,?\s+((?:(?:\d+(?:\x20+\w+\.?)+
(?:(?:\x20+STREET|ST|DRIVE|DR|AVENUE|AVE|ROAD|RD|LOOP|COURT|CT|CIRCLE|LANE|LN|BOULEVARD|
BLVD)\.?)?)|(?:(?:P\.\x20?O\.|P\x20?O)\x20*Box\x20+\d+)|(?:General\x20+Delivery)|
(?:C[\\\/]O\x20+(?:\w+\x20*)+))\,?\x20*(?:(?:(?:APT|BLDG|DEPT|FL|HNGR|LOT|PIER|RM|
S(?:LIP|PC|T(?:E|OP))|TRLR|UNIT|\x23)\.?\x20*(?:[a-zA-Z0-9\-]+))|(?:BSMT|FRNT|LBBY|
LOWR|OFC|PH|REAR|SIDE|UPPR))?)?\,?\s+((?:[A-Za-z]+\x20*)+)\,\s+(A[LKSZRAP]|C[AOT]|
D[EC]|F[LM]|G[AU]|HI|I[ADLN]|K[SY]|LA|M[ADEHINOPST]|N[CDEHJMVY]|O[HKR]|P[ARW]|RI|
S[CD]|T[NX]|UT|V[AIT]|W[AIVY])\s+(\d+(?:-\d+)?)\s*$
Addresses %>%
  filter(
    !grepl(pattern = ' AVENUE |AVENUE$', x = address),       # matches other than: 'AVENUE' flanked by whitespace anywhere in the string, or 'AVENUE' at the end of the string
    !grepl(pattern = ' AVE |AVE$', x = address),             # matches other than: 'AVE' flanked by whitespace anywhere in the string, or 'AVE' at the end of the string
    !grepl(pattern = ' BLVD |BLVD$', x = address),           # matches other than: 'BLVD' flanked by whitespace anywhere in the string, or 'BLVD' at the end of the string
    !grepl(pattern = ' BOULEVARD |BOULEVARD$', x = address), # matches other than: 'BOULEVARD' flanked by whitespace anywhere in the string, or 'BOULEVARD' at the end of the string
    !grepl(pattern = ' CIR |CIR$', x = address),             # matches other than: 'CIR' flanked by whitespace anywhere in the string, or 'CIR' at the end of the string
    !grepl(pattern = ' CIRCLE |CIRCLE$', x = address),       # matches other than: 'CIRCLE' flanked by whitespace anywhere in the string, or 'CIRCLE' at the end of the string
    !grepl(pattern = ' CT |CT$', x = address),               # matches other than: 'CT' flanked by whitespace anywhere in the string, or 'CT' at the end of the string
    !grepl(pattern = ' COURT |COURT$', x = address),         # matches other than: 'COURT' flanked by whitespace anywhere in the string, or 'COURT' at the end of the string
    !grepl(pattern = ' DR |DR$', x = address),               # matches other than: 'DR' flanked by whitespace anywhere in the string, or 'DR' at the end of the string
    !grepl(pattern = ' DRIVE |DRIVE$', x = address),         # matches other than: 'DRIVE' flanked by whitespace anywhere in the string, or 'DRIVE' at the end of the string
    !grepl(pattern = ' LN |LN$', x = address),               # matches other than: 'LN' flanked by whitespace anywhere in the string, or 'LN' at the end of the string
    !grepl(pattern = ' LANE |LANE$', x = address),           # matches other than: 'LANE' flanked by whitespace anywhere in the string, or 'LANE' at the end of the string
    !grepl(pattern = ' PL |PL$', x = address),               # matches other than: 'PL' flanked by whitespace anywhere in the string, or 'PL' at the end of the string
    !grepl(pattern = ' PLACE |PLACE$', x = address),         # matches other than: 'PLACE' flanked by whitespace anywhere in the string, or 'PLACE' at the end of the string
    !grepl(pattern = ' RD |RD$', x = address),               # matches other than: 'RD' flanked by whitespace anywhere in the string, or 'RD' at the end of the string
    !grepl(pattern = ' ROAD |ROAD$', x = address),           # matches other than: 'ROAD' flanked by whitespace anywhere in the string, or 'ROAD' at the end of the string
    !grepl(pattern = ' ST |ST$', x = address),               # matches other than: 'ST' flanked by whitespace anywhere in the string, or 'ST' at the end of the string
    !grepl(pattern = ' STREET |STREET$', x = address),       # matches other than: 'STREET' flanked by whitespace anywhere in the string, or 'STREET' at the end of the string
    !grepl(pattern = ' TRAIL |TRAIL$', x = address),         # matches other than: 'TRAIL' flanked by whitespace anywhere in the string, or 'TRAIL' at the end of the string
    !grepl(pattern = ' WAY |WAY$', x = address),             # matches other than: 'WAY' flanked by whitespace anywhere in the string, or 'WAY' at the end of the string
    !grepl(pattern = ' WINERY |WINERY$', x = address),       # matches other than: 'WINERY' flanked by whitespace anywhere in the string, or 'WINERY' at the end of the string
    !grepl(pattern = 'BOX\\s+(\\d+)', x = address)           # filters out all the 'BOX' from above
  ) %>%
  head()
A tibble: 6 × 1
address
<chr>
NCSU B0X 15637
HMIA-267
1104 PULLEN HALL
311 BAREFOOT
512 LIVE OAK
1706 WINGATE UNIVERSITY
pattern <- '\\s+('
pattern <- str_c(pattern, 'AVE|AVENUE')
pattern <- str_c(pattern, '|BLVD|BOULEVARD')
pattern <- str_c(pattern, '|CIR|CIRCLE')
pattern <- str_c(pattern, '|CT|COURT')
pattern <- str_c(pattern, '|DR|DRIVE')
pattern <- str_c(pattern, '|LN|LANE')
pattern <- str_c(pattern, '|PL|PLACE')
pattern <- str_c(pattern, '|RD|ROAD')
pattern <- str_c(pattern, '|ST|STREET')
pattern <- str_c(pattern, '|TRAIL')
pattern <- str_c(pattern, '|WAY')
pattern <- str_c(pattern, '|WINERY')
pattern <- str_c(pattern, ')(\\s+|$)?')
pattern

streets <-
  Addresses %>%
    filter(
      grepl(x = address, pattern = pattern, ignore.case = TRUE)
    ) %>%
    tidyr::extract(
      col   = address,
      regex = pattern,
      into  = c(
        'street',
        ' '
      )
    ) %>%
    group_by(street) %>%
    summarize(total = n()) %>%
    arrange(desc(total))
streets
'\\s+(AVE|AVENUE|BLVD|BOULEVARD|CIR|CIRCLE|CT|COURT|DR|DRIVE|LN|LANE|PL|PLACE|RD|ROAD|ST|STREET|TRAIL|WAY|WINERY)(\\s+|$)?'
A tibble: 16 × 2
streettotal
<chr><int>
DR 888
RD 688
ST 676
AVE 259
CT 229
ROAD 188
LN 152
PL 145
LANE 141
CIR 120
WAY 113
COURT 90
BLVD 50
TRAIL 26
BOULEVARD 4
WINERY 1
streets %>%
  ggplot(aes(x = reorder(street, -total), y = total)) +
    geom_col() +
    xlab('street')
../../../_images/867724b761323912e73e277a1261147ba62ea3fe7088b452ab5f032bd2815fc5.png

Bibliography#

Kaplan, Daniel & Matthew Beckman. (2021). Data Computing. 2nd Ed. Home.