R for Data Science 1#

Wickham, Hadley; Mine Çetinkaya-Rundel; & Garrett Grolemund. R for Data Science: Import, Tidy, Transform, Visualize, and Model Data. 1st Ed. O’Reilly. Home.


Revised

08 Jun 2023


Programming Environment#

packages <- c(
  'hexbin',       # library(hexbin)
  'lubridate',    # library(lubridate)
  'maps',         # library(maps)
  'modelr',       # library(modelr)
  'mosaic',       # library(mosaic)
  'mosaicData',   # library(mosaicData)
  'nycflights13', # library(nycflights13)
  'pryr',         # library(pryr)
  'purrr',        # library(purrr)
  'tidyverse',    # library(tidyverse)
  'RcppRoll'      # library(RcppRoll)
)

# Install packages not yet installed
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
  install.packages(packages[!installed_packages])
}

# Load packages
invisible(lapply(packages, library, character.only = TRUE))

str_c('EXECUTED : ', now())
sessionInfo()
# R.version.string # R.Version()
# .libPaths()
# installed.packages()
Attaching package: ‘lubridate’
The following objects are masked from ‘package:base’:

    date, intersect, setdiff, union
Registered S3 method overwritten by 'mosaic':
  method                           from   
  fortify.SpatialPolygonsDataFrame ggplot2
The 'mosaic' package masks several functions from core packages in order to add 
additional features.  The original behavior of these functions should not be affected by this.
Attaching package: ‘mosaic’
The following objects are masked from ‘package:dplyr’:

    count, do, tally
The following object is masked from ‘package:Matrix’:

    mean
The following object is masked from ‘package:ggplot2’:

    stat
The following object is masked from ‘package:modelr’:

    resample
The following objects are masked from ‘package:stats’:

    binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
    quantile, sd, t.test, var
The following objects are masked from ‘package:base’:

    max, mean, min, prod, range, sample, sum
Attaching package: ‘pryr’
The following object is masked from ‘package:mosaic’:

    inspect
The following object is masked from ‘package:dplyr’:

    where
Attaching package: ‘purrr’
The following objects are masked from ‘package:pryr’:

    compose, partial
The following object is masked from ‘package:mosaic’:

    cross
The following object is masked from ‘package:maps’:

    map
── Attaching core tidyverse packages ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
 forcats 1.0.0      tibble  3.2.1
 readr   2.1.4      tidyr   1.3.0
 stringr 1.5.0     
── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
 purrr::compose()     masks pryr::compose()
 mosaic::count()      masks dplyr::count()
 purrr::cross()       masks mosaic::cross()
 mosaic::do()         masks dplyr::do()
 tidyr::expand()      masks Matrix::expand()
 dplyr::filter()      masks stats::filter()
 dplyr::lag()         masks stats::lag()
 purrr::map()         masks maps::map()
 ggformula::na.warn() masks modelr::na.warn()
 tidyr::pack()        masks Matrix::pack()
 purrr::partial()     masks pryr::partial()
 mosaic::resample()   masks modelr::resample()
 mosaic::stat()       masks ggplot2::stat()
 mosaic::tally()      masks dplyr::tally()
 tidyr::unpack()      masks Matrix::unpack()
 pryr::where()        masks dplyr::where()
 Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
'EXECUTED : 2024-11-21 01:48:09.626914'
R version 4.3.0 (2023-04-21)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS 15.1

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: America/New_York
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] RcppRoll_0.3.0     forcats_1.0.0      stringr_1.5.0      readr_2.1.4       
 [5] tidyr_1.3.0        tibble_3.2.1       tidyverse_2.0.0    purrr_1.0.2       
 [9] pryr_0.1.6         nycflights13_1.0.2 mosaic_1.8.4.2     mosaicData_0.20.3 
[13] ggformula_0.10.4   dplyr_1.1.2        Matrix_1.5-4       ggplot2_3.4.3     
[17] lattice_0.21-8     modelr_0.1.11      maps_3.4.1         lubridate_1.9.2   
[21] hexbin_1.28.3     

loaded via a namespace (and not attached):
 [1] utf8_1.2.3         generics_0.1.3     stringi_1.7.12     hms_1.1.3         
 [5] digest_0.6.31      magrittr_2.0.3     evaluate_0.21      grid_4.3.0        
 [9] timechange_0.2.0   pbdZMQ_0.3-9       fastmap_1.1.1      jsonlite_1.8.5    
[13] backports_1.4.1    fansi_1.0.4        scales_1.2.1       tweenr_2.0.2      
[17] codetools_0.2-19   cli_3.6.1          labelled_2.11.0    rlang_1.1.1       
[21] crayon_1.5.2       polyclip_1.10-4    munsell_0.5.0      base64enc_0.1-3   
[25] withr_2.5.0        repr_1.1.6         tools_4.3.0        tzdb_0.4.0        
[29] uuid_1.1-0         colorspace_2.1-0   mosaicCore_0.9.2.1 broom_1.0.5       
[33] IRdisplay_1.1      vctrs_0.6.3        R6_2.5.1           ggridges_0.5.4    
[37] lifecycle_1.0.3    ggstance_0.3.6     MASS_7.3-58.4      pkgconfig_2.0.3   
[41] pillar_1.9.0       gtable_0.3.3       glue_1.6.2         Rcpp_1.0.10       
[45] ggforce_0.4.1      haven_2.5.2        tidyselect_1.2.0   IRkernel_1.3.2    
[49] farver_2.1.1       htmltools_0.5.5    compiler_4.3.0    

03 - Data Visualization#

head(x = mpg, n = 5)
A tibble: 5 × 11
manufacturermodeldisplyearcyltransdrvctyhwyflclass
<chr><chr><dbl><int><int><chr><chr><int><int><chr><chr>
audia41.819994auto(l5) f1829pcompact
audia41.819994manual(m5)f2129pcompact
audia42.020084manual(m6)f2031pcompact
audia42.020084auto(av) f2130pcompact
audia42.819996auto(l5) f1626pcompact
ggplot(data=mpg) +
  geom_point(mapping=aes(x=displ,y=hwy))
ggplot(data=mpg) +
  geom_point(mapping=aes(x=displ,y=hwy,color=class))
ggplot(data=mpg) +
  geom_point(mapping=aes(x=displ,y=hwy,size=class))
ggplot(data=mpg) +
  geom_point(mapping=aes(x=displ,y=hwy,alpha=class))
ggplot(data=mpg) +
  geom_point(mapping=aes(x=displ,y=hwy,shape=class))
../../../_images/62185af54332fefaa8009b117b95ccc240e087919871ebb5f551aafa6a43b784.png
Warning message:
“Using size for a discrete variable is not advised.”
../../../_images/6c119391fccc56a6293da32159f0a1aad7f647207d0a9f8ef66344d49af564ca.png
Warning message:
“Using alpha for a discrete variable is not advised.”
../../../_images/8c272dd9a365bcf50c00cd3ebde1df842bfcb45c62df148b966b5466defb5e9c.png
Warning message:
“The shape palette can deal with a maximum of 6 discrete values because
more than 6 becomes difficult to discriminate; you have 7. Consider
specifying shapes manually if you must have them.”
Warning message:
“Removed 62 rows containing missing values (`geom_point()`).”
../../../_images/ac5a59bc1b808cf93da314c9be809d9756b5a3fa62cc873d9824388c90159eb4.png ../../../_images/5fe7fe563a7d9998f5817f88d41d62dc62d182c8b2d23d74b916324c80ddec27.png
ggplot2::mpg %>%
  ggplot() +
    geom_point(mapping=aes(x=displ, y=hwy, color=displ<5))
../../../_images/ae59301642c3db9ce59bf11a5d1c849c0fbaa4394d304c024f3575c1dbfdd9aa.png
ggplot2::mpg %>%
  ggplot() +
    geom_point(mapping = aes(x = displ, y = hwy)) +
    facet_wrap(~ class, nrow=2)
ggplot2::mpg %>%
  ggplot() +
    geom_point(mapping = aes(x = displ, y = hwy)) +
    facet_grid(drv ~ cyl)
../../../_images/da949d4139ff9b7aa9671eb8b311283b9b96f861420aafc50c143fdd70f8b3d7.png ../../../_images/0f3090a844c7c7c771f3cb4dc170608dec64cff3229bb1454bd24e9161f88ec3.png
ggplot2::mpg %>%
  ggplot(mapping = aes(x = displ, y = hwy, color = drv)) +
    geom_point (                               show.legend=FALSE) +
    geom_smooth(mapping = aes(linetype = drv), show.legend=FALSE)
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
../../../_images/861240d5fa2a42234c4de7d2712ba8825187c929eeaed938db67f9a40bd1f11b.png
ggplot2::mpg %>%
  ggplot(mapping = aes(x = displ, y = hwy)) +
    geom_point(mapping = aes(color = class)) +
    geom_smooth(data = filter(mpg, class == 'subcompact'), se=FALSE)
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
../../../_images/9320f8eb71984e237d09a79f446da650b6f4eff3802310cc26a575706ab9fcd6.png
# LINE PLOT
x  <- 1:10
y  <- cumsum(rnorm(10))
df <- data.frame(x, y)
ggplot(df, mapping = aes(x = x, y = y)) +
  geom_line(size=0.8) +
  ggtitle('Evolution')
Warning message:
“Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
 Please use `linewidth` instead.”
../../../_images/f543c8a3a3da0f26670253036132ba9ffa33e77c5406a4881aa786bf14c9ee27.png
# BOX PLOT
mtcars %>%
  ggplot(mapping = aes(x = as.factor(cyl), y = mpg)) +
    geom_boxplot(fill = 'slateblue', alpha = 0.2) +
    xlab('cyl')
../../../_images/51dd87f8e5503987fddfebc4a5c9fed05bd195e9d688f5df993a94dd4b4caa0e.png
ggplot(data    = mpg,
       mapping = aes(x     = displ,
                     y     = hwy,
                     color = drv)) +
  geom_point() +
  geom_smooth(se=FALSE)
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
../../../_images/352f4ad902d6a5e102c7804b9a21f83eb6be0a7718b1d586c9343d4ae815ec72.png
ggplot(data    = mpg,
       mapping = aes(x = displ,
                     y = hwy)) +
  geom_point () +
  geom_smooth()

ggplot() +
  geom_point (data    = mpg,
              mapping = aes(x = displ,
                            y = hwy)) +
  geom_smooth(data    = mpg,
              mapping = aes(x = displ,
                            y = hwy))
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
../../../_images/c9ab4b35d51ea6c6dbf91c37ba68d50d118bfca480d66d6dfd89886a9a3b7e18.png ../../../_images/c9ab4b35d51ea6c6dbf91c37ba68d50d118bfca480d66d6dfd89886a9a3b7e18.png
ggplot(data = diamonds) +
  geom_bar  (mapping = aes(x = cut))
ggplot(data = diamonds) +
  stat_count(mapping = aes(x = cut))
../../../_images/d11e0e33f0dc911cb6a0a617a55010a29043c71d0b50aa99ef0290c0ed947a4f.png ../../../_images/d11e0e33f0dc911cb6a0a617a55010a29043c71d0b50aa99ef0290c0ed947a4f.png
demo <- tribble(
  ~cut,  ~freq,
  'Fair',      1610,
  'Good',      4906,
  'Very Good',12082,
  'Premium',  13791,
  'Ideal',    21551
)
ggplot(data = demo) +
  geom_bar(mapping = aes(x = cut,
                         y = freq),
           stat    = 'identity')

ggplot(data = diamonds) +
  geom_bar(mapping = aes(x     = cut,
                         y     = stat(prop),
                         group = 1))

ggplot(data = diamonds) +
  stat_summary(
    mapping = aes(x = cut, y = depth),
    fun.min = min,
    fun.max = max,
    fun     = median
  )
ggplot(data = diamonds) +
  geom_pointrange(
    mapping = aes(x = cut, y = depth),
    stat    = 'summary',
    fun.min = min,
    fun.max = max,
    fun     = median
  )
Warning message:
“`stat(prop)` was deprecated in ggplot2 3.4.0.
 Please use `after_stat(prop)` instead.”
../../../_images/5066383e316b451e8b8363a2c977c86458565e31a76fe46356dc3b36b87c1428.png ../../../_images/d5c36c31ae02ec2f10b1054f617e13852432abcf50d71b5c865c5f0b4d9ed0db.png ../../../_images/e0bb4fc0d12c2bf5abfc5dc2b60388c9394ede2c9d391ba79044aa7674b905a2.png ../../../_images/e0bb4fc0d12c2bf5abfc5dc2b60388c9394ede2c9d391ba79044aa7674b905a2.png
g  <- ggplot(data = mpg, mapping = aes(x = class))

df <- data.frame(x = rep(c(2.9, 3.1, 4.5), c(5, 10, 4)))
ggplot(data = df, mapping = aes(x)) + geom_bar()
ggplot(data = df, mapping = aes(x)) + geom_histogram(binwidth = 2.5)
df <- data.frame(trt = c('a', 'b', 'c'), outcome = c(2.3, 1.9, 3.2))
ggplot(data = df,  mapping = aes(x = trt, y = outcome)) + geom_point()
ggplot(data = df,  mapping = aes(x = trt, y = outcome)) + geom_col()
ggplot(data = mpg, mapping = aes(y = class)) + geom_bar(mapping = aes(fill = drv), position = position_stack(reverse = TRUE)) + theme(legend.position='top')
g + geom_bar(mapping = aes(fill = drv))
ggplot(data = mpg, mapping = aes(y = class)) + geom_bar()
g + geom_bar(mapping = aes(weight = displ))
g + geom_bar()
../../../_images/8408bf2fa6496b0fee934431b6cf8470a5d3212a85674376fc9f05e883df9407.png ../../../_images/0720f9c46c587ad2125bcc5eb3a8c1ec2ad1e7885197f10028a96ab100fc3ece.png ../../../_images/9483188b889dd00096587834695d391c5b01499d014a88dc0d6db02f49cca401.png ../../../_images/865aa51c0943fe091d54d014c1c215b854a697a083abc9224eb9c6f26d7772c9.png ../../../_images/5451f283dca86460be7fef26acdb272eb977ced2a71d6b219012566c77b97873.png ../../../_images/36270d7a6a234da4a83df2b6aaec31b3849055d11188bb1a4da0b464ccce8b5a.png ../../../_images/773a0a41dbc24cc71eb7f4f6b3870d904775f0f6be12d4ac4e95cbbc3fa393c7.png ../../../_images/25288fb488c755ce8ba7f4d69a114aa474a3011728fba97751252d8c9cf73d85.png ../../../_images/562610a5795c5ccac4ea4c7537d5590e07f70585e0a8f49de55ad1e8fefb5453.png
# 3.7.1 [5]

ggplot(data = diamonds) +
  geom_bar(mapping = aes(x = cut, y = after_stat(prop), group = 1))
ggplot(data = diamonds) +
  geom_bar(mapping = aes(x = cut, y = after_stat(prop), fill = color))
../../../_images/d5c36c31ae02ec2f10b1054f617e13852432abcf50d71b5c865c5f0b4d9ed0db.png ../../../_images/3ad7439ccae15c370195f31c2a82e4f82af5bbfb00f4edc17e2c542e1030b7d4.png
ggplot(data = diamonds) +
  geom_bar(mapping = aes(x = cut, fill = clarity), position = 'dodge')
ggplot(data = diamonds) +
  geom_bar(mapping = aes(x = cut, fill = clarity), position = 'fill')
ggplot(data = diamonds, mapping = aes(x = cut, color = clarity)) +
  geom_bar(fill = NA, position = 'identity')
ggplot(data = diamonds, mapping = aes(x = cut, fill = clarity)) +
  geom_bar(alpha = 1/5, position = 'identity')
ggplot(data = diamonds) +
  geom_bar(mapping = aes(x = cut, fill = clarity))
ggplot(data = diamonds) +
  geom_bar(mapping = aes(x = cut, fill = cut))
ggplot(data = diamonds) +
  geom_bar(mapping = aes(x = cut, color = cut))
../../../_images/5fe3e55b3d1be67faa8f26fefb905b2653c30d2c2ba934739b908ad1d113aa01.png ../../../_images/5886e79758fb3f25ad2de1c1c3bad62576407f9033214c23afcb9f9c0be0208d.png ../../../_images/924f853b8dff603d607f255a60fca71ddfdbe4cf7446922fcf07196b2faf2fab.png ../../../_images/f58683caf1a596a12868d2ef25611895a4cd86d541a122074e0bb3f2bd0d2b7d.png ../../../_images/bc6097db8f36e66f1a7edf5256c862f24c442be569b35918ff454879d2d20f5f.png ../../../_images/21b9b6ad7076690f4e8c8aa5226cf87393ae8b3fbf5a9d2e728e91a7736f525a.png ../../../_images/bd805cb664e03b540b199099bd3e97142612254636764d9d576efa0d37ece658.png
ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy), position = 'jitter')
../../../_images/abb0483f6137c34b17defaf57340639ca1f48ba2b95efb9599412fb09eeb1066.png
ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + geom_jitter()
ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + geom_count()
ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + geom_point()
../../../_images/7bcd3078228e261ad6b9823a0768c947cb908fb3ff7ccaea052830bce6ce3dfd.png ../../../_images/2276ac1fdef37b1fa4d58c3945ed731c7596a2590491b24ca9efa1ee2f41b21e.png ../../../_images/53b3d5bf668de9ee640447562fd7b393b1fe2d868151f09d3634a533e7a12250.png
head(mpg)
A tibble: 6 × 11
manufacturermodeldisplyearcyltransdrvctyhwyflclass
<chr><chr><dbl><int><int><chr><chr><int><int><chr><chr>
audia41.819994auto(l5) f1829pcompact
audia41.819994manual(m5)f2129pcompact
audia42.020084manual(m6)f2031pcompact
audia42.020084auto(av) f2130pcompact
audia42.819996auto(l5) f1626pcompact
audia42.819996manual(m5)f1826pcompact
ggplot(data = mpg) +
  geom_boxplot(mapping = aes(x = displ, fill = manufacturer))
../../../_images/e1fe6d0e624efc14c2e7931911afc8260ff7c58a67a2165b22c39253f74e1db0.png
ggplot(data = mpg, mapping = aes(x = class, y = hwy)) + geom_boxplot() + coord_flip()
ggplot(data = mpg, mapping = aes(x = class, y = hwy)) + geom_boxplot()
../../../_images/29b6b32edd0375763b99c7e0813fe358473cc84b94debc7df0ffe8b9bdd4d7d7.png ../../../_images/c95785ff59d270955569a119905a610da90549a4d9765d82f4852e1f9b77303f.png
nz <- map_data('nz')
ggplot(data = nz, mapping = aes(x = long, y = lat, group = group)) +
  geom_polygon(fill = 'white', color = 'black') +
  coord_quickmap()
ggplot(data = nz, mapping = aes(x = long, y = lat, group = group)) +
  geom_polygon(fill = 'white', color = 'black')
../../../_images/b608c81ae62396d2e034310bb441110663faa4c112ad9d1c1cbbb27b2952c9dc.png ../../../_images/b973bb32c75f8718584744eaa379dda98b27e02fa6ed5febaedf22ffdd7db9fa.png
bar <- ggplot(data = diamonds) +
  geom_bar(
    mapping = aes(x = cut, fill = cut),
    show.legend = FALSE,
    width = 1
  ) +
  theme(aspect.ratio = 1) +
  labs(x = NULL, y = NULL)

bar + coord_flip()
bar + coord_polar()
../../../_images/01f0687ad40da84da74fbdc652f85ec1b991b47119bf63b2625b648371b76144.png ../../../_images/322431e65dc5a705d1a86b5f0a8def52bd6b358b0658941fa28938ee56227dee.png
# 3.9.1 [4]

ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) +
  geom_point() +
  geom_abline() +
  coord_fixed()
../../../_images/a6cf85314459bcf9555f86c106a9eb96dcb9b37ef4b05609abdf98e8d3a27b9b.png

04 - Workflow: basics#

sin(pi/2)
seq(1,10)
typeof(seq(1,10))
seq(1,10,length.out=5)
1:10
1
  1. 1
  2. 2
  3. 3
  4. 4
  5. 5
  6. 6
  7. 7
  8. 8
  9. 9
  10. 10
'integer'
  1. 1
  2. 3.25
  3. 5.5
  4. 7.75
  5. 10
  1. 1
  2. 2
  3. 3
  4. 4
  5. 5
  6. 6
  7. 7
  8. 8
  9. 9
  10. 10

05 - Data Transformation#

airlines
A tibble: 16 × 2
carriername
<chr><chr>
9EEndeavor Air Inc.
AAAmerican Airlines Inc.
ASAlaska Airlines Inc.
B6JetBlue Airways
DLDelta Air Lines Inc.
EVExpressJet Airlines Inc.
F9Frontier Airlines Inc.
FLAirTran Airways Corporation
HAHawaiian Airlines Inc.
MQEnvoy Air
OOSkyWest Airlines Inc.
UAUnited Air Lines Inc.
USUS Airways Inc.
VXVirgin America
WNSouthwest Airlines Co.
YVMesa Airlines Inc.
airports
A tibble: 1458 × 8
faanamelatlonalttzdsttzone
<chr><chr><dbl><dbl><dbl><dbl><chr><chr>
04GLansdowne Airport 41.13047 -80.619581044-5AAmerica/New_York
06AMoton Field Municipal Airport 32.46057 -85.68003 264-6AAmerica/Chicago
06CSchaumburg Regional 41.98934 -88.10124 801-6AAmerica/Chicago
06NRandall Airport 41.43191 -74.39156 523-5AAmerica/New_York
09JJekyll Island Airport 31.07447 -81.42778 11-5AAmerica/New_York
0A9Elizabethton Municipal Airport 36.37122 -82.173421593-5AAmerica/New_York
0G6Williams County Airport 41.46731 -84.50678 730-5AAmerica/New_York
0G7Finger Lakes Regional Airport 42.88356 -76.78123 492-5AAmerica/New_York
0P2Shoestring Aviation Airfield 39.79482 -76.647191000-5UAmerica/New_York
0S9Jefferson County Intl 48.05381-122.81064 108-8AAmerica/Los_Angeles
0W3Harford County Airport 39.56684 -76.20240 409-5AAmerica/New_York
10CGalt Field Airport 42.40289 -88.37511 875-6UAmerica/Chicago
17GPort Bucyrus-Crawford County Airport40.78156 -82.974811003-5AAmerica/New_York
19AJackson County Airport 34.17586 -83.56160 951-5UAmerica/New_York
1A3Martin Campbell Field Airport 35.01581 -84.346831789-5AAmerica/New_York
1B9Mansfield Municipal 42.00013 -71.19677 122-5AAmerica/New_York
1C9Frazier Lake Airpark 54.01333-124.76833 152-8AAmerica/Vancouver
1CSClow International Airport 41.69597 -88.12923 670-6UAmerica/Chicago
1G3Kent State Airport 41.15139 -81.415111134-5AAmerica/New_York
1G4Grand Canyon West Airport 35.89990-113.815674813-7AAmerica/Phoenix
1H2Effingham Memorial Airport 39.07000 -88.53400 585-6AAmerica/Chicago
1OHFortman Airport 40.55533 -84.38662 885-5UAmerica/New_York
1RLPoint Roberts Airpark 48.97972-123.07889 10-8AAmerica/Los_Angeles
23MClarke CO 32.05170 -88.44340 320-6AAmerica/Chicago
24CLowell City Airport 42.95392 -85.34391 681-5AAmerica/New_York
24JSuwannee County Airport 30.30013 -83.02469 104-5AAmerica/New_York
25DForest Lake Airport 45.24775 -92.99439 925-6AAmerica/Chicago
29DGrove City Airport 41.14603 -80.167751371-5AAmerica/New_York
2A0Mark Anton Airport 35.48625 -84.93108 718-5AAmerica/New_York
2B2Plum Island Airport 42.79536 -70.83944 11-5AAmerica/New_York
X59Valkaria Municipal 27.96086 -80.55833 26-5AAmerica/New_York
XFLFlagler County Airport 29.28210 -81.12120 33-5AAmerica/New_York
XNANW Arkansas Regional 36.28187 -94.306811287-6AAmerica/Chicago
XZKAmherst Amtrak Station AMM 42.37500 -72.51139 258-5AAmerica/New_York
Y51Municipal Airport 43.57936 -90.896471292-6AAmerica/Chicago
Y72Bloyer Field 43.97622 -90.48061 966-6AAmerica/Chicago
YAKYakutat 59.30120-139.39370 33-9ANA
YIPWillow Run 42.23793 -83.53041 716-5AAmerica/New_York
YKMYakima Air Terminal McAllister Field46.56820-120.544001095-8AAmerica/Los_Angeles
YKNChan Gurney 42.87110 -97.396901200-6AAmerica/Chicago
YNGYoungstown Warren Rgnl 41.26074 -80.679101196-5AAmerica/New_York
YUMYuma Mcas Yuma Intl 32.65658-114.60598 216-7NAmerica/Phoenix
Z84Clear 64.30120-149.12014 552-9AAmerica/Anchorage
ZBPPenn Station 39.30722 -76.61556 66-5AAmerica/New_York
ZFVPhiladelphia 30th St Station 39.95570 -75.18200 0-5AAmerica/New_York
ZPHMunicipal Airport 28.22806 -82.15583 90-5AAmerica/New_York
ZRAAtlantic City Rail Terminal 39.36650 -74.44200 8-5AAmerica/New_York
ZRDTrain Station 37.53430 -77.42945 26-5AAmerica/New_York
ZRPNewark Penn Station 40.73472 -74.16417 0-5AAmerica/New_York
ZRTHartford Union Station 41.76888 -72.68150 0-5AAmerica/New_York
ZRZNew Carrollton Rail Station 38.94800 -76.87190 39-5AAmerica/New_York
ZSFSpringfield Amtrak Station 42.10600 -72.59305 65-5AAmerica/New_York
ZSYScottsdale Airport 33.62289-111.910531519-7AAmerica/Phoenix
ZTFStamford Amtrak Station 41.04694 -73.54149 0-5AAmerica/New_York
ZTYBoston Back Bay Station 42.34780 -71.07500 20-5AAmerica/New_York
ZUNBlack Rock 35.08323-108.791786454-7AAmerica/Denver
ZVENew Haven Rail Station 41.29867 -72.92599 7-5AAmerica/New_York
ZWIWilmington Amtrak Station 39.73667 -75.55167 0-5AAmerica/New_York
ZWUWashington Union Station 38.89746 -77.00643 76-5AAmerica/New_York
ZYPPenn Station 40.75050 -73.99350 35-5AAmerica/New_York
flights
A tibble: 336776 × 19
yearmonthdaydep_timesched_dep_timedep_delayarr_timesched_arr_timearr_delaycarrierflighttailnumorigindestair_timedistancehourminutetime_hour
<int><int><int><int><int><dbl><int><int><dbl><chr><int><chr><chr><chr><dbl><dbl><dbl><dbl><dttm>
201311517515 2 830 819 11UA1545N14228EWRIAH22714005152013-01-01 05:00:00
201311533529 4 850 830 20UA1714N24211LGAIAH22714165292013-01-01 05:00:00
201311542540 2 923 850 33AA1141N619AAJFKMIA16010895402013-01-01 05:00:00
201311544545-110041022-18B6 725N804JBJFKBQN18315765452013-01-01 05:00:00
201311554600-6 812 837-25DL 461N668DNLGAATL116 7626 02013-01-01 06:00:00
201311554558-4 740 728 12UA1696N39463EWRORD150 7195582013-01-01 05:00:00
201311555600-5 913 854 19B6 507N516JBEWRFLL15810656 02013-01-01 06:00:00
201311557600-3 709 723-14EV5708N829ASLGAIAD 53 2296 02013-01-01 06:00:00
201311557600-3 838 846 -8B6 79N593JBJFKMCO140 9446 02013-01-01 06:00:00
201311558600-2 753 745 8AA 301N3ALAALGAORD138 7336 02013-01-01 06:00:00
201311558600-2 849 851 -2B6 49N793JBJFKPBI14910286 02013-01-01 06:00:00
201311558600-2 853 856 -3B6 71N657JBJFKTPA15810056 02013-01-01 06:00:00
201311558600-2 924 917 7UA 194N29129JFKLAX34524756 02013-01-01 06:00:00
201311558600-2 923 937-14UA1124N53441EWRSFO36125656 02013-01-01 06:00:00
201311559600-1 941 910 31AA 707N3DUAALGADFW25713896 02013-01-01 06:00:00
201311559559 0 702 706 -4B61806N708JBJFKBOS 44 1875592013-01-01 05:00:00
201311559600-1 854 902 -8UA1187N76515EWRLAS33722276 02013-01-01 06:00:00
201311600600 0 851 858 -7B6 371N595JBLGAFLL15210766 02013-01-01 06:00:00
201311600600 0 837 825 12MQ4650N542MQLGAATL134 7626 02013-01-01 06:00:00
201311601600 1 844 850 -6B6 343N644JBEWRPBI14710236 02013-01-01 06:00:00
201311602610-8 812 820 -8DL1919N971DLLGAMSP17010206102013-01-01 06:00:00
201311602605-3 821 805 16MQ4401N730MQLGADTW105 5026 52013-01-01 06:00:00
201311606610-4 858 910-12AA1895N633AAEWRMIA15210856102013-01-01 06:00:00
201311606610-4 837 845 -8DL1743N3739PJFKATL128 7606102013-01-01 06:00:00
201311607607 0 858 915-17UA1077N53442EWRMIA15710856 72013-01-01 06:00:00
201311608600 8 807 735 32MQ3768N9EAMQEWRORD139 7196 02013-01-01 06:00:00
20131161160011 945 931 14UA 303N532UAJFKSFO36625866 02013-01-01 06:00:00
201311613610 3 925 921 4B6 135N635JBJFKRSW17510746102013-01-01 06:00:00
201311615615 010391100-21B6 709N794JBJFKSJU18215986152013-01-01 06:00:00
201311615615 0 833 842 -9DL 575N326NBEWRATL120 7466152013-01-01 06:00:00
201393021232125 -222232247-24EV5489N712EVLGACHO 45 30521252013-09-30 21:00:00
201393021272129 -223142323 -9EV3833N16546EWRCLT 72 52921292013-09-30 21:00:00
201393021282130 -223282359-31B6 97N807JBJFKDEN213162621302013-09-30 21:00:00
201393021292059 3022302232 -2EV5048N751EVLGARIC 45 29220592013-09-30 20:00:00
201393021312140 -922252255-30MQ3621N807MQJFKDCA 36 21321402013-09-30 21:00:00
201393021402140 0 10 40-30AA 185N335AAJFKLAX298247521402013-09-30 21:00:00
201393021422129 1322502239 11EV4509N12957EWRPWM 47 28421292013-09-30 21:00:00
201393021452145 0 115 140-25B61103N633JBJFKSJU192159821452013-09-30 21:00:00
201393021472137 10 30 27 3B61371N627JBLGAFLL139107621372013-09-30 21:00:00
201393021492156 -722452308-23UA 523N813UAEWRBOS 37 20021562013-09-30 21:00:00
201393021502159 -922502306-16EV3842N10575EWRMHT 39 20921592013-09-30 21:00:00
201393021591845194234420301949E3320N906XJJFKBUF 50 30118452013-09-30 18:00:00
201393022032205 -223392331 8EV5311N722EVLGABGR 61 37822 52013-09-30 22:00:00
201393022072140 2722572250 7MQ3660N532MQLGABNA 97 76421402013-09-30 21:00:00
201393022112059 7223392242 57EV4672N12145EWRSTL120 87220592013-09-30 20:00:00
201393022312245-1423352356-21B6 108N193JBJFKPWM 48 27322452013-09-30 22:00:00
201393022332113 80 112 30 42UA 471N578UAEWRSFO318256521132013-09-30 21:00:00
201393022352001154 592249130B61083N804JBJFKMCO123 94420 12013-09-30 20:00:00
201393022372245 -823452353 -8B6 234N318JBJFKBTV 43 26622452013-09-30 22:00:00
201393022402245 -523342351-17B61816N354JBJFKSYR 41 20922452013-09-30 22:00:00
201393022402250-102347 7-20B62002N281JBJFKBUF 52 30122502013-09-30 22:00:00
201393022412246 -52345 1-16B6 486N346JBJFKROC 47 26422462013-09-30 22:00:00
201393023072255 1223592358 1B6 718N565JBJFKBOS 33 18722552013-09-30 22:00:00
201393023492359-10 325 350-25B6 745N516JBJFKPSE196161723592013-09-30 23:00:00
2013930 NA1842 NA NA2019 NAEV5274N740EVLGABNA NA 76418422013-09-30 18:00:00
2013930 NA1455 NA NA1634 NA9E3393NA JFKDCA NA 21314552013-09-30 14:00:00
2013930 NA2200 NA NA2312 NA9E3525NA LGASYR NA 19822 02013-09-30 22:00:00
2013930 NA1210 NA NA1330 NAMQ3461N535MQLGABNA NA 76412102013-09-30 12:00:00
2013930 NA1159 NA NA1344 NAMQ3572N511MQLGACLE NA 41911592013-09-30 11:00:00
2013930 NA 840 NA NA1020 NAMQ3531N839MQLGARDU NA 431 8402013-09-30 08:00:00
weather
A tibble: 26115 × 15
originyearmonthdayhourtempdewphumidwind_dirwind_speedwind_gustprecippressurevisibtime_hour
<chr><int><int><int><int><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dttm>
EWR201311 139.0226.0659.3727010.35702 NA01012.0102013-01-01 01:00:00
EWR201311 239.0226.9661.63250 8.05546 NA01012.3102013-01-01 02:00:00
EWR201311 339.0228.0464.4324011.50780 NA01012.5102013-01-01 03:00:00
EWR201311 439.9228.0462.2125012.65858 NA01012.2102013-01-01 04:00:00
EWR201311 539.0228.0464.4326012.65858 NA01011.9102013-01-01 05:00:00
EWR201311 637.9428.0467.2124011.50780 NA01012.4102013-01-01 06:00:00
EWR201311 739.0228.0464.4324014.96014 NA01012.2102013-01-01 07:00:00
EWR201311 839.9228.0462.2125010.35702 NA01012.2102013-01-01 08:00:00
EWR201311 939.9228.0462.2126014.96014 NA01012.7102013-01-01 09:00:00
EWR2013111041.0028.0459.6526013.80936 NA01012.4102013-01-01 10:00:00
EWR2013111141.0026.9657.0626014.96014 NA01011.4102013-01-01 11:00:00
EWR2013111339.2028.4069.6733016.11092 NA0 NA102013-01-01 13:00:00
EWR2013111439.0224.0854.6828013.80936 NA01010.8102013-01-01 14:00:00
EWR2013111537.9424.0857.04290 9.20624 NA01011.9102013-01-01 15:00:00
EWR2013111637.0419.9449.6230013.8093620.7140401012.1102013-01-01 16:00:00
EWR2013111735.9619.0449.8333011.50780 NA01013.2102013-01-01 17:00:00
EWR2013111833.9815.0845.4331012.6585825.3171601014.1102013-01-01 18:00:00
EWR2013111933.0812.9242.8432010.35702 NA01014.4102013-01-01 19:00:00
EWR2013112032.0015.0849.1931014.96014 NA01015.2102013-01-01 20:00:00
EWR2013112130.0212.9248.4832018.4124826.4679401016.0102013-01-01 21:00:00
EWR2013112228.9412.0248.6932018.4124825.3171601016.5102013-01-01 22:00:00
EWR2013112328.0410.9448.1531016.11092 NA01016.4102013-01-01 23:00:00
EWR201312 026.9610.9450.3431014.9601425.3171601016.3102013-01-02 00:00:00
EWR201312 126.0610.9452.2533012.6585824.1663801016.3102013-01-02 01:00:00
EWR201312 224.9810.9454.6533013.80936 NA01017.0102013-01-02 02:00:00
EWR201312 324.08 8.9651.9332014.96014 NA01016.6102013-01-02 03:00:00
EWR201312 424.08 8.9651.9333012.65858 NA01016.9102013-01-02 04:00:00
EWR201312 524.08 8.9651.93330 6.90468 NA01016.9102013-01-02 05:00:00
EWR201312 624.08 8.9651.93310 3.45234 NA01017.2102013-01-02 06:00:00
EWR201312 724.9810.0452.50300 6.90468 NA01017.6102013-01-02 07:00:00
LGA201312291342.8037.9488.76 7012.65858 NA0.19 NA 2.502013-12-29 13:00:00
LGA201312291441.0037.9493.19 6018.41248 NA0.21 NA 1.752013-12-29 14:00:00
LGA201312291541.0039.0292.59 4013.80936 NA0.37 999.9 1.502013-12-29 15:00:00
LGA201312291641.0037.9488.76350 8.0554623.015600.28 998.7 1.502013-12-29 16:00:00
LGA201312291744.0641.0093.2435020.71404 NA0.04 NA 5.002013-12-29 17:00:00
LGA201312291842.0839.0288.8133014.96014 NA0.00 997.2 3.002013-12-29 18:00:00
LGA201312291942.8037.9485.1332017.26170 NA0.00 NA 8.002013-12-29 19:00:00
LGA201312292042.0837.9486.8932019.56326 NA0.00 NA10.002013-12-29 20:00:00
LGA201312292142.8037.4082.1732016.11092 NA0.00 NA10.002013-12-29 21:00:00
LGA201312292242.9837.0479.38300 9.20624 NA0.001003.810.002013-12-29 22:00:00
LGA201312292342.9835.0673.3931017.2617024.166380.001005.110.002013-12-29 23:00:00
LGA20131230 042.0833.9872.7832011.50780 NA0.001005.910.002013-12-30 00:00:00
LGA20131230 142.0833.9872.78250 9.20624 NA0.001007.610.002013-12-30 01:00:00
LGA20131230 241.0033.9875.88240 8.05546 NA0.001008.310.002013-12-30 02:00:00
LGA20131230 342.9833.9870.30270 9.20624 NA0.001008.210.002013-12-30 03:00:00
LGA20131230 441.0033.0873.19 0 0.00000 NA0.001008.910.002013-12-30 04:00:00
LGA20131230 542.9833.0867.8125010.35702 NA0.001009.210.002013-12-30 05:00:00
LGA20131230 642.9833.9870.30230 6.90468 NA0.001010.810.002013-12-30 06:00:00
LGA20131230 744.0635.0670.4224011.50780 NA0.001011.910.002013-12-30 07:00:00
LGA20131230 844.0633.9867.4526011.50780 NA0.001012.910.002013-12-30 08:00:00
LGA20131230 944.0633.0865.0726013.80936 NA0.001013.710.002013-12-30 09:00:00
LGA201312301042.9833.8070.2833016.11092 NA0.00 NA10.002013-12-30 10:00:00
LGA201312301141.0028.4062.2134013.8093623.015600.00 NA10.002013-12-30 11:00:00
LGA201312301237.9423.0054.5133021.8648227.618720.001015.710.002013-12-30 12:00:00
LGA201312301337.0421.9253.9734017.2617020.714040.001016.510.002013-12-30 13:00:00
LGA201312301435.9619.9451.7834013.8093621.864820.001017.110.002013-12-30 14:00:00
LGA201312301533.9817.0649.5133017.2617021.864820.001018.810.002013-12-30 15:00:00
LGA201312301632.0015.0849.1934014.9601423.015600.001019.510.002013-12-30 16:00:00
LGA201312301730.9212.9246.7432017.26170 NA0.001019.910.002013-12-30 17:00:00
LGA201312301828.9410.9446.4133018.41248 NA0.001020.910.002013-12-30 18:00:00
flights %>%
  count(month)
A tibble: 12 × 2
monthn
<int><int>
127004
224951
328834
428330
528796
628243
729425
829327
927574
1028889
1127268
1228135
vars <- c('year','month','day','dep_delay','arr_delay')
flights %>%
  #filter(arr_delay >= 2)                             # 1.1. Find all flights that had an arrival delay of two or more hours
  #filter(dest %in% c('IAH','HOU'))                   # 1.2. Find all flights that flew to Houston (IAH or HOU)
  #filter(carrier %in% c('AA','DL','UA'))             # 1.3. Find all flights that were operated by United, American, or Delta
  #filter(month %in% c(7,8,9))                        # 1.4. Find all flights that departed in summer (July, August, and September)
  #filter(arr_delay > 120 & dep_delay <= 0)           # 1.5. Find all flights that arrived more than two hours late, but didn't leave late.
  #filter(dep_delay >= 60 & arr_delay < dep_delay-30) # 1.6. Find all flights that were delayed by at least an hour, but made up over 30 minutes in flight.
  #filter(dep_time >= 1 & dep_time <= 600)            # 1.7. Find all flights that departed between midnight and 6am (inclusive).
  #filter(between(dep_time, 1, 600))                  # 2.
  #filter(is.na(dep_time))                            # 3.
  #arrange(desc(is.na(dep_delay)))                    # 5.3.1 [1] How could you use arrange() to sort all missing values to the start? (Hint: use is.na())
  #arrange(desc(dep_delay))                           # 5.3.1 [2] Sort flights to find the most delayed flights. Find the flights that left earliest.
  #arrange(desc(distance/air_time))                   # 5.3.1 [3] Sort flights to find the fastest (highest speed) flights.
  #arrange(desc(distance))                            # 5.3.1 [4] Which flights travelled the farthest? Which travelled the shortest?
  #select(year,month,day)                             # Select columns by name
  #select(year:day)                                   # Select all columns between year and day (inclusive)
  #select(-(year:day))                                # Select all columns except those from year to day (inclusive)
  #rename(tail_num = tailnum)
  #select(time_hour, air_time, everything())          # Move a handful of variables to the start of the data frame.
  #select(dep_delay,dep_delay,dep_delay,arr_delay)    # 5.4.1 [2] What happens if you include the name of a variable multiple times in a select() call?
  #select(any_of(vars))                               # 5.4.1 [3] What does the any_of() function do? Why might it be helpful in conjunction with vector `vars`?
  select(contains('TIME'))                           # 5.4.1 [4] Does the result of running this code surprise you? How do the select helpers deal with case by default? How can you change that default?
A tibble: 336776 × 6
dep_timesched_dep_timearr_timesched_arr_timeair_timetime_hour
<int><int><int><int><dbl><dttm>
517515 830 8192272013-01-01 05:00:00
533529 850 8302272013-01-01 05:00:00
542540 923 8501602013-01-01 05:00:00
544545100410221832013-01-01 05:00:00
554600 812 8371162013-01-01 06:00:00
554558 740 7281502013-01-01 05:00:00
555600 913 8541582013-01-01 06:00:00
557600 709 723 532013-01-01 06:00:00
557600 838 8461402013-01-01 06:00:00
558600 753 7451382013-01-01 06:00:00
558600 849 8511492013-01-01 06:00:00
558600 853 8561582013-01-01 06:00:00
558600 924 9173452013-01-01 06:00:00
558600 923 9373612013-01-01 06:00:00
559600 941 9102572013-01-01 06:00:00
559559 702 706 442013-01-01 05:00:00
559600 854 9023372013-01-01 06:00:00
600600 851 8581522013-01-01 06:00:00
600600 837 8251342013-01-01 06:00:00
601600 844 8501472013-01-01 06:00:00
602610 812 8201702013-01-01 06:00:00
602605 821 8051052013-01-01 06:00:00
606610 858 9101522013-01-01 06:00:00
606610 837 8451282013-01-01 06:00:00
607607 858 9151572013-01-01 06:00:00
608600 807 7351392013-01-01 06:00:00
611600 945 9313662013-01-01 06:00:00
613610 925 9211752013-01-01 06:00:00
615615103911001822013-01-01 06:00:00
615615 833 8421202013-01-01 06:00:00
2123212522232247 452013-09-30 21:00:00
2127212923142323 722013-09-30 21:00:00
21282130232823592132013-09-30 21:00:00
2129205922302232 452013-09-30 20:00:00
2131214022252255 362013-09-30 21:00:00
21402140 10 402982013-09-30 21:00:00
2142212922502239 472013-09-30 21:00:00
21452145 115 1401922013-09-30 21:00:00
21472137 30 271392013-09-30 21:00:00
2149215622452308 372013-09-30 21:00:00
2150215922502306 392013-09-30 21:00:00
2159184523442030 502013-09-30 18:00:00
2203220523392331 612013-09-30 22:00:00
2207214022572250 972013-09-30 21:00:00
22112059233922421202013-09-30 20:00:00
2231224523352356 482013-09-30 22:00:00
22332113 112 303182013-09-30 21:00:00
22352001 5922491232013-09-30 20:00:00
2237224523452353 432013-09-30 22:00:00
2240224523342351 412013-09-30 22:00:00
224022502347 7 522013-09-30 22:00:00
224122462345 1 472013-09-30 22:00:00
2307225523592358 332013-09-30 22:00:00
23492359 325 3501962013-09-30 23:00:00
NA1842 NA2019 NA2013-09-30 18:00:00
NA1455 NA1634 NA2013-09-30 14:00:00
NA2200 NA2312 NA2013-09-30 22:00:00
NA1210 NA1330 NA2013-09-30 12:00:00
NA1159 NA1344 NA2013-09-30 11:00:00
NA 840 NA1020 NA2013-09-30 08:00:00
flights_sml <- flights %>%
  select(
    year:day,
    ends_with('delay'),
    distance,
    air_time
  )
flights_sml %>%
  mutate(
    gain          = dep_delay - arr_delay,
    hours         = air_time / 60,
    gain_per_hour = gain / hours,
    speed         = distance / air_time * 60
  )
flights %>%
  transmute(
    gain          = dep_delay - arr_delay,
    hours         = air_time / 60,
    gain_per_hour = gain / hours
  )
A tibble: 336776 × 11
yearmonthdaydep_delayarr_delaydistanceair_timegainhoursgain_per_hourspeed
<int><int><int><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
201311 2 111400227 -93.7833333 -2.3788546370.0441
201311 4 201416227-163.7833333 -4.2290749374.2731
201311 2 331089160-312.6666667-11.6250000408.3750
201311-1-181576183 173.0500000 5.5737705516.7213
201311-6-25 762116 191.9333333 9.8275862394.1379
201311-4 12 719150-162.5000000 -6.4000000287.6000
201311-5 191065158-242.6333333 -9.1139241404.4304
201311-3-14 229 53 110.8833333 12.4528302259.2453
201311-3 -8 944140 52.3333333 2.1428571404.5714
201311-2 8 733138-102.3000000 -4.3478261318.6957
201311-2 -21028149 02.4833333 0.0000000413.9597
201311-2 -31005158 12.6333333 0.3797468381.6456
201311-2 72475345 -95.7500000 -1.5652174430.4348
201311-2-142565361 126.0166667 1.9944598426.3158
201311-1 311389257-324.2833333 -7.4708171324.2802
201311 0 -4 187 44 40.7333333 5.4545455255.0000
201311-1 -82227337 75.6166667 1.2462908396.4985
201311 0 -71076152 72.5333333 2.7631579424.7368
201311 0 12 762134-122.2333333 -5.3731343341.1940
201311 1 -61023147 72.4500000 2.8571429417.5510
201311-8 -81020170 02.8333333 0.0000000360.0000
201311-3 16 502105-191.7500000-10.8571429286.8571
201311-4-121085152 82.5333333 3.1578947428.2895
201311-4 -8 760128 42.1333333 1.8750000356.2500
201311 0-171085157 172.6166667 6.4968153414.6497
201311 8 32 719139-242.3166667-10.3597122310.3597
20131111 142586366 -36.1000000 -0.4918033423.9344
201311 3 41074175 -12.9166667 -0.3428571368.2286
201311 0-211598182 213.0333333 6.9230769526.8132
201311 0 -9 746120 92.0000000 4.5000000373.0000
2013930 -2-24 305 45 220.750000029.333333406.6667
2013930 -2 -9 529 72 71.2000000 5.833333440.8333
2013930 -2-311626213 293.5500000 8.169014458.0282
2013930 30 -2 292 45 320.750000042.666667389.3333
2013930 -9-30 213 36 210.600000035.000000355.0000
2013930 0-302475298 304.9666667 6.040268498.3221
2013930 13 11 284 47 20.7833333 2.553191362.5532
2013930 0-251598192 253.2000000 7.812500499.3750
2013930 10 31076139 72.3166667 3.021583464.4604
2013930 -7-23 200 37 160.616666725.945946324.3243
2013930 -9-16 209 39 70.650000010.769231321.5385
2013930194194 301 50 00.8333333 0.000000361.2000
2013930 -2 8 378 61-101.0166667-9.836066371.8033
2013930 27 7 764 97 201.616666712.371134472.5773
2013930 72 57 872120 152.0000000 7.500000436.0000
2013930-14-21 273 48 70.8000000 8.750000341.2500
2013930 80 422565318 385.3000000 7.169811483.9623
2013930154130 944123 242.050000011.707317460.4878
2013930 -8 -8 266 43 00.7166667 0.000000371.1628
2013930 -5-17 209 41 120.683333317.560976305.8537
2013930-10-20 301 52 100.866666711.538462347.3077
2013930 -5-16 264 47 110.783333314.042553337.0213
2013930 12 1 187 33 110.550000020.000000340.0000
2013930-10-251617196 153.2666667 4.591837495.0000
2013930 NA NA 764 NA NA NA NA NA
2013930 NA NA 213 NA NA NA NA NA
2013930 NA NA 198 NA NA NA NA NA
2013930 NA NA 764 NA NA NA NA NA
2013930 NA NA 419 NA NA NA NA NA
2013930 NA NA 431 NA NA NA NA NA
A tibble: 336776 × 3
gainhoursgain_per_hour
<dbl><dbl><dbl>
-93.7833333 -2.3788546
-163.7833333 -4.2290749
-312.6666667-11.6250000
173.0500000 5.5737705
191.9333333 9.8275862
-162.5000000 -6.4000000
-242.6333333 -9.1139241
110.8833333 12.4528302
52.3333333 2.1428571
-102.3000000 -4.3478261
02.4833333 0.0000000
12.6333333 0.3797468
-95.7500000 -1.5652174
126.0166667 1.9944598
-324.2833333 -7.4708171
40.7333333 5.4545455
75.6166667 1.2462908
72.5333333 2.7631579
-122.2333333 -5.3731343
72.4500000 2.8571429
02.8333333 0.0000000
-191.7500000-10.8571429
82.5333333 3.1578947
42.1333333 1.8750000
172.6166667 6.4968153
-242.3166667-10.3597122
-36.1000000 -0.4918033
-12.9166667 -0.3428571
213.0333333 6.9230769
92.0000000 4.5000000
220.750000029.333333
71.2000000 5.833333
293.5500000 8.169014
320.750000042.666667
210.600000035.000000
304.9666667 6.040268
20.7833333 2.553191
253.2000000 7.812500
72.3166667 3.021583
160.616666725.945946
70.650000010.769231
00.8333333 0.000000
-101.0166667-9.836066
201.616666712.371134
152.0000000 7.500000
70.8000000 8.750000
385.3000000 7.169811
242.050000011.707317
00.7166667 0.000000
120.683333317.560976
100.866666711.538462
110.783333314.042553
110.550000020.000000
153.2666667 4.591837
NA NA NA
NA NA NA
NA NA NA
NA NA NA
NA NA NA
NA NA NA
flights %>% head(2)
A tibble: 2 × 19
yearmonthdaydep_timesched_dep_timedep_delayarr_timesched_arr_timearr_delaycarrierflighttailnumorigindestair_timedistancehourminutetime_hour
<int><int><int><int><int><dbl><int><int><dbl><chr><int><chr><chr><chr><dbl><dbl><dbl><dbl><dttm>
201311517515283081911UA1545N14228EWRIAH22714005152013-01-01 05:00:00
201311533529485083020UA1714N24211LGAIAH22714165292013-01-01 05:00:00
flights %>%
  transmute(
             dep_time,
    hour   = dep_time %/% 100,
    minute = dep_time %% 100
  )
A tibble: 336776 × 3
dep_timehourminute
<int><dbl><dbl>
517517
533533
542542
544544
554554
554554
555555
557557
557557
558558
558558
558558
558558
558558
559559
559559
559559
6006 0
6006 0
6016 1
6026 2
6026 2
6066 6
6066 6
6076 7
6086 8
611611
613613
615615
615615
21232123
21272127
21282128
21292129
21312131
21402140
21422142
21452145
21472147
21492149
21502150
21592159
220322 3
220722 7
22112211
22312231
22332233
22352235
22372237
22402240
22402240
22412241
230723 7
23492349
NANANA
NANANA
NANANA
NANANA
NANANA
NANANA
(x <- 1:10)
lag(x)
lead(x)
cumsum(x)
cumprod(x)
cummin(x)
cummax(x)
cummean(x)
  1. 1
  2. 2
  3. 3
  4. 4
  5. 5
  6. 6
  7. 7
  8. 8
  9. 9
  10. 10
  1. <NA>
  2. 1
  3. 2
  4. 3
  5. 4
  6. 5
  7. 6
  8. 7
  9. 8
  10. 9
  1. 2
  2. 3
  3. 4
  4. 5
  5. 6
  6. 7
  7. 8
  8. 9
  9. 10
  10. <NA>
  1. 1
  2. 3
  3. 6
  4. 10
  5. 15
  6. 21
  7. 28
  8. 36
  9. 45
  10. 55
  1. 1
  2. 2
  3. 6
  4. 24
  5. 120
  6. 720
  7. 5040
  8. 40320
  9. 362880
  10. 3628800
  1. 1
  2. 1
  3. 1
  4. 1
  5. 1
  6. 1
  7. 1
  8. 1
  9. 1
  10. 1
  1. 1
  2. 2
  3. 3
  4. 4
  5. 5
  6. 6
  7. 7
  8. 8
  9. 9
  10. 10
  1. 1
  2. 1.5
  3. 2
  4. 2.5
  5. 3
  6. 3.5
  7. 4
  8. 4.5
  9. 5
  10. 5.5
y <- c(1,2,2,NA,3,4)
min_rank(y)
min_rank(desc(y))
row_number(y)
dense_rank(y)
percent_rank(y)
cume_dist(y)
  1. 1
  2. 2
  3. 2
  4. <NA>
  5. 4
  6. 5
  1. 5
  2. 3
  3. 3
  4. <NA>
  5. 2
  6. 1
  1. 1
  2. 2
  3. 3
  4. <NA>
  5. 4
  6. 5
  1. 1
  2. 2
  3. 2
  4. <NA>
  5. 3
  6. 4
  1. 0
  2. 0.25
  3. 0.25
  4. <NA>
  5. 0.75
  6. 1
  1. 0.2
  2. 0.6
  3. 0.6
  4. <NA>
  5. 0.8
  6. 1
# 5.5.2 [1] Currently `dep_time` and `sched_dep_time` are convenient to look at, but hard to compute with because they're not really continuous numbers. Convert them to a more convenient representation of number of minutes since midnight.

flights %>%
  transmute(
                         sched_dep_time,
    sched_dep_time_min = sched_dep_time %/% 100 * 60 + sched_dep_time %% 100,
                               dep_time,
          dep_time_min =       dep_time %/% 100 * 60 +       dep_time %% 100
  )
A tibble: 336776 × 4
sched_dep_timesched_dep_time_mindep_timedep_time_min
<int><dbl><int><dbl>
515315517317
529329533333
540340542342
545345544344
600360554354
558358554354
600360555355
600360557357
600360557357
600360558358
600360558358
600360558358
600360558358
600360558358
600360559359
559359559359
600360559359
600360600360
600360600360
600360601361
610370602362
605365602362
610370606366
610370606366
607367607367
600360608368
600360611371
610370613373
615375615375
615375615375
2125128521231283
2129128921271287
2130129021281288
2059125921291289
2140130021311291
2140130021401300
2129128921421302
2145130521451305
2137129721471307
2156131621491309
2159131921501310
1845112521591319
2205132522031323
2140130022071327
2059125922111331
2245136522311351
2113127322331353
2001120122351355
2245136522371357
2245136522401360
2250137022401360
2246136622411361
2255137523071387
2359143923491429
18421122 NA NA
1455 895 NA NA
22001320 NA NA
1210 730 NA NA
1159 719 NA NA
840 520 NA NA
# 5.5.2 [2] Compare `air_time` with `arr_time - dep_time`. What do you expect to see? What do you see? What do you need to do to fix it?

flights %>%
  transmute(
    air_time,
    dep_time_min = dep_time %/% 100 * 60 + dep_time %% 100,
    arr_time_min = arr_time %/% 100 * 60 + arr_time %% 100,
    air_time_min = arr_time_min - dep_time_min,
    TZdelta      = air_time_min - air_time
  )
A tibble: 336776 × 5
air_timedep_time_minarr_time_minair_time_minTZdelta
<dbl><dbl><dbl><dbl><dbl>
227317510193 -34
227333530197 -30
160342563221 61
183344604260 77
116354492138 22
150354460106 -44
158355553198 40
53357429 72 19
140357518161 21
138358473115 -23
149358529171 22
158358533175 17
345358564206-139
361358563205-156
257359581222 -35
44359422 63 19
337359534175-162
152360531171 19
134360517157 23
147361524163 16
170362492130 -40
105362501139 34
152366538172 20
128366517151 23
157367538171 14
139368487119 -20
366371585214-152
175373565192 17
182375639264 82
120375513138 18
4512831343 60 15
7212871394 107 35
21312881408 120 -93
4512891350 61 16
3612911345 54 18
2981300 10-1290-1588
4713021370 68 21
1921305 75-1230-1422
1391307 30-1277-1416
3713091365 56 19
3913101370 60 21
5013191424 105 55
6113231419 96 35
9713271377 50 -47
12013311419 88 -32
4813511415 64 16
3181353 72-1281-1599
1231355 59-1296-1419
4313571425 68 25
4113601414 54 13
5213601427 67 15
4713611425 64 17
3313871439 52 19
1961429 205-1224-1420
NA NA NA NA NA
NA NA NA NA NA
NA NA NA NA NA
NA NA NA NA NA
NA NA NA NA NA
NA NA NA NA NA
# 5.5.2 [3] Compare `dep_time`, `sched_dep_time`, and `dep_delay`. How would you expect those three numbers to be related?

flights %>%
  transmute(
          dep_time_min =       dep_time %/% 100 * 60 +       dep_time %% 100,
    sched_dep_time_min = sched_dep_time %/% 100 * 60 + sched_dep_time %% 100,
    dep_time_min - sched_dep_time_min,
    dep_delay
  )
A tibble: 336776 × 4
dep_time_minsched_dep_time_mindep_time_min - sched_dep_time_mindep_delay
<dbl><dbl><dbl><dbl>
317315 2 2
333329 4 4
342340 2 2
344345-1-1
354360-6-6
354358-4-4
355360-5-5
357360-3-3
357360-3-3
358360-2-2
358360-2-2
358360-2-2
358360-2-2
358360-2-2
359360-1-1
359359 0 0
359360-1-1
360360 0 0
360360 0 0
361360 1 1
362370-8-8
362365-3-3
366370-4-4
366370-4-4
367367 0 0
368360 8 8
3713601111
373370 3 3
375375 0 0
375375 0 0
12831285 -2 -2
12871289 -2 -2
12881290 -2 -2
12891259 30 30
12911300 -9 -9
13001300 0 0
13021289 13 13
13051305 0 0
13071297 10 10
13091316 -7 -7
13101319 -9 -9
13191125194194
13231325 -2 -2
13271300 27 27
13311259 72 72
13511365-14-14
13531273 80 80
13551201154154
13571365 -8 -8
13601365 -5 -5
13601370-10-10
13611366 -5 -5
13871375 12 12
14291439-10-10
NA1122 NA NA
NA 895 NA NA
NA1320 NA NA
NA 730 NA NA
NA 719 NA NA
NA 520 NA NA
# 5.5.2 [4] Find the 10 most delayed flights using a ranking function. How do you want to handle ties? Carefully read the documentation for min_rank().

flights %>%
  mutate(
    dep_delay_rank = min_rank(desc(dep_delay))
  ) %>%
    arrange(desc(dep_delay)) %>%
      select(dep_delay, dep_delay_rank, everything()) %>%
        head(10)
A tibble: 10 × 20
dep_delaydep_delay_rankyearmonthdaydep_timesched_dep_timearr_timesched_arr_timearr_delaycarrierflighttailnumorigindestair_timedistancehourminutetime_hour
<dbl><int><int><int><int><int><int><int><int><dbl><chr><int><chr><chr><chr><dbl><dbl><dbl><dbl><dttm>
1301 12013 1 9 641 900124215301272HA 51N384HAJFKHNL6404983 9 02013-01-09 09:00:00
1137 22013 61514321935160721201127MQ3535N504MQJFKCMH 74 48319352013-06-15 19:00:00
1126 32013 11011211635123918101109MQ3695N517MQEWRORD111 71916352013-01-10 16:00:00
1014 42013 92011391845145722101007AA 177N338AAJFKSFO354258618452013-09-20 18:00:00
1005 52013 722 845160010441815 989MQ3075N665MQJFKCVG 96 58916 02013-07-22 16:00:00
960 62013 4101100190013422211 931DL2391N959DLJFKTPA139100519 02013-04-10 19:00:00
911 72013 3172321 810 1351020 915DL2119N927DALGAMSP1671020 8102013-03-17 08:00:00
899 82013 627 959190012362226 850DL2007N3762YJFKPDX313245419 02013-06-27 19:00:00
898 92013 7222257 759 1211026 895DL2047N6716CLGAATL109 762 7592013-07-22 07:00:00
89610201312 5 756170010582020 878AA 172N5DMAAEWRMIA149108517 02013-12-05 17:00:00
# 5.5.2 [5] What does `1:3 + 1:10` return? Why?

1:3 + 1:10
Warning message in 1:3 + 1:10:
“longer object length is not a multiple of shorter object length”
  1. 2
  2. 4
  3. 6
  4. 5
  5. 7
  6. 9
  7. 8
  8. 10
  9. 12
  10. 11

07 - Exploratory Data Analysis#

diamonds %>%
  count(cut)

ggplot(data = diamonds) +
  geom_bar(mapping = aes(x = cut)) +
  theme(text = element_text(size = 20))
A tibble: 5 × 2
cutn
<ord><int>
Fair 1610
Good 4906
Very Good12082
Premium 13791
Ideal 21551
../../../_images/8586e83af2fef79db67ea7a2b567732905fff3c81af47f8395d5f897522ea0f6.png
diamonds %>%
  count(cut_width(x = carat, width = 0.5))

ggplot(data = diamonds) +
  geom_histogram(mapping = aes(x = carat), binwidth = 0.5) +
  theme(text = element_text(size = 20))
A tibble: 11 × 2
cut_width(x = carat, width = 0.5)n
<fct><int>
[-0.25,0.25] 785
(0.25,0.75] 29498
(0.75,1.25] 15977
(1.25,1.75] 5313
(1.75,2.25] 2002
(2.25,2.75] 322
(2.75,3.25] 32
(3.25,3.75] 5
(3.75,4.25] 4
(4.25,4.75] 1
(4.75,5.25] 1
../../../_images/2955c29419ba38eef4b22b77a656bd2248566173a002ac81105e7198e24ef5ca.png
smaller <-
  diamonds %>%
    filter(carat < 3)

smaller %>%
  count(cut_width(x = carat, width = 0.1))

ggplot(data = smaller) +
  geom_histogram(mapping = aes(x = carat), binwidth = 0.1) +
  theme(text = element_text(size = 20))
A tibble: 27 × 2
cut_width(x = carat, width = 0.1)n
<fct><int>
[0.15,0.25] 785
(0.25,0.35]10273
(0.35,0.45] 6231
(0.45,0.55] 5417
(0.55,0.65] 2328
(0.65,0.75] 5249
(0.75,0.85] 1725
(0.85,0.95] 2656
(0.95,1.05] 6258
(1.05,1.15] 2687
(1.15,1.25] 2651
(1.25,1.35] 1063
(1.35,1.45] 325
(1.45,1.55] 2556
(1.55,1.65] 738
(1.65,1.75] 631
(1.75,1.85] 140
(1.85,1.95] 57
(1.95,2.05] 1173
(2.05,2.15] 407
(2.15,2.25] 225
(2.25,2.35] 135
(2.35,2.45] 69
(2.45,2.55] 81
(2.55,2.65] 21
(2.65,2.75] 16
(2.75,2.85] 3
../../../_images/6963f7d6fb7e6a761b0c3a76dda2db4c01b653ea60adb4e67c9d513a4c5eca77.png
ggplot(data = smaller, mapping = aes(x = carat, color = cut)) +
  geom_freqpoly(binwidth = 0.1) +
  theme(text = element_text(size = 20))
../../../_images/66018a7b8ca2ed2259ea772f67ea72b4bad05f3e247e42fce5931cb78bdeeafe.png
ggplot(data = smaller, mapping = aes(x = carat)) +
  geom_histogram(binwidth = 0.01) +
  theme(text = element_text(size = 20))
../../../_images/09eb2c97e5eceb75c65ab39d66169b156160a41e1f66c9af55a4e20662e72bef.png
ggplot(data = faithful, mapping = aes(x = eruptions)) +
  geom_histogram(binwidth = 0.25) +
  theme(text = element_text(size = 20))
../../../_images/8e3e3f5ac3bbfd088f90cb799dc33bc10e52ca4b33c463c8008e8a8d1fdec4a3.png
ggplot(data = diamonds) +
  geom_histogram(mapping = aes(x = y), binwidth = 0.5) +
  theme(text = element_text(size = 20))
../../../_images/7bd5c4cab0ccefe66efcc4bda2fa138cf1a4f7942f615b3858a801082c4d84f1.png
ggplot(data = diamonds) +
  geom_histogram(mapping = aes(x = y), binwidth = 0.5) +
  coord_cartesian(ylim = c(0, 50)) +
  theme(text = element_text(size = 20))
../../../_images/565a82546323709987c5bd285bc622b9a83e0d0df565d35cd685ae1e7e725762.png
unusual <-
  diamonds %>%
    filter(y < 3 | y > 20) %>%
    select(price, x, y, z) %>%
    arrange(y)
unusual
A tibble: 9 × 4
pricexyz
<int><dbl><dbl><dbl>
51390.00 0.00.00
63810.00 0.00.00
128000.00 0.00.00
156860.00 0.00.00
180340.00 0.00.00
21300.00 0.00.00
21300.00 0.00.00
20755.1531.85.12
122108.0958.98.06

7.4 - Missing values#

Drop the unusual values#

diamonds2 <-
  diamonds %>%
    filter(between(x = y, left = 3, right = 20))
head(x = diamonds2)
A tibble: 6 × 10
caratcutcolorclaritydepthtablepricexyz
<dbl><ord><ord><ord><dbl><dbl><int><dbl><dbl><dbl>
0.23Ideal ESI2 61.5553263.953.982.43
0.21Premium ESI1 59.8613263.893.842.31
0.23Good EVS1 56.9653274.054.072.31
0.29Premium IVS2 62.4583344.204.232.63
0.31Good JSI2 63.3583354.344.352.75
0.24Very GoodJVVS262.8573363.943.962.48

Replace the unusual values with missing values#

diamonds2 <-
  diamonds %>%
    mutate(y = ifelse(test = y < 3 | y > 20, yes = NA, no = y))
head(x = diamonds2)
A tibble: 6 × 10
caratcutcolorclaritydepthtablepricexyz
<dbl><ord><ord><ord><dbl><dbl><int><dbl><dbl><dbl>
0.23Ideal ESI2 61.5553263.953.982.43
0.21Premium ESI1 59.8613263.893.842.31
0.23Good EVS1 56.9653274.054.072.31
0.29Premium IVS2 62.4583344.204.232.63
0.31Good JSI2 63.3583354.344.352.75
0.24Very GoodJVVS262.8573363.943.962.48
diamonds2 <-
  diamonds %>%
    mutate(y = case_when(y >= 3 & y <= 20 ~ y, .default = NA))
head(x = diamonds2)
A tibble: 6 × 10
caratcutcolorclaritydepthtablepricexyz
<dbl><ord><ord><ord><dbl><dbl><int><dbl><dbl><dbl>
0.23Ideal ESI2 61.5553263.953.982.43
0.21Premium ESI1 59.8613263.893.842.31
0.23Good EVS1 56.9653274.054.072.31
0.29Premium IVS2 62.4583344.204.232.63
0.31Good JSI2 63.3583354.344.352.75
0.24Very GoodJVVS262.8573363.943.962.48
ggplot(data = diamonds2, mapping = aes(x = x, y = y)) +
  geom_point(na.rm = TRUE) +
  theme(text = element_text(size = 20))
../../../_images/7d9e821b9c469372fa6b167a2840c8c5d1444b11fb29878e2433b38541d80113.png
nycflights13::flights %>%
  mutate(
    cancelled      = is.na(dep_time),
    sched_hour     = sched_dep_time %/% 100,
    sched_min      = sched_dep_time %%  100,
    sched_dep_time = sched_hour + sched_min / 60
  ) %>%
    ggplot(mapping = aes(sched_dep_time)) +
      geom_freqpoly(mapping = aes(color = cancelled), binwidth = 1/4) +
      theme(text = element_text(size = 20))
../../../_images/d51c1dae934921c0e41bed5730c9c9c5e2bee4f795105b3acf3ade1045843de6.png

7.5 - Covariation#

7.5.1 - One categorical variable and one continuous variable#

ggplot(data = diamonds, mapping = aes(x = price)) +
  geom_freqpoly(mapping = aes(color = cut), binwidth = 500) +
  theme(text = element_text(size = 20))
../../../_images/1229d2ea2973bff191a7aa863617d835c8a1d5ff0ac5b62871d2bd46ec319266.png
ggplot(data = diamonds) +
  geom_bar(mapping = aes(x = cut)) +
  theme(text = element_text(size = 20))
../../../_images/8586e83af2fef79db67ea7a2b567732905fff3c81af47f8395d5f897522ea0f6.png
ggplot(data = diamonds, mapping = aes(x = price, y = ..density..)) +
  geom_freqpoly(mapping = aes(color = cut), binwidth = 500) +
  theme(text = element_text(size = 20))
Warning message:
“The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
 Please use `after_stat(density)` instead.”
../../../_images/bdec9f62f5143512ed145bf27c830e41cdba17ee7aff6596b1ff08f69a87116a.png
ggplot(data = diamonds, mapping = aes(x = cut, y = price)) +
  geom_boxplot() +
  theme(text = element_text(size = 20))
../../../_images/6dc6d189d574f906e27ed0646e50781638146ad77cf79c8a151bf719b4e2df84.png
ggplot(data = mpg, mapping = aes(x = class, y = hwy)) +
  geom_boxplot() +
  theme(text = element_text(size = 20))
../../../_images/0316634c75b2e49652a6eac6a7a612004aca1255487f234eac01f91f2270d5a3.png
ggplot(data = mpg) +
  geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) +
  theme(text = element_text(size = 20))
../../../_images/cc3381012eb320f475b2edf233eba04e93d9bb9420dc50a493cff2e47030ac25.png
ggplot(data = mpg) +
  geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) +
  coord_flip() +
  theme(text = element_text(size = 20))
../../../_images/bb4d5af2e3d80ea37778e39886b20766ee4301f1b8228428180a6638121e6827.png

7.5.2 - Two categorical variables#

# to visualize the covariation between categorical variables, count the number of observations for each combination
# one way to do this is to rely on the builtin `geom_count()`
# another approach is to compute the count with dplyr, and then visualize with `geom_tile()` and the fill aesthetic
# the size of each circle in the plot displays how many observations occurred at each combination of values
# covariation will appear as a strong correlation between specific x values and specific y values
# if the categorical variables are unordered, use the seriation package to simultaneously reorder the rows and columns in order to more clearly reveal interesting patterns
# for larger plots, try the d3heatmap or heatmaply packages, which create interactive plots

ggplot(data = diamonds) +
  geom_count(mapping = aes(x = cut, y = color)) +
  theme(text = element_text(size = 20))

diamonds %>%
  count(color, cut)

diamonds %>%
  count(color, cut) %>%
  ggplot(mapping = aes(x = color, y = cut)) +
    geom_tile(mapping = aes(fill = n)) +
    theme(text = element_text(size = 20))
A tibble: 35 × 3
colorcutn
<ord><ord><int>
DFair 163
DGood 662
DVery Good1513
DPremium 1603
DIdeal 2834
EFair 224
EGood 933
EVery Good2400
EPremium 2337
EIdeal 3903
FFair 312
FGood 909
FVery Good2164
FPremium 2331
FIdeal 3826
GFair 314
GGood 871
GVery Good2299
GPremium 2924
GIdeal 4884
HFair 303
HGood 702
HVery Good1824
HPremium 2360
HIdeal 3115
IFair 175
IGood 522
IVery Good1204
IPremium 1428
IIdeal 2093
JFair 119
JGood 307
JVery Good 678
JPremium 808
JIdeal 896
../../../_images/5c9d951aa22dbd782cf5770c33964875d504e3d7cacea917bc046e5b9ec3205b.png ../../../_images/651f63353854b8281d2a7d4e514de3d7e94f5812c705ac36f79d155d5d6bed44.png

7.5.3 - Two continuous variables#

# one way to visualize the covariation between two continuous variables is to draw a scatterplot with `geom_point()`
# covariation can be seen as a pattern in the points
# for example, an expontential relationship between carat size and price of a diamond can be seen

ggplot(data = diamonds) +
  geom_point(mapping = aes(x = carat, y = price)) +
  theme(text = element_text(size = 20))
../../../_images/6ea853c16bdc2584d5bad5ae79d21513704537d946975624b5e5a277ebecbe97.png
ggplot(data = diamonds) +
  geom_point(mapping = aes(x = carat, y = price), alpha = 1/100) +
  theme(text = element_text(size = 20))
../../../_images/fe2c1b7d88ee9a42ebf1377c38677220d1df7f9836c68abb6f18bb167d2c12c1.png
ggplot(data = smaller) +
  geom_bin2d(mapping = aes(x = carat, y = price)) +
  theme(text = element_text(size = 20))
../../../_images/65b85acb03e02ec868e3a9727d390d57121dbb505546748724545750afdd9a98.png
ggplot(data = smaller) +
  geom_hex(mapping = aes(x = carat, y = price)) +
  theme(text = element_text(size = 20))
../../../_images/f3503763f930329bf1e5baae0b33dcd3c48ca134c6a34002dfadf6291175eb58.png
ggplot(data = smaller, mapping = aes(x = carat, y = price)) +
  geom_boxplot(mapping = aes(group = cut_width(x = carat, width = 0.1))) +
  theme(text = element_text(size = 20))
../../../_images/81a14924916b96ba5feca662902489316666a97016a52d1795261ca1268ff8aa.png
ggplot(data = smaller, mapping = aes(x = carat, y = price)) +
  geom_boxplot(mapping = aes(group = cut_number(x = carat, n = 20))) +
  theme(text = element_text(size = 20))
../../../_images/4e18973d3e08d9cd5965cc3cb989222c956cc671eb41f775be862ef0ce7a38c1.png

7.6 - Patterns and models#

ggplot(data = faithful) +
  geom_point(mapping = aes(x = eruptions, y = waiting)) +
  theme(text = element_text(size = 20))
../../../_images/50b396023dd7669e457fc10b426e8fe29b0c71c34cc50e17d2bdaf782ebbf11d.png
mod <- lm(formula = log(x = price) ~ log(x = carat), data = diamonds)

diamonds2 <-
  diamonds %>%
    add_residuals(model = mod) %>%
    mutate(resid = exp(x = resid))

ggplot(data = diamonds2) +
  geom_point(mapping = aes(x = carat, y = resid)) +
  theme(text = element_text(size = 20))
../../../_images/a1efa2577afeefe88af4db9e339fa700dbea7021d97dc66de34e1213faa14d97.png
ggplot(data = diamonds2) +
  geom_boxplot(mapping = aes(x = cut, y = resid)) +
  theme(text = element_text(size = 20))
../../../_images/b9009d739006fdb9c06ad63d6c737349b35ef6e383e6e470c1105194330af6aa.png

12 - Tidy Data#

table1
table2
table3
table4a
table4b
A tibble: 6 × 4
countryyearcasespopulation
<chr><dbl><dbl><dbl>
Afghanistan1999 745 19987071
Afghanistan2000 2666 20595360
Brazil 1999 37737 172006362
Brazil 2000 80488 174504898
China 19992122581272915272
China 20002137661280428583
A tibble: 12 × 4
countryyeartypecount
<chr><dbl><chr><dbl>
Afghanistan1999cases 745
Afghanistan1999population 19987071
Afghanistan2000cases 2666
Afghanistan2000population 20595360
Brazil 1999cases 37737
Brazil 1999population 172006362
Brazil 2000cases 80488
Brazil 2000population 174504898
China 1999cases 212258
China 1999population1272915272
China 2000cases 213766
China 2000population1280428583
A tibble: 6 × 3
countryyearrate
<chr><dbl><chr>
Afghanistan1999745/19987071
Afghanistan20002666/20595360
Brazil 199937737/172006362
Brazil 200080488/174504898
China 1999212258/1272915272
China 2000213766/1280428583
A tibble: 3 × 3
country19992000
<chr><dbl><dbl>
Afghanistan 745 2666
Brazil 37737 80488
China 212258213766
A tibble: 3 × 3
country19992000
<chr><dbl><dbl>
Afghanistan 19987071 20595360
Brazil 172006362 174504898
China 12729152721280428583
# Compute rate per 10,000
table1 %>%
  mutate(rate = cases / population * 10000)

# Compute cases per year
table1 %>%
  count(year, wt = cases)

# Visualize changes over time
ggplot(table1, aes(year, cases)) +
  geom_line(aes(group = country), color = "grey50") +
  geom_point(aes(color = country))
A tibble: 6 × 5
countryyearcasespopulationrate
<chr><dbl><dbl><dbl><dbl>
Afghanistan1999 745 199870710.372741
Afghanistan2000 2666 205953601.294466
Brazil 1999 37737 1720063622.193930
Brazil 2000 80488 1745048984.612363
China 199921225812729152721.667495
China 200021376612804285831.669488
A tibble: 2 × 2
yearn
<dbl><dbl>
1999250740
2000296920
../../../_images/064c99a4d0d24feadaf03fdd79611b164732df7af602b3d56de52b72d8a6c98a.png
table4a
table4a %>%
  pivot_longer(c(`1999`,`2000`),names_to='year',values_to='cases')
table4b
table4b %>%
  pivot_longer(c(`1999`,`2000`),names_to='year',values_to='population')
tidy4a <- table4a %>%
  pivot_longer(c(`1999`,`2000`),names_to='year',values_to='cases')
tidy4b <- table4b %>%
  pivot_longer(c(`1999`,`2000`),names_to='year',values_to='population')
left_join(tidy4a,tidy4b)
A tibble: 3 × 3
country19992000
<chr><dbl><dbl>
Afghanistan 745 2666
Brazil 37737 80488
China 212258213766
A tibble: 6 × 3
countryyearcases
<chr><chr><dbl>
Afghanistan1999 745
Afghanistan2000 2666
Brazil 1999 37737
Brazil 2000 80488
China 1999212258
China 2000213766
A tibble: 3 × 3
country19992000
<chr><dbl><dbl>
Afghanistan 19987071 20595360
Brazil 172006362 174504898
China 12729152721280428583
A tibble: 6 × 3
countryyearpopulation
<chr><chr><dbl>
Afghanistan1999 19987071
Afghanistan2000 20595360
Brazil 1999 172006362
Brazil 2000 174504898
China 19991272915272
China 20001280428583
Joining with `by = join_by(country, year)`
A tibble: 6 × 4
countryyearcasespopulation
<chr><chr><dbl><dbl>
Afghanistan1999 745 19987071
Afghanistan2000 2666 20595360
Brazil 1999 37737 172006362
Brazil 2000 80488 174504898
China 19992122581272915272
China 20002137661280428583

13 - Relational data#

# tibble `flights` connects to tibble `planes`   via variable  `tailnum`
# tibble `flights` connects to tibble `airlines` via variable  `carrier`
# tibble `flights` connects to tibble `airports` via variables `origin` and `dest`
# tibble `flights` connects to tibble `weather`  via variables `origin`, `year`, `month`, `day`, and `hour`

data(package = 'nycflights13')
# tibble `airlines` looks up the full carrier name from its abbreviated code
head(x = nycflights13::airlines, n = 6)
A tibble: 6 × 2
carriername
<chr><chr>
9EEndeavor Air Inc.
AAAmerican Airlines Inc.
ASAlaska Airlines Inc.
B6JetBlue Airways
DLDelta Air Lines Inc.
EVExpressJet Airlines Inc.
# tibble `airports` gives info about each airport each of which is identified by the faa airport code
head(x = nycflights13::airports, n = 6)
A tibble: 6 × 8
faanamelatlonalttzdsttzone
<chr><chr><dbl><dbl><dbl><dbl><chr><chr>
04GLansdowne Airport 41.13047-80.619581044-5AAmerica/New_York
06AMoton Field Municipal Airport 32.46057-85.68003 264-6AAmerica/Chicago
06CSchaumburg Regional 41.98934-88.10124 801-6AAmerica/Chicago
06NRandall Airport 41.43191-74.39156 523-5AAmerica/New_York
09JJekyll Island Airport 31.07447-81.42778 11-5AAmerica/New_York
0A9Elizabethton Municipal Airport36.37122-82.173421593-5AAmerica/New_York
# tibble `flights`
#   foreign key - tailnum

head(x = nycflights13::flights, n = 6)

# not quite the primary key
flights %>%
  count(year, month, day, flight, tailnum) %>%
    filter(n > 1)
A tibble: 6 × 19
yearmonthdaydep_timesched_dep_timedep_delayarr_timesched_arr_timearr_delaycarrierflighttailnumorigindestair_timedistancehourminutetime_hour
<int><int><int><int><int><dbl><int><int><dbl><chr><int><chr><chr><chr><dbl><dbl><dbl><dbl><dttm>
201311517515 2 830 819 11UA1545N14228EWRIAH22714005152013-01-01 05:00:00
201311533529 4 850 830 20UA1714N24211LGAIAH22714165292013-01-01 05:00:00
201311542540 2 923 850 33AA1141N619AAJFKMIA16010895402013-01-01 05:00:00
201311544545-110041022-18B6 725N804JBJFKBQN18315765452013-01-01 05:00:00
201311554600-6 812 837-25DL 461N668DNLGAATL116 7626 02013-01-01 06:00:00
201311554558-4 740 728 12UA1696N39463EWRORD150 7195582013-01-01 05:00:00
A tibble: 11 × 6
yearmonthdayflighttailnumn
<int><int><int><int><chr><int>
2013 2 9 303NA 2
2013 2 9 655NA 2
2013 2 91623NA 2
2013 6 82269N487WN2
2013 6152269N230WN2
2013 6222269N440LV2
2013 6292269N707SA2
2013 7 62269N259WN2
2013 8 32269N446WN2
2013 8102269N478WN2
20131215 398NA 2
# tibble `planes` gives info about planes each of which is identified by its tailnum
#   primary key - variable `tailnum`

head(x = nycflights13::planes, n = 6)

planes %>%
  count(tailnum) %>%
    filter(n > 1)
A tibble: 6 × 9
tailnumyeartypemanufacturermodelenginesseatsspeedengine
<chr><int><chr><chr><chr><int><int><int><chr>
N101562004Fixed wing multi engineEMBRAER EMB-145XR2 55NATurbo-fan
N102UW1998Fixed wing multi engineAIRBUS INDUSTRIEA320-214 2182NATurbo-fan
N103US1999Fixed wing multi engineAIRBUS INDUSTRIEA320-214 2182NATurbo-fan
N104UW1999Fixed wing multi engineAIRBUS INDUSTRIEA320-214 2182NATurbo-fan
N105752002Fixed wing multi engineEMBRAER EMB-145LR2 55NATurbo-fan
N105UW1999Fixed wing multi engineAIRBUS INDUSTRIEA320-214 2182NATurbo-fan
A tibble: 0 × 2
tailnumn
<chr><int>
# tibble `weather` gives the weather at each NYC airport for each hour
#   primary key? - variables `year`, `month`, `day`, `hour`, `origin`

head(x = nycflights13::weather, n = 6)

# not quite the primary key
weather %>%
  count(year, month, day, hour, origin) %>%
    filter(n > 1)
A tibble: 6 × 15
originyearmonthdayhourtempdewphumidwind_dirwind_speedwind_gustprecippressurevisibtime_hour
<chr><int><int><int><int><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dttm>
EWR201311139.0226.0659.3727010.35702NA01012.0102013-01-01 01:00:00
EWR201311239.0226.9661.63250 8.05546NA01012.3102013-01-01 02:00:00
EWR201311339.0228.0464.4324011.50780NA01012.5102013-01-01 03:00:00
EWR201311439.9228.0462.2125012.65858NA01012.2102013-01-01 04:00:00
EWR201311539.0228.0464.4326012.65858NA01011.9102013-01-01 05:00:00
EWR201311637.9428.0467.2124011.50780NA01012.4102013-01-01 06:00:00
A tibble: 3 × 6
yearmonthdayhouroriginn
<int><int><int><int><chr><int>
20131131EWR2
20131131JFK2
20131131LGA2
flights2 <-
  flights %>%
    select(year:day, hour, origin, dest, tailnum, carrier)
head(x = flights2, n = 6)
A tibble: 6 × 8
yearmonthdayhourorigindesttailnumcarrier
<int><int><int><dbl><chr><chr><chr><chr>
2013115EWRIAHN14228UA
2013115LGAIAHN24211UA
2013115JFKMIAN619AAAA
2013115JFKBQNN804JBB6
2013116LGAATLN668DNDL
2013115EWRORDN39463UA
flights2 %>%
  select(-origin, -dest) %>%
    left_join(y = airlines, by = 'carrier') %>%
      head(n = 6)
A tibble: 6 × 7
yearmonthdayhourtailnumcarriername
<int><int><int><dbl><chr><chr><chr>
2013115N14228UAUnited Air Lines Inc.
2013115N24211UAUnited Air Lines Inc.
2013115N619AAAAAmerican Airlines Inc.
2013115N804JBB6JetBlue Airways
2013116N668DNDLDelta Air Lines Inc.
2013115N39463UAUnited Air Lines Inc.
flights2 %>%
  select(-origin, -dest) %>%
    mutate(
      name = airlines$name[match(carrier, airlines$carrier)]
    ) %>%
      head(n = 6)
A tibble: 6 × 7
yearmonthdayhourtailnumcarriername
<int><int><int><dbl><chr><chr><chr>
2013115N14228UAUnited Air Lines Inc.
2013115N24211UAUnited Air Lines Inc.
2013115N619AAAAAmerican Airlines Inc.
2013115N804JBB6JetBlue Airways
2013116N668DNDLDelta Air Lines Inc.
2013115N39463UAUnited Air Lines Inc.
x <- tribble(
  ~key, ~val_x,
     1,   'x1',
     2,   'x2',
     3,   'x3'
)

y <- tribble(
  ~key, ~val_y,
     1,   'y1',
     2,   'y2',
     4,   'y3'
)

inner_join(x = x, y = y, by = 'key')

base::merge(x = x, y = y)
A tibble: 2 × 3
keyval_xval_y
<dbl><chr><chr>
1x1y1
2x2y2
A data.frame: 2 × 3
keyval_xval_y
<dbl><chr><chr>
1x1y1
2x2y2
# ONE TABLE HAS DUPLICATE KEYS
#   this is useful when you want to add in additional info (there is typically a one-to-many relationship)

x <- tribble(
  ~key, ~val_x,
     1,   'x1',
     2,   'x2',
     2,   'x3',
     1,   'x4'
)

y <- tribble(
  ~key, ~val_y,
     1,   'y1',
     2,   'y2'
)

left_join(x = x, y = y, by = 'key')

base::merge(x = x, y = y)
A tibble: 4 × 3
keyval_xval_y
<dbl><chr><chr>
1x1y1
2x2y2
2x3y2
1x4y1
A data.frame: 4 × 3
keyval_xval_y
<dbl><chr><chr>
1x1y1
1x4y1
2x2y2
2x3y2
# BOTH TABLES HAVE DUPLICATE KEYS
#   this is usually an error because in neither table do the keys uniquely identify an observation
#   when you join duplicated keys you get all possible combinations, the Cartesian product

x <- tribble(
  ~key, ~val_x,
     1,   'x1',
     2,   'x2',
     2,   'x3',
     3,   'x4'
)

y <- tribble(
  ~key, ~val_y,
     1,   'y1',
     2,   'y2',
     2,   'y3',
     3,   'y4'
)

left_join(x = x, y = y, by = 'key')

base::merge(x = x, y = y)
Warning message in left_join(x = x, y = y, by = "key"):
“Detected an unexpected many-to-many relationship between `x` and `y`.
 Row 2 of `x` matches multiple rows in `y`.
 Row 2 of `y` matches multiple rows in `x`.
 If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.”
A tibble: 6 × 3
keyval_xval_y
<dbl><chr><chr>
1x1y1
2x2y2
2x2y3
2x3y2
2x3y3
3x4y4
A data.frame: 6 × 3
keyval_xval_y
<dbl><chr><chr>
1x1y1
2x2y2
2x2y3
2x3y2
2x3y3
3x4y4
# NATURAL JOIN
#   by = c('year', 'month', 'day', 'hour', 'origin')

flights2 %>%
  left_join(y = weather) %>%
    head()
Joining with `by = join_by(year, month, day, hour, origin)`
A tibble: 6 × 18
yearmonthdayhourorigindesttailnumcarriertempdewphumidwind_dirwind_speedwind_gustprecippressurevisibtime_hour
<int><int><int><dbl><chr><chr><chr><chr><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dttm>
2013115EWRIAHN14228UA39.0228.0464.4326012.65858 NA01011.9102013-01-01 05:00:00
2013115LGAIAHN24211UA39.9224.9854.8125014.9601421.8648201011.4102013-01-01 05:00:00
2013115JFKMIAN619AAAA39.0226.9661.6326014.96014 NA01012.1102013-01-01 05:00:00
2013115JFKBQNN804JBB639.0226.9661.6326014.96014 NA01012.1102013-01-01 05:00:00
2013116LGAATLN668DNDL39.9224.9854.8126016.1109223.0156001011.7102013-01-01 06:00:00
2013115EWRORDN39463UA39.0228.0464.4326012.65858 NA01011.9102013-01-01 05:00:00
flights2 %>%
  left_join(y = planes, by = 'tailnum') %>%
    head()
A tibble: 6 × 16
year.xmonthdayhourorigindesttailnumcarrieryear.ytypemanufacturermodelenginesseatsspeedengine
<int><int><int><dbl><chr><chr><chr><chr><int><chr><chr><chr><int><int><int><chr>
2013115EWRIAHN14228UA1999Fixed wing multi engineBOEING737-824 2149NATurbo-fan
2013115LGAIAHN24211UA1998Fixed wing multi engineBOEING737-824 2149NATurbo-fan
2013115JFKMIAN619AAAA1990Fixed wing multi engineBOEING757-223 2178NATurbo-fan
2013115JFKBQNN804JBB62012Fixed wing multi engineAIRBUSA320-232 2200NATurbo-fan
2013116LGAATLN668DNDL1991Fixed wing multi engineBOEING757-232 2178NATurbo-fan
2013115EWRORDN39463UA2012Fixed wing multi engineBOEING737-924ER2191NATurbo-fan
flights2 %>%
  left_join(y = airports, by = c('dest' = 'faa')) %>%
    head()
A tibble: 6 × 15
yearmonthdayhourorigindesttailnumcarriernamelatlonalttzdsttzone
<int><int><int><dbl><chr><chr><chr><chr><chr><dbl><dbl><dbl><dbl><chr><chr>
2013115EWRIAHN14228UAGeorge Bush Intercontinental 29.98443-95.34144 97-6A America/Chicago
2013115LGAIAHN24211UAGeorge Bush Intercontinental 29.98443-95.34144 97-6A America/Chicago
2013115JFKMIAN619AAAAMiami Intl 25.79325-80.29056 8-5A America/New_York
2013115JFKBQNN804JBB6NA NA NA NANANANA
2013116LGAATLN668DNDLHartsfield Jackson Atlanta Intl33.63672-84.428071026-5A America/New_York
2013115EWRORDN39463UAChicago Ohare Intl 41.97860-87.90484 668-6A America/Chicago
flights2 %>%
  left_join(y = airports, by = c('origin' = 'faa')) %>%
    head()
A tibble: 6 × 15
yearmonthdayhourorigindesttailnumcarriernamelatlonalttzdsttzone
<int><int><int><dbl><chr><chr><chr><chr><chr><dbl><dbl><dbl><dbl><chr><chr>
2013115EWRIAHN14228UANewark Liberty Intl40.69250-74.1686718-5AAmerica/New_York
2013115LGAIAHN24211UALa Guardia 40.77725-73.8726122-5AAmerica/New_York
2013115JFKMIAN619AAAAJohn F Kennedy Intl40.63975-73.7789313-5AAmerica/New_York
2013115JFKBQNN804JBB6John F Kennedy Intl40.63975-73.7789313-5AAmerica/New_York
2013116LGAATLN668DNDLLa Guardia 40.77725-73.8726122-5AAmerica/New_York
2013115EWRORDN39463UANewark Liberty Intl40.69250-74.1686718-5AAmerica/New_York
options(repr.plot.width=20, repr.plot.height=10)

airports %>%
  semi_join(y = flights, by = c('faa' = 'dest')) %>%
    ggplot(mapping = aes(x = lon, y = lat, size = )) +
      borders('state') +
      geom_point() +
      coord_quickmap() +
      theme(
        text=element_text(size=20)
      )
../../../_images/6a8425bdd191dc907148db99e2093ee665e8d06b1bc20a439441d8481dfe04c5.png

Filtering Joins#

Semi Join#

x <- tribble(
  ~key, ~val_x,
     1,   'x1',
     2,   'x2',
     3,   'x3'
)

y <- tribble(
  ~key, ~val_y,
     1,   'y1',
     2,   'y2',
     4,   'y3'
)

semi_join(x = x, y = y)
Joining with `by = join_by(key)`
A tibble: 2 × 2
keyval_x
<dbl><chr>
1x1
2x2
x <- tribble(
  ~key, ~val_x,
     1,   'x1',
     2,   'x2',
     2,   'x3',
     3,   'x4'
)

y <- tribble(
  ~key, ~val_y,
     1,   'y1',
     2,   'y2',
     2,   'y3',
     3,   'y4'
)

semi_join(x = x, y = y)
Joining with `by = join_by(key)`
A tibble: 4 × 2
keyval_x
<dbl><chr>
1x1
2x2
2x3
3x4
top_dest <-
  flights %>%
    count(dest, sort = TRUE) %>%
      head(n = 10)
top_dest

flights %>%
  filter(dest %in% top_dest$dest) %>%
    head()

flights %>%
  semi_join(y = top_dest) %>%
    head()
A tibble: 10 × 2
destn
<chr><int>
ORD17283
ATL17215
LAX16174
BOS15508
MCO14082
CLT14064
SFO13331
FLL12055
MIA11728
DCA 9705
A tibble: 6 × 19
yearmonthdaydep_timesched_dep_timedep_delayarr_timesched_arr_timearr_delaycarrierflighttailnumorigindestair_timedistancehourminutetime_hour
<int><int><int><int><int><dbl><int><int><dbl><chr><int><chr><chr><chr><dbl><dbl><dbl><dbl><dttm>
201311542540 2923850 33AA1141N619AAJFKMIA16010895402013-01-01 05:00:00
201311554600-6812837-25DL 461N668DNLGAATL116 7626 02013-01-01 06:00:00
201311554558-4740728 12UA1696N39463EWRORD150 7195582013-01-01 05:00:00
201311555600-5913854 19B6 507N516JBEWRFLL15810656 02013-01-01 06:00:00
201311557600-3838846 -8B6 79N593JBJFKMCO140 9446 02013-01-01 06:00:00
201311558600-2753745 8AA 301N3ALAALGAORD138 7336 02013-01-01 06:00:00
Joining with `by = join_by(dest)`
A tibble: 6 × 19
yearmonthdaydep_timesched_dep_timedep_delayarr_timesched_arr_timearr_delaycarrierflighttailnumorigindestair_timedistancehourminutetime_hour
<int><int><int><int><int><dbl><int><int><dbl><chr><int><chr><chr><chr><dbl><dbl><dbl><dbl><dttm>
201311542540 2923850 33AA1141N619AAJFKMIA16010895402013-01-01 05:00:00
201311554600-6812837-25DL 461N668DNLGAATL116 7626 02013-01-01 06:00:00
201311554558-4740728 12UA1696N39463EWRORD150 7195582013-01-01 05:00:00
201311555600-5913854 19B6 507N516JBEWRFLL15810656 02013-01-01 06:00:00
201311557600-3838846 -8B6 79N593JBJFKMCO140 9446 02013-01-01 06:00:00
201311558600-2753745 8AA 301N3ALAALGAORD138 7336 02013-01-01 06:00:00

Anti Join#

x <- tribble(
  ~key, ~val_x,
     1,   'x1',
     2,   'x2',
     3,   'x3'
)

y <- tribble(
  ~key, ~val_y,
     1,   'y1',
     2,   'y2',
     4,   'y3'
)

anti_join(x = x, y = y)
Joining with `by = join_by(key)`
A tibble: 1 × 2
keyval_x
<dbl><chr>
3x3
x <- tribble(
  ~key, ~val_x,
     1,   'x1',
     2,   'x2',
     2,   'x3',
     3,   'x4'
)

y <- tribble(
  ~key, ~val_y,
     1,   'y1',
     2,   'y2',
     2,   'y3',
     3,   'y4'
)

anti_join(x = x, y = y)
Joining with `by = join_by(key)`
A tibble: 0 × 2
keyval_x
<dbl><chr>
# anti joins are useful for diagnosing join mismatches
# for example, you might be interested to know that there are many `flights` that don't have a match in `planes`

flights %>%
  anti_join(y = planes, by = 'tailnum') %>%
    count(tailnum, sort = TRUE) %>%
      head()
A tibble: 6 × 2
tailnumn
<chr><int>
NA 2512
N725MQ 575
N722MQ 513
N723MQ 507
N713MQ 483
N735MQ 396

13.7 - Set operations#

df1 <- tribble(
  ~x, ~y,
   1,  1,
   2,  1
)

df2 <- tribble(
  ~x, ~y,
   1,  1,
   1,  2
)

intersect(x = df1, y = df2)

union(x = df1, y = df2)

setdiff(x = df1, y = df2)

setdiff(x = df2, y = df1)
A tibble: 1 × 2
xy
<dbl><dbl>
11
A tibble: 3 × 2
xy
<dbl><dbl>
11
21
12
A tibble: 1 × 2
xy
<dbl><dbl>
21
A tibble: 1 × 2
xy
<dbl><dbl>
12

14 - Strings#

string1      <- "This is a string"
string2      <- 'If I want to include a "quote" inside a string, I use single quotes'
double_quote <- "\"" # or '"'
single_quote <- '\'' # or "'"

# the printed representation of a string is not the same as the string itself
x <- c("\"", "\\")
x
writeLines(x)
  1. '"'
  2. '\\'
"
\
?'"'
x <- "\u00b5"
x

c('one', 'two', 'three')
'µ'
  1. 'one'
  2. 'two'
  3. 'three'
str_length(c('a', 'R for data science', NA))
  1. 1
  2. 18
  3. <NA>
str_c('x', 'y', 'z')
'xyz'
x <- c('abc', NA)

str_c('|-', x, '-|')

str_c('|-', str_replace_na(x), '-|')
  1. '|-abc-|'
  2. NA
  1. '|-abc-|'
  2. '|-NA-|'
str_c('prefix-', c('a', 'b', 'c'), '-suffix')
  1. 'prefix-a-suffix'
  2. 'prefix-b-suffix'
  3. 'prefix-c-suffix'
name        <- 'Hadley'
time_of_day <- 'morning'
birthday    <- FALSE

str_c('Good ', time_of_day, ' ', name, if (birthday) ' and HAPPY BIRTHDAY', '.')
'Good morning Hadley.'
str_c(c('x', 'y', 'z'), collapse = ', ')
'x, y, z'
x <- c('Apple', 'Banana', 'Pear')

str_sub(x,  1,  3)
str_sub(x, -3, -1)
str_sub(x,  1, 10)
  1. 'App'
  2. 'Ban'
  3. 'Pea'
  1. 'ple'
  2. 'ana'
  3. 'ear'
  1. 'Apple'
  2. 'Banana'
  3. 'Pear'
str_sub(x, 1, 1) <- str_to_lower(str_sub(x, 1, 1))

x
  1. 'apple'
  2. 'banana'
  3. 'pear'
str_to_upper(c('i', 'ı'))
str_to_upper(c('i', 'ı'), locale = 'tr')
  1. 'I'
  2. 'I'
  1. 'İ'
  2. 'I'
x <- c('apple', 'eggplant', 'banana')

str_sort(x, locale = 'en')  # English
str_sort(x, locale = 'haw') # Hawaiian
  1. 'apple'
  2. 'banana'
  3. 'eggplant'
  1. 'apple'
  2. 'eggplant'
  3. 'banana'
x <- c('apple', 'banana', 'pear')

str_view(x, 'an')
[2] │ b<an><an>a
str_view(x, '.a.')
[2] │ <ban>ana
[3] │ p<ear>
dot <- '\\.'

dot

writeLines(dot)
'\\.'
\.
str_view(c('abc', 'a.c', 'bef'), 'a\\.c')
[2] │ <a.c>
x <- 'a\\b'

x

writeLines(x)

str_view(x, '\\\\')
'a\\b'
a\b
[1] │ a<\>b
x <- 
Error in parse(text = x, srcfile = src): <text>:2:0: unexpected end of input
1: x <- 
   ^
Traceback:

16 - Dates and times#

today()
now()
[1] "2023-06-19 21:40:12 EDT"
lubridate::ymd('2017-01-31')
lubridate::mdy('January 31st, 2017')
lubridate::dmy('31-Jan-2017')
lubridate::ymd(20170131)
lubridate::ymd_hms('2017-01-31 20:11:59')
lubridate::mdy_hm('01/31/2017 08:01')
lubridate::ymd(20170131, tz = 'UTC')
[1] "2017-01-31 20:11:59 UTC"
[1] "2017-01-31 08:01:00 UTC"
[1] "2017-01-31 UTC"

19 - Functions#

df <- tibble::tibble(
  a = rnorm(10),
  b = rnorm(10),
  c = rnorm(10),
  d = rnorm(10),
)
df$a <- (df$a - min(df$a, na.rm = TRUE)) / (max(df$a, na.rm = TRUE) - min(df$a, na.rm = TRUE))
df$b <- (df$b - min(df$b, na.rm = TRUE)) / (max(df$d, na.rm = TRUE) - min(df$b, na.rm = TRUE))
df$c <- (df$c - min(df$c, na.rm = TRUE)) / (max(df$c, na.rm = TRUE) - min(df$c, na.rm = TRUE))
df$d <- (df$d - min(df$d, na.rm = TRUE)) / (max(df$d, na.rm = TRUE) - min(df$d, na.rm = TRUE))
df
A tibble: 10 × 4
abcd
<dbl><dbl><dbl><dbl>
0.26034530.00000000.522913660.17190835
0.44286650.39117150.000000000.34864602
0.76797510.29588800.613012330.00000000
0.00000000.32565570.038463921.00000000
0.95977660.34090090.303982130.38970973
0.75160230.26711971.000000000.14085417
1.00000000.43525290.351955090.08903446
0.31994400.53704800.574221990.77900010
0.71300830.07144320.378545170.07351166
0.16292080.12354440.532721030.50560502
rescale01 <- function (x) {
  rng <- range(x, na.rm = TRUE)
  (x - rng[1]) / (rng[2] - rng[1])
}

rescale01(c(0, 5, 10))
rescale01(c(-10, 0, 10))
rescale01(c(1, 2, 3, NA, 5))
  1. 0
  2. 0.5
  3. 1
  1. 0
  2. 0.5
  3. 1
  1. 0
  2. 0.25
  3. 0.5
  4. <NA>
  5. 1
df <- tibble::tibble(
  a = rnorm(10),
  b = rnorm(10),
  c = rnorm(10),
  d = rnorm(10),
)
df$a <- rescale01(df$a)
df$b <- rescale01(df$b)
df$c <- rescale01(df$c)
df$d <- rescale01(df$d)
df
A tibble: 10 × 4
abcd
<dbl><dbl><dbl><dbl>
0.56204210.47726770.000000000.2779846
0.00000000.14814860.200800520.2611846
0.31201530.83966130.045487430.4313317
0.53195230.62284010.470508750.9074133
0.45308020.34826870.333158230.1256405
0.43333811.00000000.331407200.5639065
0.47211710.57060660.144474560.7455952
0.14155880.00000000.154907171.0000000
1.00000000.39487591.000000000.0000000
0.32310580.60696660.738183490.9683428
x <- c(1:10, Inf)

rescale01(x)
  1. 0
  2. 0
  3. 0
  4. 0
  5. 0
  6. 0
  7. 0
  8. 0
  9. 0
  10. 0
  11. NaN
rescale01 <- function (x) {
  rng <- range(x, na.rm = TRUE, finite = TRUE)
  (x - rng[1]) / (rng[2] - rng[1])
}

x <- c(1:10, Inf)

rescale01(x)
  1. 0
  2. 0.111111111111111
  3. 0.222222222222222
  4. 0.333333333333333
  5. 0.444444444444444
  6. 0.555555555555556
  7. 0.666666666666667
  8. 0.777777777777778
  9. 0.888888888888889
  10. 1
  11. Inf

Exercise 19.2.1#

1#

[1] Why is TRUE not a parameter to rescale01? What would happen if x contained a single missing value, and na.rm was FALSE?

rescale01 <- function (x, remove_missing_data = TRUE, finite_data = TRUE) {
  rng <- range(x, na.rm = remove_missing_data, finite = finite_data)
  (x - rng[1]) / (rng[2] - rng[1])
}

rescale01(c(0, 5, 10),       FALSE, FALSE)
rescale01(c(-10, 0, 10),     FALSE, FALSE)
rescale01(c(1, 2, 3, NA, 5), FALSE, FALSE)
rescale01(c(1:10, Inf),      FALSE, FALSE)

rescale01(c(0, 5, 10),       FALSE, TRUE)
rescale01(c(-10, 0, 10),     FALSE, TRUE)
rescale01(c(1, 2, 3, NA, 5), FALSE, TRUE)
rescale01(c(1:10, Inf),      FALSE, TRUE)

rescale01(c(0, 5, 10),       TRUE, FALSE)
rescale01(c(-10, 0, 10),     TRUE, FALSE)
rescale01(c(1, 2, 3, NA, 5), TRUE, FALSE)
rescale01(c(1:10, Inf),      TRUE, FALSE)

rescale01(c(0, 5, 10),       TRUE, TRUE)
rescale01(c(-10, 0, 10),     TRUE, TRUE)
rescale01(c(1, 2, 3, NA, 5), TRUE, TRUE)
rescale01(c(1:10, Inf),      TRUE, TRUE)
  1. 0
  2. 0.5
  3. 1
  1. 0
  2. 0.5
  3. 1
  1. <NA>
  2. <NA>
  3. <NA>
  4. <NA>
  5. <NA>
  1. 0
  2. 0
  3. 0
  4. 0
  5. 0
  6. 0
  7. 0
  8. 0
  9. 0
  10. 0
  11. NaN
  1. 0
  2. 0.5
  3. 1
  1. 0
  2. 0.5
  3. 1
  1. 0
  2. 0.25
  3. 0.5
  4. <NA>
  5. 1
  1. 0
  2. 0.111111111111111
  3. 0.222222222222222
  4. 0.333333333333333
  5. 0.444444444444444
  6. 0.555555555555556
  7. 0.666666666666667
  8. 0.777777777777778
  9. 0.888888888888889
  10. 1
  11. Inf
  1. 0
  2. 0.5
  3. 1
  1. 0
  2. 0.5
  3. 1
  1. 0
  2. 0.25
  3. 0.5
  4. <NA>
  5. 1
  1. 0
  2. 0
  3. 0
  4. 0
  5. 0
  6. 0
  7. 0
  8. 0
  9. 0
  10. 0
  11. NaN
  1. 0
  2. 0.5
  3. 1
  1. 0
  2. 0.5
  3. 1
  1. 0
  2. 0.25
  3. 0.5
  4. <NA>
  5. 1
  1. 0
  2. 0.111111111111111
  3. 0.222222222222222
  4. 0.333333333333333
  5. 0.444444444444444
  6. 0.555555555555556
  7. 0.666666666666667
  8. 0.777777777777778
  9. 0.888888888888889
  10. 1
  11. Inf

2#

[2] In the second variant of rescale01(), infinite values are left unchanged. Rewrite rescale01() so that -Inf is mapped to 0, and Inf is mapped to 1.

rescale01 <- function (x) {
  rng <- range(x, na.rm = TRUE, finite = TRUE)
  y   <- (x - rng[1]) / (rng[2] - rng[1])
  y[y == -Inf] <- 0
  y[y ==  Inf] <- 1
  y
}

x <- c(1:10, Inf)

rescale01(x)
  1. 0
  2. 0.111111111111111
  3. 0.222222222222222
  4. 0.333333333333333
  5. 0.444444444444444
  6. 0.555555555555556
  7. 0.666666666666667
  8. 0.777777777777778
  9. 0.888888888888889
  10. 1
  11. 1

3#

[3] Practice turning the following code snippets into functions. Think about what each function does. What would you call it? How many arguments does it need? Can you rewrite it to be more expressive or less duplicative?

x <- c(1:10, NA, NA)
x

# proportion of null values
prop_na <- function (x) {
  mean(is.na(x))
}

prop_na(x)
  1. 1
  2. 2
  3. 3
  4. 4
  5. 5
  6. 6
  7. 7
  8. 8
  9. 9
  10. 10
  11. <NA>
  12. <NA>
0.166666666666667
x <- c(1:10, NA, NA)
x

# standardization, sum to unity
sum_to_one <- function (x, na.rm = FALSE) {
  x / sum(x, na.rm = na.rm)
}

sum_to_one(x)
sum_to_one(x, TRUE)
  1. 1
  2. 2
  3. 3
  4. 4
  5. 5
  6. 6
  7. 7
  8. 8
  9. 9
  10. 10
  11. <NA>
  12. <NA>
  1. <NA>
  2. <NA>
  3. <NA>
  4. <NA>
  5. <NA>
  6. <NA>
  7. <NA>
  8. <NA>
  9. <NA>
  10. <NA>
  11. <NA>
  12. <NA>
  1. 0.0181818181818182
  2. 0.0363636363636364
  3. 0.0545454545454545
  4. 0.0727272727272727
  5. 0.0909090909090909
  6. 0.109090909090909
  7. 0.127272727272727
  8. 0.145454545454545
  9. 0.163636363636364
  10. 0.181818181818182
  11. <NA>
  12. <NA>
# coefficient of variation
coef_variation <- function (x, na.rm = FALSE) {
  sd(x, na.rm = na.rm) / mean(x, na.rm = na.rm)
}

x <- c(1:10, NA, NA)

coef_variation(x)
coef_variation(x, TRUE)
  1. 1
  2. 2
  3. 3
  4. 4
  5. 5
  6. 6
  7. 7
  8. 8
  9. 9
  10. 10
  11. <NA>
  12. <NA>
<NA>
0.55048188256318

4#

[4] Write your own functions to compute the variance and skewness of a numeric vector.

\( \begin{aligned} \text{Var}(x) = \frac{1}{n - 1} \sum_{i=1}^n (x_i - \bar{x})^2 \end{aligned} \)

where

\( \begin{aligned} \bar{x} = \frac{1}{n} \sum_i^n x_i \end{aligned} \)

\( \begin{aligned} \text{Skew}(x) = \frac{\frac{1}{n - 2} (\sum_{i=1}^n (x_i - \bar{x})^3)}{\text{Var}(x)^\frac{3}{2}} \end{aligned} \)

variance <- function (x, na.rm = FALSE) {
  n      <- length(x)
  m      <- mean(x, na.rm = na.rm)
  sq_err <- (x - m)^2
  sum(sq_err) / (n - 1)
}

var(1:10)
variance(1:10)
9.16666666666667
9.16666666666667
skewness <- function (x, na.rm = FALSE) {
  n <- length(x)
  m <- mean(x, na.rm = na.rm)
  v <- var(x,  na.rm = na.rm)
  (sum((x - m)^3) / (n - 2)) / v^(3/2)
}

skewness(c(1, 2, 3, 100))
1.49875099748886

5#

[5] Write both_na(), a function that takes two vectors of the same length and returns the number of positions that have an NA in both vectors.

both_na <- function (x, y) {
  sum(is.na(x) & is.na(y))
}
x <- c(NA, NA,  1, 2)
y <- c(NA,  1, NA, 2)

both_na(x, y)
1
x <- c(NA, NA,  1, 2, NA, NA, 1)
y <- c(NA,  1, NA, 2, NA, NA, 1)

both_na(x, y)
3

6#

[6] What do the following functions do? Why are they useful even though they are so short?

is_directory <- function (x) file.info(x)$isdir
is_readable  <- function (x) file.access(x, 4) == 0
# The function `is_directory()` checks whether the path in `x` is a directory.
is_directory <- function (x) file.info(x)$isdir
# The function `is_readable()` checks whether the path in `x` is readable (i.e., whether the file exists and the user has permission to open it).
is_readable <- function (x) file.access(x, 4) == 0

7#

[7] Read the complete lyrics to “Little Bunny Foo Foo”. There’s a lot of duplication in this song. Extend the initial piping example to recreate the complete song, and use functions to reduce the duplication.

19.4 Conditional execution#

# Here's a simple function that uses an `if` statement.
# The goal of this function is to return a logical vector describing whether or not each element of a vector is named.
# This function takes advantage of the standard return rule: a function returns the last value that it computed. Here that is either one of the two branches of the `if` statement.

has_name <- function (x) {
  nms <- names(x)
  if (is.null(nms)) {
    rep(FALSE, length(x))
  } else {
    !is.na(nms) & nms != ""
  }
}

19.5 Function arguments#

# Compute confidence interval around mean using normal approximation

mean_ci <- function (x, conf = 0.95) {
  se    <- sd(x) / sqrt(length(x))
  alpha <- 1 - conf
  mean(x) + se * qnorm(c(alpha / 2, 1 - alpha / 2))
}

x <- runif(100)

mean_ci(x)
mean_ci(x, conf = 0.99)
  1. 0.460097864269066
  2. 0.574181602919688
  1. 0.44217400988909
  2. 0.592105457299664

19.5.2 Checking values#

# weighted mean summary statistic
wt_mean <- function (x, w) {
  sum(x * w) / sum(w)
}

# weighted variance summary statistic
wt_var  <- function (x, w) {
  mu <- wt_mean(x, w)
  sum(w * (x - mu)^2) / sum(w)
}

# weighted standard deviation summary statistic
wt_sd   <- function (x, w) {
  sqrt(wt_var(x, w))
}
# What happens if `x` and `w` are not the same length?
#   In this case, we don't get an error because of R's recycling rules.

x <- 1:6
w <- 1:3

x
w

wt_mean(x, w)
  1. 1
  2. 2
  3. 3
  4. 4
  5. 5
  6. 6
  1. 1
  2. 2
  3. 3
7.66666666666667
# weighted mean summary statistic
wt_mean <- function (x, w) {
  if (length(x) != length(w)) {
    stop("`x` and `w` must be the same length", call. = FALSE)
  }
  sum(x * w) / sum(w)
}

# weighted variance summary statistic
wt_var  <- function (x, w) {
  mu <- wt_mean(x, w)
  sum(w * (x - mu)^2) / sum(w)
}

# weighted standard deviation summary statistic
wt_sd   <- function (x, w) {
  sqrt(wt_var(x, w))
}

20 - Vectors#

letters
  1. 'a'
  2. 'b'
  3. 'c'
  4. 'd'
  5. 'e'
  6. 'f'
  7. 'g'
  8. 'h'
  9. 'i'
  10. 'j'
  11. 'k'
  12. 'l'
  13. 'm'
  14. 'n'
  15. 'o'
  16. 'p'
  17. 'q'
  18. 'r'
  19. 's'
  20. 't'
  21. 'u'
  22. 'v'
  23. 'w'
  24. 'x'
  25. 'y'
  26. 'z'
typeof(letters)
'character'
1:10
  1. 1
  2. 2
  3. 3
  4. 4
  5. 5
  6. 6
  7. 7
  8. 8
  9. 9
  10. 10
typeof(1:10)
'integer'
x <- list('a', 'b', 1:10)
x
  1. 'a'
  2. 'b'
    1. 1
    2. 2
    3. 3
    4. 4
    5. 5
    6. 6
    7. 7
    8. 8
    9. 9
    10. 10
length(x)
3

Logical#

1:10 %% 3 == 0
  1. FALSE
  2. FALSE
  3. TRUE
  4. FALSE
  5. FALSE
  6. TRUE
  7. FALSE
  8. FALSE
  9. TRUE
  10. FALSE
c(TRUE, TRUE, FALSE, NA)
  1. TRUE
  2. TRUE
  3. FALSE
  4. <NA>

Numeric#

typeof(1)
'double'
typeof(1L)
'integer'
1.5L
1.5
x <- sqrt(x = 2) ^ 2
x
x - 2
x == 2
near(x = x, y = 2)
2
4.44089209850063e-16
FALSE
TRUE
c(-1, 0, 1) / 0
  1. -Inf
  2. NaN
  3. Inf
is.finite(0)
is.finite(Inf)
is.finite(-Inf)
is.finite(NA)
is.finite(NaN)
TRUE
FALSE
FALSE
FALSE
FALSE
is.infinite(0)
is.infinite(Inf)
is.infinite(-Inf)
is.infinite(NA)
is.infinite(NaN)
FALSE
TRUE
TRUE
FALSE
FALSE
is.na(0)
is.na(Inf)
is.na(-Inf)
is.na(NA)
is.na(NaN)
FALSE
FALSE
FALSE
TRUE
TRUE
is.nan(0)
is.nan(Inf)
is.nan(-Inf)
is.nan(NA)
is.nan(NaN)
FALSE
FALSE
FALSE
FALSE
TRUE

Character#

# `y` doesn't take up 1000x as much memory as `x`, because each element of `y` is just a pointer to that same string
# a pointer is 8 bytes, so 1000 pointers to a 152 B string is 8 x 1000 + 152 = 8.14 kB

x <- 'This is a reasonably long string.'
pryr::object_size(x)

y <- rep(x, 1000)
pryr::object_size(y)
152 B
8.14 kB

Missing values#

# each type of atomic vector has its own missing value

NA            # logical
NA_integer_   # integer
NA_real_      # double
NA_character_ # character
<NA>
<NA>
<NA>
NA

20.4 - Using atomic vectors#

20.4.1 - Coercion#

x <- sample(20, 100, replace = TRUE)
x
  1. 16
  2. 20
  3. 11
  4. 5
  5. 18
  6. 19
  7. 6
  8. 17
  9. 2
  10. 12
  11. 12
  12. 12
  13. 5
  14. 9
  15. 17
  16. 18
  17. 9
  18. 19
  19. 3
  20. 11
  21. 4
  22. 3
  23. 1
  24. 12
  25. 11
  26. 7
  27. 6
  28. 13
  29. 10
  30. 14
  31. 15
  32. 9
  33. 4
  34. 2
  35. 9
  36. 7
  37. 20
  38. 11
  39. 16
  40. 9
  41. 14
  42. 18
  43. 9
  44. 19
  45. 16
  46. 19
  47. 7
  48. 16
  49. 4
  50. 18
  51. 16
  52. 14
  53. 1
  54. 14
  55. 4
  56. 10
  57. 6
  58. 5
  59. 17
  60. 11
  61. 7
  62. 9
  63. 17
  64. 15
  65. 2
  66. 18
  67. 4
  68. 11
  69. 9
  70. 6
  71. 4
  72. 12
  73. 11
  74. 9
  75. 19
  76. 8
  77. 13
  78. 5
  79. 6
  80. 4
  81. 6
  82. 5
  83. 2
  84. 20
  85. 4
  86. 18
  87. 11
  88. 2
  89. 10
  90. 16
  91. 19
  92. 4
  93. 11
  94. 15
  95. 3
  96. 20
  97. 19
  98. 13
  99. 10
  100. 9
y <- x > 10
y
  1. TRUE
  2. TRUE
  3. TRUE
  4. FALSE
  5. TRUE
  6. TRUE
  7. FALSE
  8. TRUE
  9. FALSE
  10. TRUE
  11. TRUE
  12. TRUE
  13. FALSE
  14. FALSE
  15. TRUE
  16. TRUE
  17. FALSE
  18. TRUE
  19. FALSE
  20. TRUE
  21. FALSE
  22. FALSE
  23. FALSE
  24. TRUE
  25. TRUE
  26. FALSE
  27. FALSE
  28. TRUE
  29. FALSE
  30. TRUE
  31. TRUE
  32. FALSE
  33. FALSE
  34. FALSE
  35. FALSE
  36. FALSE
  37. TRUE
  38. TRUE
  39. TRUE
  40. FALSE
  41. TRUE
  42. TRUE
  43. FALSE
  44. TRUE
  45. TRUE
  46. TRUE
  47. FALSE
  48. TRUE
  49. FALSE
  50. TRUE
  51. TRUE
  52. TRUE
  53. FALSE
  54. TRUE
  55. FALSE
  56. FALSE
  57. FALSE
  58. FALSE
  59. TRUE
  60. TRUE
  61. FALSE
  62. FALSE
  63. TRUE
  64. TRUE
  65. FALSE
  66. TRUE
  67. FALSE
  68. TRUE
  69. FALSE
  70. FALSE
  71. FALSE
  72. TRUE
  73. TRUE
  74. FALSE
  75. TRUE
  76. FALSE
  77. TRUE
  78. FALSE
  79. FALSE
  80. FALSE
  81. FALSE
  82. FALSE
  83. FALSE
  84. TRUE
  85. FALSE
  86. TRUE
  87. TRUE
  88. FALSE
  89. FALSE
  90. TRUE
  91. TRUE
  92. FALSE
  93. TRUE
  94. TRUE
  95. FALSE
  96. TRUE
  97. TRUE
  98. TRUE
  99. FALSE
  100. FALSE
# How many are greater than 10?

sum(y)
51
# What proportion are greater than 10?

mean(y)
0.51
typeof(c(TRUE, 1L))
'integer'
typeof(c(1L, 1.5))
'double'
typeof(c(1.5, 'a'))
'character'

20.4.2 - Test functions#

purrr::is_logical(TRUE)
purrr::is_logical(1L)
purrr::is_logical(1.5)
purrr::is_logical('a')
TRUE
FALSE
FALSE
FALSE
purrr::is_integer(TRUE)
purrr::is_integer(1L)
purrr::is_integer(1.5)
purrr::is_integer('a')
FALSE
TRUE
FALSE
FALSE
purrr::is_double(TRUE)
purrr::is_double(1L)
purrr::is_double(1.5)
purrr::is_double('a')
FALSE
FALSE
TRUE
FALSE
purrr::is_character(TRUE)
purrr::is_character(1L)
purrr::is_character(1.5)
purrr::is_character('a')
FALSE
FALSE
FALSE
TRUE
purrr::is_atomic(TRUE)
purrr::is_atomic(1L)
purrr::is_atomic(1.5)
purrr::is_atomic('a')
TRUE
TRUE
TRUE
TRUE
purrr::is_list(TRUE)
purrr::is_list(1L)
purrr::is_list(1.5)
purrr::is_list('a')
FALSE
FALSE
FALSE
FALSE
purrr::is_vector(TRUE)
purrr::is_vector(1L)
purrr::is_vector(1.5)
purrr::is_vector('a')
TRUE
TRUE
TRUE
TRUE
sample(x = 10) + 100
  1. 109
  2. 106
  3. 102
  4. 108
  5. 110
  6. 101
  7. 103
  8. 105
  9. 104
  10. 107
runif(n = 10) > 0.5
  1. TRUE
  2. TRUE
  3. FALSE
  4. FALSE
  5. FALSE
  6. TRUE
  7. FALSE
  8. FALSE
  9. FALSE
  10. FALSE
1:10 + 1:2
  1. 2
  2. 4
  3. 4
  4. 6
  5. 6
  6. 8
  7. 8
  8. 10
  9. 10
  10. 12
1:10 + 1:3
Warning message in 1:10 + 1:3:
"longitud de objeto mayor no es m'ultiplo de la longitud de uno menor"
  1. 2
  2. 4
  3. 6
  4. 5
  5. 7
  6. 9
  7. 8
  8. 10
  9. 12
  10. 11
tibble(x = 1:4, y = 1:2)
Error in `tibble()`:
! Tibble columns must have compatible sizes.
* Size 4: Existing data.
* Size 2: Column `y`.
i Only values of size one are recycled.
Traceback:

1. tibble(x = 1:4, y = 1:2)
2. tibble_quos(xs, .rows, .name_repair)
3. vectbl_recycle_rows(res, first_size, j, given_col_names[[j]], 
 .     call)
4. abort_incompatible_size(n, name, size, "Existing data", call)
5. tibble_abort(call = call, bullets("Tibble columns must have compatible sizes:", 
 .     if (!is.null(.rows)) paste0("Size ", .rows, ": ", rows_source), 
 .     problems, info = "Only values of size one are recycled."))
6. abort(x, class, ..., call = call, parent = parent, use_cli_format = TRUE)
7. signal_abort(cnd, .file)
tibble(x = 1:4, y = rep(1:2, 2))
A tibble: 4 x 2
xy
<int><int>
11
22
31
42
tibble(x = 1:4, y = rep(1:2, each = 2))
A tibble: 4 x 2
xy
<int><int>
11
21
32
42

20.4.4 - Naming vectors#

c(x = 1, y = 2, z = 4)
x
1
y
2
z
4
purrr::set_names(1:3, c('a', 'b', 'c'))
a
1
b
2
c
3

20.4.5 - Subsetting#

x <- c('one', 'two', 'three', 'four', 'five')
x
x[c(3,2,5)]
x[c(1,1,5,5,5,2)]
x[c(-1,-3,-5)]
x[0]
  1. 'one'
  2. 'two'
  3. 'three'
  4. 'four'
  5. 'five'
  1. 'three'
  2. 'two'
  3. 'five'
  1. 'one'
  2. 'one'
  3. 'five'
  4. 'five'
  5. 'five'
  6. 'two'
  1. 'two'
  2. 'four'
x <- c(10, 3, NA, 5, 8, 1, NA)
x
x[!is.na(x)]
x[x %% 2 == 0]
  1. 10
  2. 3
  3. <NA>
  4. 5
  5. 8
  6. 1
  7. <NA>
  1. 10
  2. 3
  3. 5
  4. 8
  5. 1
  1. 10
  2. <NA>
  3. 8
  4. <NA>
x <- c(abc = 1, def = 2, xyz = 5)
x
x[c('xyz', 'def')]
abc
1
def
2
xyz
5
xyz
5
def
2

20.5 - Recursive vectors (lists)#

x <- list(1, 2, 3)
x
  1. 1
  2. 2
  3. 3
str(object = x)
List of 3
 $ : num 1
 $ : num 2
 $ : num 3
x_named <- list(a = 1, b = 2, c = 3)
x_named
$a
1
$b
2
$c
3
str(object = x_named)
List of 3
 $ a: num 1
 $ b: num 2
 $ c: num 3
y <- list('a', 1L, 1.5, TRUE)
y
  1. 'a'
  2. 1
  3. 1.5
  4. TRUE
str(object = y)
List of 4
 $ : chr "a"
 $ : int 1
 $ : num 1.5
 $ : logi TRUE

20.5.1 - Visualizing lists#

x1 <- list(c(1, 2), c(3, 4))
x2 <- list(list(1, 2), list(3, 4))
x3 <- list(1, list(2, list(3)))

x1
x2
x3

str(x1)
str(x2)
str(x3)
    1. 1
    2. 2
    1. 3
    2. 4
    1. 1
    2. 2
    1. 3
    2. 4
  1. 1
    1. 2
      1. 3
List of 2
 $ : num [1:2] 1 2
 $ : num [1:2] 3 4
List of 2
 $ :List of 2
  ..$ : num 1
  ..$ : num 2
 $ :List of 2
  ..$ : num 3
  ..$ : num 4
List of 2
 $ : num 1
 $ :List of 2
  ..$ : num 2
  ..$ :List of 1
  .. ..$ : num 3

20.5.2 - Subsetting#

a <- list(a = 1:3, b = 'a string', c = pi, d = list(-1, -5))
a
$a
  1. 1
  2. 2
  3. 3
$b
'a string'
$c
3.14159265358979
$d
  1. -1
  2. -5
a[1:2]
$a
  1. 1
  2. 2
  3. 3
$b
'a string'
a[4]
$d =
  1. -1
  2. -5
a[[4]]
  1. -1
  2. -5
a[[4]][1]
  1. -1
a[[4]][[1]]
-1
a$a
  1. 1
  2. 2
  3. 3
a[['a']]
  1. 1
  2. 2
  3. 3

20.6 - Attributes#

x <- 1:10

attr(x, 'greeting')
attr(x, 'greeting') <- 'Hi!'
attr(x, 'farewell') <- 'Bye!'
attributes(x)
NULL
$greeting
'Hi!'
$farewell
'Bye!'
# the call to "UseMethod" means that this is a generic function
# and it will call a specific method based on the class of the first argument

as.Date
function (x, ...) 
UseMethod("as.Date")
# list all the methods for a generic function with `methods()`

methods(generic.function = 'as.Date')
[1] as.Date.POSIXct*    as.Date.POSIXlt*    as.Date.character* 
[4] as.Date.default*    as.Date.factor*     as.Date.numeric*   
[7] as.Date.vctrs_sclr* as.Date.vctrs_vctr*
see '?methods' for accessing help and source code
# see the specific implementation of a method with `getS3method()`

getS3method(f = 'as.Date', class = 'default')
getS3method(f = 'as.Date', class = 'numeric')
function (x, ...) 
{
    if (inherits(x, "Date")) 
        x
    else if (is.null(x)) 
        .Date(numeric())
    else if (is.logical(x) && all(is.na(x))) 
        .Date(as.numeric(x))
    else stop(gettextf("do not know how to convert '%s' to class %s", 
        deparse1(substitute(x)), dQuote("Date")), domain = NA)
}
function (x, origin, ...) 
if (missing(origin)) .Date(x) else as.Date(origin, ...) + x

20.7 - Augmented vectors#

20.7.1 - Factors#

x <- factor(c('ab', 'cd', 'ab'), levels = c('ab', 'cd', 'ef'))
x
  1. ab
  2. cd
  3. ab
Levels:
  1. 'ab'
  2. 'cd'
  3. 'ef'
typeof(x)
'integer'
attributes(x)
$levels
  1. 'ab'
  2. 'cd'
  3. 'ef'
$class
'factor'

20.7.2 Dates and datetimes#

x <- as.Date('1971-01-01')
x

unclass(x = x)
typeof(x = x)
attributes(x = x)
365
'double'
$class = 'Date'
x <- lubridate::ymd_hm('1970-01-01 01:00')
x
[1] "1970-01-01 01:00:00 UTC"

Bibliography#

Wickham, Hadley; Mine Çetinkaya-Rundel; & Garrett Grolemund. R for Data Science: Import, Tidy, Transform, Visualize, and Model Data. 1st Ed. O’Reilly. Home.