Silge & Robinson’s Text Mining with R#

Silge, Julia & David Robinson. Text Mining with R: A Tidy Approach. O’Reilly. Home.


Programming Environment#

base::library(package = forcats)
base::library(package = ggraph)
base::library(package = gutenbergr)
base::library(package = janeaustenr)
base::library(package = reshape2)
base::library(package = scales)
base::library(package = tidytext)
base::library(package = tidyverse)
base::library(package = widyr)
base::library(package = wordcloud)

utils::sessionInfo()
Loading required package: ggplot2
── Attaching core tidyverse packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
 dplyr     1.1.2      stringr   1.5.0
 lubridate 1.9.2      tibble    3.2.1
 purrr     1.0.2      tidyr     1.3.0
 readr     2.1.4     
── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
 readr::col_factor() masks scales::col_factor()
 purrr::discard()    masks scales::discard()
 dplyr::filter()     masks stats::filter()
 dplyr::lag()        masks stats::lag()
 Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Loading required package: RColorBrewer
R version 4.3.0 (2023-04-21)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS 14.4.1

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: America/New_York
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] wordcloud_2.6      RColorBrewer_1.1-3 widyr_0.1.5        lubridate_1.9.2   
 [5] stringr_1.5.0      dplyr_1.1.2        purrr_1.0.2        readr_2.1.4       
 [9] tidyr_1.3.0        tibble_3.2.1       tidyverse_2.0.0    tidytext_0.4.1    
[13] scales_1.2.1       reshape2_1.4.4     janeaustenr_1.0.0  gutenbergr_0.2.3  
[17] ggraph_2.1.0       ggplot2_3.4.3      forcats_1.0.0     

loaded via a namespace (and not attached):
 [1] gtable_0.3.3       ggrepel_0.9.3      lattice_0.21-8     tzdb_0.4.0        
 [5] vctrs_0.6.3        tools_4.3.0        generics_0.1.3     fansi_1.0.4       
 [9] pkgconfig_2.0.3    tokenizers_0.3.0   Matrix_1.5-4       uuid_1.1-0        
[13] lifecycle_1.0.3    compiler_4.3.0     farver_2.1.1       munsell_0.5.0     
[17] ggforce_0.4.1      repr_1.1.6         graphlayouts_1.0.0 htmltools_0.5.5   
[21] SnowballC_0.7.1    pillar_1.9.0       crayon_1.5.2       MASS_7.3-58.4     
[25] viridis_0.6.3      tidyselect_1.2.0   digest_0.6.31      stringi_1.7.12    
[29] polyclip_1.10-4    fastmap_1.1.1      grid_4.3.0         colorspace_2.1-0  
[33] cli_3.6.1          magrittr_2.0.3     base64enc_0.1-3    tidygraph_1.2.3   
[37] utf8_1.2.3         IRdisplay_1.1      broom_1.0.5        withr_2.5.0       
[41] backports_1.4.1    IRkernel_1.3.2     timechange_0.2.0   igraph_1.5.0      
[45] gridExtra_2.3      pbdZMQ_0.3-9       hms_1.1.3          evaluate_0.21     
[49] viridisLite_0.4.2  rlang_1.1.1        Rcpp_1.0.10        glue_1.6.2        
[53] tweenr_2.0.2       jsonlite_1.8.5     R6_2.5.1           plyr_1.8.8        

1 - The tidy text format#

text <- c('Because I could not stop for Death -',
          'He kindly stopped for me -',
          'The Carriage held but just Ourselves -',
          'and Immortality')
base::print(text)
[1] "Because I could not stop for Death -"  
[2] "He kindly stopped for me -"            
[3] "The Carriage held but just Ourselves -"
[4] "and Immortality"                       
text_df <- tibble::tibble(line = 1:4, text = text)
text_df
A tibble: 4 × 2
linetext
<int><chr>
1Because I could not stop for Death -
2He kindly stopped for me -
3The Carriage held but just Ourselves -
4and Immortality
text_df %>%
  tidytext::unnest_tokens(output = word, input = text, to_lower = TRUE)
A tibble: 20 × 2
lineword
<int><chr>
1because
1i
1could
1not
1stop
1for
1death
2he
2kindly
2stopped
2for
2me
3the
3carriage
3held
3but
3just
3ourselves
4and
4immortality
original_books <- janeaustenr::austen_books() %>%
  dplyr::group_by(book) %>%
  dplyr::mutate(linenumber = dplyr::row_number(),
                chapter    = base::cumsum(stringr::str_detect(string  = text,
                                                              pattern = stringr::regex(pattern     = '^chapter [\\divxlc]',
                                                                                       ignore_case = TRUE)))) %>%
  dplyr::ungroup()

head(original_books)
A tibble: 6 × 4
textbooklinenumberchapter
<chr><fct><int><int>
SENSE AND SENSIBILITYSense & Sensibility10
Sense & Sensibility20
by Jane Austen Sense & Sensibility30
Sense & Sensibility40
(1811) Sense & Sensibility50
Sense & Sensibility60
tidy_books <-
  original_books %>%
    tidytext::unnest_tokens(output = word, input = text)

head(tidy_books)
A tibble: 6 × 4
booklinenumberchapterword
<fct><int><int><chr>
Sense & Sensibility10sense
Sense & Sensibility10and
Sense & Sensibility10sensibility
Sense & Sensibility30by
Sense & Sensibility30jane
Sense & Sensibility30austen
tidy_books <-
  tidy_books %>%
    dplyr::anti_join(y = tidytext::stop_words, by = 'word')

head(tidy_books)
A tibble: 6 × 4
booklinenumberchapterword
<fct><int><int><chr>
Sense & Sensibility 10sense
Sense & Sensibility 10sensibility
Sense & Sensibility 30jane
Sense & Sensibility 30austen
Sense & Sensibility 501811
Sense & Sensibility101chapter
tidy_books %>%
  dplyr::count(word, sort = TRUE) %>%
  head()
A tibble: 6 × 2
wordn
<chr><int>
miss 1855
time 1337
fanny 862
dear 822
lady 817
sir 806
tidy_books %>%
  dplyr::count(word, sort = TRUE) %>%
  dplyr::filter(n > 600) %>%
  dplyr::mutate(word = stats::reorder(x = word, X = n)) %>%
  ggplot2::ggplot(ggplot2::aes(x = n, y = word)) +
    ggplot2::geom_col() +
    ggplot2::labs(y = NULL)
../../../_images/8699b52895fe144e258aca55370229a768bb51332e2a85e926756060e72b5cc8.png
hgwells <- gutenbergr::gutenberg_download(c(35, 36, 5230, 159))
Determining mirror for Project Gutenberg from https://www.gutenberg.org/robot/harvest
Using mirror http://aleph.gutenberg.org
Warning message:
“! Could not download a book at http://aleph.gutenberg.org/1/5/159/159.zip.
 The book may have been archived.
 Alternatively, You may need to select a different mirror.
→ See https://www.gutenberg.org/MIRRORS.ALL for options.”
tidy_hgwells <-
  hgwells %>%
    tidytext::unnest_tokens(output = word, input = text) %>%
    dplyr::anti_join(y = stop_words, by = 'word')

head(tidy_hgwells)
A tibble: 6 × 2
gutenberg_idword
<int><chr>
35time
35machine
35invention
35contents
35introduction
35ii
tidy_hgwells %>%
  dplyr::count(word, sort = TRUE) %>%
  head()
A tibble: 6 × 2
wordn
<chr><int>
time 396
people 249
door 224
kemp 213
invisible197
black 178
bronte <- gutenbergr::gutenberg_download(c(1260, 768, 969, 9182, 767))
tidy_bronte <-
  bronte %>%
    tidytext::unnest_tokens(output = word, input = text) %>%
    dplyr::anti_join(y = tidytext::stop_words, by = 'word')

head(tidy_bronte)
A tibble: 6 × 2
gutenberg_idword
<int><chr>
767agnes
767grey
767acton
767bell
767london
767thomas
tidy_bronte %>%
  dplyr::count(word, sort = TRUE) %>%
  head()
A tibble: 6 × 2
wordn
<chr><int>
time 1065
miss 854
day 825
don’t 780
hand 767
eyes 714
frequency <- dplyr::bind_rows(dplyr::mutate(tidy_bronte,  author = 'Brontë Sisters'),
                              dplyr::mutate(tidy_hgwells, author = 'H. G. Wells'),
                              dplyr::mutate(tidy_books,   author = 'Jane Austen')) %>%
  dplyr::mutate(word = stringr::str_extract(string = word, pattern = "[a-z']+")) %>%
  dplyr::count(author, word) %>%
  dplyr::group_by(author) %>%
  dplyr::mutate(proportion = n / base::sum(n)) %>%
  dplyr::select(-n) %>%
  tidyr::pivot_wider(names_from = author, values_from = proportion) %>%
  tidyr::pivot_longer(`Brontë Sisters`:`H. G. Wells`, names_to = 'author', values_to = 'proportion')

head(frequency)
A tibble: 6 × 4
wordJane Austenauthorproportion
<chr><dbl><chr><dbl>
a 9.190796e-06Brontë Sisters6.645869e-05
a 9.190796e-06H. G. Wells 1.911571e-05
aback NABrontë Sisters3.909335e-06
aback NAH. G. Wells 1.911571e-05
abaht NABrontë Sisters3.909335e-06
abaht NAH. G. Wells NA
ggplot(frequency, aes(x = proportion, y = `Jane Austen`, color = abs(`Jane Austen` - proportion))) +
  geom_abline(color = 'gray40', lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), low = 'darkslategray4', high = 'gray75') +
  facet_wrap(~author, ncol = 2) +
  theme(legend.position = 'none') +
  labs(y = 'Jane Austen', x = NULL)
Warning message:
“Removed 39790 rows containing missing values (`geom_point()`).”
Warning message:
“Removed 39792 rows containing missing values (`geom_text()`).”
../../../_images/918f324ad77bae5c5200df2fd247bf2b0299d9f684f037fbe5fe2c319edc3601.png
stats::cor.test(data = frequency[frequency$author == 'Brontë Sisters',], ~proportion + `Jane Austen`)
	Pearson's product-moment correlation

data:  proportion and Jane Austen
t = 110.73, df = 10275, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.7286645 0.7462983
sample estimates:
      cor 
0.7376071 
stats::cor.test(data = frequency[frequency$author == 'H. G. Wells',], ~proportion + `Jane Austen`)
	Pearson's product-moment correlation

data:  proportion and Jane Austen
t = 33.563, df = 5457, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.3914204 0.4354038
sample estimates:
      cor 
0.4136534 

2 - Sentiment analysis with tidy data#

tidytext::get_sentiments('afinn') %>%
  utils::head()
A tibble: 6 × 2
wordvalue
<chr><dbl>
abandon -2
abandoned -2
abandons -2
abducted -2
abduction -2
abductions-2
tidytext::get_sentiments('bing') %>%
  utils::head()
A tibble: 6 × 2
wordsentiment
<chr><chr>
2-faces negative
abnormal negative
abolish negative
abominablenegative
abominablynegative
abominate negative
tidytext::get_sentiments('nrc') %>%
  utils::head()
A tibble: 6 × 2
wordsentiment
<chr><chr>
abacus trust
abandon fear
abandon negative
abandon sadness
abandonedanger
abandonedfear
tidy_books <-
  original_books %>%
    tidytext::unnest_tokens(output = word, input = text)

utils::head(tidy_books)
A tibble: 6 × 4
booklinenumberchapterword
<fct><int><int><chr>
Sense & Sensibility10sense
Sense & Sensibility10and
Sense & Sensibility10sensibility
Sense & Sensibility30by
Sense & Sensibility30jane
Sense & Sensibility30austen
nrc_joy <-
  tidytext::get_sentiments('nrc') %>%
    dplyr::filter(sentiment == 'joy')

utils::head(nrc_joy)
A tibble: 6 × 2
wordsentiment
<chr><chr>
absolution joy
abundance joy
abundant joy
accolade joy
accompanimentjoy
accomplish joy
tidy_books %>%
  dplyr::filter(book == 'Emma') %>%
  dplyr::inner_join(y = nrc_joy, by = 'word') %>%
  dplyr::count(word, sort = TRUE) %>%
  utils::head()
A tibble: 6 × 2
wordn
<chr><int>
good 359
friend166
hope 143
happy 125
love 117
deal 92
jane_austen_sentiment <-
  tidy_books %>%
    dplyr::inner_join(y = tidytext::get_sentiments('bing'), by = 'word') %>%
    dplyr::count(book, index = linenumber %/% 80, sentiment) %>%
    tidyr::pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
    dplyr::mutate(sentiment = positive - negative)

utils::head(jane_austen_sentiment)
Warning message in dplyr::inner_join(., y = tidytext::get_sentiments("bing"), by = "word"):
“Detected an unexpected many-to-many relationship between `x` and `y`.
 Row 435434 of `x` matches multiple rows in `y`.
 Row 5051 of `y` matches multiple rows in `x`.
 If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.”
A tibble: 6 × 5
bookindexnegativepositivesentiment
<fct><dbl><int><int><int>
Sense & Sensibility0163216
Sense & Sensibility1195334
Sense & Sensibility2123119
Sense & Sensibility3153116
Sense & Sensibility4163418
Sense & Sensibility5165135
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = 'free_x')
../../../_images/24f4932b162eb9b84270703c6549835ead338d0dd3186163a8371f66644724ad.png
pride_prejudice <-
  tidy_books %>%
    dplyr::filter(book == 'Pride & Prejudice')

utils::head(pride_prejudice)
A tibble: 6 × 4
booklinenumberchapterword
<fct><int><int><chr>
Pride & Prejudice10pride
Pride & Prejudice10and
Pride & Prejudice10prejudice
Pride & Prejudice30by
Pride & Prejudice30jane
Pride & Prejudice30austen
afinn <-
  pride_prejudice %>%
    dplyr::inner_join(y = tidytext::get_sentiments('afinn'), by = 'word') %>%
    dplyr::group_by(index = linenumber %/% 80) %>%
    dplyr::summarize(sentiment = base::sum(value)) %>%
    dplyr::mutate(method = 'AFINN')

utils::head(afinn)
A tibble: 6 × 3
indexsentimentmethod
<dbl><dbl><chr>
029AFINN
1 0AFINN
220AFINN
330AFINN
462AFINN
566AFINN
bing_and_nrc <-
  dplyr::bind_rows(
    pride_prejudice %>%
      dplyr::inner_join(y = tidytext::get_sentiments('bing'), by = 'word') %>%
      dplyr::mutate(method = 'Bing et al.'),
    pride_prejudice %>%
      dplyr::inner_join(y = tidytext::get_sentiments('nrc') %>%
                              dplyr::filter(sentiment %in% c('positive', 'negative')), by = 'word', relationship = 'many-to-many') %>%
      dplyr::mutate(method = 'NRC')
  ) %>%
  dplyr::count(method, index = linenumber %/% 80, sentiment) %>%
  tidyr::pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
  dplyr::mutate(sentiment = positive - negative)

utils::head(bing_and_nrc)
A tibble: 6 × 5
methodindexnegativepositivesentiment
<chr><dbl><int><int><int>
Bing et al.0 72114
Bing et al.12019-1
Bing et al.21620 4
Bing et al.3193112
Bing et al.4234724
Bing et al.5154934
dplyr::bind_rows(afinn, bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
    geom_col(show.legend = FALSE) +
    facet_wrap(~method, ncol = 1, scales = 'free_y')
../../../_images/2e274ba7733d530c392f41daf808438234851658b8c95ce7ed786b34a115a0a5.png
tidytext::get_sentiments('nrc') %>%
  dplyr::filter(sentiment %in% c('positive', 'negative')) %>%
  dplyr::count(sentiment)
A tibble: 2 × 2
sentimentn
<chr><int>
negative3316
positive2308
tidytext::get_sentiments('bing') %>%
  dplyr::count(sentiment)
A tibble: 2 × 2
sentimentn
<chr><int>
negative4781
positive2005
bing_word_counts <-
  tidy_books %>%
    dplyr::inner_join(tidytext::get_sentiments('bing'), by = 'word', relationship = 'many-to-many') %>%
    dplyr::count(word, sentiment, sort = TRUE) %>%
    dplyr::ungroup()

utils::head(bing_word_counts)
A tibble: 6 × 3
wordsentimentn
<chr><chr><int>
miss negative1855
well positive1523
good positive1380
great positive 981
like positive 725
betterpositive 639
bing_word_counts %>%
  dplyr::group_by(sentiment) %>%
  dplyr::slice_max(n, n = 10) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(word = stats::reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
    geom_col(show.legend = FALSE) +
    facet_wrap(~sentiment, scales = 'free_y') +
    labs(x = 'Contribution to sentiment', y = NULL)
../../../_images/5987821c5be311aa2597339c7986746fb1de1c58a911a1be13faa00f5628fb91.png
custom_stop_words <-
  dplyr::bind_rows(tibble::tibble(word = c('miss'), lexicon = c('custom')), tidytext::stop_words)

head(custom_stop_words)
A tibble: 6 × 2
wordlexicon
<chr><chr>
miss custom
a SMART
a's SMART
able SMART
aboutSMART
aboveSMART
tidy_books %>%
  dplyr::anti_join(y = tidytext::stop_words, by = 'word') %>%
  dplyr::count(word) %>%
  base::with(wordcloud::wordcloud(words = word, freq = n, max.words = 100))
../../../_images/ae13a1c96d99b74051d6f6dc4100fc5e89e4c035cf8c996bb9bce0d951f25b48.png
tidy_books %>%
  dplyr::inner_join(y = tidytext::get_sentiments('bing'), by = 'word', relationship = 'many-to-many') %>%
  dplyr::count(word, sentiment, sort = TRUE) %>%
  reshape2::acast(formula = word ~ sentiment, value.var = 'n', fill = 0) %>%
  wordcloud::comparison.cloud(colors = c('gray20', 'gray80'), max.words = 100)
../../../_images/83337328d8d54c4d9fd50a58a91809449b03a731565c637adb999a1e3d7aff91.png
p_and_p_sentences <-
  tibble::tibble(text = prideprejudice) %>%
    tidytext::unnest_tokens(sentence, text, token = 'sentences')

p_and_p_sentences$sentence[2]
'by jane austen'
austen_chapters <-
  janeaustenr::austen_books() %>%
    dplyr::group_by(book) %>%
    tidytext::unnest_tokens(chapter, text, token = 'regex', pattern = 'Chapter|CHAPTER [\\dIVXLC]') %>%
    dplyr::ungroup()

austen_chapters %>%
  dplyr::group_by(book) %>%
  dplyr::summarize(chapters = n())
A tibble: 6 × 2
bookchapters
<fct><int>
Sense & Sensibility51
Pride & Prejudice 62
Mansfield Park 49
Emma 56
Northanger Abbey 32
Persuasion 25
bingnegative <-
  tidytext::get_sentiments('bing') %>%
    dplyr::filter(sentiment == 'negative')

wordcounts <-
  tidy_books %>%
    dplyr::group_by(book, chapter) %>%
    dplyr::summarize(words = n())

tidy_books %>%
  dplyr::semi_join(y = bingnegative, by = 'word') %>%
  dplyr::group_by(book, chapter) %>%
  dplyr::summarize(negativewords = n()) %>%
  dplyr::left_join(y = wordcounts, by = c('book', 'chapter')) %>%
  dplyr::mutate(ratio = negativewords / words) %>%
  dplyr::filter(chapter != 0) %>%
  dplyr::slice_max(ratio, n = 1) %>%
  dplyr::ungroup()
`summarise()` has grouped output by 'book'. You can override using the `.groups` argument.
`summarise()` has grouped output by 'book'. You can override using the `.groups` argument.
A tibble: 6 × 5
bookchapternegativewordswordsratio
<fct><int><int><int><dbl>
Sense & Sensibility4316134050.04728341
Pride & Prejudice 3411121040.05275665
Mansfield Park 4617336850.04694708
Emma 1515133400.04520958
Northanger Abbey 2114929820.04996647
Persuasion 4 6218070.03431101

3 - Analyzing word and document frequency: tf-idf#

book_words <-
  janeaustenr::austen_books() %>%
    tidytext::unnest_tokens(output = word, input = text) %>%
    dplyr::count(book, word, sort = TRUE)

utils::head(book_words)
A tibble: 6 × 3
bookwordn
<fct><chr><int>
Mansfield Parkthe6206
Mansfield Parkto 5475
Mansfield Parkand5438
Emma to 5239
Emma the5201
Emma and4896
total_words <-
  book_words %>%
    dplyr::group_by(book) %>%
    dplyr::summarize(total = base::sum(n))

total_words
A tibble: 6 × 2
booktotal
<fct><int>
Sense & Sensibility119957
Pride & Prejudice 122204
Mansfield Park 160460
Emma 160996
Northanger Abbey 77780
Persuasion 83658
book_words <-
  dplyr::left_join(x = book_words, y = total_words)

utils::head(book_words)
Joining with `by = join_by(book)`
A tibble: 6 × 4
bookwordntotal
<fct><chr><int><int>
Mansfield Parkthe6206160460
Mansfield Parkto 5475160460
Mansfield Parkand5438160460
Emma to 5239160996
Emma the5201160996
Emma and4896160996
# TERM FREQUENCY

book_words %>%
  ggplot(aes(n / total, fill = book)) +
    geom_histogram(show.legend = FALSE) +
    xlim(NA, 0.0009) +
    facet_wrap(~book, ncol = 2, scales = 'free_y')
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning message:
“Removed 896 rows containing non-finite values (`stat_bin()`).”
Warning message:
“Removed 6 rows containing missing values (`geom_bar()`).”
../../../_images/c037e115bf81707b80223f50548da728bcf68152ced0973be2209dd1a57ace8a.png
freq_by_rank <-
  book_words %>%
    dplyr::group_by(book) %>%
    dplyr::mutate(rank = dplyr::row_number(), `term frequency` = n / total) %>%
    dplyr::ungroup()

utils::head(freq_by_rank)
A tibble: 6 × 6
bookwordntotalrankterm frequency
<fct><chr><int><int><int><dbl>
Mansfield Parkthe620616046010.03867631
Mansfield Parkto 547516046020.03412065
Mansfield Parkand543816046030.03389007
Emma to 523916099610.03254118
Emma the520116099620.03230515
Emma and489616099630.03041069
freq_by_rank %>%
  ggplot(aes(rank, `term frequency`, color = book)) +
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
  scale_x_log10() +
  scale_y_log10()
Warning message:
“Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
 Please use `linewidth` instead.”
../../../_images/5d094271ebac82ccbb0ef38d4be2ca531166a495a05b4a73ad388ee2968e668b.png
rank_subset <-
  freq_by_rank %>%
    dplyr::filter(rank < 500, rank > 10)

utils::head(rank_subset)
A tibble: 6 × 6
bookwordntotalrankterm frequency
<fct><chr><int><int><int><dbl>
Mansfield Parkshe2246160460110.01399726
Emma in 2189160996110.01359661
Emma not2140160996120.01329226
Emma you1980160996130.01229844
Emma be 1975160996140.01226739
Mansfield Parkbe 1904160460120.01186589
stats::lm(base::log10(`term frequency`) ~ base::log10(rank), data = rank_subset)
Call:
stats::lm(formula = base::log10(`term frequency`) ~ base::log10(rank), 
    data = rank_subset)

Coefficients:
      (Intercept)  base::log10(rank)  
          -0.6226            -1.1125  
freq_by_rank %>%
  ggplot(aes(rank, `term frequency`, color = book)) +
  geom_abline(intercept = -0.62, slope = -1.1, color = 'gray50', linetype = 2) +
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
  scale_x_log10() +
  scale_y_log10()
../../../_images/084226e23619565e62be86d7b8b3b764496268735c36c1cb838a27ed5f3a71ef.png
# intput to the function `tidytext::bind_tf_idf()`
#
# case: token-document
# variables:
#   word - tokens
#   book - documents
#      n - tokens per document counts

head(book_words)
A tibble: 6 × 4
bookwordntotal
<fct><chr><int><int>
Mansfield Parkthe6206160460
Mansfield Parkto 5475160460
Mansfield Parkand5438160460
Emma to 5239160996
Emma the5201160996
Emma and4896160996
book_tf_idf <-
  book_words %>%
  tidytext::bind_tf_idf(term = word, document = book, n = n)

head(book_tf_idf)
A tibble: 6 × 7
bookwordntotaltfidftf_idf
<fct><chr><int><int><dbl><dbl><dbl>
Mansfield Parkthe62061604600.0386763100
Mansfield Parkto 54751604600.0341206500
Mansfield Parkand54381604600.0338900700
Emma to 52391609960.0325411800
Emma the52011609960.0323051500
Emma and48961609960.0304106900
book_tf_idf %>%
  dplyr::select(-total) %>%
  dplyr::arrange(dplyr::desc(tf_idf)) %>%
  head()
A tibble: 6 × 6
bookwordntfidftf_idf
<fct><chr><int><dbl><dbl><dbl>
Sense & Sensibilityelinor 6230.0051935281.7917590.009305552
Sense & Sensibilitymarianne4920.0041014701.7917590.007348847
Mansfield Park crawford4930.0030724171.7917590.005505032
Pride & Prejudice darcy 3730.0030522731.7917590.005468939
Persuasion elliot 2540.0030361711.7917590.005440088
Emma emma 7860.0048821091.0986120.005363545
book_tf_idf %>%
  dplyr::group_by(book) %>%
  dplyr::slice_max(order_by = tf_idf, n = 15) %>%
  dplyr::ungroup() %>%
  ggplot(aes(tf_idf, forcats::fct_reorder(.f = word, .x = tf_idf), fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = 'free') +
  labs(x = 'tf-idf', y = NULL)
../../../_images/ac7f917445b16225973329d6ed8d3b7f04723fd22c7e0d43c225799cbc9566c6.png
physics <-
  gutenbergr::gutenberg_download(c(37729, 14725, 13476, 30155), meta_fields = 'author')

head(physics)
A tibble: 6 × 3
gutenberg_idtextauthor
<int><chr><chr>
13476EXPERIMENTS WITH ALTERNATE CURRENTS OF HIGH POTENTIAL AND HIGH FREQUENCY Tesla, Nikola
13476 Tesla, Nikola
13476A Lecture Delivered before the Institution of Electrical Engineers, LondonTesla, Nikola
13476 Tesla, Nikola
13476by Tesla, Nikola
13476 Tesla, Nikola
physics_words <-
  physics %>%
  tidytext::unnest_tokens(output = word, input = text) %>%
  dplyr::count(author, word, sort = TRUE)

head(physics_words)
A tibble: 6 × 3
authorwordn
<chr><chr><int>
Galilei, Galileo the3760
Tesla, Nikola the3604
Huygens, Christiaanthe3553
Einstein, Albert the2993
Galilei, Galileo of 2049
Einstein, Albert of 2029
plot_physics <-
  physics_words %>%
  tidytext::bind_tf_idf(term = word, document = author, n = n) %>%
  dplyr::mutate(author = base::factor(author, levels = c('Galilei, Galileo', 'Huygens, Christiaan', 'Tesla, Nikola', 'Einstein, Albert')))

head(plot_physics)
A tibble: 6 × 6
authorwordntfidftf_idf
<fct><chr><int><dbl><dbl><dbl>
Galilei, Galileo the37600.0935416500
Tesla, Nikola the36040.0912520600
Huygens, Christiaanthe35530.0928015500
Einstein, Albert the29930.0951639100
Galilei, Galileo of 20490.0509752200
Einstein, Albert of 20290.0645130500
plot_physics %>%
  dplyr::group_by(author) %>%
  dplyr::slice_max(order_by = tf_idf, n = 15) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(word = stats::reorder(x = word, X = tf_idf)) %>%
  ggplot(aes(tf_idf, word, fill = author)) +
  geom_col(show.legend = FALSE) +
  labs(x = 'tf-idf', y = NULL) +
  facet_wrap(~author, ncol = 2, scales = 'free')
../../../_images/2c318cf079b32729c3a3fffd58c81e6c708d15990c3a4577878686d5e41bdaf1.png
physics %>%
  dplyr::filter(stringr::str_detect(text, '_k_')) %>%
  dplyr::select(text)
A tibble: 7 × 1
text
<chr>
surface AB at the points AK_k_B. Then instead of the hemispherical
would needs be that from all the other points K_k_B there should
necessarily be equal to CD, because C_k_ is equal to CK, and C_g_ to
the crystal at K_k_, all the points of the wave CO_oc_ will have
O_o_ has reached K_k_. Which is easy to comprehend, since, of these
CO_oc_ in the crystal, when O_o_ has arrived at K_k_, because it forms
ρ is the average density of the matter and _k_ is a constant connected
physics %>%
  dplyr::filter(stringr::str_detect(text, 'RC')) %>%
  dplyr::select(text) %>%
  head()
A tibble: 6 × 1
text
<chr>
line RC, parallel and equal to AB, to be a portion of a wave of light,
represents the partial wave coming from the point A, after the wave RC
be the propagation of the wave RC which fell on AB, and would be the
transparent body; seeing that the wave RC, having come to the aperture
incident rays. Let there be such a ray RC falling upon the surface
CK. Make CO perpendicular to RC, and across the angle KCO adjust OK,
mystopwords <-
  tibble::tibble(word = c('eq', 'co', 'rc', 'ac', 'ak', 'bn', 'fig', 'file', 'cg', 'cb', 'cm', 'ab', '_k', '_k_', '_x'))

physics_words <-
  dplyr::anti_join(x = physics_words, y = mystopwords, by = 'word')

plot_physics <-
  physics_words %>%
  tidytext::bind_tf_idf(term = word, document = author, n = n) %>%
  dplyr::mutate(word = stringr::str_remove_all(word, '_')) %>%
  dplyr::group_by(author) %>%
  dplyr::slice_max(order_by = tf_idf, n = 15) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(word = forcats::fct_reorder(.f = word, .x = tf_idf)) %>%
  dplyr::mutate(author = base::factor(author, levels = c('Galilei, Galileo', 'Huygens, Christiaan', 'Tesla, Nikola', 'Einstein, Albert')))

plot_physics %>%
  ggplot(aes(tf_idf, word, fill = author)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~author, ncol = 2, scales = 'free') +
  labs(x = 'tf-idf', y = NULL)
../../../_images/f5bd80dc9db6a47d34a6b003ebf0033f182d0903fa667db3d968051005bc9bfb.png

4 - Relationships between words: n-grams and correlations#

# TIDY
#
#     case: one token per row (where a token is a bigram)
# metadata: variable `book`

austen_bigrams <-
  janeaustenr::austen_books() %>%
  tidytext::unnest_tokens(output = bigram, input = text, token = 'ngrams', n = 2) %>%
  dplyr::filter(!is.na(bigram))

head(austen_bigrams)
A tibble: 6 × 2
bookbigram
<fct><chr>
Sense & Sensibilitysense and
Sense & Sensibilityand sensibility
Sense & Sensibilityby jane
Sense & Sensibilityjane austen
Sense & Sensibilitychapter 1
Sense & Sensibilitythe family
# the most common bigrams

austen_bigrams %>%
  dplyr::count(bigram, sort = TRUE) %>%
  head()
A tibble: 6 × 2
bigramn
<chr><int>
of the 2853
to be 2670
in the 2221
it was 1691
i am 1485
she had1405
# remove stop words

bigrams_separated <-
  austen_bigrams %>%
  tidyr::separate(col = bigram, into = c('word1', 'word2'), sep = ' ')

bigrams_filtered <-
  bigrams_separated %>%
  dplyr::filter(!word1 %in% tidytext::stop_words$word) %>%
  dplyr::filter(!word2 %in% tidytext::stop_words$word)

bigram_counts <-
  bigrams_filtered %>%
  dplyr::count(word1, word2, sort = TRUE)

head(bigram_counts)
A tibble: 6 × 3
word1word2n
<chr><chr><int>
sir thomas 266
miss crawford 196
captainwentworth143
miss woodhouse143
frank churchill114
lady russell 110
bigrams_united <-
  bigrams_filtered %>%
  tidyr::unite(col = bigram, word1, word2, sep = ' ')

head(bigrams_united)
A tibble: 6 × 2
bookbigram
<fct><chr>
Sense & Sensibilityjane austen
Sense & Sensibilitychapter 1
Sense & Sensibilitynorland park
Sense & Sensibilitysurrounding acquaintance
Sense & Sensibilitylate owner
Sense & Sensibilityadvanced age
janeaustenr::austen_books() %>%
  tidytext::unnest_tokens(output = trigram, input = text, token = 'ngrams', n = 3) %>%
  dplyr::filter(!is.na(trigram)) %>%
  tidyr::separate(col = trigram, into = c('word1', 'word2', 'word3'), sep = ' ') %>%
  dplyr::filter(!word1 %in% stop_words$word,
                !word2 %in% stop_words$word,
                !word3 %in% stop_words$word) %>%
  dplyr::count(word1, word2, word3, sort = TRUE) %>%
  head()
A tibble: 6 × 4
word1word2word3n
<chr><chr><chr><int>
dear miss woodhouse20
miss de bourgh 17
lady catherinede 11
poor miss taylor 11
sir walter elliot 10
catherinede bourgh 9
bigrams_filtered %>%
  dplyr::filter(word2 == 'street') %>%
  dplyr::count(book, word1, sort = TRUE) %>%
  head()
A tibble: 6 × 3
bookword1n
<fct><chr><int>
Sense & Sensibilityharley 16
Sense & Sensibilityberkeley 15
Northanger Abbey milsom 10
Northanger Abbey pulteney 10
Mansfield Park wimpole 9
Pride & Prejudice gracechurch 8
bigram_tf_idf <-
  bigrams_united %>%
  dplyr::count(book, bigram) %>%
  tidytext::bind_tf_idf(term = bigram, document = book, n = n) %>%
  dplyr::arrange(desc(tf_idf))

head(bigram_tf_idf)
A tibble: 6 × 6
bookbigramntfidftf_idf
<fct><chr><int><dbl><dbl><dbl>
Mansfield Parksir thomas 2660.030434781.7917590.05453181
Persuasion captain wentworth1430.028964961.7917590.05189824
Mansfield Parkmiss crawford 1960.022425631.7917590.04018133
Persuasion lady russell 1100.022280741.7917590.03992172
Persuasion sir walter 1080.021875631.7917590.03919587
Emma miss woodhouse 1430.017262191.7917590.03092970
bigram_tf_idf %>%
  dplyr::group_by(book) %>%
  dplyr::slice_max(order_by = tf_idf, n = 15) %>%
  dplyr::ungroup() %>%
  ggplot(aes(tf_idf, forcats::fct_reorder(.f = bigram, .x = tf_idf), fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = 'free') +
  labs(x = 'tf-idf', y = NULL)
../../../_images/aad0c804db723dc14ab1f8f46a50c57fa848f356436459ce1db831b45f04f333.png
bigrams_separated %>%
  dplyr::filter(word1 == 'not') %>%
  dplyr::count(word1, word2, sort = TRUE) %>%
  head()
A tibble: 6 × 3
word1word2n
<chr><chr><int>
notbe 580
notto 335
nothave 307
notknow 237
nota 184
notthink162
AFINN <- tidytext::get_sentiments('afinn')
head(AFINN)
A tibble: 6 × 2
wordvalue
<chr><dbl>
abandon -2
abandoned -2
abandons -2
abducted -2
abduction -2
abductions-2
not_words <-
  bigrams_separated %>%
  dplyr::filter(word1 == 'not') %>%
  dplyr::inner_join(y = AFINN, by = c(word2 = 'word')) %>%
  dplyr::count(word2, value, sort = TRUE)

head(not_words)
A tibble: 6 × 3
word2valuen
<chr><dbl><int>
like 295
help 277
want 141
wish 139
allow130
care 221
not_words %>%
  dplyr::mutate(contribution = n * value) %>%
  dplyr::arrange(dplyr::desc(base::abs(contribution))) %>%
  utils::head(20) %>%
  dplyr::mutate(word2 = stats::reorder(x = word2, X = contribution)) %>%
  ggplot(aes(n * value, word2, fill = n * value > 0)) +
  geom_col(show.legend = FALSE) +
  labs(x = 'Sentiment value * number of occurrences', y = 'Words preceded by \"not\"')
../../../_images/a2a412a290ca49075136ad193e780ead6896640d7c99712a3fc89d290c87761f.png
negation_words <- c('not', 'no', 'never', 'without')

negated_words <-
  bigrams_separated %>%
  dplyr::filter(word1 %in% negation_words) %>%
  dplyr::inner_join(y = AFINN, by = c(word2 = 'word')) %>%
  dplyr::count(word1, word2, value, sort = TRUE)

head(negated_words)
A tibble: 6 × 4
word1word2valuen
<chr><chr><dbl><int>
no doubt-196
notlike 295
nothelp 277
no no -158
notwant 141
notwish 139
negated_words %>%
  dplyr::mutate(contribution = n * value, word2 = stats::reorder(x = base::paste(word2, word1, sep = "__"), X = contribution)) %>%
  dplyr::group_by(word1) %>%
  dplyr::slice_max(abs(contribution), n = 12, with_ties = FALSE) %>%
  ggplot(aes(word2, contribution, fill = n * value > 0)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ word1, scales = "free") +
  scale_x_discrete(labels = function(x) gsub("__.+$", "", x)) +
  xlab("Words preceded by negation term") +
  ylab("Sentiment value * # of occurrences") +
  coord_flip()
../../../_images/e59458c1353f9a5e1bf3ae00e69e336d4a13da7e1ea611b7fc672ba61b88a904.png
bigram_graph <-
  bigram_counts %>%
  dplyr::filter(n > 20) %>%
  igraph::graph_from_data_frame()

head(bigram_graph)
  [[ suppressing 85 column names ‘sir’, ‘miss’, ‘captain’ ... ]]
6 x 85 sparse Matrix of class "dgCMatrix"
                                                                               
sir     . . . . . . . . . . 1 . . . . . . . . . . . . . . . 1 . . . . . 1 . . .
miss    . . . . . . . . . . . . . . . . . . . . . . . . . . . 1 . 1 . . . 1 1 .
captain . . . . . . . . . . . . . . . . . . . . . . . . . . . . 1 . . . . . . .
frank   . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 1 . . . . .
lady    . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 1 . 1 . .
colonel . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 1
                                                                               
sir     . . . . . . . . . . . . . . . . . 1 1 . . . . . . . . . . . . . . . . .
miss    1 . . 1 1 . 1 1 1 . 1 . 1 . 1 1 1 . . . . . 1 . . . . . . . . . . 1 . .
captain . . . . . . . . . 1 . . . . . . . . . . . . . . . 1 . . . . . . . . 1 .
frank   . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
lady    . 1 1 . . . . . . . . . . . . . . . . . . . . . . . . . 1 . . . . . . .
colonel . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
                                 
sir     . . . . . . . . . . . . .
miss    . . . . 1 1 1 1 . . . 1 .
captain . . . . . . . . . . . . .
frank   . . . . . . . . . . . . .
lady    . . . . . . . . . . 1 . .
colonel . 1 . . . . . . . 1 . . .
set.seed(2017)

ggraph(bigram_graph, layout = 'fr') +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)
Warning message:
“Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
 Please use `linewidth` in the `default_aes` field and elsewhere instead.”
../../../_images/7c03b773337c5596ee5af2ad9277eba41e42fc5409ce866ae54399133d3413e4.png
set.seed(2020)

a <- grid::arrow(type = 'closed', length = ggplot2::unit(.15, 'inches'))

ggraph(bigram_graph, layout = 'fr') +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a, end_cap = ggraph::circle(.07, 'inches')) +
  geom_node_point(color = 'lightblue', size = 5) +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
  theme_void()
../../../_images/bae400a72667ba2765f31a7c7e9de93f2010456af807e39fe212de7344764ff8.png
count_bigrams <- function (dataset) {
  dataset %>%
    tidytext::unnest_tokens(output = bigram, input = text, token = 'ngrams', n = 2) %>%
    tidyr::separate(col = bigram, into = c('word1', 'word2'), sep = ' ') %>%
    dplyr::filter(!word1 %in% tidytext::stop_words$word,
                  !word2 %in% tidytext::stop_words$word) %>%
    dplyr::count(word1, word2, sort = TRUE)
}

visualize_bigrams <- function (bigrams) {
  set.seed(2016)

  a <- grid::arrow(type = 'closed', length = ggplot2::unit(.15, 'inches'))

  bigrams %>%
    ggraph(layout = 'fr') +
    geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a) +
    geom_node_point(color = 'lightblue', size = 5) +
    geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
    theme_void()
}
kjv <- gutenberg_download(10)
Warning message:
“! Could not download a book at http://aleph.gutenberg.org/1/10/10.zip.
 The book may have been archived.
 Alternatively, You may need to select a different mirror.
→ See https://www.gutenberg.org/MIRRORS.ALL for options.”
Warning message:
“Unknown or uninitialised column: `text`.”
kjv_bigrams <-
  kjv %>%
  count_bigrams()

kjv_bigrams %>%
  dplyr::filter(n > 40, !stringr::str_detect(word1, '\\d'), !stringr::str_detect(word2, '\\d')) %>%
  visualize_bigrams()
Don't know how to automatically pick scale for object of type <function>. Defaulting to continuous.
ERROR while rich displaying an object: Error in `geom_edge_link()`:
! Problem while computing aesthetics.
 Error occurred in the 1st layer.
Caused by error in `compute_aesthetics()`:
! Aesthetics are not valid data columns.
 The following aesthetics are invalid:
 `edge_alpha = n`
 Did you mistype the name of a data column or forget to add `after_stat()`?

Traceback:
1. tryCatch(withCallingHandlers({
 .     if (!mime %in% names(repr::mime2repr)) 
 .         stop("No repr_* for mimetype ", mime, " in repr::mime2repr")
 .     rpr <- repr::mime2repr[[mime]](obj)
 .     if (is.null(rpr)) 
 .         return(NULL)
 .     prepare_content(is.raw(rpr), rpr)
 . }, error = error_handler), error = outer_handler)
2. tryCatchList(expr, classes, parentenv, handlers)
3. tryCatchOne(expr, names, parentenv, handlers[[1L]])
4. doTryCatch(return(expr), name, parentenv, handler)
5. withCallingHandlers({
 .     if (!mime %in% names(repr::mime2repr)) 
 .         stop("No repr_* for mimetype ", mime, " in repr::mime2repr")
 .     rpr <- repr::mime2repr[[mime]](obj)
 .     if (is.null(rpr)) 
 .         return(NULL)
 .     prepare_content(is.raw(rpr), rpr)
 . }, error = error_handler)
6. repr::mime2repr[[mime]](obj)
7. repr_text.default(obj)
8. paste(capture.output(print(obj)), collapse = "\n")
9. capture.output(print(obj))
10. withVisible(...elt(i))
11. print(obj)
12. print.ggplot(obj)
13. ggplot_build(x)
14. ggplot_build.ggraph(x)
15. NextMethod()
16. ggplot_build.ggplot(x)
17. by_layer(function(l, d) l$compute_aesthetics(d, plot), layers, 
  .     data, "computing aesthetics")
18. try_fetch(for (i in seq_along(data)) {
  .     out[[i]] <- f(l = layers[[i]], d = data[[i]])
  . }, error = function(cnd) {
  .     cli::cli_abort(c("Problem while {step}.", i = "Error occurred in the {ordinal(i)} layer."), 
  .         call = layers[[i]]$constructor, parent = cnd)
  . })
19. tryCatch(withCallingHandlers(expr, condition = function(cnd) {
  .     {
  .         .__handler_frame__. <- TRUE
  .         .__setup_frame__. <- frame
  .         if (inherits(cnd, "message")) {
  .             except <- c("warning", "error")
  .         }
  .         else if (inherits(cnd, "warning")) {
  .             except <- "error"
  .         }
  .         else {
  .             except <- ""
  .         }
  .     }
  .     while (!is_null(cnd)) {
  .         if (inherits(cnd, "error")) {
  .             out <- handlers[[1L]](cnd)
  .             if (!inherits(out, "rlang_zap")) 
  .                 throw(out)
  .         }
  .         inherit <- .subset2(.subset2(cnd, "rlang"), "inherit")
  .         if (is_false(inherit)) {
  .             return()
  .         }
  .         cnd <- .subset2(cnd, "parent")
  .     }
  . }), stackOverflowError = handlers[[1L]])
20. tryCatchList(expr, classes, parentenv, handlers)
21. tryCatchOne(expr, names, parentenv, handlers[[1L]])
22. doTryCatch(return(expr), name, parentenv, handler)
23. withCallingHandlers(expr, condition = function(cnd) {
  .     {
  .         .__handler_frame__. <- TRUE
  .         .__setup_frame__. <- frame
  .         if (inherits(cnd, "message")) {
  .             except <- c("warning", "error")
  .         }
  .         else if (inherits(cnd, "warning")) {
  .             except <- "error"
  .         }
  .         else {
  .             except <- ""
  .         }
  .     }
  .     while (!is_null(cnd)) {
  .         if (inherits(cnd, "error")) {
  .             out <- handlers[[1L]](cnd)
  .             if (!inherits(out, "rlang_zap")) 
  .                 throw(out)
  .         }
  .         inherit <- .subset2(.subset2(cnd, "rlang"), "inherit")
  .         if (is_false(inherit)) {
  .             return()
  .         }
  .         cnd <- .subset2(cnd, "parent")
  .     }
  . })
24. f(l = layers[[i]], d = data[[i]])
25. l$compute_aesthetics(d, plot)
26. compute_aesthetics(..., self = self)
27. cli::cli_abort(c("Aesthetics are not valid data columns.", x = "The following aesthetics are invalid:", 
  .     issues, i = "Did you mistype the name of a data column or forget to add {.fn after_stat}?"))
28. rlang::abort(message, ..., call = call, use_cli_format = TRUE, 
  .     .frame = .frame)
29. signal_abort(cnd, .file)
30. signalCondition(cnd)
31. (function (cnd) 
  . {
  .     {
  .         .__handler_frame__. <- TRUE
  .         .__setup_frame__. <- frame
  .         if (inherits(cnd, "message")) {
  .             except <- c("warning", "error")
  .         }
  .         else if (inherits(cnd, "warning")) {
  .             except <- "error"
  .         }
  .         else {
  .             except <- ""
  .         }
  .     }
  .     while (!is_null(cnd)) {
  .         if (inherits(cnd, "error")) {
  .             out <- handlers[[1L]](cnd)
  .             if (!inherits(out, "rlang_zap")) 
  .                 throw(out)
  .         }
  .         inherit <- .subset2(.subset2(cnd, "rlang"), "inherit")
  .         if (is_false(inherit)) {
  .             return()
  .         }
  .         cnd <- .subset2(cnd, "parent")
  .     }
  . })(structure(list(message = structure("Aesthetics are not valid data columns.", names = ""), 
  .     trace = structure(list(call = list(IRkernel::main(), kernel$run(), 
  .         handle_shell(), executor$execute(msg), tryCatch(evaluate(request$content$code, 
  .             envir = .GlobalEnv, output_handler = oh, stop_on_error = 1L), 
  .             interrupt = function(cond) {
  .                 log_debug("Interrupt during execution")
  .                 interrupted <<- TRUE
  .             }, error = .self$handle_error), tryCatchList(expr, 
  .             classes, parentenv, handlers), tryCatchOne(tryCatchList(expr, 
  .             names[-nh], parentenv, handlers[-nh]), names[nh], 
  .             parentenv, handlers[[nh]]), doTryCatch(return(expr), 
  .             name, parentenv, handler), tryCatchList(expr, names[-nh], 
  .             parentenv, handlers[-nh]), tryCatchOne(expr, names, 
  .             parentenv, handlers[[1L]]), doTryCatch(return(expr), 
  .             name, parentenv, handler), evaluate(request$content$code, 
  .             envir = .GlobalEnv, output_handler = oh, stop_on_error = 1L), 
  .         evaluate_call(expr, parsed$src[[i]], envir = envir, enclos = enclos, 
  .             debug = debug, last = i == length(out), use_try = stop_on_error != 
  .                 2L, keep_warning = keep_warning, keep_message = keep_message, 
  .             log_echo = log_echo, log_warning = log_warning, output_handler = output_handler, 
  .             include_timing = include_timing), handle(pv <- withCallingHandlers(withVisible(value_fun(ev$value, 
  .             ev$visible)), warning = wHandler, error = eHandler, 
  .             message = mHandler)), try(f, silent = TRUE), tryCatch(expr, 
  .             error = function(e) {
  .                 call <- conditionCall(e)
  .                 if (!is.null(call)) {
  .                   if (identical(call[[1L]], quote(doTryCatch))) 
  .                     call <- sys.call(-4L)
  .                   dcall <- deparse(call, nlines = 1L)
  .                   prefix <- paste("Error in", dcall, ": ")
  .                   LONG <- 75L
  .                   sm <- strsplit(conditionMessage(e), "\n")[[1L]]
  .                   w <- 14L + nchar(dcall, type = "w") + nchar(sm[1L], 
  .                     type = "w")
  .                   if (is.na(w)) 
  .                     w <- 14L + nchar(dcall, type = "b") + nchar(sm[1L], 
  .                       type = "b")
  .                   if (w > LONG) 
  .                     prefix <- paste0(prefix, "\n  ")
  .                 }
  .                 else prefix <- "Error : "
  .                 msg <- paste0(prefix, conditionMessage(e), "\n")
  .                 .Internal(seterrmessage(msg[1L]))
  .                 if (!silent && isTRUE(getOption("show.error.messages"))) {
  .                   cat(msg, file = outFile)
  .                   .Internal(printDeferredWarnings())
  .                 }
  .                 invisible(structure(msg, class = "try-error", 
  .                   condition = e))
  .             }), tryCatchList(expr, classes, parentenv, handlers), 
  .         tryCatchOne(expr, names, parentenv, handlers[[1L]]), 
  .         doTryCatch(return(expr), name, parentenv, handler), withCallingHandlers(withVisible(value_fun(ev$value, 
  .             ev$visible)), warning = wHandler, error = eHandler, 
  .             message = mHandler), withVisible(value_fun(ev$value, 
  .             ev$visible)), value_fun(ev$value, ev$visible), prepare_mimebundle_kernel(obj, 
  .             .self$handle_display_error), prepare_mimebundle(obj, 
  .             "text/plain", error_handler = handle_display_error), 
  .         filter_map(mimetypes, function(mime) {
  .             tryCatch(withCallingHandlers({
  .                 if (!mime %in% names(repr::mime2repr)) 
  .                   stop("No repr_* for mimetype ", mime, " in repr::mime2repr")
  .                 rpr <- repr::mime2repr[[mime]](obj)
  .                 if (is.null(rpr)) 
  .                   return(NULL)
  .                 prepare_content(is.raw(rpr), rpr)
  .             }, error = error_handler), error = outer_handler)
  .         }), Filter(Negate(is.null), sapply(x, f, simplify = simplify)), 
  .         unlist(lapply(x, f)), lapply(x, f), sapply(x, f, simplify = simplify), 
  .         lapply(X = X, FUN = FUN, ...), FUN(X[[i]], ...), tryCatch(withCallingHandlers({
  .             if (!mime %in% names(repr::mime2repr)) 
  .                 stop("No repr_* for mimetype ", mime, " in repr::mime2repr")
  .             rpr <- repr::mime2repr[[mime]](obj)
  .             if (is.null(rpr)) 
  .                 return(NULL)
  .             prepare_content(is.raw(rpr), rpr)
  .         }, error = error_handler), error = outer_handler), tryCatchList(expr, 
  .             classes, parentenv, handlers), tryCatchOne(expr, 
  .             names, parentenv, handlers[[1L]]), doTryCatch(return(expr), 
  .             name, parentenv, handler), withCallingHandlers({
  .             if (!mime %in% names(repr::mime2repr)) 
  .                 stop("No repr_* for mimetype ", mime, " in repr::mime2repr")
  .             rpr <- repr::mime2repr[[mime]](obj)
  .             if (is.null(rpr)) 
  .                 return(NULL)
  .             prepare_content(is.raw(rpr), rpr)
  .         }, error = error_handler), repr::mime2repr[[mime]](obj), 
  .         repr_text.default(obj), paste(capture.output(print(obj)), 
  .             collapse = "\n"), capture.output(print(obj)), withVisible(...elt(i)), 
  .         print(obj), print.ggplot(obj), ggplot_build(x), ggplot_build.ggraph(x), 
  .         NextMethod(), ggplot_build.ggplot(x), by_layer(function(l, 
  .             d) l$compute_aesthetics(d, plot), layers, data, "computing aesthetics"), 
  .         try_fetch(for (i in seq_along(data)) {
  .             out[[i]] <- f(l = layers[[i]], d = data[[i]])
  .         }, error = function(cnd) {
  .             cli::cli_abort(c("Problem while {step}.", i = "Error occurred in the {ordinal(i)} layer."), 
  .                 call = layers[[i]]$constructor, parent = cnd)
  .         }), tryCatch(withCallingHandlers(expr, condition = function(cnd) {
  .             {
  .                 .__handler_frame__. <- TRUE
  .                 .__setup_frame__. <- frame
  .                 if (inherits(cnd, "message")) {
  .                   except <- c("warning", "error")
  .                 }
  .                 else if (inherits(cnd, "warning")) {
  .                   except <- "error"
  .                 }
  .                 else {
  .                   except <- ""
  .                 }
  .             }
  .             while (!is_null(cnd)) {
  .                 if (inherits(cnd, "error")) {
  .                   out <- handlers[[1L]](cnd)
  .                   if (!inherits(out, "rlang_zap")) 
  .                     throw(out)
  .                 }
  .                 inherit <- .subset2(.subset2(cnd, "rlang"), "inherit")
  .                 if (is_false(inherit)) {
  .                   return()
  .                 }
  .                 cnd <- .subset2(cnd, "parent")
  .             }
  .         }), stackOverflowError = handlers[[1L]]), tryCatchList(expr, 
  .             classes, parentenv, handlers), tryCatchOne(expr, 
  .             names, parentenv, handlers[[1L]]), doTryCatch(return(expr), 
  .             name, parentenv, handler), withCallingHandlers(expr, 
  .             condition = function(cnd) {
  .                 {
  .                   .__handler_frame__. <- TRUE
  .                   .__setup_frame__. <- frame
  .                   if (inherits(cnd, "message")) {
  .                     except <- c("warning", "error")
  .                   }
  .                   else if (inherits(cnd, "warning")) {
  .                     except <- "error"
  .                   }
  .                   else {
  .                     except <- ""
  .                   }
  .                 }
  .                 while (!is_null(cnd)) {
  .                   if (inherits(cnd, "error")) {
  .                     out <- handlers[[1L]](cnd)
  .                     if (!inherits(out, "rlang_zap")) 
  .                       throw(out)
  .                   }
  .                   inherit <- .subset2(.subset2(cnd, "rlang"), 
  .                     "inherit")
  .                   if (is_false(inherit)) {
  .                     return()
  .                   }
  .                   cnd <- .subset2(cnd, "parent")
  .                 }
  .             }), f(l = layers[[i]], d = data[[i]]), l$compute_aesthetics(d, 
  .             plot), compute_aesthetics(..., self = self), cli::cli_abort(c("Aesthetics are not valid data columns.", 
  .             x = "The following aesthetics are invalid:", issues, 
  .             i = "Did you mistype the name of a data column or forget to add {.fn after_stat}?")), 
  .         rlang::abort(message, ..., call = call, use_cli_format = TRUE, 
  .             .frame = .frame)), parent = c(0L, 1L, 2L, 3L, 4L, 
  .     5L, 6L, 7L, 6L, 9L, 10L, 4L, 12L, 13L, 14L, 15L, 16L, 17L, 
  .     18L, 13L, 13L, 13L, 22L, 23L, 24L, 25L, 26L, 26L, 25L, 29L, 
  .     30L, 31L, 32L, 33L, 34L, 31L, 31L, 31L, 38L, 38L, 40L, 38L, 
  .     38L, 43L, 43L, 43L, 43L, 47L, 48L, 49L, 50L, 51L, 52L, 49L, 
  .     48L, 55L, 56L, 57L, 58L), visible = c(TRUE, TRUE, TRUE, TRUE, 
  .     TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 
  .     TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 
  .     TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 
  .     TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 
  .     TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 
  .     TRUE, TRUE, TRUE, FALSE, FALSE), namespace = c("IRkernel", 
  .     NA, "IRkernel", NA, "base", "base", "base", "base", "base", 
  .     "base", "base", "evaluate", "evaluate", "evaluate", "base", 
  .     "base", "base", "base", "base", "base", "base", "IRkernel", 
  .     "IRkernel", "IRdisplay", "IRdisplay", "base", "base", "base", 
  .     "base", "base", "IRdisplay", "base", "base", "base", "base", 
  .     "base", NA, "repr", "base", "utils", "base", "base", "ggplot2", 
  .     "ggplot2", "ggraph", "base", "ggplot2", "ggplot2", "rlang", 
  .     "base", "base", "base", "base", "base", "ggplot2", NA, "ggplot2", 
  .     "cli", "rlang"), scope = c("::", NA, "local", NA, "::", "local", 
  .     "local", "local", "local", "local", "local", "::", ":::", 
  .     "local", "::", "::", "local", "local", "local", "::", "::", 
  .     "local", ":::", "::", ":::", "::", "::", "::", "::", "::", 
  .     "local", "::", "local", "local", "local", "::", NA, ":::", 
  .     "::", "::", "::", "::", ":::", "::", ":::", "::", ":::", 
  .     ":::", "::", "::", "local", "local", "local", "::", "local", 
  .     NA, "local", "::", "::"), error_frame = c(FALSE, FALSE, FALSE, 
  .     FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
  .     FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
  .     FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
  .     FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
  .     FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
  .     FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, 
  .     FALSE, FALSE)), row.names = c(NA, -59L), version = 2L, class = c("rlang_trace", 
  .     "rlib_trace", "tbl", "data.frame")), parent = NULL, body = c(x = "The following aesthetics are invalid:", 
  .     x = "`edge_alpha = n`", i = "Did you mistype the name of a data column or forget to add `after_stat()`?"
  .     ), rlang = list(inherit = TRUE), call = compute_aesthetics(..., 
  .         self = self), use_cli_format = TRUE), class = c("rlang_error", 
  . "error", "condition")))
32. handlers[[1L]](cnd)
33. cli::cli_abort(c("Problem while {step}.", i = "Error occurred in the {ordinal(i)} layer."), 
  .     call = layers[[i]]$constructor, parent = cnd)
34. rlang::abort(message, ..., call = call, use_cli_format = TRUE, 
  .     .frame = .frame)
35. signal_abort(cnd, .file)
austen_section_words <-
  janeaustenr::austen_books() %>%
  dplyr::filter(book == 'Pride & Prejudice') %>%
  dplyr::mutate(section = dplyr::row_number() %/% 10) %>%
  dplyr::filter(section > 0) %>%
  tidytext::unnest_tokens(output = word, input = text) %>%
  dplyr::filter(!word %in% tidytext::stop_words$word)

head(austen_section_words)
A tibble: 6 × 3
booksectionword
<fct><dbl><chr>
Pride & Prejudice1truth
Pride & Prejudice1universally
Pride & Prejudice1acknowledged
Pride & Prejudice1single
Pride & Prejudice1possession
Pride & Prejudice1fortune
# count words cooccurring within sections
word_pairs <-
  austen_section_words %>%
  widyr::pairwise_count(item = word, feature = section, sort = TRUE)

head(word_pairs)
A tibble: 6 × 3
item1item2n
<chr><chr><dbl>
darcy elizabeth144
elizabethdarcy 144
miss elizabeth110
elizabethmiss 110
elizabethjane 106
jane elizabeth106
word_pairs %>%
  dplyr::filter(item1 == 'darcy') %>%
  head()
A tibble: 6 × 3
item1item2n
<chr><chr><dbl>
darcyelizabeth144
darcymiss 92
darcybingley 86
darcyjane 46
darcybennet 45
darcysister 45
word_cors <-
  austen_section_words %>%
  dplyr::group_by(word) %>%
  dplyr::filter(n() >= 20) %>%
  widyr::pairwise_cor(item = word, feature = section, sort = TRUE)

head(word_cors)
A tibble: 6 × 3
item1item2correlation
<chr><chr><dbl>
bourgh de 0.9508501
de bourgh 0.9508501
pounds thousand0.7005808
thousandpounds 0.7005808
william sir 0.6644719
sir william 0.6644719
word_cors %>%
  dplyr::filter(item1 == 'pounds') %>%
  head()
A tibble: 6 × 3
item1item2correlation
<chr><chr><dbl>
poundsthousand 0.7005808
poundsten 0.2305758
poundsfortune 0.1638626
poundssettled 0.1494605
poundswickham's0.1415240
poundschildren 0.1290001
word_cors %>%
  dplyr::filter(item1 %in% c('elizabeth', 'pounds', 'married', 'pride')) %>%
  dplyr::group_by(item1) %>%
  dplyr::slice_max(correlation, n = 6) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(item2 = stats::reorder(x = item2, X = correlation)) %>%
  ggplot(aes(item2, correlation)) +
  geom_bar(stat = 'identity') +
  facet_wrap(~item1, scales = 'free') +
  coord_flip()
../../../_images/0af4641e129c5d66f976bab3903a14992931274e2b1b0e431c9325d24169643b.png
# pairs of words in Pride & Prejudice that show at least a .15 correlation of appearing within the same 10-line section

set.seed(2016)

word_cors %>%
  dplyr::filter(correlation > .15) %>%
  igraph::graph_from_data_frame() %>%
  ggraph(layout = 'fr') +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = 'lightblue', size = 5) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void()
../../../_images/57383b8eadee0aa3a852c8f5cef0caf33b9c5af3154a03af34071994f9a8de64.png

  • tidytext::unnest_tokens tokenize by word, by sentence, by sequences of words token = 'ngrams', n = 2

  • tidytext::bind_tf_idf(term = word, document = author, n = n)

Relationships

  • word-document (tf-idf and frequency analysis)

  • word-sentiment (sentiment analysis)

  • word-word (n-gram)

How to quantify what a document is about? look at the words that make up a document

How important is a word?

  • term frequency (TF)

    • how frequently a word occurs in a document

  • inverse document frequency (IDF)

    • decreases the weight for commonly used words and increases the weight for words that are not used very much in a collection of documents

  • TF-IDF

    • the product of TF and IDF: the frequency of a term adjusted for how rarely it is used

    • measures how important a word is to a document in a collection of documents

\( \begin{aligned} \text{idf}(\text{term}) =\ln \left( \frac{n_\text{documents}}{n_\text{documents containing term}} \right) \end{aligned} \)

Zipf’s Empirical Law: the frequency that a word appears is inversely proportional to its rank.

\( \begin{aligned} \text{frequency} \propto \frac{1}{\text{rank}} \end{aligned} \)

  • [W] Clause

  • [W] Comment (Rheme)

  • [W] Corpus

  • [W] Document

  • [W] Document Classification

  • [W] Document Clustering

  • [W] English article

  • [W] Lexicon

  • [W] Named Entity

  • [W] Named-Entity Recognition

  • [W] Natural Language

  • [W] Rank-Size Distribution

  • [W] Sentiment Analysis

  • [W] Stop Word

  • [W] Text Mining

  • [W] Text Summarization

  • [W] Topic (Theme)

  • [W] Topic-Prominent Language

  • [W] TF-IDF

  • [W] Word Count

  • [W] Word List

  • [W] Zipf’s Law

  • [W] Jones, Karen (1935-2007)

  • [W] Luhn, Hans (1896-1964)

  • [W] Zipf, George (1902-1950)