tidytext/ 0000755 0001762 0000144 00000000000 15040607522 012131 5 ustar ligges users tidytext/tests/ 0000755 0001762 0000144 00000000000 15040555021 013267 5 ustar ligges users tidytext/tests/testthat/ 0000755 0001762 0000144 00000000000 15040607522 015133 5 ustar ligges users tidytext/tests/testthat/test-tf-idf.R 0000644 0001762 0000144 00000004766 15040537137 017425 0 ustar ligges users w <- tibble(
document = rep(1:2, each = 5),
word = c(
"the",
"quick",
"brown",
"fox",
"jumped",
"over",
"the",
"lazy",
"brown",
"dog"
),
frequency = c(
1,
1,
1,
1,
2,
1,
1,
1,
1,
2
)
)
test_that("Can calculate TF-IDF", {
result <- w %>%
bind_tf_idf(word, document, frequency)
result2 <- w %>%
bind_tf_idf("word", "document", "frequency")
expect_equal(result, result2)
expect_equal(
select(w, document, word, frequency),
select(result, document, word, frequency)
)
expect_s3_class(result, "tbl_df")
expect_type(result$tf, "double")
expect_type(result$idf, "double")
expect_type(result$tf_idf, "double")
expect_equal(result$tf, rep(c(1 / 6, 1 / 6, 1 / 6, 1 / 6, 1 / 3), 2))
expect_equal(result$idf[1:4], c(0, log(2), 0, log(2)))
expect_equal(result$tf_idf, result$tf * result$idf)
# preserves but ignores groups
result2 <- w %>%
group_by(document) %>%
bind_tf_idf(word, document, frequency)
expect_equal(length(groups(result2)), 1)
expect_equal(as.character(groups(result2)[[1]]), "document")
})
test_that("TF-IDF works when the document ID is a number", {
# example thanks to https://github.com/juliasilge/tidytext/issues/31
my_corpus <- dplyr::tibble(
id = rep(c(2, 3), each = 3),
word = c("an", "interesting", "text", "a", "boring", "text"),
n = c(1, 1, 3, 1, 2, 1)
)
tf_idf <- bind_tf_idf(my_corpus, word, id, n)
expect_false(any(is.na(tf_idf)))
expect_equal(tf_idf$tf_idf[c(3, 6)], c(0, 0))
})
test_that("tf-idf with tidyeval works", {
d <- tibble(
txt = c(
"Because I could not stop for Death -",
"He kindly stopped for me -"
)
)
termvar <- quo("word")
documentvar <- quo("document")
countvar <- quo("frequency")
result <- w %>%
bind_tf_idf(!!termvar, !!documentvar, !!countvar)
expect_equal(
select(w, document, word, frequency),
select(result, document, word, frequency)
)
expect_s3_class(result, "tbl_df")
expect_type(result$tf, "double")
expect_type(result$idf, "double")
expect_type(result$tf_idf, "double")
expect_equal(result$tf, rep(c(1 / 6, 1 / 6, 1 / 6, 1 / 6, 1 / 3), 2))
expect_equal(result$idf[1:4], c(0, log(2), 0, log(2)))
expect_equal(result$tf_idf, result$tf * result$idf)
result2 <- w %>%
group_by(document) %>%
bind_tf_idf(!!termvar, !!documentvar, !!countvar)
expect_equal(length(groups(result2)), 1)
expect_equal(as.character(groups(result2)[[1]]), "document")
})
tidytext/tests/testthat/test-dictionary-tidiers.R 0000644 0001762 0000144 00000001131 15040537137 022041 0 ustar ligges users if (requireNamespace("quanteda", quietly = TRUE)) {
test_that("can tidy a quanteda dictionary", {
lst <- list(
terror = c("terrorism", "terrorists", "threat"),
economy = c("jobs", "business", "grow", "work")
)
d <- quanteda::dictionary(lst)
td <- tidy(d)
expect_s3_class(td, "tbl_df")
expect_type(td$category, "character")
expect_type(td$word, "character")
expect_equal(nrow(td), 7)
expect_equal(sort(unique(td$category)), c("economy", "terror"))
expect_equal(
sort(unique(td$word)),
sort(unique(c(lst[[1]], lst[[2]])))
)
})
}
tidytext/tests/testthat/test-lda-tidiers.R 0000644 0001762 0000144 00000006630 15040537137 020445 0 ustar ligges users suppressPackageStartupMessages(library(dplyr))
if (require("topicmodels", quietly = TRUE)) {
data(AssociatedPress)
ap <- AssociatedPress[1:100, ]
lda <- LDA(ap, control = list(alpha = 0.1), k = 4)
ctm <- LDA(ap, k = 4)
test_that("can tidy beta matrix", {
td <- tidy.LDA(lda, matrix = "beta")
td2 <- tidy.CTM(ctm, matrix = "beta")
expect_s3_class(td, "tbl_df")
expect_s3_class(td2, "tbl_df")
expect_equal(colnames(td), c("topic", "term", "beta"))
expect_equal(colnames(td2), c("topic", "term", "beta"))
expect_type(td$term, "character")
expect_type(td$beta, "double")
expect_equal(unique(td$topic), 1:4)
expect_type(td2$term, "character")
expect_type(td2$beta, "double")
expect_equal(unique(td2$topic), 1:4)
expect_gt(nrow(td), 10000)
expect_gt(nrow(td2), 10000)
expect_true(all(c("united", "states", "president") %in% td$term))
# all betas sum to 1
summ <- td %>%
count(topic, wt = beta)
expect_lt(max(abs(summ$n - 1)), .000001)
td_log <- tidy(lda, matrix = "beta", log = TRUE)
expect_true(all(td_log$beta < 0))
td_log2 <- tidy(ctm, matrix = "beta", log = TRUE)
expect_true(all(td_log2$beta < 0))
})
test_that("can tidy gamma matrix", {
td <- tidy.LDA(lda, matrix = "gamma")
expect_s3_class(td, "tbl_df")
td2 <- tidy.CTM(ctm, matrix = "gamma")
expect_s3_class(td2, "tbl_df")
expect_equal(colnames(td), c("document", "topic", "gamma"))
expect_equal(colnames(td2), c("document", "topic", "gamma"))
expect_type(td$document, "integer")
expect_type(td$gamma, "double")
expect_type(td2$document, "integer")
expect_type(td2$gamma, "double")
expect_equal(nrow(td), 400)
expect_equal(unique(td$topic), 1:4)
expect_equal(unique(td$document), 1:100)
expect_equal(nrow(td2), 400)
expect_equal(unique(td2$topic), 1:4)
expect_equal(unique(td2$document), 1:100)
# all gammas sum to 1
summ <- td %>%
count(document, wt = gamma)
expect_lt(max(abs(summ$n - 1)), 1e-6)
td_log <- tidy(lda, matrix = "gamma", log = TRUE)
expect_true(all(td_log$gamma < 0))
})
test_that("can augment an LDA output", {
au <- augment.LDA(lda)
expect_s3_class(au, "tbl_df")
au2 <- augment.CTM(ctm)
expect_s3_class(au2, "tbl_df")
expect_equal(colnames(au), c("document", "term", ".topic"))
expect_equal(sort(unique(au$.topic)), 1:4)
# augment output should have same document-term combinations
ap_tidied <- tidy(ap)
s <- arrange(au, document, term)
s2 <- ap_tidied %>%
arrange(document, term)
expect_equal(s$term, s2$term)
expect_equal(s$document, s2$document)
# can include extra columns
ap_tidied2 <- ap_tidied %>%
mutate(starts_a = stringr::str_detect(term, "^a"))
au2 <- augment.LDA(lda, data = ap_tidied2)
expect_equal(au$document, au2$document)
expect_equal(au$term, au2$term)
expect_type(au2$starts_a, "logical")
expect_equal(stringr::str_detect(au2$term, "^a"), au2$starts_a)
# can give document term matrix
au3 <- augment.LDA(lda, data = ap)
expect_equal(au$document, au3$document)
expect_equal(au$term, au3$term)
expect_equal(au$.topic, au3$.topic)
})
test_that("can glance an LDA output", {
g <- glance.LDA(lda)
expect_s3_class(g, "tbl_df")
expect_equal(nrow(g), 1)
expect_equal(g$terms, 19253)
g2 <- glance.CTM(lda)
expect_s3_class(g2, "tbl_df")
})
}
tidytext/tests/testthat/test-sparse-tidiers.R 0000644 0001762 0000144 00000002332 15040537137 021175 0 ustar ligges users test_that("Can tidy DocumentTermMatrices and TermDocumentMatrices", {
if (require("tm", quietly = TRUE)) {
txt <- system.file("texts", "txt", package = "tm")
ovid <- VCorpus(
DirSource(txt, encoding = "UTF-8"),
readerControl = list(language = "lat")
)
ovid_dtm <- DocumentTermMatrix(ovid)
ovid_dtm_td <- tidy(ovid_dtm)
expect_s3_class(ovid_dtm_td, "tbl_df")
expect_equal(sort(unique(ovid_dtm_td$document)), sort(rownames(ovid_dtm)))
expect_equal(sort(unique(ovid_dtm_td$term)), sort(colnames(ovid_dtm)))
ovid_tdm <- TermDocumentMatrix(ovid)
ovid_tdm_td <- tidy(ovid_tdm)
expect_s3_class(ovid_tdm_td, "tbl_df")
expect_equal(sort(unique(ovid_tdm_td$document)), sort(colnames(ovid_tdm)))
expect_equal(sort(unique(ovid_tdm_td$term)), sort(rownames(ovid_tdm)))
}
})
test_that("Can tidy dfm from quanteda", {
if (requireNamespace("quanteda", quietly = TRUE)) {
dfm_obj <- quanteda::dfm(quanteda::tokens(quanteda::data_corpus_inaugural))
dfm_obj_td <- tidy(dfm_obj)
expect_s3_class(dfm_obj_td, "tbl_df")
expect_equal(sort(unique(dfm_obj_td$document)), sort(rownames(dfm_obj)))
expect_equal(sort(unique(dfm_obj_td$term)), sort(colnames(dfm_obj)))
}
})
tidytext/tests/testthat/test-unnest-char.R 0000644 0001762 0000144 00000001477 15040537137 020477 0 ustar ligges users test_that("unnest_characters works", {
d <- tibble(txt = "Emily Dickinson")
r <- unnest_characters(d, out, txt)
s <- unnest_tokens(d, out, txt, token = "characters")
expect_equal(r, s)
})
test_that("unnest_character_shingles works", {
d <- tibble(txt = "tidytext is the best")
r <- unnest_character_shingles(d, out, txt)
s <- d %>% unnest_tokens(out, txt, token = "character_shingles")
expect_equal(r, s)
r <- unnest_character_shingles(d, out, txt, n = 3, n_min = 3)
s <- d %>%
unnest_tokens(out, txt, token = "character_shingles", n = 3, n_min = 3)
expect_equal(r, s)
d <- tibble(txt = "Emily Dickinson")
r <- unnest_character_shingles(d, out, txt, to_lower = FALSE)
s <- unnest_tokens(
d,
out,
txt,
token = "character_shingles",
to_lower = FALSE
)
expect_equal(r, s)
})
tidytext/tests/testthat/test-unnest_regex.R 0000644 0001762 0000144 00000000445 14040415714 020743 0 ustar ligges users test_that("unnest_regex works", {
r <- unnest_regex(skspr, out, txt)
s <- unnest_tokens(skspr, out, txt, token = "regex")
expect_equal(r, s)
r <- unnest_regex(skspr, out, txt, pattern = "a")
s <- unnest_tokens(skspr, out, txt, token = "regex", pattern = "a")
expect_equal(r, s)
})
tidytext/tests/testthat/test-stop-words.R 0000644 0001762 0000144 00000000532 14125371266 020362 0 ustar ligges users suppressPackageStartupMessages(library(dplyr))
test_that("get_stopwords works for multiple languages", {
skip_if_not_installed("stopwords")
de <- get_stopwords("de")
ru <- get_stopwords("ru")
expect_s3_class(de, "tbl_df")
expect_s3_class(ru, "tbl_df")
expect_gt(nrow(de), nrow(ru))
expect_equal(unique(de$lexicon), "snowball")
})
tidytext/tests/testthat/helper-funs.R 0000644 0001762 0000144 00000001641 15040537137 017514 0 ustar ligges users expect_nrow <- function(tbl, n) {
expect_s3_class(tbl, "data.frame")
expect_equal(nrow(tbl), n)
}
expect_first_row <- function(tbl, col, text) {
ct <- tbl %>%
pull(!!enquo(col)) %>%
purrr::pluck(1)
expect_match(ct, text)
}
skspr <- data.frame(
id = 1:4,
txt = c(
"Now is the winter of our discontent",
"Made glorious summer by this sun of York;",
"And all the clouds that lour'd upon our house",
"In the deep bosom of the ocean buried."
),
stringsAsFactors = FALSE
)
song_df <- data.frame(
id = 1:8,
txt = c(
"How many roads must a man walk down",
"Before you call him a man?",
"How many seas must a white dove sail",
"Before she sleeps in the sand?",
"How many times must the cannonballs fly",
"Before they're forever banned?",
"The answer, my friend, is blowin' in the wind.",
"The answer is blowin' in the wind."
),
stringsAsFactors = FALSE
)
tidytext/tests/testthat/test-unnest_ngrams.R 0000644 0001762 0000144 00000001120 14125371357 021117 0 ustar ligges users test_that("unnest_ngrams works", {
r <- unnest_ngrams(skspr, out, txt)
expect_nrow(r, 24)
expect_first_row(r, out, "now is the")
r <- unnest_ngrams(skspr, out, txt, n = 4)
expect_nrow(r, 20)
expect_first_row(r, out, "now is the winter")
})
test_that("unnest_skip_ngrams works", {
r <- unnest_skip_ngrams(song_df, out, txt, n = 4)
s <- unnest_tokens(song_df, out, txt, n = 4, token = "skip_ngrams")
expect_equal(r, s)
r <- unnest_skip_ngrams(skspr, out, txt, n = 3, k = 2)
s <- unnest_tokens(skspr, out, txt, token = "skip_ngrams", n = 3, k = 2)
expect_equal(r, s)
})
tidytext/tests/testthat/test-unnest_ptb.R 0000644 0001762 0000144 00000000446 14040415714 020417 0 ustar ligges users test_that("unnest_ptb works", {
d <- tibble(txt = janeaustenr::prideprejudice)
r <- unnest_ptb(d, out, txt)
s <- unnest_tokens(d, out, txt, token = "ptb")
expect_equal(r, s)
r <- unnest_ptb(skspr, out, txt)
s <- unnest_tokens(skspr, out, txt, token = "ptb")
expect_equal(r, s)
})
tidytext/tests/testthat/test-corpus-tidiers.R 0000644 0001762 0000144 00000003074 15040537226 021216 0 ustar ligges users test_that("Can tidy corpus from tm package", {
skip_if_not_installed("tm")
# tm package examples
txt <- system.file("texts", "txt", package = "tm")
ovid <- tm::VCorpus(
tm::DirSource(txt, encoding = "UTF-8"),
readerControl = list(language = "lat")
)
td <- tidy(ovid, collapse = " ")
expect_equal(length(ovid), nrow(td))
expect_equal(
paste(as.character(ovid[[1]]), collapse = " "),
unname(td$text[1])
)
})
test_that("Can tidy corpus from quanteda package", {
skip_if_not_installed("quanteda")
data("data_corpus_inaugural", package = "quanteda")
texts <- as.character(data_corpus_inaugural)
td <- tidy(data_corpus_inaugural)
expect_equal(length(texts), nrow(td))
expect_true(all(td$text == texts))
})
test_that("Can tidy corpus from quanteda using accessor functions", {
skip_if_not_installed("quanteda")
x <- quanteda::data_corpus_inaugural
## similar to old method
ret_old <- as_tibble(quanteda::docvars(x)) %>%
mutate(text = unname(as.character(x))) %>%
select(text, everything())
## new method
ret_new <- tidy(x)
expect_identical(ret_old, ret_new)
})
test_that("Can glance a corpus from quanteda using accessor functions", {
skip_if_not_installed("quanteda")
x <- quanteda::data_corpus_inaugural
## old method
glance_old <- function(x, ...) {
md <- purrr::compact(quanteda::meta(x))
# turn vectors into list columns
md <- purrr::map_if(md, ~ length(.) > 1, list)
as_tibble(md)
}
ret_old <- glance_old(x)
## new method
ret_new <- glance(x)
expect_identical(ret_old, ret_new)
})
tidytext/tests/testthat/test-unnest_tweet.R 0000644 0001762 0000144 00000000320 14350414266 020756 0 ustar ligges users test_that("unnest_tweet works", {
tweets <- dplyr::tibble(
id = 1,
txt = "@rOpenSci and #rstats see: https://cran.r-project.org"
)
expect_snapshot_error(r <- unnest_tweets(tweets, out, txt))
})
tidytext/tests/testthat/reordered-boxplot.svg 0000644 0001762 0000144 00000025506 14625411010 021315 0 ustar ligges users
tidytext/tests/testthat/_snaps/ 0000755 0001762 0000144 00000000000 14450653344 016425 5 ustar ligges users tidytext/tests/testthat/_snaps/unnest-tokens.md 0000644 0001762 0000144 00000001005 14666153064 021563 0 ustar ligges users # tokenizing errors with appropriate error message
Token must be a supported type, or a function that takes a character vector as input
i Did you mean `token = "words"`?
# unnest_tokens raises an error if custom tokenizer gives bad output
Expected output of tokenizing function to be a list of length 1
---
Expected output of tokenizing function to be a list of length 1
# tokenizing tweets is deprecated
Support for `token = "tweets"` was deprecated in tidytext 0.4.0 and is now defunct.
tidytext/tests/testthat/_snaps/reorder-within/ 0000755 0001762 0000144 00000000000 14701055375 021366 5 ustar ligges users tidytext/tests/testthat/_snaps/reorder-within/reordered-multi-facet-boxplot.svg 0000644 0001762 0000144 00000050646 14343470404 027766 0 ustar ligges users
tidytext/tests/testthat/_snaps/reorder-within/custom-label-boxplot.svg 0000644 0001762 0000144 00000025543 14625411010 026157 0 ustar ligges users
tidytext/tests/testthat/_snaps/reorder-within/reordered-boxplot.svg 0000644 0001762 0000144 00000025524 14625411010 025542 0 ustar ligges users
tidytext/tests/testthat/_snaps/unnest_tweet.md 0000644 0001762 0000144 00000000142 14666153064 021473 0 ustar ligges users # unnest_tweet works
`unnest_tweets()` was deprecated in tidytext 0.4.0 and is now defunct.
tidytext/tests/testthat/test-sparse-casters.R 0000644 0001762 0000144 00000003760 15040537137 021204 0 ustar ligges users library(Matrix)
dat <- tibble(
a = c("row1", "row1", "row2", "row2", "row2"),
b = c("col1", "col2", "col1", "col3", "col4"),
val = 1:5
)
test_that("Can cast tables into a sparse Matrix", {
m <- cast_sparse(dat, a, b)
m2 <- cast_sparse(dat, "a", "b")
expect_s4_class(m, "dgCMatrix")
expect_equal(m, m2)
expect_equal(sum(m), 5)
expect_equal(nrow(m), length(unique(dat$a)))
expect_equal(ncol(m), length(unique(dat$b)))
m3 <- cast_sparse(dat, a, b, val)
expect_equal(sum(m3), sum(dat$val))
expect_equal(m3["row2", "col3"], 4)
})
test_that("cast_sparse ignores groups", {
m <- cast_sparse(dat, a, b)
m2 <- cast_sparse(group_by(dat, a), a, b)
expect_identical(m, m2)
})
test_that("Can cast_sparse with tidyeval", {
m <- cast_sparse(dat, a, b)
rowvar <- quo("a")
m2 <- cast_sparse(dat, !!rowvar, b)
expect_identical(m, m2)
})
test_that("Can cast tables into a sparse DocumentTermMatrix", {
skip_if_not_installed("tm")
d <- cast_dtm(dat, a, b, val)
d2 <- cast_dtm(dat, "a", "b", "val")
expect_equal(d, d2)
expect_s3_class(d, "DocumentTermMatrix")
expect_equal(dim(d), c(2, 4))
expect_equal(sort(tm::Docs(d)), sort(unique(dat$a)))
expect_equal(sort(tm::Terms(d)), sort(unique(dat$b)))
expect_equal(as.numeric(as.matrix(d[1:2, 1:2])), c(1, 3, 2, 0))
expect_equal(as.numeric(as.matrix(d[2, 3])), 4)
})
test_that("Can cast tables into a sparse TermDocumentMatrix", {
skip_if_not_installed("tm")
d <- cast_tdm(dat, b, a, val)
d2 <- cast_tdm(dat, "b", "a", "val")
expect_equal(d, d2)
expect_s3_class(d, "TermDocumentMatrix")
expect_equal(dim(d), c(4, 2))
expect_equal(sort(tm::Terms(d)), sort(unique(dat$b)))
})
test_that("Can cast tables into a sparse dfm", {
skip_if_not_installed("quanteda")
library(methods)
d <- cast_dfm(dat, a, b, val)
d2 <- cast_dfm(dat, "a", "b", "val")
expect_equal(d, d2)
expect_true(quanteda::is.dfm(d))
expect_equal(dim(d), c(2, 4))
expect_equal(as.numeric(d[1, 1]), 1)
expect_equal(as.numeric(d[2, 3]), 4)
})
tidytext/tests/testthat/test-unnest-tokens.R 0000644 0001762 0000144 00000030634 15040537137 021062 0 ustar ligges users suppressPackageStartupMessages(library(dplyr))
test_that("tokenizing by character works", {
d <- tibble(txt = "Emily Dickinson")
d <- d %>% unnest_tokens(char, txt, token = "characters")
expect_equal(nrow(d), 14)
expect_equal(ncol(d), 1)
expect_equal(d$char[1], "e")
})
test_that("tokenizing by character shingles works", {
d <- tibble(txt = "tidytext is the best")
d <- d %>% unnest_tokens(char_ngram, txt, token = "character_shingles", n = 4)
expect_equal(nrow(d), 14)
expect_equal(ncol(d), 1)
expect_equal(d$char_ngram[1], "tidy")
})
test_that("tokenizing by character shingles can include whitespace/punctuation", {
d <- tibble(txt = "tidytext is the best!")
d <- d %>%
unnest_tokens(
char_ngram,
txt,
token = "character_shingles",
strip_non_alphanum = FALSE
)
expect_equal(nrow(d), 19)
expect_equal(ncol(d), 1)
expect_equal(d$char_ngram[1], "tid")
})
test_that("tokenizing by word works", {
d <- tibble(
txt = c(
"Because I could not stop for Death -",
"He kindly stopped for me -"
),
line = 1:2
)
d1 <- d %>% unnest_tokens(word, txt)
expect_equal(nrow(d1), 12)
expect_equal(ncol(d1), 2)
expect_equal(d1$word[1], "because")
d2 <- d %>% unnest_tokens(.data$word, .data$txt)
expect_equal(d1, d2)
d3 <- d %>% group_by(line) %>% unnest_tokens(word, txt)
expect_equal(d1, ungroup(d3))
})
test_that("tokenizing errors with appropriate error message", {
d <- tibble(
txt = c(
"Because I could not stop for Death -",
"He kindly stopped for me -"
)
)
expect_snapshot_error(
d %>% unnest_tokens(word, txt, token = "word")
)
})
test_that("tokenizing by sentence works", {
orig <- tibble(
txt = c(
"I'm Nobody! Who are you?",
"Are you - Nobody - too?",
"Then there’s a pair of us!",
"Don’t tell! they’d advertise - you know!"
)
)
d <- orig %>% unnest_tokens(sentence, txt, token = "sentences")
expect_equal(nrow(d), 6)
expect_equal(ncol(d), 1)
expect_equal(d$sentence[1], "i'm nobody!")
# check it works when there are multiple columns
orig$line <- c(1, 1, 2, 2)
orig$other_line <- c("a", "a", "b", "b")
d <- orig %>% unnest_tokens(sentence, txt, token = "sentences")
expect_type(d$sentence, "character")
expect_equal(d$sentence[1], "i'm nobody!")
})
test_that("tokenizing by ngram and skip ngram works", {
d <- tibble(
txt = c(
"Hope is the thing with feathers",
"That perches in the soul",
"And sings the tune without the words",
"And never stops at all ",
"And sweetest in the Gale is heard ",
"And sore must be the storm ",
"That could abash the little Bird",
"That kept so many warm ",
"I’ve heard it in the chillest land ",
"And on the strangest Sea ",
"Yet never in Extremity,",
"It asked a crumb of me."
),
line = c(rep(1, 6), rep(2, 6))
)
# tokenize by ngram
d1 <- d %>% unnest_tokens(ngram, txt, token = "ngrams", n = 2)
# expect_equal(nrow(d), 68) does not pass on appveyor
expect_equal(ncol(d1), 2)
expect_equal(d1$ngram[1], "hope is")
expect_equal(d1$ngram[10], "and sings")
d2 <- d %>%
unnest_tokens(ngram, txt, token = "ngrams", n = 2, collapse = "line")
d3 <- d %>%
group_by(line) %>%
unnest_tokens(ngram, txt, token = "ngrams", n = 2)
expect_equal(d2, ungroup(d3))
expect_equal(ncol(d2), 2)
expect_equal(d2$ngram[4], "thing with")
expect_equal(d2$ngram[40], "little bird")
expect_error(
d %>%
group_by(line) %>%
unnest_tokens(ngram, txt, token = "ngrams", n = 2, collapse = "line"),
"Use the `collapse` argument"
)
# tokenize by skip_ngram
d2 <- d %>% unnest_tokens(ngram, txt, token = "skip_ngrams", n = 4, k = 2)
# expect_equal(nrow(d), 189) does not pass on appveyor
expect_equal(ncol(d2), 2)
expect_equal(d2$ngram[30], "is thing with")
expect_equal(d2$ngram[300], "sore must storm")
})
test_that("tokenizing with a custom function works", {
orig <- tibble(
txt = c(
"I'm Nobody! Who are you?",
"Are you - Nobody - too?",
"Then there’s a pair of us!",
"Don’t tell! they’d advertise - you know!"
),
group = "all"
)
d <- orig %>%
unnest_tokens(unit, txt, token = stringr::str_split, pattern = " - ")
expect_equal(nrow(d), 7)
expect_equal(d$unit[3], "nobody")
expect_equal(d$unit[4], "too?")
d2 <- orig %>%
unnest_tokens(
unit,
txt,
token = stringr::str_split,
pattern = " - ",
collapse = "group"
)
expect_equal(nrow(d2), 4)
expect_equal(d2$unit[2], "nobody")
expect_equal(d2$unit[4], "you know!")
})
test_that("tokenizing with standard evaluation works", {
d <- tibble(
txt = c(
"Because I could not stop for Death -",
"He kindly stopped for me -"
)
)
d <- d %>% unnest_tokens("word", "txt")
expect_equal(nrow(d), 12)
expect_equal(ncol(d), 1)
expect_equal(d$word[1], "because")
})
test_that("tokenizing with tidyeval works", {
d <- tibble(
txt = c(
"Because I could not stop for Death -",
"He kindly stopped for me -"
)
)
outputvar <- quo(word)
inputvar <- quo(txt)
d <- d %>% unnest_tokens(!!outputvar, !!inputvar)
expect_equal(nrow(d), 12)
expect_equal(ncol(d), 1)
expect_equal(d$word[1], "because")
})
test_that("tokenizing with to_lower = FALSE works", {
orig <- tibble(
txt = c(
"Because I could not stop for Death -",
"He kindly stopped for me -"
)
)
d <- orig %>% unnest_tokens(word, txt, to_lower = FALSE)
expect_equal(nrow(d), 12)
expect_equal(ncol(d), 1)
expect_equal(d$word[1], "Because")
d2 <- orig %>%
unnest_tokens(ngram, txt, token = "ngrams", n = 2, to_lower = FALSE)
expect_equal(nrow(d2), 10)
expect_equal(ncol(d2), 1)
expect_equal(d2$ngram[1], "Because I")
d <- tibble(txt = "Emily Dickinson")
d <- unnest_tokens(
d,
char_ngram,
txt,
token = "character_shingles",
to_lower = FALSE,
n = 5
)
expect_equal(nrow(d), 10)
expect_equal(ncol(d), 1)
expect_equal(d$char_ngram[1], "Emily")
})
test_that("unnest_tokens raises an error if custom tokenizer gives bad output", {
d <- tibble(txt = "Emily Dickinson")
expect_snapshot_error(
unnest_tokens(d, word, txt, token = function(e) c("a", "b"))
)
expect_snapshot_error(
unnest_tokens(d, word, txt, token = function(e) list("a", "b"))
)
})
test_that("tokenizing HTML works", {
skip_if_not_installed("hunspell")
h <- tibble(
row = 1:2,
text = c("
Text is", "here")
)
res1 <- unnest_tokens(h, word, text)
expect_gt(nrow(res1), 3)
expect_equal(res1$word[1], "h1")
res2 <- unnest_tokens(h, word, text, format = "html")
expect_equal(nrow(res2), 3)
expect_equal(res2$word, c("text", "is", "here"))
expect_equal(res2$row, c(1, 1, 2))
})
test_that("tokenizing LaTeX works", {
skip_if_not_installed("hunspell")
h <- tibble(
row = 1:4,
text = c(
"\\textbf{text} \\emph{is}",
"\\begin{itemize}",
"\\item here",
"\\end{itemize}"
)
)
res1 <- unnest_tokens(h, word, text)
expect_gt(nrow(res1), 3)
expect_equal(res1$word[1], "textbf")
res2 <- unnest_tokens(h, word, text, format = "latex")
expect_equal(nrow(res2), 3)
expect_equal(res2$word, c("text", "is", "here"))
expect_equal(res2$row, c(1, 1, 3))
})
test_that("Tokenizing a one-column data.frame works", {
text <- data.frame(
txt = c(
"Because I could not stop for Death -",
"He kindly stopped for me -"
),
stringsAsFactors = FALSE
)
d <- unnest_tokens(text, word, txt)
expect_s3_class(d, "data.frame")
expect_equal(nrow(d), 12)
expect_equal(ncol(d), 1)
expect_equal(d$word[1], "because")
})
test_that("Tokenizing a two-column data.frame with one non-text column works", {
text <- data.frame(
line = 1:2,
txt = c(
"Because I could not stop for Death -",
"He kindly stopped for me -"
),
stringsAsFactors = FALSE
)
d <- unnest_tokens(text, word, txt)
expect_s3_class(d, "data.frame")
expect_equal(nrow(d), 12)
expect_equal(ncol(d), 2)
expect_equal(d$word[1], "because")
expect_equal(d$line[1], 1)
})
test_that("Tokenizing with NA values in columns behaves as expected", {
text <- tibble(
line = c(1:2, NA),
txt = c(
NA,
"Because I could not stop for Death -",
"He kindly stopped for me -"
)
)
d <- unnest_tokens(text, word, txt)
expect_s3_class(d, "data.frame")
expect_equal(nrow(d), 13)
expect_equal(ncol(d), 2)
expect_equal(d$word[2], "because")
expect_equal(d$line[1], 1)
expect_true(is.na(d$line[10]))
expect_true(is.na(d$word[1]))
})
test_that("Trying to tokenize a non-text format with words raises an error", {
d <- tibble(txt = "Emily Dickinson")
expect_error(
unnest_tokens(d, word, txt, token = "sentences", format = "latex"),
"except words"
)
})
test_that("unnest_tokens keeps top-level attributes", {
# first check data.frame
d <- data.frame(
row = 1:2,
txt = c("Call me Ishmael.", "OK, I will."),
stringsAsFactors = FALSE
)
lst <- list(1, 2, 3, 4)
attr(d, "custom") <- lst
result <- unnest_tokens(d, word, txt)
expect_equal(attr(result, "custom"), lst)
# now tibble
d2 <- tibble::as_tibble(d)
attr(d2, "custom") <- list(1, 2, 3, 4)
result <- unnest_tokens(d2, word, txt)
expect_equal(attr(result, "custom"), lst)
})
test_that("Trying to tokenize a data.table works", {
skip_if_not_installed("data.table")
text <- data.table::data.table(
txt = "Write till my fingers look like a bouquet of roses",
author = "Watsky"
)
output <- unnest_tokens(text, word, txt)
expect_equal(ncol(output), 2)
expect_equal(nrow(output), 10)
expect_equal(output$word[1], "write")
expect_equal(output$author[1], "Watsky")
})
test_that("Can tokenize a data.table work when the input has only one column", {
skip_if_not_installed("data.table")
text <- data.table::data.table(
txt = "You gotta bring yourself your flowers now in showbiz"
)
output <- unnest_tokens(text, word, txt)
expect_equal(ncol(output), 1)
expect_equal(nrow(output), 9)
expect_equal(output$word[1], "you")
})
test_that("custom attributes are preserved for a data.table", {
skip_if_not_installed("data.table")
text <- data.table::data.table(
txt = "You gotta bring yourself your flowers now in showbiz"
)
attr(text, "testattr") <- list(1, 2, 3, 4)
output <- unnest_tokens(text, word, txt)
expect_equal(ncol(output), 1)
expect_equal(nrow(output), 9)
expect_equal(output$word[1], "you")
expect_equal(attr(output, "testattr"), list(1, 2, 3, 4))
})
test_that("Tokenizing a data frame with list columns works", {
df <- data.frame(
txt = c(
"Because I could not stop for Death -",
"He kindly stopped for me -"
),
line = 1L:2L,
stringsAsFactors = FALSE
)
df$list_col <- list(1L:3L, c("a", "b"))
ret <- unnest_tokens(df, word, txt)
expect_s3_class(ret, "data.frame")
expect_type(ret$line, "integer")
expect_type(ret$list_col, "list")
expect_type(ret$list_col[[1]], "integer")
# 7 items of length 3, 5 items of length 2
expect_equal(lengths(ret$list_col), rep(c(3, 2), c(7, 5)))
})
test_that("Tokenizing a tbl_df with list columns works", {
df <- tibble(
txt = c(
"Because I could not stop for Death -",
"He kindly stopped for me -"
),
line = 1L:2L,
list_col = list(1L:3L, c("a", "b"))
)
ret <- unnest_tokens(df, word, txt)
expect_s3_class(ret, "tbl_df")
expect_type(ret$line, "integer")
expect_type(ret$list_col, "list")
expect_type(ret$list_col[[1]], "integer")
# 7 items of length 3, 5 items of length 2
expect_equal(lengths(ret$list_col), rep(c(3, 2), c(7, 5)))
})
test_that("Can't tokenize with list columns with collapse = TRUE", {
df <- tibble(
txt = c(
"Because I could not stop for Death -",
"He kindly stopped for me -"
),
line = 1L:2L,
list_col = list(1L:3L, c("a", "b"))
)
expect_error(
unnest_tokens(df, word, txt, token = "sentences", collapse = "line"),
"to be atomic vectors"
)
# Can tokenize by sentence without collapsing
# though it sort of defeats the purpose
ret <- unnest_tokens(df, word, txt, token = "sentences", collapse = NULL)
expect_equal(nrow(ret), 2)
})
test_that("tokenizing tweets is deprecated", {
d <- tibble(
txt = c(
"Because I could not stop for Death -",
"He kindly stopped for me -"
),
line = 1:2
)
expect_snapshot_error(d1 <- d %>% unnest_tokens(word, txt, token = "tweets"))
})
tidytext/tests/testthat/test-unnest-sentence.R 0000644 0001762 0000144 00000002441 15040537137 021356 0 ustar ligges users test_that("unnest_sentences works", {
orig <- tibble(
txt = c(
"I'm Nobody! Who are you?",
"Are you - Nobody - too?",
"Then there’s a pair of us!",
"Don’t tell! they’d advertise - you know!"
)
)
d <- orig %>% unnest_tokens(sentence, txt, token = "sentences")
r <- orig %>% unnest_sentences(sentence, txt)
expect_equal(d, r)
d <- orig %>%
unnest_tokens(sentence, txt, token = "sentences", strip_punct = TRUE)
r <- orig %>% unnest_sentences(sentence, txt, strip_punct = TRUE)
expect_equal(d, r)
})
test_that("unnest_lines works", {
orig <- tibble(
txt = c(
"I'm Nobody! Who are you?",
"Are you - Nobody - too?",
"Then there’s a pair of us!",
"Don’t tell! they’d advertise - you know!"
)
)
d <- orig %>% unnest_tokens(sentence, txt, token = "lines")
r <- orig %>% unnest_lines(sentence, txt)
expect_equal(d, r)
})
test_that("unnest_paragraphs works", {
orig <- tibble(
txt = c(
"I'm Nobody! \n\nWho are you?",
"Are you - \n\nNobody - too?",
"Then there’s \n\na pair of us!",
"Don’t tell! \n\nthey’d advertise - you know!"
)
)
d <- orig %>% unnest_tokens(sentence, txt, token = "paragraphs")
r <- orig %>% unnest_paragraphs(sentence, txt)
expect_equal(d, r)
})
tidytext/tests/testthat/test-sentiments.R 0000644 0001762 0000144 00000001014 14125370165 020423 0 ustar ligges users suppressPackageStartupMessages(library(dplyr))
test_data <- tibble(
line = 1:2,
text = c(
"I am happy and joyful",
"I am sad and annoyed"
)
)
test_tokens <- unnest_tokens(test_data, word, text)
test_that("get_sentiments works for bing data", {
bing_joined <- test_tokens %>%
inner_join(get_sentiments("bing"), by = "word")
expect_equal(bing_joined$word, c("happy", "joyful", "sad", "annoyed"))
expect_equal(
bing_joined$sentiment,
c("positive", "positive", "negative", "negative")
)
})
tidytext/tests/testthat/test-stm-tidiers.R 0000644 0001762 0000144 00000014146 15040537137 020511 0 ustar ligges users skip_if_not_installed("stm")
suppressPackageStartupMessages(library(dplyr))
library(stm)
dat <- tibble(
document = c("row1", "row1", "row2", "row2", "row2"),
term = c("col1", "col2", "col1", "col3", "col4"),
n = 1:5
)
m <- cast_sparse(dat, document, term)
stm_model <- stm(m, seed = 1234, K = 3, verbose = FALSE)
temp <- textProcessor(
documents = gadarian[1:10, ]$open.ended.response,
metadata = gadarian[1:10, ],
verbose = FALSE
)
out <- prepDocuments(temp$documents, temp$vocab, temp$meta, verbose = F)
stm_model_cov <- stm(
out$documents,
out$vocab,
K = 3,
content = out$meta$treatment,
seed = 123,
max.em.its = 3,
verbose = FALSE
)
test_that("can tidy beta matrix", {
td <- tidy(stm_model, matrix = "beta")
td_cov <- tidy(stm_model_cov, matrix = "beta")
expect_s3_class(td, "tbl_df")
expect_s3_class(td_cov, "tbl_df")
expect_equal(colnames(td), c("topic", "term", "beta"))
expect_equal(colnames(td_cov), c("topic", "term", "beta", "y.level"))
expect_type(td$term, "character")
expect_type(td$beta, "double")
expect_type(td_cov$y.level, "character")
expect_equal(unique(td$topic), 1:3)
expect_equal(unique(td_cov$y.level), c("0", "1"))
expect_gt(nrow(td), 10)
expect_true(all(c("col1", "col2", "col3") %in% td$term))
# all betas sum to 1
summ <- td %>%
count(topic, wt = beta)
expect_lt(max(abs(summ$n - 1)), .000001)
summ_cov <- td_cov %>%
count(topic, y.level, wt = beta)
expect_lt(max(abs(summ_cov$n - 1)), .000001)
td_log <- tidy(stm_model, matrix = "beta", log = TRUE)
expect_true(all(td_log$beta <= 0))
td_cov_log <- tidy(stm_model_cov, matrix = "beta", log = TRUE)
expect_true(all(td_cov_log$beta <= 0))
})
test_that("can tidy gamma matrix", {
td <- tidy(stm_model, matrix = "gamma")
expect_s3_class(td, "tbl_df")
expect_equal(colnames(td), c("document", "topic", "gamma"))
expect_type(td$document, "integer")
expect_type(td$gamma, "double")
expect_equal(nrow(td), 6)
expect_equal(unique(td$topic), 1:3)
expect_equal(unique(td$document), 1:2)
# all gammas sum to 1
summ <- td %>%
count(document, wt = gamma)
expect_lt(max(abs(summ$n - 1)), 1e-6)
td_log <- tidy(stm_model, matrix = "gamma", log = TRUE)
expect_true(all(td_log$gamma <= 0))
})
test_that("can tidy frex + lift matrix", {
td <- tidy(stm_model_cov, matrix = "frex")
expect_s3_class(td, "tbl_df")
expect_equal(colnames(td), c("topic", "term"))
expect_type(td$term, "character")
expect_equal(nrow(td), 60)
expect_equal(unique(td$topic), 1:3)
logbeta <- stm_model_cov$beta$logbeta[[1]]
word_counts <- stm_model_cov$settings$dim$wcounts$x
vocab <- stm_model_cov$vocab
td2 <- tidy(stm_model_cov, matrix = "frex", w = 1)
frex_stm <- stm::calcfrex(logbeta, w = 1, word_counts)
expect_equal(td2, tidytext:::pivot_stm_longer(frex_stm, vocab))
td3 <- tidy(stm_model_cov, matrix = "lift")
expect_equal(colnames(td3), c("topic", "term"))
lift_stm <- stm::calclift(logbeta, word_counts)
expect_equal(td3, tidytext:::pivot_stm_longer(lift_stm, vocab))
})
test_that("can augment an stm output", {
skip_if_not_installed("quanteda")
au <- augment(stm_model, dat)
expect_s3_class(au, "tbl_df")
expect_equal(colnames(au), c(colnames(dat), ".topic"))
expect_equal(sort(unique(au$.topic)), 1:3)
# augment output should have same document-term combinations
s <- arrange(au, document, term)
s2 <- dat %>%
arrange(document, term)
expect_equal(s$term, s2$term)
expect_equal(s$document, s2$document)
# can include extra columns
inaug_tidied2 <- dat %>%
mutate(starts_c = stringr::str_detect(term, "^c"))
au2 <- augment(stm_model, data = inaug_tidied2)
expect_equal(au$document, au2$document)
expect_equal(au$term, au2$term)
expect_type(au2$starts_c, "logical")
expect_equal(stringr::str_detect(au2$term, "^c"), au2$starts_c)
})
test_that("can glance an stm output", {
g <- glance(stm_model)
expect_s3_class(g, "tbl_df")
expect_equal(nrow(g), 1)
expect_equal(g$terms, 4)
})
stm_estimate_one_topic <- estimateEffect(
c(1) ~ treatment,
gadarianFit,
gadarian
)
test_that("can tidy estimateEffect object with one topic", {
td <- tidy(stm_estimate_one_topic)
expect_s3_class(td, "tbl_df")
expect_equal(
colnames(td),
c("topic", "term", "estimate", "std.error", "statistic", "p.value")
)
expect_type(td$topic, "integer")
expect_type(td$term, "character")
expect_type(td$estimate, "double")
expect_type(td$std.error, "double")
expect_type(td$statistic, "double")
expect_type(td$p.value, "double")
expect_equal(unique(td$topic), 1)
expect_equal(nrow(td), 2)
expect_true(all(c("(Intercept)", "treatment") %in% td$term))
})
test_that("can glance estimateEffect object with one topic", {
gla <- glance(stm_estimate_one_topic)
expect_s3_class(gla, "tbl_df")
expect_equal(colnames(gla), c("k", "docs", "uncertainty"))
expect_type(gla$k, "integer")
expect_type(gla$docs, "integer")
expect_type(gla$uncertainty, "character")
expect_equal(nrow(gla), 1)
})
stm_estimate_three_topic_interaction <- estimateEffect(
c(1:3) ~ treatment * s(pid_rep),
gadarianFit,
gadarian
)
test_that("can tidy estimateEffect object with three topics and an interaction term", {
td <- tidy(stm_estimate_three_topic_interaction)
expect_s3_class(td, "tbl_df")
expect_equal(
colnames(td),
c("topic", "term", "estimate", "std.error", "statistic", "p.value")
)
expect_type(td$topic, "integer")
expect_type(td$term, "character")
expect_type(td$estimate, "double")
expect_type(td$std.error, "double")
expect_type(td$statistic, "double")
expect_type(td$p.value, "double")
expect_equal(unique(td$topic), c(1:3))
expect_equal(nrow(td), 42) # 14 term combinations for 3 topics
expect_true(
all(
c(
"(Intercept)",
"treatment",
"s(pid_rep)1",
"s(pid_rep)2",
"s(pid_rep)3",
"s(pid_rep)4",
"s(pid_rep)5",
"s(pid_rep)6",
"treatment:s(pid_rep)1",
"treatment:s(pid_rep)2",
"treatment:s(pid_rep)3",
"treatment:s(pid_rep)4",
"treatment:s(pid_rep)5",
"treatment:s(pid_rep)6"
) %in%
td$term
)
)
})
tidytext/tests/testthat/test-reorder-within.R 0000644 0001762 0000144 00000003306 15040537137 021203 0 ustar ligges users skip_if_not_installed("ggplot2")
suppressPackageStartupMessages(library(ggplot2))
test_that("Can reorder within", {
mtcars_reordered <- reorder_within(mtcars$cyl, mtcars$mpg, mtcars$vs)
expect_s3_class(mtcars_reordered, "factor")
expect_equal(length(levels(mtcars_reordered)), 5)
})
test_that("Can reorder within multiple variables", {
mtcars_reordered <- reorder_within(
mtcars$cyl,
mtcars$mpg,
list(mtcars$vs, mtcars$am)
)
expect_s3_class(mtcars_reordered, "factor")
expect_equal(length(levels(mtcars_reordered)), 7)
})
test_that("Can make a plot", {
p <- ggplot(mtcars, aes(reorder_within(vs, mpg, cyl), mpg)) +
geom_boxplot() +
scale_x_reordered() +
facet_wrap(~cyl, scales = "free_x")
expect_s3_class(p, "ggplot")
vdiffr::expect_doppelganger("reordered boxplot", p)
})
test_that("Can make a plot with custom labels", {
custom_labeler <- function(x) {
x %>%
stringr::str_replace("___[0-9]+$", "") %>%
stringr::str_replace("0", "ZERO")
}
p <- ggplot(mtcars, aes(reorder_within(vs, mpg, cyl), mpg)) +
geom_boxplot() +
scale_x_reordered(labels = custom_labeler) +
facet_wrap(~cyl, scales = "free_x")
expect_s3_class(p, "ggplot")
vdiffr::expect_doppelganger("custom label boxplot", p)
})
test_that("Can make a multi-facet plot", {
expect_doppelganger <- function(title, fig, ...) {
testthat::skip_if_not_installed("vdiffr")
vdiffr::expect_doppelganger(title, fig, ...)
}
p <- ggplot(mtcars, aes(reorder_within(carb, mpg, list(vs, am)), mpg)) +
geom_boxplot() +
scale_x_reordered() +
facet_wrap(vs ~ am, scales = "free_x")
expect_s3_class(p, "ggplot")
expect_doppelganger("reordered multi-facet boxplot", p)
})
tidytext/tests/testthat.R 0000644 0001762 0000144 00000000074 14040415714 015256 0 ustar ligges users library(testthat)
library(tidytext)
test_check("tidytext")
tidytext/tests/figs/ 0000755 0001762 0000144 00000000000 14040415714 014222 5 ustar ligges users tidytext/tests/figs/reorder-within/ 0000755 0001762 0000144 00000000000 14040415714 017164 5 ustar ligges users tidytext/tests/figs/reorder-within/reordered-multi-facet-boxplot.svg 0000644 0001762 0000144 00000070470 14040415714 025565 0 ustar ligges users
tidytext/tests/figs/reorder-within/reordered-boxplot.svg 0000644 0001762 0000144 00000032745 14040415714 023360 0 ustar ligges users
tidytext/tests/figs/deps.txt 0000644 0001762 0000144 00000000103 14040415714 015710 0 ustar ligges users - vdiffr-svg-engine: 1.0
- vdiffr: 0.3.1
- freetypeharfbuzz: 0.2.5
tidytext/MD5 0000644 0001762 0000144 00000014445 15040607522 012451 0 ustar ligges users 36992ba444566dddce18763e39a094d3 *DESCRIPTION
60be9711aa00f169af385937d886c2d4 *LICENSE
9300e27fe53d9f9b4b6765ee26046c6e *NAMESPACE
8fbab3390ac5909ffb537aea3792174e *NEWS.md
92e419a5f7ff36bda83f5cc9424ce6a4 *R/bind_tf_idf.R
1966f6b5a179cacc1fec240533ded131 *R/compat_lazyeval.R
252cfe7d7d0e6e79cf2e4a087eb963b1 *R/corpus_tidiers.R
f432841e6f47586a8fed1a661c9a41bd *R/dictionary_tidiers.R
d5faff098a19a8c3396d50517732aab4 *R/globals.R
cb92a785f3a454ae286f1ec5bdbea49d *R/lda_tidiers.R
f794e1ff3b0fff1b83f20b9946e3bf1a *R/mallet_tidiers.R
cdf1e39e258b54793e1bbe7b1bbf8608 *R/nma_words.R
c097b36d16384f063504cc1019cd9fc8 *R/parts_of_speech.R
069f8943c27594fc32c8f1c247a54143 *R/reorder_within.R
ca8adef3dba68210b5724734f7cceaa0 *R/sentiments.R
65a6f308ea791ac95c6d3e28e78bf70a *R/sparse_casters.R
7d81a9317f810fceff22c6c3be453fa4 *R/sparse_tidiers.R
501dbfa91b7d2eaf86faf83e0fd4db2e *R/stm_tidiers.R
7ec036bac773ec0f1c991fd52cd755bd *R/stop_words.R
b4fe12876c37f8a22cde06e9701eecf0 *R/tidytext.R
1b54ce94688717cfc7d4e4ec0bc252c7 *R/unnest_char.R
8aa79581477109844387dfe6f4d72578 *R/unnest_ngrams.R
fc95c6d1051005f86deb35fd2a530b6e *R/unnest_ptb.R
05401cb40cf5e51d0001091908c35ef1 *R/unnest_regex.R
4206f1f3c7ccc21b51c40d86df26ef97 *R/unnest_sentence.R
630a37b7306b8d4d6c717f7e7542df37 *R/unnest_tokens.R
af40adb02e58929170c3ddf03f531380 *R/unnest_tweets.R
092367125c027ea0d59abf088ade4986 *README.md
b40cf7ec671ff1a993917486f998e9be *build/vignette.rds
dffbb0fe70f8b13246e7a5ca78c17780 *data/nma_words.rda
f30278ae776165a5ef03124f7fa2323e *data/parts_of_speech.rda
01b0bbb85f2aed8ea8c7b7d9c091ef3d *data/sentiments.rda
fb19b7e42bcbff432db19d6f1ba11426 *data/stop_words.rda
33f63d59a42106e6a7e32ec946ebb92a *inst/CITATION
9fae95ea7d1e732cb8b800903280c1b8 *inst/doc/tf_idf.R
575f3091a2ced9448f4d2729f7c8b1da *inst/doc/tf_idf.Rmd
d09505e6dc72fcc710f5737559ddcbca *inst/doc/tf_idf.html
d42b3f744753a5c9e06faf5fecd9e463 *inst/doc/tidying_casting.R
4f025b0d0e7162893f17c14e74de38cd *inst/doc/tidying_casting.Rmd
e291ab89188271596c4d06023462d298 *inst/doc/tidying_casting.html
41f7e5e8b602d577de0bfbcf08332e05 *inst/doc/tidytext.R
0417e4fb22b6da67c8e8c1d8a299371d *inst/doc/tidytext.Rmd
0a5d71ef955112365e09433718c974e2 *inst/doc/tidytext.html
9321a877f1adad7b317b01c13a055885 *inst/extdata/books.rda
ed20d65427b30e3e08373bcedf22723a *man/bind_tf_idf.Rd
a60452d9ad778987a816e67222d0d3bc *man/cast_sparse.Rd
f608e65cf5d9ef815eb5e09a1ea3abe3 *man/corpus_tidiers.Rd
514e30c2a28edae3d2d3ca5b411b7a5b *man/dictionary_tidiers.Rd
452e7cfb9239c4459ae15261f6f3de27 *man/document_term_casters.Rd
63c88f6d3a87d9900f5b99c2796d9f04 *man/figures/README-unnamed-chunk-13-1.png
8448afe1ed27dad50b2d718b3493049d *man/figures/README-unnamed-chunk-9-1.png
4609d1cbcdf21b226da16fa22083929e *man/figures/logo.png
4609d1cbcdf21b226da16fa22083929e *man/figures/tidytext.png
55aee80c1cd4ff6eb58ec65a38f37283 *man/get_sentiments.Rd
e5923391bb08be5e8f20f7ded63ac44d *man/get_stopwords.Rd
c87a8a2fc840ff09be9c790356536398 *man/lda_tidiers.Rd
726966b65918bc6b2449de0815bf1a28 *man/mallet_tidiers.Rd
bd7b27d67e222f37ea76d9d9eda4547a *man/nma_words.Rd
c1b14aa4882883762ff5d8f740660fdf *man/parts_of_speech.Rd
d257672b36790f46d9f0ca7d1af74965 *man/reexports.Rd
453458aa885bd744717e5c777dcdc110 *man/reorder_within.Rd
94e52c0200accdb3e393a473e8aafe91 *man/sentiments.Rd
5f76560fcedb346fcb812a0659717828 *man/stm_tidiers.Rd
e3b28f2b572a8f8d68f6d520d05ba4f1 *man/stop_words.Rd
a6137ec5cc62317c5672da2772db29eb *man/tdm_tidiers.Rd
c96d971bca4c762574c202d632235314 *man/tidy.Corpus.Rd
752c94c4e9dc1fa5c81e6aabf978c880 *man/tidy_triplet.Rd
0548a9073b3b99a54ef4885192d374b9 *man/tidytext-package.Rd
0606da3e4b505e7f50aa13e2cca2306e *man/unnest_character.Rd
6fee6b39cafe63b969e518fce3a169d2 *man/unnest_ngrams.Rd
8f86c0ba1650b33d899f1a78eae7dfee *man/unnest_ptb.Rd
73cbd9e84733407105b1ee7df93dd3ca *man/unnest_regex.Rd
6f3d50c26903ade7750a52a528b67cdb *man/unnest_sentences.Rd
8e21d610a673c1795c3f02f885250a0c *man/unnest_tokens.Rd
9ccbd53daeade9a0c592c89c2a234506 *man/unnest_tweets.Rd
37455b724fc4126f1b8305cc5c05f444 *tests/figs/deps.txt
7eb533dd73c7a6f648a4fcf7b415ac4c *tests/figs/reorder-within/reordered-boxplot.svg
8588888689f0bb9348fcfc6fa4d15135 *tests/figs/reorder-within/reordered-multi-facet-boxplot.svg
7adbc153be7478e81835769469351ee3 *tests/testthat.R
67aa1e2384ece4669363c4bf8da563f7 *tests/testthat/_snaps/reorder-within/custom-label-boxplot.svg
129bfe96e3e4d82a0555a19de64876ff *tests/testthat/_snaps/reorder-within/reordered-boxplot.svg
986b3f8a541ed644281f10d90af891f5 *tests/testthat/_snaps/reorder-within/reordered-multi-facet-boxplot.svg
141d20379cda632276b8f398084a2f04 *tests/testthat/_snaps/unnest-tokens.md
57991b54d912130b9e08d31ae3b0d345 *tests/testthat/_snaps/unnest_tweet.md
99be3ff2b5065e7af276502d9e7741dd *tests/testthat/helper-funs.R
b928b2f180449879fe00f939fdc298f8 *tests/testthat/reordered-boxplot.svg
e671cedecbe23e5aba87cdad7405753f *tests/testthat/test-corpus-tidiers.R
ac45fd55143efb505d381c6abf3ce16d *tests/testthat/test-dictionary-tidiers.R
fe5aa100d1aa2c5db2e620b5d24296e3 *tests/testthat/test-lda-tidiers.R
801ab823a9f67d0a51afc8a7bacb2d10 *tests/testthat/test-reorder-within.R
e9e41a257db1f3c2667ebe120bb7bf0e *tests/testthat/test-sentiments.R
a0314dadfb62dee876849c0fed9c9815 *tests/testthat/test-sparse-casters.R
1d05674e95fb0f3e658a6321966aec51 *tests/testthat/test-sparse-tidiers.R
4688151ae7317873cc5a0039d28e3f3a *tests/testthat/test-stm-tidiers.R
799f908a8327feb63f0176d8aba9e6d7 *tests/testthat/test-stop-words.R
50bea9416212f11df7a34560f2ee241c *tests/testthat/test-tf-idf.R
1385e257835821730c7818be488694b8 *tests/testthat/test-unnest-char.R
601652b93d47d9c83218d0a2a1d45779 *tests/testthat/test-unnest-sentence.R
c2a23aa36d84059cb1d21e26c9936ce4 *tests/testthat/test-unnest-tokens.R
a4bbfcd4c3170776d6993704fc82be29 *tests/testthat/test-unnest_ngrams.R
9f1e9280d7c1f3d20b95f280d3a4cc37 *tests/testthat/test-unnest_ptb.R
ee2d7dace4d87c3f4e2495c9bd8081d2 *tests/testthat/test-unnest_regex.R
619db5d3ab96b94633bf0c83dd5df1e0 *tests/testthat/test-unnest_tweet.R
d286cb85fbaedc9d40dd4ed8ca9c92b3 *tools/README-unnamed-chunk-13-1.png
350d9d5e91c9ba6658df2bd2d73de535 *tools/README-unnamed-chunk-9-1.png
4609d1cbcdf21b226da16fa22083929e *tools/tidytext.png
575f3091a2ced9448f4d2729f7c8b1da *vignettes/tf_idf.Rmd
4f025b0d0e7162893f17c14e74de38cd *vignettes/tidying_casting.Rmd
0417e4fb22b6da67c8e8c1d8a299371d *vignettes/tidytext.Rmd
tidytext/R/ 0000755 0001762 0000144 00000000000 14475733203 012341 5 ustar ligges users tidytext/R/unnest_sentence.R 0000644 0001762 0000144 00000004211 15040537071 015654 0 ustar ligges users #' Wrapper around unnest_tokens for sentences, lines, and paragraphs
#'
#' These functions are wrappers around `unnest_tokens( token = "sentences" )`
#' `unnest_tokens( token = "lines" )` and `unnest_tokens( token = "paragraphs" )`.
#'
#' @seealso
#' + [unnest_tokens()]
#'
#' @inheritParams unnest_tokens
#' @inheritParams tokenizers::tokenize_sentences
#' @inheritParams tokenizers::tokenize_lines
#' @inheritParams tokenizers::tokenize_paragraphs
#'
#' @param ... Extra arguments passed on to [tokenizers][tokenizers::tokenizers]
#'
#' @export
#' @rdname unnest_sentences
#' @importFrom dplyr enquo
#'
#' @examples
#' library(dplyr)
#' library(janeaustenr)
#'
#' d <- tibble(txt = prideprejudice)
#'
#' d %>%
#' unnest_sentences(word, txt)
#'
unnest_sentences <- function(
tbl,
output,
input,
strip_punct = FALSE,
format = c("text", "man", "latex", "html", "xml"),
to_lower = TRUE,
drop = TRUE,
collapse = NULL,
...
) {
format <- match.arg(format)
unnest_tokens(
tbl,
!!enquo(output),
!!enquo(input),
token = "sentences",
format = format,
to_lower = to_lower,
drop = drop,
collapse = collapse,
strip_punct = strip_punct,
...
)
}
#' @export
#' @rdname unnest_sentences
#' @importFrom dplyr enquo
unnest_lines <- function(
tbl,
output,
input,
format = c("text", "man", "latex", "html", "xml"),
to_lower = TRUE,
drop = TRUE,
collapse = NULL,
...
) {
format <- match.arg(format)
unnest_tokens(
tbl,
!!enquo(output),
!!enquo(input),
token = "lines",
format = format,
to_lower = to_lower,
drop = drop,
collapse = collapse,
...
)
}
#' @export
#' @rdname unnest_sentences
#' @importFrom dplyr enquo
unnest_paragraphs <- function(
tbl,
output,
input,
paragraph_break = "\n\n",
format = c("text", "man", "latex", "html", "xml"),
to_lower = TRUE,
drop = TRUE,
collapse = NULL,
...
) {
format <- match.arg(format)
unnest_tokens(
tbl,
!!enquo(output),
!!enquo(input),
token = "paragraphs",
format = format,
to_lower = to_lower,
drop = drop,
collapse = collapse,
paragraph_break = paragraph_break,
...
)
}
tidytext/R/parts_of_speech.R 0000644 0001762 0000144 00000001553 14324024522 015622 0 ustar ligges users #' Parts of speech for English words from the Moby Project
#'
#' Parts of speech for English words from the Moby Project by Grady Ward.
#' Words with non-ASCII characters and items with a space have been removed.
#'
#' @format A data frame with 205,985 rows and 2 variables:
#' \describe{
#' \item{word}{An English word}
#' \item{pos}{The part of speech of the word. One of 13 options, such as
#' "Noun", "Adverb", "Adjective"}
#' }
#'
#' @details Another dataset of English parts of speech, available only for
#' non-commercial use, is available as part of SUBTLEXus at
#' .
#'
#' @examples
#'
#' library(dplyr)
#'
#' parts_of_speech
#'
#' parts_of_speech %>%
#' count(pos, sort = TRUE)
#'
#' @source
"parts_of_speech"
tidytext/R/bind_tf_idf.R 0000644 0001762 0000144 00000003746 15040537071 014717 0 ustar ligges users #' Bind the term frequency and inverse document frequency of a tidy text
#' dataset to the dataset
#'
#' Calculate and bind the term frequency and inverse document frequency of a
#' tidy text dataset, along with the product, tf-idf, to the dataset. Each of
#' these values are added as columns. This function supports non-standard
#' evaluation through the tidyeval framework.
#'
#' @param tbl A tidy text dataset with one-row-per-term-per-document
#' @param term Column containing terms as string or symbol
#' @param document Column containing document IDs as string or symbol
#' @param n Column containing document-term counts as string or symbol
#'
#' @details The arguments `term`, `document`, and `n`
#' are passed by expression and support [quasiquotation][rlang::quasiquotation];
#' you can unquote strings and symbols.
#'
#' If the dataset is grouped, the groups are ignored but are
#' retained.
#'
#' The dataset must have exactly one row per document-term combination
#' for this to work.
#'
#' @examples
#'
#' library(dplyr)
#' library(janeaustenr)
#'
#' book_words <- austen_books() %>%
#' unnest_tokens(word, text) %>%
#' count(book, word, sort = TRUE)
#'
#' book_words
#'
#' # find the words most distinctive to each document
#' book_words %>%
#' bind_tf_idf(word, book, n) %>%
#' arrange(desc(tf_idf))
#'
#' @export
bind_tf_idf <- function(tbl, term, document, n) {
term <- quo_name(enquo(term))
document <- quo_name(enquo(document))
n_col <- quo_name(enquo(n))
terms <- as.character(tbl[[term]])
documents <- as.character(tbl[[document]])
n <- tbl[[n_col]]
doc_totals <- tapply(n, documents, sum)
idf <- log(length(doc_totals) / table(terms))
tbl$tf <- n / as.numeric(doc_totals[documents])
tbl$idf <- as.numeric(idf[terms])
tbl$tf_idf <- tbl$tf * tbl$idf
if (any(tbl$idf < 0, na.rm = TRUE)) {
rlang::warn(
paste(
"A value for tf_idf is negative:\n",
"Input should have exactly one row per document-term combination."
)
)
}
tbl
}
tidytext/R/nma_words.R 0000644 0001762 0000144 00000000717 14324024522 014450 0 ustar ligges users #' English negators, modals, and adverbs
#'
#' English negators, modals, and adverbs, as a data frame. A few of these
#' entries are two-word phrases instead of single words.
#'
#' @format A data frame with 44 rows and 2 variables:
#' \describe{
#' \item{word}{An English word or bigram}
#' \item{modifier}{The modifier type for `word`, either "negator",
#' "modal", or "adverb"}
#' }
#'
#' @source
"nma_words"
tidytext/R/stm_tidiers.R 0000644 0001762 0000144 00000015435 15040537226 015016 0 ustar ligges users #' Tidiers for Structural Topic Models from the stm package
#'
#' Tidy topic models fit by the stm package. The arguments and return values
#' are similar to [lda_tidiers()].
#'
#' @param x An STM fitted model object from either [stm::stm()] or
#' [stm::estimateEffect()]
#' @param matrix Which matrix to tidy:
#' - the beta matrix (per-term-per-topic, default)
#' - the gamma/theta matrix (per-document-per-topic); the stm package calls
#' this the theta matrix, but other topic modeling packages call this gamma
#' - the FREX matrix, for words with high frequency and exclusivity
#' - the lift matrix, for words with high lift
#' @param data For `augment`, the data given to the stm function, either
#' as a `dfm` from quanteda or as a tidied table with "document" and
#' "term" columns
#' @param log Whether beta/gamma/theta should be on a log scale, default FALSE
#' @param document_names Optional vector of document names for use with
#' per-document-per-topic tidying
#' @param ... Extra arguments for tidying, such as `w` as used in
#' [stm::calcfrex()]
#'
#' @seealso [lda_tidiers()], [stm::calcfrex()], [stm::calclift()]
#' @return `tidy` returns a tidied version of either the beta, gamma, FREX, or
#' lift matrix if called on an object from [stm::stm()], or a tidied version of
#' the estimated regressions if called on an object from [stm::estimateEffect()].
#'
#' @examplesIf interactive() || identical(Sys.getenv("IN_PKGDOWN"), "true")
#' library(dplyr)
#' library(ggplot2)
#' library(stm)
#' library(janeaustenr)
#'
#' austen_sparse <- austen_books() %>%
#' unnest_tokens(word, text) %>%
#' anti_join(stop_words) %>%
#' count(book, word) %>%
#' cast_sparse(book, word, n)
#' topic_model <- stm(austen_sparse, K = 12, verbose = FALSE)
#'
#' # tidy the word-topic combinations
#' td_beta <- tidy(topic_model)
#' td_beta
#'
#' # Examine the topics
#' td_beta %>%
#' group_by(topic) %>%
#' slice_max(beta, n = 10) %>%
#' ungroup() %>%
#' ggplot(aes(beta, term)) +
#' geom_col() +
#' facet_wrap(~ topic, scales = "free")
#'
#' # high FREX words per topic
#' tidy(topic_model, matrix = "frex")
#'
#' # high lift words per topic
#' tidy(topic_model, matrix = "lift")
#'
#' # tidy the document-topic combinations, with optional document names
#' td_gamma <- tidy(topic_model, matrix = "gamma",
#' document_names = rownames(austen_sparse))
#' td_gamma
#'
#' # using stm's gardarianFit, we can tidy the result of a model
#' # estimated with covariates
#' effects <- estimateEffect(1:3 ~ treatment, gadarianFit, gadarian)
#' glance(effects)
#' td_estimate <- tidy(effects)
#' td_estimate
#'
#' @name stm_tidiers
#'
#' @export
tidy.STM <- function(
x,
matrix = c("beta", "gamma", "theta", "frex", "lift"),
log = FALSE,
document_names = NULL,
...
) {
matrix <- rlang::arg_match(matrix)
switch(
matrix,
"beta" = tidy_stm_beta(x, log),
"frex" = tidy_stm_frex(x, ...),
"lift" = tidy_stm_lift(x),
tidy_stm_gamma(x, log, document_names)
)
}
tidy_stm_beta <- function(x, log) {
logbeta <- x$beta$logbeta
ret <- reshape2::melt(logbeta) %>%
tibble::as_tibble()
ret <- transmute(
ret,
topic = Var1,
term = x$vocab[Var2],
beta = value,
y.level = x$settings$covariates$yvarlevels[as.integer(L1)]
)
if (!log) {
ret$beta <- exp(ret$beta)
}
ret
}
tidy_stm_gamma <- function(x, log, document_names) {
mat <- x$theta
ret <- reshape2::melt(mat) %>%
tibble::as_tibble()
ret <- transmute(ret, document = Var1, topic = Var2, gamma = value)
if (!is.null(document_names)) {
ret$document <- document_names[ret$document]
}
if (log) {
ret$gamma <- log(ret$gamma)
}
ret
}
tidy_stm_frex <- function(x, ...) {
logbeta <- x$beta$logbeta[[1]]
word_counts <- x$settings$dim$wcounts$x
vocab <- x$vocab
frex <- stm::calcfrex(logbeta, ..., wordcounts = word_counts)
pivot_stm_longer(frex, vocab)
}
tidy_stm_lift <- function(x) {
logbeta <- x$beta$logbeta[[1]]
word_counts <- x$settings$dim$wcounts$x
vocab <- x$vocab
lift <- stm::calclift(logbeta, word_counts)
pivot_stm_longer(lift, vocab)
}
pivot_stm_longer <- function(x, vocab) {
rlang::check_installed("tidyr")
seq_ncol <- seq_len(ncol(x))
tibble::as_tibble(x, .name_repair = ~ paste0("___", seq_ncol)) %>%
tidyr::pivot_longer(
everything(),
names_to = "topic",
values_to = "term"
) %>%
transmute(
topic = as.integer(stringr::str_remove_all(topic, "___")),
term = vocab[term]
) %>%
arrange(topic)
}
#' @rdname stm_tidiers
#'
#' @export
tidy.estimateEffect <- function(x, ...) {
s <- summary(x)
topics <- s$topics
names(s$tables) <- s$topics
ret <- purrr::map_dfr(
s$tables,
dplyr::as_tibble,
rownames = "term",
.id = "topic"
)
ret$topic <- as.integer(ret$topic)
colnames(ret) <- c(
"topic",
"term",
"estimate",
"std.error",
"statistic",
"p.value"
)
ret
}
#' @rdname stm_tidiers
#' @return `glance` returns a tibble with exactly one row of model summaries.
#' @export
glance.estimateEffect <- function(x, ...) {
ret <- tibble(
k = length(x[['topics']]),
docs = nrow(x[['modelframe']]),
uncertainty = x[['uncertainty']]
)
ret
}
#' @rdname stm_tidiers
#'
#' @return `augment` must be provided a data argument, either a
#' `dfm` from quanteda or a table containing one row per original
#' document-term pair, such as is returned by [tdm_tidiers], containing
#' columns `document` and `term`. It returns that same data with an additional
#' column `.topic` with the topic assignment for that document-term combination.
#'
#' @importFrom generics augment
#'
#' @export
augment.STM <- function(x, data, ...) {
if (missing(data)) {
stop("data argument must be provided in order to augment a stm model")
}
if (
inherits(data, "data.frame") &&
(all(c("document", "term") %in% colnames(data)))
) {
data$value <- 1
mat <- cast_dfm(data, document, term, value)
data$value <- NULL
} else if (inherits(data, "dfm")) {
mat <- data
data <- tidy(mat)
} else {
stop(
"data argument must either be a dfm ",
"(from quanteda) or a table with document and term columns"
)
}
beta <- t(as.matrix(x$beta$logbeta[[1]]))
theta <- x$theta
term_indices <- match(data$term, x$vocab)
doc_indices <- match(data$document, rownames(mat))
products <- exp(beta[term_indices, ]) * theta[doc_indices, ]
keep <- !is.na(term_indices) & !is.na(doc_indices)
data$.topic <- NA
data$.topic[keep] <- apply(products[keep, ], 1, which.max)
data
}
#' @rdname stm_tidiers
#' @export
glance.STM <- function(x, ...) {
ret <- tibble(
k = as.integer(x$settings$dim$K),
docs = x$settings$dim$N,
terms = x$settings$dim$V,
iter = length(x$convergence$bound),
alpha = x$settings$init$alpha
)
ret
}
#' @export
generics::augment
tidytext/R/unnest_tokens.R 0000644 0001762 0000144 00000015733 15040537226 015370 0 ustar ligges users #' Split a column into tokens
#'
#' Split a column into tokens, flattening the table into one-token-per-row.
#' This function supports non-standard evaluation through the tidyeval framework.
#'
#' @param tbl A data frame
#'
#' @param token Unit for tokenizing, or a custom tokenizing function. Built-in
#' options are "words" (default), "characters", "character_shingles", "ngrams",
#' "skip_ngrams", "sentences", "lines", "paragraphs", "regex", and
#' "ptb" (Penn Treebank). If a function, should take a character vector and
#' return a list of character vectors of the same length.
#'
#' @param format Either "text", "man", "latex", "html", or "xml". When the
#' format is "text", this function uses the tokenizers package. If not "text",
#' this uses the hunspell tokenizer, and can tokenize only by "word".
#'
#' @param to_lower Whether to convert tokens to lowercase.
#'
#' @param drop Whether original input column should get dropped. Ignored
#' if the original input and new output column have the same name.
#'
#' @param output Output column to be created as string or symbol.
#'
#' @param input Input column that gets split as string or symbol.
#'
#' The output/input arguments are passed by expression and support
#' [quasiquotation][rlang::quasiquotation]; you can unquote strings and symbols.
#'
#' @param collapse A character vector of variables to collapse text across,
#' or `NULL`.
#'
#' For tokens like n-grams or sentences, text can be collapsed across rows
#' within variables specified by `collapse` before tokenization. At tidytext
#' 0.2.7, the default behavior for `collapse = NULL` changed to be more
#' consistent. The new behavior is that text is _not_ collapsed for `NULL`.
#'
#' Grouping data specifies variables to collapse across in the same way as
#' `collapse` but you **cannot** use both the `collapse` argument and
#' grouped data. Collapsing applies mostly to `token` options of "ngrams",
#' "skip_ngrams", "sentences", "lines", "paragraphs", or "regex".
#'
#' @param ... Extra arguments passed on to [tokenizers][tokenizers::tokenizers], such
#' as `strip_punct` for "words", `n` and `k` for "ngrams" and "skip_ngrams",
#' and `pattern` for "regex".
#'
#' @details If format is anything other than "text", this uses the
#' [hunspell::hunspell_parse()] tokenizer instead of the tokenizers package.
#' This does not yet have support for tokenizing by any unit other than words.
#'
#' Support for `token = "tweets"` was removed in tidytext 0.4.0 because of
#' changes in upstream dependencies.
#'
#' @import dplyr
#' @import rlang
#' @import tokenizers
#' @import janeaustenr
#' @importFrom vctrs vec_rep_each
#' @importFrom vctrs vec_slice
#' @export
#'
#' @name unnest_tokens
#'
#' @examplesIf rlang::is_installed("hunspell")
#'
#' library(dplyr)
#' library(janeaustenr)
#'
#' d <- tibble(txt = prideprejudice)
#' d
#'
#' d %>%
#' unnest_tokens(output = word, input = txt)
#'
#' d %>%
#' unnest_tokens(output = sentence, input = txt, token = "sentences")
#'
#' d %>%
#' unnest_tokens(output = ngram, input = txt, token = "ngrams", n = 2)
#'
#' d %>%
#' unnest_tokens(chapter, txt, token = "regex", pattern = "Chapter [\\\\d]")
#'
#' d %>%
#' unnest_tokens(shingle, txt, token = "character_shingles", n = 4)
#'
#' # custom function
#' d %>%
#' unnest_tokens(word, txt, token = stringr::str_split, pattern = " ")
#'
#' # tokenize HTML
#' h <- tibble(row = 1:2,
#' text = c("
Text is", "here"))
#'
#' h %>%
#' unnest_tokens(word, text, format = "html")
#'
unnest_tokens <- function(
tbl,
output,
input,
token = "words",
format = c(
"text",
"man",
"latex",
"html",
"xml"
),
to_lower = TRUE,
drop = TRUE,
collapse = NULL,
...
) {
output <- enquo(output)
input <- enquo(input)
format <- arg_match(format)
tokenfunc <- find_function(token, format, to_lower, ...)
if (!is_null(collapse)) {
if (is_logical(collapse)) {
rlang::abort("`collapse` must be `NULL` or a character vector")
}
if (is_grouped_df(tbl)) {
rlang::abort("Use the `collapse` argument or grouped data, but not both.")
}
if (any(!purrr::map_lgl(tbl, is_atomic))) {
rlang::abort(
paste0(
"If collapse != NULL (such as for unnesting by sentence or paragraph),\n",
"unnest_tokens needs all input columns to be atomic vectors (not lists)"
)
)
}
tbl <- group_by(tbl, !!!syms(collapse))
}
if (is_grouped_df(tbl)) {
tbl <- tbl %>%
ungroup() %>%
mutate(new_groups = cumsum(c(1, diff(group_indices(tbl)) != 0))) %>%
group_by(new_groups, !!!groups(tbl)) %>%
summarise(!!input := stringr::str_c(!!input, collapse = "\n")) %>%
group_by(!!!groups(tbl)) %>%
dplyr::select(-new_groups)
if (!is_null(collapse)) {
tbl <- ungroup(tbl)
}
}
col <- pull(tbl, !!input)
output_lst <- tokenfunc(col, ...)
if (!(is.list(output_lst) && length(output_lst) == nrow(tbl))) {
rlang::abort(
sprintf(
"Expected output of tokenizing function to be a list of length %d",
nrow(tbl)
)
)
}
output <- quo_name(output)
input <- quo_name(input)
tbl_indices <- vec_rep_each(seq_len(nrow(tbl)), lengths(output_lst))
ret <- vec_slice(tbl, tbl_indices)
ret[[output]] <- purrr::list_c(output_lst)
if (to_lower) {
ret[[output]] <- stringr::str_to_lower(ret[[output]])
}
# For data.tables we want this to hit the result and be after the result
# has been assigned, just to make sure that we don't reduce the data.table
# to 0 rows before inserting the output.
if (drop && output != input) {
ret[[input]] <- NULL
}
ret
}
find_function <- function(token, format, to_lower, ...) {
if (is_function(token)) {
tokenfunc <- token
return(tokenfunc)
}
if (token %in% c("tweets", "tweet")) {
lifecycle::deprecate_stop(
"0.4.0",
I('Support for `token = "tweets"`')
)
}
if (
token %in%
c(
"word",
"character",
"character_shingle",
"ngram",
"skip_ngram",
"sentence",
"line",
"paragraph",
"tweet"
)
) {
cli::cli_abort(
c(
"Token must be a supported type, or a function that takes a character vector as input",
i = 'Did you mean `token = "{token}s"`?'
)
)
}
if (format != "text") {
if (token != "words") {
rlang::abort(
"Cannot tokenize by any unit except words when format is not text"
)
}
rlang::check_installed("hunspell")
tokenfunc <- function(col, ...) {
hunspell::hunspell_parse(
col,
format = format
)
}
} else {
tf <- get(paste0("tokenize_", token))
if (
token %in%
c(
"characters",
"character_shingles",
"words",
"ngrams",
"skip_ngrams",
"ptb"
)
) {
tokenfunc <- function(col, ...) tf(col, lowercase = to_lower, ...)
} else {
tokenfunc <- tf
}
}
tokenfunc
}
tidytext/R/tidytext.R 0000644 0001762 0000144 00000000041 14225646515 014337 0 ustar ligges users #' @keywords internal
"_PACKAGE"
tidytext/R/unnest_tweets.R 0000644 0001762 0000144 00000000641 15040537071 015366 0 ustar ligges users #' Wrapper around unnest_tokens for tweets
#'
#' `r lifecycle::badge("deprecated")`
#'
#' @seealso
#' + [unnest_tokens()]
#'
#' @inheritParams unnest_tokens
#'
#' @param ... Extra arguments passed on to [tokenizers][tokenizers::tokenizers]
#'
#' @export
#' @keywords internal
#' @importFrom dplyr enquo
#'
unnest_tweets <- function(tbl, output, input, ...) {
lifecycle::deprecate_stop("0.4.0", "unnest_tweets()")
}
tidytext/R/sparse_tidiers.R 0000644 0001762 0000144 00000004542 15040537071 015503 0 ustar ligges users #' Tidy DocumentTermMatrix, TermDocumentMatrix, and related objects
#' from the tm package
#'
#' Tidy a DocumentTermMatrix or TermDocumentMatrix into
#' a three-column data frame: \code{term{}}, and value (with
#' zeros missing), with one-row-per-term-per-document.
#'
#' @importFrom generics tidy
#'
#' @param x A DocumentTermMatrix or TermDocumentMatrix object
#' @param row_names Specify row names
#' @param col_names Specify column names
#' @param ... Extra arguments, not used
#'
#' @name tdm_tidiers
#'
#' @examples
#'
#' if (requireNamespace("topicmodels", quietly = TRUE)) {
#' data("AssociatedPress", package = "topicmodels")
#' AssociatedPress
#'
#' tidy(AssociatedPress)
#' }
#'
#' @export
tidy.DocumentTermMatrix <- function(x, ...) {
ret <- tidy.simple_triplet_matrix(x, x$dimnames$Docs, x$dimnames$Terms)
colnames(ret) <- c("document", "term", "count")
ret
}
#' @rdname tdm_tidiers
#' @export
tidy.TermDocumentMatrix <- function(x, ...) {
ret <- tidy.simple_triplet_matrix(x, x$dimnames$Terms, x$dimnames$Docs)
colnames(ret) <- c("term", "document", "count")
ret
}
#' @rdname tdm_tidiers
#' @export
tidy.dfm <- tidy.dfmSparse <- function(x, ...) {
triplets <- Matrix::summary(methods::as(x, "TsparseMatrix"))
ret <- tidy_triplet(x, triplets)
colnames(ret) <- c("document", "term", "count")
ret
}
#' @rdname tdm_tidiers
#' @export
tidy.dfmSparse <- tidy.dfm
#' @rdname tdm_tidiers
#' @export
tidy.simple_triplet_matrix <- function(
x,
row_names = NULL,
col_names = NULL,
...
) {
triplets <- unclass(x)[c("i", "j", "v")]
names(triplets) <- c("i", "j", "x")
tidy_triplet(x, triplets, row_names, col_names)
}
#' Utility function to tidy a simple triplet matrix
#'
#' @param x Object with rownames and colnames
#' @param triplets A data frame or list of i, j, x
#' @param row_names rownames, if not gotten from rownames(x)
#' @param col_names colnames, if not gotten from colnames(x)
tidy_triplet <- function(x, triplets, row_names = NULL, col_names = NULL) {
row <- triplets$i
if (!is.null(row_names)) {
row <- row_names[row]
} else if (!is.null(rownames(x))) {
row <- rownames(x)[row]
}
col <- triplets$j
if (!is.null(col_names)) {
col <- col_names[col]
} else if (!is.null(colnames(x))) {
col <- colnames(x)[col]
}
ret <- tibble(row = row, column = col, value = triplets$x)
ret
}
#' @export
generics::tidy
tidytext/R/unnest_char.R 0000644 0001762 0000144 00000003451 15040537071 014772 0 ustar ligges users #' Wrapper around unnest_tokens for characters and character shingles
#'
#' These functions are a wrapper around `unnest_tokens( token = "characters" )`
#' and `unnest_tokens( token = "character_shingles" )`.
#'
#' @seealso
#' + [unnest_tokens()]
#'
#' @inheritParams tokenizers::tokenize_characters
#' @inheritParams tokenizers::tokenize_character_shingles
#' @inheritParams unnest_tokens
#'
#' @param ... Extra arguments passed on to [tokenizers][tokenizers::tokenizers]
#'
#' @export
#' @importFrom dplyr enquo
#' @rdname unnest_character
#'
#' @examples
#' library(dplyr)
#' library(janeaustenr)
#'
#' d <- tibble(txt = prideprejudice)
#'
#' d %>%
#' unnest_characters(word, txt)
#'
#' d %>%
#' unnest_character_shingles(word, txt, n = 3)
#'
unnest_characters <- function(
tbl,
output,
input,
strip_non_alphanum = TRUE,
format = c("text", "man", "latex", "html", "xml"),
to_lower = TRUE,
drop = TRUE,
collapse = NULL,
...
) {
format <- arg_match(format)
unnest_tokens(
tbl,
!!enquo(output),
!!enquo(input),
token = "characters",
format = format,
to_lower = to_lower,
drop = drop,
collapse = collapse,
strip_non_alphanum = strip_non_alphanum,
...
)
}
#' @export
#' @importFrom dplyr enquo
#' @rdname unnest_character
#'
unnest_character_shingles <- function(
tbl,
output,
input,
n = 3L,
n_min = n,
strip_non_alphanum = TRUE,
format = c("text", "man", "latex", "html", "xml"),
to_lower = TRUE,
drop = TRUE,
collapse = NULL,
...
) {
format <- arg_match(format)
unnest_tokens(
tbl,
!!enquo(output),
!!enquo(input),
token = "character_shingles",
format = format,
to_lower = to_lower,
drop = drop,
collapse = collapse,
n = n,
n_min = n_min,
strip_non_alphanum = strip_non_alphanum,
...
)
}
tidytext/R/unnest_regex.R 0000644 0001762 0000144 00000001720 15040537071 015164 0 ustar ligges users #' Wrapper around unnest_tokens for regular expressions
#'
#' This function is a wrapper around `unnest_tokens( token = "regex" )`.
#'
#' @seealso
#' + [unnest_tokens()]
#'
#' @inheritParams unnest_tokens
#' @inheritParams tokenizers::tokenize_regex
#'
#' @param ... Extra arguments passed on to [tokenizers][tokenizers::tokenizers]
#'
#' @export
#' @importFrom dplyr enquo
#'
#' @examples
#' library(dplyr)
#' library(janeaustenr)
#'
#' d <- tibble(txt = prideprejudice)
#'
#' d %>%
#' unnest_regex(word, txt, pattern = "Chapter [\\\\d]")
#'
unnest_regex <- function(
tbl,
output,
input,
pattern = "\\s+",
format = c("text", "man", "latex", "html", "xml"),
to_lower = TRUE,
drop = TRUE,
collapse = NULL,
...
) {
format <- arg_match(format)
unnest_tokens(
tbl,
!!enquo(output),
!!enquo(input),
token = "regex",
format = format,
to_lower = to_lower,
drop = drop,
collapse = collapse,
pattern = pattern,
...
)
}
tidytext/R/mallet_tidiers.R 0000644 0001762 0000144 00000011113 15040537071 015454 0 ustar ligges users #' Tidiers for Latent Dirichlet Allocation models from the mallet package
#'
#' Tidy LDA models fit by the mallet package, which wraps the Mallet topic
#' modeling package in Java. The arguments and return values
#' are similar to [lda_tidiers()].
#'
#' @param x A jobjRef object, of type RTopicModel, such as created
#' by [mallet::MalletLDA()].
#' @param matrix Whether to tidy the beta (per-term-per-topic, default)
#' or gamma (per-document-per-topic) matrix.
#' @param data For `augment`, the data given to the LDA function, either
#' as a DocumentTermMatrix or as a tidied table with "document" and "term"
#' columns.
#' @param log Whether beta/gamma should be on a log scale, default FALSE
#' @param normalized If true (default), normalize so that each
#' document or word sums to one across the topics. If false, values will
#' be integers representing the actual number of word-topic or document-topic
#' assignments.
#' @param smoothed If true (default), add the smoothing parameter to each
#' to avoid any values being zero. This smoothing parameter is initialized
#' as `alpha.sum` in [mallet::MalletLDA()].
#' @param ... Extra arguments, not used
#'
#' @details Note that the LDA models from [mallet::MalletLDA()]
#' are technically a special case of S4 objects with class `jobjRef`.
#' These are thus implemented as `jobjRef` tidiers, with a check for
#' whether the `toString` output is as expected.
#'
#' @seealso [lda_tidiers()], [mallet::mallet.doc.topics()],
#' [mallet::mallet.topic.words()]
#'
#' @name mallet_tidiers
#'
#' @examples
#'
#' \dontrun{
#' library(mallet)
#' library(dplyr)
#'
#' data("AssociatedPress", package = "topicmodels")
#' td <- tidy(AssociatedPress)
#'
#' # mallet needs a file with stop words
#' tmp <- tempfile()
#' writeLines(stop_words$word, tmp)
#'
#' # two vectors: one with document IDs, one with text
#' docs <- td %>%
#' group_by(document = as.character(document)) %>%
#' summarize(text = paste(rep(term, count), collapse = " "))
#'
#' docs <- mallet.import(docs$document, docs$text, tmp)
#'
#' # create and run a topic model
#' topic_model <- MalletLDA(num.topics = 4)
#' topic_model$loadDocuments(docs)
#' topic_model$train(20)
#'
#' # tidy the word-topic combinations
#' td_beta <- tidy(topic_model)
#' td_beta
#'
#' # Examine the four topics
#' td_beta %>%
#' group_by(topic) %>%
#' top_n(8, beta) %>%
#' ungroup() %>%
#' mutate(term = reorder(term, beta)) %>%
#' ggplot(aes(term, beta)) +
#' geom_col() +
#' facet_wrap(~ topic, scales = "free") +
#' coord_flip()
#'
#' # find the assignments of each word in each document
#' assignments <- augment(topic_model, td)
#' assignments
#' }
#'
#' @export
tidy.jobjRef <- function(
x,
matrix = c("beta", "gamma"),
log = FALSE,
normalized = TRUE,
smoothed = TRUE,
...
) {
s <- x$toString()
if (!stringr::str_detect(s, "TopicModel")) {
stop("Don't know how to tidy jobjRef ", s)
}
matrix <- match.arg(matrix)
if (matrix == "beta") {
func <- mallet::mallet.topic.words
} else {
func <- mallet::mallet.doc.topics
}
m <- func(x, normalized = normalized, smoothed = smoothed)
ret <- dplyr::tbl_df(reshape2::melt(m))
if (matrix == "beta") {
# per term per topic
colnames(ret) <- c("topic", "term", "beta")
ret$term <- x$getVocabulary()[ret$term]
} else {
# per document per topic
colnames(ret) <- c("document", "topic", "gamma")
ret$document <- x$getDocumentNames()[ret$document]
}
if (log) {
ret[[matrix]] <- log(ret[[matrix]])
}
ret
}
#' @rdname mallet_tidiers
#'
#' @return `augment` must be provided a data argument containing
#' one row per original document-term pair, such as is returned by
#' [tdm_tidiers], containing columns `document` and `term`.
#' It returns that same data with an additional column
#' `.topic` with the topic assignment for that document-term combination.
#'
#' @importFrom generics augment
#'
#' @export
augment.jobjRef <- function(x, data, ...) {
s <- x$toString()
if (!stringr::str_detect(s, "TopicModel")) {
stop("Don't know how to augment jobjRef ", s)
}
if (missing(data)) {
stop("data argument must be provided in order to augment a mallet model")
}
beta <- t(mallet::mallet.topic.words(x, normalized = TRUE, smoothed = TRUE))
gamma <- mallet::mallet.doc.topics(x, normalized = TRUE, smoothed = TRUE)
term_indices <- match(data$term, x$getVocabulary())
doc_indices <- match(data$document, x$getDocumentNames())
products <- beta[term_indices, ] * gamma[doc_indices, ]
keep <- !is.na(term_indices) & !is.na(doc_indices)
data$.topic <- NA
data$.topic[keep] <- apply(products[keep, ], 1, which.max)
data
}
tidytext/R/corpus_tidiers.R 0000644 0001762 0000144 00000007621 15040537226 015524 0 ustar ligges users #' Tidy a Corpus object from the tm package
#'
#' Tidy a Corpus object from the tm package. Returns a data frame
#' with one-row-per-document, with a `text` column containing
#' the document's text, and one column for each local (per-document)
#' metadata tag. For corpus objects from the quanteda package,
#' see [tidy.corpus()].
#'
#' @param x A Corpus object, such as a VCorpus or PCorpus
#' @param collapse A string that should be used to
#' collapse text within each corpus (if a document has multiple lines).
#' Give NULL to not collapse strings, in which case a corpus
#' will end up as a list column if there are multi-line documents.
#' @param ... Extra arguments, not used
#'
#' @examples
#'
#' library(dplyr) # displaying tbl_dfs
#'
#' if (requireNamespace("tm", quietly = TRUE)) {
#' library(tm)
#' #' # tm package examples
#' txt <- system.file("texts", "txt", package = "tm")
#' ovid <- VCorpus(DirSource(txt, encoding = "UTF-8"),
#' readerControl = list(language = "lat"))
#'
#' ovid
#' tidy(ovid)
#'
#' # choose different options for collapsing text within each
#' # document
#' tidy(ovid, collapse = "")$text
#' tidy(ovid, collapse = NULL)$text
#'
#' # another example from Reuters articles
#' reut21578 <- system.file("texts", "crude", package = "tm")
#' reuters <- VCorpus(DirSource(reut21578),
#' readerControl = list(reader = readReut21578XMLasPlain))
#' reuters
#'
#' tidy(reuters)
#' }
#'
#' @export
tidy.Corpus <- function(x, collapse = "\n", ...) {
local_meta <- NLP::meta(x, type = "local") %>%
purrr::transpose()
columns <- purrr::map(local_meta, function(m) {
lengths <- purrr::map_dbl(m, length)
if (any(lengths > 1)) {
# keep as a list column
return(m)
}
m <- purrr::map_at(m, which(lengths == 0), ~NA)
ret <- unname(do.call(c, m))
## tbl_df() doesn't support POSIXlt format
## https://github.com/hadley/dplyr/issues/1382
if (inherits(ret, "POSIXlt")) {
ret <- as.POSIXct(ret)
}
ret
})
ret <- as_tibble(columns)
# most importantly, add text
text <- purrr::map(as.list(x), as.character)
if (all(purrr::map(text, length) == 1)) {
text <- unlist(text)
} else if (!is.null(collapse)) {
text <- purrr::map_chr(text, stringr::str_c, collapse = collapse)
}
ret$text <- text
ret
}
#' Tidiers for a corpus object from the quanteda package
#'
#' Tidy a corpus object from the quanteda package. `tidy` returns a
#' tbl_df with one-row-per-document, with a `text` column containing
#' the document's text, and one column for each document-level metadata.
#' `glance` returns a one-row tbl_df with corpus-level metadata,
#' such as source and created. For Corpus objects from the tm package,
#' see [tidy.Corpus()].
#'
#' @param x A Corpus object, such as a VCorpus or PCorpus
#' @param ... Extra arguments, not used
#'
#' @importFrom generics glance
#'
#' @details For the most part, the `tidy` output is equivalent to the
#' "documents" data frame in the corpus object, except that it is converted
#' to a tbl_df, and `texts` column is renamed to `text`
#' to be consistent with other uses in tidytext.
#'
#' Similarly, the `glance` output is simply the "metadata" object,
#' with NULL fields removed and turned into a one-row tbl_df.
#'
#' @examples
#'
#' if (requireNamespace("quanteda", quietly = TRUE)) {
#' data("data_corpus_inaugural", package = "quanteda")
#'
#' data_corpus_inaugural
#'
#' tidy(data_corpus_inaugural)
#' }
#'
#' @name corpus_tidiers
#'
#' @export
tidy.corpus <- function(x, ...) {
tibble::as_tibble(
data.frame(
text = as.character(x),
quanteda::docvars(x),
stringsAsFactors = FALSE
)
)
}
#' @rdname corpus_tidiers
#' @export
glance.corpus <- function(x, ...) {
md <- purrr::compact(quanteda::meta(x))
# turn vectors into list columns
md <- purrr::map_if(md, ~ length(.) > 1, list)
as_tibble(md)
}
#' @export
generics::glance
tidytext/R/unnest_ngrams.R 0000644 0001762 0000144 00000003305 15040537071 015342 0 ustar ligges users #' Wrapper around unnest_tokens for n-grams
#'
#' These functions are wrappers around `unnest_tokens( token = "ngrams" )`
#' and `unnest_tokens( token = "skip_ngrams" )` .
#'
#' @seealso
#' + [unnest_tokens()]
#'
#' @inheritParams tokenizers::tokenize_ngrams
#' @inheritParams tokenizers::tokenize_skip_ngrams
#' @inheritParams unnest_tokens
#'
#' @param ... Extra arguments passed on to [tokenizers][tokenizers::tokenizers]
#'
#' @export
#' @rdname unnest_ngrams
#' @importFrom dplyr enquo
#'
#' @examples
#' library(dplyr)
#' library(janeaustenr)
#'
#' d <- tibble(txt = prideprejudice)
#'
#' d %>%
#' unnest_ngrams(word, txt, n = 2)
#'
#' d %>%
#' unnest_skip_ngrams(word, txt, n = 3, k = 1)
#'
unnest_ngrams <- function(
tbl,
output,
input,
n = 3L,
n_min = n,
ngram_delim = " ",
format = c("text", "man", "latex", "html", "xml"),
to_lower = TRUE,
drop = TRUE,
collapse = NULL,
...
) {
format <- arg_match(format)
unnest_tokens(
tbl,
!!enquo(output),
!!enquo(input),
format = format,
to_lower = to_lower,
drop = drop,
collapse = collapse,
token = "ngrams",
n = n,
n_min = n_min,
ngram_delim = ngram_delim,
...
)
}
#' @export
#' @rdname unnest_ngrams
#' @importFrom dplyr enquo
unnest_skip_ngrams <- function(
tbl,
output,
input,
n_min = 1,
n = 3,
k = 1,
format = c("text", "man", "latex", "html", "xml"),
to_lower = TRUE,
drop = TRUE,
collapse = NULL,
...
) {
format <- arg_match(format)
unnest_tokens(
tbl,
!!enquo(output),
!!enquo(input),
format = format,
to_lower = to_lower,
drop = drop,
collapse = collapse,
token = "skip_ngrams",
n = n,
n_min = n_min,
k = k,
...
)
}
tidytext/R/compat_lazyeval.R 0000644 0001762 0000144 00000004366 15040537226 015663 0 ustar ligges users # nocov start - compat-lazyeval (last updated: rlang 0.3.0)
# This file serves as a reference for compatibility functions for lazyeval.
# Please find the most recent version in rlang's repository.
warn_underscored <- function() {
return(NULL)
warn(
paste(
"The underscored versions are deprecated in favour of",
"tidy evaluation idioms. Please see the documentation",
"for `quo()` in rlang"
)
)
}
warn_text_se <- function() {
return(NULL)
warn("Text parsing is deprecated, please supply an expression or formula")
}
compat_lazy <- function(lazy, env = caller_env(), warn = TRUE) {
if (warn) {
warn_underscored()
}
if (missing(lazy)) {
return(quo())
}
if (is_quosure(lazy)) {
return(lazy)
}
if (is_formula(lazy)) {
return(as_quosure(lazy, env))
}
out <- switch(
typeof(lazy),
symbol = ,
language = new_quosure(lazy, env),
character = {
if (warn) {
warn_text_se()
}
parse_quo(lazy[[1]], env)
},
logical = ,
integer = ,
double = {
if (length(lazy) > 1) {
warn("Truncating vector to length 1")
lazy <- lazy[[1]]
}
new_quosure(lazy, env)
},
list = if (inherits(lazy, "lazy")) {
lazy = new_quosure(lazy$expr, lazy$env)
}
)
if (is_null(out)) {
abort(sprintf("Can't convert a %s to a quosure", typeof(lazy)))
} else {
out
}
}
compat_lazy_dots <- function(dots, env, ..., .named = FALSE) {
if (missing(dots)) {
dots <- list()
}
if (inherits(dots, c("lazy", "formula"))) {
dots <- list(dots)
} else {
dots <- unclass(dots)
}
dots <- c(dots, list(...))
warn <- TRUE
for (i in seq_along(dots)) {
dots[[i]] <- compat_lazy(dots[[i]], env, warn)
warn <- FALSE
}
named <- have_name(dots)
if (.named && any(!named)) {
nms <- vapply(
dots[!named],
function(x) expr_text(get_expr(x)),
character(1)
)
names(dots)[!named] <- nms
}
names(dots) <- names2(dots)
dots
}
compat_as_lazy <- function(quo) {
structure(
class = "lazy",
list(
expr = get_expr(quo),
env = get_env(quo)
)
)
}
compat_as_lazy_dots <- function(...) {
structure(class = "lazy_dots", lapply(quos(...), compat_as_lazy))
}
# nocov end
tidytext/R/sentiments.R 0000644 0001762 0000144 00000004612 15040537071 014652 0 ustar ligges users #' Sentiment lexicon from Bing Liu and collaborators
#'
#' Lexicon for opinion and sentiment analysis in a tidy data frame. This
#' dataset is included in this package with permission of the creators, and
#' may be used in research, commercial, etc. contexts with attribution, using
#' either the paper or URL below.
#'
#' This lexicon was first published in:
#'
#' Minqing Hu and Bing Liu, ``Mining and summarizing customer reviews.'',
#' Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery
#' & Data Mining (KDD-2004), Seattle, Washington, USA, Aug 22-25, 2004.
#'
#' Words with non-ASCII characters were removed.
#'
#' @format A data frame with 6,786 rows and 2 variables:
#' \describe{
#' \item{word}{An English word}
#' \item{sentiment}{A sentiment for that word, either positive or negative.}
#' }
#'
#'
#' @source
"sentiments"
#' Get a tidy data frame of a single sentiment lexicon
#'
#' Get specific sentiment lexicons in a tidy format, with one row per word,
#' in a form that can be joined with a one-word-per-row dataset.
#' The `"bing"` option comes from the included [sentiments()]
#' data frame, and others call the relevant function in the \pkg{textdata}
#' package.
#'
#' @param lexicon The sentiment lexicon to retrieve;
#' either "afinn", "bing", "nrc", or "loughran"
#'
#' @return A tbl_df with a `word` column, and either a `sentiment`
#' column (if `lexicon` is not "afinn") or a numeric `value` column
#' (if `lexicon` is "afinn").
#'
#' @examples
#'
#' library(dplyr)
#'
#' get_sentiments("bing")
#'
#' \dontrun{
#' get_sentiments("afinn")
#' get_sentiments("nrc")
#' }
#'
#' @export
get_sentiments <- function(lexicon = c("bing", "afinn", "loughran", "nrc")) {
lexicon <- match.arg(lexicon)
lexicon_names <- list(
afinn = "AFINN",
loughran = "Loughran-McDonald",
nrc = "NRC word-emotion association"
)
if (lexicon != "bing" && !requireNamespace("textdata", quietly = TRUE)) {
msg <- "The textdata package is required to download the {lexicon_names[[lexicon]]} lexicon.\nInstall the textdata package to access this dataset."
stop(stringr::str_glue(msg), call. = FALSE)
}
switch(
lexicon,
afinn = textdata::lexicon_afinn(),
nrc = textdata::lexicon_nrc(),
loughran = textdata::lexicon_loughran(),
bing = tidytext::sentiments,
stop("Unexpected lexicon", call. = FALSE)
)
}
tidytext/R/lda_tidiers.R 0000644 0001762 0000144 00000012451 15040537071 014744 0 ustar ligges users #' Tidiers for LDA and CTM objects from the topicmodels package
#'
#' Tidy the results of a Latent Dirichlet Allocation or Correlated Topic Model.
#'
#' @param x An LDA or CTM (or LDA_VEM/CTA_VEM) object from the topicmodels package
#' @param matrix Whether to tidy the beta (per-term-per-topic, default)
#' or gamma (per-document-per-topic) matrix
#' @param data For `augment`, the data given to the LDA or CTM function, either
#' as a DocumentTermMatrix or as a tidied table with "document" and "term"
#' columns
#' @param log Whether beta/gamma should be on a log scale, default FALSE
#' @param ... Extra arguments, not used
#'
#' @return `tidy` returns a tidied version of either the beta or gamma matrix.
#'
#' If `matrix == "beta"` (default), returns a table with one row per topic and term,
#' with columns
#' \describe{
#' \item{topic}{Topic, as an integer}
#' \item{term}{Term}
#' \item{beta}{Probability of a term generated from a topic according to
#' the multinomial model}
#' }
#'
#' If `matrix == "gamma"`, returns a table with one row per topic and document,
#' with columns
#' \describe{
#' \item{topic}{Topic, as an integer}
#' \item{document}{Document name or ID}
#' \item{gamma}{Probability of topic given document}
#' }
#'
#' @examples
#'
#' if (requireNamespace("topicmodels", quietly = TRUE)) {
#' set.seed(2016)
#' library(dplyr)
#' library(topicmodels)
#'
#' data("AssociatedPress", package = "topicmodels")
#' ap <- AssociatedPress[1:100, ]
#' lda <- LDA(ap, control = list(alpha = 0.1), k = 4)
#'
#' # get term distribution within each topic
#' td_lda <- tidy(lda)
#' td_lda
#'
#' library(ggplot2)
#'
#' # visualize the top terms within each topic
#' td_lda_filtered <- td_lda %>%
#' filter(beta > .004) %>%
#' mutate(term = reorder(term, beta))
#'
#' ggplot(td_lda_filtered, aes(term, beta)) +
#' geom_bar(stat = "identity") +
#' facet_wrap(~ topic, scales = "free") +
#' theme(axis.text.x = element_text(angle = 90, size = 15))
#'
#' # get classification of each document
#' td_lda_docs <- tidy(lda, matrix = "gamma")
#' td_lda_docs
#'
#' doc_classes <- td_lda_docs %>%
#' group_by(document) %>%
#' top_n(1) %>%
#' ungroup()
#'
#' doc_classes
#'
#' # which were we most uncertain about?
#' doc_classes %>%
#' arrange(gamma)
#' }
#'
#' @name lda_tidiers
#'
#' @export
tidy.LDA <- function(x, matrix = c("beta", "gamma"), log = FALSE, ...) {
tidy_topicmodels(x = x, matrix = matrix, log = log, ...)
}
#' @name lda_tidiers
#'
#' @export
tidy.CTM <- function(x, matrix = c("beta", "gamma"), log = FALSE, ...) {
tidy_topicmodels(x = x, matrix = matrix, log = log, ...)
}
tidy_topicmodels <- function(x, matrix = c("beta", "gamma"), log = FALSE, ...) {
matrix <- match.arg(matrix)
if (matrix == "gamma") {
mat <- x@gamma
} else {
mat <- x@beta
}
ret <- reshape2::melt(mat) %>%
tibble::as_tibble()
if (matrix == "beta") {
ret <- transmute(ret, topic = Var1, term = x@terms[Var2], beta = value)
} else {
ret <- transmute(ret, document = Var1, topic = Var2, gamma = value)
if (!is.null(x@documents)) {
ret$document <- x@documents[ret$document]
}
}
if (matrix == "beta" && !log) {
ret[[matrix]] <- exp(ret[[matrix]])
} else if (matrix == "gamma" && log) {
ret[[matrix]] <- log(ret[[matrix]])
}
ret
}
#' @rdname lda_tidiers
#'
#' @return `augment` returns a table with one row per original
#' document-term pair, such as is returned by [tdm_tidiers]:
#' \describe{
#' \item{document}{Name of document (if present), or index}
#' \item{term}{Term}
#' \item{.topic}{Topic assignment}
#' }
#'
#' If the `data` argument is provided, any columns in the original
#' data are included, combined based on the `document` and `term`
#' columns.
#'
#' @importFrom generics augment
#'
#' @export
augment.LDA <- function(x, data, ...) {
augment_topicmodels(x, data, ...)
}
#' @name lda_tidiers
#'
#' @export
augment.CTM <- function(x, data, ...) {
augment_topicmodels(x, data, ...)
}
augment_topicmodels <- function(x, data, ...) {
word_assignments <- x@wordassignments
rownames(word_assignments) <- x@documents
colnames(word_assignments) <- x@terms
ret <- tidy.simple_triplet_matrix(word_assignments)
colnames(ret) <- c("document", "term", ".topic")
if (!missing(data)) {
if (inherits(data, "simple_triplet_matrix")) {
data <- tidy(data)
} else if (
!inherits(data, "data.frame") &&
!(all(c("document", "term") %in% colnames(data)))
) {
stop(
"data argument must either be a simple_triplet_matrix (such as ",
"a DocumentTermMatrix) or a table with document and term columns"
)
}
ret <- left_join(data, ret, by = c("document", "term"))
}
ret
}
#' @rdname lda_tidiers
#'
#' @return `glance` always returns a one-row table, with columns
#' \describe{
#' \item{iter}{Number of iterations used}
#' \item{terms}{Number of terms in the model}
#' \item{alpha}{If an LDA_VEM, the parameter of the Dirichlet distribution
#' for topics over documents}
#' }
#'
#' @export
glance.LDA <- function(x, ...) {
ret <- tibble(iter = x@iter, terms = x@n)
if (!is.null(x@alpha)) {
ret$alpha <- x@alpha
}
ret
}
#' @name lda_tidiers
#'
#' @export
glance.CTM <- function(x, ...) {
tibble(iter = x@iter, terms = x@n)
}
#' @export
generics::augment
tidytext/R/stop_words.R 0000644 0001762 0000144 00000003627 14324024522 014665 0 ustar ligges users #' Various lexicons for English stop words
#'
#' English stop words from three lexicons, as a data frame.
#' The snowball and SMART sets are pulled from the tm package. Note
#' that words with non-ASCII characters have been removed.
#'
#' @format A data frame with 1149 rows and 2 variables:
#' \describe{
#' \item{word}{An English word}
#' \item{lexicon}{The source of the stop word. Either "onix", "SMART", or "snowball"}
#' }
#'
#' @source \itemize{
#' \item
#' \item
#' \item
#' }
"stop_words"
#' Get a tidy data frame of a single stopword lexicon
#'
#' Get a specific stop word lexicon via the \pkg{stopwords} package's
#' [stopwords][stopwords::stopwords] function, in a tidy format with one word per row.
#'
#' @param language The language of the stopword lexicon specified as a
#' two-letter ISO code, such as `"es"`, `"de"`, or `"fr"`.
#' Default is `"en"` for English. Use
#' [stopwords_getlanguages][stopwords::stopwords_getlanguages] from \pkg{stopwords} to see available
#' languages.
#' @param source The source of the stopword lexicon specified. Default is
#' `"snowball"`. Use [stopwords_getsources][stopwords::stopwords_getsources] from
#' \pkg{stopwords} to see available sources.
#'
#' @return A tibble with two columns, `word` and `lexicon`. The
#' parameter `lexicon` is "quanteda" in this case.
#'
#' @examplesIf rlang::is_installed("stopwords")
#'
#' library(dplyr)
#' get_stopwords()
#' get_stopwords(source = "smart")
#' get_stopwords("es", "snowball")
#' get_stopwords("ru", "snowball")
#'
#' @export
#'
get_stopwords <- function(language = "en", source = "snowball") {
rlang::check_installed("stopwords", "to use this function.")
tibble(
word = stopwords::stopwords(language = language, source = source),
lexicon = source
)
}
tidytext/R/unnest_ptb.R 0000644 0001762 0000144 00000001602 15040537071 014636 0 ustar ligges users #' Wrapper around unnest_tokens for Penn Treebank Tokenizer
#'
#' This function is a wrapper around `unnest_tokens( token = "ptb" )`.
#'
#' @seealso
#' + [unnest_tokens()]
#'
#' @inheritParams unnest_tokens
#' @inheritParams tokenizers::tokenize_ptb
#'
#' @param ... Extra arguments passed on to [tokenizers][tokenizers::tokenizers]
#'
#' @export
#' @importFrom dplyr enquo
#'
#' @examples
#' library(dplyr)
#' library(janeaustenr)
#'
#' d <- tibble(txt = prideprejudice)
#'
#' d %>%
#' unnest_ptb(word, txt)
#'
unnest_ptb <- function(
tbl,
output,
input,
format = c("text", "man", "latex", "html", "xml"),
to_lower = TRUE,
drop = TRUE,
collapse = NULL,
...
) {
format <- arg_match(format)
unnest_tokens(
tbl,
!!enquo(output),
!!enquo(input),
format = format,
to_lower = to_lower,
drop = drop,
collapse = collapse,
token = "ptb",
...
)
}
tidytext/R/sparse_casters.R 0000644 0001762 0000144 00000010305 15040537071 015476 0 ustar ligges users #' Create a sparse matrix from row names, column names, and values
#' in a table.
#'
#' This function supports non-standard evaluation through the tidyeval framework.
#'
#' @param data A tbl
#' @param row Column name to use as row names in sparse matrix, as string or symbol
#' @param column Column name to use as column names in sparse matrix, as string or symbol
#' @param value Column name to use as sparse matrix values (default 1) as string or symbol
#' @param ... Extra arguments to pass on to [Matrix::sparseMatrix()]
#'
#' @return A sparse Matrix object, with one row for each unique value in
#' the `row` column, one column for each unique value in the `column`
#' column, and with as many non-zero values as there are rows in `data`.
#'
#' @details Note that cast_sparse ignores groups in a grouped tbl_df. The arguments
#' `row`, `column`, and `value` are passed by expression and support
#' [quasiquotation][rlang::quasiquotation]; you can unquote strings and symbols.
#'
#' @examples
#'
#' dat <- data.frame(a = c("row1", "row1", "row2", "row2", "row2"),
#' b = c("col1", "col2", "col1", "col3", "col4"),
#' val = 1:5)
#'
#' cast_sparse(dat, a, b)
#'
#' cast_sparse(dat, a, b, val)
#'
#' @import Matrix
#' @export
cast_sparse <- function(data, row, column, value, ...) {
row_col <- quo_name(enquo(row))
column_col <- quo_name(enquo(column))
value_col <- enquo(value)
if (quo_is_missing(value_col)) {
value_col <- 1
}
data <- ungroup(data)
data <- distinct(data, !!sym(row_col), !!sym(column_col), .keep_all = TRUE)
row_names <- data[[row_col]]
col_names <- data[[column_col]]
if (is.numeric(value_col)) {
values <- value_col
} else {
value_col <- quo_name(value_col)
values <- data[[value_col]]
}
# if it's a factor, preserve ordering
if (is.factor(row_names)) {
row_u <- levels(row_names)
i <- as.integer(row_names)
} else {
row_u <- unique(row_names)
i <- match(row_names, row_u)
}
if (is.factor(col_names)) {
col_u <- levels(col_names)
j <- as.integer(col_names)
} else {
col_u <- unique(col_names)
j <- match(col_names, col_u)
}
ret <- Matrix::sparseMatrix(
i = i,
j = j,
x = values,
dimnames = list(row_u, col_u),
...
)
ret
}
#' Casting a data frame to
#' a DocumentTermMatrix, TermDocumentMatrix, or dfm
#'
#' This turns a "tidy" one-term-per-document-per-row data frame into a
#' DocumentTermMatrix or TermDocumentMatrix from the tm package, or a
#' dfm from the quanteda package. These functions support non-standard
#' evaluation through the tidyeval framework. Groups are ignored.
#'
#' @param data Table with one-term-per-document-per-row
#' @param term Column containing terms as string or symbol
#' @param document Column containing document IDs as string or symbol
#' @param value Column containing values as string or symbol
#' @param weighting The weighting function for the DTM/TDM
#' (default is term-frequency, effectively unweighted)
#' @param ... Extra arguments passed on to
#' [Matrix::sparseMatrix()]
#'
#' @details The arguments `term`, `document`, and `value`
#' are passed by expression and support [quasiquotation][rlang::quasiquotation];
#' you can unquote strings and symbols.
#'
#' @rdname document_term_casters
#' @export
cast_tdm <- function(
data,
term,
document,
value,
weighting = tm::weightTf,
...
) {
term <- quo_name(enquo(term))
document <- quo_name(enquo(document))
value <- quo_name(enquo(value))
m <- cast_sparse(data, !!term, !!document, !!value, ...)
tm::as.TermDocumentMatrix(m, weighting = weighting)
}
#' @rdname document_term_casters
#' @export
cast_dtm <- function(
data,
document,
term,
value,
weighting = tm::weightTf,
...
) {
document <- quo_name(enquo(document))
term <- quo_name(enquo(term))
value <- quo_name(enquo(value))
m <- cast_sparse(data, !!document, !!term, !!value, ...)
tm::as.DocumentTermMatrix(m, weighting = weighting)
}
#' @rdname document_term_casters
#' @export
cast_dfm <- function(data, document, term, value, ...) {
document <- quo_name(enquo(document))
term <- quo_name(enquo(term))
value <- quo_name(enquo(value))
m <- cast_sparse(data, !!document, !!term, !!value, ...)
quanteda::as.dfm(m)
}
tidytext/R/reorder_within.R 0000644 0001762 0000144 00000006436 15040537071 015513 0 ustar ligges users #' Reorder an x or y axis within facets
#'
#' Reorder a column before plotting with faceting, such that the values are
#' ordered within each facet. This requires two functions: `reorder_within`
#' applied to the column, then either `scale_x_reordered` or
#' `scale_y_reordered` added to the plot.
#' This is implemented as a bit of a hack: it appends ___ and then the facet
#' at the end of each string.
#'
#' @param x Vector to reorder.
#' @param by Vector of the same length, to use for reordering.
#' @param within Vector or list of vectors of the same length that will later
#' be used for faceting. A list of vectors will be used to facet within multiple
#' variables.
#' @param fun Function to perform within each subset to determine the resulting
#' ordering. By default, mean.
#' @param labels Function to transform the labels of
#' [ggplot2::scale_x_discrete()], by default `reorder_func`.
#' @param sep Separator to distinguish `by` and `within`. You may want to set this
#' manually if ___ can exist within one of your labels.
#' @param ... In `reorder_within` arguments passed on to
#' [reorder()]. In the scale functions, extra arguments passed on to
#' [ggplot2::scale_x_discrete()] or [ggplot2::scale_y_discrete()].
#'
#' @source "Ordering categories within ggplot2 Facets" by Tyler Rinker:
#'
#'
#' @examplesIf rlang::is_installed(c("stopwords", "tidyr"))
#'
#' library(tidyr)
#' library(ggplot2)
#'
#' iris_gathered <- gather(iris, metric, value, -Species)
#'
#' # reordering doesn't work within each facet (see Sepal.Width):
#' ggplot(iris_gathered, aes(reorder(Species, value), value)) +
#' geom_boxplot() +
#' facet_wrap(~ metric)
#'
#' # reorder_within and scale_x_reordered work.
#' # (Note that you need to set scales = "free_x" in the facet)
#' ggplot(iris_gathered, aes(reorder_within(Species, value, metric), value)) +
#' geom_boxplot() +
#' scale_x_reordered() +
#' facet_wrap(~ metric, scales = "free_x")
#'
#' # to reorder within multiple variables, set within to the list of
#' # facet variables.
#' ggplot(mtcars, aes(reorder_within(carb, mpg, list(vs, am)), mpg)) +
#' geom_boxplot() +
#' scale_x_reordered() +
#' facet_wrap(vs ~ am, scales = "free_x")
#'
#' @importFrom lifecycle deprecated
#' @export
reorder_within <- function(x, by, within, fun = mean, sep = "___", ...) {
if (!is.list(within)) {
within <- list(within)
}
new_x <- do.call(paste, c(list(x, sep = sep), within))
stats::reorder(new_x, by, FUN = fun)
}
#' @rdname reorder_within
#' @export
scale_x_reordered <- function(..., labels = reorder_func, sep = deprecated()) {
if (lifecycle::is_present(sep)) {
lifecycle::deprecate_warn(
"0.3.3",
"scale_x_reordered(sep = )",
"reorder_func(sep = )"
)
}
ggplot2::scale_x_discrete(labels = labels, ...)
}
#' @rdname reorder_within
#' @export
scale_y_reordered <- function(..., labels = reorder_func, sep = deprecated()) {
if (lifecycle::is_present(sep)) {
lifecycle::deprecate_warn(
"0.3.3",
"scale_y_reordered(sep = )",
"reorder_func(sep = )"
)
}
ggplot2::scale_y_discrete(labels = labels, ...)
}
#' @rdname reorder_within
#' @export
reorder_func <- function(x, sep = "___") {
reg <- paste0(sep, ".+$")
gsub(reg, "", x)
}
tidytext/R/dictionary_tidiers.R 0000644 0001762 0000144 00000001114 15040537071 016343 0 ustar ligges users #' Tidy dictionary objects from the quanteda package
#'
#' @importFrom generics tidy
#'
#' @param x A dictionary object
#' @param regex Whether to turn dictionary items from a glob to a regex
#' @param ... Extra arguments, not used
#'
#' @return A data frame with two columns: category and word.
#'
#' @name dictionary_tidiers
#'
#' @export
tidy.dictionary2 <- function(x, regex = FALSE, ...) {
ret <- purrr::map_df(x, function(e) tibble(word = e), .id = "category") %>%
mutate(category = as.character(category))
if (regex) {
ret$word <- utils::glob2rx(ret$word)
}
ret
}
tidytext/R/globals.R 0000644 0001762 0000144 00000000530 15040537071 014077 0 ustar ligges users globalVariables(
c(
"X1",
"X2",
"X3",
"word",
"code",
"category",
"texts",
"i",
"j",
"x",
"Var1",
"Var2",
"value",
"out",
"topic",
".ndocs",
".nterm",
".document_total",
"new_groups",
"tf",
"idf",
"sentiments",
"document",
"term",
"L1"
)
)
tidytext/vignettes/ 0000755 0001762 0000144 00000000000 15040555021 014135 5 ustar ligges users tidytext/vignettes/tf_idf.Rmd 0000644 0001762 0000144 00000013142 14627353210 016043 0 ustar ligges users ---
title: "Term Frequency and Inverse Document Frequency (tf-idf) Using Tidy Data Principles"
author: "Julia Silge and David Robinson"
date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{Tidy Term Frequency and Inverse Document Frequency (tf-idf)}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
---
```{r}
#| label = "setup",
#| echo = FALSE
library(knitr)
opts_chunk$set(
warning = FALSE, message = FALSE,
eval = requireNamespace("ggplot2", quietly = TRUE)
)
```
```{r}
#| echo = FALSE
library(ggplot2)
theme_set(theme_light())
```
A central question in text mining and natural language processing is how to quantify what a document is about. Can we do this by looking at the words that make up the document? One measure of how important a word may be is its *term frequency* (tf), how frequently a word occurs in a document. There are words in a document, however, that occur many times but may not be important; in English, these are probably words like "the", "is", "of", and so forth. We might take the approach of adding words like these to a list of stop words and removing them before analysis, but it is possible that some of these words might be more important in some documents than others. A list of stop words is not a sophisticated approach to adjusting term frequency for commonly used words.
Another approach is to look at a term's *inverse document frequency* (idf), which decreases the weight for commonly used words and increases the weight for words that are not used very much in a collection of documents. This can be combined with term frequency to calculate a term's *tf-idf*, the frequency of a term adjusted for how rarely it is used. It is intended to measure how important a word is to a document in a collection (or corpus) of documents. It is a rule-of-thumb or heuristic quantity; while it has proved useful in text mining, search engines, etc., its theoretical foundations are considered less than firm by information theory experts. The inverse document frequency for any given term is defined as
$$idf(\text{term}) = \ln{\left(\frac{n_{\text{documents}}}{n_{\text{documents containing term}}}\right)}$$
We can use tidy data principles, as described in [the main vignette](tidytext.html), to approach tf-idf analysis and use consistent, effective tools to quantify how important various terms are in a document that is part of a collection.
Let's look at the published novels of Jane Austen and examine first term frequency, then tf-idf. We can start just by using dplyr verbs such as `group_by` and `join`. What are the most commonly used words in Jane Austen's novels? (Let's also calculate the total words in each novel here, for later use.)
```{r}
library(dplyr)
library(janeaustenr)
library(tidytext)
book_words <- austen_books() %>%
unnest_tokens(word, text) %>%
count(book, word, sort = TRUE)
total_words <- book_words %>% group_by(book) %>% summarize(total = sum(n))
book_words <- left_join(book_words, total_words)
book_words
```
The usual suspects are here, "the", "and", "to", and so forth. Let's look at the distribution of `n/total` for each novel, the number of times a word appears in a novel divided by the total number of terms (words) in that novel. This is exactly what term frequency is.
```{r}
#| fig.height = 7,
#| fig.width = 7,
#| fig.alt = "Histograms for word counts in Jane Austen's novels"
library(ggplot2)
ggplot(book_words, aes(n/total, fill = book)) +
geom_histogram(show.legend = FALSE) +
scale_x_continuous(limits = c(NA, 0.0009)) +
facet_wrap(vars(book), ncol = 2, scales = "free_y")
```
There are very long tails to the right for these novels (those extremely common words!) that we have not shown in these plots. These plots exhibit similar distributions for all the novels, with many words that occur rarely and fewer words that occur frequently. The idea of tf-idf is to find the important words for the content of each document by decreasing the weight for commonly used words and increasing the weight for words that are not used very much in a collection or corpus of documents, in this case, the group of Jane Austen's novels as a whole. Calculating tf-idf attempts to find the words that are important (i.e., common) in a text, but not *too* common. Let's do that now.
```{r}
book_words <- book_words %>%
bind_tf_idf(word, book, n)
book_words
```
Notice that idf and thus tf-idf are zero for these extremely common words. These are all words that appear in all six of Jane Austen's novels, so the idf term (which will then be the natural log of 1) is zero. The inverse document frequency (and thus tf-idf) is very low (near zero) for words that occur in many of the documents in a collection; this is how this approach decreases the weight for common words. The inverse document frequency will be a higher number for words that occur in fewer of the documents in the collection. Let's look at terms with high tf-idf in Jane Austen's works.
```{r}
book_words %>%
select(-total) %>%
arrange(desc(tf_idf))
```
Here we see all proper nouns, names that are in fact important in these novels. None of them occur in all of novels, and they are important, characteristic words for each text. Some of the values for idf are the same for different terms because there are 6 documents in this corpus and we are seeing the numerical value for $\ln(6/1)$, $\ln(6/2)$, etc. Let's look specifically at *Pride and Prejudice*.
```{r}
book_words %>%
filter(book == "Pride & Prejudice") %>%
select(-total) %>%
arrange(desc(tf_idf))
```
These words are, as measured by tf-idf, the most important to *Pride and Prejudice* and most readers would likely agree.
tidytext/vignettes/tidying_casting.Rmd 0000644 0001762 0000144 00000014521 14625411010 017760 0 ustar ligges users ---
title: "Converting to and from Document-Term Matrix and Corpus objects"
author: "Julia Silge and David Robinson"
date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{Converting to and from Document-Term Matrix and Corpus objects}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
---
```{r}
#| echo = FALSE
knitr::opts_chunk$set(
message = FALSE, warning = FALSE,
eval = requireNamespace("tm", quietly = TRUE) && requireNamespace("quanteda", quietly = TRUE) && requireNamespace("topicmodels", quietly = TRUE) && requireNamespace("ggplot2", quietly = TRUE)
)
```
```{r}
#| echo = FALSE
library(ggplot2)
theme_set(theme_bw())
```
### Tidying document-term matrices
Many existing text mining datasets are in the form of a `DocumentTermMatrix` class (from the tm package). For example, consider the corpus of 2246 Associated Press articles from the topicmodels package:
```{r}
library(tm)
data("AssociatedPress", package = "topicmodels")
AssociatedPress
```
If we want to analyze this with tidy tools, we need to turn it into a one-term-per-document-per-row data frame first. The `tidy` function does this. (For more on the tidy verb, [see the broom package](https://github.com/dgrtwo/broom)).
```{r}
library(dplyr)
library(tidytext)
ap_td <- tidy(AssociatedPress)
```
Just as shown in [this vignette](tidytext.html), having the text in this format is convenient for analysis with the tidytext package. For example, you can perform sentiment analysis on these newspaper articles.
```{r}
ap_sentiments <- ap_td %>%
inner_join(get_sentiments("bing"), join_by(term == word))
ap_sentiments
```
We can find the most negative documents:
```{r}
library(tidyr)
ap_sentiments %>%
count(document, sentiment, wt = count) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative) %>%
arrange(sentiment)
```
Or visualize which words contributed to positive and negative sentiment:
```{r}
#| fig.width = 7,
#| fig.height = 4,
#| fig.alt = 'Bar charts for the contribution of words to sentiment scores. The words "like" and "work" contribute the most to positive sentiment, and the words "killed" and "death" contribute the most to negative sentiment'
library(ggplot2)
ap_sentiments %>%
count(sentiment, term, wt = count) %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
mutate(term = reorder(term, n)) %>%
ggplot(aes(n, term, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(vars(sentiment), scales = "free_y") +
labs(x = "Contribution to sentiment", y = NULL)
```
Note that a tidier is also available for the `dfm` class from the quanteda package:
```{r}
library(methods)
data("data_corpus_inaugural", package = "quanteda")
d <- quanteda::tokens(data_corpus_inaugural) %>%
quanteda::dfm()
d
tidy(d)
```
### Casting tidy text data into a DocumentTermMatrix
Some existing text mining tools or algorithms work only on sparse document-term matrices. Therefore, tidytext provides `cast_` verbs for converting from a tidy form to these matrices.
```{r}
ap_td
# cast into a Document-Term Matrix
ap_td %>%
cast_dtm(document, term, count)
# cast into a Term-Document Matrix
ap_td %>%
cast_tdm(term, document, count)
# cast into quanteda's dfm
ap_td %>%
cast_dfm(term, document, count)
# cast into a Matrix object
m <- ap_td %>%
cast_sparse(document, term, count)
class(m)
dim(m)
```
This allows for easy reading, filtering, and processing to be done using dplyr and other tidy tools, after which the data can be converted into a document-term matrix for machine learning applications.
### Tidying corpus data
You can also tidy Corpus objects from the tm package. For example, consider a Corpus containing 20 documents, one for each
```{r}
reut21578 <- system.file("texts", "crude", package = "tm")
reuters <- VCorpus(DirSource(reut21578),
readerControl = list(reader = readReut21578XMLasPlain))
reuters
```
The `tidy` verb creates a table with one row per document:
```{r}
reuters_td <- tidy(reuters)
reuters_td
```
Similarly, you can `tidy` a `corpus` object from the quanteda package:
```{r}
library(quanteda)
data("data_corpus_inaugural")
data_corpus_inaugural
inaug_td <- tidy(data_corpus_inaugural)
inaug_td
```
This lets us work with tidy tools like `unnest_tokens` to analyze the text alongside the metadata.
```{r}
inaug_words <- inaug_td %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
inaug_words
```
We could then, for example, see how the appearance of a word changes over time:
```{r}
inaug_freq <- inaug_words %>%
count(Year, word) %>%
complete(Year, word, fill = list(n = 0)) %>%
group_by(Year) %>%
mutate(year_total = sum(n), percent = n / year_total) %>%
ungroup()
inaug_freq
```
For example, we can use the broom package to perform logistic regression on each word.
```{r}
library(broom)
models <- inaug_freq %>%
group_by(word) %>%
filter(sum(n) > 50) %>%
group_modify(
~ tidy(glm(cbind(n, year_total - n) ~ Year, ., family = "binomial"))
) %>%
ungroup() %>%
filter(term == "Year")
models
models %>%
filter(term == "Year") %>%
arrange(desc(abs(estimate)))
```
You can show these models as a volcano plot, which compares the effect size with the significance:
```{r}
#| fig.width = 7,
#| fig.height = 5,
#| fig.alt = 'Volcano plot showing that words like "america" and "world" have increased over time with small p-values, while words like "public" and "institution" have decreased'
library(ggplot2)
models %>%
mutate(adjusted.p.value = p.adjust(p.value)) %>%
ggplot(aes(estimate, adjusted.p.value)) +
geom_point() +
scale_y_log10() +
geom_text(aes(label = word), vjust = 1, hjust = 1, check_overlap = TRUE) +
labs(x = "Estimated change over time", y = "Adjusted p-value")
```
We can also use the ggplot2 package to display the top 6 terms that have changed in frequency over time.
```{r}
#| fig.width = 7,
#| fig.height = 6,
#| fig.alt = 'Scatterplot with LOESS smoothing lines showing that the words "america", "americans", "century", "children", "democracy", and "god" have increased over time'
library(scales)
models %>%
slice_max(abs(estimate), n = 6) %>%
inner_join(inaug_freq) %>%
ggplot(aes(Year, percent)) +
geom_point() +
geom_smooth() +
facet_wrap(vars(word)) +
scale_y_continuous(labels = percent_format()) +
labs(y = "Frequency of word in speech")
```
tidytext/vignettes/tidytext.Rmd 0000644 0001762 0000144 00000025203 14475733203 016474 0 ustar ligges users ---
title: "Introduction to tidytext"
author: "Julia Silge and David Robinson"
date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{Introduction to tidytext}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
---
```{r}
#| label = "setup",
#| echo = FALSE
library(knitr)
opts_chunk$set(
warning = FALSE, message = FALSE,
eval = requireNamespace("wordcloud", quietly = TRUE) && requireNamespace("ggplot2", quietly = TRUE)
)
```
```{r}
#| echo = FALSE
library(ggplot2)
theme_set(theme_light())
```
### The Life-Changing Magic of Tidying Text
Using [tidy data principles](https://doi.org/10.18637/jss.v059.i10) can make many text mining tasks easier, more effective, and consistent with tools already in wide use. Much of the infrastructure needed for text mining with tidy data frames already exists in packages like [dplyr](https://cran.r-project.org/package=dplyr), [broom](https://cran.r-project.org/package=broom), [tidyr](https://cran.r-project.org/package=tidyr) and [ggplot2](https://cran.r-project.org/package=ggplot2). In this package, we provide functions and supporting data sets to allow conversion of text to and from tidy formats, and to switch seamlessly between tidy tools and existing text mining packages. Check out [our book](https://www.tidytextmining.com/) to learn more about text mining using tidy data principles.
### A few first tidy text mining examples
The novels of Jane Austen can be so tidy! Let's use the text of Jane Austen's 6 completed, published novels from the [janeaustenr](https://cran.r-project.org/package=janeaustenr) package, and transform them into a tidy format. janeaustenr provides them as a one-row-per-line format:
```{r}
library(janeaustenr)
library(dplyr)
library(stringr)
original_books <- austen_books() %>%
group_by(book) %>%
mutate(line = row_number(),
chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup()
original_books
```
To work with this as a tidy dataset, we need to restructure it as **one-token-per-row** format. The `unnest_tokens` function is a way to convert a dataframe with a text column to be one-token-per-row. Here let's tokenize to a new `word` column from the existing `text` column:
```{r}
library(tidytext)
tidy_books <- original_books %>%
unnest_tokens(output = word, input = text)
tidy_books
```
This function uses the [tokenizers package](https://github.com/ropensci/tokenizers) to separate each line into words. The default tokenizing is for words, but other options include characters, ngrams, sentences, lines, paragraphs, or separation around a regex pattern.
Now that the data is in one-word-per-row format, we can manipulate it with tidy tools like dplyr. We can remove stop words (accessible in a tidy form with the function `get_stopwords()`) with an `anti_join`.
```{r}
cleaned_books <- tidy_books %>%
anti_join(get_stopwords())
```
We can also use `count` to find the most common words in all the books as a whole.
```{r}
cleaned_books %>%
count(word, sort = TRUE)
```
Sentiment analysis can be done as an inner join. Sentiment lexicons are available via the `get_sentiments()` function. Let's look at the words with a positive score from the lexicon of Bing Liu and collaborators. What are the most common positive words in *Emma*?
```{r}
positive <- get_sentiments("bing") %>%
filter(sentiment == "positive")
tidy_books %>%
filter(book == "Emma") %>%
semi_join(positive) %>%
count(word, sort = TRUE)
```
Or instead we could examine how sentiment changes during each novel. Let's find a sentiment score for each word using the same lexicon, then count the number of positive and negative words in defined sections of each novel.
```{r}
library(tidyr)
bing <- get_sentiments("bing")
janeaustensentiment <- tidy_books %>%
inner_join(bing, relationship = "many-to-many") %>%
count(book, index = line %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
```
Now we can plot these sentiment scores across the plot trajectory of each novel.
```{r}
#| fig.width = 7,
#| fig.height = 7,
#| fig.alt = "Sentiment scores across the trajectories of Jane Austen's six published novels",
#| warning = FALSE
library(ggplot2)
ggplot(janeaustensentiment, aes(index, sentiment, fill = book)) +
geom_bar(stat = "identity", show.legend = FALSE) +
facet_wrap(vars(book), ncol = 2, scales = "free_x")
```
### Most common positive and negative words
One advantage of having the data frame with both sentiment and word is that we can analyze word counts that contribute to each sentiment.
```{r}
bing_word_counts <- tidy_books %>%
inner_join(bing, relationship = "many-to-many") %>%
count(word, sentiment, sort = TRUE)
bing_word_counts
```
This can be shown visually, and we can pipe straight into ggplot2 because of the way we are consistently using tools built for handling tidy data frames.
```{r}
#| fig.width = 7,
#| fig.height = 4,
#| fig.alt = 'Bar charts for the contribution of words to sentiment scores. The words "well" and "good" contribute the most to positive sentiment, and the word "miss" contributes the most to negative sentiment'
bing_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(vars(sentiment), scales = "free_y") +
labs(x = "Contribution to sentiment", y = NULL)
```
This lets us spot an anomaly in the sentiment analysis; the word "miss" is coded as negative but it is used as a title for young, unmarried women in Jane Austen's works. If it were appropriate for our purposes, we could easily add "miss" to a custom stop-words list using `bind_rows`.
### Wordclouds
We've seen that this tidy text mining approach works well with ggplot2, but having our data in a tidy format is useful for other plots as well.
For example, consider the wordcloud package. Let's look at the most common words in Jane Austen's works as a whole again.
```{r}
#| fig.height = 6,
#| fig.width = 6,
#| fig.alt = 'Wordcloud showing that words like "mr", "miss", and "one" are most common in the works of Jane Austen'
library(wordcloud)
cleaned_books %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
```
In other functions, such as `comparison.cloud`, you may need to turn it into a matrix with reshape2's `acast`. Let's do the sentiment analysis to tag positive and negative words using an inner join, then find the most common positive and negative words. Until the step where we need to send the data to `comparison.cloud`, this can all be done with joins, piping, and dplyr because our data is in tidy format.
```{r}
#| label = "wordcloud",
#| fig.height = 5,
#| fig.width = 5,
#| fig.alt = 'Wordcloud showing that "well" and "good" are the most common positive sentiment words while "miss" is the most common negative sentiment word'
library(reshape2)
tidy_books %>%
inner_join(bing) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("#F8766D", "#00BFC4"),
max.words = 100)
```
### Looking at units beyond just words
Lots of useful work can be done by tokenizing at the word level, but sometimes it is useful or necessary to look at different units of text. For example, some sentiment analysis algorithms look beyond only unigrams (i.e. single words) to try to understand the sentiment of a sentence as a whole. These algorithms try to understand that
> I am not having a good day.
is a sad sentence, not a happy one, because of negation. The [Stanford CoreNLP](https://stanfordnlp.github.io/CoreNLP/) tools and the [sentimentr R package](https://cran.r-project.org/package=sentimentr) are examples of such sentiment analysis algorithms. For these, we may want to tokenize text into sentences.
```{r}
PandP_sentences <- tibble(text = prideprejudice) %>%
unnest_tokens(output = sentence, input = text, token = "sentences")
```
Let's look at just one.
```{r}
PandP_sentences$sentence[2]
```
The sentence tokenizing does seem to have a bit of trouble with UTF-8 encoded text, especially with sections of dialogue; it does much better with punctuation in ASCII.
Another option in `unnest_tokens` is to split into tokens using a regex pattern. We could use this, for example, to split the text of Jane Austen's novels into a data frame by chapter.
```{r}
austen_chapters <- austen_books() %>%
group_by(book) %>%
unnest_tokens(chapter, text, token = "regex", pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
ungroup()
austen_chapters %>%
group_by(book) %>%
summarise(chapters = n())
```
We have recovered the correct number of chapters in each novel (plus an "extra" row for each novel title). In this data frame, each row corresponds to one chapter.
Near the beginning of this vignette, we used a similar regex to find where all the chapters were in Austen's novels for a tidy data frame organized by one-word-per-row. We can use tidy text analysis to ask questions such as what are the most negative chapters in each of Jane Austen's novels? First, let's get the list of negative words from the Bing lexicon. Second, let's make a dataframe of how many words are in each chapter so we can normalize for the length of chapters. Then, let's find the number of negative words in each chapter and divide by the total words in each chapter. Which chapter has the highest proportion of negative words?
```{r}
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
wordcounts <- tidy_books %>%
group_by(book, chapter) %>%
summarize(words = n())
tidy_books %>%
semi_join(bingnegative) %>%
group_by(book, chapter) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("book", "chapter")) %>%
mutate(ratio = negativewords/words) %>%
filter(chapter != 0) %>%
slice_max(ratio, n = 1)
```
These are the chapters with the most negative words in each book, normalized for number of words in the chapter. What is happening in these chapters? In Chapter 43 of *Sense and Sensibility* Marianne is seriously ill, near death, and in Chapter 34 of *Pride and Prejudice* Mr. Darcy proposes for the first time (so badly!). Chapter 46 of *Mansfield Park* is almost the end, when everyone learns of Henry's scandalous adultery, Chapter 15 of *Emma* is when horrifying Mr. Elton proposes, and in Chapter 21 of *Northanger Abbey* Catherine is deep in her Gothic faux fantasy of murder, etc. Chapter 4 of *Persuasion* is when the reader gets the full flashback of Anne refusing Captain Wentworth and how sad she was and what a terrible mistake she realized it to be.
tidytext/data/ 0000755 0001762 0000144 00000000000 14040415714 013041 5 ustar ligges users tidytext/data/stop_words.rda 0000644 0001762 0000144 00000006640 14040415714 015742 0 ustar ligges users BZh91AY&SYL &H $@H >w P_>hnx{{GfK=z M A14&ST