# Install non-CRAN packages
# remotes::install_github("macmillancontentscience/wikimorphemes")
# remotes::install_github("macmillancontentscience/wordpiece.data")
library(morphemepiece)
library(wikimorphemes)
library(wordpiece.data)
library(dplyr)
library(ggplot2)
library(purrr)
This vignette shows how to use the processed words from the {wikimorphemes} package to create a morphemepiece vocabulary and lookup table.
To make a morphemepiece vocabulary, start with…
# load functions to make the vocab + lookup
source(here::here("vignettes", "make_vocab_and_lookup.R"))
# Load data from various packages related to this task.
<- readRDS(wikimorphemes::download_wikimorphemes_lookup())
original_lookup
# TODO: Add something to reproduce this or host it somewhere.
<- readRDS(
word_frequency_table ::path(
fsmorphemepiece_cache_dir(),
"word_frequency_table.rds"
)
)
# Not all wiktionary words are in the wiktionary lookup. Use the full word list
# to add short words back in.
<- .add_words_to_lookup(
full_lookup
original_lookup, ::wiktionary_word_list()
wikimorphemes )
Before we make a specific vocabulary, we can look at which morphemes are most common over all wiktionary words to get a prioritized list.
# for much of this process, it's more convenient to have the processed words
# unnested, with one morpheme per row. This takes a few minutes.
# Currently, we're considering only words with pure lowercase latin characters.
# We likely will want to also include the simplified version of words with
# accented characters in this list.
<- .unnest_lookup(full_lookup, clean = TRUE)
unnested_lookup
# count how many wiktionary words each token appears in.
<- count_tokens(unnested_lookup)
token_counts
::head(token_counts) utils
Unsurprisingly, the top token is “##s”, the “s” inflection at the ends of words.
We rank tokens in order of frequency. We can also rank words in order of the maximum rank that their component tokens have. This shows which words would be covered by a token vocabulary including tokens up to some rank.
# Some words process into "non-clean" tokens (diacrits, etc.).
# Those tokens are excluded in token_counts, so will get an NA here.
# Deal with this better later, but for now, just remove those words.
# (only about 0.01% of words)
# Find the highest-rank (rarest) token within each word.
<- dplyr::left_join(
words_with_max_token_ranks
unnested_lookup,
token_counts, by = "token"
%>%
) ::group_by(word) %>%
dplyr::summarize(max_rank = max(rank)) %>%
dplyr::filter(!is.na(max_rank))
dplyr
# Count how many total words are covered by tokens up to some rank:
<- words_with_max_token_ranks %>%
words_vs_tokens ::group_by(max_rank) %>%
dplyr::summarize(n_words = dplyr::n_distinct(word)) %>%
dplyr::arrange(max_rank) %>%
dplyr::mutate(n_words = cumsum(n_words)) dplyr
There are 194347 distinct tokens and 643745 distinct words.
# plot!
%>%
words_vs_tokens ::mutate(frac_words = n_words/max(n_words)) %>%
dplyr::mutate(frac_tokens = max_rank/max(max_rank)) %>%
dplyr::ggplot(ggplot2::aes(x = frac_tokens, y = frac_words)) +
ggplot2::geom_point() ggplot2
Note that only about 5% of the morpheme tokens (out of about 194K) are needed to cover about 50% of the words (out of about 644K).
The plot above weights each word equally, no matter how frequently that word occurs. To get a sense of word coverage by actual usage, we can weight each word by how often it occurs in some corpus. For example, weighting words by occurrence in the wikitext-103 corpus:
# passing a word frequency table in (columns: word, word_count) applies weights
# to the token counts.
<- count_tokens(unnested_lookup, word_frequency_table)
token_counts_weighted ::head(token_counts_weighted) utils
Now the top token is “the”, just because of how common that word is.
Instead of counting how many words are covered by tokens up to some rank, we should count the total weight of words covered. For this, we join onto word_frequency_table again (giving words not found in the corpus a count of one).
<- dplyr::left_join(
words_with_max_token_ranks_weighted
unnested_lookup,
token_counts_weighted, by = "token"
%>%
) ::group_by(word) %>%
dplyr::summarize(max_rank = max(rank)) %>%
dplyr::filter(!is.na(max_rank))
dplyr
<- dplyr::left_join(
weighted_tokens_and_words
words_with_max_token_ranks_weighted,
word_frequency_table,by = "word"
%>%
) ::mutate(
dplyrword_count = ifelse(
test = is.na(word_count),
yes = 1L,
no = word_count
)
)
<- weighted_tokens_and_words %>%
words_vs_tokens_weighted ::group_by(max_rank) %>%
dplyr::summarize(n_words = sum(word_count)) %>%
dplyr::arrange(max_rank) %>%
dplyr::mutate(n_words = cumsum(n_words))
dplyr
# plot!
%>%
words_vs_tokens_weighted ::mutate(frac_words = n_words/max(n_words)) %>%
dplyr::mutate(frac_tokens = max_rank/max(max_rank)) %>%
dplyr::ggplot(ggplot2::aes(x = frac_tokens, y = frac_words)) +
ggplot2::geom_point() ggplot2
The top 5% of tokens would cover over 98% of words by usage.
Now we make an actual vocabulary and lookup.
We construct a morphemepiece vocabulary, starting from a wordpiece vocabulary. The constructed vocabulary contains the same “fancy” tokens as wordpiece (punctuation, etc.), but replaces most (non-short, non-proper-noun) words with their morpheme tokens. This process generally results in a core vocabulary with about 16K tokens.
If our target vocabulary size is larger than this, additional tokens are added from the (ranked) list of tokens.
The lookup contains every word in the wikipedia word list which is covered by (processes into) the tokens in the vocabulary.
Here, let’s make two vocabularies: one “small” (minimum to roughly cover the wordpiece vocabulary) and one “large” (comparable to the size of the wordpiece vocabulary).
<- make_vocab_and_lookup(
vandl_small full_lookup = original_lookup,
full_vocabulary = wikimorphemes::wiktionary_word_list(),
wordpiece_vocab = wordpiece.data::wordpiece_vocab(),
target_vocab_size = 0, # no extra tokens
word_frequency_table = word_frequency_table
)
<- make_vocab_and_lookup(
vandl_large full_lookup = original_lookup,
full_vocabulary = wikimorphemes::wiktionary_word_list(),
wordpiece_vocab = wordpiece.data::wordpiece_vocab(),
target_vocab_size = 30000L,
word_frequency_table = word_frequency_table
)
The vocabulary and lookup aren’t yet in our standardized forms. First, we save them as standardized text files.
# TODO: Make this save and reload stuff unnecessary!
<- .make_text_lookup(
text_lookup_small voc = vandl_small$vocab,
lu = vandl_small$lookup,
word_freq_tab = word_frequency_table
)
writeLines(
text_lookup_small, file.path(morphemepiece_cache_dir(), "mp_lookup_small.txt")
)# vocab is already just a character vector
writeLines(
$vocab,
vandl_smallfile.path(morphemepiece_cache_dir(), "mp_vocab_small.txt")
)
# now do large
<- .make_text_lookup(
text_lookup_large voc = vandl_large$vocab,
lu = vandl_large$lookup,
word_freq_tab = word_frequency_table
)
::write_lines(
readr
text_lookup_large, file.path(morphemepiece_cache_dir(), "mp_lookup_large.txt")
)# vocab is already just a character vector
::write_lines(
readr$vocab,
vandl_largefile.path(morphemepiece_cache_dir(), "mp_vocab_large.txt")
)
# Read back from text files to process as standard {morphemepiece} files:
<- load_or_retrieve_vocab(
vocab file.path(morphemepiece_cache_dir(), "mp_vocab_large.txt")
)<- load_or_retrieve_lookup(
lookup file.path(morphemepiece_cache_dir(), "mp_lookup_large.txt")
)
morphemepiece_tokenize("Surprisingly easy", vocab, lookup)
morphemepiece_tokenize("'Twas brillig, and the slithy toves", vocab, lookup)
With these vocabularies, we can look at various measures of coverage. For example, what (weighted) fraction of words from our corpus are covered by the vocab/lookup?
<- dplyr::left_join(
corpus_coverage_small
word_frequency_table, $lookup,
vandl_smallby = "word"
%>%
) ::mutate(covered_lookup = !is.na(tokenization)) %>%
dplyr# not every word in the vocab is in the lookup; check vocab too
::mutate(covered_vocab = word %in% vandl_small$vocab) %>%
dplyr::mutate(covered = covered_lookup | covered_vocab) %>%
dplyr::mutate(covered_weighted = covered*word_count)
dplyr
%>%
corpus_coverage_small ::summarize(sum(covered_weighted)/sum(word_count)) dplyr
The “small” vocabulary covers 95.1% of words from the corpus, weighted by usage.
<- dplyr::left_join(
corpus_coverage_large
word_frequency_table, $lookup,
vandl_largeby = "word"
%>%
) ::mutate(covered_lookup = !is.na(tokenization)) %>%
dplyr# not every word in the vocab is in the lookup; check vocab too
::mutate(covered_vocab = word %in% vandl_large$vocab) %>%
dplyr::mutate(covered = covered_lookup | covered_vocab) %>%
dplyr::mutate(covered_weighted = covered*word_count)
dplyr
%>%
corpus_coverage_large ::summarize(sum(covered_weighted)/sum(word_count)) dplyr
The “large” vocabulary covers 96.6% of words, 1.5% more words (weighted) than the “small” vocabulary.
It is useful to look at the most common words that are not covered by the vocabularies:
#large
<- corpus_coverage_large %>%
uncovered ::filter(!.data$covered) %>%
dplyr::arrange(dplyr::desc(.data$word_count)) %>%
dplyr::select(.data$word, .data$word_count) %>%
dplyrhead(100) %>%
::mutate(
dplyrtokenization = morphemepiece_tokenize(
$word,
.datavocab = vocab,
lookup = lookup
)%>%
) ::rowwise() %>%
dplyr::mutate(
dplyrtokenization = paste(names(.data$tokenization), collapse = " ")
%>%
) ::ungroup()
dplyr
head(uncovered, 10)
Categories of uncovered words include some proper nouns and partial contractions (like “didn”).
The word “los” in this list raises an interesting question for future consideration: should a small set of very common words in languages besides English be included? After all, a significant number of tokens in both wordpiece and morphemepiece are currently used for Asian language characters.
Some other checks we will try include:
(These checks have not been re-ran yet with the updated data.)
# just for fun :-D
<- "https://xkcd.com/simplewriter/words.js"
xkcd_words_url <- readr::read_lines(xkcd_words_url)
raw_words
<- raw_words[grepl("WORDS", raw_words)]
raw_words <- stringr::str_split(raw_words, '"')[[1]]
raw_words <- raw_words[grepl("\\|", raw_words)]
raw_words <- dplyr::tibble(top_words = stringr::str_split(raw_words, "\\|")[[1]])
words # I feel lied to. There are more than 3k words in this list.
<- words %>%
words ::mutate(
dplyrtokenized = morphemepiece_tokenize(.data$top_words, vocab, lookup)
%>%
) ::rowwise() %>%
dplyr::mutate(
dplyrtokenized = paste(names(.data$tokenized), collapse = " ")
%>%
) ::ungroup()
dplyr
# ss_url <- "url of mp_scratch google sheet"
# already authorized
# googlesheets4::write_sheet(words, ss_url)
# manual check, add column "is_ok"
# checked_words <- googlesheets4::read_sheet(ss_url, sheet = "check common words")
# if breakdown is ok, value is "y"
# mean(checked_words$is_ok == "y")
# [1] 0.9711062
# many of the exceptions can/should be fixed in wiktionary
# These have not been checked in a while.
Fewer than 3% of these words (about 100 total) were judged to be problematic. For these common words, it makes sense for us to address the issues directly in Wiktionary where possible.
For the next check, randomly pick some words of at least length 4 without a breakdown:
<- unique(unnested_lookup$word)
all_words <- intersect(names(vocab), all_words)
unbroken_vocab_words <- unbroken_vocab_words[nchar(unbroken_vocab_words) > 3]
unbroken_vocab_words
# sample a few hundred
<- dplyr::tibble(unbroken_word = sample(unbroken_vocab_words,
unbroken_sample size = 300))
# send to google sheet for manual check
# googlesheets4::write_sheet(unbroken_sample, ss_url, sheet = "unbroken_check")
# manual check, add column "is_ok"
# checked_unbroken_words <- googlesheets4::read_sheet(
# ss_url,
# sheet = "check unbroken words"
# )
# if breakdown is ok, value is "y"
# table(checked_unbroken_words$is_ok)
# ? n y
# 24 32 244
# many of the exceptions can/should be fixed in wiktionary
About 10% of this sample was judged to be wrong (definitely should have a breakdown), and about 10% was judged to be questionable (maybe should have a breakdown).