Author: S. Wu
Prediction Algorithm - 4-gram model with backoff smoothing method. It first searches the possible highest N-gram(ex. 4-gram if the search phrase consists 3 or more words), and returns the top 3 “next word” by frequency. If the evidence is insufficient, it then searches the next lower–order N-gram, so on and so forth (Jurafsky & Martin, 2014).
Data Sampling and Cleaning - 60% random samples of english blogs, news, and tweets corpus from the HC Corpora were preprocessed and generated into N-gram tables using R’s quanteda{} package. Features that occurred only once in each N-gram were removed as minimally advantageous to prediction, a decision which greatly enhanced app speed.
pkg<-c("knitr", "quanteda", "data.table", "plyr", "dplyr")
pkgCheck<-pkg %in% rownames(installed.packages())
for(i in 1:length(pkg)) {
if(pkgCheck[i]==FALSE) {
install.packages(pkg[i])
}
library(pkg[i],character.only = TRUE, quietly = TRUE)
}
options(scipen=999); #remove scientific notation in printing
getSample <- function(x, p) {
sample <- sample(x, size = as.integer(length(x)*p), replace = FALSE)
return(sample)
};
# 60% of total corpus is train set, the remaining is set aside as validation set.
trainP<- 0.60; #testP<- 0.2;
##blogs
con<- file('en_US.blogs.txt', 'rb');
blog<- readLines(con, encoding = 'UTF-8', skipNul = TRUE);
close(con);
set.seed(123);
blogTrain<- getSample(blog, trainP); blogVal<- blog[!(blog %in% blogTrain)];
# set.seed(1234);
saveRDS(blogTrain, 'rData/blogTrain.rds');
saveRDS(blogVal, 'rData/blogVal.rds');
rm(blog, blogTest, blogVal);
##news
con<- file('en_US.news.txt', 'rb');
news<- readLines(con, encoding = 'UTF-8', skipNul = TRUE);
close(con); rm(con);
set.seed(456);
newsTrain<- getSample(news, trainP); newsVal<- news[!(news %in% newsTrain)];
# set.seed(4567);
saveRDS(newsTrain, 'rData/newsTrain.rds');
saveRDS(newsVal, 'rData/newsVal.rds');
rm(news, newsTest, newsVal);
##twitter
con<- file('en_US.twitter.txt', 'rb');
twitter<- readLines(con, encoding = 'UTF-8', skipNul = TRUE);
close(con); rm(con);
set.seed(789);
twitterTrain<- getSample(twitter, trainP); twitterVal<- twitter[!(twitter %in% twitterTrain)];
saveRDS(twitterTrain, 'rData/twitterTrain.rds');
saveRDS(twitterVal, 'rData/twitterVal.rds');
rm(twitter, twitterTest, twitterVal);
allTrain<- c(blogTrain, newsTrain, twitterTrain);
rm(blogTrain, newsTrain, twitterTrain, testP, trainP, getSample);
saveRDS(allTrain, 'rData/allTrain.rds');
N-Gram Models:
use ‘dfm()’ default: toLower = TRUE, removeNumbers = TRUE, removePunct = TRUE, removeSeparators = TRUE
no removing stopwords, no stemming
remove profanity, list from website, including variations (ex. ed, ing)
transform selective contractions (ex. do not –> don’t, i am –> i’m)
remove Twitter(# and @)
remove unnecessary whitespace
N-Gram Tables:
Remove features that occur only once (frequency==1) in the training corpus
1-Gram: remove 58.9% of the features(type), 0.6% of the frequency(count)
2-Gram: remove 72.3% of the features(type), 13.3% of the frequency(count)
3-Gram: remove 85.5% the features(type), 48.1% of the frequency(count)
4-Gram: remove 93.6% the features(type), 78.9% of the frequency(count)
# convert to ASCII and lower case
allTrain<- iconv(allTrain, "latin1", "ASCII", sub="");
allTrain<- tolower(allTrain);
# transform selective contractions
from<- c("do not", "does not", "did not", "have not", "has not", "had not",
"cannot", "would not", "could not", "should not", "must not", "will not",
"are not", "is not", "was not", "were not",
"i am ", "i have ", "i will ", "you are ", "youre ", "you have ", "you will ",
"he will ", "she will ", "they are ", "they have ", "they will ",
"we are ", "we have ", "we will ", "it is ", "it will "); #"he is ", "she is ",
to<- c("don't", "doesn't", "didn't", "haven't", "hasn't", "hadn't",
"can't", "wouldn't", "couldn't", "shouldn't", "mustn't", "won't",
"aren't", "isn't", "wasn't", "weren't",
"i'm ", "i've ", "i'll ", "you're ", "you're ", "you've ", "you'll ",
"he'll ", "she'll ", "they're ", "they've ", "they'll ",
"we're ", "we've ", "we'll ", "it's ", "it'll "); #"he's ", "she's ",
saveRDS(from, 'rData/contractionsFrom.rds');
saveRDS(to, 'rData/contractionsTo.rds');
for (i in 1:length(to)) {
allTrain<- gsub(from[i], to[i], allTrain)
};
rm(i);
saveRDS(allTrain, 'rData/allTrainClean.rds');
## Functions for generating N-Gram dfm and frequency table
# Function: Generate N-Gram dfm
dfmGenerate<- function(trainSet, n) {
dfm<- dfm(paste(trainSet, collapse = " "), ignoredFeatures=badword, removeTwitter = TRUE, ngrams=n);
return(dfm);
}
#Function: Generate N-Gram frequency table
tbGenerate<- function(dfm) {
tb<- data.table(feature=features(dfm), freq=colSums(dfm), key = "feature");
tb<- tb[order(-freq)];
tb$feature<- gsub('_+', '_', tb$feature);
tb$feature<- gsub('^_', '', tb$feature);
if (length(unique(tb$feature)) < nrow(tb)) {
tb<- tb[, list(freq=sum(freq)), by=feature];
}
return(tb);
}
showFreq<- function(tb) {
once<- sum(tb$freq<=1);
freqCum<- cumsum(tb$freq);
cat(sprintf(paste0('feature number: %d\n',
'freq=1 features percentage: %.1f %%\n',
'freq=1 frequency percentage: %.1f %%\n',
'fifty percent coverage features: %d\n',
'ninety percent coverage features: %d\n'),
nrow(tb),
once/nrow(tb)*100,
once/sum(tb$freq)*100,
head(which(freqCum>=sum(tb$freq)*0.5),1),
head(which(freqCum>=sum(tb$freq)*0.9),1)
));
}
#Function: Count frequency by "starter"(ignore the last word)
checkFreq<- function (tb) {
tb[, starter:= sapply(feature, function(x) {
underscore<- gregexpr('_', x)[[1]];
n<-underscore[length(underscore)];
substr(x, 1, n);
}, USE.NAMES = FALSE)];
tb[, n:= 1L];
tb<- ddply(tb, .(starter), function(x) {
if (nrow(x) > 1) {
for (i in 2:nrow(x)) {
x$n[i]<- x$n[i-1]+1;
}
}
return(x);
})
tb<- as.data.table(tb)[order(-freq)]
return(tb)
};
#Function: keep only the top(5) freatures with the same "starter"(ignore the last word)
chooseTops<- function(tb, k=5) {
tb<- as.data.table(tb)[n<=k];
tb[, starter:= NULL];
tb[, n:= NULL];
tb<- tb[order(-freq)]
return(tb)
};
## 1-Gram
dfm1<- dfmGenerate(allTrain, 1);
saveRDS(dfm1, 'rData/dfm1.rds');
tb1<- tbGenerate(dfm1);
rm(dfm1);
saveRDS(tb1, 'rData/tb1.rds');
showFreq(tb1);
# remove features that occur only once(freq=1): 58.9% of the features(type), 0.6% of the frequency(count)
pattern<- "-*[a-z0-9'.,]+[-a-z0-9'.,]*";
tb1<- subset(tb1, freq>1 & grepl(pattern, feature));
rm(pattern);
saveRDS(tb1, 'rData/tb1Trim.rds');
# keep only top 20 unigram to save file size
tb1<- tb1[1:20,];
saveRDS(tb1, 'rData/tb1TrimTops.rds');
gc();
## 2-Gram
dfm2<- dfmGenerate(allTrain, 2);
saveRDS(dfn m2, 'rData/dfm2.rds');
tb2<- tbGenerate(dfm2);
rm(dfm2);
saveRDS(tb2, 'rData/tb2.rds');
showFreq(tb2);
# cut out features that occur only once(freq=1): 71.3% of the features(type), 11.6% of the frequency(count)
pattern<- paste0("-*[a-z0-9'.,]+[-a-z0-9'.,]*", "_", "-*[a-z0-9'.,]+[-a-z0-9'.,]*");
tb2<- subset(tb2, freq>1 & grepl(pattern, feature));
rm(pattern);
saveRDS(tb2, 'rData/tb2Trim.rds');
# get the "starter" of each feature (excluding the last word)
tb2<- checkFreq(tb2);
saveRDS(tb2, 'rData/tb2Trim2.rds');
# keep only top 5 of each unique "starter" to save file size
tb2<- chooseTops(tb2, 5);
saveRDS(tb2, 'rData/tb2TrimTops.rds');
rm(tb1, tb2)
gc();
## 3-Gram
dfm3<- dfmGenerate(allTrain, 3);
saveRDS(dfm3, 'rData/dfm3.rds');
tb3<- tbGenerate(dfm3);
rm(dfm3);
saveRDS(tb3, 'rData/tb3.rds');
showFreq(tb3);
# cut out features that occur only once(freq=1): 84.2% the features(type), 43.8% of the frequency(count)
pattern<- paste0("-*[a-z0-9'.,]+[-a-z0-9'.,]*", "_",
"-*[a-z0-9'.,]+[-a-z0-9'.,]*", "_",
"-*[a-z0-9'.,]+[-a-z0-9'.,]*");
tb3<- subset(tb3, freq>1 & grepl(pattern, feature));
rm(pattern);
saveRDS(tb3, 'rData/tb3Trim.rds');
# get the "starter" of each feature (excluding the last word)
tb3<- checkFreq(tb3);
saveRDS(tb3, 'rData/tb3Trim2.rds');
# keep only top 5 of each unique "starter" to save file size
tb3<- chooseTops(tb3, 5);
saveRDS(tb3, 'rData/tb3TrimTops.rds');
rm(tb3)
gc();
## 4-Gram
dfm4<- dfmGenerate(allTrain, 4);
saveRDS(dfm4, 'rData/dfm4.rds');
tb4<- tbGenerate(dfm4);
rm(dfm4);
saveRDS(tb4, 'rData/tb4.rds');
showFreq(tb4);
# cut out features that occur only once(freq=1): 93.6% the features(type), 78.9% of the frequency(count)
pattern<- paste0("-*[a-z0-9'.,]+[-a-z0-9'.,]*", "_",
"-*[a-z0-9'.,]+[-a-z0-9'.,]*", "_",
"-*[a-z0-9'.,]+[-a-z0-9'.,]*", "_",
"-*[a-z0-9'.,]+[-a-z0-9'.,]*");
tb4<- subset(tb4, freq>1 & grepl(pattern, feature));
rm(pattern);
saveRDS(tb4, 'rData/tb4Trim.rds');
# get the "starter" of each feature (excluding the last word)
tb4<- checkFreq(tb4);
saveRDS(tb4, 'rData/tb4Trim2.rds');
# keep only top 5 of each unique "starter" to save file size
tb4<- chooseTops(tb4, 5);
saveRDS(tb4, 'rData/tb4TrimTops.rds');
gc();
rm(allTrain);
badword<- readRDS('rData/badword.rds'); badword2<- readRDS('rData/badword2.rds');
from<- readRDS('rData/contractionsFrom.rds'); to<- readRDS('rData/contractionsTo.rds');
cleanInputStr <- function(inputStr) {
# convert to lower case & ASCII
inputStr<- tolower(inputStr)
inputStr<- iconv(inputStr, "latin1", "ASCII", sub="")
# remove prophanity
# inputStr<- gsub(paste(gsub("\\(|\\)","",badword), collapse = "|"), " ", inputStr)
inputStr<- gsub(paste(badword2, collapse = "|"), " ", inputStr)
# remove non alphabets/numbers/apostrophes/intra-words dashes
inputStr<- gsub("[^a-z0-9 '-]", " ", inputStr)
# convert contractions
for (i in 1:length(to)) {
inputStr<- gsub(from[i], to[i], inputStr)
}
# remove numbers that are not part of a word(eg. the number in '2day' will be kept)
inputStr<- gsub(" [0-9]* ", " ", inputStr)
# remove unnecessary whitespace
inputStr<- gsub("\\s+", " ", inputStr)
# remove leading and trailing white spaces
inputStr<- gsub("(^\\s+|\\s+$)", "", inputStr)
# Return the cleaned resulting senytense
# If the resulting string is empty return empty and string.
if (nchar(inputStr) > 0) {
return(inputStr);
} else {
return("");
}
}
tb1<- readRDS('rData/tb1TrimTops.rds');
tb2<- readRDS('rData/tb2TrimTops.rds');
tb3<- readRDS('rData/tb3TrimTops.rds');
tb4<- readRDS('rData/tb4TrimTops.rds');
# Use "Trim" 4-gram to 1-gram tables from 60% training set corpus
nextWord<- function(inputStr, k=3) {
# inputStr = a phrase (multiple words) input in the input box
# k = number of next-word prediction displayed
# Preprocess the input string
inputStr <- cleanInputStr(inputStr);
inputStr<- unlist(strsplit(inputStr, split=" "));
inputStrLen<- length(inputStr);
nxtWordFound<- FALSE;
nxtWordPred<- as.character(NULL);
# 1. First search for 4-Gram
if (inputStrLen >= 3 & !nxtWordFound) {
# Construct search string, insert one undercore "_" between words
searchStr<- paste0("^",paste(inputStr[(inputStrLen-2):inputStrLen], collapse="_"),"_");
# Subset the 4-Gram table with the search string
temp<- tb4[grep(searchStr, feature)];
# Match the search string
if ( nrow(temp) > 0 ) {
tempWord<- strsplit(temp$feature, split="_");
tempWord<- sapply(tempWord, function(x) x[length(x)]);
tempWord<- unique(c(nxtWordPred, tempWord));
nxtWordPred<- tempWord[1:min(k, length(tempWord))];
if (length(nxtWordPred)==k) {nxtWordFound <- TRUE};
}
temp<- NULL; tempWord<- NULL;
}
# 2. Next search for 3-Gram
if (inputStrLen >= 2 & !nxtWordFound) {
# Construct search string, insert one undercore "_" between words
searchStr<- paste0("^",paste(inputStr[(inputStrLen-1):inputStrLen], collapse="_"),"_");
# Subset the 3-Gram table with the search string
temp<- tb3[grep(searchStr, feature)];
# Match the search string
if ( nrow(temp) > 0 ) {
tempWord<- strsplit(temp$feature, split="_");
tempWord<- sapply(tempWord, function(x) x[length(x)]);
tempWord<- unique(c(nxtWordPred, tempWord));
nxtWordPred<- tempWord[1:min(k, length(tempWord))];
if (length(nxtWordPred)==k) {nxtWordFound <- TRUE};
}
temp<- NULL; tempWord<- NULL;
}
# 3. Next search for 2-Gram
if (inputStrLen >= 1 & !nxtWordFound) {
# Construct search string, insert one undercore "_" between words
searchStr<- paste0("^",paste(inputStr[inputStrLen], collapse="_"),"_");
# Subset the 2-Gram table with the search string
temp<- tb2[grep(searchStr, feature)];
# Match the search string
if ( nrow(temp) > 0 ) {
tempWord<- strsplit(temp$feature, split="_");
tempWord<- sapply(tempWord, function(x) x[length(x)]);
tempWord<- unique(c(nxtWordPred, tempWord));
nxtWordPred<- tempWord[1:min(k, length(tempWord))];
if (length(nxtWordPred)==k) {nxtWordFound <- TRUE};
}
if (length(nxtWordPred)==k) {nxtWordFound <- TRUE};
temp<- NULL; tempWord<- NULL;
}
# 4. If no matching/not enough features found in 4, 3 and 2 Gram tables, return the most frequently used word(s) from the 1-Gram table
if (inputStrLen >= 0 & !nxtWordFound) {
tempWord<- head(tb1$feature[!(tb1$feature %in% nxtWordPred)], (k-length(nxtWordPred)));
nxtWordPred<- unique(c(nxtWordPred, tempWord));
temp<- NULL; tempWord<- NULL;
}
# nxtWordPred<- paste(nxtWordPred, collapse=', ');
return(nxtWordPred);
}
source('benchmark.r');
predict.baseline <- nextWord
benchmark(predict.baseline,
# additional parameters to be passed to the prediction function can be inserted here
sent.list = list('tweets' = tweets[c(4:10, 61:100)],
'blogs' = blogs[c(4:10, 61:100)]),
ext.output = T)
Test Results:
Overall top-3 score: 17.90 %
Overall top-1 precision: 13.49 %
Overall top-3 precision: 21.38 %
Average runtime: 1133.63 msec
Number of predictions: 2003
Total memory used: 350.33 MB
Dataset details
Dataset "blogs" (47 lines, 1172 words)
Score: 16.41 %, Top-1 precision: 12.2 %, Top-3 precision: 19.76 %
Dataset "Tweets" (47 lines, 848 words)
Score: 19..39 %, Top-1 precision: 14.78 %, Top-3 precision: 23.00 %