Word Prediction Using N-Gram

Author: S. Wu


Prediction Algorithm - 4-gram model with backoff smoothing method. It first searches the possible highest N-gram(ex. 4-gram if the search phrase consists 3 or more words), and returns the top 3 “next word” by frequency. If the evidence is insufficient, it then searches the next lower–order N-gram, so on and so forth (Jurafsky & Martin, 2014).

Data Sampling and Cleaning - 60% random samples of english blogs, news, and tweets corpus from the HC Corpora were preprocessed and generated into N-gram tables using R’s quanteda{} package. Features that occurred only once in each N-gram were removed as minimally advantageous to prediction, a decision which greatly enhanced app speed.

pkg<-c("knitr", "quanteda", "data.table", "plyr", "dplyr")
pkgCheck<-pkg %in% rownames(installed.packages())
for(i in 1:length(pkg)) {
    if(pkgCheck[i]==FALSE) {
        install.packages(pkg[i])
    } 
    library(pkg[i],character.only = TRUE, quietly = TRUE)
}
options(scipen=999); #remove scientific notation in printing

Data Processing

Load Data and Sample Training Set

getSample <- function(x, p) {
  sample <- sample(x, size = as.integer(length(x)*p), replace = FALSE)
  return(sample)
};

# 60% of total corpus is train set, the remaining is set aside as validation set.
trainP<- 0.60; #testP<- 0.2; 
##blogs
con<- file('en_US.blogs.txt', 'rb');
blog<- readLines(con, encoding = 'UTF-8', skipNul = TRUE);
close(con);
set.seed(123);
blogTrain<- getSample(blog, trainP); blogVal<- blog[!(blog %in% blogTrain)];
# set.seed(1234);
saveRDS(blogTrain, 'rData/blogTrain.rds');
saveRDS(blogVal, 'rData/blogVal.rds');
rm(blog, blogTest, blogVal);

##news
con<- file('en_US.news.txt', 'rb');
news<- readLines(con, encoding = 'UTF-8', skipNul = TRUE);
close(con); rm(con);
set.seed(456);
newsTrain<- getSample(news, trainP); newsVal<- news[!(news %in% newsTrain)];
# set.seed(4567);
saveRDS(newsTrain, 'rData/newsTrain.rds');
saveRDS(newsVal, 'rData/newsVal.rds');
rm(news, newsTest, newsVal);

##twitter
con<- file('en_US.twitter.txt', 'rb');
twitter<- readLines(con, encoding = 'UTF-8', skipNul = TRUE);
close(con); rm(con);
set.seed(789);
twitterTrain<- getSample(twitter, trainP); twitterVal<- twitter[!(twitter %in% twitterTrain)];
saveRDS(twitterTrain, 'rData/twitterTrain.rds');
saveRDS(twitterVal, 'rData/twitterVal.rds');
rm(twitter, twitterTest, twitterVal);

allTrain<- c(blogTrain, newsTrain, twitterTrain);
rm(blogTrain, newsTrain, twitterTrain, testP, trainP, getSample);
saveRDS(allTrain, 'rData/allTrain.rds');

Create N-Gram Document-Feature Matrix (DFM)

N-Gram Models:

  1. use ‘dfm()’ default: toLower = TRUE, removeNumbers = TRUE, removePunct = TRUE, removeSeparators = TRUE

  2. no removing stopwords, no stemming

  3. remove profanity, list from website, including variations (ex. ed, ing)

  4. transform selective contractions (ex. do not –> don’t, i am –> i’m)

  5. remove Twitter(# and @)

  6. remove unnecessary whitespace

N-Gram Tables:

Remove features that occur only once (frequency==1) in the training corpus

  1. 1-Gram: remove 58.9% of the features(type), 0.6% of the frequency(count)

  2. 2-Gram: remove 72.3% of the features(type), 13.3% of the frequency(count)

  3. 3-Gram: remove 85.5% the features(type), 48.1% of the frequency(count)

  4. 4-Gram: remove 93.6% the features(type), 78.9% of the frequency(count)

# convert to ASCII and lower case
  allTrain<- iconv(allTrain, "latin1", "ASCII", sub="");
  allTrain<- tolower(allTrain);

# transform selective contractions
  from<- c("do not", "does not", "did not", "have not", "has not", "had not", 
           "cannot", "would not", "could not", "should not", "must not", "will not", 
           "are not", "is not", "was not", "were not", 
           "i am ", "i have ", "i will ", "you are ", "youre ", "you have ", "you will ", 
           "he will ", "she will ", "they are ", "they have ", "they will ", 
           "we are ", "we have ", "we will ", "it is ", "it will "); #"he is ", "she is ", 
  to<- c("don't", "doesn't", "didn't", "haven't", "hasn't", "hadn't", 
         "can't", "wouldn't", "couldn't", "shouldn't", "mustn't", "won't", 
         "aren't", "isn't", "wasn't", "weren't", 
         "i'm ", "i've ", "i'll ", "you're ", "you're ", "you've ", "you'll ", 
         "he'll ", "she'll ", "they're ", "they've ", "they'll ", 
         "we're ", "we've ", "we'll ", "it's ", "it'll "); #"he's ", "she's ", 
  saveRDS(from, 'rData/contractionsFrom.rds');
  saveRDS(to, 'rData/contractionsTo.rds');

  for (i in 1:length(to)) {
    allTrain<- gsub(from[i], to[i], allTrain)
  };
  rm(i);

  saveRDS(allTrain, 'rData/allTrainClean.rds');
## Functions for generating N-Gram dfm and frequency table

# Function: Generate N-Gram dfm
dfmGenerate<- function(trainSet, n) {
  dfm<- dfm(paste(trainSet, collapse = " "), ignoredFeatures=badword, removeTwitter = TRUE, ngrams=n);
  return(dfm);
}

#Function: Generate N-Gram frequency table
tbGenerate<- function(dfm) {
  tb<- data.table(feature=features(dfm), freq=colSums(dfm), key = "feature");
  tb<- tb[order(-freq)];
  tb$feature<- gsub('_+', '_', tb$feature);
  tb$feature<- gsub('^_', '', tb$feature);
  if (length(unique(tb$feature)) < nrow(tb)) {
    tb<- tb[, list(freq=sum(freq)), by=feature];
  }
  return(tb);
}

showFreq<- function(tb) {
  once<- sum(tb$freq<=1);
  freqCum<- cumsum(tb$freq);
  cat(sprintf(paste0('feature number:                   %d\n',
                     'freq=1 features percentage:       %.1f %%\n',
                     'freq=1 frequency percentage:      %.1f %%\n',
                     'fifty percent coverage features:  %d\n',
                     'ninety percent coverage features: %d\n'),
                nrow(tb),
                once/nrow(tb)*100,
                once/sum(tb$freq)*100,
                head(which(freqCum>=sum(tb$freq)*0.5),1),
                head(which(freqCum>=sum(tb$freq)*0.9),1)
                ));
}

#Function: Count frequency by "starter"(ignore the last word)
checkFreq<- function (tb) {
  tb[, starter:= sapply(feature, function(x) {
    underscore<- gregexpr('_', x)[[1]];
    n<-underscore[length(underscore)];
    substr(x, 1, n);
  }, USE.NAMES = FALSE)];
  
  tb[, n:= 1L];
  tb<- ddply(tb, .(starter), function(x) {
    if (nrow(x) > 1) {
      for (i in 2:nrow(x)) {
        x$n[i]<- x$n[i-1]+1;
      }
    }
    return(x);
  })
  tb<- as.data.table(tb)[order(-freq)]
  return(tb)
};

#Function: keep only the top(5) freatures with the same "starter"(ignore the last word)
chooseTops<- function(tb, k=5) {
  tb<- as.data.table(tb)[n<=k];
  tb[, starter:= NULL];
  tb[, n:= NULL];
  tb<- tb[order(-freq)]
  return(tb)
};
## 1-Gram
  dfm1<- dfmGenerate(allTrain, 1);
  saveRDS(dfm1, 'rData/dfm1.rds');  
  tb1<- tbGenerate(dfm1);
  rm(dfm1);
  saveRDS(tb1, 'rData/tb1.rds');

  showFreq(tb1);

  # remove features that occur only once(freq=1): 58.9% of the features(type), 0.6% of the frequency(count)
  pattern<- "-*[a-z0-9'.,]+[-a-z0-9'.,]*";
  tb1<- subset(tb1, freq>1 & grepl(pattern, feature));
  rm(pattern);
  saveRDS(tb1, 'rData/tb1Trim.rds');
  
  # keep only top 20 unigram to save file size
  tb1<- tb1[1:20,];
  saveRDS(tb1, 'rData/tb1TrimTops.rds');
  
  gc();
  
## 2-Gram
  dfm2<- dfmGenerate(allTrain, 2);
  saveRDS(dfn m2, 'rData/dfm2.rds');
  tb2<- tbGenerate(dfm2);

  rm(dfm2);
  saveRDS(tb2, 'rData/tb2.rds');

  showFreq(tb2);

  # cut out features that occur only once(freq=1): 71.3% of the features(type), 11.6% of the frequency(count)
  pattern<- paste0("-*[a-z0-9'.,]+[-a-z0-9'.,]*", "_", "-*[a-z0-9'.,]+[-a-z0-9'.,]*");
  tb2<- subset(tb2, freq>1 & grepl(pattern, feature));
  rm(pattern);
  saveRDS(tb2, 'rData/tb2Trim.rds');
  
  # get the "starter" of each feature (excluding the last word)
  tb2<- checkFreq(tb2);
  saveRDS(tb2, 'rData/tb2Trim2.rds');
  
  # keep only top 5 of each unique "starter" to save file size
  tb2<- chooseTops(tb2, 5);
  saveRDS(tb2, 'rData/tb2TrimTops.rds');
  rm(tb1, tb2)

  gc();

## 3-Gram
  dfm3<- dfmGenerate(allTrain, 3);
  saveRDS(dfm3, 'rData/dfm3.rds');
  tb3<- tbGenerate(dfm3);
  rm(dfm3);
  saveRDS(tb3, 'rData/tb3.rds');
  
  showFreq(tb3);

  # cut out features that occur only once(freq=1): 84.2% the features(type), 43.8% of the frequency(count)
  pattern<- paste0("-*[a-z0-9'.,]+[-a-z0-9'.,]*", "_", 
                          "-*[a-z0-9'.,]+[-a-z0-9'.,]*", "_", 
                          "-*[a-z0-9'.,]+[-a-z0-9'.,]*");
  tb3<- subset(tb3, freq>1 & grepl(pattern, feature));
  rm(pattern);
  saveRDS(tb3, 'rData/tb3Trim.rds');
  
  # get the "starter" of each feature (excluding the last word)
  tb3<- checkFreq(tb3);
  saveRDS(tb3, 'rData/tb3Trim2.rds');
  
  # keep only top 5 of each unique "starter" to save file size
  tb3<- chooseTops(tb3, 5);
  saveRDS(tb3, 'rData/tb3TrimTops.rds');
  rm(tb3)

  gc();

## 4-Gram
  dfm4<- dfmGenerate(allTrain, 4);
  saveRDS(dfm4, 'rData/dfm4.rds');
  tb4<- tbGenerate(dfm4);
  rm(dfm4);
  saveRDS(tb4, 'rData/tb4.rds');
  
  showFreq(tb4);

  # cut out features that occur only once(freq=1): 93.6% the features(type), 78.9% of the frequency(count)
  pattern<- paste0("-*[a-z0-9'.,]+[-a-z0-9'.,]*", "_", 
                         "-*[a-z0-9'.,]+[-a-z0-9'.,]*", "_", 
                         "-*[a-z0-9'.,]+[-a-z0-9'.,]*", "_",
                          "-*[a-z0-9'.,]+[-a-z0-9'.,]*");
  tb4<- subset(tb4, freq>1 & grepl(pattern, feature));
  rm(pattern);
  saveRDS(tb4, 'rData/tb4Trim.rds');
  
  # get the "starter" of each feature (excluding the last word)
  tb4<- checkFreq(tb4);
  saveRDS(tb4, 'rData/tb4Trim2.rds');
  
  # keep only top 5 of each unique "starter" to save file size
  tb4<- chooseTops(tb4, 5);
  saveRDS(tb4, 'rData/tb4TrimTops.rds');

  gc();
  rm(allTrain);

Clean String Function

badword<- readRDS('rData/badword.rds'); badword2<- readRDS('rData/badword2.rds');
from<- readRDS('rData/contractionsFrom.rds'); to<- readRDS('rData/contractionsTo.rds');

cleanInputStr <- function(inputStr) {
  # convert to lower case & ASCII
  inputStr<- tolower(inputStr)
  inputStr<- iconv(inputStr, "latin1", "ASCII", sub="")
  # remove prophanity
  # inputStr<- gsub(paste(gsub("\\(|\\)","",badword), collapse = "|"), " ", inputStr)
  inputStr<- gsub(paste(badword2, collapse = "|"), " ", inputStr)
  # remove non alphabets/numbers/apostrophes/intra-words dashes
  inputStr<- gsub("[^a-z0-9 '-]", " ", inputStr) 
  # convert contractions
  for (i in 1:length(to)) {
    inputStr<- gsub(from[i], to[i], inputStr)
  }  
  # remove numbers that are not part of a word(eg. the number in '2day' will be kept)
  inputStr<- gsub(" [0-9]* ", " ", inputStr)
  # remove unnecessary whitespace
  inputStr<- gsub("\\s+", " ", inputStr) 
  # remove leading and trailing white spaces
  inputStr<- gsub("(^\\s+|\\s+$)", "", inputStr)
  
  # Return the cleaned resulting senytense
  # If the resulting string is empty return empty and string.
  if (nchar(inputStr) > 0) {
    return(inputStr); 
  } else {
    return("");
  }
}

tb1<- readRDS('rData/tb1TrimTops.rds');
tb2<- readRDS('rData/tb2TrimTops.rds');
tb3<- readRDS('rData/tb3TrimTops.rds');
tb4<- readRDS('rData/tb4TrimTops.rds');

Prediction Function: N-Gram backoff

# Use "Trim" 4-gram to 1-gram tables from 60% training set corpus

nextWord<- function(inputStr, k=3) {
  # inputStr = a phrase (multiple words) input in the input box
  # k = number of next-word prediction displayed
  
  # Preprocess the input string
  inputStr <- cleanInputStr(inputStr);
  inputStr<- unlist(strsplit(inputStr, split=" "));
  inputStrLen<- length(inputStr);
  
  nxtWordFound<- FALSE;
  nxtWordPred<- as.character(NULL);
  
  # 1. First search for 4-Gram
  if (inputStrLen >= 3 & !nxtWordFound) {
    # Construct search string, insert one undercore "_" between words
    searchStr<- paste0("^",paste(inputStr[(inputStrLen-2):inputStrLen], collapse="_"),"_");
    
    # Subset the 4-Gram table with the search string
    temp<- tb4[grep(searchStr, feature)];
    
    # Match the search string
    if ( nrow(temp) > 0 ) {
      tempWord<- strsplit(temp$feature, split="_");
      tempWord<- sapply(tempWord, function(x) x[length(x)]);
      tempWord<- unique(c(nxtWordPred, tempWord));
      nxtWordPred<- tempWord[1:min(k, length(tempWord))];
      if (length(nxtWordPred)==k) {nxtWordFound <- TRUE};
    }
    temp<- NULL; tempWord<- NULL;
  }
  
  # 2. Next search for 3-Gram
  if (inputStrLen >= 2 & !nxtWordFound) {
    # Construct search string, insert one undercore "_" between words
    searchStr<- paste0("^",paste(inputStr[(inputStrLen-1):inputStrLen], collapse="_"),"_");
    
    # Subset the 3-Gram table with the search string
    temp<- tb3[grep(searchStr, feature)];
    
    # Match the search string
    if ( nrow(temp) > 0 ) {
      tempWord<- strsplit(temp$feature, split="_");
      tempWord<- sapply(tempWord, function(x) x[length(x)]);
      tempWord<- unique(c(nxtWordPred, tempWord));
      nxtWordPred<- tempWord[1:min(k, length(tempWord))];
      if (length(nxtWordPred)==k) {nxtWordFound <- TRUE};
    }
    temp<- NULL; tempWord<- NULL;
  }
  
  # 3. Next search for 2-Gram
  if (inputStrLen >= 1 & !nxtWordFound) {
    # Construct search string, insert one undercore "_" between words
    searchStr<- paste0("^",paste(inputStr[inputStrLen], collapse="_"),"_");
    
    # Subset the 2-Gram table with the search string
    temp<- tb2[grep(searchStr, feature)];
    
    # Match the search string
    if ( nrow(temp) > 0 ) {
      tempWord<- strsplit(temp$feature, split="_");
      tempWord<- sapply(tempWord, function(x) x[length(x)]);
      tempWord<- unique(c(nxtWordPred, tempWord));
      nxtWordPred<- tempWord[1:min(k, length(tempWord))];
      if (length(nxtWordPred)==k) {nxtWordFound <- TRUE};
    }
    
    if (length(nxtWordPred)==k) {nxtWordFound <- TRUE};
    temp<- NULL; tempWord<- NULL;
  }
  
  # 4. If no matching/not enough features found in 4, 3 and 2 Gram tables, return the most frequently used word(s) from the 1-Gram table
  if (inputStrLen >= 0 & !nxtWordFound) {
    tempWord<- head(tb1$feature[!(tb1$feature %in% nxtWordPred)], (k-length(nxtWordPred)));
    nxtWordPred<- unique(c(nxtWordPred, tempWord));
    temp<- NULL; tempWord<- NULL;
  }
  # nxtWordPred<- paste(nxtWordPred, collapse=', ');
  return(nxtWordPred);
}

Benchmark Test - Validation Set

source('benchmark.r');
predict.baseline <- nextWord
benchmark(predict.baseline, 
          # additional parameters to be passed to the prediction function can be inserted here
          sent.list = list('tweets' = tweets[c(4:10, 61:100)], 
                           'blogs' = blogs[c(4:10, 61:100)]), 
          ext.output = T)

Test Results:

Overall top-3 score:     17.90 %
Overall top-1 precision: 13.49 %
Overall top-3 precision: 21.38 %
Average runtime:         1133.63 msec
Number of predictions:   2003
Total memory used:       350.33 MB

Dataset details
  Dataset "blogs" (47 lines, 1172 words)
    Score: 16.41 %, Top-1 precision: 12.2 %, Top-3 precision: 19.76 %
  Dataset "Tweets" (47 lines, 848 words)
    Score: 19..39 %, Top-1 precision: 14.78 %, Top-3 precision: 23.00 %