Data Processing

Load Data and Sample Training Set

getSample <- function(x, p) {
  sample <- sample(x, size = as.integer(length(x)*p), replace = FALSE)
  return(sample)
};

# 60% of total corpus is train set, the remaining is set aside as validation set.
trainP<- 0.60; #testP<- 0.2; 
##blogs
con<- file('en_US.blogs.txt', 'rb');
blog<- readLines(con, encoding = 'UTF-8', skipNul = TRUE);
close(con);
set.seed(123);
blogTrain<- getSample(blog, trainP); blogVal<- blog[!(blog %in% blogTrain)];
# set.seed(1234);
saveRDS(blogTrain, 'rData/blogTrain.rds');
saveRDS(blogVal, 'rData/blogVal.rds');
rm(blog, blogTest, blogVal);

##news
con<- file('en_US.news.txt', 'rb');
news<- readLines(con, encoding = 'UTF-8', skipNul = TRUE);
close(con); rm(con);
set.seed(456);
newsTrain<- getSample(news, trainP); newsVal<- news[!(news %in% newsTrain)];
# set.seed(4567);
saveRDS(newsTrain, 'rData/newsTrain.rds');
saveRDS(newsVal, 'rData/newsVal.rds');
rm(news, newsTest, newsVal);

##twitter
con<- file('en_US.twitter.txt', 'rb');
twitter<- readLines(con, encoding = 'UTF-8', skipNul = TRUE);
close(con); rm(con);
set.seed(789);
twitterTrain<- getSample(twitter, trainP); twitterVal<- twitter[!(twitter %in% twitterTrain)];
saveRDS(twitterTrain, 'rData/twitterTrain.rds');
saveRDS(twitterVal, 'rData/twitterVal.rds');
rm(twitter, twitterTest, twitterVal);

allTrain<- c(blogTrain, newsTrain, twitterTrain);
rm(blogTrain, newsTrain, twitterTrain, testP, trainP, getSample);
saveRDS(allTrain, 'rData/allTrain.rds');

Create N-Gram Document-Feature Matrix (DFM)

N-Gram Models:

use ‘dfm()’ default: toLower = TRUE, removeNumbers = TRUE, removePunct = TRUE, removeSeparators = TRUE
no removing stopwords, no stemming
remove profanity, list from website, including variations (ex. ed, ing)
transform selective contractions (ex. do not –> don’t, i am –> i’m)
remove Twitter(# and @)
remove unnecessary whitespace

N-Gram Tables:

Remove features that occur only once (frequency==1) in the training corpus

1-Gram: remove 58.9% of the features(type), 0.6% of the frequency(count)
2-Gram: remove 72.3% of the features(type), 13.3% of the frequency(count)
3-Gram: remove 85.5% the features(type), 48.1% of the frequency(count)
4-Gram: remove 93.6% the features(type), 78.9% of the frequency(count)

# convert to ASCII and lower case
  allTrain<- iconv(allTrain, "latin1", "ASCII", sub="");
  allTrain<- tolower(allTrain);

# transform selective contractions
  from<- c("do not", "does not", "did not", "have not", "has not", "had not", 
           "cannot", "would not", "could not", "should not", "must not", "will not", 
           "are not", "is not", "was not", "were not", 
           "i am ", "i have ", "i will ", "you are ", "youre ", "you have ", "you will ", 
           "he will ", "she will ", "they are ", "they have ", "they will ", 
           "we are ", "we have ", "we will ", "it is ", "it will "); #"he is ", "she is ", 
  to<- c("don't", "doesn't", "didn't", "haven't", "hasn't", "hadn't", 
         "can't", "wouldn't", "couldn't", "shouldn't", "mustn't", "won't", 
         "aren't", "isn't", "wasn't", "weren't", 
         "i'm ", "i've ", "i'll ", "you're ", "you're ", "you've ", "you'll ", 
         "he'll ", "she'll ", "they're ", "they've ", "they'll ", 
         "we're ", "we've ", "we'll ", "it's ", "it'll "); #"he's ", "she's ", 
  saveRDS(from, 'rData/contractionsFrom.rds');
  saveRDS(to, 'rData/contractionsTo.rds');

  for (i in 1:length(to)) {
    allTrain<- gsub(from[i], to[i], allTrain)
  };
  rm(i);

  saveRDS(allTrain, 'rData/allTrainClean.rds');

## Functions for generating N-Gram dfm and frequency table

# Function: Generate N-Gram dfm
dfmGenerate<- function(trainSet, n) {
  dfm<- dfm(paste(trainSet, collapse = " "), ignoredFeatures=badword, removeTwitter = TRUE, ngrams=n);
  return(dfm);
}

#Function: Generate N-Gram frequency table
tbGenerate<- function(dfm) {
  tb<- data.table(feature=features(dfm), freq=colSums(dfm), key = "feature");
  tb<- tb[order(-freq)];
  tb$feature<- gsub('_+', '_', tb$feature);
  tb$feature<- gsub('^_', '', tb$feature);
  if (length(unique(tb$feature)) < nrow(tb)) {
    tb<- tb[, list(freq=sum(freq)), by=feature];
  }
  return(tb);
}

showFreq<- function(tb) {
  once<- sum(tb$freq<=1);
  freqCum<- cumsum(tb$freq);
  cat(sprintf(paste0('feature number:                   %d\n',
                     'freq=1 features percentage:       %.1f %%\n',
                     'freq=1 frequency percentage:      %.1f %%\n',
                     'fifty percent coverage features:  %d\n',
                     'ninety percent coverage features: %d\n'),
                nrow(tb),
                once/nrow(tb)*100,
                once/sum(tb$freq)*100,
                head(which(freqCum>=sum(tb$freq)*0.5),1),
                head(which(freqCum>=sum(tb$freq)*0.9),1)
                ));
}

#Function: Count frequency by "starter"(ignore the last word)
checkFreq<- function (tb) {
  tb[, starter:= sapply(feature, function(x) {
    underscore<- gregexpr('_', x)[[1]];
    n<-underscore[length(underscore)];
    substr(x, 1, n);
  }, USE.NAMES = FALSE)];
  
  tb[, n:= 1L];
  tb<- ddply(tb, .(starter), function(x) {
    if (nrow(x) > 1) {
      for (i in 2:nrow(x)) {
        x$n[i]<- x$n[i-1]+1;
      }
    }
    return(x);
  })
  tb<- as.data.table(tb)[order(-freq)]
  return(tb)
};

#Function: keep only the top(5) freatures with the same "starter"(ignore the last word)
chooseTops<- function(tb, k=5) {
  tb<- as.data.table(tb)[n<=k];
  tb[, starter:= NULL];
  tb[, n:= NULL];
  tb<- tb[order(-freq)]
  return(tb)
};

## 1-Gram
  dfm1<- dfmGenerate(allTrain, 1);
  saveRDS(dfm1, 'rData/dfm1.rds');  
  tb1<- tbGenerate(dfm1);
  rm(dfm1);
  saveRDS(tb1, 'rData/tb1.rds');

  showFreq(tb1);

  # remove features that occur only once(freq=1): 58.9% of the features(type), 0.6% of the frequency(count)
  pattern<- "-*[a-z0-9'.,]+[-a-z0-9'.,]*";
  tb1<- subset(tb1, freq>1 & grepl(pattern, feature));
  rm(pattern);
  saveRDS(tb1, 'rData/tb1Trim.rds');
  
  # keep only top 20 unigram to save file size
  tb1<- tb1[1:20,];
  saveRDS(tb1, 'rData/tb1TrimTops.rds');
  
  gc();
  
## 2-Gram
  dfm2<- dfmGenerate(allTrain, 2);
  saveRDS(dfn m2, 'rData/dfm2.rds');
  tb2<- tbGenerate(dfm2);

  rm(dfm2);
  saveRDS(tb2, 'rData/tb2.rds');

  showFreq(tb2);

  # cut out features that occur only once(freq=1): 71.3% of the features(type), 11.6% of the frequency(count)
  pattern<- paste0("-*[a-z0-9'.,]+[-a-z0-9'.,]*", "_", "-*[a-z0-9'.,]+[-a-z0-9'.,]*");
  tb2<- subset(tb2, freq>1 & grepl(pattern, feature));
  rm(pattern);
  saveRDS(tb2, 'rData/tb2Trim.rds');
  
  # get the "starter" of each feature (excluding the last word)
  tb2<- checkFreq(tb2);
  saveRDS(tb2, 'rData/tb2Trim2.rds');
  
  # keep only top 5 of each unique "starter" to save file size
  tb2<- chooseTops(tb2, 5);
  saveRDS(tb2, 'rData/tb2TrimTops.rds');
  rm(tb1, tb2)

  gc();

## 3-Gram
  dfm3<- dfmGenerate(allTrain, 3);
  saveRDS(dfm3, 'rData/dfm3.rds');
  tb3<- tbGenerate(dfm3);
  rm(dfm3);
  saveRDS(tb3, 'rData/tb3.rds');
  
  showFreq(tb3);

  # cut out features that occur only once(freq=1): 84.2% the features(type), 43.8% of the frequency(count)
  pattern<- paste0("-*[a-z0-9'.,]+[-a-z0-9'.,]*", "_", 
                          "-*[a-z0-9'.,]+[-a-z0-9'.,]*", "_", 
                          "-*[a-z0-9'.,]+[-a-z0-9'.,]*");
  tb3<- subset(tb3, freq>1 & grepl(pattern, feature));
  rm(pattern);
  saveRDS(tb3, 'rData/tb3Trim.rds');
  
  # get the "starter" of each feature (excluding the last word)
  tb3<- checkFreq(tb3);
  saveRDS(tb3, 'rData/tb3Trim2.rds');
  
  # keep only top 5 of each unique "starter" to save file size
  tb3<- chooseTops(tb3, 5);
  saveRDS(tb3, 'rData/tb3TrimTops.rds');
  rm(tb3)

  gc();

## 4-Gram
  dfm4<- dfmGenerate(allTrain, 4);
  saveRDS(dfm4, 'rData/dfm4.rds');
  tb4<- tbGenerate(dfm4);
  rm(dfm4);
  saveRDS(tb4, 'rData/tb4.rds');
  
  showFreq(tb4);

  # cut out features that occur only once(freq=1): 93.6% the features(type), 78.9% of the frequency(count)
  pattern<- paste0("-*[a-z0-9'.,]+[-a-z0-9'.,]*", "_", 
                         "-*[a-z0-9'.,]+[-a-z0-9'.,]*", "_", 
                         "-*[a-z0-9'.,]+[-a-z0-9'.,]*", "_",
                          "-*[a-z0-9'.,]+[-a-z0-9'.,]*");
  tb4<- subset(tb4, freq>1 & grepl(pattern, feature));
  rm(pattern);
  saveRDS(tb4, 'rData/tb4Trim.rds');
  
  # get the "starter" of each feature (excluding the last word)
  tb4<- checkFreq(tb4);
  saveRDS(tb4, 'rData/tb4Trim2.rds');
  
  # keep only top 5 of each unique "starter" to save file size
  tb4<- chooseTops(tb4, 5);
  saveRDS(tb4, 'rData/tb4TrimTops.rds');

  gc();
  rm(allTrain);

Clean String Function

badword<- readRDS('rData/badword.rds'); badword2<- readRDS('rData/badword2.rds');
from<- readRDS('rData/contractionsFrom.rds'); to<- readRDS('rData/contractionsTo.rds');

cleanInputStr <- function(inputStr) {
  # convert to lower case & ASCII
  inputStr<- tolower(inputStr)
  inputStr<- iconv(inputStr, "latin1", "ASCII", sub="")
  # remove prophanity
  # inputStr<- gsub(paste(gsub("\\(|\\)","",badword), collapse = "|"), " ", inputStr)
  inputStr<- gsub(paste(badword2, collapse = "|"), " ", inputStr)
  # remove non alphabets/numbers/apostrophes/intra-words dashes
  inputStr<- gsub("[^a-z0-9 '-]", " ", inputStr) 
  # convert contractions
  for (i in 1:length(to)) {
    inputStr<- gsub(from[i], to[i], inputStr)
  }  
  # remove numbers that are not part of a word(eg. the number in '2day' will be kept)
  inputStr<- gsub(" [0-9]* ", " ", inputStr)
  # remove unnecessary whitespace
  inputStr<- gsub("\\s+", " ", inputStr) 
  # remove leading and trailing white spaces
  inputStr<- gsub("(^\\s+|\\s+$)", "", inputStr)
  
  # Return the cleaned resulting senytense
  # If the resulting string is empty return empty and string.
  if (nchar(inputStr) > 0) {
    return(inputStr); 
  } else {
    return("");
  }
}

tb1<- readRDS('rData/tb1TrimTops.rds');
tb2<- readRDS('rData/tb2TrimTops.rds');
tb3<- readRDS('rData/tb3TrimTops.rds');
tb4<- readRDS('rData/tb4TrimTops.rds');

Prediction Function: N-Gram backoff

# Use "Trim" 4-gram to 1-gram tables from 60% training set corpus

nextWord<- function(inputStr, k=3) {
  # inputStr = a phrase (multiple words) input in the input box
  # k = number of next-word prediction displayed
  
  # Preprocess the input string
  inputStr <- cleanInputStr(inputStr);
  inputStr<- unlist(strsplit(inputStr, split=" "));
  inputStrLen<- length(inputStr);
  
  nxtWordFound<- FALSE;
  nxtWordPred<- as.character(NULL);
  
  # 1. First search for 4-Gram
  if (inputStrLen >= 3 & !nxtWordFound) {
    # Construct search string, insert one undercore "_" between words
    searchStr<- paste0("^",paste(inputStr[(inputStrLen-2):inputStrLen], collapse="_"),"_");
    
    # Subset the 4-Gram table with the search string
    temp<- tb4[grep(searchStr, feature)];
    
    # Match the search string
    if ( nrow(temp) > 0 ) {
      tempWord<- strsplit(temp$feature, split="_");
      tempWord<- sapply(tempWord, function(x) x[length(x)]);
      tempWord<- unique(c(nxtWordPred, tempWord));
      nxtWordPred<- tempWord[1:min(k, length(tempWord))];
      if (length(nxtWordPred)==k) {nxtWordFound <- TRUE};
    }
    temp<- NULL; tempWord<- NULL;
  }
  
  # 2. Next search for 3-Gram
  if (inputStrLen >= 2 & !nxtWordFound) {
    # Construct search string, insert one undercore "_" between words
    searchStr<- paste0("^",paste(inputStr[(inputStrLen-1):inputStrLen], collapse="_"),"_");
    
    # Subset the 3-Gram table with the search string
    temp<- tb3[grep(searchStr, feature)];
    
    # Match the search string
    if ( nrow(temp) > 0 ) {
      tempWord<- strsplit(temp$feature, split="_");
      tempWord<- sapply(tempWord, function(x) x[length(x)]);
      tempWord<- unique(c(nxtWordPred, tempWord));
      nxtWordPred<- tempWord[1:min(k, length(tempWord))];
      if (length(nxtWordPred)==k) {nxtWordFound <- TRUE};
    }
    temp<- NULL; tempWord<- NULL;
  }
  
  # 3. Next search for 2-Gram
  if (inputStrLen >= 1 & !nxtWordFound) {
    # Construct search string, insert one undercore "_" between words
    searchStr<- paste0("^",paste(inputStr[inputStrLen], collapse="_"),"_");
    
    # Subset the 2-Gram table with the search string
    temp<- tb2[grep(searchStr, feature)];
    
    # Match the search string
    if ( nrow(temp) > 0 ) {
      tempWord<- strsplit(temp$feature, split="_");
      tempWord<- sapply(tempWord, function(x) x[length(x)]);
      tempWord<- unique(c(nxtWordPred, tempWord));
      nxtWordPred<- tempWord[1:min(k, length(tempWord))];
      if (length(nxtWordPred)==k) {nxtWordFound <- TRUE};
    }
    
    if (length(nxtWordPred)==k) {nxtWordFound <- TRUE};
    temp<- NULL; tempWord<- NULL;
  }
  
  # 4. If no matching/not enough features found in 4, 3 and 2 Gram tables, return the most frequently used word(s) from the 1-Gram table
  if (inputStrLen >= 0 & !nxtWordFound) {
    tempWord<- head(tb1$feature[!(tb1$feature %in% nxtWordPred)], (k-length(nxtWordPred)));
    nxtWordPred<- unique(c(nxtWordPred, tempWord));
    temp<- NULL; tempWord<- NULL;
  }
  # nxtWordPred<- paste(nxtWordPred, collapse=', ');
  return(nxtWordPred);
}

Word Prediction Using N-Gram

Data Processing

Load Data and Sample Training Set

Create N-Gram Document-Feature Matrix (DFM)

Clean String Function

Prediction Function: N-Gram backoff

Benchmark Test - Validation Set