#################################################################################################
################################ Name: Text Statistics Functions ################################
#### Creator: Joan Huang ####
#### Date: 14 May 2019 ####
#### Purpose:   
########### 1) tokenize each paragraphs to words; 
########### 2) mapping each words to POS taggers; 
########### 3) calculate the % of each pos tag; 
########### 4) check the pos stag of the starting word in each sentence of a paragaph
#### Requiredment: a dataset with clean paragraphs, such as text_data ####
#### Output: text_stats <- word_pos_prop_function(text_data), please note that: ####
#### 1) in the text_data, the text for processing should be names as "text", which can ba sentences or paragaphs;


#### edited on 10/7/2019: shouldn't use the word to mappying POS tagger as the POS tagger is desgined for sentence matching; therefore,
#### need to change this statement:     word_pos <- tagPOS(corpus_word$word)
#### this should be changed to use the sentence based pos tag


#################################################################################################



## load needed libraries - please note that if you may need to install packages if they are not installed yet
library(NLP)
library(openNLP)


##the pos tagger mapping function

tagPOS <-  function(x, ...) {
  s <- as.String(x)
  word_token_annotator <- Maxent_Word_Token_Annotator()
  a2 <- Annotation(1L, "sentence", 1L, nchar(s))
  a2 <- NLP::annotate(s, word_token_annotator, a2)
  a3 <- NLP::annotate(s, Maxent_POS_Tag_Annotator(), a2) #using the a Maximum Entropy Model for Part-Of-Speech Tagging
  a3w <- a3[a3$type == "word"]
  POStags <- unlist(lapply(a3w$features, `[[`, "POS"))
  POStagged <- paste(sprintf("%s/%s", s[a3w], POStags), collapse = " ")
  list(POStagged = POStagged, POStags = POStags)
}


## mapping each word to a POS tagger (output is the #of pos taggers for each paragraph)
## edited on 10/7/2019: shouldn't use the word to mappying POS tagger as the POS tagger is desgined for sentence matching

  
  
word_pos_prop_function <- function(text_input){
  
  word_pos_sum_base <- NULL %>% as.data.frame()
  
  for (i in 1:nrow(text_input)) {
 
    #tokenize paragraphs to words (sentence) and the apply the tagPOS function - doing so to avoid the memeory limit error
    corpus_sentence <- text_input[i,] %>%
      # mutate(text=removePunctuation(text)) %>% #remove punctuations
      unnest_tokens(sentence, paragraph, token = "sentences") %>% #update on 10/7/2019: need to use sentence rather than words as the POStag is a sentence based mapping tool.
      group_by(question_index) %>% count(sentence) 
    
    sentence_pos <- tagPOS(corpus_sentence$sentence)
    
    word_pos_index <- cbind(index = i, question_index=text_input$question_index[i], word_pos = sentence_pos$POStags) %>%
      as.data.frame()
    ##calculate the percentage of each pos tag
    word_pos_sum_temp <- word_pos_index %>% group_by(index, question_index) %>% count(word_pos) %>% spread(word_pos, n) %>%
      as.data.frame()
    #insert the temp to the base table (which was set up as NULL before the loop)
    word_pos_sum_base <- plyr::rbind.fill(word_pos_sum_base, word_pos_sum_temp)
    #replace NA with 0s
    word_pos_sum_base[is.na(word_pos_sum_base)] <- 0
    
  }
  
  ## remove the pos taggers for punctuations
  word_pos_sum_base2 <- word_pos_sum_base[ , which(grepl('[[:alnum:]]',names(word_pos_sum_base)))] 
  
  word_pos_sum_base2
}





# word_pos$POStagged
# 
# 
# 
# corpus_word <- text_input %>% mutate(text = tolower(text)) %>% 
#   mutate(text = removeNumbers(text)) %>%
#   # mutate(text=removePunctuation(text)) %>% #remove punctuations
#   unnest_tokens(word, text, token="ngrams", n = 1) %>%
#   group_by(question_index) %>% count(word) 
# 
# word_pos <- tagPOS(corpus_word$word[1:10])
# 
# word_pos_index <- cbind(question_index=text_input$question_index, word_pos = word_pos$POStags) %>%
#   as.data.frame()
# ##calculate the percentage of each pos tag
# word_pos_sum_temp <- word_pos_index %>% group_by(index, question_index) %>% count(word_pos) %>% spread(word_pos, n) %>%
#   as.data.frame()
# #insert the temp to the base table (which was set up as NULL before the loop)
# word_pos_sum_base <- plyr::rbind.fill(word_pos_sum_base, word_pos_sum_temp)
# #replace NA with 0s
# word_pos_sum_base[is.na(word_pos_sum_base)] <- 0
# 
# 
# 
# 
# 
# 
# 
# 










# 
# 
# tagged_first_word <-  tagPOS(first3words$start_word)
# tagged_second_word <-  tagPOS(first3words$second_word)
# tagged_third_word <-  tagPOS(first3words$third_word)
# 
# df_word_tag <- cbind(first3words, first_word_tag=tagged_first_word$POStags,second_word_tag=tagged_second_word$POStags,third_word_tag=tagged_third_word$POStags)
# head(df_word_tag)
# 
# #POS tagger for all single words
# 
# corpus_1gram_nopunc <- corpus_1gram[!str_detect(corpus_1gram$word, '\''),]
# 
# # corpus_1gram_nopunc$word2 <- removePunctuation(corpus_1gram_nopunc$word)
# # 
# # corpus_1gram_nopunc %>% filter(word2!=word)
# 
# 
# tagged_all_word <- tagPOS(corpus_1gram_nopunc$word)
# corpus_1gram_pos <- cbind(corpus_1gram_nopunc, word_pos=tagged_all_word$POStags) %>% as.data.frame()
# 
# 
# corpus_1gram_punc <- corpus_1gram[str_detect(corpus_1gram$word, '\''),] %>% as.data.frame()
# corpus_1gram_punc$word_pos <- "NN_POS"
# 
# corpus_1gram_pos <- rbind(corpus_1gram_pos, corpus_1gram_punc) %>% as.data.frame() %>% arrange(question_index)
# 
# corpus_1gram_pos %>% filter(question_index=="g1_q7")
# 
# ## calculate the pct of each pos in the paragraph
# 
# corpus_1gram_pos_sum <- corpus_1gram_pos %>% select(-word) %>% group_by(question_index, word_pos) %>% summarise(n=sum(n)) %>% spread(word_pos, n)
# 
# corpus_1gram_pos_sum <- corpus_1gram_pos_sum[,-2:-3] #remove the first 2 columns because they are punctuations tag
# #replace NAs with 0
# corpus_1gram_pos_sum[is.na(corpus_1gram_pos_sum)] <- 0
# 
# corpus_1gram_pos_sum_prop <-  round(corpus_1gram_pos_sum[-1]/rowSums(corpus_1gram_pos_sum[-1]),4)
# corpus_1gram_pos_sum_prop$question_index <- corpus_1gram_pos_sum$question_index
# head(corpus_1gram_pos_sum_prop)
