
########################################
###### Sentence related features #######
########################################




## Function to calculate the POS taggers for each sentence (output: count and pct of the POS taggers in the first sentence)

sentence_pos_function <- function(sentence_text){
  
  sentence_text_pos <- NULL
  
  for (i in 1:nrow(sentence_text)) {
    sentence_text$paragraph <- sentence_text$sentence #rename sentence to paragraph for using the funcion of word_pos_prop_function()
    sentence_text_pos_temp <- word_pos_prop_function(sentence_text[i,])
    sentence_text_pos <- plyr::rbind.fill(sentence_text_pos,sentence_text_pos_temp)
    sentence_text_pos[is.na(sentence_text_pos)] <- 0
  }
  
  sentence_text_pos_prop <-  round(sentence_text_pos[-1:-2]/rowSums(sentence_text_pos[-1:-2]),4)*100
  colnames(sentence_text_pos_prop) <- paste("prop_", colnames(sentence_text_pos_prop),sep="")
  
  text_stats_pos2 <- cbind(sentence_text_pos,sentence_text_pos_prop) #the 
  
}


## first 3 words function
first3words_pos_function <- function(sentence_text){
  
  first3words <- cbind(question_index = sentence_text$question_index,
                       start_word = word(sentence_text$sentence,1), 
                       second_word =word(sentence_text$sentence, 2), 
                       third_word = word(sentence_text$sentence, 3)) %>% as.data.frame()
  
  first3words$start_word <- removePunctuation(as.character(first3words$start_word))
  first3words$second_word <- removePunctuation(as.character(first3words$second_word))
  first3words$third_word <- removePunctuation(as.character(first3words$third_word))
  
  
  first3words$start_word_pos <- tagPOS(first3words$start_word)$POStags
  first3words$second_word_pos <- tagPOS(first3words$second_word)$POStags
  first3words$third_word_pos <- tagPOS(first3words$third_word)$POStags

  first3words
  
}



## first 3 words function
firstwords_pos_function <- function(sentence_text){
  
  firstwords <- cbind(question_index = sentence_text$question_index, rank = sentence_text$rank,
                       start_word = word(sentence_text$sentence,1)) %>% as.data.frame()
  
  firstwords$start_word <- removePunctuation(as.character(firstwords$start_word))

  
  firstwords$start_word_pos <- tagPOS(firstwords$start_word)$POStags

  
  firstwords
  
}



## clue words function
clue_words_feature_function <- function(text_data){

  ##read clue words

  clue_words <- clue_words %>% filter(!words %in% c("further"))
  
  #check if the clue words existed in the text data
  text_sentence_clue_base <- NULL
  
  for (i in 1:nrow(clue_words)) {
    
    clueword <- clue_words$words[i]
    clueword_cat <- clue_words$category[i]
    clueword2 <- paste("\\<",clueword,"\\>", sep="") #match the whole words
    text_sentence_match <- text_data[which(grepl(clueword2, text_data$sentence)),]
    
    if (nrow(text_sentence_match)>0){
      text_sentence_match$clue <- clueword
      text_sentence_match$clue_cat <- clueword_cat
    }
    
    text_sentence_clue_base <- plyr::rbind.fill(text_sentence_clue_base, text_sentence_match)
  }
  
  text_sentence_clue_base <- text_sentence_clue_base %>% filter(clue!="as well")
  
  ##check if the first sentence include the clue words
  first_sentence_clue_cat  <- 
    text_sentence_clue_base %>% filter(rank=="1") %>% group_by(question_index) %>% 
    dplyr::count(clue_cat) %>% spread(clue_cat, n)
  
  colnames(first_sentence_clue_cat)[-1] <- paste("sent1st_clue",colnames(first_sentence_clue_cat[,-1]), sep="_")
  
  ##check if the last sentence include the clue words
  last_sentence_clue_cat <- text_sentence_clue_base %>% 
    group_by(question_index) %>%
    filter(rank == max(rank)) %>%
    arrange(question_index) %>% dplyr::count(clue_cat) %>% spread(clue_cat, n)
  
  colnames(last_sentence_clue_cat)[-1] <- paste("sentlast_clue",colnames(last_sentence_clue_cat[,-1]), sep="_")
  
  
  ##check the count of clue words in each category in the whole paragaph (exclude the first and last sentences)
  middle_sentence_clue_cat <- text_sentence_clue_base %>% 
    group_by(question_index) %>%
    filter(rank > 1 & rank < max(rank)) %>%
    arrange(question_index) %>% dplyr::count(clue_cat) %>% spread(clue_cat, n)
  
  colnames(middle_sentence_clue_cat)[-1] <- paste("sentmiddle_clue",colnames(middle_sentence_clue_cat[,-1]), sep="_")
  
  sentence_clue1 <- left_join(first_sentence_clue_cat,last_sentence_clue_cat, by="question_index")
  sentence_clue2 <- left_join(sentence_clue1,middle_sentence_clue_cat, by="question_index")
  sentence_clue2[is.na(sentence_clue2)] <- 0
  
  return(sentence_clue2)
  
}







#combine the sentence pos and clue features together





