###note: this code extract the parse features for each setences in a paragraph


options(java.parameters = "-Xmx1024M" ) #change this heap size may kill the code
library(NLP)
library(openNLP)
library(dplyr)
library(textclean)
library(stringr)

install.packages("openNLPmodels.en", repos ="http://datacube.wu.ac.at/", type = "source") 

## check code here: how to make sure that each sentence is seperated properly! 

text_sentence$sentence <- replace_incomplete(text_sentence$sentence)
text_sentence$sentence <- add_missing_endmark(text_sentence$sentence,replacement = ".", endmarks = c("?", ".", "!")) #add missing endmarks

nrow(text_sentence)
prase_sum_p2 <- NULL
start <- 1
end <- nrow(text_sentence)

text_data <- text_sentence$sentence[start:end]
s <- as.String(text_data)
## Need sentence and word token annotations.
sent_token_annotator <- Maxent_Sent_Token_Annotator()
word_token_annotator <- Maxent_Word_Token_Annotator()
a2 <- annotate(s, list(sent_token_annotator, word_token_annotator))
parse_annotator <- Parse_Annotator()
  ## Compute the parse annotations only.
p <- parse_annotator(s, a2)

## Extract the formatted parse trees.
ptexts <- sapply(p$features, `[[`, "parse")
ptexts <- strsplit (ptexts, "\n") %>% unlist()

library(tidyverse)
tree_tagset <- read.csv("./data_input/peen_treebank_syntactic_tagset.csv")
tree_tagset$tagset <- as.character(tree_tagset$tagset)
ms <- gregexpr("[^() ]+", ptexts) # identify the location for spaces and brackets
words <- regmatches(ptexts, ms)   # extract words only
words_data <- enframe(words) %>% unnest() #convert list into dataframe
words_data$value <- as.character(words_data$value) #covert the parse tree into characteristics

clause_detect <- dplyr::inner_join(words_data,tree_tagset, by=c("value"="tagset") ) #extract clause only
tagset_count <- clause_detect %>% dplyr::group_by(name, value) %>% dplyr::summarise(tagset_count=dplyr::n())
tagset_count <- tidyr::spread(tagset_count, value, tagset_count) #transpose dataset (row to column)
tagset_count[is.na(tagset_count)] <- 0
  
  
tagset_count$question_index <- text_sentence$question_index[start:end]
tagset_count$rank <- text_sentence$rank[start:end]
  
tagset_count1 <- tagset_count %>% ungroup() %>% select(-name)

library(plyr)
parse_feature <- tagset_count1
parse_feature[is.na(parse_feature)] <- 0 #replace NAs with 0
  

  
  
  
  