# Title: Build Regression Data
# Creator: Calvin He
# Date Created: 01 July 2019
# Description:
# Outputs: 

build_regression_data <- function(reg_data_pre, opts ){
  
  reg_data_pre <- reg_data_pre %>% 
    arrange(region_name, Date) %>%
    add_dummy_mp_interactions(., opts, opts$mp_interaction_var) %>%
    add_controls(., opts) %>%   # add in controls based on options %>% 
    add_agg_controls(., opts ) %>% 
    add_ar_y(., opts) %>% 
    {if(opts$remove_gfc){ . %>% filter(Date < "2007-12-31"| Date > "2010-01-01") }else{.} } %>% 
    mutate(calculation_value = replace(calculation_value, calculation_value == 0 , 1)) %>% 
    arrange(region_name, Date) %>% 
    group_by(region_name) %>%
    mutate(obs = sum(!is.na(calculation_value))) %>% 
    filter(obs > 30) 
  
  return(reg_data_pre)
}




# Add in decile interactions --------------------------------------------------------------------------------
add_dummy_mp_interactions <- function(reg_data_pre, opts, var){
  # Create decile interaction with cash rate
  var_dummies <- dummy(reg_data_pre[[var]])
  
  # Lag of shock -------------------------------------------------------------------------------------------------
  # Support multiple shocks
  for (s in grep("mp_shock", colnames(reg_data_pre), value=T)){
    
    # Name of interaction
    if (opts$mp_shock_type=="diebold_li"){
      interaction_name <- paste0("_mp_interaction_",gsub("mp_shock_", "", s),"_")
    }else{
      interaction_name <- "_mp_interaction_"
    }
    # extract MP shock variable name
    shock_column_name <- parse_quo(s, caller_env())
    
    # for loop to implement shock_lag
    for (l in 0:opts$shock_lag){
      # extract shock
      mp_shock <- 
        reg_data_pre %>% arrange(region_name, Date) %>% group_by(region_name) %>% 
        mutate(mp_shock_lagged = dplyr::lag(!!  shock_column_name, n= l)) %>% ungroup %>%
        .$mp_shock_lagged
      
      # Linear model
      interaction <- var_dummies * mp_shock  # interact with cash rate shocks
      colnames(interaction) <- paste0("l", l,interaction_name , 1:max(reg_data_pre[[var]]))
      
      
      # bind the decile interactions to the input dataframe
      reg_data_pre <- bind_cols(reg_data_pre, data.frame(interaction))
    }
    
  }
  
  return(reg_data_pre)
}



# Title: Add controls to reg_data_pre
# Creator: Calvin He
# Date Created: 11 March 2019
# Description: calculates and joins specified controls in opts to reg_data 


add_controls <- function(reg_data_pre,  opts){
  # Core logic controls !!! ------------------------------------------------------
  if (grepl("rental|vacancy_rate|dwelling_investment_ratio" , paste(opts$controls, collapse =" " ) )  ){
    load(here::here("Data", "raw_corelogic_data.RData"))
  }
  
  # Hedonic Rent
  if ("rental_yield" %in% opts$controls){
    core_logic_data_hedonic_rent <- cl_raw_data %>%
      filter(metric_name == "hedonic_rental_yield", 
             property_type == opts$prop_type ) %>%
      rename(control_rental_yield =  calculation_value) %>%
      select(Date, r_sa3, contains("control"))
    
    reg_data_pre <- left_join(reg_data_pre,  core_logic_data_hedonic_rent, by = c("Date" ,"r_sa3"))
  }
  
  # Hedonic Rental growth
  if ("rental_growth_annual" %in% opts$controls){
    core_logic_data_hedonic_rent <- cl_raw_data %>%
      filter(metric_name == "median_rental_rate", 
             property_type == opts$prop_type ,
             Date >= opts$start_date, Date <= opts$end_date, 
             month(Date) %% opts$frequency ==0) %>%
      {if (opts$metric_deflate == TRUE) {
        deflate_series(. , "calculation_value") %>%  # deflate calculation value
          mutate(calculation_value_orig = calculation_value,
                 calculation_value = real_calculation_value) # keep orig cal value, and replace if metric name matches options
      }else {.} } %>% 
      arrange(r_sa3, Date) %>% 
      group_by(r_sa3) %>% 
      mutate(control_rental_growth_annual = 100* ( calculation_value/dplyr::lag(calculation_value, 12/opts$frequency) -1 )) %>% 
      select(Date, r_sa3, contains("control")) %>% ungroup
    
    reg_data_pre <- left_join(reg_data_pre,  core_logic_data_hedonic_rent, by = c("Date" ,"r_sa3"))
  }
  
  
  # average_vacancy_rate
  if ("vacancy_rate" %in% opts$controls){
    core_logic_data_average_vacancy_rate <- cl_raw_data %>%
      filter(metric_name == "average_vacancy_rate", 
             property_type == case_when(opts$prop_type=="U" ~ "U" ,
                                        TRUE ~ "H") ) %>%
      rename(control_vacancy_rate = calculation_value) %>%
      select(Date, r_sa3, contains("control"))
    
    reg_data_pre <- left_join(reg_data_pre,  core_logic_data_average_vacancy_rate, by = c("Date" ,"r_sa3"))
    
  }
  
  # dwelling_investment_ratio
  if("dwelling_investment_ratio" %in% opts$controls){
    core_logic_data_dwelling_investment <- cl_raw_data %>%
      filter(metric_name == "dwelling_investment_ratio", 
             property_type == case_when(opts$prop_type=="U" ~ "U" ,
                                        TRUE ~ "H") ) %>%
      rename(control_dwelling_investment_ratio = calculation_value) %>%
      select(Date, r_sa3, contains("control"))
    
    reg_data_pre <- left_join(reg_data_pre,  core_logic_data_dwelling_investment, by = c("Date" ,"r_sa3"))
  }

  
  #---------------------------------------------------------------------------------------------------------------
  # Check if local is in the controls
  if (grepl("local" , paste(opts$controls, collapse =" " ) )  ){
    local_lfs_temp <- local_unemployment_data(sa= TRUE)
  }
  
  # SA4 Unemployment 
  if("local_unemployment" %in% opts$controls){
    local_unemployment_data <-  local_lfs_temp  %>%
      mutate(control_local_unemployment = 100*unemp/lf)  %>% 
      rename(SA4_CODE_2016 = sa4_asgs11) %>% 
      select(Date, SA4_CODE_2016, contains("control"))
    
    reg_data_pre <- left_join(reg_data_pre, local_unemployment_data, by = c("Date" ,"SA4_CODE_2016"))
  }
  
  # SA4 log of labour force size
  if("local_labour_force_size" %in% opts$controls){
    local_unemployment_data <-  local_lfs_temp %>%
      group_by(month, sa4_asgs11) %>% select(-gender, -age) %>%
      summarise_all(., sum) %>% ungroup %>% zap_formats %>%
      rename(Date= month,
             SA4_CODE_2016 = sa4_asgs11) %>%
      mutate(Date = last_day_month(Date),
             control_ln_labour_force = log(lf),
             SA4_CODE_2016 = as.numeric(SA4_CODE_2016))  %>%
      select(Date, SA4_CODE_2016, contains("control"))
    
    reg_data_pre <- left_join(reg_data_pre, local_unemployment_data, by = c("Date" ,"SA4_CODE_2016"))
  }
  
  
  if("local_labour_force_size_growth" %in% opts$controls){
    local_unemployment_data <-  local_lfs_temp %>%
      rename( SA4_CODE_2016 = sa4_asgs11) %>%
      mutate( control_labour_force_annual_growth = 100* (log(lf) - log(dplyr::lag(lf, 12))))  %>%
      select(Date, SA4_CODE_2016, contains("control"))
    
    reg_data_pre <- left_join(reg_data_pre, local_unemployment_data, by = c("Date" ,"SA4_CODE_2016"))
  }
  
  
  if("local_employment_growth" %in% opts$controls){
    local_unemployment_data <-  local_lfs_temp %>%
      rename( SA4_CODE_2016 = sa4_asgs11) %>%
      mutate( control_local_employment_growth = 100* (log(emp) - log(dplyr::lag(emp, 12))))  %>%
      select(Date, SA4_CODE_2016, contains("control"))
    
    reg_data_pre <- left_join(reg_data_pre, local_unemployment_data, by = c("Date" ,"SA4_CODE_2016"))
  }
  
  
  if("local_sfd" %in% opts$controls){
    local_sfd <-  rGertrude::xlgertrude_timeseries(tibble::tribble(
      ~`Gertrude series code`,                      ~Mnemonic,
      "AUN1R105E00.0S0",              "New South Wales",
      "AUN1R205E00.0S0",                     "Victoria",
      "AUN1R305E00.0S0",                   "Queensland",
      "AUN1R405E00.0S0",              "South Australia",
      "AUN1R505E00.0S0",            "Western Australia",
      "AUN1R605E00.0S0",                     "Tasmania",
      "AUN1R705E00.0S0", "Australian Capital Territory",
      "AUN1R805E00.0S0",           "Northern Territory"
    )) %>% 
      fortify.zoo() %>% 
      rename(Date = Index) %>% 
      gather( -Date, key = "STATE_NAME_2016", value = "control_local_sfd") %>% 
      arrange(STATE_NAME_2016, Date) %>% 
      group_by(STATE_NAME_2016) %>% 
      mutate(control_local_sfd = 100* (log(control_local_sfd) - log(dplyr::lag(control_local_sfd, 4)) )) %>%  # year ended changes
      ungroup
    
    
    
    reg_data_pre <- left_join(reg_data_pre, local_sfd, by = c("Date" ,"STATE_NAME_2016"))
  }
  
 
  # Return
  return(reg_data_pre)
  
}



# Add in AR terms of  variable-------------------------------------------------------------------------------------------
add_ar_y <- function(reg_data_pre, opts){
  
  if (opts$ar_lag >0 ){
    temp <- reg_data_pre %>% arrange(region_name, Date) %>%
      group_by(region_name) %>%
      mutate(y = 100*( log(calculation_value) - dplyr::lag(log(calculation_value) , 1 ))) %>%
      ungroup  
    
    # allocate space and take in lagged ar terms
    reg_data_pre[, paste0("control_ar_", 1:opts$ar_lag)] <- NA
    reg_data_pre[, paste0("control_ar_", 1:opts$ar_lag)] <- data.frame(sapply(1:opts$ar_lag, function(x) temp %>% group_by(region_name) %>% 
                                                                                mutate(lag_y = dplyr::lag(y, x)) %>% 
                                                                                .$lag_y)) # quartery change in hedonic index
  }
  return(reg_data_pre)
}



add_agg_controls <- function(reg_data_pre,  opts){
  
  # Aggregate variables ---------------------------------------------------------------------
  if ("gdp" %in% opts$agg_controls){
    gdp <- aus_gdp() %>% 
      transmute(Date, agg_gdp = 100*(ln_aus_real_gdp - dplyr::lag(ln_aus_real_gdp, 4)))
    
    reg_data_pre <- left_join(reg_data_pre, gdp)
    
  }
  
  if("tot" %in% opts$agg_controls){
    tot <- gertrude_timeseries( "AUNF0000002.IS0", mnemonic = "agg_tot") %>% fortify.zoo() %>% 
      rename(Date = Index) %>% 
      mutate(Date = as.Date(Date), agg_tot =  100*(log(agg_tot) - log(dplyr::lag(agg_tot, 4))))
    
    reg_data_pre <- left_join(reg_data_pre, tot)
  }
  
  if("realxrate" %in% opts$agg_controls){
    realxrate <- rba_stat_table("f15", series_id = "FRERTWI", add_log = T ) %>% 
      arrange(date) %>%  
      filter(month(date) %% opts$frequency ==0) %>% 
      transmute(Date = date, agg_realxrate = 100*(ln_value - dplyr::lag(ln_value, 4))) 
    
    reg_data_pre <- left_join(reg_data_pre, realxrate)
  }
  
  if("unemp" %in% opts$agg_controls){
    unemp <- aus_unemployment() %>% rename(agg_unemp = unemploy) %>% 
      mutate(agg_unemp = agg_unemp - dplyr::lag(agg_unemp,4))
    reg_data_pre <- left_join(reg_data_pre, unemp)
    
  }
  
  if("mortgage_spr2ags" %in% opts$agg_controls){
    mortgage_vars <- mortgage_spr2ags() %>% transmute(Date, agg_mortgage_spr2ags = mortgage_spr2ags)
    reg_data_pre <- left_join(reg_data_pre,  mortgage_vars)
  }
  
  if (length(opts$agg_controls) >= 1 & opts$agg_control_interaction){
    # ------------------------------------------------------------------------------------------
    # Split aggregate variables by groups -----------------------------------------------------
    # Create decile interaction with cash rate
    var <- opts$mp_interaction_var
    var_dummies <- dummy(reg_data_pre[[var]])
    
    # Lag of agg_controls --------------------------------------------------------------------------------
    # Support multiple agg_controls
    for (s in opts$agg_controls){
      
      # Name of interaction
      interaction_name <- glue::glue("_agg_control_{s}_interaction_")
      
      # extract MP shock variable name
      control_column_name <- parse_quo( glue::glue("agg_{s}"), caller_env())
      
      # for loop to lags
      for (l in 0:opts$agg_control_lags){
        # extract shock
        agg_control <- 
          reg_data_pre %>% arrange(region_name, Date) %>% 
          group_by(region_name) %>% 
          mutate(agg_control_lagged = dplyr::lag(!! control_column_name, n= l)) %>% ungroup %>%
          .$agg_control_lagged
        
        # Linear model
        interaction <- var_dummies *  agg_control  # interact with cash rate shocks
        colnames(interaction) <- paste0("l", l, interaction_name , 1:max(reg_data_pre[[var]]))
        
        
        # bind the decile interactions to the input dataframe
        reg_data_pre <- bind_cols(reg_data_pre, data.frame(interaction))
      }
    }
  }
  # Return
  return(reg_data_pre)
}







# Add lag of mp_shock ---------------------------------------------------------------------------------------
add_shock_lag <- function(reg_data_pre, opts){
  # Remove GFC?
  if (opts$remove_gfc){
    date_index <- reg_data_pre$Date > "2007-12-31" & reg_data_pre$Date < "2010-01-01"
    reg_data_pre[ date_index ,grep("mp_shock", colnames(reg_data_pre))] <- NA # NA out mp_shock variable (when interacted it will be NA)
  }
  
  # Lag of shock -------------------------------------------------------------------------------------------------
  # extract MP shock variable name
  shock_column_name <- parse_quo(grep("mp_shock", colnames(reg_data_pre), value=T), caller_env())
  if(opts$shock_lag > 0){
    # for loop to implement shock_lag
    for (l in 1:opts$shock_lag){
      # extract shock
      mp_shock <- 
        reg_data_pre %>% arrange(region_name, Date) %>% group_by(region_name) %>% 
        mutate(mp_shock_lagged = dplyr::lag(!!  shock_column_name, n= l)) %>% ungroup %>%
        .$mp_shock_lagged
      
      # Linear model
      if (opts$asymmetry == FALSE){
        mp_shock_lag <-  data.frame(mp_shock)  # interact with cash rate shocks
        colnames( mp_shock_lag) <- paste0("l", l,"_mp_shock")
        
      } else if (opts$asymmetry ==TRUE){
        # Asymmetry
        mp_shock_lag_pos <- data.frame(mp_shock * (reg_data_pre[, gsub("~","",deparse(shock_column_name))] >= 0))
        colnames( mp_shock_lag_pos) <- paste0("l", l, "_mp_shock_pos")
        
        mp_shock_lag_neg <- data.frame( mp_shock * (reg_data_pre[, gsub("~","",deparse(shock_column_name))] < 0))
        colnames( mp_shock_lag_neg) <- paste0("l", l,"_mp_shock_neg")
        
        # combine the two
        mp_shock_lag <- bind_cols(data.frame( mp_shock_lag_pos), data.frame(mp_shock_lag_neg))
      }
      
      # bind the decile interactions to the input dataframe
      reg_data_pre <- bind_cols(reg_data_pre, data.frame(mp_shock_lag))
    }
  }
  
  return(reg_data_pre)
}

