# Title: Build all grouping variables which are used for model selection
# Creator: Calvin He


build_grouping_variables <- function(){

  # Step 5: Build static variables ---------------------------------------------
  grouping_variables <- abs_sa_mapping() %>%
    {plyr::join_all(
      list(. , pop_density(), # pop density
           dwelling_density(),   # dwelling density
           census_median_age_sa3() %>% select(-description) %>% rename(median_age = value), # median_age
           census_population_sa3()  %>% select(-description) %>% rename(population = value),
           census_owner_concentration_sa3(), 
           census_industry_share_sa3(), 
           census_cyclical_industry_sa3(),
           census_apartment_dwelling_ratio_sa3(year =2016),
           census_population_by_age_sa3(min_age = 15, max_age = 64) %>% select(-description) %>% rename(working_age_pop = value),
           census_population_growth(),
           census_income_growth(),
           census_housing_servicing_ratio(year = 2016) ), # population
      by = "SA3_CODE_2016", type = "left" ) } %>% 
    
    # Join in ato vars
    left_join(., ato_table_6_lazy(), by = "SA3_CODE_2016") %>% 
    
    # 2016 Income variables ABS
    {left_join(.,
               filter_data_by_region_income(keep_years = 2016 , keep_region_codes = unique(.$region_code)) %>%
                 select(-region_name) %>% filter(!is.na(region_code)) %>% rename(SA3_CODE_2016 = region_code) ,
               by = "SA3_CODE_2016") } %>%
    
    # 2016 Family and community variables ABS
    {left_join(., 
               filter_data_by_region_fam_com(keep_years = 2016,  keep_region_codes = unique(.$region_code)) %>% 
                 select(-region_name) %>% filter(!is.na(region_code)) %>% rename(SA3_CODE_2016 = region_code), 
               by = "SA3_CODE_2016")} %>% 
    # welfare payments 2018 only
    {left_join(., filter_data_by_region_income(keep_years = 2018,  
                                               keep_vars = c("Newstart Allowance", "Parenting Payment - Single", "Parenting Payment - Partnered" ) , 
                                               keep_region_codes = unique(.$region_code)) %>% 
                 select(-region_name, -year) %>% filter(!is.na(region_code)) %>% rename(SA3_CODE_2016 = region_code) ,
               by = "SA3_CODE_2016")} %>% 
    # Age pension
    {left_join(., filter_data_by_region_income(keep_years = 2018,  
                                               keep_vars = c("Age Pension - Centrelink" ) , 
                                               keep_region_codes = unique(.$region_code)) %>% 
                 select(-region_name, -year) %>% filter(!is.na(region_code)) %>% rename(SA3_CODE_2016 = region_code) ,
               by = "SA3_CODE_2016")} %>% 
    
    # Mean household wealth 2016 by SA4
    {left_join(., filter_data_by_region_economy_asgs() %>% select(-region_name, -year) %>% 
                 rename(SA4_CODE_2016 = region_code, 
                        hh_net_worth=  meanhouseholdnetworth_meanhouseholdnetworth) %>% 
                 filter(SA4_CODE_2016 >= 100, SA4_CODE_2016<= 1000), by = c("SA4_CODE_2016"))} %>% 
    
    # Import HILDA variables - hand to mouth and net worth variables - fill with random numbers
    {mutate(., 
              htm = runif( nrow(.) ),
              whtm =runif( nrow(.) ),
              phtm =runif( nrow(.) ),
            nworth = runif( nrow(.), min = 0 , max = 5e7 ),
            liqnworth = runif( nrow(.), min = 0 , max = 5e7 ),
            illiqnworth = runif( nrow(.),  min = 0 , max = 5e7 ))} %>% 
    
    # Import Zoning estimates -  fill with random numbers
    {mutate(., prop_zoning = runif( nrow(.) , min = 0, max = 100),
            prop_physical  = runif( nrow(.) , min = 0, max = 100),    
            prop_land = runif( nrow(.) , min = 0, max = 100),
            prop_structure =runif( nrow(.) , min = 0, max = 100) )} %>% 
    
    
    # Make some extra variables
    mutate(investor_density_working_age_pop = investment_income_earners/working_age_pop,
           investor_density_pop = investment_income_earners/population, 
           age_pension_centrelink = age_pension_centrelink/population, 
           investment_to_total_income = median_investment_income/median_total_income_excl_government_pensions,
           govt_benefits_working_age_pop = (newstart_allowance + parenting_payment_single + parenting_payment_partnered)/working_age_pop,
           htm_income_working_age_pop = (share_earning_1_499_per_week + share_earning_nil_income + 
                                           share_with_a_negative_income + share_earning_500_999_per_week)
    ) 
  
  
 
  # historical 1991 census data -------------------------------------------------
  hist_census <- historical_census_import_clean()
  
  hist_census_data <- left_join(historical_census_income(hist_census) , 
                                historical_census_lfs(hist_census)  , 
                                by = "region_code") %>% 
    rename(SA3_CODE_2016 = region_code)
  
  
  grouping_variables <- plyr::join_all(list(grouping_variables, 
                                      hist_census_data,   
                                      historical_census_dwelling_density(hist_census) ,
                                      historical_census_population_density(hist_census)), 
                                 by = "SA3_CODE_2016", type = "left")
  
  
  return(grouping_variables)
  
}


