# Title: ATO data collection
# Creator: Calvin He
# Date Created: 30 July 2019

asgs_mapping <- function(region_type_one = "POSTCODE", region_type_one_year= "2017" , region_type_two = "SA3"  , region_type_two_year = "2016"  ){
  
  # Zip filename
  zip_file <- here::here("data", "ato-data","asgs_correspondence.zip" )
  
  # Download if it doesn't exist
  if(!file.exists(zip_file )){
    download.file("https://data.gov.au/dataset/23fe168c-09a7-42d2-a2f9-fd08fbd0a4ce/resource/951e18c7-f187-4c86-a73f-fcabcd19af16/download/asgs2016_2016gridcorrespondences.zip",
                  destfile = zip_file , mode = "wb")
  }
  

  # Find specific file
  filename <- glue::glue("CG_{region_type_one}_{region_type_one_year}_{region_type_two}_{region_type_two_year}.xlsx")
 
  # extract
  if(!file.exists(here::here("data", "ato-data", filename))){
    unzip(zip_file  , files = filename, exdir = here::here("Data", "ato-data"))
  }
  
  # extract table 3
  output <- readxl::read_excel(here::here("data", "ato-data", filename), sheet = "Table 3", skip = 5)
  return(output)
}


# import table 6
ato_table_6 <- function(){
  
  file_name <- here::here("data", "ato-data","ato_table_6.xlsx" )
  
  download.file("https://data.gov.au/data/dataset/540e3eac-f2df-48d1-9bc0-fbe8dfec641f/resource/41d19b90-1984-46be-b82c-0a51efc3983d/download/ts17individual06taxablestatusstatepostcode.xlsx", 
                mode = "wb", destfile = file_name)
  
  # read in files 
  output <- map(readxl::excel_sheets(file_name), ~ readxl::read_excel(file_name, sheet = . , skip = 2))
  
  names(output) <- readxl::excel_sheets(file_name) %>% str_to_lower %>% str_replace_all("&", "") %>% 
    str_replace_all(" |  ", "_")
  return(output)
} 


# Combine ato_table with SA3 mapping
ato_table_6_sa3 <- function(){
  
  # import both data tables
  postcode_sa3 <- asgs_mapping() %>% select(POSTCODE = POSTCODE_2017...1, SA3_CODE_2016) %>% distinct(POSTCODE, .keep_all = T)
  ato_data <- ato_table_6()$individuals_table_6b
  
  # combine data
  ato_data_combine <- left_join(ato_data, postcode_sa3, by = c("Postcode" = "POSTCODE"))
  
  # modify column names to be sensible
  colnames(ato_data_combine) <-   colnames(ato_data_combine) %>% 
    str_replace_all("/|\r|\n|-", "_") %>%
    str_replace_all("&", "_") %>% 
    str_to_lower %>% str_replace_all(" |  |   ", "_") %>%  
    str_replace_all("no.", "count") %>% str_replace_all("\\$","dollars") %>%
    str_replace_all("state__territory1", "state") %>% paste0("ato_",.) %>% 
    str_replace_all("___|__", "_") %>%
    str_replace_all("ato_sa3_code_2016", "SA3_CODE_2016") %>% 
    str_replace_all("ato_postcode", "postcode") %>% str_replace_all("ato_state", "state")
  
  
  return(ato_data_combine)
}


# aggregate
ato_table_6_sa3_aggregate <- function(df , group){
  
  group <- enquo(group)
  
  group_vars <- c("state", "postcode", "SA3_CODE_2016")
  remove_groups <- group_vars[!group_vars %in% as_name(group)]
  
  df %>% select(-remove_groups) %>% group_by(!!group) %>% summarise_all(sum, na.rm =T) %>% ungroup
}



# take proportion of population
ato_table_6_sa3_shares <- function(df){
  temp <- df %>% select(ends_with("count"))
  share_vars <-  colnames(temp)[colnames(temp) != "ato_number_of_individuals_count" ]
  df %>%  mutate_at(share_vars,  list(atoshare= ~100*./ato_number_of_individuals_count)) 
}


# take per capita of dollar variables
ato_table_6_sa3_per_capita <- function(df){
  temp <- df %>% select(ends_with("dollars"))
  
  df %>% mutate_at(colnames(temp), list(per_capita = ~./ato_number_of_individuals_count))
}


# Keep certain variables 
ato_table_6_keep_vars <- function(df){
  
  df %>% select(SA3_CODE_2016,
                contains("rent_interest_deductions"), 
                contains("net_rent_profit"),
                contains("gross_interest"),
                contains("capital_gains"), 
                contains("dividend"),
                contains("salary_or_wages"),
                contains("taxable_income"),
                contains("low_income"),
                contains("gross_interest"),
                contains("allowances_earnings_tips_directors_fees"),
                -contains("tax_on_taxable_income_count"),
                -contains("capital_gains_net_capital_losses_carried_forward"),
                -contains("estimated_tax_on_net_capital_gains"),
                -contains("share_of_franking_credit_from_franked_dividends"),
                -contains("tax_withheld_from_salary_or_wages"),
                -contains("tax_withheld_from_salary_or_wages"),
                -contains("taxable_income_or_loss3_count"),
                -contains("tax_on_taxable_income_dollars"))
  
}   

# Do all the functions in the correct order         
ato_table_6_lazy <- function(){
  ato_table_6_sa3() %>% 
    ato_table_6_sa3_aggregate(SA3_CODE_2016) %>% 
    ato_table_6_sa3_shares() %>% 
    ato_table_6_sa3_per_capita() %>% 
    ato_table_6_keep_vars() 
  
}