# Import ABS data

# Title: Import ABS table
# Creator: Calvin He
# Date Created: 19 September 2018
# Description: Using an ABS link this function will scrap all links based on based on the file extension (xls by default)
# The table_no is just an index to the link that the code will download and read in.
# The downloaded table will be deleted in this code and only the dataframe will be outputed.
# Inputs:
#     link: the webpage link that will be scraped
#     table_no: the table_no that will be downloaded
#     file_xls: the file extension to look for
#     sheet_name: the sheet name to read 
# Requires: xml2, dplyr, rvest, stringr, readxl

import_abs_table <- function(link, table_no , file_ext=  "xls", sheet_name = "Data1", skip_rows = 0){
  
  webpage <- url(paste0(link))
  webpage <- xml2::read_html(webpage)
  
  # obtain links
  links <- webpage %>% rvest::html_nodes("a") %>% rvest::html_attr("href")
  links_mod <- links[grep(sprintf("\\.(%s)",  file_ext), links)]
  
  # obtain link names given an extension (".xls")
  filenames <- str_match(links_mod, sprintf("([^\\<\\>\\:\\\"\\/\\\\\\|\\?\\*\\&]*)\\.(%s)", file_ext))[,1] #grabs filename from href string (first illegal char up to extension)
  file_list <- list(links=paste0("http://www.abs.gov.au", links_mod), filenames=filenames)
  
  # download given a table_no, then unlink
  download.file(file_list$links[table_no], file_list$filenames[table_no], mode="wb" )
  cat(paste0("\nFile:", file_list$filenames[table_no], " has been downloaded\n"))
  
  rawdata <- lapply(sheet_name, function (x) {
    data <-  readxl::read_excel(file_list$filenames[table_no], sheet=  x, skip = skip_rows ) 
  }) # import into a list
  
  if (length(sheet_name)==1){
    rawdata <- rawdata[[1]]
  } 
  
  
  unlink(file_list$filenames[table_no])
  
  return(rawdata)
}

# ---------------------------------------------------------------------------------------------------------------------
# Import ABS Zip
import_abs_zip <- function(link, table_name , file_ext=  "zip", sheet_name = "Data 1"){
  
  table_name <- str_to_upper(table_name) # convert to upper case
  
  # Read webpage
  webpage <- url(paste0(link))
  webpage <- read_html(webpage)
  
  # obtain links
  links <- webpage %>% html_nodes("a") %>% html_attr("href")
  links_mod <- links[grep(sprintf("\\.(%s)",  file_ext), links)]
  
  # obtain link names given an extension (".xls")
  filenames <- str_match(links_mod, sprintf("([^\\<\\>\\:\\\"\\/\\\\\\|\\?\\*\\&]*)\\.(%s)", file_ext))[,1] #grabs filename from href string (first illegal char up to extension)
  file_list <- list(links=paste0("http://www.abs.gov.au", links_mod), filenames=filenames)
  
  # String search
  index <- grep(str_to_lower(table_name), str_to_lower(file_list$filenames) )
  
  
  # download given a index
  download.file(file_list$links[index], file_list$filenames[index], mode="wb" )
  cat(paste0("\nFile:", file_list$filenames[index], " has been downloaded\n"))
  unzip(file_list$filenames[index])
  rawdata <-  read_excel( str_to_upper(file_list$filenames[index]) %>% gsub("ZIP", "xlsx", .), sheet=  sheet_name ) 
  
  # Unlink
  unlink(file_list$filenames[index]) #unlink zip
  unlink(str_to_upper(file_list$filenames[index]) %>% gsub("ZIP", "xlsx", .) ) #unlink xlsx
  
  return(rawdata)
}



# Import an ABs zip then extract a CSV file

import_abs_zip_csv <- function(link, table_name ,filename , file_ext=  "zip"){
  
  table_name <- str_to_upper(table_name) # convert to upper case
  
  # Read webpage
  webpage <- url(paste0(link))
  webpage <- read_html(webpage)
  
  # obtain links
  links <- webpage %>% html_nodes("a") %>% html_attr("href")
  links_mod <- links[grep(sprintf("\\.(%s)",  file_ext), links)]
  
  # obtain link names given an extension (".xls")
  filenames <- str_match(links_mod, sprintf("([^\\<\\>\\:\\\"\\/\\\\\\|\\?\\*\\&]*)\\.(%s)", file_ext))[,1] #grabs filename from href string (first illegal char up to extension)
  file_list <- list(links=paste0("http://www.abs.gov.au", links_mod), filenames=filenames)
  
  # String search
  index <- grep(str_to_lower(table_name), str_to_lower(file_list$filenames) )
  
  
  # download given a index
  download.file(file_list$links[index], file_list$filenames[index], mode="wb" )
  cat(paste0("\nFile:", file_list$filenames[index], " has been downloaded\n"))
  unzip(file_list$filenames[index])
  rawdata <-  fread(filename ) 
  
  # Unlink
  unlink(file_list$filenames[index]) #unlink zip
  unlink(filename ) #unlink vsc
  
  return(rawdata)
}




# -------------------------------------------------------------------------------------------
# Takes a dataframe
# Converts the first column to be called 'Date'
# Then takes this date column and converts it to a date class based on excel_numeric argument
# Removes row based on NAs in the Date column
# Coerce all other columns to be of class numeric

abs_data_column_modifications <- function(df, excel_numeric = TRUE, date_conversion = FALSE, numeric_conversion = TRUE, keep = c("all", "sa", "trend", "orig")){
  
  # Keep all or seasonally adjusted ----------------------------------------------------
  if ("all" %in% keep){
    df <- df
  } else if ("sa" %in% keep){
    series_type <- which(df[,1] == "Series Type")
    df <- df[ , grepl("seasonally|type" , df[series_type,], ignore.case = T)]
  } else if ("trend" %in% keep){
    series_type <- which(df[,1] == "Series Type")
    df <- df[ , grepl("trend|type" , df[series_type,], ignore.case = T)]
  } else if ("orig" %in% keep){
    series_type <- which(df[,1] == "Series Type")
    df <- df[ , grepl("original|type" , df[series_type,], ignore.case = T)]
  }
  
  # Replace first column name with Date
  colnames(df) <- c("Date", colnames(df)[2:ncol(df)])
  
  
  
  # Date conversion
  if(date_conversion){
    # coerce Date column to be class date based on excel_numeric argument
    if (excel_numeric ==TRUE) {
      df$Date <- as.numeric(df$Date) %>% date_excel_number 
    } else{
      df$Date <- df$Date %>% as.Date
    }
  }
  
  
  # Numeric conversion of columns 2 onwards -----------------------------------------
  if (numeric_conversion){
    # coerce all other columns to be numeric
    df[,2:ncol(df)] <- sapply(df[,2:ncol(df)], as.numeric)
  }
  
  # remove row or column if all NA (except Date)
  df <- df %>% select(Date, everything())
  df <- df[apply(df %>% select(-Date), 1, function(x) !all(is.na(x))), ] # row all NA
  df <- df[, apply(df , 2, function(x) !all(is.na(x)))] # column all NA
  df <- df %>% filter(!is.na(Date))
  return(df)
}

# ----------------------------------------------------------------------------------------------------
# same as above but make series id the column name
abs_data_column_modifications_seriesid <- function(df, excel_numeric = TRUE, date_conversion = FALSE, numeric_conversion = TRUE, keep = c("all", "sa", "trend", "orig")){
  # Keep all or seasonally adjusted ----------------------------------------------------
  if ("all" %in% keep){
    df <- df
  } else if ("sa" %in% keep){
    series_type <- which(df[,1] == "Series Type")
    df <- df[ , grepl("seasonally|type" , df[series_type,], ignore.case = T)]
  } else if ("trend" %in% keep){
    series_type <- which(df[,1] == "Series Type")
    df <- df[ , grepl("trend|type" , df[series_type,], ignore.case = T)]
  } else if ("orig" %in% keep){
    series_type <- which(df[,1] == "Series Type")
    df <- df[ , grepl("original|type" , df[series_type,], ignore.case = T)]
  }
  
  # Replace first column name with Date
  colnames(df) <- c("Date", colnames(df)[2:ncol(df)])
  df <- df %>% select(Date, everything())
  
  # Convert date
  if(date_conversion){
    # coerce Date column to be class date based on excel_numeric argument
    if (excel_numeric ==TRUE) {
      df$Date <- as.numeric(df$Date) %>% date_excel_number %>% as.Date
    } else{
      df$Date <- df$Date %>% as.Date
    }
  }
  
  
  #New column names are series numbers
  # remove row or column if all NA (except Date)
  
  colnames(df) <- c("Date",df[min(which(!is.na(df$Date)))-1 ,][2:ncol(df)])
  
  # Numeric conversion of columns 2 onwards
  if (numeric_conversion){
    # coerce all other columns to be numeric
    df[,2:ncol(df)] <- sapply(df[,2:ncol(df)], as.numeric)
  }
  
  
  df <- df[apply(df %>% select(-Date), 1, function(x) !all(is.na(x))), ] # row all NA
  df <- df[, apply(df , 2, function(x) !all(is.na(x)))] # column all NA
  df <- df %>% filter(!is.na(Date))
  return(df)
}


# clean abs column names ----------------------------------------------------------------
abs_clean_column_names <- function(df){
  
  colnames(df) <- sub(";[^;]+$", "", colnames(df)) %>% 
    gsub(";","_", .) %>% gsub(",", "_", .) %>%
    str_trim(.) %>% gsub( " ", "",.) %>%
    gsub(":", "_", . ) %>% gsub("-", "_", . ) %>% str_to_lower(.)
  
  df <- df %>% select(date, everything())
  # Capitalise Date
  colnames(df) <- c("Date", colnames(df[2:ncol(df)]))
  return(df)
}



# Read in Statistical area mapping to each other (e.g. SA3 --> SA4) --------------------------------
abs_sa_mapping <- function(){
  
  # Read in file
  core_logic_sa3_code_mapping<- read.csv( "./data/sa3_mapping.csv", stringsAsFactors = FALSE) 
  
  # edit dataframe to match CoreLogic table
  core_logic_sa3_code_mapping <- core_logic_sa3_code_mapping %>% 
    mutate(region_name = SA3_NAME_2016,  area = AREA_ALBERS_SQKM)
  
  
  return( core_logic_sa3_code_mapping)
}
