/*******************************************************************************

	1_import_WAD.do
	
	This file contains code to extract and clean data from the Department of
	Jobs and Small Business Workplace Agreements Database (WAD) for the analysis
	in Bishop J and I Chan (2019), Is Declining Union Membership Contributing
	to Low Wages Growth?, RBA Research Discussion Paper No 2019-02.
	
*******************************************************************************/

clear all
set more off

* set directory 
local dir_in "<path for WAD data here>"
local dir_out "<path for 'data' folder here>"

cd "`dir_in'"

/********************************************************************************************
    1 - Wage increment file
*********************************************************************************************/

* Import wage increment files

forvalues i=1991(2)2017 {

local j = `i' + 1

import excel "WAD2017_MergedAgreementGeneralInfo_1_01_`i'_12-00-00_AM_31_12_`j'_12-00-00_AM.xlsx", sheet("Wage Increment") cellrange(A2)

rename A eba_id	
rename B incr_n	
rename C incr_date	
rename D incr_pc	
rename E incr_amount	
rename F non_compound	
rename G conditional	
rename H incr_pc_anzsic	
rename I incr_tot

save wage_increment_`i'_`j', replace
clear
}

* Append files

use wage_increment_1991_1992, clear

forvalues i=1993(2)2017 {
local j = `i' + 1
append using wage_increment_`i'_`j'
erase wage_increment_`i'_`j'.dta
}
erase wage_increment_1991_1992.dta

* Date variables

replace eba_id = upper(eba_id)

destring incr_pc     , replace
destring incr_amount , replace

gen incr_date2 = date(incr_date, "DMY")
format incr_date2 %td
drop incr_date
rename incr_date2 incr_date

gen incr_date_qy = yq(year(incr_date), quarter(incr_date))
format incr_date_qy %tq

drop incr_pc_anzsic

* Identify first, second and last wage increases in EBA

bysort eba_id: egen    last_incr = max(incr_date) 
bysort eba_id: egen    firs_incr = min(incr_date) 

gen secd = incr_date if incr_n==2
bysort eba_id: egen    secd_incr = max(secd) 
drop secd
 
format  last_incr %td
format  firs_incr %td
format  secd_incr %td

* save long file

cd "`dir_out'"

save increments_long, replace 

* save wide file

keep eba_id incr_tot incr_n incr_pc incr_amount non_compound conditional incr_date incr_date_qy last_incr firs_incr secd_incr

reshape wide incr_pc incr_amount non_compound conditional incr_date incr_date_qy,  i(eba_id) j(incr_n)

compress

save increments_wide, replace 

clear

/********************************************************************************************
    2 - Employer file
	
	   Not strictly firm-level, because often multiple agreements per firm. 
       
	   Variables like employees_total measure the number of employees covered by the *agreement*
       
	   There are sometimes multiple firms per agreement (upwards of 300 in some cases)   
*********************************************************************************************/

* Import list of ANZSIC industry codes (ABS)

clear

cd "`dir_out'"

import excel using "ANZSIC_subdivisions.xlsx", sheet("anzsic93_subdivisions") cellrange(A1:B54) firstrow

rename anzsic93_name anzsic_firm

save anzsic93_subdivisions, replace

clear

import excel using "ANZSIC_subdivisions.xlsx", sheet("anzsic06_subdivisions") cellrange(A1:B87) firstrow

rename anzsic06_name anzsic_firm

save anzsic06_subdivisions, replace

clear

cd "`dir_in'"

* Import employer files

forvalues i=1991(2)2017 {

local j = `i' + 1

import excel "WAD2017_MergedAgreementGeneralInfo_1_01_`i'_12-00-00_AM_31_12_`j'_12-00-00_AM.xlsx", sheet("Employee") cellrange(A2)

rename A eba_id	
rename B abn	
rename C legalname	
rename D employees_total	
rename E employees_estimatedtotal	
rename F employees_women	
rename G employees_parttime	
rename H anzsic_firm

* make all variables strings (otherwise there are conflicts during append)

tostring _all, replace

save employer_`i'_`j', replace
clear
}
*

* Append files

use employer_1991_1992, clear

forvalues i=1993(2)2017 {
local j = `i' + 1
append using employer_`i'_`j'
erase employer_`i'_`j'.dta
}
erase employer_1991_1992.dta

* clean data

replace eba_id = upper(eba_id)

drop employees_women employees_parttime 


/* Calculate total number of employees coevered by *agreement*

   There are cases of multiple firms per agreement, so add these to get total employees covered by agreement

   DJSB definitions: 
   
     - Employees total: total number of employees covered by agreement (if known)
   
     - Estimated employees total: used when the number of employees covered by an agreement is unknown. If the agreement replaces an 
       earlier agreement for which employee coverage is known, the employee coverage of the earlier agreement is used. For agreements still 
       lacking employee coverage, a modified mean is used. This is generated for each industry group by current quarter, removing the 
       largest 5% and smallest 5% of agreements and then calculating the mean of the remainder. */

duplicates report eba_id

* replace the category "5 or fewer employees" with 5

foreach e in employees_total employees_estimatedtotal {
replace  `e' = "5" if `e'=="5 or fewer"
replace  `e' = "." if `e'==""
destring `e', replace
}
*

gen     employees_combined = employees_total 
replace employees_combined = employees_estimatedtotal if employees_total==.

/* Merge in industry codes (both 93 and 06 basis)
   In cases of multiple firms per agreement, assign the industry covering the largest number of workers (mode) */  

cd "`dir_out'"
   
foreach i in 93 06 {   
merge m:1 anzsic_firm using anzsic`i'_subdivisions
drop if _merge==2
drop _merge
}

drop anzsic_firm

bysort eba_id: egen anzsic93_code_mode = mode(anzsic93_code), minmode
bysort eba_id: egen anzsic06_code_mode = mode(anzsic06_code), minmode

collapse (sum) employees_total employees_estimatedtotal employees_combined (firstnm) anzsic93_code_mode anzsic06_code_mode abn legalname, by(eba_id) 

rename anzsic93_code_mode anzsic93
rename anzsic06_code_mode anzsic06

erase anzsic93_subdivisions.dta
erase anzsic06_subdivisions.dta

compress

save employer, replace 

clear

/********************************************************************************************
    3 - Agreement file (Basic info)
	
	There are no duplicates of eba_id in this file, since data are measured at the agreement level 
*********************************************************************************************/

* Import agreement files

cd "`dir_in'"

forvalues i=1991(2)2017 {

local j = `i' + 1

import excel "WAD2017_MergedAgreementGeneralInfo_1_01_`i'_12-00-00_AM_31_12_`j'_12-00-00_AM.xlsx", sheet("Basic Info") cellrange(A2)

rename A  eba_id
rename B  eba_type
rename C  print_no
rename D  matter
rename E  eba_name
rename F  sector
rename G  aps
rename H  cert_date
rename I  comm_date
rename J  expi_date
rename K  term_date
rename L  term_reason
rename M  duration
rename N  aawi
rename O  replaced_ag_checked
rename P  openduration
rename Q  variations
rename R  singleinterestemp
rename S  publicinterest
rename T  internal
rename U  lowpaidbargaining
rename V  hasundertaking
rename W  singleissue
rename X  firstincnotquant_otherreason
rename Y  firstincnotquant_classsiftruct
rename Z  nowagerates
rename AA minwagemovements_fwaauto
rename AB minwagemovements_fwalinked
rename AC cpieconfactors_cpiauto
rename AD cpieconfactors_cpilinked
rename AE cpieconfactors_othereconadj
rename AF allowancecpi
rename AG oneoffbonus_uncond
rename AH oneoffbonus_uncondamount
rename AI oneoffbonus_cond
rename AJ oneoffbonus_condamount
rename AK performancepay_indiv
rename AL performancepay_group
rename AM performancepay_allemp
rename AN performancepay_unclear
rename AO performancepay_team
rename AP shareownership_profitshare
rename AQ shareownership_sharedacquisition
rename AR countofemployer
rename AS replacedagreements
rename AT awards
rename AU ANZSIC_agreement
rename AV unions
rename AW states

gen sourcefile="`i'_`j'"

* make all variables strings (otherwise there are conflicts during append)

tostring _all, replace

save basic_`i'_`j', replace
clear
}
*
* Append files

use basic_1991_1992, clear

forvalues i=1993(2)2017 {
local j = `i' + 1
append using basic_`i'_`j'
erase basic_`i'_`j'.dta
}
erase basic_1991_1992.dta

* clean data

replace eba_id = upper(eba_id)

replace replacedagreements = upper(replacedagreements)

foreach g in duration aawi oneoffbonus_uncondamount oneoffbonus_condamount countofemployer {
replace  `g'="." if `g'==""
destring `g', replace
}

cd "`dir_out'"

compress

save basic, replace

clear

/********************************************************************************************
    4 - Replaced agreements file
*********************************************************************************************/

* Import replaced agreements files

cd "`dir_in'"

forvalues i=1991(2)2017 {

local j = `i' + 1

import excel "WAD2017_MergedAgreementGeneralInfo_1_01_`i'_12-00-00_AM_31_12_`j'_12-00-00_AM.xlsx", sheet("Replacement") cellrange(A2)

rename A eba_id	
rename B eba_id_replaced	
rename C generation_replaced	
rename D employeeoverlap_replaced	

* make all variables strings (otherwise there are conflicts during append)

tostring _all, replace

save replaced_`i'_`j', replace
clear
}

* Append files

use replaced_1991_1992, clear

forvalues i=1993(2)2017 {
local j = `i' + 1
append using replaced_`i'_`j'
erase replaced_`i'_`j'.dta
}
erase replaced_1991_1992.dta

* clean data

replace eba_id = upper(eba_id)
replace eba_id_replaced = upper(eba_id_replaced)

replace  generation_replaced="." if generation_replaced==""
destring generation_replaced, replace

cd "`dir_out'"

compress

save replaced_agreements, replace

clear

/********************************************************************************************
    5 - Merge the files 
*********************************************************************************************/

cd "`dir_out'"

use basic, clear

merge 1:1 eba_id using employer
drop _merge
merge 1:1 eba_id using increments_wide
drop _merge

* Clean date variables

foreach z in cert_date comm_date expi_date term_date {
gen `z'_b = date(`z', "DMY")
format `z'_b %td
drop `z'
rename `z'_b `z'

gen `z'_y = year(`z')
gen `z'_q = quarter(`z')
gen `z'_qy = yq(`z'_y, `z'_q)
format `z'_qy %tq
drop `z'_y `z'_q
}
*

drop print_no matter term_reason lowpaidbargaining internal hasundertaking awards

compress

save wad_extract, replace

clear

* end of do file
