/*******************************************************************************

	3_create_panel.do
	
	This file contains code to identify 'families' of agreements (i.e. agreements
	representing the same worker groups over time) from the WAD. The code
	restricts the links between agreements to where the replacing agreements
	covers exactly the same worker group as its predecessor agreement. These 
	agreement families are used for the baseline regression regression results
	and robustness tests in Section 4 of Bishop J and I Chan (2019), Is
	Declining Union Membership Contributing to Low Wages Growth?, RBA Research
	Discussion Paper No 2019-02.
	
*******************************************************************************/

clear all
set more off

local dir  "<path for 'data' folder here>"

cd "`dir'"

/*********************************************************************************************
* For comparion purposes, identify all EBAs that could potentially end up in a panel 
  
  This is the 'maximum possible panel', and is defined as all agreements that replace an agreement
  or are replaced by an agreement. 
  
  The actual regression sample will be a subset of this 
*********************************************************************************************/

* flag agreements that get *replaced* 

use replaced_agreements, clear

keep eba_id_replaced
rename eba_id_replaced eba_id
gen is_replaced=1

duplicates drop

sort eba_id

save replaced_any, replace
clear

use wad_extract_clean, clear

merge 1:1 eba_id using replaced_any
drop if _merge==2
drop    _merge

* flag agreements that *replace* 

gen replaces = 1 if replacedagreements!=""

replace is_replaced = 0 if is_replaced==.
replace replaces    = 0 if replaces==.

gen     panel_maximum = 0
replace panel_maximum = 1 if is_replaced==1 | replaces==1

tab panel_maximum
tab is_replaced replaces

tab surveyq is_replaced
tab surveyq replaces

save wad_extract_clean_panel, replace
clear


* create file containing start and end dates of eba (dates1.dta) and the eba that replaces it (dates2.dta)

use wad_extract_clean, clear

keep eba_id start_date end_date cert_date

save dates1, replace

drop   cert_date
rename eba_id eba_id_replaced
rename start_date start_date_r
rename end_date end_date_r

save dates2, replace
clear

* eba_id_replaced is the agreement that is *replaced* by eba_id

use replaced_agreements, clear

sort eba_id
merge m:1 eba_id using dates1
drop if _merge==2
drop _merge
merge m:1 eba_id_replaced using dates2
drop if _merge==2
drop _merge

sort eba_id generation

/*  Identify the 'final' agreements in each family i.e. the most recent agreement 
   
   This is defined as any agreement not *replaced* by any other agreement 
   
   An agreement can only *replace* another agreement if the replacing agreement has R=P  */

save working1, replace

keep if strpos(lower(employeeoverlap_replaced),"r equals p")

keep eba_id_replaced
rename eba_id_replaced eba_id
gen replaced_flag=1

duplicates drop

save replaced_ag, replace

use working1, clear

merge m:1 eba_id using replaced_ag
drop if _merge==2
drop _merge

* to be in the sample, the (final) agreement must directly replace its predecessor

gen     r_equals_p=0
replace r_equals_p=1 if strpos(lower(employeeoverlap_replaced),"r equals p")

gen     final_ag = 0
replace final_ag = 1  if replaced_flag==. & r_equals_p==1

save working2, replace

/* Identify all predecessors of the final agreement

   There was a change in WAD coding practice from 2011m1 onwards. Before 2011 all generations of replaced agreements
   were recorded. After 2011m1, only the most recently replaced agreement was recorded. The code below accounts for this 
    
   Start with the post-2011 sample */   
   
keep if cert_date>=td(01Jan2011) & r_equals_p==1

* drop any agreements that replace more than one other agreement (most will be eliminated anyway by focusing on R=P)

quietly bysort eba_id_replaced: gen dup=cond(_N==1,0,_n)
keep if dup==0 
drop dup   

save working3, replace

keep if final_ag==1

keep eba_id eba_id_replaced start_date start_date_r

rename eba_id eba_id1
rename eba_id_replaced eba_id
rename start_date sd1
rename start_date_r sd2

merge 1:1 eba_id using working3
drop if _merge==2

forvalues x=2(1)6 {
keep eba_id* sd* start_date_r
rename eba_id eba_id`x'
rename eba_id_replaced eba_id
local i = `x'+1
rename start_date_r sd`i'
merge m:1 eba_id using working3
drop if _merge==2
}
*
rename eba_id eba_id7
keep eba_id* sd*
drop eba_id_replaced
order eba_id* sd*

gen     eba_last = eba_id7
gen     sd_last = sd7
forvalues x=1(1)6 {
local i = 7-`x'
replace eba_last = eba_id`i' if eba_last==""
replace sd_last = sd`i' if sd_last==.
}
*

format sd_last %td

save working4, replace
clear

* Merge on the relevant pre-2011 predecessor agreements
* If an agreement has R not equal to P, drop it *and all its predecessors*

use working2, clear

keep if cert_date<=td(31Dec2010)

gen    ts     = start_date_r if r_equals_p==0
bysort eba_id: egen ts_max=max(ts)

format ts     %d
format ts_max %d

drop if start_date_r<=ts_max & ts_max!=.

sort eba_id start_date_r

save working5, replace

keep eba_id eba_id_replaced

rename eba_id_replaced eba_id_p

bysort eba_id: gen id = _n

reshape wide eba_id_p, i(eba_id) j(id)

rename eba_id eba_last

save working6, replace

use  working4, clear

merge 1:1 eba_last using working6
drop if _merge==2

drop sd* eba_last _merge

forvalues x=1(1)18 {
local i = `x'+7
rename eba_id_p`x' eba_id`i'
}
*
sort eba_id1

gen family_id=_n

reshape long eba_id, i(family_id) 
drop if eba_id==""

drop _j

save working7, replace

* Now gather the agreements whose 'final' agreement was pre-2011

use working5, clear

keep eba_id eba_id_replaced final_ag

keep if final_ag==1
drop    final_ag

egen    family_id = group(eba_id) 
replace family_id = family_id + 50000

save   file1, replace
keep   eba_id          family_id
duplicates drop
save   file2, replace
use    file1, clear
keep   eba_id_replaced family_id 
rename eba_id_replaced eba_id
duplicates drop

append using file2
sort   family_id 

* Append the files for pre- and post-2011 agreements

append using working7

sort family_id

quietly bysort eba_id: gen dup=cond(_N==1,0,_n)
keep if dup==0
drop dup

egen family_id2 = group(family_id)
drop   family_id
rename family_id2 family_id

sort eba_id

save family_id, replace
clear

* Merge family_id to main dataset

use wad_extract_clean_panel, clear
merge 1:1 eba_id using family_id
drop if _merge==2
drop _merge

sort family_id start_date

order eba_id family_id eba_name aawi state_code employees_combined firs_incr cert_date comm_date last_incr expi_date term_date

bysort  family_id: egen fam_count = count(family_id)
replace family_id=. if fam_count==1
drop fam_count

/*********************************************************************************************
 Backcast ABN
*********************************************************************************************/

* Backcast ABN (in some cases, ABN changes over time)

replace abn = "" if countofemployer>1

replace abn="33370684005" if abn=="33 370 684"

destring abn, replace

bysort family_id: egen double abn_max = max(abn)
bysort family_id: egen double abn_min = min(abn)

gen     abn_unique = 0
replace abn_unique = 1 if abn_max==abn_min

gen     double abn_backcast = abn_max if abn_unique==1 

drop abn_max abn_min

* For agreements with non-unique ABN, keep track of all possible ABNs

save wad_extract_clean_panel, replace

keep if abn_unique == 0

keep family_id abn

drop if family_id==. | abn==. 

duplicates drop

bysort family_id: gen abn_n = _n

reshape wide abn, i(family_id) j(abn_n)

save abn_wide, replace
clear

use wad_extract_clean_panel, clear

merge m:1 family_id using abn_wide
drop _merge

replace abn1 = abn_backcast if abn_backcast!=. & abn1==.

drop abn_backcast

save wad_extract_clean_panel, replace

forvalues i = 2(1)7 {
erase working`i'.dta
}
*
/*********************************************************************************************
 Examine completeness of matched sample
 
 Shows that our matched sample captures 85% of all agreements we can possibly capture, based 
 on the completeness of the underlying WAD data
*********************************************************************************************/

* Identify Decendants

use replaced_agreements, clear

gen     r_equals_p_dec=0
replace r_equals_p_dec=1 if strpos(lower(employeeoverlap_replaced),"r equals p")

keep eba_id r_equals_p_dec

collapse (mean) r_equals_p_dec, by(eba_id)

save r_equals_p_dec, replace

clear

* Future agreeements

use replaced_agreements, clear

gen     r_equals_p_fut=0
replace r_equals_p_fut=1 if strpos(lower(employeeoverlap_replaced),"r equals p")

keep eba_id_replaced r_equals_p_fut

collapse (mean) r_equals_p_fut, by(eba_id_replaced)

rename eba_id_replaced eba_id

save r_equals_p_fut, replace

clear

use wad_extract_clean_panel

merge 1:1 eba_id using r_equals_p_dec, nogen
merge 1:1 eba_id using r_equals_p_fut, nogen

gen year = year(start_date)

gen     r_equals_p_dec_z = r_equals_p_dec 
replace r_equals_p_dec_z = 0 if r_equals_p_dec == .
gen     r_equals_p_fut_z = r_equals_p_fut 
replace r_equals_p_fut_z = 0 if r_equals_p_fut == .

gen     r_notequals_p_fut_dum = 1 if r_equals_p_fut<1
replace r_notequals_p_fut_dum = 0 if r_equals_p_fut==1

* Share of agreements signed in a given year that will one day be replaced by an agreement for which R not equal to P

tabstat r_notequals_p_fut_dum                            , by(year)
tabstat r_notequals_p_fut_dum [weight=employees_combined], by(year)

* Matched sample captures 85% of all agreements we can possibly capture, based on the completeness of the underlying data

gen     in_panel = 0
replace in_panel = 1 if family_id!=.

sum in_panel      if aawi!=. & surveyq>=tq(1997q1)
local s = r(mean)
sum panel_maximum if aawi!=. & surveyq>=tq(1997q1)
local m = r(mean)

local share = `s' / `m' * 100
disp `share' 

* erase temp files

foreach i in replaced_any dates1 dates2 working1 replaced_ag file1 file2 family_id abn_wide r_equals_p_dec r_equals_p_fut {
erase `i'.dta
}

* end of do file
