/*******************************************************************************

	5_panel_setup.do
	
	This file contains code to set the cleaned data from the WAD up as panel
	data for regression analysis in Section 4 of Bishop J and I Chan (2019), Is
	Declining Union Membership Contributing to Low Wages Growth?, RBA Research
	Discussion Paper No 2019-02.
	
*******************************************************************************/
 
clear all
set more off   

local dir  "<path for 'data' folder here>"

cd "`dir'"

use wad_extract_clean_panel_firm, clear

/*********************************************************************************************
   Deal with cases where agreements with the same family_id start on the same day
 *******************************************************************************************/ 
 
duplicates tag family_id start_date if family_id!=., gen(dup) 

count if dup!=0 & dup!=.

* drop pure duplicates

duplicates drop family_id aawi state_code employees_combined firs_incr cert_date comm_date last_incr expi_date term_date if dup!=0 & family_id!=., force

* for non-pure duplicates, replace start_date with cert_date

replace start_date = cert_date if dup!=0 & family_id!=.

drop dup

duplicates tag family_id start_date if family_id!=., gen(dup) 

* some of the remaining cases can be handled by dropping the EBA in the pair with missing AAWI (or keeping only the first agreement
* in the pair if both are missing AAWI)

gen aawi_nm = 1 if aawi!=. & dup!=0 & family_id!=.

bysort family_id: egen aawi_nm_max = max(aawi_nm) if dup!=0 & family_id!=.

drop if aawi_nm==. & aawi_nm_max==1 

duplicates drop family_id if dup!=0 & family_id!=. & aawi_nm_max==., force

drop dup

duplicates tag family_id start_date, gen(dup) 

* In other cases, use the largest agreement in each non-unique pair

bysort family_id: egen emp_max = max(employees_combined) if dup!=0 & family_id!=. 

drop if (employees_combined < emp_max) & dup!=0 & family_id!=. 

drop dup emp_max

duplicates tag family_id start_date if family_id!=., gen(dup) 

* In final few cases, replace AAWI with the average and drop all but the first occurence

bysort family_id: egen aawi_av = max(aawi) if dup!=0 & family_id!=. 

replace aawi = aawi_av if dup!=0 & family_id!=. 

duplicates drop family_id if dup!=0 & family_id!=., force

drop dup aawi_nm-aawi_av

/*********************************************************************************************
   Set up panel
 *******************************************************************************************/ 
  
bysort  family_id: egen family_id_count = count(family_id) if family_id!=.

replace family_id = . if family_id_count==1

* create a new family_id variable that has IDs for non-panel members too 

sort start_date

egen id1 = group(eba_id) if family_id==.

gen     family_id2 = family_id
replace family_id2 = id1 + 300000 if family_id==.

drop id1

egen id = group(family_id2)

sort family_id2 start_date
by   family_id2: gen fam_count = _n if family_id2!=.

xtset id fam_count

xtdes

save wad_extract_clean_panel_firm_setup, replace

clear

* end of do file
