/*******************************************************************************

	7_combined.do
	
	This program combines the wages and unemployment data into a single panel ('master').
	
	Seperate classifications are developed for different ways of defining local labour markets.

	Last edited: 19 August 2021 

*******************************************************************************/	

clear

cd "$dir_out"

/*******************************************************************************
	1 - Loop over the different geographic classifications
		*See Section 8.1 of the RDP
*******************************************************************************/

foreach clus in c_pr980_11c c_pr980_16c c_ferco_11c c_ferpc_11c c_gccsa_16c c_state_16c c_sa3_16c c_sa4_16c {

local census = substr("`clus'", -3, 2)

* Combine the datafiles for wages and unemployment

use wages_historical_sa2_2016, clear

drop source_file*

merge 1:1 sa2_2016 finyear using wages_sa2_2016, keepusing(ws_no ws_val) nogen

merge 1:1 sa2_2016 finyear using salm_recent, nogen

rename (no_UN no_LF) (u lf)

drop urate

merge 1:1 sa2_2016 finyear using salm_historical_sa2_2016, nogen

merge m:1 sa2_2016         using awd_share, nogen

drop if sa2_2016==.

* Create state dummies (fraction of LLM in a given state)

tostring sa2_2016, gen(e)
gen state = substr(e,1,1)
tab state, gen(s_)

/*******************************************************************************
	2 - Collapse SA2 data to the chosen level of geography
*******************************************************************************/

merge m:1 sa2_2016 using cluster_preferred_combined_asgs2016, keep(match) nogen

drop if `clus'==.

collapse (rawsum) ws_no_A-u_C (mean) s_1-s_9 awdreliant [weight=pop20`census'], by(finyear `clus')

rename `clus' llm

sort llm finyear

ds s_1-s_9 awdreliant, not
foreach v of var `r(varlist)' {
replace `v'=. if `v'==0
}

gen mainstate=.

forvalues i = 1(1)9 {
replace   mainstate=`i' if s_`i'>0.5
}


/*******************************************************************************
	3 - Calculate wage growth by year 
		*use most recently published data for any year
*******************************************************************************/

xtset llm finyear

gen ws_gr = (ws_val/ws_no)/(L1.ws_val/L1.ws_no) * 100-100

foreach i in A B C {
gen ws_gr`i' = (ws_value_`i'/ws_no_`i')/(L1.ws_value_`i'/L1.ws_no_`i') * 100-100
}
gen ws_grBC  = (ws_value_B/ws_no_B)/(L1.ws_value_C/L1.ws_no_C) * 100-100

replace ws_gr = ws_grA  if ws_gr==.
replace ws_gr = ws_grB  if ws_gr==.
replace ws_gr = ws_grBC if ws_gr==.
replace ws_gr = ws_grC  if ws_gr==.

* Calculate wage and salary earners per year (not spliced)

replace        ws_no = ws_no_A if ws_no==.
replace        ws_no = ws_no_B if ws_no==.
replace        ws_no = ws_no_C if ws_no==.

/*******************************************************************************
	4 - Calculate unemployment rates  by year 
		*Use most recently published data for any year
		*Backcast the unemployment rate for the historical period by splicing 
		*Splicing on the first-differences in the urate (rather than % changes), which means it's possible to have a -ve urate. Set these to zero
		*Using the historical data for 2010 because the current vintage was based on an incomplete FY 2010
*******************************************************************************/

gen double urate_rece = u   / lf   * 100
gen double urate_hist = u_A / lf_A * 100
replace    urate_hist = u_B / lf_B * 100 if urate_hist==.
replace    urate_hist = u_C / lf_C * 100 if urate_hist==.

gen double urate =    urate_rece

forvalues i=0(1)11 {
local j = 2010-`i'
replace   urate = F1.urate - (F1.urate_hist-urate_hist) if finyear==`j'
}

replace urate=0 if urate<0

/*******************************************************************************
	4 - Construct other useful variables
*******************************************************************************/

* Drop variables note used in regression analysis 

keep finyear llm ws_no ws_gr urate s_* mainstate awdreliant

* Compute time-invariant region population counts (average over entire sample)

bysort llm: egen wgt = mean(ws_no)
replace          wgt = round(wgt)

* Compute lagged wages growth

sort  llm finyear

gen double L1_ws_gr = L1.ws_gr

* Unemployment rate * 100 (rounded) ... used later

gen urate_100 = round(urate*100) if urate<=20

/*******************************************************************************
	5 - Construct spline variables for the regression analysis
*******************************************************************************/

* Linear spline with single kink at 4% 

mkspline urate1 4.0 urate2 = urate, marginal 

* Linear spline with kinks at 4%, 5.5% and 7.5% 

mkspline u1 4 u2 5.5 u3 7.5 u4 = urate, marginal 

* Cubic spline, setting the knots at 4%, 5.5% and 7.5%

mkspline spline =urate, cubic knots(4 5.5 7.5) displayknots
				
* Save separate master file for each geographic classification

save master_`clus', replace

}

*end of do file 