/**************************************************************/
/*  Program created by James Bishop during RBA inposting in  **/
/*  April 2016. It creates a STATA database of the WPI micro **/
/*  data.                                                    **/

/*  NB: Run extract_data.sas first                           **/
/**************************************************************/

clear all

set more off

cd "\\sasnasprd\SASData\W386\RBA\dta_files\jobdata"

/**************************************************************/
* Append the four separate files to create master job-level file
use jobdata_Mar_temp, clear
append using jobdata_Jun_temp
append using jobdata_Sep_temp
append using jobdata_Dec_temp

sort unitid surveyq

save jobdata_master, replace

/**************************************************************/

use jobdata_master, clear

* Merge some relevant edited variables (from mujaa90)

cd "\\sasnasprd\SASData\W386\RBA\dta_files\mujaa90"
merge 1:1 merge_id surveyq using mujaa90_master, keepusing(m_changeei m_temp_dead) 
drop _merge
cd "\\sasnasprd\SASData\W386\RBA\dta_files\jobdata"

* Clean data

* Create numeric ID and declare data to be panel
* NB: merge on unitid, not id

egen  id = group(unitid)
sort  id surveyq

xtset id surveyq

* No. quarters since previous survey record 
by id: gen surveyq_dif=surveyq-surveyq[_n-1]

* No. quarters the job has been in the sammple
sort unitid surveyq
by   unitid:  gen jbcount = cond(_N==1,0,_n)
replace jbcount=1 if jbcount==0

* Quarter first selected in sample
gen surveystsmp = yq(yr1stsmp, qt1stsmp)
format surveystsmp %tq
drop yr1stsmp qt1stsmp

* Dummy=1 if job occupied (JB2) 
replace  joboccpd="1" if joboccpd=="Y" | joboccpd=="y"
replace  joboccpd="0" if joboccpd=="N" | joboccpd=="n"
destring joboccpd, replace

* Dummy=1 if job has same employee as previous quarter (JB2)
gen     same_emp = 1 if changeei=="Y" | changeei=="y"
replace same_emp = 0 if changeei=="N" | changeei=="n"
drop    changeei

* Dummy=1 if employee paid at the same grade/class/level/age scale as previous quarter (JB2)
* Includes 'not applicable' responses as 'yes'
gen     same_lev = 1 if lvlchang=="Y" | lvlchang=="y" | lvlchang=="X" | lvlchang=="x"
replace same_lev = 0 if lvlchang=="N" | lvlchang=="n"
drop lvlchang

* Dummy=1 if job casual
gen     casual = 1 if casload=="Y" | ftptflg=="C"
replace casual = 0 if casload=="N" | (ftptflg!="C" & ftptflg!="")
drop    casload 

format psi wkstdhrs wkstdmin ordhrs ordmin vhrs vmins anzsco asco4 %10.0g

/**************************************************************
 Identifiers
 
 In jobdata 'unitid' is a 19 character ID for each job. The first 16 characters denote the 'firm' (or the payroll 
 split within a given firm in a given state). The last 3 characters denote a job within the 'firm', which is 
 allocated by the WPI analyst. 

 Furthermore:
 - The first 10 digits (e.g. MU01324171) are the primary identifying number from the sample frame e.g. Kmart
 - The next digit denotes the state (e.g. 100000 is NSW) e.g. Kmart in NSW
 - The following four digits indicate cases where a 'split' reporting unit has been created (when the WPI team have 
   identified a business that has very distinct payrolls e.g. 00001 or 00002. If no split, then 00000.
 
 The primary identifying number comes from the ABS's sampling frame,  so if a unitid appears at different dates
 (e.g. MU01681869100000283 was present 2001q3 to 2004q2 and then 2010q4 to 2015q4) it is not necessarily the same 
 job, but will be the same business entity. Need to allocate these unitid a new job code (992 for first recycle, 
 993 for second, etc). 
 
 In empdata 'unitid' is the first 16 characters of 'unitid' in jobdata. 

 **************************************************************/
gen firmid = substr(unitid,1,16)
gen jobid  = substr(unitid,-3,.)

gen firmid_primr = substr(unitid,1 ,10) 
gen firmid_state = substr(unitid,11, 1) 
gen firmid_split = substr(unitid,12, 4) 

gen unit_type=""
replace unit_type="MU" if strpos(unitid, "MU")
replace unit_type="AT" if strpos(unitid, "AT")


gen     surveyq_difflg = 1 if surveyq_dif>1 & surveyq_dif!=.
replace surveyq_difflg = 0 if jbcount==1    | surveyq_dif==1

sort id surveyq

gen       recycle = 1 if jbcount==1
summarize jbcount
forvalues b = 2(1)`r(max)' {
replace   recycle = recycle[_n-1] + surveyq_difflg if jbcount==`b'
}
*
tab recycle
replace jobid=jobid+"B" if recycle==2
replace jobid=jobid+"C" if recycle==3
replace jobid=jobid+"D" if recycle==4
replace jobid=jobid+"E" if recycle==5

replace unitid = firmid+jobid

drop  id surveyq_dif jbcount recycle surveyq_difflg

egen  id = group(unitid)
sort  id surveyq

xtset id surveyq

by id: gen surveyq_dif=surveyq-surveyq[_n-1]
sort unitid surveyq
by   unitid:  gen jbcount = cond(_N==1,0,_n)
replace jbcount=1 if jbcount==0

/**************************************************************
 Employee ID
 
 The job ID will not change when a new employee comes into a job role. The only time a job ID is changed 
 is when the analyst determines the duties of a new employee have changed significantly from those of the 
 previous person in the job.   
 
 Want to identify cases where employee has changed and create a unique 'employeeid' (an additional 3 characters
 appended to the end of unitid).
 
 Two situations in which we can identify a change of employee:
 
		1: The change occurs in between two consecutive quarterly surveys. Use firms' report of whether 
		   there was a change (changeei). If data on changeei were not reported, assume no change in employee.
		   
		2: The job transitions from active at t, to 'temporarily dead' (defnilqt=T) at t+1 and then back 
		   to active at t+1. Exclude cases of temporarily dead where it is clear that employee is not being
		   replaced e.g. they are simply on leave and are due to return. 
		   
		   We also assume that if the distance between any two records > 1 quarter, that a new employee is 
		   in the job.
 
 **************************************************************/
 
gen     temp_dead=1 if defnilqt=="T"
replace temp_dead=. if strpos(lower(dfnilcmt), "leave")          | strpos(lower(dfnilcmt), "did not work") | strpos(lower(dfnilcmt), "did not wrk") | ///
                       strpos(lower(dfnilcmt), "still employed") | strpos(lower(dfnilcmt), "didnt work")   | strpos(lower(dfnilcmt), "didn't work") | ///
					   strpos(lower(dfnilcmt), "return")         | strpos(lower(dfnilcmt), "no hours")     | strpos(lower(dfnilcmt), "holiday")     | ///
					   strpos(lower(dfnilcmt), "lwop")           | strpos(lower(dfnilcmt), "lsl")          | strpos(lower(dfnilcmt), "no hrs")      | ///
					   strpos(lower(dfnilcmt), "not required")

					   
gen     diff_emp = 1 if same_emp==0 | temp_dead==1 | m_temp_dead==1| (surveyq_dif>=2 & surveyq_dif!=.)
replace diff_emp = 0 if same_emp==1
replace diff_emp = 0 if diff_emp==.

sort id surveyq

gen       employee = 1 if jbcount==1
summarize jbcount
forvalues b = 2(1)`r(max)' {
replace   employee = employee[_n-1] + diff_emp if jbcount==`b'
}
tostring employee, gen(employee_string)

replace employee_string = "00"+employee_string if employee>=1  & employee<=9
replace employee_string = "0" +employee_string if employee>=10 & employee<=99

gen employeeid = unitid + employee_string

drop temp_dead diff_emp employee employee_string

/**************************************************************
 Hours worked 
 
 Create a pay frequency divisor which indicates the number of weeks per pay period. Used to 
 express ordinary hours on a weekly basis, since expressed in terms of pay frequency in data.
 
 **************************************************************/

* Pay frequency divisor (weeks per pay period)
gen     pay_divisor = .
foreach x in payfreq ordgefq {
replace pay_divisor = 1      if `x'=="W"
replace pay_divisor = 2      if `x'=="F"
replace pay_divisor = 4.333  if `x'=="M"
replace pay_divisor = 4      if `x'=="O" & (strpos(lower(`x'o), "4 weekly")      | strpos(lower(`x'o), "four weekly")     | ///
                                            strpos(lower(`x'o), "mid month")     | strpos(lower(`x'o), "mid-monthly")     | ///
                                            strpos(lower(`x'o), "4 w")           | strpos(lower(`x'o), "4w")              | ///
											strpos(lower(`x'o), "every 28 days")) 
replace pay_divisor = 2.1665 if `x'=="O" & (strpos(lower(`x'o), "twice monthly") | strpos(lower(`x'o), "twice per month") | ///
                                            strpos(lower(`x'o), "twice a month") | strpos(lower(`x'o), "half-monthly")    | ///
											strpos(lower(`x'o), "half monthly")  | strpos(lower(`x'o), "semi-monthly")    | ///
											strpos(lower(`x'o), "1/2 monthly")   | strpos(lower(`x'o), "semimonthly")     | ///
											strpos(lower(`x'o), "half month")    | strpos(lower(`x'o), "twice-monthly")   | ///
										    strpos(lower(`x'o), "twice month")   | strpos(lower(`x'o), "semi monthly")    | ///
											strpos(lower(`x'o), "twice mthly")   | strpos(lower(`x'o), "bi-mthly")        | ///
											strpos(lower(`x'o), "bi-monthly")    | strpos(lower(`x'o), "bi monthly")      | ///
											strpos(lower(`x'o), "bi - monthly")  | strpos(lower(`x'o), "b-monthly")       | ///
											strpos(lower(`x'o), "bimonthly")     | strpos(lower(`x'o), "15th & 30th mthly"))									
replace pay_divisor = 2      if `x'=="O" & (strpos(lower(`x'o), "2 weeks")) 
replace pay_divisor = 12.999 if `x'=="O" & (strpos(lower(`x'o), "quarterly")     | strpos(lower(`x'o), "qrtly")           | ///
                                            strpos(lower(`x'o), "qtly")) 
replace pay_divisor = 1/7    if `x'=="O" & (strpos(lower(`x'o), "daily")) 
}

tab payfreqo if pay_divisor==.
tab ordgefqo if pay_divisor==.
drop payfreq ordgefq payfreqo ordgefqo 

* Weekly standard hours
replace wkstdhrs = wkstdhrs + (wkstdmin/60)

* Weekly ordinary hours
gen     wkordhrs = ordhrs + (ordmin/60)
replace wkordhrs=wkordhrs/pay_divisor  


/**************************************************************
 Hourly wage 
   
  Prior to 2004q2 the survey only asked for ordinary time gross earnings for the pay period (ordge), with no
  option to report an hourly wage. To calculate an hourly wage based on weekly standard hours, we need to convert 
  ordge to a weekly basis using the pay frequency divisor. However, to calculate an hourly wage based on ordinary
  hours (our preferred approach), this is not necessary given that ordinary hours are also in terms of the pay 
  period. 
   
  From 2004q2, the survey asked for either an hourly wage (payhour) or an annual salary (payann). For 95% of award 
  jobs, an hourly rate is reported.
  
  Questions on allowances (othalw1-3) and shift/penalty payments (shfalw) asked 1998q2 to 2004q2.
  
 **************************************************************/

 * Weekly ordinary time gross earnings (pre 2004q2)
gen     wkordge=ordge/pay_divisor

* Preferred measure of hourly wage
* 1- Post 2004q2:
gen     hrwage = .
replace hrwage =  payhour             if (payhour !=. & payhour!=0)
replace hrwage = (payann/52)/wkordhrs if (hrwage  ==. & payann !=0)
* 2-  Pre 2004q2:
replace hrwage =  ordge/ordhrs        if (hrwage  ==. & ordge  !=0)
   
* Alternative measure of hourly wages that uses standard hours to convert annual salary after 2004q2
gen     hrwage_std = .
replace hrwage_std =  payhour             if (payhour !=. & payhour!=0)
replace hrwage_std = (payann/52)/wkstdhrs if (hrwage  ==. & payann !=0)
replace hrwage_std =  ordge/ordhrs        if (hrwage  ==. & ordge  !=0)
 
* Flag for hourly pay or annual salary
gen     pay_report="HOUR" if payhour!=. & (payann==. | ordge==.)
replace pay_report="ANNL" if payhour==. & (payann!=. | ordge!=.)
replace pay_report="BOTH" if payhour!=. &  payann!=. 

tab year pay_report
 
/**************************************************************
 Hourly wage: base wage
   
  hrwage is the 'loaded' wage, rather than the 'base' wage that would be in the award. It includes:
   1. casual loading
   2. public holiday and weekend penalties and shift allowances/penalties 
   3. taxable allowances
  
  Absent further information, assume casual loading = 20%. Explicit questions on (2) and (3) were asked 
  between 1998q2 to 2004q2. 
  
  Absent further information on how the casual loading and (2) and (3) interact, assume that both the
  causal loading and penalties are calculated on the base rate:
  
    base rate + (base rate x casual loading) + (base rate x penalty)
  
  This is the FWC's 'default method': https://www.fairwork.gov.au/about-us/news-and-media-releases/
  newsletter/august-2014/understanding-casual-penalty-rates  
  
  If data on (2) and (3) are missing, assume zero.
 **************************************************************/ 
 
* Taxable allowances, weekly (1998q2 to 2004q2)
forvalues x = 1(1)3 {
gen     othalw`x'_wk = othalw`x'        if othalw`x'f=="W"
replace othalw`x'_wk = othalw`x'/2      if othalw`x'f=="F"
replace othalw`x'_wk = othalw`x'/4.333  if othalw`x'f=="M"
replace othalw`x'_wk = othalw`x'/12.999 if othalw`x'f=="Q"
replace othalw`x'_wk = othalw`x'/52     if othalw`x'f=="A"
replace othalw`x'_wk = othalw`x'/26     if othalw`x'f=="B"
replace othalw`x'_wk = 0                if othalw`x' == 0
* AD and O stand for ad hoc and other, respectively. Set pay frequency tobase pay frequency. 
replace othalw`x'_wk = othalw`x'/pay_divisor  if othalw`x'f=="AD" | othalw`x'f=="O"
}
*
egen othalw_wk=rowtotal(othalw1_wk othalw2_wk othalw3_wk), missing

* Taxable allowances, pay period (1998q2 to 2004q2)
gen     othalw_pp=othalw_wk*pay_divisor
replace othalw_pp=0 if othalw_pp==. & (surveyq>=tq(1998q2) & surveyq<=tq(2004q2)) & ordge!=.
 
* Shift/penalty payments, pay period (1998q2 to 2004q2) 
replace shfalw=0    if shfalw==.    & (surveyq>=tq(1998q2) & surveyq<=tq(2004q2)) & ordge!=.

* Base wage: loaded wage less (1), (2) and (3)
gen ordge_base  = (ordge - othalw_pp - shfalw)/(1+0.2) 
gen hrwage_base = ordge_base/ordhrs   
 

drop payhour payann  ordhrs ordmin wkstdmin
* ordge wkordge

/**************************************************************
 Award-wage jobs
 
1- Pay setting mechanism (PMI)
 
 Generate a dummy for award-based pay using pmekflg. Data only collected systematically from
 2002q1, so backcast prior to this. 
 
 Method 1: If the PMI for a given job was award in 2002-2003 then assume it was also award prior 
 to this. Assumes that the job did not transition onto an award prior to 2002q1.
 
 Method 2: Use string searches to identify award-pay jobs prior to 2002q1. 
 
2- Pay treatment indicator (PTI)

 PTI identifies jobs that are 'award-influenced', including over-award and EBAs mechanically 
 linked to awards. 
 
 'pti' available from 2004q3
 'chg' available from 1998q2
 '_ps' available from 2010q2
 
***************************************************************/
* PMI
foreach z in AW CA IA OA OT SA {
replace pmekflg="`z'" if pmekflg=="`z'N" | pmekflg=="`z'P" 
}
* Unclassifiable
replace pmekflg="UNC" if pmekflg=="XX" | pmekflg=="XXN" | pmekflg=="YY" | pmekflg=="ZZ"

gen     pmi=""
replace pmi="AWD" if pmekflg=="AW" | pmekflg=="PS" | pmekflg=="MA" | pmekflg=="TA" 
replace pmi="EBA" if pmekflg=="CA" 
replace pmi="IND" if pmekflg=="IA" | pmekflg=="OA" | pmekflg=="SA" | pmekflg=="IF" | pmekflg=="RE"
replace pmi="OTH" if pmekflg=="OT" 
replace pmi="UNC" if pmekflg=="UNC" 

* Method 1: Dummy=1 if Award, backcast prior to 2002

gen     awardm1=1 if pmi=="AWD"

gen     awd_2002 = 1 if pmi=="AWD"       & surveyq==tq(2002q1)

sort unitid surveyq
by   unitid: gen pmi_2002q2 = pmi[_n+1] if surveyq==tq(2002q1)
by   unitid: gen pmi_2002q3 = pmi[_n+2] if surveyq==tq(2002q1)
by   unitid: gen pmi_2002q4 = pmi[_n+3] if surveyq==tq(2002q1)
by   unitid: gen pmi_2003q1 = pmi[_n+4] if surveyq==tq(2002q1)
by   unitid: gen pmi_2003q2 = pmi[_n+5] if surveyq==tq(2002q1)
by   unitid: gen pmi_2003q3 = pmi[_n+6] if surveyq==tq(2002q1)
by   unitid: gen pmi_2003q4 = pmi[_n+7] if surveyq==tq(2002q1)

replace awd_2002 = 1 if awd_2002==. & pmi_2002q2=="AWD" 
replace awd_2002 = 1 if awd_2002==. & pmi_2002q3=="AWD" 
replace awd_2002 = 1 if awd_2002==. & pmi_2002q4=="AWD" 
replace awd_2002 = 1 if awd_2002==. & pmi_2003q1=="AWD" 
replace awd_2002 = 1 if awd_2002==. & pmi_2003q2=="AWD" 
replace awd_2002 = 1 if awd_2002==. & pmi_2003q3=="AWD" 
replace awd_2002 = 1 if awd_2002==. & pmi_2003q4=="AWD" 

bysort  unitid: egen  awd_2002_all = max(awd_2002)
replace awardm1 = 1 if surveyq<tq(2002q1) & awd_2002_all==1

drop awd_2002 pmi_2002* pmi_2003* awd_2002_all

tab surveyq awardm1

replace awardm1=0 if awardm1==.

* Method 2: Dummy=1 if Award, use string searches prior to 2002

gen     awardm2=1 if pmi=="AWD"

merge 1:1 merge_id surveyq using award_pre2002
drop _merge

replace awardm2 = 1 if surveyq<tq(2002q1) & award_pre2002==1
replace awardm2 = 0 if awardm2==.

drop award_pre2002

tab surveyq awardm2

/**************************************************************
 Identify the DDD control group of EBA jobs
 
 The full sample of EBA jobs that have never received a pay change in 
 response to an AIRC/FWC decision.
 
 * note that the pre-2002 string-based data are initially below the level 
   of the post-2002 PMI-based data because they already implicitly screen
   out many EBA's that are also adjusted according to awards (since string
   searches exclude cases where the word 'award' is also in the string). 
 
***************************************************************/

gen     ebam2=1 if pmi=="EBA"

replace ebam2 = 1 if surveyq<tq(2002q1) & eba_pre2002==1
replace ebam2 = 0 if ebam2==.

* Dummy=1 if job ever received a pay change due to AIRC/FWC decision (including overaward)

gen award_chg = 1 if chgpsafe=="Y" | chgpfair=="Y" | chgpstat=="Y" | chgpfairwork=="Y" | chgpfairwork=="x" | chgpmodaward=="Y" | chgpoawd=="Y" 

bysort  unitid: egen  award_chg_all = max(award_chg)

replace award_chg_all = 0 if award_chg_all==.

gen     eba_ddd = ebam2
replace eba_ddd = 0 if award_chg_all==1

drop award_chg award_chg_all eba_pre2002

tab ebam2 eba_ddd

/**************************************************************
 Junior, appprentice and trainee
 
 Identified using text searches within strings e.g. offcmt and rspcmt
 
 create i_apprentice, j_apprentice etc to indicate whether employee/job was 'ever' apprentice. 
 For example, strings may only mention 'apprentice' when a wage change is observed.
 
***************************************************************/
merge 1:1 merge_id surveyq using dummy_strings

foreach z in apprentice trainee junior {
bysort unitid:     egen j_`z' = max(`z')
bysort employeeid: egen i_`z' = max(`z')
}
drop apprentice trainee junior
sort unitid surveyq

/**************************************************************
 Occupation of job
 
 asco4 used exclusively between 1998q2 and 2007q2, and was continued for all firms until
 2010q1. 
 
 anzsco was first introduced in 2007q3, with data available for 1/2 of all firms. From 
 2007q4 onwards, anzsco is available for all firms.
 
 In ASCO, occupations are classified according to two main criteriaskill level
 and skill specialisation. 
 
 The skill level of an occupation is defined as a function of the range and
 complexity of the set of tasks involvedthe greater the range and
 complexity of the set of tasks, the greater the skill level of the occupation.
***************************************************************/
tostring asco4, gen(asco4str)

forvalues m = 1(1)3 {
gen asco`m' = substr(asco4str,1,`m') if asco4str!="."
destring asco`m', replace
}
*

keep id unitid surveyq jbcount same_emp same_lev casual wkstdhrs wkordhrs hrwage hrwage_std ///
        hrwage_base pay_divisor pay_report pmi awardm1 awardm2 ebam2 eba_ddd pmekflg pmekcmt defnilqt asco* anzsco ///
		i_apprentice j_apprentice i_trainee j_trainee i_junior j_junior ///
		quarter year firmid jobid employeeid firmid_primr firmid_state firmid_split vpayflg unit_type merge_id    

order id unitid surveyq jbcount same_emp same_lev casual wkstdhrs wkordhrs hrwage hrwage_std ///
        hrwage_base pay_divisor pay_report pmi awardm1 awardm2 ebam2 eba_ddd pmekflg pmekcmt defnilqt asco* anzsco ///
		i_apprentice j_apprentice i_trainee j_trainee i_junior j_junior ///
		quarter year firmid jobid employeeid firmid_primr firmid_state firmid_split vpayflg unit_type merge_id   		
		
		
* Give all job-level variables an j_ prefix
foreach var of varlist _all {
rename `var' j_`var'
}
*
rename j_surveyq  surveyq
rename j_merge_id merge_id
 
save jobdata_master_final, replace

* end of do file
