/*******************************************************************************

	4_firm_level.do
	
	This file contains code to extract and clean firm-level data from Dun &
	Bradstreet for the robustness tests in Section 4.4 of Bishop J and
	I Chan(2019), Is Declining Union Membership Contributing to Low Wages
	Growth?, RBA Research Discussion Paper No 2019-02.
	
*******************************************************************************/

clear all
set more off   

local dir  "<path for 'data' folder here>"

cd "`dir'"

/*********************************************************************************************
 D&B data
 
 The data cleaning that follows is from code used for Rose Kenney, Gianni La Cava and David
 Rodgers, 'Why do Companies Fail?', RBA Research Discussion Paper No 2016-09.
*********************************************************************************************/

use "<path for Dun & Bradstreet data here>", clear

*BALANCE SHEET ITEMS

* Total liabilities

gen double TotalLiabilities= current_liabilities + noncurrent_liabilities
replace TotalLiabilities= total_liabilities__equity - totalshareholderequity if  TotalLiabilities==. & total_liabilities__equity!=. & totalshareholderequity!=. 
replace TotalLiabilities= current_liabilities if TotalLiabilities==. & current_liabilities!=. 
replace TotalLiabilities= noncurrent_liabilities if  TotalLiabilities==. & noncurrent_liabilities!=. 
replace TotalLiabilities=0 if TotalLiabilities==.  
replace TotalLiabilities=0 if TotalLiabilities<0  

* Total assets

replace total_assets = total_liabilities__equity  if total_liabilities__equity!=. & total_assets==. |  total_assets==0
replace total_assets = profit_before_tax * return_on_assets if profit_before_tax!=. & return_on_assets!=. & total_assets==. |  total_assets==0
replace total_assets = TotalLiabilities + totalshareholderequity  if TotalLiabilities!=. & totalshareholderequity!=. & total_assets==. |  total_assets==0
replace total_assets = total_current_assets if total_assets==. |  total_assets==0

* Debt

replace current_liabilities=0 if current_liabilities==.
replace noncurrent_liabilities=0 if noncurrent_liabilities==.
replace cash_at_bank=0 if cash_at_bank==.
replace deposits_shortterm=0 if deposits_shortterm==.
replace provisions=0 if provisions==.

gen double debt = noncurrent_liabilities - provisions + current_trade_creditors if noncurrent_liabilities >= provisions
replace    debt = current_trade_creditors if noncurrent_liabilities <= provisions
replace    debt = 0 if debt<0 | debt==.
replace    debt = current_trade_creditors if debt < current_trade_creditors
replace    debt = 0 if debt==.

rename total_assets assets
rename total_liabilities liabilities
rename current_liabilities curliabilities
rename total_current_assets curassets

replace cash_at_bank = 0 if cash_at_bank == . & deposits ~=.
replace deposits = 0 if deposits  == . & cash_at_bank ~=.
g double cash = cash_at_bank+deposits

*P&L ITEMS
rename current_sales sales
g double intexp = borrowing_costs*-1
g double deprec = depreciation_and_amortisation_ex*-1
g double ebit = return_on_assets*assets
rename current_profit ebit2

g double ppe = property_plant_and_equip
replace  ppe = . if ppe<=0
g double ppe_peremp = ppe/employees

*Convert to millions
foreach i in assets liabilities cash curliab curasset debt ebit ebit2 sales intexp ppe  {
replace `i' = `i'/1000000
}
*
gsort year -assets
g same1 = (assets == assets[_n-1] & year == year[_n-1] & dbid~=dbid[_n-1])
replace same1 = 0 if assets == 0
g same2 = (cash == cash[_n-1] & year == year[_n-1] & dbid~=dbid[_n-1])
replace same2 = 0 if cash == 0
g same3 = (sales == sales[_n-1] & year == year[_n-1] & dbid~=dbid[_n-1])
replace same3 = 0 if sales == 0
g same4 = (debt == debt[_n-1] & year == year[_n-1] & dbid~=dbid[_n-1])

g same_sum = same1+same2+same3+same4
*edit year assets sales debt cash dbid CompanyName same*

bysort dbid: egen copycat = max(same_sum)
tab copycat

*Drop companies that appear to be duplicates of other companies!
* drop if copycat == 4

keep  dbid date ebit ebit2 debt assets cash intexp deprec sales acn CompanyName postcode State Import_Ind Export_Ind IM_PARENT_IND IM_State IM_Country ULT_PARENT_IND ULT_State ULT_Country employees ppe_peremp ppe copycat
order dbid date ebit ebit2 debt assets cash intexp deprec sales acn CompanyName postcode State Import_Ind Export_Ind IM_PARENT_IND IM_State IM_Country ULT_PARENT_IND ULT_State ULT_Country employees ppe_peremp ppe copycat

* drop if assets > $200 billion
drop if assets > 200000

cd "`dir'"

* Map ACNs to ABNs using company registration data

merge m:1 acn using acn_abn_map
drop if _merge==2
drop    _merge

order dbid date abn 
drop acn

compress

save db, replace

forvalues i = 1(1)3 {

use db, clear

foreach var of varlist _all {
rename `var' db`i'_`var'
rename db`i'_`var', lower
}
foreach x in dbid date abn {
rename db`i'_`x' `x'
}
*

sort dbid date

* merging variable will be the middle of the relevant financial year

gen date_merge = date - 182 

drop if abn==.

bysort  abn date_merge :  gen dup = cond(_N==1,0,_n)
drop    if dup!=0
drop    dup

rename abn abn`i'

save db_panel`i', replace
clear
}
*/

/*********************************************************************************************
 Merge D&B data to EBA data
 
 There are upto 3 possible ABNs available for each agreement i.e. the ABN for a given family can
 change over time
*********************************************************************************************/

cd "`dir'"

use wad_extract_clean_panel, clear

gen date_merge = start_date

forvalues i = 1(1)3 {
nearmrg abn`i' using db_panel`i', nearvar(date_merge) genmatch(date_mid_fy`i') 
drop if _merge==2
drop    _merge
drop dbid date
format date_mid_fy`i' %td
}
*
* Only keep firm data if the mid-point of the financial year is within 182 days of the start_date of the agt for at least one of the agts

forvalues i = 1(1)3 {
gen date_dif`i' = abs(start_date - date_mid_fy`i')
bysort family_id: egen date_dif`i'_min = min(date_dif`i')
replace date_mid_fy`i' = . if date_dif`i'_min>182
}
*
* A. Cases where a unique set of company data are available 

egen db_count = rownonmiss(date_mid_fy*)
tab  db_count

gen     db_use = 1 if db_count==1 & date_mid_fy1!=.
replace db_use = 2 if db_count==1 & date_mid_fy2!=.
replace db_use = 3 if db_count==1 & date_mid_fy3!=.

* B. Cases where more than one set of company data are available

*       1: use the company for which the largest set of observations are available

forvalues i = 1(1)3 {
gen db`i'_nm = 1 if date_mid_fy`i'!=. & date_dif`i'<=182
bysort family_id: egen db`i'_nm_count = count(db`i'_nm)
}
*
replace db_use = 1 if db_use==. & (db1_nm_count > db2_nm_count) & (db1_nm_count > db3_nm_count) & date_mid_fy1!=.
replace db_use = 2 if db_use==. & (db2_nm_count > db1_nm_count) & (db2_nm_count > db3_nm_count) & date_mid_fy2!=.
replace db_use = 3 if db_use==. & (db3_nm_count > db1_nm_count) & (db3_nm_count > db2_nm_count) & date_mid_fy3!=.

*       2: use the modal ABN

bysort family_id: egen abn_mode = mode(abn)

replace db_use = 1 if db_use==. & abn1==abn_mode & date_mid_fy1!=.
replace db_use = 2 if db_use==. & abn2==abn_mode & date_mid_fy2!=.
replace db_use = 3 if db_use==. & abn3==abn_mode & date_mid_fy3!=.

*       3: use the most recent ABN

bysort family_id: egen lasttime = max(cond(abn != ., start_date, .))

gen double abn_mostrecent = abn if lasttime==start_date
bysort family_id: egen double abn_mostrecent_max = max(abn_mostrecent)
 
replace db_use = 1 if db_use==. & abn1==abn_mostrecent_max & date_mid_fy1!=.
replace db_use = 2 if db_use==. & abn2==abn_mostrecent_max & date_mid_fy2!=.
replace db_use = 3 if db_use==. & abn3==abn_mostrecent_max & date_mid_fy3!=.


foreach j in ebit ebit2 debt assets cash intexp deprec sales postcode employees ppe_peremp ppe copycat {
gen      double db_`j' = .
forvalues i = 1(1)3 {
replace  db_`j' = db`i'_`j' if db_use == `i' & date_dif`i'<=182
}
}
*
foreach j in companyname state import_ind export_ind im_parent_ind im_state im_country ult_parent_ind ult_state ult_country {
gen      db_`j' = ""
forvalues i = 1(1)3 {
replace  db_`j' = db`i'_`j' if db_use == `i' & date_dif`i'<=182
}
}
*

drop db1* db2* db3* date_merge date_mid_fy* date_dif* db_count db_use abn_mode lasttime abn_mostrecent abn_mostrecent_max abn1 abn2 abn3 abn_unique
drop states sourcefile replacedagreements 

compress

sort family_id start_date

save wad_extract_clean_panel_firm, replace

clear

* end of do file
