This notebook explores Granger Causality between sentiment indices derived from earnings calls and similar indices from the RBA's Business Liaison and NAB's business surveys. The procedure followed to establish Granger causality follows Toda H.Y.; Yamamoto T. (1995). Statistical inference in vector autoregressions with possibly integrated processes. Journal of Econometrics, 66, 225–250. Specifically the steps below are followed:
# Packages
from bs4 import BeautifulSoup
import requests
from bs4 import NavigableString
import queue
from concurrent.futures import ThreadPoolExecutor
import itertools
import threading
import csv
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import numpy as np
import re
import urllib.request
import seaborn as sns
import statsmodels
import statsmodels.api
from statsmodels.tsa.stattools import grangercausalitytests
from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import adfuller
from statsmodels.tools.eval_measures import rmse, aic
from statsmodels.stats.stattools import durbin_watson
/opt/anaconda3/lib/python3.7/site-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead. import pandas.util.testing as tm
# This is the data for testing only the March and September quarters
testing_mq_sq = pd.read_excel(open('granger_mq_sq_new.xlsx', 'rb'), sheet_name='levels')
testing_mq_sq = testing_mq_sq.drop(['date'], axis=1)
testing_mq_sq.set_index('date_dummy', inplace=True)
# This is the data for testing all quarters
testing = pd.read_excel(open('granger_new.xlsx', 'rb'), sheet_name='levels')
testing.set_index('date', inplace=True)
# Use this to drop some of the variables
drop = ['prices_ml_f', 'inputcosts_ml_f', 'demand_ml_f', 'wages_ml_f', 'wpi_inflation', 'gdp_growth']
testing = testing.drop(drop, axis=1)
testing_mq_sq = testing_mq_sq.drop(drop, axis=1)
# Use this to change the sample window
#testing_mq_sq = testing_mq_sq[:-5]
#testing = testing[:-9]
def adfuller_test(series, signif=0.1, name='', verbose=False):
"""Perform ADFuller to test for Stationarity of given series and print report"""
r = statsmodels.tsa.stattools.adfuller(series, autolag='AIC')
output = {'test_statistic':round(r[0], 4), 'pvalue':round(r[1], 4), 'n_lags':round(r[2], 4), 'n_obs':r[3]}
p_value = output['pvalue']
def adjust(val, length= 6): return str(val).ljust(length)
# Print Summary
if p_value > signif:
outcome.append(f"{name}, P-Value = {p_value} => non-stationary, m=1")
else:
outcome.append(f"{name}, P-Value = {p_value} => stationary, m=0")
def crosscorr(datax, datay, lag):
""" Lag-N cross correlation.
Parameters
----------
lag : int, default 0
datax, datay : pandas.Series objects of equal length
Returns
----------
crosscorr : float
"""
return datax.corr(datay.shift(lag))
variables = []
variables.append(['nab_purchase','inputcosts_ml'])
variables.append(['inputcosts_ml','nab_purchase'])
variables.append(['inputcosts_rd','inputcosts_ml'])
variables.append(['inputcosts_ml','inputcosts_rd'])
variables.append(['ppi_inflation','inputcosts_ml'])
variables.append(['inputcosts_ml','ppi_inflation'])
variables.append(['forward_orders','demand_ml'])
variables.append(['demand_ml','forward_orders'])
variables.append(['demand_rd','demand_ml'])
variables.append(['demand_ml','demand_rd'])
variables.append(['dfd_growth','demand_ml'])
variables.append(['demand_ml','dfd_growth'])
variables.append(['nab_labour','wages_ml'])
variables.append(['wages_ml','nab_labour'])
variables.append(['wages_rd','wages_ml'])
variables.append(['wages_ml','wages_rd'])
variables.append(['coe_inflation','wages_ml'])
variables.append(['wages_ml','coe_inflation'])
variables.append(['nab_selling','prices_ml'])
variables.append(['prices_ml','nab_selling'])
variables.append(['prices_rd','prices_ml'])
variables.append(['prices_ml','prices_rd'])
variables.append(['cpi_inflation','prices_ml'])
variables.append(['prices_ml','cpi_inflation'])
# ADF Test on each column
outcome = []
for name, column in testing.iteritems():
adfuller_test(column.diff().dropna(), name=column.name)
outcome
['prices_d, P-Value = 0.0 => stationary, m=0', 'inputcosts_d, P-Value = 0.0 => stationary, m=0', 'demand_d, P-Value = 0.0133 => stationary, m=0', 'wages_d, P-Value = 0.0 => stationary, m=0', 'prices_ml, P-Value = 0.0023 => stationary, m=0', 'inputcosts_ml, P-Value = 0.0001 => stationary, m=0', 'demand_ml, P-Value = 0.0 => stationary, m=0', 'wages_ml, P-Value = 0.3975 => non-stationary, m=1', 'prices_rd, P-Value = 0.0015 => stationary, m=0', 'inputcosts_rd, P-Value = 0.0001 => stationary, m=0', 'demand_rd, P-Value = 0.0 => stationary, m=0', 'wages_rd, P-Value = 0.0007 => stationary, m=0', 'ppi_inflation, P-Value = 0.0 => stationary, m=0', 'cpi_inflation, P-Value = 0.0 => stationary, m=0', 'dfd_growth, P-Value = 0.0 => stationary, m=0', 'coe_inflation, P-Value = 0.0 => stationary, m=0', 'nab_labour, P-Value = 0.0005 => stationary, m=0', 'nab_purchase, P-Value = 0.0 => stationary, m=0', 'nab_selling, P-Value = 0.0025 => stationary, m=0', 'forward_orders, P-Value = 0.0 => stationary, m=0']
aic_lag = []
DW_test = []
granger = []
max_corr = []
max_corr_lag = []
correlations = []
for i in variables:
# determine the optimal number of lags in the VAR
lag = VAR(testing[i].asfreq(freq='Q')).select_order(maxlags=8).aic
aic_lag.append(lag)
# DW test to confirm there no autocorrelation left in the residuals
DW = durbin_watson(VAR(testing[i].asfreq(freq='Q')).fit(lag).resid)
DW_test.append(DW)
# examine granger causality at the optimal lag + 1 (as per Toda H.Y.; Yamamoto T. (1995))
test_result = grangercausalitytests(testing[i], maxlag=lag, verbose=False)
p_value = test_result[lag][0]['ssr_chi2test'][1]
if p_value <= 0.1:
result = 'causation'
else:
result = 'non-causation'
granger.append(result)
# extract the maximum correlation
test_result = [crosscorr(testing[i[0]], testing[i[1]], lag=v) for v in range(lag)]
correlations.append(test_result)
maxi = max(test_result,key=abs)
max_corr.append(maxi)
# extract the lag of the maximum correlation
maxi_lag = test_result.index(maxi)
max_corr_lag.append(maxi_lag)
results = pd.DataFrame(variables, columns = ['Y','X'])
results['aic_lag'] = aic_lag
results['DW_test'] = DW_test
results['granger'] = granger
results['max_corr'] = max_corr
results['max_corr_lag'] = max_corr_lag
results['correlations'] = correlations
results
Y | X | aic_lag | DW_test | granger | max_corr | max_corr_lag | correlations | |
---|---|---|---|---|---|---|---|---|
0 | nab_purchase | inputcosts_ml | 3 | [1.9530205995476964, 1.9338980734784175] | causation | 0.798814 | 1 | [0.7961508233627325, 0.7988137377676833, 0.731... |
1 | inputcosts_ml | nab_purchase | 3 | [1.9338980734784184, 1.9530205995476968] | non-causation | 0.796151 | 0 | [0.7961508233627326, 0.7290960901915199, 0.620... |
2 | inputcosts_rd | inputcosts_ml | 2 | [2.0668916914508824, 2.0567664406003567] | causation | 0.867367 | 0 | [0.8673669988428903, 0.7909911292754309] |
3 | inputcosts_ml | inputcosts_rd | 2 | [2.0567664406003567, 2.0668916914508824] | causation | 0.886471 | 1 | [0.8673669988428903, 0.8864707906867787] |
4 | ppi_inflation | inputcosts_ml | 3 | [2.1207876032979858, 1.9767434209242634] | causation | 0.552886 | 0 | [0.5528862795939039, 0.5412919873877672, 0.440... |
5 | inputcosts_ml | ppi_inflation | 3 | [1.976743420924263, 2.120787603297986] | non-causation | 0.552886 | 0 | [0.5528862795939038, 0.48694409789907045, 0.53... |
6 | forward_orders | demand_ml | 2 | [2.076854501469768, 1.748666583626669] | causation | 0.644444 | 0 | [0.644443751200247, 0.47907779846417553] |
7 | demand_ml | forward_orders | 2 | [1.7486665836266695, 2.076854501469767] | causation | 0.644444 | 0 | [0.644443751200247, 0.4636860745150332] |
8 | demand_rd | demand_ml | 2 | [1.903514539984112, 1.7585354377077504] | causation | 0.682982 | 1 | [0.6503511286692799, 0.6829819894030982] |
9 | demand_ml | demand_rd | 2 | [1.7585354377077504, 1.9035145399841118] | causation | 0.650351 | 0 | [0.6503511286692799, 0.6018249163107213] |
10 | dfd_growth | demand_ml | 2 | [1.9972575908581778, 1.6854825794294424] | non-causation | 0.227721 | 0 | [0.22772056457763992, -0.04041095863699422] |
11 | demand_ml | dfd_growth | 2 | [1.685482579429442, 1.997257590858178] | non-causation | 0.227721 | 0 | [0.22772056457763992, 0.050862364487632865] |
12 | nab_labour | wages_ml | 2 | [2.044435302193597, 1.91430122520752] | causation | 0.779825 | 0 | [0.7798254582584527, 0.6742169079659909] |
13 | wages_ml | nab_labour | 2 | [1.9143012252075209, 2.0444353021935977] | non-causation | 0.779825 | 0 | [0.7798254582584527, 0.6720521750339608] |
14 | wages_rd | wages_ml | 2 | [2.064768621012604, 1.9472216306223809] | non-causation | 0.809471 | 0 | [0.8094711593852189, 0.718537425479361] |
15 | wages_ml | wages_rd | 2 | [1.9472216306223813, 2.064768621012603] | causation | 0.811021 | 1 | [0.8094711593852189, 0.8110212551251655] |
16 | coe_inflation | wages_ml | 2 | [1.9549830282029341, 1.8413725566701646] | non-causation | 0.582688 | 0 | [0.5826875217047367, 0.291930016200963] |
17 | wages_ml | coe_inflation | 2 | [1.8413725566701664, 1.9549830282029361] | non-causation | 0.582688 | 0 | [0.5826875217047366, 0.4960287159761049] |
18 | nab_selling | prices_ml | 2 | [2.0327817522398077, 1.8619335203451455] | causation | 0.696994 | 0 | [0.6969944892299049, 0.6788533974792019] |
19 | prices_ml | nab_selling | 2 | [1.8619335203451455, 2.032781752239809] | causation | 0.696994 | 0 | [0.6969944892299048, 0.629995602420824] |
20 | prices_rd | prices_ml | 2 | [2.0063764174461896, 1.8772948323502003] | non-causation | 0.681792 | 0 | [0.6817917064560491, 0.6222649205642552] |
21 | prices_ml | prices_rd | 2 | [1.8772948323501986, 2.0063764174461896] | causation | 0.681792 | 0 | [0.6817917064560491, 0.6491417939345004] |
22 | cpi_inflation | prices_ml | 2 | [1.8377703110477657, 1.9355808403032249] | causation | 0.571754 | 0 | [0.5717536563681193, 0.5542202030699015] |
23 | prices_ml | cpi_inflation | 2 | [1.9355808403032249, 1.8377703110477666] | non-causation | 0.571754 | 0 | [0.5717536563681191, 0.48014785309058433] |
_qui var nab_labour wages_ml, lags(1/2) exog(l3.wages_ml l3.nablabour)
vargranger
Equation | Excluded | chi2 | df | Prob > chi2 |
---|---|---|---|---|
nab_labour | wages_ml | 21.093 | 2 | 0.000 |
nab_labour | ALL | 21.093 | 2 | 0.000 |
wages_ml | nab_labour | 2.5652 | 2 | 0.277 |
wages_ml | ALL | 2.5652 | 2 | 0.277 |
_qui var wages_rd wages_ml, lags(1/2) exog(l3.wages_ml l3.wagesrd)
vargranger
Equation | Excluded | chi2 | df | Prob > chi2 |
---|---|---|---|---|
wages_rd | wages_ml | .31648 | 2 | 0.854 |
wages_rd | ALL | .31648 | 2 | 0.854 |
wages_ml | wages_rd | 32.454 | 2 | 0.000 |
wages_ml | ALL | 32.454 | 2 | 0.000 |
_qui var coe_inflation wages_ml, lags(1/2) exog(l3.wages_ml l3.coeinflation)
vargranger
Equation | Excluded | chi2 | df | Prob > chi2 |
---|---|---|---|---|
coe_inflation | wages_ml | 2.621 | 2 | 0.270 |
coe_inflation | ALL | 2.621 | 2 | 0.270 |
wages_ml | coe_inflation | 4.413 | 2 | 0.110 |
wages_ml | ALL | 4.413 | 2 | 0.110 |
# ADF Test on each column
outcome = []
for name, column in testing_mq_sq.iteritems():
adfuller_test(column.diff().dropna(), name=column.name)
outcome
['prices_d, P-Value = 0.001 => stationary, m=0', 'inputcosts_d, P-Value = 0.0006 => stationary, m=0', 'demand_d, P-Value = 0.0 => stationary, m=0', 'wages_d, P-Value = 0.0019 => stationary, m=0', 'prices_ml, P-Value = 0.0 => stationary, m=0', 'inputcosts_ml, P-Value = 0.0135 => stationary, m=0', 'demand_ml, P-Value = 0.0244 => stationary, m=0', 'wages_ml, P-Value = 0.0 => stationary, m=0', 'prices_rd, P-Value = 0.0 => stationary, m=0', 'inputcosts_rd, P-Value = 0.0006 => stationary, m=0', 'demand_rd, P-Value = 0.0074 => stationary, m=0', 'wages_rd, P-Value = 0.0015 => stationary, m=0', 'ppi_inflation, P-Value = 0.0079 => stationary, m=0', 'cpi_inflation, P-Value = 0.0217 => stationary, m=0', 'dfd_growth, P-Value = 0.0008 => stationary, m=0', 'coe_inflation, P-Value = 0.0 => stationary, m=0', 'nab_labour, P-Value = 0.0 => stationary, m=0', 'nab_purchase, P-Value = 0.2438 => non-stationary, m=1', 'nab_selling, P-Value = 0.0004 => stationary, m=0', 'forward_orders, P-Value = 0.0 => stationary, m=0']
results = []
bic_lag = []
DW_test = []
granger = []
max_corr = []
max_corr_lag = []
correlations = []
for i in variables:
# determine the optimal number of lags in the VAR
lag = VAR(testing_mq_sq[i].asfreq(freq='Q')).select_order(maxlags=4).bic
bic_lag.append(lag)
# DW test to confirm there no autocorrelation left in the residuals
DW = durbin_watson(VAR(testing_mq_sq[i].asfreq(freq='Q')).fit(lag).resid)
DW_test.append(DW)
# examine granger causality at the optimal lag + 1 (as per Toda H.Y.; Yamamoto T. (1995))
test_result = grangercausalitytests(testing_mq_sq[i], maxlag=lag, verbose=False)
p_value = test_result[lag][0]['ssr_chi2test'][1]
if p_value <= 0.1:
result = 'causation'
else:
result = 'non-causation'
granger.append(result)
# extract the maximum correlation
test_result = [crosscorr(testing_mq_sq[i[0]], testing_mq_sq[i[1]], lag=v) for v in range(2)]
correlations.append(test_result)
maxi = max(test_result,key=abs)
max_corr.append(maxi)
# extract the lag of the maximum correlation
maxi_lag = test_result.index(maxi)
max_corr_lag.append(maxi_lag)
results = pd.DataFrame(variables, columns = ['Y','X'])
results['bic_lag'] = bic_lag
results['DW_test'] = DW_test
results['granger'] = granger
results['max_corr'] = max_corr
results['max_corr_lag'] = max_corr_lag
results['correlations'] = correlations
results
Y | X | bic_lag | DW_test | granger | max_corr | max_corr_lag | correlations | |
---|---|---|---|---|---|---|---|---|
0 | nab_purchase | inputcosts_ml | 1 | [1.538157718187615, 1.689031399522777] | non-causation | 0.822312 | 0 | [0.8223122301743604, 0.6704137296294945] |
1 | inputcosts_ml | nab_purchase | 1 | [1.6890313995227766, 1.5381577181876154] | non-causation | 0.822312 | 0 | [0.8223122301743604, 0.6259646832127302] |
2 | inputcosts_rd | inputcosts_ml | 1 | [2.164382647900841, 1.8075988040375832] | causation | 0.852904 | 0 | [0.8529035726656659, 0.5824556264618963] |
3 | inputcosts_ml | inputcosts_rd | 1 | [1.8075988040375826, 2.164382647900841] | causation | 0.896921 | 1 | [0.8529035726656659, 0.8969207331428606] |
4 | ppi_inflation | inputcosts_ml | 1 | [1.4561443938748286, 1.5936842421171535] | causation | 0.710131 | 0 | [0.7101308273920196, 0.5138329504281973] |
5 | inputcosts_ml | ppi_inflation | 1 | [1.5936842421171549, 1.4561443938748286] | non-causation | 0.710131 | 0 | [0.7101308273920195, 0.6379558371206847] |
6 | forward_orders | demand_ml | 1 | [1.810501050614225, 1.7283243929545429] | causation | 0.715031 | 0 | [0.7150305580493861, 0.5202793762206624] |
7 | demand_ml | forward_orders | 1 | [1.728324392954541, 1.8105010506142265] | non-causation | 0.715031 | 0 | [0.7150305580493862, 0.4358835598401134] |
8 | demand_rd | demand_ml | 1 | [1.7851646677046504, 1.5884764711144312] | causation | 0.784262 | 0 | [0.7842620944185977, 0.7434270910806663] |
9 | demand_ml | demand_rd | 1 | [1.5884764711144301, 1.7851646677046493] | non-causation | 0.784262 | 0 | [0.7842620944185977, 0.6255826025461292] |
10 | dfd_growth | demand_ml | 1 | [1.9047583940262522, 1.768154815042844] | non-causation | 0.260248 | 0 | [0.2602481993068639, 0.1536656010650877] |
11 | demand_ml | dfd_growth | 1 | [1.7681548150428437, 1.9047583940262522] | non-causation | 0.260248 | 0 | [0.2602481993068639, 0.13045725899266247] |
12 | nab_labour | wages_ml | 1 | [1.5898662259400587, 1.5329403055689483] | causation | 0.820710 | 0 | [0.8207096761075757, 0.6563352387232075] |
13 | wages_ml | nab_labour | 1 | [1.5329403055689483, 1.5898662259400596] | non-causation | 0.820710 | 0 | [0.8207096761075757, 0.6454737859057776] |
14 | wages_rd | wages_ml | 1 | [1.674761659011718, 1.2440301026974778] | non-causation | 0.870849 | 0 | [0.8708490257976954, 0.6384872807863448] |
15 | wages_ml | wages_rd | 1 | [1.2440301026974776, 1.6747616590117185] | causation | 0.870849 | 0 | [0.8708490257976954, 0.8421334012229343] |
16 | coe_inflation | wages_ml | 1 | [1.8329332636087854, 1.5998245683500665] | non-causation | 0.570241 | 0 | [0.5702409499515773, 0.39475216287369813] |
17 | wages_ml | coe_inflation | 1 | [1.5998245683500676, 1.8329332636087854] | non-causation | 0.570241 | 0 | [0.5702409499515773, 0.4939805618659968] |
18 | nab_selling | prices_ml | 1 | [1.5733488606236041, 1.8213581139134571] | non-causation | 0.727158 | 0 | [0.7271583236294156, 0.6277115560718672] |
19 | prices_ml | nab_selling | 1 | [1.8213581139134578, 1.5733488606236041] | non-causation | 0.727158 | 0 | [0.7271583236294156, 0.5500885806470774] |
20 | prices_rd | prices_ml | 1 | [1.8644902518962685, 1.6408878232621436] | non-causation | 0.771602 | 0 | [0.7716018580834252, 0.6389296131504291] |
21 | prices_ml | prices_rd | 1 | [1.6408878232621422, 1.8644902518962687] | causation | 0.771602 | 0 | [0.7716018580834252, 0.6788006294114362] |
22 | cpi_inflation | prices_ml | 1 | [1.537845435718938, 1.8738460800595065] | causation | 0.592353 | 1 | [0.571740114633368, 0.5923534990250259] |
23 | prices_ml | cpi_inflation | 1 | [1.8738460800595076, 1.537845435718938] | non-causation | 0.571740 | 0 | [0.571740114633368, 0.36778761591975295] |
[crosscorr(testing_mq_sq.cpi_inflation, testing_mq_sq.prices_ml, lag=v) for v in range(2)]
[0.571740114633368, 0.5923534990250259]
_qui var nab_purchase inputcosts_ml, lags(1/1) exog(l2.nab_purchase l2.inputcostsml)
vargranger
Equation | Excluded | chi2 | df | Prob > chi2 |
---|---|---|---|---|
nab_purchase | inputcosts_ml | 4.859 | 1 | 0.028 |
nab_purchase | ALL | 4.859 | 1 | 0.028 |
inputcosts_ml | nab_purchase | .51721 | 1 | 0.472 |
inputcosts_ml | ALL | .51721 | 1 | 0.472 |