import os
from os import listdir
from os.path import isfile, join
import pandas as pd
import functools as ft
import numpy as np
import datetime
from tqdm.notebook import tqdm
from RefinitivConnection import RefinitivConnection
from RefinitivEarningsFetcher import RefinitivEarningsFetcher
from RefinitivEarningsParser import RefinitivEarningsParser
from RefinitivListingFetcher import RefinitivListingFetcher
from RefinitivTranscriptAnalyser import RefinitivTranscriptAnalyser
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
# Loading earnings call events meta-data
events = pd.read_csv('events/events.csv')
events['EventTime'] = pd.to_datetime(events['EventTime'])
events['month'] = events['EventTime'].dt.month
#### Initializing transcript_analyser object for keyword counting
transcript_analyser = RefinitivTranscriptAnalyser(transcripts_dir_path='./Transcripts')
# '''Generating Cost Index by counting input cost keywords'''
df = transcript_analyser.get_keyword_counts_from_transcripts(input_words_path='Keywords/input_cost_words_v2.csv',
paras=False) # sentence level counting
# transcript_analyser.get_keyword_counts_from_transcripts() # paragraph level counting
## '''Sub-index: remember to set path for new keyword csv file counting can occur only at sentence level'''
df2 = transcript_analyser.get_counts_by_subindex_sector(input_words_path='Keywords/subindices_keys_quals.csv')
# '''Cost Index (Negative Qualifiers)'''
df_neg = transcript_analyser.get_keyword_counts_from_transcripts(
input_words_path='Keywords/input_cost_words_v2_neg.csv', paras=False) # sentence level counting
#'''Sub-index (Negative Qualifiers) '''
df2_neg = transcript_analyser.get_counts_by_subindex_sector(input_words_path='Keywords/subindices_keys_quals_neg.csv')
##### Demand #####
df_demand = transcript_analyser.get_keyword_counts_from_transcripts(
input_words_path='Keywords/consumer_demand_words.csv', paras=False)
##### Demand (negative qualifiers) #####
df_demand_neg = transcript_analyser.get_keyword_counts_from_transcripts(
input_words_path='Keywords/consumer_demand_words_neg.csv', paras=False)
##### Final Price #####
df_price = transcript_analyser.get_keyword_counts_from_transcripts(
input_words_path='Keywords/final_price_words.csv', paras=False)
##### Final Price (negative qualifiers) #####
df_price_neg = transcript_analyser.get_keyword_counts_from_transcripts(
input_words_path='Keywords/final_price_words_neg.csv', paras=False)
##### margin #####
df_margin = transcript_analyser.get_keyword_counts_from_transcripts(
input_words_path='Keywords/margin_words.csv', paras=False)
##### margin (negative qualifiers) #####
df_margin_neg = transcript_analyser.get_keyword_counts_from_transcripts(
input_words_path='Keywords/margin_words_neg.csv', paras=False)
##### outlook #####
df_outlook = transcript_analyser.get_keyword_counts_from_transcripts(
input_words_path='Keywords/outlook_words.csv', paras=False)
##### outlook (negative qualifiers) #####
df_outlook_neg = transcript_analyser.get_keyword_counts_from_transcripts(
input_words_path='Keywords/outlook_words_neg.csv', paras=False)
cols2keep = ['TranscriptId', 'matches']
df_price = df_price.reset_index()[cols2keep].rename(columns={'matches': 'price_aggregate'})
df_price_neg = df_price_neg.reset_index()[cols2keep].rename(columns={'matches': 'price_aggregate_neg'})
df_demand = df_demand.reset_index()[cols2keep].rename(columns={'matches': 'demand_aggregate'})
df_demand_neg = df_demand_neg.reset_index()[cols2keep].rename(columns={'matches': 'demand_aggregate_neg'})
df_margin = df_margin.reset_index()[cols2keep].rename(columns={'matches': 'margin_aggregate'})
df_margin_neg = df_margin_neg.reset_index()[cols2keep].rename(columns={'matches': 'margin_aggregate_neg'})
df_outlook = df_outlook.reset_index()[cols2keep].rename(columns={'matches': 'outlook_aggregate'})
df_outlook_neg = df_outlook_neg.reset_index()[cols2keep].rename(columns={'matches': 'outlook_aggregate_neg'})
## loading raw transcripts
path = './Transcripts'
transcripts = [f for f in os.listdir(path) if isfile(join(path, f))]
raw_transc = {}
for transc in tqdm(transcripts):
try:
with open(join(path, transc)) as fp:
tree = ET.ElementTree(ET.fromstring(fp.read().replace('<?xml version="1.0" encoding="ISO-8859-1"?>\n','')))
root = tree.getroot()
out = ET.tostring(root.find('Trans').find('Episode'), encoding='utf-8', method='text')
raw_transc[transc[:-4]] = out
except:
print('failed to extract from: ', transc)
pass
raw_transc_df = pd.DataFrame(pd.Series(raw_transc)).reset_index().\
rename(columns={'index': 'TranscriptId', 0: 'raw_transcript'})
##### Sub-index counts ######
pivoted = pd.pivot_table(df2.reset_index(), index = ['TranscriptId'], columns = ['subindex'], values = ['matches'])
flattened = pd.DataFrame(pivoted.to_records())
flattened.columns = [colname.replace("('matches', '", "").replace("')","")
for colname in flattened.columns]
##### Merging overall index counts and sub_index counts #####
merged = pd.merge(df[['TranscriptId','matches','total_words']], flattened, how='inner', on = 'TranscriptId')
merged = merged.rename(columns={'matches': 'cost_aggregate'})
##### Sub-index counts (NEGATIVE) ######
pivoted_neg = pd.pivot_table(df2_neg.reset_index(), index = ['TranscriptId'], columns = ['subindex'], values = ['matches'])
flattened_neg = pd.DataFrame(pivoted_neg.to_records())
flattened_neg.columns = [colname.replace("('matches', '", "").replace("')",""+'_neg')
for colname in flattened_neg.columns]
##### Merging overall index counts and sub_index counts (Negative) #####
merged_neg = pd.merge(df_neg[['TranscriptId','matches']], flattened_neg, how='inner', on = 'TranscriptId')
merged_neg = merged_neg.rename(columns={'matches': 'cost_aggregate_neg'})
# ##### Merging Negative and Positive cost counts + demand counts + price counts ######
dfs = [merged, merged_neg, df_price, df_price_neg, df_demand, df_demand_neg,
df_margin, df_margin_neg, df_outlook, df_outlook_neg]
merge_all = ft.reduce(lambda left, right: pd.merge(left, right, on='TranscriptId'), dfs)
##### Adding Events metadata, also adding raw transcripts #####
events = pd.read_csv('Events/events.csv')[['TranscriptId', 'EventTime', 'Sector', 'Company']]
analytical_db = pd.merge(
pd.merge(merge_all, events, how='inner', on = 'TranscriptId'),
raw_transc_df,
how='left', on = 'TranscriptId')
##### Join 0Shot Prediction Results "Increase" Topics #####
mypath = './preds_pos'
files = [f for f in listdir(mypath) if isfile(join(mypath, f))]
topic_list = ['input cost increase',
'transportation cost increase',
'supply shortages',
'labour cost increase',
'import cost increase',
'hiring difficulties',
'downstream product price increase',
'consumer final price increase',
'consumer demand increase']
cost_topics = ['input cost increase',
'transportation cost increase',
'supply shortages',
'labour cost increase',
'import cost increase',
'hiring difficulties']
price_topics = ['downstream product price increase',
'consumer final price increase']
def count_topics(doc, path, topic_list, cost_topics, price_topics, threshold):
df = pd.read_csv(join(path ,doc))
dfcost = df[cost_topics]
dfprice = df[price_topics]
out = {'transcript_id': doc[:-4]}
out['para_count'] = df.shape[0]
out['costs_aggregate'] = sum([any(row) for index, row in
dfcost.apply(lambda x:x>threshold).iterrows()])
out['price_aggregate'] = sum([any(row) for index, row in
dfprice.apply(lambda x:x>threshold).iterrows()])
for topic in topic_list:
out[topic] = sum(df[topic] > threshold)
return out
dfout = pd.DataFrame()
for file in tqdm(files):
try:
out = count_topics(file, mypath, topic_list, cost_topics, price_topics, 0.70)
dfout = dfout.append(out, ignore_index = True)
except:
print('Failed to analyze: ', file )
dfout.columns = ['zsl_'+ colname for colname in dfout.columns]
dfout = dfout.rename(columns = {'zsl_transcript_id': 'TranscriptId'})
# dfout = pd.read_csv('counts/0shot_count_all_samples.csv').drop(columns = 'Unnamed: 0')
analytical_db = pd.merge(analytical_db, dfout, how = 'left', on = 'TranscriptId')
##### Join 0Shot Prediction Results "Decrease" Topics #####
mypath = './preds_neg'
files = [f for f in listdir(mypath) if isfile(join(mypath, f))]
topic_list = ['consumer final price decrease',
'import cost decrease',
'consumer demand decrease',
'input cost decrease',
'downstream product price decrease',
'transportation cost decrease',
'labour cost decrease']
cost_topics = ['input cost decrease',
'transportation cost decrease',
'labour cost decrease',
'import cost decrease',]
price_topics = ['downstream product price decrease',
'consumer final price decrease']
def count_topics(doc, path, topic_list, cost_topics, price_topics, threshold):
df = pd.read_csv(join(path ,doc))
dfcost = df[cost_topics]
dfprice = df[price_topics]
out = {'transcript_id': doc[:-4]}
out['costs_aggregate_neg'] = sum([any(row) for index, row in
dfcost.apply(lambda x:x>threshold).iterrows()])
out['price_aggregate_neg'] = sum([any(row) for index, row in
dfprice.apply(lambda x:x>threshold).iterrows()])
for topic in topic_list:
out[topic] = sum(df[topic] > threshold)
return out
dfout_neg = pd.DataFrame()
for file in tqdm(files):
try:
out = count_topics(file, mypath, topic_list, cost_topics, price_topics, 0.70)
dfout_neg = dfout_neg.append(out, ignore_index = True)
except:
print('Failed to analyze: ', file )
dfout_neg.columns = ['zsl_'+ colname for colname in dfout_neg.columns]
dfout_neg = dfout_neg.rename(columns = {'zsl_transcript_id': 'TranscriptId'})
# dfout_neg = pd.read_csv('counts/0shot_count_all_samples_neg.csv').drop(columns = 'Unnamed: 0')
analytical_db = pd.merge(analytical_db, dfout_neg, how = 'left', on = 'TranscriptId')
analytical_db.to_excel('analytical_database.xlsx', index = False)