Iterates through each earnings call transcript:
Runs zero-shot classification prediction on a given set of "positive" labels
Saves prediction outcomes to a file under the positive predictions folder
Runs zero-shot classification prediction on a given set of "negative" labels
Saves prediction outcomes to a file under the negative predictions folder
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import pipeline
import os
import torch
import pandas as pd
import requests
from bs4 import BeautifulSoup
from os import listdir
from os.path import isfile, join
from tqdm.notebook import tqdm
import time
# '''Initializing zeroshot classifier object'''
classifier = pipeline("zero-shot-classification",
model="facebook/bart-large-mnli")
def labelFromPreds (preds, threshold = 0.5, dropna = True):
'''Takes classifier predictions and outputs predicted lables that are above
a set prediction threshold. Outputs a dictionary mapping labels to scores'''
outs = []
for pred in preds:
out = {'sentence': pred['sequence']}
for i in pred['scores']:
if i > threshold:
ind = pred['scores'].index(i)
out[pred['labels'][ind]] = pred['scores'][ind]
outs.append(out)
df = pd.DataFrame(outs)
if dropna:
df = df.dropna(subset = df.columns[1:], how = 'all')
return df
# '''Importing transcript'''
def load_paras(transcript_id, how = 'local', drop_len = 0):
'''
Read and clean a earnings call transcript given its transcript ID.
Outputs a list of paragraphs, seperated by line break.
Arguments: how = 'local' reads from txt files under a local directory.
how = 'web' reads from web
drop_len defines the number of minimum words required to qualify as a paragraph.
'''
if how == 'web':
transcriptUrl = f'https://api.rkd.refinitiv.com/api/streetevents/documents/{transcript_id}/Transcript/Xml.ashx'
res = requests.get(
url=transcriptUrl
)
soup = BeautifulSoup(res._content, "xml")
else:
with open(f"./Transcripts/{transcript_id}.txt") as fp:
soup = BeautifulSoup(fp.read().replace('<?xml version="1.0" encoding="ISO-8859-1"?>\n',''), 'xml')
for br in soup.find_all("br"):
br.replace_with(">*<")
article = [i.getText(separator=" ") for i in soup.find_all('Turn')]
cleaned = []
for sent in article:
cleaned.extend(sent.strip().split('>*<'))
out = [i.strip() for i in cleaned if i.strip() != '']
return [i for i in out if len(i.split(' ')) > drop_len]
# '''File paths'''
mypath = './Transcripts' # Location of Transcripts
pos_pred_path = './preds_pos' # Location where prediction outcomes (of "increasing" topics) are saved
neg_pred_path = './preds_neg' # Location where prediction outcomes (of "decreasing" topics) are saved
allfiles = [f[:-4] for f in listdir(mypath) if isfile(join(mypath, f))] # all transcripts mined
predicted_pos = [f[:-4] for f in listdir(pos_pred_path) if isfile(join(pos_pred_path, f))] # transcripts already classified
predicted_neg = [f[:-4] for f in listdir(neg_pred_path)
if isfile(join(neg_pred_path, f))] # transcripts already classified "decreasing" topics
candidate_labels = ['input cost increase',
'transportation cost increase',
'supply shortages',
'labour cost increase',
'import cost increase',
'hiring difficulties',
'downstream product price increase',
'consumer final price increase',
'consumer demand increase']
for transcript_id in tqdm(set(allfiles) - set(predicted_pos)):
start = time.time()
print('Predicting: ', transcript_id)
cleaned = load_paras(transcript_id, how = 'local', drop_len = 20)
preds = classifier(cleaned, candidate_labels, multi_label=True)
df = labelFromPreds (preds, threshold = 0, dropna = False)
df.to_csv(f'{pos_pred_path}/{transcript_id}.csv')
end = time.time()
print('time taken: ', end - start)
candidate_labels_neg = ['consumer final price decrease', 'import cost decrease', 'input cost decrease',
'consumer demand decrease', 'downstream product price decrease', 'labour cost decrease', 'transportation cost decrease']
for transcript_id in tqdm(set(allfiles) - set(predicted_neg)):
start = time.time()
print('Predicting: ', transcript_id)
cleaned = load_paras(transcript_id, how = 'local', drop_len = 20)
preds = classifier(cleaned, candidate_labels_neg, multi_label=True)
df = labelFromPreds (preds, threshold = 0, dropna = False)
df.to_csv(f'{neg_pred_path}/{transcript_id}.csv')
end = time.time()
print('time taken: ', end - start)