import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
import xml.etree.ElementTree as ET
import re
from bs4.element import NavigableString
from tqdm import tqdm
from RefinitivUtils import update_progress
from RefinitivUtils import print_centre_hashed
# Gets all text from xml .txt transcript
def string_from_xml(doc):
tree = ET.ElementTree(ET.fromstring(doc))
root = tree.getroot()
out = ET.tostring(root.find('Trans').find('Episode'), encoding='utf-8', method='text')
return out.decode('utf-8')
# Given an text xml, gets the speaker information and returns as a df
def get_people(xml) -> pd.DataFrame:
soup = BeautifulSoup(xml, 'xml')
columns = ['speakerName', 'id', 'speakerCompany', 'position', 'organizer']
people = pd.DataFrame(columns=columns)
for speaker in soup.find_all('Speaker'):
sname = speaker.get('name')
if not sname:
sname = speaker['firstName'] + ' ' + speaker['lastName']
speakerRow = [
sname,
speaker.get('id'),
speaker.get('companyName'),
speaker.get('position'),
speaker.get('organizer')
]
people = pd.concat([people, pd.DataFrame([speakerRow], columns=columns)])
return people.set_index('id')
# Given an text xml transcript, fetches the quotes and speaker info for all sections of the transcript
def get_quotes_from_xml(xml: str) -> pd.DataFrame:
if xml is None:
return None
soup = BeautifulSoup(xml, 'xml')
people = get_people(xml)
columns = ['section', 'sid', 'quote']
quotes = pd.DataFrame(columns=columns)
quote_count = 0
for section in soup.find_all('Section'):
sname = section['name']
for turn in section.find_all('Turn'):
sid = turn['speaker']
quote = ''
for item in turn.contents:
if getattr(item, 'name', None) == 'br':
quotes.loc[quote_count] = [sname, sid, quote.strip()]
quote_count += 1
quote = ''
elif isinstance(item, NavigableString):
quote += str(item)
if quote:
quotes.loc[quote_count] = [sname, sid, quote.strip()]
quote_count += 1
quotes = quotes.merge(people, how='left', left_on='sid', right_on='id').drop('sid', axis=1)
return quotes
class RefinitivEarningsParser:
def __init__(self, rconn):
self.rconn = rconn
# Returns XML string for a given document if it can be found
def get_transcript_xml(self, transcriptId) -> str:
transcript_url = f'https://api.rkd.refinitiv.com/api/streetevents/documents/{transcriptId}/Transcript/Xml.ashx'
transcript_result = requests.get(
url=transcript_url,
proxies = {
'http': self.rconn.session.proxies["https"],
'https': self.rconn.session.proxies["http"]
}
)
if not transcript_result or transcript_result.status_code != 200:
# Double check to see if result was rate throttled
time.sleep(0.125)
transcript_result = requests.get(
url=transcript_url,
proxies = {
'http': self.rconn.session.proxies["https"],
'https': self.rconn.session.proxies["http"]
}
)
if not transcript_result or transcript_result.status_code != 200:
return None
return transcript_result.text
# For every event returns a df with its transcriptId and full transcript text
def get_full_text_from_events(self, events: pd.DataFrame, events_data=False) -> pd.DataFrame:
text = pd.DataFrame(columns=['TranscriptId', 'Full Text'])
for transcriptId in tqdm(events['TranscriptId']):
transcript_xml = self.get_transcript_xml(transcriptId)
xml_string = re.sub(' ', '', (string_from_xml(transcript_xml)))
text.loc[len(text.index)] = [transcriptId, xml_string]
if events_data:
text = events.merge(text, on='TranscriptId', how='inner')
print(f'Parsed {len(text)} transcripts')
return text
# Get quotes with speaker info for each event passed in
def get_quotes_from_events(self, events: pd.DataFrame, events_data=False) -> pd.DataFrame:
quotes = pd.DataFrame()
for transcriptId in tqdm(events['TranscriptId']):
transcript_xml = self.get_transcript_xml(transcriptId)
transcript_quotes = self.get_quotes_from_xml(transcript_xml)
transcript_quotes['TranscriptId'] = transcriptId
quotes = pd.concat([quotes, transcript_quotes], ignore_index=True)
if events_data:
quotes = events.merge(quotes, on='TranscriptId', how='inner')
else:
quotes = quotes.drop(columns=['TranscriptId'])
print(f'Parsed {len(quotes)} quotes from {len(events)} transcripts')
return quotes