In [ ]:
 
In [ ]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
import xml.etree.ElementTree as ET
import re
from bs4.element import NavigableString
from tqdm import tqdm
In [ ]:
from RefinitivUtils import update_progress
from RefinitivUtils import print_centre_hashed
In [ ]:
# Gets all text from xml .txt transcript
def string_from_xml(doc):
    tree = ET.ElementTree(ET.fromstring(doc))
    root = tree.getroot()
    out = ET.tostring(root.find('Trans').find('Episode'), encoding='utf-8', method='text')
    
    return out.decode('utf-8')
In [ ]:
# Given an text xml, gets the speaker information and returns as a df
def get_people(xml) -> pd.DataFrame:
    soup = BeautifulSoup(xml, 'xml')

    columns = ['speakerName', 'id', 'speakerCompany', 'position', 'organizer']
    people = pd.DataFrame(columns=columns)

    for speaker in soup.find_all('Speaker'):
        sname = speaker.get('name')

        if not sname:
            sname = speaker['firstName'] + ' ' + speaker['lastName']

        speakerRow = [
            sname,
            speaker.get('id'),
            speaker.get('companyName'),
            speaker.get('position'),
            speaker.get('organizer')
        ]

        people = pd.concat([people, pd.DataFrame([speakerRow], columns=columns)])

    return people.set_index('id')
In [ ]:
# Given an text xml transcript, fetches the quotes and speaker info for all sections of the transcript
def get_quotes_from_xml(xml: str) -> pd.DataFrame:
    if xml is None:
        return None

    soup = BeautifulSoup(xml, 'xml')
    people = get_people(xml)

    columns = ['section', 'sid', 'quote']
    quotes = pd.DataFrame(columns=columns)
    quote_count = 0

    for section in soup.find_all('Section'):
        sname = section['name']

        for turn in section.find_all('Turn'):
            sid = turn['speaker']
            quote = ''
            
            for item in turn.contents:
                if getattr(item, 'name', None) == 'br':
                    quotes.loc[quote_count] = [sname, sid, quote.strip()]
                    quote_count += 1
                    quote = ''
                elif isinstance(item, NavigableString):
                    quote += str(item)
            
            if quote:
                quotes.loc[quote_count] = [sname, sid, quote.strip()]
                quote_count += 1

    quotes = quotes.merge(people, how='left', left_on='sid', right_on='id').drop('sid', axis=1)

    return quotes
In [ ]:
class RefinitivEarningsParser:
    def __init__(self, rconn):
        self.rconn = rconn
    
    # Returns XML string for a given document if it can be found
    def get_transcript_xml(self, transcriptId) -> str:
        transcript_url = f'https://api.rkd.refinitiv.com/api/streetevents/documents/{transcriptId}/Transcript/Xml.ashx'

        transcript_result = requests.get(
            url=transcript_url,
            proxies = {
                'http': self.rconn.session.proxies["https"],
                'https': self.rconn.session.proxies["http"]
            }

        )

        if not transcript_result or transcript_result.status_code != 200:
            # Double check to see if result was rate throttled
            time.sleep(0.125)

            transcript_result = requests.get(
                url=transcript_url,
                proxies = {
                    'http': self.rconn.session.proxies["https"],
                    'https': self.rconn.session.proxies["http"]
                }
            )

            if not transcript_result or transcript_result.status_code != 200:
                return None

        return transcript_result.text
    
    # For every event returns a df with its transcriptId and full transcript text
    def get_full_text_from_events(self, events: pd.DataFrame, events_data=False) -> pd.DataFrame:        
        text = pd.DataFrame(columns=['TranscriptId', 'Full Text'])
        
        for transcriptId in tqdm(events['TranscriptId']):
            transcript_xml = self.get_transcript_xml(transcriptId)
            xml_string = re.sub('  ', '', (string_from_xml(transcript_xml)))
            text.loc[len(text.index)] = [transcriptId, xml_string]
        
        if events_data:
            text = events.merge(text, on='TranscriptId', how='inner')

        print(f'Parsed {len(text)} transcripts')
        return text
    
    # Get quotes with speaker info for each event passed in
    def get_quotes_from_events(self, events: pd.DataFrame, events_data=False) -> pd.DataFrame:
        quotes = pd.DataFrame()

        for transcriptId in tqdm(events['TranscriptId']):
            transcript_xml = self.get_transcript_xml(transcriptId)
            transcript_quotes = self.get_quotes_from_xml(transcript_xml)
            transcript_quotes['TranscriptId'] = transcriptId
            quotes = pd.concat([quotes, transcript_quotes], ignore_index=True)
        
        if events_data:
            quotes = events.merge(quotes, on='TranscriptId', how='inner')
        else:
            quotes = quotes.drop(columns=['TranscriptId'])

        print(f'Parsed {len(quotes)} quotes from {len(events)} transcripts')
        return quotes