Skip to content
This repository has been archived by the owner on Mar 1, 2024. It is now read-only.

Adding Earning Call transcripts of US based companies #658

Merged
merged 36 commits into from
Nov 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
c127691
Add files IMDB
Athe-kunal Oct 7, 2023
dad61e1
Add files IMDB
Athe-kunal Oct 7, 2023
678acf1
Add to library json
Athe-kunal Oct 7, 2023
4d30b77
Linting checks
Athe-kunal Oct 7, 2023
5be6ac7
Add to linting
Athe-kunal Oct 7, 2023
37c1970
Black linting
Athe-kunal Oct 7, 2023
6b50a75
Import fixes
Athe-kunal Oct 7, 2023
ca8c63a
Readme and import os changes
Athe-kunal Oct 7, 2023
c17bd94
linting via black
Athe-kunal Oct 7, 2023
5b41cdb
match id
Athe-kunal Oct 7, 2023
d1b3f07
dataframe to docs
Athe-kunal Oct 8, 2023
74cd66f
Remove extra files
Athe-kunal Oct 8, 2023
223a580
make dataframe optional
Athe-kunal Oct 8, 2023
af55ce6
Merge branch 'main' into main
Athe-kunal Oct 8, 2023
a8554cc
Merge branch 'main' into main
Athe-kunal Oct 9, 2023
aa6010d
Merge branch 'run-llama:main' into main
Athe-kunal Oct 9, 2023
62f8ea6
Add links for IMDB reviews
Athe-kunal Nov 12, 2023
95cde87
Merge branch 'run-llama:main' into main
Athe-kunal Nov 12, 2023
87e2807
Merge branch 'run-llama:main' into main
Athe-kunal Nov 14, 2023
96f4582
Bug fix for spoilers
Athe-kunal Nov 15, 2023
ae1c352
Merge branch 'run-llama:main' into main
Athe-kunal Nov 15, 2023
e2d1ac7
expected conditions and voting features
Athe-kunal Nov 15, 2023
623c83a
linting
Athe-kunal Nov 15, 2023
da41407
liniting checks
Athe-kunal Nov 15, 2023
89c5d72
readme updates
Athe-kunal Nov 15, 2023
d89043a
readme updates on metadata
Athe-kunal Nov 15, 2023
d2aeb96
Merge branch 'run-llama:main' into main
Athe-kunal Nov 26, 2023
a5d1722
Add earnings call transcript and update to imdb review
Athe-kunal Nov 26, 2023
779f181
satisfy the linting gods
Athe-kunal Nov 26, 2023
dad76e0
add to library.json
Athe-kunal Nov 26, 2023
29ef08c
Update library.json
Athe-kunal Nov 27, 2023
a467ac8
Merge branch 'run-llama:main' into main
Athe-kunal Nov 27, 2023
a47b364
Update library.json
Athe-kunal Nov 27, 2023
ad463c5
Merge branch 'run-llama:main' into main
Athe-kunal Nov 28, 2023
7adc270
name change for the nit
Athe-kunal Nov 28, 2023
a2a0a1a
Merge branch 'run-llama:main' into main
Athe-kunal Nov 28, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 85 additions & 0 deletions llama_hub/earnings_call_transcript/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# EARNING CALL TRANSCRIPTS LOADER

This loader fetches the earning call transcripts of US based companies from the website [discountingcashflows.com](https://discountingcashflows.com/). It is not available for commercial purposes

Install the required dependencies

```
pip install -r requirements.txt
```

The Earning call transcripts takes in three arguments

* Year
* Ticker symbol
* Quarter name from the list ["Q1","Q2","Q3","Q4"]

## Usage

```python
from llama_index import download_loader

IMDBReviewsloader = download_loader('EarningsCallTranscript')

loader = EarningsCallTranscript(2023,'AAPL','Q3')
docs = loader.load_data()
```

The metadata of the transcripts are the following

* ticker
* quarter
* date_time
* speakers_list

## Examples

#### Llama Index
```python
from llama_index import download_loader
from llama_index import VectorStoreIndex, download_loader

EarningsCallTranscript = download_loader('EarningsCallTranscript')

loader = EarningsCallTranscript(2023,'AAPL','Q3')
docs = loader.load_data()

index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()

response = query_engine.query(
"What was discussed about Generative AI?",
)
print(response)

```

#### Langchain

```python
from llama_index import download_loader
from langchain.agents import Tool
from langchain.agents import initialize_agent
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI

EarningsCallTranscript = download_loader('EarningsCallTranscript')

loader = EarningsCallTranscript(2023,'AAPL','Q3')
docs = loader.load_data()

tools = [
Tool(
name="LlamaIndex",
func=lambda q: str(index.as_query_engine().query(q)),
description="useful for questions about investor transcripts calls for a company. The input to this tool should be a complete english sentence.",
return_direct=True,
),
]
llm = ChatOpenAI(temperature=0)
agent = initialize_agent(
tools, llm, agent="conversational-react-description"
)
agent.run("What was discussed about Generative AI?")
```

14 changes: 14 additions & 0 deletions llama_hub/earnings_call_transcript/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from llama_hub.earnings_call_transcript.base import EarningsCallTranscript

from llama_hub.earnings_call_transcript.utils import (
get_earnings_transcript,
extract_speakers,
correct_date,
)

__all__ = [
"EarningsCallTranscript",
"get_earnings_transcript",
"extract_speakers",
"correct_date",
]
46 changes: 46 additions & 0 deletions llama_hub/earnings_call_transcript/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document
from datetime import datetime
from typing import List

try:
from llama_hub.earnings_call_transcript.utils import get_earnings_transcript
except ImportError:
from utils import get_earnings_transcript


class EarningsCallTranscript(BaseReader):
def __init__(self, year: int, ticker: str, quarter: str):
"""Get the earning call transcripts for a given company, in a given year and quarter

Args:
year (int): Year of the transcript
ticker (str): ticker symbol of the stock
quarter (str): quarter
"""
curr_year = datetime.now().year
assert year <= curr_year, "The year should be less than current year"

assert quarter in [
"Q1",
"Q2",
"Q3",
"Q4",
], 'The quarter should from the list ["Q1","Q2","Q3","Q4"]'
self.year = year
self.ticker = ticker
self.quarter = quarter

def load_data(self) -> List[Document]:
resp_dict, speakers_list = get_earnings_transcript(
self.quarter, self.ticker, self.year
)
return Document(
text=resp_dict["content"],
extra_info={
"ticker": resp_dict["symbol"],
"quarter": "Q" + str(resp_dict["quarter"]),
"date_time": resp_dict["date"],
"speakers_list": speakers_list,
},
)
3 changes: 3 additions & 0 deletions llama_hub/earnings_call_transcript/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#API-calling
tenacity
requests
58 changes: 58 additions & 0 deletions llama_hub/earnings_call_transcript/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from tenacity import retry, stop_after_attempt, wait_random_exponential
import requests
import json
from datetime import datetime
import re
from typing import List


def correct_date(yr, dt):
"""Some transcripts have incorrect date, correcting it

Args:
yr (int): actual
dt (datetime): given date

Returns:
datetime: corrected date
"""
dt = datetime.strptime(dt, "%Y-%m-%d %H:%M:%S")
if dt.year != yr:
dt = dt.replace(year=yr)
return dt.strftime("%Y-%m-%d %H:%M:%S")


def extract_speakers(cont: str) -> List[str]:
"""Extract the list of speakers

Args:
cont (str): transcript content

Returns:
List[str]: list of speakers
"""
pattern = re.compile(r"\n(.*?):")
matches = pattern.findall(cont)

return list(set(matches))


@retry(wait=wait_random_exponential(min=1, max=5), stop=stop_after_attempt(2))
def get_earnings_transcript(quarter: str, ticker: str, year: int):
"""Get the earnings transcripts

Args:
quarter (str)
ticker (str)
year (int)
"""
response = requests.get(
f"https://discountingcashflows.com/api/transcript/{ticker}/{quarter}/{year}/",
auth=("user", "pass"),
)

resp_text = json.loads(response.text)
speakers_list = extract_speakers(resp_text[0]["content"])
corrected_date = correct_date(resp_text[0]["year"], resp_text[0]["date"])
resp_text[0]["date"] = corrected_date
return resp_text[0], speakers_list
3 changes: 3 additions & 0 deletions llama_hub/imdb_review/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ def __init__(
generate_csv: bool = False,
multithreading: bool = False,
max_workers: int = 0,
reviews_folder: str = "movie_reviews",
):
"""Get the IMDB reviews of a movie

Expand All @@ -35,6 +36,7 @@ def __init__(
self.generate_csv = generate_csv
self.multithreading = multithreading
self.max_workers = max_workers
self.reviews_folder = reviews_folder

def load_data(self) -> List[Document]:
"""scrapes the data from the IMDB website movie reviews
Expand All @@ -57,6 +59,7 @@ def load_data(self) -> List[Document]:
self.generate_csv,
self.multithreading,
self.max_workers,
self.reviews_folder,
)

all_docs = []
Expand Down
5 changes: 3 additions & 2 deletions llama_hub/imdb_review/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ def main_scraper(
generate_csv: bool = False,
multithreading: bool = False,
max_workers: int = 0,
reviews_folder: str = "movie_reviews",
):
"""The main helper function to scrape data

Expand Down Expand Up @@ -225,7 +226,7 @@ def main_scraper(

print(f"Number of reviews scraped: {len(reviews_date)}")
if generate_csv:
os.makedirs("movie_reviews", exist_ok=True)
os.makedirs(reviews_folder, exist_ok=True)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need to make this change in this PR? if there's imdb changes let's make that in a sep PR

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I contributed imdb_scraper sometime back, but this was just a minor change that does not change much. For making it a separate PR, do I have to undo the changes and commit again, or is there any other way? Sorry if this is a stupid question

df = pd.DataFrame(
columns=[
"review_date",
Expand All @@ -246,7 +247,7 @@ def main_scraper(
df["review_helpful"] = reviews_found_helpful
df["review_total_votes"] = reviews_total_votes
df["reviews_if_spoiler"] = reviews_if_spoiler
df.to_csv(f"movie_reviews/{movie_name}.csv", index=False)
df.to_csv(f"{reviews_folder}/{movie_name}.csv", index=False)

return (
reviews_date,
Expand Down
11 changes: 10 additions & 1 deletion llama_hub/library.json
Original file line number Diff line number Diff line change
Expand Up @@ -1086,5 +1086,14 @@
"web",
"web reader"
]
},
"EarningsCallTranscript":{
"id":"earnings_call_transcript",
"author": "Athe-kunal",
"keywords": [
"Finance",
"Investor",
"Earning calls"
]
}
}
}