This notebook contains material from CBE40455-2020; content is available on Github.
This notebook presents methods for modeling a financial time series as geometric Brownian motion. The basic outline is to:
The name Brownian motion (or Brownian movement) is a tribute to Sir Robert Brown, the Scottish botanist who, in 1827, reported the random motion of pollen grains on the surface of water when viewed under a microscope.
The explanation of that behavior waited for the genius of Albert Einstein. In the Annus mirabilis of 1905, while employed as a patent clerk and living in a modest apartment in Bern, Einstein published papers describing Special Relativity, laid the foundation for quantum theory with a paper on the photoelectric effect, and demonstrated the existence of atoms and molecules with a paper on Brownian Motion.
Remarkably, five earlier Louis Bachelier published his Master's thesis on the "Theory of Speculation". While this study was limited to the dynamics of prices on the Paris Bourse, and therefore didn't have the profound implications for Physics of Einstein's forthcoming work, nevertheless Bachelier should be credited with introducing random motion to describe price dynamics. Unfortunately, this work laid in relative obscurity for decades.
Other figures in this intellectual history include the Japanese Kiyosi Ito whose work in the difficult circumstances of the second World War laid a foundation for stochastic calculus. Later, the eccentric Norbert Weiner established a [theory for random motion -- the Wiener process -- now widely used in engineering and finance.
The colorful history of individual genius and iconclastic research doesn't end there, but it is enough to provide some understanding behind the terminology that will be introduced below.
The pandas-datareader
package provides a utility for accessing on-line data sources of data. Since the interfaces to those data sources are constantly changing, the next cell updates any current installation of the data reader to the latest available version.
%%capture
#!pip install pandas_datareader --upgrade
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import datetime
import pandas as pd
import pandas_datareader as pdr
# python libraray for accessing internet resources
import requests
def lookup_yahoo(symbol):
"""Return a list of all matches for a symbol on Yahoo Finance."""
url = f"http://d.yimg.com/autoc.finance.yahoo.com/autoc?query={symbol}®ion=1&lang=en"
return requests.get(url).json()["ResultSet"]["Result"]
def get_symbol(symbol):
"""Return exact match for a symbol."""
result = [r for r in lookup_yahoo(symbol) if symbol == r['symbol']]
return result[0] if len(result) > 0 else None
symbol = 'AAPL'
# get symbol data
symbol_data = get_symbol(symbol)
print(symbol_data)
assert symbol_data, f"Symbol {symbol} wasn't found."
# end date is today
end = datetime.datetime(2018, 8, 30).date()
start = end-datetime.timedelta(3*365)
# get stock price data
S = pdr.data.DataReader(symbol, "yahoo", start, end)['Adj Close']
rlin = (S - S.shift(1))/S.shift(1)
rlog = np.log(S/S.shift(1))
# clean up data
rlin = rlin.dropna()
rlog = rlog.dropna()
# plot data
plt.figure(figsize=(10,6))
plt.subplot(3,1,1)
title = f"{symbol_data['name']} ({symbol_data['exchDisp']} {symbol_data['typeDisp']} {symbol_data['symbol']})"
S.plot(title=title)
plt.ylabel('Adjusted Close')
plt.grid()
plt.subplot(3,1,2)
rlin.plot()
plt.title('Linear Returns (daily)')
plt.grid()
plt.tight_layout()
plt.subplot(3,1,3)
rlog.plot()
plt.title('Log Returns (daily)')
plt.grid()
plt.tight_layout()
A basic assumption in developing developing stochastic price models is that the residuals are indepdendent and identically distributed (i.i.d.) random variates. Here we show the results of several common statistical tests that would screen out non-i.i.d. random variates.
bins = np.linspace(-0.12,0.10,50)
plt.figure(figsize=(10,5))
plt.subplot(2,2,1)
rlog.hist(bins=bins, density=True, color='b', alpha=0.5)
plt.xlim(bins.min(),bins.max())
plt.title('Log Returns')
plt.subplot(2,2,3)
rlog.hist(bins=bins, density=True, cumulative=True, color='b',alpha=0.5)
plt.xlim(bins.min(),bins.max())
plt.subplot(2,2,2)
rlin.hist(bins=bins, density=True, color='y', alpha=0.5)
plt.xlim(bins.min(),bins.max())
plt.title('Linear Returns')
plt.subplot(2,2,4)
rlin.hist(bins=bins, density=True, cumulative=True, color='y',alpha=0.5)
plt.xlim(bins.min(),bins.max())
plt.tight_layout()
from scipy.stats import norm
k = int(len(rlog)/2)
r = np.linspace(rlog.min(),rlog.max())
plt.figure()
param = norm.fit(rlog[:k])
rlog[:k].hist(bins=r, density=True, alpha=0.35, color='r')
plt.plot(r,norm.pdf(r,loc=param[0],scale=param[1]),'r-',lw=3);
rlog[k:].hist(bins=r, density=True, alpha=0.35, color='c')
param = norm.fit(rlog[k:])
plt.plot(r,norm.pdf(r, loc=param[0], scale=param[1]), 'c-',lw=3);
plt.legend(['rLog[:k]', 'rLog[k:]'])
plt.title('Change in Distribution of Log Returns')
norm.fit(rlog[:k].dropna())
plt.plot(rlog[0:-1], rlog[1:],'.')
plt.axis('equal');
plt.xlabel('$r^{log}_{t}$')
plt.ylabel('$r^{log}_{t+1}$')
plt.grid()
plt.title('Lag Plot for Log Returns');
i.i.d. ==> Independent and Identically Distributed
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
plot_acf(rlog, lags=min(30, len(rlog)));
plt.xlabel('Lag');
plot_pacf(rlog, lags=min(30, len(rlog)));
plt.xlabel('Lag');
from scipy.stats import norm
from statsmodels.graphics.gofplots import qqplot
r = np.linspace(rlog.min(), rlog.max())
plt.figure()
param = norm.fit(rlog)
nu = param[0]
sigma = param[1]
print(nu, sigma)
rlog.hist(bins=int(1.5*np.sqrt(len(rlog))), density=True,alpha=0.4)
plt.plot(r, norm.pdf(r, loc=param[0], scale=param[1]), 'r-', lw=3)
plt.figure()
qqplot(rlog, line='q');
from scipy.stats import t
from statsmodels.graphics.gofplots import qqplot
r = np.linspace(rlog.min(), rlog.max())
plt.figure()
param = t.fit(rlog)
print(param)
dof = param[0]
nu = param[1]
sigma = param[2]
rlog.hist(bins=int(1.5*np.sqrt(len(rlog))), density=True, alpha=0.4)
#plt.plot(r, t.pdf(r, loc=param[0], scale=param[1]), 'r-', lw=3)
plt.figure()
qqplot(rlog, t, distargs=(4,), loc=nu, scale=sigma, line='q');
The basic notion behind this class of models is to recognize the return at each point in time, for example,
$$\frac{S_{k+1} - S_k}{S_k} = r^{lin}_{k+1}$$can be expressed as the result of a random process.
$$r^{lin}_{k+1} = \mu\Delta t + \sigma \sqrt{\Delta t}Z_{k+1}$$where $Z_{k+1}$ comes from a Normal distribution with zero mean and a standard deviation of 1.
A discrete-time model for prices modeled as geometric Brownian motion is given by
$$S_{k+1} = S_k + \mu S_k \Delta t + \sigma S_k \sqrt{\Delta t} Z_k$$where $Z_k \sim N(0,1)$ and $\Delta t$ corresponds to a sampling period, typically a trading period. There are normally 252 trading days in a calendar year, 63 trading days in a quarter, and 21 trading days in a month.
Defining the linear return as
$$r^{lin}_{k} = \frac{S_k - S_{k-1}}{S_{k-1}} = \mu \Delta t + \sigma \sqrt{\Delta t} Z_k$$then the statistical model for linear returns becomes
$$r^{lin}_{k} = \mu \Delta t + \sigma \sqrt{\Delta t} Z_k$$This shows, for the case of Geometric Brownian Motion, $r^{lin}_k$ is a random variable drawn from a the normal distribution
$$r^{lin}_k \sim N(\mu \Delta t, \sigma\sqrt{\Delta t})$$Alternatively, geometric Brownian motion for prices can be modeled using the natural logarithm of price,
$$\ln S_{k+1} = \ln S_k + \nu \Delta t + \sigma \sqrt{\Delta t} Z_k$$where, as for linear returns, $Z_k \sim N(0,1)$ and $\Delta t$ corresponds to a sampling period. The relationship between linear and log returns is given by
$$\nu \approx \mu - \frac{\sigma^2}{2}$$where $\frac{\sigma^2}{2}$ is the 'volatility drag' on linear returns. Defining log return as
$$r^{log}_k = \ln S_k - \ln S_{k-1} = \nu \Delta t + \sigma \sqrt{\Delta t} Z_k$$the statistical model for log returns becomes
\begin{align*} r^{log}_{k} & = \nu \Delta t + \sigma \sqrt{\Delta t} Z_k \\ & \sim N(\nu \Delta t, \sigma\sqrt{\Delta t}) \end{align*}This shows, for the case of Geometric Brownian Motion, $r^{log}_k$ is a random variable drawn from a the normal distribution. The following cells is a complete self-contained demonstration of downloading a data series, fitting a GBM price model, and performing simulations. The first cell loads a data series, computes linear and log returns, and estimates values for $\mu$, $\nu$, and $\sigma$.
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime
from pandas_datareader import data, wb
import requests
def get_symbol(symbol):
"""
get_symbol(symbol) uses Yahoo to look up a stock trading symbol and
return a description.
"""
url = "http://d.yimg.com/autoc.finance.yahoo.com/autoc?query={}®ion=1&lang=en".format(symbol)
result = requests.get(url).json()
for x in result['ResultSet']['Result']:
if x['symbol'] == symbol:
return x['name']
symbol = 'X'
# end date is today
end = datetime.datetime.today().date()
start = end-datetime.timedelta(3*365)
# get stock price data
S = data.DataReader(symbol,"yahoo",start,end)['Adj Close']
rlin = (S - S.shift(1))/S.shift(1)
rlog = np.log(S/S.shift(1))
rlin = rlin.dropna()
rlog = rlog.dropna()
print('Linear Returns')
mu,sigma = norm.fit(rlin)
print(f' mu = {mu:12.8f} (annualized = {100*252*mu:.2f}%)')
print(f'sigma = {0:12.8f} (annualized = {1:.2f}%)'.format(sigma,100*np.sqrt(252)*sigma))
print()
print('Log Returns')
nu,sigma = norm.fit(rlog)
print(' nu = {0:12.8f} (annualized = {1:.2f}%)'.format(nu,100*252*nu))
print('sigma = {0:12.8f} (annualized = {1:.2f}%)'.format(sigma,100*np.sqrt(252)*sigma))
The second cell performs $N$ simulations over a time period $T$, and plots the results with the historical data.
from scipy.stats import norm
N = 1000
T = 63
dt = 1
plt.figure(figsize=(10,4))
plt.plot(S.values)
plt.title(get_symbol(symbol))
plt.xlabel('Trading Days')
Slog = [] # log of final values
for n in range(0,N):
P = S[-1] # returns the last price in the sequence
k = len(S)
Plog = []
tlog = []
for t in range(len(S)+1,len(S)+T+1):
Z = norm.rvs()
P += P*(mu*dt + sigma*np.sqrt(dt)*Z)
Plog.append(P)
tlog.append(t)
plt.plot(tlog,Plog,'b.',ms=0.4,alpha=0.5)
Slog.append(P)
plt.grid()
from scipy.stats import lognorm
plt.figure(figsize=(10, 4))
nbins = min(100, int(1.5*np.sqrt(N)))
plt.hist(Slog, bins=nbins, density=True, alpha=0.4, color='b');
shape, loc, scale = lognorm.fit(Slog, floc=0)
print(shape, loc, scale)
x=np.linspace(0, max(Slog), 100)
pdf_fitted = lognorm.pdf(x, shape, loc=loc, scale=scale) # fitted distribution
plt.plot(x, pdf_fitted, 'b-', lw=3)
plt.xlabel('Final Price')
plt.ylabel('Probability');
plt.title(get_symbol(symbol))
plt.grid()