Import Packages

In [7]:
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.stats import norm
from urllib.request import urlopen
from scipy.optimize import minimize
from sklearn.neighbors import KernelDensity
from mpl_toolkits.mplot3d import Axes3D

import warnings
warnings.filterwarnings('ignore')# Presentation - OLS

Declare Functions

In [8]:
def yahoo_parser(url):
    response = urlopen(url)
    data = json.loads(response.read().decode("utf-8"))['chart']['result'][0]
    t = pd.to_datetime(data['timestamp'], unit='s').date
    c = data['indicators']['quote'][0]['close']
    return pd.DataFrame({'close': c}, index=t)

def default_parser(url):
    return pd.read_csv(url, index_col=0, parse_dates=True)

def retrieve(source, code, label, column=None):
    query = {
        'yahoo': 'https://query1.finance.yahoo.com/v8/finance/chart/{}?period1=0&period2=5000000000&interval=1d',
    }
    parser = {
        'yahoo': yahoo_parser,
    }
    url = query.get(source).format(code)
    df = parser.get(source, default_parser)(url)
    df.index.name = None
    df.columns = [col.lower() for col in df.columns] if df.shape[1] > 1 else [label]
    if not column: column = df.columns
    all_dates = pd.date_range(df.index.min(), df.index.max(), freq='D')
    df = df.replace('.', None).reindex(all_dates).fillna(method='ffill')
    return df.astype('float')[column].squeeze()

def calc_call_premium(K, T, sigma):
    sigma = np.where(sigma>0, sigma, 1e-4)
    d1 = (np.log(1/K) + sigma**2 / 2 * T) / (sigma * np.sqrt(T))
    d2 = (np.log(1/K) - sigma**2 / 2 * T) / (sigma * np.sqrt(T))
    c = norm.cdf(d1) - K * norm.cdf(d2)
    return c
In [9]:
# load yahoo data
raw_df = retrieve('yahoo', '^GSPC', 'SPX').to_frame()
In [10]:
# remove all weekends
df = raw_df.copy()
df = df.reindex(pd.date_range(start="1970-01-02", end="2018-12-31", freq='B'))

# take the log price
df['log_SPX'] = np.log(df["SPX"])
# compute daily log return
df['r'] = df['log_SPX'].diff()

term_days = [5] + list(range(20, 521, 20))
term_name = [str(i) + "D" for i in term_days]
terms = dict(zip(term_name, term_days))

# calculate rolling period return
for k, v in terms.items():
    df[k + '_ret'] = df["r"].rolling(v).sum()
    df['vol_' + k] = df["r"].rolling(v).std() * np.sqrt(260)
    df[k + '_ret'] = df[k + '_ret'] - np.log(np.exp(df[k + '_ret']).mean())

Fit Volatility Surface - Least Sequare Method

In [11]:
def calc_density(x, r, T, params, delta_x=0.001, method='linear'):
    """
    Approximate the density of x = log(K) using butterfly
    :param x: float / np.array(dtype=float), log return
    :param r: float
    :param params: array-like object, 
        linear: a, b
        quad: a, b, c
        quad_diff: a, b, c, a2
    :delta_x: float, default 0.001
    :method: str, the analytical form of volatility function, can be 'linear'/'quad'/'quad_diff'/'svi'
    
    :return: float / np.array(dtype=float), a series of density
    """
    # params: a, b, c, a2
    if method == 'linear' and len(params) == 2:
        x1, x2, a, b = x + np.log(1 + delta_x), x + np.log(1 - delta_x), params[0], params[1]
        sigma1, sigma, sigma2 = a * x1 + b,\
                                a * x + b,\
                                a * x2 + b
    elif method == 'quad' and len(params) == 3:
        x1, x2, a, b, c = x + np.log(1 + delta_x), x + np.log(1 - delta_x), params[0], params[1], params[2]
        sigma1, sigma, sigma2 = a * x1**2 + b * x1 + c,\
                                a * x**2 + b * x + c,\
                                a * x2**2 + b * x2 + c
    elif method == 'quad_diff' and len(params) == 4:
        x1, x2, a, b, c, a2 = x + np.log(1 + delta_x), x + np.log(1 - delta_x), params[0], params[1], params[2], params[3]
        sigma1, sigma, sigma2 = a * x1**2 * (x1 >= 0) + a2 * x1**2 * (x1 < 0) + b * x1 + c,\
                                a * x**2 * (x >= 0) + a2 * x**2 * (x < 0) + b * x + c,\
                                a * x2**2 * (x2 >= 0) + a2 * x2**2 * (x2 < 0) + b * x2 + c
    else:
        raise Exception("Wrong param dimension or fit method!")

    f = np.exp(r * T) * (calc_call_premium(np.exp(x1), T, sigma1)
                         - 2 * calc_call_premium(np.exp(x), T, sigma)
                         + calc_call_premium(np.exp(x2), T, sigma2)) / (np.exp(x) * delta_x**2)
    return f

def obj_func_least_square(params, r_series, r_density, term_day, method):
    """
    Compute mean of squared errors (avg(SSE)) given the true density
    :param params: parameters in volatility function
    :r_series: array-like, the input in the volatility function, used to predict density
    :r_density: array-like, true density
    :term_day: int, T
    :method: str, the analytical form of volatility function
    """
    density = calc_density(r_series, r=0, T=term_day/260, params=params, delta_x=0.001, method=method)
    return np.mean((r_density - density)**2)
In [12]:
def full_model(terms, df, plot=True, method='ols'):
    """
    Fit the vol curve for all terms
    :param terms: dict, {'term_str': term_day}
    :param df: pd.DataFrame, index: dates; columns: rolling period return, columns names ["20D_ret", ...]
    :param plot: bool, whether to plot the fitted density of x=logK
    :param method: 'ols' / 'mle'
    :return result: pd.DataFrame, index: terms.keys, values, optimal parameters in volatility function
    """
    coef_linear_list = []
    coef_quad_list = []
    coef_quad_diff_list = []
    coef_svi_list = []
    if plot:
        fig = plt.figure(figsize=(18, len(terms) * 6))
        index = 1
    for term_str, term_day in terms.items():
        raw_series = df[term_str+'_ret'].dropna()
        r_series = raw_series.copy()  # fit kernel density
        raw_series = np.array(sorted(raw_series))

        avg, std = r_series.mean(), r_series.std()
        bandwidth = np.round(1.06 * std * (len(r_series))**(-0.2), 4)
        kde = KernelDensity(bandwidth = bandwidth)
        kde.fit(np.array(r_series).reshape(-1, 1))
        r_series = np.arange(avg - 5 * std, avg + 5 * std, 0.2 * std)
        r_density = np.exp(kde.score_samples(r_series.reshape(-1, 1)))
        tmp = minimize(obj_func_least_square, (-1, 1), args=(r_series, r_density, term_day, 'linear'), method="BFGS")
        coef_linear_list.append(tmp.x)
        linear_fit = np.round(calc_density(raw_series, r=0, T=term_day/260, params=tmp.x, delta_x=0.001,
                                           method='linear'), 4)
        tmp = minimize(obj_func_least_square, (1, -1, 1), args=(r_series, r_density, term_day, 'quad'), method="BFGS")
        coef_quad_list.append(tmp.x)
        quad_fit = np.round(calc_density(raw_series, r=0, T=term_day/260, params=tmp.x, delta_x=0.001,
                                         method='quad'), 4)
        tmp = minimize(obj_func_least_square, (1, -1, 1, 1), args=(r_series, r_density, term_day, 'quad_diff'),
                       method="BFGS")
        coef_quad_diff_list.append(tmp.x)
        quad_diff_fit = np.round(calc_density(raw_series, r=0, T=term_day/260, params=tmp.x, delta_x=0.001,
                                              method='quad_diff'), 4)
        if plot:  # plotting results
            plt.subplot(len(terms), 3, index)
            plt.hist(raw_series, bins=300, density=True, color='grey')
            plt.plot(raw_series, linear_fit, linewidth=2, color='salmon')
            plt.plot(raw_series, quad_fit, linewidth=2, color='skyblue')
            plt.plot(raw_series, quad_diff_fit, linewidth=2, color='burlywood')
            plt.legend(['linear fit', 'quad fit', 'quad_diff fit', 'empirical density'], loc='upper left')
            plt.title(term_str+" fitted density")
            index += 1

    result = pd.concat([pd.DataFrame(coef_linear_list), pd.DataFrame(coef_quad_list),
                        pd.DataFrame(coef_quad_diff_list)], axis=1)
    result.columns=['linear_a', 'linear_b', 'quad_a', 'quad_b', 'quad_c', 'quad_diff_a',
                    'quad_diff_b', 'quad_diff_c', 'quad_diff_a2']
    result.index = list(terms.keys())
    return result
In [13]:
ols_result = full_model(terms, df, method='ols')
In [14]:
ols_result
Out[14]:
linear_a linear_b quad_a quad_b quad_c quad_diff_a quad_diff_b quad_diff_c quad_diff_a2
5D -0.384758 0.136671 17.139927 -0.471025 0.147748 22.628546 -0.484432 0.147474 11.008452
20D -0.275135 0.129910 4.340637 -0.337145 0.139920 6.185233 -0.348405 0.140706 3.251778
40D -0.186361 0.131342 2.015341 -0.228069 0.140659 1.924107 -0.227141 0.140432 2.012410
60D -0.167698 0.128794 1.510926 -0.205523 0.138557 1.486953 -0.205046 0.138452 1.508781
80D -0.115145 0.131771 1.051253 -0.143463 0.141839 0.801218 -0.139618 0.140849 1.104362
100D -0.066928 0.129410 0.986380 -0.087550 0.140559 0.760719 -0.084565 0.139564 1.027468
120D -0.093013 0.131620 0.801029 -0.118347 0.142057 1.086626 -0.123450 0.143141 0.689021
140D -0.103167 0.135051 0.648174 -0.129623 0.145092 0.961173 -0.135407 0.146501 0.509531
160D -0.126710 0.135048 0.552229 -0.156170 0.144891 0.902756 -0.163932 0.146491 0.340358
180D -0.136048 0.134943 0.490000 -0.166140 0.144471 0.795912 -0.175711 0.147043 0.364189
200D -0.152693 0.135995 0.411233 -0.183025 0.144967 0.743591 -0.192843 0.147483 0.205254
220D -0.153306 0.135832 0.377625 -0.184945 0.144608 0.704113 -0.194929 0.146872 0.142553
240D -0.161936 0.136732 0.329470 -0.192691 0.145179 0.623727 -0.204787 0.148568 0.172904
260D -0.157064 0.137885 0.308360 -0.187172 0.146230 0.590038 -0.200464 0.150050 0.181977
280D -0.155566 0.139030 0.269324 -0.183774 0.147166 0.494934 -0.196109 0.150965 0.194725
300D -0.152914 0.139729 0.249631 -0.181678 0.148027 0.471397 -0.193989 0.151875 0.158885
320D -0.148770 0.141261 0.223580 -0.175895 0.149468 0.409354 -0.186999 0.152958 0.146455
340D -0.138989 0.143015 0.203267 -0.164246 0.151092 0.301910 -0.170956 0.153179 0.168636
360D -0.132516 0.145082 0.173954 -0.154477 0.152595 0.171216 -0.154288 0.152551 0.176651
380D -0.126110 0.145845 0.156008 -0.145764 0.153000 0.098585 -0.141238 0.151766 0.180744
400D -0.112800 0.145669 0.157720 -0.130467 0.153138 0.033660 -0.120156 0.150132 0.196417
420D -0.098359 0.145004 0.166167 -0.114116 0.153022 -0.032628 -0.096958 0.147934 0.217661
440D -0.088220 0.143348 0.170898 -0.102314 0.151725 -0.070142 -0.081066 0.145311 0.227623
460D -0.080602 0.140426 0.180126 -0.093342 0.149221 -0.056917 -0.072535 0.142461 0.229911
480D -0.073698 0.137796 0.186277 -0.085300 0.146717 -0.020837 -0.067833 0.140687 0.226454
500D -0.072350 0.135853 0.185735 -0.083703 0.144777 0.011833 -0.068722 0.139391 0.218251
520D -0.069407 0.133100 0.188556 -0.081356 0.141961 0.073295 -0.071619 0.138367 0.208655
In [15]:
def plot_result(terms, full_model_output):
    df = full_model_output
    # plot the vol curve seperately, x_axis: K
    col_names = full_model_output.columns.tolist()

    fig = plt.figure(2, figsize=(18, len(terms) * 6))
    index = 1
    for term_str, term_day in terms.items():
        plt.subplot(len(terms), 3, index)
        x = np.linspace(-0.8, 0.7, 41)
        K = np.exp(x)
        params = df.loc[term_str, :].values.tolist()
        if 'linear_a' in col_names:
            a, b = params[0:2]
            plt.plot(K, a*x+b, color='salmon')
            a, b, c = params[2:5]
            plt.plot(K, a*x**2 + b*x + c, color='skyblue')
            a, b, c, a2 = params[5:9]
            plt.plot(K, (a*(x>=0)+a2*(x<0))*x**2+b*x+c, color='burlywood')
            plt.legend(['linear', 'quad', 'quad diff'])
            plt.xlabel('moneyness(K/S)')
            plt.ylabel('volatility')
            plt.title(term_str+" volatility plot against moneyness")
        index += 1
    return
In [16]:
plot_result(terms, ols_result)