Bitcoin Inter-exchange Price Spread Prediction

2017-05-21

Price prediction is thought to be very difficult for any equity, however the price spread between two exchanges may not. We look into the price spread between two of the largest Bitcoin exchanges: Bitstamp and BTC-E.

Step 1: Import All Packages

import quandl
import cmocean
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from pytrends.request import TrendReq
from scipy import stats as ss
from datetime import datetime as dt
from arch.unitroot import PhillipsPerron, ADF
from matplotlib import pyplot as plt
from matplotlib import dates as mdates
from statsmodels.graphics.api import qqplot
pytrend = TrendReq('[email protected]', 'aaaaaaaaaa', hl='en-US', tz=360, custom_useragent=None)
warnings.filterwarnings('ignore')
quandl.ApiConfig.api_key = 'aaaaaaaaaaaaaaa'

pd.set_option("display.width",999)
plt.rcdefaults()
plt.style.use('seaborn-paper')
plt.rcParams['axes.facecolor']='w'
plt.rcParams['axes.grid']=False
plt.rcParams['font.family'] = 'serif'
plt.rcParams['figure.figsize'] = [12, 4]

Step 2: Data Retrieving and Cleaning

Data source: https://www.quandl.com/data/.

exchangelist = [
    'BTCE',
    'KRAKEN',
    'BITFINEX',
    'BITSTAMP',
#     'COINBASE',
    'ROCK',
    'HITBTC',
    'ITBIT',
    'BITBAY',
    'BITKONAN',
    'CBX',
    'VCX'
]
n = len(exchangelist)
print('there are {} exchanges in total'.format(n))

there are 11 exchanges in total

exchanges = [quandl.get('BCHARTS/' + e + 'USD') for e in exchangelist]

for i in range(n):
    daterange = exchanges[i].index.tolist()
    print('{:>8}'.format(exchangelist[i]), daterange[0].date(), daterange[-1].date())

    BTCE 2011-08-14 2017-05-19
  KRAKEN 2014-01-07 2017-05-19
BITFINEX 2013-03-31 2016-12-22
BITSTAMP 2011-09-13 2017-05-19
    ROCK 2011-11-12 2017-05-19
  HITBTC 2013-12-27 2017-05-19
   ITBIT 2013-08-25 2017-05-19
  BITBAY 2014-05-16 2017-05-19
BITKONAN 2013-07-02 2017-05-19
     CBX 2011-07-05 2017-04-16
     VCX 2011-12-25 2017-05-19

start_date = pd.to_datetime('2014-07-01')
mid_date = pd.to_datetime('2016-01-01')
end_date = pd.to_datetime('2016-07-01')

price = pd.concat([e.ix[start_date:end_date,'Close'] for e in exchanges], join='outer', axis=1)
volume = pd.concat([e.ix[start_date:end_date,'Volume (Currency)'] for e in exchanges], join='outer', axis=1)
price.columns = exchangelist
volume.columns = exchangelist
price = price[price > 0].fillna(method='ffill').fillna(method='bfill')
volume = price[volume > 0].fillna(method='ffill').fillna(method='bfill')
print(price.describe())

             BTCE      KRAKEN    BITFINEX    BITSTAMP        ROCK      HITBTC       ITBIT      BITBAY    BITKONAN         CBX         VCX
count  732.000000  732.000000  732.000000  732.000000  732.000000  732.000000  732.000000  732.000000  732.000000  732.000000  732.000000
mean   359.045682  363.799398  363.647027  363.204795  363.963907  373.581913  362.888860  367.052036  369.847240  367.836366  388.325161
std    119.611681  122.473245  122.182031  122.020091  121.643524  115.354405  121.729241  125.084653  123.984738  119.057448   94.745641
min    170.000000  175.000000  182.000000  171.410000  185.000000  186.760000  183.160000  186.890000  171.030000  200.040000  210.000001
25%    245.123250  249.669390  249.235000  249.047500  249.887500  272.900000  249.252500  253.750000  254.972500  254.757500  312.249915
50%    349.933000  358.429990  353.825000  356.495000  355.015000  357.610000  353.475000  359.995000  360.765000  365.005000  380.000000
75%    429.812500  430.136262  433.410000  432.535000  433.282500  438.695000  432.302500  435.082500  442.192500  432.992500  425.000000
max    729.855000  765.490000  769.500000  766.620000  767.800000  768.590000  767.180000  999.000000  771.670000  690.000000  658.000000

Step 3: Data Analysis

%config InlineBackend.figure_format = 'retina'
min_exchange = pd.Series([np.argmin(price.iloc[i], axis=0) for i in range(len(price))],
                         index=price.index).apply(lambda fn: exchangelist.index(fn))
max_exchange = pd.Series([np.argmax(price.iloc[i], axis=0) for i in range(len(price))],
                         index=price.index).apply(lambda fn: exchangelist.index(fn))
dmin = price.min(axis=1)
dmax = price.max(axis=1)

thermal = cmocean.cm.thermal

plt.close()
fig = plt.figure()

ax = fig.add_subplot(111)
for i in range(n):
    ax.plot((price.index[0], price.index[0]), (0, 0), c=thermal(i/n), label=exchangelist[i])
for i in range(len(price)):
    ax.plot((price.index[i], price.index[i]), (0, -dmin[i]), lw=1.5, c=thermal(min_exchange[i]/n))
    ax.plot((price.index[i], price.index[i]), (0, dmax[i]), lw=1.5, c=thermal(max_exchange[i]/n))
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=2))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y\n %b'))
ax.set_xlim(start_date, end_date)
ax.set_ylim(-1500,1500)

plt.legend(loc='lower center', frameon=0, ncol=n//2)
plt.tight_layout()
plt.savefig('price1.pdf')

plt.show()

active_price = price * (price.diff() != 0)
active_price = active_price[active_price > 0]
err_count = np.isnan(active_price).sum()

%config InlineBackend.figure_format = 'retina'
plt.close()
fig = plt.figure()

ax1 = fig.add_subplot(111)
(100*max_exchange.value_counts().sort_index().reindex(range(len(exchangelist))).fillna(0)/len(price)).plot.bar(ax=ax1, color='c', align='center', alpha=.5, label='% being highest')
(100*min_exchange.value_counts().sort_index().reindex(range(len(exchangelist))).fillna(0)/len(price)).plot.bar(ax=ax1, color='r', align='center', alpha=.5, label='% being lowest')
ax1.set_ylim(-5,80)
ax1.set_ylabel('%')
plt.xticks(range(len(err_count)), err_count.index.tolist(), rotation=0)
ax1.legend(loc='upper left', frameon=0)

ax2 = ax1.twinx()
err_count.plot(ax=ax2, marker='o', color='r', alpha=.5, label='total number of error days')
ax2.set_xlim(-.5,len(err_count)-.5)
ax2.set_xticks(range(len(err_count)), err_count.index.tolist())
ax2.legend(loc='upper right', frameon=0)
ax2.set_ylim(-35,500)

# plt.grid(None)
plt.tight_layout()
plt.show()

irr_exchanges = [10,9,8,7,5,0]  # delete those that have over 200 error days, or dominate over 10% of all time range
n -= len(irr_exchanges)
active_price.drop(active_price.columns[[irr_exchanges]], axis=1, inplace=1)
for i in irr_exchanges:
    exchangelist.pop(i)

%config InlineBackend.figure_format = 'retina'
min_exchange = pd.Series([np.argmin(active_price.iloc[i], axis=0) for i in range(len(active_price))], index=active_price.index).apply(lambda fn: exchangelist.index(fn))
dmin = active_price.min(axis=1)
max_exchange = pd.Series([np.argmax(active_price.iloc[i], axis=0) for i in range(len(active_price))], index=active_price.index).apply(lambda fn: exchangelist.index(fn))
dmax = active_price.max(axis=1)

thermal = cmocean.cm.thermal

plt.close()
fig = plt.figure()

ax = fig.add_subplot(111)
for i in range(n):
    ax.plot((active_price.index[0], active_price.index[0]), (0, 0), c=thermal(i/n), label=exchangelist[i])
for i in range(len(active_price)):
    ax.plot((active_price.index[i], active_price.index[i]), (0, -dmin[i]), lw=1.5, c=thermal(min_exchange[i]/n))
    ax.plot((active_price.index[i], active_price.index[i]), (0, dmax[i]), lw=1.5, c=thermal(max_exchange[i]/n))
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=2))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y\n %b'))
ax.set_xlim(start_date, end_date)
ax.set_ylim(-1500,1500)

plt.legend(loc='lower center', frameon=0, ncol=n)
plt.tight_layout()
plt.savefig('price2.pdf')

plt.show()

%config InlineBackend.figure_format = 'retina'

err_count = np.isnan(active_price).sum()

plt.close()
fig = plt.figure()

ax1 = fig.add_subplot(111)
(100*max_exchange.value_counts().sort_index().reindex(range(len(exchangelist))).fillna(0)/len(price)).plot.bar(ax=ax1, color='c', align='center', alpha=.5, label='% being highest')
(100*min_exchange.value_counts().sort_index().reindex(range(len(exchangelist))).fillna(0)/len(price)).plot.bar(ax=ax1, color='r', align='center', alpha=.5, label='% being lowest')
ax1.set_ylim(-5,80)
ax1.set_ylabel('%')
plt.xticks(range(len(err_count)), err_count.index.tolist(), rotation=0)
ax1.legend(loc='upper left', frameon=0)

ax2 = ax1.twinx()
err_count.plot(ax=ax2, marker='o', color='r', alpha=.5, label='total number of error days')
ax2.set_xlim(-.5,len(err_count)-.5)
ax2.set_xticks(range(len(err_count)), err_count.index.tolist())
ax2.legend(loc='upper right', frameon=0)
ax2.set_ylim(-7,100)

# plt.grid(None)
plt.tight_layout()
plt.show()

%config InlineBackend.figure_format = 'retina'
plt.close()

fig = plt.figure()

ax = fig.add_subplot(111)
for i in range(n):
    ax.plot(active_price.ix[:,i], c=thermal(i/n), label=exchangelist[i])
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=2))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y\n %b'))
ax.axvspan(mid_date, end_date, color='c', alpha=0.05)
ax.set_xlim(start_date, end_date)
ax.set_ylim(0,1000)

plt.legend(loc='lower center', frameon=0, ncol=n)
plt.tight_layout()
# plt.title('Historical Prices')
plt.savefig('price3.pdf')
plt.show()

price_spread = active_price.max(axis=1) - active_price.min(axis=1)
price_spread.describe()

count    732.000000
mean       7.418801
std        7.705870
min        0.150000
25%        2.430000
50%        4.914920
75%        9.461703
max       78.191470
dtype: float64

%config InlineBackend.figure_format = 'retina'
plt.close()

fig = plt.figure()

ax = fig.add_subplot(111)
ax.plot(price_spread, lw=1,c='k')
ax.axhline(price_spread.mean(), alpha=0.5, ls='--', c='c', label='$\mu$')
ax.axhline(price_spread.mean()+price_spread.std()*3, alpha=0.5, ls='--', c='r', label='$\pm3\sigma$')
ax.axhline(price_spread.mean()-price_spread.std()*3, alpha=0.5, ls='--', c='r')
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=2))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y\n %b'))
ax.axvspan(mid_date, end_date, color='c', alpha=0.05)
ax.set_xlim(start_date, end_date)
ax.set_ylim(-60,100)

plt.tight_layout()
plt.legend(loc='lower center', framealpha=0, ncol=2)
# plt.title('Price Spreads')
plt.savefig('spread.pdf')
plt.show()

log_price_spread = np.log(price_spread)
log_price_spread.describe()

count    732.000000
mean       1.584788
std        0.939405
min       -1.897120
25%        0.887891
50%        1.592275
75%        2.247244
max        4.359161
dtype: float64

%config InlineBackend.figure_format = 'retina'
plt.close()
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(log_price_spread, c='k',lw=1,label='')
ax.axhline(log_price_spread.mean(), alpha=0.5, ls='--', c='c', label='$\mu$')
ax.axhline(log_price_spread.mean()+log_price_spread.std()*3, alpha=0.5, ls='--', c='r',label='$\pm3\sigma$')
ax.axhline(log_price_spread.mean()-log_price_spread.std()*3, alpha=0.5, ls='--', c='r')
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=2))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y\n %b'))
ax.axvspan(mid_date, end_date, color='c', alpha=0.05)
ax.set_ylim(-3,6)
ax.set_xlim(start_date, end_date)
plt.tight_layout()
plt.legend(loc='lower center', framealpha=0, ncol=2)
# plt.title('Log Price Spreads')
plt.savefig('logspread.pdf')
plt.show()

print(ADF(log_price_spread, trend='c'), '\n')
print(PhillipsPerron(log_price_spread, trend='c'), '\n')

   Augmented Dickey-Fuller Results   
=====================================
Test Statistic                 -2.346
P-value                         0.158
Lags                               17
-------------------------------------

Trend: Constant
Critical Values: -3.44 (1%), -2.87 (5%), -2.57 (10%)
Null Hypothesis: The process contains a unit root.
Alternative Hypothesis: The process is weakly stationary. 

     Phillips-Perron Test (Z-tau)    
=====================================
Test Statistic                -21.389
P-value                         0.000
Lags                               20
-------------------------------------

Trend: Constant Critical Values: -3.44 (1%), -2.87 (5%), -2.57 (10%) Null Hypothesis: The process contains a unit root. Alternative Hypothesis: The process is weakly stationary.

def z_norm(s):
    return (s - s.mean(axis=0)) / s.std(axis=0)

log_price_spread = z_norm(log_price_spread.rolling(5).mean().fillna(method='bfill'))
log_price_spread.describe()

count    7.320000e+02
mean    -4.284976e-15
std      1.000000e+00
min     -1.902323e+00
25%     -8.414662e-01
50%     -1.734537e-02
75%      7.996859e-01
max      2.645812e+00
Name: None, dtype: float64

print(ADF(log_price_spread, trend='nc'), '\n')
print(PhillipsPerron(log_price_spread, trend='nc'), '\n')

   Augmented Dickey-Fuller Results   
=====================================
Test Statistic                 -2.229
P-value                         0.025
Lags                               20
-------------------------------------

Trend: No Trend
Critical Values: -2.57 (1%), -1.94 (5%), -1.62 (10%)
Null Hypothesis: The process contains a unit root.
Alternative Hypothesis: The process is weakly stationary. 

     Phillips-Perron Test (Z-tau)    
=====================================
Test Statistic                 -3.321
P-value                         0.001
Lags                               20
-------------------------------------

Trend: No Trend
Critical Values: -2.57 (1%), -1.94 (5%), -1.62 (10%)
Null Hypothesis: The process contains a unit root.
Alternative Hypothesis: The process is weakly stationary.

%config InlineBackend.figure_format = 'retina'
plt.close()
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(log_price_spread, c='k',lw=1,label='')
ax.axhline(log_price_spread.mean(), alpha=0.5, ls='--', c='c', label='$\mu$')
ax.axhline(log_price_spread.mean()+log_price_spread.std()*3, alpha=0.5, ls='--', c='r',label='$\pm3\sigma$')
ax.axhline(log_price_spread.mean()-log_price_spread.std()*3, alpha=0.5, ls='--', c='r')
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=2))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y\n %b'))
ax.axvspan(mid_date, end_date, color='c', alpha=0.05)
ax.set_ylim(-5,5)
ax.set_yticks(range(-5,6))
ax.set_xlim(start_date, end_date)
plt.tight_layout()
plt.legend(loc='lower center', framealpha=0, ncol=2)
# plt.title('Log Price Spreads')
plt.savefig('logspreadnorm.pdf')
plt.show()

Step 4: Univariate Model - ARMA(p,q)

train = log_price_spread.loc[:mid_date]
test = log_price_spread.loc[mid_date:]
train.name = 'LogPs'

%config InlineBackend.figure_format = 'retina'
plt.close()
fig = plt.figure(figsize=(8,4))

ax1 = fig.add_subplot(211)
sm.graphics.tsa.plot_acf(train, lags=40, ax=ax1, label='ACF')
ax1.set_ylim(-.55,1.25)
ax1.set_yticks(np.arange(-.5,1.5,.5))
ax1.set_xticks(range(0,40,5))
ax1.set_title('')
handles, labels= ax1.get_legend_handles_labels()
handles = handles[:-len(handles)//3][1::2]
labels = labels[:-len(handles)//3][1::2]
ax1.legend(handles=handles, labels=labels,loc='best', frameon=0, numpoints=2)

ax2 = fig.add_subplot(212)
sm.graphics.tsa.plot_pacf(train, lags=40, ax=ax2, label='PACF')
ax2.set_ylim(-.55,1.25)
ax2.set_yticks(np.arange(-.5,1.5,.5))
ax2.set_xticks(range(0,40,5))
ax2.set_title('')
handles, labels= ax2.get_legend_handles_labels()
handles = handles[:-len(handles)//3][1::2]
labels = labels[:-len(handles)//3][1::2]
ax2.legend(handles=handles, labels=labels,loc='best', frameon=0, numpoints=2)

plt.tight_layout()
plt.savefig('uniacfpacf.pdf')
plt.show()

min_aic = min_bic = 1e5
min_aic_order = min_bic_order = (0,0)
for p in range(1,10):
    for q in range(6):
        model = sm.tsa.ARMA(train, (p, q))
        try:
            result = model.fit(trend='nc')
            stat = abs(sm.stats.durbin_watson(result.resid)-2)
            stat_score = (stat <= .01) * 3 + (.01 < stat <= .05) * 2 + (.05 < stat <= .10) * 1
            if np.abs(np.concatenate([result.arroots, result.maroots])).min() > 1.05:
                aic = result.aic
                bic = result.bic
                if aic < min_aic: 
                    min_aic = aic
                    min_aic_order = (p,q)
                if bic < min_bic: 
                    min_bic = bic
                    min_bic_order = (p,q)
                print('({},{}) AIC = {:>7.2f}, BIC = {:>7.2f}, |DW-2| = {:.8f} {}'.format(p,q,aic,bic,stat,stat_score*'*'))
        except:
            continue
print()
print('Minimum AIC = {:.2f} at {}, minimum BIC = {:.2f} at {}'.format(min_aic, min_aic_order, min_bic, min_bic_order))

(1,1) AIC =  113.26, BIC =  126.19, |DW-2| = 0.07497044 *
(1,2) AIC =  113.04, BIC =  130.28, |DW-2| = 0.06298702 *
(2,0) AIC =  111.43, BIC =  124.36, |DW-2| = 0.03723533 **
(2,1) AIC =  112.01, BIC =  129.25, |DW-2| = 0.04461551 **
(2,2) AIC =  113.29, BIC =  134.84, |DW-2| = 0.06107177 *
(3,0) AIC =  111.72, BIC =  128.96, |DW-2| = 0.05226283 *
(3,1) AIC =  113.64, BIC =  135.19, |DW-2| = 0.05401722 *
(4,0) AIC =  113.37, BIC =  134.92, |DW-2| = 0.05788130 *
(4,1) AIC =  115.31, BIC =  141.17, |DW-2| = 0.05672657 *
(5,0) AIC =  113.33, BIC =  139.19, |DW-2| = 0.00835075 ***

Minimum AIC = 111.43 at (2, 0), minimum BIC = 124.36 at (2, 0)

model = sm.tsa.ARMA(train, (2,0))
result = model.fit(trend='nc')
print(result.summary())

                              ARMA Model Results                              
==============================================================================
Dep. Variable:                  LogPs   No. Observations:                  550
Model:                     ARMA(2, 0)   Log Likelihood                 -52.713
Method:                       css-mle   S.D. of innovations              0.266
Date:                Sun, 21 May 2017   AIC                            111.426
Time:                        01:10:53   BIC                            124.355
Sample:                    07-01-2014   HQIC                           116.478
                         - 01-01-2016                                         
===============================================================================
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
ar.L1.LogPs     1.1236      0.042     26.763      0.000       1.041       1.206
ar.L2.LogPs    -0.1672      0.042     -3.976      0.000      -0.250      -0.085
                                    Roots                                    
=============================================================================
                 Real           Imaginary           Modulus         Frequency
-----------------------------------------------------------------------------
AR.1            1.0559           +0.0000j            1.0559            0.0000
AR.2            5.6627           +0.0000j            5.6627            0.0000
-----------------------------------------------------------------------------

Normality test.

sm.stats.jarque_bera(result.resid)[1]

2.6180738344018359e-23

sm.stats.acorr_ljungbox(result.resid, lags=1)[1][0]

0.81633813674691746

%config InlineBackend.figure_format = 'retina'
plt.close()
fig = plt.figure(figsize=(4,4))
ax = fig.add_subplot(111)
fig = qqplot(result.resid, line='45', ax=ax, fit=True)
lines = fig.findobj(lambda x: hasattr(x, 'get_color') and x.get_color() == 'r')
dots = fig.findobj(lambda x: hasattr(x, 'get_color') and x.get_color() == 'b')
[l.set_linewidth(2) for l in lines]
[d.set_alpha(0.3) for d in dots]
# [d.set_color('c') for d in dots]
[d.set_markersize(5) for d in dots]
ax.set_ylim(-3,3)
ax.set_yticks(range(-3,4))
ax.set_xlim(-3,3)
ax.set_xticks(range(-3,4))
plt.tight_layout()
# plt.title('p-value = {}'.format(ss.normaltest(result.resid)[1]))
plt.savefig('uniqqplot.pdf')
plt.show()

phi = result.arparams * (result.pvalues[:result.k_ar]<=0.05).values

def yhat(y):  # y is now nparray of shape (k_ar,2)
    ylag = y[::-1]
    return ylag.T @ phi

fit = result.fittedvalues

pred = test * np.nan
for i in range(len(phi)):
    pred[i] = yhat(train[-1-i-len(phi):-1-i])
for i in range(len(phi), len(test)):
    pred[i] = yhat(test[i-len(phi):i])

fcst = result.forecast(len(test))
forecast = pd.Series(fcst[0], index=test.index)
conf_interval = pd.DataFrame(fcst[2], index=test.index)

%config InlineBackend.figure_format = 'retina'
plt.close()
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(log_price_spread, c='k',lw=1, label='true data')
ax.plot(fit, 'ro-', markevery=10, alpha=0.5, label='insample fitted data')
# ax.plot(result.fittedvalues, 'b-')
ax.plot(pred, 'co-', markevery=10, alpha=0.5, label='outsample point forecast')
ax.plot(forecast, 'c--', label='outsample forecast')
ax.axvspan(mid_date, end_date, color='c', alpha=0.05)
ax.fill_between(conf_interval.index, conf_interval.ix[:,0], conf_interval.ix[:,1], color='k', alpha=.1, label='95% conf. interval')
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=2))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y\n %b'))
ax.set_xlim(start_date, end_date)
ax.set_ylim(-5,5)
ax.set_yticks(range(-5,6))
plt.tight_layout()
plt.legend(loc='lower center', framealpha=0, ncol=5)
# plt.title('First Difference Log Price Spreads')
plt.savefig('unifit1.pdf')
plt.show()

temp_date = pd.to_datetime('2015-12-01')

%config InlineBackend.figure_format = 'retina'
plt.close()
fig = plt.figure()

ax = fig.add_subplot(111)
ax.plot(log_price_spread, c='k',lw=1, label='true data')
ax.plot(fit, 'ro-', markevery=5, alpha=0.5, label='insample fitted data')
ax.plot(pred, 'co-', markevery=5, alpha=0.5, label='outsample point forecast')
ax.plot(forecast, 'c--', alpha=0.5, label='outsample forecast')
ax.axvspan(mid_date, end_date, color='c', alpha=0.05)
ax.fill_between(conf_interval.index, conf_interval.ix[:,0], conf_interval.ix[:,1], color='k', alpha=.1, label='95% conf. interval')
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=1))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y\n %b'))
ax.set_xlim(temp_date, end_date)
ax.set_ylim(-5,5)
ax.set_yticks(range(-5,6))
plt.tight_layout()
plt.legend(loc='lower center', framealpha=0, ncol=5)
# plt.title('First Difference Log Price Spreads')
plt.savefig('unifit2.pdf')
plt.show()

MASE (Mean Absolute Scaled Error): https://en.wikipedia.org/wiki/Mean_absolute_scaled_error

Better than MAPE: MASE is symmetric while MAPE punishes on negative errors
Better than RMSE: scaled to the data
Better than R-squares: parameter number irrelavant
Better than AUC and Predictive Likelihood: easier to compute

def mase(data, pred):
    error = data - pred
    ydiff = data.diff().dropna()
    return error.abs().mean() / ydiff.abs().mean()

mase_uni_is = mase(train, fit)
mase_uni_osp = mase(test, pred)
mase_uni_os = mase(test, forecast)
print('MASE insample is {:.3f}'.format(mase_uni_is))
print('MASE outsample point is {:.3f}'.format(mase_uni_osp))
print('MASE outsample is {:.3f}'.format(mase_uni_os))

MASE insample is 0.996
MASE outsample point is 0.991
MASE outsample is 4.704

Step 5: Multivariate Model - ARIMAX(p,d,q)

data = [
    'BCHAIN/ATRCT',     # Bitcoin Median Transaction Confirmation Time
    'BCHAIN/CPTRA',     # Bitcoin Cost Per Transaction
    'BCHAIN/DIFF',      # Bitcoin Mining Difficulty
    'BCHAIN/TVTVR',     # Bitcoin Trade Volume vs Transaction Volume Ratio
]

ind = [quandl.get(d)[start_date:end_date] for d in data]

max_volume = ((volume.apply(lambda v:v.index, axis=1).T == max_exchange.apply(lambda i:exchangelist[i])).T * volume).sum(axis=1)
min_volume = ((volume.apply(lambda v:v.index, axis=1).T == min_exchange.apply(lambda i:exchangelist[i])).T * volume).sum(axis=1)
trade_volume = max_volume + min_volume
transaction_volume = (trade_volume * ind[-1].T).T
ind[-1] = transfer_volume = transaction_volume.Value - trade_volume

dataset = pd.concat([price_spread, trade_volume] + ind, axis=1, join='outer')
dataset.columns = ['Ps', 'Tr', 'Ct', 'Co', 'Di', 'Tf']
print(dataset.describe())

               Ps           Tr          Ct          Co            Di             Tf
count  732.000000   732.000000  732.000000  732.000000  7.320000e+02     732.000000
mean     7.418801   727.610416    8.056266   12.379825  7.563867e+10   22286.774041
std      7.705870   243.943093    1.111682    8.532508  5.491585e+10   23071.094095
min      0.150000   356.410000    5.883333    3.442916  1.681846e+10    1057.026608
25%      2.430000   499.281875    7.333333    7.574733  4.055572e+10    7382.009772
50%      4.914920   713.810010    7.875000    8.890402  4.969239e+10   12993.766448
75%      9.461703   864.820000    8.600000   13.035012  9.605659e+10   29144.912952
max     78.191470  1532.508000   14.130000   49.667654  2.094532e+11  136972.126998

googletrends = []
kw_list = ['bitcoin robot', 'bitcoin arbitrage', 'bitcoin spread']
pytrend.build_payload(kw_list=kw_list)
googletrends = pytrend.interest_over_time().ix[start_date:end_date, :] + 1  # as googletrends can be 0 from time to time, which is troublesome for taking log

plt.close()
fig = plt.figure()
ax = fig.add_subplot(111)
for i in range(len(kw_list)):
    ax.plot(googletrends.ix[:,i], c=cmocean.cm.algae(i/len(kw_list)), label=kw_list[i])
ax.plot(googletrends.sum(axis=1), c='c', label='all')
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=2))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y\n %b'))
ax.set_xlim(start_date, end_date)
plt.tight_layout()
plt.legend(loc='lower center', frameon=0, ncol=len(kw_list)+1)
plt.ylim(-10,100)
plt.yticks(range(0,101,10))
plt.savefig('multrend.pdf')
plt.show()

exchangetrends = []
kw_list = np.setdiff1d(exchangelist, ['ROCK'])
pytrend.build_payload(kw_list=kw_list)
exchangetrends = pytrend.interest_over_time().ix[start_date:end_date, :] + 1

plt.close()
fig = plt.figure()
ax = fig.add_subplot(111)
for i in range(len(kw_list)):
    ax.plot(exchangetrends.ix[:,i], color=cmocean.cm.algae(i/len(kw_list)), label=kw_list[i])
ax.plot(exchangetrends.sum(axis=1), c='c', label='all')
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=2))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y\n %b'))
ax.set_xlim(start_date, end_date)
plt.tight_layout()
plt.legend(loc='lower center', frameon=0, ncol=len(kw_list)+1)
plt.ylim(-10,100)
plt.yticks(range(0,101,10))
plt.savefig('multrend.pdf')
plt.show()

dataset = pd.concat([dataset, googletrends.sum(axis=1), exchangetrends.sum(axis=1)], axis=1, join='outer').fillna(method='ffill').fillna(method='bfill')
dataset.columns = list(dataset.columns[:-2]) + ['Gt1', 'Gt2']
dataset.Gt1.interpolate(inplace=1)
dataset.Gt2.interpolate(inplace=1)

log_dataset = z_norm(np.log(dataset).rolling(5).mean().fillna(method='bfill'))
log_dataset.columns = ['Log'+c for c in log_dataset.columns]
print(log_dataset.describe())

              LogPs         LogTr         LogCt         LogCo         LogDi         LogTf        LogGt1        LogGt2
count  7.320000e+02  7.320000e+02  7.320000e+02  7.320000e+02  7.320000e+02  7.320000e+02  7.320000e+02  7.320000e+02
mean  -4.284976e-15  1.526951e-14  2.660744e-14 -2.311827e-16  5.078330e-14 -1.000566e-14  4.753939e-15  9.735989e-15
std    1.000000e+00  1.000000e+00  1.000000e+00  1.000000e+00  1.000000e+00  1.000000e+00  1.000000e+00  1.000000e+00
min   -1.902323e+00 -1.600349e+00 -2.681378e+00 -1.602305e+00 -1.918163e+00 -2.449733e+00 -2.868102e+00 -1.541291e+00
25%   -8.414662e-01 -1.003732e+00 -6.189083e-01 -6.491453e-01 -5.965383e-01 -7.432149e-01 -6.731405e-01 -6.369187e-01
50%   -1.734537e-02  8.771289e-02 -2.819185e-02 -4.128926e-01 -2.796970e-01 -1.754303e-01  4.339693e-02 -2.954394e-01
75%    7.996859e-01  6.973955e-01  5.295034e-01  4.580167e-01  6.834602e-01  6.506610e-01  7.555832e-01  4.566839e-01
max    2.645812e+00  2.439251e+00  3.834172e+00  2.765270e+00  1.896075e+00  2.321454e+00  2.077746e+00  4.822614e+00

%config InlineBackend.figure_format = 'retina'
plt.close()

fig = plt.figure(figsize=(5,4))

ax = fig.add_subplot(111)
idx = log_dataset.corr().sort_values(by='LogPs', ascending=0).index
log_dataset = log_dataset.ix[:,idx]
corr = log_dataset.corr()
sns.heatmap(corr, square=1, ax=ax)

plt.yticks(rotation=0)
plt.xticks(rotation=90)

plt.tight_layout()
plt.savefig('mulcorr.pdf')
plt.show()

fig = plt.figure(figsize=(12,4*len(corr)))
for i in range(len(corr)-1):
    ax = fig.add_subplot(len(corr)-1,1,i+1)
    ax.plot(log_price_spread, c='r', label='LogPs', alpha=.5)
    ax.plot(log_dataset.ix[:,i+1], c='c', alpha=.5, label=log_dataset.columns[i+1])
    ax.legend(frameon=0,loc='lower center', ncol=2)
    ax.xaxis.set_major_locator(mdates.MonthLocator(interval=2))
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y\n %b'))
    ax.set_xlim(start_date, end_date)
    ax.set_ylim(-5,5)
    ax.set_yticks(range(-5,6))
plt.tight_layout()
plt.show()

endog = log_dataset.LogPs
exog = log_dataset.ix[:,np.setdiff1d(log_dataset.columns,['LogPs'])]
train_endog = endog.ix[start_date:mid_date]
train_exog = exog.ix[start_date:mid_date]
test_endog = endog.ix[mid_date:end_date]
test_exog = exog.ix[mid_date:end_date]

min_aic = min_bic = 1e5
min_aic_order = min_bic_order = (0,0,0)
for p in range(1,10):
    for q in range(3):
        for d in range(1):
            model = sm.tsa.SARIMAX(train_endog, train_exog, (p, d, q))
            try:
                result = model.fit(trend='nc')
                result.summary()
            except:
                continue
            stat = abs(sm.stats.durbin_watson(result.resid)-2)
            stat_score = (stat <= 0.01) * 3 + (0.01 < stat <= 0.05) * 2 + (0.05 < stat <= 0.10) * 1
            if np.abs(np.concatenate([result.arroots, result.maroots])).min() > 1.05:
                aic = result.aic
                bic = result.bic
                if aic < min_aic: 
                    min_aic = aic
                    min_aic_order = (p,d,q)
                if bic < min_bic: 
                    min_bic = bic
                    min_bic_order = (p,d,q)
                print('({},{},{}) AIC = {:>7.2f}, BIC = {:>7.2f}, |DW-2| = {:.8f} {}'.format(p,d,q,aic,bic,stat,stat_score*'*'))
print()
print('Minimum AIC = {:.2f} at {}, minimum BIC = {:.2f} at {}'.format(min_aic, min_aic_order, min_bic, min_bic_order))

(1,0,0) AIC =   95.23, BIC =  134.02, |DW-2| = 0.32750528 
(1,0,1) AIC =   82.47, BIC =  125.56, |DW-2| = 0.03040248 **
(1,0,2) AIC =   81.51, BIC =  128.92, |DW-2| = 0.01859667 **
(2,0,0) AIC =   79.67, BIC =  122.77, |DW-2| = 0.02601878 **
(2,0,1) AIC =   77.37, BIC =  124.78, |DW-2| = 0.02101876 **
(2,0,2) AIC =   77.46, BIC =  129.18, |DW-2| = 0.02477762 **
(3,0,0) AIC =   77.69, BIC =  125.10, |DW-2| = 0.00997169 ***
(3,0,1) AIC =   75.14, BIC =  126.85, |DW-2| = 0.00109986 ***
(4,0,0) AIC =   75.70, BIC =  127.42, |DW-2| = 0.00565012 ***
(4,0,1) AIC =   77.69, BIC =  133.72, |DW-2| = 0.00489150 ***
(5,0,0) AIC =   77.64, BIC =  133.67, |DW-2| = 0.00382267 ***
(5,0,1) AIC =   56.67, BIC =  117.01, |DW-2| = 0.11066651 
(6,0,0) AIC =  -11.64, BIC =   48.69, |DW-2| = 0.10243475 
(6,0,1) AIC =  -16.11, BIC =   48.54, |DW-2| = 0.02155382 **
(6,0,2) AIC =  -16.40, BIC =   52.56, |DW-2| = 0.00558554 ***
(7,0,0) AIC =  -18.40, BIC =   46.25, |DW-2| = 0.00766101 ***
(7,0,1) AIC =  -16.75, BIC =   52.20, |DW-2| = 0.00203650 ***
(7,0,2) AIC =  -15.06, BIC =   58.20, |DW-2| = 0.00505927 ***
(8,0,0) AIC =  -17.85, BIC =   51.11, |DW-2| = 0.00599037 ***
(8,0,1) AIC =  -21.57, BIC =   51.70, |DW-2| = 0.01490724 **
(8,0,2) AIC =  -18.89, BIC =   58.69, |DW-2| = 0.02765866 **
(9,0,0) AIC =  -15.87, BIC =   57.40, |DW-2| = 0.00530673 ***
(9,0,1) AIC =  -20.02, BIC =   57.56, |DW-2| = 0.00183866 ***
(9,0,2) AIC =  -17.87, BIC =   64.01, |DW-2| = 0.01105891 **

Minimum AIC = -21.57 at (8, 0, 1), minimum BIC = 46.25 at (7, 0, 0)

model = sm.tsa.SARIMAX(train_endog, train_exog, (8,0,1), freq='D')
result = model.fit(trend='nc')
print(result.summary())

                           Statespace Model Results                           
==============================================================================
Dep. Variable:                  LogPs   No. Observations:                  550
Model:               SARIMAX(8, 0, 1)   Log Likelihood                  27.783
Date:                Sun, 21 May 2017   AIC                            -21.566
Time:                        01:13:14   BIC                             51.703
Sample:                    07-01-2014   HQIC                             7.066
                         - 01-01-2016                                         
Covariance Type:                  opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
LogCo          0.3659      0.181      2.024      0.043       0.012       0.720
LogCt          0.0089      0.039      0.224      0.823      -0.069       0.086
LogDi         -0.0414      0.266     -0.156      0.876      -0.563       0.480
LogGt1         0.0069      0.061      0.113      0.910      -0.113       0.127
LogGt2        -0.0132      0.084     -0.157      0.875      -0.178       0.152
LogTf         -0.3987      0.087     -4.557      0.000      -0.570      -0.227
LogTr          0.3262      0.180      1.815      0.069      -0.026       0.678
ar.L1          0.3475      0.168      2.063      0.039       0.017       0.678
ar.L2          0.7677      0.183      4.200      0.000       0.409       1.126
ar.L3         -0.1169      0.054     -2.171      0.030      -0.222      -0.011
ar.L4         -0.0721      0.051     -1.402      0.161      -0.173       0.029
ar.L5         -0.4598      0.047     -9.752      0.000      -0.552      -0.367
ar.L6          0.2162      0.088      2.468      0.014       0.045       0.388
ar.L7          0.3528      0.077      4.600      0.000       0.202       0.503
ar.L8         -0.1733      0.044     -3.953      0.000      -0.259      -0.087
ma.L1          0.7612      0.174      4.369      0.000       0.420       1.103
sigma2         0.0526      0.003     18.715      0.000       0.047       0.058
===================================================================================
Ljung-Box (Q):                      138.21   Jarque-Bera (JB):                28.29
Prob(Q):                              0.00   Prob(JB):                         0.00
Heteroskedasticity (H):               1.54   Skew:                            -0.30
Prob(H) (two-sided):                  0.00   Kurtosis:                         3.93
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

sm.stats.durbin_watson(result.resid)

1.9750927630586519

sm.stats.jarque_bera(result.resid)[1]

7.8859408081166919e-07

sm.stats.acorr_ljungbox(result.resid, lags=1)[1][0]

0.87455062641970893

%config InlineBackend.figure_format = 'retina'
plt.close()
fig = plt.figure(figsize=(4,4))
ax = fig.add_subplot(111)
fig = qqplot(result.resid, line='45', ax=ax, fit=True)
lines = fig.findobj(lambda x: hasattr(x, 'get_color') and x.get_color() == 'r')
dots = fig.findobj(lambda x: hasattr(x, 'get_color') and x.get_color() == 'b')
[l.set_linewidth(2) for l in lines]
[d.set_alpha(0.3) for d in dots]
[d.set_markersize(5) for d in dots]
ax.set_ylim(-3,3)
ax.set_yticks(range(-3,4))
ax.set_xlim(-3,3)
ax.set_xticks(range(-3,4))
plt.tight_layout()
# plt.title('p-value = {}'.format(ss.normaltest(result.resid)[1]))
plt.savefig('mulqqplot.pdf')
plt.show()

forecast = result.forecast(steps=len(test_endog), exog=test_exog)
fit = result.fittedvalues

model = sm.tsa.statespace.SARIMAX(endog, exog, model.order)
result = model.filter(result.params)
predict = result.get_prediction()
pred = predict.predicted_mean[mid_date:]
conf_interval = [forecast+predict.se_mean[len(train_endog):]*1.96,forecast-predict.se_mean[len(train_endog):]*1.96]

%config InlineBackend.figure_format = 'retina'
plt.close()
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(log_price_spread, c='k',lw=1, label='true data')
ax.plot(fit, 'ro-', markevery=10, alpha=0.5, label='insample fitted data')
ax.plot(pred, 'co-', markevery=10, alpha=0.5, label='outsample point forecast')
ax.plot(forecast, 'c--', label='outsample forecast')
ax.axvspan(mid_date, end_date, color='c', alpha=0.05)
ax.fill_between(test_endog.index, conf_interval[0], conf_interval[1], color='k', alpha=.1, label='95% conf. interval')
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=2))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y\n %b'))
ax.set_xlim(start_date, end_date)
ax.set_ylim(-5,5)
ax.set_yticks(range(-5,6))
plt.tight_layout()
plt.legend(loc='lower center', framealpha=0, ncol=5)
# plt.title('First Difference Log Price Spreads')
plt.savefig('mulfit1.pdf')
plt.show()

%config InlineBackend.figure_format = 'retina'
plt.close()
fig = plt.figure()

ax = fig.add_subplot(111)
ax.plot(log_price_spread, c='k',lw=1, label='true data')
ax.plot(fit, 'ro-', markevery=5, alpha=0.5, label='insample fitted data')
ax.plot(pred, 'co-', markevery=5, alpha=0.5, label='outsample point forecast')
ax.plot(forecast, 'c--', alpha=0.5, label='outsample forecast')
ax.axvspan(mid_date, end_date, color='c', alpha=0.05)
ax.fill_between(test_endog.index, conf_interval[0], conf_interval[1], color='k', alpha=.1, label='95% conf. interval')
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=1))
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y\n %b'))
ax.set_xlim(temp_date, end_date)
ax.set_ylim(-5,5)
ax.set_yticks(range(-5,6))
plt.tight_layout()
plt.legend(loc='lower center', framealpha=0, ncol=5)
# plt.title('First Difference Log Price Spreads')
plt.savefig('mulfit2.pdf')

plt.show()

mase_mul_is = mase(train_endog, fit)
mase_mul_osp = mase(test_endog, pred)
mase_mul_os = mase(test_endog, forecast)
print('MASE insample is {:.3f}'.format(mase_mul_is))
print('MASE outsample is {:.3f}'.format(mase_mul_osp))
print('MASE outsample is {:.3f}'.format(mase_mul_os))

MASE insample is 0.858
MASE outsample is 0.857
MASE outsample is 2.688

uni = np.array([mase_uni_is, mase_uni_osp, mase_uni_os])
mul = np.array([mase_mul_is, mase_mul_osp, mase_mul_os])
print(uni)
print(mul)

[ 0.99605648  0.99134227  4.70378576]
[ 0.8576784   0.85705027  2.68798458]

plt.figure(figsize=(8,4))
plt.plot((0,0),(0,5),'#eeeeee',ls='--')
plt.plot((1,1),(0,5),'#eeeeee',ls='--')
plt.plot((2,2),(0,5),'#eeeeee',ls='--')
plt.plot(uni, 'ro-', label='univariate model')
plt.plot(mul, 'ko-', label='multivariate model')
plt.legend(loc='lower center', frameon=0, ncol=2)
plt.xticks(range(3),['insample', 'outsample point', 'outsample'])
# plt.title('MASE Scores')
plt.yticks(np.arange(0,6))
plt.ylim(0,5)
plt.tight_layout()
plt.savefig('mase.pdf')
plt.show()

Step 6: Conclusion

So our exogenous indicators do work. We’ll see what else we can do with the multivariate model next time.