I read Ernest Chan's "Machine Trading", and in his chapter on Factor Analysis, he introduced the idea of using Principal Component Analysis (PCA) to get the statistical factors and then regressing them against next day's returns to get buy/sell signals.
I translated his MatLab code into Python as best as I could, but the backtesting results so far have been dismal, compared to the results in his book.
Was wondering if anyone has tried something similar? Or is there something clearly wrong with my code?
Appreciate any comments or help from you guys.
Thanks!
Yi Peng
Clone Algorithm
10
Backtest from
to
with
initial capital
Cumulative performance:
Algorithm
Benchmark
Custom data:
Total Returns
--
Alpha
--
Beta
--
Sharpe
--
Sortino
--
Max Drawdown
--
Benchmark Returns
--
Volatility
--
Returns | 1 Month | 3 Month | 6 Month | 12 Month |
Alpha | 1 Month | 3 Month | 6 Month | 12 Month |
Beta | 1 Month | 3 Month | 6 Month | 12 Month |
Sharpe | 1 Month | 3 Month | 6 Month | 12 Month |
Sortino | 1 Month | 3 Month | 6 Month | 12 Month |
Volatility | 1 Month | 3 Month | 6 Month | 12 Month |
Max Drawdown | 1 Month | 3 Month | 6 Month | 12 Month |
""" PCA v6 documentation: Using PCA to find the statistical factors that drive returns. Out of the statistical factors, assuming 45% to 55% of the factors are systematic factors and the rest are specific, regress against next days returns, and use it to predict next day’s winners. """ import quantopian.algorithm as algo from quantopian.pipeline import Pipeline from quantopian.pipeline.data.builtin import USEquityPricing from quantopian.pipeline.filters import Q1500US import numpy as np import math import statsmodels.api as smapi import statsmodels as sm from sklearn import linear_model import pandas as pd from sklearn import decomposition from sklearn.decomposition import PCA from sklearn import svm from sklearn.preprocessing import StandardScaler ####################################################### def initialize(context): """ Called once at the start of the algorithm. """ context.lookback = 60 context.n_components = 15 context.longnum = 20 context.shortnum = 40 context.highvarthres = 0.60 context.lowvarthres = 0.40 print ("lookback days: %s, PCA: %s" %(context.lookback, context.n_components)) # Longs the top context.longshort and shorts 2*context.longshort bottom stocks algo.schedule_function( trade, algo.date_rules.every_day(), algo.time_rules.market_open(minutes=1), ) # Create our dynamic stock selector. algo.attach_pipeline(make_pipeline(), 'pipeline') def make_pipeline(): """ A function to create our dynamic stock selector (pipeline). Documentation on pipeline can be found here: https://www.quantopian.com/help#pipeline-title """ # Base universe set to the Q1500US base_universe = Q1500US() # Factor of today's open price. day_open = USEquityPricing.open.latest pipe = Pipeline( screen=base_universe, columns={ 'open': day_open, } ) return pipe def before_trading_start(context, data): """ Called every day before market open. """ #calls pipe and drops NaN context.output = algo.pipeline_output('pipeline').dropna() # These are the securities that we are interested in trading each day. context.security_list = context.output.index def handle_data(context, data): """ Called every minute. """ #record(leverage=context.account.leverage, #exposure=context.account.net_leverage) def trade(context, data): """ Execute orders according to our schedule_function() timing. Uses PCA to find the statistical factors, and fit it to next day's returns. """ #data.history includes data for today as well. price_history = data.history(context.security_list, fields="open", bar_count=context.lookback, frequency="1d") #print price_history.index #clearing all the NaNs in returns returns = price_history.pct_change() for idx in returns.count(): returns = returns[pd.notnull(returns[list(returns)[idx]])] returns = returns.bfill().ffill() #print returns.index returns_np = StandardScaler().fit_transform(returns) returns = pd.DataFrame(data = returns_np, columns = returns.columns, index=returns.index) #print returns.index pca = PCA(n_components=context.n_components, whiten=True) pca.fit(returns) var = pca.explained_variance_ratio_ highcount = 1 while sum(var) > context.highvarthres: new_components = context.n_components - highcount pca = PCA(n_components=new_components, whiten=True) pca.fit(returns) var = pca.explained_variance_ratio_ highcount += 1 lowcount = 1 while sum(var) < context.lowvarthres: new_components = context.n_components + lowcount pca = PCA(n_components=new_components, whiten=True) pca.fit(returns) var = pca.explained_variance_ratio_ lowcount += 1 #print new_components pca_returns = pca.transform(returns) factors = pd.DataFrame(pca_returns) #print factors.head() X = factors.iloc[0:-1,:] #print ('shape of X is', X.shape) lastday = factors.iloc[-1,:] lastday = lastday.to_frame().T #lastday.head() #print ("lastday shape:", lastday.shape) pred_ret = pd.Series(index=returns.columns) print ("variance is: %s" %sum(var)) for stock in returns.columns: Y = returns.iloc[1:,:] Y = Y[stock] #print ('shape of Y is', Y.shape) LR = linear_model.Lasso(alpha=0.1) LR.fit(X,Y) #score = LR.score(X,Y) #print ("score is:", score) pred = LR.predict(lastday) pred_ret.loc[stock] = pred for stock in context.security_list: if stock not in pred_ret.nlargest(context.longnum).index and stock not in pred_ret.nsmallest(context.shortnum).index: order_target_percent(stock, 0) elif sum(var) > context.highvarthres or sum(var) < context.lowvarthres: order_target_percent(stock, 0) elif stock in pred_ret.nlargest(context.longnum).index and pred_ret[stock] > 0: order_target_percent(stock, 0.025) #spreading the short positions over twice number of stocks as drops are sharper than rises elif stock in pred_ret.nsmallest(context.shortnum).index and pred_ret[stock] < 0: order_target_percent(stock, -0.0125)