Back to Community
Using PCA for Statistical Factors Regression

I read Ernest Chan's "Machine Trading", and in his chapter on Factor Analysis, he introduced the idea of using Principal Component Analysis (PCA) to get the statistical factors and then regressing them against next day's returns to get buy/sell signals.

I translated his MatLab code into Python as best as I could, but the backtesting results so far have been dismal, compared to the results in his book.

Was wondering if anyone has tried something similar? Or is there something clearly wrong with my code?

Appreciate any comments or help from you guys.

Thanks!
Yi Peng

Clone Algorithm
8
Loading...
Backtest from to with initial capital
Total Returns
--
Alpha
--
Beta
--
Sharpe
--
Sortino
--
Max Drawdown
--
Benchmark Returns
--
Volatility
--
Returns 1 Month 3 Month 6 Month 12 Month
Alpha 1 Month 3 Month 6 Month 12 Month
Beta 1 Month 3 Month 6 Month 12 Month
Sharpe 1 Month 3 Month 6 Month 12 Month
Sortino 1 Month 3 Month 6 Month 12 Month
Volatility 1 Month 3 Month 6 Month 12 Month
Max Drawdown 1 Month 3 Month 6 Month 12 Month
"""
PCA v6 documentation:

Using PCA to find the statistical factors that drive returns. Out of the statistical factors, assuming 45% to 55% of the factors are systematic factors and the rest are specific, regress against next days returns, and use it to predict next day’s winners.

"""
import quantopian.algorithm as algo
from quantopian.pipeline import Pipeline
from quantopian.pipeline.data.builtin import USEquityPricing
from quantopian.pipeline.filters import Q1500US

import numpy as np
import math
import statsmodels.api as smapi
import statsmodels as sm
from sklearn import linear_model
import pandas as pd
from sklearn import decomposition
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.preprocessing import StandardScaler

#######################################################

def initialize(context):
    """
    Called once at the start of the algorithm.
    """
    context.lookback = 60
    context.n_components = 15
    context.longnum = 20
    context.shortnum = 40
    context.highvarthres = 0.60
    context.lowvarthres = 0.40
    
    print ("lookback days: %s, PCA: %s" %(context.lookback, context.n_components))

    # Longs the top context.longshort and shorts 2*context.longshort bottom stocks   
    algo.schedule_function(
        trade,
        algo.date_rules.every_day(),
        algo.time_rules.market_open(minutes=1),
    )

    # Create our dynamic stock selector.
    algo.attach_pipeline(make_pipeline(), 'pipeline')
    
def make_pipeline():
    """
    A function to create our dynamic stock selector (pipeline). Documentation
    on pipeline can be found here:
    https://www.quantopian.com/help#pipeline-title
    """
    # Base universe set to the Q1500US
    base_universe = Q1500US()

    # Factor of today's open price.
    day_open = USEquityPricing.open.latest

    pipe = Pipeline(
        screen=base_universe,
        columns={
            'open': day_open,
        }
    )
    
    return pipe

def before_trading_start(context, data):
    """
    Called every day before market open.
    """
    #calls pipe and drops NaN
    context.output = algo.pipeline_output('pipeline').dropna()

    # These are the securities that we are interested in trading each day.
    context.security_list = context.output.index

def handle_data(context, data):
    """
    Called every minute.
    """
    #record(leverage=context.account.leverage,
           #exposure=context.account.net_leverage)

def trade(context, data):
    """
    Execute orders according to our schedule_function() timing.

    Uses PCA to find the statistical factors, and fit it to next day's returns.
    """
    
    #data.history includes data for today as well.
    price_history = data.history(context.security_list, fields="open", bar_count=context.lookback, frequency="1d")
    #print price_history.index
    
    #clearing all the NaNs in returns
    returns = price_history.pct_change()
    for idx in returns.count():
        returns = returns[pd.notnull(returns[list(returns)[idx]])]
    returns = returns.bfill().ffill()
    #print returns.index
    returns_np = StandardScaler().fit_transform(returns)
    returns = pd.DataFrame(data = returns_np, columns =
                           returns.columns, index=returns.index)
    #print returns.index
    
    pca = PCA(n_components=context.n_components, whiten=True)
    pca.fit(returns)
    var = pca.explained_variance_ratio_
    
    highcount = 1
    while sum(var) > context.highvarthres:
        new_components = context.n_components - highcount
        pca = PCA(n_components=new_components, whiten=True)
        pca.fit(returns)
        var = pca.explained_variance_ratio_
        highcount += 1
    
    lowcount = 1
    while sum(var) < context.lowvarthres: 
        new_components = context.n_components + lowcount
        pca = PCA(n_components=new_components, whiten=True)
        pca.fit(returns)
        var = pca.explained_variance_ratio_
        lowcount += 1
    
    #print new_components
    
    pca_returns = pca.transform(returns)
    factors = pd.DataFrame(pca_returns)
    #print factors.head()
    
    X = factors.iloc[0:-1,:]
    #print ('shape of X is', X.shape)
    
    lastday = factors.iloc[-1,:] 
    lastday = lastday.to_frame().T
    #lastday.head()
    #print ("lastday shape:", lastday.shape)
    pred_ret = pd.Series(index=returns.columns)
    print ("variance is: %s" %sum(var))
    
    for stock in returns.columns:
        Y = returns.iloc[1:,:]
        Y = Y[stock]
        #print ('shape of Y is', Y.shape)
        LR = linear_model.Lasso(alpha=0.1)
        LR.fit(X,Y)
        #score = LR.score(X,Y) 
        #print ("score is:", score)
        
        pred = LR.predict(lastday)
        pred_ret.loc[stock] = pred
    
    for stock in context.security_list:
        if stock not in pred_ret.nlargest(context.longnum).index and stock not in pred_ret.nsmallest(context.shortnum).index:
            order_target_percent(stock, 0)
        elif sum(var) > context.highvarthres or sum(var) < context.lowvarthres:
            order_target_percent(stock, 0)
        elif stock in pred_ret.nlargest(context.longnum).index and pred_ret[stock] > 0:
            order_target_percent(stock, 0.025)
        #spreading the short positions over twice number of stocks as drops are sharper than rises 
        elif stock in pred_ret.nsmallest(context.shortnum).index  and pred_ret[stock] < 0:
            order_target_percent(stock, -0.0125)
There was a runtime error.
10 responses

I would suggest to initially run your backtests with commissions and slippage set to 0, just to see if the strategy has some alpha. With the new splippage model I saw good algorithms (they used to be good at least) perform poorly, so watch out for that.

Here's the algo (Backtest ID: 5ad9230ba7eb4e43d5833bac) with:

    set_commission(commission.PerShare(cost=0, min_trade_cost=0))  
    set_slippage(slippage.FixedSlippage(spread=0))  
Clone Algorithm
4
Loading...
Backtest from to with initial capital
Total Returns
--
Alpha
--
Beta
--
Sharpe
--
Sortino
--
Max Drawdown
--
Benchmark Returns
--
Volatility
--
Returns 1 Month 3 Month 6 Month 12 Month
Alpha 1 Month 3 Month 6 Month 12 Month
Beta 1 Month 3 Month 6 Month 12 Month
Sharpe 1 Month 3 Month 6 Month 12 Month
Sortino 1 Month 3 Month 6 Month 12 Month
Volatility 1 Month 3 Month 6 Month 12 Month
Max Drawdown 1 Month 3 Month 6 Month 12 Month
"""
PCA v6 documentation:

Using PCA to find the statistical factors that drive returns. Out of the statistical factors, assuming 45% to 55% of the factors are systematic factors and the rest are specific, regress against next days returns, and use it to predict next day’s winners.

"""
import quantopian.algorithm as algo
from quantopian.pipeline import Pipeline
from quantopian.pipeline.data.builtin import USEquityPricing
from quantopian.pipeline.filters import Q1500US

import numpy as np
import math
import statsmodels.api as smapi
import statsmodels as sm
from sklearn import linear_model
import pandas as pd
from sklearn import decomposition
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.preprocessing import StandardScaler

#######################################################

def initialize(context):
    """
    Called once at the start of the algorithm.
    """
    context.lookback = 60
    context.n_components = 15
    context.longnum = 20
    context.shortnum = 40
    context.highvarthres = 0.60
    context.lowvarthres = 0.40
    
    print ("lookback days: %s, PCA: %s" %(context.lookback, context.n_components))

    # Longs the top context.longshort and shorts 2*context.longshort bottom stocks   
    algo.schedule_function(
        trade,
        algo.date_rules.every_day(),
        algo.time_rules.market_open(minutes=1),
    )

    # Create our dynamic stock selector.
    algo.attach_pipeline(make_pipeline(), 'pipeline')
    
    set_commission(commission.PerShare(cost=0, min_trade_cost=0))
    set_slippage(slippage.FixedSlippage(spread=0))
    
def make_pipeline():
    """
    A function to create our dynamic stock selector (pipeline). Documentation
    on pipeline can be found here:
    https://www.quantopian.com/help#pipeline-title
    """
    # Base universe set to the Q1500US
    base_universe = Q1500US()

    # Factor of today's open price.
    day_open = USEquityPricing.open.latest

    pipe = Pipeline(
        screen=base_universe,
        columns={
            'open': day_open,
        }
    )
    
    return pipe

def before_trading_start(context, data):
    """
    Called every day before market open.
    """
    #calls pipe and drops NaN
    context.output = algo.pipeline_output('pipeline').dropna()

    # These are the securities that we are interested in trading each day.
    context.security_list = context.output.index

def handle_data(context, data):
    """
    Called every minute.
    """
    #record(leverage=context.account.leverage,
           #exposure=context.account.net_leverage)

def trade(context, data):
    """
    Execute orders according to our schedule_function() timing.

    Uses PCA to find the statistical factors, and fit it to next day's returns.
    """
    
    #data.history includes data for today as well.
    price_history = data.history(context.security_list, fields="open", bar_count=context.lookback, frequency="1d")
    #print price_history.index
    
    #clearing all the NaNs in returns
    returns = price_history.pct_change()
    for idx in returns.count():
        returns = returns[pd.notnull(returns[list(returns)[idx]])]
    returns = returns.bfill().ffill()
    #print returns.index
    returns_np = StandardScaler().fit_transform(returns)
    returns = pd.DataFrame(data = returns_np, columns =
                           returns.columns, index=returns.index)
    #print returns.index
    
    pca = PCA(n_components=context.n_components, whiten=True)
    pca.fit(returns)
    var = pca.explained_variance_ratio_
    
    highcount = 1
    while sum(var) > context.highvarthres:
        new_components = context.n_components - highcount
        pca = PCA(n_components=new_components, whiten=True)
        pca.fit(returns)
        var = pca.explained_variance_ratio_
        highcount += 1
    
    lowcount = 1
    while sum(var) < context.lowvarthres: 
        new_components = context.n_components + lowcount
        pca = PCA(n_components=new_components, whiten=True)
        pca.fit(returns)
        var = pca.explained_variance_ratio_
        lowcount += 1
    
    #print new_components
    
    pca_returns = pca.transform(returns)
    factors = pd.DataFrame(pca_returns)
    #print factors.head()
    
    X = factors.iloc[0:-1,:]
    #print ('shape of X is', X.shape)
    
    lastday = factors.iloc[-1,:] 
    lastday = lastday.to_frame().T
    #lastday.head()
    #print ("lastday shape:", lastday.shape)
    pred_ret = pd.Series(index=returns.columns)
    print ("variance is: %s" %sum(var))
    
    for stock in returns.columns:
        Y = returns.iloc[1:,:]
        Y = Y[stock]
        #print ('shape of Y is', Y.shape)
        LR = linear_model.Lasso(alpha=0.1)
        LR.fit(X,Y)
        #score = LR.score(X,Y) 
        #print ("score is:", score)
        
        pred = LR.predict(lastday)
        pred_ret.loc[stock] = pred
    
    for stock in context.security_list:
        if stock not in pred_ret.nlargest(context.longnum).index and stock not in pred_ret.nsmallest(context.shortnum).index:
            order_target_percent(stock, 0)
        elif sum(var) > context.highvarthres or sum(var) < context.lowvarthres:
            order_target_percent(stock, 0)
        elif stock in pred_ret.nlargest(context.longnum).index and pred_ret[stock] > 0:
            order_target_percent(stock, 0.025)
        #spreading the short positions over twice number of stocks as drops are sharper than rises 
        elif stock in pred_ret.nsmallest(context.shortnum).index  and pred_ret[stock] < 0:
            order_target_percent(stock, -0.0125)
There was a runtime error.

Would you be willing to share the new code? What did you fix?

This is my implementation of Ernest Chan's Statistical Factor loadings algo. In addition, I have added a couple of ways to trade the OLS results. They are commented out in the code. Also, I have implemented a reduction of features from 10 to 5 using Sklearn RFE. This seems to work pretty well.

Now, if someone wants to contribute, please try to fix the high turnover rate.

Clone Algorithm
20
Loading...
Backtest from to with initial capital
Total Returns
--
Alpha
--
Beta
--
Sharpe
--
Sortino
--
Max Drawdown
--
Benchmark Returns
--
Volatility
--
Returns 1 Month 3 Month 6 Month 12 Month
Alpha 1 Month 3 Month 6 Month 12 Month
Beta 1 Month 3 Month 6 Month 12 Month
Sharpe 1 Month 3 Month 6 Month 12 Month
Sortino 1 Month 3 Month 6 Month 12 Month
Volatility 1 Month 3 Month 6 Month 12 Month
Max Drawdown 1 Month 3 Month 6 Month 12 Month
import quantopian.algorithm as algo
from quantopian.pipeline import Pipeline
from quantopian.pipeline.data.builtin import USEquityPricing
from quantopian.pipeline.filters import Q500US, QTradableStocksUS
from sklearn.feature_selection import RFE

import quantopian.optimize as opt

import numpy as np
import math
import statsmodels.api as smapi
import statsmodels as sm
from sklearn import linear_model
import pandas as pd
from sklearn import decomposition
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.preprocessing import StandardScaler

#######################################################

LOOKBACK = 256
N_COMPONENTS = 10


def initialize(context):
    set_slippage(slippage.FixedBasisPointsSlippage(basis_points=0.0, volume_limit=1.0))
    set_commission(commission.PerShare(cost=0.00, min_trade_cost=0.00))
    
    
    print ("lookback days: %s, PCA: %s" %(LOOKBACK, N_COMPONENTS))

    # Longs the top context.longshort and shorts 2*context.longshort bottom stocks   
    algo.schedule_function(
        trade,
        algo.date_rules.every_day(),
        algo.time_rules.market_open(hours=0, minutes=1),
    )

    # Create our dynamic stock selector.
    algo.attach_pipeline(make_pipeline(), 'pipeline')
    
def make_pipeline():
    base_universe = QTradableStocksUS()


    pipe = Pipeline(
        screen=base_universe,
        columns={
            'open': USEquityPricing.open.latest,
        }
    )
   
    return pipe


def before_trading_start(context, data):
    context.output = algo.pipeline_output('pipeline')
    context.security_list = context.output.index

    
def trade(context, data):
    prices = data.history(context.security_list, fields='price', bar_count=LOOKBACK, frequency="1d")

    rets = np.log(prices).diff()[1:]
    rets.dropna(inplace=True, axis=1) # remove stocks with incomplete histories.
        
    stocks = rets.columns
        
    rets = StandardScaler().fit_transform(rets)
    
    pca = PCA(n_components=N_COMPONENTS, whiten=True)
    pca_rets = pca.fit_transform(rets)
    
    X_train = pca_rets[:-1,:]
    X_test = pca_rets[-1:,:]
    
    df = pd.DataFrame(rets, columns=stocks)
    df = df[1:]  # this was df[:-1] but from what I understand, it should be df[1:]
        
    predictions = []
        
    for stock in stocks:
        y = df[stock]
        m = linear_model.LinearRegression()
        # m.fit(X_train, y)        
        # pred = m.predict(X_test)[0]
        # score = m.score(X_train, y)
        
        rfe = RFE(estimator=m, step=1, n_features_to_select=5)
        rfe.fit(X_train, y)
        pred = rfe.predict(X_test)[0]
        score = rfe.score(X_train, y)

        predictions.append({'stock':stock, 'pred': pred, 'score': score})

        
    df = pd.DataFrame(predictions)
    df.index = df['stock']
    
    # rank by prediction 
    # df.sort_values('pred', ascending=False, inplace=True)
    # longs = df['stock'][:50].tolist()
    # shorts = df['stock'][-50:].tolist()

    
    # rank by fit score.
    # df.sort_values('score', ascending=False, inplace=True)
    # df = df[:100] # best scores
    # longs = df[df['pred'] > 0]['stock']
    # shorts = df[df['pred'] < 0]['stock']
    
    
    # another technique
    df.sort_values('pred', ascending=False, inplace=True)
    longs = df[:200]
    shorts = df[-200:]
    
    longs.sort_values('score', ascending=False, inplace=True)
    longs = longs['stock'][:50].tolist() # best scores
    
    shorts.sort_values('score', ascending=False, inplace=True)
    shorts = shorts['stock'][:50].tolist() # best scores
    
    # record(dollard_exposure=float(len(longs)-len(shorts))/(len(longs)+len(shorts)))
    
    # alphas = df['pred']
    
    # objective = opt.MaximizeAlpha(alphas)
    # # #Set Constraints
    # constrain_gross_leverage = opt.MaxGrossExposure(1.0)
    # market_neutral = opt.DollarNeutral(tolerance=0.50)
    # constrain_pos_size = opt.PositionConcentration.with_equal_bounds(
    #     -0.02,
    #      0.02,
    # )
    
    # # #Place Orders based on our objective and constraints
    # try:        
    #     algo.order_optimal_portfolio(
    #         objective=objective,
    #         constraints=[
    #             constrain_gross_leverage,
    #             constrain_pos_size,
    #             market_neutral,
    #         ],
    #     )
        
    # except:
    #     pass
    
    
    for p in context.portfolio.positions:
        if context.portfolio.positions[p].amount > 0 and p in shorts:
            order_target_percent(p, -0.01)            
        elif context.portfolio.positions[p].amount < 0 and p in longs:
            order_target_percent(p, 0.01)
        elif context.portfolio.positions[p].amount > 0 and p in longs:
            pass
        elif context.portfolio.positions[p].amount < 0 and p in shorts:
            pass
        elif context.portfolio.positions[p].amount != 0:
            order_target_percent(p, 0.00)

            
    for l in longs:
        if context.portfolio.positions[l].amount == 0:
            order_target_percent(l, 0.01)
    
    for s in shorts:
        if context.portfolio.positions[s].amount == 0:
            order_target_percent(s, -0.01)
There was a runtime error.

Here is pretty much the same as the above but without RFE feature selection. From the resulting OLS, we pick the top stock with best OLS score and split into top/bottom to create the long-short.

Clone Algorithm
10
Loading...
Backtest from to with initial capital
Total Returns
--
Alpha
--
Beta
--
Sharpe
--
Sortino
--
Max Drawdown
--
Benchmark Returns
--
Volatility
--
Returns 1 Month 3 Month 6 Month 12 Month
Alpha 1 Month 3 Month 6 Month 12 Month
Beta 1 Month 3 Month 6 Month 12 Month
Sharpe 1 Month 3 Month 6 Month 12 Month
Sortino 1 Month 3 Month 6 Month 12 Month
Volatility 1 Month 3 Month 6 Month 12 Month
Max Drawdown 1 Month 3 Month 6 Month 12 Month
import quantopian.algorithm as algo
from quantopian.pipeline import Pipeline
from quantopian.pipeline.data.builtin import USEquityPricing
from quantopian.pipeline.filters import Q500US, QTradableStocksUS
from sklearn.feature_selection import RFE

import quantopian.optimize as opt

import numpy as np
import math
import statsmodels.api as smapi
import statsmodels as sm
from sklearn import linear_model
import pandas as pd
from sklearn import decomposition
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.preprocessing import StandardScaler

#######################################################

LOOKBACK = 256
N_COMPONENTS = 5


def initialize(context):
    set_slippage(slippage.FixedBasisPointsSlippage(basis_points=0.0, volume_limit=1.0))
    set_commission(commission.PerShare(cost=0.00, min_trade_cost=0.00))
    
    
    print ("lookback days: %s, PCA: %s" %(LOOKBACK, N_COMPONENTS))

    # Longs the top context.longshort and shorts 2*context.longshort bottom stocks   
    algo.schedule_function(
        trade,
        algo.date_rules.every_day(),
        algo.time_rules.market_open(hours=0, minutes=1),
    )

    # Create our dynamic stock selector.
    algo.attach_pipeline(make_pipeline(), 'pipeline')
    
def make_pipeline():
    base_universe = QTradableStocksUS()


    pipe = Pipeline(
        screen=base_universe,
        columns={
            'open': USEquityPricing.open.latest,
        }
    )
   
    return pipe


def before_trading_start(context, data):
    context.output = algo.pipeline_output('pipeline')
    context.security_list = context.output.index

    
def trade(context, data):
    prices = data.history(context.security_list, fields='price', bar_count=LOOKBACK, frequency="1d")

    rets = np.log(prices).diff()[1:]
    rets.dropna(inplace=True, axis=1) # remove stocks with incomplete histories.
        
    stocks = rets.columns
        
    rets = StandardScaler().fit_transform(rets)
    
    pca = PCA(n_components=N_COMPONENTS, whiten=True)
    pca_rets = pca.fit_transform(rets)
    
    X_train = pca_rets[:-1,:]
    X_test = pca_rets[-1:,:]
    
    df = pd.DataFrame(rets, columns=stocks)
    df = df[1:]  # this was df[:-1] but from what I understand, it should be df[1:]
        
    predictions = []
        
    for stock in stocks:
        y = df[stock]
        m = linear_model.LinearRegression()
        m.fit(X_train, y)        
        pred = m.predict(X_test)[0]
        score = m.score(X_train, y)
        
        # rfe = RFE(estimator=m, step=1, n_features_to_select=5)
        # rfe.fit(X_train, y)
        # pred = rfe.predict(X_test)[0]
        # score = rfe.score(X_train, y)

        predictions.append({'stock':stock, 'pred': pred, 'score': score})

        
    df = pd.DataFrame(predictions)
    # df.index = df['stock']
    
    # rank by prediction 
    # df.sort_values('pred', ascending=False, inplace=True)
    # longs = df['stock'][:50].tolist()
    # shorts = df['stock'][-50:].tolist()

    
    # rank by fit score.
    # df.sort_values('score', ascending=False, inplace=True)
    # df = df[:100] # best scores
    # longs = df[df['pred'] > 0]['stock']
    # shorts = df[df['pred'] < 0]['stock']
    
    
    # another technique
    df.sort_values('pred', ascending=False, inplace=True)
    longs = df[:200]
    shorts = df[-200:]
    
    longs.sort_values('score', ascending=False, inplace=True)
    longs = longs['stock'][:50].tolist() # best scores
    
    shorts.sort_values('score', ascending=False, inplace=True)
    shorts = shorts['stock'][:50].tolist() # best scores
    
    # record(dollard_exposure=float(len(longs)-len(shorts))/(len(longs)+len(shorts)))
    
    # alphas = df['pred']
    
    # objective = opt.MaximizeAlpha(alphas)
    # # #Set Constraints
    # constrain_gross_leverage = opt.MaxGrossExposure(1.0)
    # market_neutral = opt.DollarNeutral(tolerance=0.50)
    # constrain_pos_size = opt.PositionConcentration.with_equal_bounds(
    #     -0.02,
    #      0.02,
    # )
    
    # # #Place Orders based on our objective and constraints
    # try:        
    #     algo.order_optimal_portfolio(
    #         objective=objective,
    #         constraints=[
    #             constrain_gross_leverage,
    #             constrain_pos_size,
    #             market_neutral,
    #         ],
    #     )
        
    # except:
    #     pass
    
    
    for p in context.portfolio.positions:
        if context.portfolio.positions[p].amount > 0 and p in shorts:
            order_target_percent(p, -0.01)            
        elif context.portfolio.positions[p].amount < 0 and p in longs:
            order_target_percent(p, 0.01)
        elif context.portfolio.positions[p].amount > 0 and p in longs:
            pass
        elif context.portfolio.positions[p].amount < 0 and p in shorts:
            pass
        elif context.portfolio.positions[p].amount != 0:
            order_target_percent(p, 0.00)

            
    for l in longs:
        if context.portfolio.positions[l].amount == 0:
            order_target_percent(l, 0.01)
    
    for s in shorts:
        if context.portfolio.positions[s].amount == 0:
            order_target_percent(s, -0.01)
There was a runtime error.

Not to address turnover although it might help there, I didn't check.
Just offering some extras/options and an occasional use of weighted by score can be informative.

Clone Algorithm
4
Loading...
Backtest from to with initial capital
Total Returns
--
Alpha
--
Beta
--
Sharpe
--
Sortino
--
Max Drawdown
--
Benchmark Returns
--
Volatility
--
Returns 1 Month 3 Month 6 Month 12 Month
Alpha 1 Month 3 Month 6 Month 12 Month
Beta 1 Month 3 Month 6 Month 12 Month
Sharpe 1 Month 3 Month 6 Month 12 Month
Sortino 1 Month 3 Month 6 Month 12 Month
Volatility 1 Month 3 Month 6 Month 12 Month
Max Drawdown 1 Month 3 Month 6 Month 12 Month
import numpy  as np
import pandas as pd
import quantopian.algorithm as algo
import quantopian.optimize  as opt
from quantopian.pipeline              import Pipeline
from quantopian.pipeline.data.builtin import USEquityPricing
from quantopian.pipeline.filters      import Q500US, QTradableStocksUS
from sklearn                   import linear_model
from sklearn.decomposition     import PCA
from sklearn.feature_selection import RFE
from sklearn.preprocessing     import StandardScaler

#######################################################

LOOKBACK     = 256
N_COMPONENTS = 10

def initialize(context):
    set_slippage(slippage.FixedBasisPointsSlippage(basis_points=0.0, volume_limit=1.0))
    set_commission(commission.PerShare(cost=0.00, min_trade_cost=0.00))

    print ('lookback days: %s, PCA: %s' %(LOOKBACK, N_COMPONENTS))

    schedule_function(metrics, date_rules.every_day(), time_rules.market_open())
    schedule_function(closes,  date_rules.every_day(), time_rules.market_open())
    schedule_function(trade,   date_rules.every_day(), time_rules.market_open(hours=0, minutes=5))

    context.mxlv = 0
    for i in range(1, 391):
        schedule_function(mxlv, date_rules.every_day(), time_rules.market_open(minutes=i))

    algo.attach_pipeline(make_pipeline(), 'pipeline')

def make_pipeline():
    return Pipeline(
        screen  = QTradableStocksUS(),
        columns = {
            'open': USEquityPricing.open.latest,
        }
    )

def before_trading_start(context, data):
    context.out = algo.pipeline_output('pipeline')

    record(pos  = len(context.portfolio.positions))
    record(cash = context.portfolio.cash)
    record(lv   = context.account.leverage)
    record(MxLv = context.mxlv)  # every minute every day, maybe more strict than the contest (merely EOD?)

def closes(context, data):
    for s in context.portfolio.positions:
        if s not in context.alpha:
            order_target(s, 0)

def trade(context, data):
    do_opt = 1

    if do_opt:
        try:
            algo.order_optimal_portfolio(
                #objective   = opt.MaximizeAlpha(context.alpha),
                objective   = opt.TargetWeights(context.alpha),
                constraints = [
                    opt.MaxGrossExposure(1.0),
                    opt.DollarNeutral(tolerance=0.50),
                    #opt.PositionConcentration.with_equal_bounds(-0.02, 0.02),
                ],
            )
        except Exception as e:
            print e
    else:
        for s in context.alpha.index:
            order_target_percent(s, context.alpha[s])

def metrics(context, data):
    prices = data.history(context.out.index, 'price', LOOKBACK, '1d').ffill().bfill()

    rets = np.log(prices).diff()[1:]
    #rets.dropna(inplace=True, axis=1) # remove stocks with incomplete histories.

    stocks = rets.columns

    rets = StandardScaler().fit_transform(rets)

    pca = PCA(n_components=N_COMPONENTS, whiten=True)
    pca_rets = pca.fit_transform(rets)

    X_train = pca_rets[:-1,:]
    X_test  = pca_rets[-1:,:]

    df = pd.DataFrame(rets, columns=stocks)
    df = df[1:]  # this was df[:-1] but from what I understand, it should be df[1:]

    predictions = []

    for stock in stocks:
        y = df[stock]
        m = linear_model.LinearRegression()
        m.fit(X_train, y)
        pred  = m.predict(X_test)[0]
        score = m.score(X_train, y)

        #rfe = RFE(estimator=m, step=1, n_features_to_select=5)
        #rfe.fit(X_train, y)
        #pred  = rfe.predict(X_test)[0]
        #score = rfe.score(X_train, y)

        predictions.append({'stock':stock, 'pred': pred, 'score': score})

    df = pd.DataFrame(predictions)
    df.index = df['stock']
    del df['stock']

    try: context.log_data_done
    except:
        log_data(context, data, df, 4)        # show df info once

    df.sort_values('score', ascending=False, inplace=True)
    alpha = df['score'][:50].add(df['score'][-50:], fill_value=0)
    context.alpha = norm(context, alpha)
    context.alpha *= .5

    return

    # rank by prediction
    # df.sort_values('pred', ascending=False, inplace=True)
    # longs = df['stock'][:50] .tolist()
    # shrts = df['stock'][-50:].tolist()

    # rank by fit score.
    # df.sort_values('score', ascending=False, inplace=True)
    # df = df[:100] # best scores
    #longs = df[df['pred'] > 0]
    #shrts = df[df['pred'] < 0]

    # another technique
    df.sort_values('pred', ascending=False, inplace=True)
    longs = df[:200]
    shrts = df[-200:]

    longs.sort_values('score', ascending=False, inplace=True)
    context.longs = longs[:50].index.tolist() # best scores

    shrts.sort_values('score', ascending=True, inplace=True)
    context.shrts = shrts[:50].index.tolist()

    # record(dollard_exposure=float(len(longs)-len(shrts))/(len(longs)+len(shrts)))

    # alphas = df['pred']

    # objective = opt.MaximizeAlpha(alphas)
    # # #Set Constraints
    # constrain_gross_leverage = opt.MaxGrossExposure(1.0)
    # market_neutral = opt.DollarNeutral(tolerance=0.50)
    # constrain_pos_size = opt.PositionConcentration.with_equal_bounds(
    #     -0.02,
    #      0.02,
    # )

    # # #Place Orders based on the objective and constraints
    # try:
    #     algo.order_optimal_portfolio(
    #         objective=objective,
    #         constraints=[
    #             constrain_gross_leverage,
    #             constrain_pos_size,
    #             market_neutral,
    #         ],
    #     )

    # except:
    #     pass

def norm(c, d):    # d data, it's a series, normalize it pos, neg separately
    # https://www.quantopian.com/posts/normalizing-positive-and-negative-values-separately
    # Normalizing positive and negative values separately, recombining for input to optimize.
    # Debated whether to include this part. If all pos or neg, shift for pos & neg.
    if not len(d): return d   # In case None
    d = d[ d == d ]           # insure no nans
    if d.min() >= 0 or d.max() <= 0:
        d -= d.mean()
    pos  = d[ d > 0 ]
    neg  = d[ d < 0 ]

    # same number of stocks for positive & negative
    num  = min(len(pos), len(neg))
    #num  = max(len(pos), len(neg))   # making this block useless, a lazy way to.
    pos  = pos.sort_values(ascending=False).head(num)
    neg  = neg.sort_values(ascending=False).tail(num)

    pos /=   pos.sum()
    neg  = -(neg / neg.sum())
    return pos.append(neg)

def log_data(context, data, z, num, fields=None):
    ''' Log info about pipeline output or, z can be any DataFrame or Series
    https://www.quantopian.com/posts/overview-of-pipeline-content-easy-to-add-to-your-backtest
    '''
    try: context.log_data_done
    except:
        # {:,} is magic for adding commas
        log.info('starting_cash ${:,}   portfolio ${:,}     {} positions ...'.format(
            int(context.portfolio.cash),
            int(context.portfolio.portfolio_value),
            len(context.portfolio.positions),
        ))
        context.log_data_done = 1

    if not len(z):
        log.info('Empty')
        return

    # Options
    log_nan_only = 0          # Only log if nans are present
    show_sectors = 0          # If sectors, do you want to see them or not
    show_sorted_details = 1   # [num] high & low securities sorted, each column
    padmax = 6                # num characters for each field, starting point

    # Series ......
    if 'Series' in str(type(z)):    # is Series, not DataFrame
        nan_count = len(z[z != z])
        nan_count = 'NaNs {}/{}'.format(nan_count, len(z)) if nan_count else ''
        if (log_nan_only and nan_count) or not log_nan_only:
            pad = max( padmax, len('%.5f' % z.max()) )
            log.info('{}{}{}   Series  len {}'.format('min'.rjust(pad+5),
                'mean'.rjust(pad+5), 'max'.rjust(pad+5), len(z)))
            log.info('{}{}{} {}'.format(
                ('%.5f' % z.min()) .rjust(pad+5),
                ('%.5f' % z.mean()).rjust(pad+5),
                ('%.5f' % z.max()) .rjust(pad+5),
                nan_count
            ))
            log.info('High\n{}'.format(z.sort_values(ascending=False).head(num)))
            log.info('Low\n{}' .format(z.sort_values(ascending=False).tail(num)))
        return

    # DataFrame ......
    content_min_max = [ ['','min','mid','max',''] ] ; content = ''
    for col in z.columns:
        #try: z[col].max()
        #except:
        #    log.info('{} non-numeric'.format(col))
        #    #continue   # skip non-numeric
        if col == 'sector' and not show_sectors: continue
        nan_count = len(z[col][z[col] != z[col]])
        nan_count = 'NaNs {}/{}'.format(nan_count, len(z)) if nan_count else ''
        # known bug, not always sorting strings alphabetically ...
        srt       = z[col].sort_values() if type(z[col][0]) != str else z.iloc[z[col].str.lower().argsort()]
        padmax    = max( padmax, len(str(srt[-1])) )
        content_min_max.append([col, str(srt[0]), str(srt[len(srt)//2]), str(srt[-1]), nan_count])
    if log_nan_only and nan_count or not log_nan_only:
        if len(z.columns) == 1: content = 'Stocks: {}'.format(z.shape[0])
        if len(z.columns)  > 1: content = 'Stocks: {}  Columns: {}'.format(z.shape[0], z.shape[1])
        if len(z.columns):
            paddings = [6 for i in range(4)]
            for lst in content_min_max:    # set max lengths
                i = 0
                for val in lst[:4]:        # value in each sub-list
                    paddings[i] = max(paddings[i], len(str(val)))
                    i += 1
            headr = content_min_max[0]
            content += ('\n{}{}{}{}{}'.format(
                 headr[0] .rjust(paddings[0]),
                (headr[1]).rjust(paddings[1]+5),
                (headr[2]).rjust(paddings[2]+5),
                (headr[3]).rjust(paddings[3]+5),
                ''
            ))
            for lst in content_min_max[1:]:    # populate content using max lengths
                content += ('\n{}{}{}{}     {}'.format(
                    lst[0].rjust(paddings[0]),
                    lst[1].rjust(paddings[1]+5),
                    lst[2].rjust(paddings[2]+5),
                    lst[3].rjust(paddings[3]+5),
                    lst[4],
                ))
        log.info(content)

    if not show_sorted_details: return
    if len(z.columns) == 1:     return     # skip detail if only 1 column
    if fields == None: details = z.columns
    for detail in details:
        if detail == 'sector' and not show_sectors: continue
        hi = z[details].sort_values(by=detail, ascending=False).head(num)
        lo = z[details].sort_values(by=detail, ascending=False).tail(num)
        content  = ''
        content += ('_ _ _   {}   _ _ _'  .format(detail))
        content += ('\n\t... {} highs\n{}'.format(detail, str(hi)))
        content += ('\n\t... {} lows \n{}'.format(detail, str(lo)))
        if log_nan_only and not len(lo[lo[detail] != lo[detail]]):
            continue  # skip if no nans
        log.info(content)

def mxlv(context, data):
    if context.account.leverage > context.mxlv:
        context.mxlv = context.account.leverage
There was a runtime error.

It does not matter how much alpha you get in a backtest if a trading strategy cannot at least survive its frictional costs.

The first of any acid test that should be done on any trading strategy is to find out if it could survive these frictional costs (commissions, slippage, and other fees). The second test might be to see if it might break down going forward (giving it more time). And a third test to figure out if it is scalable (give it more money to manage).

All 3 tests were done simultaneously in the attached algo using Luc's version (Backtest ID: 5c62b8dee310ed49b6a0c97e).

Have not read the program, however, I have no motivation to go any further.

Clone Algorithm
1
Loading...
Backtest from to with initial capital
Total Returns
--
Alpha
--
Beta
--
Sharpe
--
Sortino
--
Max Drawdown
--
Benchmark Returns
--
Volatility
--
Returns 1 Month 3 Month 6 Month 12 Month
Alpha 1 Month 3 Month 6 Month 12 Month
Beta 1 Month 3 Month 6 Month 12 Month
Sharpe 1 Month 3 Month 6 Month 12 Month
Sortino 1 Month 3 Month 6 Month 12 Month
Volatility 1 Month 3 Month 6 Month 12 Month
Max Drawdown 1 Month 3 Month 6 Month 12 Month
import quantopian.algorithm as algo
from quantopian.pipeline import Pipeline
from quantopian.pipeline.data.builtin import USEquityPricing
from quantopian.pipeline.filters import Q500US, QTradableStocksUS
from sklearn.feature_selection import RFE

import quantopian.optimize as opt

import numpy as np
import math
import statsmodels.api as smapi
import statsmodels as sm
from sklearn import linear_model
import pandas as pd
from sklearn import decomposition
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.preprocessing import StandardScaler

#######################################################

LOOKBACK = 256
N_COMPONENTS = 10


def initialize(context):
    #set_slippage(slippage.FixedBasisPointsSlippage(basis_points=0.0, volume_limit=1.0))
    #set_commission(commission.PerShare(cost=0.00, min_trade_cost=0.00))
    
    
    print ("lookback days: %s, PCA: %s" %(LOOKBACK, N_COMPONENTS))

    # Longs the top context.longshort and shorts 2*context.longshort bottom stocks   
    algo.schedule_function(
        trade,
        algo.date_rules.every_day(),
        algo.time_rules.market_open(hours=0, minutes=1),
    )

    # Create our dynamic stock selector.
    algo.attach_pipeline(make_pipeline(), 'pipeline')
    
def make_pipeline():
    base_universe = QTradableStocksUS()


    pipe = Pipeline(
        screen=base_universe,
        columns={
            'open': USEquityPricing.open.latest,
        }
    )
   
    return pipe


def before_trading_start(context, data):
    context.output = algo.pipeline_output('pipeline')
    context.security_list = context.output.index

    
def trade(context, data):
    prices = data.history(context.security_list, fields='price', bar_count=LOOKBACK, frequency="1d")

    rets = np.log(prices).diff()[1:]
    rets.dropna(inplace=True, axis=1) # remove stocks with incomplete histories.
        
    stocks = rets.columns
        
    rets = StandardScaler().fit_transform(rets)
    
    pca = PCA(n_components=N_COMPONENTS, whiten=True)
    pca_rets = pca.fit_transform(rets)
    
    X_train = pca_rets[:-1,:]
    X_test = pca_rets[-1:,:]
    
    df = pd.DataFrame(rets, columns=stocks)
    df = df[1:]  # this was df[:-1] but from what I understand, it should be df[1:]
        
    predictions = []
        
    for stock in stocks:
        y = df[stock]
        m = linear_model.LinearRegression()
        # m.fit(X_train, y)        
        # pred = m.predict(X_test)[0]
        # score = m.score(X_train, y)
        
        rfe = RFE(estimator=m, step=1, n_features_to_select=5)
        rfe.fit(X_train, y)
        pred = rfe.predict(X_test)[0]
        score = rfe.score(X_train, y)

        predictions.append({'stock':stock, 'pred': pred, 'score': score})

        
    df = pd.DataFrame(predictions)
    df.index = df['stock']
    
    # rank by prediction 
    # df.sort_values('pred', ascending=False, inplace=True)
    # longs = df['stock'][:50].tolist()
    # shorts = df['stock'][-50:].tolist()

    
    # rank by fit score.
    # df.sort_values('score', ascending=False, inplace=True)
    # df = df[:100] # best scores
    # longs = df[df['pred'] > 0]['stock']
    # shorts = df[df['pred'] < 0]['stock']
    
    
    # another technique
    df.sort_values('pred', ascending=False, inplace=True)
    longs = df[:200]
    shorts = df[-200:]
    
    longs.sort_values('score', ascending=False, inplace=True)
    longs = longs['stock'][:50].tolist() # best scores
    
    shorts.sort_values('score', ascending=False, inplace=True)
    shorts = shorts['stock'][:50].tolist() # best scores
    
    # record(dollard_exposure=float(len(longs)-len(shorts))/(len(longs)+len(shorts)))
    
    # alphas = df['pred']
    
    # objective = opt.MaximizeAlpha(alphas)
    # # #Set Constraints
    # constrain_gross_leverage = opt.MaxGrossExposure(1.0)
    # market_neutral = opt.DollarNeutral(tolerance=0.50)
    # constrain_pos_size = opt.PositionConcentration.with_equal_bounds(
    #     -0.02,
    #      0.02,
    # )
    
    # # #Place Orders based on our objective and constraints
    # try:        
    #     algo.order_optimal_portfolio(
    #         objective=objective,
    #         constraints=[
    #             constrain_gross_leverage,
    #             constrain_pos_size,
    #             market_neutral,
    #         ],
    #     )
        
    # except:
    #     pass
    
    
    for p in context.portfolio.positions:
        if context.portfolio.positions[p].amount > 0 and p in shorts:
            order_target_percent(p, -0.01)            
        elif context.portfolio.positions[p].amount < 0 and p in longs:
            order_target_percent(p, 0.01)
        elif context.portfolio.positions[p].amount > 0 and p in longs:
            pass
        elif context.portfolio.positions[p].amount < 0 and p in shorts:
            pass
        elif context.portfolio.positions[p].amount != 0:
            order_target_percent(p, 0.00)

            
    for l in longs:
        if context.portfolio.positions[l].amount == 0:
            order_target_percent(l, 0.01)
    
    for s in shorts:
        if context.portfolio.positions[s].amount == 0:
            order_target_percent(s, -0.01)
There was a runtime error.

@Guy, Thank you for your contribution. Yes, it was obvious to me that an algo that has a turnover of 100%+ will fail given standard slip of 5bps. Hence why I posted my results asking the community for ideas on how to reduce the turnover, if at all possible.

Thanks @Blue for posting your code. Your "norm()" function has decreased TO to 18%. I have yet to understand why. I will check out carefully your code.

Is there any filter on the universe that could be applied such that it is reduced, and as such, reduce the TO?

/Luc

You might try lengthening the look back window, using exponentially weighted scores, and shrinking the covariance matrix to build the PCA.

Your algorithm better used for a short-term signal for long-term trading. See this paper To Trade or Not to Trade? Informed Trading with Short-Term Signals for Long-Term Investors