Back to Community
Efficient N Days Ago Factor Research and Algo Templates

I spent the last day or so working to implement a research notebook and a corresponding algo inspired by Grant's clustering notebook and David's contribution regarding a more efficient pipeline that avoids recalculating values.

In the templates, I specified arbitrary factors: two custom factors, and one fundamental factor (that I implemented as a CustomFactor to work with the code setup). I then combined the factors by scaling all features and simply adding them. In the algo specifically, I did my best to mimic Grant's method of meeting the contest criteria while using the Optimize API. There are arbitrary parameters there that you can feel free to change.

There is no hypothesis for the template (i.e. there's no reason why I chose the factors that I did or how I chose to combine them). Rather, it's a demonstration of how you can implement your own alpha factors and include previous days' factor data. It's up to you to find meaningful signals. :)

In theory, you should be able to simply edit the make_factors() function to your liking and copy the changes into the algo version. The same goes for how you combine your alpha factors.

Loading notebook preview...
Notebook previews are currently unavailable.
7 responses

EDIT: See a few posts below for a slightly changed template. I adjusted the two lines that David points out in the next post to support more combinations of factor names. Basically, it should work provided your factor name doesn't have a hyphen in it.

And here's the corresponding algo, run for the same time period. Not the best returns! ;)

Clone Algorithm
20
Loading...
Backtest from to with initial capital
Total Returns
--
Alpha
--
Beta
--
Sharpe
--
Sortino
--
Max Drawdown
--
Benchmark Returns
--
Volatility
--
Returns 1 Month 3 Month 6 Month 12 Month
Alpha 1 Month 3 Month 6 Month 12 Month
Beta 1 Month 3 Month 6 Month 12 Month
Sharpe 1 Month 3 Month 6 Month 12 Month
Sortino 1 Month 3 Month 6 Month 12 Month
Volatility 1 Month 3 Month 6 Month 12 Month
Max Drawdown 1 Month 3 Month 6 Month 12 Month
"""
This is a template algorithm on Quantopian for you to adapt and fill in.
"""
import quantopian.algorithm as algo
from quantopian.algorithm import attach_pipeline, pipeline_output
import quantopian.optimize as opt
from quantopian.pipeline import Pipeline
from quantopian.pipeline.data import Fundamentals
from quantopian.pipeline.data.builtin import USEquityPricing
from quantopian.pipeline.experimental import risk_loading_pipeline
from quantopian.pipeline.filters import QTradableStocksUS
from quantopian.pipeline.factors import CustomFactor, SimpleBeta

from sklearn import preprocessing
from scipy.stats.mstats import winsorize

import pandas as pd
import numpy as np

WINSORIZE_LIMIT = 0  # The limit for each extreme of preprocess()'s winsorize
WINDOW_LENGTH = 5  # trailing window of alpha factors exported to before_trading_start

# Optimize API constraints
MAX_POSITION_SIZE = 0.01  # set to 0.01 for ~100 positions
USE_MaxTurnover = True  # set to True to use Optimize API MaxTurnover constraint
MIN_TURN = 0.06  # Optimize API MaxTurnover constraint (if optimize fails, incrementally higher constraints will be attempted)


def preprocess(a):
    a = a.astype('float64')
    a[np.isinf(a)] = np.nan
    a = np.nan_to_num(a - np.nanmean(a))
    a = winsorize(a, limits=[WINSORIZE_LIMIT, WINSORIZE_LIMIT])

    return preprocessing.scale(a)  # a standardized scaler


def make_factors():

    class Direction(CustomFactor):
        inputs = [USEquityPricing.open, USEquityPricing.close]
        window_length = 21
        window_safe = True

        def compute(self, today, assets, out, open, close):
            p = (close - open) / close

            out[:] = preprocess(np.nansum(-p, axis=0))

    class mean_rev(CustomFactor):
        inputs = [USEquityPricing.high, USEquityPricing.low, USEquityPricing.close]
        window_length = 30
        window_safe = True

        def compute(self, today, assets, out, high, low, close):
            p = (high + low + close) / 3

            m = len(close[0, :])
            n = len(close[:, 0])

            b = np.zeros(m)
            a = np.zeros(m)

            for k in range(10, n + 1):
                price_rel = np.nanmean(p[-k:, :], axis=0) / p[-1, :]
                wt = np.nansum(price_rel)
                b += wt * price_rel
                price_rel = 1.0 / price_rel
                wt = np.nansum(price_rel)
                a += wt * price_rel

            out[:] = preprocess(b - a)

    class fcf(CustomFactor):
        inputs = [Fundamentals.fcf_yield]
        window_length = 1
        window_safe = True
        def compute(self, today, assets, out, fcf_yield):
            out[:] = preprocess(np.nan_to_num(fcf_yield[-1, :]))
            
    factors = {
        'Direction': Direction,
        'mean_rev': mean_rev,
        'fcf': fcf
    }

    return factors


class Factor_N_Days_Ago(CustomFactor):

    def compute(self, today, assets, out, input_factor):
        out[:] = input_factor[0]


def init_pipeline():
    factors = make_factors()

    pipeline_columns = {}
    for f in factors.keys():
        for days_ago in reversed(range(WINDOW_LENGTH)):
            pipeline_columns[f + '-' + str(days_ago)] = Factor_N_Days_Ago([factors[f](mask=QTradableStocksUS())], window_length=days_ago + 1, mask=QTradableStocksUS())

    pipe = Pipeline(columns=pipeline_columns, screen=QTradableStocksUS())

    return pipe


def factor_pipeline():
    all_factors = make_factors()

    factors = {a: all_factors[a]() for a in all_factors}

    pipe = Pipeline(columns=factors, screen=QTradableStocksUS())

    return pipe


def beta_pipeline():
    beta = SimpleBeta(target=sid(8554), regression_length=260, allowed_missing_percentage=1.0)

    pipe = Pipeline(columns={'beta': beta}, screen=QTradableStocksUS())
    return pipe


def initialize(context):
    """
    Called once at the start of the algorithm.
    """

    context.alphas = pd.DataFrame()

    # Rebalance every day, 1 hour after market open.
    algo.schedule_function(
        rebalance,
        algo.date_rules.every_day(),
        algo.time_rules.market_open(hours=1),
    )

    # Record tracking variables at the end of each day.
    algo.schedule_function(
        record_vars,
        algo.date_rules.every_day(),
        algo.time_rules.market_close(),
    )

    # Create our dynamic stock selector.
    attach_pipeline(risk_loading_pipeline(), 'risk_loading_pipeline')
    attach_pipeline(beta_pipeline(), 'beta_pipeline')
    attach_pipeline(init_pipeline(), 'pipeinit')
    attach_pipeline(factor_pipeline(), 'pipeline')

    context.first_trading_day = True
    context.init = True


def before_trading_start(context, data):
    # Extract the risk and beta pipelines
    risk_loadings = pipeline_output('risk_loading_pipeline')
    risk_loadings.fillna(risk_loadings.median(), inplace=True)
    context.risk_loadings = risk_loadings
    context.beta_pipeline = pipeline_output('beta_pipeline')

    # Extract the factor pipelines
    if context.first_trading_day == True:
        # Extract the alphas from the initial pipeline

        # Get the df
        df = (pipeline_output("pipeinit")).dropna().astype('float64')  # (pipeline_output("pipeinit")) for the algo environ, init_pipe for research
        df = df.stack().to_frame()
        df.index.names = ['stock', 'alphas']  # exclude 'date' for the algo environ, include for research
        df = df.reset_index(level=['alphas', 'stock'])

        # Extract the alpha names and days and remove unnecessary columns
        df['factor'] = df['alphas'].str.extract('(\w+\s*\w+)')
        df['day'] = df['alphas'].str.extract('(\d+)').astype('int32')
        df = df.drop('alphas', axis=1)
        df = df.reset_index().set_index(['stock', 'factor', 'day'])  # exclude 'date' for the algo environ, include for research

        # Extract the alpha values
        df = df[0]  # the column for the alpha values
        df = df.unstack(level=2)  # use 2 for the algo environ, 3 for research

        # Save the alphas
        context.alphas = df  # for pipeline env
        # for research env
        # date = init_pipe.index.levels[0][0] # Keep only the first date
        # alphas = df.xs(date)
        # all_alphas[date] = alphas

        context.first_trading_day = False
    else:
        # Extract the alphas from the factor pipeline

        # Get the df
        df = (pipeline_output("pipeline")).dropna().astype('float64')  # for pipeline environ
        # df = factor_pipe.xs(date).astype('float64')  # for research environ
        df = df.stack().to_frame()
        df.index.names = ['stock', 'factor']

        # Update the alphas' days (use context.alphas in algo environ, alphas for research)
        context.alphas = context.alphas.drop([WINDOW_LENGTH - 1], axis=1)  # Drop the oldest alpha
        context.alphas.columns = range(WINDOW_LENGTH)[1:]  # Rename columns as they're now one day older
        context.alphas = pd.concat([df, context.alphas], axis=1)  # Append the new data

        # Save the alphas
        context.alphas = context.alphas  # for pipeline env (redundant, I know; but it's clear!)
        # all_alphas[date] = alphas  # for research env


def rebalance(context, data):
    """
    Execute orders according to our schedule_function() timing.
    """
    # Flatten the alpha factors
    alphas_flattened = context.alphas.unstack().dropna()

    # Scale the factors to prepare for combination
    scaler = preprocessing.StandardScaler()
    scaled = scaler.fit_transform(alphas_flattened)
    alphas_flattened.loc[:, :] = scaled

    # Combine the alpha factors
    context.combined_alpha = alphas_flattened.sum(axis=1)

    # Define our objective
    objective = opt.MaximizeAlpha(context.combined_alpha)

    # Add our constraints
    constraints = []
    constraints.append(opt.MaxGrossExposure(1.0))
    constraints.append(opt.DollarNeutral())
    constraints.append(opt.PositionConcentration.with_equal_bounds(min=-MAX_POSITION_SIZE, max=MAX_POSITION_SIZE))

    risk_model_exposure = opt.experimental.RiskModelExposure(context.risk_loadings, version=opt.Newest)
    constraints.append(risk_model_exposure)

    beta_neutral = opt.FactorExposure(loadings=context.beta_pipeline[['beta']], min_exposures={'beta': 0}, max_exposures={'beta': 0})
    constraints.append(beta_neutral)

    # Order the optimal portfolio
    if context.init:
        order_optimal_portfolio(objective=objective, constraints=constraints)
        if USE_MaxTurnover:
            context.init = False
        return
    turnover = np.linspace(MIN_TURN, 0.65, num=100)
    for max_turnover in turnover:  # Try loosening the constraint
        constraints.append(opt.MaxTurnover(max_turnover))
        try:
            order_optimal_portfolio(objective=objective, constraints=constraints)
            constraints = constraints[:-1]
            record(max_turnover=max_turnover)
            return
        except:
            constraints = constraints[:-1]


def record_vars(context, data):
    """
    Plot variables at the end of each day.
    """
    record(num_positions=len(context.portfolio.positions))


def handle_data(context, data):
    """
    Called every minute.
    """
    pass
There was a runtime error.

Hi Kyle,
Could you explain me the 2 line:

175 df['factor'] = df['alphas'].str.extract('(\w+\s*\w+)')
176 df['day'] = df['alphas'].str.extract('(\d+)').astype('int32')

Then one could used a uint8 for the days counter (but that is really saving quite nothing :-).

Thanks,
David

Hi David,

I'm using string pattern matching to dynamically pick out the factor names and dates associated with them before pivoting into the final table format.

The first line you reference would match any of these factors (as specified by the dictionary key):

factors = {  
        'Direction': Direction,  
        'mean_rev': mean_rev,  
        'fcf': fcf  
    }  

The second line picks out the days ago associated with the factor. Because the initial pipeline spits out columns named "fcf-0", "fcf-1", etc., I needed to pick out the days ago number so that the data could be aligned in-time and combined with the second pipeline.

You're right that I could have used uint8 for the days counter. I don't foresee anyone running the pipeline with more than 255 days ago of data.

Hope that helps!

I understood the fact that you was picking the name and days...

But then I don't understand the \w+\s*\w+ and \d+.

In my version, I "stupidly" rebuild the serie from by searching the '-' character and taking what before as the the factor name and what is after as the day. I made is that way to be waterproof of factor name such as momentum10, "momentum20, or worst "mom20 var2"... Ok I know space should be avoided but I like white spaces ;-).

I don't think your way is "stupid". It's probably more robust than what I'm doing here...

Regarding the \d+, that just matches the day number. In the way I've set it up, it's equivalent to just taking what's after the '-', provided there are no other hyphens in the factor name.

Regarding the \w+\s*\w+, I found that to encompass the factor names I was using. The \w+ will match all letters, numbers and '_'. The \s* is an optional space in the factor name. My setup is limited in that it cannot support more than one space in the factor name. But for control sake, I felt safer specifying something to match than taking what's before the '-' as you did. I don't think either way is necessary better, but I wanted to have a little more control/filtering. :)

Here's the updated algo with slightly different pattern matching. I updated the notebook in the original post as well.

Clone Algorithm
20
Loading...
Backtest from to with initial capital
Total Returns
--
Alpha
--
Beta
--
Sharpe
--
Sortino
--
Max Drawdown
--
Benchmark Returns
--
Volatility
--
Returns 1 Month 3 Month 6 Month 12 Month
Alpha 1 Month 3 Month 6 Month 12 Month
Beta 1 Month 3 Month 6 Month 12 Month
Sharpe 1 Month 3 Month 6 Month 12 Month
Sortino 1 Month 3 Month 6 Month 12 Month
Volatility 1 Month 3 Month 6 Month 12 Month
Max Drawdown 1 Month 3 Month 6 Month 12 Month
"""
This is a template algorithm on Quantopian for you to adapt and fill in.
"""
import quantopian.algorithm as algo
from quantopian.algorithm import attach_pipeline, pipeline_output
import quantopian.optimize as opt
from quantopian.pipeline import Pipeline
from quantopian.pipeline.data import Fundamentals
from quantopian.pipeline.data.builtin import USEquityPricing
from quantopian.pipeline.experimental import risk_loading_pipeline
from quantopian.pipeline.filters import QTradableStocksUS
from quantopian.pipeline.factors import CustomFactor, SimpleBeta

from sklearn import preprocessing
from scipy.stats.mstats import winsorize

import pandas as pd
import numpy as np

WINSORIZE_LIMIT = 0  # The limit for each extreme of preprocess()'s winsorize
WINDOW_LENGTH = 5  # trailing window of alpha factors exported to before_trading_start

# Optimize API constraints
MAX_POSITION_SIZE = 0.01  # set to 0.01 for ~100 positions
USE_MaxTurnover = True  # set to True to use Optimize API MaxTurnover constraint
MIN_TURN = 0.06  # Optimize API MaxTurnover constraint (if optimize fails, incrementally higher constraints will be attempted)


def preprocess(a):
    a = a.astype('float64')
    a[np.isinf(a)] = np.nan
    a = np.nan_to_num(a - np.nanmean(a))
    a = winsorize(a, limits=[WINSORIZE_LIMIT, WINSORIZE_LIMIT])

    return preprocessing.scale(a)  # a standardized scaler


def make_factors():

    class Direction(CustomFactor):
        inputs = [USEquityPricing.open, USEquityPricing.close]
        window_length = 21
        window_safe = True

        def compute(self, today, assets, out, open, close):
            p = (close - open) / close

            out[:] = preprocess(np.nansum(-p, axis=0))

    class mean_rev(CustomFactor):
        inputs = [USEquityPricing.high, USEquityPricing.low, USEquityPricing.close]
        window_length = 30
        window_safe = True

        def compute(self, today, assets, out, high, low, close):
            p = (high + low + close) / 3

            m = len(close[0, :])
            n = len(close[:, 0])

            b = np.zeros(m)
            a = np.zeros(m)

            for k in range(10, n + 1):
                price_rel = np.nanmean(p[-k:, :], axis=0) / p[-1, :]
                wt = np.nansum(price_rel)
                b += wt * price_rel
                price_rel = 1.0 / price_rel
                wt = np.nansum(price_rel)
                a += wt * price_rel

            out[:] = preprocess(b - a)

    class fcf(CustomFactor):
        inputs = [Fundamentals.fcf_yield]
        window_length = 1
        window_safe = True
        def compute(self, today, assets, out, fcf_yield):
            out[:] = preprocess(np.nan_to_num(fcf_yield[-1, :]))
            
    factors = {
        'Direction': Direction,
        'mean_rev': mean_rev,
        'fcf': fcf
    }

    return factors


class Factor_N_Days_Ago(CustomFactor):

    def compute(self, today, assets, out, input_factor):
        out[:] = input_factor[0]


def init_pipeline():
    factors = make_factors()

    pipeline_columns = {}
    for f in factors.keys():
        for days_ago in reversed(range(WINDOW_LENGTH)):
            pipeline_columns[f + '-' + str(days_ago)] = Factor_N_Days_Ago([factors[f](mask=QTradableStocksUS())], window_length=days_ago + 1, mask=QTradableStocksUS())

    pipe = Pipeline(columns=pipeline_columns, screen=QTradableStocksUS())

    return pipe


def factor_pipeline():
    all_factors = make_factors()

    factors = {a: all_factors[a]() for a in all_factors}

    pipe = Pipeline(columns=factors, screen=QTradableStocksUS())

    return pipe


def beta_pipeline():
    beta = SimpleBeta(target=sid(8554), regression_length=260, allowed_missing_percentage=1.0)

    pipe = Pipeline(columns={'beta': beta}, screen=QTradableStocksUS())
    return pipe


def initialize(context):
    """
    Called once at the start of the algorithm.
    """

    context.alphas = pd.DataFrame()

    # Rebalance every day, 1 hour after market open.
    algo.schedule_function(
        rebalance,
        algo.date_rules.every_day(),
        algo.time_rules.market_open(hours=1),
    )

    # Record tracking variables at the end of each day.
    algo.schedule_function(
        record_vars,
        algo.date_rules.every_day(),
        algo.time_rules.market_close(),
    )

    # Create our dynamic stock selector.
    attach_pipeline(risk_loading_pipeline(), 'risk_loading_pipeline')
    attach_pipeline(beta_pipeline(), 'beta_pipeline')
    attach_pipeline(init_pipeline(), 'pipeinit')
    attach_pipeline(factor_pipeline(), 'pipeline')

    context.first_trading_day = True
    context.init = True


def before_trading_start(context, data):
    # Extract the risk and beta pipelines
    risk_loadings = pipeline_output('risk_loading_pipeline')
    risk_loadings.fillna(risk_loadings.median(), inplace=True)
    context.risk_loadings = risk_loadings
    context.beta_pipeline = pipeline_output('beta_pipeline')

    # Extract the factor pipelines
    if context.first_trading_day == True:
        # Extract the alphas from the initial pipeline

        # Get the df
        df = (pipeline_output("pipeinit")).dropna().astype('float64')  # (pipeline_output("pipeinit")) for the algo environ, init_pipe for research
        df = df.stack().to_frame()
        df.index.names = ['stock', 'alphas']  # exclude 'date' for the algo environ, include for research
        df = df.reset_index(level=['alphas', 'stock'])

        # Extract the alpha names and days and remove unnecessary columns
        df['factor'] = df['alphas'].str.extract('(\w+\s*\w+)-')
        df['day'] = df['alphas'].str.extract('-(\d+)').astype('int32')
        df = df.drop('alphas', axis=1).reset_index()

        # Extract the alpha values
        df = df.pivot_table(index=['stock', 'factor'], columns='day', values=0)  # exclude 'date' for the algo environ, include for research

        # Save the alphas
        context.alphas = df  # for pipeline env
        # for research env
        # date = init_pipe.index.levels[0][0] # Keep only the first date
        # alphas = df.xs(date)
        # all_alphas[date] = alphas

        context.first_trading_day = False
    else:
        # Extract the alphas from the factor pipeline

        # Get the df
        df = (pipeline_output("pipeline")).dropna().astype('float64')  # for pipeline environ
        # df = factor_pipe.xs(date).astype('float64')  # for research environ
        df = df.stack().to_frame()
        df.index.names = ['stock', 'factor']

        # Update the alphas' days (use context.alphas in algo environ, alphas for research)
        context.alphas = context.alphas.drop([WINDOW_LENGTH - 1], axis=1)  # Drop the oldest alpha
        context.alphas.columns = range(WINDOW_LENGTH)[1:]  # Rename columns as they're now one day older
        context.alphas = pd.concat([df, context.alphas], axis=1)  # Append the new data

        # Save the alphas
        context.alphas = context.alphas  # for pipeline env (redundant, I know; but it's clear!)
        # all_alphas[date] = alphas  # for research env


def rebalance(context, data):
    """
    Execute orders according to our schedule_function() timing.
    """
    # Flatten the alpha factors
    alphas_flattened = context.alphas.unstack().dropna()

    # Scale the factors to prepare for combination
    scaler = preprocessing.StandardScaler()
    scaled = scaler.fit_transform(alphas_flattened)
    alphas_flattened.loc[:, :] = scaled

    # Combine the alpha factors
    context.combined_alpha = alphas_flattened.sum(axis=1)

    # Define our objective
    objective = opt.MaximizeAlpha(context.combined_alpha)

    # Add our constraints
    constraints = []
    constraints.append(opt.MaxGrossExposure(1.0))
    constraints.append(opt.DollarNeutral())
    constraints.append(opt.PositionConcentration.with_equal_bounds(min=-MAX_POSITION_SIZE, max=MAX_POSITION_SIZE))

    risk_model_exposure = opt.experimental.RiskModelExposure(context.risk_loadings, version=opt.Newest)
    constraints.append(risk_model_exposure)

    beta_neutral = opt.FactorExposure(loadings=context.beta_pipeline[['beta']], min_exposures={'beta': 0}, max_exposures={'beta': 0})
    constraints.append(beta_neutral)

    # Order the optimal portfolio
    if context.init:
        order_optimal_portfolio(objective=objective, constraints=constraints)
        if USE_MaxTurnover:
            context.init = False
        return
    turnover = np.linspace(MIN_TURN, 0.65, num=100)
    for max_turnover in turnover:  # Try loosening the constraint
        constraints.append(opt.MaxTurnover(max_turnover))
        try:
            order_optimal_portfolio(objective=objective, constraints=constraints)
            constraints = constraints[:-1]
            record(max_turnover=max_turnover)
            return
        except:
            constraints = constraints[:-1]


def record_vars(context, data):
    """
    Plot variables at the end of each day.
    """
    record(num_positions=len(context.portfolio.positions))


def handle_data(context, data):
    """
    Called every minute.
    """
    pass
There was a runtime error.

Thanks for the details!