Back to Community
Fundamental data KMeans clustering

This is an algorithm that clusters stocks based on a set of fundamentals (I have used Max's set of 4 fundamental ratios as a basis). The algo clusters the stock universe around fundamental data. It then tries (and this is where this needs improving) to rank the clusters and take long-short positions on top and bottom clusters. The ranking for now is done on past performance, which is somewhat naive. I am calling on Q's best and brightest to suggest improvement on rank the clusters. Or maybe this is just a dead end.

Some of the code for cluster manipulation comes from a post from Jonathan Larkin.

/Luc

Clone Algorithm
12
Loading...
Backtest from to with initial capital
Total Returns
--
Alpha
--
Beta
--
Sharpe
--
Sortino
--
Max Drawdown
--
Benchmark Returns
--
Volatility
--
Returns 1 Month 3 Month 6 Month 12 Month
Alpha 1 Month 3 Month 6 Month 12 Month
Beta 1 Month 3 Month 6 Month 12 Month
Sharpe 1 Month 3 Month 6 Month 12 Month
Sortino 1 Month 3 Month 6 Month 12 Month
Volatility 1 Month 3 Month 6 Month 12 Month
Max Drawdown 1 Month 3 Month 6 Month 12 Month
import quantopian.algorithm as algo
from quantopian.pipeline import Pipeline
from quantopian.pipeline.data.builtin import USEquityPricing
from quantopian.pipeline.filters import QTradableStocksUS, Q500US

from sklearn.cluster import KMeans, DBSCAN, SpectralClustering
from sklearn.decomposition import PCA
from sklearn import preprocessing

from quantopian.pipeline.data import factset
from quantopian.pipeline.factors import CustomFactor
from quantopian.pipeline.data import Fundamentals

from quantopian.pipeline.data import morningstar

import numpy as np
import pandas as pd

def initialize(context):
    algo.schedule_function(
        rebalance,
        algo.date_rules.month_start(days_offset=1),
        algo.time_rules.market_open(hours=3),
    )

    algo.schedule_function(
        record_vars,
        algo.date_rules.every_day(),
        algo.time_rules.market_close(),
    )

    algo.attach_pipeline(make_pipeline(), 'pipeline')

    
class DebtToTotalAssets(CustomFactor):
    inputs = [Fundamentals.long_term_debt,
              Fundamentals.current_debt,
              Fundamentals.cash_and_cash_equivalents,
              Fundamentals.total_assets]
    window_length = 1

    def compute(self, today, assets, out, ltd, std, cce, ta):
        std_part = np.maximum(std - cce, np.zeros(std.shape))

        out[:] = np.divide(ltd + std_part, ta)

        
class TEM(CustomFactor):
    """
    TEM = standard deviation of past 6 quarters' reports
    """
    window_length = 390
    def compute(self, today, assets, out, asof_date, capex, total_assets):
        values = capex/total_assets
        for column_ix in range(asof_date.shape[1]):
            _, unique_indices = np.unique(asof_date[:, column_ix], return_index=True)
            quarterly_values = values[unique_indices, column_ix]
            if len(quarterly_values) < 6:
                quarterly_values = np.hstack([
                    np.repeat([np.nan], 6 - len(quarterly_values)),
                    quarterly_values,
                ])
            out[column_ix] = np.std(quarterly_values[-6:])

    

def make_pipeline():
    universe = QTradableStocksUS()

    debt_asset = -DebtToTotalAssets(mask=QTradableStocksUS())
    debt_asset = debt_asset.winsorize(min_percentile=0.02,
                                     max_percentile=0.98,
                                     mask=universe)
    debt_asset = debt_asset.zscore()
    
    capex_vol = TEM(
        inputs=[factset.Fundamentals.capex_qf_asof_date,
                factset.Fundamentals.capex_qf,
                factset.Fundamentals.assets],
        mask=QTradableStocksUS()
    )
    capex_vol = -capex_vol.winsorize(min_percentile=0.02,
                                     max_percentile=0.98,
                                     mask=capex_vol.isfinite())
    capex_vol = capex_vol.zscore()
    
    
    fcf_ev = factset.Fundamentals.free_cf_fcfe_qf.latest / \
               factset.Fundamentals.entrpr_val_qf.latest
    fcf_ev = fcf_ev.winsorize(min_percentile=0.02,
                         max_percentile=0.98,
                         mask=universe & fcf_ev.isfinite())
    fcf_ev = fcf_ev.zscore()
    
    revenue_ltm = factset.Fundamentals.sales_ltm.latest/\
              factset.Fundamentals.mkt_val_public.latest
    revenue_ltm = revenue_ltm.log()
    revenue_ltm = revenue_ltm.winsorize(
        min_percentile=0.02,
        max_percentile=0.98,
        mask=universe & (revenue_ltm > 0) & revenue_ltm.isfinite()
    )
    revenue_ltm = revenue_ltm.zscore()
    
    industry = morningstar.asset_classification.morningstar_industry_group_code.latest
    
    pipe = Pipeline(
        columns={
            'capex_vol': capex_vol,
            'fcf_ev': fcf_ev,
            'debt_asset': debt_asset,
            'revenue_ltm': revenue_ltm,
        },
        screen=universe & (industry != 31055)
    )
    return pipe


def before_trading_start(context, data):
    context.output = algo.pipeline_output('pipeline')
    context.security_list = context.output.index


def rebalance(context, data):
    now = get_datetime()   
    if now.month not in [1, 4, 7, 10]:
        return
    
    context.output.dropna(inplace=True)
    
    # we don't use past returns for clustering. So this is not necessary
    returns = data.history(context.output.index, 'close', 60, '1d').pct_change().dropna()
    # N_PRIN_COMPONENTS = 10
    # pca = PCA(n_components=N_PRIN_COMPONENTS)
    # pca.fit(returns)
    
    # prepare data for clustering
    X = np.hstack(\
    (#pca.components_.T, 
     context.output['debt_asset'][returns.columns].values[:, np.newaxis],
     context.output['capex_vol'][returns.columns].values[:, np.newaxis],
     context.output['fcf_ev'][returns.columns].values[:, np.newaxis],
     context.output['revenue_ltm'][returns.columns].values[:, np.newaxis])
    )
     
    X = preprocessing.StandardScaler().fit_transform(X)
    
    # fit Kmeans clusters
    clf = KMeans(n_clusters=20, n_init=500, random_state=1234, precompute_distances=True)
    clf.fit(X)
    labels = clf.labels_
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    print "Clusters discovered: %d" % n_clusters_

    clustered = clf.labels_

    clustered_series = pd.Series(index=returns.columns, data=clustered.flatten())
    clustered_series = clustered_series[clustered_series != -1]
    
    counts = clustered_series.value_counts()
    counts = counts.sort_index()
    
    res = []

    # for each clusters, calculate average performance in past cycle (past month or quarter).
    for c in counts.index:
        df = clustered_series[clustered_series == c]    
        prices = data.history(df.index, 'close', 60, '1d')
        mean = prices.pct_change().dropna().mean().mean()
        std = prices.pct_change().dropna().mean().std()
        res.append({'c':c, 'mean':mean, 'std': std, 'count': counts[c]})

    res = pd.DataFrame(res)    
    res.sort_values('mean', ascending=False, inplace=True)
    # filter out very small and very large clusters.
    res = res[res['count'] > 30]
    res = res[res['count'] < 100]
    # pick to best and two worse performing clusters.
    top = res['c'][:2].tolist()
    bot = res['c'][-2:].tolist()
    top_stocks = clustered_series[clustered_series.isin(top)].index
    bot_stocks = clustered_series[clustered_series.isin(bot)].index
    
    
    # liquidate all. this is not efficient, but since we trade once a month, to allow
    # to keep things simple.
    for p in context.portfolio.positions:
        if context.portfolio.positions[p] != 0:
            order_target_percent(p, 0.0)
    
    
    try:
        l_w = 0.5/len(top_stocks)
        s_w = 0.5/len(bot_stocks)
    except:
        pass
    
    # buy stocks from best performing and worse performing clusters.
    for s in top_stocks:
        order_target_percent(s, l_w)
        
    for s in bot_stocks:
        order_target_percent(s, -s_w)
    
    
def record_vars(context, data):
    """
    Plot variables at the end of each day.
    """
    pass
There was a runtime error.
2 responses

Hi Luc,

Grant Kielhne has contributed an algo for alpha combination using spectral clustering that follows a pipeline format and conforms to contest specs. Here's the link: alpha-combination-via-clustering You can substitute your choice of clustering routine. Hope this helps.

Thanks James for this. I looked it up and it is quite an impressive piece of code. I have yet to understand the clustering bit, as it clusters 20 factors (samples), each having 5*1715 features. I will look further into it... But really, Grant did a fantastic job.

Mine does not work the same way, as it clusters stocks and not factors. Mine clusters 1500 stocks around 4 factors, creating clusters of stocks with similar factors.

/Luc