Back to Community
Help on figuring out a NaN issue

Hello,

I'm trying to replicate the results from "The Little Book that Still Beats the Market". The strategy is simple in that it takes rankings from earnings_yield and return on invested capital and buys and holds the best ranked stocks for 1 year. I am ranking the earnings_yield from lowest to highest and assigning it a ranking score. I'm doing the same for the ROIC as well. Once two sets of rankings are complete I add the ranking scores together and assign a final rank of the original pandas dataframe. I then order it in descending order and pick the top 60 stocks to hold.

I am having issues with the NaN. Some symbols have "NaN" and are messing up my rankings overall. Any ideas in troubleshooting the NaN's as I would like to completely remove those tickers from all ranking lists.

Also any help in filtering stocks with certain marketcap will be great as well!

Clone Algorithm
4
Loading...
Backtest from to with initial capital
Total Returns
--
Alpha
--
Beta
--
Sharpe
--
Sortino
--
Max Drawdown
--
Benchmark Returns
--
Volatility
--
Returns 1 Month 3 Month 6 Month 12 Month
Alpha 1 Month 3 Month 6 Month 12 Month
Beta 1 Month 3 Month 6 Month 12 Month
Sharpe 1 Month 3 Month 6 Month 12 Month
Sortino 1 Month 3 Month 6 Month 12 Month
Volatility 1 Month 3 Month 6 Month 12 Month
Max Drawdown 1 Month 3 Month 6 Month 12 Month
"""
This is a template algorithm on Quantopian for you to adapt and fill in.
"""
import quantopian.algorithm as algo
from quantopian.pipeline import Pipeline
from quantopian.pipeline.data.builtin import USEquityPricing
from quantopian.pipeline.filters import QTradableStocksUS

from quantopian.pipeline import Pipeline
from quantopian.pipeline.data.morningstar import Fundamentals
from quantopian.pipeline.domain import US_EQUITIES
import pandas as pd
import numpy as np

def initialize(context):
    """
    Called once at the start of the algorithm.
    """
    # Rebalance every day, 1 hour after market open.
    algo.schedule_function(
        rebalance,
        algo.date_rules.month_start(),
        algo.time_rules.market_open(hours=3),
    )
    '''
    algo.schedule_function(
        re_try,
        algo.date_rules.month_start(days_offset = 1),
        algo.time_rules.market_open(hours=3),
    )
    
    algo.schedule_function(
        exit,
        algo.date_rules.month_end(),
        algo.time_rules.market_open(hours=3),
    )
    '''
    # Record tracking variables at the end of each day.
    algo.schedule_function(
        record_vars,
        algo.date_rules.every_day(),
        algo.time_rules.market_close(),
    )

    # Create our dynamic stock selector.
    algo.attach_pipeline(make_pipeline(), 'pipeline')


def make_pipeline():
    # Base universe set to the QTradableStocksUS
    base_universe = QTradableStocksUS()

    # Factor of yesterday's close price.
    market_cap = Fundamentals.market_cap.latest
    pe_ratio = Fundamentals.pe_ratio.latest 
    pb_ratio = Fundamentals.pb_ratio.latest
    value_score = Fundamentals.value_score.latest
    earning_yield = Fundamentals.earning_yield.latest
    roic = Fundamentals.roic.latest
        
    pipe = Pipeline(
        columns={
            'earning_yield': earning_yield,
            'roic' : roic
        },
        screen=base_universe
    )
    return pipe

def before_trading_start(context, data):
    """
    Called every day before market open.
    """
    a = 0
    for i in context.portfolio.positions:
        a +=1
        
    if a < 60 :
        context.output = algo.pipeline_output('pipeline')
        context.output['score'] = 0.0
        context.my_list = []
        df = context.output
    
        # These are the securities that we are interested in trading each day.
        context.security_list = context.output.index
        df_1 = context.output.sort_values(by=['earning_yield'], ascending = True)
        df_2 = context.output.sort_values(by=['roic'], ascending = True)
    
        df_1['score_1'] = 0.0 
        df_2['score_2'] = 0.0
        
        for i in range(len(df_2)):
            if df_2['roic'][i] == np.NaN:
                print "mistake found"
             
        for i in range(1,len(df_1)):
            df_1['score_1'][i] = i 
    
        for i in range(1,len(df_2)):
            df_2['score_2'][i] = i 
            
        for i in range(len(df)):
            for j in range(len(df_1)):
                if df.index[i] == df_1.index[j]:
                    for k in range(len(df_2)):
                        if df.index[i] == df_2.index[k]:
                            if df_1['earning_yield'][j] == np.nan or df_2['roic'][k] == np.nan:
                                print "we have an NaN error"
                                df['score'][i] = 0.0 
                            else:    
                                df['score'][i] = float(df_1['score_1'][j]) + float(df_2['score_2'][k])
                    
        df = context.output.sort_values(by=['score'], ascending = False)
        print df.head(30)
        for i in range(60):
            context.my_list.append(df.index[i])
        
        #print "Third Filtration:", context.my_list, len(context.my_list)
    else: 
        pass
    
def rebalance(context, data):
    
    print "Final Filtered LONG List:", len(context.my_list)
    #Long Positioning
    a = 0
    for i in context.portfolio.positions:
        a +=1
    
    for i in range(len(context.my_list)):
        if data.can_trade(context.my_list[i]) and a < 60:
            if context.my_list[i] not in context.portfolio.positions:
                order_target_percent(context.my_list[i], 0.015)
  
    pass

def re_try(context, data):
    #Retry for Long positions
    for i in context.portfolio.positions:
        a = (context.portfolio.positions[i].amount*context.portfolio.positions[i].cost_basis)/context.portfolio.portfolio_value
        if a < 0.030 and a > 0:
            order_target_percent(i, 0.015) 
    
    pass

def exit(context, data):
    
    #print "Final Filtered List:", context.my_list
    for i in context.portfolio.positions:
        order_target_percent(i, 0.0)             
            
    pass

def record_vars(context, data):
    """
    Plot variables at the end of each day.
    """
    record(leverage=context.account.leverage)
    long_ = 0 
    short = 0
    for i in context.portfolio.positions:
        if context.portfolio.positions[i].amount > 0:
            long_ += 1
        if context.portfolio.positions[i].amount < 0:
            short +=1
    record(longs = long_, shorts= short) 
    pass


def handle_data(context, data):
    """
    Called every minute.
    """
    pass
There was a runtime error.
1 response

dropna() is one route.

This shows the number of nans for each of the factors, abbreviated:

                   min              mean                max  
   mkt     312392342.0     7774684074.94     311065842800.0  
   pbr          0.1515          5.603815           769.2308     NaNs 11/1553  
   per          1.0059         65.838372            10000.0     NaNs 58/1553  
  roic       -9.660113          0.010075           0.981729     NaNs 6/1553  
   scr             nan               nan                nan     NaNs 1553/1553     # value_score  
   yld          -0.758          0.027015             0.9941  

Using dropna() only, returns are around 1/3 for that time frame, 2004.

It's not uncommon.
Nans produce unpredictable, false results and often people don't realize it is happening. Good for you. You noticed.

Another thing to know, after clone, click a line number in the margin to set a breakpoint somewhere and run it, can be useful, just starts slower.

This will hopefully give you more avenues to work with and help bring back returns but without the nans plus lots of flexibility.
A run of just a few weeks, less to wait for here. Good luck.

Clone Algorithm
6
Loading...
Backtest from to with initial capital
Total Returns
--
Alpha
--
Beta
--
Sharpe
--
Sortino
--
Max Drawdown
--
Benchmark Returns
--
Volatility
--
Returns 1 Month 3 Month 6 Month 12 Month
Alpha 1 Month 3 Month 6 Month 12 Month
Beta 1 Month 3 Month 6 Month 12 Month
Sharpe 1 Month 3 Month 6 Month 12 Month
Sortino 1 Month 3 Month 6 Month 12 Month
Volatility 1 Month 3 Month 6 Month 12 Month
Max Drawdown 1 Month 3 Month 6 Month 12 Month
from quantopian.algorithm import attach_pipeline, pipeline_output
from quantopian.pipeline  import Pipeline
from quantopian.pipeline.data    import Fundamentals
from quantopian.pipeline.filters import QTradableStocksUS
import numpy as np

def initialize(context):
    # Rebalance every day, n hours after market open.
    schedule_function(trade, date_rules.month_start(), time_rules.market_open(hours=3))

    attach_pipeline(make_pipeline(), 'pipeline')

def make_pipeline():
    m = QTradableStocksUS()  # initial mask

    # Factors from yesterday's close.
    pe_ratio      = Fundamentals.pe_ratio     .latest
    pb_ratio      = Fundamentals.pb_ratio     .latest
    earning_yield = Fundamentals.earning_yield.latest
    roic          = Fundamentals.roic         .latest
    #market_cap    = Fundamentals.market_cap   .latest
    #value_score   = Fundamentals.value_score  .latest

    screen_nans = 1

    if screen_nans:   # adding to mask
        m &= pe_ratio     .notnull()
        m &= pb_ratio     .notnull()
        m &= earning_yield.notnull()
        m &= roic         .notnull()
        #m &= market_cap   .notnull()
        #m &= value_score  .notnull()

    # combining or isolating with returns relevnt to each other. Later other changes were made in trade() etc.
    #alpha = pe_ratio.zscore(mask=m) + pb_ratio.zscore(mask=m) + earning_yield.zscore(mask=m) + roic.zscore(mask=m) # 14.24
    #alpha = earning_yield.zscore(mask=m)  # 14.24   # same as above, why?
    #alpha = roic.zscore(mask=m)           #  5.84
    #alpha = pb_ratio.zscore(mask=m)       #  2.55
    #alpha = pe_ratio.zscore(mask=m)       #  2.38
        
    alpha = earning_yield.zscore(mask=m)  # 14.24
    
    return Pipeline(
        columns = {
            'per'  : pe_ratio,
            'pbr'  : pb_ratio,
            'yld'  : earning_yield,
            'roic' : roic,
            'alpha': alpha,
            #'mkt'  : market_cap,
            #'scr'  : value_score,
        },
        screen = m
    )

def before_trading_start(context, data):
    # Recording here is same as end of day in most cases
    record(pos  = len(context.portfolio.positions))
    record(lvrg = context.account.leverage)
    long_ = 0 
    short = 0
    for i in context.portfolio.positions:
        if context.portfolio.positions[i].amount > 0: long_ += 1
        if context.portfolio.positions[i].amount < 0: short += 1
    record(longs = long_, shrts = short)

    #a = len(context.portfolio.positions)
    #if a >= 60:
    #    return
    
    context.output = pipeline_output('pipeline')   .dropna()

    context.output['score'] = 0.0
    context.lst = []
    df = context.output.copy()

    do_log_preview = 0    # a way to toggle this off when it becomes annoying
    if do_log_preview:
        try:    
            context.log_data_done
        except: 
            log_data(context, context.output, 9)        # show pipe info once
            '''
                               min              mean                max
             alpha       -36.93942              -0.0          25.212712     
               mkt     312392342.0     8042677166.39     311065842800.0     
               pbr          0.1515          4.850602           666.6667     
               per          1.0059         65.861074            10000.0     
              roic       -9.660113          0.015443           0.981729     
               yld          -0.758          0.031238             0.9941   
           '''

    # all of those with nan values because nan is always not-equal to nan in pandas
    # context.output[context.output != context.output]

    # A bolt from the Blue: I'm not sure what this section is doing, but, 
    #   there is likely to be a way to use pandas methods to accomplish it in one or two lines.

    # securities to trade each day.
    #context.security_list = context.output.index
    df_1 = df.sort_values(by=['yld'],  ascending = True)
    df_2 = df.sort_values(by=['roic'], ascending = True)
    df_1['score_1'] = 0.0
    df_2['score_2'] = 0.0

    for i in range(len(df_2)):
        if df_2['roic'][i] == np.NaN:
            log.info("mistake found")

    for i in range(1,len(df_1)):
        df_1['score_1'][i] = i

    for i in range(1,len(df_2)):
        df_2['score_2'][i] = i

    for i in range(len(df)):
        for j in range(len(df_1)):
            if df.index[i] == df_1.index[j]:
                for k in range(len(df_2)):
                    if df.index[i] == df_2.index[k]:
                        if df_1['yld'][j] == np.nan or df_2['roic'][k] == np.nan:
                            log.info("a NaN error")
                            df['score'][i] = 0.0
                        else:
                            df['score'][i] = float(df_1['score_1'][j]) + float(df_2['score_2'][k])

    #df = context.output.sort_values(by=['score'], ascending = False) # all zero by the time I was done
    df = context.output.sort_values(by=['alpha'], ascending = False)
    
    try:    # To only log these df values one time
        context.df_glimpse_done
    except: 
        log.info('.\n{}'.format(df.head(5)))
        log.info('.\n{}'.format(df.tail(5)))
        context.df_glimpse_done = 1
    
    #for i in range(60):
    #    context.lst.append(df.index[i])
    context.lst = df.index.tolist()
    
    #log.info("Third Filtration:", context.lst, len(context.lst))

def trade(context, data):
    a = len(context.portfolio.positions)
    log.info("Final Trade List Length: {}  Positions: {}".format(len(context.lst), a))
    #if a >= 60: return
    
    route = 'proportional'
    
    if route == 'proportional':   # Weights for each stock proportional to pipeline values.
        num   = 30                # Number each positive and negative. There's surely a better way ...
        alpha =       context.output.alpha.sort_values(ascending=True)[-num:] 
        alpha.append( context.output.alpha.sort_values(ascending=True)[:num] )
    
        # Have to normalize alpha values in this case to keep leverage in check.
        alpha = norm( context, alpha.copy() )
        
        for s in alpha.index:
            if s in context.portfolio.positions: continue  # meaning, don't continue, next instead.
            if not data.can_trade(s): continue

            order_target_percent(s, alpha[s])

    else:
        alpha = context.output.alpha.sort_values(ascending=True)[-50:]    # long only
        
        # set logic aside sometimes and experiment
        collection = alpha.index     #context.output.index     #context.lst
        
        for s in collection:
            if s in context.portfolio.positions: continue  # meaning, don't continue, next instead.
            if not data.can_trade(s): continue

            order_target_percent(s, 1.0 / len(collection))
        
def log_data(context, z, num, fields=None):  # Click the tiny dim triangle left of the line number here to collapse this section out of the way
    ''' Log info about pipeline output or, z can be any DataFrame or Series
    https://quantopian.com/posts/overview-of-pipeline-content-easy-to-add-to-your-backtest
    '''
    if not len(z):
        log.info('Empty pipe')
        return

    try: 
        context.log_data_done
    except:
        context.log_data_done = 1
        log.info('Pipe preview')

    # Options
    log_nan_only = 0          # Only log if nans are present.
    show_sectors = 0          # If sectors, see them or not.
    show_sorted_details = 1   # [num] high & low securities sorted, each column.
    padmax = 6                # num characters for each field, starting point.

    def out(lines):  # log data lines of output efficiently
        buffer_len = 1024   # each group
        chunk = ':'
        for line in lines:
            if line is None or not len(line):
                continue    # skip if empty string for example
            if len(chunk) + len(line) < buffer_len:
                # Add to chunk if will still be under buffer_len
                chunk += '\n{}'.format(line)
            else:  # Or log chunk and start over with new line.
                log.info(chunk)
                chunk = ':\n{}'.format(line)
        if len(chunk) > 2:       # if anything remaining
            log.info(chunk)

    if 'dict' in str(type(z)):
        log.info('Not set up to handle a dictionary, only dataframe & series, bailing out of log_data()')
        return
    elif 'MultiIndex' in str(type(z.index)):
        log.info('Found MultiIndex, not set up to handle it, bailing out of log_data()')
        return
    # Change index to just symbols for readability, meanwhile, right-aligned
    z = z.rename(index=dict(zip(z.index.tolist(), [i.symbol.rjust(6) for i in z.index.tolist()])))

    # Series ......
    if 'Series' in str(type(z)):    # is Series, not DataFrame
        nan_count = len(z[z != z])
        nan_count = 'NaNs {}/{}'.format(nan_count, len(z)) if nan_count else ''
        if (log_nan_only and nan_count) or not log_nan_only:
            pad = max( padmax, len('%.5f' % z.max()) )
            log.info('{}{}{}   Series  len {}'.format('min'.rjust(pad+5),
                'mean'.rjust(pad+5), 'max'.rjust(pad+5), len(z)))
            log.info('{}{}{} {}'.format(
                ('%.5f' % z.round(6). min()).rjust(pad+5),
                ('%.5f' % z.round(6).mean()).rjust(pad+5),
                ('%.5f' % z.round(6). max()).rjust(pad+5),
                nan_count
            ))
            log.info('High\n{}'.format(z.sort_values(ascending=False).head(num)))
            log.info('Low\n{}' .format(z.sort_values(ascending=False).tail(num)))
        return

    # DataFrame ......
    content_min_max = [ ['','min','mean','max',''] ] ; content = []
    for col in z.columns:
        try: z[col].max()
        except: continue   # skip non-numeric
        if col == 'sector' and not show_sectors: continue
        nan_count = len(z[col][z[col] != z[col]])
        nan_count = 'NaNs {}/{}'.format(nan_count, len(z)) if nan_count else ''
        padmax    = max( padmax, len(str(z[col].max())) ) ; mean_ = ''
        if len(str(z[col].max())) > 8 and 'float' in str(z[col].dtype):
            z[col] = z[col].round(6)   # Reduce number of decimal places for floating point values
        if 'float' in str(z[col].dtype): mean_ = str(round(z[col].mean(), 6))
        elif 'int' in str(z[col].dtype): mean_ = str(round(z[col].mean(), 1))
        content_min_max.append([col, str(z[col] .min()), mean_, str(z[col] .max()), nan_count])
    if log_nan_only and nan_count or not log_nan_only:
        log.info('Rows: {}  Columns: {}'.format(z.shape[0], z.shape[1]))
        if len(z.columns) == 1: content.append('Rows: {}'.format(z.shape[0]))

        paddings = [6 for i in range(4)]
        for lst in content_min_max:    # set max lengths
            i = 0
            for val in lst[:4]:    # value in each sub-list
                paddings[i] = max(paddings[i], len(str(val)))
                i += 1
        headr = content_min_max[0]
        content.append(('{}{}{}{}{}'.format(
             headr[0] .rjust(paddings[0]),
            (headr[1]).rjust(paddings[1]+5),
            (headr[2]).rjust(paddings[2]+5),
            (headr[3]).rjust(paddings[3]+5),
            ''
        )))
        for lst in content_min_max[1:]:    # populate content using max lengths
            content.append(('{}{}{}{}     {}'.format(
                lst[0].rjust(paddings[0]),
                lst[1].rjust(paddings[1]+5),
                lst[2].rjust(paddings[2]+5),
                lst[3].rjust(paddings[3]+5),
                lst[4],
            )))
    out(content)

    if not show_sorted_details: return
    if len(z.columns) == 1:     return     # skip detail if only 1 column
    if fields == None: details = z.columns
    content = []
    for detail in details:
        if detail == 'sector' and not show_sectors: continue
        hi = z[details].sort_values(by=detail, ascending=False).head(num)
        lo = z[details].sort_values(by=detail, ascending=False).tail(num)
        content.append(('_ _ _   {}   _ _ _'  .format(detail)))
        content.append(('{} highs ...\n{}'.format(detail, str(hi))))
        content.append(('{} lows  ...\n{}'.format(detail, str(lo))))
        if log_nan_only and not len(lo[lo[detail] != lo[detail]]):
            continue  # skip if no nans
    out(content)

    

def norm(c, d):    # d data, it's a series, normalize it pos, neg separately
    # https://www.quantopian.com/posts/normalizing-positive-and-negative-values-separately
    do_demean = 0                           # centering all values around 0
    preserve_zero_values = 0                # change to 0 if incoming zero-weights should simply be dropped.
    trim_pos_neg_to_same_number_each = 0    # same number of stocks for positive & negative

    if not len(d): return d   # In case empy.
    d = d[ d == d ]           # Insure no nans.

    if do_demean:   # If all pos or neg, shift for both pos & neg.
        if d.min() >= 0 or d.max() <= 0:
            d -= d.mean()

    zeros = None
    if preserve_zero_values:
        zeros = d[ d == 0 ]

    pos = d[ d > 0 ]
    neg = d[ d < 0 ]

    if trim_pos_neg_to_same_number_each:
        num  = min(len(pos), len(neg))
        pos  = pos.sort_values(ascending=False).head(num)
        neg  = neg.sort_values(ascending=False).tail(num)

    pos /=   pos.sum()
    neg  = -(neg / neg.sum())
    ret  = pos.append(neg)

    if preserve_zero_values and zeros is not None:
        ret = ret.append(zeros)

    return ret
    
        
        
        
There was a runtime error.