Back to Community
Tear sheet: non-unique MultiIndex error after concatenating pipeline output

Hi all,

I'm trying to input some data into alphalens. Since I can't run the pipeline over a longer time period I call the pipeline every 6 months, after which I concatenate the pipeline outputs to obtain one single dataframe. I then get the prices for the securities from the concatenated dataframe, but when I plug in the the factor and pricing data into the alphalens functions I get an error.

import numpy as np  
import pandas as pd  
import matplotlib.pyplot as plt  
import scipy.stats  
import numpy as np  
from quantopian.pipeline import Pipeline  
from import morningstar  
from import USEquityPricing  
from quantopian.pipeline.factors import CustomFactor  
from quantopian.pipeline.filters import Q1500US

class Quality(CustomFactor):  
    inputs = [morningstar.income_statement.ebit, morningstar.balance_sheet.working_capital, morningstar.balance_sheet.gross_ppe]  
    window_length = 1  
    def compute(self, today, assets, out, ebit, wc, ppe):  
        out[:] = ebit / (wc + ppe)

def make_pipeline():  
    universe = Q1500US()  
    pipe = Pipeline()  
    quality = Quality()  
    pipe.add(quality, 'quality')  
    return pipe

from quantopian.research import run_pipeline

start_dates = pd.date_range(start='2011-01-01', periods=4, freq='2QS')  
end_dates = pd.date_range(start='2011-07-01', periods=4, freq='2QS')

pipes = range(0,len(start_dates)-1)  
prices = range(0,len(start_dates)-1)

for i in range(0,len(start_dates)-1): 

    pipe = make_pipeline()  
    results = run_pipeline(pipe, start_dates[i], end_dates[i])  
    pipes[i] = results  
    print("Iteration " + str(i+1) +" of " + str(len(start_dates)-1) + " complete.")  

Here I get the error message:

import alphalens  
results = pd.concat(pipes)  
securities = results['quality'].index.levels[1].unique()  
pricing = get_pricing(securities,  

factor_data = alphalens.utils.get_clean_factor_and_forward_returns(  
ExceptionTraceback (most recent call last)  
<ipython-input-23-6d8e42bfbdcb> in <module>()  
     12 pricing_test,  
     13 quantiles=5,  
---> 14 periods=(15,20,30))  
     15 alphalens.tears.create_full_tear_sheet(factor_data)

/usr/local/lib/python2.7/dist-packages/alphalens/utils.pyc in get_clean_factor_and_forward_returns(factor, prices, groupby, by_group, quantiles, bins, periods, filter_zscore, groupby_labels)
    301                                        "the pandas methods tz_localize and tz_convert.")  
--> 303     merged_data = compute_forward_returns(prices, periods, filter_zscore)  
    305     factor = factor.copy()

/usr/local/lib/python2.7/dist-packages/alphalens/utils.pyc in compute_forward_returns(prices, periods, filter_zscore)
    136             delta[mask] = np.nan  
--> 138         forward_returns[period] = delta.stack()  
    140     forward_returns.index = forward_returns.index.rename(['date', 'asset'])

/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in __setitem__(self, key, value)
   2355         else:  
   2356             # set column  
-> 2357             self._set_item(key, value)  
   2359     def _setitem_slice(self, key, value):

/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in _set_item(self, key, value)
   2422         self._ensure_valid_index(value)  
-> 2423         value = self._sanitize_column(key, value)  
   2424         NDFrame._set_item(self, key, value)  

/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in _sanitize_column(self, key, value)
   2556         if isinstance(value, Series):  
-> 2557             value = reindexer(value)  
   2559         elif isinstance(value, DataFrame):

/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in reindexer(value)
   2547                     # duplicate axis  
   2548                     if not value.index.is_unique:  
-> 2549                         raise e  
   2551                     # other

Exception: cannot handle a non-unique multi-index!  

Does anyone have an idea what I'm doing wrong here? Any help would be appreciated!



2 responses


The code below may streamline the process for you.

Also, the issue is there might be some duplication in the date-time index due to how data might get pulled to fix it I found this to fix the problem

result = result[~result.index.duplicated(keep='first'))]

def run_pipeline_chunks(pipe, start_date, end_date, chunks_len = None):
Drop-in replacement for run_pipeline.
run_pipeline fails over a very long period of time (memery usage),
so we need to split in chunks the pipeline and concatenate the results
chunks = []
current = pd.Timestamp(start_date)
end = pd.Timestamp(end_date)
step = pd.Timedelta(weeks=26) if chunks_len is None else chunks_len

while current <= end:  

    current_end = current + step  
    if current_end > end:  
        current_end = end  

    print 'Running pipeline:', current, ' - ', current_end  
    results = run_pipeline(pipe, current.strftime("%Y-%m-%d"), current_end.strftime("%Y-%m-%d"))  

    # pipeline bug: it returns more days than requested, so get last date from the results  
    last_date_returned = results.index.get_level_values(0)[-1].tz_localize(None)  
    current = last_date_returned + pd.Timedelta(days=1)  

    if last_date_returned > current_end:  
        print 'pipeline bug it returns more days than requested: last date returned ', last_date_returned

return pd.concat(chunks)