Source code for rapa.base

from . import utils
from . import config
from . import _var

import time

from sklearn.feature_selection import f_regression, f_classif
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.utils import check_array

from typing import List
from typing import Callable
from typing import Tuple
from typing import Union

import pandas as pd
import numpy as np
from statistics import mean
from statistics import stdev
from math import ceil

import matplotlib.pyplot as plt
import seaborn as sns

import datarobot as dr

import logging
import warnings

try: # check if in jupyter notebook
    get_ipython
    from tqdm.notebook import tqdm
except:
    from tqdm import tqdm


[docs]class RAPABase():
    """
        The base of regression and classification RAPA analysis
    """

    POSSIBLE_TARGET_TYPES = [x for x in dir(dr.enums.TARGET_TYPE) if not x.startswith('__')] # List of DR TARGET_TYPES

    """
    * _classification = None # Set by child classes
    * target_type = None # Set at initialization
    * project = None # Set at initialization or with 'perform_parsimony()'"""

    def __init__(self):
        if self.__class__.__name__ == "RAPABase":
            raise RuntimeError("Do not instantiate the RAPABase class directly; use RAPAClassif or RAPARegress")

    @staticmethod
    def _wait_for_jobs(project: dr.Project, progress_bar: bool = True, sleep_time: int = 5, pbar = None, pbar_prefix: str = '', job_type: str = '', timeout = 21600):
        """Gets all the jobs for a project, and if there are more than 0 current jobs, 
        sleeps for 5 seconds and checks again.

        :Parameters:
        ------------
            project: datarobot.Project
                The datarobot.Project that will be probed for current jobs
            
            progress_bar: bool, optional (default = True)
                If True, a print statement and a progress bar will appear TODO: a print statement and a progress bar will appear

            sleep_time: int, optional (default = 5)
                The time to sleep between datarobot.Project.get_all_jobs() 
                (avoid sending too many api requests) TODO: warning or check for max api requests
            
            pbar: tqdm.tqdm, optional (defaut = None)
                A progress bar object from tqdm
            
            pbar_prefix: str, optional (default = '')
                The prefix to put in frot of the progress bar message
            
            job_type: str, optional (default = '')
                A string to put in front of the jobs left (after pbar_prefix)
        """
        start_time = time.time()
        if len(project.get_all_jobs()) > 0:
            if progress_bar:
                if pbar:
                    pbar.set_description(f'{pbar_prefix}{job_type}job(s) remaining ({len(project.get_all_jobs())})') 
                else:
                    tqdm.write(f'\r{job_type}job(s) remaining ({len(project.get_all_jobs())})', end='') 
            time.sleep(sleep_time)
        while len(project.get_all_jobs()) > 0:
            while len(project.get_all_jobs()) > 0: # double check
                if progress_bar: # PROGRESS BAR
                    if pbar:
                        pbar.set_description(f'{pbar_prefix}Feature Impact job(s) remaining ({len(project.get_all_jobs())})') 
                    else:
                        tqdm.write(f'\r{job_type}job(s) remaining ({len(project.get_all_jobs())})', end='') 
                time.sleep(sleep_time)
                if time.time()-start_time >= timeout:
                    raise TimeoutError
            time.sleep(sleep_time+5)# sometimes all jobs will be complete and no jobs will be in queue, but then more jobs will be created
        tqdm.write(f'\r{job_type}job(s) remaining ({len(project.get_all_jobs())})', end='\n') 
        return None 

    @staticmethod
    def _check_lives(lives: int, 
                    project: dr.Project, 
                    previous_best_model: dr.Model,
                    featurelist_prefix: str = None, 
                    starred: bool = False, 
                    metric: str = 'AUC',
                    verbose: bool = False) -> Tuple[int, dr.Model]:
        """Finds the 'best' model of a project/featurelist of a project and returns the new
        `lives` count (decreased by 1 if the model doesn't change) and the 'best' model

        Uses `rapa.utils.get_best_model` to find the current best model, and decides
        if the model has changed by equating `datarobot.Model.id`. Returns a tuple with
        the number of 'lives' left in the first position, and the current 'best' model
        in the second position.

        :Parameters:
        ------------
            lives: int
                The current number of 'lives' remaining in parsimony analysis

            project: datarobot.Project
                The datarobot.Project parsimony analysis is being performed in
            
            previous_best_model: datarobot.Model
                The previously 'best' model in the datarobot.Project before
                a round of parsimony analysis

            featurelist_prefix: str, optional (default = None)
                The desired prefix for the featurelists that will be used for searching
                for the 'best' model. If None, will search the entire datarobot.Project
            
            starred: bool, optional (default = False)
                If True, searching the project's starred models. If False, searches
                all of the project's models
            
            metric: str, optional (default = 'AUC')
                What model cross validation metric to use when averaging scores to
                find the 'best' model
            
            verbose: bool, optional (default = True)
                If True, prints previous and current best model information when 
                before returning

        :Returns:
        ----------
            Tuple(int, datarobot.Model)
                A tuple with the new `lives` in the first position, and the new
                'best' model after one round of persimony analysis
        """

        # check for the best model (supplied metric of cv)
        current_best_model = utils.get_best_model(project, metric=metric, featurelist_prefix=featurelist_prefix, starred=starred)
        if current_best_model.id == previous_best_model.id:
            lives -= 1
            current_best_model_score = mean(current_best_model.get_cross_validation_scores()['cvScores'][metric].values())
            last_best_model_score = mean(previous_best_model.get_cross_validation_scores()['cvScores'][metric].values())
            if verbose:
                tqdm.write(f'Current model performance: \'{current_best_model_score}\'. Previous best model performance: \'{last_best_model_score}\'\nNo change in the best model, so a life was lost.\nLives remaining: \'{lives}\'')
            else:
                tqdm.write(f'Lives left: \'{lives}\'')
        return (lives, current_best_model)
    
    @staticmethod 
    def _feature_performances_hbar(stat_feature_importances, featurelist_name, metric='', stacked=False, colormap='tab20'):
        feature_performances = pd.DataFrame(stat_feature_importances.rename(len(stat_feature_importances)))
        warnings.filterwarnings('ignore', message='The handle <BarContainer object of 1 artists>')
        feature_performances_to_plot = feature_performances.iloc[:config.MAX_FEATURES_TO_LABEL].T.set_axis(list(feature_performances.iloc[:config.MAX_FEATURES_TO_LABEL].T.columns), axis=1, inplace=False)
        plt.figure(figsize=(config.FIG_SIZE[0], config.FIG_SIZE[1]/2))
        ax = sns.barplot(data=feature_performances_to_plot, orient='h', palette=colormap)
        ax.set(xlabel='Normalized Impact of Features', title=f'{min([config.MAX_FEATURES_TO_LABEL, len(feature_performances)])} {metric} Impact Normalized Feature Performances\nFeaturelist: {featurelist_name}', ylabel='Features')
        warnings.filterwarnings('default')
        plt.show()
        return None


[docs]    def create_submittable_dataframe(self, 
                                    input_data_df: pd.DataFrame, 
                                    target_name: str, 
                                    n_features: int = 19990,
                                    n_splits: int = 6, 
                                    filter_function: Callable[[pd.DataFrame, np.ndarray], List[np.ndarray]] = None,
                                    random_state: int = None) -> pd.DataFrame: #TODO: change return type
        """Prepares the input data for submission as either a regression or classification problem on DataRobot.

        Creates pre-determined k-fold cross-validation splits and filters the feature
        set down to a size that DataRobot can receive as input, if necessary. TODO: private function submit_datarobot_project explanation


        :Parameters:
        ------------
            input_data_df: pandas.DataFrame
                pandas DataFrame containing the feature set and prediction target.

            target_name: str
                Name of the prediction target column in `input_data_df`.

            n_features: int, optional (default: 19990)
                The number of features to reduce the feature set in `input_data_df`
                down to. DataRobot's maximum feature set size is 20,000.
                If `n_features` has the same number of features as the `input_data_df`,
                NaN values are allowed because no feature filtering will ocurr

            n_splits: int, optional (default: 6)
                The number of cross-validation splits to create. One of the splits
                will be retained as a holdout split, so by default this function
                sets up the dataset for 5-fold cross-validation with a holdout.
                NOTE: `CV Fold 0` is the holdout set by default.

            filter_function: callable, optional (default: None)
                The function used to calculate the importance of each feature in
                the initial filtering step that reduces the feature set down to
                `max_features`.

                This filter function must take a feature matrix as the first input
                and the target array as the second input, then return two separate
                arrays containing the feature importance of each feature and the
                P-value for that correlation, in that order.

                When None, the filter function is determined by child class.
                If an instance of `RAPAClassif()`, sklearn.feature_selection.f_classif is used.
                If `RAPARegress()`, sklearn.feature_selection.f_regression is used.
                See scikit-learn's f_classif function for an example:
                https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_regression.html

            random_state: int, optional (default: None)
                The random number generator seed for RAPA. Use this parameter to make sure
                that RAPA will give you the same results each time you run it on the
                same input data set with that seed.

        :Returns:
        ------------
            pandas.DataFrame
                DataFrame holds original values from the input Dataframe, but with 
                pre-determined k-fold cross-validation splits, and was 
                filtered down to 'max_features' size using the 'filter_function'
        """
        #TODO: make private function? 
        # Check dataframe has 'target_name' columns
        if target_name not in input_data_df.columns:
            raise KeyError(f'{target_name} is not a column in the input DataFrame')

        # Check that the dataframe can be copied and remove target_name column
        input_data_df = input_data_df.copy()
        only_features_df = input_data_df.drop(columns=[target_name])

        # Check if the requested number of features is equal to the number of features provided
        # If True, skip feature filtering and allow NaNs
        if n_features >= only_features_df.shape[1]:
            feature_filter = False
        else:
            feature_filter = True

        # Set target_type, kfold_type, and filter_function based on type of classification/regression problem
        if self._classification:
            # Check if binary or multi classification problem
            if len(np.unique(input_data_df[target_name].values)) == 2:
                self.target_type = dr.enums.TARGET_TYPE.BINARY
            else:
                self.target_type = dr.enums.TARGET_TYPE.MULTICLASS
            kfold_type = StratifiedKFold
            filter_function = f_classif
        else:
            # Check array for infinite values/NaNs
            if feature_filter:
                check_array(input_data_df)
            kfold_type = KFold
            self.target_type = dr.enums.TARGET_TYPE.REGRESSION
            filter_function = f_regression

        # Create 'partition' column and set all values to 'train'
        input_data_df['partition'] = 'train'
        train_feature_importances = []

        # Make cross validation folds
        fold_name_prefix = 'CV Fold'
        for fold_num, (_, fold_indices) in enumerate(kfold_type(n_splits=n_splits, random_state=random_state, shuffle=True).split(only_features_df.values,
                                                    input_data_df[target_name].values)):

            # Replace the values in the partition column with their true CV fold, removing all 'train' entries
            input_data_df.iloc[fold_indices, input_data_df.columns.get_loc('partition')] = f'{fold_name_prefix} {fold_num}'

            # Fold 0 is the holdout set, so don't calculate feature importances using that fold
            if feature_filter and fold_num > 0:
                feature_importances, _ = filter_function(only_features_df.iloc[fold_indices].values, input_data_df[target_name].iloc[fold_indices].values)
                train_feature_importances.append(feature_importances)

        if feature_filter:
            # We calculate the overall feature importance scores by averaging the feature importance scores across all of the training folds
            avg_train_feature_importances = np.mean(train_feature_importances, axis=0)

            # Change parition 0 name to 'Holdout'
            input_data_df.loc[input_data_df['partition'] == f'{fold_name_prefix} 0', 'partition'] = 'Holdout'

            # Gets the top `n_features` correlated features as a list
            most_correlated_features = only_features_df.columns.values[np.argsort(avg_train_feature_importances)[::-1][:n_features]].tolist()

            # put target_name, partition, and most correlated features columns in dr_upload_df
            datarobot_upload_df = input_data_df[[target_name, 'partition'] + most_correlated_features]
        
        else:
            # put target_name, partition, and most correlated features columns in dr_upload_df
            datarobot_upload_df = input_data_df[[target_name, 'partition'] + only_features_df.columns.values.tolist()]

        return datarobot_upload_df


[docs]    def submit_datarobot_project(self, 
                                input_data_df: pd.DataFrame, 
                                target_name: str, 
                                project_name: str, 
                                target_type: str = None, 
                                worker_count: int = -1, 
                                metric: str = None,
                                mode: str = dr.AUTOPILOT_MODE.FULL_AUTO,
                                random_state: int = None) -> dr.Project: #TODO check input df for partition, target_name (data-robotified df), logger.warning
        """Submits the input data to DataRobot as a new modeling project.

        It is suggested to prepare the `input_data_df` using the
        'create_submittable_dataframe' function first with an instance of
        either RAPAClassif or RAPARegress.

        :Parameters:
        ------------
            input_data_df: pandas.DataFrame
                pandas DataFrame containing the feature set and prediction target.

            target_name: str
                Name of the prediction target column in `input_data_df`.

            project_name: str
                Name of the project in DataRobot.

            target_type: str (enum)
                Indicator to DataRobot of whether the new modeling project should be
                a binary classification, multiclass classification, or regression project.

                Options:
                    * datarobot.TARGET_TYPE.BINARY
                    * datarobot.TARGET_TYPE.REGRESSION
                    * datarobot.TARGET_TYPE.MULTICLASS

            worker_count: int, optional (default: -1)
                The number of worker engines to assign to the DataRobot project.
                By default, -1 tells DataRobot to use all available worker engines.
            
            metric: str, optional (default: None)
                Name of the metric to use for evaluating models. You can query the metrics 
                available for the target by way of Project.get_metrics. If none is specified, 
                then the default recommended by DataRobot is used.

            mode: str (enum), optional (default: datarobot.AUTOPILOT_MODE.FULL_AUTO)
                The modeling mode to start the DataRobot project in.

                Options:
                    *  datarobot.AUTOPILOT_MODE.FULL_AUTO
                    *  datarobot.AUTOPILOT_MODE.QUICK
                    *  datarobot.AUTOPILOT_MODE.MANUAL
                    *  datarobot.AUTOPILOT_MODE.COMPREHENSIVE: Runs all blueprints in the repository (this may be extremely slow).

            random_state: int, optional (default: None)
                The random number generator seed for DataRobot. Use this parameter to make sure
                that DataRobot will give you the same results each time you run it on the
                same input data set with that seed.
            
        :Returns:
        ------------
            datarobot.Project
                The DataRobot project submitted

        """
        # TODO: provide an option for columns to disregard

        # Check for a target_type
        if target_type == None or target_type not in self.POSSIBLE_TARGET_TYPES:
            target_type = self.target_type
            if target_type == None:
                raise Exception(f'No target type.')
        

        project = dr.Project.create(sourcedata=input_data_df, project_name=project_name)

        project.analyze_and_model(target=target_name, target_type=target_type,
                        worker_count=worker_count, mode=mode, metric=metric,
                        advanced_options=dr.AdvancedOptions(seed=random_state, accuracy_optimized_mb=False,
                                                            prepare_model_for_deployment=False, blend_best_models=False),
                        partitioning_method=dr.UserCV(user_partition_col='partition', cv_holdout_level='Holdout'))

        return project


[docs]    def perform_parsimony(self, feature_range: List[Union[int, float]], 
                        project: Union[dr.Project, str] = None,
                        starting_featurelist_name: str = 'Informative Features',
                        featurelist_prefix: str = 'RAPA Reduced to', 
                        mode: str = dr.AUTOPILOT_MODE.FULL_AUTO,
                        lives: int = None,
                        cv_average_mean_error_limit: float = None,
                        feature_impact_metric: str = 'median',
                        progress_bar: bool = True, 
                        to_graph: List[str] = None, 
                        metric: str = None,
                        verbose: bool = True):
        """Performs parsimony analysis by repetatively extracting feature importance from 
        DataRobot models and creating new models with reduced features (smaller feature lists). # TODO take a look at featurelist_prefix for running multiple RAPA

        NOTICE: Feature impact scores are only gathered from models that have had their 
        **cross-validation accuracy** tested!

        :Parameters:
        ------------
            feature_range: list[int] | list[float]
                Either a list containing integers representing desired featurelist lengths,
                or a list containing floats representing desired featurelist percentages (of the original featurelist size)

            project: datarobot.Project | str, optional (default = None)
                Either a datarobot project, or a string of it's id or name. If None,
                uses the project that was provided to create the rapa class
            
            starting_featurelist: str, optional (default = 'Informative Features')
                The name or id of the featurelist that rapa will start pasimony analysis with

            featurelist_prefix: str, optional (default = 'RAPA Reduced to')
                The desired prefix for the featurelists that rapa creates in datarobot. Each featurelist
                will start with the prefix, include a space, and then end with the number of features in that featurelist

            mode: str (enum), optional (default: datarobot.AUTOPILOT_MODE.FULL_AUTO)
                The modeling mode to start the DataRobot project in.
                Options:
                    * datarobot.AUTOPILOT_MODE.FULL_AUTO
                    * datarobot.AUTOPILOT_MODE.QUICK
                    * datarobot.AUTOPILOT_MODE.MANUAL
                    * datarobot.AUTOPILOT_MODE.COMPREHENSIVE: Runs all blueprints in the repository (warning: this may be extremely slow).
            
            lives: int, optional (default = None)
                The number of times allowed for reducing the featurelist and obtaining a worse model. By default,
                'lives' are off, and the entire 'feature_range' will be ran, but if supplied a number >= 0, then 
                that is the number of 'lives' there are. 

                Ex: lives = 0, feature_range = [100, 90, 80, 50]
                RAPA finds that after making all the models for the length 80 featurelist, the 'best' model was created with the length
                90 featurelist, so it stops and doesn't make a featurelist of length 50.

                Similar to datarobot's Feature Importance Rank Ensembling for advanced feature selection (FIRE) package's 'lifes' 
                https://www.datarobot.com/blog/using-feature-importance-rank-ensembling-fire-for-advanced-feature-selection/ 
            
            cv_average_mean_error_limit: float, optional (default = None)
                The limit of cross validation mean error to help avoid overfitting. By default, the limit is off, 
                and the each 'feature_range' will be ran. Limit exists only if supplied a number >= 0.0

                Ex: 'feature_range' = 2.5, feature_range = [100, 90, 80, 50]
                    RAPA finds that the average AUC for each CV fold is [.8, .6, .9, .5] respectfully,
                    the mean of these is 0.7. The average error is += 0.15. If 0.15 >= cv_average_mean_error_limit,
                    the training stops.
            
            feature_impact_metric: str, optional (default = 'median')
                How RAPA will decide each feature's importance over every model in a feature list
                    Options: 
                    * median
                    * mean
                    * cumulative

            progress_bar: bool, optional (default = True)
                If True, a simple progres bar displaying complete and incomplete featurelists. 
                If False, provides updates in stdout Ex: current worker count, current featurelist, etc.

            to_graph: List[str], optional (default = None)
                A list of keys choosing which graphs to produce. Possible Keys:
                    * 'models': `seaborn` boxplot with model performances with provided metric
                    * 'feature_performance': `matplotlib.pyplot` stackplot of feature performances

            metric: str, optional (default = None)
                The metric used for scoring models, when finding the 'best' model, and when
                plotting model performance

                When None, the metric is determined by what class inherits from base. For instance,
                a `RAPAClassif` instance's default is 'AUC', and `RAPARegress` is 'R Squared'
            
            verbose: bool, optional (default = True)
                If True, prints updates from DataRobot and rapa during parsimonious feature rduction
        
        :Returns:
        ------------
            None
        """ 
        # TODO: return a dictionary of values? {"time_taken": 2123, "cv_mean_error": list[floats], ""}
        # TODO: graph cv performance boxplots
        # TODO: graph pareto front of median model performance vs feature list size
        # TODO: support more scoring metrics
        # TODO: CHECK FOR FEATURE/PARTITIONS INSTEAD OF JUST SUBTRACTING 2

        start_time = time.time()

        # check project
        if project == None:
            project = self.project
            if project == None:
                raise Exception('No provided datarobot.Project()')

        # check scoring metric 
        if metric == None:
            if self._classification: # classification
                metric = 'AUC'
            else: # regression
                metric = 'R Squared'
    
        # check if project is a string, and if it is, find it
        if type(project) == str:
            project = utils.find_project(project)
            if project == None:
                raise Exception(f'Could not find the project.')
        
        # get starting featurelist
        starting_featurelist = utils.get_featurelist(starting_featurelist_name, project)

        # check feature_range size
        if len(feature_range) == 0:
            raise Exception('The provided feature_range is empty.')
        original_featurelist_size = len(starting_featurelist.features)-2 # -2 because of target feature and partitions

        # feature_range logic for sizes (ints) / ratios (floats)
        if np.array(feature_range).dtype.kind in np.typecodes['AllInteger']: 
            feature_range_check = [x for x in feature_range if x < len(starting_featurelist.features)-2 and x > 0] # -2 because of target feature and partitions 
            if len(feature_range_check) != len(feature_range): # check to see if values are < 0 or > the length of the original featurelist
                raise Exception('The provided feature_range integer values have to be: 0 < feature_range < original featurelist length')
        elif np.array(feature_range).dtype.kind in np.typecodes['AllFloat']:
            feature_range_check = [x for x in feature_range if x > 0 and x < 1]
            if len(feature_range_check) != len(feature_range):
                raise Exception(f'The provided feature_range ratio values have to be: 0 < feature_range < 1')
            # convert ratios to featurelist sizes
            feature_range = [ceil(original_featurelist_size * feature_pct) for feature_pct in feature_range] # multiply by feature_pct and take ceil
            feature_range = pd.Series(feature_range).drop_duplicates() # drop duplicates
            feature_range = list(feature_range[feature_range < original_featurelist_size]) # take all values that less than the original featurelist size
            feature_range.sort(reverse=True) # sort descending
        else:
            raise TypeError('Provided \'feature_range\' is not all Int or all Float.')
        # ---------------------------------------------------------------------------------------------
        if verbose:
            tqdm.write(f"---------- {starting_featurelist_name} ({original_featurelist_size}) ----------")
        # ---------------------------------------------------------------------------------------------

        # ----------------------------------------------------------------------------------
        if config.DEBUG_STATEMENTS:
            logging.debug(f'project:{project}| starting_featurelist:{starting_featurelist}| metric:{metric}| feature_range:{feature_range}')
        # ----------------------------------------------------------------------------------

        # waiting for any started before rapa
        tqdm.write(f'{starting_featurelist_name}: Waiting for previous jobs to complete...')
        self._wait_for_jobs(project, job_type='Previous ')

        # waiting for feature impact message
        temp_start = time.time()
        tqdm.write(f'{starting_featurelist_name}: Waiting for feature impact...')


        # get the models from starting featurelist
        datarobot_project_models = project.get_models()
        if len(datarobot_project_models) == 0:
            raise Exception(f'There are no models in the datarobot project "{project}"')


        for model in datarobot_project_models: # for each model
            if model.featurelist_id == starting_featurelist.id: # if the model uses the starting featurelist, request the feature impact
                if model.metrics[metric]['crossValidation'] != None:
                    try:
                        model.request_feature_impact()
                    except dr.errors.JobAlreadyRequested:
                        continue

        # wait a bit for jobs to start
        time.sleep(5)

        # TODO request_featureimpact returns a job indicator?
        self._wait_for_jobs(project, job_type='Feature Impact ')
        tqdm.write(f'Feature Impact: ({time.time()-temp_start:.{config.TIME_DECIMALS}f}s)')


        # get feature impact/importances of original featurelist
        all_feature_importances = []
        for model in datarobot_project_models:
            if model.featurelist_id == starting_featurelist.id: # request feature impact for starting featurelist models
                if model.metrics[metric]['crossValidation'] != None:
                    all_feature_importances.extend(model.get_or_request_feature_impact())

        # sort by features by feature importance statistic 
        stat_feature_importances = pd.DataFrame(all_feature_importances).groupby('featureName')['impactNormalized']
        if feature_impact_metric.lower() == 'median':
            stat_feature_importances = stat_feature_importances.median().sort_values(ascending=False)
        elif feature_impact_metric.lower() == 'mean':
            stat_feature_importances = stat_feature_importances.mean().sort_values(ascending=False)
        elif feature_impact_metric.lower() == 'cumulative':
            stat_feature_importances = stat_feature_importances.sum().sort_values(ascending=False)
        else: # feature_impact_metric isn't one of the provided statistics
            raise ValueError(f'The provided feature_impact_metric:{feature_impact_metric} is not one of the provided:{_var.FEATURE_IMPACT_STATISTICS}')

        # retain feature performance for each round, and plot stacked bar plot of original feature performances
        if to_graph and 'feature_performance' in to_graph:
            tqdm.write('Graphing feature performance...')
            self._feature_performances_hbar(stat_feature_importances=stat_feature_importances, featurelist_name=starting_featurelist_name, metric=feature_impact_metric)
            plt.show()
            plt.close()


        # waiting for DataRobot projects
        self._wait_for_jobs(project, job_type='DataRobot ')
        
        # get the best performing model of this iteration
        previous_best_model = utils.get_best_model(project, metric=metric, featurelist_prefix=starting_featurelist_name)

        
        tqdm.write(f'Project: {project.project_name} | Featurelist Prefix: {featurelist_prefix} | Feature Range: {feature_range}')
        if verbose:
            tqdm.write(f'Feature Importance Metric: {feature_impact_metric} | Model Performance Metric: {metric}')
            if lives:
                tqdm.write(f'Lives: {lives}')
            if cv_average_mean_error_limit:
                tqdm.write(f'CV Mean Error Limit: {cv_average_mean_error_limit}')

        # perform parsimony
        pbar = tqdm(feature_range, disable = not progress_bar)
        for featurelist_length in pbar:
            # ---------------------------------------------------------------------------------------------
            if verbose:
                tqdm.write(f"---------- {featurelist_prefix} ({featurelist_length}) ----------")
            # ---------------------------------------------------------------------------------------------
            try:
                # get shortened featurelist
                desired_reduced_featurelist_size = featurelist_length
                reduced_features = stat_feature_importances.head(desired_reduced_featurelist_size).index.values.tolist()

                # ----- create new featurelist in datarobot -----
                new_featurelist_name = '{} ({})'.format(featurelist_prefix, len(reduced_features)) # TODO have some suffix added, move try except
                reduced_featurelist = project.create_featurelist(name=new_featurelist_name, features=reduced_features)

                # make the progress bar prefix
                pbar_prefix = f'{new_featurelist_name} - '
                
                # ----- submit new featurelist and create models -----
                pbar.set_description(f'{pbar_prefix}Starting autopilot')
                temp_start = time.time()
                project.start_autopilot(featurelist_id=reduced_featurelist.id, mode=mode, blend_best_models=False, prepare_model_for_deployment=False)
                pbar.set_description(f'{pbar_prefix}Waiting for autopilot')
                project.wait_for_autopilot(verbosity=dr.VERBOSITY_LEVEL.SILENT) #TODO some kind of spinning bar (threading)
                tqdm.write(f'Autopilot: {time.time()-temp_start:.{config.TIME_DECIMALS}f}s')

                temp_start = time.time()
                pbar.set_description(f'{pbar_prefix}Waiting for feature impact')
                datarobot_project_models = project.get_models()
                for model in datarobot_project_models:
                    if model.featurelist_id == reduced_featurelist.id and model.metrics[metric]['crossValidation'] != None:
                        try:
                            model.request_feature_impact()
                        except dr.errors.JobAlreadyRequested:
                            pass

                # API note: Is there a project-level wait function for all jobs, regardless of AutoPilot status?
                self._wait_for_jobs(project, pbar=pbar, pbar_prefix=pbar_prefix, job_type='Feature Impact ')
                tqdm.write(f'Feature Impact: {time.time()-temp_start:.{config.TIME_DECIMALS}f}s')

                temp_start = time.time()
                pbar.set_description(f'{pbar_prefix}Waiting for DataRobot')
                all_feature_importances = []
                while(len(all_feature_importances) == 0):
                    for model in datarobot_project_models:
                        if model.featurelist_id == reduced_featurelist.id and model.metrics[metric]['crossValidation'] != None:
                            all_feature_importances.extend(model.get_or_request_feature_impact())
                    time.sleep(5)
                tqdm.write(f'Waiting for DataRobot: {time.time()-temp_start:.{config.TIME_DECIMALS}f}s')

                # sort by features by feature importance statistic 
                stat_feature_importances = pd.DataFrame(all_feature_importances).groupby('featureName')['impactNormalized']
                if feature_impact_metric.lower() == 'median':
                    stat_feature_importances = stat_feature_importances.median().sort_values(ascending=False)
                elif feature_impact_metric.lower() == 'mean':
                    stat_feature_importances = stat_feature_importances.mean().sort_values(ascending=False)
                elif feature_impact_metric.lower() == 'cumulative':
                    stat_feature_importances = stat_feature_importances.sum().sort_values(ascending=False)
                
                
                # ----- Graphing -----
                if to_graph:
                    if 'feature_performance' in to_graph:
                        temp_start = time.time()
                        pbar.set_description(f'{pbar_prefix}Graphing feature performance stackplot')
                        utils.feature_performance_stackplot(project=project,
                                                            featurelist_prefix=featurelist_prefix,
                                                            starting_featurelist=starting_featurelist,
                                                            feature_impact_metric=feature_impact_metric,
                                                            metric=metric)
                        plt.show()
                        plt.close()
                        print(f'Performance Stackplot: {time.time()-temp_start:.{config.TIME_DECIMALS}f}s')
                    if 'models' in to_graph:
                        temp_start = time.time()
                        pbar.set_description(f'{pbar_prefix}Graphing model performance boxplots')
                        utils.parsimony_performance_boxplot(project, 
                                                            featurelist_prefix=featurelist_prefix,
                                                            starting_featurelist=starting_featurelist)
                        plt.show()
                        plt.close()
                        print(f'Model Performance Boxplot: {time.time()-temp_start:.{config.TIME_DECIMALS}f}s')

                # ----- LIVES -----
                # check for the best model (supplied metric of cv)
                if lives != None:
                    temp_start = time.time()
                    pbar.set_description(f'{pbar_prefix}Checking lives')
                    if featurelist_length == feature_range[0]: # for the first time, check model scores instead of making sure the model id doesn't change (what _check_lives does)
                        current_best_model = utils.get_best_model(project, metric=metric, featurelist_prefix=featurelist_prefix)
                        previous_best_model_score = mean(previous_best_model.get_cross_validation_scores()['cvScores'][metric].values())
                        current_best_model_score = mean(current_best_model.get_cross_validation_scores()['cvScores'][metric].values())
                        if previous_best_model_score > current_best_model_score:
                            lives -= 1
                            tqdm.write(f'Current model performance: \'{current_best_model_score}\'. Last best model performance: \'{previous_best_model_score}\'\nNo change in the best model, so a life was lost.\nLives remaining: \'{lives}\'')
                            logging.debug(f'{current_best_model_score}, {previous_best_model_score}, {lives}')
                            previous_best_model = current_best_model
                    else: # get the best model and check their id
                        lives, previous_best_model = self._check_lives(lives=lives, 
                                                                        project=project, 
                                                                        previous_best_model=previous_best_model, 
                                                                        featurelist_prefix=featurelist_prefix, 
                                                                        metric=metric,
                                                                        verbose=verbose)
                    tqdm.write(f'Checking lives: {time.time()-temp_start:.{config.TIME_DECIMALS}f}s')
                    if lives < 0:
                        current_best_model_score = mean(previous_best_model.get_cross_validation_scores()['cvScores'][metric].values())
                        tqdm.write(f'Ran out of lives.\nBest model: \'{previous_best_model}\'\nAccuracy ({metric}):\'{current_best_model_score}\'')
                        break

                    # ----------------------------------------------------------------------------------
                    if verbose:
                        tqdm.write(f'Lives left: {lives} | Previous Model Best Score: {previous_best_model_score} | Current Best Model Score: {current_best_model_score}')
                    # ----------------------------------------------------------------------------------
                    
                # ----- cv_average_mean_error_limit -----
                # for the current featurelist, check the cv metric for all models and get the standard deviation of the metric among the cv folds for each model.
                # Then, take the average of those standard deviation values and check that it is below the cv_average_mean_error_limit
                if cv_average_mean_error_limit != None:
                    temp_start = time.time()
                    pbar.set_description(f'{pbar_prefix}Checking mean error limit')
                    cv_metrics_dict = {}
                    for model in datarobot_project_models:
                        if model.featurelist_id == reduced_featurelist.id and model.metrics[metric]['crossValidation'] != None:
                            cv_metrics_dict[model] = stdev(model.get_cross_validation_scores()['cvScores'][metric].values())
                    error_from_mean = mean(cv_metrics_dict.values())
                    print(f'Mean Error Limit: {time.time()-temp_start:.{config.TIME_DECIMALS}f}s')
                    if error_from_mean > cv_average_mean_error_limit:
                        tqdm.write(f'Error from the mean over the limit! Stopping parsimony analysis.\nError from the mean: \'{error_from_mean}\'\nLimit set: \'{cv_average_mean_error_limit}\'')
                        logging.debug(f'{error_from_mean}, {cv_average_mean_error_limit}')
                        break
                    # ----------------------------------------------------------------------------------
                    if verbose:
                        tqdm.write(f'CV Error From the Mean: {error_from_mean} | CV Mean Error Limit: {cv_average_mean_error_limit} | CV Model Performance Metric: {metric}')
                    # ----------------------------------------------------------------------------------

            except dr.errors.ClientError as e: # TODO flesh out exceptions logger option/verbose
                if 'Feature list named' in str(e) and 'already exists' in str(e):
                    pass
                else:
                    raise e
        temp_start = time.time()
        pbar.set_description(f'Graphing final feature performances')
        self._feature_performances_hbar(stat_feature_importances=stat_feature_importances, featurelist_name=new_featurelist_name, metric=feature_impact_metric)
        tqdm.write(f'Finished Parsimony Analysis in {time.time()-start_time:.{config.TIME_DECIMALS}f}s.')