Source code for rapa.base

from . import utils
from . import config
from . import _var

import time

from sklearn.feature_selection import f_regression, f_classif
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.utils import check_array

from typing import List
from typing import Callable
from typing import Tuple
from typing import Union

import pandas as pd
import numpy as np
from statistics import mean
from statistics import stdev
from math import ceil

import matplotlib.pyplot as plt
import seaborn as sns

import datarobot as dr

import logging
import warnings

try: # check if in jupyter notebook
    get_ipython
    from tqdm.notebook import tqdm
except:
    from tqdm import tqdm


[docs]class RAPABase(): """ The base of regression and classification RAPA analysis """ POSSIBLE_TARGET_TYPES = [x for x in dir(dr.enums.TARGET_TYPE) if not x.startswith('__')] # List of DR TARGET_TYPES """ * _classification = None # Set by child classes * target_type = None # Set at initialization * project = None # Set at initialization or with 'perform_parsimony()'""" def __init__(self): if self.__class__.__name__ == "RAPABase": raise RuntimeError("Do not instantiate the RAPABase class directly; use RAPAClassif or RAPARegress") @staticmethod def _wait_for_jobs(project: dr.Project, progress_bar: bool = True, sleep_time: int = 5, pbar = None, pbar_prefix: str = '', job_type: str = '', timeout = 21600): """Gets all the jobs for a project, and if there are more than 0 current jobs, sleeps for 5 seconds and checks again. :Parameters: ------------ project: datarobot.Project The datarobot.Project that will be probed for current jobs progress_bar: bool, optional (default = True) If True, a print statement and a progress bar will appear TODO: a print statement and a progress bar will appear sleep_time: int, optional (default = 5) The time to sleep between datarobot.Project.get_all_jobs() (avoid sending too many api requests) TODO: warning or check for max api requests pbar: tqdm.tqdm, optional (defaut = None) A progress bar object from tqdm pbar_prefix: str, optional (default = '') The prefix to put in frot of the progress bar message job_type: str, optional (default = '') A string to put in front of the jobs left (after pbar_prefix) """ start_time = time.time() if len(project.get_all_jobs()) > 0: if progress_bar: if pbar: pbar.set_description(f'{pbar_prefix}{job_type}job(s) remaining ({len(project.get_all_jobs())})') else: tqdm.write(f'\r{job_type}job(s) remaining ({len(project.get_all_jobs())})', end='') time.sleep(sleep_time) while len(project.get_all_jobs()) > 0: while len(project.get_all_jobs()) > 0: # double check if progress_bar: # PROGRESS BAR if pbar: pbar.set_description(f'{pbar_prefix}Feature Impact job(s) remaining ({len(project.get_all_jobs())})') else: tqdm.write(f'\r{job_type}job(s) remaining ({len(project.get_all_jobs())})', end='') time.sleep(sleep_time) if time.time()-start_time >= timeout: raise TimeoutError time.sleep(sleep_time+5)# sometimes all jobs will be complete and no jobs will be in queue, but then more jobs will be created tqdm.write(f'\r{job_type}job(s) remaining ({len(project.get_all_jobs())})', end='\n') return None @staticmethod def _check_lives(lives: int, project: dr.Project, previous_best_model: dr.Model, featurelist_prefix: str = None, starred: bool = False, metric: str = 'AUC', verbose: bool = False) -> Tuple[int, dr.Model]: """Finds the 'best' model of a project/featurelist of a project and returns the new `lives` count (decreased by 1 if the model doesn't change) and the 'best' model Uses `rapa.utils.get_best_model` to find the current best model, and decides if the model has changed by equating `datarobot.Model.id`. Returns a tuple with the number of 'lives' left in the first position, and the current 'best' model in the second position. :Parameters: ------------ lives: int The current number of 'lives' remaining in parsimony analysis project: datarobot.Project The datarobot.Project parsimony analysis is being performed in previous_best_model: datarobot.Model The previously 'best' model in the datarobot.Project before a round of parsimony analysis featurelist_prefix: str, optional (default = None) The desired prefix for the featurelists that will be used for searching for the 'best' model. If None, will search the entire datarobot.Project starred: bool, optional (default = False) If True, searching the project's starred models. If False, searches all of the project's models metric: str, optional (default = 'AUC') What model cross validation metric to use when averaging scores to find the 'best' model verbose: bool, optional (default = True) If True, prints previous and current best model information when before returning :Returns: ---------- Tuple(int, datarobot.Model) A tuple with the new `lives` in the first position, and the new 'best' model after one round of persimony analysis """ # check for the best model (supplied metric of cv) current_best_model = utils.get_best_model(project, metric=metric, featurelist_prefix=featurelist_prefix, starred=starred) if current_best_model.id == previous_best_model.id: lives -= 1 current_best_model_score = mean(current_best_model.get_cross_validation_scores()['cvScores'][metric].values()) last_best_model_score = mean(previous_best_model.get_cross_validation_scores()['cvScores'][metric].values()) if verbose: tqdm.write(f'Current model performance: \'{current_best_model_score}\'. Previous best model performance: \'{last_best_model_score}\'\nNo change in the best model, so a life was lost.\nLives remaining: \'{lives}\'') else: tqdm.write(f'Lives left: \'{lives}\'') return (lives, current_best_model) @staticmethod def _feature_performances_hbar(stat_feature_importances, featurelist_name, metric='', stacked=False, colormap='tab20'): feature_performances = pd.DataFrame(stat_feature_importances.rename(len(stat_feature_importances))) warnings.filterwarnings('ignore', message='The handle <BarContainer object of 1 artists>') feature_performances_to_plot = feature_performances.iloc[:config.MAX_FEATURES_TO_LABEL].T.set_axis(list(feature_performances.iloc[:config.MAX_FEATURES_TO_LABEL].T.columns), axis=1, inplace=False) plt.figure(figsize=(config.FIG_SIZE[0], config.FIG_SIZE[1]/2)) ax = sns.barplot(data=feature_performances_to_plot, orient='h', palette=colormap) ax.set(xlabel='Normalized Impact of Features', title=f'{min([config.MAX_FEATURES_TO_LABEL, len(feature_performances)])} {metric} Impact Normalized Feature Performances\nFeaturelist: {featurelist_name}', ylabel='Features') warnings.filterwarnings('default') plt.show() return None
[docs] def create_submittable_dataframe(self, input_data_df: pd.DataFrame, target_name: str, n_features: int = 19990, n_splits: int = 6, filter_function: Callable[[pd.DataFrame, np.ndarray], List[np.ndarray]] = None, random_state: int = None) -> pd.DataFrame: #TODO: change return type """Prepares the input data for submission as either a regression or classification problem on DataRobot. Creates pre-determined k-fold cross-validation splits and filters the feature set down to a size that DataRobot can receive as input, if necessary. TODO: private function submit_datarobot_project explanation :Parameters: ------------ input_data_df: pandas.DataFrame pandas DataFrame containing the feature set and prediction target. target_name: str Name of the prediction target column in `input_data_df`. n_features: int, optional (default: 19990) The number of features to reduce the feature set in `input_data_df` down to. DataRobot's maximum feature set size is 20,000. If `n_features` has the same number of features as the `input_data_df`, NaN values are allowed because no feature filtering will ocurr n_splits: int, optional (default: 6) The number of cross-validation splits to create. One of the splits will be retained as a holdout split, so by default this function sets up the dataset for 5-fold cross-validation with a holdout. NOTE: `CV Fold 0` is the holdout set by default. filter_function: callable, optional (default: None) The function used to calculate the importance of each feature in the initial filtering step that reduces the feature set down to `max_features`. This filter function must take a feature matrix as the first input and the target array as the second input, then return two separate arrays containing the feature importance of each feature and the P-value for that correlation, in that order. When None, the filter function is determined by child class. If an instance of `RAPAClassif()`, sklearn.feature_selection.f_classif is used. If `RAPARegress()`, sklearn.feature_selection.f_regression is used. See scikit-learn's f_classif function for an example: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_regression.html random_state: int, optional (default: None) The random number generator seed for RAPA. Use this parameter to make sure that RAPA will give you the same results each time you run it on the same input data set with that seed. :Returns: ------------ pandas.DataFrame DataFrame holds original values from the input Dataframe, but with pre-determined k-fold cross-validation splits, and was filtered down to 'max_features' size using the 'filter_function' """ #TODO: make private function? # Check dataframe has 'target_name' columns if target_name not in input_data_df.columns: raise KeyError(f'{target_name} is not a column in the input DataFrame') # Check that the dataframe can be copied and remove target_name column input_data_df = input_data_df.copy() only_features_df = input_data_df.drop(columns=[target_name]) # Check if the requested number of features is equal to the number of features provided # If True, skip feature filtering and allow NaNs if n_features >= only_features_df.shape[1]: feature_filter = False else: feature_filter = True # Set target_type, kfold_type, and filter_function based on type of classification/regression problem if self._classification: # Check if binary or multi classification problem if len(np.unique(input_data_df[target_name].values)) == 2: self.target_type = dr.enums.TARGET_TYPE.BINARY else: self.target_type = dr.enums.TARGET_TYPE.MULTICLASS kfold_type = StratifiedKFold filter_function = f_classif else: # Check array for infinite values/NaNs if feature_filter: check_array(input_data_df) kfold_type = KFold self.target_type = dr.enums.TARGET_TYPE.REGRESSION filter_function = f_regression # Create 'partition' column and set all values to 'train' input_data_df['partition'] = 'train' train_feature_importances = [] # Make cross validation folds fold_name_prefix = 'CV Fold' for fold_num, (_, fold_indices) in enumerate(kfold_type(n_splits=n_splits, random_state=random_state, shuffle=True).split(only_features_df.values, input_data_df[target_name].values)): # Replace the values in the partition column with their true CV fold, removing all 'train' entries input_data_df.iloc[fold_indices, input_data_df.columns.get_loc('partition')] = f'{fold_name_prefix} {fold_num}' # Fold 0 is the holdout set, so don't calculate feature importances using that fold if feature_filter and fold_num > 0: feature_importances, _ = filter_function(only_features_df.iloc[fold_indices].values, input_data_df[target_name].iloc[fold_indices].values) train_feature_importances.append(feature_importances) if feature_filter: # We calculate the overall feature importance scores by averaging the feature importance scores across all of the training folds avg_train_feature_importances = np.mean(train_feature_importances, axis=0) # Change parition 0 name to 'Holdout' input_data_df.loc[input_data_df['partition'] == f'{fold_name_prefix} 0', 'partition'] = 'Holdout' # Gets the top `n_features` correlated features as a list most_correlated_features = only_features_df.columns.values[np.argsort(avg_train_feature_importances)[::-1][:n_features]].tolist() # put target_name, partition, and most correlated features columns in dr_upload_df datarobot_upload_df = input_data_df[[target_name, 'partition'] + most_correlated_features] else: # put target_name, partition, and most correlated features columns in dr_upload_df datarobot_upload_df = input_data_df[[target_name, 'partition'] + only_features_df.columns.values.tolist()] return datarobot_upload_df
[docs] def submit_datarobot_project(self, input_data_df: pd.DataFrame, target_name: str, project_name: str, target_type: str = None, worker_count: int = -1, metric: str = None, mode: str = dr.AUTOPILOT_MODE.FULL_AUTO, random_state: int = None) -> dr.Project: #TODO check input df for partition, target_name (data-robotified df), logger.warning """Submits the input data to DataRobot as a new modeling project. It is suggested to prepare the `input_data_df` using the 'create_submittable_dataframe' function first with an instance of either RAPAClassif or RAPARegress. :Parameters: ------------ input_data_df: pandas.DataFrame pandas DataFrame containing the feature set and prediction target. target_name: str Name of the prediction target column in `input_data_df`. project_name: str Name of the project in DataRobot. target_type: str (enum) Indicator to DataRobot of whether the new modeling project should be a binary classification, multiclass classification, or regression project. Options: * datarobot.TARGET_TYPE.BINARY * datarobot.TARGET_TYPE.REGRESSION * datarobot.TARGET_TYPE.MULTICLASS worker_count: int, optional (default: -1) The number of worker engines to assign to the DataRobot project. By default, -1 tells DataRobot to use all available worker engines. metric: str, optional (default: None) Name of the metric to use for evaluating models. You can query the metrics available for the target by way of Project.get_metrics. If none is specified, then the default recommended by DataRobot is used. mode: str (enum), optional (default: datarobot.AUTOPILOT_MODE.FULL_AUTO) The modeling mode to start the DataRobot project in. Options: * datarobot.AUTOPILOT_MODE.FULL_AUTO * datarobot.AUTOPILOT_MODE.QUICK * datarobot.AUTOPILOT_MODE.MANUAL * datarobot.AUTOPILOT_MODE.COMPREHENSIVE: Runs all blueprints in the repository (this may be extremely slow). random_state: int, optional (default: None) The random number generator seed for DataRobot. Use this parameter to make sure that DataRobot will give you the same results each time you run it on the same input data set with that seed. :Returns: ------------ datarobot.Project The DataRobot project submitted """ # TODO: provide an option for columns to disregard # Check for a target_type if target_type == None or target_type not in self.POSSIBLE_TARGET_TYPES: target_type = self.target_type if target_type == None: raise Exception(f'No target type.') project = dr.Project.create(sourcedata=input_data_df, project_name=project_name) project.analyze_and_model(target=target_name, target_type=target_type, worker_count=worker_count, mode=mode, metric=metric, advanced_options=dr.AdvancedOptions(seed=random_state, accuracy_optimized_mb=False, prepare_model_for_deployment=False, blend_best_models=False), partitioning_method=dr.UserCV(user_partition_col='partition', cv_holdout_level='Holdout')) return project
[docs] def perform_parsimony(self, feature_range: List[Union[int, float]], project: Union[dr.Project, str] = None, starting_featurelist_name: str = 'Informative Features', featurelist_prefix: str = 'RAPA Reduced to', mode: str = dr.AUTOPILOT_MODE.FULL_AUTO, lives: int = None, cv_average_mean_error_limit: float = None, feature_impact_metric: str = 'median', progress_bar: bool = True, to_graph: List[str] = None, metric: str = None, verbose: bool = True): """Performs parsimony analysis by repetatively extracting feature importance from DataRobot models and creating new models with reduced features (smaller feature lists). # TODO take a look at featurelist_prefix for running multiple RAPA NOTICE: Feature impact scores are only gathered from models that have had their **cross-validation accuracy** tested! :Parameters: ------------ feature_range: list[int] | list[float] Either a list containing integers representing desired featurelist lengths, or a list containing floats representing desired featurelist percentages (of the original featurelist size) project: datarobot.Project | str, optional (default = None) Either a datarobot project, or a string of it's id or name. If None, uses the project that was provided to create the rapa class starting_featurelist: str, optional (default = 'Informative Features') The name or id of the featurelist that rapa will start pasimony analysis with featurelist_prefix: str, optional (default = 'RAPA Reduced to') The desired prefix for the featurelists that rapa creates in datarobot. Each featurelist will start with the prefix, include a space, and then end with the number of features in that featurelist mode: str (enum), optional (default: datarobot.AUTOPILOT_MODE.FULL_AUTO) The modeling mode to start the DataRobot project in. Options: * datarobot.AUTOPILOT_MODE.FULL_AUTO * datarobot.AUTOPILOT_MODE.QUICK * datarobot.AUTOPILOT_MODE.MANUAL * datarobot.AUTOPILOT_MODE.COMPREHENSIVE: Runs all blueprints in the repository (warning: this may be extremely slow). lives: int, optional (default = None) The number of times allowed for reducing the featurelist and obtaining a worse model. By default, 'lives' are off, and the entire 'feature_range' will be ran, but if supplied a number >= 0, then that is the number of 'lives' there are. Ex: lives = 0, feature_range = [100, 90, 80, 50] RAPA finds that after making all the models for the length 80 featurelist, the 'best' model was created with the length 90 featurelist, so it stops and doesn't make a featurelist of length 50. Similar to datarobot's Feature Importance Rank Ensembling for advanced feature selection (FIRE) package's 'lifes' https://www.datarobot.com/blog/using-feature-importance-rank-ensembling-fire-for-advanced-feature-selection/ cv_average_mean_error_limit: float, optional (default = None) The limit of cross validation mean error to help avoid overfitting. By default, the limit is off, and the each 'feature_range' will be ran. Limit exists only if supplied a number >= 0.0 Ex: 'feature_range' = 2.5, feature_range = [100, 90, 80, 50] RAPA finds that the average AUC for each CV fold is [.8, .6, .9, .5] respectfully, the mean of these is 0.7. The average error is += 0.15. If 0.15 >= cv_average_mean_error_limit, the training stops. feature_impact_metric: str, optional (default = 'median') How RAPA will decide each feature's importance over every model in a feature list Options: * median * mean * cumulative progress_bar: bool, optional (default = True) If True, a simple progres bar displaying complete and incomplete featurelists. If False, provides updates in stdout Ex: current worker count, current featurelist, etc. to_graph: List[str], optional (default = None) A list of keys choosing which graphs to produce. Possible Keys: * 'models': `seaborn` boxplot with model performances with provided metric * 'feature_performance': `matplotlib.pyplot` stackplot of feature performances metric: str, optional (default = None) The metric used for scoring models, when finding the 'best' model, and when plotting model performance When None, the metric is determined by what class inherits from base. For instance, a `RAPAClassif` instance's default is 'AUC', and `RAPARegress` is 'R Squared' verbose: bool, optional (default = True) If True, prints updates from DataRobot and rapa during parsimonious feature rduction :Returns: ------------ None """ # TODO: return a dictionary of values? {"time_taken": 2123, "cv_mean_error": list[floats], ""} # TODO: graph cv performance boxplots # TODO: graph pareto front of median model performance vs feature list size # TODO: support more scoring metrics # TODO: CHECK FOR FEATURE/PARTITIONS INSTEAD OF JUST SUBTRACTING 2 start_time = time.time() # check project if project == None: project = self.project if project == None: raise Exception('No provided datarobot.Project()') # check scoring metric if metric == None: if self._classification: # classification metric = 'AUC' else: # regression metric = 'R Squared' # check if project is a string, and if it is, find it if type(project) == str: project = utils.find_project(project) if project == None: raise Exception(f'Could not find the project.') # get starting featurelist starting_featurelist = utils.get_featurelist(starting_featurelist_name, project) # check feature_range size if len(feature_range) == 0: raise Exception('The provided feature_range is empty.') original_featurelist_size = len(starting_featurelist.features)-2 # -2 because of target feature and partitions # feature_range logic for sizes (ints) / ratios (floats) if np.array(feature_range).dtype.kind in np.typecodes['AllInteger']: feature_range_check = [x for x in feature_range if x < len(starting_featurelist.features)-2 and x > 0] # -2 because of target feature and partitions if len(feature_range_check) != len(feature_range): # check to see if values are < 0 or > the length of the original featurelist raise Exception('The provided feature_range integer values have to be: 0 < feature_range < original featurelist length') elif np.array(feature_range).dtype.kind in np.typecodes['AllFloat']: feature_range_check = [x for x in feature_range if x > 0 and x < 1] if len(feature_range_check) != len(feature_range): raise Exception(f'The provided feature_range ratio values have to be: 0 < feature_range < 1') # convert ratios to featurelist sizes feature_range = [ceil(original_featurelist_size * feature_pct) for feature_pct in feature_range] # multiply by feature_pct and take ceil feature_range = pd.Series(feature_range).drop_duplicates() # drop duplicates feature_range = list(feature_range[feature_range < original_featurelist_size]) # take all values that less than the original featurelist size feature_range.sort(reverse=True) # sort descending else: raise TypeError('Provided \'feature_range\' is not all Int or all Float.') # --------------------------------------------------------------------------------------------- if verbose: tqdm.write(f"---------- {starting_featurelist_name} ({original_featurelist_size}) ----------") # --------------------------------------------------------------------------------------------- # ---------------------------------------------------------------------------------- if config.DEBUG_STATEMENTS: logging.debug(f'project:{project}| starting_featurelist:{starting_featurelist}| metric:{metric}| feature_range:{feature_range}') # ---------------------------------------------------------------------------------- # waiting for any started before rapa tqdm.write(f'{starting_featurelist_name}: Waiting for previous jobs to complete...') self._wait_for_jobs(project, job_type='Previous ') # waiting for feature impact message temp_start = time.time() tqdm.write(f'{starting_featurelist_name}: Waiting for feature impact...') # get the models from starting featurelist datarobot_project_models = project.get_models() if len(datarobot_project_models) == 0: raise Exception(f'There are no models in the datarobot project "{project}"') for model in datarobot_project_models: # for each model if model.featurelist_id == starting_featurelist.id: # if the model uses the starting featurelist, request the feature impact if model.metrics[metric]['crossValidation'] != None: try: model.request_feature_impact() except dr.errors.JobAlreadyRequested: continue # wait a bit for jobs to start time.sleep(5) # TODO request_featureimpact returns a job indicator? self._wait_for_jobs(project, job_type='Feature Impact ') tqdm.write(f'Feature Impact: ({time.time()-temp_start:.{config.TIME_DECIMALS}f}s)') # get feature impact/importances of original featurelist all_feature_importances = [] for model in datarobot_project_models: if model.featurelist_id == starting_featurelist.id: # request feature impact for starting featurelist models if model.metrics[metric]['crossValidation'] != None: all_feature_importances.extend(model.get_or_request_feature_impact()) # sort by features by feature importance statistic stat_feature_importances = pd.DataFrame(all_feature_importances).groupby('featureName')['impactNormalized'] if feature_impact_metric.lower() == 'median': stat_feature_importances = stat_feature_importances.median().sort_values(ascending=False) elif feature_impact_metric.lower() == 'mean': stat_feature_importances = stat_feature_importances.mean().sort_values(ascending=False) elif feature_impact_metric.lower() == 'cumulative': stat_feature_importances = stat_feature_importances.sum().sort_values(ascending=False) else: # feature_impact_metric isn't one of the provided statistics raise ValueError(f'The provided feature_impact_metric:{feature_impact_metric} is not one of the provided:{_var.FEATURE_IMPACT_STATISTICS}') # retain feature performance for each round, and plot stacked bar plot of original feature performances if to_graph and 'feature_performance' in to_graph: tqdm.write('Graphing feature performance...') self._feature_performances_hbar(stat_feature_importances=stat_feature_importances, featurelist_name=starting_featurelist_name, metric=feature_impact_metric) plt.show() plt.close() # waiting for DataRobot projects self._wait_for_jobs(project, job_type='DataRobot ') # get the best performing model of this iteration previous_best_model = utils.get_best_model(project, metric=metric, featurelist_prefix=starting_featurelist_name) tqdm.write(f'Project: {project.project_name} | Featurelist Prefix: {featurelist_prefix} | Feature Range: {feature_range}') if verbose: tqdm.write(f'Feature Importance Metric: {feature_impact_metric} | Model Performance Metric: {metric}') if lives: tqdm.write(f'Lives: {lives}') if cv_average_mean_error_limit: tqdm.write(f'CV Mean Error Limit: {cv_average_mean_error_limit}') # perform parsimony pbar = tqdm(feature_range, disable = not progress_bar) for featurelist_length in pbar: # --------------------------------------------------------------------------------------------- if verbose: tqdm.write(f"---------- {featurelist_prefix} ({featurelist_length}) ----------") # --------------------------------------------------------------------------------------------- try: # get shortened featurelist desired_reduced_featurelist_size = featurelist_length reduced_features = stat_feature_importances.head(desired_reduced_featurelist_size).index.values.tolist() # ----- create new featurelist in datarobot ----- new_featurelist_name = '{} ({})'.format(featurelist_prefix, len(reduced_features)) # TODO have some suffix added, move try except reduced_featurelist = project.create_featurelist(name=new_featurelist_name, features=reduced_features) # make the progress bar prefix pbar_prefix = f'{new_featurelist_name} - ' # ----- submit new featurelist and create models ----- pbar.set_description(f'{pbar_prefix}Starting autopilot') temp_start = time.time() project.start_autopilot(featurelist_id=reduced_featurelist.id, mode=mode, blend_best_models=False, prepare_model_for_deployment=False) pbar.set_description(f'{pbar_prefix}Waiting for autopilot') project.wait_for_autopilot(verbosity=dr.VERBOSITY_LEVEL.SILENT) #TODO some kind of spinning bar (threading) tqdm.write(f'Autopilot: {time.time()-temp_start:.{config.TIME_DECIMALS}f}s') temp_start = time.time() pbar.set_description(f'{pbar_prefix}Waiting for feature impact') datarobot_project_models = project.get_models() for model in datarobot_project_models: if model.featurelist_id == reduced_featurelist.id and model.metrics[metric]['crossValidation'] != None: try: model.request_feature_impact() except dr.errors.JobAlreadyRequested: pass # API note: Is there a project-level wait function for all jobs, regardless of AutoPilot status? self._wait_for_jobs(project, pbar=pbar, pbar_prefix=pbar_prefix, job_type='Feature Impact ') tqdm.write(f'Feature Impact: {time.time()-temp_start:.{config.TIME_DECIMALS}f}s') temp_start = time.time() pbar.set_description(f'{pbar_prefix}Waiting for DataRobot') all_feature_importances = [] while(len(all_feature_importances) == 0): for model in datarobot_project_models: if model.featurelist_id == reduced_featurelist.id and model.metrics[metric]['crossValidation'] != None: all_feature_importances.extend(model.get_or_request_feature_impact()) time.sleep(5) tqdm.write(f'Waiting for DataRobot: {time.time()-temp_start:.{config.TIME_DECIMALS}f}s') # sort by features by feature importance statistic stat_feature_importances = pd.DataFrame(all_feature_importances).groupby('featureName')['impactNormalized'] if feature_impact_metric.lower() == 'median': stat_feature_importances = stat_feature_importances.median().sort_values(ascending=False) elif feature_impact_metric.lower() == 'mean': stat_feature_importances = stat_feature_importances.mean().sort_values(ascending=False) elif feature_impact_metric.lower() == 'cumulative': stat_feature_importances = stat_feature_importances.sum().sort_values(ascending=False) # ----- Graphing ----- if to_graph: if 'feature_performance' in to_graph: temp_start = time.time() pbar.set_description(f'{pbar_prefix}Graphing feature performance stackplot') utils.feature_performance_stackplot(project=project, featurelist_prefix=featurelist_prefix, starting_featurelist=starting_featurelist, feature_impact_metric=feature_impact_metric, metric=metric) plt.show() plt.close() print(f'Performance Stackplot: {time.time()-temp_start:.{config.TIME_DECIMALS}f}s') if 'models' in to_graph: temp_start = time.time() pbar.set_description(f'{pbar_prefix}Graphing model performance boxplots') utils.parsimony_performance_boxplot(project, featurelist_prefix=featurelist_prefix, starting_featurelist=starting_featurelist) plt.show() plt.close() print(f'Model Performance Boxplot: {time.time()-temp_start:.{config.TIME_DECIMALS}f}s') # ----- LIVES ----- # check for the best model (supplied metric of cv) if lives != None: temp_start = time.time() pbar.set_description(f'{pbar_prefix}Checking lives') if featurelist_length == feature_range[0]: # for the first time, check model scores instead of making sure the model id doesn't change (what _check_lives does) current_best_model = utils.get_best_model(project, metric=metric, featurelist_prefix=featurelist_prefix) previous_best_model_score = mean(previous_best_model.get_cross_validation_scores()['cvScores'][metric].values()) current_best_model_score = mean(current_best_model.get_cross_validation_scores()['cvScores'][metric].values()) if previous_best_model_score > current_best_model_score: lives -= 1 tqdm.write(f'Current model performance: \'{current_best_model_score}\'. Last best model performance: \'{previous_best_model_score}\'\nNo change in the best model, so a life was lost.\nLives remaining: \'{lives}\'') logging.debug(f'{current_best_model_score}, {previous_best_model_score}, {lives}') previous_best_model = current_best_model else: # get the best model and check their id lives, previous_best_model = self._check_lives(lives=lives, project=project, previous_best_model=previous_best_model, featurelist_prefix=featurelist_prefix, metric=metric, verbose=verbose) tqdm.write(f'Checking lives: {time.time()-temp_start:.{config.TIME_DECIMALS}f}s') if lives < 0: current_best_model_score = mean(previous_best_model.get_cross_validation_scores()['cvScores'][metric].values()) tqdm.write(f'Ran out of lives.\nBest model: \'{previous_best_model}\'\nAccuracy ({metric}):\'{current_best_model_score}\'') break # ---------------------------------------------------------------------------------- if verbose: tqdm.write(f'Lives left: {lives} | Previous Model Best Score: {previous_best_model_score} | Current Best Model Score: {current_best_model_score}') # ---------------------------------------------------------------------------------- # ----- cv_average_mean_error_limit ----- # for the current featurelist, check the cv metric for all models and get the standard deviation of the metric among the cv folds for each model. # Then, take the average of those standard deviation values and check that it is below the cv_average_mean_error_limit if cv_average_mean_error_limit != None: temp_start = time.time() pbar.set_description(f'{pbar_prefix}Checking mean error limit') cv_metrics_dict = {} for model in datarobot_project_models: if model.featurelist_id == reduced_featurelist.id and model.metrics[metric]['crossValidation'] != None: cv_metrics_dict[model] = stdev(model.get_cross_validation_scores()['cvScores'][metric].values()) error_from_mean = mean(cv_metrics_dict.values()) print(f'Mean Error Limit: {time.time()-temp_start:.{config.TIME_DECIMALS}f}s') if error_from_mean > cv_average_mean_error_limit: tqdm.write(f'Error from the mean over the limit! Stopping parsimony analysis.\nError from the mean: \'{error_from_mean}\'\nLimit set: \'{cv_average_mean_error_limit}\'') logging.debug(f'{error_from_mean}, {cv_average_mean_error_limit}') break # ---------------------------------------------------------------------------------- if verbose: tqdm.write(f'CV Error From the Mean: {error_from_mean} | CV Mean Error Limit: {cv_average_mean_error_limit} | CV Model Performance Metric: {metric}') # ---------------------------------------------------------------------------------- except dr.errors.ClientError as e: # TODO flesh out exceptions logger option/verbose if 'Feature list named' in str(e) and 'already exists' in str(e): pass else: raise e temp_start = time.time() pbar.set_description(f'Graphing final feature performances') self._feature_performances_hbar(stat_feature_importances=stat_feature_importances, featurelist_name=new_featurelist_name, metric=feature_impact_metric) tqdm.write(f'Finished Parsimony Analysis in {time.time()-start_time:.{config.TIME_DECIMALS}f}s.')