Source code for rapa.utils
from . import config
import datarobot as dr
from datarobot.errors import ClientError
import pickle
import logging
from warnings import warn
from warnings import catch_warnings
from datarobot.models import featurelist
import pandas as pd
import numpy as np
from statistics import mean
from statistics import median
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sb
LOGGER = logging.getLogger(__name__)
[docs]def find_project(project: str) -> dr.Project:
"""Uses the DataRobot api to find a current project.
Uses datarobot.Project.get() and dr.Project.list() to test if 'project' is either an id
or possibly a name of a project in DataRobot, then returns the project found.
:Parameters:
----------
project: str
Either a project id or a search term for project name
:Returns:
----------
datarobot.Project
A datarobot project object that is either the project with the id provided, or the
first/only project returned by searching by project name. Returns None if the list is
empty.
"""
project = str(project) # make sure the project id/name provided is a string
return_project = None
try: # try finding project with id
return_project = dr.Project.get(project_id=project)
return return_project
except ClientError: # id was not provided, most likely a name
project_list = dr.Project.list(search_params={'project_name': project})
if len(project_list) == 0: # probably wrong id, check id?
raise Exception(f"No projects found with id/string of \'{project}\'")
elif len(project_list) == 1: # found one project with search, good
return project_list[0]
else: # more than one project was found
warn(f"Returning the first of multiple projects with \'{project}\': {project_list}")
return project_list[0]
[docs]def get_best_model(project: dr.Project,
featurelist_prefix: str = None,
starred: bool = False,
metric: str = None,
fold: str = 'crossValidation',
highest: bool = None) -> dr.Model:
"""Attempts to find the 'best' model in a datarobot by searching cross validation scores of all the
models in a supplied project. # TODO make dictionary for minimize/maximize
CURRENTLY SUPPORTS METRICS WHERE HIGHER = BETTER
.. warning::
Actually finding the 'best' model takes more than averageing cross validation
scores, and it is suggested that the 'best' model is decided and starred in DataRobot.
(Make sure 'starred = True' if starring the 'best' model)
.. note::
Some models may not have scores for the supplied fold because they were not run. These
models are ignored by this function. Make sure all models of interest have scores for
the fold being provided if those models should be considered.
:Parameters:
----------
project: datarobot.Project
The project object that will be searched for the 'best' model
featurelist_prefix: str, optional (default = None)
The desired featurelist prefix used to search in for models using specific
rapa featurelists
starred: bool, optional (default = False)
If True, return the starred model. If there are more than one starred models,
then warn the user and return the 'best' one
metric: str, optional (default = 'AUC' or 'RMSE') [classification and regression]
What model metric to use when finding the 'best'
fold: str, optional (default = 'crossValidation')
The fold of data used in DataRobot. Options are as follows:
['validation',
'crossValidation',
'holdout',
'training',
'backtestingScores',
'backtesting']
highest: bool, optional (default for classification = True, default for regression = False)
Whether to take the highest value (highest = True), or the lowest
value (highest = False). Change this when assumed switch is
:Returns:
----------
datarobot.Model
A datarobot model that is either the 'best', starred, or the 'best' of the starred models
from the provided datarobot project
"""
# if metric is missing, assume a metric
if metric == None:
if project.target_type == dr.TARGET_TYPE.BINARY or project.target_type == dr.TARGET_TYPE.MULTICLASS:
# classification
metric = 'AUC'
elif project.target_type == dr.TARGET_TYPE.REGRESSION:
# regression
metric = 'RMSE'
# if highest is missing, assume a direction
if highest == None:
if project.target_type == dr.TARGET_TYPE.BINARY or project.target_type == dr.TARGET_TYPE.MULTICLASS:
# classification
highest = True
elif project.target_type == dr.TARGET_TYPE.REGRESSION:
highest = False
scores = []
#### get scores
# set featurelist_prefix to '' for ease of use in code
if not starred: # the model(s) is/are not starred
if featurelist_prefix == None:
featurelist_prefix = ''
for model in project.get_models():
current_model_score = model.metrics[metric][fold] # get the score for the metric and fold
if model.featurelist_name.startswith(featurelist_prefix) and current_model_score: # if the model is scored in this fold, and it was created with the featurelist we are looking at
scores.append((current_model_score, model)) # add the model score and model object to a list
else: # the model(s) is/are starred
if featurelist_prefix == None:
featurelist_prefix = ''
for model in project.get_models():
current_model_score = model.metrics[metric][fold] # get the score for the metric and fold
if model.is_starred and model.featurelist_name.startswith(featurelist_prefix) and current_model_score: # if the model is scored in this fold, and it was created with the featurelist we are looking at
scores.append((current_model_score, model)) # add the model score and model object to a list
#### find the best scores
# check that there are any models
if len(scores) > 1: # multiple models
return sorted(scores, key=lambda tup: tup[0], reverse=highest)[0][1] # sort by first item in the tuples
elif len(scores) == 1: # one model
return scores[0][1]
else: # no models
raise Exception(f"No models found. \n Parameters: project=`{project}`, metric=`{metric}`, fold=`{fold}`, featurelist_prefix=`{featurelist_prefix}`, starred=`{starred}`, highest=`{highest}`")
# alias for get_best_model
[docs]def get_starred_model(project: dr.Project,
metric: str = None,
featurelist_prefix: str = None) -> dr.Model:
"""Alias for rapa.utils.get_best_model() but makes starred = True
"""
return get_best_model(project, starred = True, metric = metric, featurelist_prefix = featurelist_prefix)
[docs]def initialize_dr_api(token_key: str = None,
file_path: str = 'data/dr-tokens.pkl',
endpoint: str = 'https://app.datarobot.com/api/v2'):
"""Initializes the DataRobot API with a pickled dictionary created by the user.
.. warning:
It is advised that the user keeps the pickled dictionary in an ignored
directory if using GitHub (put the file in the .gitignore)
Accesses a file that should be a pickled dictionary. This dictionary has the API token
as the value to the provided token_key. Ex: {token_key: 'API_TOKEN'}
:Parameters:
----------
token_key: str
The API token's key in the pickled dictionary located in file_path
file_path: str, optional (default = 'data/dr-tokens.pkl')
Path to the pickled dictionary containing the API token
endpoint: str, optional (default = 'https://app.datarobot.com/api/v2')
The endpoint is usually the URL you would use to log into the DataRobot Web User Interface
"""
# load pickled dictionary and initialize api, catching FileNotFound, KeyError, and failed authentication warning
try:
datarobot_tokens = pickle.load(open(file_path, 'rb'))
with catch_warnings(record=True) as w: # appends warning to w if warning occurs
dr.Client(endpoint=endpoint, token=datarobot_tokens[token_key])
if not not w: # check to see if w is not None or empty (has a warning)
raise Exception(w[0].message)
else:
pass
except FileNotFoundError:
raise FileNotFoundError(f'The file {file_path} does not exist.') # TODO: Make a tutorial on how to create the pickled dictionary with api tokens and link here
except KeyError:
raise KeyError(f'\'{token_key}\' is not in the dictionary at \'{file_path}\'')
# TODO: I probably didn't catch all errors, make tests for this
print(f'DataRobot API initiated with endpoint \'{endpoint}\'')
[docs]def get_featurelist(featurelist: str,
project: dr.Project) -> dr.Featurelist:
"""Uses the DataRobot api to search for a desired featurelist.
Uses datarobot.Project.get_featurelists() to retrieve all the featurelists in
the project. Then, it searches the list for id's, and if it doesn't find any,
it searches the list again for names. Returns the first project it finds.
:Parameters:
----------
featurelist: str
Either a featurelist id or a search term for featurelist name
project: datarobot.Project
The project that is being searched for the featurelist
:Returns:
----------
datarobot.Featurelist
The featurelist that was found. Returns None if no featurelist is found
"""
featurelist = str(featurelist) # cast to string just in case id is an int or something
featurelists = project.get_featurelists()
dr_featurelist = [x for x in featurelists if featurelist == x.id] # loop over all the featurelists and get all that match featurelist (assuming it is an id)
if dr_featurelist: # if dr_featurelist is not empty
return dr_featurelist[0] # there should only be one id
else: # if dr_featurelist is empty
dr_featurelist = [x for x in featurelists if featurelist.lower() in str(x.name).lower()] # use python's `in` to search strings
if not dr_featurelist: # if dr_featurelist is empty
raise Exception(f"No featurelists were found with the id/name of \'{featurelist}\'")
elif len(dr_featurelist) > 1: # if dr_featurelist has more than 1
warn(f'More than one featurelist was found: \'{dr_featurelist}\', returning the first.')
return dr_featurelist[0]
else: # dr_Featurelist has 1
return dr_featurelist[0]
[docs]def parsimony_performance_boxplot(project: dr.Project,
featurelist_prefix: str = 'RAPA Reduced to',
starting_featurelist: str = None,
metric: str = None,
split: str = 'crossValidation',
featurelist_lengths: list = None):
"""Uses `seaborn`'s `boxplot` function to plot featurelist size vs performance
for all models that use that featurelist prefix. There is a different boxplot for
each featurelist length. # TODO warn about multiple prefixes, try to use new prefixes
:Paremeters:
----------
project: datarobot.Project
Either a datarobot project, or a string of it's id or name
featurelist_prefix: str, optional (default = 'RAPA Reduced to')
The desired prefix for the featurelists that will be used for plotting parsimony performance. Each featurelist
will start with the prefix, include a space, and then end with the number of features in that featurelist
starting_featurelist: str, optional (default = None)
The starting featurelist used for parsimony analysis. If None, only
the featurelists with the desired prefix in `featurelist_prefix` will be plotted
metric: str, optional (default = 'AUC' or 'RMSE') [classification and regression]
The metric used for plotting accuracy of models
split: str, optional (default = 'crossValidation')
What split's performance to take from.
Can be: ['crossValidation', 'holdout'] TODO: i think it can be more, double check
featurelist_lengths: list, optional (default = None)
A list of featurelist lengths to plot
:Returns:
----------
None TODO: return plot?
"""
# if `project` is a string, find the project
if type(project) is str:
project = find_project(project)
# if metric is missing, assume a metric
if metric == None:
if project.target_type == dr.TARGET_TYPE.BINARY or project.target_type == dr.TARGET_TYPE.MULTICLASS:
# classification
metric = 'AUC'
elif project.target_type == dr.TARGET_TYPE.REGRESSION:
# regression
metric = 'RMSE'
datarobot_project_models = project.get_models() # get all the models in the provided project
if starting_featurelist:
if type(starting_featurelist) == str:
starting_featurelist = get_featurelist(starting_featurelist, project)
num_starting_featurelist_features = len(starting_featurelist.features)
featurelist_performances = defaultdict(list)
for model in datarobot_project_models: # for every model, if the model has the prefix, then add it's performance
if model.featurelist_name != None and featurelist_prefix in model.featurelist_name:
num_features = int(model.featurelist_name.split(' ')[-1].strip('()')) # parse the number of features from the featurelist name
if model.metrics[metric][split] != None: # if there is no feature impact for the model/split, don't add the metric
if featurelist_lengths and num_features in featurelist_lengths:
featurelist_performances[num_features].append(model.metrics[metric][split])
elif not featurelist_lengths:
featurelist_performances[num_features].append(model.metrics[metric][split])
elif starting_featurelist and model.featurelist_id == starting_featurelist.id: # starting featurelist
if model.metrics[metric][split] != None: # if there is no feature impact for the model/split, don't add the metric
if featurelist_lengths and num_starting_featurelist_features in featurelist_lengths:
featurelist_performances[num_starting_featurelist_features].append(model.metrics[metric][split])
elif not featurelist_lengths:
featurelist_performances[num_starting_featurelist_features].append(model.metrics[metric][split])
# Add Nones so that the arrays are the same length
last = 0
for key in featurelist_performances:
m = max(last, len(featurelist_performances[key]))
last = m
for key in featurelist_performances:
temp_len = len(featurelist_performances[key])
for _ in range(m-temp_len):
featurelist_performances[key].append(None)
featurelist_performances_df = pd.DataFrame(featurelist_performances)[sorted(featurelist_performances.keys())[::-1]]
with plt.style.context('tableau-colorblind10'):
plt.ylabel(f'{split} {metric}')
plt.xlabel('Number of Features')
plt.title(f'{project.project_name} - {featurelist_prefix}\nParsimonious Model Performance')
sb.boxplot(data=featurelist_performances_df)
return featurelist_performances_df
[docs]def feature_performance_stackplot(project: dr.Project,
featurelist_prefix: str = 'RAPA Reduced to',
starting_featurelist: str = None,
feature_impact_metric: str = 'median',
metric: str = None,
vlines: bool = False):
"""Utilizes `matplotlib.pyplot.stackplot` to show feature performance during
parsimony analysis.
:Parameters:
----------
project: datarobot.Project
Either a datarobot project, or a string of it's id or name
featurelist_prefix: str, optional (default = 'RAPA Reduced to')
The desired prefix for the featurelists that will be used for plotting feature performance. Each featurelist
will start with the prefix, include a space, and then end with the number of features in that featurelist
starting_featurelist: str, optional (default = None)
The starting featurelist used for parsimony analysis. If None, only
the featurelists with the desired prefix in `featurelist_prefix` will be plotted
feature_impact_metric: str, optional (default = mean)
Which metric to use when finding the most representative feature importance of all models in the featurelist
Options:
* median
* mean
* cumulative
metric: str, optional (default = 'AUC' or 'RMSE') [classification and regression]
Which metric to use when finding feature importance of each model
vlines: bool, optional (default = False)
Whether to add vertical lines at the featurelist lengths or not, False by default
:Returns:
----------
None TODO: return plot?
"""
# if `project` is a string, find the project
if type(project) is str:
project = find_project(project)
# if metric is missing, assume a metric
if metric == None:
if project.target_type == dr.TARGET_TYPE.BINARY or project.target_type == dr.TARGET_TYPE.MULTICLASS:
# classification
metric = 'AUC'
elif project.target_type == dr.TARGET_TYPE.REGRESSION:
# regression
metric = 'RMSE'
if starting_featurelist:
if type(starting_featurelist) == str:
starting_featurelist = get_featurelist(starting_featurelist, project)
datarobot_project_models = project.get_models() # get all the models in the provided project
if starting_featurelist != None: # have the starting featurelist as well
all_feature_importances = {}
for model in datarobot_project_models:
if model.featurelist_name != None and (model.featurelist_name.startswith(featurelist_prefix) or model.featurelist_id == starting_featurelist.id): # if the model uses the starting featurelist/featurelist prefix
if model.metrics[metric]['crossValidation'] != None:
if model.featurelist_name in all_feature_importances.keys():
for x in model.get_or_request_feature_impact():
if x['featureName'] in all_feature_importances[model.featurelist_name].keys():
all_feature_importances[model.featurelist_name][x['featureName']].append(x['impactNormalized'])
else:
all_feature_importances[model.featurelist_name][x['featureName']] = [x['impactNormalized']]
else:
all_feature_importances[model.featurelist_name] = {}
for x in model.get_or_request_feature_impact():
all_feature_importances[model.featurelist_name][x['featureName']] = [x['impactNormalized']]
else: # same as if, but without starting featurelist
all_feature_importances = {}
for model in datarobot_project_models:
if model.featurelist_name.startswith(featurelist_prefix): # if the model's featurelist starts with the featurelist prefix
if model.metrics[metric]['crossValidation'] != None:
if model.featurelist_name in all_feature_importances.keys():
for x in model.get_or_request_feature_impact():
if x['featureName'] in all_feature_importances[model.featurelist_name].keys():
all_feature_importances[model.featurelist_name][x['featureName']].append(x['impactNormalized'])
else:
all_feature_importances[model.featurelist_name][x['featureName']] = [x['impactNormalized']]
else:
all_feature_importances[model.featurelist_name] = {}
for x in model.get_or_request_feature_impact():
all_feature_importances[model.featurelist_name][x['featureName']] = [x['impactNormalized']]
for featurelist_name in all_feature_importances.keys():
for feature in all_feature_importances[featurelist_name].keys():
if feature_impact_metric.lower() == 'median':
all_feature_importances[featurelist_name][feature] = median(all_feature_importances[featurelist_name][feature])
elif feature_impact_metric.lower() == 'mean':
all_feature_importances[featurelist_name][feature] = mean(all_feature_importances[featurelist_name][feature])
elif feature_impact_metric.lower() == 'cumulative':
all_feature_importances[featurelist_name][feature] = sum(all_feature_importances[featurelist_name][feature])
else:
raise Exception(f'`feature_impact_metric` provided ({feature_impact_metric}) not accepted.\nOptions: \'median\', \'mean\', or \'cumulative\'')
# create 1d array of dimension N (x), and 2d array of dimension MxN (y) for stackplot
df = pd.DataFrame(all_feature_importances).replace({np.nan: 0})
if starting_featurelist != None: # rename starting_featurelist column to have the number of features
df = df.rename(columns={starting_featurelist.name: f'{starting_featurelist.name} {len(starting_featurelist.features)}'})
df = df/df.sum()
cols = [(int(x.split(' ')[-1].strip('()')), x) for x in list(df.columns)] # get a list of tuples where (# of features, column name)
cols = sorted(cols)[::-1] # sorted descending by first object in tuple (featurelist size)
x = []
y = []
for col in cols:
x.append(str(col[0]))
y.append(list(df[col[1]]))
y = np.array(y)
y = y.T
featurelist_lengths = sorted([int(x.split(' ')[-1].strip('()')) for x in df.columns])[::-1] # descending list of featurelist lengths
len_smallest_featurelist = min(featurelist_lengths)
smallest_featurelist = featurelist_prefix + ' (' + str(len_smallest_featurelist) + ')'
# if the length of the smallest featurelist is less than the number of features to label
# get a featurelist that has a length higher than the minimum features to label for labeling purposes
if len_smallest_featurelist < config.MIN_FEATURES_TO_LABEL:
len_smallest_featurelist = config.MIN_FEATURES_TO_LABEL
last_length = np.inf
for length in featurelist_lengths:
if length < len_smallest_featurelist:
break
last_length = length
smallest_featurelist = featurelist_prefix + ' (' + str(last_length) + ')'
# unreadable list comprehension really means: get a dictionary with keys that are the old column names (features), and values with new column names (starting with underscore)
# at least show config.MIN_FEATURES_TO_GRAPH
# this is so that the underscored names are not shown in the legend.
labels = [{x:'_' + str(x)} if i > config.MAX_FEATURES_TO_LABEL or i >= len_smallest_featurelist else {x:x} for i, x in enumerate(df.loc[:,smallest_featurelist].sort_values(ascending=False).index)]
l = {}
for label in labels:
l.update(label)
df = df.rename(index=l)
_, ax = plt.subplots(figsize=(config.FIG_SIZE[0], config.FIG_SIZE[1]/2))
plt.xlabel('Feature List Length')
plt.ylabel('Normalized Feature Impact\n(Normalized Impact Normalized)')
plt.title(f'{project.project_name} - {featurelist_prefix}\nFeature Impact Stackplot')
if vlines:
plt.vlines([z for z in range(1,len(x)-1)], ymin=0, ymax=1, linestyles='dashed')
ax.stackplot(x, y, labels=list(df.index), colors=plt.cm.tab20.colors)
ax.legend(loc='upper left')
return None