Source code for rapa.utils

from . import config

import datarobot as dr
from datarobot.errors import ClientError

import pickle

import logging
from warnings import warn
from warnings import catch_warnings
from datarobot.models import featurelist

import pandas as pd
import numpy as np
from statistics import mean
from statistics import median

from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sb

LOGGER = logging.getLogger(__name__)
    

[docs]def find_project(project: str) -> dr.Project:
    """Uses the DataRobot api to find a current project.

    Uses datarobot.Project.get() and dr.Project.list() to test if 'project' is either an id
    or possibly a name of a project in DataRobot, then returns the project found.

    :Parameters:
    ----------
        project: str
            Either a project id or a search term for project name

    :Returns: 
    ----------
        datarobot.Project
            A datarobot project object that is either the project with the id provided, or the 
            first/only project returned by searching by project name. Returns None if the list is 
            empty.
    """

    project = str(project) # make sure the project id/name provided is a string
    return_project = None

    try: # try finding project with id
        return_project = dr.Project.get(project_id=project)
        return return_project
    except ClientError: # id was not provided, most likely a name
        project_list = dr.Project.list(search_params={'project_name': project})
        if len(project_list) == 0: # probably wrong id, check id?
            raise Exception(f"No projects found with id/string of \'{project}\'")
        elif len(project_list) == 1: # found one project with search, good
            return project_list[0]
        else: # more than one project was found
            warn(f"Returning the first of multiple projects with \'{project}\': {project_list}")
            return project_list[0]
    

[docs]def get_best_model(project: dr.Project, 
                    featurelist_prefix: str = None, 
                    starred: bool = False, 
                    metric: str = None,
                    fold: str = 'crossValidation',
                    highest: bool = None) -> dr.Model:
    """Attempts to find the 'best' model in a datarobot by searching cross validation scores of all the
    models in a supplied project. # TODO make dictionary for minimize/maximize 

    CURRENTLY SUPPORTS METRICS WHERE HIGHER = BETTER

    .. warning:: 
        Actually finding the 'best' model takes more than averageing cross validation
        scores, and it is suggested that the 'best' model is decided and starred in DataRobot.
        (Make sure 'starred = True' if starring the 'best' model) 

    .. note::
        Some models may not have scores for the supplied fold because they were not run. These
        models are ignored by this function. Make sure all models of interest have scores for
        the fold being provided if those models should be considered.

    :Parameters:
    ----------
        project: datarobot.Project
            The project object that will be searched for the 'best' model

        featurelist_prefix: str, optional (default = None)
            The desired featurelist prefix used to search in for models using specific
            rapa featurelists

        starred: bool, optional (default = False)
            If True, return the starred model. If there are more than one starred models,
            then warn the user and return the 'best' one

        metric: str, optional (default = 'AUC' or 'RMSE') [classification and regression]
            What model metric to use when finding the 'best'
        
        fold: str, optional (default = 'crossValidation')
            The fold of data used in DataRobot. Options are as follows:
                ['validation', 
                'crossValidation', 
                'holdout', 
                'training', 
                'backtestingScores', 
                'backtesting']
        
        highest: bool, optional (default for classification = True, default for regression = False)
            Whether to take the highest value (highest = True), or the lowest
            value (highest = False). Change this when assumed switch is 
    
    :Returns:
    ----------
        datarobot.Model
            A datarobot model that is either the 'best', starred, or the 'best' of the starred models
            from the provided datarobot project
    """
    
    # if metric is missing, assume a metric
    if metric == None:
        if project.target_type == dr.TARGET_TYPE.BINARY or project.target_type == dr.TARGET_TYPE.MULTICLASS:
            # classification
            metric = 'AUC'
        elif project.target_type == dr.TARGET_TYPE.REGRESSION:
            # regression
            metric = 'RMSE'

    # if highest is missing, assume a direction
    if highest == None:
        if project.target_type == dr.TARGET_TYPE.BINARY or project.target_type == dr.TARGET_TYPE.MULTICLASS:
            # classification
            highest = True
        elif project.target_type == dr.TARGET_TYPE.REGRESSION:
            highest = False

    scores = []

    #### get scores

    # set featurelist_prefix to '' for ease of use in code
    if not starred: # the model(s) is/are not starred
        if featurelist_prefix == None: 
            featurelist_prefix = ''

        for model in project.get_models():
            current_model_score = model.metrics[metric][fold] # get the score for the metric and fold

            if model.featurelist_name.startswith(featurelist_prefix) and current_model_score: # if the model is scored in this fold, and it was created with the featurelist we are looking at
                scores.append((current_model_score, model)) # add the model score and model object to a list
    else: # the model(s) is/are starred
        if featurelist_prefix == None: 
            featurelist_prefix = ''

        for model in project.get_models():
            current_model_score = model.metrics[metric][fold] # get the score for the metric and fold

            if model.is_starred and model.featurelist_name.startswith(featurelist_prefix) and current_model_score: # if the model is scored in this fold, and it was created with the featurelist we are looking at
                scores.append((current_model_score, model)) # add the model score and model object to a list


    #### find the best scores

    # check that there are any models
    if len(scores) > 1: # multiple models
        return sorted(scores, key=lambda tup: tup[0], reverse=highest)[0][1] # sort by first item in the tuples
    elif len(scores) == 1: # one model
        return scores[0][1]
    else: # no models
        raise Exception(f"No models found. \n Parameters: project=`{project}`, metric=`{metric}`, fold=`{fold}`, featurelist_prefix=`{featurelist_prefix}`, starred=`{starred}`, highest=`{highest}`")


# alias for get_best_model
[docs]def get_starred_model(project: dr.Project, 
                    metric: str = None,
                    featurelist_prefix: str = None) -> dr.Model:
    """Alias for rapa.utils.get_best_model() but makes starred = True
    """
    return get_best_model(project, starred = True, metric = metric, featurelist_prefix = featurelist_prefix)


[docs]def initialize_dr_api(token_key: str = None, 
                    file_path: str = 'data/dr-tokens.pkl', 
                    endpoint: str = 'https://app.datarobot.com/api/v2'):
    """Initializes the DataRobot API with a pickled dictionary created by the user.

    .. warning:
        It is advised that the user keeps the pickled dictionary in an ignored 
        directory if using GitHub (put the file in the .gitignore)

    Accesses a file that should be a pickled dictionary. This dictionary has the API token
    as the value to the provided token_key. Ex: {token_key: 'API_TOKEN'}

    :Parameters:
    ----------
        token_key: str
            The API token's key in the pickled dictionary located in file_path

        file_path: str, optional (default = 'data/dr-tokens.pkl')
            Path to the pickled dictionary containing the API token

        endpoint: str, optional (default = 'https://app.datarobot.com/api/v2')
            The endpoint is usually the URL you would use to log into the DataRobot Web User Interface

    """
    # load pickled dictionary and initialize api, catching FileNotFound, KeyError, and failed authentication warning
    try:
        datarobot_tokens = pickle.load(open(file_path, 'rb'))
        with catch_warnings(record=True) as w: # appends warning to w if warning occurs
            dr.Client(endpoint=endpoint, token=datarobot_tokens[token_key])
            if not not w: # check to see if w is not None or empty (has a warning)
                raise Exception(w[0].message)
            else:
                pass
    except FileNotFoundError:
        raise FileNotFoundError(f'The file {file_path} does not exist.') # TODO: Make a tutorial on how to create the pickled dictionary with api tokens and link here
    except KeyError:
        raise KeyError(f'\'{token_key}\' is not in the dictionary at \'{file_path}\'')
    
    # TODO: I probably didn't catch all errors, make tests for this
    
    print(f'DataRobot API initiated with endpoint \'{endpoint}\'')


[docs]def get_featurelist(featurelist: str, 
                    project: dr.Project) -> dr.Featurelist:
    """Uses the DataRobot api to search for a desired featurelist.

    Uses datarobot.Project.get_featurelists() to retrieve all the featurelists in
    the project. Then, it searches the list for id's, and if it doesn't find any,
    it searches the list again for names. Returns the first project it finds.

    :Parameters:
    ----------
        featurelist: str
            Either a featurelist id or a search term for featurelist name
        
        project: datarobot.Project
            The project that is being searched for the featurelist

    :Returns:
    ----------
        datarobot.Featurelist
            The featurelist that was found. Returns None if no featurelist is found
    """
    featurelist = str(featurelist) # cast to string just in case id is an int or something
    featurelists = project.get_featurelists()
    dr_featurelist = [x for x in featurelists if featurelist == x.id] # loop over all the featurelists and get all that match featurelist (assuming it is an id)
    if dr_featurelist: # if dr_featurelist is not empty 
        return dr_featurelist[0] # there should only be one id
    else: # if dr_featurelist is empty
        dr_featurelist = [x for x in featurelists if featurelist.lower() in str(x.name).lower()] # use python's `in` to search strings
        if not dr_featurelist: # if dr_featurelist is empty
            raise Exception(f"No featurelists were found with the id/name of \'{featurelist}\'")
        elif len(dr_featurelist) > 1: # if dr_featurelist has more than 1
            warn(f'More than one featurelist was found: \'{dr_featurelist}\', returning the first.')
            return dr_featurelist[0]
        else: # dr_Featurelist has 1
            return dr_featurelist[0]
    
    
[docs]def parsimony_performance_boxplot(project: dr.Project, 
                                featurelist_prefix: str = 'RAPA Reduced to',
                                starting_featurelist: str = None,
                                metric: str = None,
                                split: str = 'crossValidation',
                                featurelist_lengths: list = None):
    """Uses `seaborn`'s `boxplot` function to plot featurelist size vs performance
    for all models that use that featurelist prefix. There is a different boxplot for
    each featurelist length. # TODO warn about multiple prefixes, try to use new prefixes

    :Paremeters:
    ----------
        project: datarobot.Project
            Either a datarobot project, or a string of it's id or name

        featurelist_prefix: str, optional (default = 'RAPA Reduced to')
            The desired prefix for the featurelists that will be used for plotting parsimony performance. Each featurelist
            will start with the prefix, include a space, and then end with the number of features in that featurelist

        starting_featurelist: str, optional (default = None)
            The starting featurelist used for parsimony analysis. If None, only
            the featurelists with the desired prefix in `featurelist_prefix` will be plotted

        metric: str, optional (default = 'AUC' or 'RMSE') [classification and regression]
            The metric used for plotting accuracy of models

        split: str, optional (default = 'crossValidation')
            What split's performance to take from. 
            Can be: ['crossValidation', 'holdout'] TODO: i think it can be more, double check
        
        featurelist_lengths: list, optional (default = None)
            A list of featurelist lengths to plot

    :Returns:
    ----------
        None TODO: return plot?
    """
    # if `project` is a string, find the project
    if type(project) is str:
        project = find_project(project)

    # if metric is missing, assume a metric
    if metric == None:
        if project.target_type == dr.TARGET_TYPE.BINARY or project.target_type == dr.TARGET_TYPE.MULTICLASS:
            # classification
            metric = 'AUC'
        elif project.target_type == dr.TARGET_TYPE.REGRESSION:
            # regression
            metric = 'RMSE'


    datarobot_project_models = project.get_models() # get all the models in the provided project

    if starting_featurelist:
        if type(starting_featurelist) == str:
            starting_featurelist = get_featurelist(starting_featurelist, project)
        num_starting_featurelist_features = len(starting_featurelist.features)


    featurelist_performances = defaultdict(list)
    for model in datarobot_project_models: # for every model, if the model has the prefix, then add it's performance
        if model.featurelist_name != None and featurelist_prefix in model.featurelist_name:
            num_features = int(model.featurelist_name.split(' ')[-1].strip('()')) # parse the number of features from the featurelist name
            if model.metrics[metric][split] != None: # if there is no feature impact for the model/split, don't add the metric
                if featurelist_lengths and num_features in featurelist_lengths:
                    featurelist_performances[num_features].append(model.metrics[metric][split])
                elif not featurelist_lengths:
                    featurelist_performances[num_features].append(model.metrics[metric][split])
        elif starting_featurelist and model.featurelist_id == starting_featurelist.id: # starting featurelist
            if model.metrics[metric][split] != None: # if there is no feature impact for the model/split, don't add the metric
                if featurelist_lengths and num_starting_featurelist_features in featurelist_lengths:
                    featurelist_performances[num_starting_featurelist_features].append(model.metrics[metric][split])
                elif not featurelist_lengths:
                    featurelist_performances[num_starting_featurelist_features].append(model.metrics[metric][split])
    
    # Add Nones so that the arrays are the same length
    last = 0
    for key in featurelist_performances:
        m = max(last, len(featurelist_performances[key]))
        last = m
    for key in featurelist_performances:
        temp_len = len(featurelist_performances[key])
        for _ in range(m-temp_len):
            featurelist_performances[key].append(None)
    
    featurelist_performances_df = pd.DataFrame(featurelist_performances)[sorted(featurelist_performances.keys())[::-1]]
    
    with plt.style.context('tableau-colorblind10'):
        plt.ylabel(f'{split} {metric}')
        plt.xlabel('Number of Features')
        plt.title(f'{project.project_name} - {featurelist_prefix}\nParsimonious Model Performance')
        sb.boxplot(data=featurelist_performances_df)
    return featurelist_performances_df
    
[docs]def feature_performance_stackplot(project: dr.Project, 
                                featurelist_prefix: str = 'RAPA Reduced to',
                                starting_featurelist: str = None,
                                feature_impact_metric: str = 'median',
                                metric: str = None,
                                vlines: bool = False):
    """Utilizes `matplotlib.pyplot.stackplot` to show feature performance during 
    parsimony analysis.

    :Parameters:
    ----------
        project: datarobot.Project
            Either a datarobot project, or a string of it's id or name

        featurelist_prefix: str, optional (default = 'RAPA Reduced to')
            The desired prefix for the featurelists that will be used for plotting feature performance. Each featurelist
            will start with the prefix, include a space, and then end with the number of features in that featurelist

        starting_featurelist: str, optional (default = None)
            The starting featurelist used for parsimony analysis. If None, only
            the featurelists with the desired prefix in `featurelist_prefix` will be plotted
        
        feature_impact_metric: str, optional (default = mean)
            Which metric to use when finding the  most representative feature importance of all models in the featurelist

            Options:
                * median
                * mean
                * cumulative

        metric: str, optional (default = 'AUC' or 'RMSE') [classification and regression]
            Which metric to use when finding feature importance of each model
        
        vlines: bool, optional (default = False)
            Whether to add vertical lines at the featurelist lengths or not, False by default

    :Returns:
    ----------
        None TODO: return plot?
    """
    # if `project` is a string, find the project
    if type(project) is str:
        project = find_project(project)

    # if metric is missing, assume a metric
    if metric == None:
        if project.target_type == dr.TARGET_TYPE.BINARY or project.target_type == dr.TARGET_TYPE.MULTICLASS:
            # classification
            metric = 'AUC'
        elif project.target_type == dr.TARGET_TYPE.REGRESSION:
            # regression
            metric = 'RMSE'
    
    if starting_featurelist:
        if type(starting_featurelist) == str:
            starting_featurelist = get_featurelist(starting_featurelist, project)
    
    datarobot_project_models = project.get_models() # get all the models in the provided project

    if starting_featurelist != None: # have the starting featurelist as well
        all_feature_importances = {}
        for model in datarobot_project_models:
            if model.featurelist_name != None and (model.featurelist_name.startswith(featurelist_prefix) or model.featurelist_id == starting_featurelist.id): # if the model uses the starting featurelist/featurelist prefix
                if model.metrics[metric]['crossValidation'] != None:
                    if model.featurelist_name in all_feature_importances.keys():
                        for x in model.get_or_request_feature_impact():
                            if x['featureName'] in all_feature_importances[model.featurelist_name].keys():
                                all_feature_importances[model.featurelist_name][x['featureName']].append(x['impactNormalized'])
                            else:
                                all_feature_importances[model.featurelist_name][x['featureName']] = [x['impactNormalized']]
                    else:
                        all_feature_importances[model.featurelist_name] = {} 
                        for x in model.get_or_request_feature_impact():
                            all_feature_importances[model.featurelist_name][x['featureName']] = [x['impactNormalized']]
    else: # same as if, but without starting featurelist 
        all_feature_importances = {}
        for model in datarobot_project_models:
            if model.featurelist_name.startswith(featurelist_prefix): # if the model's featurelist starts with the featurelist prefix
                if model.metrics[metric]['crossValidation'] != None:
                    if model.featurelist_name in all_feature_importances.keys():
                        for x in model.get_or_request_feature_impact():
                            if x['featureName'] in all_feature_importances[model.featurelist_name].keys():
                                all_feature_importances[model.featurelist_name][x['featureName']].append(x['impactNormalized'])
                            else:
                                all_feature_importances[model.featurelist_name][x['featureName']] = [x['impactNormalized']]
                    else:
                        all_feature_importances[model.featurelist_name] = {} 
                        for x in model.get_or_request_feature_impact():
                            all_feature_importances[model.featurelist_name][x['featureName']] = [x['impactNormalized']]

    for featurelist_name in all_feature_importances.keys():
        for feature in all_feature_importances[featurelist_name].keys():
            if feature_impact_metric.lower() == 'median':
                all_feature_importances[featurelist_name][feature] = median(all_feature_importances[featurelist_name][feature])
            elif feature_impact_metric.lower() == 'mean':
                all_feature_importances[featurelist_name][feature] = mean(all_feature_importances[featurelist_name][feature])
            elif feature_impact_metric.lower() == 'cumulative':
                all_feature_importances[featurelist_name][feature] = sum(all_feature_importances[featurelist_name][feature])
            else:
                raise Exception(f'`feature_impact_metric` provided ({feature_impact_metric}) not accepted.\nOptions: \'median\', \'mean\', or \'cumulative\'')

    # create 1d array of dimension N (x), and 2d array of dimension MxN (y) for stackplot
    df = pd.DataFrame(all_feature_importances).replace({np.nan: 0})
    if starting_featurelist != None: # rename starting_featurelist column to have the number of features
        df = df.rename(columns={starting_featurelist.name: f'{starting_featurelist.name} {len(starting_featurelist.features)}'})
    df = df/df.sum()
    cols = [(int(x.split(' ')[-1].strip('()')), x) for x in list(df.columns)] # get a list of tuples where (# of features, column name)
    cols = sorted(cols)[::-1] # sorted descending by first object in tuple (featurelist size)
    x = []
    y = []
    for col in cols:
        x.append(str(col[0]))
        y.append(list(df[col[1]]))
    y = np.array(y)
    y = y.T
    
    featurelist_lengths = sorted([int(x.split(' ')[-1].strip('()')) for x in df.columns])[::-1] # descending list of featurelist lengths

    len_smallest_featurelist = min(featurelist_lengths)
    smallest_featurelist = featurelist_prefix + ' (' + str(len_smallest_featurelist) + ')'

    # if the length of the smallest featurelist is less than the number of features to label
    # get a featurelist that has a length higher than the minimum features to label for labeling purposes
    if len_smallest_featurelist < config.MIN_FEATURES_TO_LABEL:
        len_smallest_featurelist = config.MIN_FEATURES_TO_LABEL
        last_length = np.inf
        for length in featurelist_lengths:
            if length < len_smallest_featurelist:
                break
            last_length = length
            smallest_featurelist = featurelist_prefix + ' (' + str(last_length) + ')'

    # unreadable list comprehension really means: get a dictionary with keys that are the old column names (features), and values with new column names (starting with underscore)
    # at least show config.MIN_FEATURES_TO_GRAPH
    # this is so that the underscored names are not shown in the legend.
    labels = [{x:'_' + str(x)} if i > config.MAX_FEATURES_TO_LABEL or i >= len_smallest_featurelist else {x:x} for i, x in enumerate(df.loc[:,smallest_featurelist].sort_values(ascending=False).index)]
    l = {}
    for label in labels:
        l.update(label)

    df = df.rename(index=l)
    _, ax = plt.subplots(figsize=(config.FIG_SIZE[0], config.FIG_SIZE[1]/2))
    plt.xlabel('Feature List Length')
    plt.ylabel('Normalized Feature Impact\n(Normalized Impact Normalized)')
    plt.title(f'{project.project_name} - {featurelist_prefix}\nFeature Impact Stackplot')
    if vlines:
        plt.vlines([z for z in range(1,len(x)-1)], ymin=0, ymax=1, linestyles='dashed')
    ax.stackplot(x, y, labels=list(df.index), colors=plt.cm.tab20.colors)
    ax.legend(loc='upper left')
    return None