Source code for mlflow.shap

import os
import tempfile
import types
import warnings
from contextlib import contextmanager
from typing import Any, Optional

import numpy as np
import yaml

import mlflow
import mlflow.utils.autologging_utils
from mlflow import pyfunc
from mlflow.models import Model, ModelInputExample, ModelSignature
from mlflow.models.model import MLMODEL_FILE_NAME
from mlflow.models.utils import _save_example
from mlflow.tracking._model_registry import DEFAULT_AWAIT_MAX_SLEEP_SECONDS
from mlflow.tracking.artifact_utils import _download_artifact_from_uri
from mlflow.utils.docstring_utils import LOG_MODEL_PARAM_DOCS, format_docstring
from mlflow.utils.environment import (
    _CONDA_ENV_FILE_NAME,
    _CONSTRAINTS_FILE_NAME,
    _PYTHON_ENV_FILE_NAME,
    _REQUIREMENTS_FILE_NAME,
    _get_pip_deps,
    _mlflow_conda_env,
    _process_conda_env,
    _process_pip_requirements,
    _PythonEnv,
    _validate_env_arguments,
)
from mlflow.utils.file_utils import write_to
from mlflow.utils.model_utils import (
    _add_code_from_conf_to_system_path,
    _get_flavor_configuration,
    _validate_and_copy_code_paths,
    _validate_and_prepare_target_save_path,
)
from mlflow.utils.requirements_utils import _get_package_name
from mlflow.utils.uri import append_to_uri_path

FLAVOR_NAME = "shap"

_MAXIMUM_BACKGROUND_DATA_SIZE = 100
_DEFAULT_ARTIFACT_PATH = "model_explanations_shap"
_SUMMARY_BAR_PLOT_FILE_NAME = "summary_bar_plot.png"
_BASE_VALUES_FILE_NAME = "base_values.npy"
_SHAP_VALUES_FILE_NAME = "shap_values.npy"
_UNKNOWN_MODEL_FLAVOR = "unknown"
_UNDERLYING_MODEL_SUBPATH = "underlying_model"


[docs]def get_underlying_model_flavor(model):
    """
    Find the underlying models flavor.

    Args:
        model: underlying model of the explainer.
    """

    # checking if underlying model is wrapped

    if hasattr(model, "inner_model"):
        unwrapped_model = model.inner_model

        # check if passed model is a method of object
        if isinstance(unwrapped_model, types.MethodType):
            model_object = unwrapped_model.__self__

            # check if model object is of type sklearn
            try:
                import sklearn

                if issubclass(type(model_object), sklearn.base.BaseEstimator):
                    return mlflow.sklearn.FLAVOR_NAME
            except ImportError:
                pass

        # check if passed model is of type pytorch
        try:
            import torch

            if issubclass(type(unwrapped_model), torch.nn.Module):
                return mlflow.pytorch.FLAVOR_NAME
        except ImportError:
            pass

    return _UNKNOWN_MODEL_FLAVOR


[docs]def get_default_pip_requirements():
    """
    A list of default pip requirements for MLflow Models produced by this flavor. Calls to
    :func:`save_explainer()` and :func:`log_explainer()` produce a pip environment that, at
    minimum, contains these requirements.
    """
    import shap

    return [f"shap=={shap.__version__}"]


[docs]def get_default_conda_env():
    """
    Returns:
        The default Conda environment for MLflow Models produced by calls to
        :func:`save_explainer()` and :func:`log_explainer()`.
    """
    return _mlflow_conda_env(additional_pip_deps=get_default_pip_requirements())


def _load_pyfunc(path):
    """
    Load PyFunc implementation. Called by ``pyfunc.load_model``.
    """
    return _SHAPWrapper(path)


@contextmanager
def _log_artifact_contextmanager(out_file, artifact_path=None):
    """
    A context manager to make it easier to log an artifact.
    """
    with tempfile.TemporaryDirectory() as tmp_dir:
        tmp_path = os.path.join(tmp_dir, out_file)
        yield tmp_path
        mlflow.log_artifact(tmp_path, artifact_path)


def _log_numpy(numpy_obj, out_file, artifact_path=None):
    """
    Log a numpy object.
    """
    with _log_artifact_contextmanager(out_file, artifact_path) as tmp_path:
        np.save(tmp_path, numpy_obj)


def _log_matplotlib_figure(fig, out_file, artifact_path=None):
    """
    Log a matplotlib figure.
    """
    with _log_artifact_contextmanager(out_file, artifact_path) as tmp_path:
        fig.savefig(tmp_path)


def _get_conda_env_for_underlying_model(underlying_model_path):
    underlying_model_conda_path = os.path.join(underlying_model_path, "conda.yaml")
    with open(underlying_model_conda_path) as underlying_model_conda_file:
        return yaml.safe_load(underlying_model_conda_file)


[docs]def log_explanation(predict_function, features, artifact_path=None):
    r"""
    Given a ``predict_function`` capable of computing ML model output on the provided ``features``,
    computes and logs explanations of an ML model's output. Explanations are logged as a directory
    of artifacts containing the following items generated by `SHAP`_ (SHapley Additive
    exPlanations).

        - Base values
        - SHAP values (computed using `shap.KernelExplainer`_)
        - Summary bar plot (shows the average impact of each feature on model output)

    Args:
        predict_function:
            A function to compute the output of a model (e.g. ``predict_proba`` method of
            scikit-learn classifiers). Must have the following signature:

            .. code-block:: python

                def predict_function(X) -> pred: ...

            - ``X``: An array-like object whose shape should be (# samples, # features).
            - ``pred``: An array-like object whose shape should be (# samples) for a regressor or
              (# classes, # samples) for a classifier. For a classifier, the values in ``pred``
              should correspond to the predicted probability of each class.

            Acceptable array-like object types:

                - ``numpy.array``
                - ``pandas.DataFrame``
                - ``shap.common.DenseData``
                - ``scipy.sparse matrix``

        features:
            A matrix of features to compute SHAP values with. The provided features should
            have shape (# samples, # features), and can be either of the array-like object
            types listed above.

            .. note::
                Background data for `shap.KernelExplainer`_ is generated by subsampling ``features``
                with `shap.kmeans`_. The background data size is limited to 100 rows for performance
                reasons.

        artifact_path:
            The run-relative artifact path to which the explanation is saved.
            If unspecified, defaults to "model_explanations_shap".

    Returns:
        Artifact URI of the logged explanations.

    .. _SHAP: https://github.com/slundberg/shap

    .. _shap.KernelExplainer: https://shap.readthedocs.io/en/latest/generated
        /shap.KernelExplainer.html#shap.KernelExplainer

    .. _shap.kmeans: https://github.com/slundberg/shap/blob/v0.36.0/shap/utils/_legacy.py#L9

    .. code-block:: python
        :caption: Example

        import os

        import numpy as np
        import pandas as pd
        from sklearn.datasets import load_diabetes
        from sklearn.linear_model import LinearRegression

        import mlflow
        from mlflow import MlflowClient

        # prepare training data
        X, y = dataset = load_diabetes(return_X_y=True, as_frame=True)
        X = pd.DataFrame(dataset.data[:50, :8], columns=dataset.feature_names[:8])
        y = dataset.target[:50]

        # train a model
        model = LinearRegression()
        model.fit(X, y)

        # log an explanation
        with mlflow.start_run() as run:
            mlflow.shap.log_explanation(model.predict, X)

        # list artifacts
        client = MlflowClient()
        artifact_path = "model_explanations_shap"
        artifacts = [x.path for x in client.list_artifacts(run.info.run_id, artifact_path)]
        print("# artifacts:")
        print(artifacts)

        # load back the logged explanation
        dst_path = client.download_artifacts(run.info.run_id, artifact_path)
        base_values = np.load(os.path.join(dst_path, "base_values.npy"))
        shap_values = np.load(os.path.join(dst_path, "shap_values.npy"))

        print("\n# base_values:")
        print(base_values)
        print("\n# shap_values:")
        print(shap_values[:3])

    .. code-block:: text
        :caption: Output

        # artifacts:
        ['model_explanations_shap/base_values.npy',
         'model_explanations_shap/shap_values.npy',
         'model_explanations_shap/summary_bar_plot.png']

        # base_values:
        20.502000000000002

        # shap_values:
        [[ 2.09975523  0.4746513   7.63759026  0.        ]
         [ 2.00883109 -0.18816665 -0.14419184  0.        ]
         [ 2.00891772 -0.18816665 -0.14419184  0.        ]]

    .. figure:: ../_static/images/shap-ui-screenshot.png

        Logged artifacts
    """
    import matplotlib.pyplot as plt
    import shap

    artifact_path = _DEFAULT_ARTIFACT_PATH if artifact_path is None else artifact_path
    with mlflow.utils.autologging_utils.disable_autologging():
        background_data = shap.kmeans(features, min(_MAXIMUM_BACKGROUND_DATA_SIZE, len(features)))
        explainer = shap.KernelExplainer(predict_function, background_data)
        shap_values = explainer.shap_values(features)

        _log_numpy(explainer.expected_value, _BASE_VALUES_FILE_NAME, artifact_path)
        _log_numpy(shap_values, _SHAP_VALUES_FILE_NAME, artifact_path)

        shap.summary_plot(shap_values, features, plot_type="bar", show=False)
        fig = plt.gcf()
        fig.tight_layout()
        _log_matplotlib_figure(fig, _SUMMARY_BAR_PLOT_FILE_NAME, artifact_path)
        plt.close(fig)

    return append_to_uri_path(mlflow.active_run().info.artifact_uri, artifact_path)


[docs]@format_docstring(LOG_MODEL_PARAM_DOCS.format(package_name=FLAVOR_NAME))
def log_explainer(
    explainer,
    artifact_path,
    serialize_model_using_mlflow=True,
    conda_env=None,
    code_paths=None,
    registered_model_name=None,
    signature: ModelSignature = None,
    input_example: ModelInputExample = None,
    await_registration_for=DEFAULT_AWAIT_MAX_SLEEP_SECONDS,
    pip_requirements=None,
    extra_pip_requirements=None,
    metadata=None,
):
    """
    Log an SHAP explainer as an MLflow artifact for the current run.

    Args:
        explainer: SHAP explainer to be saved.
        artifact_path: Run-relative artifact path.
        serialize_model_using_mlflow: When set to True, MLflow will extract the underlying
            model and serialize it as an MLmodel, otherwise it uses SHAP's internal serialization.
            Defaults to True. Currently MLflow serialization is only supported for models of
            'sklearn' or 'pytorch' flavors.
        conda_env: {{ conda_env }}
        code_paths: {{ code_paths }}
        registered_model_name: If given, create a model version under ``registered_model_name``,
            also creating a registered model if one with the given name does not exist.
        signature: :py:class:`ModelSignature <mlflow.models.ModelSignature>` describes model input
            and output :py:class:`Schema <mlflow.types.Schema>`. The model signature can be
            :py:func:`inferred <mlflow.models.infer_signature>` from datasets with valid model input
            (e.g. the training dataset with target column omitted) and valid model output
            (e.g. model predictions generated on the training dataset), for example:

            .. code-block:: python

                from mlflow.models import infer_signature

                train = df.drop_column("target_label")
                predictions = ...  # compute model predictions
                signature = infer_signature(train, predictions)
        input_example: {{ input_example }}
        await_registration_for: Number of seconds to wait for the model version to finish
            being created and is in ``READY`` status. By default, the function waits for five
            minutes. Specify 0 or None to skip waiting.
        pip_requirements: {{ pip_requirements }}
        extra_pip_requirements: {{ extra_pip_requirements }}
        metadata: {{ metadata }}
    """

    Model.log(
        artifact_path=artifact_path,
        flavor=mlflow.shap,
        explainer=explainer,
        conda_env=conda_env,
        code_paths=code_paths,
        serialize_model_using_mlflow=serialize_model_using_mlflow,
        registered_model_name=registered_model_name,
        signature=signature,
        input_example=input_example,
        await_registration_for=await_registration_for,
        pip_requirements=pip_requirements,
        extra_pip_requirements=extra_pip_requirements,
        metadata=metadata,
    )


[docs]@format_docstring(LOG_MODEL_PARAM_DOCS.format(package_name=FLAVOR_NAME))
def save_explainer(
    explainer,
    path,
    serialize_model_using_mlflow=True,
    conda_env=None,
    code_paths=None,
    mlflow_model=None,
    signature: ModelSignature = None,
    input_example: ModelInputExample = None,
    pip_requirements=None,
    extra_pip_requirements=None,
    metadata=None,
):
    """
    Save a SHAP explainer to a path on the local file system. Produces an MLflow Model
    containing the following flavors:

        - :py:mod:`mlflow.shap`
        - :py:mod:`mlflow.pyfunc`

    Args:
        explainer: SHAP explainer to be saved.
        path: Local path where the explainer is to be saved.
        serialize_model_using_mlflow: When set to True, MLflow will extract the underlying
            model and serialize it as an MLmodel, otherwise it uses SHAP's internal serialization.
            Defaults to True. Currently MLflow serialization is only supported for models of
            'sklearn' or 'pytorch' flavors.
        conda_env: {{ conda_env }}
        code_paths: {{ code_paths }}
        mlflow_model: :py:mod:`mlflow.models.Model` this flavor is being added to.
        signature: :py:class:`ModelSignature <mlflow.models.ModelSignature>` describes model input
            and output :py:class:`Schema <mlflow.types.Schema>`. The model signature can be
            :py:func:`inferred <mlflow.models.infer_signature>` from datasets with valid model input
            (e.g. the training dataset with target column omitted) and valid model output (e.g.
            model predictions generated on the training dataset), for example:

            .. code-block:: python

                from mlflow.models import infer_signature

                train = df.drop_column("target_label")
                predictions = ...  # compute model predictions
                signature = infer_signature(train, predictions)
        input_example: {{ input_example }}
        pip_requirements: {{ pip_requirements }}
        extra_pip_requirements: {{ extra_pip_requirements }}
        metadata: {{ metadata }}
    """
    import shap

    _validate_env_arguments(conda_env, pip_requirements, extra_pip_requirements)

    _validate_and_prepare_target_save_path(path)
    code_dir_subpath = _validate_and_copy_code_paths(code_paths, path)

    if mlflow_model is None:
        mlflow_model = Model()
    if signature is not None:
        mlflow_model.signature = signature
    if input_example is not None:
        _save_example(mlflow_model, input_example, path)
    if metadata is not None:
        mlflow_model.metadata = metadata

    underlying_model_flavor = None
    underlying_model_path = None
    serializable_by_mlflow = False

    # saving the underlying model if required
    if serialize_model_using_mlflow:
        underlying_model_flavor = get_underlying_model_flavor(explainer.model)

        if underlying_model_flavor != _UNKNOWN_MODEL_FLAVOR:
            serializable_by_mlflow = True  # prevents SHAP from serializing the underlying model
            underlying_model_path = os.path.join(path, _UNDERLYING_MODEL_SUBPATH)
        else:
            warnings.warn(
                "Unable to serialize underlying model using MLflow, will use SHAP serialization"
            )

        if underlying_model_flavor == mlflow.sklearn.FLAVOR_NAME:
            mlflow.sklearn.save_model(explainer.model.inner_model.__self__, underlying_model_path)
        elif underlying_model_flavor == mlflow.pytorch.FLAVOR_NAME:
            mlflow.pytorch.save_model(explainer.model.inner_model, underlying_model_path)

    # saving the explainer object
    explainer_data_subpath = "explainer.shap"
    explainer_output_path = os.path.join(path, explainer_data_subpath)
    with open(explainer_output_path, "wb") as explainer_output_file_handle:
        if serialize_model_using_mlflow and serializable_by_mlflow:
            explainer.save(explainer_output_file_handle, model_saver=False)
        else:
            explainer.save(explainer_output_file_handle)

    pyfunc.add_to_model(
        mlflow_model,
        loader_module="mlflow.shap",
        model_path=explainer_data_subpath,
        underlying_model_flavor=underlying_model_flavor,
        conda_env=_CONDA_ENV_FILE_NAME,
        python_env=_PYTHON_ENV_FILE_NAME,
        code=code_dir_subpath,
    )

    mlflow_model.add_flavor(
        FLAVOR_NAME,
        shap_version=shap.__version__,
        serialized_explainer=explainer_data_subpath,
        underlying_model_flavor=underlying_model_flavor,
        code=code_dir_subpath,
    )

    mlflow_model.save(os.path.join(path, MLMODEL_FILE_NAME))

    if conda_env is None:
        if pip_requirements is None:
            default_reqs = get_default_pip_requirements()
            # To ensure `_load_pyfunc` can successfully load the model during the dependency
            # inference, `mlflow_model.save` must be called beforehand to save an MLmodel file.
            inferred_reqs = mlflow.models.infer_pip_requirements(
                path,
                FLAVOR_NAME,
                fallback=default_reqs,
            )
            default_reqs = sorted(set(inferred_reqs).union(default_reqs))
        else:
            default_reqs = None
        conda_env, pip_requirements, pip_constraints = _process_pip_requirements(
            default_reqs,
            pip_requirements,
            extra_pip_requirements,
        )
    else:
        conda_env, pip_requirements, pip_constraints = _process_conda_env(conda_env)

    if underlying_model_path is not None:
        underlying_model_conda_env = _get_conda_env_for_underlying_model(underlying_model_path)
        conda_env = _merge_environments(conda_env, underlying_model_conda_env)
        pip_requirements = _get_pip_deps(conda_env)

    with open(os.path.join(path, _CONDA_ENV_FILE_NAME), "w") as f:
        yaml.safe_dump(conda_env, stream=f, default_flow_style=False)

    # Save `constraints.txt` if necessary
    if pip_constraints:
        write_to(os.path.join(path, _CONSTRAINTS_FILE_NAME), "\n".join(pip_constraints))

    # Save `requirements.txt`
    write_to(os.path.join(path, _REQUIREMENTS_FILE_NAME), "\n".join(pip_requirements))

    _PythonEnv.current().to_yaml(os.path.join(path, _PYTHON_ENV_FILE_NAME))


# Defining save_model (Required by Model.log) to refer to save_explainer
save_model = save_explainer


def _get_conda_and_pip_dependencies(conda_env):
    """
    Extract conda and pip dependencies from conda environments

    Args:
        conda_env: Conda environment
    """

    conda_deps = []
    # NB: Set operations are required in case there are multiple references of MLflow as a
    # dependency to ensure that duplicate entries are not present in the final consolidated
    # dependency list.
    pip_deps_set = set()

    for dependency in conda_env["dependencies"]:
        if isinstance(dependency, dict) and dependency["pip"]:
            for pip_dependency in dependency["pip"]:
                if pip_dependency != "mlflow":
                    pip_deps_set.add(pip_dependency)
        else:
            package_name = _get_package_name(dependency)
            if package_name is not None and package_name not in ["python", "pip"]:
                conda_deps.append(dependency)

    return conda_deps, sorted(pip_deps_set)


def _union_lists(l1, l2):
    """
    Returns the union of two lists as a new list.
    """
    return list(dict.fromkeys(l1 + l2))


def _merge_environments(shap_environment, model_environment):
    """
    Merge conda environments of underlying model and shap.

    Args:
        shap_environment: SHAP conda environment.
        model_environment: Underlying model conda environment.
    """
    # merge the channels from the two environments and remove the default conda
    # channels if present since its added later in `_mlflow_conda_env`
    merged_conda_channels = _union_lists(
        shap_environment["channels"], model_environment["channels"]
    )
    merged_conda_channels = [x for x in merged_conda_channels if x != "conda-forge"]

    shap_conda_deps, shap_pip_deps = _get_conda_and_pip_dependencies(shap_environment)
    model_conda_deps, model_pip_deps = _get_conda_and_pip_dependencies(model_environment)

    merged_conda_deps = _union_lists(shap_conda_deps, model_conda_deps)
    merged_pip_deps = _union_lists(shap_pip_deps, model_pip_deps)
    return _mlflow_conda_env(
        additional_conda_deps=merged_conda_deps,
        additional_pip_deps=merged_pip_deps,
        additional_conda_channels=merged_conda_channels,
    )


[docs]def load_explainer(model_uri):
    """
    Load a SHAP explainer from a local file or a run.

    Args:
        model_uri: The location, in URI format, of the MLflow model. For example:

            - ``/Users/me/path/to/local/model``
            - ``relative/path/to/local/model``
            - ``s3://my_bucket/path/to/model``
            - ``runs:/<mlflow_run_id>/run-relative/path/to/model``
            - ``models:/<model_name>/<model_version>``
            - ``models:/<model_name>/<stage>``

            For more information about supported URI schemes, see
            `Referencing Artifacts <https://www.mlflow.org/docs/latest/concepts.html#
            artifact-locations>`_.

    Returns:
        A SHAP explainer.
    """

    explainer_path = _download_artifact_from_uri(artifact_uri=model_uri)
    flavor_conf = _get_flavor_configuration(model_path=explainer_path, flavor_name=FLAVOR_NAME)
    _add_code_from_conf_to_system_path(explainer_path, flavor_conf)
    explainer_artifacts_path = os.path.join(explainer_path, flavor_conf["serialized_explainer"])
    underlying_model_flavor = flavor_conf["underlying_model_flavor"]
    model = None

    if underlying_model_flavor != _UNKNOWN_MODEL_FLAVOR:
        underlying_model_path = os.path.join(explainer_path, _UNDERLYING_MODEL_SUBPATH)
        if underlying_model_flavor == mlflow.sklearn.FLAVOR_NAME:
            model = mlflow.sklearn._load_pyfunc(underlying_model_path).predict
        elif underlying_model_flavor == mlflow.pytorch.FLAVOR_NAME:
            model = mlflow.pytorch._load_model(os.path.join(underlying_model_path, "data"))

    return _load_explainer(explainer_file=explainer_artifacts_path, model=model)


def _load_explainer(explainer_file, model=None):
    """
    Load a SHAP explainer saved as an MLflow artifact on the local file system.

    Args:
        explainer_file: Local filesystem path to the MLflow Model saved with the ``shap`` flavor.
        model: Model to override underlying explainer model.

    """
    import shap

    def inject_model_loader(_in_file):
        return model

    with open(explainer_file, "rb") as explainer:
        if model is None:
            explainer = shap.Explainer.load(explainer)
        else:
            explainer = shap.Explainer.load(explainer, model_loader=inject_model_loader)
        return explainer


class _SHAPWrapper:
    def __init__(self, path):
        flavor_conf = _get_flavor_configuration(model_path=path, flavor_name=FLAVOR_NAME)
        shap_explainer_artifacts_path = os.path.join(path, flavor_conf["serialized_explainer"])
        underlying_model_flavor = flavor_conf["underlying_model_flavor"]
        model = None
        if underlying_model_flavor != _UNKNOWN_MODEL_FLAVOR:
            underlying_model_path = os.path.join(path, _UNDERLYING_MODEL_SUBPATH)
            if underlying_model_flavor == mlflow.sklearn.FLAVOR_NAME:
                model = mlflow.sklearn._load_pyfunc(underlying_model_path).predict
            elif underlying_model_flavor == mlflow.pytorch.FLAVOR_NAME:
                model = mlflow.pytorch._load_model(os.path.join(underlying_model_path, "data"))

        self.explainer = _load_explainer(explainer_file=shap_explainer_artifacts_path, model=model)

    def get_raw_model(self):
        """
        Returns the underlying model.
        """
        return self.explainer

    def predict(
        self,
        dataframe,
        params: Optional[dict[str, Any]] = None,
    ):
        """
        Args:
            dataframe: Model input data.
            params: Additional parameters to pass to the model for inference.

        Returns:
            Model predictions.
        """
        return self.explainer(dataframe.values).values