Source code for mlflow.models.evaluation.validation

import logging
import operator
import os
from decimal import Decimal
from typing import Dict, Optional

from mlflow.exceptions import MlflowException
from mlflow.models.evaluation import EvaluationResult
from mlflow.protos.databricks_pb2 import BAD_REQUEST, INVALID_PARAMETER_VALUE
from mlflow.utils.annotations import deprecated

_logger = logging.getLogger(__name__)


[docs]class MetricThreshold: """ This class allows you to define metric thresholds for model validation. Allowed thresholds are: threshold, min_absolute_change, min_relative_change. Args: threshold: (Optional) A number representing the value threshold for the metric. - If higher is better for the metric, the metric value has to be >= threshold to pass validation. - Otherwise, the metric value has to be <= threshold to pass the validation. min_absolute_change: (Optional) A positive number representing the minimum absolute change required for candidate model to pass validation with the baseline model. - If higher is better for the metric, metric value has to be >= baseline model metric value + min_absolute_change to pass the validation. - Otherwise, metric value has to be <= baseline model metric value - min_absolute_change to pass the validation. min_relative_change: (Optional) A floating point number between 0 and 1 representing the minimum relative change (in percentage of baseline model metric value) for candidate model to pass the comparison with the baseline model. - If higher is better for the metric, metric value has to be >= baseline model metric value * (1 + min_relative_change) - Otherwise, metric value has to be <= baseline model metric value * (1 - min_relative_change) - Note that if the baseline model metric value is equal to 0, the threshold falls back performing a simple verification that the candidate metric value is better than the baseline metric value, i.e. metric value >= baseline model metric value + 1e-10 if higher is better; metric value <= baseline model metric value - 1e-10 if lower is better. greater_is_better: A required boolean representing whether higher value is better for the metric. higher_is_better: .. deprecated:: 2.3.0 Use ``greater_is_better`` instead. A required boolean representing whether higher value is better for the metric. """ def __init__( self, threshold=None, min_absolute_change=None, min_relative_change=None, greater_is_better=None, higher_is_better=None, ): if threshold is not None and type(threshold) not in {int, float}: raise MetricThresholdClassException("`threshold` parameter must be a number.") if min_absolute_change is not None and ( type(min_absolute_change) not in {int, float} or min_absolute_change <= 0 ): raise MetricThresholdClassException( "`min_absolute_change` parameter must be a positive number." ) if min_relative_change is not None: if not isinstance(min_relative_change, float): raise MetricThresholdClassException( "`min_relative_change` parameter must be a floating point number." ) if min_relative_change < 0 or min_relative_change > 1: raise MetricThresholdClassException( "`min_relative_change` parameter must be between 0 and 1." ) if higher_is_better is None and greater_is_better is None: raise MetricThresholdClassException("`greater_is_better` parameter must be defined.") if higher_is_better is not None and greater_is_better is not None: raise MetricThresholdClassException( "`higher_is_better` parameter must be None when `greater_is_better` is defined." ) if greater_is_better is None: greater_is_better = higher_is_better if not isinstance(greater_is_better, bool): raise MetricThresholdClassException("`greater_is_better` parameter must be a boolean.") if threshold is None and min_absolute_change is None and min_relative_change is None: raise MetricThresholdClassException("no threshold was specified.") self._threshold = threshold self._min_absolute_change = min_absolute_change self._min_relative_change = min_relative_change self._greater_is_better = greater_is_better @property def threshold(self): """ Value of the threshold. """ return self._threshold @property def min_absolute_change(self): """ Value of the minimum absolute change required to pass model comparison with baseline model. """ return self._min_absolute_change @property def min_relative_change(self): """ Float value of the minimum relative change required to pass model comparison with baseline model. """ return self._min_relative_change @property @deprecated("The attribute `higher_is_better` is deprecated. Use `greater_is_better` instead.") def higher_is_better(self): """ Boolean value representing whether higher value is better for the metric. """ return self._greater_is_better @property def greater_is_better(self): """ Boolean value representing whether higher value is better for the metric. """ return self._greater_is_better def __str__(self): """ Returns a human-readable string consisting of all specified thresholds. """ threshold_strs = [] if self._threshold is not None: threshold_strs.append(f"Threshold: {self._threshold}.") if self._min_absolute_change is not None: threshold_strs.append(f"Minimum Absolute Change: {self._min_absolute_change}.") if self._min_relative_change is not None: threshold_strs.append(f"Minimum Relative Change: {self._min_relative_change}.") if self._greater_is_better is not None: if self._greater_is_better: threshold_strs.append("Higher value is better.") else: threshold_strs.append("Lower value is better.") return " ".join(threshold_strs)
class MetricThresholdClassException(MlflowException): def __init__(self, _message, **kwargs): message = "Could not instantiate MetricThreshold class: " + _message super().__init__(message, error_code=INVALID_PARAMETER_VALUE, **kwargs) class _MetricValidationResult: """ Internal class for representing validation result per metric. Not user facing, used for organizing metric failures and generating failure message more conveniently. Args: metric_name: String representing the metric name candidate_metric_value: value of metric for candidate model metric_threshold: :py:class: `MetricThreshold<mlflow.models.validation.MetricThreshold>` The MetricThreshold for the metric. baseline_metric_value: value of metric for baseline model """ missing_candidate = False missing_baseline = False threshold_failed = False min_absolute_change_failed = False min_relative_change_failed = False def __init__( self, metric_name, candidate_metric_value, metric_threshold, baseline_metric_value=None, ): self.metric_name = metric_name self.candidate_metric_value = candidate_metric_value self.baseline_metric_value = baseline_metric_value self.metric_threshold = metric_threshold def __str__(self): """ Returns a human-readable string representing the validation result for the metric. """ if self.is_success(): return f"Metric {self.metric_name} passed the validation." if self.missing_candidate: return ( f"Metric validation failed: metric {self.metric_name} was missing from the " f"evaluation result of the candidate model." ) result_strs = [] if self.threshold_failed: result_strs.append( f"Metric {self.metric_name} value threshold check failed: " f"candidate model {self.metric_name} = {self.candidate_metric_value}, " f"{self.metric_name} threshold = {self.metric_threshold.threshold}." ) if self.missing_baseline: result_strs.append( f"Model comparison failed: metric {self.metric_name} was missing from " f"the evaluation result of the baseline model." ) else: if self.min_absolute_change_failed: result_strs.append( f"Metric {self.metric_name} minimum absolute change check failed: " f"candidate model {self.metric_name} = {self.candidate_metric_value}, " f"baseline model {self.metric_name} = {self.baseline_metric_value}, " f"{self.metric_name} minimum absolute change threshold = " f"{self.metric_threshold.min_absolute_change}." ) if self.min_relative_change_failed: result_strs.append( f"Metric {self.metric_name} minimum relative change check failed: " f"candidate model {self.metric_name} = {self.candidate_metric_value}, " f"baseline model {self.metric_name} = {self.baseline_metric_value}, " f"{self.metric_name} minimum relative change threshold = " f"{self.metric_threshold.min_relative_change}." ) return " ".join(result_strs) def is_success(self): return ( not self.missing_candidate and not self.missing_baseline and not self.threshold_failed and not self.min_absolute_change_failed and not self.min_relative_change_failed ) class ModelValidationFailedException(MlflowException): def __init__(self, message, **kwargs): super().__init__(message, error_code=BAD_REQUEST, **kwargs)
[docs]def validate_evaluation_results( validation_thresholds: Dict[str, MetricThreshold], candidate_result: EvaluationResult, baseline_result: Optional[EvaluationResult] = None, ): """ Validate the evaluation result from one model (candidate) against another model (baseline). If the candidate results do not meet the validation thresholds, an ModelValidationFailedException will be raised. .. note:: This API is a replacement for the deprecated model validation functionality in the :py:func:`mlflow.evaluate` API. Args: validation_thresholds: A dictionary of metric name to :py:class:`mlflow.models.MetricThreshold` used for model validation. Each metric name must either be the name of a builtin metric or the name of a metric defined in the ``extra_metrics`` parameter. candidate_result: The evaluation result of the candidate model. Returned by the :py:func:`mlflow.evaluate` API. baseline_result: The evaluation result of the baseline model. Returned by the :py:func:`mlflow.evaluate` API. If set to None, the candidate model result will be compared against the threshold values directly. Code Example: .. code-block:: python :caption: Example of Model Validation import mlflow from mlflow.models import MetricThreshold thresholds = { "accuracy_score": MetricThreshold( # accuracy should be >=0.8 threshold=0.8, # accuracy should be at least 5 percent greater than baseline model accuracy min_absolute_change=0.05, # accuracy should be at least 0.05 greater than baseline model accuracy min_relative_change=0.05, greater_is_better=True, ), } # Get evaluation results for the candidate model candidate_result = mlflow.evaluate( model="<YOUR_CANDIDATE_MODEL_URI>", data=eval_dataset, targets="ground_truth", model_type="classifier", ) # Get evaluation results for the baseline model baseline_result = mlflow.evaluate( model="<YOUR_BASELINE_MODEL_URI>", data=eval_dataset, targets="ground_truth", model_type="classifier", ) # Validate the results mlflow.validate_evaluation_results( thresholds, candidate_result, baseline_result, ) See :ref:`the Model Validation documentation <model-validation>` for more details. """ try: assert type(validation_thresholds) is dict for key in validation_thresholds.keys(): assert type(key) is str for threshold in validation_thresholds.values(): assert isinstance(threshold, MetricThreshold) except AssertionError: raise MlflowException( message="The validation thresholds argument must be a dictionary that maps strings " "to MetricThreshold objects.", error_code=INVALID_PARAMETER_VALUE, ) _logger.info("Validating candidate model metrics against baseline") _validate( validation_thresholds, candidate_result.metrics, baseline_result.metrics if baseline_result else {}, ) _logger.info("Model validation passed!")
def _validate( validation_thresholds: MetricThreshold, candidate_metrics: Dict[str, float], baseline_metrics: Dict[str, float], ): """ Validate the model based on validation_thresholds by metrics value and metrics comparison between candidate model's metrics (candidate_metrics) and baseline model's metrics (baseline_metrics). Args: validation_thresholds: A dictionary from metric_name to MetricThreshold. candidate_metrics: The metric evaluation result of the candidate model. baseline_metrics: The metric evaluation result of the baseline model. Raises: If the validation does not pass, raise an MlflowException with detail failure message. """ validation_results = { metric_name: _MetricValidationResult( metric_name, candidate_metrics.get(metric_name), threshold, baseline_metrics.get(metric_name), ) for (metric_name, threshold) in validation_thresholds.items() } for metric_name in validation_thresholds.keys(): metric_threshold = validation_thresholds[metric_name] validation_result = validation_results[metric_name] if metric_name not in candidate_metrics: validation_result.missing_candidate = True continue candidate_metric_value = candidate_metrics[metric_name] baseline_metric_value = baseline_metrics[metric_name] if baseline_metrics else None # If metric is higher is better, >= is used, otherwise <= is used # for thresholding metric value and model comparsion comparator_fn = operator.__ge__ if metric_threshold.greater_is_better else operator.__le__ operator_fn = operator.add if metric_threshold.greater_is_better else operator.sub if metric_threshold.threshold is not None: # metric threshold fails # - if not (metric_value >= threshold) for higher is better # - if not (metric_value <= threshold) for lower is better validation_result.threshold_failed = not comparator_fn( candidate_metric_value, metric_threshold.threshold ) if ( metric_threshold.min_relative_change or metric_threshold.min_absolute_change ) and metric_name not in baseline_metrics: validation_result.missing_baseline = True continue if metric_threshold.min_absolute_change is not None: # metric comparsion aboslute change fails # - if not (metric_value >= baseline + min_absolute_change) for higher is better # - if not (metric_value <= baseline - min_absolute_change) for lower is better validation_result.min_absolute_change_failed = not comparator_fn( Decimal(candidate_metric_value), Decimal(operator_fn(baseline_metric_value, metric_threshold.min_absolute_change)), ) if metric_threshold.min_relative_change is not None: # If baseline metric value equals 0, fallback to simple comparison check if baseline_metric_value == 0: _logger.warning( f"Cannot perform relative model comparison for metric {metric_name} as " "baseline metric value is 0. Falling back to simple comparison: verifying " "that candidate metric value is better than the baseline metric value." ) validation_result.min_relative_change_failed = not comparator_fn( Decimal(candidate_metric_value), Decimal(operator_fn(baseline_metric_value, 1e-10)), ) continue # metric comparsion relative change fails # - if (metric_value - baseline) / baseline < min_relative_change for higher is better # - if (baseline - metric_value) / baseline < min_relative_change for lower is better if metric_threshold.greater_is_better: relative_change = ( candidate_metric_value - baseline_metric_value ) / baseline_metric_value else: relative_change = ( baseline_metric_value - candidate_metric_value ) / baseline_metric_value validation_result.min_relative_change_failed = ( relative_change < metric_threshold.min_relative_change ) failure_messages = [] for metric_validation_result in validation_results.values(): if metric_validation_result.is_success(): continue failure_messages.append(str(metric_validation_result)) if not failure_messages: return raise ModelValidationFailedException(message=os.linesep.join(failure_messages))