Source code for mlflow.metrics.genai.metric_definitions

from typing import Any, Optional

from mlflow.exceptions import MlflowException
from mlflow.metrics.genai.base import EvaluationExample
from mlflow.metrics.genai.genai_metric import make_genai_metric
from mlflow.metrics.genai.utils import _get_latest_metric_version
from mlflow.models import EvaluationMetric
from mlflow.protos.databricks_pb2 import INTERNAL_ERROR, INVALID_PARAMETER_VALUE
from mlflow.utils.annotations import experimental
from mlflow.utils.class_utils import _get_class_from_string


[docs]@experimental def answer_similarity( model: Optional[str] = None, metric_version: Optional[str] = None, examples: Optional[list[EvaluationExample]] = None, metric_metadata: Optional[dict[str, Any]] = None, parameters: Optional[dict[str, Any]] = None, extra_headers: Optional[dict[str, str]] = None, proxy_url: Optional[str] = None, max_workers: int = 10, ) -> EvaluationMetric: """ This function will create a genai metric used to evaluate the answer similarity of an LLM using the model provided. Answer similarity will be assessed by the semantic similarity of the output to the ``ground_truth``, which should be specified in the ``targets`` column. High scores mean that your model outputs contain similar information as the ground_truth, while low scores mean that outputs may disagree with the ground_truth. The ``targets`` eval_arg must be provided as part of the input dataset or output predictions. This can be mapped to a column of a different name using ``col_mapping`` in the ``evaluator_config`` parameter, or using the ``targets`` parameter in mlflow.evaluate(). An MlflowException will be raised if the specified version for this metric does not exist. Args: model: (Optional) Model uri of the judge model that will be used to compute the metric, e.g., ``openai:/gpt-4``. Refer to the `LLM-as-a-Judge Metrics <https://mlflow.org/docs/latest/llms/llm-evaluate/index.html#selecting-the-llm-as-judge-model>`_ documentation for the supported model types and their URI format. metric_version: (Optional) The version of the answer similarity metric to use. Defaults to the latest version. examples: (Optional) Provide a list of examples to help the judge model evaluate the answer similarity. It is highly recommended to add examples to be used as a reference to evaluate the new results. metric_metadata: (Optional) Dictionary of metadata to be attached to the EvaluationMetric object. Useful for model evaluators that require additional information to determine how to evaluate this metric. parameters: (Optional) Dictionary of parameters to be passed to the judge model, e.g., {"temperature": 0.5}. When specified, these parameters will override the default parameters defined in the metric implementation. extra_headers: (Optional) Dictionary of extra headers to be passed to the judge model. proxy_url: (Optional) Proxy URL to be used for the judge model. This is useful when the judge model is served via a proxy endpoint, not directly via LLM provider services. If not specified, the default URL for the LLM provider will be used (e.g., https://api.openai.com/v1/chat/completions for OpenAI chat models). max_workers: (Optional) The maximum number of workers to use for judge scoring. Defaults to 10 workers. Returns: A metric object """ if metric_version is None: metric_version = _get_latest_metric_version() class_name = f"mlflow.metrics.genai.prompts.{metric_version}.AnswerSimilarityMetric" try: answer_similarity_class_module = _get_class_from_string(class_name) except ModuleNotFoundError: raise MlflowException( f"Failed to find answer similarity metric for version {metric_version}." f" Please check the version", error_code=INVALID_PARAMETER_VALUE, ) from None except Exception as e: raise MlflowException( f"Failed to construct answer similarity metric {metric_version}. Error: {e!r}", error_code=INTERNAL_ERROR, ) from None if examples is None: examples = answer_similarity_class_module.default_examples if model is None: model = answer_similarity_class_module.default_model return make_genai_metric( name="answer_similarity", definition=answer_similarity_class_module.definition, grading_prompt=answer_similarity_class_module.grading_prompt, include_input=False, examples=examples, version=metric_version, model=model, grading_context_columns=answer_similarity_class_module.grading_context_columns, parameters=parameters or answer_similarity_class_module.parameters, aggregations=["mean", "variance", "p90"], greater_is_better=True, metric_metadata=metric_metadata, extra_headers=extra_headers, proxy_url=proxy_url, max_workers=max_workers, )
[docs]@experimental def answer_correctness( model: Optional[str] = None, metric_version: Optional[str] = None, examples: Optional[list[EvaluationExample]] = None, metric_metadata: Optional[dict[str, Any]] = None, parameters: Optional[dict[str, Any]] = None, extra_headers: Optional[dict[str, str]] = None, proxy_url: Optional[str] = None, max_workers: int = 10, ) -> EvaluationMetric: """ This function will create a genai metric used to evaluate the answer correctness of an LLM using the model provided. Answer correctness will be assessed by the accuracy of the provided output based on the ``ground_truth``, which should be specified in the ``targets`` column. High scores mean that your model outputs contain similar information as the ground_truth and that this information is correct, while low scores mean that outputs may disagree with the ground_truth or that the information in the output is incorrect. Note that this builds onto answer_similarity. The ``targets`` eval_arg must be provided as part of the input dataset or output predictions. This can be mapped to a column of a different name using ``col_mapping`` in the ``evaluator_config`` parameter, or using the ``targets`` parameter in mlflow.evaluate(). An MlflowException will be raised if the specified version for this metric does not exist. Args: model: (Optional) Model uri of the judge model that will be used to compute the metric, e.g., ``openai:/gpt-4``. Refer to the `LLM-as-a-Judge Metrics <https://mlflow.org/docs/latest/llms/llm-evaluate/index.html#selecting-the-llm-as-judge-model>`_ documentation for the supported model types and their URI format. metric_version: The version of the answer correctness metric to use. Defaults to the latest version. examples: Provide a list of examples to help the judge model evaluate the answer correctness. It is highly recommended to add examples to be used as a reference to evaluate the new results. metric_metadata: (Optional) Dictionary of metadata to be attached to the EvaluationMetric object. Useful for model evaluators that require additional information to determine how to evaluate this metric. parameters: (Optional) Dictionary of parameters to be passed to the judge model, e.g., {"temperature": 0.5}. When specified, these parameters will override the default parameters defined in the metric implementation. extra_headers: (Optional) Dictionary of extra headers to be passed to the judge model. proxy_url: (Optional) Proxy URL to be used for the judge model. This is useful when the judge model is served via a proxy endpoint, not directly via LLM provider services. If not specified, the default URL for the LLM provider will be used (e.g., https://api.openai.com/v1/chat/completions for OpenAI chat models). max_workers: (Optional) The maximum number of workers to use for judge scoring. Defaults to 10 workers. Returns: A metric object """ if metric_version is None: metric_version = _get_latest_metric_version() class_name = f"mlflow.metrics.genai.prompts.{metric_version}.AnswerCorrectnessMetric" try: answer_correctness_class_module = _get_class_from_string(class_name) except ModuleNotFoundError: raise MlflowException( f"Failed to find answer correctness metric for version {metric_version}." f"Please check the version", error_code=INVALID_PARAMETER_VALUE, ) from None except Exception as e: raise MlflowException( f"Failed to construct answer correctness metric {metric_version}. Error: {e!r}", error_code=INTERNAL_ERROR, ) from None if examples is None: examples = answer_correctness_class_module.default_examples if model is None: model = answer_correctness_class_module.default_model return make_genai_metric( name="answer_correctness", definition=answer_correctness_class_module.definition, grading_prompt=answer_correctness_class_module.grading_prompt, examples=examples, version=metric_version, model=model, grading_context_columns=answer_correctness_class_module.grading_context_columns, parameters=parameters or answer_correctness_class_module.parameters, aggregations=["mean", "variance", "p90"], greater_is_better=True, metric_metadata=metric_metadata, extra_headers=extra_headers, proxy_url=proxy_url, max_workers=max_workers, )
[docs]@experimental def faithfulness( model: Optional[str] = None, metric_version: Optional[str] = _get_latest_metric_version(), examples: Optional[list[EvaluationExample]] = None, metric_metadata: Optional[dict[str, Any]] = None, parameters: Optional[dict[str, Any]] = None, extra_headers: Optional[dict[str, str]] = None, proxy_url: Optional[str] = None, max_workers: int = 10, ) -> EvaluationMetric: """ This function will create a genai metric used to evaluate the faithfullness of an LLM using the model provided. Faithfulness will be assessed based on how factually consistent the output is to the ``context``. High scores mean that the outputs contain information that is in line with the context, while low scores mean that outputs may disagree with the context (input is ignored). The ``context`` eval_arg must be provided as part of the input dataset or output predictions. This can be mapped to a column of a different name using ``col_mapping`` in the ``evaluator_config`` parameter. An MlflowException will be raised if the specified version for this metric does not exist. Args: model: (Optional) Model uri of the judge model that will be used to compute the metric, e.g., ``openai:/gpt-4``. Refer to the `LLM-as-a-Judge Metrics <https://mlflow.org/docs/latest/llms/llm-evaluate/index.html#selecting-the-llm-as-judge-model>`_ documentation for the supported model types and their URI format. metric_version: The version of the faithfulness metric to use. Defaults to the latest version. examples: Provide a list of examples to help the judge model evaluate the faithfulness. It is highly recommended to add examples to be used as a reference to evaluate the new results. metric_metadata: (Optional) Dictionary of metadata to be attached to the EvaluationMetric object. Useful for model evaluators that require additional information to determine how to evaluate this metric. parameters: (Optional) Dictionary of parameters to be passed to the judge model, e.g., {"temperature": 0.5}. When specified, these parameters will override the default parameters defined in the metric implementation. extra_headers: (Optional) Dictionary of extra headers to be passed to the judge model. proxy_url: (Optional) Proxy URL to be used for the judge model. This is useful when the judge model is served via a proxy endpoint, not directly via LLM provider services. If not specified, the default URL for the LLM provider will be used (e.g., https://api.openai.com/v1/chat/completions for OpenAI chat models). max_workers: (Optional) The maximum number of workers to use for judge scoring. Defaults to 10 workers. Returns: A metric object """ class_name = f"mlflow.metrics.genai.prompts.{metric_version}.FaithfulnessMetric" try: faithfulness_class_module = _get_class_from_string(class_name) except ModuleNotFoundError: raise MlflowException( f"Failed to find faithfulness metric for version {metric_version}." f" Please check the version", error_code=INVALID_PARAMETER_VALUE, ) from None except Exception as e: raise MlflowException( f"Failed to construct faithfulness metric {metric_version}. Error: {e!r}", error_code=INTERNAL_ERROR, ) from None if examples is None: examples = faithfulness_class_module.default_examples if model is None: model = faithfulness_class_module.default_model return make_genai_metric( name="faithfulness", definition=faithfulness_class_module.definition, grading_prompt=faithfulness_class_module.grading_prompt, include_input=False, examples=examples, version=metric_version, model=model, grading_context_columns=parameters or faithfulness_class_module.grading_context_columns, parameters=faithfulness_class_module.parameters, aggregations=["mean", "variance", "p90"], greater_is_better=True, metric_metadata=metric_metadata, extra_headers=extra_headers, proxy_url=proxy_url, max_workers=max_workers, )
[docs]@experimental def answer_relevance( model: Optional[str] = None, metric_version: Optional[str] = _get_latest_metric_version(), examples: Optional[list[EvaluationExample]] = None, metric_metadata: Optional[dict[str, Any]] = None, parameters: Optional[dict[str, Any]] = None, extra_headers: Optional[dict[str, str]] = None, proxy_url: Optional[str] = None, max_workers: int = 10, ) -> EvaluationMetric: """ This function will create a genai metric used to evaluate the answer relevance of an LLM using the model provided. Answer relevance will be assessed based on the appropriateness and applicability of the output with respect to the input. High scores mean that your model outputs are about the same subject as the input, while low scores mean that outputs may be non-topical. An MlflowException will be raised if the specified version for this metric does not exist. Args: model: (Optional) Model uri of the judge model that will be used to compute the metric, e.g., ``openai:/gpt-4``. Refer to the `LLM-as-a-Judge Metrics <https://mlflow.org/docs/latest/llms/llm-evaluate/index.html#selecting-the-llm-as-judge-model>`_ documentation for the supported model types and their URI format. metric_version: The version of the answer relevance metric to use. Defaults to the latest version. examples: Provide a list of examples to help the judge model evaluate the answer relevance. It is highly recommended to add examples to be used as a reference to evaluate the new results. metric_metadata: (Optional) Dictionary of metadata to be attached to the EvaluationMetric object. Useful for model evaluators that require additional information to determine how to evaluate this metric. parameters: (Optional) Dictionary of parameters to be passed to the judge model, e.g., {"temperature": 0.5}. When specified, these parameters will override the default parameters defined in the metric implementation. extra_headers: (Optional) Dictionary of extra headers to be passed to the judge model. proxy_url: (Optional) Proxy URL to be used for the judge model. This is useful when the judge model is served via a proxy endpoint, not directly via LLM provider services. If not specified, the default URL for the LLM provider will be used (e.g., https://api.openai.com/v1/chat/completions for OpenAI chat models). max_workers: (Optional) The maximum number of workers to use for judge scoring. Defaults to 10 workers. Returns: A metric object """ class_name = f"mlflow.metrics.genai.prompts.{metric_version}.AnswerRelevanceMetric" try: answer_relevance_class_module = _get_class_from_string(class_name) except ModuleNotFoundError: raise MlflowException( f"Failed to find answer relevance metric for version {metric_version}." f" Please check the version", error_code=INVALID_PARAMETER_VALUE, ) from None except Exception as e: raise MlflowException( f"Failed to construct answer relevance metric {metric_version}. Error: {e!r}", error_code=INTERNAL_ERROR, ) from None if examples is None: examples = answer_relevance_class_module.default_examples if model is None: model = answer_relevance_class_module.default_model return make_genai_metric( name="answer_relevance", definition=answer_relevance_class_module.definition, grading_prompt=answer_relevance_class_module.grading_prompt, examples=examples, version=metric_version, model=model, parameters=parameters or answer_relevance_class_module.parameters, aggregations=["mean", "variance", "p90"], greater_is_better=True, metric_metadata=metric_metadata, extra_headers=extra_headers, proxy_url=proxy_url, max_workers=max_workers, )
[docs]def relevance( model: Optional[str] = None, metric_version: Optional[str] = None, examples: Optional[list[EvaluationExample]] = None, metric_metadata: Optional[dict[str, Any]] = None, parameters: Optional[dict[str, Any]] = None, extra_headers: Optional[dict[str, str]] = None, proxy_url: Optional[str] = None, max_workers: int = 10, ) -> EvaluationMetric: """ This function will create a genai metric used to evaluate the evaluate the relevance of an LLM using the model provided. Relevance will be assessed by the appropriateness, significance, and applicability of the output with respect to the input and ``context``. High scores mean that the model has understood the context and correct extracted relevant information from the context, while low score mean that output has completely ignored the question and the context and could be hallucinating. The ``context`` eval_arg must be provided as part of the input dataset or output predictions. This can be mapped to a column of a different name using ``col_mapping`` in the ``evaluator_config`` parameter. An MlflowException will be raised if the specified version for this metric does not exist. Args: model: (Optional) Model uri of the judge model that will be used to compute the metric, e.g., ``openai:/gpt-4``. Refer to the `LLM-as-a-Judge Metrics <https://mlflow.org/docs/latest/llms/llm-evaluate/index.html#selecting-the-llm-as-judge-model>`_ documentation for the supported model types and their URI format. metric_version: (Optional) The version of the relevance metric to use. Defaults to the latest version. examples: (Optional) Provide a list of examples to help the judge model evaluate the relevance. It is highly recommended to add examples to be used as a reference to evaluate the new results. metric_metadata: (Optional) Dictionary of metadata to be attached to the EvaluationMetric object. Useful for model evaluators that require additional information to determine how to evaluate this metric. parameters: (Optional) Dictionary of parameters to be passed to the judge model, e.g., {"temperature": 0.5}. When specified, these parameters will override the default parameters defined in the metric implementation. extra_headers: (Optional) Dictionary of extra headers to be passed to the judge model. proxy_url: (Optional) Proxy URL to be used for the judge model. This is useful when the judge model is served via a proxy endpoint, not directly via LLM provider services. If not specified, the default URL for the LLM provider will be used (e.g., https://api.openai.com/v1/chat/completions for OpenAI chat models). max_workers: (Optional) The maximum number of workers to use for judge scoring. Defaults to 10 workers. Returns: A metric object """ if metric_version is None: metric_version = _get_latest_metric_version() class_name = f"mlflow.metrics.genai.prompts.{metric_version}.RelevanceMetric" try: relevance_class_module = _get_class_from_string(class_name) except ModuleNotFoundError: raise MlflowException( f"Failed to find relevance metric for version {metric_version}." f"Please check the version", error_code=INVALID_PARAMETER_VALUE, ) from None except Exception as e: raise MlflowException( f"Failed to construct relevance metric {metric_version}. Error: {e!r}", error_code=INTERNAL_ERROR, ) from None if examples is None: examples = relevance_class_module.default_examples if model is None: model = relevance_class_module.default_model return make_genai_metric( name="relevance", definition=relevance_class_module.definition, grading_prompt=relevance_class_module.grading_prompt, examples=examples, version=metric_version, model=model, grading_context_columns=relevance_class_module.grading_context_columns, parameters=parameters or relevance_class_module.parameters, aggregations=["mean", "variance", "p90"], greater_is_better=True, metric_metadata=metric_metadata, extra_headers=extra_headers, proxy_url=proxy_url, max_workers=max_workers, )