from dataclasses import dataclass
from typing import Optional, Union
from mlflow.metrics.genai.prompt_template import PromptTemplate
from mlflow.utils.annotations import experimental
[docs]@experimental
@dataclass
class EvaluationExample:
"""
Stores the sample example during few shot learning during LLM evaluation
Args:
input: The input provided to the model
output: The output generated by the model
score: The score given by the evaluator
justification: The justification given by the evaluator
grading_context: The grading_context provided to the evaluator for evaluation. Either
a dictionary of grading context column names and grading context strings
or a single grading context string.
.. code-block:: python
:caption: Example for creating an EvaluationExample
from mlflow.metrics.base import EvaluationExample
example = EvaluationExample(
input="What is MLflow?",
output="MLflow is an open-source platform for managing machine "
"learning workflows, including experiment tracking, model packaging, "
"versioning, and deployment, simplifying the ML lifecycle.",
score=4,
justification="The definition effectively explains what MLflow is "
"its purpose, and its developer. It could be more concise for a 5-score.",
grading_context={
"ground_truth": "MLflow is an open-source platform for managing "
"the end-to-end machine learning (ML) lifecycle. It was developed by Databricks, "
"a company that specializes in big data and machine learning solutions. MLflow is "
"designed to address the challenges that data scientists and machine learning "
"engineers face when developing, training, and deploying machine learning models."
},
)
print(str(example))
.. code-block:: text
:caption: Output
Input: What is MLflow?
Provided output: "MLflow is an open-source platform for managing machine "
"learning workflows, including experiment tracking, model packaging, "
"versioning, and deployment, simplifying the ML lifecycle."
Provided ground_truth: "MLflow is an open-source platform for managing "
"the end-to-end machine learning (ML) lifecycle. It was developed by Databricks, "
"a company that specializes in big data and machine learning solutions. MLflow is "
"designed to address the challenges that data scientists and machine learning "
"engineers face when developing, training, and deploying machine learning models."
Score: 4
Justification: "The definition effectively explains what MLflow is "
"its purpose, and its developer. It could be more concise for a 5-score."
"""
output: str
score: float
justification: str
input: Optional[str] = None
grading_context: Optional[Union[dict[str, str], str]] = None
def _format_grading_context(self):
if isinstance(self.grading_context, dict):
return "\n".join(
[f"key: {key}\nvalue:\n{value}" for key, value in self.grading_context.items()]
)
else:
return self.grading_context
def __str__(self) -> str:
return PromptTemplate(
[
"""
Example Input:
{input}
""",
"""
Example Output:
{output}
""",
"""
Additional information used by the model:
{grading_context}
""",
"""
Example score: {score}
Example justification: {justification}
""",
]
).format(
input=self.input,
output=self.output,
grading_context=self._format_grading_context(),
score=self.score,
justification=self.justification,
)