Source code for mlflow.projects

"""
The ``mlflow.projects`` module provides an API for running MLflow projects locally or remotely.
"""

import json
import logging
import os

import yaml

import mlflow.projects.databricks
import mlflow.utils.uri
from mlflow import tracking
from mlflow.entities import RunStatus
from mlflow.exceptions import ExecutionException, MlflowException
from mlflow.projects.backend import loader
from mlflow.projects.submitted_run import SubmittedRun
from mlflow.projects.utils import (
    MLFLOW_LOCAL_BACKEND_RUN_ID_CONFIG,
    PROJECT_BUILD_IMAGE,
    PROJECT_DOCKER_ARGS,
    PROJECT_DOCKER_AUTH,
    PROJECT_ENV_MANAGER,
    PROJECT_STORAGE_DIR,
    PROJECT_SYNCHRONOUS,
    fetch_and_validate_project,
    get_entry_point_command,
    get_or_create_run,
    get_run_env_vars,
    load_project,
)
from mlflow.tracking.fluent import _get_experiment_id
from mlflow.utils import env_manager as _EnvManager
from mlflow.utils.mlflow_tags import (
    MLFLOW_DOCKER_IMAGE_ID,
    MLFLOW_PROJECT_BACKEND,
    MLFLOW_PROJECT_ENV,
    MLFLOW_RUN_NAME,
)

_logger = logging.getLogger(__name__)


def _resolve_experiment_id(experiment_name=None, experiment_id=None):
    """
    Resolve experiment.

    Verifies either one or other is specified - cannot be both selected.

    If ``experiment_name`` is provided and does not exist, an experiment
    of that name is created and its id is returned.

    Args:
        experiment_name: Name of experiment under which to launch the run.
        experiment_id: ID of experiment under which to launch the run.

    Returns:
        str
    """

    if experiment_name and experiment_id:
        raise MlflowException("Specify only one of 'experiment_name' or 'experiment_id'.")

    if experiment_id:
        return str(experiment_id)

    if experiment_name:
        client = tracking.MlflowClient()
        exp = client.get_experiment_by_name(experiment_name)
        if exp:
            return exp.experiment_id
        else:
            _logger.info("'%s' does not exist. Creating a new experiment", experiment_name)
            return client.create_experiment(experiment_name)

    return _get_experiment_id()


def _run(
    uri,
    experiment_id,
    entry_point,
    version,
    parameters,
    docker_args,
    backend_name,
    backend_config,
    storage_dir,
    env_manager,
    synchronous,
    run_name,
    build_image,
    docker_auth,
):
    """
    Helper that delegates to the project-running method corresponding to the passed-in backend.
    Returns a ``SubmittedRun`` corresponding to the project run.
    """
    tracking_store_uri = tracking.get_tracking_uri()
    backend_config[PROJECT_ENV_MANAGER] = env_manager
    backend_config[PROJECT_SYNCHRONOUS] = synchronous
    backend_config[PROJECT_DOCKER_ARGS] = docker_args
    backend_config[PROJECT_STORAGE_DIR] = storage_dir
    backend_config[PROJECT_BUILD_IMAGE] = build_image
    backend_config[PROJECT_DOCKER_AUTH] = docker_auth
    # TODO: remove this check once kubernetes execution has been refactored
    if backend_name not in {"databricks", "kubernetes"}:
        backend = loader.load_backend(backend_name)
        if backend:
            submitted_run = backend.run(
                uri,
                entry_point,
                parameters,
                version,
                backend_config,
                tracking_store_uri,
                experiment_id,
            )
            tracking.MlflowClient().set_tag(
                submitted_run.run_id, MLFLOW_PROJECT_BACKEND, backend_name
            )
            if run_name is not None:
                tracking.MlflowClient().set_tag(submitted_run.run_id, MLFLOW_RUN_NAME, run_name)
            return submitted_run

    work_dir = fetch_and_validate_project(uri, version, entry_point, parameters)
    project = load_project(work_dir)
    _validate_execution_environment(project, backend_name)

    active_run = get_or_create_run(
        None, uri, experiment_id, work_dir, version, entry_point, parameters
    )

    if run_name is not None:
        tracking.MlflowClient().set_tag(active_run.info.run_id, MLFLOW_RUN_NAME, run_name)

    if backend_name == "databricks":
        tracking.MlflowClient().set_tag(
            active_run.info.run_id, MLFLOW_PROJECT_BACKEND, "databricks"
        )
        from mlflow.projects.databricks import run_databricks, run_databricks_spark_job

        if project.databricks_spark_job_spec is not None:
            return run_databricks_spark_job(
                remote_run=active_run,
                uri=uri,
                work_dir=work_dir,
                experiment_id=experiment_id,
                cluster_spec=backend_config,
                project_spec=project,
                entry_point=entry_point,
                parameters=parameters,
            )

        return run_databricks(
            remote_run=active_run,
            uri=uri,
            entry_point=entry_point,
            work_dir=work_dir,
            parameters=parameters,
            experiment_id=experiment_id,
            cluster_spec=backend_config,
            env_manager=env_manager,
        )

    elif backend_name == "kubernetes":
        from mlflow.projects import kubernetes as kb
        from mlflow.projects.docker import (
            build_docker_image,
            validate_docker_env,
            validate_docker_installation,
        )

        tracking.MlflowClient().set_tag(active_run.info.run_id, MLFLOW_PROJECT_ENV, "docker")
        tracking.MlflowClient().set_tag(
            active_run.info.run_id, MLFLOW_PROJECT_BACKEND, "kubernetes"
        )
        validate_docker_env(project)
        validate_docker_installation()
        kube_config = _parse_kubernetes_config(backend_config)
        image = build_docker_image(
            work_dir=work_dir,
            repository_uri=kube_config["repository-uri"],
            base_image=project.docker_env.get("image"),
            run_id=active_run.info.run_id,
            build_image=build_image,
            docker_auth=docker_auth,
        )
        image_digest = kb.push_image_to_registry(image.tags[0])
        tracking.MlflowClient().set_tag(
            active_run.info.run_id, MLFLOW_DOCKER_IMAGE_ID, image_digest
        )
        return kb.run_kubernetes_job(
            project.name,
            active_run,
            image.tags[0],
            image_digest,
            get_entry_point_command(project, entry_point, parameters, storage_dir),
            get_run_env_vars(
                run_id=active_run.info.run_uuid, experiment_id=active_run.info.experiment_id
            ),
            kube_config.get("kube-context", None),
            kube_config["kube-job-template"],
        )

    supported_backends = ["databricks", "kubernetes"] + list(loader.MLFLOW_BACKENDS.keys())
    raise ExecutionException(
        f"Got unsupported execution mode {backend_name}. Supported values: {supported_backends}"
    )


[docs]def run(
    uri,
    entry_point="main",
    version=None,
    parameters=None,
    docker_args=None,
    experiment_name=None,
    experiment_id=None,
    backend="local",
    backend_config=None,
    storage_dir=None,
    synchronous=True,
    run_id=None,
    run_name=None,
    env_manager=None,
    build_image=False,
    docker_auth=None,
):
    """
    Run an MLflow project. The project can be local or stored at a Git URI.

    MLflow provides built-in support for running projects locally or remotely on a Databricks or
    Kubernetes cluster. You can also run projects against other targets by installing an appropriate
    third-party plugin. See `Community Plugins <../plugins.html#community-plugins>`_ for more
    information.

    For information on using this method in chained workflows, see `Building Multistep Workflows
    <../projects.html#building-multistep-workflows>`_.

    Raises:
        :py:class:`mlflow.exceptions.ExecutionException` If a run launched in blocking mode
            is unsuccessful.

    Args:
        uri: URI of project to run. A local filesystem path
            or a Git repository URI (e.g. https://github.com/mlflow/mlflow-example)
            pointing to a project directory containing an MLproject file.
        entry_point: Entry point to run within the project. If no entry point with the specified
            name is found, runs the project file ``entry_point`` as a script,
            using "python" to run ``.py`` files and the default shell (specified by
            environment variable ``$SHELL``) to run ``.sh`` files.
        version: For Git-based projects, either a commit hash or a branch name.
        parameters: Parameters (dictionary) for the entry point command.
        docker_args: Arguments (dictionary) for the docker command.
        experiment_name: Name of experiment under which to launch the run.
        experiment_id: ID of experiment under which to launch the run.
        backend: Execution backend for the run: MLflow provides built-in support for "local",
            "databricks", and "kubernetes" (experimental) backends. If running against
            Databricks, will run against a Databricks workspace determined as follows:
            if a Databricks tracking URI of the form ``databricks://profile`` has been set
            (e.g. by setting the MLFLOW_TRACKING_URI environment variable), will run
            against the workspace specified by <profile>. Otherwise, runs against the
            workspace specified by the default Databricks CLI profile.
        backend_config: A dictionary, or a path to a JSON file (must end in '.json'), which will
            be passed as config to the backend. The exact content which should be
            provided is different for each execution backend and is documented
            at https://www.mlflow.org/docs/latest/projects.html.
        storage_dir: Used only if ``backend`` is "local". MLflow downloads artifacts from
            distributed URIs passed to parameters of type ``path`` to subdirectories of
            ``storage_dir``.
        synchronous: Whether to block while waiting for a run to complete. Defaults to True.
            Note that if ``synchronous`` is False and ``backend`` is "local", this
            method will return, but the current process will block when exiting until
            the local run completes. If the current process is interrupted, any
            asynchronous runs launched via this method will be terminated. If
            ``synchronous`` is True and the run fails, the current process will
            error out as well.
        run_id: Note: this argument is used internally by the MLflow project APIs and should
            not be specified. If specified, the run ID will be used instead of
            creating a new run.
        run_name: The name to give the MLflow Run associated with the project execution.
            If ``None``, the MLflow Run name is left unset.
        env_manager: Specify an environment manager to create a new environment for the run and
            install project dependencies within that environment. The following values
            are supported:

            - local: use the local environment
            - virtualenv: use virtualenv (and pyenv for Python version management)
            - conda: use conda

            If unspecified, MLflow automatically determines the environment manager to
            use by inspecting files in the project directory. For example, if
            ``python_env.yaml`` is present, virtualenv will be used.
        build_image: Whether to build a new docker image of the project or to reuse an existing
            image. Default: False (reuse an existing image)
        docker_auth: A dictionary representing information to authenticate with a Docker
            registry. See `docker.client.DockerClient.login
            <https://docker-py.readthedocs.io/en/stable/client.html#docker.client.DockerClient.login>`_
            for available options.

    Returns:
        :py:class:`mlflow.projects.SubmittedRun` exposing information (e.g. run ID)
        about the launched run.

    .. code-block:: python
        :caption: Example

        import mlflow

        project_uri = "https://github.com/mlflow/mlflow-example"
        params = {"alpha": 0.5, "l1_ratio": 0.01}

        # Run MLflow project and create a reproducible conda environment
        # on a local host
        mlflow.run(project_uri, parameters=params)

    .. code-block:: text
        :caption: Output

        ...
        ...
        Elasticnet model (alpha=0.500000, l1_ratio=0.010000):
        RMSE: 0.788347345611717
        MAE: 0.6155576449938276
        R2: 0.19729662005412607
        ... mlflow.projects: === Run (ID '6a5109febe5e4a549461e149590d0a7c') succeeded ===
    """
    backend_config_dict = backend_config if backend_config is not None else {}
    if (
        backend_config
        and type(backend_config) != dict
        and os.path.splitext(backend_config)[-1] == ".json"
    ):
        with open(backend_config) as handle:
            try:
                backend_config_dict = json.load(handle)
            except ValueError:
                _logger.error(
                    "Error when attempting to load and parse JSON cluster spec from file %s",
                    backend_config,
                )
                raise

    if env_manager is not None:
        _EnvManager.validate(env_manager)

    if backend == "databricks":
        mlflow.projects.databricks.before_run_validations(mlflow.get_tracking_uri(), backend_config)
    elif backend == "local" and run_id is not None:
        backend_config_dict[MLFLOW_LOCAL_BACKEND_RUN_ID_CONFIG] = run_id

    experiment_id = _resolve_experiment_id(
        experiment_name=experiment_name, experiment_id=experiment_id
    )

    submitted_run_obj = _run(
        uri=uri,
        experiment_id=experiment_id,
        entry_point=entry_point,
        version=version,
        parameters=parameters,
        docker_args=docker_args,
        backend_name=backend,
        backend_config=backend_config_dict,
        env_manager=env_manager,
        storage_dir=storage_dir,
        synchronous=synchronous,
        run_name=run_name,
        build_image=build_image,
        docker_auth=docker_auth,
    )
    if synchronous:
        _wait_for(submitted_run_obj)
    return submitted_run_obj


def _wait_for(submitted_run_obj):
    """Wait on the passed-in submitted run, reporting its status to the tracking server."""
    run_id = submitted_run_obj.run_id
    active_run = None
    # Note: there's a small chance we fail to report the run's status to the tracking server if
    # we're interrupted before we reach the try block below
    try:
        active_run = tracking.MlflowClient().get_run(run_id) if run_id is not None else None
        if submitted_run_obj.wait():
            _logger.info("=== Run (ID '%s') succeeded ===", run_id)
            _maybe_set_run_terminated(active_run, "FINISHED")
        else:
            _maybe_set_run_terminated(active_run, "FAILED")
            raise ExecutionException(f"Run (ID '{run_id}') failed")
    except KeyboardInterrupt:
        _logger.error("=== Run (ID '%s') interrupted, cancelling run ===", run_id)
        submitted_run_obj.cancel()
        _maybe_set_run_terminated(active_run, "FAILED")
        raise


def _maybe_set_run_terminated(active_run, status):
    """
    If the passed-in active run is defined and still running (i.e. hasn't already been terminated
    within user code), mark it as terminated with the passed-in status.
    """
    if active_run is None:
        return
    run_id = active_run.info.run_id
    cur_status = tracking.MlflowClient().get_run(run_id).info.status
    if RunStatus.is_terminated(cur_status):
        return
    tracking.MlflowClient().set_terminated(run_id, status)


def _validate_execution_environment(project, backend):
    if project.docker_env and backend == "databricks":
        raise ExecutionException(
            "Running docker-based projects on Databricks is not yet supported."
        )


def _parse_kubernetes_config(backend_config):
    """
    Creates build context tarfile containing Dockerfile and project code, returning path to tarfile
    """
    if not backend_config:
        raise ExecutionException("Backend_config file not found.")
    kube_config = backend_config.copy()
    if "kube-job-template-path" not in backend_config.keys():
        raise ExecutionException(
            "'kube-job-template-path' attribute must be specified in backend_config."
        )
    kube_job_template = backend_config["kube-job-template-path"]
    if os.path.exists(kube_job_template):
        with open(kube_job_template) as job_template:
            yaml_obj = yaml.safe_load(job_template.read())
        kube_job_template = yaml_obj
        kube_config["kube-job-template"] = kube_job_template
    else:
        raise ExecutionException(f"Could not find 'kube-job-template-path': {kube_job_template}")
    if "kube-context" not in backend_config.keys():
        _logger.debug(
            "Could not find kube-context in backend_config."
            " Using current context or in-cluster config."
        )
    if "repository-uri" not in backend_config.keys():
        raise ExecutionException("Could not find 'repository-uri' in backend_config.")
    return kube_config


__all__ = ["run", "SubmittedRun"]