import json
from abc import abstractmethod
from typing import Any
[docs]class DatasetSource:
"""
Represents the source of a dataset used in MLflow Tracking, providing information such as
cloud storage location, delta table name / version, etc.
"""
@staticmethod
@abstractmethod
def _get_source_type() -> str:
"""Obtains a string representing the source type of the dataset.
Returns:
A string representing the source type of the dataset, e.g. "s3", "delta_table", ...
"""
[docs] @abstractmethod
def load(self) -> Any:
"""
Loads files / objects referred to by the DatasetSource. For example, depending on the type
of :py:class:`DatasetSource <mlflow.data.dataset_source.DatasetSource>`, this may download
source CSV files from S3 to the local filesystem, load a source Delta Table as a Spark
DataFrame, etc.
Returns:
The downloaded source, e.g. a local filesystem path, a Spark DataFrame, etc.
"""
@staticmethod
@abstractmethod
def _can_resolve(raw_source: Any) -> bool:
"""Determines whether this type of DatasetSource can be resolved from a specified raw source
object. For example, an S3DatasetSource can be resolved from an S3 URI like
"s3://mybucket/path/to/iris/data" but not from an Azure Blob Storage URI like
"wasbs:/account@host.blob.core.windows.net".
Args:
raw_source: The raw source, e.g. a string like "s3://mybucket/path/to/iris/data".
Returns:
True if this DatasetSource can resolve the raw source, False otherwise.
"""
@classmethod
@abstractmethod
def _resolve(cls, raw_source: Any) -> "DatasetSource":
"""Constructs an instance of the DatasetSource from a raw source object, such as a
string URI like "s3://mybucket/path/to/iris/data" or a delta table identifier
like "my.delta.table@2".
Args:
raw_source: The raw source, e.g. a string like "s3://mybucket/path/to/iris/data".
Returns:
A DatasetSource instance derived from the raw_source.
"""
[docs] @abstractmethod
def to_dict(self) -> dict[str, Any]:
"""Obtains a JSON-compatible dictionary representation of the DatasetSource.
Returns:
A JSON-compatible dictionary representation of the DatasetSource.
"""
[docs] def to_json(self) -> str:
"""
Obtains a JSON string representation of the
:py:class:`DatasetSource <mlflow.data.dataset_source.DatasetSource>`.
Returns:
A JSON string representation of the
:py:class:`DatasetSource <mlflow.data.dataset_source.DatasetSource>`.
"""
return json.dumps(self.to_dict())
[docs] @classmethod
@abstractmethod
def from_dict(cls, source_dict: dict[Any, Any]) -> "DatasetSource":
"""Constructs an instance of the DatasetSource from a dictionary representation.
Args:
source_dict: A dictionary representation of the DatasetSource.
Returns:
A DatasetSource instance.
"""
[docs] @classmethod
def from_json(cls, source_json: str) -> "DatasetSource":
"""Constructs an instance of the DatasetSource from a JSON string representation.
Args:
source_json: A JSON string representation of the DatasetSource.
Returns:
A DatasetSource instance.
"""
return cls.from_dict(json.loads(source_json))