Source code for mlflow.data.spark_dataset_source

from typing import Any, Optional

from mlflow.data.dataset_source import DatasetSource
from mlflow.exceptions import MlflowException
from mlflow.protos.databricks_pb2 import INVALID_PARAMETER_VALUE


[docs]class SparkDatasetSource(DatasetSource): """ Represents the source of a dataset stored in a spark table. """ def __init__( self, path: Optional[str] = None, table_name: Optional[str] = None, sql: Optional[str] = None, ): if (path, table_name, sql).count(None) != 2: raise MlflowException( 'Must specify exactly one of "path", "table_name", or "sql"', INVALID_PARAMETER_VALUE, ) self._path = path self._table_name = table_name self._sql = sql @staticmethod def _get_source_type() -> str: return "spark"
[docs] def load(self, **kwargs): """Loads the dataset source as a Spark Dataset Source. Returns: An instance of ``pyspark.sql.DataFrame``. """ from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() if self._path: return spark.read.parquet(self._path) if self._table_name: return spark.read.table(self._table_name) if self._sql: return spark.sql(self._sql)
@staticmethod def _can_resolve(raw_source: Any): return False @classmethod def _resolve(cls, raw_source: str) -> "SparkDatasetSource": raise NotImplementedError
[docs] def to_dict(self) -> dict[Any, Any]: info = {} if self._path is not None: info["path"] = self._path elif self._table_name is not None: info["table_name"] = self._table_name elif self._sql is not None: info["sql"] = self._sql return info
[docs] @classmethod def from_dict(cls, source_dict: dict[Any, Any]) -> "SparkDatasetSource": return cls( path=source_dict.get("path"), table_name=source_dict.get("table_name"), sql=source_dict.get("sql"), )