DataSource

The main class responsible for representing a pluggable Data Source in Spark SQL. In addition to acting as the canonical set of parameters that can describe a Data Source, this class is used to resolve a description to a concrete implementation that can be used in a query plan (either batch or streaming) or to write out data using an external library.

From an end user's perspective a DataSource description can be created explicitly using org.apache.spark.sql.DataFrameReader or CREATE TABLE USING DDL. Additionally, this class is used when resolving a description from a metastore to a concrete implementation.

Many of the arguments to this class are optional, though depending on the specific API being used these optional arguments might be filled in during resolution using either inference or external metadata. For example, when reading a partitioned table from a file system, partition columns will be inferred from the directory layout even if they are not specified.

paths: A list of file system paths that hold data. These will be globbed before and qualified. This option only works when reading from a FileFormat.
userSpecifiedSchema: An optional specification of the schema of the data. When present we skip attempting to infer the schema.
partitionColumns: A list of column names that the relation is partitioned by. This list is generally empty during the read path, unless this DataSource is managed by Hive. In these cases, during resolveRelation, we will call getOrInferFileFormatSchema for file based DataSources to infer the partitioning. In other cases, if this list is empty, then this table is unpartitioned.
bucketSpec: An optional specification for bucketing (hash-partitioning) of the data.
catalogTable: Optional catalog table reference that can be used to push down operations over the datasource to the catalog service.

Linear Supertypes

Serializable, Serializable, Product, Equals, internal.Logging, AnyRef, Any

Instance Constructors

new DataSource(sparkSession: SparkSession, className: String, paths: Seq[String] = Nil, userSpecifiedSchema: Option[StructType] = None, partitionColumns: Seq[String] = Seq.empty, bucketSpec: Option[BucketSpec] = None, options: Map[String, String] = Map.empty, catalogTable: Option[CatalogTable] = None)

paths
A list of file system paths that hold data. These will be globbed before and qualified. This option only works when reading from a FileFormat.
userSpecifiedSchema
An optional specification of the schema of the data. When present we skip attempting to infer the schema.
partitionColumns
A list of column names that the relation is partitioned by. This list is generally empty during the read path, unless this DataSource is managed by Hive. In these cases, during resolveRelation, we will call getOrInferFileFormatSchema for file based DataSources to infer the partitioning. In other cases, if this list is empty, then this table is unpartitioned.
bucketSpec
An optional specification for bucketing (hash-partitioning) of the data.
catalogTable
Optional catalog table reference that can be used to push down operations over the datasource to the catalog service.

Type Members

case class SourceInfo(name: String, schema: StructType, partitionColumns: Seq[String]) extends Product with Serializable

Value Members

final def !=(arg0: Any): Boolean

Definition Classes
AnyRef → Any
final def ##(): Int

Definition Classes
AnyRef → Any
final def ==(arg0: Any): Boolean

Definition Classes
AnyRef → Any
final def asInstanceOf[T0]: T0

Definition Classes
Any
val bucketSpec: Option[BucketSpec]

An optional specification for bucketing (hash-partitioning) of the data.
val catalogTable: Option[CatalogTable]

Optional catalog table reference that can be used to push down operations over the datasource to the catalog service.
val className: String
def clone(): AnyRef

Attributes
protected[java.lang]
Definition Classes
AnyRef
Annotations
@throws( ... )
def createSink(outputMode: OutputMode): Sink

Returns a sink that can be used to continually write data.
def createSource(metadataPath: String): Source

Returns a source that can be used to continually read data.
final def eq(arg0: AnyRef): Boolean

Definition Classes
AnyRef
def finalize(): Unit

Attributes
protected[java.lang]
Definition Classes
AnyRef
Annotations
@throws( classOf[java.lang.Throwable] )
final def getClass(): Class[_]

Definition Classes
AnyRef → Any
def hasMetadata(path: Seq[String]): Boolean

Returns true if there is a single path that has a metadata log indicating which files should be read.
def initializeLogIfNecessary(isInterpreter: Boolean, silent: Boolean = false): Boolean

Attributes
protected
Definition Classes
Logging
def initializeLogIfNecessary(isInterpreter: Boolean): Unit

Attributes
protected
Definition Classes
Logging
final def isInstanceOf[T0]: Boolean

Definition Classes
Any
def isTraceEnabled(): Boolean

Attributes
protected
Definition Classes
Logging
def log: Logger

Attributes
protected
Definition Classes
Logging
def logDebug(msg: ⇒ String, throwable: Throwable): Unit

Attributes
protected
Definition Classes
Logging
def logDebug(msg: ⇒ String): Unit

Attributes
protected
Definition Classes
Logging
def logError(msg: ⇒ String, throwable: Throwable): Unit

Attributes
protected
Definition Classes
Logging
def logError(msg: ⇒ String): Unit

Attributes
protected
Definition Classes
Logging
def logInfo(msg: ⇒ String, throwable: Throwable): Unit

Attributes
protected
Definition Classes
Logging
def logInfo(msg: ⇒ String): Unit

Attributes
protected
Definition Classes
Logging
def logName: String

Attributes
protected
Definition Classes
Logging
def logTrace(msg: ⇒ String, throwable: Throwable): Unit

Attributes
protected
Definition Classes
Logging
def logTrace(msg: ⇒ String): Unit

Attributes
protected
Definition Classes
Logging
def logWarning(msg: ⇒ String, throwable: Throwable): Unit

Attributes
protected
Definition Classes
Logging
def logWarning(msg: ⇒ String): Unit

Attributes
protected
Definition Classes
Logging
final def ne(arg0: AnyRef): Boolean

Definition Classes
AnyRef
final def notify(): Unit

Definition Classes
AnyRef
final def notifyAll(): Unit

Definition Classes
AnyRef
val options: Map[String, String]
val partitionColumns: Seq[String]

A list of column names that the relation is partitioned by.
A list of column names that the relation is partitioned by. This list is generally empty during the read path, unless this DataSource is managed by Hive. In these cases, during resolveRelation, we will call getOrInferFileFormatSchema for file based DataSources to infer the partitioning. In other cases, if this list is empty, then this table is unpartitioned.
val paths: Seq[String]

A list of file system paths that hold data.
A list of file system paths that hold data. These will be globbed before and qualified. This option only works when reading from a FileFormat.
lazy val providingClass: Class[_]
def resolveRelation(checkFilesExist: Boolean = true): BaseRelation

Create a resolved BaseRelation that can be used to read data from or write data into this DataSource
Create a resolved BaseRelation that can be used to read data from or write data into this DataSource
checkFilesExist
Whether to confirm that the files exist when generating the non-streaming file based datasource. StructuredStreaming jobs already list file existence, and when generating incremental jobs, the batch is considered as a non-streaming file based data source. Since we know that files already exist, we don't need to check them again.
lazy val sourceInfo: SourceInfo
val sparkSession: SparkSession
final def synchronized[T0](arg0: ⇒ T0): T0

Definition Classes
AnyRef
val userSpecifiedSchema: Option[StructType]

An optional specification of the schema of the data.
An optional specification of the schema of the data. When present we skip attempting to infer the schema.
final def wait(): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long, arg1: Int): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
def write(mode: SaveMode, data: DataFrame): Unit

Writes the given DataFrame out to this DataSource.
def writeAndRead(mode: SaveMode, data: DataFrame): BaseRelation

Writes the given DataFrame out to this DataSource and returns a BaseRelation for the following reading.

Related Docs: object DataSource | package datasources

Instance Constructors

Type Members

case class SourceInfo(name: String, schema: StructType, partitionColumns: Seq[String]) extends Product with Serializable

Value Members

final def !=(arg0: Any): Boolean

final def ##(): Int

final def ==(arg0: Any): Boolean

final def asInstanceOf[T0]: T0

val bucketSpec: Option[BucketSpec]

val catalogTable: Option[CatalogTable]

val className: String

def clone(): AnyRef

def createSink(outputMode: OutputMode): Sink

def createSource(metadataPath: String): Source

final def eq(arg0: AnyRef): Boolean

def finalize(): Unit

final def getClass(): Class[_]

def hasMetadata(path: Seq[String]): Boolean

def initializeLogIfNecessary(isInterpreter: Boolean, silent: Boolean = false): Boolean

def initializeLogIfNecessary(isInterpreter: Boolean): Unit

final def isInstanceOf[T0]: Boolean

def isTraceEnabled(): Boolean

def log: Logger

def logDebug(msg: ⇒ String, throwable: Throwable): Unit

def logDebug(msg: ⇒ String): Unit

def logError(msg: ⇒ String, throwable: Throwable): Unit

def logError(msg: ⇒ String): Unit

def logInfo(msg: ⇒ String, throwable: Throwable): Unit

def logInfo(msg: ⇒ String): Unit

def logName: String

def logTrace(msg: ⇒ String, throwable: Throwable): Unit

def logTrace(msg: ⇒ String): Unit

def logWarning(msg: ⇒ String, throwable: Throwable): Unit

def logWarning(msg: ⇒ String): Unit

final def ne(arg0: AnyRef): Boolean

final def notify(): Unit

final def notifyAll(): Unit

val options: Map[String, String]

val partitionColumns: Seq[String]

val paths: Seq[String]

lazy val providingClass: Class[_]

def resolveRelation(checkFilesExist: Boolean = true): BaseRelation

lazy val sourceInfo: SourceInfo

val sparkSession: SparkSession

final def synchronized[T0](arg0: ⇒ T0): T0

val userSpecifiedSchema: Option[StructType]

final def wait(): Unit

final def wait(arg0: Long, arg1: Int): Unit

final def wait(arg0: Long): Unit

def write(mode: SaveMode, data: DataFrame): Unit

def writeAndRead(mode: SaveMode, data: DataFrame): BaseRelation

Inherited from Serializable

Inherited from Serializable

Inherited from Product

Inherited from Equals

Inherited from internal.Logging

Inherited from AnyRef

Inherited from Any

Ungrouped