Source code for intake_spark.spark_sources

from intake.source.base import DataSource, Schema
from .base import SparkHolder
from ._version import get_versions
__version__ = get_versions()['version']
del get_versions


[docs]class SparkRDD(DataSource): """A reference to an RDD definition in Spark RDDs are list-of-things objects, evaluated lazily in Spark. Examples -------- >>> args = [('textFile', ('text.*.files', )), ... ('map', (len,))] >>> context = {'master': 'spark://master.node:7077'} >>> source = SparkRDD(args, context) The output of `source.to_spark()` is an RDD object holding the lengths of the lines of the input files. """ container = 'python' version = __version__ name = 'spark_rdd' partition_access = True def __init__(self, args, context_kwargs=None, metadata=None): """ Parameters ---------- args, context_kwargs: Passed on to SparkHolder, see its docstrings and the examples. metadata: dict Arbitrary data to associate with this source. """ super(SparkRDD, self).__init__(metadata) self.holder = SparkHolder(False, args, context_kwargs) self.ref = None def _get_schema(self): if self.ref is None: self.ref = self.holder.setup() self.npartitions = self.ref.getNumPartitions() return Schema(npartitions=self.npartitions, extra_metadata=self.metadata)
[docs] def read_partition(self, i): """Returns one of the partitions of the RDD as a list of objects""" self._get_schema() sc = self.holder.sc[0] return sc.runJob(self.ref, lambda x: x, partitions=[i])
[docs] def to_spark(self): """Return the spark object for this data, an RDD""" self._get_schema() return self.ref
[docs] def read(self): """Materialise the whole RDD into a list of objects""" self._get_schema() return self.ref.collect()
def _close(self): self.ref = None
[docs]class SparkDataFrame(DataSource): """A reference to a DataFrame definition in Spark DataFrames are tabular spark objects containing a heterogeneous set of columns and potentially a large number of rows. They are similar in concept to Pandas or Dask data-frames. The Spark variety produced by this driver will be a handle to a lazy object, where computation will be managed by Spark. Examples -------- >>> args = [ ... ('read', ), ... ('format', ('csv', )), ... ('option', ('header', 'true')), ... ('load', ('data.*.csv', ))] >>> context = {'master': 'spark://master.node:7077'} >>> source = SparkDataFrame(args, context) The output of `source.to_spark()` contains a spark object pointing to the parsed contents of the indicated CSV files """ container = 'dataframe' version = __version__ name = 'spark_dataframe' partition_access = True def __init__(self, args, context_kwargs=None, metadata=None): """ Parameters ---------- args, context_kwargs: Passed on to SparkHolder, see its docstrings and the examples. metadata: dict Arbitrary data to associate with this source. """ super(SparkDataFrame, self).__init__(metadata) self.holder = SparkHolder(True, args, context_kwargs) self.ref = None def _get_schema(self): if self.ref is None: self.ref = self.holder.setup() self.npartitions = self.ref.rdd.getNumPartitions() rows = self.ref.take(10) self.dtype = pandas_dtypes(self.ref.schema, rows) self.shape = (None,len(self.dtype)) return Schema(npartitions=self.npartitions, extra_metadata=self.metadata, dtype=self.dtype, shape=self.shape)
[docs] def read_partition(self, i): """Returns one partition of the data as a pandas data-frame""" import pandas as pd self._get_schema() sc = self.holder.sc[0] out = sc.runJob(self.ref.rdd, lambda x: x, partitions=[i]) df = pd.DataFrame.from_records(out) df.columns = list(self.dtype) return df
[docs] def to_spark(self): """Return the Spark object for this data, a DataFrame""" self._get_schema() return self.ref
[docs] def read(self): """Read all of the data into an in-memory Pandas data-frame""" self._get_schema() return self.ref.toPandas()
def _close(self): self.ref = None
def pandas_dtypes(schema, rows): """Rough dtype for the given pyspark schema""" import pandas as pd from pyspark.sql.dataframe import (_to_corrected_pandas_type, IntegralType) # copied from toPandas() method df = pd.DataFrame.from_records(rows) df.columns = [s.name for s in schema] for field in schema: pandas_type = _to_corrected_pandas_type(field.dataType) if pandas_type is not None and not( isinstance(field.dataType, IntegralType) and field.nullable): df[field.name] = df[field.name].astype(pandas_type) return {k: str(v) for k, v in df.dtypes.to_dict().items()}