Source code for intake_elasticsearch.elasticsearch_table

from intake.source import base
import json
from elasticsearch import Elasticsearch
import pandas as pd
import time

try:
    from json.decoder import JSONDecodeError
except ImportError:
    JSONDecodeError = ValueError

from .elasticsearch_seq import ElasticSearchSeqSource


[docs]class ElasticSearchTableSource(ElasticSearchSeqSource): """ Data source which executes arbitrary queries on ElasticSearch This is the tabular reader: will return dataframes. Nested return items will become dict-like objects in the output. Parameters ---------- query: str Query to execute. Can either be in Lucene single-line format, or a JSON structured query (presented as text) qargs: dict Further parameters to pass to the query, such as set of indexes to consider, filtering, ordering. See http://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.search es_kwargs: dict Settings for the ES connection, e.g., a simple local connection may be ``{'host': 'localhost', 'port': 9200}``. Other keywords to the Plugin that end up here and are material: scroll: str how long the query is live for, default ``'100m'`` size: int the paging size when downloading, default 1000. metadata: dict Extra information for this source. """ _dataframe = None container = 'dataframe' def __init__(self, *args, **kwargs): ElasticSearchSeqSource.__init__(self, *args, **kwargs) def _get_schema(self, retry=2): """Get schema from first 10 hits or cached dataframe""" if self._dataframe is not None: return base.Schema(datashape=None, dtype=self._dataframe[:0], shape=self._dataframe.shape, npartitions=1, extra_metadata=self._extra_metadata) else: while True: results = self._run_query(10) if 'hits' in results and results['hits']['hits']: # ES likes to return empty result-sets while indexing break retry -= 0.2 time.sleep(0.2) if retry < 0: raise IOError('No results arrived') df = pd.DataFrame([r['_source'] for r in results['hits']['hits']]) results.pop('hits') self._extra_metadata = results return base.Schema(datashape=None, dtype=df[:0], shape=(None, df.shape[1]), npartitions=1, extra_metadata=self._extra_metadata)
[docs] def to_dask(self): """Make single-partition lazy dask data-frame""" import dask.dataframe as dd from dask import delayed self.discover() part = delayed(self._get_partition(0)) return dd.from_delayed([part], meta=self.dtype)
def _get_partition(self, _): """Downloads all data ES has a hard maximum of 10000 items to fetch. Otherwise need to implement paging, known to ES as "scroll" https://stackoverflow.com/questions/41655913/elk-how-do-i-retrieve-more-than-10000-results-events-in-elastic-search """ if self._dataframe is None: results = self._run_query() df = pd.DataFrame([r['_source'] for r in results['hits']['hits']]) self._dataframe = df self._schema = None self.discover() return self._dataframe def _close(self): self._dataframe = None