from intake.source import base
import json
from elasticsearch import Elasticsearch
import pandas as pd
import time
try:
from json.decoder import JSONDecodeError
except ImportError:
JSONDecodeError = ValueError
from .elasticsearch_seq import ElasticSearchSeqSource
[docs]class ElasticSearchTableSource(ElasticSearchSeqSource):
"""
Data source which executes arbitrary queries on ElasticSearch
This is the tabular reader: will return dataframes. Nested return items
will become dict-like objects in the output.
Parameters
----------
query: str
Query to execute. Can either be in Lucene single-line format, or a
JSON structured query (presented as text)
qargs: dict
Further parameters to pass to the query, such as set of indexes to
consider, filtering, ordering. See
http://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.search
es_kwargs: dict
Settings for the ES connection, e.g., a simple local connection may be
``{'host': 'localhost', 'port': 9200}``.
Other keywords to the Plugin that end up here and are material:
scroll: str
how long the query is live for, default ``'100m'``
size: int
the paging size when downloading, default 1000.
metadata: dict
Extra information for this source.
"""
_dataframe = None
container = 'dataframe'
def __init__(self, *args, **kwargs):
ElasticSearchSeqSource.__init__(self, *args, **kwargs)
def _get_schema(self, retry=2):
"""Get schema from first 10 hits or cached dataframe"""
if self._dataframe is not None:
return base.Schema(datashape=None,
dtype=self._dataframe[:0],
shape=self._dataframe.shape,
npartitions=1,
extra_metadata=self._extra_metadata)
else:
while True:
results = self._run_query(10)
if 'hits' in results and results['hits']['hits']:
# ES likes to return empty result-sets while indexing
break
retry -= 0.2
time.sleep(0.2)
if retry < 0:
raise IOError('No results arrived')
df = pd.DataFrame([r['_source'] for r in results['hits']['hits']])
results.pop('hits')
self._extra_metadata = results
return base.Schema(datashape=None,
dtype=df[:0],
shape=(None, df.shape[1]),
npartitions=1,
extra_metadata=self._extra_metadata)
[docs] def to_dask(self):
"""Make single-partition lazy dask data-frame"""
import dask.dataframe as dd
from dask import delayed
self.discover()
part = delayed(self._get_partition(0))
return dd.from_delayed([part], meta=self.dtype)
def _get_partition(self, _):
"""Downloads all data
ES has a hard maximum of 10000 items to fetch. Otherwise need to
implement paging, known to ES as "scroll"
https://stackoverflow.com/questions/41655913/elk-how-do-i-retrieve-more-than-10000-results-events-in-elastic-search
"""
if self._dataframe is None:
results = self._run_query()
df = pd.DataFrame([r['_source'] for r in results['hits']['hits']])
self._dataframe = df
self._schema = None
self.discover()
return self._dataframe
def _close(self):
self._dataframe = None