from intake.source import base
import json
from elasticsearch import Elasticsearch
import pandas as pd
import time
try:
from json.decoder import JSONDecodeError
except ImportError:
JSONDecodeError = ValueError
__version__ = '0.0.1'
[docs]class ElasticSearchSeqSource(base.DataSource):
"""
Data source which executes arbitrary queries on ElasticSearch
This is the tabular reader: will return dataframes. Nested return items
will become dict-like objects in the output.
Parameters
----------
query: str
Query to execute. Can either be in Lucene single-line format, or a
JSON structured query (presented as text)
qargs: dict
Further parameters to pass to the query, such as set of indexes to
consider, filtering, ordering. See
http://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.search
es_kwargs: dict
Settings for the ES connection, e.g., a simple local connection may be
``{'host': 'localhost', 'port': 9200}``.
Other keywords to the Plugin that end up here and are material:
scroll: str
how long the query is live for, default ``'100m'``
size: int
the paging size when downloading, default 1000.
metadata: dict
Extra information for this source.
"""
container = 'python'
def __init__(self, query, qargs, es_kwargs, metadata):
self._query = query
self._qargs = qargs
self._scroll = es_kwargs.pop('scroll', '100m')
self._size = es_kwargs.pop('size', 1000) # default page size
self._es_kwargs = es_kwargs
self._dataframe = None
self.es = Elasticsearch([es_kwargs]) # maybe should be (more) global?
super(ElasticSearchSeqSource, self).__init__(container=self.container,
metadata=metadata)
def _run_query(self, size=None):
if size is None:
size = self._size
try:
q = json.loads(self._query)
if 'query' not in q:
q = {'query': q}
s = self.es.search(body=q, size=size, scroll=self._scroll,
**self._qargs)
except (JSONDecodeError, TypeError):
s = self.es.search(q=self._query, size=size, scroll=self._scroll,
**self._qargs)
sid = s['_scroll_id']
scroll_size = s['hits']['total']
while scroll_size > len(s['hits']['hits']):
page = self.es.scroll(scroll_id=sid, scroll=self._scroll)
sid = page['_scroll_id']
s['hits']['hits'].extend(page['hits']['hits'])
self.es.clear_scroll(scroll_id=sid)
return s
def _get_schema(self, retry=2):
"""Get schema from first 10 hits or cached dataframe"""
return base.Schema(datashape=None,
dtype=None,
shape=None,
npartitions=1,
extra_metadata={})
def _get_partition(self, _):
"""Downloads all data
ES has a hard maximum of 10000 items to fetch. Otherwise need to
implement paging, known to ES as "scroll"
https://stackoverflow.com/questions/41655913/elk-how-do-i-retrieve-more-than-10000-results-events-in-elastic-search
"""
results = self._run_query()
return [r['_source'] for r in results['hits']['hits']]