+class MGScrollSetupError(Exception):
# self._doc_type = 'gene'
self._index = ES_INDEX_NAME_ALL
self._doc_type = ES_INDEX_TYPE
+ self._scroll_time = '1m'
+ self._total_scroll_size = 1000
+ if self._total_scroll_size % self.get_number_of_shards() == 0:
+ self._scroll_size = int(self._total_scroll_size / self.get_number_of_shards())
+ raise MGScrollSetupError("_total_scroll_size of {} can't be ".format(self._total_scroll_size) +
+ "divided evenly among {} shards.".format(self.get_number_of_shards()))
# self._doc_type = 'gene_sample'
self._default_fields = ['name', 'symbol', 'taxid', 'entrezgene']
self._default_species = [9606, 10090, 10116] # human, mouse, rat
self._tier_1_species = set(taxid_d.values())
- def _search(self, q, species='all'):
+ def _search(self, q, species='all', scroll_options={}):
# body = '{"query" : {"term" : { "_all" : ' + q + ' }}}'
res = self.conn.search(index=self._index, doc_type=self._doc_type,
+ body=q, **scroll_options)
self._index = ES_INDEX_NAME_ALL # reset self._index
return [self._get_genedoc(hit, dotfield=dotfield) for hit in hits['hits']]
+ def _clean_res2(self, res):
+ ''' res is the dictionary returned from a query.
+ do some reformating of raw ES results before returning.
+ This method is used for self.query method.
+ for attr in ['took', 'facets', 'aggregations', '_scroll_id']:
+ _res['hits'] = [self._get_genedoc(hit) for hit in _res['hits']]
def _cleaned_res_2(self, res, empty=[], error={'error': True},
single_hit=False, dotfield=True, fields=None):
options.rawquery = kwargs.pop('rawquery', False)
#if dofield is false, returned fields contains dot notation will be restored as an object.
options.dotfield = kwargs.pop('dotfield', True) not in [False, 'false']
+ options.fetch_all = kwargs.pop('fetch_all', False)
scopes = kwargs.pop('scopes', None)
options.scopes = self._cleaned_scopes(scopes)
+ def get_number_of_shards(self):
+ r = self.conn.indices.get_settings(self._index)
+ n_shards = r[list(r.keys())[0]]['settings']['index']['number_of_shards']
+ n_shards = int(n_shards)
def get_gene(self, geneid, fields='all', **kwargs):
kwargs['fields'] = self._cleaned_fields(fields)
raw = kwargs.pop('raw', False)
q = re.sub(u'[\t\n\x0b\x0c\r\x00]+', ' ', q)
+ scroll_options.update({'search_type': 'scan', 'size': self._scroll_size, 'scroll': self._scroll_time})
# Check if special interval query pattern exists
interval_query = self._parse_interval_query(q)
- res = self._search(_q, species=kwargs['species'])
+ res = self._search(_q, species=kwargs['species'], scroll_options=scroll_options)
return {'success': False, 'error': msg}
- _res['took'] = res['took']
- _res['facets'] = res['facets']
- for attr in ['fields', '_source']:
- if not options.dotfield:
+ res = self._clean_res2(res)
+ #_res['took'] = res['took']
+ # _res['facets'] = res['facets']
+ #for v in _res['hits']:
+ # for attr in ['fields', '_source']:
+ # if not options.dotfield:
'error': "Invalid query. Please check parameters."}
+ def scroll(self, scroll_id, fields=None, **kwargs):
+ options = self._get_cleaned_query_options(fields, kwargs)
+ r = self.conn.scroll(scroll_id, scroll=self._scroll_time)
+ scroll_id = r.get('_scroll_id')
+ if scroll_id is None or not r['hits']['hits']:
+ return {'success': False, 'error': 'No results to return.'}
+ res = self._clean_res2(r)
+ #res.update({'_scroll_id': scroll_id})
+ if r['_shards']['failed']:
+ res.update({'_warning': 'Scroll request has failed on {} shards out of {}.'.format(r['_shards']['failed'], r['_shards']['total'])})
def query_interval(self, taxid, chr, gstart, gend, **kwargs):
'''deprecated! Use query method with interval query string.'''
kwargs.setdefault('fields', ['symbol', 'name', 'taxid'])