Commit 64e58cf2 authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

Refactored search and domain quantities.

parent 5b8641d5
...@@ -97,9 +97,10 @@ def add_common_parameters(request_parser): ...@@ -97,9 +97,10 @@ def add_common_parameters(request_parser):
'until_time', type=lambda x: rfc3339DateTime.parse(x), 'until_time', type=lambda x: rfc3339DateTime.parse(x),
help='A yyyy-MM-ddTHH:mm:ss (RFC3339) maximum entry time (e.g. upload time)') help='A yyyy-MM-ddTHH:mm:ss (RFC3339) maximum entry time (e.g. upload time)')
for search_quantity in search.search_quantities.keys(): for quantity in search.search_quantities.values():
_, _, description = search.search_quantities[search_quantity] request_parser.add_argument(
request_parser.add_argument(search_quantity, type=str, help=description) quantity.name, type=str, help=quantity.description,
action='append' if quantity.multi else None)
repo_request_parser = pagination_request_parser.copy() repo_request_parser = pagination_request_parser.copy()
......
...@@ -168,13 +168,18 @@ class DomainQuantity: ...@@ -168,13 +168,18 @@ class DomainQuantity:
0 (the default) means no aggregations. 0 (the default) means no aggregations.
metric: Indicates that this quantity should be used as search metric. Values need metric: Indicates that this quantity should be used as search metric. Values need
to be tuples with metric name and elastic aggregation (e.g. sum, cardinality) to be tuples with metric name and elastic aggregation (e.g. sum, cardinality)
zero_aggs: Return aggregation values for values with zero hits in the search. Default
is with zero aggregations.
elastic_mapping: An optional elasticsearch_dsl mapping. Default is ``Keyword``. elastic_mapping: An optional elasticsearch_dsl mapping. Default is ``Keyword``.
elastic_search_type: An optional elasticsearch search type. Default is ``term``.
elastic_field: An optional elasticsearch key. Default is the name of the quantity.
""" """
def __init__( def __init__(
self, description: str = None, multi: bool = False, aggregations: int = 0, self, description: str = None, multi: bool = False, aggregations: int = 0,
order_default: bool = False, metric: Tuple[str, str] = None, order_default: bool = False, metric: Tuple[str, str] = None,
elastic_mapping=None): zero_aggs: bool = True, elastic_mapping: str = None,
elastic_search_type: str = 'term', elastic_field: str = None):
self.name: str = None self.name: str = None
self.description = description self.description = description
...@@ -182,11 +187,18 @@ class DomainQuantity: ...@@ -182,11 +187,18 @@ class DomainQuantity:
self.order_default = order_default self.order_default = order_default
self.aggregations = aggregations self.aggregations = aggregations
self.metric = metric self.metric = metric
self.zero_aggs = zero_aggs
self.elastic_mapping = elastic_mapping self.elastic_mapping = elastic_mapping
self.elastic_search_type = elastic_search_type
self._elastic_key = elastic_field
if self.elastic_mapping is None: if self.elastic_mapping is None:
self.elastic_mapping = Keyword(multi=self.multi) self.elastic_mapping = Keyword(multi=self.multi)
@property
def elastic_field(self) -> str:
return self._elastic_key if self._elastic_key is not None else self.name
class Domain: class Domain:
""" """
...@@ -216,6 +228,32 @@ class Domain: ...@@ -216,6 +228,32 @@ class Domain:
instance: 'Domain' = None instance: 'Domain' = None
instances: Dict[str, 'Domain'] = {} instances: Dict[str, 'Domain'] = {}
base_quantities = dict(
authors=DomainQuantity(
elastic_field='authors.name.keyword', multi=True,
description=(
'Search for the given author. Exact keyword matches in the form "Lastname, '
'Firstname".')),
comment=DomainQuantity(
elastic_search_type='match', multi=True,
description='Search within the comments. This is a text search ala google.'),
paths=DomainQuantity(
elastic_search_type='match', elastic_field='files', multi=True,
description='Search for elements in one of the file paths. The paths are split at all "/".'),
files=DomainQuantity(
elastic_field='files.keyword', multi=True,
description='Search for exact file name with full path.'),
quantities=DomainQuantity(
multi=True,
description='Search for the existence of a certain meta-info quantity'),
upload_id=DomainQuantity(description='Search for the upload_id.'),
calc_id=DomainQuantity(description='Search for the calc_id.'),
pid=DomainQuantity(description='Search for the pid.'),
mainfile=DomainQuantity(description='Search for the mainfile.'),
datasets=DomainQuantity(
elastic_field='datasets.name', multi=True,
description='Search for a particular dataset by name.'))
def __init__( def __init__(
self, name: str, domain_entry_class: Type[CalcWithMetadata], self, name: str, domain_entry_class: Type[CalcWithMetadata],
quantities: Dict[str, DomainQuantity], quantities: Dict[str, DomainQuantity],
...@@ -239,9 +277,11 @@ class Domain: ...@@ -239,9 +277,11 @@ class Domain:
for quantity_name, value in reference_domain_calc.__dict__.items(): for quantity_name, value in reference_domain_calc.__dict__.items():
if not hasattr(reference_general_calc, quantity_name): if not hasattr(reference_general_calc, quantity_name):
quantity = quantities.get(quantity_name, None) quantity = quantities.get(quantity_name, None)
if quantity is None: if quantity is None:
quantity = DomainQuantity() quantity = DomainQuantity()
quantities[quantity_name] = quantity quantities[quantity_name] = quantity
quantity.name = quantity_name quantity.name = quantity_name
quantity.multi = isinstance(value, list) quantity.multi = isinstance(value, list)
self.quantities[quantity.name] = quantity self.quantities[quantity.name] = quantity
...@@ -253,6 +293,11 @@ class Domain: ...@@ -253,6 +293,11 @@ class Domain:
assert any(quantity.order_default for quantity in Domain.instances[name].quantities.values()), \ assert any(quantity.order_default for quantity in Domain.instances[name].quantities.values()), \
'you need to define a order default quantity' 'you need to define a order default quantity'
self.search_quantities = dict(**Domain.base_quantities)
for name, quantity in self.search_quantities.items():
quantity.name = name
self.search_quantities.update(self.quantities)
@property @property
def metrics(self) -> Dict[str, Tuple[str, str]]: def metrics(self) -> Dict[str, Tuple[str, str]]:
""" """
......
...@@ -19,6 +19,7 @@ DFT specific metadata ...@@ -19,6 +19,7 @@ DFT specific metadata
from typing import List from typing import List
import re import re
from elasticsearch_dsl import Integer from elasticsearch_dsl import Integer
import ase.data
from nomad import utils, config from nomad import utils, config
...@@ -161,8 +162,7 @@ Domain('DFT', DFTCalcWithMetadata, quantities=dict( ...@@ -161,8 +162,7 @@ Domain('DFT', DFTCalcWithMetadata, quantities=dict(
order_default=True), order_default=True),
atoms=DomainQuantity( atoms=DomainQuantity(
'The atom labels of all atoms in the simulated system.', 'The atom labels of all atoms in the simulated system.',
# aggregations=len(ase.data.chemical_symbols)), aggregations=len(ase.data.chemical_symbols), multi=True, zero_aggs=False),
aggregations=200, multi=True), # quickfix for bad atom labels
basis_set=DomainQuantity( basis_set=DomainQuantity(
'The used basis set functions.', aggregations=10), 'The used basis set functions.', aggregations=10),
xc_functional=DomainQuantity( xc_functional=DomainQuantity(
......
...@@ -195,31 +195,8 @@ def refresh(): ...@@ -195,31 +195,8 @@ def refresh():
aggregations = datamodel.Domain.instance.aggregations aggregations = datamodel.Domain.instance.aggregations
""" The available aggregations in :func:`aggregate_search` and their maximum aggregation size """ """ The available aggregations in :func:`aggregate_search` and their maximum aggregation size """
search_quantities = { search_quantities = datamodel.Domain.instance.search_quantities
'authors': ('term', 'authors.name.keyword', ( """The available search quantities """
'Search for the given author. Exact keyword matches in the form "Lastname, Firstname".')),
'comment': ('match', 'comment', 'Search within the comments. This is a text search ala google.'),
'paths': ('match', 'files', (
'Search for elements in one of the file paths. The paths are split at all "/".')),
'files': ('term', 'files.keyword', 'Search for exact file name with full path.'),
'quantities': ('term', 'quantities', 'Search for the existence of a certain meta-info quantity'),
'upload_id': ('term', 'upload_id', 'Search for the upload_id.'),
'calc_id': ('term', 'calc_id', 'Search for the calc_id.'),
'pid': ('term', 'pid', 'Search for the pid.'),
'mainfile': ('term', 'mainfile', 'Search for the mainfile.'),
'datasets': ('term', 'datasets.name', 'Search for a particular dataset by name.')
}
"""
The available search quantities in :func:`aggregate_search` as tuples with *search type*,
elastic field and description.
"""
for quantity in datamodel.Domain.instance.quantities.values():
search_spec = ('term', quantity.name, quantity.description)
search_quantities[quantity.name] = search_spec
metrics = { metrics = {
'datasets': ('cardinality', 'datasets.id'), 'datasets': ('cardinality', 'datasets.id'),
...@@ -231,8 +208,7 @@ be used in aggregations, e.g. the sum of all total energy calculations or cardin ...@@ -231,8 +208,7 @@ be used in aggregations, e.g. the sum of all total energy calculations or cardin
all unique geometries. all unique geometries.
""" """
for key, value in datamodel.Domain.instance.metrics.items(): metrics.update(**datamodel.Domain.instance.metrics)
metrics[key] = value
metrics_names = list(metric for metric in metrics.keys()) metrics_names = list(metric for metric in metrics.keys())
...@@ -256,8 +232,8 @@ def _construct_search( ...@@ -256,8 +232,8 @@ def _construct_search(
search = search.query('range', upload_time=dict(gte=time_range[0], lte=time_range[1])) search = search.query('range', upload_time=dict(gte=time_range[0], lte=time_range[1]))
for key, value in search_parameters.items(): for key, value in search_parameters.items():
query_type, field, _ = search_quantities.get(key, (None, None, None)) quantity = search_quantities.get(key, None)
if query_type is None: if quantity is None:
if key in ['page', 'per_page', 'order', 'order_by']: if key in ['page', 'per_page', 'order', 'order_by']:
continue continue
else: else:
...@@ -269,17 +245,9 @@ def _construct_search( ...@@ -269,17 +245,9 @@ def _construct_search(
values = [value] values = [value]
for item in values: for item in values:
quantity = datamodel.Domain.instance.quantities.get(key) search = search.query(Q(quantity.elastic_search_type, **{quantity.elastic_field: item}))
if quantity is not None and quantity.multi:
items = item.split(',')
else:
items = [item]
for item in items:
search = search.query(Q(query_type, **{field: item}))
search = search.source(exclude=['quantities']) search = search.source(exclude=['quantities'])
return search return search
...@@ -292,9 +260,12 @@ def _execute_paginated_search( ...@@ -292,9 +260,12 @@ def _execute_paginated_search(
if order_by not in search_quantities: if order_by not in search_quantities:
raise KeyError('Unknown order quantity %s' % order_by) raise KeyError('Unknown order quantity %s' % order_by)
_, order_by, _ = search_quantities[order_by] order_by_quantity = search_quantities[order_by]
search = search.sort(order_by if order == 1 else '-%s' % order_by) if order == 1:
search = search.sort(order_by_quantity.elastic_field)
else:
search = search.sort('-%s' % order_by_quantity.elastic_field)
paginated_search = search[(page - 1) * per_page: page * per_page] paginated_search = search[(page - 1) * per_page: page * per_page]
response = paginated_search.execute() # pylint: disable=E1101 response = paginated_search.execute() # pylint: disable=E1101
...@@ -443,15 +414,15 @@ def quantity_search( ...@@ -443,15 +414,15 @@ def quantity_search(
""" """
search = _construct_search(**kwargs) search = _construct_search(**kwargs)
for quantity, after in quantities.items(): for quantity_name, after in quantities.items():
_, field, _ = search_quantities[quantity] quantity = search_quantities[quantity_name]
terms = A('terms', field=field) terms = A('terms', field=quantity.elastic_field)
composite = dict(sources={quantity: terms}, size=size) composite = dict(sources={quantity_name: terms}, size=size)
if after is not None: if after is not None:
composite['after'] = {quantity: after} composite['after'] = {quantity_name: after}
search.aggs.bucket(quantity, 'composite', **composite) search.aggs.bucket(quantity_name, 'composite', **composite)
response, entry_results = _execute_paginated_search(search, **kwargs) response, entry_results = _execute_paginated_search(search, **kwargs)
...@@ -524,15 +495,18 @@ def metrics_search( ...@@ -524,15 +495,18 @@ def metrics_search(
metric_kind, field = metrics[metric] metric_kind, field = metrics[metric]
parent.metric(metric, A(metric_kind, field=field)) parent.metric(metric, A(metric_kind, field=field))
for quantity, size in quantities.items(): for quantity_name, size in quantities.items():
# We are using elastic searchs 'composite aggregations' here. We do not really # We are using elastic searchs 'composite aggregations' here. We do not really
# compose aggregations, but only those pseudo composites allow us to use the # compose aggregations, but only those pseudo composites allow us to use the
# 'after' feature that allows to scan through all aggregation values. # 'after' feature that allows to scan through all aggregation values.
terms: Dict[str, Any] = None terms: Dict[str, Any] = None
_, field, _ = search_quantities[quantity] quantity = search_quantities[quantity_name]
terms = A('terms', field=field, size=size, min_doc_count=0, order=dict(_key='asc')) min_doc_count = 0 if quantity.zero_aggs else 1
terms = A(
'terms', field=quantity.elastic_field, size=size, min_doc_count=min_doc_count,
order=dict(_key='asc'))
buckets = search.aggs.bucket(quantity, terms) buckets = search.aggs.bucket(quantity_name, terms)
add_metrics(buckets) add_metrics(buckets)
add_metrics(search.aggs) add_metrics(search.aggs)
...@@ -548,12 +522,12 @@ def metrics_search( ...@@ -548,12 +522,12 @@ def metrics_search(
return result return result
metrics_results = { metrics_results = {
quantity: { quantity_name: {
bucket.key: get_metrics(bucket, bucket.doc_count) bucket.key: get_metrics(bucket, bucket.doc_count)
for bucket in getattr(response.aggregations, quantity).buckets for bucket in getattr(response.aggregations, quantity_name).buckets
} }
for quantity in quantities.keys() for quantity_name in quantities.keys()
if quantity not in metrics_names # ES aggs for total metrics, and aggs for quantities stand side by side if quantity_name not in metrics_names # ES aggs for total metrics, and aggs for quantities stand side by side
} }
total_metrics_result = get_metrics(response.aggregations, entry_results['pagination']['total']) total_metrics_result = get_metrics(response.aggregations, entry_results['pagination']['total'])
......
...@@ -23,6 +23,7 @@ import inspect ...@@ -23,6 +23,7 @@ import inspect
from passlib.hash import bcrypt from passlib.hash import bcrypt
import datetime import datetime
import os.path import os.path
from urllib.parse import urlencode
from nomad.api.app import rfc3339DateTime from nomad.api.app import rfc3339DateTime
from nomad import coe_repo, search, parsing, files, config from nomad import coe_repo, search, parsing, files, config
...@@ -754,7 +755,7 @@ class TestRepo(): ...@@ -754,7 +755,7 @@ class TestRepo():
(0, 'quantities', 'dos') (0, 'quantities', 'dos')
]) ])
def test_search_quantities(self, client, example_elastic_calcs, no_warn, test_user_auth, calcs, quantity, value): def test_search_quantities(self, client, example_elastic_calcs, no_warn, test_user_auth, calcs, quantity, value):
query_string = '%s=%s' % (quantity, ','.join(value) if isinstance(value, list) else value) query_string = urlencode({quantity: value}, doseq=True)
rv = client.get('/repo/?%s' % query_string, headers=test_user_auth) rv = client.get('/repo/?%s' % query_string, headers=test_user_auth)
data = self.assert_search(rv, calcs) data = self.assert_search(rv, calcs)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment