Commit 64e58cf2 authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

Refactored search and domain quantities.

parent 5b8641d5
......@@ -97,9 +97,10 @@ def add_common_parameters(request_parser):
'until_time', type=lambda x: rfc3339DateTime.parse(x),
help='A yyyy-MM-ddTHH:mm:ss (RFC3339) maximum entry time (e.g. upload time)')
for search_quantity in search.search_quantities.keys():
_, _, description = search.search_quantities[search_quantity]
request_parser.add_argument(search_quantity, type=str, help=description)
for quantity in search.search_quantities.values():
request_parser.add_argument(
quantity.name, type=str, help=quantity.description,
action='append' if quantity.multi else None)
repo_request_parser = pagination_request_parser.copy()
......
......@@ -168,13 +168,18 @@ class DomainQuantity:
0 (the default) means no aggregations.
metric: Indicates that this quantity should be used as search metric. Values need
to be tuples with metric name and elastic aggregation (e.g. sum, cardinality)
zero_aggs: Return aggregation values for values with zero hits in the search. Default
is with zero aggregations.
elastic_mapping: An optional elasticsearch_dsl mapping. Default is ``Keyword``.
elastic_search_type: An optional elasticsearch search type. Default is ``term``.
elastic_field: An optional elasticsearch key. Default is the name of the quantity.
"""
def __init__(
self, description: str = None, multi: bool = False, aggregations: int = 0,
order_default: bool = False, metric: Tuple[str, str] = None,
elastic_mapping=None):
zero_aggs: bool = True, elastic_mapping: str = None,
elastic_search_type: str = 'term', elastic_field: str = None):
self.name: str = None
self.description = description
......@@ -182,11 +187,18 @@ class DomainQuantity:
self.order_default = order_default
self.aggregations = aggregations
self.metric = metric
self.zero_aggs = zero_aggs
self.elastic_mapping = elastic_mapping
self.elastic_search_type = elastic_search_type
self._elastic_key = elastic_field
if self.elastic_mapping is None:
self.elastic_mapping = Keyword(multi=self.multi)
@property
def elastic_field(self) -> str:
return self._elastic_key if self._elastic_key is not None else self.name
class Domain:
"""
......@@ -216,6 +228,32 @@ class Domain:
instance: 'Domain' = None
instances: Dict[str, 'Domain'] = {}
base_quantities = dict(
authors=DomainQuantity(
elastic_field='authors.name.keyword', multi=True,
description=(
'Search for the given author. Exact keyword matches in the form "Lastname, '
'Firstname".')),
comment=DomainQuantity(
elastic_search_type='match', multi=True,
description='Search within the comments. This is a text search ala google.'),
paths=DomainQuantity(
elastic_search_type='match', elastic_field='files', multi=True,
description='Search for elements in one of the file paths. The paths are split at all "/".'),
files=DomainQuantity(
elastic_field='files.keyword', multi=True,
description='Search for exact file name with full path.'),
quantities=DomainQuantity(
multi=True,
description='Search for the existence of a certain meta-info quantity'),
upload_id=DomainQuantity(description='Search for the upload_id.'),
calc_id=DomainQuantity(description='Search for the calc_id.'),
pid=DomainQuantity(description='Search for the pid.'),
mainfile=DomainQuantity(description='Search for the mainfile.'),
datasets=DomainQuantity(
elastic_field='datasets.name', multi=True,
description='Search for a particular dataset by name.'))
def __init__(
self, name: str, domain_entry_class: Type[CalcWithMetadata],
quantities: Dict[str, DomainQuantity],
......@@ -239,9 +277,11 @@ class Domain:
for quantity_name, value in reference_domain_calc.__dict__.items():
if not hasattr(reference_general_calc, quantity_name):
quantity = quantities.get(quantity_name, None)
if quantity is None:
quantity = DomainQuantity()
quantities[quantity_name] = quantity
quantity.name = quantity_name
quantity.multi = isinstance(value, list)
self.quantities[quantity.name] = quantity
......@@ -253,6 +293,11 @@ class Domain:
assert any(quantity.order_default for quantity in Domain.instances[name].quantities.values()), \
'you need to define a order default quantity'
self.search_quantities = dict(**Domain.base_quantities)
for name, quantity in self.search_quantities.items():
quantity.name = name
self.search_quantities.update(self.quantities)
@property
def metrics(self) -> Dict[str, Tuple[str, str]]:
"""
......
......@@ -19,6 +19,7 @@ DFT specific metadata
from typing import List
import re
from elasticsearch_dsl import Integer
import ase.data
from nomad import utils, config
......@@ -161,8 +162,7 @@ Domain('DFT', DFTCalcWithMetadata, quantities=dict(
order_default=True),
atoms=DomainQuantity(
'The atom labels of all atoms in the simulated system.',
# aggregations=len(ase.data.chemical_symbols)),
aggregations=200, multi=True), # quickfix for bad atom labels
aggregations=len(ase.data.chemical_symbols), multi=True, zero_aggs=False),
basis_set=DomainQuantity(
'The used basis set functions.', aggregations=10),
xc_functional=DomainQuantity(
......
......@@ -195,31 +195,8 @@ def refresh():
aggregations = datamodel.Domain.instance.aggregations
""" The available aggregations in :func:`aggregate_search` and their maximum aggregation size """
search_quantities = {
'authors': ('term', 'authors.name.keyword', (
'Search for the given author. Exact keyword matches in the form "Lastname, Firstname".')),
'comment': ('match', 'comment', 'Search within the comments. This is a text search ala google.'),
'paths': ('match', 'files', (
'Search for elements in one of the file paths. The paths are split at all "/".')),
'files': ('term', 'files.keyword', 'Search for exact file name with full path.'),
'quantities': ('term', 'quantities', 'Search for the existence of a certain meta-info quantity'),
'upload_id': ('term', 'upload_id', 'Search for the upload_id.'),
'calc_id': ('term', 'calc_id', 'Search for the calc_id.'),
'pid': ('term', 'pid', 'Search for the pid.'),
'mainfile': ('term', 'mainfile', 'Search for the mainfile.'),
'datasets': ('term', 'datasets.name', 'Search for a particular dataset by name.')
}
"""
The available search quantities in :func:`aggregate_search` as tuples with *search type*,
elastic field and description.
"""
for quantity in datamodel.Domain.instance.quantities.values():
search_spec = ('term', quantity.name, quantity.description)
search_quantities[quantity.name] = search_spec
search_quantities = datamodel.Domain.instance.search_quantities
"""The available search quantities """
metrics = {
'datasets': ('cardinality', 'datasets.id'),
......@@ -231,8 +208,7 @@ be used in aggregations, e.g. the sum of all total energy calculations or cardin
all unique geometries.
"""
for key, value in datamodel.Domain.instance.metrics.items():
metrics[key] = value
metrics.update(**datamodel.Domain.instance.metrics)
metrics_names = list(metric for metric in metrics.keys())
......@@ -256,8 +232,8 @@ def _construct_search(
search = search.query('range', upload_time=dict(gte=time_range[0], lte=time_range[1]))
for key, value in search_parameters.items():
query_type, field, _ = search_quantities.get(key, (None, None, None))
if query_type is None:
quantity = search_quantities.get(key, None)
if quantity is None:
if key in ['page', 'per_page', 'order', 'order_by']:
continue
else:
......@@ -269,17 +245,9 @@ def _construct_search(
values = [value]
for item in values:
quantity = datamodel.Domain.instance.quantities.get(key)
if quantity is not None and quantity.multi:
items = item.split(',')
else:
items = [item]
for item in items:
search = search.query(Q(query_type, **{field: item}))
search = search.query(Q(quantity.elastic_search_type, **{quantity.elastic_field: item}))
search = search.source(exclude=['quantities'])
return search
......@@ -292,9 +260,12 @@ def _execute_paginated_search(
if order_by not in search_quantities:
raise KeyError('Unknown order quantity %s' % order_by)
_, order_by, _ = search_quantities[order_by]
order_by_quantity = search_quantities[order_by]
search = search.sort(order_by if order == 1 else '-%s' % order_by)
if order == 1:
search = search.sort(order_by_quantity.elastic_field)
else:
search = search.sort('-%s' % order_by_quantity.elastic_field)
paginated_search = search[(page - 1) * per_page: page * per_page]
response = paginated_search.execute() # pylint: disable=E1101
......@@ -443,15 +414,15 @@ def quantity_search(
"""
search = _construct_search(**kwargs)
for quantity, after in quantities.items():
_, field, _ = search_quantities[quantity]
terms = A('terms', field=field)
for quantity_name, after in quantities.items():
quantity = search_quantities[quantity_name]
terms = A('terms', field=quantity.elastic_field)
composite = dict(sources={quantity: terms}, size=size)
composite = dict(sources={quantity_name: terms}, size=size)
if after is not None:
composite['after'] = {quantity: after}
composite['after'] = {quantity_name: after}
search.aggs.bucket(quantity, 'composite', **composite)
search.aggs.bucket(quantity_name, 'composite', **composite)
response, entry_results = _execute_paginated_search(search, **kwargs)
......@@ -524,15 +495,18 @@ def metrics_search(
metric_kind, field = metrics[metric]
parent.metric(metric, A(metric_kind, field=field))
for quantity, size in quantities.items():
for quantity_name, size in quantities.items():
# We are using elastic searchs 'composite aggregations' here. We do not really
# compose aggregations, but only those pseudo composites allow us to use the
# 'after' feature that allows to scan through all aggregation values.
terms: Dict[str, Any] = None
_, field, _ = search_quantities[quantity]
terms = A('terms', field=field, size=size, min_doc_count=0, order=dict(_key='asc'))
quantity = search_quantities[quantity_name]
min_doc_count = 0 if quantity.zero_aggs else 1
terms = A(
'terms', field=quantity.elastic_field, size=size, min_doc_count=min_doc_count,
order=dict(_key='asc'))
buckets = search.aggs.bucket(quantity, terms)
buckets = search.aggs.bucket(quantity_name, terms)
add_metrics(buckets)
add_metrics(search.aggs)
......@@ -548,12 +522,12 @@ def metrics_search(
return result
metrics_results = {
quantity: {
quantity_name: {
bucket.key: get_metrics(bucket, bucket.doc_count)
for bucket in getattr(response.aggregations, quantity).buckets
for bucket in getattr(response.aggregations, quantity_name).buckets
}
for quantity in quantities.keys()
if quantity not in metrics_names # ES aggs for total metrics, and aggs for quantities stand side by side
for quantity_name in quantities.keys()
if quantity_name not in metrics_names # ES aggs for total metrics, and aggs for quantities stand side by side
}
total_metrics_result = get_metrics(response.aggregations, entry_results['pagination']['total'])
......
......@@ -23,6 +23,7 @@ import inspect
from passlib.hash import bcrypt
import datetime
import os.path
from urllib.parse import urlencode
from nomad.api.app import rfc3339DateTime
from nomad import coe_repo, search, parsing, files, config
......@@ -754,7 +755,7 @@ class TestRepo():
(0, 'quantities', 'dos')
])
def test_search_quantities(self, client, example_elastic_calcs, no_warn, test_user_auth, calcs, quantity, value):
query_string = '%s=%s' % (quantity, ','.join(value) if isinstance(value, list) else value)
query_string = urlencode({quantity: value}, doseq=True)
rv = client.get('/repo/?%s' % query_string, headers=test_user_auth)
data = self.assert_search(rv, calcs)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment