Commit c728cf4b authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

Completed optimade filterpaser and search tests.

parent 76190167
Subproject commit 47394ea099e95d8ee19f1f1a0b0f6d26aea33036 Subproject commit 58d6b6ea63a4758f719c73466a749eae2cd012e6
...@@ -15,6 +15,8 @@ ...@@ -15,6 +15,8 @@
from flask import Blueprint from flask import Blueprint
from flask_restplus import Api from flask_restplus import Api
from .filterparser import parse_filter
""" """
The optimade implementation of NOMAD. The optimade implementation of NOMAD.
""" """
......
# Copyright 2018 Markus Scheidgen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an"AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from optimade.filterparser import LarkParser
import lark
from elasticsearch_dsl import Q, Text, Keyword, Integer
import ase.data
from nomad.metainfo.optimade import OptimadeStructureEntry
from nomad.metainfo import Quantity
class FilterException(Exception):
""" Raised on parsing a filter expression with syntactic of semantic errors. """
pass
_cmp_operators = {'>': 'gt', '>=': 'gte', '<': 'lt', '<=': 'lte'}
_rev_cmp_operators = {'>': '<', '>=': '<=', '<': '>', '<=': '=>'}
_has_operators = {'ALL': 'must', 'ANY': 'should'}
_length_quantities = {'elements': 'nelements', 'elements_rations': 'nelements', 'dimension_types': 'dimension_types'}
class Transformer(lark.Transformer):
""" Transformer for the Lark parser generator used for the filterparser.
It translates the parse tree into an elastic search query.
"""
def _field(self, quantity, nested=None):
optimade_field_name = quantity.name
if nested is not None:
optimade_field_name = '%s.%s' % (nested, optimade_field_name)
return 'optimade.%s' % optimade_field_name
def _order_terms(self, l, o, r):
if isinstance(l, Quantity):
if isinstance(r, Quantity):
raise Exception('Cannot compare two quantities: %s, %s' % (l.name, r.name))
return l, o, r
else:
if isinstance(r, Quantity):
o = _rev_cmp_operators.get(o, o)
return r, o, l
raise Exception('Cannot compare two values: %s, %s' % (str(l), str(l)))
def _query(self, quantity, o, value, nested=None):
field = self._field(quantity, nested=nested)
if o in _cmp_operators:
return Q('range', **{field: {_cmp_operators[o]: value}})
elastic_annotation = quantity.m_annotations.get('elastic', None)
if elastic_annotation['type'] == Text:
query_type = 'match'
elif elastic_annotation['type'] in [Keyword, Integer]:
query_type = 'term'
else:
raise NotImplementedError('Quantity has unsupported ES field type')
if o in ['=', '']:
return Q(query_type, **{field: value})
if o == '!=':
return ~Q(query_type, **{field: value}) # pylint: disable=invalid-unary-operand-type
raise Exception('Unknown operator %s' % o)
def _has_query(self, quantities, predicates):
if len(quantities) != len(predicates):
raise Exception(
'Tuple length does not match: %s <o> %s ' %
(':'.join(quantities), ':'.join(predicates)))
if len(quantities) == 1:
o, value = predicates[0]
return self._query(quantities[0], o, value)
if any(quantity.name not in ['elements', 'elements_ratios'] for quantity in quantities):
raise Exception('Expression with tuples are only supported for elements and elements_positions')
queries = [
self._query(field, o, value, nested='elements_ratios')
for field, (o, value) in zip(quantities, predicates)]
return Q('nested', path='optimade.elements_ratios', query=dict(bool=dict(must=queries)))
def _wildcard_query(self, quantity, wildcard):
return Q('wildcard', **{self._field(quantity): wildcard})
def __default__(self, tree, children, *args, **kwargs):
""" Default behavior for rules that only replace one symbol with another """
return children[0]
def and_expr(self, args):
if len(args) == 1:
return args[0]
l, r = args
return l & r
def or_expr(self, args):
if len(args) == 1:
return args[0]
l, r = args
return l | r
def not_expr(self, args):
o, = args
return ~o
def cmp_op(self, args):
l, o, r = args
field, o, value = self._order_terms(l, o, r)
return self._query(field, o, value)
def has_op(self, args):
quantities, predicates = args
return self._has_query(quantities, predicates)
def has_list_op(self, args):
quantities, o, predicates_list = args
queries = [
self._has_query(quantities, predicates)
for predicates in predicates_list]
if o in _has_operators:
return Q('bool', **{_has_operators[o]: queries})
raise Exception('Unknown operator %s' % o)
def has_only_op(self, args):
quantity, lst = args
if quantity.name != 'elements':
raise Exception('HAS ONLY is only supported for elements')
def values():
for predicates in lst:
if len(predicates) != 1:
raise Exception('Tuples not supported in HAS ONLY')
op, value = predicates[0]
if op != '':
raise Exception('Predicated not supported in HAS ONLY')
if not isinstance(value, str):
raise Exception('Only strings supported in HAS ONLY')
yield value
try:
order_numbers = list([ase.data.atomic_numbers[element] for element in values()])
order_numbers.sort()
value = ''.join([ase.data.chemical_symbols[number] for number in order_numbers])
except KeyError as e:
raise Exception('Not a chemical symbol: %s' % str(e))
return Q('term', only_atoms=value)
def length(self, args):
quantity, = args
if quantity.name not in _length_quantities:
raise Exception('LENGTH is not supported for %s' % quantity.name)
return OptimadeStructureEntry.m_section.quantities[_length_quantities[quantity.name]]
def known_op(self, args):
quantity, qualifier = args
query = Q('exists', field=self._field(quantity))
if qualifier == 'KNOWN':
return query
elif qualifier == 'UNKNOWN':
return ~query # pylint: disable=invalid-unary-operand-type
raise NotImplementedError
def contains_op(self, args):
quantity, value = args
return self._wildcard_query(quantity, '*%s*' % value)
def starts_op(self, args):
quantity, value = args
return self._wildcard_query(quantity, '%s*' % value)
def ends_op(self, args):
quantity, value = args
return self._wildcard_query(quantity, '*%s' % value)
def list(self, args):
return list(args)
def quantity_tuple(self, args):
return list(args)
def predicate_tuple(self, args):
return list(args)
def predicate(self, args):
if len(args) == 1:
return '', args[0]
else:
return args[0], args[1]
def quantity(self, args):
quantity_name = args[0]
quantity_def = OptimadeStructureEntry.m_section.quantities.get(quantity_name, None)
if quantity_def is None:
raise Exception('%s is not a known quantity' % quantity_name)
elastic_annotation = quantity_def.m_annotations.get('elastic', None)
if elastic_annotation is None:
raise Exception('%s is not supported in queries' % quantity_name)
return quantity_def
def int_literal(self, args):
return int(args[0])
def float_literal(self, args):
return float(args[0])
def string_literal(self, args):
return args[0].strip('"')
_parser = LarkParser(version=(0, 10, 0))
_transformer = Transformer()
def parse_filter(filter_str: str) -> Q:
""" Parses the given optimade filter str and returns a suitable elastic search query.
Arguments:
filter_str: Can be direct user input with no prior processing.
Raises:
FilterException: If the given str cannot be parsed, or if there are any semantic
errors in the given expression.
"""
try:
parse_tree = _parser.parse(filter_str)
except Exception as e:
raise FilterException('Syntax error: %s' % str(e))
try:
query = _transformer.transform(parse_tree)
except Exception as e:
raise FilterException('Semantic error: %s' % str(e))
return query
from ase.data import chemical_symbols from ase.data import chemical_symbols
from elasticsearch_dsl import Keyword, Integer, Float, Text, InnerDoc, Nested from elasticsearch_dsl import Keyword, Integer, Float, InnerDoc, Nested
import numpy as np import numpy as np
from nomad.metainfo import MObject, Section, Quantity, Enum, units from nomad.metainfo import MObject, Section, Quantity, Enum, units
...@@ -63,7 +63,7 @@ class OptimadeStructureEntry(MObject): ...@@ -63,7 +63,7 @@ class OptimadeStructureEntry(MObject):
chemical_formula_descriptive = Quantity( chemical_formula_descriptive = Quantity(
type=str, type=str,
links=optimade_links('h.6.2.4'), links=optimade_links('h.6.2.4'),
a_elastic=dict(type=Text), a_elastic=dict(type=Keyword),
a_optimade=Optimade(query=True, entry=True), a_optimade=Optimade(query=True, entry=True),
description=''' description='''
The chemical formula for a structure as a string in a form chosen by the API The chemical formula for a structure as a string in a form chosen by the API
...@@ -73,7 +73,7 @@ class OptimadeStructureEntry(MObject): ...@@ -73,7 +73,7 @@ class OptimadeStructureEntry(MObject):
chemical_formula_reduced = Quantity( chemical_formula_reduced = Quantity(
type=str, type=str,
links=optimade_links('h.6.2.5'), links=optimade_links('h.6.2.5'),
a_elastic=dict(type=Text), a_elastic=dict(type=Keyword),
a_optimade=Optimade(query=True, entry=True), a_optimade=Optimade(query=True, entry=True),
description=''' description='''
The reduced chemical formula for a structure as a string with element symbols and The reduced chemical formula for a structure as a string with element symbols and
...@@ -83,7 +83,7 @@ class OptimadeStructureEntry(MObject): ...@@ -83,7 +83,7 @@ class OptimadeStructureEntry(MObject):
chemical_formula_hill = Quantity( chemical_formula_hill = Quantity(
type=str, type=str,
links=optimade_links('h.6.2.6'), links=optimade_links('h.6.2.6'),
a_elastic=dict(type=Text), a_elastic=dict(type=Keyword),
a_optimade=Optimade(query=True, entry=False), a_optimade=Optimade(query=True, entry=False),
description=''' description='''
The chemical formula for a structure in Hill form with element symbols followed by The chemical formula for a structure in Hill form with element symbols followed by
...@@ -93,7 +93,7 @@ class OptimadeStructureEntry(MObject): ...@@ -93,7 +93,7 @@ class OptimadeStructureEntry(MObject):
chemical_formula_anonymous = Quantity( chemical_formula_anonymous = Quantity(
type=str, type=str,
links=optimade_links('h.6.2.7'), links=optimade_links('h.6.2.7'),
a_elastic=dict(type=Text), a_elastic=dict(type=Keyword),
a_optimade=Optimade(query=True, entry=True), a_optimade=Optimade(query=True, entry=True),
description=''' description='''
The anonymous formula is the chemical_formula_reduced, but where the elements are The anonymous formula is the chemical_formula_reduced, but where the elements are
......
...@@ -56,6 +56,7 @@ class OptimadeNormalizer(SystemBasedNormalizer): ...@@ -56,6 +56,7 @@ class OptimadeNormalizer(SystemBasedNormalizer):
# elements # elements
atoms = normalized_atom_labels(nomad_species) atoms = normalized_atom_labels(nomad_species)
atom_count = len(atoms)
atom_counts: Dict[str, int] = {} atom_counts: Dict[str, int] = {}
for atom in atoms: for atom in atoms:
current = atom_counts.setdefault(atom, 0) current = atom_counts.setdefault(atom, 0)
...@@ -66,7 +67,7 @@ class OptimadeNormalizer(SystemBasedNormalizer): ...@@ -66,7 +67,7 @@ class OptimadeNormalizer(SystemBasedNormalizer):
optimade.elements.sort() optimade.elements.sort()
optimade.nelements = len(optimade.elements) optimade.nelements = len(optimade.elements)
optimade.elements_ratios = [ optimade.elements_ratios = [
atom_counts[element] / optimade.nelements atom_counts[element] / atom_count
for element in optimade.elements] for element in optimade.elements]
# formulas # formulas
......
# Copyright 2018 Markus Scheidgen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an"AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
import json import json
from optimade.filterparser import LarkParser import numpy as np
from lark import Transformer import pytest
from elasticsearch_dsl import Q, Text, Keyword, Integer
import ase.data
from nomad.processing import Upload from nomad.processing import Upload
from nomad.search import SearchRequest from nomad import search
from nomad.metainfo.optimade import OptimadeStructureEntry from nomad.parsing import LocalBackend
from nomad.metainfo import Quantity from nomad.datamodel import CalcWithMetadata
from nomad.app.optimade import parse_filter
from tests.test_normalizing import run_normalize
from tests.conftest import clear_elastic
def test_get_entry(published: Upload): def test_get_entry(published: Upload):
...@@ -17,223 +35,92 @@ def test_get_entry(published: Upload): ...@@ -17,223 +35,92 @@ def test_get_entry(published: Upload):
data = json.load(f) data = json.load(f)
assert 'OptimadeStructureEntry' in data assert 'OptimadeStructureEntry' in data
search_result = SearchRequest().search_parameter('calc_id', calc_id).execute_paginated()['results'][0] search_result = search.SearchRequest().search_parameter('calc_id', calc_id).execute_paginated()['results'][0]
assert 'optimade' in search_result assert 'optimade' in search_result
class ESTransformer(Transformer): def create_test_structure(meta_info, id: int, h: int, o: int, extra: List[str], periodicity: int):
atom_labels = ['H' for i in range(0, h)] + ['O' for i in range(0, o)] + extra
cmp_operators = {'>': 'gt', '>=': 'gte', '<': 'lt', '<=': 'lte'} test_vector = np.array([0, 0, 0])
has_operators = {'ALL': 'must', 'ANY': 'should'}
length_quantities = {'elements': 'nelements', 'elements_rations': 'nelements', 'dimension_types': 'dimension_types'} backend = LocalBackend(meta_info, False, True) # type: ignore
backend.openSection('section_run')
def _field(self, quantity, nested=None): backend.addValue('program_name', 'test_code')
optimade_field_name = quantity backend.openSection('section_system')
if nested is not None:
optimade_field_name = '%s.%s' % (nested, optimade_field_name) backend.addArrayValues('atom_labels', np.array(atom_labels))
return 'optimade.%s' % optimade_field_name backend.addArrayValues(
'atom_positions', np.array([test_vector for i in range(0, len(atom_labels))]))
def _order_terms(self, l, r): backend.addArrayValues(
if isinstance(l, Quantity): 'lattice_vectors', np.array([test_vector, test_vector, test_vector]))
if isinstance(r, Quantity): backend.addArrayValues(
raise Exception('Cannot compare two quantities: %s, %s' % (l.name, r.name)) 'configuration_periodic_dimensions',
else: np.array([True for _ in range(0, periodicity)] + [False for _ in range(periodicity, 3)]))
return l, r
else: backend.closeSection('section_system', 0)
if isinstance(r, Quantity): backend.closeSection('section_run', 0)
return r, l
else: backend = run_normalize(backend)
raise Exception('Cannot compare two values: %s, %s' % (str(l), str(l))) calc = CalcWithMetadata(
upload_id='test_uload_id', calc_id='test_calc_id_%d' % id, mainfile='test_mainfile')
def __default__(self, tree, children, *args, **kwargs): calc.apply_domain_metadata(backend)
return children[0] search.Entry.from_calc_with_metadata(calc).save()
def and_expr(self, args):
if len(args) == 1: @pytest.fixture(scope='module')
return args[0] def example_structures(meta_info, elastic_infra):
l, r = args clear_elastic(elastic_infra)
return l & r create_test_structure(meta_info, 1, 2, 1, [], 0)
create_test_structure(meta_info, 2, 2, 1, ['C'], 0)
def or_expr(self, args): create_test_structure(meta_info, 3, 2, 1, [], 1)
if len(args) == 1: create_test_structure(meta_info, 4, 1, 1, [], 0)
return args[0] search.refresh()
l, r = args
return l | r yield
clear_elastic(elastic_infra)
def not_expr(self, args):
if len(args) == 1:
return args[0] @pytest.mark.parametrize('query, results', [
o, = args ('nelements > 1', 4),
return ~o ('nelements >= 2', 4),
('nelements > 2', 1),
def _query(self, quantity, o, value, nested=None): ('nelements < 4', 4),
field = self._field(quantity, nested=nested) ('nelements < 3', 3),
if o in ESTransformer.cmp_operators: ('nelements <= 3', 4),
return Q('range', **{field: {ESTransformer.cmp_operators[o]: value}}) ('nelements != 2', 1),
('1 < nelements', 4),
elastic_annotation = quantity.m_annotations.get('elastic', None) ('elements HAS "H"', 4),
if elastic_annotation['type'] == Text: ('elements HAS ALL "H", "O"', 4),
query_type = 'match' ('elements HAS ALL "H", "C"', 1),
elif elastic_annotation['type'] in [Keyword, Integer]: ('elements HAS ANY "H", "C"', 4),
query_type = 'term' ('elements HAS ANY "C"', 1),
else: ('elements HAS ONLY "C"', 0),
raise NotImplementedError('Quantity has unsupported ES field type') ('elements HAS ONLY "H", "O"', 3),
('elements:elements_ratios HAS "H":>0.66', 2),
if o in ['=', '']: ('elements:elements_ratios HAS ALL "O":>0.33', 3),
return Q(query_type, **{field: value}) ('elements:elements_ratios HAS ALL "O":>0.33,"O":<0.34', 2),
('elements IS KNOWN', 4),
if o == '!=': ('elements IS UNKNOWN', 0),
return ~Q(query_type, **{field: value}) # pylint: disable=invalid-unary-operand-type ('chemical_formula_reduced = "H2O"', 2),
('chemical_formula_reduced CONTAINS "H2"', 3),
raise Exception('Unknown operator %s' % o) ('chemical_formula_reduced CONTAINS "H"', 4),
('chemical_formula_reduced CONTAINS "C"', 1),
def cmp_op(self, args): ('chemical_formula_reduced STARTS "H2"', 3),
l, o, r = args ('chemical_formula_reduced STARTS WITH "H2"', 3),
field, value = self._order_terms(l, r) ('chemical_formula_reduced ENDS WITH "C"', 1),
return self._query(field, o, value) ('chemical_formula_reduced ENDS "C"', 1),
('LENGTH elements = 2', 3),
def has_op(self, args): ('LENGTH elements = 3', 1),
quantities, predicates = args ('LENGTH dimension_types = 0', 3),
return self._has_query(quantities, predicates) ('LENGTH dimension_types = 1', 1),
('nelements = 2 AND LENGTH dimension_types = 1', 1),
def _has_query(self, quantities, predicates): ('nelements = 3 AND LENGTH dimension_types = 1', 0),
if len(quantities) != len(predicates): ('nelements = 3 OR LENGTH dimension_types = 1', 2),
raise Exception( ('nelements > 1 OR LENGTH dimension_types = 1 AND nelements = 2', 4),
'Tuple length does not match: %s <o> %s ' % ('(nelements > 1 OR LENGTH dimension_types = 1) AND nelements = 2', 3),
(':'.join(quantities), ':'.join(predicates))) ('NOT LENGTH dimension_types = 1', 3)
])
if len(quantities) == 1: def test_optimade_parser(example_structures, query, results):
o, value = predicates[0] query = parse_filter(query)
return self._query(quantities[0], o, value) result = search.SearchRequest(query=query).execute_paginated()
assert result['pagination']['total'] == results
if any(quantity.name not in ['elements', 'elements_ratios'] for quantity in quantities):
raise Exception('Expression with tuples are only supported for elements and elements_positions')
queries = [
self._query(field, o, value, nested='elements_ratios')
for field, (o, value) in zip(quantities, predicates)]
return Q('nested', path='elements_ratios', query=dict(bool=dict(must=queries)))
def has_list_op(self, args):
quantities, o, predicates_list = args
queries = [
self._has_query(quantities, predicates)
for predicates in predicates_list]
if o in ESTransformer.has_operators:
return Q('bool', **{ESTransformer.has_operators[o]: queries})
raise Exception('Unknown operator %s' % o)
def has_only_op(self, args):
quantity, lst = args
if quantity.name != 'elements':