diff --git a/nomad/app_fastapi/optimade/elasticsearch.py b/nomad/app_fastapi/optimade/elasticsearch.py index 881119e35b2b75b065be0f83cfa51ab4aaab1012..df868611aa98ea1f67fd7450d8d35d9b73111d45 100644 --- a/nomad/app_fastapi/optimade/elasticsearch.py +++ b/nomad/app_fastapi/optimade/elasticsearch.py @@ -22,7 +22,9 @@ from optimade.models import StructureResource from nomad import config, datamodel, files, search, utils -from nomad.normalizing.optimade import optimade_chemical_formula_reduced +from nomad.normalizing.optimade import ( + optimade_chemical_formula_reduced, optimade_chemical_formula_anonymous, + optimade_chemical_formula_hill) from .filterparser import _get_transformer as get_transformer @@ -113,14 +115,12 @@ class ElasticsearchStructureCollection(EntryCollection): if nresults_now > 1: raise HTTPException( status_code=404, - detail=f"Instead of a single entry, {nresults_now} entries were found", - ) + detail=f'Instead of a single entry, {nresults_now} entries were found') results = results[0] if results else None return results, data_returned, more_data_available, all_fields - fields def _check_aliases(self, aliases): - """ Check that aliases do not clash with mongo keywords. """ pass def _es_to_optimade_result( @@ -161,9 +161,14 @@ class ElasticsearchStructureCollection(EntryCollection): if include('last_modified'): attrs['last_modified'] = entry.last_processing if entry.last_processing is not None else entry.upload_time + # TODO this should be removed, once all data is reprocessed with the right normalization attrs['chemical_formula_reduced'] = optimade_chemical_formula_reduced( attrs['chemical_formula_reduced']) - + attrs['chemical_formula_anonymous'] = optimade_chemical_formula_anonymous( + attrs['chemical_formula_reduced']) + attrs['chemical_formula_hill'] = optimade_chemical_formula_hill( + attrs['chemical_formula_hill']) + attrs['chemical_formula_descriptive'] = attrs['chemical_formula_hill'] dimension_types = attrs['dimension_types'] if isinstance(dimension_types, int): attrs['dimension_types'] = [1] * dimension_types + [0] * (3 - dimension_types) diff --git a/nomad/normalizing/optimade.py b/nomad/normalizing/optimade.py index 201dcc1dbd9fcb106062cec5b5c6defd7c8d0282..99c35af589547ccc0b2eb9bc0e381745e2dc225e 100644 --- a/nomad/normalizing/optimade.py +++ b/nomad/normalizing/optimade.py @@ -23,6 +23,7 @@ import ase.data import ase.formula from string import ascii_uppercase import pint.quantity +from collections import OrderedDict from nomad.normalizing.normalizer import SystemBasedNormalizer from nomad.units import ureg @@ -50,6 +51,43 @@ def optimade_chemical_formula_reduced(formula: str): return formula +def optimade_chemical_formula_anonymous(formula: str): + if formula is None: + return formula + + try: + ase_formula = ase.formula.Formula(formula).count() + result_formula = '' + for index, element_count in enumerate(reversed(sorted(ase_formula.values()))): + result_formula += ascii_uppercase[index] + if element_count > 1: + result_formula += str(element_count) + + return result_formula + except Exception: + return formula + + +def optimade_chemical_formula_hill(formula: str): + if formula is None: + return formula + + try: + ase_formula = ase.formula.Formula(formula).count() + result: Dict[str, int] = OrderedDict() + if 'C' in ase_formula: + for symbol in 'CH': + if symbol in ase_formula: + result[symbol] = ase_formula.pop(symbol) + for symbol, n in sorted(ase_formula.items()): + result[symbol] = n + return ''.join([ + symbol + (str(n) if n > 1 else '') + for symbol, n in result.items()]) + except Exception: + return formula + + class OptimadeNormalizer(SystemBasedNormalizer): ''' @@ -113,14 +151,11 @@ class OptimadeNormalizer(SystemBasedNormalizer): # formulas optimade.chemical_formula_reduced = optimade_chemical_formula_reduced( get_value(section_system.chemical_composition_reduced)) - optimade.chemical_formula_hill = get_value(section_system.chemical_composition_bulk_reduced) + optimade.chemical_formula_hill = optimade_chemical_formula_hill( + get_value(section_system.chemical_composition)) optimade.chemical_formula_descriptive = optimade.chemical_formula_hill - optimade.chemical_formula_anonymous = '' - for i in range(len(optimade.elements)): - part = '%s' % ascii_uppercase[i % len(ascii_uppercase)] - if atom_counts[optimade.elements[i]] > 1: - part += str(atom_counts[optimade.elements[i]]) - optimade.chemical_formula_anonymous += part + optimade.chemical_formula_anonymous = optimade_chemical_formula_anonymous( + optimade.chemical_formula_reduced) # sites optimade.nsites = len(nomad_species) diff --git a/tests/app_fastapi/test_optimade.py b/tests/app_fastapi/test_optimade.py index a13696ad1ab631d516527d9a12db4dcc54222467..07a4c7a7531ae97d032f320a942c3bf7e84f4eea 100644 --- a/tests/app_fastapi/test_optimade.py +++ b/tests/app_fastapi/test_optimade.py @@ -112,7 +112,7 @@ def example_structures(elastic_infra, mongo_infra, raw_files_infra): ('NOT chemical_formula_descriptive ENDS WITH "1"', 4), ('chemical_formula_descriptive CONTAINS "C" AND NOT chemical_formula_descriptive STARTS WITH "O"', 1), ('NOT chemical_formula_anonymous STARTS WITH "A"', 0), - ('chemical_formula_anonymous CONTAINS "AB2" AND chemical_formula_anonymous ENDS WITH "C"', 1), + ('chemical_formula_anonymous CONTAINS "A2B" AND chemical_formula_anonymous ENDS WITH "C"', 1), ('nsites >=3 AND elements LENGTH = 2', 2), ('elements LENGTH = 2', 3), ('elements LENGTH 2', 3), diff --git a/tests/normalizing/test_optimade.py b/tests/normalizing/test_optimade.py new file mode 100644 index 0000000000000000000000000000000000000000..7fe9e5b55ab3f7fd3d692c4adb493c50afa0f1b9 --- /dev/null +++ b/tests/normalizing/test_optimade.py @@ -0,0 +1,41 @@ +# +# Copyright The NOMAD Authors. +# +# This file is part of NOMAD. See https://nomad-lab.eu for further info. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest +from nomad.normalizing import optimade + + +@pytest.mark.parametrize('formula, expected', [ + ('NaClHC', 'CHClNa'), ('NaClH2', 'ClH2Na') +]) +def test_chemical_formula_hill(formula, expected): + assert optimade.optimade_chemical_formula_hill(formula) == expected + + +@pytest.mark.parametrize('formula, expected', [ + ('Na3Cl2H', 'A3B2C'), ('NaNaNaClClHH', 'A3B2C2') +]) +def test_chemical_formula_anonymous(formula, expected): + assert optimade.optimade_chemical_formula_anonymous(formula) == expected + + +@pytest.mark.parametrize('formula, expected', [ + ('Na3Cl2H', 'Cl2HNa3'), ('NaNaNaClClHH', 'Cl2H2Na3') +]) +def test_chemical_formula_reduced(formula, expected): + assert optimade.optimade_chemical_formula_reduced(formula) == expected