Commit c61ff1f2 authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

Added configurability of domain specific metadata.

parent 9eaca8b5
Pipeline #45922 failed with stages
in 22 minutes and 35 seconds
......@@ -25,6 +25,7 @@ from nomad import config, utils
from nomad.files import ArchiveBasedStagingUploadFiles
from nomad.parsing import parser_dict, LocalBackend, match_parser
from nomad.normalizing import normalizers
from nomad.datamodel import CalcWithMetadata
from .main import cli
......@@ -173,5 +174,6 @@ def local(archive_id, show_backend=False, show_metadata=False, **kwargs):
if show_backend:
backend.write_json(sys.stdout, pretty=True)
if show_metadata:
metadata = backend.to_calc_with_metadata()
metadata = CalcWithMetadata()
metadata.apply_domain_metadata(backend)
ujson.dump(metadata.to_dict(), sys.stdout, indent=4)
......@@ -20,7 +20,7 @@ from sqlalchemy.sql.expression import literal
from datetime import datetime
from nomad import infrastructure, utils
from nomad.datamodel import CalcWithMetadata
from nomad.datamodel import DFTCalcWithMetadata
from . import base
from .user import User
......@@ -180,7 +180,7 @@ class Calc(Base):
_dataset_cache: dict = {}
def apply_calc_with_metadata(self, calc: CalcWithMetadata, context: PublishContext) -> None:
def apply_calc_with_metadata(self, calc: DFTCalcWithMetadata, context: PublishContext) -> None:
"""
Applies the data from ``source`` to this coe Calc object.
"""
......@@ -309,14 +309,14 @@ class Calc(Base):
coe_calc.citations.append(citation)
def to_calc_with_metadata(self) -> CalcWithMetadata:
def to_calc_with_metadata(self) -> DFTCalcWithMetadata:
"""
Creates a :class:`CalcWithMetadata` instance with UCPM ids, and all UMD/CMD.
Creates a :class:`DFTCalcWithMetadata` instance with UCPM ids, and all UMD/CMD.
Be aware that ``upload_id`` and ``calc_id``, might be old coe repository
``upload_name`` and calculation ``checksum`` depending on the context, i.e. used
database.
"""
result = CalcWithMetadata(
result = DFTCalcWithMetadata(
upload_id=self.upload.upload_id if self.upload else None,
calc_id=self.checksum)
......
......@@ -42,7 +42,7 @@ This module also provides functionality to add parsed calculation data to the db
:undoc-members:
"""
from typing import Type
from typing import Type, cast
import datetime
from sqlalchemy import Column, Integer, String, Boolean, DateTime, ForeignKey
from sqlalchemy.orm import relationship
......@@ -52,7 +52,7 @@ import warnings
from sqlalchemy import exc as sa_exc
from nomad import utils, infrastructure, config
from nomad.datamodel import UploadWithMetadata
from nomad.datamodel import UploadWithMetadata, DFTCalcWithMetadata
from .calc import Calc, PublishContext
from .base import Base
......@@ -194,7 +194,8 @@ class Upload(Base): # type: ignore
upload=coe_upload)
repo_db.add(coe_calc)
coe_calc.apply_calc_with_metadata(calc, context=context)
coe_calc.apply_calc_with_metadata(
cast(DFTCalcWithMetadata, calc), context=context)
logger.debug(
'added calculation, not yet committed', calc_id=coe_calc.calc_id)
......
......@@ -108,7 +108,8 @@ services = NomadConfig(
api_secret='defaultApiSecret',
admin_password='password',
disable_reset=True,
not_processed_value='not processed'
not_processed_value='not processed',
unavailable_value='unavailable'
)
tests = NomadConfig(
......@@ -148,11 +149,12 @@ client = NomadConfig(
url='http://localhost:8000/nomad/api'
)
console_log_level = logging.WARNING
service = 'unknown nomad service'
version = '4.3' # TODO replace with git hash?
release = 'devel'
domain = 'DFT'
service = 'unknown nomad service'
auxfile_cutoff = 30
version = '4.3' # TODO replace with git hash?
console_log_level = logging.WARNING
def get_loglevel_from_env(key, default_level=logging.INFO):
......
# Copyright 2018 Markus Scheidgen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an"AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This module contains classes that allow to represent the core
nomad data entities :class:`Upload` and :class:`Calc` on a high level of abstraction
independent from their representation in the different modules
:py:mod:`nomad.processing`, :py:mod:`nomad.coe_repo`, :py:mod:`nomad.parsing`,
:py:mod:`nomad.search`, :py:mod:`nomad.api`, :py:mod:`nomad.migration`.
It is not about representing every detail, but those parts that are directly involved in
api, processing, migration, mirroring, or other 'infrastructure' operations.
Transformations between different implementations of the same entity can be build
and used. To ease the number of necessary transformations the classes
:class:`UploadWithMetadata` and :class:`CalcWithMetadata` can act as intermediate
representations. Therefore, implement only transformation from and to these
classes. These are the implemented transformations:
.. image:: datamodel_transformations.png
"""
import sys
from nomad.datamodel.base import UploadWithMetadata, CalcWithMetadata, Domain
from nomad.datamodel.dft import DFTCalcWithMetadata
# Override the CalcWithMetadata with the domain specific decendant
setattr(sys.modules['nomad.datamodel'], 'CalcWithMetadata', Domain.domain_class)
......@@ -12,29 +12,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This module contains classes that allow to represent the core
nomad data entities :class:`Upload` and :class:`Calc` on a high level of abstraction
independent from their representation in the different modules
:py:mod:`nomad.processing`, :py:mod:`nomad.coe_repo`, :py:mod:`nomad.parsing`,
:py:mod:`nomad.search`, :py:mod:`nomad.api`, :py:mod:`nomad.migration`.
It is not about representing every detail, but those parts that are directly involved in
api, processing, migration, mirroring, or other 'infrastructure' operations.
Transformations between different implementations of the same entity can be build
and used. To ease the number of necessary transformations the classes
:class:`UploadWithMetadata` and :class:`CalcWithMetadata` can act as intermediate
representations. Therefore, implement only transformation from and to these
classes. These are the implemented transformations:
.. image:: datamodel_transformations.png
"""
from typing import Iterable, List, Dict
from typing import Iterable, List, Dict, Type
import datetime
from elasticsearch_dsl import Keyword
from nomad import utils
from nomad import utils, config
class UploadWithMetadata():
......@@ -60,7 +42,7 @@ class UploadWithMetadata():
class CalcWithMetadata():
"""
A dict/POPO class that can be used for mapping calc representations with calc metadata.
We have many representations of calcs and their calc metadata. To avoid implement
We have multi representations of calcs and their calc metadata. To avoid implement
mappings between all combinations, just implement mappings with the class and use
mapping transitivity. E.g. instead of A -> B, A -> this -> B.
......@@ -87,16 +69,6 @@ class CalcWithMetadata():
references: Objects describing user provided references, keys are ``id`` and ``value``.
datasets: Objects describing the datasets, keys are ``id``, ``name``, ``doi``.
DOI is optional, is an object with key ``id``, ``value``.
formula: The chemical formula
atoms: A list of all atoms, as labels. All atoms means the whole composition, with atom labels repeated.
basis_set: The basis set type of this calculation.
xc_functional: The class of functional used.
system: The system type, e.g. Atom/Molecule, 2D, Bulk(3D)
crystal_system: The symmetry describing crystal_system type.
spacegroup: The spacegroup, as spacegroup number.
code_name: The name of the used code.
code_version: The version of the used code.
"""
def __init__(self, **kwargs):
# id relevant metadata
......@@ -123,18 +95,6 @@ class CalcWithMetadata():
self.references: List[utils.POPO] = []
self.datasets: List[utils.POPO] = []
# DFT specific calc metadata, derived from raw data through successful processing
self.formula: str = None
self.atoms: List[str] = []
self.basis_set: str = None
self.xc_functional: str = None
self.system: str = None
self.crystal_system: str = None
self.spacegroup: str = None
self.spacegroup_symbol: str = None
self.code_name: str = None
self.code_version: str = None
# temporary reference to the backend after successful processing
self.backend = None
......@@ -175,3 +135,56 @@ class CalcWithMetadata():
self.datasets = [
utils.POPO(id=int(ds['id']), doi=utils.POPO(value=ds.get('_doi')), name=ds.get('_name'))
for ds in metadata.get('datasets', [])]
def apply_domain_metadata(self, backend):
raise NotImplementedError()
class DomainQuantity:
def __init__(
self, description: str = None, multi: bool = False, aggregate: bool = False,
elastic_mapping=None):
self.name: str = None
self.description = description
self.multi = multi
self.aggregate = aggregate
self.elastic_mapping = elastic_mapping
if self.elastic_mapping is None:
self.elastic_mapping = Keyword(multi=self.multi)
class Domain:
domain_class: Type[CalcWithMetadata] = None
quantities: List[DomainQuantity] = []
@classmethod
def register_domain(cls, domain_class: type, domain_name: str, quantities: Dict[str, DomainQuantity]):
assert cls.domain_class is None, 'you can only define one domain.'
if not domain_name == config.domain:
return
cls.domain_class = domain_class
reference_domain_calc = domain_class()
reference_general_calc = CalcWithMetadata()
for name, value in reference_domain_calc.__dict__.items():
if not hasattr(reference_general_calc, name):
quantity = quantities.get(name, None)
if quantity is None:
quantity = DomainQuantity()
quantities[name] = quantity
quantity.name = name
quantity.multi = isinstance(value, list)
cls.quantities.append(quantity)
for name in quantities.keys():
assert hasattr(reference_domain_calc, name) and not hasattr(reference_general_calc, name), \
'quantity does not exist or overrides general non domain quantity'
# utils.get_logger(__name__).info(
# 'configured domain', domain=domain_name, domain_quantities=len(cls.quantities))
# Copyright 2018 Markus Scheidgen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an"AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
DFT specific metadata
"""
from typing import List
import re
from elasticsearch_dsl import Integer
from nomad import utils, config
from .base import CalcWithMetadata, DomainQuantity, Domain
xc_treatments = {
'gga': 'GGA',
'hf_': 'HF',
'oep': 'OEP',
'hyb': 'hybrid',
'mgg': 'meta-GGA',
'vdw': 'vdW',
'lda': 'LDA',
}
""" https://gitlab.mpcdf.mpg.de/nomad-lab/nomad-meta-info/wikis/metainfo/XC-functional """
basis_sets = {
'gaussians': 'gaussians',
'realspacegrid': 'real-space grid',
'planewaves': 'plane waves'
}
version_re = re.compile(r'(\d+(\.\d+(\.\d+)?)?)')
def map_functional_name_to_xc_treatment(name):
if name == config.services.unavailable_label:
return name
return xc_treatments.get(name[:3].lower(), name)
def map_basis_set_to_basis_set_label(name):
key = name.replace('_', '').replace('-', '').replace(' ', '').lower()
return basis_sets.get(key, name)
def simplify_version(version):
match = version_re.search(version)
if match is None:
return version
else:
return match.group(0)
class DFTCalcWithMetadata(CalcWithMetadata):
def __init__(self, **kwargs):
self.formula: str = None
self.atoms: List[str] = []
self.n_atoms: int = 0
self.basis_set: str = None
self.xc_functional: str = None
self.system: str = None
self.crystal_system: str = None
self.spacegroup: str = None
self.spacegroup_symbol: str = None
self.code_name: str = None
self.code_version: str = None
self.n_total_energies = 0
self.n_geometries = 0
self.quantities = []
self.geometries = []
self.group_hash: str = None
super().__init__(**kwargs)
def apply_domain_metadata(self, backend):
logger = utils.get_logger(__name__).bind(
upload_id=self.upload_id, calc_id=self.calc_id, mainfile=self.mainfile)
def get_optional_value(key, section, unavailable_value=None):
# Section is section_system, section_symmetry, etc...
val = None # Initialize to None, so we can compare section values.
# Loop over the sections with the name section in the backend.
for section_index in backend.get_sections(section):
try:
new_val = backend.get_value(key, section_index)
except KeyError:
new_val = None
# Compare values from iterations.
if val is not None and new_val is not None:
if val.__repr__() != new_val.__repr__():
logger.warning(
'The values for %s differ between different %s: %s vs %s' %
(key, section, str(val), str(new_val)))
val = new_val if new_val is not None else val
if val is None:
logger.warning(
'The values for %s where not available in any %s' % (key, section))
return unavailable_value if unavailable_value is not None else config.services.unavailable_label
else:
return val
if self.calc_id is None:
self.calc_id = backend.get_value('calc_id')
if self.upload_id is None:
self.upload_id = backend.get_value('upload_id')
if self.mainfile is None:
self.mainfile = backend.get_value('main_file')
self.code_name = backend.get_value('program_name', 0)
self.code_version = simplify_version(backend.get_value('program_version', 0))
self.atoms = get_optional_value('atom_labels', 'section_system')
if hasattr(self.atoms, 'tolist'):
self.atoms = self.atoms.tolist()
self.n_atoms = len(self.atoms)
self.atoms = list(set(self.atoms))
self.atoms.sort()
self.crystal_system = get_optional_value('crystal_system', 'section_symmetry')
self.spacegroup = get_optional_value('space_group_number', 'section_symmetry', 0)
self.spacegroup_symbol = get_optional_value('international_short_symbol', 'section_symmetry', 0)
self.basis_set = map_basis_set_to_basis_set_label(
get_optional_value('program_basis_set_type', 'section_run'))
self.system = get_optional_value('system_type', 'section_system')
self.formula = get_optional_value('chemical_composition_bulk_reduced', 'section_system')
self.xc_functional = map_functional_name_to_xc_treatment(
get_optional_value('XC_functional_name', 'section_method'))
self.group_hash = utils.hash(
self.formula,
self.spacegroup,
self.basis_set,
self.xc_functional,
self.code_name,
self.code_version,
self.with_embargo,
self.comment,
self.references,
self.uploader,
self.coauthors)
quantities = set()
geometries = set()
n_total_energies = 0
n_geometries = 0
for meta_info, _, value in backend._delegate.results.traverse():
quantities.add(meta_info)
if meta_info == 'energy_total':
n_total_energies += 1
if meta_info == 'section_system':
n_geometries += 1
if meta_info == 'configuration_raw_gid':
geometries.add(value)
self.quantities = list(quantities)
self.geometries = list(geometries)
self.n_total_energies = n_total_energies
self.n_geometries = n_geometries
Domain.register_domain(DFTCalcWithMetadata, 'DFT', quantities=dict(
formula=DomainQuantity('The chemical (hill) formula of the simulated system.'),
atoms=DomainQuantity('The atom labels of all atoms in the simulated system.'),
basis_set=DomainQuantity('The used basis set functions.'),
n_total_energies=DomainQuantity('Number of total energy calculations', elastic_mapping=Integer()),
n_geometries=DomainQuantity('Number of unique geometries', elastic_mapping=Integer()),
n_atoms=DomainQuantity('Number of atoms in the simulated system', elastic_mapping=Integer())))
......@@ -531,7 +531,8 @@ class NomadCOEMigration:
def check_mismatch() -> bool:
# some exceptions
if source_value in NomadCOEMigration.expected_differences and \
if isinstance(source_value, str) and \
source_value in NomadCOEMigration.expected_differences and \
target_value == NomadCOEMigration.expected_differences.get(source_value):
return True
......@@ -550,7 +551,9 @@ class NomadCOEMigration:
if isinstance(target_value, list):
source_list = list(to_comparable_list(source_value))
target_list = list(to_comparable_list(target_value))
if len(set(source_list).intersection(target_list)) != len(target_list):
if len(source_list) != len(target_list):
is_valid &= check_mismatch()
elif any(a != b for a, b in zip(source_list, target_list)):
is_valid &= check_mismatch()
continue
......
......@@ -36,11 +36,9 @@ from typing import List, Any
from .normalizer import Normalizer
from .system import SystemNormalizer
from .fhiaims import FhiAimsBaseNormalizer
from .repository import RepositoryNormalizer
normalizers: List[Any] = [
SystemNormalizer,
FhiAimsBaseNormalizer,
RepositoryNormalizer
FhiAimsBaseNormalizer
]
# Copyright 2018 Fawzi Mohamed, Danio Brambila, Markus Scheidgen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an"AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
from nomad.parsing import BadContextURI
from .normalizer import Normalizer
unavailable_label = 'unavailable'
class RepositoryNormalizer(Normalizer):
"""
The normalizer that turnes normalized parse results into a set of metadata
quantities for the repository.
"""
xc_treatments = {
'gga': 'GGA',
'hf_': 'HF',
'oep': 'OEP',
'hyb': 'hybrid',
'mgg': 'meta-GGA',
'vdw': 'vdW',
'lda': 'LDA',
}
""" https://gitlab.mpcdf.mpg.de/nomad-lab/nomad-meta-info/wikis/metainfo/XC-functional """
basis_sets = {
'gaussians': 'gaussians',
'realspacegrid': 'real-space grid',
'planewaves': 'plane waves'
}
version_re = re.compile(r'(\d+(\.\d+(\.\d+)?)?)')
def map_functional_name_to_xc_treatment(self, name):
if name == unavailable_label:
return name
return RepositoryNormalizer.xc_treatments.get(name[:3].lower(), name)
def map_basis_set_to_basis_set_label(self, name):
key = name.replace('_', '').replace('-', '').replace(' ', '').lower()
return RepositoryNormalizer.basis_sets.get(key, name)
def simplify_version(self, version):
match = RepositoryNormalizer.version_re.search(version)
if match is None:
return version
else:
return match.group(0)
def get_optional_value(self, key, section, unavailable_value=None):
# Section is section_system, section_symmetry, etc...
val = None # Initialize to None, so we can compare section values.
diff_flag = False # Flag to store whether vals differ between sections.
# Loop over the sections with the name section in the backend.
for section_index in self._backend.get_sections(section):
try:
new_val = self._backend.get_value(key, section_index)
except KeyError:
continue
# Compare values from iterations.
# We can't compare numpy arrays of different lengths.
if val is None: # We also can't check the length of none-type objects.
diff_bool = True
elif type(new_val) != int: # Type int doesn't have a len() operator.
if len(new_val) != len(val):
diff_bool = False
else: # If the first value wasn't none and the lengths are the same.
diff_bool = new_val != val
if type(diff_bool) is bool:
if diff_bool and val is not None:
diff_flag = True
elif diff_bool.all() and (val is not None):
# Then we have an array, and diff bool has multiple values since
# each item in array has been compared item for item.
diff_flag = True
val = new_val
if diff_flag is True:
self.logger.warning(
'The values for %s differ between different %s' % (key, section))
if val is None:
self.logger.warning(
'The values for %s where not available in any %s' % (key, section))
return unavailable_value if unavailable_value is not None else unavailable_label
else:
return val
def normalize(self, logger=None) -> None: