Commit 9cffe14b authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

Added the EMS domain and a new parser skeleton.

parent 1105db4c
Subproject commit 8c917126279cbbecb7adb7e30fc7d641d9a31a78
Subproject commit 0d50b13f66f45891bc441a82af73bb425167ee71
Subproject commit d4adc7e3266d420ffe12ed43cb1a6d87e8bd4df6
Subproject commit 51ad1335acc2ab3cffac57117a7cfaa63e092103
Subproject commit d8414907d150ee677d4b7ff89bbfcc7e39b159c8
Subproject commit 7d911fa1a65885efb73381b77ee2dd975845c899
Subproject commit d8ee02926a9f37317c880ba3caa31f5140e7730e
Subproject commit 567a96be45394556d94c61cb29e345ebb31a2f9f
......@@ -140,13 +140,13 @@ class CalcProcReproduction:
if not parser_backend.status[0] == 'ParseSuccess':
self.logger.error('parsing was not successful', status=parser_backend.status)
parser_backend.openNonOverlappingSection('section_calculation_info')
parser_backend.openNonOverlappingSection('section_entry_info')
parser_backend.addValue('upload_id', self.upload_id)
parser_backend.addValue('calc_id', self.calc_id)
parser_backend.addValue('calc_hash', "no hash")
parser_backend.addValue('main_file', self.mainfile)
parser_backend.addValue('mainfile', self.mainfile)
parser_backend.addValue('parser_name', parser.__class__.__name__)
parser_backend.closeNonOverlappingSection('section_calculation_info')
parser_backend.closeNonOverlappingSection('section_entry_info')
self.logger.info('ran parser')
return parser_backend
......
......@@ -48,7 +48,9 @@ about ids, user metadata, etc. To define domain specific quantities the classes
import sys
from nomad.datamodel.base import UploadWithMetadata, CalcWithMetadata, Domain
from nomad.datamodel import ems, dft
from nomad.datamodel.dft import DFTCalcWithMetadata
from nomad.datamodel.ems import EMSEntryWithMetadata
# Override the CalcWithMetadata with the domain specific decendant
setattr(sys.modules['nomad.datamodel'], 'CalcWithMetadata', Domain.instance.domain_entry_class)
......@@ -212,38 +212,40 @@ class Domain:
instances of :class:`DomainQuantity`.
"""
instance: 'Domain' = None
instances: Dict[str, 'Domain'] = {}
def __init__(
self, name: str, domain_entry_class: Type[CalcWithMetadata],
quantities: Dict[str, DomainQuantity]) -> None:
assert Domain.instance is None, 'you can only define one domain.'
quantities: Dict[str, DomainQuantity], root_sections=['section_run', 'section_entry_info']) -> None:
if name == config.domain:
assert Domain.instance is None, 'you can only define one domain.'
Domain.instance = self
Domain.instances[name] = self
self.name = name
self.domain_entry_class = domain_entry_class
self.quantities: List[DomainQuantity] = []
self.root_sections = root_sections
reference_domain_calc = domain_entry_class()
reference_general_calc = CalcWithMetadata()
for name, value in reference_domain_calc.__dict__.items():
if not hasattr(reference_general_calc, name):
quantity = quantities.get(name, None)
for quantity_name, value in reference_domain_calc.__dict__.items():
if not hasattr(reference_general_calc, quantity_name):
quantity = quantities.get(quantity_name, None)
if quantity is None:
quantity = DomainQuantity()
quantities[name] = quantity
quantity.name = name
quantities[quantity_name] = quantity
quantity.name = quantity_name
quantity.multi = isinstance(value, list)
self.quantities.append(quantity)
for name in quantities.keys():
assert hasattr(reference_domain_calc, name) and not hasattr(reference_general_calc, name), \
for quantity_name in quantities.keys():
assert hasattr(reference_domain_calc, quantity_name) and not hasattr(reference_general_calc, quantity_name), \
'quantity does not exist or overrides general non domain quantity'
assert any(quantity.order_default for quantity in Domain.instance.quantities), \
assert any(quantity.order_default for quantity in Domain.instances[name].quantities), \
'you need to define a order default quantity'
@property
......@@ -278,3 +280,30 @@ class Domain:
def aggregations_names(self) -> Iterable[str]:
""" Just the names of all metrics. """
return list(self.aggregations.keys())
def get_optional_backend_value(backend, key, section, unavailable_value=None, logger=None):
# Section is section_system, section_symmetry, etc...
val = None # Initialize to None, so we can compare section values.
# Loop over the sections with the name section in the backend.
for section_index in backend.get_sections(section):
try:
new_val = backend.get_value(key, section_index)
except KeyError:
new_val = None
# Compare values from iterations.
if val is not None and new_val is not None:
if val.__repr__() != new_val.__repr__() and logger:
logger.warning(
'The values for %s differ between different %s: %s vs %s' %
(key, section, str(val), str(new_val)))
val = new_val if new_val is not None else val
if val is None and logger:
logger.warning(
'The values for %s where not available in any %s' % (key, section))
return unavailable_value if unavailable_value is not None else config.services.unavailable_value
else:
return val
......@@ -22,7 +22,7 @@ from elasticsearch_dsl import Integer
from nomad import utils, config
from .base import CalcWithMetadata, DomainQuantity, Domain
from .base import CalcWithMetadata, DomainQuantity, Domain, get_optional_backend_value
xc_treatments = {
......@@ -92,58 +92,30 @@ class DFTCalcWithMetadata(CalcWithMetadata):
logger = utils.get_logger(__name__).bind(
upload_id=self.upload_id, calc_id=self.calc_id, mainfile=self.mainfile)
def get_optional_value(key, section, unavailable_value=None):
# Section is section_system, section_symmetry, etc...
val = None # Initialize to None, so we can compare section values.
# Loop over the sections with the name section in the backend.
for section_index in backend.get_sections(section):
try:
new_val = backend.get_value(key, section_index)
except KeyError:
new_val = None
# Compare values from iterations.
if val is not None and new_val is not None:
if val.__repr__() != new_val.__repr__():
logger.warning(
'The values for %s differ between different %s: %s vs %s' %
(key, section, str(val), str(new_val)))
val = new_val if new_val is not None else val
if val is None:
logger.warning(
'The values for %s where not available in any %s' % (key, section))
return unavailable_value if unavailable_value is not None else config.services.unavailable_value
else:
return val
if self.calc_id is None:
self.calc_id = backend.get_value('calc_id')
if self.upload_id is None:
self.upload_id = backend.get_value('upload_id')
if self.mainfile is None:
self.mainfile = backend.get_value('main_file')
self.code_name = backend.get_value('program_name', 0)
self.code_version = simplify_version(backend.get_value('program_version', 0))
self.atoms = get_optional_value('atom_labels', 'section_system')
self.atoms = get_optional_backend_value(backend, 'atom_labels', 'section_system', logger=logger)
if hasattr(self.atoms, 'tolist'):
self.atoms = self.atoms.tolist()
self.n_atoms = len(self.atoms)
self.atoms = list(set(self.atoms))
self.atoms.sort()
self.crystal_system = get_optional_value('crystal_system', 'section_symmetry')
self.spacegroup = get_optional_value('space_group_number', 'section_symmetry', 0)
self.spacegroup_symbol = get_optional_value('international_short_symbol', 'section_symmetry', 0)
self.crystal_system = get_optional_backend_value(
backend, 'crystal_system', 'section_symmetry', logger=logger)
self.spacegroup = get_optional_backend_value(
backend, 'space_group_number', 'section_symmetry', 0, logger=logger)
self.spacegroup_symbol = get_optional_backend_value(
backend, 'international_short_symbol', 'section_symmetry', 0, logger=logger)
self.basis_set = map_basis_set_to_basis_set_label(
get_optional_value('program_basis_set_type', 'section_run'))
self.system = get_optional_value('system_type', 'section_system')
self.formula = get_optional_value('chemical_composition_bulk_reduced', 'section_system')
get_optional_backend_value(backend, 'program_basis_set_type', 'section_run', logger=logger))
self.system = get_optional_backend_value(
backend, 'system_type', 'section_system', logger=logger)
self.formula = get_optional_backend_value(
backend, 'chemical_composition_bulk_reduced', 'section_system', logger=logger)
self.xc_functional = map_functional_name_to_xc_treatment(
get_optional_value('XC_functional_name', 'section_method'))
get_optional_backend_value(backend, 'XC_functional_name', 'section_method', logger=logger))
self.group_hash = utils.hash(
self.formula,
......@@ -202,6 +174,10 @@ Domain('DFT', DFTCalcWithMetadata, quantities=dict(
'Hashes that describe unique geometries simulated by this code run.',
metric=('geometries', 'cardinality')
),
quantities=DomainQuantity(
'All quantities that are used by this calculation',
metric=('quantities', 'value_count')
),
n_total_energies=DomainQuantity(
'Number of total energy calculations',
metric=('total_energies', 'sum'),
......
# Copyright 2018 Markus Scheidgen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an"AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Experimental material science specific metadata
"""
from typing import List
import ase.data
from nomad import utils
from .base import CalcWithMetadata, DomainQuantity, Domain, get_optional_backend_value
class EMSEntryWithMetadata(CalcWithMetadata):
def __init__(self, **kwargs):
# sample quantities
self.formula: str = None
self.atoms: List[str] = []
self.n_atoms: int = 0
self.chemical: str = None
# general metadata
self.location: str = None
self.experiment_time: str = None
self.method: str = None
self.quantities = []
self.group_hash: str = None
super().__init__(**kwargs)
def apply_domain_metadata(self, backend):
logger = utils.get_logger(__name__).bind(
upload_id=self.upload_id, calc_id=self.calc_id, mainfile=self.mainfile)
self.formula = get_optional_backend_value(
backend, 'sample_formula', 'section_sample', logger=logger)
self.atoms = get_optional_backend_value(
backend, 'sample_atoms', 'section_sample', logger=logger)
if hasattr(self.atoms, 'tolist'):
self.atoms = self.atoms.tolist()
self.n_atoms = len(self.atoms)
self.atoms = list(set(self.atoms))
self.atoms.sort()
self.chemical = get_optional_backend_value(
backend, 'sample_formula', 'section_sample', logger=logger)
self.location = get_optional_backend_value(
backend, 'experiment_location', 'section_experiment', logger=logger)
self.experiment_time = get_optional_backend_value(
backend, 'experiment_time', 'section_experiment', logger=logger)
self.method = get_optional_backend_value(
backend, 'experiment_method', 'section_experiment', logger=logger)
self.group_hash = utils.hash(
self.formula,
self.method,
self.location,
self.with_embargo,
self.comment,
self.references,
self.uploader,
self.coauthors)
quantities = set()
for meta_info, _, _ in backend._delegate.results.traverse(root_section='section_experiment'):
quantities.add(meta_info)
self.quantities = list(quantities)
Domain(
'EMS', EMSEntryWithMetadata,
root_sections=['section_experiment', 'section_entry_info'],
quantities=dict(
formula=DomainQuantity(
'The chemical (hill) formula of the simulated system.',
order_default=True),
atoms=DomainQuantity(
'The atom labels of all atoms in the simulated system.',
aggregations=len(ase.data.chemical_symbols)),
method=DomainQuantity(
'The experimental method used.', aggregations=20),
location=DomainQuantity(
'The used basis set functions.', aggregations=10),
quantities=DomainQuantity(
'All quantities that are used by this calculation',
metric=('quantities', 'value_count')
)))
......@@ -31,10 +31,13 @@ class Normalizer(metaclass=ABCMeta):
for read and write. Normalizer instances are reused.
Arguments:
backend: the backend used to read and write data from and to
backend: The backend used to read and write data from and to.
"""
def __init__(self, backend: AbstractParserBackend) -> None:
domain = 'DFT'
""" The domain this normalizer should be used in. Default for all normalizer is 'DFT'. """
def __init__(self, backend: AbstractParserBackend) -> None:
self._backend = backend
self.logger = get_logger(__name__)
......
......@@ -99,9 +99,10 @@ def match_parser(mainfile: str, upload_files: files.StagingUploadFiles) -> 'Pars
mime_type = magic.from_buffer(buffer, mime=True)
for parser in parsers:
if parser.is_mainfile(mainfile_path, mime_type, buffer, compression):
# TODO: deal with multiple possible parser specs
return parser
if parser.domain == config.domain:
if parser.is_mainfile(mainfile_path, mime_type, buffer, compression):
# TODO: deal with multiple possible parser specs
return parser
return None
......@@ -358,6 +359,12 @@ parsers = [
mainfile_contents_re=(
r'\s*(P?<progr>[a-zA-z0-9_]+)\s*(?:\([^()]+\))\s*:\s*TURBOMOLE\s*(P?<version>.*)'
r'\s*Copyright \(C\) [0-9]+ TURBOMOLE GmbH, Karlsruhe')
),
LegacyParser(
name='parsers/skeleton', code_name='skeleton', domain='EMS',
parser_class_name='skeletonparser.SkeletonParserInterface',
mainfile_mime_re=r'(application/json)|(text/.*)',
mainfile_contents_re=(r'skeleton experimental metadata format')
)
]
......
......@@ -34,6 +34,9 @@ class Parser(metaclass=ABCMeta):
and extracted files. Further, allows to run the parser on those 'main files'.
"""
def __init__(self):
self.domain = 'DFT'
@abstractmethod
def is_mainfile(self, filename: str, mime: str, buffer: bytes, compression: str = None) -> bool:
"""
......@@ -76,6 +79,7 @@ class LegacyParser(Parser):
mainfile_contents_re: A regexp that is used to match the first 1024 bytes of a
potential mainfile.
mainfile_name_re: A regexp that is used to match the paths of potential mainfiles
domain: The domain that this parser should be used for. Default is 'DFT'.
supported_compressions: A list of [gz, bz2], if the parser supports compressed files
"""
def __init__(
......@@ -83,10 +87,14 @@ class LegacyParser(Parser):
mainfile_contents_re: str = None,
mainfile_mime_re: str = r'text/.*',
mainfile_name_re: str = r'.*',
domain='DFT',
supported_compressions: List[str] = []) -> None:
super().__init__()
self.name = name
self.parser_class_name = parser_class_name
self.domain = domain
self._mainfile_mime_re = re.compile(mainfile_mime_re)
self._mainfile_name_re = re.compile(mainfile_name_re)
# Assign private variable this way to avoid static check issue.
......
......@@ -38,7 +38,7 @@ from nomad.files import PathObject, UploadFiles, ExtractError, ArchiveBasedStagi
from nomad.processing.base import Proc, process, task, PENDING, SUCCESS, FAILURE
from nomad.parsing import parser_dict, match_parser, LocalBackend
from nomad.normalizing import normalizers
from nomad.datamodel import UploadWithMetadata, CalcWithMetadata
from nomad.datamodel import UploadWithMetadata, CalcWithMetadata, Domain
class Calc(Proc):
......@@ -213,6 +213,7 @@ class Calc(Proc):
context = dict(parser=self.parser, step=self.parser)
logger = self.get_logger(**context)
parser = parser_dict[self.parser]
self.metadata['parser_name'] = self.parser
with utils.timer(logger, 'parser executed', input_size=self.mainfile_file.size):
try:
......@@ -226,29 +227,29 @@ class Calc(Proc):
# add the non code specific calc metadata to the backend
# all other quantities have been determined by parsers/normalizers
self._parser_backend.openNonOverlappingSection('section_calculation_info')
self._parser_backend.openNonOverlappingSection('section_entry_info')
self._parser_backend.addValue('upload_id', self.upload_id)
self._parser_backend.addValue('calc_id', self.calc_id)
self._parser_backend.addValue('calc_hash', self.metadata['calc_hash'])
self._parser_backend.addValue('main_file', self.mainfile)
self._parser_backend.addValue('mainfile', self.mainfile)
self._parser_backend.addValue('parser_name', self.parser)
filepaths = self.metadata['files']
self._parser_backend.addValue('number_of_files', len(filepaths))
self._parser_backend.addValue('filepaths', filepaths)
uploader = self.upload.uploader
self._parser_backend.addValue(
'entry_uploader_name', '%s, %s' % (uploader.first_name, uploader.last_name))
self._parser_backend.addValue(
'entry_uploader_id', str(uploader.user_id))
self._parser_backend.addValue('entry_upload_time', int(self.upload.upload_time.timestamp()))
self._parser_backend.closeNonOverlappingSection('section_entry_info')
self.add_processor_info(self.parser)
if self._parser_backend.status[0] != 'ParseSuccess':
logger.error(self._parser_backend.status[1])
error = self._parser_backend.status[1]
self._parser_backend.addValue('parse_status', 'ParseFailure')
self.fail(error, level=logging.INFO, **context)
else:
self._parser_backend.addValue('parse_status', 'ParseSuccess')
self._parser_backend.closeNonOverlappingSection('section_calculation_info')
self._parser_backend.openNonOverlappingSection('section_repository_info')
self._parser_backend.addValue('repository_archive_gid', '%s/%s' % (self.upload_id, self.calc_id))
self._parser_backend.addValue('repository_filepaths', self.metadata['files'])
self._parser_backend.closeNonOverlappingSection('section_repository_info')
self.add_processor_info(self.parser)
@contextmanager
def use_parser_backend(self, processor_name):
......@@ -257,7 +258,7 @@ class Calc(Proc):
self.add_processor_info(processor_name)
def add_processor_info(self, processor_name: str) -> None:
self._parser_backend.openContext('/section_calculation_info/0')
self._parser_backend.openContext('/section_entry_info/0')
self._parser_backend.openNonOverlappingSection('section_archive_processing_info')
self._parser_backend.addValue('archive_processor_name', processor_name)
......@@ -265,7 +266,7 @@ class Calc(Proc):
warnings = getattr(self._parser_backend, '_warnings', [])
if len(warnings) > 0:
self._parser_backend.addValue('archive_processor_status', 'WithWarnings')
self._parser_backend.addValue('archive_processor_warning_number', len(warnings))
self._parser_backend.addValue('number_of_archive_processor_warnings', len(warnings))
self._parser_backend.addArrayValues('archive_processor_warnings', [str(warning) for warning in warnings])
else:
self._parser_backend.addValue('archive_processor_status', 'Success')
......@@ -274,11 +275,14 @@ class Calc(Proc):
self._parser_backend.addValue('archive_processor_error', str(errors))
self._parser_backend.closeNonOverlappingSection('section_archive_processing_info')
self._parser_backend.closeContext('/section_calculation_info/0')
self._parser_backend.closeContext('/section_entry_info/0')
@task
def normalizing(self):
for normalizer in normalizers:
if normalizer.domain != config.domain:
continue
normalizer_name = normalizer.__name__
context = dict(normalizer=normalizer_name, step=normalizer_name)
logger = self.get_logger(**context)
......@@ -310,7 +314,6 @@ class Calc(Proc):
calc_with_metadata = CalcWithMetadata(**self.metadata)
calc_with_metadata.apply_domain_metadata(self._parser_backend)
calc_with_metadata.parser_name = self.parser
calc_with_metadata.processed = True
# persist the calc metadata
......@@ -326,7 +329,7 @@ class Calc(Proc):
logger, 'archived', step='archive',
input_size=self.mainfile_file.size) as log_data:
with self.upload_files.archive_file(self.calc_id, 'wt') as out:
self._parser_backend.write_json(out, pretty=True)
self._parser_backend.write_json(out, pretty=True, root_sections=Domain.instance.root_sections)
log_data.update(archive_size=self.upload_files.archive_file_object(self.calc_id).size)
......
......@@ -29,7 +29,7 @@ Content-Type: application/json
{
"query": {
"match": {
"published": true
"published": false
}
}
}
......
......@@ -19,7 +19,7 @@ import os.path
import json
import re
from nomad import utils, infrastructure, config
from nomad import utils, infrastructure, config, datamodel
from nomad.files import UploadFiles, StagingUploadFiles, PublicUploadFiles
from nomad.processing import Upload, Calc
from nomad.processing.base import task as task_decorator, FAILURE, SUCCESS
......@@ -82,7 +82,7 @@ def assert_processing(upload: Upload):
with upload_files.archive_file(calc.calc_id) as archive_json:
archive = json.load(archive_json)
assert 'section_run' in archive
assert 'section_calculation_info' in archive
assert 'section_entry_info' in archive
with upload_files.archive_log_file(calc.calc_id) as f:
assert 'a test' in f.read()
......@@ -233,3 +233,17 @@ def test_malicious_parser_task_failure(proc_infra, failure, test_user):
assert not calc.tasks_running
assert calc.tasks_status == FAILURE
assert len(calc.errors) == 1
def test_ems_data(proc_infra, test_user, monkeypatch):
monkeypatch.setattr('nomad.config.domain', 'EMS')
monkeypatch.setattr('nomad.datamodel.Domain.instance', datamodel.Domain.instances['EMS'])
monkeypatch.setattr('nomad.datamodel.CalcWithMetadata', datamodel.Domain.instance.domain_entry_class)
upload = run_processing(('test_ems_upload', 'tests/data/proc/example_ems.zip'), test_user)
additional_keys = ['method', 'location', 'experiment_date']
assert upload.total_calcs == 1
assert_upload_files(upload, StagingUploadFiles, additional_keys, published=False)
assert_search_upload(upload, additional_keys, published=False)
......@@ -554,6 +554,7 @@ class TestRepo():
clear_elastic(elastic_infra)
calc_with_metadata = CalcWithMetadata(upload_id=0, calc_id=0)
calc_with_metadata.files = ['test/mainfile.txt']
calc_with_metadata.apply_domain_metadata(normalized)
calc_with_metadata.update(calc_id='1', uploader=test_user.to_popo(), published=True, with_embargo=False)
......
......@@ -12,6 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
A generator for random test calculations.
"""
import names
import random
from essential_generators import DocumentGenerator
......
......@@ -119,19 +119,19 @@ class TestLocalBackend(object):
backend.addValue('program_name', 't0')
backend.closeSection('section_run', 0)
g_index = backend.openSection('section_calculation_info')
g_index = backend.openSection('section_entry_info')
assert g_index == 0
backend.addValue('parser_name', 'p0')
backend.closeSection('section_calculation_info', 0)
backend.closeSection('section_entry_info', 0)
assert backend.get_sections('section_run') == [0]
assert backend.get_sections('section_calculation_info') == [0]
assert backend.get_sections('section_entry_info') == [0]
output = StringIO()
backend.write_json(output)
archive = json.loads(output.getvalue())
assert 'section_run' in archive
assert 'section_calculation_info' in archive
assert 'section_entry_info' in archive