Commit 4b280809 authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

Completed the refactor.

parent 817f2d4a
Pipeline #80729 passed with stages
in 40 minutes and 22 seconds
......@@ -32,7 +32,7 @@ pip install nomad-lab
To **use the NOMAD parsers for example**, install the `parsing` extra:
```
pip install nomad-lab[parsing]
nomad parse --show-backend <your-file-to-parse>
nomad parse --show-archive <your-file-to-parse>
```
### For NOMAD developer
......
......@@ -42,7 +42,7 @@ To parse code input/output from the command line, you can use NOMAD's command li
interface (CLI) and print the processing results output to stdout:
```
nomad parse --show-backend <path-to-file>
nomad parse --show-archive <path-to-file>
```
To parse a file in Python, you can program something like this:
......
Subproject commit afc55d917505d8a882611ca27ef91f0cc3ac11f6
Subproject commit f41d95aa9bf238dbcf2258eba82c87ecdb491cd1
Subproject commit 5b1305cb24a7ec2806a94b2b5192ab6ec9a7d0d2
Subproject commit 41bc37d7d165f671de427ab25bda17d110e22e38
Subproject commit 0a9bb17150428c5c86115091aed58a1ae502d96b
Subproject commit bd9c04e281aa42010d3e57310f1106680a332763
......@@ -4,11 +4,11 @@ Using the NOMAD parsers
To use the NOMAD parsers from the command line, you can use the ``parse`` command. The
parse command will automatically *match* the right parser to your code output file and
run the parser. There are two output formats, ``--show-metadata`` (a JSON representation
of the repository metadata), ``--show-backend`` (a JSON representation of the archive data).
of the repository metadata), ``--show-archive`` (a JSON representation of the archive data).
.. code-block:: sh
nomad parser --show-backend <path-to-your-mainfile-code-output-file>
nomad parser --show-archive <path-to-your-mainfile-code-output-file>
You can also use the NOMAD parsers from within Python. This will give you the parse
results as metainfo objects to conveniently analyse the results in Python. See :ref:`metainfo <metainfo-label>`
......
......@@ -284,10 +284,9 @@ like all others: add __usrMyCodeLength to the group name.
## Backend
The backend is an object can stores parsed data according to its meta-info. The
class :py:class:`nomad.parsing.AbstractParserBackend` provides the basic backend interface.
class :py:class:`nomad.parsing.Backend` provides the basic backend interface.
It allows to open and close sections, add values, arrays, and values to arrays.
In nomad@FAIRDI, we practically only use the :py:class:`nomad.parsing.LocalBackend`. In
NOMAD-coe multiple backend implementations existed to facilitate the communication of
In NOMAD-coe multiple backend implementations existed to facilitate the communication of
python parsers with the scala infrastructure, including caching and streaming.
## Triggers
......
......@@ -23,7 +23,6 @@ import json
from nomad import config, utils
from nomad import files
from nomad import datamodel
from nomad.cli import parse as cli_parse
from .client import client
......@@ -131,31 +130,31 @@ class CalcProcReproduction:
self.upload_files.raw_file_object(self.mainfile).os_path,
parser_name=parser_name, logger=self.logger, **kwargs)
def normalize(self, normalizer: typing.Union[str, typing.Callable], parser_backend=None):
def normalize(self, normalizer: typing.Union[str, typing.Callable], entry_archive=None):
'''
Parse the downloaded calculation and run the given normalizer.
'''
if parser_backend is None:
parser_backend = self.parse()
if entry_archive is None:
entry_archive = self.parse()
return cli_parse.normalize(parser_backend=parser_backend, normalizer=normalizer, logger=self.logger)
return cli_parse.normalize(entry_archive=entry_archive, normalizer=normalizer, logger=self.logger)
def normalize_all(self, parser_backend=None):
def normalize_all(self, entry_archive=None):
'''
Parse the downloaded calculation and run the whole normalizer chain.
'''
return cli_parse.normalize_all(parser_backend=parser_backend, logger=self.logger)
return cli_parse.normalize_all(entry_archive=entry_archive, logger=self.logger)
@client.command(help='Run processing locally.')
@click.argument('CALC_ID', nargs=1, required=True, type=str)
@click.option('--override', is_flag=True, help='Override existing local calculation data.')
@click.option('--show-backend', is_flag=True, help='Print the backend data.')
@click.option('--show-archive', is_flag=True, help='Print the archive data.')
@click.option('--show-metadata', is_flag=True, help='Print the extracted repo metadata.')
@click.option('--mainfile', default=None, type=str, help='Use this mainfile (in case mainfile cannot be retrived via API.')
@click.option('--skip-normalizers', is_flag=True, help='Do not normalize.')
@click.option('--not-strict', is_flag=True, help='Also match artificial parsers.')
def local(calc_id, show_backend, show_metadata, skip_normalizers, not_strict, **kwargs):
def local(calc_id, show_archive, show_metadata, skip_normalizers, not_strict, **kwargs):
utils.get_logger(__name__).info('Using %s' % config.client.url)
with CalcProcReproduction(calc_id, **kwargs) as local:
......@@ -163,15 +162,15 @@ def local(calc_id, show_backend, show_metadata, skip_normalizers, not_strict, **
print(
'Data being saved to .volumes/fs/tmp/repro_'
'%s if not already there' % local.upload_id)
backend = local.parse(strict=not not_strict)
entry_archive = local.parse(strict=not not_strict)
if not skip_normalizers:
local.normalize_all(parser_backend=backend)
local.normalize_all(entry_archive=entry_archive)
if show_backend:
json.dump(backend.resource.m_to_dict(), sys.stdout, indent=2)
if show_archive:
json.dump(entry_archive.m_to_dict(), sys.stdout, indent=2)
if show_metadata:
metadata = datamodel.EntryMetadata(domain='dft') # TODO take domain from matched parser
metadata.apply_domain_metadata(backend.entry_archive)
metadata = entry_archive.section_metadata
metadata.apply_domain_metadata(entry_archive)
json.dump(metadata.m_to_dict(), sys.stdout, indent=4)
......@@ -41,18 +41,20 @@ def parse(
if hasattr(parser, 'backend_factory'):
setattr(parser, 'backend_factory', backend_factory)
parser_backend = parser.run(mainfile_path, logger=logger)
if not parser_backend.status[0] == 'ParseSuccess':
logger.error('parsing was not successful', status=parser_backend.status)
entry_archive = datamodel.EntryArchive()
metadata = entry_archive.m_create(datamodel.EntryMetadata)
metadata.domain = parser.domain
try:
parser.parse(mainfile_path, entry_archive, logger=logger)
except Exception as e:
logger.error('parsing was not successful', exc_info=e)
logger.info('ran parser')
return parser_backend
return entry_archive
def normalize(
normalizer: typing.Union[str, typing.Callable], parser_backend=None,
logger=None):
normalizer: typing.Union[str, typing.Callable], entry_archive, logger=None):
if logger is None:
logger = utils.get_logger(__name__)
......@@ -63,50 +65,46 @@ def normalize(
if normalizer_instance.__class__.__name__ == normalizer)
assert normalizer is not None, 'there is no normalizer %s' % str(normalizer)
normalizer_instance = typing.cast(typing.Callable, normalizer)(parser_backend.entry_archive)
normalizer_instance = typing.cast(typing.Callable, normalizer)(entry_archive)
logger = logger.bind(normalizer=normalizer_instance.__class__.__name__)
logger.info('identified normalizer')
normalizer_instance.normalize(logger=logger)
logger.info('ran normalizer')
return parser_backend
def normalize_all(parser_backend=None, logger=None):
def normalize_all(entry_archive, logger=None):
'''
Parse the downloaded calculation and run the whole normalizer chain.
'''
for normalizer in normalizing.normalizers:
if normalizer.domain == parser_backend.domain:
parser_backend = normalize(
normalizer, parser_backend=parser_backend, logger=logger)
return parser_backend
if normalizer.domain == entry_archive.section_metadata.domain:
normalize(normalizer, entry_archive, logger=logger)
@cli.command(help='Run parsing and normalizing locally.', name='parse')
@click.argument('MAINFILE', nargs=1, required=True, type=str)
@click.option('--show-backend', is_flag=True, default=False, help='Print the backend data.')
@click.option('--show-archive', is_flag=True, default=False, help='Print the archive data.')
@click.option('--show-metadata', is_flag=True, default=False, help='Print the extracted repo metadata.')
@click.option('--skip-normalizers', is_flag=True, default=False, help='Do not run the normalizer.')
@click.option('--not-strict', is_flag=True, help='Do also match artificial parsers.')
@click.option('--parser', help='Skip matching and use the provided parser')
@click.option('--annotate', is_flag=True, help='Sub-matcher based parsers will create a .annotate file.')
def _parse(
mainfile, show_backend, show_metadata, skip_normalizers, not_strict, parser,
mainfile, show_archive, show_metadata, skip_normalizers, not_strict, parser,
annotate):
nomadcore.simple_parser.annotate = annotate
kwargs = dict(strict=not not_strict, parser_name=parser)
backend = parse(mainfile, **kwargs)
entry_archive = parse(mainfile, **kwargs)
if not skip_normalizers:
normalize_all(backend)
normalize_all(entry_archive)
if show_backend:
json.dump(backend.resource.m_to_dict(), sys.stdout, indent=2)
if show_archive:
json.dump(entry_archive.m_to_dict(), sys.stdout, indent=2)
if show_metadata:
metadata = datamodel.EntryMetadata(domain='dft') # TODO take domain from matched parser
metadata.apply_domain_metadata(backend.entry_archive)
metadata = entry_archive.section_metadata
metadata.apply_domain_metadata(entry_archive)
json.dump(metadata.m_to_dict(), sys.stdout, indent=4)
# Copyright 2018 Markus Scheidgen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an"AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from nomad import config
def get_optional_backend_value(backend, key, section, unavailable_value=None, logger=None):
# Section is section_system, section_symmetry, etc...
val = None # Initialize to None, so we can compare section values.
# Loop over the sections with the name section in the backend.
for section_index in backend.get_sections(section):
if section == 'section_system':
try:
if not backend.get_value('is_representative', section_index):
continue
except (KeyError, IndexError):
continue
try:
new_val = backend.get_value(key, section_index)
except (KeyError, IndexError):
new_val = None
# Compare values from iterations.
if val is not None and new_val is not None:
if val.__repr__() != new_val.__repr__() and logger:
logger.warning(
'The values for %s differ between different %s: %s vs %s' %
(key, section, str(val), str(new_val)))
val = new_val if new_val is not None else val
if val is None and logger:
logger.warning(
'The values for %s where not available in any %s' % (key, section))
return unavailable_value if unavailable_value is not None else config.services.unavailable_value
else:
if isinstance(val, np.generic):
return val.item()
return val
......@@ -50,7 +50,7 @@ class BasisSet(ABC):
@abstractmethod
def to_dict(self) -> RestrictedDict:
"""Used to extract basis set settings from the backend and returning
"""Used to extract basis set settings from the archive and returning
them as a RestrictedDict.
"""
pass
......
......@@ -106,7 +106,7 @@ class EncyclopediaNormalizer(Normalizer):
except (AttributeError, KeyError):
pass
else:
# Try to find system type information from backend for the selected system.
# Try to find system type information from archive for the selected system.
try:
system = self.section_run.section_system[system_idx]
stype = system.system_type
......@@ -278,7 +278,7 @@ class EncyclopediaNormalizer(Normalizer):
representative_scc_idx=representative_scc_idx,
)
# Put the encyclopedia section into backend
# Put the encyclopedia section into archive
self.fill(context)
# Check that the necessary information is in place
......
......@@ -33,8 +33,8 @@ class OptimadeNormalizer(SystemBasedNormalizer):
This normalizer performs all produces a section all data necessary for the Optimade API.
It assumes that the :class:`SystemNormalizer` was run before.
'''
def __init__(self, backend):
super().__init__(backend, only_representatives=True)
def __init__(self, archive):
super().__init__(archive, only_representatives=True)
def add_optimade_data(self, index) -> OptimadeEntry:
'''
......
......@@ -23,9 +23,6 @@ class WorkflowNormalizer(Normalizer):
This normalizer performs all produces a section all data necessary for the Optimade API.
It assumes that the :class:`SystemNormalizer` was run before.
'''
def __init__(self, backend):
super().__init__(backend)
def _get_relaxation_type(self):
sec_system = self.section_run.section_system
if not sec_system:
......
......@@ -64,12 +64,10 @@ basends. In nomad@FAIRDI, we only currently only use a single backed. The follow
classes provide a interface definition for *backends* as an ABC and a concrete implementation
based on nomad@fairdi's metainfo:
.. autoclass:: nomad.parsing.AbstractParserBackend
:members:
.. autoclass:: nomad.parsing.Backend
:members:
'''
from nomad.parsing.legacy import AbstractParserBackend, Backend, BackendError, LegacyParser
from nomad.parsing.legacy import Backend, BackendError, LegacyParser
from nomad.parsing.parser import Parser, BrokenParser, MissingParser, MatchingParser
from nomad.parsing.artificial import TemplateParser, GenerateRandomParser, ChaosParser, EmptyParser
......@@ -29,7 +29,6 @@ import signal
from nomad import metainfo
from nomad.datamodel import EntryArchive
from nomad.datamodel.metainfo import m_env as general_nomad_metainfo_env
from nomad.datamodel.metainfo.common import section_run as Run
from .legacy import Backend
......@@ -52,13 +51,6 @@ class EmptyParser(MatchingParser):
'''
name = "parsers/empty"
def run(self, mainfile: str, logger=None) -> Backend:
backend = Backend(metainfo=general_nomad_metainfo_env, domain=self.domain, logger=logger)
backend.openSection('section_run')
backend.addValue('program_name', self.code_name)
backend.closeSection('section_run', 0)
return backend
def parse(self, mainfile: str, archive: EntryArchive, logger=None) -> None:
run = archive.m_create(Run)
run.program_name = self.code_name
......@@ -115,24 +107,6 @@ class TemplateParser(ArtificalParser):
self.backend.closeSection(name, index)
def run(self, mainfile: str, logger=None) -> Backend:
# tell tests about received logger
if logger is not None:
logger.debug('received logger')
self.init_backend()
if 'warning' in mainfile:
self.backend.pwarn('A test warning.')
template_json = json.load(open(mainfile, 'r'))
self.add_section(template_json['section_run'][0])
if 'section_workflow' in template_json:
self.add_section(template_json['section_workflow'])
self.backend.finishedParsingSession('ParseSuccess', [])
logger.debug('a test log entry')
return self.backend
def parse(self, mainfile: str, archive: EntryArchive, logger=None) -> None:
# tell tests about received logger
if logger is not None:
......@@ -171,13 +145,7 @@ class ChaosParser(ArtificalParser):
def parse(self, mainfile: str, archive: EntryArchive, logger=None) -> None:
self.init_backend(entry_archive=archive)
self.do_chaos(mainfile, logger)
def run(self, mainfile: str, logger=None) -> Backend:
self.init_backend()
return self.do_chaos(mainfile, logger)
def do_chaos(self, mainfile: str, logger=None):
chaos_json = json.load(open(mainfile, 'r'))
if isinstance(chaos_json, str):
chaos = chaos_json
......@@ -271,20 +239,6 @@ class GenerateRandomParser(TemplateParser):
else:
return value
def run(self, mainfile: str, logger=None) -> Backend:
# tell tests about received logger
if logger is not None:
logger.debug('received logger')
self.init_backend()
seed = int(os.path.basename(mainfile).split('_')[1])
random.seed(seed)
numpy.random.seed(seed)
section = self.template['section_run'][0]
self.add_section(section)
self.backend.finishedParsingSession('ParseSuccess', [])
return self.backend
def parse(self, mainfile: str, archive: EntryArchive, logger=None) -> None:
# tell tests about received logger
if logger is not None:
......
......@@ -14,7 +14,7 @@
'''
This module contains functionality to use old 'legacy' NOMAD CoE parsers with the
new nomad@fairdi infrastructure. This covers aspects like the new metainfo, a unifying
new nomad@fairdi infrastructure. This covers aspects like the old metainfo, a unifying
wrapper for parsers, parser logging, and a parser backend.
'''
......@@ -41,136 +41,7 @@ class BackendError(Exception):
pass
class AbstractParserBackend(metaclass=ABCMeta):
'''
This ABS provides the parser backend interface used by the NOMAD-coe parsers.
'''
@abstractmethod
def metaInfoEnv(self):
''' Returns the meta info used by this backend. '''
pass
@abstractmethod
def startedParsingSession(
self, mainFileUri, parserInfo, parserStatus=None, parserErrors=None):
'''
Should be called when the parsing starts.
ParserInfo should be a valid json dictionary.
'''
pass
@abstractmethod
def finishedParsingSession(
self, parserStatus, parserErrors, mainFileUri=None, parserInfo=None,
parsingStats=None):
''' Called when the parsing finishes. '''
pass
@abstractmethod
def openSection(self, metaName, parent_index=-1):
''' Opens a new section and returns its new unique gIndex. '''
pass
@abstractmethod
def closeSection(self, metaName, gIndex):
'''
Closes the section with the given meta name and index. After this, no more
value can be added to this section.
'''
pass
@abstractmethod
def openNonOverlappingSection(self, metaName):
''' Opens a new non overlapping section. '''
pass
@abstractmethod
def setSectionInfo(self, metaName, gIndex, references):
'''
Sets info values of an open section references should be a dictionary with the
gIndexes of the root sections this section refers to.
'''
pass
@abstractmethod
def closeNonOverlappingSection(self, metaName):
'''
Closes the current non overlapping section for the given meta name. After
this, no more value can be added to this section.
'''
pass
@abstractmethod
def openSections(self):
''' Returns the sections that are still open as metaName, gIndex tuples. '''
pass
@abstractmethod
def addValue(self, metaName, value, gIndex=-1):
'''
Adds a json value for the given metaName. The gIndex is used to identify
the right parent section.
'''
pass
@abstractmethod
def addRealValue(self, metaName, value, gIndex=-1):
'''
Adds a float value for the given metaName. The gIndex is used to identify
the right parent section.
'''
pass
@abstractmethod
def addArray(self, metaName, shape, gIndex=-1):
'''
Adds an unannitialized array of the given shape for the given metaName.
The gIndex is used to identify the right parent section.
This is neccessary before array values can be set with :func:`setArrayValues`.
'''
@abstractmethod
def setArrayValues(self, metaName, values, offset=None, gIndex=-1):
'''
Adds values of the given numpy array to the last array added for the given
metaName and parent gIndex.
'''
pass
@abstractmethod
def addArrayValues(self, metaName, values, gIndex=-1, override: bool = False):
'''
Adds an array with the given numpy array values for the given metaName and
parent section gIndex. Override determines whether to rewrite exisiting values
in the backend.
'''
pass
@abstractmethod
def pwarn(self, msg):
''' Used to catch parser warnings. '''
pass
@abstractmethod
def get_sections(self, meta_name: str, g_index: int = -1) -> List[int]:
''' Return all gIndices for existing sections of the given meta_name and parent section index. '''
pass
@abstractmethod
def get_value(self, metaName: str, g_index=-1) -> Any:
'''
Return the value set to the given meta_name in its parent section of the given index.
An index of -1 (default) is only allowed if there is exactly one parent section.
'''
pass
@abstractmethod
def __getitem__(self, key):
pass
class Backend(AbstractParserBackend):
class Backend():
'''
A backend that uses the new metainfo to store all data.
......@@ -255,12 +126,14 @@ class Backend(AbstractParserBackend):
return section.m_get_sub_sections(property_def)
def metaInfoEnv(self):
''' Returns the meta info used by this backend. '''
if self.__legacy_env is None:
self.__legacy_env = self.env.legacy_info_env()
return self.__legacy_env
def resolve_definition(self, name, section_cls: Type[MSectionBound]) -> MSectionBound:
definition = self.env.from_legacy_name(name, section_cls)
if definition:
return definition
......@@ -269,6 +142,8 @@ class Backend(AbstractParserBackend):
def openSection(self, name, parent_index: int = -1, return_section=False):
'''
Opens a new section and returns its new unique gIndex.
It will assume that there is a sub-section def with the given name.
It will use the latest opened section of the sub-sections parent as the parent
for the new section.
......@@ -311,9 +186,14 @@ class Backend(AbstractParserBackend):
return section, quantity_def
def closeSection(self, name, g_index):
'''
Closes the section with the given meta name and index. After this, no more
value can be added to this section.
'''
pass
def openNonOverlappingSection(self, metaName):
''' Opens a new non overlapping section. '''
return self.openSection(metaName)
def setSectionInfo(self, metaName, gInd