Commit 5787da77 authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

Added EmptyParser and strict parsing do deal with PID entries without actual parsers.

parent c0187fbb
Pipeline #53899 passed with stages
in 22 minutes and 5 seconds
......@@ -123,12 +123,12 @@ class CalcProcReproduction:
def __exit__(self, *args):
self.upload_files.delete()
def parse(self, parser_name: str = None) -> LocalBackend:
def parse(self, parser_name: str = None, **kwargs) -> LocalBackend:
"""
Run the given parser on the downloaded calculation. If no parser is given,
do parser matching and use the respective parser.
"""
return parse(self.mainfile, self.upload_files, parser_name=parser_name, logger=self.logger)
return parse(self.mainfile, self.upload_files, parser_name=parser_name, logger=self.logger, **kwargs)
def normalize(self, normalizer: Union[str, Callable], parser_backend: LocalBackend = None):
"""
......@@ -153,7 +153,8 @@ class CalcProcReproduction:
@click.option('--show-metadata', is_flag=True, help='Print the extracted repo metadata.')
@click.option('--mainfile', default=None, type=str, help='Use this mainfile (in case mainfile cannot be retrived via API.')
@click.option('--skip-normalizers', is_flag=True, help='Do not normalize.')
def local(calc_id, show_backend, show_metadata, skip_normalizers, **kwargs):
@click.option('--not-strict', is_flag=True, help='Also match artificial parsers.')
def local(calc_id, show_backend, show_metadata, skip_normalizers, not_strict, **kwargs):
utils.configure_logging()
utils.get_logger(__name__).info('Using %s' % config.client.url)
......@@ -162,7 +163,7 @@ def local(calc_id, show_backend, show_metadata, skip_normalizers, **kwargs):
print(
'Data being saved to .volumes/fs/tmp/repro_'
'%s if not already there' % local.upload_id)
backend = local.parse()
backend = local.parse(strict=not not_strict)
if not skip_normalizers:
local.normalize_all(parser_backend=backend)
......
......@@ -5,7 +5,7 @@ import click
import sys
from nomad import config, utils, files
from nomad.parsing import LocalBackend, parser_dict, match_parser
from nomad.parsing import LocalBackend, parser_dict, match_parser, MatchingParser
from nomad.normalizing import normalizers
from nomad.datamodel import CalcWithMetadata
......@@ -14,7 +14,7 @@ from .cli import cli
def parse(
mainfile: str, upload_files: Union[str, files.StagingUploadFiles],
parser_name: str = None, logger=None) -> LocalBackend:
parser_name: str = None, strict: bool = True, logger=None) -> LocalBackend:
"""
Run the given parser on the downloaded calculation. If no parser is given,
do parser matching and use the respective parser.
......@@ -24,7 +24,11 @@ def parse(
if parser_name is not None:
parser = parser_dict.get(parser_name)
else:
parser = match_parser(mainfile, upload_files)
parser = match_parser(mainfile, upload_files, strict=strict)
if isinstance(parser, MatchingParser):
parser_name = parser.name
else:
parser_name = parser.__class__.__name__
assert parser is not None, 'there is not parser matching %s' % mainfile
logger = logger.bind(parser=parser.name) # type: ignore
......@@ -45,7 +49,7 @@ def parse(
parser_backend.addValue('calc_id', config.services.unavailable_value)
parser_backend.addValue('calc_hash', "no hash")
parser_backend.addValue('mainfile', mainfile)
parser_backend.addValue('parser_name', parser.__class__.__name__)
parser_backend.addValue('parser_name', parser_name)
parser_backend.closeNonOverlappingSection('section_entry_info')
logger.info('ran parser')
......@@ -89,10 +93,11 @@ def normalize_all(parser_backend: LocalBackend = None, logger=None) -> LocalBack
@click.option('--show-backend', is_flag=True, default=False, help='Print the backend data.')
@click.option('--show-metadata', is_flag=True, default=False, help='Print the extracted repo metadata.')
@click.option('--skip-normalizers', is_flag=True, default=False, help='Do not run the normalizer.')
def _parse(mainfile, show_backend, show_metadata, skip_normalizers):
@click.option('--not-strict', is_flag=True, help='Do also match artificial parsers.')
def _parse(mainfile, show_backend, show_metadata, skip_normalizers, not_strict):
utils.configure_logging()
backend = parse(mainfile, '.')
backend = parse(mainfile, '.', strict=not not_strict)
if not skip_normalizers:
normalize_all(backend)
......
......@@ -110,7 +110,7 @@ class DFTCalcWithMetadata(CalcWithMetadata):
except KeyError:
self.code_version = config.services.unavailable_value
self.atoms = get_optional_backend_value(backend, 'atom_labels', 'section_system', logger=logger)
self.atoms = get_optional_backend_value(backend, 'atom_labels', 'section_system', [], logger=logger)
if hasattr(self.atoms, 'tolist'):
self.atoms = self.atoms.tolist()
self.n_atoms = len(self.atoms)
......
......@@ -68,7 +68,7 @@ from nomad import files, config
from nomad.parsing.backend import AbstractParserBackend, LocalBackend, LegacyLocalBackend, JSONStreamWriter, BadContextURI, WrongContextState
from nomad.parsing.parser import Parser, LegacyParser, VaspOutcarParser, BrokenParser, MissingParser, MatchingParser
from nomad.parsing.artificial import TemplateParser, GenerateRandomParser, ChaosParser
from nomad.parsing.artificial import TemplateParser, GenerateRandomParser, ChaosParser, EmptyParser
_compressions = {
......@@ -77,7 +77,7 @@ _compressions = {
}
def match_parser(mainfile: str, upload_files: Union[str, files.StagingUploadFiles]) -> 'Parser':
def match_parser(mainfile: str, upload_files: Union[str, files.StagingUploadFiles], strict=True) -> 'Parser':
"""
Performs parser matching. This means it take the given mainfile and potentially
opens it with the given callback and tries to identify a parser that can parse
......@@ -90,6 +90,7 @@ def match_parser(mainfile: str, upload_files: Union[str, files.StagingUploadFile
mainfile: The upload relative path to the mainfile
upload_files: Either a :class:`files.StagingUploadFiles` object or a directory name.
Directory name + mainfile needs to point to the file.
strict: Only match strict parsers, e.g. no artificial parsers for missing or empty entries.
Returns: The parser, or None if no parser could be matched.
"""
......@@ -106,10 +107,15 @@ def match_parser(mainfile: str, upload_files: Union[str, files.StagingUploadFile
mime_type = magic.from_buffer(buffer, mime=True)
for parser in parsers:
if parser.domain == config.domain:
if parser.is_mainfile(mainfile_path, mime_type, buffer, compression):
# TODO: deal with multiple possible parser specs
return parser
if strict and (isinstance(parser, MissingParser) or isinstance(parser, EmptyParser)):
continue
if parser.domain != config.domain:
continue
if parser.is_mainfile(mainfile_path, mime_type, buffer, compression):
# TODO: deal with multiple possible parser specs
return parser
return None
......@@ -402,28 +408,23 @@ parsers = [
parser_class_name='onetepparser.OnetepParser',
mainfile_contents_re=r'####### # # ####### ####### ####### ######'
),
# These are supposedly octopus files, but they do not look like octopus files at all
# TODO We have migrated the wrong octopus mainfiles .. this should be removed now
# MissingParser(
# name='parser/octopus', code_name='Octopus', domain='DFT',
# mainfile_name_re=r'(inp)|(.*/inp)'
# ),
# We already have crystal with mainfile_contents_re, but this one does not always properly match
LegacyParser(
name='parsers/crystal', code_name='Crystal',
parser_class_name='crystalparser.CrystalParser',
# There are some entries with PIDs that have mainfiles which do not match what
# the actual parsers expect. We use the EmptyParser to produce placeholder entries
# to keep the PIDs. These parsers will not match for new, non migrated data.
EmptyParser(
name='missing/octopus', code_name='Octopus', domain='DFT',
mainfile_name_re=r'(inp)|(.*/inp)'
),
EmptyParser(
name='missing/crystal', code_name='Crystal',
mainfile_name_re=r'.*\.cryst\.out'
),
# We already have wien2k with mainfile_contents_re, but this one does not always properly match
LegacyParser(
name='parsers/wien2k', code_name='WIEN2k',
parser_class_name='wien2kparser.Wien2kParser',
EmptyParser(
name='missing/wien2k', code_name='WIEN2k',
mainfile_name_re=r'.*\.scf'
),
# We already have fhi-aims with mainfile_contents_re, but this one does not always properly match
LegacyParser(
name='parsers/fhi-aims', code_name='FHI-aims',
parser_class_name='fhiaimsparser.FHIaimsParser',
EmptyParser(
name='missing/fhi-aims', code_name='FHI-aims', domain='DFT',
mainfile_name_re=r'.*\.fhiaims'
),
BrokenParser()
......
......@@ -31,27 +31,40 @@ from nomadcore.local_meta_info import loadJsonFile, InfoKindEl
import nomad_meta_info
from nomad.parsing.backend import LocalBackend
from nomad.parsing.parser import Parser
from nomad.parsing.parser import Parser, MatchingParser
file_dir = os.path.dirname(os.path.abspath(nomad_meta_info.__file__))
meta_info_path = os.path.normpath(os.path.join(file_dir, 'vasp.nomadmetainfo.json'))
meta_info_env, _ = loadJsonFile(filePath=meta_info_path, dependencyLoader=None, extraArgsHandling=InfoKindEl.ADD_EXTRA_ARGS, uri=None)
class ArtificalParser(Parser):
""" Base class for artifical parsers based on VASP metainfo. """
def __init__(self):
super().__init__()
# use vasp metainfo, not to really use it, but because it works
file_dir = os.path.dirname(os.path.abspath(nomad_meta_info.__file__))
meta_info_path = os.path.normpath(os.path.join(file_dir, 'vasp.nomadmetainfo.json'))
self.meta_info_env, _ = loadJsonFile(filePath=meta_info_path, dependencyLoader=None, extraArgsHandling=InfoKindEl.ADD_EXTRA_ARGS, uri=None)
self.backend = None
def init_backend(self):
self.backend = LocalBackend(metaInfoEnv=self.meta_info_env, debug=False)
self.backend = LocalBackend(metaInfoEnv=meta_info_env, debug=False)
@property
def name(self):
return self.__class__.name
class EmptyParser(MatchingParser):
"""
Implementation that produces an empty code_run
"""
def run(self, mainfile: str, logger=None) -> LocalBackend:
backend = LocalBackend(metaInfoEnv=meta_info_env, debug=False) # type: ignore
backend.openSection('section_run')
backend.addValue('program_name', self.code_name)
backend.closeSection('section_run', 0)
return backend
class TemplateParser(ArtificalParser):
"""
A parser that generates data based on a template given via the
......@@ -85,7 +98,7 @@ class TemplateParser(ArtificalParser):
else:
value = self.transform_value(key, value)
if isinstance(value, list):
shape = self.meta_info_env[key].get('shape')
shape = meta_info_env[key].get('shape')
if shape is None or len(shape) == 0:
for single_value in value:
self.backend.addValue(key, single_value, index)
......
......@@ -660,10 +660,10 @@ class Upload(Proc):
calc.reset(worker_hostname=self.worker_hostname)
parser = match_parser(calc.mainfile, staging_upload_files)
parser = match_parser(calc.mainfile, staging_upload_files, strict=False)
if parser is None:
logger.warn(
'no parser matches during re-process, use old parser',
logger.error(
'no parser matches during re-process, use the old parser',
calc_id=calc.calc_id)
elif calc.parser != parser.name:
calc.parser = parser.name
......
......@@ -112,11 +112,14 @@ def assert_normalized(backend: LocalBackend):
assert metadata.xc_functional is not None
assert metadata.system is not None
assert metadata.crystal_system is not None
assert len(metadata.atoms) > 0
assert len(metadata.atoms) is not None
assert metadata.spacegroup is not None
exceptions = parser_exceptions.get(backend.get_value('parser_name'), [])
if metadata.formula != config.services.unavailable_value:
assert len(metadata.atoms) > 0
for key in calc_metadata_keys:
if key not in exceptions:
assert getattr(metadata, key) != config.services.unavailable_value
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment