From 38032188f3e6373bd14acbe837e882593fc75897 Mon Sep 17 00:00:00 2001 From: Markus Scheidgen <markus.scheidgen@gmail.com> Date: Tue, 16 Jul 2019 15:19:16 +0200 Subject: [PATCH] Fixed atom_labels normalization. Added parse command to client cli. --- nomad/admin/__init__.py | 2 +- nomad/admin/__main__.py | 4 +- nomad/client/__init__.py | 2 +- nomad/client/local.py | 50 +++-------------- nomad/client/parse.py | 105 ++++++++++++++++++++++++++++++++++++ nomad/datamodel/dft.py | 9 +++- nomad/normalizing/system.py | 41 ++++++++++---- nomad/parsing/__init__.py | 21 +++++--- tests/test_normalizing.py | 15 ++++-- 9 files changed, 177 insertions(+), 72 deletions(-) create mode 100644 nomad/client/parse.py diff --git a/nomad/admin/__init__.py b/nomad/admin/__init__.py index a3f3f14323..bc1477afad 100644 --- a/nomad/admin/__init__.py +++ b/nomad/admin/__init__.py @@ -23,4 +23,4 @@ from .__main__ import cli as cli_main def cli(): - cli_main(obj=POPO()) + cli_main(obj=POPO()) # pylint: disable=E1120,E1123 diff --git a/nomad/admin/__main__.py b/nomad/admin/__main__.py index 1ec8f3ba57..240b10cdd9 100644 --- a/nomad/admin/__main__.py +++ b/nomad/admin/__main__.py @@ -20,7 +20,7 @@ import shutil from tabulate import tabulate from elasticsearch_dsl import A -from nomad import config as nomad_config, infrastructure, processing, utils +from nomad import config as nomad_config, infrastructure, processing from nomad.search import Search @@ -145,4 +145,4 @@ def clean(dry, skip_calcs, skip_fs, skip_es): if __name__ == '__main__': - cli(obj={}) # pylint: disable=E1120 + cli(obj={}) # pylint: disable=E1120,E1123 diff --git a/nomad/client/__init__.py b/nomad/client/__init__.py index a2fc9fc68c..12266f4124 100644 --- a/nomad/client/__init__.py +++ b/nomad/client/__init__.py @@ -16,6 +16,6 @@ Swagger/bravado based python client library for the API and various usefull shell commands. """ -from . import local, migration, upload, integrationtests +from . import local, migration, upload, integrationtests, parse from .__main__ import cli, create_client from .upload import stream_upload_with_client diff --git a/nomad/client/local.py b/nomad/client/local.py index 66c1beed38..e6982bb427 100644 --- a/nomad/client/local.py +++ b/nomad/client/local.py @@ -17,16 +17,16 @@ import os import io import requests import click -from typing import Union, Callable, cast +from typing import Union, Callable import sys import ujson import bravado.exception from nomad import config, utils from nomad.files import ArchiveBasedStagingUploadFiles -from nomad.parsing import parser_dict, LocalBackend, match_parser -from nomad.normalizing import normalizers from nomad.datamodel import CalcWithMetadata +from nomad.parsing import LocalBackend +from nomad.client.parse import parse, normalize, normalize_all from .__main__ import cli @@ -126,30 +126,7 @@ class CalcProcReproduction: Run the given parser on the downloaded calculation. If no parser is given, do parser matching and use the respective parser. """ - if parser_name is not None: - parser = parser_dict.get(parser_name) - else: - parser = match_parser(self.mainfile, self.upload_files) - - assert parser is not None, 'there is not parser matching %s' % self.mainfile - self.logger = self.logger.bind(parser=parser.name) # type: ignore - self.logger.info('identified parser') - - parser_backend = parser.run(self.upload_files.raw_file_object(self.mainfile).os_path, logger=self.logger) - - if not parser_backend.status[0] == 'ParseSuccess': - self.logger.error('parsing was not successful', status=parser_backend.status) - - parser_backend.openNonOverlappingSection('section_entry_info') - parser_backend.addValue('upload_id', self.upload_id) - parser_backend.addValue('calc_id', self.calc_id) - parser_backend.addValue('calc_hash', "no hash") - parser_backend.addValue('mainfile', self.mainfile) - parser_backend.addValue('parser_name', parser.__class__.__name__) - parser_backend.closeNonOverlappingSection('section_entry_info') - - self.logger.info('ran parser') - return parser_backend + return parse(self.mainfile, self.upload_files, parser_name=parser_name, logger=self.logger) def normalize(self, normalizer: Union[str, Callable], parser_backend: LocalBackend = None): """ @@ -158,28 +135,13 @@ class CalcProcReproduction: if parser_backend is None: parser_backend = self.parse() - if isinstance(normalizer, str): - normalizer = next( - normalizer_instance for normalizer_instance in normalizers - if normalizer_instance.__class__.__name__ == normalizer) - - assert normalizer is not None, 'there is no normalizer %s' % str(normalizer) - normalizer_instance = cast(Callable, normalizer)(parser_backend) - logger = self.logger.bind(normalizer=normalizer_instance.__class__.__name__) - self.logger.info('identified normalizer') - - normalizer_instance.normalize(logger=logger) - self.logger.info('ran normalizer') - return parser_backend + return normalize(parser_backend=parser_backend, normalizer=normalizer, logger=self.logger) def normalize_all(self, parser_backend: LocalBackend = None): """ Parse the downloaded calculation and run the whole normalizer chain. """ - for normalizer in normalizers: - parser_backend = self.normalize(normalizer, parser_backend=parser_backend) - - return parser_backend + return normalize_all(parser_backend=parser_backend, logger=self.logger) @cli.command(help='Run processing locally.') diff --git a/nomad/client/parse.py b/nomad/client/parse.py new file mode 100644 index 0000000000..e917a10ef4 --- /dev/null +++ b/nomad/client/parse.py @@ -0,0 +1,105 @@ +from typing import Union, Callable, cast +import os.path +import ujson +import click +import sys + +from nomad import config, utils, files +from nomad.parsing import LocalBackend, parser_dict, match_parser +from nomad.normalizing import normalizers +from nomad.datamodel import CalcWithMetadata + +from .__main__ import cli + + +def parse( + mainfile: str, upload_files: Union[str, files.StagingUploadFiles], + parser_name: str = None, logger=None) -> LocalBackend: + """ + Run the given parser on the downloaded calculation. If no parser is given, + do parser matching and use the respective parser. + """ + if logger is None: + logger = utils.get_logger(__name__) + if parser_name is not None: + parser = parser_dict.get(parser_name) + else: + parser = match_parser(mainfile, upload_files) + + assert parser is not None, 'there is not parser matching %s' % mainfile + logger = logger.bind(parser=parser.name) # type: ignore + logger.info('identified parser') + + if isinstance(upload_files, str): + mainfile_path = os.path.join(upload_files, mainfile) + else: + mainfile_path = upload_files.raw_file_object(mainfile).os_path + + parser_backend = parser.run(mainfile_path, logger=logger) + + if not parser_backend.status[0] == 'ParseSuccess': + logger.error('parsing was not successful', status=parser_backend.status) + + parser_backend.openNonOverlappingSection('section_entry_info') + parser_backend.addValue('upload_id', config.services.unavailable_value) + parser_backend.addValue('calc_id', config.services.unavailable_value) + parser_backend.addValue('calc_hash', "no hash") + parser_backend.addValue('mainfile', mainfile) + parser_backend.addValue('parser_name', parser.__class__.__name__) + parser_backend.closeNonOverlappingSection('section_entry_info') + + logger.info('ran parser') + return parser_backend + + +def normalize( + normalizer: Union[str, Callable], parser_backend: LocalBackend = None, + logger=None) -> LocalBackend: + + if logger is None: + logger = utils.get_logger(__name__) + + if isinstance(normalizer, str): + normalizer = next( + normalizer_instance for normalizer_instance in normalizers + if normalizer_instance.__class__.__name__ == normalizer) + + assert normalizer is not None, 'there is no normalizer %s' % str(normalizer) + normalizer_instance = cast(Callable, normalizer)(parser_backend) + logger = logger.bind(normalizer=normalizer_instance.__class__.__name__) + logger.info('identified normalizer') + + normalizer_instance.normalize(logger=logger) + logger.info('ran normalizer') + return parser_backend + + +def normalize_all(parser_backend: LocalBackend = None, logger=None) -> LocalBackend: + """ + Parse the downloaded calculation and run the whole normalizer chain. + """ + for normalizer in normalizers: + parser_backend = normalize(normalizer, parser_backend=parser_backend, logger=logger) + + return parser_backend + + +@cli.command(help='Run parsing and normalizing locally.', name='parse') +@click.argument('MAINFILE', nargs=1, required=True, type=str) +@click.option('--show-backend', is_flag=True, default=False, help='Print the backend data.') +@click.option('--show-metadata', is_flag=True, default=False, help='Print the extracted repo metadata.') +@click.option('--skip-normalizers', is_flag=True, default=False, help='Do not run the normalizer.') +def _parse(mainfile, show_backend, show_metadata, skip_normalizers): + utils.configure_logging() + + backend = parse(mainfile, '.') + + if not skip_normalizers: + normalize_all(backend) + + if show_backend: + backend.write_json(sys.stdout, pretty=True) + if show_metadata: + metadata = CalcWithMetadata() + metadata.apply_domain_metadata(backend) + ujson.dump(metadata.to_dict(), sys.stdout, indent=4) diff --git a/nomad/datamodel/dft.py b/nomad/datamodel/dft.py index 0a9c25b591..37315ceee0 100644 --- a/nomad/datamodel/dft.py +++ b/nomad/datamodel/dft.py @@ -89,17 +89,22 @@ class DFTCalcWithMetadata(CalcWithMetadata): super().__init__(**kwargs) def apply_domain_metadata(self, backend): + from nomad.normalizing.system import normalized_atom_labels + logger = utils.get_logger(__name__).bind( upload_id=self.upload_id, calc_id=self.calc_id, mainfile=self.mainfile) self.code_name = backend.get_value('program_name', 0) - self.code_version = simplify_version(backend.get_value('program_version', 0)) + try: + self.code_version = simplify_version(backend.get_value('program_version', 0)) + except KeyError: + self.code_version = config.services.unavailable_value self.atoms = get_optional_backend_value(backend, 'atom_labels', 'section_system', logger=logger) if hasattr(self.atoms, 'tolist'): self.atoms = self.atoms.tolist() self.n_atoms = len(self.atoms) - self.atoms = list(set(self.atoms)) + self.atoms = list(set(normalized_atom_labels(set(self.atoms)))) self.atoms.sort() self.crystal_system = get_optional_backend_value( diff --git a/nomad/normalizing/system.py b/nomad/normalizing/system.py index ab5b0ddba3..e7895a72ff 100644 --- a/nomad/normalizing/system.py +++ b/nomad/normalizing/system.py @@ -16,6 +16,7 @@ from typing import Any import ase import numpy as np import json +import re from matid import SymmetryAnalyzer from matid.geometry import get_dimensionality @@ -24,6 +25,23 @@ from nomad import utils, config from nomad.normalizing.normalizer import SystemBasedNormalizer +# use a regular expression to check atom labels; expression is build from list of +# all labels sorted desc to find Br and not B when searching for Br. +atom_label_re = re.compile('|'.join( + sorted(ase.data.chemical_symbols, key=lambda x: len(x), reverse=True))) + + +def normalized_atom_labels(atom_labels): + """ + Normalizes the given atom labels: they either are labels right away, or contain + additional numbers (to distinguish same species but different labels, see meta-info), + or we replace them with ase placeholder atom for unknown elements 'X'. + """ + return [ + ase.data.chemical_symbols[0] if match is None else match.group(0) + for match in [re.search(atom_label_re, atom_label) for atom_label in atom_labels]] + + class SystemNormalizer(SystemBasedNormalizer): """ @@ -66,20 +84,25 @@ class SystemNormalizer(SystemBasedNormalizer): # analyze atoms labels atom_labels = get_value('atom_labels', nonp=True) + if atom_labels is not None: + atom_labels = normalized_atom_labels(atom_labels) + atom_species = get_value('atom_species', nonp=True) if atom_labels is None and atom_species is None: self.logger.error('calculation has neither atom species nor labels') return + # If there are no atom labels we create them from atom species data. if atom_labels is None: - atom_labels = list(ase.data.chemical_symbols[species] for species in atom_species) - # At this point we should have atom labels. Check that each atom label in the atom - # labels list is a true atom label by checking if it is in the ASE list of atom labels. - if not all(label in ase.data.chemical_symbols for label in atom_labels): - # Throw an error that the atom labels are poorly formated or there are unknown - # labels. Save first ten elemenets in logged error. - self.logger.error('Atom labels cannot be recognized.', atom_labels=atom_labels[:10]) - return + try: + atom_labels = list(ase.data.chemical_symbols[species] for species in atom_species) + except IndexError: + self.logger.error('calculation has atom species that are out of range') + return + + self._backend.addArrayValues('atom_labels', atom_labels) + + # At this point we should have atom labels. try: atoms = ase.Atoms(symbols=atom_labels) chemical_symbols = list(atoms.get_chemical_symbols()) @@ -91,8 +114,6 @@ class SystemNormalizer(SystemBasedNormalizer): 'cannot build ase atoms from atom labels', atom_labels=atom_labels[:10], exc_info=e, error=str(e)) raise e - # Write labels. Rewrite if labels exist in backend already from parser. - self._backend.addArrayValues('atom_labels', atom_labels) if atom_species is None: atom_species = atoms.get_atomic_numbers().tolist() diff --git a/nomad/parsing/__init__.py b/nomad/parsing/__init__.py index abca23f6aa..7b505df631 100644 --- a/nomad/parsing/__init__.py +++ b/nomad/parsing/__init__.py @@ -58,10 +58,11 @@ based on NOMAD-coe's *python-common* module. :members: """ -from typing import Callable, IO +from typing import Callable, IO, Union import magic import gzip import bz2 +import os.path from nomad import files, config @@ -76,7 +77,7 @@ _compressions = { } -def match_parser(mainfile: str, upload_files: files.StagingUploadFiles) -> 'Parser': +def match_parser(mainfile: str, upload_files: Union[str, files.StagingUploadFiles]) -> 'Parser': """ Performs parser matching. This means it take the given mainfile and potentially opens it with the given callback and tries to identify a parser that can parse @@ -87,15 +88,21 @@ def match_parser(mainfile: str, upload_files: files.StagingUploadFiles) -> 'Pars Arguments: mainfile: The upload relative path to the mainfile - open: A function that allows to open a stream to the file + upload_files: Either a :class:`files.StagingUploadFiles` object or a directory name. + Directory name + mainfile needs to point to the file. Returns: The parser, or None if no parser could be matched. """ - with upload_files.raw_file(mainfile, 'rb') as f: + if isinstance(upload_files, str): + mainfile_path = os.path.join(upload_files, mainfile) + else: + mainfile_path = upload_files.raw_file_object(mainfile).os_path + + with open(mainfile_path, 'rb') as f: compression, open_compressed = _compressions.get(f.read(3), (None, open)) - mainfile_path = upload_files.raw_file_object(mainfile).os_path - with open_compressed(mainfile_path, 'rb') as f: - buffer = f.read(2048) + + with open_compressed(mainfile_path, 'rb') as cf: + buffer = cf.read(2048) mime_type = magic.from_buffer(buffer, mime=True) for parser in parsers: diff --git a/tests/test_normalizing.py b/tests/test_normalizing.py index 78166f43a8..3c6fd708c4 100644 --- a/tests/test_normalizing.py +++ b/tests/test_normalizing.py @@ -131,17 +131,22 @@ def test_normalizer_faulty_matid(caplog): def test_normalizer_single_string_atom_labels(caplog): - """ Runs normalizer on ['Br1SiSiK'] expects error that it is formatted wrong.""" + """ + Runs normalizer on ['Br1SiSiK'] expects error. Should replace the label with 'X' and + the numbers of postitions should not match the labels. + """ backend = parse_file(single_string_atom_labels) run_normalize(backend) - assert_log(caplog, 'ERROR', 'Atom labels cannot be recognized.') + assert_log(caplog, 'ERROR', 'len of atom position does not match number of atoms') -def test_normalizer_unknown_atom_label(caplog): - """ Runs normalizer on ['Br','Si','Si','Za'], expects Za throws an error""" +def test_normalizer_unknown_atom_label(caplog, no_warn): + """ Runs normalizer on ['Br','Si','Si','Za'], for normalizeation Za will be replaced, + but stays int the labels. + """ backend = parse_file(unknown_atom_label) run_normalize(backend) - assert_log(caplog, 'ERROR', 'Atom labels cannot be recognized.') + assert backend.get_value('atom_labels')[3] == 'Za' def test_symmetry_classification_fcc(): -- GitLab