Commit 38032188 authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

Fixed atom_labels normalization. Added parse command to client cli.

parent 8a4b0a14
Pipeline #51998 passed with stages
in 19 minutes and 46 seconds
......@@ -23,4 +23,4 @@ from .__main__ import cli as cli_main
def cli():
cli_main(obj=POPO())
cli_main(obj=POPO()) # pylint: disable=E1120,E1123
......@@ -20,7 +20,7 @@ import shutil
from tabulate import tabulate
from elasticsearch_dsl import A
from nomad import config as nomad_config, infrastructure, processing, utils
from nomad import config as nomad_config, infrastructure, processing
from nomad.search import Search
......@@ -145,4 +145,4 @@ def clean(dry, skip_calcs, skip_fs, skip_es):
if __name__ == '__main__':
cli(obj={}) # pylint: disable=E1120
cli(obj={}) # pylint: disable=E1120,E1123
......@@ -16,6 +16,6 @@
Swagger/bravado based python client library for the API and various usefull shell commands.
"""
from . import local, migration, upload, integrationtests
from . import local, migration, upload, integrationtests, parse
from .__main__ import cli, create_client
from .upload import stream_upload_with_client
......@@ -17,16 +17,16 @@ import os
import io
import requests
import click
from typing import Union, Callable, cast
from typing import Union, Callable
import sys
import ujson
import bravado.exception
from nomad import config, utils
from nomad.files import ArchiveBasedStagingUploadFiles
from nomad.parsing import parser_dict, LocalBackend, match_parser
from nomad.normalizing import normalizers
from nomad.datamodel import CalcWithMetadata
from nomad.parsing import LocalBackend
from nomad.client.parse import parse, normalize, normalize_all
from .__main__ import cli
......@@ -126,30 +126,7 @@ class CalcProcReproduction:
Run the given parser on the downloaded calculation. If no parser is given,
do parser matching and use the respective parser.
"""
if parser_name is not None:
parser = parser_dict.get(parser_name)
else:
parser = match_parser(self.mainfile, self.upload_files)
assert parser is not None, 'there is not parser matching %s' % self.mainfile
self.logger = self.logger.bind(parser=parser.name) # type: ignore
self.logger.info('identified parser')
parser_backend = parser.run(self.upload_files.raw_file_object(self.mainfile).os_path, logger=self.logger)
if not parser_backend.status[0] == 'ParseSuccess':
self.logger.error('parsing was not successful', status=parser_backend.status)
parser_backend.openNonOverlappingSection('section_entry_info')
parser_backend.addValue('upload_id', self.upload_id)
parser_backend.addValue('calc_id', self.calc_id)
parser_backend.addValue('calc_hash', "no hash")
parser_backend.addValue('mainfile', self.mainfile)
parser_backend.addValue('parser_name', parser.__class__.__name__)
parser_backend.closeNonOverlappingSection('section_entry_info')
self.logger.info('ran parser')
return parser_backend
return parse(self.mainfile, self.upload_files, parser_name=parser_name, logger=self.logger)
def normalize(self, normalizer: Union[str, Callable], parser_backend: LocalBackend = None):
"""
......@@ -158,28 +135,13 @@ class CalcProcReproduction:
if parser_backend is None:
parser_backend = self.parse()
if isinstance(normalizer, str):
normalizer = next(
normalizer_instance for normalizer_instance in normalizers
if normalizer_instance.__class__.__name__ == normalizer)
assert normalizer is not None, 'there is no normalizer %s' % str(normalizer)
normalizer_instance = cast(Callable, normalizer)(parser_backend)
logger = self.logger.bind(normalizer=normalizer_instance.__class__.__name__)
self.logger.info('identified normalizer')
normalizer_instance.normalize(logger=logger)
self.logger.info('ran normalizer')
return parser_backend
return normalize(parser_backend=parser_backend, normalizer=normalizer, logger=self.logger)
def normalize_all(self, parser_backend: LocalBackend = None):
"""
Parse the downloaded calculation and run the whole normalizer chain.
"""
for normalizer in normalizers:
parser_backend = self.normalize(normalizer, parser_backend=parser_backend)
return parser_backend
return normalize_all(parser_backend=parser_backend, logger=self.logger)
@cli.command(help='Run processing locally.')
......
from typing import Union, Callable, cast
import os.path
import ujson
import click
import sys
from nomad import config, utils, files
from nomad.parsing import LocalBackend, parser_dict, match_parser
from nomad.normalizing import normalizers
from nomad.datamodel import CalcWithMetadata
from .__main__ import cli
def parse(
mainfile: str, upload_files: Union[str, files.StagingUploadFiles],
parser_name: str = None, logger=None) -> LocalBackend:
"""
Run the given parser on the downloaded calculation. If no parser is given,
do parser matching and use the respective parser.
"""
if logger is None:
logger = utils.get_logger(__name__)
if parser_name is not None:
parser = parser_dict.get(parser_name)
else:
parser = match_parser(mainfile, upload_files)
assert parser is not None, 'there is not parser matching %s' % mainfile
logger = logger.bind(parser=parser.name) # type: ignore
logger.info('identified parser')
if isinstance(upload_files, str):
mainfile_path = os.path.join(upload_files, mainfile)
else:
mainfile_path = upload_files.raw_file_object(mainfile).os_path
parser_backend = parser.run(mainfile_path, logger=logger)
if not parser_backend.status[0] == 'ParseSuccess':
logger.error('parsing was not successful', status=parser_backend.status)
parser_backend.openNonOverlappingSection('section_entry_info')
parser_backend.addValue('upload_id', config.services.unavailable_value)
parser_backend.addValue('calc_id', config.services.unavailable_value)
parser_backend.addValue('calc_hash', "no hash")
parser_backend.addValue('mainfile', mainfile)
parser_backend.addValue('parser_name', parser.__class__.__name__)
parser_backend.closeNonOverlappingSection('section_entry_info')
logger.info('ran parser')
return parser_backend
def normalize(
normalizer: Union[str, Callable], parser_backend: LocalBackend = None,
logger=None) -> LocalBackend:
if logger is None:
logger = utils.get_logger(__name__)
if isinstance(normalizer, str):
normalizer = next(
normalizer_instance for normalizer_instance in normalizers
if normalizer_instance.__class__.__name__ == normalizer)
assert normalizer is not None, 'there is no normalizer %s' % str(normalizer)
normalizer_instance = cast(Callable, normalizer)(parser_backend)
logger = logger.bind(normalizer=normalizer_instance.__class__.__name__)
logger.info('identified normalizer')
normalizer_instance.normalize(logger=logger)
logger.info('ran normalizer')
return parser_backend
def normalize_all(parser_backend: LocalBackend = None, logger=None) -> LocalBackend:
"""
Parse the downloaded calculation and run the whole normalizer chain.
"""
for normalizer in normalizers:
parser_backend = normalize(normalizer, parser_backend=parser_backend, logger=logger)
return parser_backend
@cli.command(help='Run parsing and normalizing locally.', name='parse')
@click.argument('MAINFILE', nargs=1, required=True, type=str)
@click.option('--show-backend', is_flag=True, default=False, help='Print the backend data.')
@click.option('--show-metadata', is_flag=True, default=False, help='Print the extracted repo metadata.')
@click.option('--skip-normalizers', is_flag=True, default=False, help='Do not run the normalizer.')
def _parse(mainfile, show_backend, show_metadata, skip_normalizers):
utils.configure_logging()
backend = parse(mainfile, '.')
if not skip_normalizers:
normalize_all(backend)
if show_backend:
backend.write_json(sys.stdout, pretty=True)
if show_metadata:
metadata = CalcWithMetadata()
metadata.apply_domain_metadata(backend)
ujson.dump(metadata.to_dict(), sys.stdout, indent=4)
......@@ -89,17 +89,22 @@ class DFTCalcWithMetadata(CalcWithMetadata):
super().__init__(**kwargs)
def apply_domain_metadata(self, backend):
from nomad.normalizing.system import normalized_atom_labels
logger = utils.get_logger(__name__).bind(
upload_id=self.upload_id, calc_id=self.calc_id, mainfile=self.mainfile)
self.code_name = backend.get_value('program_name', 0)
self.code_version = simplify_version(backend.get_value('program_version', 0))
try:
self.code_version = simplify_version(backend.get_value('program_version', 0))
except KeyError:
self.code_version = config.services.unavailable_value
self.atoms = get_optional_backend_value(backend, 'atom_labels', 'section_system', logger=logger)
if hasattr(self.atoms, 'tolist'):
self.atoms = self.atoms.tolist()
self.n_atoms = len(self.atoms)
self.atoms = list(set(self.atoms))
self.atoms = list(set(normalized_atom_labels(set(self.atoms))))
self.atoms.sort()
self.crystal_system = get_optional_backend_value(
......
......@@ -16,6 +16,7 @@ from typing import Any
import ase
import numpy as np
import json
import re
from matid import SymmetryAnalyzer
from matid.geometry import get_dimensionality
......@@ -24,6 +25,23 @@ from nomad import utils, config
from nomad.normalizing.normalizer import SystemBasedNormalizer
# use a regular expression to check atom labels; expression is build from list of
# all labels sorted desc to find Br and not B when searching for Br.
atom_label_re = re.compile('|'.join(
sorted(ase.data.chemical_symbols, key=lambda x: len(x), reverse=True)))
def normalized_atom_labels(atom_labels):
"""
Normalizes the given atom labels: they either are labels right away, or contain
additional numbers (to distinguish same species but different labels, see meta-info),
or we replace them with ase placeholder atom for unknown elements 'X'.
"""
return [
ase.data.chemical_symbols[0] if match is None else match.group(0)
for match in [re.search(atom_label_re, atom_label) for atom_label in atom_labels]]
class SystemNormalizer(SystemBasedNormalizer):
"""
......@@ -66,20 +84,25 @@ class SystemNormalizer(SystemBasedNormalizer):
# analyze atoms labels
atom_labels = get_value('atom_labels', nonp=True)
if atom_labels is not None:
atom_labels = normalized_atom_labels(atom_labels)
atom_species = get_value('atom_species', nonp=True)
if atom_labels is None and atom_species is None:
self.logger.error('calculation has neither atom species nor labels')
return
# If there are no atom labels we create them from atom species data.
if atom_labels is None:
atom_labels = list(ase.data.chemical_symbols[species] for species in atom_species)
# At this point we should have atom labels. Check that each atom label in the atom
# labels list is a true atom label by checking if it is in the ASE list of atom labels.
if not all(label in ase.data.chemical_symbols for label in atom_labels):
# Throw an error that the atom labels are poorly formated or there are unknown
# labels. Save first ten elemenets in logged error.
self.logger.error('Atom labels cannot be recognized.', atom_labels=atom_labels[:10])
return
try:
atom_labels = list(ase.data.chemical_symbols[species] for species in atom_species)
except IndexError:
self.logger.error('calculation has atom species that are out of range')
return
self._backend.addArrayValues('atom_labels', atom_labels)
# At this point we should have atom labels.
try:
atoms = ase.Atoms(symbols=atom_labels)
chemical_symbols = list(atoms.get_chemical_symbols())
......@@ -91,8 +114,6 @@ class SystemNormalizer(SystemBasedNormalizer):
'cannot build ase atoms from atom labels',
atom_labels=atom_labels[:10], exc_info=e, error=str(e))
raise e
# Write labels. Rewrite if labels exist in backend already from parser.
self._backend.addArrayValues('atom_labels', atom_labels)
if atom_species is None:
atom_species = atoms.get_atomic_numbers().tolist()
......
......@@ -58,10 +58,11 @@ based on NOMAD-coe's *python-common* module.
:members:
"""
from typing import Callable, IO
from typing import Callable, IO, Union
import magic
import gzip
import bz2
import os.path
from nomad import files, config
......@@ -76,7 +77,7 @@ _compressions = {
}
def match_parser(mainfile: str, upload_files: files.StagingUploadFiles) -> 'Parser':
def match_parser(mainfile: str, upload_files: Union[str, files.StagingUploadFiles]) -> 'Parser':
"""
Performs parser matching. This means it take the given mainfile and potentially
opens it with the given callback and tries to identify a parser that can parse
......@@ -87,15 +88,21 @@ def match_parser(mainfile: str, upload_files: files.StagingUploadFiles) -> 'Pars
Arguments:
mainfile: The upload relative path to the mainfile
open: A function that allows to open a stream to the file
upload_files: Either a :class:`files.StagingUploadFiles` object or a directory name.
Directory name + mainfile needs to point to the file.
Returns: The parser, or None if no parser could be matched.
"""
with upload_files.raw_file(mainfile, 'rb') as f:
if isinstance(upload_files, str):
mainfile_path = os.path.join(upload_files, mainfile)
else:
mainfile_path = upload_files.raw_file_object(mainfile).os_path
with open(mainfile_path, 'rb') as f:
compression, open_compressed = _compressions.get(f.read(3), (None, open))
mainfile_path = upload_files.raw_file_object(mainfile).os_path
with open_compressed(mainfile_path, 'rb') as f:
buffer = f.read(2048)
with open_compressed(mainfile_path, 'rb') as cf:
buffer = cf.read(2048)
mime_type = magic.from_buffer(buffer, mime=True)
for parser in parsers:
......
......@@ -131,17 +131,22 @@ def test_normalizer_faulty_matid(caplog):
def test_normalizer_single_string_atom_labels(caplog):
""" Runs normalizer on ['Br1SiSiK'] expects error that it is formatted wrong."""
"""
Runs normalizer on ['Br1SiSiK'] expects error. Should replace the label with 'X' and
the numbers of postitions should not match the labels.
"""
backend = parse_file(single_string_atom_labels)
run_normalize(backend)
assert_log(caplog, 'ERROR', 'Atom labels cannot be recognized.')
assert_log(caplog, 'ERROR', 'len of atom position does not match number of atoms')
def test_normalizer_unknown_atom_label(caplog):
""" Runs normalizer on ['Br','Si','Si','Za'], expects Za throws an error"""
def test_normalizer_unknown_atom_label(caplog, no_warn):
""" Runs normalizer on ['Br','Si','Si','Za'], for normalizeation Za will be replaced,
but stays int the labels.
"""
backend = parse_file(unknown_atom_label)
run_normalize(backend)
assert_log(caplog, 'ERROR', 'Atom labels cannot be recognized.')
assert backend.get_value('atom_labels')[3] == 'Za'
def test_symmetry_classification_fcc():
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment