From 38032188f3e6373bd14acbe837e882593fc75897 Mon Sep 17 00:00:00 2001
From: Markus Scheidgen <markus.scheidgen@gmail.com>
Date: Tue, 16 Jul 2019 15:19:16 +0200
Subject: [PATCH] Fixed atom_labels normalization. Added parse command to
 client cli.

---
 nomad/admin/__init__.py     |   2 +-
 nomad/admin/__main__.py     |   4 +-
 nomad/client/__init__.py    |   2 +-
 nomad/client/local.py       |  50 +++--------------
 nomad/client/parse.py       | 105 ++++++++++++++++++++++++++++++++++++
 nomad/datamodel/dft.py      |   9 +++-
 nomad/normalizing/system.py |  41 ++++++++++----
 nomad/parsing/__init__.py   |  21 +++++---
 tests/test_normalizing.py   |  15 ++++--
 9 files changed, 177 insertions(+), 72 deletions(-)
 create mode 100644 nomad/client/parse.py

diff --git a/nomad/admin/__init__.py b/nomad/admin/__init__.py
index a3f3f14323..bc1477afad 100644
--- a/nomad/admin/__init__.py
+++ b/nomad/admin/__init__.py
@@ -23,4 +23,4 @@ from .__main__ import cli as cli_main
 
 
 def cli():
-    cli_main(obj=POPO())
+    cli_main(obj=POPO())  # pylint: disable=E1120,E1123
diff --git a/nomad/admin/__main__.py b/nomad/admin/__main__.py
index 1ec8f3ba57..240b10cdd9 100644
--- a/nomad/admin/__main__.py
+++ b/nomad/admin/__main__.py
@@ -20,7 +20,7 @@ import shutil
 from tabulate import tabulate
 from elasticsearch_dsl import A
 
-from nomad import config as nomad_config, infrastructure, processing, utils
+from nomad import config as nomad_config, infrastructure, processing
 from nomad.search import Search
 
 
@@ -145,4 +145,4 @@ def clean(dry, skip_calcs, skip_fs, skip_es):
 
 
 if __name__ == '__main__':
-    cli(obj={})  # pylint: disable=E1120
+    cli(obj={})  # pylint: disable=E1120,E1123
diff --git a/nomad/client/__init__.py b/nomad/client/__init__.py
index a2fc9fc68c..12266f4124 100644
--- a/nomad/client/__init__.py
+++ b/nomad/client/__init__.py
@@ -16,6 +16,6 @@
 Swagger/bravado based python client library for the API and various usefull shell commands.
 """
 
-from . import local, migration, upload, integrationtests
+from . import local, migration, upload, integrationtests, parse
 from .__main__ import cli, create_client
 from .upload import stream_upload_with_client
diff --git a/nomad/client/local.py b/nomad/client/local.py
index 66c1beed38..e6982bb427 100644
--- a/nomad/client/local.py
+++ b/nomad/client/local.py
@@ -17,16 +17,16 @@ import os
 import io
 import requests
 import click
-from typing import Union, Callable, cast
+from typing import Union, Callable
 import sys
 import ujson
 import bravado.exception
 
 from nomad import config, utils
 from nomad.files import ArchiveBasedStagingUploadFiles
-from nomad.parsing import parser_dict, LocalBackend, match_parser
-from nomad.normalizing import normalizers
 from nomad.datamodel import CalcWithMetadata
+from nomad.parsing import LocalBackend
+from nomad.client.parse import parse, normalize, normalize_all
 
 from .__main__ import cli
 
@@ -126,30 +126,7 @@ class CalcProcReproduction:
         Run the given parser on the downloaded calculation. If no parser is given,
         do parser matching and use the respective parser.
         """
-        if parser_name is not None:
-            parser = parser_dict.get(parser_name)
-        else:
-            parser = match_parser(self.mainfile, self.upload_files)
-
-        assert parser is not None, 'there is not parser matching %s' % self.mainfile
-        self.logger = self.logger.bind(parser=parser.name)  # type: ignore
-        self.logger.info('identified parser')
-
-        parser_backend = parser.run(self.upload_files.raw_file_object(self.mainfile).os_path, logger=self.logger)
-
-        if not parser_backend.status[0] == 'ParseSuccess':
-            self.logger.error('parsing was not successful', status=parser_backend.status)
-
-        parser_backend.openNonOverlappingSection('section_entry_info')
-        parser_backend.addValue('upload_id', self.upload_id)
-        parser_backend.addValue('calc_id', self.calc_id)
-        parser_backend.addValue('calc_hash', "no hash")
-        parser_backend.addValue('mainfile', self.mainfile)
-        parser_backend.addValue('parser_name', parser.__class__.__name__)
-        parser_backend.closeNonOverlappingSection('section_entry_info')
-
-        self.logger.info('ran parser')
-        return parser_backend
+        return parse(self.mainfile, self.upload_files, parser_name=parser_name, logger=self.logger)
 
     def normalize(self, normalizer: Union[str, Callable], parser_backend: LocalBackend = None):
         """
@@ -158,28 +135,13 @@ class CalcProcReproduction:
         if parser_backend is None:
             parser_backend = self.parse()
 
-        if isinstance(normalizer, str):
-            normalizer = next(
-                normalizer_instance for normalizer_instance in normalizers
-                if normalizer_instance.__class__.__name__ == normalizer)
-
-        assert normalizer is not None, 'there is no normalizer %s' % str(normalizer)
-        normalizer_instance = cast(Callable, normalizer)(parser_backend)
-        logger = self.logger.bind(normalizer=normalizer_instance.__class__.__name__)
-        self.logger.info('identified normalizer')
-
-        normalizer_instance.normalize(logger=logger)
-        self.logger.info('ran normalizer')
-        return parser_backend
+        return normalize(parser_backend=parser_backend, normalizer=normalizer, logger=self.logger)
 
     def normalize_all(self, parser_backend: LocalBackend = None):
         """
         Parse the downloaded calculation and run the whole normalizer chain.
         """
-        for normalizer in normalizers:
-            parser_backend = self.normalize(normalizer, parser_backend=parser_backend)
-
-        return parser_backend
+        return normalize_all(parser_backend=parser_backend, logger=self.logger)
 
 
 @cli.command(help='Run processing locally.')
diff --git a/nomad/client/parse.py b/nomad/client/parse.py
new file mode 100644
index 0000000000..e917a10ef4
--- /dev/null
+++ b/nomad/client/parse.py
@@ -0,0 +1,105 @@
+from typing import Union, Callable, cast
+import os.path
+import ujson
+import click
+import sys
+
+from nomad import config, utils, files
+from nomad.parsing import LocalBackend, parser_dict, match_parser
+from nomad.normalizing import normalizers
+from nomad.datamodel import CalcWithMetadata
+
+from .__main__ import cli
+
+
+def parse(
+        mainfile: str, upload_files: Union[str, files.StagingUploadFiles],
+        parser_name: str = None, logger=None) -> LocalBackend:
+    """
+    Run the given parser on the downloaded calculation. If no parser is given,
+    do parser matching and use the respective parser.
+    """
+    if logger is None:
+        logger = utils.get_logger(__name__)
+    if parser_name is not None:
+        parser = parser_dict.get(parser_name)
+    else:
+        parser = match_parser(mainfile, upload_files)
+
+    assert parser is not None, 'there is not parser matching %s' % mainfile
+    logger = logger.bind(parser=parser.name)  # type: ignore
+    logger.info('identified parser')
+
+    if isinstance(upload_files, str):
+        mainfile_path = os.path.join(upload_files, mainfile)
+    else:
+        mainfile_path = upload_files.raw_file_object(mainfile).os_path
+
+    parser_backend = parser.run(mainfile_path, logger=logger)
+
+    if not parser_backend.status[0] == 'ParseSuccess':
+        logger.error('parsing was not successful', status=parser_backend.status)
+
+    parser_backend.openNonOverlappingSection('section_entry_info')
+    parser_backend.addValue('upload_id', config.services.unavailable_value)
+    parser_backend.addValue('calc_id', config.services.unavailable_value)
+    parser_backend.addValue('calc_hash', "no hash")
+    parser_backend.addValue('mainfile', mainfile)
+    parser_backend.addValue('parser_name', parser.__class__.__name__)
+    parser_backend.closeNonOverlappingSection('section_entry_info')
+
+    logger.info('ran parser')
+    return parser_backend
+
+
+def normalize(
+        normalizer: Union[str, Callable], parser_backend: LocalBackend = None,
+        logger=None) -> LocalBackend:
+
+    if logger is None:
+        logger = utils.get_logger(__name__)
+
+    if isinstance(normalizer, str):
+        normalizer = next(
+            normalizer_instance for normalizer_instance in normalizers
+            if normalizer_instance.__class__.__name__ == normalizer)
+
+    assert normalizer is not None, 'there is no normalizer %s' % str(normalizer)
+    normalizer_instance = cast(Callable, normalizer)(parser_backend)
+    logger = logger.bind(normalizer=normalizer_instance.__class__.__name__)
+    logger.info('identified normalizer')
+
+    normalizer_instance.normalize(logger=logger)
+    logger.info('ran normalizer')
+    return parser_backend
+
+
+def normalize_all(parser_backend: LocalBackend = None, logger=None) -> LocalBackend:
+    """
+    Parse the downloaded calculation and run the whole normalizer chain.
+    """
+    for normalizer in normalizers:
+        parser_backend = normalize(normalizer, parser_backend=parser_backend, logger=logger)
+
+    return parser_backend
+
+
+@cli.command(help='Run parsing and normalizing locally.', name='parse')
+@click.argument('MAINFILE', nargs=1, required=True, type=str)
+@click.option('--show-backend', is_flag=True, default=False, help='Print the backend data.')
+@click.option('--show-metadata', is_flag=True, default=False, help='Print the extracted repo metadata.')
+@click.option('--skip-normalizers', is_flag=True, default=False, help='Do not run the normalizer.')
+def _parse(mainfile, show_backend, show_metadata, skip_normalizers):
+    utils.configure_logging()
+
+    backend = parse(mainfile, '.')
+
+    if not skip_normalizers:
+        normalize_all(backend)
+
+    if show_backend:
+        backend.write_json(sys.stdout, pretty=True)
+    if show_metadata:
+        metadata = CalcWithMetadata()
+        metadata.apply_domain_metadata(backend)
+        ujson.dump(metadata.to_dict(), sys.stdout, indent=4)
diff --git a/nomad/datamodel/dft.py b/nomad/datamodel/dft.py
index 0a9c25b591..37315ceee0 100644
--- a/nomad/datamodel/dft.py
+++ b/nomad/datamodel/dft.py
@@ -89,17 +89,22 @@ class DFTCalcWithMetadata(CalcWithMetadata):
         super().__init__(**kwargs)
 
     def apply_domain_metadata(self, backend):
+        from nomad.normalizing.system import normalized_atom_labels
+
         logger = utils.get_logger(__name__).bind(
             upload_id=self.upload_id, calc_id=self.calc_id, mainfile=self.mainfile)
 
         self.code_name = backend.get_value('program_name', 0)
-        self.code_version = simplify_version(backend.get_value('program_version', 0))
+        try:
+            self.code_version = simplify_version(backend.get_value('program_version', 0))
+        except KeyError:
+            self.code_version = config.services.unavailable_value
 
         self.atoms = get_optional_backend_value(backend, 'atom_labels', 'section_system', logger=logger)
         if hasattr(self.atoms, 'tolist'):
             self.atoms = self.atoms.tolist()
         self.n_atoms = len(self.atoms)
-        self.atoms = list(set(self.atoms))
+        self.atoms = list(set(normalized_atom_labels(set(self.atoms))))
         self.atoms.sort()
 
         self.crystal_system = get_optional_backend_value(
diff --git a/nomad/normalizing/system.py b/nomad/normalizing/system.py
index ab5b0ddba3..e7895a72ff 100644
--- a/nomad/normalizing/system.py
+++ b/nomad/normalizing/system.py
@@ -16,6 +16,7 @@ from typing import Any
 import ase
 import numpy as np
 import json
+import re
 
 from matid import SymmetryAnalyzer
 from matid.geometry import get_dimensionality
@@ -24,6 +25,23 @@ from nomad import utils, config
 from nomad.normalizing.normalizer import SystemBasedNormalizer
 
 
+# use a regular expression to check atom labels; expression is build from list of
+# all labels sorted desc to find Br and not B when searching for Br.
+atom_label_re = re.compile('|'.join(
+    sorted(ase.data.chemical_symbols, key=lambda x: len(x), reverse=True)))
+
+
+def normalized_atom_labels(atom_labels):
+    """
+    Normalizes the given atom labels: they either are labels right away, or contain
+    additional numbers (to distinguish same species but different labels, see meta-info),
+    or we replace them with ase placeholder atom for unknown elements 'X'.
+    """
+    return [
+        ase.data.chemical_symbols[0] if match is None else match.group(0)
+        for match in [re.search(atom_label_re, atom_label) for atom_label in atom_labels]]
+
+
 class SystemNormalizer(SystemBasedNormalizer):
 
     """
@@ -66,20 +84,25 @@ class SystemNormalizer(SystemBasedNormalizer):
 
         # analyze atoms labels
         atom_labels = get_value('atom_labels', nonp=True)
+        if atom_labels is not None:
+            atom_labels = normalized_atom_labels(atom_labels)
+
         atom_species = get_value('atom_species', nonp=True)
         if atom_labels is None and atom_species is None:
             self.logger.error('calculation has neither atom species nor labels')
             return
+
         # If there are no atom labels we create them from atom species data.
         if atom_labels is None:
-            atom_labels = list(ase.data.chemical_symbols[species] for species in atom_species)
-        # At this point we should have atom labels. Check that each atom label in the atom
-        # labels list is a true atom label by checking if it is in the ASE list of atom labels.
-        if not all(label in ase.data.chemical_symbols for label in atom_labels):
-            # Throw an error that the atom labels are poorly formated or there are unknown
-            # labels. Save first ten elemenets in logged error.
-            self.logger.error('Atom labels cannot be recognized.', atom_labels=atom_labels[:10])
-            return
+            try:
+                atom_labels = list(ase.data.chemical_symbols[species] for species in atom_species)
+            except IndexError:
+                self.logger.error('calculation has atom species that are out of range')
+                return
+
+            self._backend.addArrayValues('atom_labels', atom_labels)
+
+        # At this point we should have atom labels.
         try:
             atoms = ase.Atoms(symbols=atom_labels)
             chemical_symbols = list(atoms.get_chemical_symbols())
@@ -91,8 +114,6 @@ class SystemNormalizer(SystemBasedNormalizer):
                 'cannot build ase atoms from atom labels',
                 atom_labels=atom_labels[:10], exc_info=e, error=str(e))
             raise e
-        # Write labels. Rewrite if labels exist in backend already from parser.
-        self._backend.addArrayValues('atom_labels', atom_labels)
 
         if atom_species is None:
             atom_species = atoms.get_atomic_numbers().tolist()
diff --git a/nomad/parsing/__init__.py b/nomad/parsing/__init__.py
index abca23f6aa..7b505df631 100644
--- a/nomad/parsing/__init__.py
+++ b/nomad/parsing/__init__.py
@@ -58,10 +58,11 @@ based on NOMAD-coe's *python-common* module.
     :members:
 
 """
-from typing import Callable, IO
+from typing import Callable, IO, Union
 import magic
 import gzip
 import bz2
+import os.path
 
 from nomad import files, config
 
@@ -76,7 +77,7 @@ _compressions = {
 }
 
 
-def match_parser(mainfile: str, upload_files: files.StagingUploadFiles) -> 'Parser':
+def match_parser(mainfile: str, upload_files: Union[str, files.StagingUploadFiles]) -> 'Parser':
     """
     Performs parser matching. This means it take the given mainfile and potentially
     opens it with the given callback and tries to identify a parser that can parse
@@ -87,15 +88,21 @@ def match_parser(mainfile: str, upload_files: files.StagingUploadFiles) -> 'Pars
 
     Arguments:
         mainfile: The upload relative path to the mainfile
-        open: A function that allows to open a stream to the file
+        upload_files: Either a :class:`files.StagingUploadFiles` object or a directory name.
+            Directory name + mainfile needs to point to the file.
 
     Returns: The parser, or None if no parser could be matched.
     """
-    with upload_files.raw_file(mainfile, 'rb') as f:
+    if isinstance(upload_files, str):
+        mainfile_path = os.path.join(upload_files, mainfile)
+    else:
+        mainfile_path = upload_files.raw_file_object(mainfile).os_path
+
+    with open(mainfile_path, 'rb') as f:
         compression, open_compressed = _compressions.get(f.read(3), (None, open))
-    mainfile_path = upload_files.raw_file_object(mainfile).os_path
-    with open_compressed(mainfile_path, 'rb') as f:
-        buffer = f.read(2048)
+
+    with open_compressed(mainfile_path, 'rb') as cf:
+        buffer = cf.read(2048)
 
     mime_type = magic.from_buffer(buffer, mime=True)
     for parser in parsers:
diff --git a/tests/test_normalizing.py b/tests/test_normalizing.py
index 78166f43a8..3c6fd708c4 100644
--- a/tests/test_normalizing.py
+++ b/tests/test_normalizing.py
@@ -131,17 +131,22 @@ def test_normalizer_faulty_matid(caplog):
 
 
 def test_normalizer_single_string_atom_labels(caplog):
-    """ Runs normalizer on ['Br1SiSiK'] expects error that it is formatted wrong."""
+    """
+    Runs normalizer on ['Br1SiSiK'] expects error. Should replace the label with 'X' and
+    the numbers of postitions should not match the labels.
+    """
     backend = parse_file(single_string_atom_labels)
     run_normalize(backend)
-    assert_log(caplog, 'ERROR', 'Atom labels cannot be recognized.')
+    assert_log(caplog, 'ERROR', 'len of atom position does not match number of atoms')
 
 
-def test_normalizer_unknown_atom_label(caplog):
-    """ Runs normalizer on ['Br','Si','Si','Za'], expects Za throws an error"""
+def test_normalizer_unknown_atom_label(caplog, no_warn):
+    """ Runs normalizer on ['Br','Si','Si','Za'], for normalizeation Za will be replaced,
+        but stays int the labels.
+    """
     backend = parse_file(unknown_atom_label)
     run_normalize(backend)
-    assert_log(caplog, 'ERROR', 'Atom labels cannot be recognized.')
+    assert backend.get_value('atom_labels')[3] == 'Za'
 
 
 def test_symmetry_classification_fcc():
-- 
GitLab