Commit ca7161a3 authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

Merge branch 'parser-integration' into 'master'

Parser integration

See merge request !25
parents 8148d0f4 708f5972
Pipeline #43088 passed with stages
in 29 minutes and 11 seconds
......@@ -15,3 +15,4 @@ test_*/
local/
target/
*.swp
*.vscode
.ropeproject/
*.sql
*.sql
\ No newline at end of file
......@@ -44,7 +44,27 @@
"cwd": "${workspaceFolder}",
"program": "${workspaceFolder}/.pyenv/bin/pytest",
"args": [
"-sv", "tests/test_api.py::TestAuth::test_xtoken_auth_denied"
"-sv", "tests/test_api.py::TestUploads::test_put[None-multipart-tests/data/proc/examples_template.zip]"
]
},
{
"name": "Python: crystal normalizer test",
"type": "python",
"request": "launch",
"cwd": "${workspaceFolder}",
"program": "${workspaceFolder}/.pyenv/bin/pytest",
"args": [
"-sv", "tests/test_normalizing.py::test_normalizer[parsers/crystal-tests/data/parsers/crystal/si.out]"
]
},
{
"name": "Python: nwchem normalizer test h2o sp test",
"type": "python",
"request": "launch",
"cwd": "${workspaceFolder}",
"program": "${workspaceFolder}/.pyenv/bin/pytest",
"args": [
"-sv", "tests/test_normalizing.py::test_normalizer[parsers/nwchem-tests/data/parsers/nwchem/sp_output.out]"
]
},
{
......
......@@ -57,6 +57,9 @@ RUN \
# Second, create a slim final image
FROM final
RUN apt-get update && apt-get install -y --no-install-recommends libgomp1
# copy the sources for tests, coverage, qa, etc.
COPY . /app
WORKDIR /app
......
......@@ -20,6 +20,11 @@ This module is used to store all configuration values. It makes use of
import os
import logging
from collections import namedtuple
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
FilesConfig = namedtuple(
'FilesConfig', ['uploads_bucket', 'raw_bucket', 'archive_bucket', 'staging_bucket', 'public_bucket'])
......
......@@ -16,8 +16,8 @@
This module allows to configure and install all necessary legecy nomad GIT repositories
to process (parser, normalizer, etc.) uploaded calculations.
Parsers are developed as independed, individual python programs in their own GIT repositories.
They are build on a common modules called *python-common*, also in a separate GIT.
Parsers are developed as independent, individual python programs in their own GIT repositories.
They are built on a common modules called *python-common*, also in a separate GIT.
All parsers depend on the *meta-info*, which is also maintained in its own GIT.
Preparing dependencies
......@@ -190,19 +190,28 @@ dependencies = [
git_url='https://gitlab.mpcdf.mpg.de/nomad-lab/parser-fhi-aims.git',
git_branch='nomad-fair'),
PythonGit(
name='normalizers/stats',
git_url='https://gitlab.mpcdf.mpg.de/nomad-lab/normalizer-stats.git',
name='parsers/cp2k',
git_url='https://gitlab.mpcdf.mpg.de/nomad-lab/parser-cp2k',
git_branch='nomad-fair'),
PythonGit(
name='normalizers/symmetry',
git_url='https://gitlab.mpcdf.mpg.de/nomad-lab/normalizer-symmetry',
name='parsers/crystal',
git_url='https://gitlab.mpcdf.mpg.de/nomad-lab/parser-crystal',
git_branch='nomad-fair'),
PythonGit(
name='normalizers/system-type',
git_url='https://gitlab.mpcdf.mpg.de/nomad-lab/normalizer-system-type',
name='parsers/cpmd',
git_url='https://gitlab.mpcdf.mpg.de/nomad-lab/parser-cpmd',
git_branch='nomad-fair'),
PythonGit(
name='parsers/nwchem',
git_url='https://gitlab.mpcdf.mpg.de/nomad-lab/parser-nwchem',
git_branch='nomad-fair'),
PythonGit(
name='parsers/bigdft',
git_url='https://gitlab.mpcdf.mpg.de/nomad-lab/parser-big-dft',
git_branch='nomad-fair')
]
dependencies_dict = {dependency.name: dependency for dependency in dependencies}
......
......@@ -15,8 +15,6 @@ from typing import List, Any
from .normalizer import Normalizer
from .system import SystemNormalizer
from .symmetry import SymmetryNormalizer
from .systemtype import SystemTypeNormalizer
from .fhiaims import FhiAimsBaseNormalizer
from .repository import RepositoryNormalizer
......@@ -46,7 +44,5 @@ There is one ABC for all normalizer:
normalizers: List[Any] = [
SystemNormalizer,
FhiAimsBaseNormalizer,
SymmetryNormalizer,
SystemTypeNormalizer,
RepositoryNormalizer
]
......@@ -18,6 +18,8 @@ from nomad.parsing import BadContextURI
from .normalizer import Normalizer
unavailable_label = 'unavailable'
class RepositoryNormalizer(Normalizer):
"""
......@@ -31,13 +33,16 @@ class RepositoryNormalizer(Normalizer):
'hyb': 'hybrid',
'mgga': 'meta-GGA',
'vdw': 'vdW',
'lda': 'LDA'
'lda': 'LDA',
}
""" https://gitlab.mpcdf.mpg.de/nomad-lab/nomad-meta-info/wikis/metainfo/XC-functional """
version_re = re.compile(r'(\d+(\.\d+(\.\d+)?)?)')
def map_functional_name_to_xc_treatment(self, name):
if name == unavailable_label:
return name
return RepositoryNormalizer.xc_treatments.get(name[:3].lower(), name)
def simplify_version(self, version):
......@@ -47,6 +52,12 @@ class RepositoryNormalizer(Normalizer):
else:
return match.group(0)
def get_optional_value(self, key):
try:
return self._backend.get_value(key, 0)
except KeyError:
return unavailable_label
def normalize(self, logger=None) -> None:
super().normalize(logger)
b = self._backend
......@@ -61,22 +72,25 @@ class RepositoryNormalizer(Normalizer):
b.openNonOverlappingSection('section_repository_parserdata')
b.addValue('repository_checksum', b.get_value('calc_hash', 0))
b.addValue('repository_chemical_formula', b.get_value('chemical_composition_bulk_reduced', 0))
b.addValue('repository_parser_id', b.get_value('parser_name', 0))
atom_labels = b.get_value('atom_labels', 0)
b.addValue('repository_atomic_elements', list(set(atom_labels)))
b.addValue('repository_atomic_elements_count', len(atom_labels))
b.addValue('repository_basis_set_type', b.get_value('program_basis_set_type', 0))
b.addValue('repository_crystal_system', b.get_value('crystal_system', 0))
b.addValue('repository_program_name', b.get_value('program_name', 0))
b.addValue(
'repository_code_version',
self.simplify_version(b.get_value('program_version', 0)))
b.addValue('repository_spacegroup_nr', b.get_value('space_group_number', 0))
b.addValue('repository_parser_id', b.get_value('parser_name', 0))
b.addValue('repository_chemical_formula', b.get_value('chemical_composition_bulk_reduced', 0))
atom_labels = b.get_value('atom_labels', 0)
b.addValue('repository_atomic_elements', list(set(atom_labels)))
b.addValue('repository_atomic_elements_count', len(atom_labels))
b.addValue('repository_system_type', b.get_value('system_type', 0))
b.addValue('repository_crystal_system', self.get_optional_value('crystal_system'))
b.addValue('repository_spacegroup_nr', self.get_optional_value('space_group_number'))
b.addValue('repository_basis_set_type', self.get_optional_value('program_basis_set_type'))
b.addValue(
'repository_xc_treatment',
self.map_functional_name_to_xc_treatment(b.get_value('XC_functional_name', 0)))
self.map_functional_name_to_xc_treatment(self.get_optional_value(('XC_functional_name'))))
b.closeNonOverlappingSection('section_repository_parserdata')
if repository_info_context is None:
......
# Copyright 2018 Markus Scheidgen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an"AS IS" BASIS,
# distributed under the License is distributed on an'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import ase
import numpy
import spglib
import numpy as np
import sys
import matid
from nomadcore.json_support import addShasOfJson
from statsnormalizer import stats
from statsnormalizer import classify_structure
from matid import SymmetryAnalyzer, Classifier
from nomadcore.json_support import addShasOfJson
from nomad.normalizing.normalizer import SystemBasedNormalizer
# TODO: check what is wrong, the commented meta names seem not to exist
# in the current meta info
class SystemNormalizer(SystemBasedNormalizer):
"""
This normalizer performs all system (atoms, cells, etc.) related normalizations
of the legacy NOMAD-coe *stats* normalizer.
"""
def __init__(self, backend):
super().__init__(backend, all_sections=True)
@staticmethod
def atom_label_to_num(atom_label):
......@@ -44,114 +43,268 @@ class SystemNormalizer(SystemBasedNormalizer):
return 0
def normalize_system(self, section_system) -> None:
stats.logging = self.logger
classify_structure.logger = self.logger
""" Main normalizer that runs system, syste_type and symmetry analysis."""
results = dict()
self.atom_labels = section_system['atom_labels']
self.atom_species = section_system['atom_atom_numbers']
self.atom_positions = section_system['atom_positions']
self.periodic_dirs = section_system['configuration_periodic_dimensions']
# Try to first read the cell information from the renamed metainfo
# lattice_vectors, if this doesn't work try the depreciated name
# simulation_cell. Otherwise, if neither are present, assign None.
self.cell = section_system.get(
'lattice_vectors', section_system.get('simulation_cell', None)
)
# Run a system analysis on the system.
self.system_analysis()
if self.cell is None:
# Then the parser hasn't parsed any information about periodicity.
# We therefore assume we're simulating a single cell without
# periodicity and don't try to ascertain symmetry information.
try:
self.atoms = ase.Atoms(
positions=1e10 * np.asarray(self.atom_positions),
symbols=np.asarray(self.atom_labels)
)
except Exception:
self.logger.error(
'The ASE library is unable to build an object from the parsed vars.'
)
# Classify the material's system type.
self.system_type_classification()
if self.nomad_system_type not in ['Atom', 'Molecule / Cluster']:
self.logger.error(
'Matid classified more than 1D despite having no simulation_cell')
atom_labels = section_system['atom_labels']
atom_species = section_system['atom_atom_numbers']
if atom_labels is not None and atom_species is None:
# Return w/out symmetry analysis since we don't have a sim_cell.
return None
self.pbc = section_system.get('configuration_periodic_dimensions', None)
# If no pbc is found assume there is no periodicity.
if self.pbc is None:
self.pbc = np.array([False, False, False])
# The pbc should be defined as a single-dimensional list.
if len(np.asarray(self.pbc).shape) == 2:
self.pbc = self.pbc[0, :]
# Build an ASE atoms object to feed into Matid.
try:
self.atoms = ase.Atoms(
positions=1e10 * np.asarray(self.atom_positions),
symbols=np.asarray(self.atom_labels),
cell=1e10 * np.asarray(self.cell),
pbc=self.pbc
)
except Exception:
self.logger.error(
'The ASE library is unable to build an object from the member'
'variables: atom_positions, atom_labels, simulation_cell and pbc.'
)
# Classify the material's system type.
self.system_type_classification()
# Analyze the symmetry of the material.
self.symmetry_analysis()
def system_analysis(self) -> None:
"""Analyze system properties of a simulation from parsed values."""
results = dict()
if self.atom_labels is not None and self.atom_species is None:
atom_label_to_num = SystemNormalizer.atom_label_to_num
atom_species = [atom_label_to_num(atom_label) for atom_label in atom_labels]
self.atom_species = [
atom_label_to_num(atom_label) for atom_label in self.atom_labels
]
periodic_dirs = section_system['configuration_periodic_dimensions']
formula = None
if atom_species:
results['atom_species'] = atom_species
atom_symbols = [ase.data.chemical_symbols[atom_number] for atom_number in atom_species]
if self.atom_species:
results['atom_species'] = self.atom_species
atom_symbols = [
ase.data.chemical_symbols[atom_number] for atom_number in self.atom_species
]
formula = ase.Atoms(atom_symbols).get_chemical_formula(mode='all')
formula_reduced = ase.Atoms(atom_symbols).get_chemical_formula(mode='reduce')
if periodic_dirs is not None and any(periodic_dirs):
if self.periodic_dirs is not None and any(self.periodic_dirs):
formula_bulk = formula_reduced
else:
formula_bulk = formula
cell = section_system.get('simulation_cell', None)
if cell is not None:
results['lattice_vectors'] = cell
if self.cell is not None:
results['lattice_vectors'] = self.cell
positions = section_system['atom_positions']
if positions is not None:
results['atom_positions'] = positions
if self.atom_positions is not None:
results['atom_positions'] = self.atom_positions
if not formula:
formula = 'X%d' % len(positions) if len(positions) != 1 else 'X'
if periodic_dirs is not None:
results['configuration_periodic_dimensions'] = periodic_dirs.tolist()
formula = (
'X%d' % len(self.atom_positions) if len(self.atom_positions) != 1 else 'X'
)
symm = None
if self.periodic_dirs is not None:
results['configuration_periodic_dimensions'] = self.periodic_dirs.tolist()
# TODO: @dts, might be good to clean this up so it is more readable in the
# future.
configuration_id = 's' + addShasOfJson(results).b64digests()[0][0:28]
if cell is not None and atom_labels is not None:
if cell is not None:
results['simulation_cell'] = cell
if atom_labels is not None:
results['atom_labels'] = atom_labels
results['gIndex'] = section_system['gIndex']
results['name'] = 'section_system'
structure = classify_structure.ClassifyStructure(None, jsonValue={
"sections": [{
"name": "section_run",
"gIndex": 1,
"sections": [results]
}]
})
classification = structure.classify()
if classification.get('classificationStatus', None) == 'ClassificationSuccess':
classType = classification['sections'][0]['sections'][0]['structure_kind']
else:
classType = 'NoClassification'
if classType == 'Bulk' and positions is not None and atom_species is not None and cell is not None:
acell = numpy.asarray(cell) * 1.0e10
cellInv = numpy.linalg.inv(cell)
symm = spglib.get_symmetry_dataset(
(acell, numpy.dot(positions, cellInv), atom_species),
0.002, -1) # use m instead of Angstrom?
if symm:
symm['configuration_raw_gid'] = configuration_id
self._backend.addValue("configuration_raw_gid", configuration_id)
self._backend.addValue("atom_species", atom_species)
self._backend.addValue("chemical_composition", formula)
self._backend.addValue("chemical_composition_reduced", formula_reduced)
self._backend.addValue("chemical_composition_bulk_reduced", formula_bulk)
if symm is not None:
# for quantity in ["number", "international", "hall", "choice", "pointgroup"]:
# v = symm.get(quantity)
# if v is not None:
# self._backend.addValue("spacegroup_3D_" + quantity, v)
# for quantity in ["transformation_matrix"]:
# v = symm.get(quantity)
# if v is not None:
# self._backend.addArrayValues(
# "spacegroup_3D_" + quantity, numpy.asarray(v))
n = symm.get("number")
if n:
self._backend.openNonOverlappingSection('section_symmetry')
self._backend.addValue("bravais_lattice", stats.crystalSystem(n))
self._backend.closeNonOverlappingSection('section_symmetry')
# for quantity in ["origin_shift", "std_lattice"]:
# v = symm.get(quantity)
# if v is not None:
# backend.addArrayValues(
# "spacegroup_3D_" + quantity, 1.0e-10 * numpy.asarray(v, dtype=float))
# for (r, t) in zip(symm.get("rotations", []), symm.get("translations", [])):
# self._backend.openNonOverlappingSection("section_spacegroup_3D_operation")
# self._backend.addArrayValues("spacegroup_3D_rotation", numpy.asarray(r))
# self._backend.addArrayValues(
# "spacegroup_3D_translation", 1.0e-10 * numpy.asarray(t, dtype=float))
# self._backend.closeNonOverlappingSection("section_spacegroup_3D_operation")
# v = symm.get("wyckoffs")
# if v is not None:
# for w in v:
# self._backend.addValue("spacegroup_3D_wyckoff", w)
self._backend.addValue('configuration_raw_gid', configuration_id)
self._backend.addValue('atom_species', self.atom_species)
self._backend.addValue('chemical_composition', formula)
self._backend.addValue('chemical_composition_reduced', formula_reduced)
self._backend.addValue('chemical_composition_bulk_reduced', formula_bulk)
def symmetry_analysis(self) -> None:
"""Analyze the symmetry of the material bein simulated.
We feed in the parsed values in section_system to the
the symmetry analyzer. We then use the Matid library
to classify the system as 0D, 1D, 2D or 3D and more specific
when possible. When lattice vectors or simulation cells are
not present we skip this analysis.
Args:
None: We feed in the bakend and atoms object from the
SymmetryAndType normalizer.
Returns:
None: The method should write symmetry variables
to the backend which is member of this class.
"""
# Try to use Matid's symmetry analyzer to anlyze the ASE object.
# TODO: dts, find out what the symmetry_tol does.
try:
symm = SymmetryAnalyzer(self.atoms, symmetry_tol=0.1)
space_group_number = symm.get_space_group_number()
hall_number = symm.get_hall_number()
hall_symbol = symm.get_hall_symbol()
crystal_system = symm.get_crystal_system()
bravais_lattice = symm.get_bravais_lattice()
point_group = symm.get_point_group()
orig_wyckoff = symm.get_wyckoff_letters_original()
prim_wyckoff = symm.get_wyckoff_letters_primitive()
conv_wyckoff = symm.get_wyckoff_letters_conventional()
orig_equivalent_atoms = symm.get_equivalent_atoms_original()
prim_equivalent_atoms = symm.get_equivalent_atoms_primitive()
conv_equivalent_atoms = symm.get_equivalent_atoms_conventional()
international_short = symm.get_space_group_international_short()
point_group = symm.get_point_group()
conv_sys = symm.get_conventional_system()
conv_pos = conv_sys.get_scaled_positions()
conv_cell = conv_sys.get_cell()
conv_num = conv_sys.get_atomic_numbers()
prim_sys = symm.get_primitive_system()
prim_pos = prim_sys.get_scaled_positions()
prim_cell = prim_sys.get_cell()
prim_num = prim_sys.get_atomic_numbers()
transform = symm._get_spglib_transformation_matrix()
origin_shift = symm._get_spglib_origin_shift()
except Exception:
self.logger.error(
'The matid project symmetry analyzer fails on the ASE'
' object from this section.'
)
return None # Without trying to write any symmetry data.
# Write data extracted from Matid symmetry analysis to the backend.
symGid = self._backend.openSection('section_symmetry')
# TODO: @dts, should we change the symmetry_method to MATID?
self._backend.addValue('symmetry_method', 'Matid (spg)')
self._backend.addValue('space_group_number', space_group_number)
self._backend.addValue('hall_number', hall_number)
self._backend.addValue('hall_symbol', hall_symbol)
self._backend.addValue('international_short_symbol', international_short)
self._backend.addValue('point_group', point_group)
self._backend.addValue('crystal_system', crystal_system)
self._backend.addValue('bravais_lattice', bravais_lattice)
self._backend.addArrayValues('origin_shift', origin_shift)
self._backend.addArrayValues('transformation_matrix', transform)
stdGid = self._backend.openSection('section_std_system')
self._backend.addArrayValues('lattice_vectors_std', conv_cell)
self._backend.addArrayValues('atom_positions_std', conv_pos)
self._backend.addArrayValues('atomic_numbers_std', conv_num)
self._backend.addArrayValues('wyckoff_letters_std', conv_wyckoff)
self._backend.addArrayValues('equivalent_atoms_std', conv_equivalent_atoms)
self._backend.closeSection('section_std_system', stdGid)
primGid = self._backend.openSection('section_primitive_system')
self._backend.addArrayValues('lattice_vectors_primitive', prim_cell)
self._backend.addArrayValues('atom_positions_primitive', prim_pos)
self._backend.addArrayValues('atomic_numbers_primitive', prim_num)
self._backend.addArrayValues('wyckoff_letters_primitive', prim_wyckoff)
self._backend.addArrayValues('equivalent_atoms_primitive', prim_equivalent_atoms)
self._backend.closeSection('section_primitive_system', primGid)
origGid = self._backend.openSection('section_original_system')
self._backend.addArrayValues('wyckoff_letters_original', orig_wyckoff)
self._backend.addArrayValues('equivalent_atoms_original', orig_equivalent_atoms)
self._backend.closeSection('section_original_system', origGid)
self._backend.closeSection('section_symmetry', symGid)
# nomad-xt: context already closed in nomad-xt.
# backend.closeContext(context)
sys.stdout.flush()
def system_type_classification(self) -> None:
"""Try to classify the ASE materials object using Matid's classification."""
try:
# Define the classifier as Matid's Classifier that we've imported.
classifier = Classifier()
# Perform classification on the atoms ASE object.
matid_system_type = classifier.classify(self.atoms)
except Exception:
self.logger.error(
'The matid project clsasification fails on the ASE'
' object from this section.'
)
return None # Without saving any system type value.
# Convert Matid classification to a Nomad classification.
self.nomad_system_type = self.map_matid_to_nomad_system_types(matid_system_type)
self._backend.addValue('system_type', self.nomad_system_type)
# Create a class static dictionary for mapping Matid classifications
# to Nomad classifications.
translation_dict = {
matid.classifications.Class0D: 'Atom',
matid.classifications.Class1D: '1D',
matid.classifications.Material2D: '2D',
matid.classifications.Surface: 'Surface',
matid.classifications.Class2DWithCell: '2D',
matid.classifications.Class2D: '2D',
matid.classifications.Class3D: 'Bulk',
matid.classifications.Unknown: 'Unknown'
}
def map_matid_to_nomad_system_types(self, system_type):
""" We map the system type classification from matid to Nomad values.
Args:
system_type: Object of a matid class representing a
material classification.
Returns:
nomad_classification: String representing a material
classification that fits into Nomad's current way
of naming material classes.
"""
nomad_classification = None
for matid_class in SystemNormalizer.translation_dict:
if isinstance(system_type, matid_class):
nomad_classification = SystemNormalizer.translation_dict[matid_class]
break