Commit ce2b4e01 authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

Merge branch 'encyclopedia' into 'v0.7.10'

First version of EncyclopediaNormalizer

See merge request !102
parents 74195402 d2c1d7a9
Pipeline #71458 passed with stages
in 16 minutes and 18 seconds
......@@ -31,7 +31,7 @@ build:
stage: build
before_script:
- git submodule sync
- git submodule update --init
- git submodule update --init --jobs=4
- ./gitinfo.sh
script:
# ignore test directories of dependencies, there is a lot of data that we not use
......
Subproject commit 36378d0cf67b46d28c1c5476c387978143d21367
Subproject commit 0fecfb3f466690e1aa45ec6f24e9a51661e3f65c
Subproject commit f9979f05e6a26e0512c26d5a316f4b0749528640
Subproject commit c776b194dd825ad7bbe81c5c2afb1745122a1dde
Subproject commit 95bb8e2a8288aac6fb8d0bfbb3e7d28db43e880b
Subproject commit d73611bc1b16ea71daa3d0fd24ee275c78853557
Subproject commit a3d0d0e99d6c17e95ac07f29e14e64e85446e720
Subproject commit 2e8b48c457eeb0a44d6b006b701203e197df2861
Subproject commit ae1f7175bea210eff43d52a30a454f4f9acce30b
Subproject commit ad121c9a1a88ac667cd9c13a27cbb941774cd871
Subproject commit fc60b6cfb902e1bf493a57d398e5b364f2153a9d
Subproject commit b3159851a99a4ff6a5ee13dbe204cc0a72aff81e
......@@ -137,7 +137,7 @@ We use the branch ``nomad-fair`` on all dependencies for nomad-FAIRDI specific c
Parsers
^^^^^^^
There are several steps to take, to wrap a NOMOAD-coe parser into a nomad@FAIRDI parser:
There are several steps to take, to wrap a NOMAD-coe parser into a nomad@FAIRDI parser:
- Implement ``nomadcore.baseclasses.ParserInterface`` or a class with a similar constructutor
and `parse` method interface.
......
......@@ -85,13 +85,6 @@ conda -c conda-forge install --name nomad_env libmagic
The next steps can be done using the `setup.sh` script. If you prefere to understand all
the steps and run them manually, read on:
### Get all the submodules
We use git submodules to retrieve all the other NOMAD repositories, mainly parsers.
```
git submodules update --init
```
### Install python dependencies
We use *pip* to manage required python packages.
```
......@@ -104,7 +97,7 @@ This includes parsers, python-common and the meta-info. These modules are mainta
their own GITLab/git repositories. To clone and initialize them run:
```
git submodules update --init
git submodule update --init
```
All requirements for these submodules need to be installed and they need to be installed
......@@ -364,4 +357,4 @@ Here are some example launch configs for VSCode:
}
]
}
```
\ No newline at end of file
```
......@@ -11,24 +11,211 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import functools
import fractions
from typing import Dict
import itertools
from math import gcd as gcd
from functools import reduce
from typing import List, Dict, Tuple
import numpy as np
from scipy.spatial import Voronoi # pylint: disable=no-name-in-module
from matid.symmetry import WyckoffSet
from nomad.normalizing.data.aflow_prototypes import aflow_prototypes
from nomad import config
from nomad.constants import NUMBER_TO_MASS_MAP_KG
def get_summed_atomic_mass(atomic_numbers: np.ndarray) -> float:
"""Calculates the summed atomic mass for the given atomic numbers.
Args:
atomic_numbers: Array of valid atomic numbers
Returns:
The atomic mass in kilograms.
"""
# It is assumed that the atomic numbers are valid at this point.
mass = np.sum(NUMBER_TO_MASS_MAP_KG[atomic_numbers])
return mass
def get_symmetry_string(space_group: int, wyckoff_sets: List[WyckoffSet]) -> str:
"""Used to serialize symmetry information into a string. The Wyckoff
positions are assumed to be normalized and ordered as is the case if using
the matid-library.
Args:
space_group: 3D space group number
wyckoff_sets: Wyckoff sets that map a Wyckoff letter to related
information
Returns:
A string that encodes the symmetry properties of an atomistic
structure.
"""
wyckoff_strings = []
for group in wyckoff_sets:
element = group.element
wyckoff_letter = group.wyckoff_letter
n_atoms = len(group.indices)
i_string = "{} {} {}".format(element, wyckoff_letter, n_atoms)
wyckoff_strings.append(i_string)
wyckoff_string = ", ".join(sorted(wyckoff_strings))
string = "{} {}".format(space_group, wyckoff_string)
return string
def get_lattice_parameters(normalized_cell: np.ndarray) -> np.ndarray:
"""Calculate the lattice parameters for the normalized cell.
Args:
normalized_cell: The normalized cell as a 3x3 array. Each row is a
basis vector.
Returns:
Six parameters a, b, c, alpha, beta, gamma (in this order) as a numpy
array. Here is an explanation of each parameter:
a = length of first basis vector
b = length of second basis vector
c = length of third basis vector
alpha = angle between b and c
beta = angle between a and c
gamma = angle between a and b
"""
if normalized_cell is None:
return None
# Lengths
lengths = np.linalg.norm(normalized_cell, axis=1)
a, b, c = lengths
# Angles
angles = np.zeros(3)
for i in range(3):
j = (i + 1) % 3
k = (i + 2) % 3
angles[i] = np.dot(
normalized_cell[j],
normalized_cell[k]) / (lengths[j] * lengths[k])
angles = np.clip(angles, -1.0, 1.0)
alpha, beta, gamma = np.arccos(angles)
return [a, b, c, alpha, beta, gamma]
def get_hill_decomposition(atom_labels: np.ndarray, reduced: bool = False) -> Tuple[List[str], List[int]]:
"""Given a list of atomic labels, returns the chemical formula using the
Hill system (https://en.wikipedia.org/wiki/Hill_system) with an exception
for binary ionic compounds where the cation is always given first.
Args:
atom_labels: Atom labels.
reduced: Whether to divide the number of atoms by the greatest common
divisor
Returns:
An ordered list of chemical symbols and the corresponding counts.
"""
# Count occurancy of elements
names = []
counts = []
unordered_names, unordered_counts = np.unique(atom_labels, return_counts=True)
element_count_map = dict(zip(unordered_names, unordered_counts))
# Apply basic Hill system:
# 1. Is Carbon part of the system?
if "C" in element_count_map:
names.append("C")
counts.append(element_count_map["C"])
del element_count_map['C']
# 1a. add hydrogren
if "H" in element_count_map:
names.append("H")
counts.append(element_count_map["H"])
del element_count_map["H"]
# 2. all remaining elements in alphabetic order
for element in sorted(element_count_map):
names.append(element)
counts.append(element_count_map[element])
# 3. Binary ionic compounds: cation first, anion second
# If any of the most electronegative elements is first
# by alphabetic order, we move it to second
if len(counts) == 2 and names != ["C", "H"]:
order = {
"F": 1,
"O": 2,
"N": 3,
"Cl": 4,
"Br": 5,
"C": 6,
"Se": 7,
"S": 8,
"I": 9,
"As": 10,
"H": 11,
"P": 12,
"Ge": 13,
"Te": 14,
"B": 15,
"Sb": 16,
"Po": 17,
"Si": 18,
"Bi": 19
}
if (names[0] in order):
if (names[1] in order):
if(order[names[0]] < order[names[1]]):
# For non-metals:
# Swap symbols and counts if first element
# is more electronegative than the second one,
# because the more electronegative element is the anion
names[0], names[1] = names[1], names[0]
counts[0], counts[1] = counts[1], counts[0]
else:
# Swap symbols and counts always if second element
# is any other element,i.e.,
# put non-metal last because it is the anion
names[0], names[1] = names[1], names[0]
counts[0], counts[1] = counts[1], counts[0]
# The AFLOW symmetry information is checked once on import
old_symmetry_tolerance = aflow_prototypes["matid_symmetry_tolerance"]
symmetry_tolerance = config.normalize.symmetry_tolerance
if old_symmetry_tolerance != symmetry_tolerance:
raise AssertionError(
"The AFLOW prototype information is outdated due to changed "
"tolerance for symmetry detection. Please update the AFLOW "
"prototype information by running once the function "
"'update_aflow_prototype_information'."
)
# TODO: implement all further exceptions regarding ordering
# in chemical formulas:
# - ionic compounds (ordering wrt to ionization)
# - oxides, acids, hydroxides...
# Reduce if requested
if reduced:
greatest_common_divisor = reduce(gcd, counts)
counts = np.array(counts) / greatest_common_divisor
return names, counts
def get_formula_string(symbols: List[str], counts: List[int]) -> str:
"""Used to form a single formula string from a list of chemical speices and
their counts.
Args:
symbols: List of chemical species
counts: List of chemical species occurences
Returns:
The formula as a string.
"""
formula = ""
for symbol, count in zip(symbols, counts):
if count > 1:
formula += "%s%d" % (symbol, count)
else:
formula += symbol
return formula
def get_normalized_wyckoff(atomic_numbers: np.array, wyckoff_letters: np.array) -> Dict[str, Dict[str, int]]:
......@@ -130,3 +317,65 @@ def search_aflow_prototype(space_group: int, norm_wyckoff: dict) -> dict:
structure_type_info = type_description
break
return structure_type_info
def get_brillouin_zone(reciprocal_lattice: np.array) -> dict:
"""Calculates the Brillouin Zone information from the given reciprocal
lattice.
This function uses the crystallographic definition, so there is no factor
of 2*Pi.
Args:
primitive_lattice: The primitive cell as a matrix where rows are the
cell basis vectors.
Returns:
A dictionary containing:
"vertices": The vertices of the first Brillouin zone
"faces": The indices of the vertices that make up the faces on the
first Brillouin zone. The order of these indices matter, because
only when combined sequentially they form the correct face.
"""
# Create the near lattice points that surround the origin
b1 = reciprocal_lattice[0, :]
b2 = reciprocal_lattice[1, :]
b3 = reciprocal_lattice[2, :]
list_k_points = []
for i, j, k in itertools.product([-1, 0, 1], [-1, 0, 1], [-1, 0, 1]):
list_k_points.append(i * b1 + j * b2 + k * b3)
# Create the first Brillouin zone by calculating a Voronoi cell starting
# from the reciprocal cell origin.
voronoi = Voronoi(list_k_points)
origin_index = 13
# Get the vertices. The regions attribute will contain a list of
# different regions that were found during the Voronoi creation. We want
# the Voronoi region for the point at the origin.
point_region = voronoi.point_region[13]
vertice_indices = voronoi.regions[point_region]
vertices = voronoi.vertices[vertice_indices].tolist()
# Create a mapping between the original index and an index in the new list
index_map = {
old_id: new_id for (new_id, old_id) in enumerate(vertice_indices)
}
# The ridges are the faces of a 3D Voronoi cell. Here we search for ridges
# that are placed between the origin and some other point. These form the
# BZ faces.
faces = []
for key in voronoi.ridge_dict:
if key[0] == origin_index or key[1] == origin_index:
ridge_indices = voronoi.ridge_dict[key]
new_ridge_indices = [index_map[i] for i in ridge_indices]
faces.append(new_ridge_indices)
faces = faces
brillouin_zone = {
"vertices": vertices,
"faces": faces,
}
return brillouin_zone
......@@ -33,7 +33,7 @@ from bs4 import BeautifulSoup
from matid import SymmetryAnalyzer
from nomad import processing as proc, search, datamodel, infrastructure, utils, config
from nomad.normalizing.structure import get_normalized_wyckoff
from nomad.atomutils import get_normalized_wyckoff
from nomad.cli.cli import cli
from nomad import config
......@@ -505,9 +505,8 @@ def prototypes_update(ctx, filepath, matches_only):
)
# Try to first see if the space group can be matched with the one in AFLOW
tolerance = config.normalize.symmetry_tolerance
try:
symm = SymmetryAnalyzer(atoms, tolerance)
symm = SymmetryAnalyzer(atoms, config.normalize.prototype_symmetry_tolerance)
spg_number = symm.get_space_group_number()
wyckoff_matid = symm.get_wyckoff_letters_conventional()
norm_system = symm.get_conventional_system()
......@@ -528,7 +527,5 @@ def prototypes_update(ctx, filepath, matches_only):
.format(n_prototypes, n_unmatched, n_failed)
)
aflow_prototypes["matid_symmetry_tolerance"] = tolerance
# Write data file to the specified path
write_prototype_data_file(aflow_prototypes, filepath)
......@@ -153,20 +153,6 @@ tests = NomadConfig(
)
def api_url(ssl: bool = True):
return '%s://%s/%s/api' % (
'https' if services.https and ssl else 'http',
services.api_host.strip('/'),
services.api_base_path.strip('/'))
def gui_url():
base = api_url(True)[:-3]
if base.endswith('/'):
base = base[:-1]
return '%s/gui' % base
mail = NomadConfig(
enabled=False,
with_login=False,
......@@ -181,15 +167,34 @@ mail = NomadConfig(
normalize = NomadConfig(
# The system size limit for running the dimensionality analysis. For very
# large systems the dimensionality analysis will get too expensive.
system_classification_with_clusters_threshold=50,
system_classification_with_clusters_threshold=64,
# Symmetry tolerance controls the precision used by spglib in order to find
# symmetries. The atoms are allowed to move 1/2*symmetry_tolerance from
# their symmetry positions in order for spglib to still detect symmetries.
# The unit is angstroms.
# The unit is angstroms. The value of 0.1 is used e.g. by Materials Project
# according to
# https://pymatgen.org/pymatgen.symmetry.analyzer.html#pymatgen.symmetry.analyzer.SpacegroupAnalyzer
symmetry_tolerance=0.1,
# The symmetry tolerance used in aflow prototype matching. Should only be
# changed before re-running the prototype detection.
prototype_symmetry_tolerance=0.1,
# Maximum number of atoms in the single cell of a 2D material for it to be
# considered valid.
max_2d_single_cell_size=7,
# The distance tolerance between atoms for grouping them into the same
# cluster. Used in detecting system type.
cluster_threshold=3.1,
# Defines the "bin size" for rounding cell angles for the material hash
angle_rounding=float(10.0), # unit: degree
# The threshold for a system to be considered "flat". Used e.g. when
# determining if a 2D structure is purely 2-dimensional to allow extra rigid
# transformations that are improper in 3D but proper in 2D.
flat_dim_threshold=0.1,
# The threshold for point equality in k-space. Unit: 1/m.
k_space_precision=150e6,
# The energy threshold for how much a band can be on top or below the fermi
# level in order to detect a gap. k_B x T at room temperature. Unit: Joule
fermi_level_precision=300 * 1.38064852E-23
)
client = NomadConfig(
......@@ -219,11 +224,44 @@ raw_file_strip_cutoff = 1000
use_empty_parsers = False
reprocess_unmatched = True
springer_db_relative_path = 'normalizing/data/SM_all08.db'
springer_db_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), springer_db_relative_path)
def api_url(ssl: bool = True):
return '%s://%s/%s/api' % (
'https' if services.https and ssl else 'http',
services.api_host.strip('/'),
services.api_base_path.strip('/'))
def gui_url():
base = api_url(True)[:-3]
if base.endswith('/'):
base = base[:-1]
return '%s/gui' % base
def check_config():
"""Used to check that the current configuration is valid. Should only be
called once after the final config is loaded.
Raises:
AssertionError: if there is a contradiction or invalid values in the
config file settings.
"""
# The AFLOW symmetry information is checked once on import
proto_symmetry_tolerance = normalize.prototype_symmetry_tolerance
symmetry_tolerance = normalize.symmetry_tolerance
if proto_symmetry_tolerance != symmetry_tolerance:
raise AssertionError(
"The AFLOW prototype information is outdated due to changed tolerance "
"for symmetry detection. Please update the AFLOW prototype information "
"by running the CLI command 'nomad admin ops prototype-update "
"--matches-only'."
)
def normalize_loglevel(value, default_level=logging.INFO):
plain_value = value
if plain_value is None:
......@@ -352,3 +390,4 @@ def load_config(config_file: str = os.environ.get('NOMAD_CONFIG', 'nomad.yaml'))
load_config()
check_config()
# Copyright 2018 Markus Scheidgen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an"AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
# List of atomic masses (natural isotope dist.) in order, atomic mass units.
# These custom mass definitions are used because the ones provided by ASE are not
# as complete. Origin: phonopy-1.11.2.25.tar.gz:phonopy/structure/atoms.py:atom_data
NUMBER_TO_MASS_MAP_AMU = np.array([
np.nan, # 0
1.00794, # 1
4.002602, # 2
6.941, # 3
9.012182, # 4
10.811, # 5
12.0107, # 6
14.0067, # 7
15.9994, # 8
18.9984032, # 9
20.1797, # 10
22.98976928, # 11
24.3050, # 12
26.9815386, # 13
28.0855, # 14
30.973762, # 15
32.065, # 16
35.453, # 17
39.948, # 18
39.0983, # 19
40.078, # 20
44.955912, # 21
47.867, # 22
50.9415, # 23
51.9961, # 24
54.938045, # 25
55.845, # 26
58.933195, # 27
58.6934, # 28
63.546, # 29
65.38, # 30
69.723, # 31
72.64, # 32
74.92160, # 33
78.96, # 34
79.904, # 35
83.798, # 36
85.4678, # 37
87.62, # 38
88.90585, # 39
91.224, # 40
92.90638, # 41
95.96, # 42
98.9062, # 43 - NIST
101.07, # 44
102.90550, # 45
106.42, # 46
107.8682, # 47
112.411, # 48
114.818, # 49
118.710, # 50
121.760, # 51
127.60, # 52
126.90447, # 53
131.293, # 54
132.9054519, # 55
137.327, # 56
138.90547, # 57
140.116, # 58
140.90765, # 59
144.242, # 60
145, # 61 most stable isotope
150.36, # 62
151.964, # 63
157.25, # 64
158.92535, # 65
162.500, # 66
164.93032, # 67
167.259, # 68
168.93421, # 69
173.054, # 70
174.9668, # 71
178.49, # 72
180.94788, # 73
183.84, # 74
186.207, # 75
190.23, # 76
192.217, # 77
195.084, # 78
196.966569, # 79
200.59, # 80
204.3833, # 81
207.2, # 82
208.98040, # 83
209, # 84 - NIST
210, # 85 - NIST
222, # 86 - NIST
223, # 87 - NIST
226, # 88 - NIST
227, # 89 - NIST
232.03806, # 90
231.03588, # 91
238.02891, # 92
237, # 93 - NIST
244, # 94 - NIST
243, # 95 - most stable isotope
247, # 96 - most stable isotope
247, # 97 - most stable isotope
251, # 98 - most stable isotope
252, # 99 - most stable isotope
257, # 100 - most stable isotope
258, # 101 - most stable isotope
259, # 102 - most stable isotope
262, # 103 - most stable isotope
261, # 104 - most stable isotope
262, # 105 - most stable isotope
266, # 106 - most stable isotope