Commit a0ddb4ac authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

Various migration related improvements.

parent f3a53aab
......@@ -112,6 +112,7 @@ class CalcWithMetadata():
self.system: str = None
self.crystal_system: str = None
self.spacegroup: str = None
self.spacegroup_symbol: str = None
self.code_name: str = None
self.code_version: str = None
......
......@@ -613,10 +613,7 @@ class StagingUploadFiles(UploadFiles):
Raises:
KeyError: If the mainfile does not exist.
"""
hash = hashlib.sha512()
hash.update(self.upload_id.encode('utf-8'))
hash.update(mainfile.encode('utf-8'))
return utils.hash(hash.digest())
return utils.hash(self.upload_id, mainfile)
def calc_hash(self, mainfile: str) -> str:
"""
......@@ -634,7 +631,7 @@ class StagingUploadFiles(UploadFiles):
for data in iter(lambda: f.read(65536), b''):
hash.update(data)
return utils.hash(hash.digest())
return utils.make_websave(hash)
class ArchiveBasedStagingUploadFiles(StagingUploadFiles):
......
......@@ -28,6 +28,7 @@ import zipfile
import math
from mongoengine import Document, IntField, StringField, DictField, ListField
import time
import datetime
from bravado.exception import HTTPNotFound, HTTPBadRequest, HTTPGatewayTimeout
import glob
import os
......@@ -47,6 +48,9 @@ max_package_size = 16 * 1024 * 1024 * 1024 # 16 GB
""" The maximum size of a package that will be used as an upload on nomad@FAIRDI """
use_stats_for_filestats_threshold = 1024
default_comment = 'entry with unknown provernance'
default_uploader = dict(id=1)
def iterable_to_stream(iterable, buffer_size=io.DEFAULT_BUFFER_SIZE):
"""
......@@ -501,6 +505,21 @@ class NomadCOEMigration:
logger.debug('identified packages for source upload', n_packages=package_query.count())
return package_query, source_upload_id
def _surrogate_metadata(self, source: CalcWithMetadata):
"""
Compute metadata from the given metadata that can be used for new calcs of the
same upload.
"""
return CalcWithMetadata(
uploader=source.uploader,
with_embargo=source.with_embargo,
upload_time=source.upload_time,
coauthors=source.coauthors,
shared_with=source.shared_with,
comment=source.comment,
references=source.references,
datasets=source.datasets)
def set_pid_prefix(self, prefix: int = default_pid_prefix):
"""
Sets the repo db pid counter to the given values. Allows to create new calcs
......@@ -668,6 +687,7 @@ class NomadCOEMigration:
# grab source calcs
source_calcs = dict()
surrogate_source_calc_with_metadata = None
with utils.timer(logger, 'loaded source metadata'):
for source_calc in SourceCalc.objects(
upload=source_upload_id, mainfile__in=calc_mainfiles):
......@@ -677,6 +697,19 @@ class NomadCOEMigration:
source_calc_with_metadata.mainfile = source_calc.mainfile
source_calcs[source_calc.mainfile] = (source_calc, source_calc_with_metadata)
# establish a surrogate for new calcs
if surrogate_source_calc_with_metadata is None:
surrogate_source_calc_with_metadata = \
self._surrogate_metadata(source_calc_with_metadata)
# try to find a surrogate outside the package, if necessary
if surrogate_source_calc_with_metadata is None:
source_calc = SourceCalc.objects(upload=source_upload_id).first()
if source_calc is not None:
source_calc_with_metadata = CalcWithMetadata(**source_calc.metadata)
surrogate_source_calc_with_metadata = \
self._surrogate_metadata(source_calc_with_metadata)
# verify upload against source
calcs_in_search = 0
with utils.timer(logger, 'varyfied upload against source calcs'):
......@@ -699,6 +732,22 @@ class NomadCOEMigration:
else:
calc_logger.info('processed a calc that has no source')
report.new_calcs += 1
# guessing the metadata from other calcs in upload/package
if surrogate_source_calc_with_metadata is not None:
new_calc_with_metadata = CalcWithMetadata(**surrogate_source_calc_with_metadata.to_dict())
new_calc_with_metadata.mainfile = calc['mainfile']
else:
calc_logger.warning('could not determine any metadata for new calc')
create_time_epoch = os.path.getctime(package.upload_path)
new_calc_with_metadata = CalcWithMetadata(
upload_time=datetime.datetime.fromtimestamp(create_time_epoch),
with_embargo=package.restricted > 0,
comment=default_comment,
uploader=default_uploader,
mainfile=calc['mainfile'])
surrogate_source_calc_with_metadata = new_calc_with_metadata
source_calcs[calc['mainfile']] = (None, new_calc_with_metadata)
if len(calc_mainfiles) != calcs_in_search:
logger.error('missmatch between processed calcs and calcs found with search')
......
......@@ -147,5 +147,5 @@ class SystemBasedNormalizer(Normalizer, metaclass=ABCMeta):
except Exception as e:
self.logger.error(
'Unexpected error during normalizing', normalizer=self.__class__.__name__,
section='section_system', g_index=g_index, exc_info=e)
section='section_system', g_index=g_index, exc_info=e, error=str(e))
raise e
......@@ -118,6 +118,9 @@ class RepositoryNormalizer(Normalizer):
b.addValue(
'repository_spacegroup_nr',
self.get_optional_value('space_group_number', 'section_symmetry', 0))
b.addValue(
'repository_spacegroup_symbol',
self.get_optional_value('international_short_symbol', 'section_symmetry', 0))
b.addValue(
'repository_basis_set_type',
self.get_optional_value('program_basis_set_type', 'section_run'))
......
......@@ -15,10 +15,10 @@
from typing import Any
import ase
import numpy as np
import matid
import json
from matid import SymmetryAnalyzer, Classifier
from matid import SymmetryAnalyzer
from matid.geometry import get_dimensionality
from nomad import utils, config
from nomad.normalizing.normalizer import SystemBasedNormalizer
......@@ -83,7 +83,7 @@ class SystemNormalizer(SystemBasedNormalizer):
except Exception as e:
self.logger.error(
'cannot build ase atoms from atom labels',
atom_labels=atom_labels, exc_info=e)
atom_labels=atom_labels, exc_info=e, error=str(e))
return
chemical_symbols = list(atoms.get_chemical_symbols())
if atom_labels != chemical_symbols:
......@@ -110,7 +110,8 @@ class SystemNormalizer(SystemBasedNormalizer):
try:
atoms.set_pbc(pbc)
except Exception as e:
self.logger.error('cannot use pbc with ase atoms', exc_info=e, pbc=pbc)
self.logger.error(
'cannot use pbc with ase atoms', exc_info=e, pbc=pbc, error=str(e))
return
# formulas
......@@ -131,7 +132,8 @@ class SystemNormalizer(SystemBasedNormalizer):
try:
atoms.set_positions(1e10 * atom_positions)
except Exception as e:
self.logger.error('cannot use positions with ase atoms', exc_info=e)
self.logger.error(
'cannot use positions with ase atoms', exc_info=e, error=str(e))
return
# lattice vectors
......@@ -147,7 +149,8 @@ class SystemNormalizer(SystemBasedNormalizer):
try:
atoms.set_cell(1e10 * lattice_vectors)
except Exception as e:
self.logger.error('cannot use lattice_vectors with ase atoms', exc_info=e)
self.logger.error(
'cannot use lattice_vectors with ase atoms', exc_info=e, error=str(e))
return
# configuration
......@@ -164,15 +167,7 @@ class SystemNormalizer(SystemBasedNormalizer):
self.logger, 'system classification executed',
system_size=atoms.get_number_of_atoms()):
try:
classifier = Classifier()
system_type = classifier.classify(atoms)
except Exception as e:
self.logger.error('matid project system classification failed', exc_info=e)
else:
# Convert Matid classification to a Nomad classification.
system_type = self.map_matid_to_nomad_system_types(atoms, system_type)
set_value('system_type', system_type)
self.system_type_analysis(atoms)
# symmetry analysis
if atom_positions is not None and (lattice_vectors is not None or not any(pbc)):
......@@ -182,6 +177,35 @@ class SystemNormalizer(SystemBasedNormalizer):
self.symmetry_analysis(atoms)
def system_type_analysis(self, atoms) -> None:
"""
Determine the dimensioality and hence the system type of the system with
Matid. Write the system type to the backend.
"""
system_type = 'unavailable'
try:
dimensionality = get_dimensionality(
atoms, cluster_threshold=3.1, return_clusters=False)
if dimensionality is None:
pass
elif dimensionality == 0:
if atoms.get_number_of_atoms() == 1:
system_type = 'atom'
else:
system_type = 'molecule / cluster'
elif dimensionality == 1:
system_type = '1D'
elif dimensionality == 2:
system_type = '2D / surface'
elif dimensionality == 3:
system_type = 'bulk'
except Exception as e:
self.logger.error(
'matid project system classification failed', exc_info=e, error=str(e))
self._backend.addValue('system_type', system_type)
def symmetry_analysis(self, atoms) -> None:
"""Analyze the symmetry of the material being simulated.
......@@ -278,47 +302,3 @@ class SystemNormalizer(SystemBasedNormalizer):
self._backend.closeSection('section_original_system', origGid)
self._backend.closeSection('section_symmetry', symGid)
# Create a class static dictionary for mapping Matid classifications
# to Nomad classifications.
translation_dict = {
matid.classifications.Class0D: 'Atom',
matid.classifications.Class1D: '1D',
matid.classifications.Material2D: '2D',
matid.classifications.Surface: 'Surface',
matid.classifications.Class2DWithCell: '2D',
matid.classifications.Class2D: '2D',
matid.classifications.Class3D: 'Bulk',
matid.classifications.Unknown: 'Unknown'
}
def map_matid_to_nomad_system_types(self, atoms, system_type):
""" We map the system type classification from matid to Nomad values.
Args:
system_type: Object of a matid class representing a
material classification.
Returns:
nomad_classification: String representing a material
classification that fits into Nomad's current way
of naming material classes.
"""
nomad_classification = None
for matid_class in SystemNormalizer.translation_dict:
if isinstance(system_type, matid_class):
nomad_classification = SystemNormalizer.translation_dict[matid_class]
break
# Check to make sure a match was found in translating classes.
if nomad_classification is None:
# Then something unexpected has happened with our system_type.
self.logger.error(
'Matid classfication has given us an unexpected type: %s' % system_type)
if nomad_classification == 'Atom' and (atoms.get_number_of_atoms() > 1):
nomad_classification = 'Molecule / Cluster'
if nomad_classification == 'Unknown':
self.logger.warning('Could not determine system type.')
return nomad_classification
......@@ -548,6 +548,7 @@ class LocalBackend(LegacyParserBackend):
target.system = calc_data['repository_system_type']
target.atoms = calc_data['repository_atomic_elements']
target.spacegroup = calc_data['repository_spacegroup_nr']
target.spacegroup_symbol = calc_data['repository_spacegroup_symbol']
target.formula = calc_data['repository_chemical_formula']
target.code_version = calc_data['repository_code_version']
target.code_name = calc_data['repository_program_name']
......
......@@ -211,7 +211,9 @@ class Proc(Document, metaclass=ProcMetaclass):
for error in errors:
if isinstance(error, Exception):
failed_with_exception = True
Proc.log(logger, log_level, 'task failed with exception', exc_info=error)
Proc.log(
logger, log_level, 'task failed with exception',
exc_info=error, error=str(error))
self.errors = [str(error) for error in errors]
self.complete_time = datetime.now()
......
......@@ -22,7 +22,7 @@ from elasticsearch_dsl import Document, InnerDoc, Keyword, Text, Date, \
import elasticsearch.helpers
import ase.data
from nomad import config, datamodel, infrastructure, datamodel, coe_repo, parsing
from nomad import config, datamodel, infrastructure, datamodel, coe_repo, parsing, utils
path_analyzer = analyzer(
'path_analyzer',
......@@ -93,9 +93,17 @@ class Entry(Document):
system = Keyword()
crystal_system = Keyword()
spacegroup = Keyword()
spacegroup_symbol = Keyword()
code_name = Keyword()
code_version = Keyword()
group_hash = Keyword()
"""
A hash that is used to collapse results in search results. Its based on:
formula, spacegroup, basis_set, xc_functional, code_name, code_version,
with_embargo, commet, references, authors
"""
n_total_energies = Integer()
n_geometries = Integer()
geometries = Keyword(multi=True)
......@@ -142,9 +150,22 @@ class Entry(Document):
self.system = source.system
self.crystal_system = source.crystal_system
self.spacegroup = source.spacegroup
self.spacegroup_symbol = source.spacegroup_symbol
self.code_name = source.code_name
self.code_version = source.code_version
self.group_hash = utils.hash(
self.formula,
self.spacegroup,
self.basis_set,
self.xc_functional,
self.code_name,
self.code_version,
self.with_embargo,
self.comment,
self.references,
self.authors)
if source.backend is not None:
quantities = set()
geometries = set()
......@@ -202,6 +223,7 @@ aggregations = {
search_quantities = {
'formula': ('term', 'formula', 'The full reduced formula.'),
'spacegroup': ('term', 'spacegroup', 'The spacegroup as int.'),
'spacegroup_symbol': ('term', 'spacegroup', 'The spacegroup as international short symbol.'),
'basis_set': ('term', 'basis_set', 'The basis set type.'),
'atoms': ('term', 'atoms', (
'Search the given atom. This quantity can be used multiple times to search for '
......
......@@ -46,6 +46,7 @@ import uuid
import time
import re
from werkzeug.exceptions import HTTPException
import hashlib
from nomad import config
......@@ -53,12 +54,21 @@ default_hash_len = 28
""" Length of hashes and hash-based ids (e.g. calc, upload) in nomad. """
def hash(hash: bytes, length: int = default_hash_len) -> str:
""" Creates a websave hash for the given bytes of the given length. """
def hash(*args, length: int = default_hash_len) -> str:
""" Creates a websave hash of the given length based on the repr of the given arguments. """
hash = hashlib.sha512()
for arg in args:
hash.update(str(arg).encode('utf-8'))
return make_websave(hash, length=length)
def make_websave(hash, length: int = default_hash_len) -> str:
""" Creates a websave string for a hashlib hash object. """
if length > 0:
return base64.b64encode(hash, altchars=b'-_')[:length].decode('utf-8')
return base64.b64encode(hash.digest(), altchars=b'-_')[:length].decode('utf-8')
else:
return base64.b64encode(hash, altchars=b'-_')[0:-2].decode('utf-8')
return base64.b64encode(hash.digest(), altchars=b'-_')[0:-2].decode('utf-8')
def sanitize_logevent(event: str) -> str:
......
......@@ -597,8 +597,8 @@ class TestRepo(UploadFilesBasedTests):
assert key in results[0]
@pytest.mark.parametrize('calcs, quantity, value', [
(2, 'system', 'Bulk'),
(0, 'system', 'Atom'),
(2, 'system', 'bulk'),
(0, 'system', 'atom'),
(1, 'atoms', 'Br'),
(1, 'atoms', 'Fe'),
(0, 'atoms', ['Fe', 'Br']),
......
......@@ -55,8 +55,8 @@ def test_search(elastic, normalized: parsing.LocalBackend):
total, hits, aggs = aggregate_search()
assert total == 1
assert hits[0]['calc_id'] == calc_with_metadata.calc_id
assert 'Bulk' in aggs['system']
assert aggs['system']['Bulk'] == 1
assert 'bulk' in aggs['system']
assert aggs['system']['bulk'] == 1
def test_authors(elastic, normalized: parsing.LocalBackend, test_user: coe_repo.User, other_test_user: coe_repo.User):
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment