Commit e1aae42b authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

Merge branch 'migration' into 'master'

Migration

See merge request !39
parents 87351529 a663d4f6
Pipeline #46164 canceled with stages
in 26 seconds
Subproject commit 3b199d1f17f9cdf94e08fc5a3b46e2895147463d
Subproject commit 5f97f32086c281ebda5ab6084ae2c7eba16b516f
Subproject commit 7505b8b16035ec22025875bb6dfbe22f812ddcdd
Subproject commit 0feea3bb5aeb2847cde41c4d642c6b5af8e38cd5
Subproject commit 4802d6d8ab942d2431a39d37e2ffefaf5c6ef478
Subproject commit 3811ced85fb7d68ca579d5ca8d93e800f48c53a5
Subproject commit 0e0c20ab4adea61ad31f8a7e96876434008fb490
Subproject commit d8ee02926a9f37317c880ba3caa31f5140e7730e
......@@ -25,6 +25,7 @@ from nomad import config, utils
from nomad.files import ArchiveBasedStagingUploadFiles
from nomad.parsing import parser_dict, LocalBackend, match_parser
from nomad.normalizing import normalizers
from nomad.datamodel import CalcWithMetadata
from .main import cli
......@@ -173,5 +174,6 @@ def local(archive_id, show_backend=False, show_metadata=False, **kwargs):
if show_backend:
backend.write_json(sys.stdout, pretty=True)
if show_metadata:
metadata = backend.to_calc_with_metadata()
metadata = CalcWithMetadata()
metadata.apply_domain_metadata(backend)
ujson.dump(metadata.to_dict(), sys.stdout, indent=4)
......@@ -20,7 +20,7 @@ from sqlalchemy.sql.expression import literal
from datetime import datetime
from nomad import infrastructure, utils
from nomad.datamodel import CalcWithMetadata
from nomad.datamodel import DFTCalcWithMetadata
from . import base
from .user import User
......@@ -180,7 +180,7 @@ class Calc(Base):
_dataset_cache: dict = {}
def apply_calc_with_metadata(self, calc: CalcWithMetadata, context: PublishContext) -> None:
def apply_calc_with_metadata(self, calc: DFTCalcWithMetadata, context: PublishContext) -> None:
"""
Applies the data from ``source`` to this coe Calc object.
"""
......@@ -309,14 +309,14 @@ class Calc(Base):
coe_calc.citations.append(citation)
def to_calc_with_metadata(self) -> CalcWithMetadata:
def to_calc_with_metadata(self) -> DFTCalcWithMetadata:
"""
Creates a :class:`CalcWithMetadata` instance with UCPM ids, and all UMD/CMD.
Creates a :class:`DFTCalcWithMetadata` instance with UCPM ids, and all UMD/CMD.
Be aware that ``upload_id`` and ``calc_id``, might be old coe repository
``upload_name`` and calculation ``checksum`` depending on the context, i.e. used
database.
"""
result = CalcWithMetadata(
result = DFTCalcWithMetadata(
upload_id=self.upload.upload_id if self.upload else None,
calc_id=self.checksum)
......
......@@ -42,7 +42,7 @@ This module also provides functionality to add parsed calculation data to the db
:undoc-members:
"""
from typing import Type
from typing import Type, cast
import datetime
from sqlalchemy import Column, Integer, String, Boolean, DateTime, ForeignKey
from sqlalchemy.orm import relationship
......@@ -52,7 +52,7 @@ import warnings
from sqlalchemy import exc as sa_exc
from nomad import utils, infrastructure, config
from nomad.datamodel import UploadWithMetadata
from nomad.datamodel import UploadWithMetadata, DFTCalcWithMetadata
from .calc import Calc, PublishContext
from .base import Base
......@@ -194,7 +194,8 @@ class Upload(Base): # type: ignore
upload=coe_upload)
repo_db.add(coe_calc)
coe_calc.apply_calc_with_metadata(calc, context=context)
coe_calc.apply_calc_with_metadata(
cast(DFTCalcWithMetadata, calc), context=context)
logger.debug(
'added calculation, not yet committed', calc_id=coe_calc.calc_id)
......
......@@ -17,26 +17,33 @@ import os
import os.path
import yaml
import warnings
from kombu import Queue
from nomad import utils
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
class NomadConfig(utils.POPO):
pass
class NomadConfig(dict):
"""
A dict subclass that uses attributes as key/value pairs.
"""
def __init__(self, **kwargs):
super().__init__(**kwargs)
def __getattr__(self, name):
if name in self:
return self[name]
else:
raise AttributeError("No such attribute: " + name)
def __setattr__(self, name, value):
self[name] = value
def __delattr__(self, name):
if name in self:
del self[name]
else:
raise AttributeError("No such attribute: " + name)
# class ConfigProperty:
# def __init__(
# self, name: str, default_value: Union[int, str, bool], help: str = None,
# env_var: str = None) -> None:
# self.name = name
# self.default_value = default_value,
# self.help = help
# self.env_var = env_var
CELERY_WORKER_ROUTING = 'worker'
CELERY_QUEUE_ROUTING = 'queue'
......@@ -57,10 +64,11 @@ celery = NomadConfig(
timeout=1800, # 1/2 h
acks_late=True,
routing=CELERY_QUEUE_ROUTING,
task_queues=[
Queue('calcs', routing_key='calcs', queue_arguments={'x-max-priority': 10}),
Queue('uploads', routing_key='uploads', queue_arguments={'x-max-priority': 100})
]
priorities={
'Upload.process_upload': 5,
'Upload.delete_upload': 9,
'Upload.publish_upload': 10
}
)
fs = NomadConfig(
......@@ -69,7 +77,8 @@ fs = NomadConfig(
public='.volumes/fs/public',
migration_packages='.volumes/fs/migration_packages',
local_tmp='/tmp',
prefix_size=2
prefix_size=2,
working_directory=os.getcwd()
)
elastic = NomadConfig(
......@@ -109,6 +118,7 @@ services = NomadConfig(
admin_password='password',
disable_reset=True,
not_processed_value='not processed',
unavailable_value='unavailable',
https=False
)
......@@ -153,15 +163,16 @@ client = NomadConfig(
url='http://localhost:8000/nomad/api'
)
console_log_level = logging.WARNING
service = 'unknown nomad service'
version = '4.3' # TODO replace with git hash?
release = 'devel'
domain = 'DFT'
service = 'unknown nomad service'
auxfile_cutoff = 30
version = '4.3' # TODO replace with git hash?
console_log_level = logging.WARNING
def get_loglevel_from_env(key, default_level=logging.INFO):
plain_value = os.environ.get(key, None)
def normalize_loglevel(value, default_level=logging.INFO):
plain_value = value
if plain_value is None:
return default_level
else:
......@@ -172,8 +183,8 @@ def get_loglevel_from_env(key, default_level=logging.INFO):
transformations = {
'console_log_level': get_loglevel_from_env,
'logstash_level': get_loglevel_from_env
'console_log_level': normalize_loglevel,
'logstash_level': normalize_loglevel
}
......
# Copyright 2018 Markus Scheidgen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an"AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This module contains classes that allow to represent the core
nomad data entities :class:`Upload` and :class:`Calc` on a high level of abstraction
independent from their representation in the different modules
:py:mod:`nomad.processing`, :py:mod:`nomad.coe_repo`, :py:mod:`nomad.parsing`,
:py:mod:`nomad.search`, :py:mod:`nomad.api`, :py:mod:`nomad.migration`.
It is not about representing every detail, but those parts that are directly involved in
api, processing, migration, mirroring, or other 'infrastructure' operations.
Transformations between different implementations of the same entity can be build
and used. To ease the number of necessary transformations the classes
:class:`UploadWithMetadata` and :class:`CalcWithMetadata` can act as intermediate
representations. Therefore, implement only transformation from and to these
classes. These are the implemented transformations:
.. image:: datamodel_transformations.png
.. autoclass:: nomad.datamodel.UploadWithMetadata
:members:
.. autoclass:: nomad.datamodel.CalcWithMetadata
:members:
The class :class:`CalcWithMetadata` only defines non domain specific metadata quantities
about ids, user metadata, etc. To define domain specific quantities the classes
:class:`Domain` and :class:`DomainQuantity` must be used.
.. autoclass:: nomad.datamodel.Domain
:members:
.. autoclass:: nomad.datamodel.DomainQuantity
:members:
"""
import sys
from nomad.datamodel.base import UploadWithMetadata, CalcWithMetadata, Domain
from nomad.datamodel.dft import DFTCalcWithMetadata
# Override the CalcWithMetadata with the domain specific decendant
setattr(sys.modules['nomad.datamodel'], 'CalcWithMetadata', Domain.domain_class)
# Copyright 2018 Markus Scheidgen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an"AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Iterable, List, Dict, Type, Tuple
import datetime
from elasticsearch_dsl import Keyword
from nomad import utils, config
class UploadWithMetadata():
"""
See :class:`CalcWithMetadata`.
"""
def __init__(self, **kwargs):
self.upload_id: str = None
self.uploader: utils.POPO = None
self.upload_time: datetime.datetime = None
self.calcs: Iterable['CalcWithMetadata'] = list()
for key, value in kwargs.items():
setattr(self, key, value)
@property
def calcs_dict(self) -> Dict[str, 'CalcWithMetadata']:
return {calc.calc_id: calc for calc in self.calcs}
class CalcWithMetadata():
"""
A dict/POPO class that can be used for mapping calc representations with calc metadata.
We have multi representations of calcs and their calc metadata. To avoid implement
mappings between all combinations, just implement mappings with the class and use
mapping transitivity. E.g. instead of A -> B, A -> this -> B.
Attributes:
upload_id: The ``upload_id`` of the calculations upload (random UUID).
calc_id: The unique mainfile based calculation id.
calc_hash: The raw file content based checksum/hash of this calculation.
pid: The unique persistent id of this calculation.
mainfile: The upload relative mainfile path.
files: A list of all files, relative to upload.
upload_time: The time when the calc was uploaded.
uploader: An object describing the uploading user, has at least ``user_id``
processed: Boolean indicating if this calc was successfully processed and archive
data and calc metadata is available.
last_processing: A datatime with the time of the last successful processing.
nomad_version: A string that describes the version of the nomad software that was
used to do the last successful processing.
with_embargo: Show if user set an embargo on the calculation.
coauthors: List of coauther user objects with at ``user_id``.
shared_with: List of users this calcs ownership is shared with, objects with at ``user_id``.
comment: String comment.
references: Objects describing user provided references, keys are ``id`` and ``value``.
datasets: Objects describing the datasets, keys are ``id``, ``name``, ``doi``.
DOI is optional, is an object with key ``id``, ``value``.
"""
def __init__(self, **kwargs):
# id relevant metadata
self.upload_id: str = None
self.calc_id: str = None
self.calc_hash: str = None
self.mainfile: str = None
self.pid: int = None
# basic upload and processing related metadata
self.upload_time: datetime.datetime = None
self.files: List[str] = None
self.uploader: utils.POPO = None
self.processed: bool = False
self.last_processing: datetime.datetime = None
self.nomad_version: str = None
# user metadata, i.e. quantities given and editable by the user
self.with_embargo: bool = None
self.published: bool = False
self.coauthors: List[utils.POPO] = []
self.shared_with: List[utils.POPO] = []
self.comment: str = None
self.references: List[utils.POPO] = []
self.datasets: List[utils.POPO] = []
# parser related general (not domain specific) metadata
self.parser_name = None
self.update(**kwargs)
def to_dict(self):
return {
key: value for key, value in self.__dict__.items()
if value is not None and key not in ['backend']
}
def update(self, **kwargs):
for key, value in kwargs.items():
if value is None:
continue
if isinstance(value, list):
if len(value) == 0:
continue
if len(value) > 0 and isinstance(value[0], dict) and not isinstance(value[0], utils.POPO):
value = list(utils.POPO(**item) for item in value)
if isinstance(value, dict) and not isinstance(value, utils.POPO):
value = utils.POPO(**value)
setattr(self, key, value)
def apply_user_metadata(self, metadata: dict):
"""
Applies a user provided metadata dict to this calc.
"""
self.pid = metadata.get('_pid')
self.comment = metadata.get('comment')
self.upload_time = metadata.get('_upload_time')
uploader_id = metadata.get('_uploader')
if uploader_id is not None:
self.uploader = utils.POPO(id=int(uploader_id))
self.references = [utils.POPO(value=ref) for ref in metadata.get('references', [])]
self.with_embargo = metadata.get('with_embargo', False)
self.coauthors = [
utils.POPO(id=int(user)) for user in metadata.get('coauthors', [])]
self.shared_with = [
utils.POPO(id=int(user)) for user in metadata.get('shared_with', [])]
self.datasets = [
utils.POPO(id=int(ds['id']), doi=utils.POPO(value=ds.get('_doi')), name=ds.get('_name'))
for ds in metadata.get('datasets', [])]
def apply_domain_metadata(self, backend):
raise NotImplementedError()
class DomainQuantity:
"""
This class can be used to define further details about a domain specific metadata
quantity.
Attributes:
name: The name of the quantity, also the key used to store values in
:class:`CalcWithMetadata`
description: A human friendly description. The description is used to define
the swagger documentation on the relevant API endpoints.
multi: Indicates a list of values. This is important for the elastic mapping.
order_default: Indicates that this metric should be used for the default order of
search results.
aggregations: Indicates that search aggregations (and how many) should be provided.
0 (the default) means no aggregations.
metric: Indicates that this quantity should be used as search metric. Values need
to be tuples with metric name and elastic aggregation (e.g. sum, cardinality)
elastic_mapping: An optional elasticsearch_dsl mapping. Default is ``Keyword``.
"""
def __init__(
self, description: str = None, multi: bool = False, aggregations: int = 0,
order_default: bool = False, metric: Tuple[str, str] = None,
elastic_mapping=None):
self.name: str = None
self.description = description
self.multi = multi
self.order_default = order_default
self.aggregations = aggregations
self.metric = metric
self.elastic_mapping = elastic_mapping
if self.elastic_mapping is None:
self.elastic_mapping = Keyword(multi=self.multi)
class Domain:
"""
A domain defines all metadata quantities that are specific to a certain scientific
domain, e.g. DFT calculations, or experimental material science.
For each domain there needs to define a subclass of :class:`CalcWithMetadata`. This
class has to define the necessary domain specific metadata quantities and how these
are filled from parser results (usually an instance of :class:LocalBackend).
Furthermore, the class method :func:`register_domain` of this ``Domain`` class has
to be used to register a domain with ``domain_nam``. This also allows to provide
further descriptions on each domain specific quantity via instance of :class:`DomainQuantity`.
While there can be multiple domains registered. Currently, only one domain can be
active. This active domain is define in the configuration using the ``domain_name``.
"""
domain_class: Type[CalcWithMetadata] = None
quantities: List[DomainQuantity] = []
@classmethod
def register_domain(cls, domain_class: type, domain_name: str, quantities: Dict[str, DomainQuantity]):
assert cls.domain_class is None, 'you can only define one domain.'
if not domain_name == config.domain:
return
cls.domain_class = domain_class
reference_domain_calc = domain_class()
reference_general_calc = CalcWithMetadata()
for name, value in reference_domain_calc.__dict__.items():
if not hasattr(reference_general_calc, name):
quantity = quantities.get(name, None)
if quantity is None:
quantity = DomainQuantity()
quantities[name] = quantity
quantity.name = name
quantity.multi = isinstance(value, list)
cls.quantities.append(quantity)
for name in quantities.keys():
assert hasattr(reference_domain_calc, name) and not hasattr(reference_general_calc, name), \
'quantity does not exist or overrides general non domain quantity'
assert any(quantity.order_default for quantity in Domain.quantities), \
'you need to define a order default quantity'
......@@ -12,29 +12,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This module contains classes that allow to represent the core
nomad data entities :class:`Upload` and :class:`Calc` on a high level of abstraction
independent from their representation in the different modules
:py:mod:`nomad.processing`, :py:mod:`nomad.coe_repo`, :py:mod:`nomad.parsing`,
:py:mod:`nomad.search`, :py:mod:`nomad.api`, :py:mod:`nomad.migration`.
It is not about representing every detail, but those parts that are directly involved in
api, processing, migration, mirroring, or other 'infrastructure' operations.
Transformations between different implementations of the same entity can be build
and used. To ease the number of necessary transformations the classes
:class:`UploadWithMetadata` and :class:`CalcWithMetadata` can act as intermediate
representations. Therefore, implement only transformation from and to these
classes. These are the implemented transformations:
.. image:: datamodel_transformations.png
"""
from typing import Iterable, List, Dict
from typing import Iterable, List, Dict, Type
import datetime
from elasticsearch_dsl import Keyword
from nomad import utils
from nomad import utils, config
class UploadWithMetadata():
......@@ -60,7 +42,7 @@ class UploadWithMetadata():
class CalcWithMetadata():
"""
A dict/POPO class that can be used for mapping calc representations with calc metadata.
We have many representations of calcs and their calc metadata. To avoid implement
We have multi representations of calcs and their calc metadata. To avoid implement
mappings between all combinations, just implement mappings with the class and use
mapping transitivity. E.g. instead of A -> B, A -> this -> B.
......@@ -87,16 +69,6 @@ class CalcWithMetadata():
references: Objects describing user provided references, keys are ``id`` and ``value``.
datasets: Objects describing the datasets, keys are ``id``, ``name``, ``doi``.
DOI is optional, is an object with key ``id``, ``value``.
formula: The chemical formula
atoms: A list of all atoms, as labels. All atoms means the whole composition, with atom labels repeated.
basis_set: The basis set type of this calculation.
xc_functional: The class of functional used.
system: The system type, e.g. Atom/Molecule, 2D, Bulk(3D)
crystal_system: The symmetry describing crystal_system type.
spacegroup: The spacegroup, as spacegroup number.
code_name: The name of the used code.
code_version: The version of the used code.
"""
def __init__(self, **kwargs):
# id relevant metadata
......@@ -123,21 +95,11 @@ class CalcWithMetadata():
self.references: List[utils.POPO] = []
self.datasets: List[utils.POPO] = []
# DFT specific calc metadata, derived from raw data through successful processing
self.formula: str = None
self.atoms: List[str] = []
self.basis_set: str = None
self.xc_functional: str = None
self.system: str = None
self.crystal_system: str = None
self.spacegroup: str = None
self.spacegroup_symbol: str = None
self.code_name: str = None
self.code_version: str = None
# temporary reference to the backend after successful processing
self.backend = None
self.parser_name = None
self.update(**kwargs)
def to_dict(self):
......@@ -181,3 +143,56 @@ class CalcWithMetadata():
self.datasets = [
utils.POPO(id=int(ds['id']), doi=utils.POPO(value=ds.get('_doi')), name=ds.get('_name'))
for ds in metadata.get('datasets', [])]
def apply_domain_metadata(self, backend):
raise NotImplementedError()
class DomainQuantity:
def __init__(
self, description: str = None, multi: bool = False, aggregate: bool = False,
elastic_mapping=None):
self.name: str = None
self.description = description
self.multi = multi
self.aggregate = aggregate
self.elastic_mapping = elastic_mapping
if self.elastic_mapping is None:
self.elastic_mapping = Keyword(multi=self.multi)
class Domain:
domain_class: Type[CalcWithMetadata] = None
quantities: List[DomainQuantity] = []
@classmethod
def register_domain(cls, domain_class: type, domain_name: str, quantities: Dict[str, DomainQuantity]):
assert cls.domain_class is None, 'you can only define one domain.'
if not domain_name == config.domain:
return
cls.domain_class = domain_class
reference_domain_calc = domain_class()
reference_general_calc = CalcWithMetadata()
for name, value in reference_domain_calc.__dict__.items():
if not hasattr(reference_general_calc, name):
quantity = quantities.get(name, None)
if quantity is None:
quantity = DomainQuantity()
quantities[name] = quantity
quantity.name = name
quantity.multi = isinstance(value, list)