Commit 64b22e73 authored by Daniel Speckhard's avatar Daniel Speckhard
Browse files

Merged Migration with Parser Integration.

parents 89e2abf2 11cb9e74
......@@ -31,7 +31,7 @@ build:
stage: build
- git submodule sync
- git submodule update --init --depth 1
- git submodule update --init
- docker login -u gitlab-ci-token -p $CI_BUILD_TOKEN
- docker build --no-cache -t $TEST_IMAGE .
......@@ -44,7 +44,7 @@
"cwd": "${workspaceFolder}",
"program": "${workspaceFolder}/.pyenv/bin/pytest",
"args": [
"-sv", "tests/[None-multipart-tests/data/proc/]"
"-sv", "tests/[parsers/vasp-tests/data/parsers/vasp_compressed/vasp.xml.gz]"
......@@ -52,7 +52,7 @@ RUN \
# Second, create a slim final image
FROM final
RUN apt-get update && apt-get install -y --no-install-recommends libgomp1
RUN apt-get update && apt-get install -y --no-install-recommends libgomp1 && apt-get install -y libmagic-dev
# copy the sources for tests, coverage, qa, etc.
COPY . /app
Subproject commit f07c719036248b4d74bff67c118187cfda167f01
Subproject commit acdd533b20022130b7cf87c765596d1d91d3b0fe
......@@ -90,6 +90,9 @@ Terms:
- repo entry: Some quantities of a calculation that are used to represent that calculation in the repository.
- archive data: The normalized data of one calculation in nomad's meta-info-based format.
.. _id-reference-label:
......@@ -189,6 +189,27 @@ the *archive data* (a hierarchy of all parsed quantities), and the uploaded *raw
- Materials aggregate calculations based on common system properties
(e.g. system type, atoms, lattice, space group, etc.).
### Data
We distinguish various forms of calculation data:
- raw data: The raw files provided by nomad users
- archive data: The data extracted from raw files by nomad parsers and normalizers.
This data is represented in the *meta-info* format.
- materials data: Aggregated information about calculations that simulated the *same* material.
### Metadata
Metadata refers to those pieces of data, those quantities/attributes that we use
to represent, identify, and index uploads and calculations in the API, search, GUI, etc.
There are three catergories of metadata:
- ids: attributes that are necessary to uniquely identify entities. See also :ref:`id-reference-label`.
- user metadata: attributes provided by the user, e.g. comments, references, coauthors, datasets, etc.
- calculation metadata: metadata parsed from raw files that describe calculations on a high level, e.g. code name, basis set, system type, etc.
Those sets of metadata along with the actual raw and archive data are often transformed,
passed, stored, etc. by the various nomad modules.
.. figure:: datamodel_dataflow.png
:alt: nomad's data flow
### Implementation
The different entities have often multiple implementations for different storage systems.
For example, aspects of calculations are stored in files (raw files, calc metadata, archive data),
......@@ -52,4 +52,12 @@ nomad.client
.. automodule:: nomad.utils
\ No newline at end of file
.. automodule:: nomad.utils
.. automodule:: nomad.migration
.. automodule:: tests
......@@ -67,34 +67,13 @@ class RepoCalcView extends React.Component {
data(quantity) {
const path = quantity.split('.')
let data = this.state.calcData
for (let i = 0; i < path.length; i++) {
if (data) {
data = data[path[i]]
return data
renderQuantity(quantity, label, defaultValue) {
const value = || defaultValue || ''
return (
<div key={quantity}>
<Typography variant="caption">{label}</Typography>
<Typography variant="body1">{value}</Typography>
render() {
const { classes, ...calcProps } = this.props
const { uploadId, calcId } = calcProps
const calcData = this.state.calcData || {}
const filePaths ='section_repository_info.repository_filepaths') || []
const mainfile ='section_calculation_info.main_file')
const filePaths = calcData.files || []
const mainfile = calcData.mainfile
const calcPath = mainfile ? mainfile.substring(0, mainfile.lastIndexOf('/')) : null
return (
......@@ -111,48 +90,48 @@ class RepoCalcView extends React.Component {
<div className={classes.quantityRow}>
<CalcQuantity label="chemical formula" typography="h4">
<div className={classes.quantityRow}>
<CalcQuantity label='dft code'>
<CalcQuantity label='dft code version'>
<div className={classes.quantityRow}>
<CalcQuantity label='basis set'>
<CalcQuantity label='xc functional'>
<div className={classes.quantityRow}>
<CalcQuantity label='system type'>
<CalcQuantity label='crystal system'>
<CalcQuantity label='spacegroup'>
<div className={classes.quantityRow}>
<CalcQuantity label='upload id'>
<CalcQuantity label='calculation id'>
<CalcQuantity label='mainfile'>
<CalcQuantity label='calculation hash'>
<Divider />
......@@ -35,11 +35,11 @@ app = Flask(
static_folder=os.path.abspath(os.path.join(os.path.dirname(__file__), '../../docs/.build/html')))
""" The Flask app that serves all APIs. """
app.config.APPLICATION_ROOT = base_path
app.config.RESTPLUS_MASK_HEADER = False
app.config.RESTPLUS_MASK_SWAGGER = False
app.config.APPLICATION_ROOT = base_path # type: ignore
app.config.RESTPLUS_MASK_HEADER = False # type: ignore
app.config.RESTPLUS_MASK_SWAGGER = False # type: ignore
app.config.SWAGGER_UI_OPERATION_ID = True # type: ignore
app.config.SWAGGER_UI_REQUEST_DURATION = True # type: ignore
def api_base_path_response(env, resp):
......@@ -49,7 +49,7 @@ def api_base_path_response(env, resp):'utf-8')]
app.wsgi_app = DispatcherMiddleware(
app.wsgi_app = DispatcherMiddleware( # type: ignore
api_base_path_response, { app.wsgi_app})
......@@ -135,6 +135,7 @@ user_model = api.model('User', {
'last_name': fields.String(description='The user\'s last name'),
'email': fields.String(description='Guess what, the user\'s email'),
'affiliation': fields.String(description='The user\'s affiliation'),
'password': fields.String(description='The bcrypt 2y-indented password for initial and changed password'),
'token': fields.String(
description='The access token that authenticates the user with the API. '
'User the HTTP header "X-Token" to provide it in API requests.')
......@@ -161,6 +162,54 @@ class UserResource(Resource):
message='User not logged in, provide credentials via Basic HTTP authentication.')
@api.marshal_with(user_model, skip_none=True, code=200, description='User created')
def put(self):
Creates a new user account. Currently only the admin user is allows. The
NOMAD-CoE repository GUI should be used to create user accounts for now.
Passwords have to be encrypted by the client with bcrypt and 2y indent.
if not g.user.is_admin:
abort(401, message='Only the admin user can perform create user.')
data = request.get_json()
if data is None:
data = {}
for required_key in ['last_name', 'first_name', 'password', 'email']:
if required_key not in data:
abort(400, message='The %s is missing' % required_key)
user = coe_repo.User.create_user(
email=data['email'], password=data.get('password', None), crypted=True,
first_name=data['first_name'], last_name=data['last_name'],
affiliation=data.get('affiliation', None))
return user, 200
@api.marshal_with(user_model, skip_none=True, code=200, description='User updated')
def post(self):
Allows to edit the authenticated user and change his password. Password
have to be encrypted by the client with bcrypt and 2y indent.
data = request.get_json()
if data is None:
data = {}
if 'email' in data:
abort(400, message='Cannot change the users email.')
g.user.update(crypted=True, **data)
return g.user, 200
token_model = api.model('Token', {
'user': fields.Nested(user_model),
......@@ -51,20 +51,26 @@ proc_model = api.model('Processing', {
'process_running': fields.Boolean,
dataset_model = api.model('DataSet', {
'id': fields.Integer(required=True, description='The repository db dataset id'),
'_doi': fields.String(description='The DOI of the dataset'),
'_name': fields.String(description='The unique dataset name')
metadata_model = api.model('MetaData', {
'with_embargo': fields.Boolean(default=False, description='Data with embargo is only visible to the upload until the embargo period ended.'),
'comment': fields.String(description='The comment are shown in the repository for each calculation.'),
'references': fields.List(fields.String, descriptions='References allow to link calculations to external source, e.g. URLs.'),
'coauthors': fields.List(fields.String, description='A list of co-authors given by user_id.'),
'shared_with': fields.List(fields.String, description='A list of users to share calculations with given by user_id.'),
'coauthors': fields.List(fields.Integer, description='A list of co-authors given by user_id.'),
'shared_with': fields.List(fields.Integer, description='A list of users to share calculations with given by user_id.'),
'_upload_time': fields.DateTime(dt_format='iso8601', description='Overrride the upload time.'),
'_uploader': fields.String(description='Override the uploader with the given user id.')
'_uploader': fields.Integer(description='Override the uploader with the given user id.'),
'datasets': fields.List(fields.Nested(model=dataset_model), description='A list of datasets.')
calc_metadata_model = api.inherit('CalcMetaData', metadata_model, {
'mainfile': fields.String(description='The calculation main output file is used to identify the calculation in the upload.'),
'_checksum': fields.String(description='Override the calculation checksum'),
'_pid': fields.String(description='Assign a specific pid. It must be unique.')
'_pid': fields.Integer(description='Assign a specific pid. It must be unique.')
upload_metadata_model = api.inherit('UploadMetaData', metadata_model, {
......@@ -20,7 +20,7 @@ from typing import Union, Callable, cast
from nomad import config, utils
from nomad.files import ArchiveBasedStagingUploadFiles
from nomad.parsing import parsers, parser_dict, LocalBackend
from nomad.parsing import parser_dict, LocalBackend, match_parser
from nomad.normalizing import normalizers
from .main import cli, api_base
......@@ -91,11 +91,7 @@ class CalcProcReproduction:
if parser_name is not None:
parser = parser_dict.get(parser_name)
for potential_parser in parsers:
with self.upload_files.raw_file(self.mainfile) as mainfile_f:
if potential_parser.is_mainfile(self.mainfile, lambda fn: mainfile_f):
parser = potential_parser
parser = match_parser(self.mainfile, self.upload_files)
assert parser is not None, 'there is not parser matching %s' % self.mainfile
self.logger = self.logger.bind( # type: ignore
......@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import requests
import click
......@@ -73,6 +74,8 @@ def cli(host: str, port: int, verbose: bool, user: str, password: str):
config.console_log_level = logging.WARNING
config.service = os.environ.get('NOMAD_SERVICE', 'client')
global api_base
api_base = 'http://%s:%d/nomad/api' % (host, port)
......@@ -24,6 +24,8 @@ from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.dialects.postgresql import BYTEA
from sqlalchemy.ext.declarative import declarative_base
from nomad import utils
Base = declarative_base()
......@@ -139,3 +141,6 @@ class Citation(Base): # type: ignore
citation_id = Column(Integer, primary_key=True)
value = Column(String)
kind = Column(Enum('INTERNAL', 'EXTERNAL', name='citation_kind_enum'))
def to_popo(self) -> utils.POPO:
return utils.POPO(id=self.citation_id, value=self.value)
......@@ -17,17 +17,19 @@ import json
from sqlalchemy import Column, Integer, String, ForeignKey
from sqlalchemy.orm import relationship, aliased
from sqlalchemy.sql.expression import literal
from datetime import datetime
from nomad import infrastructure, datamodel
from nomad import infrastructure, utils
from nomad.datamodel import CalcWithMetadata
from . import base
from .user import User
from .base import Base, calc_citation_association, ownership, co_authorship, shareship, \
Tag, Topics, CalcSet, calc_dataset_containment, Citation
Tag, Topics, CalcSet, calc_dataset_containment, Citation, Spacegroup, CalcMetaData, \
CodeVersion, StructRatio, UserMetaData
class Calc(Base, datamodel.Calc): # type: ignore
class Calc(Base):
__tablename__ = 'calculations'
coe_calc_id = Column('calc_id', Integer, primary_key=True, autoincrement=True)
......@@ -61,7 +63,7 @@ class Calc(Base, datamodel.Calc): # type: ignore
return self.calc_metadata.location
def pid(self):
def pid(self) -> int:
return self.coe_calc_id
......@@ -86,13 +88,17 @@ class Calc(Base, datamodel.Calc): # type: ignore
return self.user_metadata.permission == 1
def chemical_formula(self) -> str:
def formula(self) -> str:
return self.calc_metadata.chemical_formula
def filenames(self) -> List[str]:
filenames = self.calc_metadata.filenames.decode('utf-8')
return json.loads(filenames)
def files(self) -> List[str]:
if self.calc_metadata is not None:
if self.calc_metadata.filenames is not None:
filenames = self.calc_metadata.filenames.decode('utf-8')
return json.loads(filenames)
return []
def all_datasets(self) -> List['DataSet']:
......@@ -116,7 +122,7 @@ class Calc(Base, datamodel.Calc): # type: ignore
def direct_datasets(self) -> List['DataSet']:
return [DataSet(dataset_calc) for dataset_calc in self.parents]
def set_value(self, topic_cid: int, value: str) -> None:
def _set_value(self, topic_cid: int, value: str) -> None:
if value is None:
......@@ -131,31 +137,152 @@ class Calc(Base, datamodel.Calc): # type: ignore
_dataset_cache: dict = {}
def to_calc_with_metadata(self):
def apply_calc_with_metadata(self, calc: CalcWithMetadata) -> None:
Applies the data from ``source`` to this coe Calc object.
repo_db = infrastructure.repository_db
self.checksum = calc.calc_id
source_code_version = calc.code_version # TODO shorten version names
code_version_obj = repo_db.query(CodeVersion).filter_by(content=source_code_version).first()
if code_version_obj is None:
code_version_obj = CodeVersion(content=source_code_version)
if calc.upload_time is not None:
added_time = calc.upload_time
elif self.upload is not None and self.upload.upload_time is not None:
added_time = self.upload.upload_time
added_time =
metadata = CalcMetaData(
filenames=('[%s]' % ','.join(['"%s"' % filename for filename in calc.files])).encode('utf-8'),
struct_ratio = StructRatio(
formula_units=1, nelem=len(calc.atoms))
user_metadata = UserMetaData(
permission=(1 if calc.with_embargo else 0))
spacegroup = Spacegroup(calc=self, n=calc.spacegroup)
# topic based properties
self._set_value(base.topic_code, calc.code_name)
for atom in set(calc.atoms):
self._set_value(base.topic_atoms, str(atom))
self._set_value(base.topic_system_type, calc.system)
self._set_value(base.topic_xc_treatment, calc.xc_functional)
self._set_value(base.topic_crystal_system, calc.crystal_system)
self._set_value(base.topic_basis_set_type, calc.basis_set)
# user relations
def add_users_to_relation(source_users, relation):
for source_user in source_users:
coe_user = repo_db.query(User).get(
if calc.uploader is not None:
add_users_to_relation([calc.uploader], self.owners)
elif self.upload is not None and self.upload.user is not None:
calc.uploader = self.upload.user.to_popo()
add_users_to_relation(calc.coauthors, self.coauthors)
add_users_to_relation(calc.shared_with, self.shared_with)
# datasets
for dataset in calc.datasets:
dataset_id =
coe_dataset_calc: Calc = repo_db.query(Calc).get(dataset_id)
if coe_dataset_calc is None:
coe_dataset_calc = Calc(coe_calc_id=dataset_id)
metadata = CalcMetaData(
if dataset.doi is not None:
self._add_citation(coe_dataset_calc, dataset.doi['value'], 'INTERNAL')
# cause a flush to avoid future inconsistencies
coe_dataset_calc = repo_db.query(Calc).get(dataset_id)
coe_dataset_rel = CalcSet(parent_calc_id=dataset_id, children_calc_id=self.coe_calc_id)
# references
for reference in calc.references:
self._add_citation(self, reference['value'], 'EXTERNAL')
def _add_citation(self, coe_calc: 'Calc', value: str, kind: str) -> None:
repo_db = infrastructure.repository_db
citation = repo_db.query(Citation).filter_by(value=value, kind=kind).first()
if citation is None:
citation = Citation(value=value, kind=kind)
def to_calc_with_metadata(self) -> CalcWithMetadata:
Creates a :class:`CalcWithMetadata` instance with UCPM ids, and all UMD/CMD.
Be aware that ``upload_id`` and ``calc_id``, might be old coe repository
``upload_name`` and calculation ``checksum`` depending on the context, i.e. used
result = CalcWithMetadata(
upload_id=self.upload.upload_id if self.upload else None,
calc_id=self.checksum) =
result.mainfile = self.mainfile
result.files = self.files
for topic in [tag.topic for tag in self.tags]:
if topic.cid == base.topic_code:
result.program_name = topic.topic
result.code_name = topic.topic
elif topic.cid == base.topic_basis_set_type:
result.basis_set_type = topic.topic
result.basis_set = topic.topic
elif topic.cid == base.topic_xc_treatment:
result.XC_functional_name = topic.topic
result.xc_functional = topic.topic
elif topic.cid == base.topic_system_type:
result.system_type = topic.topic
result.system = topic.topic
elif topic.cid == base.topic_atoms:
result.setdefault('atom_labels', []).append(topic.topic)
elif topic.cid == base.topic_crystal_system:
result.crystal_system = topic.topic
elif topic.cid in [1996, 1994, 703, 702, 701, 100]:
# user/author, restriction, formulas?, another category
raise KeyError('topic cid %s.' % str(topic.cid))
result.program_version = self.calc_metadata.version.content
result.chemical_composition = self.calc_metadata.chemical_formula
result.space_group_number = self.spacegroup.n
result.setdefault('atom_labels', []).sort()
result.code_version = self.calc_metadata.version.content
result.formula = self.calc_metadata.chemical_formula
result.spacegroup = self.spacegroup.n
datasets: List[DataSet] = []
for parent in self.parents:
......@@ -167,23 +294,20 @@ class Calc(Base, datamodel.Calc): # type: ignore