Commit e26e3610 authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

Refactored datemodel.

parent 1bd83d7f
Pipeline #43564 failed with stages
in 18 minutes and 15 seconds
......@@ -90,6 +90,9 @@ Terms:
- repo entry: Some quantities of a calculation that are used to represent that calculation in the repository.
- archive data: The normalized data of one calculation in nomad's meta-info-based format.
.. _id-reference-label:
Ids
---
......
......@@ -189,6 +189,27 @@ the *archive data* (a hierarchy of all parsed quantities), and the uploaded *raw
- Materials aggregate calculations based on common system properties
(e.g. system type, atoms, lattice, space group, etc.).
### Data
We distinguish various forms of calculation data:
- raw data: The raw files provided by nomad users
- archive data: The data extracted from raw files by nomad parsers and normalizers.
This data is represented in the *meta-info* format.
- materials data: Aggregated information about calculations that simulated the *same* material.
### Metadata
Metadata refers to those pieces of data, those quantities/attributes that we use
to represent, identify, and index uploads and calculations in the API, search, GUI, etc.
There are three catergories of metadata:
- ids: attributes that are necessary to uniquely identify entities. See also :ref:`id-reference-label`.
- user metadata: attributes provided by the user, e.g. comments, references, coauthors, datasets, etc.
- calculation metadata: metadata parsed from raw files that describe calculations on a high level, e.g. code name, basis set, system type, etc.
Those sets of metadata along with the actual raw and archive data are often transformed,
passed, stored, etc. by the various nomad modules.
.. figure:: datamodel_dataflow.png
:alt: nomad's data flow
### Implementation
The different entities have often multiple implementations for different storage systems.
For example, aspects of calculations are stored in files (raw files, calc metadata, archive data),
......
......@@ -52,4 +52,8 @@ nomad.client
nomad.utils
-----------
.. automodule:: nomad.utils
\ No newline at end of file
.. automodule:: nomad.utils
nomad.migration
---------------
.. automodule:: nomad.migration
......@@ -67,34 +67,13 @@ class RepoCalcView extends React.Component {
})
}
data(quantity) {
const path = quantity.split('.')
let data = this.state.calcData
for (let i = 0; i < path.length; i++) {
if (data) {
data = data[path[i]]
}
}
return data
}
renderQuantity(quantity, label, defaultValue) {
const value = this.data(quantity) || defaultValue || ''
return (
<div key={quantity}>
<Typography variant="caption">{label}</Typography>
<Typography variant="body1">{value}</Typography>
</div>
)
}
render() {
const { classes, ...calcProps } = this.props
const { uploadId, calcId } = calcProps
const calcData = this.state.calcData || {}
const filePaths = this.data('section_repository_info.repository_filepaths') || []
const mainfile = this.data('section_calculation_info.main_file')
const filePaths = calcData.files || []
const mainfile = calcData.mainfile
const calcPath = mainfile ? mainfile.substring(0, mainfile.lastIndexOf('/')) : null
return (
......@@ -111,48 +90,48 @@ class RepoCalcView extends React.Component {
</Download>
<div className={classes.quantityRow}>
<CalcQuantity label="chemical formula" typography="h4">
{this.data('section_repository_info.section_repository_parserdata.repository_chemical_formula')}
{calcData.formula}
</CalcQuantity>
</div>
<div className={classes.quantityRow}>
<CalcQuantity label='dft code'>
{this.data('section_repository_info.section_repository_parserdata.repository_program_name')}
{calcData.code_name}
</CalcQuantity>
<CalcQuantity label='dft code version'>
{this.data('section_repository_info.section_repository_parserdata.repository_code_version')}
{calcData.code_version}
</CalcQuantity>
</div>
<div className={classes.quantityRow}>
<CalcQuantity label='basis set'>
{this.data('section_repository_info.section_repository_parserdata.repository_basis_set_type')}
{calcData.basis_set}
</CalcQuantity>
<CalcQuantity label='xc functional'>
{this.data('section_repository_info.section_repository_parserdata.repository_xc_treatment')}
{calcData.xc_functional}
</CalcQuantity>
</div>
<div className={classes.quantityRow}>
<CalcQuantity label='system type'>
{this.data('section_repository_info.section_repository_parserdata.repository_system_type')}
{calcData.system}
</CalcQuantity>
<CalcQuantity label='crystal system'>
{this.data('section_repository_info.section_repository_parserdata.repository_crystal_system')}
{calcData.crystal_system}
</CalcQuantity>
<CalcQuantity label='spacegroup'>
{this.data('section_repository_info.section_repository_parserdata.repository_spacegroup_nr')}
{calcData.spacegroup}
</CalcQuantity>
</div>
<div className={classes.quantityRow}>
<CalcQuantity label='upload id'>
{this.data('section_calculation_info.upload_id')}
{calcData.upload_id}
</CalcQuantity>
<CalcQuantity label='calculation id'>
{this.data('section_calculation_info.calc_id')}
{calcData.calc_id}
</CalcQuantity>
<CalcQuantity label='mainfile'>
{mainfile}
</CalcQuantity>
<CalcQuantity label='calculation hash'>
{this.data('section_calculation_info.calc_hash')}
{calcData.calc_hash}
</CalcQuantity>
</div>
<Divider />
......
......@@ -51,20 +51,26 @@ proc_model = api.model('Processing', {
'process_running': fields.Boolean,
})
dataset_model = api.model('DataSet', {
'id': fields.Integer(required=True, description='The repository db dataset id'),
'_doi': fields.String(description='The DOI of the dataset'),
'_name': fields.String(description='The unique dataset name')
})
metadata_model = api.model('MetaData', {
'with_embargo': fields.Boolean(default=False, description='Data with embargo is only visible to the upload until the embargo period ended.'),
'comment': fields.String(description='The comment are shown in the repository for each calculation.'),
'references': fields.List(fields.String, descriptions='References allow to link calculations to external source, e.g. URLs.'),
'coauthors': fields.List(fields.String, description='A list of co-authors given by user_id.'),
'shared_with': fields.List(fields.String, description='A list of users to share calculations with given by user_id.'),
'coauthors': fields.List(fields.Integer, description='A list of co-authors given by user_id.'),
'shared_with': fields.List(fields.Integer, description='A list of users to share calculations with given by user_id.'),
'_upload_time': fields.DateTime(dt_format='iso8601', description='Overrride the upload time.'),
'_uploader': fields.String(description='Override the uploader with the given user id.')
'_uploader': fields.Integer(description='Override the uploader with the given user id.'),
'datasets': fields.List(fields.Nested(model=dataset_model), description='A list of datasets.')
})
calc_metadata_model = api.inherit('CalcMetaData', metadata_model, {
'mainfile': fields.String(description='The calculation main output file is used to identify the calculation in the upload.'),
'_checksum': fields.String(description='Override the calculation checksum'),
'_pid': fields.String(description='Assign a specific pid. It must be unique.')
'_pid': fields.Integer(description='Assign a specific pid. It must be unique.')
})
upload_metadata_model = api.inherit('UploadMetaData', metadata_model, {
......
......@@ -24,6 +24,8 @@ from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.dialects.postgresql import BYTEA
from sqlalchemy.ext.declarative import declarative_base
from nomad import utils
Base = declarative_base()
......@@ -140,5 +142,5 @@ class Citation(Base): # type: ignore
value = Column(String)
kind = Column(Enum('INTERNAL', 'EXTERNAL', name='citation_kind_enum'))
def to_dict(self) -> dict:
return dict(id=self.citation_id, value=self.value)
def to_popo(self) -> utils.POPO:
return utils.POPO(id=self.citation_id, value=self.value)
......@@ -18,16 +18,17 @@ from sqlalchemy import Column, Integer, String, ForeignKey
from sqlalchemy.orm import relationship, aliased
from sqlalchemy.sql.expression import literal
from nomad import infrastructure, datamodel
from nomad import infrastructure, utils
from nomad.datamodel import CalcWithMetadata
from . import base
from .user import User
from .base import Base, calc_citation_association, ownership, co_authorship, shareship, \
Tag, Topics, CalcSet, calc_dataset_containment, Citation
Tag, Topics, CalcSet, calc_dataset_containment, Citation, Spacegroup, CalcMetaData, \
CodeVersion, StructRatio, UserMetaData
class Calc(Base, datamodel.Calc): # type: ignore
class Calc(Base):
__tablename__ = 'calculations'
coe_calc_id = Column('calc_id', Integer, primary_key=True, autoincrement=True)
......@@ -61,7 +62,7 @@ class Calc(Base, datamodel.Calc): # type: ignore
return self.calc_metadata.location
@property
def pid(self):
def pid(self) -> int:
return self.coe_calc_id
@property
......@@ -86,13 +87,17 @@ class Calc(Base, datamodel.Calc): # type: ignore
return self.user_metadata.permission == 1
@property
def chemical_formula(self) -> str:
def formula(self) -> str:
return self.calc_metadata.chemical_formula
@property
def filenames(self) -> List[str]:
filenames = self.calc_metadata.filenames.decode('utf-8')
return json.loads(filenames)
def files(self) -> List[str]:
if self.calc_metadata is not None:
if self.calc_metadata.filenames is not None:
filenames = self.calc_metadata.filenames.decode('utf-8')
return json.loads(filenames)
return []
@property
def all_datasets(self) -> List['DataSet']:
......@@ -116,7 +121,7 @@ class Calc(Base, datamodel.Calc): # type: ignore
def direct_datasets(self) -> List['DataSet']:
return [DataSet(dataset_calc) for dataset_calc in self.parents]
def set_value(self, topic_cid: int, value: str) -> None:
def _set_value(self, topic_cid: int, value: str) -> None:
if value is None:
return
......@@ -131,24 +136,129 @@ class Calc(Base, datamodel.Calc): # type: ignore
_dataset_cache: dict = {}
def to_calc_with_metadata(self):
def apply_calc_with_metadata(self, calc: CalcWithMetadata) -> None:
"""
Applies the data from ``source`` to this coe Calc object.
"""
repo_db = infrastructure.repository_db
self.checksum = calc.calc_id
source_code_version = calc.code_version # TODO shorten version names
code_version_obj = repo_db.query(CodeVersion).filter_by(content=source_code_version).first()
if code_version_obj is None:
code_version_obj = CodeVersion(content=source_code_version)
repo_db.add(code_version_obj)
metadata = CalcMetaData(
calc=self,
added=calc.upload_time if calc.upload_time is not None else self.upload.upload_time,
chemical_formula=calc.formula,
filenames=('[%s]' % ','.join(['"%s"' % filename for filename in calc.files])).encode('utf-8'),
location=calc.mainfile,
version=code_version_obj)
repo_db.add(metadata)
struct_ratio = StructRatio(
calc=self,
chemical_formula=calc.formula,
formula_units=1, nelem=len(calc.atoms))
repo_db.add(struct_ratio)
user_metadata = UserMetaData(
calc=self,
label=calc.comment,
permission=(1 if calc.with_embargo else 0))
repo_db.add(user_metadata)
spacegroup = Spacegroup(calc=self, n=calc.spacegroup)
repo_db.add(spacegroup)
# topic based properties
self._set_value(base.topic_code, calc.code_name)
for atom in set(calc.atoms):
self._set_value(base.topic_atoms, str(atom))
self._set_value(base.topic_system_type, calc.system)
self._set_value(base.topic_xc_treatment, calc.xc_functional)
self._set_value(base.topic_crystal_system, calc.crystal_system)
self._set_value(base.topic_basis_set_type, calc.basis_set)
# user relations
if calc.uploader is not None:
uploader = repo_db.query(User).get(calc.uploader.id)
else:
uploader = self.upload.user
self.owners.append(uploader)
for coauthor in calc.coauthors:
self.coauthors.append(repo_db.query(User).get(coauthor.id))
for shared_with in calc.shared_with:
self.shared_with.append(repo_db.query(User).get(shared_with.id))
# datasets
for dataset in calc.datasets:
dataset_id = dataset.id
coe_dataset = repo_db.query(Calc).get(dataset_id)
if coe_dataset is None:
coe_dataset = Calc(coe_calc_id=dataset_id)
repo_db.add(coe_dataset)
metadata = CalcMetaData(
calc=coe_dataset,
added=self.upload.upload_time,
chemical_formula=dataset.name)
repo_db.add(metadata)
if dataset.doi is not None:
self._add_citation(coe_dataset, dataset.doi['value'], 'INTERNAL')
# cause a flush to avoid future inconsistencies
coe_dataset = repo_db.query(Calc).get(dataset_id)
dataset = CalcSet(parent_calc_id=dataset_id, children_calc_id=self.coe_calc_id)
repo_db.add(dataset)
# references
for reference in calc.references:
self._add_citation(self, reference['value'], 'EXTERNAL')
def _add_citation(self, coe_calc: 'Calc', value: str, kind: str) -> None:
repo_db = infrastructure.repository_db
citation = repo_db.query(Citation).filter_by(value=value, kind=kind).first()
if citation is None:
citation = Citation(value=value, kind=kind)
repo_db.add(citation)
coe_calc.citations.append(citation)
def to_calc_with_metadata(self) -> CalcWithMetadata:
"""
Creates a :class:`CalcWithMetadata` instance with UCPM ids, and all UMD/CMD.
Be aware that ``upload_id`` and ``calc_id``, might be old coe repository
``upload_name`` and calculation ``checksum`` depending on the context, i.e. used
database.
"""
result = CalcWithMetadata(
upload_id=self.upload.upload_id if self.upload else None,
calc_id=self.calc_id)
calc_id=self.checksum)
result.calc_hash = self.checksum
result.pid = self.pid
result.mainfile = self.mainfile
result.files = self.files
for topic in [tag.topic for tag in self.tags]:
if topic.cid == base.topic_code:
result.program_name = topic.topic
result.code_name = topic.topic
elif topic.cid == base.topic_basis_set_type:
result.basis_set_type = topic.topic
result.basis_set = topic.topic
elif topic.cid == base.topic_xc_treatment:
result.XC_functional_name = topic.topic
result.xc_functional = topic.topic
elif topic.cid == base.topic_system_type:
result.system_type = topic.topic
result.system = topic.topic
elif topic.cid == base.topic_atoms:
result.setdefault('atom_labels', []).append(topic.topic)
result.atoms.append(topic.topic)
elif topic.cid == base.topic_crystal_system:
result.crystal_system = topic.topic
elif topic.cid in [1996, 1994, 703, 702, 701, 100]:
......@@ -157,10 +267,10 @@ class Calc(Base, datamodel.Calc): # type: ignore
else:
raise KeyError('topic cid %s.' % str(topic.cid))
result.program_version = self.calc_metadata.version.content
result.chemical_composition = self.calc_metadata.chemical_formula
result.space_group_number = self.spacegroup.n
result.setdefault('atom_labels', []).sort()
result.code_version = self.calc_metadata.version.content
result.formula = self.calc_metadata.chemical_formula
result.spacegroup = self.spacegroup.n
result.atoms.sort()
datasets: List[DataSet] = []
for parent in self.parents:
......@@ -172,25 +282,22 @@ class Calc(Base, datamodel.Calc): # type: ignore
datasets.extend(parents)
result.pid = self.pid
result.uploader = self.uploader.to_dict()
result.uploader = self.uploader.to_popo()
result.upload_time = self.calc_metadata.added
result.datasets = list(
dict(id=ds.id, dois=ds.dois, name=ds.name)
utils.POPO(id=ds.id, doi=ds.doi.to_popo(), name=ds.name)
for ds in datasets)
result.with_embargo = self.with_embargo
result.comment = self.comment
result.references = list(
citation.to_dict() for citation in self.citations
citation.to_popo() for citation in self.citations
if citation.kind == 'EXTERNAL')
result.coauthors = list(user.to_dict() for user in self.coauthors)
result.shared_with = list(user.to_dict() for user in self.shared_with)
result.coauthors = list(user.to_popo() for user in self.coauthors)
result.shared_with = list(user.to_popo() for user in self.shared_with)
return result
CalcWithMetadata.register_mapping(Calc, Calc.to_calc_with_metadata)
class DataSet:
def __init__(self, dataset_calc: Calc) -> None:
self._dataset_calc = dataset_calc
......@@ -200,10 +307,15 @@ class DataSet:
return self._dataset_calc.coe_calc_id
@property
def dois(self) -> List[Citation]:
return list(
citation.to_dict() for citation in self._dataset_calc.citations
if citation.kind == 'INTERNAL')
def doi(self) -> Citation:
doi = None
for citation in self._dataset_calc.citations:
if citation.kind == 'INTERNAL':
if doi is not None:
utils.get_logger(__name__).warning(
'dataset with multiple dois', dataset_id=self.id)
doi = citation
return doi
@property
def name(self):
......
......@@ -47,14 +47,12 @@ import datetime
from sqlalchemy import Column, Integer, String, Boolean, DateTime, ForeignKey
from sqlalchemy.orm import relationship
from nomad import utils, infrastructure, datamodel
from nomad.datamodel import CalcWithMetadata
from nomad import utils, infrastructure
from nomad.datamodel import UploadWithMetadata
from . import base
from .user import User
from .calc import Calc
from .base import Base, CalcMetaData, UserMetaData, StructRatio, CodeVersion, Spacegroup, \
CalcSet, Citation
from .base import Base
from .user import User
class UploadMetaData:
......@@ -79,7 +77,7 @@ class UploadMetaData:
return self._calc_data.get(mainfile, self._upload_data)
class Upload(Base, datamodel.Upload): # type: ignore
class Upload(Base): # type: ignore
__tablename__ = 'uploads'
coe_upload_id = Column('upload_id', Integer, primary_key=True, autoincrement=True)
......@@ -91,10 +89,6 @@ class Upload(Base, datamodel.Upload): # type: ignore
user = relationship('User')
calcs = relationship('Calc')
@classmethod
def load_from(cls, obj):
return Upload.from_upload_id(str(obj.upload_id))
@staticmethod
def from_upload_id(upload_id: str) -> 'Upload':
repo_db = infrastructure.repository_db
......@@ -107,7 +101,7 @@ class Upload(Base, datamodel.Upload): # type: ignore
return self.upload_name
@property
def uploader(self) -> 'User':
def uploader(self) -> User:
return self.user
@property
......@@ -115,21 +109,15 @@ class Upload(Base, datamodel.Upload): # type: ignore
return self.created
@staticmethod
def add(upload: datamodel.Upload, metadata: dict = {}) -> int:
def add(upload: UploadWithMetadata) -> int:
"""
Add the upload to the NOMAD-coe repository db. It creates an
uploads-entry, respective calculation and property entries. Everything in one
transaction.
Triggers and updates the NOMAD-coe repository elastic search index after
success (TODO).
Arguments:
upload: The upload to add.
upload_metadata: A dictionary with additional meta data (e.g. user provided
meta data) that should be added to upload and calculations.
upload: The upload to add, including calculations with respective IDs, UMD, CMD.
"""
upload_metadata = UploadMetaData(metadata)
assert upload.uploader is not None
repo_db = infrastructure.repository_db
......@@ -143,20 +131,24 @@ class Upload(Base, datamodel.Upload): # type: ignore
# create upload
coe_upload = Upload(
upload_name=upload.upload_id,
created=metadata.get('_upload_time', upload.upload_time),
user=upload.uploader,
created=upload.upload_time,
user_id=upload.uploader.id,
is_processed=True)
repo_db.add(coe_upload)
# add calculations and metadata
calcs = []
has_calcs = False
for calc in upload.calcs:
calcs.append(
coe_upload._add_calculation(
calc.to(CalcWithMetadata), upload_metadata.get(calc.mainfile)))
has_calcs = True
coe_calc = Calc(
coe_calc_id=calc.pid,
checksum=calc.calc_id,
upload=coe_upload)
repo_db.add(coe_calc)
coe_calc.apply_calc_with_metadata(calc)
# commit
if len(calcs) > 0:
if has_calcs:
# empty upload case
repo_db.commit()
result = coe_upload.coe_upload_id
......@@ -167,114 +159,4 @@ class Upload(Base, datamodel.Upload): # type: ignore
repo_db.rollback()
raise e
# TODO trigger index update
pass
return result
def _add_calculation(self, calc: CalcWithMetadata, calc_metadata: dict) -> Calc:
repo_db = infrastructure.repository_db
# table based properties
coe_calc_id = calc_metadata.get('_pid', None)
coe_calc = Calc(
coe_calc_id=coe_calc_id,
checksum=calc_metadata.get('_checksum', calc.calc_hash),
upload=self)
repo_db.add(coe_calc)
program_version = calc.program_version # TODO shorten version names
code_version = repo_db.query(CodeVersion).filter_by(content=program_version).first()
if code_version is None:
code_version = CodeVersion(content=program_version)
repo_db.add(code_version)
metadata = CalcMetaData(