Skip to content
Snippets Groups Projects
Commit 493cb626 authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

Added more conceptual work for the new meta-info.

parent db8d3e37
No related branches found
No related tags found
1 merge request!60v0.6.0 Release
"""
This example shows how to read files from many sources (here .tar.gz files),
chunk the data into even sized uploads and upload/process them in parallel.
"""
from bravado.requests_client import RequestsClient
from bravado.client import SwaggerClient
from urllib.parse import urlparse
import time
import os.path
import sys
# config
nomad_url = 'http://labdev-nomad.esc.rzg.mpg.de/fairdi/nomad/testing/api'
user = 'leonard.hofstadter@nomad-fairdi.tests.de'
password = 'password'
approx_upload_size = 1 * 1024 # 32 * 1024^3
parallel_uploads = 6
# create the bravado client
host = urlparse(nomad_url).netloc.split(':')[0]
http_client = RequestsClient()
http_client.set_basic_auth(host, user, password)
client = SwaggerClient.from_url('%s/swagger.json' % nomad_url, http_client=http_client)
def source_generator():
"""
Yields all data sources.
"""
yield os.path.join(os.path.dirname(__file__), 'example-1.tar.gz')
yield os.path.join(os.path.dirname(__file__), 'example-2.tar.gz')
yield os.path.join(os.path.dirname(__file__), 'example-3.tar.gz')
def source_file_generator(source):
"""
Yields [filepath, file] tuples from :func:`source_generator`
"""
pass
def calc_generator(source_file):
pass
def upload_generator(zip_streams):
"""
Yields nomad uploads that are already uploaded, but are still processing.
"""
size = 0
streamed_files: Set[str] = set()
def generator():
""" Stream a zip file with all files using zipstream. """
def iterator():
"""
Replace the directory based iter of zipstream with an iter over all given
files.
"""
for zipped_filename, upload_filename, upload_files in files:
if zipped_filename in streamed_files:
continue
streamed_files.add(zipped_filename)
# Write a file to the zipstream.
try:
with upload_files.raw_file(upload_filename, 'rb') as f:
def iter_content():
while True:
data = f.read(1024 * 64)
if not data:
break
yield data
yield dict(arcname=zipped_filename, iterable=iter_content())
except KeyError:
# files that are not found, will not be returned
pass
except Restricted:
# due to the streaming nature, we cannot raise 401 here
# we just leave it out in the download
pass
compression = zipfile.ZIP_DEFLATED if compress else zipfile.ZIP_STORED
zip_stream = zipstream.ZipFile(mode='w', compression=compression, allowZip64=True)
zip_stream.paths_to_write = iterator()
for chunk in zip_stream:
yield chunk
response = Response(stream_with_context(generator()), mimetype='application/zip')
response.headers['Content-Disposition'] = 'attachment; filename={}'.format(zipfile_name)
return response
def upload_and_process():
"""
Uses the chain of generators to upload data sequentially, awaits processing in
parallel.
"""
upload_file = os.path.join(os.path.dirname(__file__), 'external_project_example.zip')
# upload data
print('uploading a file with "external_id/AcAg/vasp.xml" inside ...')
with open(upload_file, 'rb') as f:
upload = client.uploads.upload(file=f).response().result
print('processing ...')
while upload.tasks_running:
upload = client.uploads.get_upload(upload_id=upload.upload_id).response().result
time.sleep(5)
print('processed: %d, failures: %d' % (upload.processed_calcs, upload.failed_calcs))
# check if processing was a success
if upload.tasks_status != 'SUCCESS':
print('something went wrong')
print('errors: %s' % str(upload.errors))
# delete the unsuccessful upload
client.uploads.delete_upload(upload_id=upload.upload_id).response().result
sys.exit(1)
# publish data
print('publishing ...')
client.uploads.exec_upload_operation(upload_id=upload.upload_id, payload={
'operation': 'publish',
'metadata': {
# these metadata are applied to all calcs in the upload
'comment': 'Data from a cool external project',
'references': ['http://external.project.eu'],
'calculations': [
{
# these metadata are only applied to the calc identified by its 'mainfile'
'mainfile': 'external_id/AcAg/vasp.xml',
# 'coauthors': ['sheldon.cooper@ucla.edu'], this does not YET work with emails,
# Currently you have to use user_ids: leonard (the uploader, who is automatically an author) is 2 and sheldon is 1.
# Ask NOMAD developers about how to find out about user_ids.
'coauthors': [1],
# If users demand, we can implement a specific metadata keys (e.g. 'external_id', 'external_url') for external projects.
# This could allow to directly search for, or even have API endpoints that work with external_ids
# 'external_id': 'external_id',
# 'external_url': 'http://external.project.eu/data/calc/external_id/'
}
]
}
}).response().result
while upload.process_running:
upload = client.uploads.get_upload(upload_id=upload.upload_id).response().result
time.sleep(1)
if upload.tasks_status != 'SUCCESS' or len(upload.errors) > 0:
print('something went wrong')
print('errors: %s' % str(upload.errors))
# delete the unsuccessful upload
client.uploads.delete_upload(upload_id=upload.upload_id).response().result
sys.exit(1)
# search for data
result = client.repo.search(paths=['external_id']).response().result
if result.pagination.total == 0:
print('not found')
sys.exit(1)
elif result.pagination.total > 1:
print('my ids are not specific enough, bummer ... or did I uploaded stuff multiple times?')
# The results key holds an array with the current page data
print('Found the following calcs for my "external_id".')
print(', '.join(calc['calc_id'] for calc in result.results))
# download data
calc = result.results[0]
client.raw.get(upload_id=calc['upload_id'], path=calc['mainfile']).response()
print('Download of first calc works.')
# download urls, e.g. for curl
print('Possible download URLs are:')
print('%s/raw/%s/%s' % (nomad_url, calc['upload_id'], calc['mainfile']))
print('%s/raw/%s/%s/*' % (nomad_url, calc['upload_id'], os.path.dirname(calc['mainfile'])))
# direct download urls without having to search before
print('%s/raw/query?paths=external_id' % nomad_url)
**! This is not yet aligned with the ideas of CONCEPT.md !**
# A new metainfo schema, interface, and *file formats* support
This is a design document (later documentation) for a re-implementation of nomad's old
......
# NOMAD MetaInfo
## History
The NOMAD MetaInfo was devised within the first NOMAD CoE; over 2000 quantities have
been defined in this *old MetaInfo*. The experience with this system revealed the following drawbacks:
- The Python libraries that allow to use the MetaInfo are non pythonic and incomplete.
- The MetaInfo is only used for the archive, not for the encyclopedia and repository data.
- There is direct support to map MetaInfo definitions to DI technologies (databases, search indices, APIs).
- There is no support for namespaces. MetaInfo names are cumbersome. This will not scale
to expected levels of FAIRmat metadata.
- MetaInfo packages are not version controlled. They are part of the same git and do not belong to the independently evolving parsers. This does not allow for "external" parser development and makes it hard to keep versions consistent.
- The MetaInfo is defined in JSON. The syntax is inadequate, checks are not immediate.
Attempts to revise the MetaInfo have failed in the past.
## Goals
### Common language to define physics (meta-)data quantities and their relationships
The *physics quantities* part includes
- each quantity has a physics *unit*
- *shapes* that precisely define vectors, matrices, tensors, and their dimensions
The *relationship* parts entails:
- hierarchies for quantity *values* (e.g. *sections*)
- hierarchies for quantity *definition* (e.g. *categories*, former *abstract types*)
- *derived* quantities that can be computed from other quantities
- *synonyms* as a special trivial case for derived quantities
- *shapes* might also define a type of relationship through one quantity being the dimension of another
In addition there are the *typical* data-type definition (schema, ontology, ...) features:
- names/namespaces
- modularization (i.e. Metainfo packages)
- documentation
- basic primitive types (int, string, bool)
- simple compound types (lists, dictionaries, unions)
- references between data objects
### Complex, evolving, extendable packages of quantities
There are a lot of quantities, and they need to be organized. There are three mechanisms
to organize quantities:
- *Packages* (a.k.a modules) allow to modularize large sets of quantities, e.g. one package per code
- *Sections* allow to organize quantity values into containment (a.k.a whole-part, parent-child) hierarchies, e.g. `system` *contains* all quantity values that describe the simulated system.
- *Categories* allow to organize quantity definitions via generalization (a.k.a specialization, inheritance) relationships, e.g. `atom_labels` and `formula_hill` (*special*) both express `chemical_composition` (*general*)
Quantities and their relationships change over time. This requires (at least) a versioning mechanism to track changes and reason whether a pieces of data adheres to a certain version of the MetaInfo or not.
The MetaInfo needs to be extendable. It must be possible to add *packages*, quantities in new *packages* must be addable to existing sections and categories. Existing sections must be extendable. It must be possible to develop and version packages independently.
### Mappings to DI technologies
The core of the MetaInfo is about defining data and their physics. But in the end, the data needs to be managed with DI components, such as file formats, databases, search indices, onotology tools, APIs, GUIs, programming languages, etc. While all these tools come with their own ways of defining data, it can be cumbersome to manually map the MetaInfo to the corresponding DI technology. Furthermore, this usually comprises both mapping definitions and transforming values.
The MetaInfo will allow for quantity *annotations*. Annotations allow to add additional
information to quantity definitions that carry the necessary information to automatically map/transform definitions and their values to underlying DI components. Annotations can be easily stripped/filtered to present the MetaInfo either clean or under technology specific lenses.
### Intuitive programming interface to create, access, and use (meta-)data defined with the NOMAD MetaInfo
While MetaInfo definitions and MetaInfo values should have a *native* serialization format (JSON), the primary interface to deal with definitions and data should be made from programming language (Python) primitives. By the way, both things are basically just mappings from the logical MetaInfo into concrete technologies (i.e. JSON, Python).
As a programming language, Python has a far richer set of syntax to define and use data
than JSON has. We should use this. It was not used for definitions in the NOMAD CoE, and
the *backend*s for data were designed for creating data only and not very *pythonic*.
## Concepts for a new NOMAD MetaInfo
### Definition
`Definition` is the abstract base for all definitions in the MetaInfo.
### Attributes:
- `name`, a string
- `description`, a string
- `links`, a list of URLs
- `annotations`, a list of `Annotations`
### Property
`Property` is a special `Definition` and an abstract base for section properties.
#### Attributes
- `section`, a reference to a section definition
### Quantities (incl. dimensions)
A `Quantity` definition is a special and concrete `Property` definition:
#### Attributes
- `shape`, a list of either `int`, references to a dimension (quantity definition), or limits definitions (e.g. `'1..n'`, `'0..n'`.)
- `type`, a primitive or Enum type
- `categories`, a list of references to category definitions
- `section`, a reference to the parent section definition
- `unit`, a (computed) units, e.g. `units.F * units.m`
- `derived_from`, a list of references to other quantity definitions
- `synonym`, a reference to another quantity definition
A `Quantity`s are mapped to Python *descriptors*. *Dimensions* are quantity definitions
with empty shape and int type.
#### Contrains
- `synonym`, `derived_from`, and dimensions come from the same section
### Sections (incl. references)
A `Section` is a special and concrete `Definition`.
#### Attributes
- `adds_to`, a reference to another section definition. All quantities of this *pseudo* section are added to the given section.
- `parent_section`, a reference to another section definition
- `repeats`, a boolean
- `extends`, list of reference to other section definitions. This section automatically inherits all quantities of the other sections. (Might not be necessary)
#### Contrains
- `parent_section` is not circular
- `extends` is not circular
- `adds_to` is not circular
- all quantities that have *this* (or an `extends`) section as `section` have unique names
`Section`s are mapped to Python classes/objects.
### Categories
A `Category` is a special `Definition`.
#### Attributes
- `super_categories`, a list of references to other category definitions
#### Contrains
- `super_categories` is not circular
### Packages
A `Package` is a special `Definition` that contains definitions. `Packages` are mapped
to Python modules.
### References
A `Reference` is a special `Property`.
#### Attributes
- `referenced_section`, reference to a section definition
### Annotations
Arbitrary serializable objects that can contain additional information.
## Examples (of the Python interface)
### Definitions
This could be code, from a python module that represents the NOMAD *common* package `nomad.metainfo.common`:
```python
class System(MetainfoObject):
"""
The system is ...
"""
m_definition = Section(parent_section=Run, repeats=True)
n_atoms = Quantity(type=int, derived_from='atom_labels')
atom_labels = Quantity(
shape=['n_atoms'],
type=Enum(ase.data.chemical_symbols),
annotations=[ElasticSearchQuantity('keyword')])
"""
Atom labels are ...
"""
formula_hill = Quantity(type=str, derived_from=['atom_labels'])
atom_species = Quantity(shape=['n_atoms'], type=int, derived_from='atom_labels')
atom_positions = Quantity(shape=['n_atoms', 3], type=float, unit=units.m)
cell = Quantity(shape=[3, 3], type=float, unit=units.m)
lattice_vectors = Quantity(synonym='cell')
pbc = Quantity(shape=[3], type=bool)
# Not sure if this should be part of the definition. It will not serialize to
# JSON. It might get complex for more involved cases. In many cases, we would
# need both directions anyways. On the other hand, it allows to formally define
# the derive semantics.
def m_derive_atom_species(self) -> List[int]:
return [ase.data.atomic_numbers[label] for label in self.atom_labels]
def m_derive_n_atoms(self) -> int:
return len(self.atom_labels)
```
This could be part of the VASP source code:
```python
class Method(MetainfoObject):
m_definition = Section(adds_to=nomad.metainfo.common.Method)
incar_nbands = Quantity(
type=int, links=['https://cms.mpi.univie.ac.at/wiki/index.php/NBANDS'])
```
### (Meta-)data
```python
from nomad.metainfo.common import Run, System
run = Run()
system = run.m_create(System)
system.atom_labels = ['H', 'H', 'O']
system.atom_positions = [[0, 0, 0], [1, 0, 0], [0.5, 0.5, 0]]
system.cell = [[1, 0, 0], [0, 1, 0], [0, 0, 1]]
system.pbc = [False, False, False]
print(system.atom_species) # [1, 1, 96]
print(system.lattice_vectors)
print(system.n_atoms)
```
# Glossary
A list of words with very specific and precise meaning. This meaning might not yet be
fully expressed, but its there.
- annotation
- category
- derived quantity
- dimension
- new MetaInfo
- old MetaInfo
- package
- pythonic
- quantity
- reference
- section
- shape
- synonym
- unit
"""
Some playground to try the API_CONCEPT.md ideas.
"""
class MObject:
def __init__(self, m_definition: 'MElementDef', m_section: 'MSection' = None):
......
"""
Some playground to try the CONCEPT.md ideas.
"""
from typing import Dict, List, Any, Union, Type
import json
import ase.data
......@@ -13,6 +17,7 @@ units = Units()
class Definition():
m_definition: Any = None
pass
......@@ -22,17 +27,17 @@ class Property(Definition):
class Quantity(Property):
def __init__(
self,
name: str = None,
description: str = None,
self,
name: str = None,
description: str = None,
parent_section: 'Section' = None,
shape: List[Union[str, int]] = [],
type: type = None,
shape: List[Union[str, int]] = [],
type: Union['Enum', type] = None,
unit: str = None,
derived: bool = False,
repeats: bool = False,
synonym: str = None):
self.name = name
self.parent_section = parent_section.m_definition if parent_section is not None else None
self.derived = derived
......@@ -45,10 +50,10 @@ class Quantity(Property):
if self.derived:
derive_method = getattr(obj, 'm_derive_%s' % self.name, None)
if derive_method is None:
raise KeyError('Derived quantity %s is not implemented' % self.name)
else:
return derive_method()
......@@ -58,7 +63,7 @@ class Quantity(Property):
else:
return obj.m_data.get(self.name, None)
def __set__(self, obj: 'MetainfoObject', value: Any):
def __set__(self, obj: 'MetainfoObject', value: Any):
obj.m_data[self.name] = value
def __delete__(self, obj: 'MetainfoObject'):
......@@ -76,10 +81,10 @@ class Section(Definition):
def __init__(
self,
name: str = None,
parent_section: 'Section' = None,
parent_section=None,
repeats: bool = False,
extends: 'Section' = None,
adds_to: 'Section' = None):
extends=None,
adds_to=None):
self.name = name
self.parent_section = parent_section.m_definition if parent_section is not None else None
......@@ -112,9 +117,9 @@ class MetainfoObjectMeta(type):
if isinstance(value, Property):
value.name = name
value.parent_section = cls.m_definition
cls = super().__new__(cls, cls_name, bases, dct)
if cls.m_definition is not None:
if cls.m_definition.name is None:
cls.m_definition.name = cls_name
......@@ -122,15 +127,16 @@ class MetainfoObjectMeta(type):
return cls
class MetainfoObject(metaclass=MetainfoObjectMeta):
class MetainfoObject(metaclass=MetainfoObjectMeta):
"""
Base class for all
Base class for all
"""
m_definition: Any = None
def __init__(self):
self.m_data = dict(m_defintion=self.m_definition.name)
def m_create(self, section_definition: Type['MSection'], *args, **kwargs) -> 'MSection':
def m_create(self, section_definition: Any, *args, **kwargs) -> Any:
"""
Creates a sub section of the given section definition.
"""
......@@ -142,7 +148,7 @@ class MetainfoObject(metaclass=MetainfoObjectMeta):
else:
# TODO test overwrite
self.m_data[definition.name] = sub_section
return sub_section
def m_get_definition(self, name):
......@@ -199,7 +205,7 @@ class System(MetainfoObject):
atom_species = Quantity(shape=['n_atoms'], type=int, derived=True)
atom_positions = Quantity(shape=['n_atoms', 3], type=float, unit=units.m)
atom_positions = Quantity(shape=['n_atoms', 3], type=float, unit=units.m)
cell = Quantity(shape=[3, 3], type=float, unit=units.m)
lattice_vectors = Quantity(synonym='cell')
......@@ -236,4 +242,4 @@ print(system.__class__.m_definition)
print(system.m_definition)
print(system.m_get_definition('atom_labels'))
print(run)
\ No newline at end of file
print(run)
......@@ -72,18 +72,17 @@ GET http://localhost:19200/fairdi_nomad_prod/_search HTTP/1.1
Content-Type: application/json
{
"size": 0,
"size": 3,
"query": {
"bool": {
"must": [
{ "match": { "code_name": "Phonopy" } },
{ "match": { "published": false}}
{ "match": { "code_name": "DL_POLY" } }
]
}
},
"aggs": {
"upload_id": {
"sum": {
"terms": {
"field": "n_calculations"
}
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment