From 493cb626242ee250a00a08c41765864a77ff9be9 Mon Sep 17 00:00:00 2001
From: Markus Scheidgen <markus@dhcp-46-238.physik.hu-berlin.de>
Date: Mon, 16 Sep 2019 16:04:26 +0200
Subject: [PATCH] Added more conceptual work for the new meta-info.

---
 .../externalproject.py                        | 184 ++++++++++++++
 nomad/metainfo/{README.md => API_CONCEPT.md}  |   2 +
 nomad/metainfo/CONCEPT.md                     | 240 ++++++++++++++++++
 nomad/metainfo/bootstrap.py                   |   4 +
 nomad/metainfo/metainfo.py                    |  46 ++--
 ops/scripts/misc.http                         |   7 +-
 6 files changed, 459 insertions(+), 24 deletions(-)
 create mode 100644 examples/external_project_parallel_upload/externalproject.py
 rename nomad/metainfo/{README.md => API_CONCEPT.md} (99%)
 create mode 100644 nomad/metainfo/CONCEPT.md

diff --git a/examples/external_project_parallel_upload/externalproject.py b/examples/external_project_parallel_upload/externalproject.py
new file mode 100644
index 0000000000..26c5d920c9
--- /dev/null
+++ b/examples/external_project_parallel_upload/externalproject.py
@@ -0,0 +1,184 @@
+"""
+This example shows how to read files from many sources (here .tar.gz files),
+chunk the data into even sized uploads and upload/process them in parallel.
+"""
+
+from bravado.requests_client import RequestsClient
+from bravado.client import SwaggerClient
+from urllib.parse import urlparse
+import time
+import os.path
+import sys
+
+# config
+nomad_url = 'http://labdev-nomad.esc.rzg.mpg.de/fairdi/nomad/testing/api'
+user = 'leonard.hofstadter@nomad-fairdi.tests.de'
+password = 'password'
+approx_upload_size = 1 * 1024  # 32 * 1024^3
+parallel_uploads = 6
+
+# create the bravado client
+host = urlparse(nomad_url).netloc.split(':')[0]
+http_client = RequestsClient()
+http_client.set_basic_auth(host, user, password)
+client = SwaggerClient.from_url('%s/swagger.json' % nomad_url, http_client=http_client)
+
+def source_generator():
+    """
+    Yields all data sources.
+    """
+    yield os.path.join(os.path.dirname(__file__), 'example-1.tar.gz')
+    yield os.path.join(os.path.dirname(__file__), 'example-2.tar.gz')
+    yield os.path.join(os.path.dirname(__file__), 'example-3.tar.gz')
+
+def source_file_generator(source):
+    """
+    Yields [filepath, file] tuples from :func:`source_generator`
+    """
+    pass
+
+def calc_generator(source_file):
+    pass
+
+def upload_generator(zip_streams):
+    """
+    Yields nomad uploads that are already uploaded, but are still processing.
+    """
+    size = 0
+      streamed_files: Set[str] = set()
+
+    def generator():
+        """ Stream a zip file with all files using zipstream. """
+        def iterator():
+            """
+            Replace the directory based iter of zipstream with an iter over all given
+            files.
+            """
+            for zipped_filename, upload_filename, upload_files in files:
+                if zipped_filename in streamed_files:
+                    continue
+                streamed_files.add(zipped_filename)
+
+                # Write a file to the zipstream.
+                try:
+                    with upload_files.raw_file(upload_filename, 'rb') as f:
+                        def iter_content():
+                            while True:
+                                data = f.read(1024 * 64)
+                                if not data:
+                                    break
+                                yield data
+
+                        yield dict(arcname=zipped_filename, iterable=iter_content())
+                except KeyError:
+                    # files that are not found, will not be returned
+                    pass
+                except Restricted:
+                    # due to the streaming nature, we cannot raise 401 here
+                    # we just leave it out in the download
+                    pass
+
+        compression = zipfile.ZIP_DEFLATED if compress else zipfile.ZIP_STORED
+        zip_stream = zipstream.ZipFile(mode='w', compression=compression, allowZip64=True)
+        zip_stream.paths_to_write = iterator()
+
+        for chunk in zip_stream:
+            yield chunk
+
+    response = Response(stream_with_context(generator()), mimetype='application/zip')
+    response.headers['Content-Disposition'] = 'attachment; filename={}'.format(zipfile_name)
+    return response
+
+
+def upload_and_process():
+    """
+    Uses the chain of generators to upload data sequentially, awaits processing in
+    parallel.
+    """
+
+upload_file = os.path.join(os.path.dirname(__file__), 'external_project_example.zip')
+
+
+
+# upload data
+print('uploading  a file with "external_id/AcAg/vasp.xml" inside ...')
+with open(upload_file, 'rb') as f:
+    upload = client.uploads.upload(file=f).response().result
+
+print('processing ...')
+while upload.tasks_running:
+    upload = client.uploads.get_upload(upload_id=upload.upload_id).response().result
+    time.sleep(5)
+    print('processed: %d, failures: %d' % (upload.processed_calcs, upload.failed_calcs))
+
+# check if processing was a success
+if upload.tasks_status != 'SUCCESS':
+    print('something went wrong')
+    print('errors: %s' % str(upload.errors))
+    # delete the unsuccessful upload
+    client.uploads.delete_upload(upload_id=upload.upload_id).response().result
+    sys.exit(1)
+
+# publish data
+print('publishing ...')
+client.uploads.exec_upload_operation(upload_id=upload.upload_id, payload={
+    'operation': 'publish',
+    'metadata': {
+        # these metadata are applied to all calcs in the upload
+        'comment': 'Data from a cool external project',
+        'references': ['http://external.project.eu'],
+        'calculations': [
+            {
+                # these metadata are only applied to the calc identified by its 'mainfile'
+                'mainfile': 'external_id/AcAg/vasp.xml',
+
+                # 'coauthors': ['sheldon.cooper@ucla.edu'],  this does not YET work with emails,
+                # Currently you have to use user_ids: leonard (the uploader, who is automatically an author) is 2 and sheldon is 1.
+                # Ask NOMAD developers about how to find out about user_ids.
+                'coauthors': [1],
+
+                # If users demand, we can implement a specific metadata keys (e.g. 'external_id', 'external_url') for external projects.
+                # This could allow to directly search for, or even have API endpoints that work with external_ids
+                # 'external_id': 'external_id',
+                # 'external_url': 'http://external.project.eu/data/calc/external_id/'
+            }
+        ]
+
+
+    }
+}).response().result
+
+while upload.process_running:
+    upload = client.uploads.get_upload(upload_id=upload.upload_id).response().result
+    time.sleep(1)
+if upload.tasks_status != 'SUCCESS' or len(upload.errors) > 0:
+    print('something went wrong')
+    print('errors: %s' % str(upload.errors))
+    # delete the unsuccessful upload
+    client.uploads.delete_upload(upload_id=upload.upload_id).response().result
+    sys.exit(1)
+
+
+# search for data
+result = client.repo.search(paths=['external_id']).response().result
+if result.pagination.total == 0:
+    print('not found')
+    sys.exit(1)
+elif result.pagination.total > 1:
+    print('my ids are not specific enough, bummer ... or did I uploaded stuff multiple times?')
+# The results key holds an array with the current page data
+print('Found the following calcs for my "external_id".')
+print(', '.join(calc['calc_id'] for calc in result.results))
+
+# download data
+calc = result.results[0]
+client.raw.get(upload_id=calc['upload_id'], path=calc['mainfile']).response()
+print('Download of first calc works.')
+
+# download urls, e.g. for curl
+print('Possible download URLs are:')
+print('%s/raw/%s/%s' % (nomad_url, calc['upload_id'], calc['mainfile']))
+print('%s/raw/%s/%s/*' % (nomad_url, calc['upload_id'], os.path.dirname(calc['mainfile'])))
+
+# direct download urls without having to search before
+print('%s/raw/query?paths=external_id' % nomad_url)
diff --git a/nomad/metainfo/README.md b/nomad/metainfo/API_CONCEPT.md
similarity index 99%
rename from nomad/metainfo/README.md
rename to nomad/metainfo/API_CONCEPT.md
index 67427a7143..8e58b5812d 100644
--- a/nomad/metainfo/README.md
+++ b/nomad/metainfo/API_CONCEPT.md
@@ -1,3 +1,5 @@
+**! This is not yet aligned with the ideas of CONCEPT.md !**
+
 # A new metainfo schema, interface, and *file formats* support
 
 This is a design document (later documentation) for a re-implementation of nomad's old
diff --git a/nomad/metainfo/CONCEPT.md b/nomad/metainfo/CONCEPT.md
new file mode 100644
index 0000000000..6e07ed5f46
--- /dev/null
+++ b/nomad/metainfo/CONCEPT.md
@@ -0,0 +1,240 @@
+# NOMAD MetaInfo
+
+## History
+
+The NOMAD MetaInfo was devised within the first NOMAD CoE; over 2000 quantities have
+been defined in this *old MetaInfo*. The experience with this system revealed the following drawbacks:
+
+- The Python libraries that allow to use the MetaInfo are non pythonic and incomplete.
+- The MetaInfo is only used for the archive, not for the encyclopedia and repository data.
+- There is direct support to map MetaInfo definitions to DI technologies (databases, search indices, APIs).
+- There is no support for namespaces. MetaInfo names are cumbersome. This will not scale
+to expected levels of FAIRmat metadata.
+- MetaInfo packages are not version controlled. They are part of the same git and do not belong to the independently evolving parsers. This does not allow for "external" parser development and makes it hard to keep versions consistent.
+- The MetaInfo is defined in JSON. The syntax is inadequate, checks are not immediate.
+
+Attempts to revise the MetaInfo have failed in the past.
+
+## Goals
+
+### Common language to define physics (meta-)data quantities and their relationships
+
+The *physics quantities* part includes
+- each quantity has a physics *unit*
+- *shapes* that precisely define vectors, matrices, tensors, and their dimensions
+
+The *relationship* parts entails:
+- hierarchies for quantity *values* (e.g. *sections*)
+- hierarchies for quantity *definition* (e.g. *categories*, former *abstract types*)
+- *derived* quantities that can be computed from other quantities
+- *synonyms* as a special trivial case for derived quantities
+- *shapes* might also define a type of relationship through one quantity being the dimension of another
+
+In addition there are the *typical* data-type definition (schema, ontology, ...) features:
+- names/namespaces
+- modularization (i.e. Metainfo packages)
+- documentation
+- basic primitive types (int, string, bool)
+- simple compound types (lists, dictionaries, unions)
+- references between data objects
+
+### Complex, evolving, extendable packages of quantities
+
+There are a lot of quantities, and they need to be organized. There are three mechanisms
+to organize quantities:
+- *Packages* (a.k.a modules) allow to modularize large sets of quantities, e.g. one package per code
+- *Sections* allow to organize quantity values into containment (a.k.a whole-part, parent-child) hierarchies, e.g. `system` *contains* all quantity values that describe the simulated system.
+- *Categories* allow to organize quantity definitions via generalization (a.k.a specialization, inheritance) relationships, e.g. `atom_labels` and `formula_hill` (*special*) both express `chemical_composition` (*general*)
+
+Quantities and their relationships change over time. This requires (at least) a versioning mechanism to track changes and reason whether a pieces of data adheres to a certain version of the MetaInfo or not.
+
+The MetaInfo needs to be extendable. It must be possible to add *packages*, quantities in new *packages* must be addable to existing sections and categories. Existing sections must be extendable. It must be possible to develop and version packages independently.
+
+### Mappings to DI technologies
+
+The core of the MetaInfo is about defining data and their physics. But in the end, the data needs to be managed with DI components, such as file formats, databases, search indices, onotology tools, APIs, GUIs, programming languages, etc. While all these tools come with their own ways of defining data, it can be cumbersome to manually map the MetaInfo to the corresponding DI technology. Furthermore, this usually comprises both mapping definitions and transforming values.
+
+The MetaInfo will allow for quantity *annotations*. Annotations allow to add additional
+information to quantity definitions that carry the necessary information to automatically map/transform definitions and their values to underlying DI components. Annotations can be easily stripped/filtered to present the MetaInfo either clean or under technology specific lenses.
+
+### Intuitive programming interface to create, access, and use (meta-)data defined with the NOMAD MetaInfo
+
+While MetaInfo definitions and MetaInfo values should have a *native* serialization format (JSON), the primary interface to deal with definitions and data should be made from programming language (Python) primitives. By the way, both things are basically just mappings from the logical MetaInfo into concrete technologies (i.e. JSON, Python).
+
+As a programming language, Python has a far richer set of syntax to define and use data
+than JSON has. We should use this. It was not used for definitions in the NOMAD CoE, and
+the *backend*s for data were designed for creating data only and not very *pythonic*.
+
+## Concepts for a new NOMAD MetaInfo
+
+### Definition
+
+`Definition` is the abstract base for all definitions in the MetaInfo.
+
+### Attributes:
+- `name`, a string
+- `description`, a string
+- `links`, a list of URLs
+- `annotations`, a list of `Annotations`
+
+### Property
+
+`Property` is a special `Definition` and an abstract base for section properties.
+
+#### Attributes
+- `section`, a reference to a section definition
+
+### Quantities (incl. dimensions)
+
+A `Quantity` definition is a special and concrete `Property` definition:
+
+#### Attributes
+- `shape`, a list of either `int`, references to a dimension (quantity definition), or limits definitions (e.g. `'1..n'`, `'0..n'`.)
+- `type`, a primitive or Enum type
+- `categories`, a list of references to category definitions
+- `section`, a reference to the parent section definition
+- `unit`, a (computed) units, e.g. `units.F * units.m`
+- `derived_from`, a list of references to other quantity definitions
+- `synonym`, a reference to another quantity definition
+
+A `Quantity`s are mapped to Python *descriptors*. *Dimensions* are quantity definitions
+with empty shape and int type.
+
+#### Contrains
+- `synonym`, `derived_from`, and dimensions come from the same section
+
+### Sections (incl. references)
+
+A `Section` is a special and concrete `Definition`.
+
+#### Attributes
+- `adds_to`, a reference to another section definition. All quantities of this *pseudo* section are added to the given section.
+- `parent_section`, a reference to another section definition
+- `repeats`, a boolean
+- `extends`, list of reference to other section definitions. This section automatically inherits all quantities of the other sections. (Might not be necessary)
+
+#### Contrains
+- `parent_section` is not circular
+- `extends` is not circular
+- `adds_to` is not circular
+- all quantities that have *this* (or an `extends`) section as `section` have unique names
+
+`Section`s are mapped to Python classes/objects.
+
+### Categories
+
+A `Category` is a special `Definition`.
+
+#### Attributes
+- `super_categories`, a list of references to other category definitions
+
+#### Contrains
+- `super_categories` is not circular
+
+### Packages
+
+A `Package` is a special `Definition` that contains definitions. `Packages` are mapped
+to Python modules.
+
+### References
+
+A `Reference` is a special `Property`.
+
+#### Attributes
+- `referenced_section`, reference to a section definition
+
+### Annotations
+
+Arbitrary serializable objects that can contain additional information.
+
+
+## Examples (of the Python interface)
+
+### Definitions
+
+This could be code, from a python module that represents the NOMAD *common* package `nomad.metainfo.common`:
+```python
+class System(MetainfoObject):
+    """
+    The system is ...
+    """
+    m_definition = Section(parent_section=Run, repeats=True)
+
+    n_atoms = Quantity(type=int, derived_from='atom_labels')
+
+    atom_labels = Quantity(
+        shape=['n_atoms'],
+        type=Enum(ase.data.chemical_symbols),
+        annotations=[ElasticSearchQuantity('keyword')])
+    """
+    Atom labels are ...
+    """
+
+    formula_hill = Quantity(type=str, derived_from=['atom_labels'])
+
+    atom_species = Quantity(shape=['n_atoms'], type=int, derived_from='atom_labels')
+
+    atom_positions = Quantity(shape=['n_atoms', 3], type=float, unit=units.m)
+
+    cell = Quantity(shape=[3, 3], type=float, unit=units.m)
+    lattice_vectors = Quantity(synonym='cell')
+
+    pbc = Quantity(shape=[3], type=bool)
+
+    # Not sure if this should be part of the definition. It will not serialize to
+    # JSON. It might get complex for more involved cases. In many cases, we would
+    # need both directions anyways. On the other hand, it allows to formally define
+    # the derive semantics.
+    def m_derive_atom_species(self) -> List[int]:
+        return [ase.data.atomic_numbers[label] for label in self.atom_labels]
+
+    def m_derive_n_atoms(self) -> int:
+        return len(self.atom_labels)
+```
+
+This could be part of the VASP source code:
+```python
+class Method(MetainfoObject):
+    m_definition = Section(adds_to=nomad.metainfo.common.Method)
+
+    incar_nbands = Quantity(
+        type=int, links=['https://cms.mpi.univie.ac.at/wiki/index.php/NBANDS'])
+```
+
+### (Meta-)data
+
+```python
+from nomad.metainfo.common import Run, System
+
+run = Run()
+
+system = run.m_create(System)
+system.atom_labels = ['H', 'H', 'O']
+system.atom_positions = [[0, 0, 0], [1, 0, 0], [0.5, 0.5, 0]]
+system.cell = [[1, 0, 0], [0, 1, 0], [0, 0, 1]]
+system.pbc = [False, False, False]
+
+print(system.atom_species)  # [1, 1, 96]
+print(system.lattice_vectors)
+print(system.n_atoms)
+```
+
+# Glossary
+
+A list of words with very specific and precise meaning. This meaning might not yet be
+fully expressed, but its there.
+
+- annotation
+- category
+- derived quantity
+- dimension
+- new MetaInfo
+- old MetaInfo
+- package
+- pythonic
+- quantity
+- reference
+- section
+- shape
+- synonym
+- unit
diff --git a/nomad/metainfo/bootstrap.py b/nomad/metainfo/bootstrap.py
index 4118eccf1e..0283e72a6e 100644
--- a/nomad/metainfo/bootstrap.py
+++ b/nomad/metainfo/bootstrap.py
@@ -1,3 +1,7 @@
+"""
+Some playground to try the API_CONCEPT.md ideas.
+"""
+
 
 class MObject:
     def __init__(self, m_definition: 'MElementDef', m_section: 'MSection' = None):
diff --git a/nomad/metainfo/metainfo.py b/nomad/metainfo/metainfo.py
index 04cef7d48c..35da1f51d4 100644
--- a/nomad/metainfo/metainfo.py
+++ b/nomad/metainfo/metainfo.py
@@ -1,3 +1,7 @@
+"""
+Some playground to try the CONCEPT.md ideas.
+"""
+
 from typing import Dict, List, Any, Union, Type
 import json
 import ase.data
@@ -13,6 +17,7 @@ units = Units()
 
 
 class Definition():
+    m_definition: Any = None
     pass
 
 
@@ -22,17 +27,17 @@ class Property(Definition):
 
 class Quantity(Property):
     def __init__(
-            self, 
-            name: str = None, 
-            description: str = None, 
+            self,
+            name: str = None,
+            description: str = None,
             parent_section: 'Section' = None,
-            shape: List[Union[str, int]] = [], 
-            type: type = None, 
+            shape: List[Union[str, int]] = [],
+            type: Union['Enum', type] = None,
             unit: str = None,
             derived: bool = False,
             repeats: bool = False,
             synonym: str = None):
-        
+
         self.name = name
         self.parent_section = parent_section.m_definition if parent_section is not None else None
         self.derived = derived
@@ -45,10 +50,10 @@ class Quantity(Property):
 
         if self.derived:
             derive_method = getattr(obj, 'm_derive_%s' % self.name, None)
-            
+
             if derive_method is None:
                 raise KeyError('Derived quantity %s is not implemented' % self.name)
-            
+
             else:
                 return derive_method()
 
@@ -58,7 +63,7 @@ class Quantity(Property):
         else:
             return obj.m_data.get(self.name, None)
 
-    def __set__(self, obj: 'MetainfoObject', value: Any):        
+    def __set__(self, obj: 'MetainfoObject', value: Any):
         obj.m_data[self.name] = value
 
     def __delete__(self, obj: 'MetainfoObject'):
@@ -76,10 +81,10 @@ class Section(Definition):
     def __init__(
             self,
             name: str = None,
-            parent_section: 'Section' = None,
+            parent_section=None,
             repeats: bool = False,
-            extends: 'Section' = None,
-            adds_to: 'Section' = None):
+            extends=None,
+            adds_to=None):
 
         self.name = name
         self.parent_section = parent_section.m_definition if parent_section is not None else None
@@ -112,9 +117,9 @@ class MetainfoObjectMeta(type):
             if isinstance(value, Property):
                 value.name = name
                 value.parent_section = cls.m_definition
-                
+
         cls = super().__new__(cls, cls_name, bases, dct)
-        
+
         if cls.m_definition is not None:
             if cls.m_definition.name is None:
                 cls.m_definition.name = cls_name
@@ -122,15 +127,16 @@ class MetainfoObjectMeta(type):
         return cls
 
 
-class MetainfoObject(metaclass=MetainfoObjectMeta): 
+class MetainfoObject(metaclass=MetainfoObjectMeta):
     """
-    Base class for all 
+    Base class for all
     """
+    m_definition: Any = None
 
     def __init__(self):
         self.m_data = dict(m_defintion=self.m_definition.name)
 
-    def m_create(self, section_definition: Type['MSection'], *args, **kwargs) -> 'MSection':
+    def m_create(self, section_definition: Any, *args, **kwargs) -> Any:
         """
         Creates a sub section of the given section definition.
         """
@@ -142,7 +148,7 @@ class MetainfoObject(metaclass=MetainfoObjectMeta):
         else:
             # TODO test overwrite
             self.m_data[definition.name] = sub_section
-        
+
         return sub_section
 
     def m_get_definition(self, name):
@@ -199,7 +205,7 @@ class System(MetainfoObject):
 
     atom_species = Quantity(shape=['n_atoms'], type=int, derived=True)
 
-    atom_positions = Quantity(shape=['n_atoms', 3], type=float, unit=units.m) 
+    atom_positions = Quantity(shape=['n_atoms', 3], type=float, unit=units.m)
 
     cell = Quantity(shape=[3, 3], type=float, unit=units.m)
     lattice_vectors = Quantity(synonym='cell')
@@ -236,4 +242,4 @@ print(system.__class__.m_definition)
 print(system.m_definition)
 print(system.m_get_definition('atom_labels'))
 
-print(run)
\ No newline at end of file
+print(run)
diff --git a/ops/scripts/misc.http b/ops/scripts/misc.http
index 024fec67d3..3cdf9b1d34 100644
--- a/ops/scripts/misc.http
+++ b/ops/scripts/misc.http
@@ -72,18 +72,17 @@ GET http://localhost:19200/fairdi_nomad_prod/_search HTTP/1.1
 Content-Type: application/json
 
 {
-    "size": 0,
+    "size": 3,
     "query": {
         "bool": {
             "must": [
-                { "match": { "code_name": "Phonopy" } },
-                { "match": { "published": false}}
+                { "match": { "code_name": "DL_POLY" } }
             ]
         }
     },
     "aggs": {
         "upload_id": {
-            "sum": {
+            "terms": {
                 "field": "n_calculations"
             }
         }
-- 
GitLab