From 436c937304e6f08e6edb97c7ce63648d035b893d Mon Sep 17 00:00:00 2001
From: Markus Scheidgen <markus.scheidgen@gmail.com>
Date: Thu, 17 Oct 2019 16:54:22 +0200
Subject: [PATCH] Added dataset models and API. #196

---
 nomad/app/api/__init__.py |   2 +-
 nomad/app/api/dataset.py  | 220 ++++++++++++++++++++++++++++++++++++++
 requirements.txt          |   2 +-
 tests/app/test_api.py     |  78 ++++++++++++++
 4 files changed, 300 insertions(+), 2 deletions(-)
 create mode 100644 nomad/app/api/dataset.py

diff --git a/nomad/app/api/__init__.py b/nomad/app/api/__init__.py
index d684852c60..77c5dcef51 100644
--- a/nomad/app/api/__init__.py
+++ b/nomad/app/api/__init__.py
@@ -26,4 +26,4 @@ There is a separate documentation for the API endpoints from a client perspectiv
 """
 
 from .api import blueprint
-from . import info, auth, admin, upload, repo, archive, raw, mirror
+from . import info, auth, admin, upload, repo, archive, raw, mirror, dataset
diff --git a/nomad/app/api/dataset.py b/nomad/app/api/dataset.py
new file mode 100644
index 0000000000..09ade7601a
--- /dev/null
+++ b/nomad/app/api/dataset.py
@@ -0,0 +1,220 @@
+from typing import Dict, Any
+from flask import request, g
+from flask_restplus import Resource, fields, abort
+import mongoengine as me
+
+from nomad import utils
+from nomad.metainfo import MSection, Quantity, Section
+from nomad.app.utils import with_logger
+
+from .api import api
+from .auth import authenticate
+from .common import pagination_model, pagination_request_parser
+
+
+ns = api.namespace(
+    'datasets',
+    description='Datasets allow to create sets of related data.')
+
+
+class Dataset(MSection):
+    """ A Dataset is attached to one or many entries to form a set of data.
+
+    Args:
+        dataset_id: The unique identifier for this dataset as a string. It should be
+            a randomly generated UUID, similar to other nomad ids.
+        name: The human readable name of the dataset as string. The dataset name must be
+            unique for the user.
+        user_id: The unique user_id of the owner and creator of this dataset. The owner
+            must not change after creation.
+        doi: The optional Document Object Identifier (DOI) associated with this dataset.
+            Nomad can register DOIs that link back to the respective representation of
+            the dataset in the nomad UI. This quantity holds the string representation of
+            this DOI. There is only one per dataset.
+    """
+    dataset_id = Quantity(type=str, a_me=dict(primary_key=True))
+    name = Quantity(type=str, a_me=dict(index=True))
+    user_id = Quantity(type=str, a_me=dict(index=True))
+    doi = Quantity(type=str, a_me=dict(index=True))
+
+
+def generate_flask_restplus_model(section_def: Section):
+    def generate_field(quantity: Quantity):
+        field = None
+        if quantity.type == int:
+            field = fields.Integer
+        elif quantity.type == float:
+            field = fields.Float
+        elif quantity.type == str:
+            field = fields.String
+        elif quantity.type == bool:
+            field = fields.Boolean
+        else:
+            raise NotImplementedError
+
+        result = field(description=quantity.description)
+
+        if len(quantity.shape) == 0:
+            return result
+        elif len(quantity.shape) == 1:
+            return fields.List(result)
+        else:
+            raise NotImplementedError
+
+    return api.model(section_def.name, {
+        name: generate_field(quantity)
+        for name, quantity in section_def.all_quantities.items()
+    })
+
+
+dataset_model = generate_flask_restplus_model(Dataset.m_def)
+dataset_list_model = api.model('DatasetList', {
+    'pagination': fields.Nested(model=pagination_model),
+    'results': fields.List(fields.Nested(model=dataset_model, skip_none=True))
+})
+
+
+def generate_mongoengine(section_def: Section):
+    def generate_field(quantity: Quantity):
+        annotation = quantity.m_annotations.get('me', {})
+        annotation.pop('index', None)
+
+        field = None
+        if quantity.type == int:
+            field = me.IntField
+        elif quantity.type == float:
+            field = me.FloatField
+        elif quantity.type == str:
+            field = me.StringField
+        elif quantity.type == bool:
+            field = me.BooleanField
+        else:
+            raise NotImplementedError
+
+        result = field(default=quantity.default, **annotation)
+
+        if len(quantity.shape) == 0:
+            return result
+        elif len(quantity.shape) == 1:
+            return me.ListField(result)
+        else:
+            raise NotImplementedError
+
+    indexes = [
+        quantity.name
+        for quantity in section_def.all_quantities.values()
+        if quantity.m_annotations.get('me', {}).get('index', False)]
+
+    dct: Dict[str, Any] = dict()
+    if len(indexes) > 0:
+        dct.update(meta=dict(indexes=indexes))
+    dct.update(**{
+        name: generate_field(quantity)
+        for name, quantity in section_def.all_quantities.items()
+    })
+    return type(section_def.name, (me.Document,), dct)
+
+
+DatasetME = generate_mongoengine(Dataset.m_def)
+
+
+@ns.route('/')
+class DatasetListResource(Resource):
+    @api.doc('list_datasets')
+    @api.marshal_with(dataset_list_model, skip_none=True, code=200, description='Dateset send')
+    @api.expect(pagination_request_parser)
+    @authenticate(required=True)
+    def get(self):
+        """ Retrieve a list of all datasets of the authenticated user. """
+        try:
+            page = int(request.args.get('page', 1))
+            per_page = int(request.args.get('per_page', 10))
+        except Exception:
+            abort(400, message='bad parameter types')
+
+        result_query = DatasetME.objects(user_id=g.user.user_id)
+        return dict(
+            pagination=dict(total=result_query.count(), page=page, per_page=per_page),
+            results=result_query[(page - 1) * per_page: page * per_page]), 200
+
+    @api.doc('create_dataset')
+    @api.response(400, 'The provided data is malformed or a dataset with the name already exists')
+    @api.marshal_with(dataset_model, skip_none=True, code=200, description='Dateset send')
+    @api.expect(dataset_model)
+    @authenticate(required=True)
+    def put(self):
+        """ Creates a new dataset. """
+        data = request.get_json()
+        if data is None:
+            data = {}
+
+        # unique name
+        name = data.get('name', None)
+        if name is None:
+            abort(400, 'Must provide a dataset name.')
+
+        if DatasetME.objects(user_id=g.user.user_id, name=name).count() > 0:
+            abort(400, 'A dataset with name %s does already exist for the current user.' % name)
+
+        # only admin can set user or doi
+        if any(key in data for key in ['user_id', 'doi', 'dataset_id']):
+            if not g.user.is_admin():
+                abort(400, 'The dataset contains information you are not allowed to set.')
+
+        # no other keys
+        if any(key not in Dataset.m_def.all_quantities for key in data):
+            abort(400, 'The dataset contains unknown keys.')
+
+        if 'user_id' not in data:
+            data['user_id'] = g.user.user_id
+        dataset_id = data.pop('dataset_id', utils.create_uuid())
+        return DatasetME(dataset_id=dataset_id, **data).save(), 200
+
+
+@ns.route('/<string:name>')
+@api.doc(params=dict(name='The name of the requested dataset.'))
+class DatasetResource(Resource):
+    @api.doc('get_dataset')
+    @api.response(404, 'The dataset does not exist')
+    @api.marshal_with(dataset_model, skip_none=True, code=200, description='Dateset send')
+    @authenticate(required=True)
+    def get(self, name: str):
+        """ Retrieve a dataset by name. """
+        result = DatasetME.objects(user_id=g.user.user_id, name=name).first()
+        if result is None:
+            abort(404, 'Dataset with name %s does not exist for current user' % name)
+
+        return result
+
+    @api.doc('assign_doi')
+    @api.response(404, 'The dataset does not exist')
+    @api.response(400, 'The dataset already has a DOI')
+    @api.marshal_with(dataset_model, skip_none=True, code=200, description='DOI assigned')
+    @authenticate(required=True)
+    @with_logger
+    def post(self, name: str, logger):
+        """ Assign a DOI to the dataset. """
+        result = DatasetME.objects(user_id=g.user.user_id, name=name).first()
+        if result is None:
+            abort(404, 'Dataset with name %s does not exist for current user' % name)
+
+        logger.error('assign datasets is not implemented', user_id=g.user.user_id)
+
+        return result
+
+    @api.doc('delete_dataset')
+    @api.response(404, 'The dataset does not exist')
+    @api.response(400, 'The dataset has a DOI and cannot be deleted')
+    @api.marshal_with(dataset_model, skip_none=True, code=200, description='Dateset deleted')
+    @authenticate(required=True)
+    def delete(self, name: str):
+        """ Assign a DOI to the dataset. """
+        result = DatasetME.objects(user_id=g.user.user_id, name=name).first()
+        if result is None:
+            abort(404, 'Dataset with name %s does not exist for current user' % name)
+        if result.doi is not None:
+            abort(400, 'Dataset with name %s has a DOI and cannot be deleted' % name)
+
+        result.delete()
+
+        return result
diff --git a/requirements.txt b/requirements.txt
index 31fd877394..14eca94949 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -57,7 +57,7 @@ sphinx
 sphinxcontrib.httpdomain
 sphinx_rtd_theme
 gitpython
-mypy
+mypy==0.730
 pylint==2.3.1
 pylint_plugin_utils==0.5
 pylint_mongoengine==0.3.3
diff --git a/tests/app/test_api.py b/tests/app/test_api.py
index 83c39206c1..7960221992 100644
--- a/tests/app/test_api.py
+++ b/tests/app/test_api.py
@@ -30,6 +30,7 @@ from nomad import search, parsing, files, config, utils, infrastructure
 from nomad.files import UploadFiles, PublicUploadFiles
 from nomad.processing import Upload, Calc, SUCCESS
 from nomad.datamodel import UploadWithMetadata, CalcWithMetadata, User
+from nomad.app.api.dataset import DatasetME
 
 from tests.conftest import create_auth_headers, clear_elastic
 from tests.test_files import example_file, example_file_mainfile, example_file_contents
@@ -1151,3 +1152,80 @@ class TestMirror:
 
         data = json.loads(rv.data)
         assert data[0]['upload_id'] == published.upload_id
+
+
+class TestDataset:
+
+    @pytest.fixture()
+    def example_datasets(self, mongo, test_user):
+        DatasetME(dataset_id='1', user_id=test_user.user_id, name='ds1').save()
+        DatasetME(dataset_id='2', user_id=test_user.user_id, name='ds2', doi='test_doi').save()
+
+    def assert_dataset(self, dataset, name: str = None, doi: bool = False):
+        assert 'dataset_id' in dataset
+        assert 'user_id' in dataset
+        assert ('doi' in dataset) == doi
+        assert dataset.get('name') is not None
+        if name is not None:
+            assert dataset.get('name') == name
+
+    def test_create_dataset(self, api, test_user_auth):
+        rv = api.put(
+            '/datasets/', headers=test_user_auth,
+            data=json.dumps(dict(name='test_dataset')),
+            content_type='application/json')
+        assert rv.status_code == 200
+        data = json.loads(rv.data)
+        self.assert_dataset(data, 'test_dataset')
+
+    @pytest.mark.parametrize('data', [
+        dict(name='test_name', doi='something'),
+        dict(name='test_name', dataset_id='something'),
+        dict(name='test_name', user_id='something'),
+        dict(name='test_name', unknown_key='something'),
+        dict()])
+    def test_create_dataset_bad_data(self, api, test_user_auth, data):
+        rv = api.put(
+            '/datasets/', headers=test_user_auth,
+            data=json.dumps(data),
+            content_type='application/json')
+        assert rv.status_code >= 400
+
+    def test_get_datasets(self, api, test_user_auth, example_datasets):
+        rv = api.get('/datasets/', headers=test_user_auth)
+        assert rv.status_code == 200
+        data = json.loads(rv.data)
+        assert 'pagination' in data
+        assert data['pagination']['total'] == 2
+        assert len(data['results']) == 2
+        for dataset in data['results']:
+            if dataset['name'] == 'ds2':
+                self.assert_dataset(dataset, doi=True)
+            else:
+                self.assert_dataset(dataset)
+
+    def test_get_dataset(self, api, test_user_auth, example_datasets):
+        rv = api.get('/datasets/ds1', headers=test_user_auth)
+        assert rv.status_code == 200
+        data = json.loads(rv.data)
+        self.assert_dataset(data, name='ds1')
+
+    def test_get_dataset_missing(self, api, other_test_user_auth, example_datasets):
+        rv = api.get('/datasets/ds1', headers=other_test_user_auth)
+        assert rv.status_code == 404
+
+    def test_post_dataset(self, api, test_user_auth, example_datasets):
+        rv = api.post('/datasets/ds1', headers=test_user_auth)
+        # TODO the actual DOI part needs to be implemented
+        assert rv.status_code == 200
+
+    def test_delete_dataset(self, api, test_user_auth, example_datasets):
+        rv = api.delete('/datasets/ds1', headers=test_user_auth)
+        assert rv.status_code == 200
+        data = json.loads(rv.data)
+        self.assert_dataset(data, name='ds1')
+        api.get('/datasets/ds1', headers=test_user_auth).status_code == 404
+
+    def test_get_dataset_with_doi(self, api, test_user_auth, example_datasets):
+        rv = api.delete('/datasets/ds2', headers=test_user_auth)
+        assert rv.status_code == 400
-- 
GitLab