From 436c937304e6f08e6edb97c7ce63648d035b893d Mon Sep 17 00:00:00 2001 From: Markus Scheidgen <markus.scheidgen@gmail.com> Date: Thu, 17 Oct 2019 16:54:22 +0200 Subject: [PATCH] Added dataset models and API. #196 --- nomad/app/api/__init__.py | 2 +- nomad/app/api/dataset.py | 220 ++++++++++++++++++++++++++++++++++++++ requirements.txt | 2 +- tests/app/test_api.py | 78 ++++++++++++++ 4 files changed, 300 insertions(+), 2 deletions(-) create mode 100644 nomad/app/api/dataset.py diff --git a/nomad/app/api/__init__.py b/nomad/app/api/__init__.py index d684852c60..77c5dcef51 100644 --- a/nomad/app/api/__init__.py +++ b/nomad/app/api/__init__.py @@ -26,4 +26,4 @@ There is a separate documentation for the API endpoints from a client perspectiv """ from .api import blueprint -from . import info, auth, admin, upload, repo, archive, raw, mirror +from . import info, auth, admin, upload, repo, archive, raw, mirror, dataset diff --git a/nomad/app/api/dataset.py b/nomad/app/api/dataset.py new file mode 100644 index 0000000000..09ade7601a --- /dev/null +++ b/nomad/app/api/dataset.py @@ -0,0 +1,220 @@ +from typing import Dict, Any +from flask import request, g +from flask_restplus import Resource, fields, abort +import mongoengine as me + +from nomad import utils +from nomad.metainfo import MSection, Quantity, Section +from nomad.app.utils import with_logger + +from .api import api +from .auth import authenticate +from .common import pagination_model, pagination_request_parser + + +ns = api.namespace( + 'datasets', + description='Datasets allow to create sets of related data.') + + +class Dataset(MSection): + """ A Dataset is attached to one or many entries to form a set of data. + + Args: + dataset_id: The unique identifier for this dataset as a string. It should be + a randomly generated UUID, similar to other nomad ids. + name: The human readable name of the dataset as string. The dataset name must be + unique for the user. + user_id: The unique user_id of the owner and creator of this dataset. The owner + must not change after creation. + doi: The optional Document Object Identifier (DOI) associated with this dataset. + Nomad can register DOIs that link back to the respective representation of + the dataset in the nomad UI. This quantity holds the string representation of + this DOI. There is only one per dataset. + """ + dataset_id = Quantity(type=str, a_me=dict(primary_key=True)) + name = Quantity(type=str, a_me=dict(index=True)) + user_id = Quantity(type=str, a_me=dict(index=True)) + doi = Quantity(type=str, a_me=dict(index=True)) + + +def generate_flask_restplus_model(section_def: Section): + def generate_field(quantity: Quantity): + field = None + if quantity.type == int: + field = fields.Integer + elif quantity.type == float: + field = fields.Float + elif quantity.type == str: + field = fields.String + elif quantity.type == bool: + field = fields.Boolean + else: + raise NotImplementedError + + result = field(description=quantity.description) + + if len(quantity.shape) == 0: + return result + elif len(quantity.shape) == 1: + return fields.List(result) + else: + raise NotImplementedError + + return api.model(section_def.name, { + name: generate_field(quantity) + for name, quantity in section_def.all_quantities.items() + }) + + +dataset_model = generate_flask_restplus_model(Dataset.m_def) +dataset_list_model = api.model('DatasetList', { + 'pagination': fields.Nested(model=pagination_model), + 'results': fields.List(fields.Nested(model=dataset_model, skip_none=True)) +}) + + +def generate_mongoengine(section_def: Section): + def generate_field(quantity: Quantity): + annotation = quantity.m_annotations.get('me', {}) + annotation.pop('index', None) + + field = None + if quantity.type == int: + field = me.IntField + elif quantity.type == float: + field = me.FloatField + elif quantity.type == str: + field = me.StringField + elif quantity.type == bool: + field = me.BooleanField + else: + raise NotImplementedError + + result = field(default=quantity.default, **annotation) + + if len(quantity.shape) == 0: + return result + elif len(quantity.shape) == 1: + return me.ListField(result) + else: + raise NotImplementedError + + indexes = [ + quantity.name + for quantity in section_def.all_quantities.values() + if quantity.m_annotations.get('me', {}).get('index', False)] + + dct: Dict[str, Any] = dict() + if len(indexes) > 0: + dct.update(meta=dict(indexes=indexes)) + dct.update(**{ + name: generate_field(quantity) + for name, quantity in section_def.all_quantities.items() + }) + return type(section_def.name, (me.Document,), dct) + + +DatasetME = generate_mongoengine(Dataset.m_def) + + +@ns.route('/') +class DatasetListResource(Resource): + @api.doc('list_datasets') + @api.marshal_with(dataset_list_model, skip_none=True, code=200, description='Dateset send') + @api.expect(pagination_request_parser) + @authenticate(required=True) + def get(self): + """ Retrieve a list of all datasets of the authenticated user. """ + try: + page = int(request.args.get('page', 1)) + per_page = int(request.args.get('per_page', 10)) + except Exception: + abort(400, message='bad parameter types') + + result_query = DatasetME.objects(user_id=g.user.user_id) + return dict( + pagination=dict(total=result_query.count(), page=page, per_page=per_page), + results=result_query[(page - 1) * per_page: page * per_page]), 200 + + @api.doc('create_dataset') + @api.response(400, 'The provided data is malformed or a dataset with the name already exists') + @api.marshal_with(dataset_model, skip_none=True, code=200, description='Dateset send') + @api.expect(dataset_model) + @authenticate(required=True) + def put(self): + """ Creates a new dataset. """ + data = request.get_json() + if data is None: + data = {} + + # unique name + name = data.get('name', None) + if name is None: + abort(400, 'Must provide a dataset name.') + + if DatasetME.objects(user_id=g.user.user_id, name=name).count() > 0: + abort(400, 'A dataset with name %s does already exist for the current user.' % name) + + # only admin can set user or doi + if any(key in data for key in ['user_id', 'doi', 'dataset_id']): + if not g.user.is_admin(): + abort(400, 'The dataset contains information you are not allowed to set.') + + # no other keys + if any(key not in Dataset.m_def.all_quantities for key in data): + abort(400, 'The dataset contains unknown keys.') + + if 'user_id' not in data: + data['user_id'] = g.user.user_id + dataset_id = data.pop('dataset_id', utils.create_uuid()) + return DatasetME(dataset_id=dataset_id, **data).save(), 200 + + +@ns.route('/<string:name>') +@api.doc(params=dict(name='The name of the requested dataset.')) +class DatasetResource(Resource): + @api.doc('get_dataset') + @api.response(404, 'The dataset does not exist') + @api.marshal_with(dataset_model, skip_none=True, code=200, description='Dateset send') + @authenticate(required=True) + def get(self, name: str): + """ Retrieve a dataset by name. """ + result = DatasetME.objects(user_id=g.user.user_id, name=name).first() + if result is None: + abort(404, 'Dataset with name %s does not exist for current user' % name) + + return result + + @api.doc('assign_doi') + @api.response(404, 'The dataset does not exist') + @api.response(400, 'The dataset already has a DOI') + @api.marshal_with(dataset_model, skip_none=True, code=200, description='DOI assigned') + @authenticate(required=True) + @with_logger + def post(self, name: str, logger): + """ Assign a DOI to the dataset. """ + result = DatasetME.objects(user_id=g.user.user_id, name=name).first() + if result is None: + abort(404, 'Dataset with name %s does not exist for current user' % name) + + logger.error('assign datasets is not implemented', user_id=g.user.user_id) + + return result + + @api.doc('delete_dataset') + @api.response(404, 'The dataset does not exist') + @api.response(400, 'The dataset has a DOI and cannot be deleted') + @api.marshal_with(dataset_model, skip_none=True, code=200, description='Dateset deleted') + @authenticate(required=True) + def delete(self, name: str): + """ Assign a DOI to the dataset. """ + result = DatasetME.objects(user_id=g.user.user_id, name=name).first() + if result is None: + abort(404, 'Dataset with name %s does not exist for current user' % name) + if result.doi is not None: + abort(400, 'Dataset with name %s has a DOI and cannot be deleted' % name) + + result.delete() + + return result diff --git a/requirements.txt b/requirements.txt index 31fd877394..14eca94949 100644 --- a/requirements.txt +++ b/requirements.txt @@ -57,7 +57,7 @@ sphinx sphinxcontrib.httpdomain sphinx_rtd_theme gitpython -mypy +mypy==0.730 pylint==2.3.1 pylint_plugin_utils==0.5 pylint_mongoengine==0.3.3 diff --git a/tests/app/test_api.py b/tests/app/test_api.py index 83c39206c1..7960221992 100644 --- a/tests/app/test_api.py +++ b/tests/app/test_api.py @@ -30,6 +30,7 @@ from nomad import search, parsing, files, config, utils, infrastructure from nomad.files import UploadFiles, PublicUploadFiles from nomad.processing import Upload, Calc, SUCCESS from nomad.datamodel import UploadWithMetadata, CalcWithMetadata, User +from nomad.app.api.dataset import DatasetME from tests.conftest import create_auth_headers, clear_elastic from tests.test_files import example_file, example_file_mainfile, example_file_contents @@ -1151,3 +1152,80 @@ class TestMirror: data = json.loads(rv.data) assert data[0]['upload_id'] == published.upload_id + + +class TestDataset: + + @pytest.fixture() + def example_datasets(self, mongo, test_user): + DatasetME(dataset_id='1', user_id=test_user.user_id, name='ds1').save() + DatasetME(dataset_id='2', user_id=test_user.user_id, name='ds2', doi='test_doi').save() + + def assert_dataset(self, dataset, name: str = None, doi: bool = False): + assert 'dataset_id' in dataset + assert 'user_id' in dataset + assert ('doi' in dataset) == doi + assert dataset.get('name') is not None + if name is not None: + assert dataset.get('name') == name + + def test_create_dataset(self, api, test_user_auth): + rv = api.put( + '/datasets/', headers=test_user_auth, + data=json.dumps(dict(name='test_dataset')), + content_type='application/json') + assert rv.status_code == 200 + data = json.loads(rv.data) + self.assert_dataset(data, 'test_dataset') + + @pytest.mark.parametrize('data', [ + dict(name='test_name', doi='something'), + dict(name='test_name', dataset_id='something'), + dict(name='test_name', user_id='something'), + dict(name='test_name', unknown_key='something'), + dict()]) + def test_create_dataset_bad_data(self, api, test_user_auth, data): + rv = api.put( + '/datasets/', headers=test_user_auth, + data=json.dumps(data), + content_type='application/json') + assert rv.status_code >= 400 + + def test_get_datasets(self, api, test_user_auth, example_datasets): + rv = api.get('/datasets/', headers=test_user_auth) + assert rv.status_code == 200 + data = json.loads(rv.data) + assert 'pagination' in data + assert data['pagination']['total'] == 2 + assert len(data['results']) == 2 + for dataset in data['results']: + if dataset['name'] == 'ds2': + self.assert_dataset(dataset, doi=True) + else: + self.assert_dataset(dataset) + + def test_get_dataset(self, api, test_user_auth, example_datasets): + rv = api.get('/datasets/ds1', headers=test_user_auth) + assert rv.status_code == 200 + data = json.loads(rv.data) + self.assert_dataset(data, name='ds1') + + def test_get_dataset_missing(self, api, other_test_user_auth, example_datasets): + rv = api.get('/datasets/ds1', headers=other_test_user_auth) + assert rv.status_code == 404 + + def test_post_dataset(self, api, test_user_auth, example_datasets): + rv = api.post('/datasets/ds1', headers=test_user_auth) + # TODO the actual DOI part needs to be implemented + assert rv.status_code == 200 + + def test_delete_dataset(self, api, test_user_auth, example_datasets): + rv = api.delete('/datasets/ds1', headers=test_user_auth) + assert rv.status_code == 200 + data = json.loads(rv.data) + self.assert_dataset(data, name='ds1') + api.get('/datasets/ds1', headers=test_user_auth).status_code == 404 + + def test_get_dataset_with_doi(self, api, test_user_auth, example_datasets): + rv = api.delete('/datasets/ds2', headers=test_user_auth) + assert rv.status_code == 400 -- GitLab