From 6eb62cd8309abc9f7f2b678ad43bb388ca0f1499 Mon Sep 17 00:00:00 2001
From: Markus Scheidgen <markus.scheidgen@gmail.com>
Date: Mon, 9 Dec 2019 13:40:08 +0100
Subject: [PATCH] Dataset edit related checks and improvements.

---
 gui/src/components/search/EntryList.js     |   2 +-
 gui/src/components/search/SearchContext.js |   6 +-
 nomad/app/api/dataset.py                   |  13 +-
 nomad/app/api/repo.py                      | 254 ++++++++++++---------
 nomad/processing/data.py                   |   4 +-
 tests/app/test_api.py                      |  56 ++++-
 tests/conftest.py                          |  15 ++
 7 files changed, 240 insertions(+), 110 deletions(-)

diff --git a/gui/src/components/search/EntryList.js b/gui/src/components/search/EntryList.js
index 3804969535..3d18297641 100644
--- a/gui/src/components/search/EntryList.js
+++ b/gui/src/components/search/EntryList.js
@@ -301,7 +301,7 @@ export class EntryListUnstyled extends React.Component {
       {moreActions}
     </React.Fragment>
     const selectActions = createActions({query: selectQuery, buttonProps: {color: 'secondary'}})
-    const allActions = createActions({query: query}, actions)
+    const allActions = actions
 
     return (
       <div className={classes.root}>
diff --git a/gui/src/components/search/SearchContext.js b/gui/src/components/search/SearchContext.js
index 50ec415092..3c3c9b7396 100644
--- a/gui/src/components/search/SearchContext.js
+++ b/gui/src/components/search/SearchContext.js
@@ -21,7 +21,11 @@ class SearchContext extends React.Component {
   }
 
   static emptyResponse = {
-    statistics: {}
+    statistics: {
+      total: {
+        all: {}
+      }
+    }
   }
 
   static type = React.createContext()
diff --git a/nomad/app/api/dataset.py b/nomad/app/api/dataset.py
index aec01cf5fa..0876b390b7 100644
--- a/nomad/app/api/dataset.py
+++ b/nomad/app/api/dataset.py
@@ -16,7 +16,7 @@ from flask import request, g
 from flask_restplus import Resource, fields, abort
 import re
 
-from nomad import utils
+from nomad import utils, processing as proc
 from nomad.app.utils import with_logger
 from nomad.datamodel import Dataset
 from nomad.metainfo.flask_restplus import generate_flask_restplus_model
@@ -136,6 +136,15 @@ class DatasetResource(Resource):
         if result.doi is not None:
             abort(400, 'Dataset with name %s already has a DOI' % name)
 
+        # check if the DOI can be created
+        published_values = proc.Calc.objects(metadata__datasets=result.dataset_id).distinct('metadata.published')
+
+        if False in published_values:
+            abort(400, 'Dataset must not contain non published entries.')
+
+        if True not in published_values:
+            abort(400, 'Dataset must not be empty.')
+
         # set the DOI
         doi = DOI.create(title='NOMAD dataset: %s' % result.name, user=g.user)
         doi.create_draft()
@@ -161,7 +170,7 @@ class DatasetResource(Resource):
     @authenticate(required=True)
     @with_logger
     def delete(self, name: str, logger):
-        """ Assign a DOI to the dataset. """
+        """ Delete the dataset. """
         try:
             result = Dataset.m_def.m_x('me').get(user_id=g.user.user_id, name=name)
         except KeyError:
diff --git a/nomad/app/api/repo.py b/nomad/app/api/repo.py
index 78a8bc269e..0ca5280c25 100644
--- a/nomad/app/api/repo.py
+++ b/nomad/app/api/repo.py
@@ -20,6 +20,7 @@ meta-data.
 from typing import List, Dict, Any
 from flask_restplus import Resource, abort, fields
 from flask import request, g
+from elasticsearch_dsl import Q
 from elasticsearch.exceptions import NotFoundError
 import elasticsearch.helpers
 
@@ -344,43 +345,48 @@ repo_edit_model = api.model('RepoEdit', {
             quantity.name: repo_edit_action_field(quantity)
             for quantity in UserMetadata.m_def.all_quantities.values()
         }), skip_none=True,
-        description='Each action specifies a single value (even for multi valued quantities).')
+        description='Each action specifies a single value (even for multi valued quantities).'),
+    'success': fields.Boolean(description='If the overall edit can/could be done. Only in API response.'),
+    'message': fields.String(description='A message that details the overall edit result. Only in API response.')
 })
 
 
 def edit(parsed_query: Dict[str, Any], logger, mongo_update: Dict[str, Any] = None, re_index=True) -> List[str]:
     # get all calculations that have to change
-    search_request = search.SearchRequest()
-    add_query(search_request, parsed_query)
-    upload_ids = set()
-    calc_ids = []
-    for hit in search_request.execute_scan():
-        calc_ids.append(hit['calc_id'])
-        upload_ids.add(hit['upload_id'])
+    with utils.timer(logger, 'edit query executed'):
+        search_request = search.SearchRequest()
+        add_query(search_request, parsed_query)
+        upload_ids = set()
+        calc_ids = []
+        for hit in search_request.execute_scan():
+            calc_ids.append(hit['calc_id'])
+            upload_ids.add(hit['upload_id'])
 
     # perform the update on the mongo db
-    if mongo_update is not None:
-        n_updated = proc.Calc.objects(calc_id__in=calc_ids).update(multi=True, **mongo_update)
-        if n_updated != len(calc_ids):
-            logger.error('edit repo did not update all entries', payload=mongo_update)
+    with utils.timer(logger, 'edit mongo update executed', size=len(calc_ids)):
+        if mongo_update is not None:
+            n_updated = proc.Calc.objects(calc_id__in=calc_ids).update(multi=True, **mongo_update)
+            if n_updated != len(calc_ids):
+                logger.error('edit repo did not update all entries', payload=mongo_update)
 
     # re-index the affected entries in elastic search
-    if re_index:
-        def elastic_updates():
-            for calc in proc.Calc.objects(calc_id__in=calc_ids):
-                entry = search.Entry.from_calc_with_metadata(
-                    datamodel.CalcWithMetadata(**calc['metadata']))
-                entry = entry.to_dict(include_meta=True)
-                entry['_op_type'] = 'index'
-                yield entry
-
-        _, failed = elasticsearch.helpers.bulk(
-            infrastructure.elastic_client, elastic_updates(), stats_only=True)
-        search.refresh()
-        if failed > 0:
-            logger.error(
-                'edit repo with failed elastic updates',
-                payload=mongo_update, nfailed=len(failed))
+    with utils.timer(logger, 'edit elastic update executed', size=len(calc_ids)):
+        if re_index:
+            def elastic_updates():
+                for calc in proc.Calc.objects(calc_id__in=calc_ids):
+                    entry = search.Entry.from_calc_with_metadata(
+                        datamodel.CalcWithMetadata(**calc['metadata']))
+                    entry = entry.to_dict(include_meta=True)
+                    entry['_op_type'] = 'index'
+                    yield entry
+
+            _, failed = elasticsearch.helpers.bulk(
+                infrastructure.elastic_client, elastic_updates(), stats_only=True)
+            search.refresh()
+            if failed > 0:
+                logger.error(
+                    'edit repo with failed elastic updates',
+                    payload=mongo_update, nfailed=len(failed))
 
     return list(upload_ids)
 
@@ -431,91 +437,127 @@ class EditRepoCalcsResource(Resource):
         parsed_query['owner'] = owner
 
         # checking the edit actions and preparing a mongo update on the fly
+        json_data['success'] = True
         mongo_update = {}
         uploader_ids = None
         lift_embargo = False
-        for action_quantity_name, quantity_actions in actions.items():
-            quantity = UserMetadata.m_def.all_quantities.get(action_quantity_name)
-            if quantity is None:
-                abort(400, 'Unknown quantity %s' % action_quantity_name)
-
-            quantity_flask = quantity.m_x('flask', {})
-            if quantity_flask.get('admin_only', False):
-                if not g.user.is_admin():
-                    abort(404, 'Only the admin user can set %s' % quantity.name)
-
-            if isinstance(quantity_actions, list) == quantity.is_scalar:
-                abort(400, 'Wrong shape for quantity %s' % action_quantity_name)
-
-            if not isinstance(quantity_actions, list):
-                quantity_actions = [quantity_actions]
-
-            flask_verify = quantity_flask.get('verify', None)
-            mongo_key = 'metadata__%s' % quantity.name
-            has_error = False
-            for action in quantity_actions:
-                action['success'] = True
-                action['message'] = None
-                action_value = action.get('value')
-                action_value = action_value if action_value is None else action_value.strip()
-
-                if action_value is None:
-                    mongo_value = None
-
-                elif action_value == '':
-                    mongo_value = None
-
-                elif flask_verify == datamodel.User:
-                    try:
-                        mongo_value = User.get(user_id=action_value).user_id
-                    except KeyError:
-                        action['success'] = False
-                        has_error = True
-                        action['message'] = 'User does not exist'
-                        continue
+        removed_datasets = None
+
+        with utils.timer(logger, 'edit verified'):
+            for action_quantity_name, quantity_actions in actions.items():
+                quantity = UserMetadata.m_def.all_quantities.get(action_quantity_name)
+                if quantity is None:
+                    abort(400, 'Unknown quantity %s' % action_quantity_name)
+
+                quantity_flask = quantity.m_x('flask', {})
+                if quantity_flask.get('admin_only', False):
+                    if not g.user.is_admin():
+                        abort(404, 'Only the admin user can set %s' % quantity.name)
+
+                if isinstance(quantity_actions, list) == quantity.is_scalar:
+                    abort(400, 'Wrong shape for quantity %s' % action_quantity_name)
+
+                if not isinstance(quantity_actions, list):
+                    quantity_actions = [quantity_actions]
+
+                flask_verify = quantity_flask.get('verify', None)
+                mongo_key = 'metadata__%s' % quantity.name
+                has_error = False
+                for action in quantity_actions:
+                    action['success'] = True
+                    action['message'] = None
+                    action_value = action.get('value')
+                    action_value = action_value if action_value is None else action_value.strip()
+
+                    if action_value is None:
+                        mongo_value = None
 
-                    if uploader_ids is None:
-                        uploader_ids = get_uploader_ids(parsed_query)
-                    if action_value in uploader_ids:
-                        action['success'] = False
-                        has_error = True
-                        action['message'] = 'This user is already an uploader of one entry in the query'
-                        continue
-
-                elif flask_verify == datamodel.Dataset:
-                    try:
-                        mongo_value = Dataset.m_def.m_x('me').get(
-                            user_id=g.user.user_id, name=action_value).dataset_id
-                    except KeyError:
-                        action['message'] = 'Dataset does not exist and will be created'
+                    elif action_value == '':
                         mongo_value = None
-                        if not verify:
-                            dataset = Dataset(
-                                dataset_id=utils.create_uuid(), user_id=g.user.user_id,
-                                name=action_value)
-                            dataset.m_x('me').create()
-                            mongo_value = dataset.dataset_id
-                elif action_quantity_name == 'with_embargo':
-                    # ignore the actual value ... just lift the embargo
-                    mongo_value = False
-                    lift_embargo = True
-                else:
-                    mongo_value = action_value
-
-                if len(quantity.shape) == 0:
-                    mongo_update[mongo_key] = mongo_value
-                else:
-                    mongo_values = mongo_update.setdefault(mongo_key, [])
-                    if mongo_value is not None:
-                        if mongo_value in mongo_values:
+
+                    elif flask_verify == datamodel.User:
+                        try:
+                            mongo_value = User.get(user_id=action_value).user_id
+                        except KeyError:
                             action['success'] = False
                             has_error = True
-                            action['message'] = 'Duplicate values are not allowed'
+                            action['message'] = 'User does not exist'
                             continue
-                        mongo_values.append(mongo_value)
 
-            if len(quantity_actions) == 0 and len(quantity.shape) > 0:
-                mongo_update[mongo_key] = []
+                        if uploader_ids is None:
+                            uploader_ids = get_uploader_ids(parsed_query)
+                        if action_value in uploader_ids:
+                            action['success'] = False
+                            has_error = True
+                            action['message'] = 'This user is already an uploader of one entry in the query'
+                            continue
+
+                    elif flask_verify == datamodel.Dataset:
+                        try:
+                            mongo_value = Dataset.m_def.m_x('me').get(
+                                user_id=g.user.user_id, name=action_value).dataset_id
+                        except KeyError:
+                            action['message'] = 'Dataset does not exist and will be created'
+                            mongo_value = None
+                            if not verify:
+                                dataset = Dataset(
+                                    dataset_id=utils.create_uuid(), user_id=g.user.user_id,
+                                    name=action_value)
+                                dataset.m_x('me').create()
+                                mongo_value = dataset.dataset_id
+
+                    elif action_quantity_name == 'with_embargo':
+                        # ignore the actual value ... just lift the embargo
+                        mongo_value = False
+                        lift_embargo = True
+
+                        # check if necessary
+                        search_request = search.SearchRequest()
+                        add_query(search_request, parsed_query)
+                        search_request.q = search_request.q & Q('term', with_embargo=True)
+                        if search_request.execute()['total'] == 0:
+                            action['success'] = False
+                            has_error = True
+                            action['message'] = 'There is no embargo to lift'
+                            continue
+                    else:
+                        mongo_value = action_value
+
+                    if len(quantity.shape) == 0:
+                        mongo_update[mongo_key] = mongo_value
+                    else:
+                        mongo_values = mongo_update.setdefault(mongo_key, [])
+                        if mongo_value is not None:
+                            if mongo_value in mongo_values:
+                                action['success'] = False
+                                has_error = True
+                                action['message'] = 'Duplicate values are not allowed'
+                                continue
+                            mongo_values.append(mongo_value)
+
+                if len(quantity_actions) == 0 and len(quantity.shape) > 0:
+                    mongo_update[mongo_key] = []
+
+                if action_quantity_name == 'datasets':
+                    # check if datasets edit is allowed and if datasets have to be removed
+                    search_request = search.SearchRequest()
+                    add_query(search_request, parsed_query)
+                    search_request.quantity(name='dataset_id')
+                    old_datasets = list(
+                        search_request.execute()['quantities']['dataset_id']['values'].keys())
+
+                    removed_datasets = []
+                    for dataset_id in old_datasets:
+                        if dataset_id not in mongo_update.get(mongo_key, []):
+                            removed_datasets.append(dataset_id)
+
+                    doi_ds = Dataset.m_def.m_x('me').objects(
+                        dataset_id__in=removed_datasets, doi__ne=None).first()
+                    if doi_ds is not None:
+                        json_data['success'] = False
+                        json_data['message'] = json_data.get('message', '') + \
+                            'Edit would remove entries from a dataset with DOI (%s) ' % doi_ds.name
+                        has_error = True
 
         # stop here, if client just wants to verify its actions
         if verify:
@@ -534,6 +576,10 @@ class EditRepoCalcsResource(Resource):
                 upload = proc.Upload.get(upload_id)
                 upload.re_pack()
 
+        # remove old datasets
+        if removed_datasets is not None:
+            Dataset.m_def.m_x('me').objects(dataset_id__in=removed_datasets).delete()
+
         return json_data, 200
 
 
diff --git a/nomad/processing/data.py b/nomad/processing/data.py
index 39f6950d8d..1e534737e2 100644
--- a/nomad/processing/data.py
+++ b/nomad/processing/data.py
@@ -74,7 +74,9 @@ class Calc(Proc):
             ('upload_id', 'parser'),
             ('upload_id', 'tasks_status'),
             ('upload_id', 'process_status'),
-            ('upload_id', 'metadata.nomad_version')
+            ('upload_id', 'metadata.nomad_version'),
+            'metadata.published',
+            'metadata.datasets'
         ]
     }
 
diff --git a/tests/app/test_api.py b/tests/app/test_api.py
index 856afb3545..b4e83b0ed3 100644
--- a/tests/app/test_api.py
+++ b/tests/app/test_api.py
@@ -36,6 +36,7 @@ from tests.test_files import example_file, example_file_mainfile, example_file_c
 from tests.test_files import create_staging_upload, create_public_upload, assert_upload_files
 from tests.test_search import assert_search_upload
 from tests.processing import test_data as test_processing
+from tests.utils import assert_exception
 
 from tests.app.test_app import BlueprintClient
 
@@ -1189,6 +1190,28 @@ class TestEditRepo():
         self.assert_edit(rv, quantity='datasets', success=True, message=False)
         assert self.mongo(1, datasets=[self.example_dataset.dataset_id])
 
+    def test_edit_ds_remove_doi(self):
+        rv = self.perform_edit(
+            datasets=[self.example_dataset.name], query=dict(upload_id='upload_1'))
+        assert rv.status_code == 200
+        rv = self.api.post('/datasets/%s' % self.example_dataset.name, headers=self.test_user_auth)
+        assert rv.status_code == 200
+        rv = self.perform_edit(datasets=[], query=dict(upload_id='upload_1'))
+        assert rv.status_code == 400
+        data = json.loads(rv.data)
+        assert not data['success']
+        assert self.example_dataset.name in data['message']
+        assert Dataset.m_def.m_x('me').get(dataset_id=self.example_dataset.dataset_id) is not None
+
+    def test_edit_ds_remove(self):
+        rv = self.perform_edit(
+            datasets=[self.example_dataset.name], query=dict(upload_id='upload_1'))
+        assert rv.status_code == 200
+        rv = self.perform_edit(datasets=[], query=dict(upload_id='upload_1'))
+        assert rv.status_code == 200
+        with assert_exception(KeyError):
+            assert Dataset.m_def.m_x('me').get(dataset_id=self.example_dataset.dataset_id) is None
+
     def test_edit_ds_user_namespace(self, test_user):
         assert Dataset.m_def.m_x('me').objects(
             name=self.other_example_dataset.name).first() is not None
@@ -1246,6 +1269,24 @@ def test_edit_lift_embargo(api, published, other_test_user_auth):
         f.read()
 
 
+@pytest.mark.timeout(config.tests.default_timeout)
+def test_edit_lift_embargo_unnecessary(api, published_wo_user_metadata, other_test_user_auth):
+    example_calc = Calc.objects(upload_id=published_wo_user_metadata.upload_id).first()
+    assert not example_calc.metadata['with_embargo']
+    rv = api.post(
+        '/repo/edit', headers=other_test_user_auth, content_type='application/json',
+        data=json.dumps({
+            'actions': {
+                'with_embargo': {
+                    'value': 'lift'
+                }
+            }
+        }))
+    assert rv.status_code == 400
+    data = json.loads(rv.data)
+    assert not data['actions']['with_embargo']['success']
+
+
 class TestRaw(UploadFilesBasedTests):
 
     def assert_zip_file(self, rv, files: int = -1, basename: bool = None):
@@ -1607,7 +1648,20 @@ class TestDataset:
         self.assert_dataset(data, name='ds1', doi=True)
         self.assert_dataset_entry(api, '1', True, True, headers=test_user_auth)
 
-    def test_resolve_doi(self, api, example_dataset_with_entry):
+    def test_assign_doi_empty(self, api, test_user_auth, example_datasets):
+        rv = api.post('/datasets/ds1', headers=test_user_auth)
+        assert rv.status_code == 400
+
+    def test_assign_doi_unpublished(self, api, test_user_auth, example_datasets):
+        calc = CalcWithMetadata(
+            calc_id='1', upload_id='1', published=False, with_embargo=False, datasets=['1'])
+        Calc(
+            calc_id='1', upload_id='1', create_time=datetime.datetime.now(),
+            metadata=calc.to_dict()).save()
+        rv = api.post('/datasets/ds1', headers=test_user_auth)
+        assert rv.status_code == 400
+
+    def test_resolve_doi(self, api, example_datasets):
         rv = api.get('/datasets/doi/test_doi')
         assert rv.status_code == 200
         data = json.loads(rv.data)
diff --git a/tests/conftest.py b/tests/conftest.py
index 96bf7d06d3..17bfbba924 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -581,6 +581,21 @@ def published(non_empty_processed: processing.Upload, example_user_metadata) ->
     return non_empty_processed
 
 
+@pytest.mark.timeout(config.tests.default_timeout)
+@pytest.fixture(scope='function')
+def published_wo_user_metadata(non_empty_processed: processing.Upload) -> processing.Upload:
+    """
+    Provides a processed upload. Upload was uploaded with test_user.
+    """
+    non_empty_processed.publish_upload()
+    try:
+        non_empty_processed.block_until_complete(interval=.01)
+    except Exception:
+        pass
+
+    return non_empty_processed
+
+
 @pytest.fixture
 def reset_config():
     """ Fixture that resets the log-level after test. """
-- 
GitLab