From 24d4f5dbad43df886bb5705f418fb3829918514f Mon Sep 17 00:00:00 2001
From: David Sikter <david.sikter@physik.hu-berlin.de>
Date: Tue, 21 Sep 2021 11:59:35 +0200
Subject: [PATCH] Remove redundancy of the published and with_embargo fields

---
 nomad/app/flask/api/dataset.py       |  3 ++-
 nomad/cli/admin/admin.py             |  4 ++--
 nomad/cli/client/update_database.py  |  3 +--
 nomad/datamodel/datamodel.py         |  4 ++--
 nomad/files.py                       | 19 ++++------------
 nomad/processing/data.py             | 34 +++++-----------------------
 tests/app/flask/test_api.py          | 19 +++-------------
 tests/app/v1/routers/test_uploads.py |  4 ++--
 tests/processing/test_data.py        | 14 ++++--------
 tests/test_cli.py                    | 10 ++++----
 tests/utils.py                       |  5 ++--
 11 files changed, 36 insertions(+), 83 deletions(-)

diff --git a/nomad/app/flask/api/dataset.py b/nomad/app/flask/api/dataset.py
index 7f667c800c..07fdd3d49f 100644
--- a/nomad/app/flask/api/dataset.py
+++ b/nomad/app/flask/api/dataset.py
@@ -140,7 +140,8 @@ class DatasetResource(Resource):
             abort(400, 'Dataset with name %s already has a DOI' % name)
 
         # check if the DOI can be created
-        published_values = proc.Calc.objects(metadata__datasets=result.dataset_id).distinct('metadata.published')
+        upload_ids = proc.Calc.objects(metadata__datasets=result.dataset_id).distinct('upload_id')
+        published_values = proc.Upload.objects(upload_id__in=upload_ids).distinct('published')
 
         if False in published_values:
             abort(400, 'Dataset must not contain non published entries.')
diff --git a/nomad/cli/admin/admin.py b/nomad/cli/admin/admin.py
index 690a70d8c7..a2f09cb1c3 100644
--- a/nomad/cli/admin/admin.py
+++ b/nomad/cli/admin/admin.py
@@ -319,8 +319,8 @@ def index_materials(threads, code, dry, in_place, n, source):
                     calc.calc_id = metadata["calc_id"]
                     calc.upload_id = metadata["upload_id"]
                     mongo_calc = proc.Calc.get(calc.calc_id)
-                    calc.published = mongo_calc["metadata"]["published"]
-                    calc.with_embargo = mongo_calc["metadata"]["with_embargo"]
+                    calc.published = mongo_calc.upload.published
+                    calc.with_embargo = mongo_calc.upload.embargo_length > 0
                     calc.owners = [mongo_calc.upload.user_id] + mongo_calc["metadata"]["shared_with"]
                     enc_idealized_structure = encyclopedia.material.idealized_structure
                     idealized_structure = IdealizedStructure()
diff --git a/nomad/cli/client/update_database.py b/nomad/cli/client/update_database.py
index 41b19f6bad..9ca21d1bda 100644
--- a/nomad/cli/client/update_database.py
+++ b/nomad/cli/client/update_database.py
@@ -356,7 +356,7 @@ class DbUpdater:
         upload_id = None
 
         for entry in entries:
-            if entry['name'] == uploadname:
+            if entry['upload_name'] == uploadname:
                 status = 'uploaded'
                 if entry['published']:
                     status = 'published'
@@ -370,7 +370,6 @@ class DbUpdater:
             return dict(
                 operation='publish',
                 metadata=dict(
-                    with_embargo=False,
                     comment='',
                     references=[
                         'http://www.sciencedirect.com/science/article/pii/S0927025614003322',
diff --git a/nomad/datamodel/datamodel.py b/nomad/datamodel/datamodel.py
index cfa629e1d8..22ab1df89a 100644
--- a/nomad/datamodel/datamodel.py
+++ b/nomad/datamodel/datamodel.py
@@ -478,7 +478,7 @@ class EntryMetadata(metainfo.MSection):
     published = metainfo.Quantity(
         type=bool, default=False,
         description='Indicates if the entry is published',
-        categories=[MongoMetadata, OasisMetadata],
+        categories=[OasisMetadata],
         a_search=Search(), a_elasticsearch=Elasticsearch(material_entry_type))
 
     processed = metainfo.Quantity(
@@ -602,7 +602,7 @@ class EntryMetadata(metainfo.MSection):
         categories=[MongoMetadata, EditableUserMetadata])
 
     with_embargo = metainfo.Quantity(
-        type=bool, default=False, categories=[MongoMetadata],
+        type=bool, default=False,
         description='Indicated if this entry is under an embargo',
         a_search=Search(), a_elasticsearch=Elasticsearch(material_entry_type))
 
diff --git a/nomad/files.py b/nomad/files.py
index c62638ad06..8161c4a411 100644
--- a/nomad/files.py
+++ b/nomad/files.py
@@ -28,11 +28,11 @@ almost readonly (beside metadata) storage.
 .. code-block:: sh
 
     fs/staging/<upload>/raw/**
-                       /archive/<calc>.json
-    fs/public/<upload>/raw-public.plain.zip
-                      /raw-restricted.plain.zip
-                      /archive-public.json.zip
-                      /archive-restricted.json.zip
+                       /archive/<calc>.msg
+    fs/public/<upload>/raw-{access}.plain.zip
+                      /archive-{access}.msg.msg
+
+Where `access` is either "public" (non-embargoed) or "restricted" (embargoed).
 
 There is an implicit relationship between files, based on them being in the same
 directory. Each directory with at least one *mainfile* is a *calculation directory*
@@ -41,15 +41,6 @@ respective files actually contributing data or not. A *calculation directory* mi
 contain multiple *mainfile*. E.g., user simulated multiple states of the same system, have
 one calculation based on the other, etc. In this case the other *mainfile* is an *aux*
 file to the original *mainfile* and vice versa.
-
-Published files are kept in pairs of public and restricted files. Here the multiple *mainfiles*
-per directory provides a dilemma. If on *mainfile* is restricted, all its *aux* files
-should be restricted too. But if one of the *aux* files is actually a *mainfile* it
-might be published!
-
-There are multiple ways to solve this. Due to the rarity of the case, we take the
-most simple solution: if one file is public, all files are made public, execpt those
-being other mainfiles. Therefore, the aux files of a restricted calc might become public!
 '''
 
 from abc import ABCMeta
diff --git a/nomad/processing/data.py b/nomad/processing/data.py
index 09d0c0e3c2..b9c3f5ce5b 100644
--- a/nomad/processing/data.py
+++ b/nomad/processing/data.py
@@ -171,7 +171,6 @@ class Calc(Proc):
             ('upload_id', 'metadata.nomad_version'),
             'metadata.processed',
             'metadata.last_processing',
-            'metadata.published',
             'metadata.datasets',
             'metadata.pid'
         ]
@@ -297,6 +296,7 @@ class Calc(Proc):
         entry_metadata.uploader = upload.user_id
         entry_metadata.upload_time = upload.upload_time
         entry_metadata.upload_name = upload.name
+        entry_metadata.published = upload.published
         entry_metadata.with_embargo = (upload.embargo_length > 0)
         # Entry metadata
         if self.parser is not None:
@@ -961,28 +961,13 @@ class Upload(Proc):
             self.embargo_length = embargo_length
 
         with utils.lnr(logger, 'publish failed'):
-            with self.entries_metadata() as calcs:
-
-                with utils.timer(logger, 'upload metadata updated'):
-                    def create_update(calc):
-                        calc.published = True
-                        if embargo_length is not None:
-                            calc.with_embargo = (embargo_length > 0)
-                        else:
-                            assert calc.with_embargo is not None, 'with_embargo flag is None'
-                        return UpdateOne(
-                            {'_id': calc.calc_id},
-                            {'$set': {'metadata': calc.m_to_dict(
-                                include_defaults=True, categories=[datamodel.MongoMetadata])}})
-
-                    Calc._get_collection().bulk_write([create_update(calc) for calc in calcs])
-
+            with self.entries_metadata() as entries:
                 if isinstance(self.upload_files, StagingUploadFiles):
                     with utils.timer(logger, 'staged upload files packed'):
-                        self.staging_upload_files.pack(calcs, with_embargo=(self.embargo_length > 0))
+                        self.staging_upload_files.pack(entries, with_embargo=(self.embargo_length > 0))
 
                 with utils.timer(logger, 'index updated'):
-                    search.publish(calcs)
+                    search.publish(entries)
 
                 if isinstance(self.upload_files, StagingUploadFiles):
                     with utils.timer(logger, 'upload staging files deleted'):
@@ -1660,7 +1645,6 @@ class Upload(Proc):
             if self.published and (self.embargo_length > 0) != (upload_metadata.embargo_length > 0):
                 need_to_repack = True
                 need_to_reindex = True
-                new_entry_metadata['with_embargo'] = (upload_metadata.embargo_length > 0)
             self.embargo_length = upload_metadata.embargo_length
         if upload_metadata.uploader is not None:
             self.user_id = upload_metadata.uploader.user_id
@@ -1828,7 +1812,7 @@ class Upload(Proc):
             required_keys_entry_level = (
                 '_id', 'upload_id', 'mainfile', 'parser', 'process_status', 'create_time', 'metadata')
             required_keys_entry_metadata = (
-                'upload_time', 'published', 'with_embargo', 'calc_hash')
+                'upload_time', 'calc_hash')
             required_keys_datasets = (
                 'dataset_id', 'name', 'user_id')
 
@@ -1927,15 +1911,9 @@ class Upload(Proc):
                     'Mismatching upload_id in entry definition')
                 assert entry_dict['_id'] == generate_entry_id(self.upload_id, entry_dict['mainfile']), (
                     'Provided entry id does not match generated value')
-                for k, v in (
-                        ('published', self.published),
-                        ('with_embargo', self.embargo_length > 0)):
-                    assert entry_metadata_dict.get(k) == v, f'Inconsistent entry metadata: {k}'
                 check_user_ids(entry_dict.get('coauthors', []), 'Invalid coauthor reference: {id}')
                 check_user_ids(entry_dict.get('shared_with', []), 'Invalid shared_with reference: {id}')
-                if embargo_length is not None:
-                    # Update the embargo flag on the entry level
-                    entry_metadata_dict['with_embargo'] = (embargo_length > 0)
+
                 # Instantiate an entry object from the json, and validate it
                 entry_keys_to_copy = (
                     'upload_id', 'mainfile', 'parser', 'metadata', 'errors', 'warnings',
diff --git a/tests/app/flask/test_api.py b/tests/app/flask/test_api.py
index 5af9358dcb..ef7d098a6b 100644
--- a/tests/app/flask/test_api.py
+++ b/tests/app/flask/test_api.py
@@ -2172,19 +2172,6 @@ class TestDataset:
         assert files.UploadFiles.get('1') is not None
         assert example_entry.metadata.datasets[0].dataset_id == '1'
 
-        # entry_archive = EntryArchive()
-        # entry_metadata = entry_archive.m_create(
-        #     EntryMetadata,
-        #     domain='dft', calc_id='1', upload_id='1', published=True, with_embargo=False,
-        #     datasets=['1'])
-        # Calc(
-        #     calc_id='1', upload_id='1', create_time=datetime.datetime.now(),
-        #     metadata=entry_metadata.m_to_dict()).save()
-        # upload_files = files.StagingUploadFiles(upload_id='1', create=True)
-        # upload_files.write_archive('1', dict(metadata=entry_metadata.m_to_dict()))
-        # upload_files.close()
-        # index(entry_archive)
-
     def test_delete_dataset(self, api, test_user_auth, example_dataset_with_entry):
         rv = api.delete('/datasets/ds1', headers=test_user_auth)
         assert rv.status_code == 200
@@ -2217,9 +2204,9 @@ class TestDataset:
         entry_metadata = EntryMetadata(
             domain='dft', calc_id='1', upload_id='1', published=False, with_embargo=False,
             datasets=['1'])
-        Calc(
-            calc_id='1', upload_id='1', create_time=datetime.datetime.now(),
-            metadata=entry_metadata.m_to_dict()).save()
+        entry = Calc(calc_id='1', upload_id='1', create_time=datetime.datetime.now())
+        entry._apply_metadata_to_mongo_entry(entry_metadata)
+        entry.save()
         rv = api.post('/datasets/ds1', headers=test_user_auth)
         assert rv.status_code == 400
 
diff --git a/tests/app/v1/routers/test_uploads.py b/tests/app/v1/routers/test_uploads.py
index 3d3c3e1e13..8a62bdff3f 100644
--- a/tests/app/v1/routers/test_uploads.py
+++ b/tests/app/v1/routers/test_uploads.py
@@ -311,7 +311,7 @@ def get_upload_entries_metadata(entries: List[Dict[str, Any]]) -> Iterable[Entry
     return [
         EntryMetadata(
             domain='dft', calc_id=entry['entry_id'], mainfile=entry['mainfile'],
-            with_embargo=Calc.get(entry['entry_id']).metadata.get('with_embargo'))
+            with_embargo=Upload.get(entry['upload_id']).embargo_length > 0)
         for entry in entries]
 
 
@@ -985,7 +985,7 @@ def test_put_upload_metadata(
         pass
 
     if upload_id == 'id_published_w':
-        assert Calc.get('id_published_w_entry').metadata['with_embargo']
+        assert Upload.get(upload_id).embargo_length > 0
         es_data = search(owner=None, query=dict(entry_id='id_published_w_entry')).data[0]
         assert es_data['with_embargo']
 
diff --git a/tests/processing/test_data.py b/tests/processing/test_data.py
index 9a77ee612b..8b1cb97afc 100644
--- a/tests/processing/test_data.py
+++ b/tests/processing/test_data.py
@@ -95,7 +95,6 @@ def assert_processing(upload: Upload, published: bool = False, process='process_
         assert calc.parser is not None
         assert calc.mainfile is not None
         assert calc.process_status == ProcessStatus.SUCCESS
-        assert calc.metadata['published'] == published
 
         with upload_files.read_archive(calc.calc_id) as archive:
             calc_archive = archive[calc.calc_id]
@@ -283,7 +282,6 @@ def test_oasis_upload_processing(proc_infra, oasis_example_uploaded: Tuple[str,
     assert_processing(upload, published=True)
     calc = Calc.objects(upload_id='oasis_upload_id').first()
     assert calc.calc_id == 'test_calc_id'
-    assert calc.metadata['published']
     assert calc.metadata['datasets'] == ['oasis_dataset_1', 'cn_dataset_2']
 
 
@@ -527,20 +525,18 @@ def test_re_process_match(non_empty_processed, published, monkeypatch, no_warn):
 
     assert upload.total_calcs == 2
     if not published:
-        for calc in upload.calcs:
-            assert calc.metadata['published'] == published
-            assert not calc.metadata['with_embargo']
+        assert upload.published == published
+        assert upload.embargo_length == 0
 
 
 def test_re_pack(published: Upload):
     upload_id = published.upload_id
     upload_files: PublicUploadFiles = published.upload_files  # type: ignore
     assert upload_files.access == 'restricted'
-    # Lift embargo
+    assert published.embargo_length > 0
     calc = Calc.objects(upload_id=upload_id).first()
-    assert calc.metadata['with_embargo']
-    calc.metadata['with_embargo'] = False
-    calc.save()
+
+    # Lift embargo
     published.embargo_length = 0
     published.save()
     upload_files.re_pack(with_embargo=False)
diff --git a/tests/test_cli.py b/tests/test_cli.py
index d0926848a3..2a7a9e2c63 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -105,7 +105,7 @@ class TestAdmin:
         calc = Calc.objects(upload_id=upload_id).first()
 
         assert published.upload_files.exists()
-        assert calc.metadata['with_embargo']
+        assert published.embargo_length > 0
         assert search.SearchRequest().owner('public').search_parameter('upload_id', upload_id).execute()['total'] == 0
 
         result = click.testing.CliRunner().invoke(
@@ -114,7 +114,7 @@ class TestAdmin:
 
         assert result.exit_code == 0
         published.block_until_complete()
-        assert not Calc.objects(upload_id=upload_id).first().metadata['with_embargo'] == lifted
+        assert not (published.embargo_length > 0) == lifted
         assert (search.SearchRequest().owner('public').search_parameter('upload_id', upload_id).execute()['total'] > 0) == lifted
         if lifted:
             with files.UploadFiles.get(upload_id=upload_id).read_archive(calc_id=calc.calc_id) as archive:
@@ -242,9 +242,9 @@ class TestAdminUploads:
     def test_re_pack(self, published, monkeypatch):
         upload_id = published.upload_id
         calc = Calc.objects(upload_id=upload_id).first()
-        assert calc.metadata['with_embargo']
-        calc.metadata['with_embargo'] = False
-        calc.save()
+        assert published.embargo_length > 0
+        published.embargo_length = 0
+        published.save()
 
         result = click.testing.CliRunner().invoke(
             cli, ['admin', 'uploads', 're-pack', '--parallel', '2', upload_id], catch_exceptions=False)
diff --git a/tests/utils.py b/tests/utils.py
index ae638000ad..8bb1c4cc4e 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -166,6 +166,7 @@ class ExampleData:
                 if upload_id in self.uploads:
                     assert embargo_length is not None, 'No embargo provided on upload'
                     assert (embargo_length > 0) == with_embargo, 'Inconsistent embargo'
+                    assert published == self.uploads[upload_id]['published']
                 else:
                     # No uploads created. Just generate it
                     embargo_length = 36 if with_embargo else 0
@@ -300,9 +301,9 @@ class ExampleData:
             calc_hash='dummy_hash_' + entry_id,
             domain='dft',
             upload_time=upload_time,
-            published=True,
             processed=True,
-            with_embargo=False,
+            published=self.uploads.get(upload_id, {}).get('published', True),
+            with_embargo=self.uploads.get(upload_id, {}).get('embargo_length', 0) > 0,
             parser_name='parsers/vasp')
         entry_metadata.m_update(**self.entry_defaults)
         entry_metadata.m_update(**kwargs)
-- 
GitLab