diff --git a/nomad/app/v1/routers/datasets.py b/nomad/app/v1/routers/datasets.py index d65e6483cb93c112f70bcc5febd3685d70f4ddd5..7cc031d955a07153eb3b8b9d7392326f7c4d74e0 100644 --- a/nomad/app/v1/routers/datasets.py +++ b/nomad/app/v1/routers/datasets.py @@ -250,7 +250,7 @@ async def post_datasets( if not empty: processing.Calc._get_collection().update_many( - mongo_query, {'$push': {'metadata.datasets': dataset.dataset_id}}) + mongo_query, {'$push': {'datasets': dataset.dataset_id}}) update_by_query( ''' if (ctx._source.datasets == null) { @@ -309,7 +309,7 @@ async def delete_dataset( if len(entry_ids) > 0: processing.Calc._get_collection().update_many( - mongo_query, {'$pull': {'metadata.datasets': dataset.dataset_id}}) + mongo_query, {'$pull': {'datasets': dataset.dataset_id}}) update_by_query( ''' int index = -1; diff --git a/nomad/app/v1/routers/entries.py b/nomad/app/v1/routers/entries.py index d846503f23b5e07536f79fefaba14a22d740a7f8..69834042b44b59e44a43c0b0428c1baf4dd71273 100644 --- a/nomad/app/v1/routers/entries.py +++ b/nomad/app/v1/routers/entries.py @@ -1327,7 +1327,7 @@ async def post_entry_metadata_edit( # remove potentially empty old datasets if removed_datasets is not None: for dataset in removed_datasets: - if proc.Calc.objects(metadata__datasets=dataset).first() is None: + if proc.Calc.objects(datasets=dataset).first() is None: datamodel.Dataset.m_def.a_mongo.objects(dataset_id=dataset).delete() return data diff --git a/nomad/datamodel/datamodel.py b/nomad/datamodel/datamodel.py index da80868ffb593f5210abb3f31c2b2e67e53d5bd2..52858ded2d88340bfd9f84de4e64ff60d71bffc5 100644 --- a/nomad/datamodel/datamodel.py +++ b/nomad/datamodel/datamodel.py @@ -574,7 +574,7 @@ class EntryMetadata(metainfo.MSection): datasets = metainfo.Quantity( type=dataset_reference, shape=['0..*'], default=[], - categories=[MongoMetadata, EditableUserMetadata], + categories=[MongoEntryMetadata, EditableUserMetadata], description='A list of user curated datasets this entry belongs to.', a_elasticsearch=Elasticsearch(material_entry_type)) diff --git a/nomad/processing/data.py b/nomad/processing/data.py index 89a2e6b5775fde7823aa8868a3c97fa215c37efe..4fccacbc35f99ac519765b7f59faf64f22d35cff 100644 --- a/nomad/processing/data.py +++ b/nomad/processing/data.py @@ -173,6 +173,7 @@ class Calc(Proc): comment: a user provided comment for this entry references: user provided references (URLs) for this entry coauthors: a user provided list of co-authors + datasets: a list of user curated datasets this entry belongs to metadata: the metadata record wit calc and user metadata, see :class:`EntryMetadata` ''' @@ -193,6 +194,7 @@ class Calc(Proc): references = ListField(StringField(), default=None) coauthors = ListField(StringField(), default=None) shared_with = ListField(StringField(), default=None) + datasets = ListField(StringField(), default=None) metadata = DictField() # Stores user provided metadata and system metadata (not archive metadata) @@ -207,7 +209,7 @@ class Calc(Proc): ('upload_id', 'nomad_version'), 'process_status', 'last_processing_time', - 'metadata.datasets', + 'datasets', 'pid' ] } @@ -475,11 +477,9 @@ class Calc(Proc): if self.upload is None: logger.error('calculation upload does not exist') - has_previous_metadata = bool(self.metadata) - # 1. Determine if we should parse or not self.set_process_step('Determining action') - if not self.upload.published or not has_previous_metadata: + if not self.upload.published or not self.nomad_version: should_parse = True else: # This entry has already been published and has metadata. @@ -1693,13 +1693,12 @@ class Upload(Proc): # Handle datasets dataset_ids: Set[str] = set() for entry_dict in bundle_info['entries']: - entry_metadata = entry_dict['metadata'] - entry_metadata_datasets = entry_metadata.get('datasets') - if entry_metadata_datasets: + entry_datasets = entry_dict.get('datasets') + if entry_datasets: if not include_datasets: - entry_metadata['datasets'] = [] + entry_dict['datasets'] = None else: - dataset_ids.update(entry_metadata_datasets) + dataset_ids.update(entry_datasets) if include_datasets: bundle_info['datasets'] = [ datamodel.Dataset.m_def.a_mongo.get(dataset_id=dataset_id).m_to_dict() @@ -1854,7 +1853,6 @@ class Upload(Proc): keys_exist(entry_dict, required_keys_entry_level, 'Missing key for entry: {key}') assert entry_dict['process_status'] in ProcessStatus.STATUSES_NOT_PROCESSING, ( f'Invalid entry `process_status`') - entry_metadata_dict = entry_dict['metadata'] # Check referential consistency assert entry_dict['upload_id'] == self.upload_id, ( 'Mismatching upload_id in entry definition') @@ -1882,10 +1880,12 @@ class Upload(Proc): # Instantiate an EntryMetadata object to validate the format try: if settings.include_datasets: - entry.metadata['datasets'] = [ - dataset_id_mapping[id] for id in entry_metadata_dict.get('datasets', [])] + entry_datasets = entry_dict.get('datasets') + if entry_datasets: + entry.datasets = [ + dataset_id_mapping[id] for id in entry_datasets] or None else: - entry.metadata['datasets'] = [] + entry.datasets = None entry.mongo_metadata(self) # TODO: if we don't import archive files, should we still index something in ES? except Exception as e: diff --git a/tests/app/v1/routers/test_datasets.py b/tests/app/v1/routers/test_datasets.py index 4630805eb9ddf1865d44ee3da216ef023236a7c3..efcf89f8ed9ee08ac3f301cc976e50964b91e152 100644 --- a/tests/app/v1/routers/test_datasets.py +++ b/tests/app/v1/routers/test_datasets.py @@ -152,7 +152,7 @@ def assert_dataset(dataset, query: Query = None, entries: List[str] = None, n_en expected_n_entries = n_entries if dataset['dataset_type'] == 'owned' else 0 assert search_results.pagination.total == expected_n_entries - assert processing.Calc.objects(metadata__datasets=dataset_id).count() == expected_n_entries + assert processing.Calc.objects(datasets=dataset_id).count() == expected_n_entries def assert_dataset_deleted(dataset_id): @@ -162,7 +162,7 @@ def assert_dataset_deleted(dataset_id): search_results = search( owner='admin', query={'datasets.dataset_id': dataset_id}, user_id=admin_user_id) assert search_results.pagination.total == 0 - assert processing.Calc.objects(metadata__datasets=dataset_id).count() == 0 + assert processing.Calc.objects(datasets=dataset_id).count() == 0 @pytest.mark.parametrize('query, size, status_code', [ diff --git a/tests/app/v1/routers/test_uploads.py b/tests/app/v1/routers/test_uploads.py index 84c5900bd02fb94258f2f2469cdd53568518dcfb..85f41a7751b27cba5547a6012081f236fac82fb7 100644 --- a/tests/app/v1/routers/test_uploads.py +++ b/tests/app/v1/routers/test_uploads.py @@ -1262,7 +1262,7 @@ def test_post_upload_action_publish_to_central_nomad( datamodel.Dataset( dataset_id='dataset_id', dataset_name='dataset_name', user_id=test_users_dict[user].user_id).a_mongo.save() - calc.metadata['datasets'] = ['dataset_id'] + calc.datasets = ['dataset_id'] calc.save() # Finally, invoke the method to publish to central nomad @@ -1288,7 +1288,7 @@ def test_post_upload_action_publish_to_central_nomad( 'upload_id', 'calc_id', 'upload_create_time', 'entry_create_time', 'last_processing_time', 'publish_time'): assert new_calc_metadata_dict[k] == v, f'Metadata not matching: {k}' - assert new_calc.metadata.get('datasets') == ['dataset_id'] + assert new_calc.datasets == ['dataset_id'] assert old_upload.published_to[0] == config.oasis.central_nomad_deployment_id assert new_upload.from_oasis and new_upload.oasis_deployment_id diff --git a/tests/processing/test_data.py b/tests/processing/test_data.py index 56f0fa1569979cf548001253eab57da47f358237..877de1e2aee1f85ad41c2fdb0f77668dba4099f5 100644 --- a/tests/processing/test_data.py +++ b/tests/processing/test_data.py @@ -278,7 +278,7 @@ def test_oasis_upload_processing(proc_infra, oasis_example_uploaded: Tuple[str, assert_processing(upload, published=True) calc = Calc.objects(upload_id='oasis_upload_id').first() assert calc.calc_id == 'test_calc_id' - assert calc.metadata['datasets'] == ['oasis_dataset_1', 'cn_dataset_2'] + assert calc.datasets == ['oasis_dataset_1', 'cn_dataset_2'] @pytest.mark.timeout(config.tests.default_timeout)