diff --git a/nomad/processing/data.py b/nomad/processing/data.py index 1ceaad37716e93d61623b30fa06764623acc59f8..27bf7da822ac5f29bd00c32d5005749c4b263393 100644 --- a/nomad/processing/data.py +++ b/nomad/processing/data.py @@ -558,6 +558,8 @@ class Calc(Proc): metadata_part = self.upload.metadata_file_cached( os.path.join(metadata_dir, metadata_file)) for key, val in metadata_part.items(): + if key in ['entries', 'oasis_datasets']: + continue metadata.setdefault(key, val) if metadata_dir == self.upload_files.os_path: @@ -912,6 +914,7 @@ class Upload(Proc): # compile oasis metadata for the upload upload_metadata = dict(upload_time=str(self.upload_time)) upload_metadata_entries = {} + upload_metadata_datasets = {} for calc in self.calcs: entry_metadata = dict(**{ key: str(value) if isinstance(value, datetime) else value @@ -921,7 +924,17 @@ class Upload(Proc): if entry_metadata.get('with_embargo'): continue upload_metadata_entries[calc.mainfile] = entry_metadata + if 'datasets' in entry_metadata: + for dataset_id in entry_metadata['datasets']: + if dataset_id in upload_metadata_datasets: + continue + + dataset = datamodel.Dataset.m_def.a_mongo.get(dataset_id=dataset_id) + upload_metadata_datasets[dataset_id] = dataset.m_to_dict() + upload_metadata['entries'] = upload_metadata_entries + upload_metadata['oasis_datasets'] = { + dataset['name']: dataset for dataset in upload_metadata_datasets.values()} oasis_upload_id, upload_metadata = _normalize_oasis_upload_metadata( self.upload_id, upload_metadata) @@ -1043,6 +1056,39 @@ class Upload(Proc): def process_upload(self): ''' A *process* that performs the initial upload processing. ''' self.extracting() + + if self.from_oasis: + # we might need to add datasets from the oasis before processing and + # adding the entries + oasis_metadata_file = os.path.join(self.upload_files.os_path, 'raw', config.metadata_file_name + '.json') + with open(oasis_metadata_file, 'rt') as f: + oasis_metadata = json.load(f) + oasis_datasets = oasis_metadata.get('oasis_datasets', {}) + metadata_was_changed = False + for oasis_dataset in oasis_datasets.values(): + try: + existing_dataset = datamodel.Dataset.m_def.a_mongo.get( + user_id=self.user_id, name=oasis_dataset['name']) + except KeyError: + datamodel.Dataset(**oasis_dataset).a_mongo.save() + else: + oasis_dataset_id = oasis_dataset['dataset_id'] + if existing_dataset.dataset_id != oasis_dataset_id: + # A dataset for the same user with the same name was created + # in both deployments. We consider this to be the "same" dataset. + # These datasets have different ids and we need to migrate the provided + # dataset ids: + for entry in oasis_metadata['entries'].values(): + entry_datasets = entry.get('datasets', []) + for index, dataset_id in enumerate(entry_datasets): + if dataset_id == oasis_dataset_id: + entry_datasets[index] = existing_dataset.dataset_id + metadata_was_changed = True + + if metadata_was_changed: + with open(oasis_metadata_file, 'wt') as f: + json.dump(oasis_metadata, f) + self.parse_all() @task diff --git a/tests/conftest.py b/tests/conftest.py index 74729ecebc9c371a9c2a502f5bdbea10383a7b3c..16fecf41197891435dc6240532d48bc665663efc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -587,7 +587,7 @@ def non_empty_uploaded(non_empty_example_upload: str, raw_files) -> Tuple[str, s @pytest.fixture(scope='function') -def oasis_example_upload(non_empty_example_upload: str, raw_files) -> str: +def oasis_example_upload(non_empty_example_upload: str, test_user, raw_files) -> str: processing.Upload.metadata_file_cached.cache_clear() uploaded_path = non_empty_example_upload @@ -601,7 +601,20 @@ def oasis_example_upload(non_empty_example_upload: str, raw_files) -> str: 'published': True, 'entries': { 'examples_template/template.json': { - 'calc_id': 'test_calc_id' + 'calc_id': 'test_calc_id', + 'datasets': ['oasis_dataset_1', 'oasis_dataset_2'] + } + }, + 'oasis_datasets': { + 'dataset_1_name': { + 'dataset_id': 'oasis_dataset_1', + 'user_id': test_user.user_id, + 'name': 'dataset_1_name' + }, + 'dataset_2_name': { + 'dataset_id': 'oasis_dataset_2', + 'user_id': test_user.user_id, + 'name': 'dataset_2_name' } } } diff --git a/tests/processing/test_data.py b/tests/processing/test_data.py index 2a6aaedf3f2bb02f9d2d1d732f9c214edc2cc89f..9553554d9e5b40044ddbde33d3910dbe9c7b0115 100644 --- a/tests/processing/test_data.py +++ b/tests/processing/test_data.py @@ -23,7 +23,7 @@ import os.path import re import shutil -from nomad import utils, infrastructure, config +from nomad import utils, infrastructure, config, datamodel from nomad.archive import read_partial_archive_from_mongo from nomad.files import UploadFiles, StagingUploadFiles, PublicUploadFiles from nomad.processing import Upload, Calc @@ -208,6 +208,12 @@ def test_publish_failed( def test_oasis_upload_processing(proc_infra, oasis_example_uploaded: Tuple[str, str], test_user, no_warn): uploaded_id, uploaded_path = oasis_example_uploaded + # create a dataset to force dataset joining of one of the datasets in the example + # upload + datamodel.Dataset( + dataset_id='cn_dataset_2', name='dataset_2_name', + user_id=test_user.user_id).a_mongo.save() + upload = Upload.create( upload_id=uploaded_id, user=test_user, upload_path=uploaded_path) upload.from_oasis = True @@ -227,6 +233,7 @@ def test_oasis_upload_processing(proc_infra, oasis_example_uploaded: Tuple[str, calc = Calc.objects(upload_id='oasis_upload_id').first() assert calc.calc_id == 'test_calc_id' assert calc.metadata['published'] + assert calc.metadata['datasets'] == ['oasis_dataset_1', 'cn_dataset_2'] @pytest.mark.timeout(config.tests.default_timeout) @@ -238,9 +245,17 @@ def test_publish_from_oasis( upload.publish_upload() upload.block_until_complete(interval=.01) + # create a dataset to also test this aspect of oasis uploads + calc = Calc.objects(upload_id=upload.upload_id).first() + datamodel.Dataset( + dataset_id='dataset_id', name='dataset_name', + user_id=other_test_user.user_id).a_mongo.save() + calc.metadata['datasets'] = ['dataset_id'] + calc.save() + cn_upload_id = 'cn_' + upload.upload_id - # We need to alter the ids, because we this by uploading to the same NOMAD + # We need to alter the ids, because we do this test by uploading to the same NOMAD def normalize_oasis_upload_metadata(upload_id, metadata): for entry in metadata['entries'].values(): entry['calc_id'] = utils.create_uuid() @@ -271,6 +286,10 @@ def test_publish_from_oasis( assert cn_upload.from_oasis assert cn_upload.oasis_deployment_id == config.meta.deployment_id assert upload.published_to[0] == config.oasis.central_nomad_deployment_id + cn_calc = Calc.objects(upload_id=cn_upload_id).first() + assert cn_calc.calc_id != calc.calc_id + assert cn_calc.metadata['datasets'] == ['dataset_id'] + assert datamodel.Dataset.m_def.a_mongo.objects().count() == 1 @pytest.mark.timeout(config.tests.default_timeout)