diff --git a/nomad/config/models.py b/nomad/config/models.py index 81a5f3410bda84a244ce7c5f241ea1b36a3bf1a6..f89e3459112e2b518851a41ebedbd976f1d312de 100644 --- a/nomad/config/models.py +++ b/nomad/config/models.py @@ -547,6 +547,7 @@ class Process(NomadSettings): True will redirect lines to stdout (e.g. print output) that occur during processing (e.g. created by parsers or normalizers) as log entries. ''') + rfc3161_skip_published = False # skip published entries, regardless of timestamp class Reprocess(NomadSettings): diff --git a/nomad/files.py b/nomad/files.py index 0f6d57a7749f7254803efa7223e7297b2267f850..67499ba7a2bed4f7b232c0fd02656d3fd2bf68b3 100644 --- a/nomad/files.py +++ b/nomad/files.py @@ -1356,10 +1356,13 @@ class PublicUploadFiles(UploadFiles): staging_upload_files.add_rawfiles(raw_zip_file.os_path) if include_archive: - with self._open_msg_file() as archive: - for entry_id, data in archive.items(): - entry_id = entry_id.strip() - staging_upload_files.write_archive(entry_id, data.to_dict()) + try: + with self._open_msg_file() as archive: + for entry_id, data in archive.items(): + entry_id = entry_id.strip() + staging_upload_files.write_archive(entry_id, data.to_dict()) + except FileNotFoundError: + pass return staging_upload_files diff --git a/nomad/processing/data.py b/nomad/processing/data.py index e7e198dfebfe5f596035c8434250c1360e356002..cf918441386bc4ce8bfc5d0e17d66f1cf96abd32 100644 --- a/nomad/processing/data.py +++ b/nomad/processing/data.py @@ -145,7 +145,7 @@ def get_rfc3161_token( params['certificate'] = f.read() else: # a network location - params['certificate'] = requests.get(cert).content + params['certificate'] = requests.get(cert, timeout=10).content stamper = rfc3161ng.RemoteTimestamper(server, **params) return stamper(data=hash_string.encode('utf-8')) except Exception: @@ -786,27 +786,35 @@ class Entry(Proc): entry_metadata.nomad_commit = '' entry_metadata.entry_hash = self.upload_files.entry_hash(self.mainfile, self.mainfile_key) + get_timestamp: bool = True # do we need to get a new timestamp? + if config.process.rfc3161_skip_published and self.upload.published: + get_timestamp = False + try: with self.upload_files.read_archive(self.entry_id) as archive: entry_timestamp = archive[self.entry_id]['metadata']['entry_timestamp'] stored_seed = entry_timestamp['token_seed'] stored_token = base64.b64decode(entry_timestamp['token']) stored_server = entry_timestamp['tsa_server'] + has_existing_timestamp: bool = True except KeyError: stored_seed = None stored_token = None stored_server = None - if stored_seed != entry_metadata.entry_hash: + has_existing_timestamp = False + + if stored_seed == entry_metadata.entry_hash: + get_timestamp = False + + if get_timestamp: # entry is new or has changed - token = get_rfc3161_token(entry_metadata.entry_hash) - if token: - # 1. save to entry metadata + if token := get_rfc3161_token(entry_metadata.entry_hash): entry_metadata.entry_timestamp = RFC3161Timestamp( token_seed=entry_metadata.entry_hash, token=token, tsa_server=config.rfc3161_timestamp.server, timestamp=rfc3161ng.get_timestamp(token)) - else: + elif has_existing_timestamp: # entry is unchanged entry_metadata.entry_timestamp = RFC3161Timestamp( token_seed=stored_seed, @@ -1715,7 +1723,7 @@ class Upload(Proc): self.set_last_status_message('Refreshing staging files') self._cleanup_staging_files() with utils.timer(logger, 'upload extracted'): - self.upload_files.to_staging_upload_files(create=True) + self.upload_files.to_staging_upload_files(create=True, include_archive=True) elif not StagingUploadFiles.exists_for(self.upload_id): # Create staging files self.set_last_status_message('Creating staging files') diff --git a/tests/processing/test_data.py b/tests/processing/test_data.py index 7bd62adbbc2a55e920f047e74179ce8405779be7..ea1281f02801df5b9693a1e9dad8cff8e406626a 100644 --- a/tests/processing/test_data.py +++ b/tests/processing/test_data.py @@ -419,8 +419,8 @@ def test_re_processing(published: Upload, internal_example_user_metadata, monkey if with_failure != 'not-matched': for archive_file in old_archive_files: - with open(published.upload_files.join_file(archive_file).os_path, 'wt') as f: - f.write('') + # delete all archive files + os.remove(published.upload_files.join_file(archive_file).os_path) if with_failure == 'after': raw_files = create_template_upload_file(tmp, 'tests/data/proc/templates/unparsable/template.json') diff --git a/tests/processing/test_rfc3161.py b/tests/processing/test_rfc3161.py index 82dff9a6642078ed70099063945358a6af736fac..90f939ce140a63fe7b9401361b2c6e426329b8b8 100644 --- a/tests/processing/test_rfc3161.py +++ b/tests/processing/test_rfc3161.py @@ -17,13 +17,15 @@ # import datetime +import os import httpx import pytest import rfc3161ng +from nomad.archive import write_archive, read_archive from nomad.datamodel.datamodel import RFC3161Timestamp -from nomad.processing.data import get_rfc3161_token +from nomad.processing.data import get_rfc3161_token, Entry @pytest.mark.parametrize('server,cert,result', [ @@ -57,3 +59,41 @@ def test_rfc3161ng_timestamp(server, cert, result, monkeysession): new_metadata = RFC3161Timestamp.m_from_dict(metadata.m_to_dict()) assert new_metadata.token == token assert rfc3161ng.get_timestamp(new_metadata.token) == rfc3161ng_time + + +def test_rfc3161ng_processing(published, monkeypatch): + entry_id = Entry.objects(upload_id=published.upload_id).first().entry_id + file_path = published.upload_files._create_msg_file_object( + published.upload_files, published.upload_files.access, fallback=True).os_path + + archive = read_archive(file_path)[entry_id].to_dict() + assert 'entry_timestamp' in archive['metadata'] + + original_timestamp = archive['metadata']['entry_timestamp'] + + def _re_process(): + published.process_upload() + published.publish_upload(embargo_length=12) + try: + published.block_until_complete(interval=.01) + except Exception: + pass + return read_archive(file_path)[entry_id].to_dict() + + # 0. assert reprocessing does not change timestamp + archive = _re_process() + assert 'entry_timestamp' in archive['metadata'] + assert archive['metadata']['entry_timestamp'] == original_timestamp + + # 1. old timestamp deleted, published, skip published, expect no timestamp + os.remove(file_path) + del archive['metadata']['entry_timestamp'] + write_archive(file_path, 1, data=[(entry_id, archive)]) + monkeypatch.setattr('nomad.config.process.rfc3161_skip_published', True) + archive = _re_process() + assert 'entry_timestamp' not in archive['metadata'] + + # 2. published, NOT skip published, expect timestamp + monkeypatch.setattr('nomad.config.process.rfc3161_skip_published', False) + archive = _re_process() + assert 'entry_timestamp' in archive['metadata']