From 3b97832c704d6e80e09bb0741eedef5f741ad8b1 Mon Sep 17 00:00:00 2001 From: Theodore Chang <theodore.chang@physik.hu-berlin.de> Date: Wed, 3 May 2023 05:51:24 +0000 Subject: [PATCH] Added config switch for generating new timestamps. Changelog: Added --- nomad/config/models.py | 1 + nomad/files.py | 11 ++++++--- nomad/processing/data.py | 22 +++++++++++------ tests/processing/test_data.py | 4 +-- tests/processing/test_rfc3161.py | 42 +++++++++++++++++++++++++++++++- 5 files changed, 66 insertions(+), 14 deletions(-) diff --git a/nomad/config/models.py b/nomad/config/models.py index 81a5f3410b..f89e345911 100644 --- a/nomad/config/models.py +++ b/nomad/config/models.py @@ -547,6 +547,7 @@ class Process(NomadSettings): True will redirect lines to stdout (e.g. print output) that occur during processing (e.g. created by parsers or normalizers) as log entries. ''') + rfc3161_skip_published = False # skip published entries, regardless of timestamp class Reprocess(NomadSettings): diff --git a/nomad/files.py b/nomad/files.py index 0f6d57a774..67499ba7a2 100644 --- a/nomad/files.py +++ b/nomad/files.py @@ -1356,10 +1356,13 @@ class PublicUploadFiles(UploadFiles): staging_upload_files.add_rawfiles(raw_zip_file.os_path) if include_archive: - with self._open_msg_file() as archive: - for entry_id, data in archive.items(): - entry_id = entry_id.strip() - staging_upload_files.write_archive(entry_id, data.to_dict()) + try: + with self._open_msg_file() as archive: + for entry_id, data in archive.items(): + entry_id = entry_id.strip() + staging_upload_files.write_archive(entry_id, data.to_dict()) + except FileNotFoundError: + pass return staging_upload_files diff --git a/nomad/processing/data.py b/nomad/processing/data.py index e7e198dfeb..cf91844138 100644 --- a/nomad/processing/data.py +++ b/nomad/processing/data.py @@ -145,7 +145,7 @@ def get_rfc3161_token( params['certificate'] = f.read() else: # a network location - params['certificate'] = requests.get(cert).content + params['certificate'] = requests.get(cert, timeout=10).content stamper = rfc3161ng.RemoteTimestamper(server, **params) return stamper(data=hash_string.encode('utf-8')) except Exception: @@ -786,27 +786,35 @@ class Entry(Proc): entry_metadata.nomad_commit = '' entry_metadata.entry_hash = self.upload_files.entry_hash(self.mainfile, self.mainfile_key) + get_timestamp: bool = True # do we need to get a new timestamp? + if config.process.rfc3161_skip_published and self.upload.published: + get_timestamp = False + try: with self.upload_files.read_archive(self.entry_id) as archive: entry_timestamp = archive[self.entry_id]['metadata']['entry_timestamp'] stored_seed = entry_timestamp['token_seed'] stored_token = base64.b64decode(entry_timestamp['token']) stored_server = entry_timestamp['tsa_server'] + has_existing_timestamp: bool = True except KeyError: stored_seed = None stored_token = None stored_server = None - if stored_seed != entry_metadata.entry_hash: + has_existing_timestamp = False + + if stored_seed == entry_metadata.entry_hash: + get_timestamp = False + + if get_timestamp: # entry is new or has changed - token = get_rfc3161_token(entry_metadata.entry_hash) - if token: - # 1. save to entry metadata + if token := get_rfc3161_token(entry_metadata.entry_hash): entry_metadata.entry_timestamp = RFC3161Timestamp( token_seed=entry_metadata.entry_hash, token=token, tsa_server=config.rfc3161_timestamp.server, timestamp=rfc3161ng.get_timestamp(token)) - else: + elif has_existing_timestamp: # entry is unchanged entry_metadata.entry_timestamp = RFC3161Timestamp( token_seed=stored_seed, @@ -1715,7 +1723,7 @@ class Upload(Proc): self.set_last_status_message('Refreshing staging files') self._cleanup_staging_files() with utils.timer(logger, 'upload extracted'): - self.upload_files.to_staging_upload_files(create=True) + self.upload_files.to_staging_upload_files(create=True, include_archive=True) elif not StagingUploadFiles.exists_for(self.upload_id): # Create staging files self.set_last_status_message('Creating staging files') diff --git a/tests/processing/test_data.py b/tests/processing/test_data.py index 7bd62adbbc..ea1281f028 100644 --- a/tests/processing/test_data.py +++ b/tests/processing/test_data.py @@ -419,8 +419,8 @@ def test_re_processing(published: Upload, internal_example_user_metadata, monkey if with_failure != 'not-matched': for archive_file in old_archive_files: - with open(published.upload_files.join_file(archive_file).os_path, 'wt') as f: - f.write('') + # delete all archive files + os.remove(published.upload_files.join_file(archive_file).os_path) if with_failure == 'after': raw_files = create_template_upload_file(tmp, 'tests/data/proc/templates/unparsable/template.json') diff --git a/tests/processing/test_rfc3161.py b/tests/processing/test_rfc3161.py index 82dff9a664..90f939ce14 100644 --- a/tests/processing/test_rfc3161.py +++ b/tests/processing/test_rfc3161.py @@ -17,13 +17,15 @@ # import datetime +import os import httpx import pytest import rfc3161ng +from nomad.archive import write_archive, read_archive from nomad.datamodel.datamodel import RFC3161Timestamp -from nomad.processing.data import get_rfc3161_token +from nomad.processing.data import get_rfc3161_token, Entry @pytest.mark.parametrize('server,cert,result', [ @@ -57,3 +59,41 @@ def test_rfc3161ng_timestamp(server, cert, result, monkeysession): new_metadata = RFC3161Timestamp.m_from_dict(metadata.m_to_dict()) assert new_metadata.token == token assert rfc3161ng.get_timestamp(new_metadata.token) == rfc3161ng_time + + +def test_rfc3161ng_processing(published, monkeypatch): + entry_id = Entry.objects(upload_id=published.upload_id).first().entry_id + file_path = published.upload_files._create_msg_file_object( + published.upload_files, published.upload_files.access, fallback=True).os_path + + archive = read_archive(file_path)[entry_id].to_dict() + assert 'entry_timestamp' in archive['metadata'] + + original_timestamp = archive['metadata']['entry_timestamp'] + + def _re_process(): + published.process_upload() + published.publish_upload(embargo_length=12) + try: + published.block_until_complete(interval=.01) + except Exception: + pass + return read_archive(file_path)[entry_id].to_dict() + + # 0. assert reprocessing does not change timestamp + archive = _re_process() + assert 'entry_timestamp' in archive['metadata'] + assert archive['metadata']['entry_timestamp'] == original_timestamp + + # 1. old timestamp deleted, published, skip published, expect no timestamp + os.remove(file_path) + del archive['metadata']['entry_timestamp'] + write_archive(file_path, 1, data=[(entry_id, archive)]) + monkeypatch.setattr('nomad.config.process.rfc3161_skip_published', True) + archive = _re_process() + assert 'entry_timestamp' not in archive['metadata'] + + # 2. published, NOT skip published, expect timestamp + monkeypatch.setattr('nomad.config.process.rfc3161_skip_published', False) + archive = _re_process() + assert 'entry_timestamp' in archive['metadata'] -- GitLab