From 3b97832c704d6e80e09bb0741eedef5f741ad8b1 Mon Sep 17 00:00:00 2001
From: Theodore Chang <theodore.chang@physik.hu-berlin.de>
Date: Wed, 3 May 2023 05:51:24 +0000
Subject: [PATCH] Added config switch for generating new timestamps.

Changelog: Added
---
 nomad/config/models.py           |  1 +
 nomad/files.py                   | 11 ++++++---
 nomad/processing/data.py         | 22 +++++++++++------
 tests/processing/test_data.py    |  4 +--
 tests/processing/test_rfc3161.py | 42 +++++++++++++++++++++++++++++++-
 5 files changed, 66 insertions(+), 14 deletions(-)

diff --git a/nomad/config/models.py b/nomad/config/models.py
index 81a5f3410b..f89e345911 100644
--- a/nomad/config/models.py
+++ b/nomad/config/models.py
@@ -547,6 +547,7 @@ class Process(NomadSettings):
         True will redirect lines to stdout (e.g. print output) that occur during
         processing (e.g. created by parsers or normalizers) as log entries.
     ''')
+    rfc3161_skip_published = False  # skip published entries, regardless of timestamp
 
 
 class Reprocess(NomadSettings):
diff --git a/nomad/files.py b/nomad/files.py
index 0f6d57a774..67499ba7a2 100644
--- a/nomad/files.py
+++ b/nomad/files.py
@@ -1356,10 +1356,13 @@ class PublicUploadFiles(UploadFiles):
             staging_upload_files.add_rawfiles(raw_zip_file.os_path)
 
         if include_archive:
-            with self._open_msg_file() as archive:
-                for entry_id, data in archive.items():
-                    entry_id = entry_id.strip()
-                    staging_upload_files.write_archive(entry_id, data.to_dict())
+            try:
+                with self._open_msg_file() as archive:
+                    for entry_id, data in archive.items():
+                        entry_id = entry_id.strip()
+                        staging_upload_files.write_archive(entry_id, data.to_dict())
+            except FileNotFoundError:
+                pass
 
         return staging_upload_files
 
diff --git a/nomad/processing/data.py b/nomad/processing/data.py
index e7e198dfeb..cf91844138 100644
--- a/nomad/processing/data.py
+++ b/nomad/processing/data.py
@@ -145,7 +145,7 @@ def get_rfc3161_token(
                     params['certificate'] = f.read()
             else:
                 # a network location
-                params['certificate'] = requests.get(cert).content
+                params['certificate'] = requests.get(cert, timeout=10).content
         stamper = rfc3161ng.RemoteTimestamper(server, **params)
         return stamper(data=hash_string.encode('utf-8'))
     except Exception:
@@ -786,27 +786,35 @@ class Entry(Proc):
         entry_metadata.nomad_commit = ''
         entry_metadata.entry_hash = self.upload_files.entry_hash(self.mainfile, self.mainfile_key)
 
+        get_timestamp: bool = True  # do we need to get a new timestamp?
+        if config.process.rfc3161_skip_published and self.upload.published:
+            get_timestamp = False
+
         try:
             with self.upload_files.read_archive(self.entry_id) as archive:
                 entry_timestamp = archive[self.entry_id]['metadata']['entry_timestamp']
                 stored_seed = entry_timestamp['token_seed']
                 stored_token = base64.b64decode(entry_timestamp['token'])
                 stored_server = entry_timestamp['tsa_server']
+            has_existing_timestamp: bool = True
         except KeyError:
             stored_seed = None
             stored_token = None
             stored_server = None
-        if stored_seed != entry_metadata.entry_hash:
+            has_existing_timestamp = False
+
+        if stored_seed == entry_metadata.entry_hash:
+            get_timestamp = False
+
+        if get_timestamp:
             # entry is new or has changed
-            token = get_rfc3161_token(entry_metadata.entry_hash)
-            if token:
-                # 1. save to entry metadata
+            if token := get_rfc3161_token(entry_metadata.entry_hash):
                 entry_metadata.entry_timestamp = RFC3161Timestamp(
                     token_seed=entry_metadata.entry_hash,
                     token=token,
                     tsa_server=config.rfc3161_timestamp.server,
                     timestamp=rfc3161ng.get_timestamp(token))
-        else:
+        elif has_existing_timestamp:
             # entry is unchanged
             entry_metadata.entry_timestamp = RFC3161Timestamp(
                 token_seed=stored_seed,
@@ -1715,7 +1723,7 @@ class Upload(Proc):
             self.set_last_status_message('Refreshing staging files')
             self._cleanup_staging_files()
             with utils.timer(logger, 'upload extracted'):
-                self.upload_files.to_staging_upload_files(create=True)
+                self.upload_files.to_staging_upload_files(create=True, include_archive=True)
         elif not StagingUploadFiles.exists_for(self.upload_id):
             # Create staging files
             self.set_last_status_message('Creating staging files')
diff --git a/tests/processing/test_data.py b/tests/processing/test_data.py
index 7bd62adbbc..ea1281f028 100644
--- a/tests/processing/test_data.py
+++ b/tests/processing/test_data.py
@@ -419,8 +419,8 @@ def test_re_processing(published: Upload, internal_example_user_metadata, monkey
 
     if with_failure != 'not-matched':
         for archive_file in old_archive_files:
-            with open(published.upload_files.join_file(archive_file).os_path, 'wt') as f:
-                f.write('')
+            # delete all archive files
+            os.remove(published.upload_files.join_file(archive_file).os_path)
 
     if with_failure == 'after':
         raw_files = create_template_upload_file(tmp, 'tests/data/proc/templates/unparsable/template.json')
diff --git a/tests/processing/test_rfc3161.py b/tests/processing/test_rfc3161.py
index 82dff9a664..90f939ce14 100644
--- a/tests/processing/test_rfc3161.py
+++ b/tests/processing/test_rfc3161.py
@@ -17,13 +17,15 @@
 #
 
 import datetime
+import os
 
 import httpx
 import pytest
 import rfc3161ng
 
+from nomad.archive import write_archive, read_archive
 from nomad.datamodel.datamodel import RFC3161Timestamp
-from nomad.processing.data import get_rfc3161_token
+from nomad.processing.data import get_rfc3161_token, Entry
 
 
 @pytest.mark.parametrize('server,cert,result', [
@@ -57,3 +59,41 @@ def test_rfc3161ng_timestamp(server, cert, result, monkeysession):
         new_metadata = RFC3161Timestamp.m_from_dict(metadata.m_to_dict())
         assert new_metadata.token == token
         assert rfc3161ng.get_timestamp(new_metadata.token) == rfc3161ng_time
+
+
+def test_rfc3161ng_processing(published, monkeypatch):
+    entry_id = Entry.objects(upload_id=published.upload_id).first().entry_id
+    file_path = published.upload_files._create_msg_file_object(
+        published.upload_files, published.upload_files.access, fallback=True).os_path
+
+    archive = read_archive(file_path)[entry_id].to_dict()
+    assert 'entry_timestamp' in archive['metadata']
+
+    original_timestamp = archive['metadata']['entry_timestamp']
+
+    def _re_process():
+        published.process_upload()
+        published.publish_upload(embargo_length=12)
+        try:
+            published.block_until_complete(interval=.01)
+        except Exception:
+            pass
+        return read_archive(file_path)[entry_id].to_dict()
+
+    # 0. assert reprocessing does not change timestamp
+    archive = _re_process()
+    assert 'entry_timestamp' in archive['metadata']
+    assert archive['metadata']['entry_timestamp'] == original_timestamp
+
+    # 1. old timestamp deleted, published, skip published, expect no timestamp
+    os.remove(file_path)
+    del archive['metadata']['entry_timestamp']
+    write_archive(file_path, 1, data=[(entry_id, archive)])
+    monkeypatch.setattr('nomad.config.process.rfc3161_skip_published', True)
+    archive = _re_process()
+    assert 'entry_timestamp' not in archive['metadata']
+
+    # 2. published, NOT skip published, expect timestamp
+    monkeypatch.setattr('nomad.config.process.rfc3161_skip_published', False)
+    archive = _re_process()
+    assert 'entry_timestamp' in archive['metadata']
-- 
GitLab