From c99669a24284eebb81f1d711a5454e8d90222322 Mon Sep 17 00:00:00 2001
From: Markus Scheidgen <markus.scheidgen@gmail.com>
Date: Wed, 18 Dec 2019 13:31:07 +0100
Subject: [PATCH] Fixed broken zipfile cache.

---
 nomad/app/api/archive.py | 37 +++++++++++---------
 nomad/app/api/raw.py     | 75 +++++++++++++++++++++-------------------
 nomad/files.py           | 22 ++++++------
 3 files changed, 72 insertions(+), 62 deletions(-)

diff --git a/nomad/app/api/archive.py b/nomad/app/api/archive.py
index 57484680af..19713b7061 100644
--- a/nomad/app/api/archive.py
+++ b/nomad/app/api/archive.py
@@ -149,25 +149,27 @@ class ArchiveQueryResource(Resource):
 
         def generator():
             manifest = {}
+            upload_files = None
             for entry in calcs:
                 upload_id = entry['upload_id']
                 calc_id = entry['calc_id']
-                upload_files = UploadFiles.get(
-                    upload_id, create_authorization_predicate(upload_id))
-                if upload_files is None:
-                    utils.get_logger(__name__).error('upload files do not exist', upload_id=upload_id)
-                    continue
-
-                if hasattr(upload_files, 'zipfile_cache'):
-                    zipfile_cache = upload_files.zipfile_cache()
-                else:
-                    zipfile_cache = contextlib.suppress()
-
-                with zipfile_cache:
-                    yield (
-                        '%s.%s' % (calc_id, upload_files._archive_ext), calc_id,
-                        lambda calc_id: upload_files.archive_file(calc_id, 'rb'),
-                        lambda calc_id: upload_files.archive_file_size(calc_id))
+                if upload_files is None or upload_files.upload_id != upload_id:
+                    if upload_files is not None:
+                        upload_files.close_zipfile_cache()
+
+                    upload_files = UploadFiles.get(
+                        upload_id, create_authorization_predicate(upload_id))
+
+                    if upload_files is None:
+                        utils.get_logger(__name__).error('upload files do not exist', upload_id=upload_id)
+                        continue
+
+                    upload_files.open_zipfile_cache()
+
+                yield (
+                    '%s.%s' % (calc_id, upload_files._archive_ext), calc_id,
+                    lambda calc_id: upload_files.archive_file(calc_id, 'rb'),
+                    lambda calc_id: upload_files.archive_file_size(calc_id))
 
                 manifest[calc_id] = {
                     key: entry[key]
@@ -175,6 +177,9 @@ class ArchiveQueryResource(Resource):
                     if entry.get(key) is not None
                 }
 
+            if upload_files is not None:
+                upload_files.close_zipfile_cache()
+
             try:
                 manifest_contents = json.dumps(manifest).encode('utf-8')
             except Exception as e:
diff --git a/nomad/app/api/raw.py b/nomad/app/api/raw.py
index a0a761c4fe..3ae09689d2 100644
--- a/nomad/app/api/raw.py
+++ b/nomad/app/api/raw.py
@@ -430,40 +430,44 @@ class RawFileQueryResource(Resource):
 
         def generator():
             manifest = {}
+            upload_files = None
             for entry in calcs:
                 upload_id = entry['upload_id']
                 mainfile = entry['mainfile']
-                upload_files = UploadFiles.get(
-                    upload_id, create_authorization_predicate(upload_id))
-                if upload_files is None:
-                    utils.get_logger(__name__).error('upload files do not exist', upload_id=upload_id)
-                    continue
-
-                if hasattr(upload_files, 'zipfile_cache'):
-                    zipfile_cache = upload_files.zipfile_cache()
-                else:
-                    zipfile_cache = contextlib.suppress()
-
-                with zipfile_cache:
-                    filenames = upload_files.raw_file_manifest(
-                        path_prefix=os.path.dirname(mainfile))
-                    for filename in filenames:
-                        filename_w_upload = os.path.join(upload_files.upload_id, filename)
-                        filename_wo_prefix = filename_w_upload[common_prefix_len:]
-                        if len(patterns) == 0 or any(
-                                fnmatch.fnmatchcase(os.path.basename(filename_wo_prefix), pattern)
-                                for pattern in patterns):
-
-                            yield (
-                                filename_wo_prefix, filename,
-                                lambda upload_filename: upload_files.raw_file(upload_filename, 'rb'),
-                                lambda upload_filename: upload_files.raw_file_size(upload_filename))
-
-                            manifest[path(entry)] = {
-                                key: entry[key]
-                                for key in RawFileQueryResource.manifest_quantities
-                                if entry.get(key) is not None
-                            }
+                if upload_files is None or upload_files.upload_id != upload_id:
+                    if upload_files is not None:
+                        upload_files.close_zipfile_cache()
+
+                    upload_files = UploadFiles.get(
+                        upload_id, create_authorization_predicate(upload_id))
+
+                    if upload_files is None:
+                        utils.get_logger(__name__).error('upload files do not exist', upload_id=upload_id)
+                        continue
+
+                    upload_files.open_zipfile_cache()
+
+                filenames = upload_files.raw_file_manifest(path_prefix=os.path.dirname(mainfile))
+                for filename in filenames:
+                    filename_w_upload = os.path.join(upload_files.upload_id, filename)
+                    filename_wo_prefix = filename_w_upload[common_prefix_len:]
+                    if len(patterns) == 0 or any(
+                            fnmatch.fnmatchcase(os.path.basename(filename_wo_prefix), pattern)
+                            for pattern in patterns):
+
+                        yield (
+                            filename_wo_prefix, filename,
+                            lambda upload_filename: upload_files.raw_file(upload_filename, 'rb'),
+                            lambda upload_filename: upload_files.raw_file_size(upload_filename))
+
+                        manifest[path(entry)] = {
+                            key: entry[key]
+                            for key in RawFileQueryResource.manifest_quantities
+                            if entry.get(key) is not None
+                        }
+
+            if upload_files is not None:
+                upload_files.close_zipfile_cache()
 
             try:
                 manifest_contents = json.dumps(manifest).encode('utf-8')
@@ -490,17 +494,14 @@ def respond_to_get_raw_files(upload_id, files, compress=False, strip=False):
 
     # the zipfile cache allows to access many raw-files from public upload files without
     # having to re-open the underlying zip files all the time
-    if hasattr(upload_files, 'zipfile_cache'):
-        zipfile_cache = upload_files.zipfile_cache()
-    else:
-        zipfile_cache = contextlib.suppress()
+    upload_files.open_zipfile_cache()
 
     if strip:
         common_prefix_len = len(utils.common_prefix(files))
     else:
         common_prefix_len = 0
 
-    with zipfile_cache:
+    try:
         return streamed_zipfile(
             [(
                 filename[common_prefix_len:], filename,
@@ -508,3 +509,5 @@ def respond_to_get_raw_files(upload_id, files, compress=False, strip=False):
                 lambda upload_filename: upload_files.raw_file_size(upload_filename)
             ) for filename in files],
             zipfile_name='%s.zip' % upload_id, compress=compress)
+    finally:
+        upload_files.close_zipfile_cache()
diff --git a/nomad/files.py b/nomad/files.py
index aa94329961..9c26a21901 100644
--- a/nomad/files.py
+++ b/nomad/files.py
@@ -58,7 +58,6 @@ import tarfile
 import hashlib
 import io
 import pickle
-from contextlib import contextmanager
 
 from nomad import config, utils
 from nomad.datamodel import UploadWithMetadata
@@ -313,6 +312,13 @@ class UploadFiles(DirectoryObject, metaclass=ABCMeta):
         """
         raise NotImplementedError()
 
+    def open_zipfile_cache(self):
+        """ Allows to reuse the same zipfile for multiple file operations. Must be closed. """
+        pass
+
+    def close_zipfile_cache(self):
+        pass
+
 
 class StagingUploadFiles(UploadFiles):
     def __init__(
@@ -916,16 +922,12 @@ class PublicUploadFiles(UploadFiles):
                 repacked_file.os_path,
                 public_file.os_path)
 
-    @contextmanager
-    def zipfile_cache(self):
-        """
-        Context that allows to read files while caching zipfiles without reopening them
-        all the time.
-        """
+    def open_zipfile_cache(self):
         if self._zipfile_cache is None:
             self._zipfile_cache = {}
-        try:
-            yield
-        finally:
+
+    def close_zipfile_cache(self):
+        if self._zipfile_cache is not None:
             for zip_file in self._zipfile_cache.values():
                 zip_file.close()
+        self._zipfile_cache = None
-- 
GitLab