From d028b9ed1d953f22e203dbf6b131ff8d96db994b Mon Sep 17 00:00:00 2001
From: Markus Scheidgen <markus.scheidgen@gmail.com>
Date: Mon, 8 Oct 2018 16:21:32 +0200
Subject: [PATCH] Added all calc raw file zipball download.

---
 nomad/api.py      | 103 ++++++++++++++++++++++++++++++++++------------
 nomad/client.py   |  57 +++++++++++++++++++++++--
 requirements.txt  |   3 +-
 tests/test_api.py |   8 ++++
 4 files changed, 140 insertions(+), 31 deletions(-)

diff --git a/nomad/api.py b/nomad/api.py
index 3652010f16..f50d5f23e3 100644
--- a/nomad/api.py
+++ b/nomad/api.py
@@ -12,13 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from flask import Flask, request, g, jsonify, send_file
+from werkzeug.exceptions import HTTPException
+from flask import Flask, request, g, jsonify, send_file, Response
 from flask_restful import Resource, Api, abort
 from flask_cors import CORS
 from flask_httpauth import HTTPBasicAuth
 from elasticsearch.exceptions import NotFoundError
 from datetime import datetime
 import os.path
+import zipstream
+from zipfile import ZIP_DEFLATED
+from contextlib import contextmanager
+import types
 
 from nomad import config, infrastructure
 from nomad.files import UploadFile, ArchiveFile, ArchiveLogFile
@@ -742,7 +747,7 @@ def get_raw(upload_hash, calc_hash):
     """
     Get calculation mainfile raw data. Calcs are references via *upload_hash*, *calc_hash*
     pairs. Returns the mainfile, unless an aux_file is specified. Aux files are stored
-    in repository entries.
+    in repository entries. See ``/repo`` endpoint.
 
     .. :quickref: repo; Get calculation raw data.
 
@@ -756,6 +761,7 @@ def get_raw(upload_hash, calc_hash):
     :param string upload_hash: the hash of the upload (from uploaded file contents)
     :param string calc_hash: the hash of the calculation (from mainfile)
     :qparam str auxfile: an optional aux_file to download the respective aux file, default is mainfile
+    :qparam all: set any value to get a .zip with main and aux files instead of an individual file
     :resheader Content-Type: application/json
     :status 200: calc raw data successfully retrieved
     :status 404: calc with given hashes does not exist or the given aux file does not exist
@@ -769,33 +775,76 @@ def get_raw(upload_hash, calc_hash):
     except Exception as e:
         abort(500, message=str(e))
 
-    auxfile = request.args.get('auxfile', None)
-    if auxfile:
-        filename = os.path.join(os.path.dirname(repo.mainfile), auxfile)
+    @contextmanager
+    def raw_file(filename):
+        try:
+            upload = Upload.get(repo.upload_id)
+            upload_file = UploadFile(upload.upload_id, local_path=upload.local_path)
+            the_file = upload_file.get_file(filename)
+            with the_file.open() as f:
+                yield f
+        except KeyError:
+            abort(404, message='The file %s does not exist.' % filename)
+        except FileNotFoundError:
+            abort(404, message='The file %s does not exist.' % filename)
+
+    get_all = request.args.get('all', None) is not None
+    if get_all:
+        # retrieve the 'whole' calculation, meaning the mainfile and all aux files as
+        # a .zip archive
+        def generator():
+            """ Stream a zip file with all files using zipstream. """
+            def iterator():
+                """ Replace the directory based iter of zipstream with an iter over all raw files. """
+                def write(filename):
+                    """ Write a raw file to the zipstream. """
+                    def iter_content():
+                        """ Iterate the raw file contents. """
+                        with raw_file(filename) as file_object:
+                            while True:
+                                data = file_object.read(1024)
+                                if not data:
+                                    break
+                                yield data
+                    return dict(arcname=filename, iterable=iter_content())
+
+                yield write(repo.mainfile)
+                for auxfile in repo.aux_files:
+                    yield write(os.path.join(os.path.dirname(repo.mainfile), auxfile))
+
+            zip_stream = zipstream.ZipFile(mode='w', compression=ZIP_DEFLATED)
+            zip_stream.paths_to_write = iterator()
+
+            for chunk in zip_stream:
+                yield chunk
+
+        response = Response(generator(), mimetype='application/zip')
+        response.headers['Content-Disposition'] = 'attachment; filename={}'.format('%s.zip' % archive_id)
+        return response
     else:
-        filename = repo.mainfile
+        # retrieve an individual raw file
+        auxfile = request.args.get('auxfile', None)
+        if auxfile:
+            filename = os.path.join(os.path.dirname(repo.mainfile), auxfile)
+        else:
+            filename = repo.mainfile
 
-    try:
-        upload = Upload.get(repo.upload_id)
-        upload_file = UploadFile(upload.upload_id, local_path=upload.local_path)
-        the_file = upload_file.get_file(filename)
-        with the_file.open() as f:
-            rv = send_file(
-                f,
-                mimetype='application/octet-stream',
-                as_attachment=True,
-                attachment_filename=os.path.basename(filename))
-            return rv
-    except KeyError:
-        abort(404, message='The file %s does not exist.' % filename)
-    except FileNotFoundError:
-        abort(404, message='The file %s does not exist.' % filename)
-    except Exception as e:
-        logger = get_logger(
-            __name__, endpoint='archive', action='get',
-            upload_hash=upload_hash, calc_hash=calc_hash)
-        logger.error('Exception on accessing archive', exc_info=e)
-        abort(500, message='Could not accessing the archive.')
+        try:
+            with raw_file(filename) as f:
+                rv = send_file(
+                    f,
+                    mimetype='application/octet-stream',
+                    as_attachment=True,
+                    attachment_filename=os.path.basename(filename))
+                return rv
+        except HTTPException as e:
+            raise e
+        except Exception as e:
+            logger = get_logger(
+                __name__, endpoint='archive', action='get',
+                upload_hash=upload_hash, calc_hash=calc_hash)
+            logger.error('Exception on accessing archive', exc_info=e)
+            abort(500, message='Could not accessing the archive.')
 
 
 @app.route('%s/admin/<string:operation>' % base_path, methods=['POST'])
diff --git a/nomad/client.py b/nomad/client.py
index 89ab72f43a..9b1bdafb4f 100644
--- a/nomad/client.py
+++ b/nomad/client.py
@@ -26,6 +26,10 @@ import requests
 from requests.auth import HTTPBasicAuth
 import click
 
+from nomad import config
+from nomad.files import UploadFile
+
+
 api_base = 'http://localhost/nomad/api'
 user = 'other@gmail.com'
 pw = 'nomad'
@@ -44,13 +48,15 @@ def handle_common_errors(func):
 
 
 @handle_common_errors
-def upload_file(file_path, name=None, offline=False):
+def upload_file(file_path: str, name: str = None, offline: bool = False):
     """
     Upload a file to nomad.
 
     Arguments:
-        file_path: Path to the file, absolute or relative to call directory.
-        name: Optional name, default is the file_path's basename
+        file_path: path to the file, absolute or relative to call directory
+        name: optional name, default is the file_path's basename
+        offline: allows to process data without upload, requires client to be run on the server
+
     """
     auth = HTTPBasicAuth(user, pw)
 
@@ -115,6 +121,51 @@ def walk_through_files(path, extension='.zip'):
                 yield os.path.abspath(os.path.join(dirpath, filename))
 
 
+class CalcProcReproduction(UploadFile):
+    """
+    Instances represent a local reproduction of the processing for a single calculation.
+    It allows to download raw data from a nomad server and reproduce its processing
+    (parsing, normalizing) with the locally installed parsers and normalizers.
+
+    The use-case is error/warning reproduction. Use ELK to identify errors, use
+    the upload, archive ids/hashes to given by ELK, and reproduce and fix the error
+    in your development environment.
+
+    This is a class of :class:`UploadFile` the downloaded raw data will be treated as
+    an fake 'upload' that only contains the respective calculation data. This allows us
+    to locally run processing code that is very similar to the one used on the server.
+    """
+    def __init__(self, archive_id: str) -> CalcProcReproduction:
+        local_path = os.path.join(config.fs.tmp, '%s.zip' % archive_id)
+        if not os.path.exists(local_path):
+            # download raw if not already downloaded
+            req = requests.get('%s/raw/%s?all=1' % (api_base, archive_id), stream=True)
+            with open(local_path, 'wb') as f:
+                for chunk in req.iter_content():
+                    f.write(chunk)
+
+        super().__init__(upload_id='tmp_%s' % archive_id, local_path=local_path)
+
+    def parse(parser_name: str = None):
+        """
+        Run the given parser on the downloaded calculation. If no parser is given,
+        do parser matching and use the respective parser.
+        """
+        pass
+
+    def normalize(normalizer_name: str):
+        """
+        Parse the downloaded calculation and run the given normalizer.
+        """
+        pass
+
+    def normalize_all():
+        """
+        Parse the downloaded calculation and run the whole normalizer chain.
+        """
+        pass
+
+
 @click.group()
 @click.option('--host', default='localhost', help='The host nomad runs on, default is "localhost".')
 @click.option('--port', default=80, help='the port nomad runs with, default is 80.')
diff --git a/requirements.txt b/requirements.txt
index 83c95fc81c..11cd901a6b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -21,4 +21,5 @@ requests
 click
 sphinx
 sphinxcontrib.httpdomain
-sphinx_rtd_theme
\ No newline at end of file
+sphinx_rtd_theme
+zipstream
\ No newline at end of file
diff --git a/tests/test_api.py b/tests/test_api.py
index 07fba40ba1..c0ec23758d 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -352,6 +352,14 @@ def test_raw_missing_auxfile(client, example_repo_with_files, no_warn):
     assert rv.status_code == 404
 
 
+def test_raw_all_files(client, example_repo_with_files, no_warn):
+    rv = client.get('/raw/%s?all=1' % example_repo_with_files.archive_id)
+    assert rv.status_code == 200
+    assert len(rv.data) > 0
+    with open('test.zip', 'wb') as f:
+        f.write(rv.data)
+
+
 def test_raw_missing_mainfile(client, no_warn):
     rv = client.get('/raw/doesnot/exist')
     assert rv.status_code == 404
-- 
GitLab