From d028b9ed1d953f22e203dbf6b131ff8d96db994b Mon Sep 17 00:00:00 2001 From: Markus Scheidgen <markus.scheidgen@gmail.com> Date: Mon, 8 Oct 2018 16:21:32 +0200 Subject: [PATCH] Added all calc raw file zipball download. --- nomad/api.py | 103 ++++++++++++++++++++++++++++++++++------------ nomad/client.py | 57 +++++++++++++++++++++++-- requirements.txt | 3 +- tests/test_api.py | 8 ++++ 4 files changed, 140 insertions(+), 31 deletions(-) diff --git a/nomad/api.py b/nomad/api.py index 3652010f16..f50d5f23e3 100644 --- a/nomad/api.py +++ b/nomad/api.py @@ -12,13 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -from flask import Flask, request, g, jsonify, send_file +from werkzeug.exceptions import HTTPException +from flask import Flask, request, g, jsonify, send_file, Response from flask_restful import Resource, Api, abort from flask_cors import CORS from flask_httpauth import HTTPBasicAuth from elasticsearch.exceptions import NotFoundError from datetime import datetime import os.path +import zipstream +from zipfile import ZIP_DEFLATED +from contextlib import contextmanager +import types from nomad import config, infrastructure from nomad.files import UploadFile, ArchiveFile, ArchiveLogFile @@ -742,7 +747,7 @@ def get_raw(upload_hash, calc_hash): """ Get calculation mainfile raw data. Calcs are references via *upload_hash*, *calc_hash* pairs. Returns the mainfile, unless an aux_file is specified. Aux files are stored - in repository entries. + in repository entries. See ``/repo`` endpoint. .. :quickref: repo; Get calculation raw data. @@ -756,6 +761,7 @@ def get_raw(upload_hash, calc_hash): :param string upload_hash: the hash of the upload (from uploaded file contents) :param string calc_hash: the hash of the calculation (from mainfile) :qparam str auxfile: an optional aux_file to download the respective aux file, default is mainfile + :qparam all: set any value to get a .zip with main and aux files instead of an individual file :resheader Content-Type: application/json :status 200: calc raw data successfully retrieved :status 404: calc with given hashes does not exist or the given aux file does not exist @@ -769,33 +775,76 @@ def get_raw(upload_hash, calc_hash): except Exception as e: abort(500, message=str(e)) - auxfile = request.args.get('auxfile', None) - if auxfile: - filename = os.path.join(os.path.dirname(repo.mainfile), auxfile) + @contextmanager + def raw_file(filename): + try: + upload = Upload.get(repo.upload_id) + upload_file = UploadFile(upload.upload_id, local_path=upload.local_path) + the_file = upload_file.get_file(filename) + with the_file.open() as f: + yield f + except KeyError: + abort(404, message='The file %s does not exist.' % filename) + except FileNotFoundError: + abort(404, message='The file %s does not exist.' % filename) + + get_all = request.args.get('all', None) is not None + if get_all: + # retrieve the 'whole' calculation, meaning the mainfile and all aux files as + # a .zip archive + def generator(): + """ Stream a zip file with all files using zipstream. """ + def iterator(): + """ Replace the directory based iter of zipstream with an iter over all raw files. """ + def write(filename): + """ Write a raw file to the zipstream. """ + def iter_content(): + """ Iterate the raw file contents. """ + with raw_file(filename) as file_object: + while True: + data = file_object.read(1024) + if not data: + break + yield data + return dict(arcname=filename, iterable=iter_content()) + + yield write(repo.mainfile) + for auxfile in repo.aux_files: + yield write(os.path.join(os.path.dirname(repo.mainfile), auxfile)) + + zip_stream = zipstream.ZipFile(mode='w', compression=ZIP_DEFLATED) + zip_stream.paths_to_write = iterator() + + for chunk in zip_stream: + yield chunk + + response = Response(generator(), mimetype='application/zip') + response.headers['Content-Disposition'] = 'attachment; filename={}'.format('%s.zip' % archive_id) + return response else: - filename = repo.mainfile + # retrieve an individual raw file + auxfile = request.args.get('auxfile', None) + if auxfile: + filename = os.path.join(os.path.dirname(repo.mainfile), auxfile) + else: + filename = repo.mainfile - try: - upload = Upload.get(repo.upload_id) - upload_file = UploadFile(upload.upload_id, local_path=upload.local_path) - the_file = upload_file.get_file(filename) - with the_file.open() as f: - rv = send_file( - f, - mimetype='application/octet-stream', - as_attachment=True, - attachment_filename=os.path.basename(filename)) - return rv - except KeyError: - abort(404, message='The file %s does not exist.' % filename) - except FileNotFoundError: - abort(404, message='The file %s does not exist.' % filename) - except Exception as e: - logger = get_logger( - __name__, endpoint='archive', action='get', - upload_hash=upload_hash, calc_hash=calc_hash) - logger.error('Exception on accessing archive', exc_info=e) - abort(500, message='Could not accessing the archive.') + try: + with raw_file(filename) as f: + rv = send_file( + f, + mimetype='application/octet-stream', + as_attachment=True, + attachment_filename=os.path.basename(filename)) + return rv + except HTTPException as e: + raise e + except Exception as e: + logger = get_logger( + __name__, endpoint='archive', action='get', + upload_hash=upload_hash, calc_hash=calc_hash) + logger.error('Exception on accessing archive', exc_info=e) + abort(500, message='Could not accessing the archive.') @app.route('%s/admin/<string:operation>' % base_path, methods=['POST']) diff --git a/nomad/client.py b/nomad/client.py index 89ab72f43a..9b1bdafb4f 100644 --- a/nomad/client.py +++ b/nomad/client.py @@ -26,6 +26,10 @@ import requests from requests.auth import HTTPBasicAuth import click +from nomad import config +from nomad.files import UploadFile + + api_base = 'http://localhost/nomad/api' user = 'other@gmail.com' pw = 'nomad' @@ -44,13 +48,15 @@ def handle_common_errors(func): @handle_common_errors -def upload_file(file_path, name=None, offline=False): +def upload_file(file_path: str, name: str = None, offline: bool = False): """ Upload a file to nomad. Arguments: - file_path: Path to the file, absolute or relative to call directory. - name: Optional name, default is the file_path's basename + file_path: path to the file, absolute or relative to call directory + name: optional name, default is the file_path's basename + offline: allows to process data without upload, requires client to be run on the server + """ auth = HTTPBasicAuth(user, pw) @@ -115,6 +121,51 @@ def walk_through_files(path, extension='.zip'): yield os.path.abspath(os.path.join(dirpath, filename)) +class CalcProcReproduction(UploadFile): + """ + Instances represent a local reproduction of the processing for a single calculation. + It allows to download raw data from a nomad server and reproduce its processing + (parsing, normalizing) with the locally installed parsers and normalizers. + + The use-case is error/warning reproduction. Use ELK to identify errors, use + the upload, archive ids/hashes to given by ELK, and reproduce and fix the error + in your development environment. + + This is a class of :class:`UploadFile` the downloaded raw data will be treated as + an fake 'upload' that only contains the respective calculation data. This allows us + to locally run processing code that is very similar to the one used on the server. + """ + def __init__(self, archive_id: str) -> CalcProcReproduction: + local_path = os.path.join(config.fs.tmp, '%s.zip' % archive_id) + if not os.path.exists(local_path): + # download raw if not already downloaded + req = requests.get('%s/raw/%s?all=1' % (api_base, archive_id), stream=True) + with open(local_path, 'wb') as f: + for chunk in req.iter_content(): + f.write(chunk) + + super().__init__(upload_id='tmp_%s' % archive_id, local_path=local_path) + + def parse(parser_name: str = None): + """ + Run the given parser on the downloaded calculation. If no parser is given, + do parser matching and use the respective parser. + """ + pass + + def normalize(normalizer_name: str): + """ + Parse the downloaded calculation and run the given normalizer. + """ + pass + + def normalize_all(): + """ + Parse the downloaded calculation and run the whole normalizer chain. + """ + pass + + @click.group() @click.option('--host', default='localhost', help='The host nomad runs on, default is "localhost".') @click.option('--port', default=80, help='the port nomad runs with, default is 80.') diff --git a/requirements.txt b/requirements.txt index 83c95fc81c..11cd901a6b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,4 +21,5 @@ requests click sphinx sphinxcontrib.httpdomain -sphinx_rtd_theme \ No newline at end of file +sphinx_rtd_theme +zipstream \ No newline at end of file diff --git a/tests/test_api.py b/tests/test_api.py index 07fba40ba1..c0ec23758d 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -352,6 +352,14 @@ def test_raw_missing_auxfile(client, example_repo_with_files, no_warn): assert rv.status_code == 404 +def test_raw_all_files(client, example_repo_with_files, no_warn): + rv = client.get('/raw/%s?all=1' % example_repo_with_files.archive_id) + assert rv.status_code == 200 + assert len(rv.data) > 0 + with open('test.zip', 'wb') as f: + f.write(rv.data) + + def test_raw_missing_mainfile(client, no_warn): rv = client.get('/raw/doesnot/exist') assert rv.status_code == 404 -- GitLab