Commit 7c666fef authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

Added a manifest to raw query results. #225.

parent eddb1c5c
Pipeline #62779 failed with stages
in 32 minutes and 34 seconds
......@@ -25,6 +25,7 @@ import magic
import sys
import contextlib
import fnmatch
import json
from nomad import search, utils
from nomad.files import UploadFiles, Restricted
......@@ -180,8 +181,7 @@ class RawFileFromUploadPathResource(Resource):
@login_if_available
@with_signature_token
def get(self, upload_id: str, path: str):
"""
Get a single raw calculation file, directory contents, or whole directory sub-tree
""" Get a single raw calculation file, directory contents, or whole directory sub-tree
from a given upload.
The 'upload_id' parameter needs to identify an existing upload.
......@@ -239,8 +239,7 @@ class RawFileFromCalcPathResource(Resource):
@login_if_available
@with_signature_token
def get(self, upload_id: str, calc_id: str, path: str):
"""
Get a single raw calculation file, calculation contents, or all files for a
""" Get a single raw calculation file, calculation contents, or all files for a
given calculation.
The 'upload_id' parameter needs to identify an existing upload.
......@@ -277,8 +276,7 @@ class RawFileFromCalcEmptyPathResource(RawFileFromCalcPathResource):
@login_if_available
@with_signature_token
def get(self, upload_id: str, calc_id: str):
"""
Get calculation contents.
""" Get calculation contents.
This is basically /raw/calc/<upload_id>/<calc_id>/<path> with an empty path, since
having an empty path parameter is not possible.
......@@ -312,8 +310,8 @@ class RawFilesResource(Resource):
@api.expect(raw_files_request_model, validate=True)
@login_if_available
def post(self, upload_id):
"""
Download multiple raw calculation files in a .zip file.
""" Download multiple raw calculation files in a .zip file.
Zip files are streamed; instead of 401 errors, the zip file will just not contain
any files that the user is not authorized to access.
"""
......@@ -363,18 +361,24 @@ raw_file_from_query_parser.add_argument(
@ns.route('/query')
class RawFileQueryResource(Resource):
manifest_quantities = ['upload_id', 'calc_id', 'external_id', 'raw_id', 'pid', 'calc_hash']
@api.doc('raw_files_from_query')
@api.response(400, 'Invalid requests, e.g. wrong owner type or bad search parameters')
@api.expect(raw_file_from_query_parser, validate=True)
@api.response(200, 'File(s) send', headers={'Content-Type': 'application/gz'})
@login_if_available
def get(self):
"""
Download a .zip file with all raw-files for all entries that match the given
search parameters. See ``/repo`` endpoint for documentation on the search
""" Download a .zip file with all raw-files for all entries that match the given
search parameters.
See ``/repo`` endpoint for documentation on the search
parameters.
Zip files are streamed; instead of 401 errors, the zip file will just not contain
any files that the user is not authorized to access.
The zip file will contain a ``manifest.json`` with the repository meta data.
"""
patterns: List[str] = None
try:
......@@ -394,18 +398,23 @@ class RawFileQueryResource(Resource):
search_request = search.SearchRequest()
add_query(search_request, search_request_parser)
calcs = sorted([
(entry['upload_id'], entry['mainfile'])
for entry in search_request.execute_scan()], key=lambda x: x[0])
def path(entry):
return '%s/%s' % (entry['upload_id'], entry['mainfile'])
calcs = sorted(
[entry for entry in search_request.execute_scan()],
key=lambda x: x['upload_id'])
paths = ['%s/%s' % (upload_id, mainfile) for upload_id, mainfile in calcs]
paths = [path(entry) for entry in calcs]
if strip:
common_prefix_len = len(utils.common_prefix(paths))
else:
common_prefix_len = 0
def generator():
for upload_id, mainfile in calcs:
for entry in calcs:
upload_id = entry['upload_id']
mainfile = entry['mainfile']
upload_files = UploadFiles.get(
upload_id, create_authorization_predicate(upload_id))
if upload_files is None:
......@@ -429,7 +438,24 @@ class RawFileQueryResource(Resource):
yield filename_wo_prefix, filename, upload_files
return _streamed_zipfile(generator(), zipfile_name='nomad_raw_files.zip', compress=compress)
try:
manifest = {
path(entry): {
key: entry[key]
for key in RawFileQueryResource.manifest_quantities
if entry.get(key) is not None
}
for entry in calcs
}
manifest_contents = json.dumps(manifest)
except Exception as e:
manifest_contents = dict(error='Could not create the manifest: %s' % (e))
utils.get_logger(__name__).error(
'could not create raw query manifest', exc_info=e)
return _streamed_zipfile(
generator(), zipfile_name='nomad_raw_files.zip', compress=compress,
manifest=manifest_contents)
def respond_to_get_raw_files(upload_id, files, compress=False):
......@@ -453,7 +479,7 @@ def respond_to_get_raw_files(upload_id, files, compress=False):
def _streamed_zipfile(
files: Iterable[Tuple[str, str, UploadFiles]], zipfile_name: str,
compress: bool = False):
compress: bool = False, manifest: str = None):
"""
Creates a response that streams the given files as a streamed zip file. Ensures that
each given file is only streamed once, based on its filename in the resulting zipfile.
......@@ -465,6 +491,7 @@ def _streamed_zipfile(
zipfile_name: A name that will be used in the content disposition attachment
used as an HTTP respone.
compress: Uses compression. Default is stored only.
manifest: Add a ``manifest.json`` with the given content.
"""
streamed_files: Set[str] = set()
......@@ -476,6 +503,11 @@ def _streamed_zipfile(
Replace the directory based iter of zipstream with an iter over all given
files.
"""
# first the manifest
if manifest is not None:
yield dict(arcname='manifest.json', iterable=(manifest.encode('utf-8'),))
# now the actual contents
for zipped_filename, upload_filename, upload_files in files:
if zipped_filename in streamed_files:
continue
......
......@@ -524,7 +524,7 @@ class UploadFilesBasedTests:
wrapper.__signature__ = wrapper_sig
@staticmethod
def check_authorizaton(func):
def check_authorization(func):
@pytest.mark.parametrize('test_data', [
[True, None, True], # in staging for upload
[True, None, False], # in staging for different user
......@@ -543,7 +543,7 @@ class UploadFilesBasedTests:
except AssertionError as assertion:
assertion_str = str(assertion)
if not authorized:
if '0 == 5' in assertion_str and 'ZipFile' in assertion_str:
if '0 == 5' in assertion_str:
# the user is not authorized an gets an empty zip as expected
return
if '401' in assertion_str:
......@@ -609,7 +609,7 @@ class UploadFilesBasedTests:
class TestArchive(UploadFilesBasedTests):
@UploadFilesBasedTests.check_authorizaton
@UploadFilesBasedTests.check_authorization
def test_get(self, api, upload, auth_headers):
rv = api.get('/archive/%s/0' % upload, headers=auth_headers)
assert rv.status_code == 200
......@@ -621,7 +621,7 @@ class TestArchive(UploadFilesBasedTests):
assert rv.status_code == 200
assert json.loads(rv.data) is not None
@UploadFilesBasedTests.check_authorizaton
@UploadFilesBasedTests.check_authorization
def test_get_calc_proc_log(self, api, upload, auth_headers):
rv = api.get('/archive/logs/%s/0' % upload, headers=auth_headers)
assert rv.status_code == 200
......@@ -1006,6 +1006,24 @@ class TestRepo():
class TestRaw(UploadFilesBasedTests):
def assert_zip_file(self, rv, files: int = -1, basename: bool = None):
assert rv.status_code == 200
assert len(rv.data) > 0
with zipfile.ZipFile(io.BytesIO(rv.data)) as zip_file:
assert zip_file.testzip() is None
zip_files = zip_file.namelist()
if files >= 0:
assert len(zip_files) == files
if basename is not None:
if basename:
assert all(
os.path.basename(name) == name
for name in zip_files if name != 'manifest.json')
else:
assert all(
os.path.basename(name) != name
for name in zip_files for name in zip_files if name != 'manifest.json')
def test_raw_file_from_calc(self, api, non_empty_processed, test_user_auth):
calc = list(non_empty_processed.calcs)[0]
url = '/raw/calc/%s/%s/%s' % (
......@@ -1020,14 +1038,14 @@ class TestRaw(UploadFilesBasedTests):
result = json.loads(rv.data)
assert len(result['contents']) > 0
@UploadFilesBasedTests.check_authorizaton
@UploadFilesBasedTests.check_authorization
def test_raw_file(self, api, upload, auth_headers):
url = '/raw/%s/%s' % (upload, example_file_mainfile)
rv = api.get(url, headers=auth_headers)
assert rv.status_code == 200
assert len(rv.data) > 0
@UploadFilesBasedTests.check_authorizaton
@UploadFilesBasedTests.check_authorization
def test_raw_file_partial(self, api, upload, auth_headers):
url = '/raw/%s/%s?offset=0&length=20' % (upload, example_file_mainfile)
rv = api.get(url, headers=auth_headers)
......@@ -1066,10 +1084,7 @@ class TestRaw(UploadFilesBasedTests):
rv = api.get(url, headers=auth_headers)
assert rv.status_code == 200
assert len(rv.data) > 0
with zipfile.ZipFile(io.BytesIO(rv.data)) as zip_file:
assert zip_file.testzip() is None
assert len(zip_file.namelist()) == len(example_file_contents)
self.assert_zip_file(rv, files=len(example_file_contents))
@UploadFilesBasedTests.ignore_authorization
def test_raw_file_wildcard_missing(self, api, upload, auth_headers):
......@@ -1084,7 +1099,7 @@ class TestRaw(UploadFilesBasedTests):
assert rv.status_code == 404
@pytest.mark.parametrize('compress', [True, False])
@UploadFilesBasedTests.check_authorizaton
@UploadFilesBasedTests.check_authorization
def test_raw_files(self, api, upload, auth_headers, compress):
url = '/raw/%s?files=%s' % (
upload, ','.join(example_file_contents))
......@@ -1093,10 +1108,7 @@ class TestRaw(UploadFilesBasedTests):
rv = api.get(url, headers=auth_headers)
assert rv.status_code == 200
assert len(rv.data) > 0
with zipfile.ZipFile(io.BytesIO(rv.data)) as zip_file:
assert zip_file.testzip() is None
assert len(zip_file.namelist()) == len(example_file_contents)
self.assert_zip_file(rv, files=len(example_file_contents))
@pytest.mark.parametrize('compress', [False, True])
def test_raw_files_from_query_upload_id(self, api, non_empty_processed, test_user_auth, compress):
......@@ -1104,10 +1116,7 @@ class TestRaw(UploadFilesBasedTests):
rv = api.get(url, headers=test_user_auth)
assert rv.status_code == 200
assert len(rv.data) > 0
with zipfile.ZipFile(io.BytesIO(rv.data)) as zip_file:
assert zip_file.testzip() is None
assert len(zip_file.namelist()) == len(example_file_contents)
self.assert_zip_file(rv, files=len(example_file_contents) + 1)
@pytest.mark.parametrize('query_params', [
{'atoms': 'Si'},
......@@ -1119,20 +1128,18 @@ class TestRaw(UploadFilesBasedTests):
rv = api.get(url, headers=test_user_auth)
assert rv.status_code == 200
assert len(rv.data) > 0
self.assert_zip_file(rv, files=len(example_file_contents) * len(processeds) + 1)
with zipfile.ZipFile(io.BytesIO(rv.data)) as zip_file:
assert zip_file.testzip() is None
assert len(zip_file.namelist()) == len(example_file_contents) * len(processeds)
with zip_file.open('manifest.json', 'rt') as f:
manifest = json.load(f)
assert len(manifest) == len(processeds)
def test_raw_files_from_empty_query(self, api, elastic):
url = '/raw/query?upload_id=doesNotExist'
rv = api.get(url)
assert rv.status_code == 200
assert len(rv.data) > 0
with zipfile.ZipFile(io.BytesIO(rv.data)) as zip_file:
assert zip_file.testzip() is None
assert len(zip_file.namelist()) == 0
self.assert_zip_file(rv, files=1)
@pytest.mark.parametrize('files, pattern, strip', [
(1, '*.json', False),
......@@ -1145,15 +1152,7 @@ class TestRaw(UploadFilesBasedTests):
url = '/raw/query?%s' % urlencode(params, doseq=True)
rv = api.get(url, headers=test_user_auth)
assert rv.status_code == 200
assert len(rv.data) > 0
with zipfile.ZipFile(io.BytesIO(rv.data)) as zip_file:
assert zip_file.testzip() is None
zip_files = zip_file.namelist()
assert len(zip_files) == files
if strip:
assert all(os.path.basename(name) == name for name in zip_files)
else:
assert all(os.path.basename(name) != name for name in zip_files)
self.assert_zip_file(rv, files=(files + 1), basename=strip)
@UploadFilesBasedTests.ignore_authorization
def test_raw_files_signed(self, api, upload, _, test_user_signature_token):
......@@ -1162,13 +1161,10 @@ class TestRaw(UploadFilesBasedTests):
rv = api.get(url)
assert rv.status_code == 200
assert len(rv.data) > 0
with zipfile.ZipFile(io.BytesIO(rv.data)) as zip_file:
assert zip_file.testzip() is None
assert len(zip_file.namelist()) == len(example_file_contents)
self.assert_zip_file(rv, files=len(example_file_contents))
@pytest.mark.parametrize('compress', [True, False, None])
@UploadFilesBasedTests.check_authorizaton
@UploadFilesBasedTests.check_authorization
def test_raw_files_post(self, api, upload, auth_headers, compress):
url = '/raw/%s' % upload
data = dict(files=example_file_contents)
......@@ -1177,10 +1173,7 @@ class TestRaw(UploadFilesBasedTests):
rv = api.post(url, data=json.dumps(data), content_type='application/json', headers=auth_headers)
assert rv.status_code == 200
assert len(rv.data) > 0
with zipfile.ZipFile(io.BytesIO(rv.data)) as zip_file:
assert zip_file.testzip() is None
assert len(zip_file.namelist()) == len(example_file_contents)
self.assert_zip_file(rv, files=len(example_file_contents))
@pytest.mark.parametrize('compress', [True, False])
@UploadFilesBasedTests.ignore_authorization
......@@ -1191,10 +1184,7 @@ class TestRaw(UploadFilesBasedTests):
rv = api.get(url, headers=auth_headers)
assert rv.status_code == 200
assert len(rv.data) > 0
with zipfile.ZipFile(io.BytesIO(rv.data)) as zip_file:
assert zip_file.testzip() is None
assert len(zip_file.namelist()) == 1
self.assert_zip_file(rv, files=1)
@UploadFilesBasedTests.ignore_authorization
def test_raw_files_missing_upload(self, api, upload, auth_headers):
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment