Commit b48ba67e authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

Added file_pattern and strip parameters to /raw/query.

parent 82002d6b
Pipeline #62502 passed with stages
in 24 minutes and 24 seconds
......@@ -78,6 +78,11 @@ your browser.
## Change log
Omitted versions are plain bugfix releases with only minor changes and fixes.
### v0.6.2
- API /raw/query endpoint takes file pattern to further filter download contents and
strips potential shared path prefixes for a cleaner download .zip
- minor bugfixes
### v0.6.0
- GUI URL, and API endpoint that resolves NOMAD CoE legacy PIDs
- Support for datasets in the GUI
......
......@@ -16,7 +16,7 @@
The raw API of the nomad@FAIRDI APIs. Can be used to retrieve raw calculation files.
"""
from typing import IO, Any, Union, Iterable, Tuple, Set
from typing import IO, Any, Union, Iterable, Tuple, Set, List
import os.path
import zipstream
from flask import Response, request, send_file, stream_with_context
......@@ -24,6 +24,7 @@ from flask_restplus import abort, Resource, fields
import magic
import sys
import contextlib
import fnmatch
from nomad import search, utils
from nomad.files import UploadFiles, Restricted
......@@ -346,16 +347,25 @@ class RawFilesResource(Resource):
raw_file_from_query_parser = search_request_parser.copy()
raw_file_from_query_parser = dict(
raw_file_from_query_parser.add_argument(
name='compress', type=bool, help='Use compression on .zip files, default is not.',
location='args')
raw_file_from_query_parser.add_argument(
name='strip', type=bool, help='Removes a potential common path prefix from all file paths.',
location='args')
raw_file_from_query_parser.add_argument(
name='file_pattern', type=str,
help=(
'A wildcard pattern. Only filenames that match this pattern will be in the '
'download. Multiple patterns will be combined with logical or'),
location='args', action='append')
@ns.route('/query')
class RawFileQueryResource(Resource):
@api.doc('raw_files_from_query')
@api.response(400, 'Invalid requests, e.g. wrong owner type or bad search parameters')
@api.expect(search_request_parser, validate=True)
@api.expect(raw_file_from_query_parser, validate=True)
@api.response(200, 'File(s) send', headers={'Content-Type': 'application/gz'})
@login_if_available
def get(self):
......@@ -366,8 +376,17 @@ class RawFileQueryResource(Resource):
Zip files are streamed; instead of 401 errors, the zip file will just not contain
any files that the user is not authorized to access.
"""
patterns: List[str] = None
try:
compress = bool(request.args.get('compress', False))
strip = bool(request.args.get('strip', False))
pattern = request.args.get('file_pattern', None)
if isinstance(pattern, str):
patterns = [pattern]
elif pattern is None:
patterns = []
else:
patterns = pattern
except Exception:
abort(400, message='bad parameter types')
......@@ -378,6 +397,12 @@ class RawFileQueryResource(Resource):
(entry['upload_id'], entry['mainfile'])
for entry in search_request.execute_scan()], key=lambda x: x[0])
paths = ['%s/%s' % (upload_id, mainfile) for upload_id, mainfile in calcs]
if strip:
common_prefix_len = len(utils.common_prefix(paths))
else:
common_prefix_len = 0
def generator():
for upload_id, mainfile in calcs:
upload_files = UploadFiles.get(
......@@ -392,8 +417,16 @@ class RawFileQueryResource(Resource):
zipfile_cache = contextlib.suppress()
with zipfile_cache:
for filename in list(upload_files.raw_file_manifest(path_prefix=os.path.dirname(mainfile))):
yield os.path.join(upload_id, filename), filename, upload_files
filenames = upload_files.raw_file_manifest(
path_prefix=os.path.dirname(mainfile))
for filename in filenames:
filename_w_upload = os.path.join(upload_files.upload_id, filename)
filename_wo_prefix = filename_w_upload[common_prefix_len:]
if len(patterns) == 0 or any(
fnmatch.fnmatchcase(os.path.basename(filename_wo_prefix), pattern)
for pattern in patterns):
yield filename_wo_prefix, filename, upload_files
return _streamed_zipfile(generator(), zipfile_name='nomad_raw_files.zip', compress=compress)
......
......@@ -432,3 +432,35 @@ class ETA:
def __exit__(self, *args, **kwargs):
print('')
def common_prefix(paths):
"""
Computes the longest common file path prefix (with respect to '/' separated segments).
Returns empty string is ne common prefix exists.
"""
common_prefix = None
for path in paths:
if common_prefix is None:
common_prefix = path
index = 0
index_last_slash = -1
for a, b in zip(path, common_prefix):
if a != b:
break
if a == '/':
index_last_slash = index
index += 1
if index_last_slash == -1:
common_prefix = ''
break
common_prefix = common_prefix[:index_last_slash + 1]
if common_prefix is None:
common_prefix = ''
return common_prefix
......@@ -1134,6 +1134,25 @@ class TestRaw(UploadFilesBasedTests):
assert zip_file.testzip() is None
assert len(zip_file.namelist()) == 0
@pytest.mark.parametrize('strip', [False, True])
def test_raw_query_pattern(self, api, non_empty_processed, test_user_auth, strip):
params = dict(file_pattern='*.json')
if strip:
params.update(strip=True)
url = '/raw/query?%s' % urlencode(params)
rv = api.get(url, headers=test_user_auth)
assert rv.status_code == 200
assert len(rv.data) > 0
with zipfile.ZipFile(io.BytesIO(rv.data)) as zip_file:
assert zip_file.testzip() is None
files = zip_file.namelist()
assert len(files) == 1
assert all(name.endswith('.json') for name in files)
if strip:
assert all(os.path.basename(name) == name for name in files)
else:
assert all(os.path.basename(name) != name for name in files)
@UploadFilesBasedTests.ignore_authorization
def test_raw_files_signed(self, api, upload, _, test_user_signature_token):
url = '/raw/%s?files=%s&token=%s' % (
......
......@@ -54,3 +54,12 @@ def test_logging(no_warn):
assert data['event'] == 'test msg'
received_test_event = True
assert received_test_event
def test_common_prefix():
assert utils.common_prefix(['aa/bb/cc', 'aa/bb/dd']) == 'aa/bb/'
assert utils.common_prefix(['aa/bb/dc', 'aa/bb/d']) == 'aa/bb/'
assert utils.common_prefix(['aa/b/dc', 'aa/bb/d']) == 'aa/'
assert utils.common_prefix(['a', 'a']) == ''
assert utils.common_prefix(['a', 'ab']) == ''
assert utils.common_prefix(['/a', '/a']) == '/'
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment