Commit 604a854f authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

Overhawl of the raw file API. Allow to access directory contents. Adapted the...

Overhawl of the raw file API. Allow to access directory contents. Adapted the GUI raw file component accordingly.
parent 06746182
Pipeline #52135 failed with stages
in 3 minutes and 46 seconds
......@@ -44,7 +44,7 @@
"cwd": "${workspaceFolder}",
"program": "${workspaceFolder}/.pyenv/bin/pytest",
"args": [
"-sv", "tests/test_api.py::TestRepo::test_search[2-user-other_test_user]"
"-sv", "tests/test_api.py::TestRaw::test_raw_file_from_calc"
]
},
{
......
......@@ -242,6 +242,27 @@ class Api {
.finally(this.onFinishLoading)
}
async getRawFileListFromCalc(uploadId, calcId) {
this.onStartLoading()
return this.swaggerPromise
.then(client => {
try {
return client.apis.raw.get_file_list_from_calc({
upload_id: uploadId,
calc_id: calcId,
path: null
})
} catch (e) {
console.log(e)
}
})
.catch(this.handleApiError)
.then(response => {
return response.body
})
.finally(this.onFinishLoading)
}
async repo(uploadId, calcId) {
this.onStartLoading()
return this.swaggerPromise
......
......@@ -9,11 +9,11 @@ import Download from './Download'
class RawFiles extends React.Component {
static propTypes = {
classes: PropTypes.object.isRequired,
uploadId: PropTypes.str.isRequired,
mainfile: PropTypes.str.isRequired,
data: PropTypes.object.isRequired,
api: PropTypes.object.isRequired,
user: PropTypes.object,
loading: PropTypes.number.isRequired
loading: PropTypes.number.isRequired,
raiseError: PropTypes.func.isRequired
}
static styles = theme => ({
......@@ -25,6 +25,7 @@ class RawFiles extends React.Component {
state = {
selectedFiles: [],
uploadDirectory: null,
files: null
}
......@@ -39,9 +40,9 @@ class RawFiles extends React.Component {
}
update() {
const {uploadId, mainfile} = this.props
this.props.api.raw_file_list(uploadId, mainfile).then(data => {
this.setState({files: data.files})
const {data: {uploadId, calcId}} = this.props
this.props.api.getRawFileListFromCalc(uploadId, calcId).then(data => {
this.setState({files: data.contents, uploadDirectory: data.directory})
}).catch(error => {
this.setState({files: null})
this.props.raiseError(error)
......@@ -49,7 +50,7 @@ class RawFiles extends React.Component {
}
label(file) {
return file.substring(file.lastIndexOf('/') + 1)
return file
}
onSelectFile(file) {
......@@ -64,21 +65,10 @@ class RawFiles extends React.Component {
}
render() {
const {classes, uploadId, mainfile, loading} = this.props
const {selectedFiles, files} = this.state
const {classes, data: {upload_id, calc_id}, loading} = this.props
const {selectedFiles, files, uploadDirectory} = this.state
const mainfileLocal = mainfile.split('/')[-1]
let availableFiles = [
{
file: mainfileLocal,
size: -1
}
]
if (files) {
const mainfileIndex = files.findIndex(file => file.file === mainfileLocal)
}
const availableFiles = files ? files.map(file => file.name) : []
const someSelected = selectedFiles.length > 0
const allSelected = availableFiles.length === selectedFiles.length && someSelected
......@@ -100,7 +90,7 @@ class RawFiles extends React.Component {
</FormLabel>
<Download component={IconButton} disabled={selectedFiles.length === 0}
tooltip="download selected files"
url={(selectedFiles.length === 1) ? `raw/${uploadId}/${selectedFiles[0]}` : `raw/${calc_id}?files=${encodeURIComponent(selectedFiles.join(','))}`}
url={(selectedFiles.length === 1) ? `raw/${upload_id}/${uploadDirectory}/${selectedFiles[0]}` : `raw/${upload_id}?files=${encodeURIComponent(selectedFiles.map(file => `${uploadDirectory}/${file}`).join(','))}`}
fileName={selectedFiles.length === 1 ? this.label(selectedFiles[0]) : `${calc_id}.zip`}
>
<DownloadIcon />
......
......@@ -16,19 +16,27 @@
The raw API of the nomad@FAIRDI APIs. Can be used to retrieve raw calculation files.
"""
from typing import IO, Any, Union
import os.path
from zipfile import ZIP_DEFLATED, ZIP_STORED
import zipstream
from flask import Response, request, send_file, stream_with_context
from flask_restplus import abort, Resource, fields
import magic
import sys
from nomad.files import UploadFiles, Restricted
from nomad.processing import Calc
from .app import api
from .auth import login_if_available, create_authorization_predicate, \
signature_token_argument, with_signature_token
if sys.version_info >= (3, 7):
import zipfile
else:
import zipfile37 as zipfile
ns = api.namespace('raw', description='Downloading raw data files.')
raw_file_list_model = api.model('RawFileList', {
......@@ -46,65 +54,147 @@ raw_file_compress_argument = dict(
raw_file_from_path_parser = api.parser()
raw_file_from_path_parser.add_argument(**raw_file_compress_argument)
raw_file_from_path_parser.add_argument(**signature_token_argument)
raw_file_from_path_parser.add_argument(
name='length', type=int, help='Download only x bytes from the given file.',
location='args')
raw_file_from_path_parser.add_argument(
name='offset', type=int, help='Start downloading a file\' content from the given offset.',
location='args')
@ns.route('/list/<string:upload_id>/<path:directory>')
@api.doc(params={
'upload_id': 'The unique id for the requested upload.',
'directory': 'The directory in the upload with the desired contents.'
})
@api.header('Content-Type', 'application/json')
class RawFileList(Resource):
@api.doc('get')
@api.response(404, 'The upload or path does not exist')
@api.response(401, 'Not authorized to access the data.')
@api.response(200, 'File(s) send', headers={'Content-Type': 'application/json'})
@api.marshal_with(raw_file_list_model, skip_none=True, code=200, description='File list send')
@login_if_available
@with_signature_token
def get(self, upload_id: str, directory: str):
"""
Get the contents of the given directory for the given upload.
If the path points to a file a single entry is returned. If the path
points to a directory, information on all files in the directory are returned.
"""
class FileView:
"""
File-like wrapper that restricts the contents to a portion of the file.
Arguments:
f: the file-like
offset: the offset
length: the amount of bytes
"""
def __init__(self, f, offset, length):
self.f = f
self.f_offset = offset
self.offset = 0
self.length = length
def seek(self, offset, whence=0):
if whence == os.SEEK_SET:
self.offset = offset
elif whence == os.SEEK_CUR:
self.offset += offset
elif whence == os.SEEK_END:
self.offset = self.length + offset
else:
# Other values of whence should raise an IOError
return self.f.seek(offset, whence)
return self.f.seek(self.offset + self.f_offset, os.SEEK_SET)
def tell(self):
return self.offset
def read(self, size=-1):
self.seek(self.offset)
if size < 0:
size = self.length - self.offset
size = max(0, min(size, self.length - self.offset))
self.offset += size
return self.f.read(size)
def get_raw_file_from_upload_path(upload_files, upload_filepath, authorization_predicate):
"""
Helper method used by func:`RawFileFromUploadPathResource.get` and
func:`RawFileFromCalcPathResource.get`.
"""
if upload_filepath[-1:] == '*':
upload_filepath = upload_filepath[0:-1]
wildcarded_files = list(upload_files.raw_file_manifest(path_prefix=upload_filepath))
if len(wildcarded_files) == 0:
abort(404, message='There are no files for %s.' % upload_filepath)
else:
compress = request.args.get('compress', None) is not None
return respond_to_get_raw_files(upload_files.upload_id, wildcarded_files, compress)
upload_files = UploadFiles.get(upload_id, create_authorization_predicate(upload_id))
if upload_files is None:
abort(404, message='The upload with id %s does not exist.' % upload_id)
try:
with upload_files.raw_file(upload_filepath, 'br') as raw_file:
buffer = raw_file.read(2048)
mime_type = magic.from_buffer(buffer, mime=True)
files = upload_files.raw_file_list(directory=directory)
if len(files) == 0:
abort(404, message='There are no files for %s.' % directory)
try:
offset = int(request.args.get('offset', 0))
length = int(request.args.get('length', -1))
except Exception:
abort(400, message='bad parameter types')
if offset < 0:
abort(400, message='bad offset, length values')
if offset > 0 and length <= 0:
abort(400, message='bad offset, length values')
raw_file = upload_files.raw_file(upload_filepath, 'br')
raw_file_view: Union[FileView, IO[Any]] = None
if length > 0:
raw_file_view = FileView(raw_file, offset, length)
else:
return {
'upload_id': upload_id,
'directory': directory,
'contents': [dict(file=file, size=size) for file, size in files]
}
raw_file_view = raw_file
return send_file(
raw_file_view,
mimetype=mime_type,
as_attachment=True,
attachment_filename=os.path.basename(upload_filepath))
except Restricted:
abort(401, message='Not authorized to access all files in %s.' % upload_files.upload_id)
except KeyError:
directory_files = upload_files.raw_file_list(upload_filepath)
if len(directory_files) == 0:
abort(404, message='There is nothing to be found at %s.' % upload_filepath)
return {
'upload_id': upload_files.upload_id,
'directory': upload_filepath,
'contents': [
dict(name=name, size=size) for name, size in directory_files]
}, 200
@ns.route('/<string:upload_id>/<path:path>')
@api.doc(params={
'upload_id': 'The unique id for the requested upload.',
'path': 'The path to a file or directory.'
'path': 'The path to a file or directory with optional wildcard.'
})
@api.header('Content-Type', 'application/gz')
class RawFileFromPathResource(Resource):
class RawFileFromUploadPathResource(Resource):
@api.doc('get')
@api.response(404, 'The upload or path does not exist')
@api.response(401, 'Not authorized to access the data.')
@api.response(200, 'File(s) send', headers={'Content-Type': 'application/gz'})
@api.response(401, 'Not authorized to access the requested files.')
@api.response(200, 'File(s) send')
@api.expect(raw_file_from_path_parser, validate=True)
@login_if_available
@with_signature_token
def get(self, upload_id: str, path: str):
"""
Get a single raw calculation file or whole directory from a given upload.
If the given path points to a file, the file is provided. If the given path
points to an directory, the directory and all contents is provided as .zip file.
Get a single raw calculation file, directory contents, or whole directory sub-tree
from a given upload.
The 'upload_id' parameter needs to identify an existing upload.
If the upload
is not yet published or contains requested data with embargo, proper authentication
is required. This can be done via HTTP headers as usual. But, if you need to
access files via plain URLs (e.g. for curl, download link, etc.), URLs for
this endpoint can be token signed (see also /auth/token). For unpublished
uploads, authentication is required regardless. For (partially) embargoed data,
multi file downloads work, but will not contain any embargoed data.
If the given path points to a file, the file is provided with the appropriate
Content-Type header. A 401 is returned for staging, embargo files with unsigned
or wrongly signed URLs. When accessing a file, the additional query parameters 'length'
and 'offset' can be used to partially download a file's content.
If the given path points to a directory, the content (names, sizes, type) is returned
as a json body. Only visible items (depending on authenticated user, token) are
returned.
If the given path ends with the '*' wildcard character, all upload contents that
match the given path at the start, will be returned as a .zip file body.
Zip files are streamed; instead of 401 errors, the zip file will just not contain
any files that the user is not authorized to access.
"""
......@@ -121,29 +211,67 @@ class RawFileFromPathResource(Resource):
if upload_files is None:
abort(404, message='The upload with id %s does not exist.' % upload_id)
if upload_filepath[-1:] == '*':
upload_filepath = upload_filepath[0:-1]
files = list(upload_files.raw_file_manifest(path_prefix=upload_filepath))
if len(files) == 0:
abort(404, message='There are no files for %s.' % upload_filepath)
else:
compress = request.args.get('compress', None) is not None
return respond_to_get_raw_files(upload_id, files, compress)
return get_raw_file_from_upload_path(upload_files, upload_filepath, authorization_predicate)
try:
return send_file(
upload_files.raw_file(upload_filepath, 'br'),
mimetype='application/octet-stream',
as_attachment=True,
attachment_filename=os.path.basename(upload_filepath))
except Restricted:
abort(401, message='Not authorized to access upload %s.' % upload_id)
except KeyError:
files = list(file for file in upload_files.raw_file_manifest(upload_filepath))
if len(files) == 0:
abort(404, message='The file %s does not exist.' % upload_filepath)
else:
abort(404, message='The file %s does not exist, but there are files with matching paths' % upload_filepath, files=files)
@ns.route('/calc/<string:upload_id>/<string:calc_id>/<path:path>')
@api.doc(params={
'upload_id': 'The unique id for the requested calc\'s upload.',
'calc_id': 'The unique calc id for the requested calc',
'path': 'The path to a file or directory with optional wildcard.'
})
class RawFileFromCalcPathResource(Resource):
@api.doc('get_file_from_calc')
@api.response(404, 'The upload or path does not exist')
@api.response(401, 'Not authorized to access the requested files.')
@api.response(200, 'File(s) send')
@api.expect(raw_file_from_path_parser, validate=True)
@login_if_available
@with_signature_token
def get(self, upload_id: str, calc_id: str, path: str):
"""
Get a single raw calculation file, calculation contents, or all files for a
given calculation.
The 'upload_id' parameter needs to identify an existing upload.
The 'calc_id' parameter needs to identify a calculation within in the upload.
This endpoint behaves exactly like /raw/<upload_id>/<path>, but the path is
now relative to the calculation and not the upload.
"""
calc_filepath = path if path is not None else ''
authorization_predicate = create_authorization_predicate(upload_id)
upload_files = UploadFiles.get(upload_id, authorization_predicate)
if upload_files is None:
abort(404, message='The upload with id %s does not exist.' % upload_id)
calc = Calc.get(calc_id)
if calc is None:
abort(404, message='The calc with id %s does not exist.' % calc_id)
if calc.upload_id != upload_id:
abort(404, message='The calc with id %s is not part of the upload with id %s.' % (calc_id, upload_id))
upload_filepath = os.path.join(os.path.dirname(calc.mainfile), calc_filepath)
return get_raw_file_from_upload_path(upload_files, upload_filepath, authorization_predicate)
@ns.route('/calc/<string:upload_id>/<string:calc_id>/')
class RawFileFromCalcEmptyPathResource(RawFileFromCalcPathResource):
@api.doc('get_file_list_from_calc')
@api.response(404, 'The upload or path does not exist')
@api.response(401, 'Not authorized to access the requested files.')
@api.response(200, 'File(s) send')
@api.expect(raw_file_from_path_parser, validate=True)
@login_if_available
@with_signature_token
def get(self, upload_id: str, calc_id: str):
"""
Get calculation contents.
This is basically /raw/calc/<upload_id>/<calc_id>/<path> with an empty path, since
having an empty path parameter is not possible.
"""
return super().get(upload_id, calc_id, None)
raw_files_request_model = api.model('RawFilesRequest', {
......@@ -236,7 +364,7 @@ def respond_to_get_raw_files(upload_id, files, compress=False):
# we just leave it out in the download
pass
compression = ZIP_DEFLATED if compress else ZIP_STORED
compression = zipfile.ZIP_DEFLATED if compress else zipfile.ZIP_STORED
zip_stream = zipstream.ZipFile(mode='w', compression=compression, allowZip64=True)
zip_stream.paths_to_write = iterator()
......
......@@ -49,11 +49,11 @@ being other mainfiles. Therefore, the aux files of a restricted calc might becom
"""
from abc import ABCMeta
import sys
from typing import IO, Generator, Dict, Iterable, Callable, List, Tuple
import os.path
import os
import shutil
from zipfile import ZipFile, BadZipFile
import tarfile
import hashlib
import io
......@@ -64,6 +64,14 @@ from nomad import config, utils
from nomad.datamodel import UploadWithMetadata
# TODO this should become obsolete, once we are going beyong python 3.6. For now
# python 3.6's zipfile does not allow to seek/tell within a file-like opened from a
# file in a zipfile.
if sys.version_info >= (3, 7):
import zipfile
else:
import zipfile37 as zipfile
user_metadata_filename = 'user_metadata.pickle'
......@@ -320,6 +328,8 @@ class StagingUploadFiles(UploadFiles):
return open(path_object.os_path, *args, **kwargs)
except FileNotFoundError:
raise KeyError()
except IsADirectoryError:
raise KeyError()
def raw_file(self, file_path: str, *args, **kwargs) -> IO:
if not self._is_authorized():
......@@ -370,12 +380,12 @@ class StagingUploadFiles(UploadFiles):
ext = os.path.splitext(path)[1]
if force_archive or ext == '.zip':
try:
with ZipFile(path) as zf:
with zipfile.ZipFile(path) as zf:
zf.extractall(target_dir.os_path)
if move:
os.remove(path)
return
except BadZipFile:
except zipfile.BadZipFile:
pass
if force_archive or ext in ['.tgz', '.tar.gz', '.tar.bz2']:
......@@ -440,9 +450,9 @@ class StagingUploadFiles(UploadFiles):
self._user_metadata_file.os_path,
target_metadata_file.os_path)
def create_zipfile(kind: str, prefix: str, ext: str) -> ZipFile:
def create_zipfile(kind: str, prefix: str, ext: str) -> zipfile.ZipFile:
file = target_dir.join_file('%s-%s.%s.zip' % (kind, prefix, ext))
return ZipFile(file.os_path, mode='w')
return zipfile.ZipFile(file.os_path, mode='w')
# In prior versions we used bagit on raw files. There was not much purpose for
# it, so it was removed. Check 0.3.x for the implementation
......@@ -513,6 +523,9 @@ class StagingUploadFiles(UploadFiles):
yield path
def raw_file_list(self, directory: str) -> List[Tuple[str, int]]:
if not self._is_authorized():
raise Restricted
if directory is None or directory == '':
prefix = self._raw_dir.os_path
else:
......@@ -647,9 +660,9 @@ class PublicUploadFiles(UploadFiles):
super().__init__(config.fs.public, *args, **kwargs)
@cachetools.cached(cache=__zip_file_cache)
def get_zip_file(self, prefix: str, access: str, ext: str) -> ZipFile:
def get_zip_file(self, prefix: str, access: str, ext: str) -> zipfile.ZipFile:
zip_file = self.join_file('%s-%s.%s.zip' % (prefix, access, ext))
return ZipFile(zip_file.os_path)
return zipfile.ZipFile(zip_file.os_path)
def _file(self, prefix: str, ext: str, path: str, *args, **kwargs) -> IO:
mode = kwargs.get('mode') if len(args) == 0 else args[0]
......@@ -670,6 +683,8 @@ class PublicUploadFiles(UploadFiles):
return f
except FileNotFoundError:
pass
except IsADirectoryError:
pass
except KeyError:
pass
......@@ -711,6 +726,9 @@ class PublicUploadFiles(UploadFiles):
results = []
for access in ['public', 'restricted']:
if access == 'restricted' and not self._is_authorized():
continue
try:
zf = self.get_zip_file('raw', access, 'plain')
for path in zf.namelist():
......
......@@ -64,7 +64,7 @@ class MProperty(MObject):
class MElementDef(MSection):
def __init__(self, m_definition: 'MElementDef', name: str): # more **kwargs
def __init__(self, m_definition: 'MElementDef', name: str): # more **kwargs
super().__init__(m_definition=m_definition)
self.name = name
......
......@@ -26,7 +26,7 @@ import multiprocessing.pool
import time
import os
import os.path
import zipfile
import sys
import tarfile
import math
from mongoengine import Document, IntField, StringField, DictField, BooleanField
......@@ -47,6 +47,11 @@ from nomad.datamodel import CalcWithMetadata
from nomad.processing import FAILURE
if sys.version_info >= (3, 7):
import zipfile
else:
import zipfile37 as zipfile
default_pid_prefix = 7000000
""" The default pid prefix for new non migrated calculations """
......
......@@ -22,6 +22,7 @@ import io
import inspect
from passlib.hash import bcrypt
import datetime
import os.path
from nomad.api.app import rfc3339DateTime
from nomad import coe_repo, search, parsing, files, config
......@@ -840,6 +841,20 @@ class TestRepo():
class TestRaw(UploadFilesBasedTests):
def test_raw_file_from_calc(self, client, non_empty_processed, test_user_auth):
calc = list(non_empty_processed.calcs)[0]
url = '/raw/calc/%s/%s/%s' % (
non_empty_processed.upload_id, calc.calc_id, os.path.basename(calc.mainfile))
rv = client.get(url, headers=test_user_auth)
assert rv.status_code == 200
assert len(rv.data) > 0
url = '/raw/calc/%s/%s/' % (non_empty_processed.upload_id, calc.calc_id)
rv = client.get(url, headers=test_user_auth)
assert rv.status_code == 200
result = json.loads(rv.data)
assert len(result['contents']) > 0
@UploadFilesBasedTests.check_authorizaton
def test_raw_file(self, client, upload, auth_headers):
url = '/raw/%s/%s' % (upload, example_file_mainfile)
......@@ -847,6 +862,21 @@ class TestRaw(UploadFilesBasedTests):
assert rv.status_code == 200
assert len(rv.data) > 0
@UploadFilesBasedTests.check_authorizaton
def test_raw_file_partial(self, client, upload, auth_headers):
url = '/raw/%s/%s?offset=0&length=20' % (upload, example_file_mainfile)
rv = client.get(