Commit 3fd1558c authored by David Sikter's avatar David Sikter
Browse files

Splitting upload raw api call into raw and rawdir

parent 71cff371
Pipeline #120471 passed with stages
in 46 minutes and 25 seconds
......@@ -146,13 +146,13 @@ export default function FilesBrower({uploadId, disabled}) {
const fetchData = useMemo(() => (path, open) => {
async function fetchData() {
const results = await api.get(`/uploads/${uploadId}/raw/${path}`)
const results = await api.get(`/uploads/${uploadId}/rawdir/${path}?page_size=500`)
allData.current[path] = {
open: open,
...results
}
const resultsByPath = {}
results.content
results.directory_metadata.content
.filter(item => item.is_file)
.forEach(item => {
resultsByPath[`${path}/${item.name}`] = item
......@@ -212,7 +212,7 @@ export default function FilesBrower({uploadId, disabled}) {
key: path,
hasChildren: !is_file,
open: data?.open,
children: data?.content?.map(mapContent),
children: data?.directory_metadata?.content?.map(mapContent),
onToggle: is_file ? null : () => handleToggle(path),
// TODO
// info: !is_file && data?.content?.length === 0 && <Typography variant="caption">
......
......@@ -569,7 +569,7 @@ class Pagination(BaseModel):
@validator('page_offset')
def validate_page_offset(cls, page_offset, values): # pylint: disable=no-self-argument
if page_offset is not None:
assert page_offset >= 0, 'page must be >= 1'
assert page_offset >= 0, 'page_offset must be >= 1'
return page_offset
@root_validator(skip_on_failure=True)
......
......@@ -29,7 +29,7 @@ from fastapi.responses import StreamingResponse
from fastapi.exceptions import RequestValidationError
from nomad import utils, config, files
from nomad.files import UploadFiles, StagingUploadFiles, UploadBundle, is_safe_relative_path
from nomad.files import StagingUploadFiles, UploadBundle, is_safe_relative_path
from nomad.processing import Upload, Entry, ProcessAlreadyRunning, ProcessStatus, MetadataEditRequestHandler
from nomad.utils import strip
from nomad.search import search
......@@ -229,19 +229,53 @@ class EntryProcDataQueryResponse(BaseModel):
'''))
class DirectoryListLine(BaseModel):
class RawDirPagination(Pagination):
@validator('order_by')
def validate_order_by(cls, order_by): # pylint: disable=no-self-argument
assert not order_by, 'Cannot specify `order_by` for rawdir calls'
return None
@validator('page_after_value')
def validate_page_after_value(cls, page_after_value, values): # pylint: disable=no-self-argument
# Validation handled elsewhere
return page_after_value
rawdir_pagination_parameters = parameter_dependency_from_model(
'rawdir_pagination_parameters', RawDirPagination, exclude=['order', 'order_by'])
class RawDirFileMetadata(BaseModel):
''' Metadata about a file '''
name: str = Field()
size: Optional[int] = Field()
entry_id: Optional[str] = Field(description=strip('''
If this is a mainfile: the ID of the corresponding entry.'''))
parser_name: Optional[str] = Field(description=strip('''
If this is a mainfile: the name of the matched parser.'''))
class RawDirElementMetadata(RawDirFileMetadata):
''' Metadata about an directory *element*, i.e. a file or a directory '''
is_file: bool = Field()
class RawDirDirectoryMetadata(BaseModel):
''' Metadata about a directory '''
name: str = Field()
size: Optional[int] = Field()
access: str = Field()
content: List[RawDirElementMetadata] = Field(
example=[
{'name': 'a_directory', 'is_file': False, 'size': 456},
{'name': 'a_file.json', 'is_file': True, 'size': 123, 'entry_id': 'XYZ', 'parser_name': 'parsers/vasp'}])
class DirectoryListResponse(BaseModel):
class RawDirResponse(BaseModel):
path: str = Field(example='The/requested/path')
content: List[DirectoryListLine] = Field(
example=[
{'name': 'a_directory', 'is_file': False, 'size': 456, 'access': 'public'},
{'name': 'a_file.json', 'is_file': True, 'size': 123, 'access': 'restricted'}])
access: str = Field()
file_metadata: Optional[RawDirFileMetadata] = Field()
directory_metadata: Optional[RawDirDirectoryMetadata] = Field()
pagination: Optional[PaginationResponse] = Field()
class UploadCommandExamplesResponse(BaseModel):
......@@ -307,18 +341,12 @@ _upload_response = 200, {
`Accept = application/json`, otherwise a plain text information string.''')}
_raw_path_response = 200, {
'model': DirectoryListResponse,
'content': {
'application/json': {},
'text/html': {'example': '<html defining a list of directory content>'},
'application/octet-stream': {'example': 'file data'},
'application/zip': {'example': '<zipped file or directory content>'}},
'description': strip('''
If `path` denotes a file: a stream with the file content, zipped if `compress = true`.
If `path` denotes a directory, and `compress = true`, the directory content, zipped.
If `path` denotes a directory, and `compress = false`, a list of the directory
content, either encoded as json or html, depending on the request headers (json if
`Accept = application/json`, html otherwise).''')}
If `path` denotes a directory, and `compress = true`, the directory content, zipped.''')}
_upload_bundle_response = 200, {
'content': {
......@@ -545,6 +573,100 @@ async def get_upload_entry(
return EntryProcDataResponse(entry_id=entry_id, data=data)
@router.get(
'/{upload_id}/rawdir/{path:path}', tags=[raw_tag],
summary='Get the raw files and folders metadata for a given upload and path.',
response_model=RawDirResponse,
responses=create_responses(_upload_or_path_not_found, _not_authorized_to_upload, _bad_request),
response_model_exclude_unset=True,
response_model_exclude_none=True)
async def get_upload_rawdir_path(
request: Request,
upload_id: str = Path(
...,
description='The unique id of the upload.'),
path: str = Path(
...,
description='The path within the upload raw files.'),
pagination: RawDirPagination = Depends(rawdir_pagination_parameters),
include_entry_info: bool = FastApiQuery(
False,
description=strip('''
If the fields `entry_id` and `parser_name` should be populated for all
encountered mainfiles.''')),
user: User = Depends(create_user_dependency(required=False, signature_token_auth_allowed=True))):
'''
For the upload specified by `upload_id`, gets the raw file or directory metadata
located at the given `path`. The response will either contain a `file_metadata` or
`directory_metadata` key. For files, basic data about the file is returned, such as its
name and size. For directories, the response includes a list of elements
(files and folders) in the directory. For directories, the result is paginated.
'''
# Get upload
upload = _get_upload_with_read_access(upload_id, user, include_others=True)
try:
# Get upload files
upload_files = upload.upload_files
if not upload_files.raw_path_exists(path):
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=strip('''
Not found. Invalid path?'''))
response = RawDirResponse(
path=path.rstrip('/'),
access='unpublished' if not upload.published else (
'embargoed' if upload.embargo_length else 'public'))
if upload_files.raw_path_is_file(path):
response.file_metadata = RawDirFileMetadata(
name=os.path.basename(path),
size=upload_files.raw_file_size(path))
if include_entry_info:
entry: Entry = Entry.objects(upload_id=upload_id, mainfile=path).first()
if entry:
response.file_metadata.entry_id = entry.entry_id
response.file_metadata.parser_name = entry.parser_name
else:
start = pagination.get_simple_index()
end = start + pagination.page_size
directory_list = upload_files.raw_directory_list(path)
upload_files.close()
content = []
path_to_element: Dict[str, RawDirElementMetadata] = {}
total = 0
total_size = 0
for i, path_info in enumerate(directory_list):
total += 1
total_size += path_info.size
if start <= i < end:
element = RawDirElementMetadata(
name=os.path.basename(path_info.path),
is_file=path_info.is_file,
size=path_info.size)
content.append(element)
if include_entry_info:
path_to_element[path_info.path] = element
if include_entry_info and content:
for entry in Entry.objects(upload_id=upload_id, mainfile__in=path_to_element.keys()):
element = path_to_element[entry.mainfile]
element.entry_id = entry.entry_id
element.parser_name = entry.parser_name
response.directory_metadata = RawDirDirectoryMetadata(
name=os.path.basename(path),
size=total_size,
content=content)
pagination_response = PaginationResponse(total=total, **pagination.dict())
pagination_response.populate_simple_index_and_urls(request)
response.pagination = pagination_response
return response
except Exception:
upload_files.close()
raise
@router.get(
'/{upload_id}/raw/{path:path}', tags=[raw_tag],
summary='Get the raw files and folders for a given upload and path.',
......@@ -554,7 +676,6 @@ async def get_upload_entry(
response_model_exclude_unset=True,
response_model_exclude_none=True)
async def get_upload_raw_path(
request: Request,
upload_id: str = Path(
...,
description='The unique id of the upload.'),
......@@ -586,19 +707,22 @@ async def get_upload_raw_path(
at the given `path`. The data is zipped if `compress = true`.
It is possible to download both individual files and directories, but directories can
only be downloaded if `compress = true`. If the path points to a directory, but
`compress = false`, a list of the directory contents is returned instead. The list is
encoded as a json structure (if the request headers has `Accept = application/json`),
otherwise as html.
When downloading a directory (i.e. with `compress = true`), it is also possible to
specify `re_pattern` or `glob_pattern` to filter the files based on the file names.
only be downloaded if `compress = true`. When downloading a directory, it is also
possible to specify `re_pattern`, `glob_pattern` or `include_files` to filter the files
based on the file names.
When downloading a file, you can specify `decompress` to attempt to decompress the data
if the file is compressed before streaming it. You can also specify `offset` and `length`
to download only a segment of the file (*Note:* `offset` and `length` does not work if
`compress` is set to true).
'''
if files_params.compress and (offset != 0 or length != -1):
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=strip('''
Cannot specify `offset` or `length` when `compress` is true'''))
# Get upload
upload = _get_upload_with_read_access(upload_id, user, include_others=True)
_check_upload_not_processing(upload)
# Get upload files
upload_files = UploadFiles.get(upload_id)
upload_files = upload.upload_files
try:
if not upload_files.raw_path_exists(path):
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=strip('''
......@@ -627,50 +751,17 @@ async def get_upload_raw_path(
return StreamingResponse(content, media_type=media_type)
else:
# Directory
if files_params.compress:
# Stream directory content, compressed.
download_item = DownloadItem(
upload_id=upload_id, raw_path=path, zip_path='')
content = create_download_stream_zipped(
download_item, upload_files,
re_pattern=files_params.re_pattern, recursive=True,
create_manifest_file=False, compress=True)
return StreamingResponse(content, media_type='application/zip')
else:
# compress = False -> return list of directory contents
directory_list = upload_files.raw_directory_list(path)
upload_files.close()
if request.headers.get('Accept') == 'application/json':
# json response
response = DirectoryListResponse(path=path.rstrip('/'), content=[])
for path_info in directory_list:
response.content.append(DirectoryListLine(
name=os.path.basename(path_info.path),
is_file=path_info.is_file,
size=path_info.size,
access=path_info.access))
response_text = response.json()
media_type = 'application/json'
else:
# html response
response_text = ''
scheme, netloc, url_path, _query, _fragment = request.url.components
base_url = f'{scheme}://{netloc}{url_path}'
if not base_url.endswith('/'):
base_url += '/'
for path_info in directory_list:
# TODO: How should the html look? Need html escaping?
name = os.path.basename(path_info.path)
if not path_info.is_file:
name += '/'
info = f'{path_info.size} bytes'
if not path_info.is_file:
info += ' (Directory)'
info += f' [{path_info.access}]'
response_text += f'<p><a href="{base_url + name}">{name}</a> {info}</p>\n'
media_type = 'text/html'
return StreamingResponse(create_stream_from_string(response_text), media_type=media_type)
if not files_params.compress:
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=strip('''
Path is a directory, `compress` must be set to true'''))
# Stream directory content, compressed.
download_item = DownloadItem(
upload_id=upload_id, raw_path=path, zip_path='')
content = create_download_stream_zipped(
download_item, upload_files,
re_pattern=files_params.re_pattern, recursive=True,
create_manifest_file=False, compress=True)
return StreamingResponse(content, media_type='application/zip')
except Exception as e:
logger.error('exception while streaming download', exc_info=e)
upload_files.close()
......
......@@ -16,7 +16,7 @@
# limitations under the License.
#
from typing import Dict, Set, Iterator, Any, Optional, Union
from typing import List, Dict, Set, Iterator, Any, Optional, Union
from types import FunctionType
import urllib
import io
......@@ -30,7 +30,7 @@ import lzma
from nomad.files import UploadFiles, StreamedFile, create_zipstream
def parameter_dependency_from_model(name: str, model_cls):
def parameter_dependency_from_model(name: str, model_cls, exclude: List[str] = []):
'''
Takes a pydantic model class as input and creates a dependency with corresponding
Query parameter definitions that can be used for GET
......@@ -47,11 +47,11 @@ def parameter_dependency_from_model(name: str, model_cls):
annotations: Dict[str, type] = {}
defaults = []
for field_model in model_cls.__fields__.values():
field_info = field_model.field_info
names.append(field_model.name)
annotations[field_model.name] = field_model.outer_type_
defaults.append(Query(field_model.default, description=field_info.description))
if field_model.name not in exclude:
field_info = field_model.field_info
names.append(field_model.name)
annotations[field_model.name] = field_model.outer_type_
defaults.append(Query(field_model.default, description=field_info.description))
code = inspect.cleandoc('''
def %s(%s):
......
......@@ -665,18 +665,6 @@ def test_get_upload_entry(
pytest.param(dict(
user='test_user', upload_id='id_unpublished', path='test_content/id_unpublished_1/1.aux'),
200, 'text/plain; charset=utf-8', 'content', id='unpublished-file'),
pytest.param(dict(
user='test_user', upload_id='id_unpublished', path='test_content/id_unpublished_1/',
accept='application/json'),
200, 'application/json', ['1.aux', '2.aux', '3.aux', '4.aux', 'mainfile.json'],
id='unpublished-dir-json'),
pytest.param(dict(
user='test_user', upload_id='id_unpublished', path='test_content/id_unpublished_1/'),
200, 'text/html; charset=utf-8', ['1.aux', '2.aux', '3.aux', '4.aux', 'mainfile.json'],
id='unpublished-dir-html'),
pytest.param(dict(
user='test_user', upload_id='id_unpublished', path='', accept='application/json'),
200, 'application/json', ['test_content'], id='unpublished-dir-json-root'),
pytest.param(dict(
user='other_test_user', upload_id='id_unpublished', path='test_content/id_unpublished_1/1.aux'),
401, None, None, id='unpublished-file-unauthorized'),
......@@ -686,16 +674,6 @@ def test_get_upload_entry(
pytest.param(dict(
user='test_user', upload_id='id_published', path='test_content/subdir/test_entry_01/mainfile.json'),
200, 'text/plain; charset=utf-8', 'content', id='published-file'),
pytest.param(dict(
user='test_user', upload_id='id_published', path='test_content/subdir/test_entry_01',
accept='application/json'),
200, 'application/json', ['1.aux', '2.aux', '3.aux', '4.aux', 'mainfile.json'], id='published-dir-json'),
pytest.param(dict(
user='test_user', upload_id='id_published', path='test_content/subdir/test_entry_01'),
200, 'text/html; charset=utf-8', ['1.aux', '2.aux', '3.aux', '4.aux', 'mainfile.json'], id='published-dir-html'),
pytest.param(dict(
user='test_user', upload_id='id_published', path='', accept='application/json'),
200, 'application/json', ['test_content'], id='published-dir-json-root'),
pytest.param(dict(
user='admin_user', upload_id='id_published', path='test_content/subdir/test_entry_01/1.aux'),
200, 'text/plain; charset=utf-8', 'content', id='published-file-admin-auth'),
......@@ -802,17 +780,6 @@ def test_get_upload_raw_path(
assert response.text == expected_content, 'Wrong content (offset and length)'
else:
assert expected_content in response.text, 'Expected content not found'
elif mime_type == 'application/json':
data = response.json()
assert data['path'] == (path.rstrip('/') or '')
if expected_content:
file_list_returned = [o['name'] for o in data['content']]
assert file_list_returned == expected_content, 'Incorrect list of files returned'
elif mime_type == 'text/html':
assert response.text, 'No response text'
if expected_content:
for name in expected_content:
assert name in response.text
elif mime_type == 'application/zip':
if expected_content:
with zipfile.ZipFile(io.BytesIO(response.content)) as zip_file:
......@@ -839,6 +806,107 @@ def test_get_upload_raw_path(
assert found, f'Missing expected path in zip file: {expected_path}'
@pytest.mark.parametrize('user, upload_id, path, query_args, expected_status_code, expected_content, expected_file_metadata, expected_pagination', [
pytest.param(
'test_user', 'id_published', 'test_content/subdir/silly_value', {},
404, None, None, None,
id='bad-path'),
pytest.param(
'test_user', 'id_published', 'test_content/subdir/test_entry_01', {},
200, ['1.aux', '2.aux', '3.aux', '4.aux', 'mainfile.json'], None, {'total': 5},
id='published-dir'),
pytest.param(
'test_user', 'id_published', 'test_content/subdir/test_entry_01', {'include_entry_info': True},
200, ['1.aux', '2.aux', '3.aux', '4.aux', 'mainfile.json'], None, {'total': 5},
id='published-dir-include_entry_info'),
pytest.param(
'test_user', 'id_published', 'test_content/subdir/test_entry_01', {'include_entry_info': True, 'page_size': 2, 'page': 3},
200, ['mainfile.json'], None, {'total': 5},
id='published-dir-include_entry_info-page3'),
pytest.param(
'test_user', 'id_published', '', {},
200, ['test_content'], None, {'total': 1},
id='published-dir-root'),
pytest.param(
'test_user', 'id_unpublished', 'test_content/id_unpublished_1/', {},
200, ['1.aux', '2.aux', '3.aux', '4.aux', 'mainfile.json'], None, {'total': 5},
id='unpublished-dir'),
pytest.param(
'test_user', 'id_unpublished', 'test_content/id_unpublished_1/', {'page_size': 3, 'page': 1},
200, ['1.aux', '2.aux', '3.aux'], None, {'total': 5, 'next_page_after_value': '2'},
id='unpublished-dir-page1'),
pytest.param(
'test_user', 'id_unpublished', 'test_content/id_unpublished_1/', {'page_size': 2, 'page': 4},
400, None, None, None,
id='unpublished-dir-page-out-of-range'),
pytest.param(
'test_user', 'id_unpublished', 'test_content/id_unpublished_1/', {'include_entry_info': True},
200, ['1.aux', '2.aux', '3.aux', '4.aux', 'mainfile.json'], None, {'total': 5},
id='unpublished-dir-include_entry_info'),
pytest.param(
'test_user', 'id_unpublished', '', {},
200, ['test_content'], None, {'total': 1},
id='unpublished-dir-root'),
pytest.param(
'test_user', 'id_unpublished', 'test_content/id_unpublished_1/2.aux', {'include_entry_info': True},
200, None, {'name': '2.aux', 'size': 8, 'entry_id': None, 'parser_name': None}, None,
id='unpublished-aux-file'),
pytest.param(
'test_user', 'id_published', 'test_content/subdir/test_entry_01/mainfile.json', {'include_entry_info': True},
200, None, {'name': 'mainfile.json', 'size': 3237, 'entry_id': 'id_01', 'parser_name': 'parsers/vasp'}, None,
id='published-main-file'),
pytest.param(
'other_test_user', 'id_unpublished', 'test_content/id_unpublished_1', {},
401, None, None, None,
id='unpublished-no-access'),
pytest.param(
'other_test_user', 'id_embargo', 'test_content/id_embargo_1', {},
401, None, None, None,
id='embargoed-no-access'),
pytest.param(
'other_test_user', 'id_embargo_w_coauthor', 'test_content/id_embargo_w_coauthor_1', {},
200, ['1.aux', '2.aux', '3.aux', '4.aux', 'mainfile.json'], None, {'total': 5},
id='embargoed-coauthor-access')])
def test_get_upload_rawdir_path(
client, example_data, test_auth_dict,
user, upload_id, path, query_args,
expected_status_code, expected_content, expected_file_metadata, expected_pagination):
user_auth, __token = test_auth_dict[user]
response = perform_get(
client, f'uploads/{upload_id}/rawdir/{path}', user_auth=user_auth, **query_args)
assert_response(response, expected_status_code)
if expected_status_code == 200:
data = response.json()
assert data['path'] == (path.rstrip('/') or '')
if expected_content is not None:
dir_content_returned = data['directory_metadata']['content']
assert [d['name'] for d in dir_content_returned] == expected_content, 'Incorrect list of files returned'
for d in dir_content_returned:
if query_args.get('include_entry_info'):
assert (d.get('entry_id') is not None) == ('mainfile' in d['name'])
assert (d.get('parser_name') is not None) == ('mainfile' in d['name'])
else:
assert 'entry_id' not in d and 'parser_name' not in d
elif expected_file_metadata is not None:
file_metadata_returned = data['file_metadata']
for k, v in expected_file_metadata.items():
if v is None:
assert k not in file_metadata_returned
else:
assert file_metadata_returned.get(k) == v
if expected_pagination is None:
assert 'pagination' not in data
else:
pagination_returned = data['pagination']
for k, v in expected_pagination.items():
if v is None:
assert k not in pagination_returned
else:
assert pagination_returned.get(k) == v
@pytest.mark.parametrize('upload_id, mainfile, user, status_code', [
pytest.param('id_published', 'test_content/subdir/test_entry_01/mainfile.json', None, 200, id='published'),
pytest.param('id_published', 'test_content/doesnotexist.json', None, 404, id='bad-mainfile'),
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment