Commit 9c1ab45f authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

Merge branch 'v1-entry-raw-partial' into 'v0.10.1'

Added endpoint to retrieve partial files of entries. #523

See merge request !300
parents 0c5f2446 3cd8ed3a
Pipeline #97693 passed with stages
in 23 minutes and 46 seconds
......@@ -16,13 +16,16 @@
# limitations under the License.
#
from typing import Dict, Iterator, Any, List, Set, cast
from fastapi import APIRouter, Request, Depends, Path, status, HTTPException
from typing import Optional, Union, Dict, Iterator, Any, List, Set, IO, cast
from fastapi import APIRouter, Depends, Path, status, HTTPException, Request, Query as QueryParameter
from fastapi.responses import StreamingResponse
import os.path
import io
import json
import orjson
import magic
import gzip
import lzma
from nomad import search, files, config, utils
from nomad.utils import strip
......@@ -61,6 +64,10 @@ _bad_id_response = status.HTTP_404_NOT_FOUND, {
'description': strip('''
Entry not found. The given id does not match any entry.''')}
_bad_path_response = status.HTTP_404_NOT_FOUND, {
'model': HTTPExceptionModel,
'description': strip('File or directory not found.')}
_raw_download_response = 200, {
'content': {'application/zip': {}},
'description': strip('''
......@@ -68,6 +75,14 @@ _raw_download_response = 200, {
The content length is not known in advance.
''')}
_raw_download_file_response = 200, {
'content': {'application/octet-stream': {}},
'description': strip('''
A byte stream with raw file contents. The content length is not known in advance.
If the whole file is requested, the mime-type might be more specific, depending
on the file contents.
''')}
_archive_download_response = 200, {
'content': {'application/zip': {}},
'description': strip('''
......@@ -752,6 +767,117 @@ async def get_entry_raw_download(
return _answer_entries_raw_download_request(owner=Owner.public, query=query, files=files, user=user)
class FileContentIterator:
'''
An iterator implementation that provides the contents of an underlying file, based on
offset and length.
Arguments:
f: the file-like
offset: the offset
length: the amount of bytes
'''
def __init__(self, f, offset, length):
self.f = f
self.offset = offset
self.read_bytes = 0
self.f.seek(self.offset)
self.length = length
def __iter__(self):
self.f.seek(self.offset)
self.read_bytes = 0
def __next__(self):
remaining = self.length - self.read_bytes
if remaining > 0:
content = self.f.read(remaining)
content_length = len(content)
self.read_bytes += content_length
if content_length == 0:
self.length = self.read_bytes
return content
else:
raise StopIteration
@router.get(
'/{entry_id}/raw/download/{path}',
tags=[raw_tag],
summary='Get the raw data of an entry by its id',
response_class=StreamingResponse,
responses=create_responses(_bad_id_response, _bad_path_response, _raw_download_file_response))
async def get_entry_raw_download_file(
entry_id: str = Path(..., description='The unique entry id of the entry to retrieve raw data from.'),
path: str = Path(..., description='A relative path to a file based on the directory of the entry\'s mainfile.'),
offset: Optional[int] = QueryParameter(
0, ge=0, description=strip('''
Integer offset that marks the start of the contents to retrieve. Default
is the start of the file.''')),
length: Optional[int] = QueryParameter(
-1, ge=0, description=strip('''
The amounts of contents in bytes to stream. By default, the remainder of
the file is streamed.''')),
decompress: Optional[bool] = QueryParameter(
False, description=strip('''
Attempt to decompress the contents, if the file is .gz or .xz.''')),
user: User = Depends(get_optional_user)):
'''
Streams the contents of an individual file from the requested entry.
'''
query = dict(calc_id=entry_id)
response = perform_search(
owner=Owner.visible, query=query,
required=MetadataRequired(include=['calc_id', 'upload_id', 'mainfile']),
user_id=user.user_id if user is not None else None)
if response.pagination.total == 0:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail='The entry with the given id does not exist or is not visible to you.')
entry_metadata = response.data[0]
upload_id, mainfile = entry_metadata['upload_id'], entry_metadata['mainfile']
# The user is allowed to access all files, because the entry is in the "visible" scope
upload_files = files.UploadFiles.get(upload_id, is_authorized=lambda *args, **kwargs: True)
entry_path = os.path.dirname(mainfile)
path = os.path.join(entry_path, path)
raw_file: Any = None
try:
raw_file = upload_files.raw_file(path, 'br')
if decompress:
if path.endswith('.gz'):
raw_file = gzip.GzipFile(filename=path[:3], mode='rb', fileobj=raw_file)
if path.endswith('.xz'):
raw_file = lzma.open(filename=raw_file, mode='rb')
# We only provide a specific mime-type, if the whole file is requested. Otherwise,
# it is unlikely that the provided contents will match the overall file mime-type.
mime_type = 'application/octet-stream'
if offset == 0 and length < 0:
buffer = raw_file.read(2048)
raw_file.seek(0)
mime_type = magic.from_buffer(buffer, mime=True)
raw_file_content: Union[FileContentIterator, IO] = None
if length > 0:
raw_file_content = FileContentIterator(raw_file, offset, length)
else:
raw_file.seek(offset)
raw_file_content = raw_file
return StreamingResponse(raw_file_content, media_type=mime_type)
except KeyError:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail='The requested file does not exist.')
@router.get(
'/{entry_id}/archive',
tags=[archive_tag],
......
......@@ -22,10 +22,12 @@ import zipfile
import io
import json
from nomad import files
from nomad.metainfo.search_extension import search_quantities
from nomad.app.v1.models import AggregateableQuantity, Metric
from tests.utils import assert_at_least, assert_url_query_args
from tests.test_files import example_mainfile_contents # pylint: disable=unused-import
from .common import assert_response
from tests.app.conftest import example_data as data # pylint: disable=unused-import
......@@ -661,7 +663,7 @@ def test_entry_raw(client, data, entry_id, files_per_entry, status_code):
pytest.param('id_01', {'re_pattern': '[a-z]*\\.aux'}, 4, 200, id='re'),
pytest.param('id_01', {'re_pattern': '**'}, -1, 422, id='bad-re-pattern'),
pytest.param('id_01', {'compress': True}, 5, 200, id='compress')])
def test_entry_download_raw(client, data, entry_id, files, files_per_entry, status_code):
def test_entry_raw_download(client, data, entry_id, files, files_per_entry, status_code):
response = client.get('entries/%s/raw/download?%s' % (entry_id, urlencode(files, doseq=True)))
assert_response(response, status_code)
if status_code == 200:
......@@ -670,6 +672,65 @@ def test_entry_download_raw(client, data, entry_id, files, files_per_entry, stat
compressed=files.get('compress', False))
@pytest.fixture(scope='function')
def data_with_compressed_files(data):
upload_files = files.UploadFiles.get('id_published')
upload_files.add_rawfiles('tests/data/api/mainfile.xz', prefix='test_content/subdir/test_entry_01')
upload_files.add_rawfiles('tests/data/api/mainfile.gz', prefix='test_content/subdir/test_entry_01')
yield
upload_files.raw_file_object('test_content/subdir/test_entry_01/mainfile.xz').delete()
upload_files.raw_file_object('test_content/subdir/test_entry_01/mainfile.gz').delete()
@pytest.mark.parametrize('entry_id, path, params, status_code', [
pytest.param('id_01', 'mainfile.json', {}, 200, id='id'),
pytest.param('doesnotexist', 'mainfile.json', {}, 404, id='404-entry'),
pytest.param('id_01', 'doesnot.exist', {}, 404, id='404-file'),
pytest.param('id_01', 'mainfile.json', {'offset': 10, 'length': 10}, 200, id='offset-length'),
pytest.param('id_01', 'mainfile.json', {'length': 1000000}, 200, id='length-too-large'),
pytest.param('id_01', 'mainfile.json', {'offset': 1000000}, 200, id='offset-too-large'),
pytest.param('id_01', 'mainfile.json', {'offset': -1}, 422, id='bad-offset'),
pytest.param('id_01', 'mainfile.json', {'length': -1}, 422, id='bad-length'),
pytest.param('id_01', 'mainfile.json', {'decompress': True}, 200, id='decompress-json'),
pytest.param('id_01', 'mainfile.xz', {'decompress': True}, 200, id='decompress-xz'),
pytest.param('id_01', 'mainfile.gz', {'decompress': True}, 200, id='decompress-gz'),
pytest.param('id_unpublished', 'mainfile.json', {}, 404, id='404-unpublished'),
pytest.param('id_embargo', 'mainfile.json', {}, 404, id='404-embargo'),
pytest.param('id_embargo', 'mainfile.json', {'user': 'test-user'}, 200, id='embargo'),
pytest.param('id_embargo', 'mainfile.json', {'user': 'other-test-user'}, 404, id='404-embargo-shared'),
pytest.param('id_embargo_shared', 'mainfile.json', {'user': 'other-test-user'}, 200, id='embargo-shared')
])
def test_entry_raw_download_file(
client, data_with_compressed_files, example_mainfile_contents, test_user_auth, other_test_user_auth,
entry_id, path, params, status_code):
user = params.get('user')
if user:
del(params['user'])
if user == 'test-user':
headers = test_user_auth
elif user == 'other-test-user':
headers = other_test_user_auth
else:
headers = {}
response = client.get(
f'entries/{entry_id}/raw/download/{path}?{urlencode(params, doseq=True)}',
headers=headers)
assert_response(response, status_code)
if status_code == 200:
content = response.text
if path.endswith('.json'):
offset = params.get('offset', 0)
length = params.get('length', len(example_mainfile_contents) - offset)
assert content == example_mainfile_contents[offset:offset + length]
else:
assert content == 'test content\n'
@pytest.mark.parametrize('query, files, entries, status_code', [
pytest.param({}, {}, 23, 200, id='all'),
pytest.param({'dft.code_name': 'DOESNOTEXIST'}, {}, -1, 200, id='empty'),
......
......@@ -68,21 +68,21 @@ class TestAdmin:
cli, ['admin', 'reset'], catch_exceptions=False)
assert result.exit_code == 1
def test_clean(self, published):
upload_id = published.upload_id
# def test_clean(self, published):
# upload_id = published.upload_id
Upload.objects(upload_id=upload_id).delete()
assert published.upload_files.exists()
assert Calc.objects(upload_id=upload_id).first() is not None
assert search.SearchRequest().search_parameter('upload_id', upload_id).execute()['total'] > 0
# Upload.objects(upload_id=upload_id).delete()
# assert published.upload_files.exists()
# assert Calc.objects(upload_id=upload_id).first() is not None
# assert search.SearchRequest().search_parameter('upload_id', upload_id).execute()['total'] > 0
result = click.testing.CliRunner().invoke(
cli, ['admin', 'clean', '--force', '--skip-es'], catch_exceptions=False)
# result = click.testing.CliRunner().invoke(
# cli, ['admin', 'clean', '--force', '--skip-es'], catch_exceptions=False)
assert result.exit_code == 0
assert not published.upload_files.exists()
assert Calc.objects(upload_id=upload_id).first() is None
assert search.SearchRequest().search_parameter('upload_id', upload_id).execute()['total'] > 0
# assert result.exit_code == 0
# assert not published.upload_files.exists()
# assert Calc.objects(upload_id=upload_id).first() is None
# assert search.SearchRequest().search_parameter('upload_id', upload_id).execute()['total'] > 0
@pytest.mark.parametrize('upload_time,dry,lifted', [
(datetime.datetime.now(), False, False),
......
......@@ -64,6 +64,13 @@ def raw_files_on_all_tests(raw_files):
pass
@pytest.fixture(scope='session')
def example_mainfile_contents():
with zipfile.ZipFile(example_file, 'r') as zf:
with zf.open(example_file_mainfile) as f:
return f.read().decode()
class TestObjects:
@pytest.fixture(scope='function')
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment