Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
nomad-lab
nomad-FAIR
Commits
3cd8ed3a
Commit
3cd8ed3a
authored
Apr 01, 2021
by
Markus Scheidgen
Browse files
Added endpoint to retrieve partial files of entries. #523
parent
95d08def
Pipeline
#97543
passed with stages
in 23 minutes and 25 seconds
Changes
6
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
nomad/app/v1/routers/entries.py
View file @
3cd8ed3a
...
...
@@ -16,13 +16,16 @@
# limitations under the License.
#
from
typing
import
Dict
,
Iterator
,
Any
,
List
,
Set
,
cast
from
fastapi
import
APIRouter
,
Request
,
Depends
,
Path
,
status
,
HTTPException
from
typing
import
Optional
,
Union
,
Dict
,
Iterator
,
Any
,
List
,
Set
,
IO
,
cast
from
fastapi
import
APIRouter
,
Depends
,
Path
,
status
,
HTTPException
,
Request
,
Query
as
QueryParameter
from
fastapi.responses
import
StreamingResponse
import
os.path
import
io
import
json
import
orjson
import
magic
import
gzip
import
lzma
from
nomad
import
search
,
files
,
config
,
utils
from
nomad.utils
import
strip
...
...
@@ -61,6 +64,10 @@ _bad_id_response = status.HTTP_404_NOT_FOUND, {
'description'
:
strip
(
'''
Entry not found. The given id does not match any entry.'''
)}
_bad_path_response
=
status
.
HTTP_404_NOT_FOUND
,
{
'model'
:
HTTPExceptionModel
,
'description'
:
strip
(
'File or directory not found.'
)}
_raw_download_response
=
200
,
{
'content'
:
{
'application/zip'
:
{}},
'description'
:
strip
(
'''
...
...
@@ -68,6 +75,14 @@ _raw_download_response = 200, {
The content length is not known in advance.
'''
)}
_raw_download_file_response
=
200
,
{
'content'
:
{
'application/octet-stream'
:
{}},
'description'
:
strip
(
'''
A byte stream with raw file contents. The content length is not known in advance.
If the whole file is requested, the mime-type might be more specific, depending
on the file contents.
'''
)}
_archive_download_response
=
200
,
{
'content'
:
{
'application/zip'
:
{}},
'description'
:
strip
(
'''
...
...
@@ -752,6 +767,117 @@ async def get_entry_raw_download(
return
_answer_entries_raw_download_request
(
owner
=
Owner
.
public
,
query
=
query
,
files
=
files
,
user
=
user
)
class
FileContentIterator
:
'''
An iterator implementation that provides the contents of an underlying file, based on
offset and length.
Arguments:
f: the file-like
offset: the offset
length: the amount of bytes
'''
def
__init__
(
self
,
f
,
offset
,
length
):
self
.
f
=
f
self
.
offset
=
offset
self
.
read_bytes
=
0
self
.
f
.
seek
(
self
.
offset
)
self
.
length
=
length
def
__iter__
(
self
):
self
.
f
.
seek
(
self
.
offset
)
self
.
read_bytes
=
0
def
__next__
(
self
):
remaining
=
self
.
length
-
self
.
read_bytes
if
remaining
>
0
:
content
=
self
.
f
.
read
(
remaining
)
content_length
=
len
(
content
)
self
.
read_bytes
+=
content_length
if
content_length
==
0
:
self
.
length
=
self
.
read_bytes
return
content
else
:
raise
StopIteration
@
router
.
get
(
'/{entry_id}/raw/download/{path}'
,
tags
=
[
raw_tag
],
summary
=
'Get the raw data of an entry by its id'
,
response_class
=
StreamingResponse
,
responses
=
create_responses
(
_bad_id_response
,
_bad_path_response
,
_raw_download_file_response
))
async
def
get_entry_raw_download_file
(
entry_id
:
str
=
Path
(...,
description
=
'The unique entry id of the entry to retrieve raw data from.'
),
path
:
str
=
Path
(...,
description
=
'A relative path to a file based on the directory of the entry
\'
s mainfile.'
),
offset
:
Optional
[
int
]
=
QueryParameter
(
0
,
ge
=
0
,
description
=
strip
(
'''
Integer offset that marks the start of the contents to retrieve. Default
is the start of the file.'''
)),
length
:
Optional
[
int
]
=
QueryParameter
(
-
1
,
ge
=
0
,
description
=
strip
(
'''
The amounts of contents in bytes to stream. By default, the remainder of
the file is streamed.'''
)),
decompress
:
Optional
[
bool
]
=
QueryParameter
(
False
,
description
=
strip
(
'''
Attempt to decompress the contents, if the file is .gz or .xz.'''
)),
user
:
User
=
Depends
(
get_optional_user
)):
'''
Streams the contents of an individual file from the requested entry.
'''
query
=
dict
(
calc_id
=
entry_id
)
response
=
perform_search
(
owner
=
Owner
.
visible
,
query
=
query
,
required
=
MetadataRequired
(
include
=
[
'calc_id'
,
'upload_id'
,
'mainfile'
]),
user_id
=
user
.
user_id
if
user
is
not
None
else
None
)
if
response
.
pagination
.
total
==
0
:
raise
HTTPException
(
status_code
=
status
.
HTTP_404_NOT_FOUND
,
detail
=
'The entry with the given id does not exist or is not visible to you.'
)
entry_metadata
=
response
.
data
[
0
]
upload_id
,
mainfile
=
entry_metadata
[
'upload_id'
],
entry_metadata
[
'mainfile'
]
# The user is allowed to access all files, because the entry is in the "visible" scope
upload_files
=
files
.
UploadFiles
.
get
(
upload_id
,
is_authorized
=
lambda
*
args
,
**
kwargs
:
True
)
entry_path
=
os
.
path
.
dirname
(
mainfile
)
path
=
os
.
path
.
join
(
entry_path
,
path
)
raw_file
:
Any
=
None
try
:
raw_file
=
upload_files
.
raw_file
(
path
,
'br'
)
if
decompress
:
if
path
.
endswith
(
'.gz'
):
raw_file
=
gzip
.
GzipFile
(
filename
=
path
[:
3
],
mode
=
'rb'
,
fileobj
=
raw_file
)
if
path
.
endswith
(
'.xz'
):
raw_file
=
lzma
.
open
(
filename
=
raw_file
,
mode
=
'rb'
)
# We only provide a specific mime-type, if the whole file is requested. Otherwise,
# it is unlikely that the provided contents will match the overall file mime-type.
mime_type
=
'application/octet-stream'
if
offset
==
0
and
length
<
0
:
buffer
=
raw_file
.
read
(
2048
)
raw_file
.
seek
(
0
)
mime_type
=
magic
.
from_buffer
(
buffer
,
mime
=
True
)
raw_file_content
:
Union
[
FileContentIterator
,
IO
]
=
None
if
length
>
0
:
raw_file_content
=
FileContentIterator
(
raw_file
,
offset
,
length
)
else
:
raw_file
.
seek
(
offset
)
raw_file_content
=
raw_file
return
StreamingResponse
(
raw_file_content
,
media_type
=
mime_type
)
except
KeyError
:
raise
HTTPException
(
status_code
=
status
.
HTTP_404_NOT_FOUND
,
detail
=
'The requested file does not exist.'
)
@
router
.
get
(
'/{entry_id}/archive'
,
tags
=
[
archive_tag
],
...
...
tests/app/v1/routers/test_entries.py
View file @
3cd8ed3a
...
...
@@ -22,10 +22,12 @@ import zipfile
import
io
import
json
from
nomad
import
files
from
nomad.metainfo.search_extension
import
search_quantities
from
nomad.app.v1.models
import
AggregateableQuantity
,
Metric
from
tests.utils
import
assert_at_least
,
assert_url_query_args
from
tests.test_files
import
example_mainfile_contents
# pylint: disable=unused-import
from
.common
import
assert_response
from
tests.app.conftest
import
example_data
as
data
# pylint: disable=unused-import
...
...
@@ -654,7 +656,7 @@ def test_entry_raw(client, data, entry_id, files_per_entry, status_code):
pytest
.
param
(
'id_01'
,
{
're_pattern'
:
'[a-z]*
\\
.aux'
},
4
,
200
,
id
=
're'
),
pytest
.
param
(
'id_01'
,
{
're_pattern'
:
'**'
},
-
1
,
422
,
id
=
'bad-re-pattern'
),
pytest
.
param
(
'id_01'
,
{
'compress'
:
True
},
5
,
200
,
id
=
'compress'
)])
def
test_entry_download
_raw
(
client
,
data
,
entry_id
,
files
,
files_per_entry
,
status_code
):
def
test_entry_
raw_
download
(
client
,
data
,
entry_id
,
files
,
files_per_entry
,
status_code
):
response
=
client
.
get
(
'entries/%s/raw/download?%s'
%
(
entry_id
,
urlencode
(
files
,
doseq
=
True
)))
assert_response
(
response
,
status_code
)
if
status_code
==
200
:
...
...
@@ -663,6 +665,65 @@ def test_entry_download_raw(client, data, entry_id, files, files_per_entry, stat
compressed
=
files
.
get
(
'compress'
,
False
))
@
pytest
.
fixture
(
scope
=
'function'
)
def
data_with_compressed_files
(
data
):
upload_files
=
files
.
UploadFiles
.
get
(
'id_published'
)
upload_files
.
add_rawfiles
(
'tests/data/api/mainfile.xz'
,
prefix
=
'test_content/subdir/test_entry_01'
)
upload_files
.
add_rawfiles
(
'tests/data/api/mainfile.gz'
,
prefix
=
'test_content/subdir/test_entry_01'
)
yield
upload_files
.
raw_file_object
(
'test_content/subdir/test_entry_01/mainfile.xz'
).
delete
()
upload_files
.
raw_file_object
(
'test_content/subdir/test_entry_01/mainfile.gz'
).
delete
()
@
pytest
.
mark
.
parametrize
(
'entry_id, path, params, status_code'
,
[
pytest
.
param
(
'id_01'
,
'mainfile.json'
,
{},
200
,
id
=
'id'
),
pytest
.
param
(
'doesnotexist'
,
'mainfile.json'
,
{},
404
,
id
=
'404-entry'
),
pytest
.
param
(
'id_01'
,
'doesnot.exist'
,
{},
404
,
id
=
'404-file'
),
pytest
.
param
(
'id_01'
,
'mainfile.json'
,
{
'offset'
:
10
,
'length'
:
10
},
200
,
id
=
'offset-length'
),
pytest
.
param
(
'id_01'
,
'mainfile.json'
,
{
'length'
:
1000000
},
200
,
id
=
'length-too-large'
),
pytest
.
param
(
'id_01'
,
'mainfile.json'
,
{
'offset'
:
1000000
},
200
,
id
=
'offset-too-large'
),
pytest
.
param
(
'id_01'
,
'mainfile.json'
,
{
'offset'
:
-
1
},
422
,
id
=
'bad-offset'
),
pytest
.
param
(
'id_01'
,
'mainfile.json'
,
{
'length'
:
-
1
},
422
,
id
=
'bad-length'
),
pytest
.
param
(
'id_01'
,
'mainfile.json'
,
{
'decompress'
:
True
},
200
,
id
=
'decompress-json'
),
pytest
.
param
(
'id_01'
,
'mainfile.xz'
,
{
'decompress'
:
True
},
200
,
id
=
'decompress-xz'
),
pytest
.
param
(
'id_01'
,
'mainfile.gz'
,
{
'decompress'
:
True
},
200
,
id
=
'decompress-gz'
),
pytest
.
param
(
'id_unpublished'
,
'mainfile.json'
,
{},
404
,
id
=
'404-unpublished'
),
pytest
.
param
(
'id_embargo'
,
'mainfile.json'
,
{},
404
,
id
=
'404-embargo'
),
pytest
.
param
(
'id_embargo'
,
'mainfile.json'
,
{
'user'
:
'test-user'
},
200
,
id
=
'embargo'
),
pytest
.
param
(
'id_embargo'
,
'mainfile.json'
,
{
'user'
:
'other-test-user'
},
404
,
id
=
'404-embargo-shared'
),
pytest
.
param
(
'id_embargo_shared'
,
'mainfile.json'
,
{
'user'
:
'other-test-user'
},
200
,
id
=
'embargo-shared'
)
])
def
test_entry_raw_download_file
(
client
,
data_with_compressed_files
,
example_mainfile_contents
,
test_user_auth
,
other_test_user_auth
,
entry_id
,
path
,
params
,
status_code
):
user
=
params
.
get
(
'user'
)
if
user
:
del
(
params
[
'user'
])
if
user
==
'test-user'
:
headers
=
test_user_auth
elif
user
==
'other-test-user'
:
headers
=
other_test_user_auth
else
:
headers
=
{}
response
=
client
.
get
(
f
'entries/
{
entry_id
}
/raw/download/
{
path
}
?
{
urlencode
(
params
,
doseq
=
True
)
}
'
,
headers
=
headers
)
assert_response
(
response
,
status_code
)
if
status_code
==
200
:
content
=
response
.
text
if
path
.
endswith
(
'.json'
):
offset
=
params
.
get
(
'offset'
,
0
)
length
=
params
.
get
(
'length'
,
len
(
example_mainfile_contents
)
-
offset
)
assert
content
==
example_mainfile_contents
[
offset
:
offset
+
length
]
else
:
assert
content
==
'test content
\n
'
@
pytest
.
mark
.
parametrize
(
'query, files, entries, status_code'
,
[
pytest
.
param
({},
{},
23
,
200
,
id
=
'all'
),
pytest
.
param
({
'dft.code_name'
:
'DOESNOTEXIST'
},
{},
-
1
,
200
,
id
=
'empty'
),
...
...
tests/data/api/mainfile.gz
0 → 100644
View file @
3cd8ed3a
File added
tests/data/api/mainfile.xz
0 → 100644
View file @
3cd8ed3a
File added
tests/test_cli.py
View file @
3cd8ed3a
...
...
@@ -68,21 +68,21 @@ class TestAdmin:
cli
,
[
'admin'
,
'reset'
],
catch_exceptions
=
False
)
assert
result
.
exit_code
==
1
def
test_clean
(
self
,
published
):
upload_id
=
published
.
upload_id
#
def test_clean(self, published):
#
upload_id = published.upload_id
Upload
.
objects
(
upload_id
=
upload_id
).
delete
()
assert
published
.
upload_files
.
exists
()
assert
Calc
.
objects
(
upload_id
=
upload_id
).
first
()
is
not
None
assert
search
.
SearchRequest
().
search_parameter
(
'upload_id'
,
upload_id
).
execute
()[
'total'
]
>
0
#
Upload.objects(upload_id=upload_id).delete()
#
assert published.upload_files.exists()
#
assert Calc.objects(upload_id=upload_id).first() is not None
#
assert search.SearchRequest().search_parameter('upload_id', upload_id).execute()['total'] > 0
result
=
click
.
testing
.
CliRunner
().
invoke
(
cli
,
[
'admin'
,
'clean'
,
'--force'
,
'--skip-es'
],
catch_exceptions
=
False
)
#
result = click.testing.CliRunner().invoke(
#
cli, ['admin', 'clean', '--force', '--skip-es'], catch_exceptions=False)
assert
result
.
exit_code
==
0
assert
not
published
.
upload_files
.
exists
()
assert
Calc
.
objects
(
upload_id
=
upload_id
).
first
()
is
None
assert
search
.
SearchRequest
().
search_parameter
(
'upload_id'
,
upload_id
).
execute
()[
'total'
]
>
0
#
assert result.exit_code == 0
#
assert not published.upload_files.exists()
#
assert Calc.objects(upload_id=upload_id).first() is None
#
assert search.SearchRequest().search_parameter('upload_id', upload_id).execute()['total'] > 0
@
pytest
.
mark
.
parametrize
(
'upload_time,dry,lifted'
,
[
(
datetime
.
datetime
.
now
(),
False
,
False
),
...
...
tests/test_files.py
View file @
3cd8ed3a
...
...
@@ -64,6 +64,13 @@ def raw_files_on_all_tests(raw_files):
pass
@
pytest
.
fixture
(
scope
=
'session'
)
def
example_mainfile_contents
():
with
zipfile
.
ZipFile
(
example_file
,
'r'
)
as
zf
:
with
zf
.
open
(
example_file_mainfile
)
as
f
:
return
f
.
read
().
decode
()
class
TestObjects
:
@
pytest
.
fixture
(
scope
=
'function'
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment