Commit cfacafb6 authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

Merge branch 'refactor-raw-api' into 'v1.0.0'

Refactor raw api

See merge request !539
parents 7d0e80e4 3fd1558c
Pipeline #120579 passed with stages
in 32 minutes and 15 seconds
......@@ -256,7 +256,7 @@ are:
- `entries/query` - Query entries for metadata
- `entries/archive/query` - Query entries for archive data
- `entries/{entry-id}/raw/download` - Download raw data for a specific entry
- `entries/{entry-id}/raw` - Download raw data for a specific entry
- `uploads/{upload-id}/raw/path/to/file` - Download a specific file of an upload
## Common concepts
......@@ -392,7 +392,7 @@ files in one big zip-file. Here, you might want to use a program like *curl* to
directly from the shell:
```
curl "{{ nomad_url() }}/v1/entries/raw/download?results.material.elements=Ti&results.material.elements=O" -o download.zip
curl "{{ nomad_url() }}/v1/entries/raw?results.material.elements=Ti&results.material.elements=O" -o download.zip
```
## Access archives
......
......@@ -206,7 +206,7 @@ class Api {
this.onStartLoading()
const auth = await this.authHeaders()
try {
const entry = await this.axios.get(`/entries/${entryId}/raw`, auth)
const entry = await this.axios.get(`/entries/${entryId}/rawdir`, auth)
return entry.data
} catch (errors) {
handleApiError(errors)
......
......@@ -153,7 +153,7 @@ export default function RawFiles({data, entryId}) {
setShownFile(file)
setFileContents(null)
api.get(
`/entries/${entryId}/raw/download/${file.split('/').reverse()[0]}`,
`/entries/${entryId}/raw/${file.split('/').reverse()[0]}`,
{length: 16 * 1024, decompress: true},
{transformResponse: []})
.then(contents => setFileContents({
......@@ -174,7 +174,7 @@ export default function RawFiles({data, entryId}) {
if (fileContents.contents.length < (page + 1) * 16 * 1024) {
api.get(
`/entries/${entryId}/raw/download/${shownFile.split('/').reverse()[0]}`,
`/entries/${entryId}/raw/${shownFile.split('/').reverse()[0]}`,
{offset: page * 16 * 1024, length: 16 * 1024, decompress: true},
{transformResponse: []})
.then(contents => {
......@@ -220,14 +220,14 @@ export default function RawFiles({data, entryId}) {
let downloadUrl
if (selectedFiles.length === 1) {
// download the individual file
downloadUrl = `entries/${entryId}/raw/download/${file(selectedFiles[0])}`
downloadUrl = `entries/${entryId}/raw/${file(selectedFiles[0])}`
} else if (selectedFiles.length === availableFiles.length) {
// use an endpoint that downloads all files of the entry
downloadUrl = `entries/${entryId}/raw/download`
downloadUrl = `entries/${entryId}/raw`
} else if (selectedFiles.length > 0) {
// download specific files
const query = selectedFiles.map(file).map(f => `include_files=${encodeURIComponent(f)}`).join('&')
downloadUrl = `entries/${entryId}/raw/download?${query}`
downloadUrl = `entries/${entryId}/raw?${query}`
}
return (
......
......@@ -146,13 +146,13 @@ export default function FilesBrower({uploadId, disabled}) {
const fetchData = useMemo(() => (path, open) => {
async function fetchData() {
const results = await api.get(`/uploads/${uploadId}/raw/${path}`)
const results = await api.get(`/uploads/${uploadId}/rawdir/${path}?page_size=500`)
allData.current[path] = {
open: open,
...results
}
const resultsByPath = {}
results.content
results.directory_metadata.content
.filter(item => item.is_file)
.forEach(item => {
resultsByPath[`${path}/${item.name}`] = item
......@@ -212,7 +212,7 @@ export default function FilesBrower({uploadId, disabled}) {
key: path,
hasChildren: !is_file,
open: data?.open,
children: data?.content?.map(mapContent),
children: data?.directory_metadata?.content?.map(mapContent),
onToggle: is_file ? null : () => handleToggle(path),
// TODO
// info: !is_file && data?.content?.length === 0 && <Typography variant="caption">
......
......@@ -190,7 +190,7 @@ class Mapping():
dist = BNode()
self.g.add((dist, RDF.type, DCAT.Distribution))
self.g.add((dist, DCT.title, Literal(get_optional_entry_prop(entry, 'formula') + '_raw')))
self.g.add((dist, DCAT.accessURL, URIRef(f'https://nomad-lab.eu/prod/rae/api/v1/entries/{entry["entry_id"]}/raw/download')))
self.g.add((dist, DCAT.accessURL, URIRef(f'https://nomad-lab.eu/prod/rae/api/v1/entries/{entry["entry_id"]}/raw')))
self.g.add((dist, DCAT.packageFormat, URIRef('https://www.iana.org/assignments/media-types/application/zip')))
return dist
......@@ -569,7 +569,7 @@ class Pagination(BaseModel):
@validator('page_offset')
def validate_page_offset(cls, page_offset, values): # pylint: disable=no-self-argument
if page_offset is not None:
assert page_offset >= 0, 'page must be >= 1'
assert page_offset >= 0, 'page_offset must be >= 1'
return page_offset
@root_validator(skip_on_failure=True)
......
......@@ -180,11 +180,11 @@ class EntriesArchiveDownload(WithQuery):
files: Optional[Files] = Body(None)
class EntriesRaw(WithQuery):
class EntriesRawDir(WithQuery):
pagination: Optional[MetadataPagination] = Body(None)
class EntriesRawDownload(WithQuery):
class EntriesRaw(WithQuery):
files: Optional[Files] = Body(
None,
example={
......@@ -192,26 +192,26 @@ class EntriesRawDownload(WithQuery):
})
class EntryRawFile(BaseModel):
class EntryRawDirFile(BaseModel):
path: str = Field(None)
size: int = Field(None)
class EntryRaw(BaseModel):
class EntryRawDir(BaseModel):
entry_id: str = Field(None)
upload_id: str = Field(None)
mainfile: str = Field(None)
files: List[EntryRawFile] = Field(None)
files: List[EntryRawDirFile] = Field(None)
class EntriesRawResponse(EntriesRaw):
class EntriesRawDirResponse(EntriesRawDir):
pagination: PaginationResponse = Field(None) # type: ignore
data: List[EntryRaw] = Field(None)
data: List[EntryRawDir] = Field(None)
class EntryRawResponse(BaseModel):
class EntryRawDirResponse(BaseModel):
entry_id: str = Field(...)
data: EntryRaw = Field(...)
data: EntryRawDir = Field(...)
class EntryArchive(BaseModel):
......@@ -296,14 +296,14 @@ _bad_edit_request_empty_query = status.HTTP_404_NOT_FOUND, {
'model': HTTPExceptionModel,
'description': strip('No matching entries found.')}
_raw_download_response = 200, {
_raw_response = 200, {
'content': {'application/zip': {}},
'description': strip('''
A zip file with the requested raw files. The file is streamed.
The content length is not known in advance.
''')}
_raw_download_file_response = 200, {
_raw_file_response = 200, {
'content': {'application/octet-stream': {}},
'description': strip('''
A byte stream with raw file contents. The content length is not known in advance.
......@@ -462,7 +462,7 @@ class _Uploads():
self._upload_files.close()
def _create_entry_raw(entry_metadata: Dict[str, Any], uploads: _Uploads):
def _create_entry_rawdir(entry_metadata: Dict[str, Any], uploads: _Uploads):
entry_id = entry_metadata['entry_id']
upload_id = entry_metadata['upload_id']
mainfile = entry_metadata['mainfile']
......@@ -472,12 +472,12 @@ def _create_entry_raw(entry_metadata: Dict[str, Any], uploads: _Uploads):
files = []
for path_info in upload_files.raw_directory_list(mainfile_dir, files_only=True):
files.append(EntryRawFile(path=path_info.path, size=path_info.size))
files.append(EntryRawDirFile(path=path_info.path, size=path_info.size))
return EntryRaw(entry_id=entry_id, upload_id=upload_id, mainfile=mainfile, files=files)
return EntryRawDir(entry_id=entry_id, upload_id=upload_id, mainfile=mainfile, files=files)
def _answer_entries_raw_request(
def _answer_entries_rawdir_request(
owner: Owner, query: Query, pagination: MetadataPagination, user: User):
if owner == Owner.all_:
......@@ -495,19 +495,19 @@ def _answer_entries_raw_request(
uploads = _Uploads()
try:
response_data = [
_create_entry_raw(entry_metadata, uploads)
_create_entry_rawdir(entry_metadata, uploads)
for entry_metadata in search_response.data]
finally:
uploads.close()
return EntriesRawResponse(
return EntriesRawDirResponse(
owner=search_response.owner,
query=search_response.query,
pagination=search_response.pagination,
data=response_data)
def _answer_entries_raw_download_request(owner: Owner, query: Query, files: Files, user: User):
def _answer_entries_raw_request(owner: Owner, query: Query, files: Files, user: User):
if owner == Owner.all_:
raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail=strip('''
The owner=all is not allowed for this operation as it will search for entries
......@@ -558,16 +558,15 @@ def _answer_entries_raw_download_request(owner: Owner, query: Query, files: File
raise
_entries_raw_query_docstring = strip('''
_entries_rawdir_query_docstring = strip('''
Will perform a search and return a *page* of raw file metadata for entries fulfilling
the query. This allows you to get a complete list of all rawfiles with their full
path in their respective upload and their sizes. The first returned files for each
entry, is their respective *mainfile*.
Each entry on NOMAD represents a set of raw files. These are the input and output
files (as well as additional auxiliary files) in their original form, i.e. as
provided by the uploader. More specifically, an entry represents a code-run identified
by a certain *mainfile*. This is usually the main output file of the code. All other
Each entry on NOMAD has a set of raw files. These are the files in their original form,
i.e. as provided by the uploader. More specifically, an entry has a *mainfile*, identified as
parseable. For CMS entries, the mainfile is usually the main output file of the code. All other
files in the same directory are considered the entries *auxiliary* no matter their role
or if they were actually parsed by NOMAD.
......@@ -576,50 +575,49 @@ _entries_raw_query_docstring = strip('''
@router.post(
'/raw/query',
'/rawdir/query',
tags=[raw_tag],
summary='Search entries and get their raw files metadata',
description=_entries_raw_query_docstring,
response_model=EntriesRawResponse,
description=_entries_rawdir_query_docstring,
response_model=EntriesRawDirResponse,
responses=create_responses(_bad_owner_response),
response_model_exclude_unset=True,
response_model_exclude_none=True)
async def post_entries_raw_query(
request: Request, data: EntriesRaw, user: User = Depends(create_user_dependency())):
async def post_entries_rawdir_query(
request: Request, data: EntriesRawDir, user: User = Depends(create_user_dependency())):
return _answer_entries_raw_request(
return _answer_entries_rawdir_request(
owner=data.owner, query=data.query, pagination=data.pagination, user=user)
@router.get(
'/raw',
'/rawdir',
tags=[raw_tag],
summary='Search entries and get raw their raw files metadata',
description=_entries_raw_query_docstring,
response_model=EntriesRawResponse,
summary='Search entries and get their raw files metadata',
description=_entries_rawdir_query_docstring,
response_model=EntriesRawDirResponse,
response_model_exclude_unset=True,
response_model_exclude_none=True,
responses=create_responses(_bad_owner_response))
async def get_entries_raw(
async def get_entries_rawdir(
request: Request,
with_query: WithQuery = Depends(query_parameters),
pagination: MetadataPagination = Depends(metadata_pagination_parameters),
user: User = Depends(create_user_dependency())):
res = _answer_entries_raw_request(
res = _answer_entries_rawdir_request(
owner=with_query.owner, query=with_query.query, pagination=pagination, user=user)
res.pagination.populate_urls(request)
return res
_entries_raw_download_query_docstring = strip('''
This operation will perform a search and stream a .zip file with raw input and output
files of the found entries.
_entries_raw_query_docstring = strip('''
This operation will perform a search and stream a .zip file with the raw files of the
found entries.
Each entry on NOMAD represents a set of raw files. These are the input and output
files (as well as additional auxiliary files) in their original form, i.e. as
provided by the uploader. More specifically, an entry represents a code-run identified
by a certain *mainfile*. This is usually the main output file of the code. All other
Each entry on NOMAD has a set of raw files. These are the files in their original form,
i.e. as provided by the uploader. More specifically, an entry has a *mainfile*, identified as
parseable. For CMS entries, the mainfile is usually the main output file of the code. All other
files in the same directory are considered the entries *auxiliary* no matter their role
or if they were actually parsed by NOMAD.
......@@ -633,32 +631,32 @@ _entries_raw_download_query_docstring = strip('''
@router.post(
'/raw/download/query',
'/raw/query',
tags=[raw_tag],
summary='Search entries and download their raw files',
description=_entries_raw_download_query_docstring,
description=_entries_raw_query_docstring,
response_class=StreamingResponse,
responses=create_responses(_raw_download_response, _bad_owner_response))
async def post_entries_raw_download_query(
data: EntriesRawDownload, user: User = Depends(create_user_dependency())):
responses=create_responses(_raw_response, _bad_owner_response))
async def post_entries_raw_query(
data: EntriesRaw, user: User = Depends(create_user_dependency())):
return _answer_entries_raw_download_request(
return _answer_entries_raw_request(
owner=data.owner, query=data.query, files=data.files, user=user)
@router.get(
'/raw/download',
'/raw',
tags=[raw_tag],
summary='Search entries and download their raw files',
description=_entries_raw_download_query_docstring,
description=_entries_raw_query_docstring,
response_class=StreamingResponse,
responses=create_responses(_raw_download_response, _bad_owner_response))
async def get_entries_raw_download(
responses=create_responses(_raw_response, _bad_owner_response))
async def get_entries_raw(
with_query: WithQuery = Depends(query_parameters),
files: Files = Depends(files_parameters),
user: User = Depends(create_user_dependency(signature_token_auth_allowed=True))):
return _answer_entries_raw_download_request(
return _answer_entries_raw_request(
owner=with_query.owner, query=with_query.query, files=files, user=user)
......@@ -913,14 +911,14 @@ async def get_entry_metadata(
@router.get(
'/{entry_id}/raw',
'/{entry_id}/rawdir',
tags=[raw_tag],
summary='Get the raw files metadata for an entry by its id',
response_model=EntryRawResponse,
response_model=EntryRawDirResponse,
responses=create_responses(_bad_id_response),
response_model_exclude_unset=True,
response_model_exclude_none=True)
async def get_entry_raw(
async def get_entry_rawdir(
entry_id: str = Path(..., description='The unique entry id of the entry to retrieve raw data from.'),
user: User = Depends(create_user_dependency())):
'''
......@@ -940,18 +938,18 @@ async def get_entry_raw(
uploads = _Uploads()
try:
return EntryRawResponse(entry_id=entry_id, data=_create_entry_raw(response.data[0], uploads))
return EntryRawDirResponse(entry_id=entry_id, data=_create_entry_rawdir(response.data[0], uploads))
finally:
uploads.close()
@router.get(
'/{entry_id}/raw/download',
'/{entry_id}/raw',
tags=[raw_tag],
summary='Get the raw data of an entry by its id',
response_class=StreamingResponse,
responses=create_responses(_bad_id_response, _raw_download_response))
async def get_entry_raw_download(
responses=create_responses(_bad_id_response, _raw_response))
async def get_entry_raw(
entry_id: str = Path(..., description='The unique entry id of the entry to retrieve raw data from.'),
files: Files = Depends(files_parameters),
user: User = Depends(create_user_dependency(signature_token_auth_allowed=True))):
......@@ -969,16 +967,16 @@ async def get_entry_raw_download(
status_code=status.HTTP_404_NOT_FOUND,
detail='The entry with the given id does not exist or is not visible to you.')
return _answer_entries_raw_download_request(owner=Owner.visible, query=query, files=files, user=user)
return _answer_entries_raw_request(owner=Owner.visible, query=query, files=files, user=user)
@router.get(
'/{entry_id}/raw/download/{path}',
'/{entry_id}/raw/{path}',
tags=[raw_tag],
summary='Get the raw data of an entry by its id',
response_class=StreamingResponse,
responses=create_responses(_bad_id_response, _bad_path_response, _raw_download_file_response))
async def get_entry_raw_download_file(
responses=create_responses(_bad_id_response, _bad_path_response, _raw_file_response))
async def get_entry_raw_file(
entry_id: str = Path(..., description='The unique entry id of the entry to retrieve raw data from.'),
path: str = Path(..., description='A relative path to a file based on the directory of the entry\'s mainfile.'),
offset: Optional[int] = QueryParameter(
......@@ -1077,7 +1075,7 @@ def answer_entry_archive_request(query: Dict[str, Any], required: ArchiveRequire
response_model_exclude_none=True,
responses=create_responses(_bad_id_response))
async def get_entry_archive(
entry_id: str = Path(..., description='The unique entry id of the entry to retrieve raw data from.'),
entry_id: str = Path(..., description='The unique entry id of the entry to retrieve archive data from.'),
user: User = Depends(create_user_dependency())):
'''
Returns the full archive for the given `entry_id`.
......@@ -1091,7 +1089,7 @@ async def get_entry_archive(
summary='Get the archive for an entry by its id as plain archive json',
responses=create_responses(_bad_id_response, _archive_download_response))
async def get_entry_archive_download(
entry_id: str = Path(..., description='The unique entry id of the entry to retrieve raw data from.'),
entry_id: str = Path(..., description='The unique entry id of the entry to retrieve archive data from.'),
user: User = Depends(create_user_dependency(signature_token_auth_allowed=True))):
'''
Returns the full archive for the given `entry_id`.
......@@ -1110,7 +1108,7 @@ async def get_entry_archive_download(
responses=create_responses(_bad_id_response, _bad_archive_required_response))
async def post_entry_archive_query(
data: EntryArchiveRequest, user: User = Depends(create_user_dependency()),
entry_id: str = Path(..., description='The unique entry id of the entry to retrieve raw data from.')):
entry_id: str = Path(..., description='The unique entry id of the entry to retrieve archive data from.')):
'''
Returns a partial archive for the given `entry_id` based on the `required` specified
......
......@@ -29,7 +29,7 @@ from fastapi.responses import StreamingResponse
from fastapi.exceptions import RequestValidationError
from nomad import utils, config, files
from nomad.files import UploadFiles, StagingUploadFiles, UploadBundle, is_safe_relative_path
from nomad.files import StagingUploadFiles, UploadBundle, is_safe_relative_path
from nomad.processing import Upload, Entry, ProcessAlreadyRunning, ProcessStatus, MetadataEditRequestHandler
from nomad.utils import strip
from nomad.search import search
......@@ -229,19 +229,53 @@ class EntryProcDataQueryResponse(BaseModel):
'''))
class DirectoryListLine(BaseModel):
class RawDirPagination(Pagination):
@validator('order_by')
def validate_order_by(cls, order_by): # pylint: disable=no-self-argument
assert not order_by, 'Cannot specify `order_by` for rawdir calls'
return None
@validator('page_after_value')
def validate_page_after_value(cls, page_after_value, values): # pylint: disable=no-self-argument
# Validation handled elsewhere
return page_after_value
rawdir_pagination_parameters = parameter_dependency_from_model(
'rawdir_pagination_parameters', RawDirPagination, exclude=['order', 'order_by'])
class RawDirFileMetadata(BaseModel):
''' Metadata about a file '''
name: str = Field()
size: Optional[int] = Field()
entry_id: Optional[str] = Field(description=strip('''
If this is a mainfile: the ID of the corresponding entry.'''))
parser_name: Optional[str] = Field(description=strip('''
If this is a mainfile: the name of the matched parser.'''))
class RawDirElementMetadata(RawDirFileMetadata):
''' Metadata about an directory *element*, i.e. a file or a directory '''
is_file: bool = Field()
class RawDirDirectoryMetadata(BaseModel):
''' Metadata about a directory '''
name: str = Field()
size: Optional[int] = Field()
access: str = Field()
content: List[RawDirElementMetadata] = Field(
example=[
{'name': 'a_directory', 'is_file': False, 'size': 456},
{'name': 'a_file.json', 'is_file': True, 'size': 123, 'entry_id': 'XYZ', 'parser_name': 'parsers/vasp'}])
class DirectoryListResponse(BaseModel):
class RawDirResponse(BaseModel):
path: str = Field(example='The/requested/path')
content: List[DirectoryListLine] = Field(
example=[
{'name': 'a_directory', 'is_file': False, 'size': 456, 'access': 'public'},
{'name': 'a_file.json', 'is_file': True, 'size': 123, 'access': 'restricted'}])
access: str = Field()
file_metadata: Optional[RawDirFileMetadata] = Field()
directory_metadata: Optional[RawDirDirectoryMetadata] = Field()
pagination: Optional[PaginationResponse] = Field()
class UploadCommandExamplesResponse(BaseModel):
......@@ -307,18 +341,12 @@ _upload_response = 200, {
`Accept = application/json`, otherwise a plain text information string.''')}
_raw_path_response = 200, {
'model': DirectoryListResponse,
'content': {
'application/json': {},
'text/html': {'example': '<html defining a list of directory content>'},
'application/octet-stream': {'example': 'file data'},
'application/zip': {'example': '<zipped file or directory content>'}},
'description': strip('''
If `path` denotes a file: a stream with the file content, zipped if `compress = true`.
If `path` denotes a directory, and `compress = true`, the directory content, zipped.
If `path` denotes a directory, and `compress = false`, a list of the directory
content, either encoded as json or html, depending on the request headers (json if
`Accept = application/json`, html otherwise).''')}
If `path` denotes a directory, and `compress = true`, the directory content, zipped.''')}
_upload_bundle_response = 200, {
'content': {
......@@ -545,6 +573,100 @@ async def get_upload_entry(
return EntryProcDataResponse(entry_id=entry_id, data=data)
@router.get(
'/{upload_id}/rawdir/{path:path}', tags=[raw_tag],
summary='Get the raw files and folders metadata for a given upload and path.',
response_model=RawDirResponse,
responses=create_responses(_upload_or_path_not_found, _not_authorized_to_upload, _bad_request),
response_model_exclude_unset=True,
response_model_exclude_none=True)
async def get_upload_rawdir_path(
request: Request,
upload_id: str = Path(
...,
description='The unique id of the upload.'),
path: str = Path(
...,
description='The path within the upload raw files.'),
pagination: RawDirPagination = Depends(rawdir_pagination_parameters),
include_entry_info: bool = FastApiQuery(
False,
description=strip('''
If the fields `entry_id` and `parser_name` should be populated for all
encountered mainfiles.''')),
user: User = Depends(create_user_dependency(required=False, signature_token_auth_allowed=True))):
'''
For the upload specified by `upload_id`, gets the raw file or directory metadata
located at the given `path`. The response will either contain a `file_metadata` or
`directory_metadata` key. For files, basic data about the file is returned, such as its
name and size. For directories, the response includes a list of elements
(files and folders) in the directory. For directories, the result is paginated.
'''
# Get upload
upload = _get_upload_with_read_access(upload_id, user, include_others=True)
try:
# Get upload files
upload_files = upload.upload_files
if not upload_files.raw_path_exists(path):
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=strip('''
Not found. Invalid path?'''))
response = RawDirResponse(
path=path.rstrip('/'),
access='unpublished' if not upload.published else (
'embargoed' if upload.embargo_length else 'public'))
if upload_files.raw_path_is_file(path):
response.file_metadata = RawDirFileMetadata(
name=os.path.basename(path),
size=upload_files.raw_file_size(path))
if include_entry_info:
entry: Entry = Entry.objects(upload_id=upload_id, mainfile=path).first()
if entry:
response.file_metadata.entry_id = entry.entry_id
response.file_metadata.parser_name = entry.parser_name
else:
start = pagination.get_simple_index()
end = start + pagination.page_size
directory_list = upload_files.raw_directory_list(path)
upload_files.close()
content = []
path_to_element: Dict[str, RawDirElementMetadata] = {}
total = 0
total_size = 0
for i, path_info in enumerate(directory_list):
total += 1
total_size += path_info.size
if start <= i < end: