Commit 09ba5074 authored by Alvin Noe Ladines's avatar Alvin Noe Ladines
Browse files

Include documentation for archive library and removed zip db option from archive query

parent bae1e3d4
Pipeline #68228 passed with stages
in 14 minutes and 8 seconds
......@@ -30,7 +30,7 @@ import nomad_meta_info
from nomad.files import UploadFiles, Restricted
from nomad import utils, search, config
from nomad.archive_library.filedb import ArchiveFileDB
from nomad.archive_library.query import ArchiveFileDBs
from .auth import authenticate, create_authorization_predicate
from .api import api
......@@ -244,8 +244,7 @@ class ArchiveQueryResource(Resource):
def post(self):
Post an query schema and return it filled with archive data in json format from
all query results.
Post a query schema and return it filled with archive data.
See ``/repo`` endpoint for documentation on the search
......@@ -271,7 +270,6 @@ class ArchiveQueryResource(Resource):
per_page = pagination.get('per_page', per_page)
order = pagination.get('order', order)
order_by = pagination.get('order_by', order_by)
db = data_in.get('db')
qschema = data_in.get('results', None)
if qschema is not None:
qschema = qschema[-1]
......@@ -309,36 +307,16 @@ class ArchiveQueryResource(Resource):
data = []
calcs = results['results']
upload_files = None
msgdbs = None
for entry in calcs:
upload_id = entry['upload_id']
calc_id = entry['calc_id']
if upload_files is None or upload_files.upload_id != upload_id:
if upload_files is not None:
if msgdbs is None or msgdbs.upload_id != upload_id:
upload_files = UploadFiles.get(
upload_id, create_authorization_predicate(upload_id))
msgdbs = ArchiveFileDBs(upload_id).get_dbs()
if upload_files is None:
raise KeyError
if db == 'msg':
fos = upload_files.archive_file_msg(calc_id)
msgdbs = [ArchiveFileDB(fo) for fo in fos if fo is not None]
if db == 'zip':
fo = upload_files.archive_file(calc_id, 'rb')
data.append({calc_id: json.loads(})
elif db == 'msg':
for msgdb in msgdbs:
data.append(msgdb.query({calc_id: qschema}))
if upload_files is not None:
for msgdb in msgdbs:
data.append(msgdb.query({calc_id: qschema}))
except Restricted:
abort(401, message='Not authorized to access %s/%s.' % (upload_id, calc_id))
# New archive implementation
This contains the tutorials to use the new archive query functionality.
It uses the new metainfo definition for the archive data. In addition, the archive data
can now be filtered through the new api. The archive are now also stored using a new binary
format msgpack which in principle makes querying faster.
## Archive API
First, we look at how to use the new archive query api. Here we use the python requests
import requests
data = {
'atoms': 'Fe', 'scroll': True, 'per_page': 10,
'results': [{"section_run": {"section_single_configuration_calculation[-1]": {"energy_total": None}}}]}
response ='', data=data)
data = response.json
results = data.get('results')
To query the archive, we use the post method where we provide the usual query parameters
in a dictionary. In addition, we provide a schema for the archive data ala graphQL, i.e.
a heirarchical dictionary with null values for each of the property we would like to query.
In the example, we would like to return only the total energy for the last image. It is
important to point out that this schema uses the key 'results' and is a list since
this will be filled with a list of archive data with this schema.
## Archive and the new metainfo
A wrapper for the archive query api is implemented in ArchiveQuery.
from nomad.archive_library.filedb import ArchiveQuery
q = ArchiveQuery(
atoms=Fe, scroll=True, per_page=10, archive_data={
"section_run": {"section_single_configuration_calculation[-1]": {"energy_total": None}}})
metainfo = q.query()
for calc in metainfo:
Similarly, we provide query parameters and also the schema which in this case is 'archive_data'.
When we invoke query, a recursive api request is made until all the data matching our
parameters are downloaded. The results are then expressed in the new metainfo scheme
which offers auto-completion feature, among others.
## Msgpack container
The archive data are now stored in a binary format called msgpack. The archive data are
fragmented and upon query will access only the relevant fragment without loading the whole
archives collection. This is beneficial when one only query small chunks but will approach the
efficiency of zip files when one accesses the whole archive. To create a msgpack database
from the archive data and query it, one uses ArchiveFileDB.
from nomad.archive_library.filedb import ArchiveFileDB
db = ArchiveFileDB('archive.msg', mode='w', max_lfragment=2)
db.add_data({'calc1':{'secA': {'subsecA': {'propA': 1.0}}, 'secB': {'propB': 'X'}}})
db.add_data({'calc2':{'secA': {'subsecA': {'propA': 2.0}}, 'secB': {'propB': 'Y'}}})
db = ArchiveFileDB('archive.msg')
db.query({'calc1':{'secA': None}})
In the example, we first create a database in 'archive.msg', and data which are added
will be fragmented down to subsections. We reload it for reading and query all entries
under 'secA' of 'calc1'.
\ No newline at end of file
......@@ -28,6 +28,9 @@ import json
from import query_api_url
from nomad.archive_library.metainfo import ArchiveMetainfo
from nomad.archive_library.filedb import ArchiveFileDB
from nomad.files import UploadFiles
from import create_authorization_predicate
class ArchiveQuery:
......@@ -114,3 +117,19 @@ class ArchiveQuery:
if self._archive_data:
metainfo = ArchiveMetainfo(archive_data=self._archive_data, archive_schema=self._archive_schema)
return metainfo
class ArchiveFileDBs:
def __init__(self, upload_id):
self.upload_id = upload_id
def get_dbs(self):
upload_files = UploadFiles.get(
self.upload_id, create_authorization_predicate(self.upload_id))
if upload_files is None:
return []
files = upload_files.archive_file_msg('X')
msgdbs = [ArchiveFileDB(f) for f in files if f is not None]
return msgdbs
......@@ -667,8 +667,7 @@ class TestArchive(UploadFilesBasedTests):
assert rv.status_code == 200
assert_zip_file(rv, files=1)
@pytest.mark.parametrize('db', ['zip', 'msg'])
def test_post_archive_query(self, api, published_wo_user_metadata, db):
def test_post_archive_query(self, api, published_wo_user_metadata):
schema = {"section_run": {"section_single_configuration_calculation": {"energy_total": None}}}
data = {'results': [schema], 'per_page': 5}
uri = '/archive/query'
......@@ -116,10 +116,9 @@ class TestArchiveQuery:
monkeypatch.setattr('nomad.config.api_url', lambda *args, **kwargs: '')
return BlueprintClient(client, '/api')
@pytest.mark.parametrize('db', ['zip', 'msg'])
def test_query_from_json(self, api, published_wo_user_metadata, other_test_user_auth, db, monkeypatch):
def test_query_from_json(self, api, published_wo_user_metadata, other_test_user_auth, monkeypatch):
monkeypatch.setattr('nomad.archive_library.query.requests', api)
q_params = {'pagination': {'order': 1, 'per_page': 5}, 'db': db}
q_params = {'pagination': {'order': 1, 'per_page': 5}}
q_schema = {'section_entry_info': None}
q = ArchiveQuery(q_params, archive_data=q_schema, authentication=other_test_user_auth)
metainfo = q.query()
......@@ -129,7 +128,7 @@ class TestArchiveQuery:
def test_query_from_kwargs(self, api, published_wo_user_metadata, other_test_user_auth, monkeypatch):
monkeypatch.setattr('nomad.archive_library.query.requests', api)
q_schema = {'section_entry_info': None}
q = ArchiveQuery(order=1, per_page=5, scroll=True, db='msg', archive_data=q_schema, authentication=other_test_user_auth)
q = ArchiveQuery(order=1, per_page=5, scroll=True, archive_data=q_schema, authentication=other_test_user_auth)
metainfo = q.query()
for calc in metainfo:
assert calc.section_entry_info.calc_id is not None
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment