diff --git a/docs/archive.rst b/docs/archive.rst new file mode 100644 index 0000000000000000000000000000000000000000..5bf528cacb5193a49013afa0fcdb358594d36112 --- /dev/null +++ b/docs/archive.rst @@ -0,0 +1,8 @@ +Accessing the Archive +===================== + +Of course, you can access the NOMAD Archive directly via the NOMAD API (see the `API tutorial <api_tutorial.html>`_ +and `API reference <api.html>`_). But, it is more effective and convenient to use NOMAD's Python client +library. + +.. automodule:: nomad.client \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst index a0225b5c69ac15638d4ba1815b2a116e79b2d39a..06364d9bb3911edc9e721fc11ad5632afc46bf34 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -8,13 +8,14 @@ and infrastructure with a simplyfied architecture and consolidated code base. :maxdepth: 2 introduction - setup - dev_guidelines + upload api_tutorial api + archive metainfo + ops + setup + dev_guidelines parser_tutorial - archive_tutorial reference gui - ops diff --git a/docs/reference.rst b/docs/reference.rst index 882f848628b7a792369912f8ef6126482f26cb69..3927347a7049f3749445c32ab8939177e6a9ccff 100644 --- a/docs/reference.rst +++ b/docs/reference.rst @@ -50,6 +50,10 @@ nomad.cli ------------ .. automodule:: nomad.cli +nomad.client +------------ +.. automodule:: nomad.client + nomad.utils ----------- .. automodule:: nomad.utils diff --git a/docs/upload.rst b/docs/upload.rst new file mode 100644 index 0000000000000000000000000000000000000000..ad79fa10bec31b191dbaf66ec692ce12e67d7c3d --- /dev/null +++ b/docs/upload.rst @@ -0,0 +1,51 @@ +============== +Uploading Data +============== + +To contribute your data to the repository, please, login to our `upload page <../uploads>`_ (you need to register first, if you do not have a NOMAD account yet). + +*A note for returning NOMAD users!* We revised the upload process with browser based upload +alongside new shell commands. The new Upload page allows you to monitor upload processing +and verify processing results before publishing your data to the Repository. + +The `upload page <../uploads>`_ acts as a staging area for your data. It allows you to +upload data, to supervise the processing of your data, and to examine all metadata that +NOMAD extracts from your uploads. The data on the upload page will be private and can be +deleted again. If you are satisfied with our processing, you can publish the data. +Only then, data will become publicly available and cannot be deleted anymore. +You will always be able to access, share, and download your data. You may curate your data +and create datasets to give them a hierarchical structure. These functions are available +from the Your data page by selecting and editing data. + +You should upload many files at the same time by creating .zip or .tar files of your folder structures. +Ideally, input and output files are accompanied by relevant auxiliary files. NOMAD will +consider everything within a single directory as related. + +**A note for VASP users** on the handling of **POTCAR** files: NOMAD takes care of it; you don't +need to worry about it. We understand that according to your VASP license, POTCAR files are +not supposed to be visible to the public. Thus, in agreement with Georg Kresse, NOMAD will +extract the most important information of POTCAR files and store it in the files named +``POTCAR.stripped``. These files can be assessed and downloaded by anyone, while the original +POTCAR files are only available to the uploader and assigned co-authors. +This is done automatically; you don't need to do anything. + +Once published, data cannot be erased. Linking a corrected version to a corresponding older one ("erratum") will be possible soon. +Files from an improved calculation, even for the same material, will be handled as a new entry. + +You can publish data as being open access or restricted for up to three years (with embargo). +For the latter you may choose with whom you want to share your data. We strongly support the +idea of open access and thus suggest to impose as few restrictions as possible from the very +beginning. In case of open access data, all uploaded files are downloadable by any user. +Additional information, e.g. pointing to publications or how your data should be cited, +can be provided after the upload. Also DOIs can be requested. The restriction on data +can be lifted at any time. You cannot restrict data that was published as open access. + +Unless published without an embargo, all your information will be private and only visible +to you (or NOMAD users you explicitly shared your data with). Viewing private data will +always require a login. + +By uploading you confirm authorship of the uploaded calculations. Co-authors must be specified +after the upload process. This procedure is very much analogous to the submission of a +publication to a scientific journal. + +Upload of data is free of charge. \ No newline at end of file diff --git a/examples/client.py b/examples/client.py index 46f56f512a588a2e0300b329d038c4745c877fcb..27ba7bf5e2d661653287b7360c9ad9da1ee5d13a 100644 --- a/examples/client.py +++ b/examples/client.py @@ -3,12 +3,14 @@ from nomad.client import query_archive from nomad.metainfo import units # this will not be necessary, once this is the official NOMAD version -config.client.url = 'https://labdev-nomad.esc.rzg.mpg.de/dev/nomad/v0-8-0/api' +config.client.url = 'https://labdev-nomad.esc.rzg.mpg.de/fairdi/nomad/testing-major/api' - -aq = query_archive( +query = ArchiveQuery( query={ - 'upload_id': ['6LUBCju3T3KK3D_fRCJ4qw'] + 'dft.compound_type': 'binary', + 'dft.crystal_system': 'cubic', + 'dft.code_name': 'FHI-aims', + 'atoms': ['O'] }, required={ 'section_run': { @@ -17,12 +19,11 @@ aq = query_archive( } } }, - per_page=100, max=1000) - -print('total', aq.total) + per_page=10, + max=1000) -for i, e in enumerate(aq): - if i % 200 == 0: - print(e.section_run[0].section_single_configuration_calculation[0].energy_total.to(units.hartree)) +print(query) -print(aq) +for result in query[0:10]: + energy = result.section_run[0].section_single_configuration_calculation[0].energy_total + print('Energy %s' % energy.to(units.hartree)) diff --git a/nomad/app/api/archive.py b/nomad/app/api/archive.py index d4cbff594513f0aacab6ee8302880f4b0674d1d8..c068130f228730643f8f51b7662280ba154865d0 100644 --- a/nomad/app/api/archive.py +++ b/nomad/app/api/archive.py @@ -215,7 +215,9 @@ class ArchiveDownloadResource(Resource): _archive_query_model = api.inherit('ArchiveSearch', search_model, { 'query': fields.Nested(query_model, description='The query used to find the requested entries.', skip_none=True), - 'query_schema': fields.Raw(description='The query schema that defines what archive data to retrive.') + 'required': fields.Raw(description='A dictionary that defines what archive data to retrive.'), + 'query_schema': fields.Raw(description='Deprecated, use required instead.'), + 'raise_errors': fields.Boolean(description='Return 401 on missing archives or 500 on other errors instead of skipping the entry.') }) @@ -250,7 +252,14 @@ class ArchiveQueryResource(Resource): per_page = pagination.get('per_page', 10 if not scroll else 1000) query = data_in.get('query', {}) - query_schema = data_in.get('query_schema', '*') + + required: Dict[str, Any] = None + if 'required' in data_in: + required = data_in.get('required') + else: + required = data_in.get('query_schema', '*') + + raise_error = data_in.get('raise_error', True) except Exception: abort(400, message='bad parameter types') @@ -280,8 +289,6 @@ class ArchiveQueryResource(Resource): except search.ScrollIdNotFound: abort(400, 'The given scroll_id does not exist.') except KeyError as e: - import traceback - traceback.print_exc() abort(400, str(e)) data = [] @@ -316,14 +323,22 @@ class ArchiveQueryResource(Resource): 'calc_id': calc_id, 'parser_name': entry['parser_name'], 'archive': query_archive( - archive, {calc_id: query_schema})[calc_id] + archive, {calc_id: required})[calc_id] }) except ArchiveQueryError as e: abort(400, str(e)) - + except KeyError: + if raise_error: + abort(401, 'Archive for entry %s does not exist' % calc_id) + # We simply skip this entry + pass except Restricted: # TODO in reality this should not happen pass + except Exception as e: + if raise_error: + raise e + common.logger(str(e), exc_info=e) if upload_files is not None: upload_files.close() diff --git a/nomad/client.py b/nomad/client.py index e0b4eab7240c5965d58cedcc913e13cd8f17ccd9..cbeed1f19e45782dbe5ba86a0bf7c72ce3f372ef 100644 --- a/nomad/client.py +++ b/nomad/client.py @@ -13,26 +13,125 @@ # limitations under the License. ''' -Contains the Python client side library to access the NOMAD archive. - -# TODO -In module ``ArchiveMetainfo``, the data is provided either from raw -json data or as a filename of an existing msgpack database. The metainfo -can then queried by providing a schema. - -.. code-block: python - am = ArchiveMetainfo(archive_data) - for calc in am.calcs: - c.section_run.section_single_configuration_calculation[0]({'energy_total':None}) - -The ArchiveQuery enables a query interface to the archive data. A dict of query parameters -and a query schema similar to the archive json format can be provided to filter archive. - -.. code-block: python - q = ArchiveQuery({'atoms':'Si'}) - metainfo = q.query() - for c in metainfo.calcs: - print(c.section_run.section_single_configuration_calculation[0]({'energy_total':'*'})) +Install the NOMAD client library +________________________________ + +The NOMAD client library is a Python module (part of the nomad Python package) that +allows to access the NOMAD archive to retrieve and analyse (large amounts) of NOMAD's +archive data. It allows to use queries to filter for desired entries, bulk download +the required parts of the respective archives, and navigate the results using NOMAD's +metainfo Python API. + +To install the NOMAD Python package, you can use ``pip install`` to install our +source distribution + +.. code:: sh + + pip install https://repository.nomad-coe.eu/app/dist/nomad-v0.8.0.tar.gz + + +First example +_____________ + +.. literalinclude:: ../examples/client.py + :language: python + +This script should yield a result like this: + +.. code:: + + Number queries entries: 7667 + Number of entries loaded in the last api call: 10 + Bytes loaded in the last api call: 3579 + Bytes loaded from this query: 3579 + Number of downloaded entries: 10 + Number of made api calls: 1 + + Energy -178.6990610734937 hartree + Energy -6551.45699684026 hartree + Energy -6551.461104765451 hartree + Energy -548.9736595672932 hartree + Energy -548.9724185656775 hartree + Energy -1510.3938165430286 hartree + Energy -1510.3937761449583 hartree + Energy -11467.827149010665 hartree + Energy -16684.667362890417 hartree + Energy -1510.3908614326358 hartree + +Let's discuss the different elements here. First, we have a set of imports. The NOMAD source +codes comes with various sub-modules. The `client` module contains everything related +to what is described here; the `metainfo` is the Python interface to NOMAD's common +archive data format and its data type definitions; the `config` module simply contains +configuration values (like the URL to the NOMAD API). + +Next, we create an :class:`ArchiveQuery` instance. This object will be responsible for talking +to NOMAD's API for us in a transparent and lazy manner. This means, it will not download +all data right away, but do so when we are actually iterating through the results. + +The archive query takes several parameters: + +- The ``query`` is a dictionary of search criteria. The query is used to filter all of NOMAD's + entry down to a set of desired entries. You can use NOMAD's GUI to create queries and + copy their Python equivalent with the ``<>``-code button on the result list. +- The ``required`` part, allows to specify what parts of the archive should be downloaded. + Leave it out to download the whole archives. Based on NOMAD's Metainfo (the 'schema' of + all archives), you can determine what sections to include and which to leave out. Here, + we are interested in the first run (usually entries only have one run) and the first + calculation result. +- With the optional ``per_page`` you can determine, how many results are downloaded at + a time. For bulk downloading many results, we recommend ~100. If you are just interested + in the first results a lower number might increase performance. +- With the optional ``max``, we limit the maximum amount of entries that are downloaded, + just to avoid accidentely iterating through a result set of unknown and potentially large + size. + +When you print the archive query object, you will get some basic statistics about the +query and downloaded data. + +The archive query object can be treated as a Python list-like. You use indices and ranges +to select results. Here we iterate through a slice and print the calculated energies +from the first calculation of the entries. Each result is a Python object with attributes +governed by the NOMAD Metainfo. Quantities yield numbers, string, or numpy arrays, while +sub-sections return lists of further objects. Here we navigate the sections ``section_run`` and +sub-section ``section_system`` to access the quantity ``energy_total``. This quantity is a +number with an attached unit (Joule), which can be converted to something else (e.g. Hartree). + +The NOMAD Metainfo +__________________ + +You can imagine the NOMAD Metainfo as a complex schema for hiearchically organized scientific +data. In this sense, the NOMAD Metainfo is a set of data type definitions. These definitions +then govern how the archive for an data entry in NOMAD might look like. You can browse the +hierarchy of definitions in our `Metainfo browser <../metainfo>`_. + +Be aware, that the definitions entail everything that an entry could possibly contain, but +not all entries contain all sections and all quantities. What an entry contains depends +on the information that the respective uploaded data contained, what could be extracted, +and of course what was calculated in the first place. To see what the archive of an concrete +entry looks like, you can use the `search interface <../search>`_, select an entry from the +list fo search results, and click on the *Archive* tab. + +To *see inside* an archive object in Python, you can use :func:`nomad.metainfo.MSection.m_to_dict` +which is provided by all archive objects. This will convert a (part of an) archive into a +regular, JSON-serializable Python dictionary. + +For more details on the metainfo Python interface, consult the `metainfo documentation <metainfo.html>`_. + +The ArchiveQuery class +______________________ + +.. autoclass:: ArchiveQuery + +Working with private data +_________________________ + +Public NOMAD data can be accessed without any authentication; everyone can use our API +without the need for an account or login. However, if you want to work with your own +data that is not yet published, or embargoed data was shared with you, you need to +authenticate before accessing this data. Otherwise, you will simply not find it with +your queries. To authenticate simply provide your NOMAD username and password to the +:class:`ArchiveQuery` constructor. + ''' from typing import Dict, Union, Any, List @@ -118,11 +217,39 @@ class ApiStatistics(mi.MSection): class ArchiveQuery(collections.abc.Sequence): + ''' + Object of this class represent a query on the NOMAD Archive. It is solely configured + through its constructor. After creation, it implements the + Python ``Sequence`` interface and therefore acts as a sequence of query results. + + Not all results are downloaded at once, expect that this class will continuesly pull + results from the API, while you access or iterate to the far side of the result list. + + Attributes: + query: A dictionary of search parameters. Consult the search API to get a + comprehensive list of parameters. + required: A potentially nested dictionary of sections to retrieve. + url: Optional, override the default NOMAD API url. + username: Optional, allows authenticated access. + password: Optional, allows authenticated access. + scroll: Use the scroll API to iterate through results. This is required when you + are accessing many 1000 results. By default, the pagination API is used. + per_page: Determine how many results are downloaded per page (or scroll window). + Default is 10. + max: Optionally determine the maximum amount of downloaded archives. The iteration + will stop even if more results are available. Default is unlimited. + raise_errors: There situations where archives for certain entries are unavailable. + If set to True, this cases will raise an Exception. Otherwise, the entries + with missing archives are simply skipped (default). + authentication: Optionally provide detailed authentication information. Usually, + providing ``username`` and ``password``should suffice. + ''' def __init__( self, query: dict = None, required: dict = None, url: str = None, username: str = None, password: str = None, scroll: bool = False, per_page: int = 10, max: int = None, + raise_errors: bool = False, authentication: Union[Dict[str, str], KeycloakAuthenticator] = None): self.scroll = scroll @@ -132,7 +259,8 @@ class ArchiveQuery(collections.abc.Sequence): self.max = max self.query: Dict[str, Any] = { - 'query': {} + 'query': {}, + 'raise_errors': raise_errors } if query is not None: self.query['query'].update(query) @@ -151,6 +279,10 @@ class ArchiveQuery(collections.abc.Sequence): @property def authentication(self): + ''' + The authentication information that is used, if username or password were + provided. + ''' if self._authentication is None and self.username is not None and self.password is not None: host = urlparse(self.url).netloc.split(':')[0] self._authentication = KeycloakAuthenticator( @@ -168,6 +300,10 @@ class ArchiveQuery(collections.abc.Sequence): return self._authentication def call_api(self): + ''' + Calls the API to retrieve the next set of results. Is automatically called, if + not yet downloaded entries are accessed. + ''' url = '%s/%s/%s' % (self.url, 'archive', 'query') if self.scroll: @@ -255,6 +391,7 @@ class ArchiveQuery(collections.abc.Sequence): @property def total(self): + ''' The total ammount of search results. ''' if self._total == -1: self.call_api() @@ -262,6 +399,7 @@ class ArchiveQuery(collections.abc.Sequence): @property def statistics(self): + ''' A metainfo object with a basic set of query statistics. ''' if self._total == -1: self.call_api() diff --git a/nomad/metainfo/metainfo.py b/nomad/metainfo/metainfo.py index 50c20da5fb659b2267d567e0910c6d5ddf01a9aa..b5f2686ebab300d8173827c59b23e0cd19c7fd82 100644 --- a/nomad/metainfo/metainfo.py +++ b/nomad/metainfo/metainfo.py @@ -1561,10 +1561,20 @@ class MSection(metaclass=MObjectMeta): # TODO find a way to make this a subclas # name = self.m_get(name_quantity_def) try: name = self.__dict__['name'] + main = '%s:%s' % (name, m_section_name) except KeyError: - name = '<noname>' + main = m_section_name - return '%s:%s' % (name, m_section_name) + more = '' + props = [ + prop + for prop in self.m_def.all_properties + if prop in self.__dict__] + + if len(props) > 10: + more = ', +%d more properties' % (len(props) - 10) + + return '%s(%s%s)' % (main, ', '.join(props[0:10]), more) def __getitem__(self, key): try: diff --git a/tests/app/test_api.py b/tests/app/test_api.py index 83c7e9ae4f9eb99af9522fa33c5b2d5c8fb96696..7340aed00c3319fe0d5641987727e7702de1f0f9 100644 --- a/tests/app/test_api.py +++ b/tests/app/test_api.py @@ -715,6 +715,18 @@ class TestArchive(UploadFilesBasedTests): # TODO assert archive contents + # test not exists + entry_metadata = EntryMetadata( + domain='dft', upload_id=published_wo_user_metadata.upload_id, + calc_id='test_id', published=True, with_embargo=False) + entry_metadata.a_elastic.index(refresh=True) + + rv = api.post(uri, content_type='application/json', data=json.dumps(dict(per_page=5, raise_error=True))) + assert rv.status_code == 401 + + rv = api.post(uri, content_type='application/json', data=json.dumps(dict(per_page=5, raise_error=False))) + assert rv.status_code == 200 + class TestRepo(): @pytest.fixture(scope='class')