From 1dffd0ee88c1f966ba235ea71a1e88a772604a85 Mon Sep 17 00:00:00 2001
From: Alvin Noe Ladines <ladinesalvinnoe@gmail.com>
Date: Wed, 17 Apr 2024 13:04:58 +0000
Subject: [PATCH] Fix HDF5 ref docs

Changelog: Fixed
---
 docs/howto/customization/hdf5.md | 68 ++++++++++++++++----------------
 docs/howto/overview.md           |  2 +-
 mkdocs.yml                       |  2 +-
 nomad/datamodel/hdf5.py          |  3 +-
 4 files changed, 36 insertions(+), 39 deletions(-)

diff --git a/docs/howto/customization/hdf5.md b/docs/howto/customization/hdf5.md
index 40a49f6326..71783e7dac 100644
--- a/docs/howto/customization/hdf5.md
+++ b/docs/howto/customization/hdf5.md
@@ -1,6 +1,6 @@
-# How to handle large quantities with HDF5
+# How to use HDF5 to handle large quantities
 
-The NOMAD schemas and processed data system is designed to describe and manage
+The NOMAD schemas and processed data system are designed to describe and manage
 intricate hierarchies of connected data. This is ideal for metadata and lots of small
 data quantities, but does not work for large quantities. Quantities are atomic and
 are always managed as a whole; there is currently no functionality to stream or
@@ -8,27 +8,26 @@ splice large quantities. Consequently, tools that produce or work with such data
 cannot scale.
 
 To address the issue, the option to use auxiliary storage systems optimized for large
-data is implemented. In the following we discuss two ways to write large datasets to HDF5.
-The first is the use of the quantity type `HDF5Reference` and second is the addition of
-quantity annotation.
+data is implemented. In the following we discuss two quantity types to enable the writing
+of large datasets to HDF5: `HDF5Reference` and `HDF5Dataset`. These are defined in
+`nomad.datamodel.hdf5`.
 
 ## HDF5Reference
 
 HDF5Reference is a metainfo quantity type intended to reference datasets in external raw
-HDF5 files. This can also be used to write large data into an HDF5 file following the
-structure of the nomad archive. In following example schema, we define two HDF5Reference
-quantities to illustrate these functionalities.
+HDF5 files. It is assumed that the dataset exists in an HDF5 file and the reference
+is assigned to this quantity. Static methods to read from and write to an HDF5 file are
+implemented. The following example illustrates how to use these.
 
 ```python
 from nomad.datamodel import ArchiveSection
-from nomad.metainfo import HDF5Reference
+from nomad.datamodel.hdf5 import HDF5Reference
 
 class LargeData(ArchiveSection):
-    value_external = Quantity(type=HDF5Reference)
     value = Quantity(type=HDF5Reference)
 ```
 
-The writing and reading of quantity values to and from an HDF5 file occurs during
+The writing and reading of quantity values to and from an HDF5 file occur during
 processing. For illustration purposes, we mock this by creating `ServerContext`. Furthermore,
 we use this section definition for the `data` sub-section of EntryArchive.
 
@@ -51,43 +50,42 @@ archive = EntryArchive(
     data=LargeData(),
 )
 
-archive.data.value_external = 'external.h5#/path/to/data'
-archive.data.value = np.eye(5)
-archive.data.value
-# '/uploads/test_upload/archive/test_entry#/data/value'
+data = np.eye(3)
+path = 'external.h5#path/to/data'
+HDF5Reference.write_dataset(archive, data, path)
+archive.data.value = path
+HDF5Reference.read_dataset(archive, path)
+array([[1., 0., 0.],
+       [0., 1., 0.],
+       [0., 0., 1.]])
 ```
 
-For `value_external`, we assign a reference to a dataset `/path/to/data` in a raw HDF5
-file `external.h5` in the same upload. This will simply store this reference and will not
-write it to another HDF5 file. To reference a file in another upload, follow the
-same form for [reference values](basics.md#different-forms-of-references) e.g.
-`/uploads/<upload_id>/raw/large_data.hdf5#group/large_field`
-In contrast, when assigning an array to `value`, this is written to an HDF5 extension of the
-entry archive and serialized as `/uploads/test_upload/archive/test_entry#/data/value`.
-The structure of the HDF5 file will be the same as that of the archive.
+We use `write_dataset` to write our data into a raw HDF5 file in `test_upload` with the
+filename and dataset location in `path`. Additionally, archive is required to resolve the
+upload metadata. We then assign the reference to the dataset to `value`. To reference a
+file in another upload, follow the same form for
+[reference values](basics.md#different-forms-of-references) e.g.
+`/uploads/<upload_id>/raw/large_data.hdf5#group/large_field`.
+
 !!! important
     When reassigning a different value for an HDF5 archive quantity, it is necessary that the data
     attributes (shape and type) are preserved.
 
-## Existing quantities for large arrays
+To read a dataset, use `read_dataset` and provide a reference. This will return the value
+cast in the type of the dataset.
 
-For existing quantity definitions which one uses for large arrays, it is also possible
-to write the data to the HDF5 representation of the archive. This can be done by adding
-a `serialization` annotation to the quantity definition.
+## HDF5Dataset
+To use HDF5 storage for archive quantities, one should use `HDF5Dataset`.
 
 ```python
-from nomad.datamodel.metainfo.annotations import HDF5SerializationAnnotation
+from nomad.datamodel.hdf5 import HDF5Dataset
 
 class LargeData(ArchiveSection):
-    value = Quantity(type=np.float64)
-    value.m_annotations = dict(serialization=HDF5SerializationAnnotation())
+    value = Quantity(type=HDF5Dataset)
 ```
 
-Upon serialization, the assigned value will also be written to the archive HDF5 file.
-However, the value will remain in memory. This is the difference compared to HDF5Rerence
-where the value is immediately written to an HDF5 file and serialized as reference.
-During serialization, one also needs to provide the archive context in order to resolve
-the reference.
+The assigned value will also be written to the archive HDF5 file and serialized as
+`/uploads/test_upload/archive/test_entry#/data/value`.
 
 ```python
 archive.data.value = np.ones(3)
diff --git a/docs/howto/overview.md b/docs/howto/overview.md
index c6a59ac9b1..c52c902d3e 100644
--- a/docs/howto/overview.md
+++ b/docs/howto/overview.md
@@ -65,12 +65,12 @@ Customize NOMAD, write plugins, and tailor NOMAD Oasis.
 - [Use base sections](customization/base_sections.md)
 - [Parse tabular data](customization/tabular.md)
 - [Define workflows](customization/workflows.md)
-- [Reference hdf5 files](customization/hdf5.md)
 - [Write plugins](customization/plugins.md)
 - [Write a python schema](customization/schemas.md)
 - [Write a parser](customization/parsers.md)
 - [Write a normalizer](customization/normalizers.md)
 - [Work with units](customization/units.md)
+- [Use HDF5 to handle large quantities](customization/hdf5.md)
 
 </div>
 <div markdown="block">
diff --git a/mkdocs.yml b/mkdocs.yml
index ce2a80c408..f98185c367 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -39,12 +39,12 @@ nav:
       - Use base sections: howto/customization/base_sections.md
       - Parse tabular data: howto/customization/tabular.md
       - Define workflows: howto/customization/workflows.md
-      - Handle large quantities: howto/customization/hdf5.md
       - Write plugins: howto/customization/plugins.md
       - Write a schema plugin: howto/customization/schemas.md
       - Write a parser: howto/customization/parsers.md
       - Write a normalizer: howto/customization/normalizers.md
       - Work with units: howto/customization/units.md
+      - Use HDF5 to handle large quantities: howto/customization/hdf5.md
     - Development:
       - Get started: howto/develop/setup.md
       - Navigate the code: howto/develop/code.md
diff --git a/nomad/datamodel/hdf5.py b/nomad/datamodel/hdf5.py
index bf2a5e4e3e..34be73a0ca 100644
--- a/nomad/datamodel/hdf5.py
+++ b/nomad/datamodel/hdf5.py
@@ -47,7 +47,7 @@ def read_hdf5_dataset(hdf5_file: h5py.File, path: str) -> h5py.Dataset:
     )[match['path']]
 
 
-def write_hdf5_dataset(value: Any, hdf5_file: h5py.File, path: str) -> str:
+def write_hdf5_dataset(value: Any, hdf5_file: h5py.File, path: str) -> None:
     """
     Write data to HDF5 file.
     """
@@ -59,7 +59,6 @@ def write_hdf5_dataset(value: Any, hdf5_file: h5py.File, path: str) -> str:
         dtype=value.dtype if hasattr(value, 'dtype') else None,
     )
     dataset[...] = value.magnitude if hasattr(value, 'magnitude') else value
-    return dataset
 
 
 class _HDF5Reference(DataType):
-- 
GitLab