Skip to content
Snippets Groups Projects
Commit 105e181f authored by Lauri Himanen's avatar Lauri Himanen
Browse files

Added separate dictionaries for accessing the calculations and repositories...

Added separate dictionaries for accessing the calculations and repositories inside archive files. Disabled the writing to cache by default.
parent 8792f71e
No related branches found
No related tags found
No related merge requests found
...@@ -5,7 +5,6 @@ import string ...@@ -5,7 +5,6 @@ import string
import h5py import h5py
import numpy as np import numpy as np
from abc import ABCMeta, abstractmethod from abc import ABCMeta, abstractmethod
from Nomad.exceptions import NomadArchiveError
class ArchiveSection(metaclass=ABCMeta): class ArchiveSection(metaclass=ABCMeta):
...@@ -95,9 +94,15 @@ class ArchiveSection(metaclass=ABCMeta): ...@@ -95,9 +94,15 @@ class ArchiveSection(metaclass=ABCMeta):
This method will store these values into a temporary location This method will store these values into a temporary location
dictionary that is separate from the original data. dictionary that is separate from the original data.
""" """
if self._archive.use_write_cache:
path = "{}/{}".format(self._path, path) path = "{}/{}".format(self._path, path)
self._archive.overrides[path] = value self._archive.overrides[path] = value
else:
raise ValueError(
"Writing to the source file is currently disabled. If you want "
"to write to a local cache, set the 'use_write_cache' attribute"
" of this Archive to True."
)
def __getitem__(self, key): def __getitem__(self, key):
"""Used to get a direct child of this section by name. """Used to get a direct child of this section by name.
...@@ -234,7 +239,7 @@ class ArchiveSection(metaclass=ABCMeta): ...@@ -234,7 +239,7 @@ class ArchiveSection(metaclass=ABCMeta):
return parts, names, indices return parts, names, indices
class Archive(ArchiveSection): class Archive(object):
__metaclass__ = ABCMeta __metaclass__ = ABCMeta
"""Defines a storage independent interface to an archive file. To make a """Defines a storage independent interface to an archive file. To make a
storage specific implementation just subclass this and define the required storage specific implementation just subclass this and define the required
...@@ -251,7 +256,7 @@ class Archive(ArchiveSection): ...@@ -251,7 +256,7 @@ class Archive(ArchiveSection):
lifetime of this object. These values will not persists on the lifetime of this object. These values will not persists on the
original file. original file.
""" """
def __init__(self, filepath): def __init__(self, filepath, use_write_cache=False):
""" """
Args: Args:
filepath (string): Filepath to an archive file. filepath (string): Filepath to an archive file.
...@@ -261,9 +266,12 @@ class Archive(ArchiveSection): ...@@ -261,9 +266,12 @@ class Archive(ArchiveSection):
self.overrides = {} self.overrides = {}
self._path = "" self._path = ""
self._archive = self self._archive = self
self.repositories = {}
self.calculations = {}
self.use_write_cache = use_write_cache
@staticmethod @staticmethod
def factory(archive_path): def factory(archive_path, use_write_cache=False):
"""A factory method for creating Archive objects based on the file """A factory method for creating Archive objects based on the file
type. type.
...@@ -276,32 +284,35 @@ class Archive(ArchiveSection): ...@@ -276,32 +284,35 @@ class Archive(ArchiveSection):
""" """
extension = archive_path.rsplit(".", 1)[-1] extension = archive_path.rsplit(".", 1)[-1]
if extension == "json": if extension == "json":
return ArchiveJSON(archive_path) return ArchiveJSON(archive_path, use_write_cache)
elif extension == "h5": elif extension == "h5":
return ArchiveHDF5(archive_path) return ArchiveHDF5(archive_path, use_write_cache)
else: else:
raise ValueError( raise ValueError(
"Unknown archive filetype with extension '{}'." "Unknown archive filetype with extension '{}'."
.format(extension) .format(extension)
) )
def get_child(self, name): def setup(self, root):
return self.root_section.get_child(name) """Used to setup the dictionaries that contain the repositories and
calculations.
def __len__(self): """
return len(self.root_section) for repo_name, repo in root.items():
self.repositories[repo_name] = ArchiveSectionHDF5(
def __contains__(self, key): repo,
return self.root_section.contains(key) "{}".format(repo_name),
self._archive,
def items(self): [[0]],
return self.root_section.items() 0
)
def keys(self): for calc_name, calc in repo.items():
return self.root_section.keys() self.calculations[calc_name] = ArchiveSectionHDF5(
calc,
def values(self): "{}/{}".format(repo_name, calc_name),
return self.root_section.values() self._archive,
[[0]],
0
)
class ArchiveHDF5(Archive): class ArchiveHDF5(Archive):
...@@ -316,15 +327,11 @@ class ArchiveHDF5(Archive): ...@@ -316,15 +327,11 @@ class ArchiveHDF5(Archive):
index_cache (dict): A cache containing the index data for groups and index_cache (dict): A cache containing the index data for groups and
datasets. datasets.
""" """
def __init__(self, filepath): def __init__(self, filepath, use_write_cache=False):
super().__init__(filepath) super().__init__(filepath, use_write_cache)
try: h5_root = h5py.File(filepath, "r")
h5_root = h5py.File(filepath, "r")
except Exception as os_error:
raise NomadArchiveError(str(os_error))
self.root_section = ArchiveSectionHDF5(h5_root, "", self, [], local_index=0)
self.index_cache = {} self.index_cache = {}
self.setup(h5_root)
class ArchiveSectionHDF5(ArchiveSection): class ArchiveSectionHDF5(ArchiveSection):
...@@ -344,7 +351,7 @@ class ArchiveSectionHDF5(ArchiveSection): ...@@ -344,7 +351,7 @@ class ArchiveSectionHDF5(ArchiveSection):
self._local_index = local_index self._local_index = local_index
def __len__(self): def __len__(self):
pass return len(self.keys())
def __contains__(self, key): def __contains__(self, key):
try: try:
...@@ -467,7 +474,11 @@ class ArchiveSectionHDF5(ArchiveSection): ...@@ -467,7 +474,11 @@ class ArchiveSectionHDF5(ArchiveSection):
global_index_path = "/".join(self._names) + "/" + index_path global_index_path = "/".join(self._names) + "/" + index_path
index_data = self._archive.index_cache.get(global_index_path) index_data = self._archive.index_cache.get(global_index_path)
if index_data is None: if index_data is None:
index_data = self._data[index_path].value index_data = self._data.get(index_path)
if index_data is None:
index_data = np.array([[0]])
else:
index_data = index_data.value
self._archive.index_cache[global_index_path] = index_data self._archive.index_cache[global_index_path] = index_data
index_datas.append(index_data) index_datas.append(index_data)
...@@ -533,7 +544,7 @@ class ArchiveSectionHDF5(ArchiveSection): ...@@ -533,7 +544,7 @@ class ArchiveSectionHDF5(ArchiveSection):
.format(child_path) .format(child_path)
) )
if test_index.size > 1: if test_index.size > 1:
raise NomadArchiveError( raise ValueError(
"The HDF file contains more than one dataset for the " "The HDF file contains more than one dataset for the "
"path '{}'. " "path '{}'. "
.format(child_path) .format(child_path)
...@@ -564,19 +575,6 @@ class ArchiveSectionHDF5(ArchiveSection): ...@@ -564,19 +575,6 @@ class ArchiveSectionHDF5(ArchiveSection):
if data.dtype == np.object: if data.dtype == np.object:
data = np.array(data, dtype=np.str) data = np.array(data, dtype=np.str)
# Handle scalar values. The JSON files can distinguish
# between a scalar value and an array, but in HDF5 datasets
# are always arrays. This transformation will make lists
# with one value also scalar, which ensures compatibility
# with JSON. To properly handle scalar values, we could
# load the metainfo definitions and see the correct size
# from there, or then always return values as lists.
# Currently in the code when expecting a list, you should
# first ensure that the value returned from HDF5 ARchive is
# actually a list.
if data.shape == (1,):
data = data[0]
return data return data
def base64convert(self, x): def base64convert(self, x):
...@@ -601,8 +599,8 @@ class ArchiveJSON(Archive): ...@@ -601,8 +599,8 @@ class ArchiveJSON(Archive):
This implementation will load the entire JSON file into memory, which might This implementation will load the entire JSON file into memory, which might
become a problem with big files and parallel execution on the same machine. become a problem with big files and parallel execution on the same machine.
""" """
def __init__(self, filepath): def __init__(self, filepath, use_write_cache=False):
super().__init__(filepath) super().__init__(filepath, use_write_cache)
with open(filepath, "r") as fin: with open(filepath, "r") as fin:
json_root = json.load(fin) json_root = json.load(fin)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment