nomad_utils files

ab8abff7 · Mohamed, Fawzi Roberto (fawzi) · d26811c0 · ab8abff7 · ab8abff7 · ab8abff7
Commit ab8abff7 authored 6 years ago by Mohamed, Fawzi Roberto (fawzi)
--- a/common/python/nomad_utils/__init__.py
+++ b/common/python/nomad_utils/__init__.py
--- a/common/python/nomad_utils/calc_fix.py
+++ b/common/python/nomad_utils/calc_fix.py
+import ase.calculators.calculator
+
+if 'potential_energy' not in calculator.all_properties:
+    calculator.all_properties += ['potential_energy', 'kinetic_energy']
--- a/common/python/nomad_utils/nomad_fetch.py
+++ b/common/python/nomad_utils/nomad_fetch.py
--- a/common/python/nomad_utils/nomad_formats.py
+++ b/common/python/nomad_utils/nomad_formats.py
+"""File formats.
+
+This module implements the read(), iread() and write() functions in ase.io.
+For each file format there is a namedtuple (IOFormat) that has the following
+elements:
+
+* a read(filename, index, **kwargs) generator that will yield Atoms objects
+* a write(filename, images) function
+* a 'single' boolean (False if multiple configurations is supported)
+* a 'acceptsfd' boolean (True if file-descriptors are accepted)
+
+There is a dict 'ioformats' that is filled with IOFormat objects as they are
+needed.  The 'initialize()' function will create the IOFormat object by
+looking at the all_formats dict and by importing the correct read/write
+functions from the correct module.  The 'single' and 'acceptsfd' bools are
+parsed from two-charcter string in the all_formats dict below.
+
+
+Example
+=======
+
+The xyz format is implemented in the ase/io/xyz.py file which has a
+read_xyz() generator and a write_xyz() function.
+
+"""
+
+import collections
+import functools
+import inspect
+import os
+import sys
+
+from ase.atoms import Atoms
+from ase.utils import import_module, basestring, PurePath
+from ase.parallel import parallel_function, parallel_generator
+
+
+class UnknownFileTypeError(Exception):
+    pass
+
+
+IOFormat = collections.namedtuple('IOFormat',
+                                  'read, write, single, acceptsfd, isbinary')
+ioformats = {}  # will be filled at run-time
+
+# 1=single, +=multiple, F=accepts a file-descriptor, S=needs a file-name str,
+# B=like F, but opens in binary mode
+all_formats = {
+    'nomad-json': ('JSON from Nomad archive', '+F'),
+    'nomad-ziptxt': ('ZIPPED TXT from Nomad archive', '+F'),
+}
+
+# Special cases:
+format2modulename = {
+}
+
+extension2format = {
+}
+
+netcdfconventions2format = {
+    'http://www.etsf.eu/fileformats': 'etsf',
+    'AMBER': 'netcdftrajectory'
+}
+
+
+def initialize(format):
+    """Import read and write functions."""
+    if format in ioformats:
+        return  # already done
+
+    _format = format.replace('-', '_')
+    module_name = format2modulename.get(format, _format)
+
+    try:
+        module = import_module('ase.io.' + module_name)
+    except ImportError as err:
+        raise ValueError('File format not recognized: %s.  Error: %s'
+                         % (format, err))
+
+    read = getattr(module, 'read_' + _format, None)
+    write = getattr(module, 'write_' + _format, None)
+
+    if read and not inspect.isgeneratorfunction(read):
+        read = functools.partial(wrap_read_function, read)
+    if not read and not write:
+        raise ValueError('File format not recognized: ' + format)
+    code = all_formats[format][1]
+    single = code[0] == '1'
+    assert code[1] in 'BFS'
+    acceptsfd = code[1] != 'S'
+    isbinary = code[1] == 'B'
+    ioformats[format] = IOFormat(read, write, single, acceptsfd, isbinary)
+
+
+def get_ioformat(format):
+    """Initialize and return IOFormat tuple."""
+    initialize(format)
+    return ioformats[format]
+
+
+def get_compression(filename):
+    """
+    Parse any expected file compression from the extension of a filename.
+    Return the filename without the extension, and the extension. Recognises
+    ``.gz``, ``.bz2``, ``.xz``.
+
+    >>> get_compression('H2O.pdb.gz')
+    ('H2O.pdb', 'gz')
+    >>> get_compression('crystal.cif')
+    ('crystal.cif', None)
+
+    Parameters
+    ==========
+    filename: str
+        Full filename including extension.
+
+    Returns
+    =======
+    (root, extension): (str, str or None)
+        Filename split into root without extension, and the extension
+        indicating compression format. Will not split if compression
+        is not recognised.
+    """
+    # Update if anything is added
+    valid_compression = ['gz', 'bz2', 'xz']
+
+    # Use stdlib as it handles most edge cases
+    root, compression = os.path.splitext(filename)
+
+    # extension keeps the '.' so remember to remove it
+    if compression.strip('.') in valid_compression:
+        return root, compression.strip('.')
+    else:
+        return filename, None
+
+
+def open_with_compression(filename, mode='r'):
+    """
+    Wrapper around builtin `open` that will guess compression of a file
+    from the filename and open it for reading or writing as if it were
+    a standard file.
+
+    Implemented for ``gz``(gzip), ``bz2``(bzip2) and ``xz``(lzma). Either
+    Python 3 or the ``backports.lzma`` module are required for ``xz``.
+
+    Supported modes are:
+       * 'r', 'rt', 'w', 'wt' for text mode read and write.
+       * 'rb, 'wb' for binary read and write.
+    Depending on the Python version, you may get errors trying to write the
+    wrong string type to the file.
+
+    Parameters
+    ==========
+    filename: str
+        Path to the file to open, including any extensions that indicate
+        the compression used.
+    mode: str
+        Mode to open the file, same as for builtin ``open``, e.g 'r', 'w'.
+
+    Returns
+    =======
+    fd: file
+        File-like object open with the specified mode.
+    """
+
+    if sys.version_info[0] > 2:
+        # Compressed formats sometimes default to binary, so force
+        # text mode in Python 3.
+        if mode == 'r':
+            mode = 'rt'
+        elif mode == 'w':
+            mode = 'wt'
+        elif mode == 'a':
+            mode = 'at'
+    else:
+        # The version of gzip in Anaconda Python 2 on Windows forcibly
+        # adds a 'b', so strip any 't' and let the string conversions
+        # be carried out implicitly by Python.
+        mode = mode.strip('t')
+
+    root, compression = get_compression(filename)
+
+    if compression is None:
+        return open(filename, mode)
+    elif compression == 'gz':
+        import gzip
+        fd = gzip.open(filename, mode=mode)
+    elif compression == 'bz2':
+        import bz2
+        if hasattr(bz2, 'open'):
+            # Python 3 only
+            fd = bz2.open(filename, mode=mode)
+        else:
+            # Python 2
+            fd = bz2.BZ2File(filename, mode=mode)
+    elif compression == 'xz':
+        try:
+            from lzma import open as lzma_open
+        except ImportError:
+            from backports.lzma import open as lzma_open
+        fd = lzma_open(filename, mode)
+    else:
+        fd = open(filename, mode)
+
+    return fd
+
+
+def wrap_read_function(read, filename, index=None, **kwargs):
+    """Convert read-function to generator."""
+    if index is None:
+        yield read(filename, **kwargs)
+    else:
+        for atoms in read(filename, index, **kwargs):
+            yield atoms
+
+
+def write(filename, images, format=None, parallel=True, append=False,
+          **kwargs):
+    """Write Atoms object(s) to file.
+
+    filename: str or file
+        Name of the file to write to or a file descriptor.  The name '-'
+        means standard output.
+    images: Atoms object or list of Atoms objects
+        A single Atoms object or a list of Atoms objects.
+    format: str
+        Used to specify the file-format.  If not given, the
+        file-format will be taken from suffix of the filename.
+    parallel: bool
+        Default is to write on master only.  Use parallel=False to write
+        from all slaves.
+    append: bool
+        Default is to open files in 'w' or 'wb' mode, overwriting existing files.
+        In some cases opening the file in 'a' or 'ab' mode (appending) is usefull,
+        e.g. writing trajectories or saving multiple Atoms objects in one file.
+        WARNING: If the file format does not support multiple entries without
+        additional keywords/headers, files created using 'append=True'
+        might not be readable by any program! They will nevertheless be
+        written without error message.
+
+    The use of additional keywords is format specific."""
+
+    if isinstance(filename, basestring):
+        filename = os.path.expanduser(filename)
+        fd = None
+        if filename == '-':
+            fd = sys.stdout
+            filename = None
+        elif format is None:
+            format = filetype(filename, read=False)
+    else:
+        fd = filename
+        filename = None
+
+    format = format or 'json'  # default is json
+
+    io = get_ioformat(format)
+
+    _write(filename, fd, format, io, images, parallel=parallel, append=append, **kwargs)
+
+
+@parallel_function
+def _write(filename, fd, format, io, images, parallel=None, append=False, **kwargs):
+    if isinstance(images, Atoms):
+        images = [images]
+
+    if io.single:
+        if len(images) > 1:
+            raise ValueError('{}-format can only store 1 Atoms object.'
+                             .format(format))
+        images = images[0]
+
+    if io.write is None:
+        raise ValueError("Can't write to {}-format".format(format))
+
+    # Special case for json-format:
+    if format == 'json' and len(images) > 1:
+        if filename is not None:
+            io.write(filename, images, **kwargs)
+            return
+        raise ValueError("Can't write more than one image to file-descriptor"
+                         'using json-format.')
+
+    if io.acceptsfd:
+        open_new = (fd is None)
+        if open_new:
+            mode = 'wb' if io.isbinary else 'w'
+            if append:
+                mode = mode.replace('w', 'a')
+            fd = open_with_compression(filename, mode)
+        io.write(fd, images, **kwargs)
+        if open_new:
+            fd.close()
+    else:
+        if fd is not None:
+            raise ValueError("Can't write {}-format to file-descriptor"
+                             .format(format))
+        if 'append' in io.write.__code__.co_varnames:
+            io.write(filename, images, append=append, **kwargs)
+        elif append:
+            raise ValueError("Cannot append to {}-format, write-function "
+                             "does not support the append keyword.".format(format))
+        else:
+            io.write(filename, images, **kwargs)
+
+
+def read(filename, index=None, format=None, parallel=True, **kwargs):
+    """Read Atoms object(s) from file.
+
+    filename: str or file
+        Name of the file to read from or a file descriptor.
+    index: int, slice or str
+        The last configuration will be returned by default.  Examples:
+
+            * ``index=0``: first configuration
+            * ``index=-2``: second to last
+            * ``index=':'`` or ``index=slice(None)``: all
+            * ``index='-3:`` or ``index=slice(-3, None)``: three last
+            * ``index='::2`` or ``index=slice(0, None, 2)``: even
+            * ``index='1::2`` or ``index=slice(1, None, 2)``: odd
+    format: str
+        Used to specify the file-format.  If not given, the
+        file-format will be guessed by the *filetype* function.
+    parallel: bool
+        Default is to read on master and broadcast to slaves.  Use
+        parallel=False to read on all slaves.
+
+    Many formats allow on open file-like object to be passed instead
+    of ``filename``. In this case the format cannot be auto-decected,
+    so the ``format`` argument should be explicitly given."""
+
+    if isinstance(filename, PurePath):
+        filename = str(filename)
+    if isinstance(index, basestring):
+        try:
+            index = string2index(index)
+        except ValueError:
+            pass
+
+    filename, index = parse_filename(filename, index)
+    if index is None:
+        index = -1
+    format = format or filetype(filename)
+    io = get_ioformat(format)
+    if isinstance(index, (slice, basestring)):
+        return list(_iread(filename, index, format, io, parallel=parallel,
+                           **kwargs))
+    else:
+        return next(_iread(filename, slice(index, None), format, io,
+                           parallel=parallel, **kwargs))
+
+
+def iread(filename, index=None, format=None, parallel=True, **kwargs):
+    """Iterator for reading Atoms objects from file.
+
+    Works as the `read` function, but yields one Atoms object at a time
+    instead of all at once."""
+
+    if isinstance(index, basestring):
+        index = string2index(index)
+
+    filename, index = parse_filename(filename, index)
+
+    if index is None or index == ':':
+        index = slice(None, None, None)
+
+    if not isinstance(index, (slice, basestring)):
+        index = slice(index, (index + 1) or None)
+
+    format = format or filetype(filename)
+    io = get_ioformat(format)
+
+    for atoms in _iread(filename, index, format, io, parallel=parallel,
+                        **kwargs):
+        yield atoms
+
+
+@parallel_generator
+def _iread(filename, index, format, io, parallel=None, full_output=False,
+           **kwargs):
+    if isinstance(filename, basestring):
+        filename = os.path.expanduser(filename)
+
+    if not io.read:
+        raise ValueError("Can't read from {}-format".format(format))
+
+    if io.single:
+        start = index.start
+        assert start is None or start == 0 or start == -1
+        args = ()
+    else:
+        args = (index,)
+
+    must_close_fd = False
+    if isinstance(filename, basestring):
+        if io.acceptsfd:
+            mode = 'rb' if io.isbinary else 'r'
+            fd = open_with_compression(filename, mode)
+            must_close_fd = True
+        else:
+            fd = filename
+    else:
+        assert io.acceptsfd
+        fd = filename
+
+    # Make sure fd is closed in case loop doesn't finish:
+    try:
+        for dct in io.read(fd, *args, **kwargs):
+            if not isinstance(dct, dict):
+                dct = {'atoms': dct}
+            if full_output:
+                yield dct
+            else:
+                yield dct['atoms']
+    finally:
+        if must_close_fd:
+            fd.close()
+
+
+def parse_filename(filename, index=None):
+    if not isinstance(filename, basestring):
+        return filename, index
+
+    extension = os.path.basename(filename)
+    if '@' not in extension:
+        return filename, index
+
+    newindex = None
+    newfilename, newindex = filename.rsplit('@', 1)
+
+    if isinstance(index, slice):
+        return newfilename, index
+    try:
+        newindex = string2index(newindex)
+    except ValueError:
+        pass
+
+    return newfilename, newindex
+
+
+def string2index(string):
+    if ':' not in string:
+        return int(string)
+    i = []
+    for s in string.split(':'):
+        if s == '':
+            i.append(None)
+        else:
+            i.append(int(s))
+    i += (3 - len(i)) * [None]
+    return slice(*i)
+
+
+def filetype(filename, read=True, guess=True):
+    """Try to guess the type of the file.
+
+    First, special signatures in the filename will be checked for.  If that
+    does not identify the file type, then the first 2000 bytes of the file
+    will be read and analysed.  Turn off this second part by using
+    read=False.
+
+    Can be used from the command-line also::
+
+        $ ase info filename ...
+    """
+
+    ext = None
+    if isinstance(filename, basestring):
+        # strip any compression extensions that can be read
+        root, compression = get_compression(filename)
+        basename = os.path.basename(root)
+
+        if basename.endswith('.nomad.json'):
+            return 'nomad-json'
+        
+        if basename.endswith('.nomad.zip'):
+            return 'nomad-ziptxt'
+
+
+    format = extension2format.get(ext)
+    if format is None and guess:
+        format = ext
+    if format is None:
+        raise UnknownFileTypeError('Could not guess file type')
+
+    return format
--- a/common/python/nomad_utils/nomad_get.py
+++ b/common/python/nomad_utils/nomad_get.py
+from __future__ import print_function
+import json
+
+
+class CLICommand:
+    short_description = 'Get calculations from NOMAD and write to JSON files.'
+
+    @staticmethod
+    def add_arguments(p):
+        p.add_argument('uri', nargs='+', metavar='nmd://<hash>',
+                       +                       help='URIs to get')
+
+    @staticmethod
+    def run(args):
+        from ase.nomad import download
+        for uri in args.uri:
+            calculation = download(uri)
+            identifier = calculation.hash.replace('/', '.')
+            fname = 'nmd.{}.nomad.json'.format(identifier)
+            with open(fname, 'w') as fd:
+                json.dump(calculation, fd)
+            print(uri)
--- a/common/python/nomad_utils/nomad_json.py
+++ b/common/python/nomad_utils/nomad_json.py
+import json
+from nomad_utils.nomad_fetch import dict2images
+from ase.utils import basestring
+
+def read_nomad_json(fd, index=':', only_atoms=False):
+    # wth, we should not be passing index like this!
+    from ase.io.formats import string2index
+    if isinstance(index, basestring):
+        index = string2index(index)
+    d = json.load(fd)
+    images = dict2images(d, only_atoms=only_atoms)
+    return list(images)[index]
--- a/common/python/nomad_utils/nomad_ziptxt.py
+++ b/common/python/nomad_utils/nomad_ziptxt.py
+import ase
+from ase.utils import basestring
+
+def read_nomad_ziptxt(fd, index=':', only_atoms=False, skip_errors=False):
+    images = []
+    from ase.io.formats import string2index
+    if isinstance(index, basestring):
+        index = string2index(index)
+    for bline in fd:
+        line = bline.decode("utf-8")
+        if line.startswith('#'):
+            pass
+        else:
+            nmduri = line.split('/section_run')
+            print('Requesting NOMAD archive at ' + nmduri[0])
+            entry = nomad_utils.nomad_fetch.download(nmduri[0], only_atoms=only_atoms, skip_errors=skip_errors)
+            nmd_entry_images = entry.toatoms()
+            nmd_images = list(nmd_entry_images)
+            if len(nmd_images)>0:
+                print('Adding ' + str(len(nmd_images)) + ' structure(s) with ' + ','.join(
+                    list(set([str(ni.get_chemical_formula('reduce')) for ni in nmd_images]))))
+            else:
+                print('No structures retrieved from this NOMAD archive!')
+            images.extend(nmd_images)
+    return list(images)[index]
--- a/common/python/nomad_utils/singlepoint.py
+++ b/common/python/nomad_utils/singlepoint.py
+import numpy as np
+from ase.calculators.calculator import Calculator, all_properties
+if 'potential_energy' not in all_properties:
+    all_properties += ['potential_energy', 'kinetic_energy']
+from ase.calculators.calculator import PropertyNotImplementedError
+
+
+class SinglePointCalculator(Calculator):
+    """Special calculator for a single configuration.
+
+    Used to remember the energy, force and stress for a given
+    configuration.  If the positions, atomic numbers, unit cell, or
+    boundary conditions are changed, then asking for
+    energy/forces/stress will raise an exception."""
+
+    name = 'unknown'
+
+    def __init__(self, atoms=None, **results):
+        """Save energy, forces, stress, ... for the current configuration."""
+        Calculator.__init__(self)
+        self.results = {}
+        for property, value in results.items():
+            if property.startswith('nomad_'):
+                pass
+            else:
+                assert property in all_properties
+            if value is None:
+                continue
+            if(property in ['energy', 'magmom', 'free_energy'] or
+               property.startswith('nomad_')):
+                self.results[property] = value
+            else:
+                self.results[property] = np.array(value, float)
+        if atoms:
+            self.atoms = atoms.copy()
+
+    def __str__(self):
+        tokens = []
+        for key, val in sorted(self.results.items()):
+            if np.isscalar(val):
+                txt = '{}={}'.format(key, val)
+            else:
+                txt = '{}=...'.format(key)
+            tokens.append(txt)
+        return '{}({})'.format(self.__class__.__name__, ', '.join(tokens))
+
+    def get_property(self, name, atoms=None, allow_calculation=True):
+        if name not in self.results or self.check_state(atoms):
+            if allow_calculation:
+                raise PropertyNotImplementedError(
+                    'The property "{0}" is not available.'.format(name))
+            return None
+
+        result = self.results[name]
+        if isinstance(result, np.ndarray):
+            result = result.copy()
+        return result
+
+
+class SinglePointKPoint:
+    def __init__(self, weight, s, k, eps_n=[], f_n=[]):
+        self.weight = weight
+        self.s = s  # spin index
+        self.k = k  # k-point index
+        self.eps_n = eps_n
+        self.f_n = f_n
+
+
+class SinglePointDFTCalculator(SinglePointCalculator):
+    def __init__(self, atoms,
+                 efermi=None, bzkpts=None, ibzkpts=None, bz2ibz=None,
+                 **results):
+        self.bz_kpts = bzkpts
+        self.ibz_kpts = ibzkpts
+        self.bz2ibz = bz2ibz
+        self.eFermi = efermi
+
+        SinglePointCalculator.__init__(self, atoms, **results)
+        self.kpts = None
+
+    def get_fermi_level(self):
+        """Return the Fermi-level(s)."""
+        return self.eFermi
+
+    def get_bz_to_ibz_map(self):
+        return self.bz2ibz
+
+    def get_bz_k_points(self):
+        """Return the k-points."""
+        return self.bz_kpts
+
+    def get_number_of_spins(self):
+        """Return the number of spins in the calculation.
+
+        Spin-paired calculations: 1, spin-polarized calculation: 2."""
+        if self.kpts is not None:
+            nspin = set()
+            for kpt in self.kpts:
+                nspin.add(kpt.s)
+            return len(nspin)
+        return None
+
+    def get_spin_polarized(self):
+        """Is it a spin-polarized calculation?"""
+        nos = self.get_number_of_spins()
+        if nos is not None:
+            return nos == 2
+        return None
+
+    def get_ibz_k_points(self):
+        """Return k-points in the irreducible part of the Brillouin zone."""
+        return self.ibz_kpts
+
+    def get_kpt(self, kpt=0, spin=0):
+        if self.kpts is not None:
+            counter = 0
+            for kpoint in self.kpts:
+                if kpoint.s == spin:
+                    if kpt == counter:
+                        return kpoint
+                    counter += 1
+        return None
+
+    def get_occupation_numbers(self, kpt=0, spin=0):
+        """Return occupation number array."""
+        kpoint = self.get_kpt(kpt, spin)
+        if kpoint is not None:
+            return kpoint.f_n
+        return None
+
+    def get_eigenvalues(self, kpt=0, spin=0):
+        """Return eigenvalue array."""
+        kpoint = self.get_kpt(kpt, spin)
+        if kpoint is not None:
+            return kpoint.eps_n
+        return None
+
+    def get_homo_lumo(self):
+        """Return HOMO and LUMO energies."""
+        if self.kpts is None:
+            raise RuntimeError('No kpts')
+        eHs = []
+        eLs = []
+        for kpt in self.kpts:
+            eH, eL = self.get_homo_lumo_by_spin(kpt.s)
+            eHs.append(eH)
+            eLs.append(eL)
+        return np.array(eHs).max(), np.array(eLs).min()
+
+    def get_homo_lumo_by_spin(self, spin=0):
+        """Return HOMO and LUMO energies for a given spin."""
+        if self.kpts is None:
+            raise RuntimeError('No kpts')
+        for kpt in self.kpts:
+            if kpt.s == spin:
+                break
+        else:
+            raise RuntimeError('No k-point with spin {0}'.format(spin))
+        if self.eFermi is None:
+            raise RuntimeError('Fermi level is not available')
+        eH = -1.e32
+        eL = 1.e32
+        for kpt in self.kpts:
+            if kpt.s == spin:
+                for e in kpt.eps_n:
+                    if e <= self.eFermi:
+                        eH = max(eH, e)
+                    else:
+                        eL = min(eL, e)
+        return eH, eL