diff --git a/README.md b/README.md index f1377f62c55dc704412e828f5a2f0a3b90e4326a..d3ef99921569ba772156d3971410c208a607f815 100644 --- a/README.md +++ b/README.md @@ -46,6 +46,9 @@ contributing, and API reference. Omitted versions are plain bugfix releases with only minor changes and fixes. +### v0.9.8 +- A new library for parsing text-based raw files. + ### v0.9.3 - Encyclopedia with dedicated materials search index. diff --git a/dependencies/parsers/elastic b/dependencies/parsers/elastic index c17dbecac0f395b48fc65b0c7dcc2ba4de1be271..9d4518fefb1f1146b7fa7ed95547822683020533 160000 --- a/dependencies/parsers/elastic +++ b/dependencies/parsers/elastic @@ -1 +1 @@ -Subproject commit c17dbecac0f395b48fc65b0c7dcc2ba4de1be271 +Subproject commit 9d4518fefb1f1146b7fa7ed95547822683020533 diff --git a/dependencies/parsers/gromacs b/dependencies/parsers/gromacs index 3f78e35b56714fe8c79b010ed52d97def19cb2dc..3b5866bc8dff77d398eddc6baceb19e1c30180c1 160000 --- a/dependencies/parsers/gromacs +++ b/dependencies/parsers/gromacs @@ -1 +1 @@ -Subproject commit 3f78e35b56714fe8c79b010ed52d97def19cb2dc +Subproject commit 3b5866bc8dff77d398eddc6baceb19e1c30180c1 diff --git a/dependencies/parsers/lammps b/dependencies/parsers/lammps index 7ed44cab4410099208f5e399bfa4c1dc2e9c29fe..8944d5ffd4cac01e43aaac5c5687b0a8831f2bc7 160000 --- a/dependencies/parsers/lammps +++ b/dependencies/parsers/lammps @@ -1 +1 @@ -Subproject commit 7ed44cab4410099208f5e399bfa4c1dc2e9c29fe +Subproject commit 8944d5ffd4cac01e43aaac5c5687b0a8831f2bc7 diff --git a/dependencies/python_common b/dependencies/python_common index afdd0937aab2681ca3912cfd9d95c7633fdcd7b9..c1aca04237d69097bbeb17d3a397be66e9c6797f 160000 --- a/dependencies/python_common +++ b/dependencies/python_common @@ -1 +1 @@ -Subproject commit afdd0937aab2681ca3912cfd9d95c7633fdcd7b9 +Subproject commit c1aca04237d69097bbeb17d3a397be66e9c6797f diff --git a/nomad/datamodel/metainfo/common.py b/nomad/datamodel/metainfo/common.py index 5c08eb1dde24cc680b646a071b215b0ca83556fa..61acd2d8a7d8f14008c86e855ceaf06417a675fb 100644 --- a/nomad/datamodel/metainfo/common.py +++ b/nomad/datamodel/metainfo/common.py @@ -963,7 +963,7 @@ class section_method(public.section_method): gw_frequency_number = Quantity( type=np.dtype(np.int32), - shape=[], + shape=['gw_number_of_frequencies'], description=''' Number referring to the frequency used in the calculation of the self energy. ''', @@ -971,7 +971,7 @@ class section_method(public.section_method): gw_frequency_values = Quantity( type=np.dtype(np.float64), - shape=[], + shape=['gw_number_of_frequencies'], unit='joule', description=''' Values of the frequency used in the calculation of the self energy. @@ -980,7 +980,7 @@ class section_method(public.section_method): gw_frequency_weights = Quantity( type=np.dtype(np.float64), - shape=[], + shape=['gw_number_of_frequencies'], description=''' Weights of the frequency used in the calculation of the self energy. ''', diff --git a/nomad/datamodel/metainfo/public.py b/nomad/datamodel/metainfo/public.py index fc30912cc763d2a4b3a88db438fe36e2f2bc05ca..eaf5c0be7c4a9c3406c6f13ae48f23bac8cbc9f8 100644 --- a/nomad/datamodel/metainfo/public.py +++ b/nomad/datamodel/metainfo/public.py @@ -3614,6 +3614,16 @@ class section_scf_iteration(MSection): m_def = Section(validate=False, a_legacy=LegacyDefinition(name='section_scf_iteration')) + charge_total_scf_iteration = Quantity( + type=np.dtype(np.float64), + shape=[], + unit='coulomb', + description=''' + Value of the total charge, calculated with the method described in XC_method + during each self-consistent field (SCF) iteration. + ''', + a_legacy=LegacyDefinition(name='charge_total_scf_iteration')) + electronic_kinetic_energy_scf_iteration = Quantity( type=np.dtype(np.float64), shape=[], @@ -3867,6 +3877,16 @@ class section_scf_iteration(MSection): categories=[time_info, accessory_info], a_legacy=LegacyDefinition(name='time_scf_iteration_wall_start')) + time_scf_iteration = Quantity( + type=np.dtype(np.float64), + shape=[], + unit='second', + description=''' + Total time of the self-consistent field (SCF) iteration. + ''', + categories=[time_info, accessory_info], + a_legacy=LegacyDefinition(name='time_scf_iteration')) + class section_single_configuration_calculation(MSection): ''' @@ -3995,6 +4015,15 @@ class section_single_configuration_calculation(MSection): categories=[atom_forces_type], a_legacy=LegacyDefinition(name='atom_forces')) + charge_total = Quantity( + type=np.dtype(np.float64), + shape=[], + unit='coulomb', + description=''' + Value of the total charge, calculated with the method described in XC_method. + ''', + a_legacy=LegacyDefinition(name='charge_total')) + electronic_kinetic_energy = Quantity( type=np.dtype(np.float64), shape=[], @@ -5211,6 +5240,18 @@ class section_system(MSection): categories=[configuration_core], a_legacy=LegacyDefinition(name='lattice_vectors')) + lattice_vectors_reciprocal = Quantity( + type=np.dtype(np.float64), + shape=[3, 3], + unit='1/meter', + description=''' + Reciprocal lattice vectors (in Cartesian coordinates) of the simulation cell. The + first index runs over the $x,y,z$ Cartesian coordinates, and the second index runs + over the 3 lattice vectors. + ''', + categories=[configuration_core], + a_legacy=LegacyDefinition(name='lattice_vectors_reciprocal')) + local_rotations = Quantity( type=np.dtype(np.float64), shape=['number_of_atoms', 3, 3], diff --git a/nomad/parsing/file_parser/README.md b/nomad/parsing/file_parser/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fb91896971e25d76d4521051695e4fc7c75940b5 --- /dev/null +++ b/nomad/parsing/file_parser/README.md @@ -0,0 +1,146 @@ +# NOMAD file parsing module + +The parsing module consists of the `UnstructuredTextFileParser`, `DataTextFileParser` +and `XMLParser` classes to enable the parsing of unstructured text, structured data text, +and xml files, respectively. These classes are based on the FileParser class which +provides the common methods for file handling, and querying the parsed results. + +## UnstructuredTextFileParser + +The most common type of file that are parsed in NOMAD are unstructured text files which +can be handled using the UnstructuredTextFileParser. The parser uses the `re` module to +match a given pattern for a quantity in the text file. To illustrate the use of this parser, +let us consider a file `super_code.out` with the following contents: + +``` +2020/05/15 + *** super_code v2 *** + +system 1 +-------- +sites: H(1.23, 0, 0), H(-1.23, 0, 0), O(0, 0.33, 0) +latice: (0, 0, 0), (1, 0, 0), (1, 1, 0) +energy: 1.29372 + +*** This was done with magic source *** +*** x°42 *** + + +system 2 +-------- +sites: H(1.23, 0, 0), H(-1.23, 0, 0), O(0, 0.33, 0) +cell: (0, 0, 0), (1, 0, 0), (1, 1, 0) +energy: 1.29372 +``` + +In order to create a nomad archive from this file, we first have to parse the necessary +quantities which includes the date, system, energy, etc. The following python code +illustrates how can this be achieved. Note that we will be using *parser* to refer to the +file parser and to the code parser that writes the archive. + +```python +import datetime +import numpy as np + +from nomad.parsing.file_parser import UnstructuredTextFileParser, Quantity +from nomad.datamodel import EntryArchive +from nomad.datamodel.metainfo.public import section_run, section_system, section_single_configuration_calculation + +p = UnstructuredTextFileParser() + +def str_to_sites(string): + sym, pos = string.split('(') + pos = np.array(pos.split(')')[0].split(',')[:3], dtype=float) + return sym, pos + +q_system = Quantity( + 'system', r'\s*system \d+([\s\S]+?energy: [\d\.]+)([\s\S]+\*\*\*)*', + sub_parser=UnstructuredTextFileParser(quantities=[ + Quantity( + 'sites', r'([A-Z]\([\d\.\, \-]+\))', + str_operation=str_to_sites), + Quantity( + section_system.lattice_vectors, + r'(?:latice|cell): \((\d)\, (\d), (\d)\)\,?\s*\((\d)\, (\d), (\d)\)\,?\s*\((\d)\, (\d), (\d)\)\,?\s*', + repeats=False), + Quantity( + 'energy', r'energy: (\d\.\d+)'), + Quantity( + 'magic_source', r'done with magic source\s*\*{3}\s*\*{3}\s*([\S]+)', + repeats=False)]), + repeats=True) + +quantities = [ + Quantity('date', r'(\d\d\d\d\/\d\d\/\d\d)', repeats=False), + Quantity('program_version', r'super\_code\s*v(\d+)\s*', repeats=False), + q_system] + +p.quantities = quantities +# this returns the energy for system 2 +p.system[1].get('energy', unit='hartree') +``` + +The quantities to be parsed can be specified as a list of `Quantity` objects with a name +and a re pattern. The matched value should be enclosed in a group(s). By default, +the parser uses the findall method of `re`, hence overlap +between matches is not tolerated. If overlap cannot be avoided, one should switch to the +finditer method by passing *findall=False* to the parser. Multiple +matches for the quantity are returned if *repeats=True* (default). The name, data type, +shape and unit for the quantity can also intialized by passing a metainfo Quantity. +An external function *str_operation* can be also be passed to perform more specific +string operations on the matched value. A local parsing on a matched block can be carried +out by nesting a *sub_parser*. This is also an instance of the `UnstructuredTextFileParser` +with a list of quantities to parse. To access a parsed quantity, one can use the *get* +method. + +The creation of the archive is implemented in the parse method of the code parser which takes +the mainfile, archive and logger as arguments. The file parser, *out_parser* is +created only in the constructor and subsequent parsing on a different *mainfile* can be +performed by assigning it to the file parser. + +```python +class SupercodeParser: + def __init__(self): + self.out_parser = UnstructuredTextFileParser() + self.out_parser.quantities = quantities + + def parse(self, mainfile, archive, logger): + self.out_parser.mainfile = mainfile + sec_run = archive.m_create(section_run) + sec_run.program_name = 'super_code' + sec_run.program_version = str(self.out_parser.get('program_version')) + date = datetime.datetime.strptime( + self.out_parser.get('date'), '%Y/%m/%d') - datetime.datetime(1970, 1, 1) + sec_run.program_compilation_datetime = date.total_seconds() + for system in self.out_parser.get('system'): + sec_system = sec_run.m_create(section_system) + sec_system.lattice_vectors = system.get('lattice_vectors') + sites = system.get('sites') + sec_system.atom_labels = [site[0] for site in sites] + sec_system.atom_positions = [site[1] for site in sites] + + sec_scc = sec_run.m_create(section_single_configuration_calculation) + sec_scc.energy_total = system.get('energy') + sec_scc.single_configuration_calculation_to_system_ref = sec_system + magic_source = system.get('magic_source') + if magic_source is not None: + sec_scc.message_info_evaluation = magic_source + +archive = EntryArchive() + +parser = SupercodeParser() +parser.parse('temp.dat', archive, None) + +print(archive.m_to_json()) +``` + +## DataTextFileParser +The `DataTextFileParser` uses the numpy.loadtxt function to load an structured data file. +The loaded data can be accessed from property *data*. + +## XMLParser +The `XMLParser` uses the ElementTree module to parse an xml file. The parse method of the +parser takes in an xpath style key to access individual quantities. By default, automatic +data type conversion is performed, which can be switched off by setting *convert=False*. + + diff --git a/nomad/parsing/file_parser/__init__.py b/nomad/parsing/file_parser/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2e4a255a8c849ed9719deadb58bb6dd89996539b --- /dev/null +++ b/nomad/parsing/file_parser/__init__.py @@ -0,0 +1,3 @@ +from .file_parser import FileParser +from .text_parser import UnstructuredTextFileParser, DataTextFileParser, Quantity, ParsePattern +from .xml_parser import XMLParser diff --git a/nomad/parsing/file_parser/file_parser.py b/nomad/parsing/file_parser/file_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..2da0d69c540ad185e16660ef0ced0fda289c831e --- /dev/null +++ b/nomad/parsing/file_parser/file_parser.py @@ -0,0 +1,112 @@ +# Copyright 2018 Markus Scheidgen +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an"AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os +import logging +import pint +from typing import Any, Dict + + +class FileParser: + ''' + Base class for parsers. The parse method implemented here simply sets the parsed + quantities as attributes of the class. The parse method specific to a file type + should be implemented in the corresponding child class. The parsed quantities are + stored in results. One can access a quantity by using the get method. + + Arguments: + mainfile: the file to be parsed + logger: optional logger + ''' + def __init__(self, mainfile: str, logger=None): + self._mainfile: str = os.path.abspath(mainfile) if mainfile else mainfile + self.logger = logger if logger else logging + self._results: Dict[str, Any] = None + # a key is necessary for xml parsers, where parsing is done dynamically + self._key: str = None + self._kwargs: Dict[str, Any] = None + self._file_handler: Any = None + + @property + def results(self): + if self._results is None: + self._results = dict() + if self._key not in self._results: + self.parse(self._key, **self._kwargs) + + return self._results + + @property + def maindir(self): + return os.path.dirname(self._mainfile) + + @property + def mainfile(self): + if self._mainfile is None: + return + + if not os.path.isfile(self._mainfile): + return + return self._mainfile + + @mainfile.setter + def mainfile(self, val): + self._results = None + self._file_handler = None + self._mainfile = os.path.abspath(val) if val is not None else val + + def get(self, key: str, default: Any = None, unit: str = None, **kwargs): + ''' + Returns the parsed result for quantity with name key. If quantity is not in + results default will be returned. A pint unit can be provided which is attached + to the returned value. + ''' + self._key = key + self._kwargs = kwargs + val = self.results.get(key, None) + if val is None: + val = default + + if val is None: + return + + if unit is not None: + if isinstance(unit, pint.Quantity): + val = val * unit + + elif isinstance(val, pint.Quantity): + val = val.to(unit) + + else: + val = pint.Quantity(val, unit) + + return val + + def __getitem__(self, key): + if isinstance(key, str): + return self.get(key) + elif isinstance(key, int): + return self[int] + + def parse(self, quantity_key: str = None): + ''' + Sets quantities in result as class attributes. + ''' + for key, val in self._results.items(): + try: + setattr(self, key, val) + except Exception: + pass + return self diff --git a/nomad/parsing/file_parser/text_parser.py b/nomad/parsing/file_parser/text_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..71bad6788602eb70b53926c308c61b2de30a40bf --- /dev/null +++ b/nomad/parsing/file_parser/text_parser.py @@ -0,0 +1,524 @@ +# Copyright 2018 Markus Scheidgen +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an"AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import logging +import mmap +import re +import numpy as np +import pint +from typing import List, Union, Callable, Type + +from nomad.parsing.file_parser import FileParser +from nomad.metainfo import Quantity as mQuantity + + +class ParsePattern: + def __init__(self, **kwargs): + self._head = kwargs.get('head', '') + self._key = kwargs.get('key', '') + value = kwargs.get('value', 're_float_array') + if value.startswith('re_'): + token = '' + if 'float' in value: + token += r'Ee\+\d\.\-' + if 'int' in value: + token += r'\d' + if 'string' in value: + token += r'\w' + if 'array' in value: + token += r' ' + value = r'[%s]+' % token + self._value = value + self._tail = kwargs.get('tail', '\n') + self._re_pattern = None + + @property + def re_pattern(self): + if self._re_pattern is None: + head = r'%s[\s\S]*?' % self._head if self._head else '' + key = r'%s\s*\:*\=*\s*' % self._key if self._key else '' + self._re_pattern = r'%s%s\s*\:*\=*\s*(%s)%s' % ( + head, key, self._value, self._tail) + return self._re_pattern + + def __call__(self, text, repeats=True): + values = [] + units = [] + if repeats: + for res in self.re_pattern.finditer(text): + unit = res.groupdict().get('__unit', None) + values.append( + ''.join([group.decode() for group in res.groups() if group and group != unit])) + units.append(unit.decode() if unit is not None else None) + else: + res = self.re_pattern.search(text) + if res is not None: + unit = res.groupdict().get('__unit', None) + units.append(unit.decode() if unit is not None else None) + values.append(''.join( + [group.decode() for group in res.groups() if group and group != unit])) + + +class Quantity: + ''' + Class to define a quantity to be parsed in the UnstructuredTextFileParser. + + Arguments: + quantity: string to identify the name or a metainfo quantity to initialize the + quantity object. + re_pattern: pattern to be used by re for matching. Ideally, overlaps among + quantities for a given parser should be avoided. + sub_parser: instance of UnstructuredTextFileParser to perform local parsing + within a matched block + str_operation: external function to be performed on a matched block + dtype: data type of the quantity + unit: unit of the quantity + shape: shape of the quantity + repeats: denotes if multiple matches are expected + convert: switch automatic data type conversion + comment: character to denote a line to be ignored + + ''' + def __init__(self, quantity: Union[str, mQuantity], re_pattern: Union[str, ParsePattern], **kwargs): + self.name: str + self.dtype: str + self.unit: str + self.shape: List[int] + if isinstance(quantity, str): + self.name = quantity + self.dtype = None + self.unit = None + self.shape = None + elif isinstance(quantity, mQuantity): + self.name = quantity.name + self.dtype = quantity.type + self.unit = quantity.unit + # check if metainfo shape has dependencies + self.shape = quantity.shape + if False in [str(i).isdigit() for i in self.shape]: + self.shape = None + # override metainfo + self.dtype = kwargs.get('dtype', self.dtype) + self.unit = kwargs.get('unit', self.unit) + self.shape = kwargs.get('shape', self.shape) + + self._re_pattern: str = re_pattern.re_pattern if isinstance( + re_pattern, ParsePattern) else re_pattern + self._str_operation: Callable = kwargs.get('str_operation', None) + self._sub_parser: UnstructuredTextFileParser = kwargs.get('sub_parser', None) + self.repeats: bool = kwargs.get('repeats', True) + self.convert: bool = kwargs.get('convert', True) + self.comment: str = kwargs.get('comment', None) + + @property + def re_pattern(self): + ''' + Returns a compiled re pattern. + ''' + if isinstance(self._re_pattern, str): + re_pattern = self._re_pattern.replace('__unit', '__unit_%s' % self.name) + self._re_pattern = re.compile(re_pattern.encode()) + return self._re_pattern + + @re_pattern.setter + def re_pattern(self, val: str): + self._re_pattern = val + + @property + def str_operation(self): + return self._str_operation + + @str_operation.setter + def str_operation(self, val: Callable): + self._str_operation = val + + def to_data(self, val_in: List[str]): + ''' + Converts the parsed block into data. + ''' + def process(val): + if self.comment is not None: + if val.strip()[0] == self.comment: + return + + if self.str_operation is not None: + val = self.str_operation(val) + + else: + val = val.strip().split() if isinstance(val, str) else val + val = val[0] if len(val) == 1 else val + + def _convert(val): + if isinstance(val, str): + if self.dtype is None: + if val.isdecimal(): + val = int(val) + else: + try: + val = float(val) + except Exception: + pass + + self.shape = [] if self.shape is None else self.shape + return val + + elif type(val) in [np.ndarray, list]: + try: + dtype = float if self.dtype is None else self.dtype + val_test = np.array(val, dtype=dtype) + if self.dtype is None: + if np.all(np.mod(val_test, 1) == 0): + val_test = np.array(val_test, dtype=int) + self.shape = list(np.shape(val)) if self.shape is None else self.shape + val = val_test + + except Exception: + val = [_convert(v) for v in val] + + return val + + elif isinstance(val, dict): + for k, v in val.items(): + self.dtype = None + val[k] = _convert(v) + return val + + else: + self.dtype = type(val) + self.shape = [] if self.shape is None else self.shape + return val + + if self.convert: + val = _convert(val) + + if isinstance(val, np.ndarray) and self.shape: + val = np.reshape(val, self.shape) + + return val + + val_out = [process(val) for val in val_in] + + if isinstance(val_out[0], np.ndarray): + self.dtype = val_out[0].dtype + + return val_out + + +class DataTextFileParser(FileParser): + ''' + Parser for structured data text files using numpy.loadtxt + + Arguments: + mainfile: the file to be parsed + dtype: data type + ''' + def __init__(self, **kwargs): + self._dtype: Type = kwargs.get('dtype', float) + mainfile: str = kwargs.get('mainfile', None) + logger = kwargs.get('logger', None) + logger = logger if logger is not None else logging + super().__init__(mainfile, logger=logger) + self.init_parameters() + + def init_parameters(self): + ''' + Method to call after loading data. + ''' + pass + + @property + def data(self): + ''' + Returns the loaded data + ''' + if self._file_handler is None: + if self.mainfile is None: + return + + try: + self._file_handler = np.loadtxt(self.mainfile, dtype=self._dtype) + except Exception: + return + + self.init_parameters() + return self._file_handler + + +class UnstructuredTextFileParser(FileParser): + ''' + Parser for unstructured text files using the re module. The quantities to be parsed + are given as a list of Quantity objects which specifies the re pattern. The mmap + module is used to handle the file. By default, re.find_all is used to get matches + for performance reasons. In this case, overlap is not tolerated in the re patterns. + To avoid this, set findall to False to switch to re.finditer. + + Arguments: + mainfile: the file to be parsed + quantities: list of Quantity objects to be parsed. + logger: optional logger + findall: switches between using re.findall and re.finditer + file_offset: offset in reading the file + file_length: length of the chunk to be read from the file + ''' + def __init__(self, mainfile=None, quantities=None, logger=None, findall=True, **kwargs): + super().__init__(mainfile, logger) + self._quantities: List[Quantity] = quantities + self.findall: bool = findall + self._kwargs = kwargs + self._file_length: int = kwargs.get('file_length', 0) + self._file_offset: int = kwargs.get('file_offset', 0) + self._file_pad: int = 0 + if quantities is None: + self.init_quantities() + + def copy(self): + ''' + Returns a copy of the object excluding the parsed results. + ''' + return UnstructuredTextFileParser( + self.mainfile, self.quantities, self.logger, **self._kwargs) + + def init_quantities(self): + ''' + Initializes the quantities list. + ''' + self._quantities = [] + + @property + def quantities(self): + return self._quantities + + @quantities.setter + def quantities(self, val): + self._quantities = val + + @property + def file_offset(self): + ''' + Integer offset in loading the file taking into account mmap pagination. + ''' + return self._file_offset + + @file_offset.setter + def file_offset(self, val): + self._file_pad = val % mmap.PAGESIZE + self._file_offset = (val // mmap.PAGESIZE) * mmap.PAGESIZE + + @property + def file_length(self): + ''' + Length of the file chunk to be loaded. + ''' + return self._file_length + + @file_length.setter + def file_length(self, val): + self._file_length = val + + @property + def file_mmap(self): + ''' + Memory mapped representation of the file. + ''' + if self._file_handler is None: + with open(self.mainfile) as f: + self._file_handler = mmap.mmap( + f.fileno(), self._file_length, access=mmap.ACCESS_COPY, + offset=self._file_offset) + # set the extra chunk loaded before the intended offset to empty + self._file_handler[:self._file_pad] = b' ' * self._file_pad + self._file_pad = 0 + return self._file_handler + + def keys(self): + ''' + Returns all the quantity names. + ''' + return [quantity.name for quantity in self.quantities] + + def items(self): + ''' + Returns an iterable name, value of the parsed quantities + ''' + for key in self.keys(): + yield key, self.get(key) + + def _parse_quantities(self, quantities): + re_findall = '|'.join([q.re_pattern.pattern.decode() for q in quantities]) + if len(quantities) == 1: + # necessary to add a dummy variable to make multiple matches + re_findall = '%s|(__dummy__)' % re_findall + re_findall = re_findall.encode() + + # map matches to quantities + matches = re.findall(re_findall, self.file_mmap) + current_index = 0 + for i in range(len(quantities)): + values = [] + units = [] + n_groups = quantities[i].re_pattern.groups + + non_empty_matches = [] + for match in matches: + non_empty_match = [m for m in match[current_index: current_index + n_groups] if m] + if not non_empty_match: + continue + non_empty_matches.append(non_empty_match) + index_unit = quantities[i].re_pattern.groupindex.get( + '__unit_%s' % quantities[i].name, None) + for non_empty_match in non_empty_matches: + if index_unit is not None: + unit = non_empty_match.pop(index_unit - 1) + units.append(unit.decode()) + + else: + units.append(None) + + values.append(' '.join([m.decode() for m in non_empty_match])) + + current_index += n_groups + + if not values: + continue + + try: + value_processed = quantities[i].to_data(values) + for j in range(len(value_processed)): + unit = units[j] if units[j] else quantities[i].unit + if not unit: + continue + value_processed[j] = pint.Quantity(value_processed[j], unit) + + if not quantities[i].repeats and value_processed: + value_processed = value_processed[0] + + self._results[quantities[i].name] = value_processed + + except Exception: + self.logger.warn('Error setting value for %s ' % quantities[i].name) + pass + + def _parse_quantity(self, quantity): + + value = [] + units = [] + if not quantity.repeats: + res = quantity.re_pattern.search(self.file_mmap) + if res is not None: + if quantity._sub_parser is not None: + span = np.array(res.span()) + self.file_offset + sub_parser = quantity._sub_parser.copy() + sub_parser.mainfile = self.mainfile + if (span[1] - span[0]) < mmap.PAGESIZE: + # self.logger.warn( + # 'Cannot use sub parser on quantity %s with blocks with size <' + # '%d. Will try to parse string' % (quantity.name, mmap.PAGESIZE)) + sub_parser._file_handler = b' '.join([g for g in res.groups() if g]) + else: + sub_parser.file_offset = span[0] + sub_parser.file_length = span[1] - sub_parser.file_offset + value.append(sub_parser.parse()) + + else: + unit = res.groupdict().get('__unit_%s' % quantity.name, None) + units.append(unit.decode() if unit is not None else None) + value.append(''.join( + [group.decode() for group in res.groups() if group and group != unit])) + + else: + for res in quantity.re_pattern.finditer(self.file_mmap): + if quantity._sub_parser is not None: + span = np.array(res.span()) + self.file_offset + sub_parser = quantity._sub_parser.copy() + sub_parser.mainfile = self.mainfile + if (span[1] - span[0]) < mmap.PAGESIZE: + # self.logger.warn( + # 'Cannot use sub parser on quantity %s with blocks with size <' + # '%d. Will try to parse string' % (quantity.name, mmap.PAGESIZE)) + sub_parser._file_handler = b' '.join([g for g in res.groups() if g]) + else: + sub_parser.file_offset = span[0] + sub_parser.file_length = span[1] - sub_parser.file_offset + value.append(sub_parser.parse()) + + else: + unit = res.groupdict().get('__unit_%s' % quantity.name, None) + value.append( + ''.join([group.decode() for group in res.groups() if group and group != unit])) + units.append(unit.decode() if unit is not None else None) + + if not value: + return + + if quantity._sub_parser is not None: + self._results[quantity.name] = value if quantity.repeats else value[0] + + else: + try: + value_processed = quantity.to_data(value) + for i in range(len(value_processed)): + unit = units[i] if units[i] else quantity.unit + if not unit: + continue + value_processed[i] = pint.Quantity(value_processed[i], unit) + + if not quantity.repeats and value_processed: + value_processed = value_processed[0] + + self._results[quantity.name] = value_processed + except Exception: + self.logger.warn('Error setting value for %s ' % quantity.name) + pass + + def parse(self, key=None): + ''' + Triggers parsing of all quantities if key is not provided. + ''' + if self._results is None: + self._results = dict() + + if self.file_mmap is None: + return self + + if self.findall: + if len(self._results) > 1: + return self + + n_results = 0 + while True: + quantities_findall = [ + q for q in self.quantities if q.name not in self._results and q._sub_parser is None] + if not quantities_findall: + break + + # recursively parse quantities + self._parse_quantities(quantities_findall) + + if n_results == len(self._results): + break + n_results = len(self._results) + + for quantity in self._quantities: + if quantity._sub_parser is not None: + self._parse_quantity(quantity) + + else: + for quantity in self._quantities: + if quantity.name == key or key is None: + if quantity.name not in self._results: + self._parse_quantity(quantity) + + super().parse() + return self diff --git a/nomad/parsing/file_parser/xml_parser.py b/nomad/parsing/file_parser/xml_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..2d9437ffed64bb5ee51b25b038b040f6587b4994 --- /dev/null +++ b/nomad/parsing/file_parser/xml_parser.py @@ -0,0 +1,148 @@ +# Copyright 2018 Markus Scheidgen +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an"AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os +import re +import numpy as np +from xml.etree import ElementTree + +from nomad.parsing.file_parser import FileParser + + +class XMLParser(FileParser): + ''' + Parser for XML files using ElementTree. + + Arguments: + mainfile: the file to be parsed + logger: logger + convert: specifies if quantities are converted automatically + ''' + def __init__(self, mainfile: str = None, logger=None, convert: bool = True): + super().__init__(mainfile, logger=logger) + self.convert = convert + self.init_parameters() + + def init_parameters(self): + ''' + Method to call after loading the xml file. + ''' + self._elements = None + + @property + def root(self): + ''' + Returns the root of the XML tree. + ''' + if self._file_handler is None: + if self.mainfile is None: + return + self._file_handler = ElementTree.parse(self.mainfile).getroot() + self.init_parameters() + + return self._file_handler + + @property + def elements(self): + ''' + Returns a list of all elements in the XML tree. + ''' + if self._elements is None: + self._elements = self.root.findall('.//') + + return self._elements + + def parse(self, key, convert=None): + ''' + Parse a quantity identified by key or an xpath-style path. Automatic conversion + can be switch off by setting convert to False. + ''' + _convert = convert if convert is not None else self.convert + if self._results is None: + self._results = dict() + + if not self.root: + return + + key_in = key + key = key.lstrip('/') + if key.find('/') > 0: + parent = os.path.dirname(key) + child = os.path.basename(key) + elements = self.root.findall(os.path.join('./', parent)) + else: + elements = self.elements + child = key + + val = [] + for element in elements: + if child: + v = element.attrib.get(child, None) + if v is None: + v = element.findall(child) + v = [e.text for e in v] + if v: + val.append(v) + + else: + val.append(element.attrib) + + if not val: + return + + def convert_value(val_in): + if isinstance(val_in, dict): + val = dict() + for k, v in val_in.items(): + val[k] = convert_value(v) + val = val_in + + elif isinstance(val_in, str): + # exponential formatting + val = val_in.strip().split() + if len(val) == 1: + val = val[0] + re_float = r'(\d+\.\d+)d(\-\d+)' + val = re.sub(re_float, r'\1e\2', val) + if val.isdecimal(): + val = int(val) + elif val == 'true' or val == 'false': + val = val == 'true' + else: + try: + val = float(val) + except Exception: + pass + else: + val = [convert_value(v) for v in val] + + elif isinstance(val_in, list): + try: + val = [v.split() if isinstance(v, str) else v for v in val_in] + val = [v[0] if (isinstance(v, list) and len(v) == 1) else v for v in val] + val = np.array(val, dtype=float) + if np.all(np.mod(val, 1) == 0): + val = np.array(val, dtype=int) + except Exception: + val = [convert_value(v) for v in val_in] + + return val + + if _convert: + val = convert_value(val) + + val = val[0] if len(val) == 1 else val + + self._results[key_in] = val diff --git a/nomad/parsing/parsers.py b/nomad/parsing/parsers.py index 854b3392ce12a5ef08c290fe7002dad0975e3717..b23d9f185aee507f0673ad174eeeb442421c841c 100644 --- a/nomad/parsing/parsers.py +++ b/nomad/parsing/parsers.py @@ -31,6 +31,7 @@ from vaspparser import VASPParser from phonopyparser import PhonopyParser from elasticparser import ElasticParser from lammpsparser import LammpsParser +from gromacsparser import GromacsParser try: # these packages are not available without parsing extra, which is ok, if the @@ -388,11 +389,7 @@ parsers = [ parser_class_name='amberparser.AMBERParser', mainfile_contents_re=r'\s*Amber\s[0-9]+\s[A-Z]+\s*[0-9]+' ), - LegacyParser( - name='parsers/gromacs', code_name='Gromacs', domain='dft', - parser_class_name='gromacsparser.GROMACSParser', - mainfile_contents_re=r'GROMACS - gmx mdrun' - ), + GromacsParser(), LegacyParser( name='parsers/gromos', code_name='Gromos', domain='dft', parser_class_name='gromosparser.GromosParser', diff --git a/nomad/parsing/text_parser.py b/nomad/parsing/text_parser.py deleted file mode 100644 index 0eecd0ab433c0174eafc01b6ad8b92572190125b..0000000000000000000000000000000000000000 --- a/nomad/parsing/text_parser.py +++ /dev/null @@ -1,176 +0,0 @@ -# -# Copyright The NOMAD Authors. -# -# This file is part of NOMAD. See https://nomad-lab.eu for further info. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import numpy as np -import re -import mmap -import logging - - -class Quantity: - def __init__(self, name, re_pattern, str_operation=None, unit=None, dtype=None, comment=None): - self.name = name - self.re_pattern = re.compile(re_pattern.encode()) - self.unit = unit - self.shape = None - self.dtype = dtype - self._value = None - self._str_operation = str_operation - self._comment = comment - - @property - def value(self): - return self._value - - @value.setter - def value(self, val_in): - - def convert_value(val): - if self._comment is not None: - if val.strip()[0] == self._comment: - return - - if self._str_operation is not None: - val = self._str_operation(val) - - else: - val = val.strip().split() if isinstance(val, str) else val - val = val[0] if len(val) == 1 else val - - def _convert(val): - if isinstance(val, str): - if self.dtype is None: - if val.isdecimal(): - val = int(val) - else: - try: - val = float(val) - except Exception: - pass - - self.shape = [] - return val - - elif type(val) in [np.ndarray, list]: - try: - dtype = float if self.dtype is None else self.dtype - val = np.array(val, dtype=dtype) - self.dtype = dtype - if np.all(np.mod(val, 1) == 0): - val = np.array(val, dtype=int) - self.dtype = int - self.shape = list(np.shape(val)) - - except Exception: - val = [_convert(v) for v in val] - self.dtype = None - - return val - - else: - self.dtype = type(val) - self.shape = [] - return val - - val = _convert(val) - - return val - - self._value = None if val_in is None else [convert_value(val) for val in val_in] - - def to_SI(self): - pass - - -class UnstructuredTextFileParser: - def __init__(self, mainfile, quantities, logger=None): - self._mainfile = mainfile - self.quantities = quantities - self._file_mmap = None - self.logger = logger if logger else logging - - @property - def quantities(self): - return self._quantities - - @quantities.setter - def quantities(self, val): - self._quantities = val - self.quantities_mapping = {val[idx].name: idx for idx in range(len(val))} - - @property - def file_mmap(self): - if self._file_mmap is None: - with open(self.mainfile) as f: - self._file_mmap = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) - return self._file_mmap - - @property - def mainfile(self): - return self._mainfile - - @mainfile.setter - def mainfile(self, val): - for quantity in self.quantities: - quantity.value = None - - self._file_mmap = None - self._mainfile = val - - def __getitem__(self, key): - idx = self.quantities_mapping.get(key, None) - if idx is None: - return - - value = self.quantities[idx].value - if value is None: - self.parse(key) - - return self.quantities[idx].value - - def __setitem__(self, key, val): - idx = self.quantities_mapping.get(key, None) - if idx is None: - return - - self.quantities[idx].value = val - - def keys(self): - return self.quantities_mapping.keys() - - def items(self): - for key in self.keys(): - yield key, self[key] - - def parse(self, key=None): - if isinstance(key, str): - key = [key] - - for quantity in self.quantities: - if key is None or quantity.name in key: - value = [] - for res in quantity.re_pattern.finditer(self.file_mmap): - value.append(''.join([group.decode() for group in res.groups() if group])) - if not value: - continue - - try: - quantity.value = value - except Exception: - self.logger.warn('Error setting value for %s ' % quantity.name) - pass diff --git a/tests/conftest.py b/tests/conftest.py index d371c2762c5212f0c7e6a61bad8d59defdff8989..d4ea0b531ef4ab8949177829ab2b2f1043fca433 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -40,7 +40,7 @@ from nomad.datamodel import EntryArchive from nomad.utils import structlogging from nomad.datamodel import User -from tests import test_parsing +from tests.parsing import test_parsing from tests.normalizing.conftest import run_normalize from tests.processing import test_data as test_processing from tests.test_files import example_file, empty_file diff --git a/tests/normalizing/conftest.py b/tests/normalizing/conftest.py index 7cae67046e15e8bdf873c469dfce8ff1f5910126..a0c0e5e58e074e258bea605cf048def875076d52 100644 --- a/tests/normalizing/conftest.py +++ b/tests/normalizing/conftest.py @@ -24,11 +24,11 @@ from nomad.normalizing import normalizers from nomad.datamodel import EntryArchive from nomad.datamodel.metainfo.public import section_system as System -from tests.test_parsing import parsed_vasp_example # pylint: disable=unused-import -from tests.test_parsing import parsed_template_example # pylint: disable=unused-import -from tests.test_parsing import parsed_example # pylint: disable=unused-import -from tests.test_parsing import parse_file -from tests.test_parsing import parsed_template_no_system # pylint: disable=unused-import +from tests.parsing.test_parsing import parsed_vasp_example # pylint: disable=unused-import +from tests.parsing.test_parsing import parsed_template_example # pylint: disable=unused-import +from tests.parsing.test_parsing import parsed_example # pylint: disable=unused-import +from tests.parsing.test_parsing import parse_file +from tests.parsing.test_parsing import parsed_template_no_system # pylint: disable=unused-import def run_normalize(entry_archive: EntryArchive) -> EntryArchive: diff --git a/tests/normalizing/test_system.py b/tests/normalizing/test_system.py index 9f82e7de9c45ee43f2848cdc93bf5061691d481f..abefa996e65e707d43d61842924f4ee9cbd5a51a 100644 --- a/tests/normalizing/test_system.py +++ b/tests/normalizing/test_system.py @@ -23,11 +23,11 @@ from nomad.datamodel import EntryArchive from nomad.app import dump_json from nomad.datamodel.metainfo.public import section_springer_material as SpringerMaterial -from tests.test_parsing import parsed_vasp_example # pylint: disable=unused-import -from tests.test_parsing import parsed_template_example # pylint: disable=unused-import -from tests.test_parsing import parsed_example # pylint: disable=unused-import -from tests.test_parsing import parsed_template_no_system # pylint: disable=unused-import -from tests.test_parsing import parse_file +from tests.parsing.test_parsing import parsed_vasp_example # pylint: disable=unused-import +from tests.parsing.test_parsing import parsed_template_example # pylint: disable=unused-import +from tests.parsing.test_parsing import parsed_example # pylint: disable=unused-import +from tests.parsing.test_parsing import parsed_template_no_system # pylint: disable=unused-import +from tests.parsing.test_parsing import parse_file from tests.normalizing.conftest import run_normalize, run_normalize_for_structure # pylint: disable=unused-import from tests.utils import assert_log @@ -168,17 +168,17 @@ def test_system_classification(atom, molecule, one_d, two_d, surface, bulk): """Tests that the system classification is correct for different kind of systems """ # Atom - assert atom.section_run[0].section_system[0].system_type == "atom" + assert atom.section_run[0].section_system[-1].system_type == "atom" # Molecule - assert molecule.section_run[0].section_system[0].system_type == "molecule / cluster" + assert molecule.section_run[0].section_system[-1].system_type == "molecule / cluster" # 1D system - assert one_d.section_run[0].section_system[0].system_type == "1D" + assert one_d.section_run[0].section_system[-1].system_type == "1D" # 2D system - assert two_d.section_run[0].section_system[0].system_type == "2D" + assert two_d.section_run[0].section_system[-1].system_type == "2D" # Surface - assert surface.section_run[0].section_system[0].system_type == "surface" + assert surface.section_run[0].section_system[-1].system_type == "surface" # Bulk system - assert bulk.section_run[0].section_system[0].system_type == "bulk" + assert bulk.section_run[0].section_system[-1].system_type == "bulk" def test_representative_systems(single_point, molecular_dynamics, geometry_optimization, phonon): diff --git a/tests/normalizing/test_workflow.py b/tests/normalizing/test_workflow.py index 537566689911b843430b3e94e1319229186f8dd9..89e333d9da1d9b9e8e3096dfdc62fc80a9131061 100644 --- a/tests/normalizing/test_workflow.py +++ b/tests/normalizing/test_workflow.py @@ -18,7 +18,7 @@ import pytest -from tests.test_parsing import parse_file +from tests.parsing.test_parsing import parse_file from .conftest import run_normalize diff --git a/tests/parsing/test_file_parser.py b/tests/parsing/test_file_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..b71b77960eabb7f0c2f694626a5c9be0e13e1f59 --- /dev/null +++ b/tests/parsing/test_file_parser.py @@ -0,0 +1,307 @@ +import pytest +import numpy as np +import pint + +from nomad.parsing.file_parser import UnstructuredTextFileParser, Quantity, ParsePattern,\ + XMLParser +from nomad.datamodel.metainfo.public import section_system + + +class TestUnstructuredTextFileParser: + @pytest.fixture(scope='class') + def mainfile(self): + return 'tests/data/parsers/exciting/Ag/INFO.OUT' + + @pytest.fixture(scope='class') + def mainfile2(self): + return 'tests/data/parsers/exciting/GW/INFO.OUT' + + @pytest.fixture(scope='class') + def quantity_string(self): + return dict( + quantity=Quantity('spin', r'Spin treatment\s*:\s*([\w\-]+)', repeats=False), + value='spin-unpolarised') + + @pytest.fixture(scope='class') + def quantity_float(self): + return dict( + quantity=Quantity('cell_volume', r'Unit cell volume\s*:\s*([\d\.]+)', repeats=False), + value=115.0293819379) + + @pytest.fixture(scope='class') + def quantity_array(self): + return dict( + quantity=Quantity( + 'lattice_vector', + r'Lattice vectors \(cartesian\) :\s*([\d\s\.]+)', repeats=False, shape=(3, 3)), + value=np.array([ + [3.86005, 3.86005, 0], [3.86005, 0, 3.86005], [0, 3.86005, 3.86005]])) + + @pytest.fixture(scope='class') + def quantity_repeats(self): + return dict( + quantity=Quantity('total_energy', r'Total energy\s*:\s*([\d\.\-]+)'), + value=np.array([ + -5307.34855605, -5313.90710687, -5315.97055490, -5316.38701749, + -5317.59994092, -5317.26163104, -5317.26791647, -5317.26750374, + -5317.26724651, -5317.26725951, -5317.26726114, -5317.26726119, + -5317.26726118])) + + @pytest.fixture(scope='class') + def quantity_with_unit(self): + return dict( + quantity=Quantity('wall_time', r'Wall time \((?P<__unit>\w+)\)\s*:\s*([\d\.]+)'), + value=[pint.Quantity(v, 'seconds') for v in [ + 3.55, 5.32, 7.09, 8.84, 10.58, 12.33, 14.09, 15.84, 17.58, + 19.33, 21.09, 22.91]]) + + @pytest.fixture(scope='function') + def parser(self, mainfile): + return UnstructuredTextFileParser(mainfile=mainfile) + + def test_mainfile_setter(self, parser, mainfile2): + parser.quantities = [Quantity( + 'time', r'Time \(hh:mm:ss\)\s*:\s*([\d:]+)', repeats=False)] + assert parser.get('time') == '19:10:23' + parser.mainfile = mainfile2 + assert parser.get('time') == '08:24:03' + + def test_constructor(self, mainfile, quantity_string): + class TestParser(UnstructuredTextFileParser): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def init_quantities(self): + self.quantities = [quantity_string['quantity']] + + parser = TestParser(mainfile=mainfile) + assert parser.get(quantity_string['quantity'].name) == quantity_string['value'] + + def test_copy(self, parser, quantity_string): + parser.quantities = [quantity_string['quantity']] + parser.parse() + parser2 = parser.copy() + assert parser2.mainfile == parser.mainfile + assert parser2.quantities == parser.quantities + assert parser2._results != parser._results + + def test_findall(self, parser, quantity_string, quantity_float, quantity_repeats): + parser.quantities = [ + q['quantity'] for q in [quantity_string, quantity_float, quantity_repeats]] + assert parser.findall + spin = parser.get(quantity_string['quantity'].name) + volume = parser.get(quantity_float['quantity'].name) + energies = parser.get(quantity_repeats['quantity'].name) + + parser_finditer = parser.copy() + parser_finditer.findall = False + assert parser_finditer._results is None + assert parser_finditer.get(quantity_string['quantity'].name) == spin + assert parser_finditer.get(quantity_float['quantity'].name) == volume + assert parser_finditer.get(quantity_repeats['quantity'].name) == energies + + def test_finditer(self, parser, quantity_string, quantity_float, quantity_with_unit): + # dynamic parsing + parser.quantities = [q['quantity'] for q in [ + quantity_string, quantity_float, quantity_with_unit]] + parser.findall = False + count = 0 + for q in [quantity_string, quantity_float, quantity_with_unit]: + count += 1 + assert parser.get(q['quantity'].name) == q['value'] + assert len(parser._results) == count + + def test_quantity_sub_parser(self, parser): + quantity_species = Quantity( + 'species', r'cies :\s*([\s\S]+?)(?:Spe|Total)', repeats=True, + sub_parser=UnstructuredTextFileParser(quantities=[ + Quantity('name', r'name\s*:\s*(\w+)', repeats=False), + Quantity('mass', r'atomic mass\s*:\s*([\d\.]+)', repeats=False)])) + + quantity_initialization = Quantity( + 'initialization', + r'Starting initialization([\s\S]+?)Ending initialization', repeats=False, + sub_parser=UnstructuredTextFileParser(quantities=[ + Quantity('k_point_grid', r'k\-point grid\s*:\s*([\d ]+)', repeats=False), + quantity_species])) + + quantity_scf = Quantity( + 'scf', r'Self\-consistent loop started([\s\S]+?)Self\-consistent loop stopped', + repeats=True, sub_parser=UnstructuredTextFileParser(quantities=[ + Quantity('iteration', r'SCF iteration number\s*:\s*(\d+)')])) + + parser.quantities = [ + quantity_initialization, quantity_scf, Quantity( + 'total_time', r'Total time spent \(seconds\)\s*:\s*([\d.]+)', repeats=False)] + + initialization = parser.get('initialization') + + assert (initialization.get('k_point_grid') == np.array([4, 4, 4])).all() + species = initialization.get('species') + assert len(species) == 1 + assert species[0].get('name') == 'silver' + assert species[0].get('mass') == 196631.6997 + + scf = parser.get('scf') + assert len(scf) == 1 + assert len(scf[0].get('iteration')) == 12 + + assert parser.get('total_time') == 22.4 + + def test_block_short(self, parser, quantity_repeats): + parser.quantities = [Quantity( + 'scf', r'SCF iteration number\s*:\s*\d+([\s\S]+?)Wall time', + repeats=True, sub_parser=UnstructuredTextFileParser(quantities=[ + quantity_repeats.get('quantity')]))] + + scf = parser.get('scf') + assert len(scf) == 12 + energies = quantity_repeats.get('value') + # total_energy repeats is deliberately set to True to confirm that + # only one energy per scf block is read + for i in range(len(scf)): + assert scf[i].get(quantity_repeats.get('quantity').name) == [energies[i]] + + def test_get_default(self, parser, quantity_float): + parser.quantities = [quantity_float['quantity']] + volume = parser.get('volume', 10.00) + assert volume == 10.00 + + def test_get_unit(self, parser, quantity_float): + parser.quantities = [quantity_float['quantity']] + volume = parser.get('cell_volume', unit='angstrom') + assert isinstance(volume, pint.Quantity) + assert volume.units == 'angstrom' + assert volume.magnitude == quantity_float['value'] + + def test_quantity_unit(self, parser, quantity_with_unit, quantity_repeats, quantity_string): + parser.quantities = [q['quantity'] for q in [ + quantity_with_unit, quantity_repeats, quantity_string]] + for q in [quantity_with_unit, quantity_string, quantity_repeats]: + equal = parser.get(q['quantity'].name) == q['value'] + if isinstance(equal, np.ndarray): + equal = equal.all() + assert equal + + def test_quantity_conversion(self, parser, quantity_float): + quantity = quantity_float.get('quantity') + quantity.convert = False + parser.quantities = [quantity] + assert parser.get(quantity.name) == str(quantity_float.get('value')) + + quantity.convert = True + parser = parser.copy() + parser.quantities = [quantity] + assert parser.get(quantity.name) == quantity_float.get('value') + + def test_quantity_parse_pattern(self, parser): + parser.quantities = [ + Quantity( + 'BZ_volume', ParsePattern(key='Brillouin zone volume', value='re_float'), + repeats=False), + Quantity( + 'n_crystal_symmetry', ParsePattern( + key='Number of crystal symmetries', value='re_int'), repeats=False), + Quantity( + 'g_vector_size', ParsePattern( + key='G-vector grid sizes', value='re_int_array'), repeats=False), + Quantity( + 'smearing', ParsePattern( + key='Smearing scheme', value='re_string'), repeats=False)] + + assert parser.get('BZ_volume') == 2.1564074262 + assert parser.get('n_crystal_symmetry') == 48 + assert (parser.get('g_vector_size') == np.array([24, 24, 24])).all() + assert parser.get('smearing') == 'Gaussian' + + def test_quantity_str_operation(self, parser): + def str_to_max_lm(string): + val = [v.split(':') for v in string.split('\n') if v] + res = {v[0].strip(): v[1] for v in val} + return res + + parser.quantities = [Quantity( + 'max_lm', + r'Maximum angular momentum used for([\s\S]+?inner part of muffin\-tin\s*:\s*\d+)', + repeats=False, str_operation=str_to_max_lm)] + + max_lm = parser.get('max_lm') + assert max_lm.get('APW functions') == 8 + assert max_lm.get('computing H and O matrix elements') == 8 + assert max_lm.get('potential and density') == 8 + assert max_lm.get('inner part of muffin-tin') == 2 + + def test_quantity_repeats(self, parser, quantity_repeats, mainfile): + quantity = quantity_repeats.get('quantity') + parser.quantities = [quantity] + assert isinstance(parser.get(quantity.name), list) + quantity.repeats = False + parser.mainfile = mainfile + parser.quantities = [quantity] + assert isinstance(parser.get(quantity.name), float) + + def test_quantity_unit_array(self, parser, quantity_array): + quantity = quantity_array.get('quantity') + quantity.unit = 'angstrom' + parser.quantities = [quantity] + lattice_vector = parser.get(quantity.name) + assert isinstance(lattice_vector, pint.Quantity) + assert lattice_vector.units == 'angstrom' + assert (lattice_vector.magnitude == quantity_array.get('value')).all() + + def test_quantity_metainfo(self, parser): + quantity = Quantity( + section_system.lattice_vectors, + r'Lattice vectors \(cartesian\) :\s*([\d\s\.]+)', repeats=False) + + parser.quantities = [quantity] + lattice_vectors = parser.get(quantity.name) + assert list(lattice_vectors.shape) == section_system.lattice_vectors.shape + assert lattice_vectors.dtype == section_system.lattice_vectors.type + + +class TestXMLParser: + @pytest.fixture(scope='class') + def mainfile(self): + return 'tests/data/parsers/vasp/vasp.xml' + + @pytest.fixture(scope='function') + def parser(self, mainfile): + return XMLParser(mainfile) + + def test_constructor(self, mainfile): + class TestParser(XMLParser): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + test_parser = TestParser(mainfile=mainfile) + + incar = dict(zip(test_parser.get('incar/i/name'), test_parser.get('incar/i'))) + assert incar['SYSTEM'] == 'SrTiO3' + assert incar['ISMEAR'] == 0 + assert incar['SIGMA'] == 0.1 + + def test_parse(self, parser): + k_points = parser.get('kpoints/varray[1]/v') + assert isinstance(k_points, np.ndarray) + assert k_points.shape == (35, 3) + + weights = parser.get('kpoints/varray[2]/v') + assert isinstance(weights, np.ndarray) + assert weights.shape == (35,) + + sc_energies = parser.get('calculation/scstep/energy/i') + assert len(sc_energies) == 13 + + def test_conversion(self, parser, mainfile): + parser.convert = False + + assert parser.get('atominfo/atoms') == [' 5 '] + assert parser.get('structure[1]/crystal[1]/varray[1]/v[1]') == [ + ' 4.00419668 0.00000000 0.00000000 '] + + parser.mainfile = mainfile + assert parser.get('atominfo/atoms', convert=True) == 5 + parser.convert = True + assert (parser.get('structure[1]/crystal[1]/varray[1]/v[1]') == np.array([ + 4.00419668, 0.0, 0.0])).all() diff --git a/tests/test_parsing.py b/tests/parsing/test_parsing.py similarity index 100% rename from tests/test_parsing.py rename to tests/parsing/test_parsing.py