Commit b1d6a2b3 authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

Merged file-parser into v0.9.8.

parent 5e225fbe
Pipeline #88359 passed with stages
in 23 minutes and 38 seconds
......@@ -46,6 +46,9 @@ contributing, and API reference.
Omitted versions are plain bugfix releases with only minor changes and fixes.
### v0.9.8
- A new library for parsing text-based raw files.
### v0.9.3
- Encyclopedia with dedicated materials search index.
......
Subproject commit c17dbecac0f395b48fc65b0c7dcc2ba4de1be271
Subproject commit 9d4518fefb1f1146b7fa7ed95547822683020533
Subproject commit 3f78e35b56714fe8c79b010ed52d97def19cb2dc
Subproject commit 3b5866bc8dff77d398eddc6baceb19e1c30180c1
Subproject commit 7ed44cab4410099208f5e399bfa4c1dc2e9c29fe
Subproject commit 8944d5ffd4cac01e43aaac5c5687b0a8831f2bc7
Subproject commit afdd0937aab2681ca3912cfd9d95c7633fdcd7b9
Subproject commit c1aca04237d69097bbeb17d3a397be66e9c6797f
......@@ -963,7 +963,7 @@ class section_method(public.section_method):
gw_frequency_number = Quantity(
type=np.dtype(np.int32),
shape=[],
shape=['gw_number_of_frequencies'],
description='''
Number referring to the frequency used in the calculation of the self energy.
''',
......@@ -971,7 +971,7 @@ class section_method(public.section_method):
gw_frequency_values = Quantity(
type=np.dtype(np.float64),
shape=[],
shape=['gw_number_of_frequencies'],
unit='joule',
description='''
Values of the frequency used in the calculation of the self energy.
......@@ -980,7 +980,7 @@ class section_method(public.section_method):
gw_frequency_weights = Quantity(
type=np.dtype(np.float64),
shape=[],
shape=['gw_number_of_frequencies'],
description='''
Weights of the frequency used in the calculation of the self energy.
''',
......
......@@ -3614,6 +3614,16 @@ class section_scf_iteration(MSection):
m_def = Section(validate=False, a_legacy=LegacyDefinition(name='section_scf_iteration'))
charge_total_scf_iteration = Quantity(
type=np.dtype(np.float64),
shape=[],
unit='coulomb',
description='''
Value of the total charge, calculated with the method described in XC_method
during each self-consistent field (SCF) iteration.
''',
a_legacy=LegacyDefinition(name='charge_total_scf_iteration'))
electronic_kinetic_energy_scf_iteration = Quantity(
type=np.dtype(np.float64),
shape=[],
......@@ -3867,6 +3877,16 @@ class section_scf_iteration(MSection):
categories=[time_info, accessory_info],
a_legacy=LegacyDefinition(name='time_scf_iteration_wall_start'))
time_scf_iteration = Quantity(
type=np.dtype(np.float64),
shape=[],
unit='second',
description='''
Total time of the self-consistent field (SCF) iteration.
''',
categories=[time_info, accessory_info],
a_legacy=LegacyDefinition(name='time_scf_iteration'))
class section_single_configuration_calculation(MSection):
'''
......@@ -3995,6 +4015,15 @@ class section_single_configuration_calculation(MSection):
categories=[atom_forces_type],
a_legacy=LegacyDefinition(name='atom_forces'))
charge_total = Quantity(
type=np.dtype(np.float64),
shape=[],
unit='coulomb',
description='''
Value of the total charge, calculated with the method described in XC_method.
''',
a_legacy=LegacyDefinition(name='charge_total'))
electronic_kinetic_energy = Quantity(
type=np.dtype(np.float64),
shape=[],
......@@ -5211,6 +5240,18 @@ class section_system(MSection):
categories=[configuration_core],
a_legacy=LegacyDefinition(name='lattice_vectors'))
lattice_vectors_reciprocal = Quantity(
type=np.dtype(np.float64),
shape=[3, 3],
unit='1/meter',
description='''
Reciprocal lattice vectors (in Cartesian coordinates) of the simulation cell. The
first index runs over the $x,y,z$ Cartesian coordinates, and the second index runs
over the 3 lattice vectors.
''',
categories=[configuration_core],
a_legacy=LegacyDefinition(name='lattice_vectors_reciprocal'))
local_rotations = Quantity(
type=np.dtype(np.float64),
shape=['number_of_atoms', 3, 3],
......
# NOMAD file parsing module
The parsing module consists of the `UnstructuredTextFileParser`, `DataTextFileParser`
and `XMLParser` classes to enable the parsing of unstructured text, structured data text,
and xml files, respectively. These classes are based on the FileParser class which
provides the common methods for file handling, and querying the parsed results.
## UnstructuredTextFileParser
The most common type of file that are parsed in NOMAD are unstructured text files which
can be handled using the UnstructuredTextFileParser. The parser uses the `re` module to
match a given pattern for a quantity in the text file. To illustrate the use of this parser,
let us consider a file `super_code.out` with the following contents:
```
2020/05/15
*** super_code v2 ***
system 1
--------
sites: H(1.23, 0, 0), H(-1.23, 0, 0), O(0, 0.33, 0)
latice: (0, 0, 0), (1, 0, 0), (1, 1, 0)
energy: 1.29372
*** This was done with magic source ***
*** x°42 ***
system 2
--------
sites: H(1.23, 0, 0), H(-1.23, 0, 0), O(0, 0.33, 0)
cell: (0, 0, 0), (1, 0, 0), (1, 1, 0)
energy: 1.29372
```
In order to create a nomad archive from this file, we first have to parse the necessary
quantities which includes the date, system, energy, etc. The following python code
illustrates how can this be achieved. Note that we will be using *parser* to refer to the
file parser and to the code parser that writes the archive.
```python
import datetime
import numpy as np
from nomad.parsing.file_parser import UnstructuredTextFileParser, Quantity
from nomad.datamodel import EntryArchive
from nomad.datamodel.metainfo.public import section_run, section_system, section_single_configuration_calculation
p = UnstructuredTextFileParser()
def str_to_sites(string):
sym, pos = string.split('(')
pos = np.array(pos.split(')')[0].split(',')[:3], dtype=float)
return sym, pos
q_system = Quantity(
'system', r'\s*system \d+([\s\S]+?energy: [\d\.]+)([\s\S]+\*\*\*)*',
sub_parser=UnstructuredTextFileParser(quantities=[
Quantity(
'sites', r'([A-Z]\([\d\.\, \-]+\))',
str_operation=str_to_sites),
Quantity(
section_system.lattice_vectors,
r'(?:latice|cell): \((\d)\, (\d), (\d)\)\,?\s*\((\d)\, (\d), (\d)\)\,?\s*\((\d)\, (\d), (\d)\)\,?\s*',
repeats=False),
Quantity(
'energy', r'energy: (\d\.\d+)'),
Quantity(
'magic_source', r'done with magic source\s*\*{3}\s*\*{3}\s*([\S]+)',
repeats=False)]),
repeats=True)
quantities = [
Quantity('date', r'(\d\d\d\d\/\d\d\/\d\d)', repeats=False),
Quantity('program_version', r'super\_code\s*v(\d+)\s*', repeats=False),
q_system]
p.quantities = quantities
# this returns the energy for system 2
p.system[1].get('energy', unit='hartree')
```
The quantities to be parsed can be specified as a list of `Quantity` objects with a name
and a re pattern. The matched value should be enclosed in a group(s). By default,
the parser uses the findall method of `re`, hence overlap
between matches is not tolerated. If overlap cannot be avoided, one should switch to the
finditer method by passing *findall=False* to the parser. Multiple
matches for the quantity are returned if *repeats=True* (default). The name, data type,
shape and unit for the quantity can also intialized by passing a metainfo Quantity.
An external function *str_operation* can be also be passed to perform more specific
string operations on the matched value. A local parsing on a matched block can be carried
out by nesting a *sub_parser*. This is also an instance of the `UnstructuredTextFileParser`
with a list of quantities to parse. To access a parsed quantity, one can use the *get*
method.
The creation of the archive is implemented in the parse method of the code parser which takes
the mainfile, archive and logger as arguments. The file parser, *out_parser* is
created only in the constructor and subsequent parsing on a different *mainfile* can be
performed by assigning it to the file parser.
```python
class SupercodeParser:
def __init__(self):
self.out_parser = UnstructuredTextFileParser()
self.out_parser.quantities = quantities
def parse(self, mainfile, archive, logger):
self.out_parser.mainfile = mainfile
sec_run = archive.m_create(section_run)
sec_run.program_name = 'super_code'
sec_run.program_version = str(self.out_parser.get('program_version'))
date = datetime.datetime.strptime(
self.out_parser.get('date'), '%Y/%m/%d') - datetime.datetime(1970, 1, 1)
sec_run.program_compilation_datetime = date.total_seconds()
for system in self.out_parser.get('system'):
sec_system = sec_run.m_create(section_system)
sec_system.lattice_vectors = system.get('lattice_vectors')
sites = system.get('sites')
sec_system.atom_labels = [site[0] for site in sites]
sec_system.atom_positions = [site[1] for site in sites]
sec_scc = sec_run.m_create(section_single_configuration_calculation)
sec_scc.energy_total = system.get('energy')
sec_scc.single_configuration_calculation_to_system_ref = sec_system
magic_source = system.get('magic_source')
if magic_source is not None:
sec_scc.message_info_evaluation = magic_source
archive = EntryArchive()
parser = SupercodeParser()
parser.parse('temp.dat', archive, None)
print(archive.m_to_json())
```
## DataTextFileParser
The `DataTextFileParser` uses the numpy.loadtxt function to load an structured data file.
The loaded data can be accessed from property *data*.
## XMLParser
The `XMLParser` uses the ElementTree module to parse an xml file. The parse method of the
parser takes in an xpath style key to access individual quantities. By default, automatic
data type conversion is performed, which can be switched off by setting *convert=False*.
from .file_parser import FileParser
from .text_parser import UnstructuredTextFileParser, DataTextFileParser, Quantity, ParsePattern
from .xml_parser import XMLParser
# Copyright 2018 Markus Scheidgen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an"AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import logging
import pint
from typing import Any, Dict
class FileParser:
'''
Base class for parsers. The parse method implemented here simply sets the parsed
quantities as attributes of the class. The parse method specific to a file type
should be implemented in the corresponding child class. The parsed quantities are
stored in results. One can access a quantity by using the get method.
Arguments:
mainfile: the file to be parsed
logger: optional logger
'''
def __init__(self, mainfile: str, logger=None):
self._mainfile: str = os.path.abspath(mainfile) if mainfile else mainfile
self.logger = logger if logger else logging
self._results: Dict[str, Any] = None
# a key is necessary for xml parsers, where parsing is done dynamically
self._key: str = None
self._kwargs: Dict[str, Any] = None
self._file_handler: Any = None
@property
def results(self):
if self._results is None:
self._results = dict()
if self._key not in self._results:
self.parse(self._key, **self._kwargs)
return self._results
@property
def maindir(self):
return os.path.dirname(self._mainfile)
@property
def mainfile(self):
if self._mainfile is None:
return
if not os.path.isfile(self._mainfile):
return
return self._mainfile
@mainfile.setter
def mainfile(self, val):
self._results = None
self._file_handler = None
self._mainfile = os.path.abspath(val) if val is not None else val
def get(self, key: str, default: Any = None, unit: str = None, **kwargs):
'''
Returns the parsed result for quantity with name key. If quantity is not in
results default will be returned. A pint unit can be provided which is attached
to the returned value.
'''
self._key = key
self._kwargs = kwargs
val = self.results.get(key, None)
if val is None:
val = default
if val is None:
return
if unit is not None:
if isinstance(unit, pint.Quantity):
val = val * unit
elif isinstance(val, pint.Quantity):
val = val.to(unit)
else:
val = pint.Quantity(val, unit)
return val
def __getitem__(self, key):
if isinstance(key, str):
return self.get(key)
elif isinstance(key, int):
return self[int]
def parse(self, quantity_key: str = None):
'''
Sets quantities in result as class attributes.
'''
for key, val in self._results.items():
try:
setattr(self, key, val)
except Exception:
pass
return self
# Copyright 2018 Markus Scheidgen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an"AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import mmap
import re
import numpy as np
import pint
from typing import List, Union, Callable, Type
from nomad.parsing.file_parser import FileParser
from nomad.metainfo import Quantity as mQuantity
class ParsePattern:
def __init__(self, **kwargs):
self._head = kwargs.get('head', '')
self._key = kwargs.get('key', '')
value = kwargs.get('value', 're_float_array')
if value.startswith('re_'):
token = ''
if 'float' in value:
token += r'Ee\+\d\.\-'
if 'int' in value:
token += r'\d'
if 'string' in value:
token += r'\w'
if 'array' in value:
token += r' '
value = r'[%s]+' % token
self._value = value
self._tail = kwargs.get('tail', '\n')
self._re_pattern = None
@property
def re_pattern(self):
if self._re_pattern is None:
head = r'%s[\s\S]*?' % self._head if self._head else ''
key = r'%s\s*\:*\=*\s*' % self._key if self._key else ''
self._re_pattern = r'%s%s\s*\:*\=*\s*(%s)%s' % (
head, key, self._value, self._tail)
return self._re_pattern
def __call__(self, text, repeats=True):
values = []
units = []
if repeats:
for res in self.re_pattern.finditer(text):
unit = res.groupdict().get('__unit', None)
values.append(
''.join([group.decode() for group in res.groups() if group and group != unit]))
units.append(unit.decode() if unit is not None else None)
else:
res = self.re_pattern.search(text)
if res is not None:
unit = res.groupdict().get('__unit', None)
units.append(unit.decode() if unit is not None else None)
values.append(''.join(
[group.decode() for group in res.groups() if group and group != unit]))
class Quantity:
'''
Class to define a quantity to be parsed in the UnstructuredTextFileParser.
Arguments:
quantity: string to identify the name or a metainfo quantity to initialize the
quantity object.
re_pattern: pattern to be used by re for matching. Ideally, overlaps among
quantities for a given parser should be avoided.
sub_parser: instance of UnstructuredTextFileParser to perform local parsing
within a matched block
str_operation: external function to be performed on a matched block
dtype: data type of the quantity
unit: unit of the quantity
shape: shape of the quantity
repeats: denotes if multiple matches are expected
convert: switch automatic data type conversion
comment: character to denote a line to be ignored
'''
def __init__(self, quantity: Union[str, mQuantity], re_pattern: Union[str, ParsePattern], **kwargs):
self.name: str
self.dtype: str
self.unit: str
self.shape: List[int]
if isinstance(quantity, str):
self.name = quantity
self.dtype = None
self.unit = None
self.shape = None
elif isinstance(quantity, mQuantity):
self.name = quantity.name
self.dtype = quantity.type
self.unit = quantity.unit
# check if metainfo shape has dependencies
self.shape = quantity.shape
if False in [str(i).isdigit() for i in self.shape]:
self.shape = None
# override metainfo
self.dtype = kwargs.get('dtype', self.dtype)
self.unit = kwargs.get('unit', self.unit)
self.shape = kwargs.get('shape', self.shape)
self._re_pattern: str = re_pattern.re_pattern if isinstance(
re_pattern, ParsePattern) else re_pattern
self._str_operation: Callable = kwargs.get('str_operation', None)
self._sub_parser: UnstructuredTextFileParser = kwargs.get('sub_parser', None)
self.repeats: bool = kwargs.get('repeats', True)
self.convert: bool = kwargs.get('convert', True)
self.comment: str = kwargs.get('comment', None)
@property
def re_pattern(self):
'''
Returns a compiled re pattern.
'''
if isinstance(self._re_pattern, str):
re_pattern = self._re_pattern.replace('__unit', '__unit_%s' % self.name)
self._re_pattern = re.compile(re_pattern.encode())
return self._re_pattern
@re_pattern.setter
def re_pattern(self, val: str):
self._re_pattern = val
@property
def str_operation(self):
return self._str_operation
@str_operation.setter
def str_operation(self, val: Callable):
self._str_operation = val
def to_data(self, val_in: List[str]):
'''
Converts the parsed block into data.
'''
def process(val):
if self.comment is not None:
if val.strip()[0] == self.comment:
return
if self.str_operation is not None:
val = self.str_operation(val)
else:
val = val.strip().split() if isinstance(val, str) else val
val = val[0] if len(val) == 1 else val
def _convert(val):
if isinstance(val, str):
if self.dtype is None:
if val.isdecimal():
val = int(val)
else:
try:
val = float(val)
except Exception:
pass
self.shape = [] if self.shape is None else self.shape
return val
elif type(val) in [np.ndarray, list]:
try:
dtype = float if self.dtype is None else self.dtype
val_test = np.array(val, dtype=dtype)
if self.dtype is None:
if np.all(np.mod(val_test, 1) == 0):
val_test = np.array(val_test, dtype=int)
self.shape = list(np.shape(val)) if self.shape is None else self.shape
val = val_test
except Exception:
val = [_convert(v) for v in val]
return val
elif isinstance(val, dict):
for k, v in val.items():
self.dtype = None
val[k] = _convert(v)
return val
else:
self.dtype = type(val)
self.shape = [] if self.shape is None else self.shape
return val
if self.convert:
val = _convert(val)
if isinstance(val, np.ndarray) and self.shape:
val = np.reshape(val, self.shape)
return val
val_out = [process(val) for val in val_in]
if isinstance(val_out[0], np.ndarray):