diff --git a/README.md b/README.md
index f1377f62c55dc704412e828f5a2f0a3b90e4326a..d3ef99921569ba772156d3971410c208a607f815 100644
--- a/README.md
+++ b/README.md
@@ -46,6 +46,9 @@ contributing, and API reference.
 
 Omitted versions are plain bugfix releases with only minor changes and fixes.
 
+### v0.9.8
+- A new library for parsing text-based raw files.
+
 ### v0.9.3
 - Encyclopedia with dedicated materials search index.
 
diff --git a/dependencies/parsers/elastic b/dependencies/parsers/elastic
index c17dbecac0f395b48fc65b0c7dcc2ba4de1be271..9d4518fefb1f1146b7fa7ed95547822683020533 160000
--- a/dependencies/parsers/elastic
+++ b/dependencies/parsers/elastic
@@ -1 +1 @@
-Subproject commit c17dbecac0f395b48fc65b0c7dcc2ba4de1be271
+Subproject commit 9d4518fefb1f1146b7fa7ed95547822683020533
diff --git a/dependencies/parsers/gromacs b/dependencies/parsers/gromacs
index 3f78e35b56714fe8c79b010ed52d97def19cb2dc..3b5866bc8dff77d398eddc6baceb19e1c30180c1 160000
--- a/dependencies/parsers/gromacs
+++ b/dependencies/parsers/gromacs
@@ -1 +1 @@
-Subproject commit 3f78e35b56714fe8c79b010ed52d97def19cb2dc
+Subproject commit 3b5866bc8dff77d398eddc6baceb19e1c30180c1
diff --git a/dependencies/parsers/lammps b/dependencies/parsers/lammps
index 7ed44cab4410099208f5e399bfa4c1dc2e9c29fe..8944d5ffd4cac01e43aaac5c5687b0a8831f2bc7 160000
--- a/dependencies/parsers/lammps
+++ b/dependencies/parsers/lammps
@@ -1 +1 @@
-Subproject commit 7ed44cab4410099208f5e399bfa4c1dc2e9c29fe
+Subproject commit 8944d5ffd4cac01e43aaac5c5687b0a8831f2bc7
diff --git a/dependencies/python_common b/dependencies/python_common
index afdd0937aab2681ca3912cfd9d95c7633fdcd7b9..c1aca04237d69097bbeb17d3a397be66e9c6797f 160000
--- a/dependencies/python_common
+++ b/dependencies/python_common
@@ -1 +1 @@
-Subproject commit afdd0937aab2681ca3912cfd9d95c7633fdcd7b9
+Subproject commit c1aca04237d69097bbeb17d3a397be66e9c6797f
diff --git a/nomad/datamodel/metainfo/common.py b/nomad/datamodel/metainfo/common.py
index 5c08eb1dde24cc680b646a071b215b0ca83556fa..61acd2d8a7d8f14008c86e855ceaf06417a675fb 100644
--- a/nomad/datamodel/metainfo/common.py
+++ b/nomad/datamodel/metainfo/common.py
@@ -963,7 +963,7 @@ class section_method(public.section_method):
 
     gw_frequency_number = Quantity(
         type=np.dtype(np.int32),
-        shape=[],
+        shape=['gw_number_of_frequencies'],
         description='''
         Number referring to the frequency used in the calculation of the self energy.
         ''',
@@ -971,7 +971,7 @@ class section_method(public.section_method):
 
     gw_frequency_values = Quantity(
         type=np.dtype(np.float64),
-        shape=[],
+        shape=['gw_number_of_frequencies'],
         unit='joule',
         description='''
         Values of the frequency used in the calculation of the self energy.
@@ -980,7 +980,7 @@ class section_method(public.section_method):
 
     gw_frequency_weights = Quantity(
         type=np.dtype(np.float64),
-        shape=[],
+        shape=['gw_number_of_frequencies'],
         description='''
         Weights of the frequency used in the calculation of the self energy.
         ''',
diff --git a/nomad/datamodel/metainfo/public.py b/nomad/datamodel/metainfo/public.py
index fc30912cc763d2a4b3a88db438fe36e2f2bc05ca..eaf5c0be7c4a9c3406c6f13ae48f23bac8cbc9f8 100644
--- a/nomad/datamodel/metainfo/public.py
+++ b/nomad/datamodel/metainfo/public.py
@@ -3614,6 +3614,16 @@ class section_scf_iteration(MSection):
 
     m_def = Section(validate=False, a_legacy=LegacyDefinition(name='section_scf_iteration'))
 
+    charge_total_scf_iteration = Quantity(
+        type=np.dtype(np.float64),
+        shape=[],
+        unit='coulomb',
+        description='''
+        Value of the total charge, calculated with the method described in XC_method
+        during each self-consistent field (SCF) iteration.
+        ''',
+        a_legacy=LegacyDefinition(name='charge_total_scf_iteration'))
+
     electronic_kinetic_energy_scf_iteration = Quantity(
         type=np.dtype(np.float64),
         shape=[],
@@ -3867,6 +3877,16 @@ class section_scf_iteration(MSection):
         categories=[time_info, accessory_info],
         a_legacy=LegacyDefinition(name='time_scf_iteration_wall_start'))
 
+    time_scf_iteration = Quantity(
+        type=np.dtype(np.float64),
+        shape=[],
+        unit='second',
+        description='''
+        Total time of the self-consistent field (SCF) iteration.
+        ''',
+        categories=[time_info, accessory_info],
+        a_legacy=LegacyDefinition(name='time_scf_iteration'))
+
 
 class section_single_configuration_calculation(MSection):
     '''
@@ -3995,6 +4015,15 @@ class section_single_configuration_calculation(MSection):
         categories=[atom_forces_type],
         a_legacy=LegacyDefinition(name='atom_forces'))
 
+    charge_total = Quantity(
+        type=np.dtype(np.float64),
+        shape=[],
+        unit='coulomb',
+        description='''
+        Value of the total charge, calculated with the method described in XC_method.
+        ''',
+        a_legacy=LegacyDefinition(name='charge_total'))
+
     electronic_kinetic_energy = Quantity(
         type=np.dtype(np.float64),
         shape=[],
@@ -5211,6 +5240,18 @@ class section_system(MSection):
         categories=[configuration_core],
         a_legacy=LegacyDefinition(name='lattice_vectors'))
 
+    lattice_vectors_reciprocal = Quantity(
+        type=np.dtype(np.float64),
+        shape=[3, 3],
+        unit='1/meter',
+        description='''
+        Reciprocal lattice vectors (in Cartesian coordinates) of the simulation cell. The
+        first index runs over the $x,y,z$ Cartesian coordinates, and the second index runs
+        over the 3 lattice vectors.
+        ''',
+        categories=[configuration_core],
+        a_legacy=LegacyDefinition(name='lattice_vectors_reciprocal'))
+
     local_rotations = Quantity(
         type=np.dtype(np.float64),
         shape=['number_of_atoms', 3, 3],
diff --git a/nomad/parsing/file_parser/README.md b/nomad/parsing/file_parser/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..fb91896971e25d76d4521051695e4fc7c75940b5
--- /dev/null
+++ b/nomad/parsing/file_parser/README.md
@@ -0,0 +1,146 @@
+# NOMAD file parsing module
+
+The parsing module consists of the `UnstructuredTextFileParser`, `DataTextFileParser`
+and `XMLParser` classes to enable the parsing of unstructured text, structured data text,
+and xml files, respectively. These classes are based on the FileParser class which
+provides the common methods for file handling, and querying the parsed results.
+
+## UnstructuredTextFileParser
+
+The most common type of file that are parsed in NOMAD are unstructured text files which
+can be handled using the UnstructuredTextFileParser. The parser uses the `re` module to
+match a given pattern for a quantity in the text file. To illustrate the use of this parser,
+let us consider a file `super_code.out` with the following contents:
+
+```
+2020/05/15
+               *** super_code v2 ***
+
+system 1
+--------
+sites: H(1.23, 0, 0), H(-1.23, 0, 0), O(0, 0.33, 0)
+latice: (0, 0, 0), (1, 0, 0), (1, 1, 0)
+energy: 1.29372
+
+*** This was done with magic source                                ***
+***                                x°42                            ***
+
+
+system 2
+--------
+sites: H(1.23, 0, 0), H(-1.23, 0, 0), O(0, 0.33, 0)
+cell: (0, 0, 0), (1, 0, 0), (1, 1, 0)
+energy: 1.29372
+```
+
+In order to create a nomad archive from this file, we first have to parse the necessary
+quantities which includes the date, system, energy, etc. The following python code
+illustrates how can this be achieved. Note that we will be using *parser* to refer to the
+file parser and to the code parser that writes the archive.
+
+```python
+import datetime
+import numpy as np
+
+from nomad.parsing.file_parser import UnstructuredTextFileParser, Quantity
+from nomad.datamodel import EntryArchive
+from nomad.datamodel.metainfo.public import section_run, section_system, section_single_configuration_calculation
+
+p = UnstructuredTextFileParser()
+
+def str_to_sites(string):
+    sym, pos = string.split('(')
+    pos = np.array(pos.split(')')[0].split(',')[:3], dtype=float)
+    return sym, pos
+
+q_system = Quantity(
+    'system', r'\s*system \d+([\s\S]+?energy: [\d\.]+)([\s\S]+\*\*\*)*',
+    sub_parser=UnstructuredTextFileParser(quantities=[
+        Quantity(
+            'sites', r'([A-Z]\([\d\.\, \-]+\))',
+            str_operation=str_to_sites),
+        Quantity(
+            section_system.lattice_vectors,
+            r'(?:latice|cell): \((\d)\, (\d), (\d)\)\,?\s*\((\d)\, (\d), (\d)\)\,?\s*\((\d)\, (\d), (\d)\)\,?\s*',
+            repeats=False),
+        Quantity(
+            'energy', r'energy: (\d\.\d+)'),
+        Quantity(
+            'magic_source', r'done with magic source\s*\*{3}\s*\*{3}\s*([\S]+)',
+            repeats=False)]),
+    repeats=True)
+
+quantities = [
+        Quantity('date', r'(\d\d\d\d\/\d\d\/\d\d)', repeats=False),
+        Quantity('program_version', r'super\_code\s*v(\d+)\s*', repeats=False),
+        q_system]
+
+p.quantities = quantities
+# this returns the energy for system 2
+p.system[1].get('energy', unit='hartree')
+```
+
+The quantities to be parsed can be specified as a list of `Quantity` objects with a name
+and a re pattern. The matched value should be enclosed in a group(s). By default,
+the parser uses the findall method of `re`, hence overlap
+between matches is not tolerated. If overlap cannot be avoided, one should switch to the
+finditer method by passing *findall=False* to the parser. Multiple
+matches for the quantity are returned if *repeats=True* (default). The name, data type,
+shape and unit for the quantity can also intialized by passing a metainfo Quantity.
+An external function *str_operation* can be also be passed to perform more specific
+string operations on the matched value. A local parsing on a matched block can be carried
+out by nesting a *sub_parser*. This is also an instance of the `UnstructuredTextFileParser`
+with a list of quantities to parse. To access a parsed quantity, one can use the *get*
+method.
+
+The creation of the archive is implemented in the parse method of the code parser which takes
+the mainfile, archive and logger as arguments. The file parser, *out_parser* is
+created only in the constructor and subsequent parsing on a different *mainfile* can be
+performed by assigning it to the file parser.
+
+```python
+class SupercodeParser:
+    def __init__(self):
+        self.out_parser = UnstructuredTextFileParser()
+        self.out_parser.quantities = quantities
+
+    def parse(self, mainfile, archive, logger):
+        self.out_parser.mainfile = mainfile
+        sec_run = archive.m_create(section_run)
+        sec_run.program_name = 'super_code'
+        sec_run.program_version = str(self.out_parser.get('program_version'))
+        date = datetime.datetime.strptime(
+            self.out_parser.get('date'), '%Y/%m/%d') - datetime.datetime(1970, 1, 1)
+        sec_run.program_compilation_datetime = date.total_seconds()
+        for system in self.out_parser.get('system'):
+            sec_system = sec_run.m_create(section_system)
+            sec_system.lattice_vectors = system.get('lattice_vectors')
+            sites = system.get('sites')
+            sec_system.atom_labels = [site[0]  for site in sites]
+            sec_system.atom_positions = [site[1] for site in sites]
+
+            sec_scc = sec_run.m_create(section_single_configuration_calculation)
+            sec_scc.energy_total = system.get('energy')
+            sec_scc.single_configuration_calculation_to_system_ref = sec_system
+            magic_source = system.get('magic_source')
+            if magic_source is not None:
+                sec_scc.message_info_evaluation = magic_source
+
+archive = EntryArchive()
+
+parser = SupercodeParser()
+parser.parse('temp.dat', archive, None)
+
+print(archive.m_to_json())
+```
+
+## DataTextFileParser
+The `DataTextFileParser` uses the numpy.loadtxt function to load an structured data file.
+The loaded data can be accessed from property *data*.
+
+## XMLParser
+The `XMLParser` uses the ElementTree module to parse an xml file. The parse method of the
+parser takes in an xpath style key to access individual quantities. By default, automatic
+data type conversion is performed, which can be switched off by setting *convert=False*.
+
+
diff --git a/nomad/parsing/file_parser/__init__.py b/nomad/parsing/file_parser/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e4a255a8c849ed9719deadb58bb6dd89996539b
--- /dev/null
+++ b/nomad/parsing/file_parser/__init__.py
@@ -0,0 +1,3 @@
+from .file_parser import FileParser
+from .text_parser import UnstructuredTextFileParser, DataTextFileParser, Quantity, ParsePattern
+from .xml_parser import XMLParser
diff --git a/nomad/parsing/file_parser/file_parser.py b/nomad/parsing/file_parser/file_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..2da0d69c540ad185e16660ef0ced0fda289c831e
--- /dev/null
+++ b/nomad/parsing/file_parser/file_parser.py
@@ -0,0 +1,112 @@
+# Copyright 2018 Markus Scheidgen
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an"AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import logging
+import pint
+from typing import Any, Dict
+
+
+class FileParser:
+    '''
+    Base class for parsers. The parse method implemented here simply sets the parsed
+    quantities as attributes of the class. The parse method specific to a file type
+    should be implemented in the corresponding child class. The parsed quantities are
+    stored in results. One can access a quantity by using the get method.
+
+    Arguments:
+        mainfile: the file to be parsed
+        logger: optional logger
+    '''
+    def __init__(self, mainfile: str, logger=None):
+        self._mainfile: str = os.path.abspath(mainfile) if mainfile else mainfile
+        self.logger = logger if logger else logging
+        self._results: Dict[str, Any] = None
+        # a key is necessary for xml parsers, where parsing is done dynamically
+        self._key: str = None
+        self._kwargs: Dict[str, Any] = None
+        self._file_handler: Any = None
+
+    @property
+    def results(self):
+        if self._results is None:
+            self._results = dict()
+        if self._key not in self._results:
+            self.parse(self._key, **self._kwargs)
+
+        return self._results
+
+    @property
+    def maindir(self):
+        return os.path.dirname(self._mainfile)
+
+    @property
+    def mainfile(self):
+        if self._mainfile is None:
+            return
+
+        if not os.path.isfile(self._mainfile):
+            return
+        return self._mainfile
+
+    @mainfile.setter
+    def mainfile(self, val):
+        self._results = None
+        self._file_handler = None
+        self._mainfile = os.path.abspath(val) if val is not None else val
+
+    def get(self, key: str, default: Any = None, unit: str = None, **kwargs):
+        '''
+        Returns the parsed result for quantity with name key. If quantity is not in
+        results default will be returned. A pint unit can be provided which is attached
+        to the returned value.
+        '''
+        self._key = key
+        self._kwargs = kwargs
+        val = self.results.get(key, None)
+        if val is None:
+            val = default
+
+        if val is None:
+            return
+
+        if unit is not None:
+            if isinstance(unit, pint.Quantity):
+                val = val * unit
+
+            elif isinstance(val, pint.Quantity):
+                val = val.to(unit)
+
+            else:
+                val = pint.Quantity(val, unit)
+
+        return val
+
+    def __getitem__(self, key):
+        if isinstance(key, str):
+            return self.get(key)
+        elif isinstance(key, int):
+            return self[int]
+
+    def parse(self, quantity_key: str = None):
+        '''
+        Sets quantities in result as class attributes.
+        '''
+        for key, val in self._results.items():
+            try:
+                setattr(self, key, val)
+            except Exception:
+                pass
+        return self
diff --git a/nomad/parsing/file_parser/text_parser.py b/nomad/parsing/file_parser/text_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..71bad6788602eb70b53926c308c61b2de30a40bf
--- /dev/null
+++ b/nomad/parsing/file_parser/text_parser.py
@@ -0,0 +1,524 @@
+# Copyright 2018 Markus Scheidgen
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an"AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import logging
+import mmap
+import re
+import numpy as np
+import pint
+from typing import List, Union, Callable, Type
+
+from nomad.parsing.file_parser import FileParser
+from nomad.metainfo import Quantity as mQuantity
+
+
+class ParsePattern:
+    def __init__(self, **kwargs):
+        self._head = kwargs.get('head', '')
+        self._key = kwargs.get('key', '')
+        value = kwargs.get('value', 're_float_array')
+        if value.startswith('re_'):
+            token = ''
+            if 'float' in value:
+                token += r'Ee\+\d\.\-'
+            if 'int' in value:
+                token += r'\d'
+            if 'string' in value:
+                token += r'\w'
+            if 'array' in value:
+                token += r' '
+            value = r'[%s]+' % token
+        self._value = value
+        self._tail = kwargs.get('tail', '\n')
+        self._re_pattern = None
+
+    @property
+    def re_pattern(self):
+        if self._re_pattern is None:
+            head = r'%s[\s\S]*?' % self._head if self._head else ''
+            key = r'%s\s*\:*\=*\s*' % self._key if self._key else ''
+            self._re_pattern = r'%s%s\s*\:*\=*\s*(%s)%s' % (
+                head, key, self._value, self._tail)
+        return self._re_pattern
+
+    def __call__(self, text, repeats=True):
+        values = []
+        units = []
+        if repeats:
+            for res in self.re_pattern.finditer(text):
+                unit = res.groupdict().get('__unit', None)
+                values.append(
+                    ''.join([group.decode() for group in res.groups() if group and group != unit]))
+                units.append(unit.decode() if unit is not None else None)
+        else:
+            res = self.re_pattern.search(text)
+            if res is not None:
+                unit = res.groupdict().get('__unit', None)
+                units.append(unit.decode() if unit is not None else None)
+                values.append(''.join(
+                    [group.decode() for group in res.groups() if group and group != unit]))
+
+
+class Quantity:
+    '''
+    Class to define a quantity to be parsed in the UnstructuredTextFileParser.
+
+    Arguments:
+        quantity: string to identify the name or a metainfo quantity to initialize the
+            quantity object.
+        re_pattern: pattern to be used by re for matching. Ideally, overlaps among
+            quantities for a given parser should be avoided.
+        sub_parser: instance of UnstructuredTextFileParser to perform local parsing
+            within a matched block
+        str_operation: external function to be performed on a matched block
+        dtype: data type of the quantity
+        unit: unit of the quantity
+        shape: shape of the quantity
+        repeats: denotes if multiple matches are expected
+        convert: switch automatic data type conversion
+        comment: character to denote a line to be ignored
+
+    '''
+    def __init__(self, quantity: Union[str, mQuantity], re_pattern: Union[str, ParsePattern], **kwargs):
+        self.name: str
+        self.dtype: str
+        self.unit: str
+        self.shape: List[int]
+        if isinstance(quantity, str):
+            self.name = quantity
+            self.dtype = None
+            self.unit = None
+            self.shape = None
+        elif isinstance(quantity, mQuantity):
+            self.name = quantity.name
+            self.dtype = quantity.type
+            self.unit = quantity.unit
+            # check if metainfo shape has dependencies
+            self.shape = quantity.shape
+            if False in [str(i).isdigit() for i in self.shape]:
+                self.shape = None
+        # override metainfo
+        self.dtype = kwargs.get('dtype', self.dtype)
+        self.unit = kwargs.get('unit', self.unit)
+        self.shape = kwargs.get('shape', self.shape)
+
+        self._re_pattern: str = re_pattern.re_pattern if isinstance(
+            re_pattern, ParsePattern) else re_pattern
+        self._str_operation: Callable = kwargs.get('str_operation', None)
+        self._sub_parser: UnstructuredTextFileParser = kwargs.get('sub_parser', None)
+        self.repeats: bool = kwargs.get('repeats', True)
+        self.convert: bool = kwargs.get('convert', True)
+        self.comment: str = kwargs.get('comment', None)
+
+    @property
+    def re_pattern(self):
+        '''
+        Returns a compiled re pattern.
+        '''
+        if isinstance(self._re_pattern, str):
+            re_pattern = self._re_pattern.replace('__unit', '__unit_%s' % self.name)
+            self._re_pattern = re.compile(re_pattern.encode())
+        return self._re_pattern
+
+    @re_pattern.setter
+    def re_pattern(self, val: str):
+        self._re_pattern = val
+
+    @property
+    def str_operation(self):
+        return self._str_operation
+
+    @str_operation.setter
+    def str_operation(self, val: Callable):
+        self._str_operation = val
+
+    def to_data(self, val_in: List[str]):
+        '''
+        Converts the parsed block into data.
+        '''
+        def process(val):
+            if self.comment is not None:
+                if val.strip()[0] == self.comment:
+                    return
+
+            if self.str_operation is not None:
+                val = self.str_operation(val)
+
+            else:
+                val = val.strip().split() if isinstance(val, str) else val
+                val = val[0] if len(val) == 1 else val
+
+            def _convert(val):
+                if isinstance(val, str):
+                    if self.dtype is None:
+                        if val.isdecimal():
+                            val = int(val)
+                        else:
+                            try:
+                                val = float(val)
+                            except Exception:
+                                pass
+
+                    self.shape = [] if self.shape is None else self.shape
+                    return val
+
+                elif type(val) in [np.ndarray, list]:
+                    try:
+                        dtype = float if self.dtype is None else self.dtype
+                        val_test = np.array(val, dtype=dtype)
+                        if self.dtype is None:
+                            if np.all(np.mod(val_test, 1) == 0):
+                                val_test = np.array(val_test, dtype=int)
+                        self.shape = list(np.shape(val)) if self.shape is None else self.shape
+                        val = val_test
+
+                    except Exception:
+                        val = [_convert(v) for v in val]
+
+                    return val
+
+                elif isinstance(val, dict):
+                    for k, v in val.items():
+                        self.dtype = None
+                        val[k] = _convert(v)
+                    return val
+
+                else:
+                    self.dtype = type(val)
+                    self.shape = [] if self.shape is None else self.shape
+                    return val
+
+            if self.convert:
+                val = _convert(val)
+
+            if isinstance(val, np.ndarray) and self.shape:
+                val = np.reshape(val, self.shape)
+
+            return val
+
+        val_out = [process(val) for val in val_in]
+
+        if isinstance(val_out[0], np.ndarray):
+            self.dtype = val_out[0].dtype
+
+        return val_out
+
+
+class DataTextFileParser(FileParser):
+    '''
+    Parser for structured data text files using numpy.loadtxt
+
+    Arguments:
+        mainfile: the file to be parsed
+        dtype: data type
+    '''
+    def __init__(self, **kwargs):
+        self._dtype: Type = kwargs.get('dtype', float)
+        mainfile: str = kwargs.get('mainfile', None)
+        logger = kwargs.get('logger', None)
+        logger = logger if logger is not None else logging
+        super().__init__(mainfile, logger=logger)
+        self.init_parameters()
+
+    def init_parameters(self):
+        '''
+        Method to call after loading data.
+        '''
+        pass
+
+    @property
+    def data(self):
+        '''
+        Returns the loaded data
+        '''
+        if self._file_handler is None:
+            if self.mainfile is None:
+                return
+
+            try:
+                self._file_handler = np.loadtxt(self.mainfile, dtype=self._dtype)
+            except Exception:
+                return
+
+            self.init_parameters()
+        return self._file_handler
+
+
+class UnstructuredTextFileParser(FileParser):
+    '''
+    Parser for unstructured text files using the re module. The quantities to be parsed
+    are given as a list of Quantity objects which specifies the re pattern. The mmap
+    module is used to handle the file. By default, re.find_all is used to get matches
+    for performance reasons. In this case, overlap is not tolerated in the re patterns.
+    To avoid this, set findall to False to switch to re.finditer.
+
+    Arguments:
+        mainfile: the file to be parsed
+        quantities: list of Quantity objects to be parsed.
+        logger: optional logger
+        findall: switches between using re.findall and re.finditer
+        file_offset: offset in reading the file
+        file_length: length of the chunk to be read from the file
+    '''
+    def __init__(self, mainfile=None, quantities=None, logger=None, findall=True, **kwargs):
+        super().__init__(mainfile, logger)
+        self._quantities: List[Quantity] = quantities
+        self.findall: bool = findall
+        self._kwargs = kwargs
+        self._file_length: int = kwargs.get('file_length', 0)
+        self._file_offset: int = kwargs.get('file_offset', 0)
+        self._file_pad: int = 0
+        if quantities is None:
+            self.init_quantities()
+
+    def copy(self):
+        '''
+        Returns a copy of the object excluding the parsed results.
+        '''
+        return UnstructuredTextFileParser(
+            self.mainfile, self.quantities, self.logger, **self._kwargs)
+
+    def init_quantities(self):
+        '''
+        Initializes the quantities list.
+        '''
+        self._quantities = []
+
+    @property
+    def quantities(self):
+        return self._quantities
+
+    @quantities.setter
+    def quantities(self, val):
+        self._quantities = val
+
+    @property
+    def file_offset(self):
+        '''
+        Integer offset in loading the file taking into account mmap pagination.
+        '''
+        return self._file_offset
+
+    @file_offset.setter
+    def file_offset(self, val):
+        self._file_pad = val % mmap.PAGESIZE
+        self._file_offset = (val // mmap.PAGESIZE) * mmap.PAGESIZE
+
+    @property
+    def file_length(self):
+        '''
+        Length of the file chunk to be loaded.
+        '''
+        return self._file_length
+
+    @file_length.setter
+    def file_length(self, val):
+        self._file_length = val
+
+    @property
+    def file_mmap(self):
+        '''
+        Memory mapped representation of the file.
+        '''
+        if self._file_handler is None:
+            with open(self.mainfile) as f:
+                self._file_handler = mmap.mmap(
+                    f.fileno(), self._file_length, access=mmap.ACCESS_COPY,
+                    offset=self._file_offset)
+                # set the extra chunk loaded before the intended offset to empty
+                self._file_handler[:self._file_pad] = b' ' * self._file_pad
+            self._file_pad = 0
+        return self._file_handler
+
+    def keys(self):
+        '''
+        Returns all the quantity names.
+        '''
+        return [quantity.name for quantity in self.quantities]
+
+    def items(self):
+        '''
+        Returns an iterable name, value of the parsed quantities
+        '''
+        for key in self.keys():
+            yield key, self.get(key)
+
+    def _parse_quantities(self, quantities):
+        re_findall = '|'.join([q.re_pattern.pattern.decode() for q in quantities])
+        if len(quantities) == 1:
+            # necessary to add a dummy variable to make multiple matches
+            re_findall = '%s|(__dummy__)' % re_findall
+        re_findall = re_findall.encode()
+
+        # map matches to quantities
+        matches = re.findall(re_findall, self.file_mmap)
+        current_index = 0
+        for i in range(len(quantities)):
+            values = []
+            units = []
+            n_groups = quantities[i].re_pattern.groups
+
+            non_empty_matches = []
+            for match in matches:
+                non_empty_match = [m for m in match[current_index: current_index + n_groups] if m]
+                if not non_empty_match:
+                    continue
+                non_empty_matches.append(non_empty_match)
+            index_unit = quantities[i].re_pattern.groupindex.get(
+                '__unit_%s' % quantities[i].name, None)
+            for non_empty_match in non_empty_matches:
+                if index_unit is not None:
+                    unit = non_empty_match.pop(index_unit - 1)
+                    units.append(unit.decode())
+
+                else:
+                    units.append(None)
+
+                values.append(' '.join([m.decode() for m in non_empty_match]))
+
+            current_index += n_groups
+
+            if not values:
+                continue
+
+            try:
+                value_processed = quantities[i].to_data(values)
+                for j in range(len(value_processed)):
+                    unit = units[j] if units[j] else quantities[i].unit
+                    if not unit:
+                        continue
+                    value_processed[j] = pint.Quantity(value_processed[j], unit)
+
+                if not quantities[i].repeats and value_processed:
+                    value_processed = value_processed[0]
+
+                self._results[quantities[i].name] = value_processed
+
+            except Exception:
+                self.logger.warn('Error setting value for %s ' % quantities[i].name)
+                pass
+
+    def _parse_quantity(self, quantity):
+
+        value = []
+        units = []
+        if not quantity.repeats:
+            res = quantity.re_pattern.search(self.file_mmap)
+            if res is not None:
+                if quantity._sub_parser is not None:
+                    span = np.array(res.span()) + self.file_offset
+                    sub_parser = quantity._sub_parser.copy()
+                    sub_parser.mainfile = self.mainfile
+                    if (span[1] - span[0]) < mmap.PAGESIZE:
+                        # self.logger.warn(
+                        #     'Cannot use sub parser on quantity %s with blocks with size <'
+                        #     '%d. Will try to parse string' % (quantity.name, mmap.PAGESIZE))
+                        sub_parser._file_handler = b' '.join([g for g in res.groups() if g])
+                    else:
+                        sub_parser.file_offset = span[0]
+                        sub_parser.file_length = span[1] - sub_parser.file_offset
+                    value.append(sub_parser.parse())
+
+                else:
+                    unit = res.groupdict().get('__unit_%s' % quantity.name, None)
+                    units.append(unit.decode() if unit is not None else None)
+                    value.append(''.join(
+                        [group.decode() for group in res.groups() if group and group != unit]))
+
+        else:
+            for res in quantity.re_pattern.finditer(self.file_mmap):
+                if quantity._sub_parser is not None:
+                    span = np.array(res.span()) + self.file_offset
+                    sub_parser = quantity._sub_parser.copy()
+                    sub_parser.mainfile = self.mainfile
+                    if (span[1] - span[0]) < mmap.PAGESIZE:
+                        # self.logger.warn(
+                        #     'Cannot use sub parser on quantity %s with blocks with size <'
+                        #     '%d. Will try to parse string' % (quantity.name, mmap.PAGESIZE))
+                        sub_parser._file_handler = b' '.join([g for g in res.groups() if g])
+                    else:
+                        sub_parser.file_offset = span[0]
+                        sub_parser.file_length = span[1] - sub_parser.file_offset
+                    value.append(sub_parser.parse())
+
+                else:
+                    unit = res.groupdict().get('__unit_%s' % quantity.name, None)
+                    value.append(
+                        ''.join([group.decode() for group in res.groups() if group and group != unit]))
+                    units.append(unit.decode() if unit is not None else None)
+
+        if not value:
+            return
+
+        if quantity._sub_parser is not None:
+            self._results[quantity.name] = value if quantity.repeats else value[0]
+
+        else:
+            try:
+                value_processed = quantity.to_data(value)
+                for i in range(len(value_processed)):
+                    unit = units[i] if units[i] else quantity.unit
+                    if not unit:
+                        continue
+                    value_processed[i] = pint.Quantity(value_processed[i], unit)
+
+                if not quantity.repeats and value_processed:
+                    value_processed = value_processed[0]
+
+                self._results[quantity.name] = value_processed
+            except Exception:
+                self.logger.warn('Error setting value for %s ' % quantity.name)
+                pass
+
+    def parse(self, key=None):
+        '''
+        Triggers parsing of all quantities if key is not provided.
+        '''
+        if self._results is None:
+            self._results = dict()
+
+        if self.file_mmap is None:
+            return self
+
+        if self.findall:
+            if len(self._results) > 1:
+                return self
+
+            n_results = 0
+            while True:
+                quantities_findall = [
+                    q for q in self.quantities if q.name not in self._results and q._sub_parser is None]
+                if not quantities_findall:
+                    break
+
+                # recursively parse quantities
+                self._parse_quantities(quantities_findall)
+
+                if n_results == len(self._results):
+                    break
+                n_results = len(self._results)
+
+            for quantity in self._quantities:
+                if quantity._sub_parser is not None:
+                    self._parse_quantity(quantity)
+
+        else:
+            for quantity in self._quantities:
+                if quantity.name == key or key is None:
+                    if quantity.name not in self._results:
+                        self._parse_quantity(quantity)
+
+        super().parse()
+        return self
diff --git a/nomad/parsing/file_parser/xml_parser.py b/nomad/parsing/file_parser/xml_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d9437ffed64bb5ee51b25b038b040f6587b4994
--- /dev/null
+++ b/nomad/parsing/file_parser/xml_parser.py
@@ -0,0 +1,148 @@
+# Copyright 2018 Markus Scheidgen
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an"AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import re
+import numpy as np
+from xml.etree import ElementTree
+
+from nomad.parsing.file_parser import FileParser
+
+
+class XMLParser(FileParser):
+    '''
+    Parser for XML files using ElementTree.
+
+    Arguments:
+        mainfile: the file to be parsed
+        logger: logger
+        convert: specifies if quantities are converted automatically
+    '''
+    def __init__(self, mainfile: str = None, logger=None, convert: bool = True):
+        super().__init__(mainfile, logger=logger)
+        self.convert = convert
+        self.init_parameters()
+
+    def init_parameters(self):
+        '''
+        Method to call after loading the xml file.
+        '''
+        self._elements = None
+
+    @property
+    def root(self):
+        '''
+        Returns the root of the XML tree.
+        '''
+        if self._file_handler is None:
+            if self.mainfile is None:
+                return
+            self._file_handler = ElementTree.parse(self.mainfile).getroot()
+            self.init_parameters()
+
+        return self._file_handler
+
+    @property
+    def elements(self):
+        '''
+        Returns a list of all elements in the XML tree.
+        '''
+        if self._elements is None:
+            self._elements = self.root.findall('.//')
+
+        return self._elements
+
+    def parse(self, key, convert=None):
+        '''
+        Parse a quantity identified by key or an xpath-style path. Automatic conversion
+        can be switch off by setting convert to False.
+        '''
+        _convert = convert if convert is not None else self.convert
+        if self._results is None:
+            self._results = dict()
+
+        if not self.root:
+            return
+
+        key_in = key
+        key = key.lstrip('/')
+        if key.find('/') > 0:
+            parent = os.path.dirname(key)
+            child = os.path.basename(key)
+            elements = self.root.findall(os.path.join('./', parent))
+        else:
+            elements = self.elements
+            child = key
+
+        val = []
+        for element in elements:
+            if child:
+                v = element.attrib.get(child, None)
+                if v is None:
+                    v = element.findall(child)
+                    v = [e.text for e in v]
+                if v:
+                    val.append(v)
+
+            else:
+                val.append(element.attrib)
+
+        if not val:
+            return
+
+        def convert_value(val_in):
+            if isinstance(val_in, dict):
+                val = dict()
+                for k, v in val_in.items():
+                    val[k] = convert_value(v)
+                val = val_in
+
+            elif isinstance(val_in, str):
+                # exponential formatting
+                val = val_in.strip().split()
+                if len(val) == 1:
+                    val = val[0]
+                    re_float = r'(\d+\.\d+)d(\-\d+)'
+                    val = re.sub(re_float, r'\1e\2', val)
+                    if val.isdecimal():
+                        val = int(val)
+                    elif val == 'true' or val == 'false':
+                        val = val == 'true'
+                    else:
+                        try:
+                            val = float(val)
+                        except Exception:
+                            pass
+                else:
+                    val = [convert_value(v) for v in val]
+
+            elif isinstance(val_in, list):
+                try:
+                    val = [v.split() if isinstance(v, str) else v for v in val_in]
+                    val = [v[0] if (isinstance(v, list) and len(v) == 1) else v for v in val]
+                    val = np.array(val, dtype=float)
+                    if np.all(np.mod(val, 1) == 0):
+                        val = np.array(val, dtype=int)
+                except Exception:
+                    val = [convert_value(v) for v in val_in]
+
+            return val
+
+        if _convert:
+            val = convert_value(val)
+
+        val = val[0] if len(val) == 1 else val
+
+        self._results[key_in] = val
diff --git a/nomad/parsing/parsers.py b/nomad/parsing/parsers.py
index 854b3392ce12a5ef08c290fe7002dad0975e3717..b23d9f185aee507f0673ad174eeeb442421c841c 100644
--- a/nomad/parsing/parsers.py
+++ b/nomad/parsing/parsers.py
@@ -31,6 +31,7 @@ from vaspparser import VASPParser
 from phonopyparser import PhonopyParser
 from elasticparser import ElasticParser
 from lammpsparser import LammpsParser
+from gromacsparser import GromacsParser
 
 try:
     # these packages are not available without parsing extra, which is ok, if the
@@ -388,11 +389,7 @@ parsers = [
         parser_class_name='amberparser.AMBERParser',
         mainfile_contents_re=r'\s*Amber\s[0-9]+\s[A-Z]+\s*[0-9]+'
     ),
-    LegacyParser(
-        name='parsers/gromacs', code_name='Gromacs', domain='dft',
-        parser_class_name='gromacsparser.GROMACSParser',
-        mainfile_contents_re=r'GROMACS - gmx mdrun'
-    ),
+    GromacsParser(),
     LegacyParser(
         name='parsers/gromos', code_name='Gromos', domain='dft',
         parser_class_name='gromosparser.GromosParser',
diff --git a/nomad/parsing/text_parser.py b/nomad/parsing/text_parser.py
deleted file mode 100644
index 0eecd0ab433c0174eafc01b6ad8b92572190125b..0000000000000000000000000000000000000000
--- a/nomad/parsing/text_parser.py
+++ /dev/null
@@ -1,176 +0,0 @@
-#
-# Copyright The NOMAD Authors.
-#
-# This file is part of NOMAD. See https://nomad-lab.eu for further info.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import numpy as np
-import re
-import mmap
-import logging
-
-
-class Quantity:
-    def __init__(self, name, re_pattern, str_operation=None, unit=None, dtype=None, comment=None):
-        self.name = name
-        self.re_pattern = re.compile(re_pattern.encode())
-        self.unit = unit
-        self.shape = None
-        self.dtype = dtype
-        self._value = None
-        self._str_operation = str_operation
-        self._comment = comment
-
-    @property
-    def value(self):
-        return self._value
-
-    @value.setter
-    def value(self, val_in):
-
-        def convert_value(val):
-            if self._comment is not None:
-                if val.strip()[0] == self._comment:
-                    return
-
-            if self._str_operation is not None:
-                val = self._str_operation(val)
-
-            else:
-                val = val.strip().split() if isinstance(val, str) else val
-                val = val[0] if len(val) == 1 else val
-
-            def _convert(val):
-                if isinstance(val, str):
-                    if self.dtype is None:
-                        if val.isdecimal():
-                            val = int(val)
-                        else:
-                            try:
-                                val = float(val)
-                            except Exception:
-                                pass
-
-                    self.shape = []
-                    return val
-
-                elif type(val) in [np.ndarray, list]:
-                    try:
-                        dtype = float if self.dtype is None else self.dtype
-                        val = np.array(val, dtype=dtype)
-                        self.dtype = dtype
-                        if np.all(np.mod(val, 1) == 0):
-                            val = np.array(val, dtype=int)
-                            self.dtype = int
-                        self.shape = list(np.shape(val))
-
-                    except Exception:
-                        val = [_convert(v) for v in val]
-                        self.dtype = None
-
-                    return val
-
-                else:
-                    self.dtype = type(val)
-                    self.shape = []
-                    return val
-
-            val = _convert(val)
-
-            return val
-
-        self._value = None if val_in is None else [convert_value(val) for val in val_in]
-
-    def to_SI(self):
-        pass
-
-
-class UnstructuredTextFileParser:
-    def __init__(self, mainfile, quantities, logger=None):
-        self._mainfile = mainfile
-        self.quantities = quantities
-        self._file_mmap = None
-        self.logger = logger if logger else logging
-
-    @property
-    def quantities(self):
-        return self._quantities
-
-    @quantities.setter
-    def quantities(self, val):
-        self._quantities = val
-        self.quantities_mapping = {val[idx].name: idx for idx in range(len(val))}
-
-    @property
-    def file_mmap(self):
-        if self._file_mmap is None:
-            with open(self.mainfile) as f:
-                self._file_mmap = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
-        return self._file_mmap
-
-    @property
-    def mainfile(self):
-        return self._mainfile
-
-    @mainfile.setter
-    def mainfile(self, val):
-        for quantity in self.quantities:
-            quantity.value = None
-
-        self._file_mmap = None
-        self._mainfile = val
-
-    def __getitem__(self, key):
-        idx = self.quantities_mapping.get(key, None)
-        if idx is None:
-            return
-
-        value = self.quantities[idx].value
-        if value is None:
-            self.parse(key)
-
-        return self.quantities[idx].value
-
-    def __setitem__(self, key, val):
-        idx = self.quantities_mapping.get(key, None)
-        if idx is None:
-            return
-
-        self.quantities[idx].value = val
-
-    def keys(self):
-        return self.quantities_mapping.keys()
-
-    def items(self):
-        for key in self.keys():
-            yield key, self[key]
-
-    def parse(self, key=None):
-        if isinstance(key, str):
-            key = [key]
-
-        for quantity in self.quantities:
-            if key is None or quantity.name in key:
-                value = []
-                for res in quantity.re_pattern.finditer(self.file_mmap):
-                    value.append(''.join([group.decode() for group in res.groups() if group]))
-                if not value:
-                    continue
-
-                try:
-                    quantity.value = value
-                except Exception:
-                    self.logger.warn('Error setting value for %s ' % quantity.name)
-                    pass
diff --git a/tests/conftest.py b/tests/conftest.py
index d371c2762c5212f0c7e6a61bad8d59defdff8989..d4ea0b531ef4ab8949177829ab2b2f1043fca433 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -40,7 +40,7 @@ from nomad.datamodel import EntryArchive
 from nomad.utils import structlogging
 from nomad.datamodel import User
 
-from tests import test_parsing
+from tests.parsing import test_parsing
 from tests.normalizing.conftest import run_normalize
 from tests.processing import test_data as test_processing
 from tests.test_files import example_file, empty_file
diff --git a/tests/normalizing/conftest.py b/tests/normalizing/conftest.py
index 7cae67046e15e8bdf873c469dfce8ff1f5910126..a0c0e5e58e074e258bea605cf048def875076d52 100644
--- a/tests/normalizing/conftest.py
+++ b/tests/normalizing/conftest.py
@@ -24,11 +24,11 @@ from nomad.normalizing import normalizers
 from nomad.datamodel import EntryArchive
 from nomad.datamodel.metainfo.public import section_system as System
 
-from tests.test_parsing import parsed_vasp_example  # pylint: disable=unused-import
-from tests.test_parsing import parsed_template_example  # pylint: disable=unused-import
-from tests.test_parsing import parsed_example  # pylint: disable=unused-import
-from tests.test_parsing import parse_file
-from tests.test_parsing import parsed_template_no_system  # pylint: disable=unused-import
+from tests.parsing.test_parsing import parsed_vasp_example  # pylint: disable=unused-import
+from tests.parsing.test_parsing import parsed_template_example  # pylint: disable=unused-import
+from tests.parsing.test_parsing import parsed_example  # pylint: disable=unused-import
+from tests.parsing.test_parsing import parse_file
+from tests.parsing.test_parsing import parsed_template_no_system  # pylint: disable=unused-import
 
 
 def run_normalize(entry_archive: EntryArchive) -> EntryArchive:
diff --git a/tests/normalizing/test_system.py b/tests/normalizing/test_system.py
index 9f82e7de9c45ee43f2848cdc93bf5061691d481f..abefa996e65e707d43d61842924f4ee9cbd5a51a 100644
--- a/tests/normalizing/test_system.py
+++ b/tests/normalizing/test_system.py
@@ -23,11 +23,11 @@ from nomad.datamodel import EntryArchive
 from nomad.app import dump_json
 from nomad.datamodel.metainfo.public import section_springer_material as SpringerMaterial
 
-from tests.test_parsing import parsed_vasp_example  # pylint: disable=unused-import
-from tests.test_parsing import parsed_template_example  # pylint: disable=unused-import
-from tests.test_parsing import parsed_example  # pylint: disable=unused-import
-from tests.test_parsing import parsed_template_no_system  # pylint: disable=unused-import
-from tests.test_parsing import parse_file
+from tests.parsing.test_parsing import parsed_vasp_example  # pylint: disable=unused-import
+from tests.parsing.test_parsing import parsed_template_example  # pylint: disable=unused-import
+from tests.parsing.test_parsing import parsed_example  # pylint: disable=unused-import
+from tests.parsing.test_parsing import parsed_template_no_system  # pylint: disable=unused-import
+from tests.parsing.test_parsing import parse_file
 from tests.normalizing.conftest import run_normalize, run_normalize_for_structure   # pylint: disable=unused-import
 from tests.utils import assert_log
 
@@ -168,17 +168,17 @@ def test_system_classification(atom, molecule, one_d, two_d, surface, bulk):
     """Tests that the system classification is correct for different kind of systems
     """
     # Atom
-    assert atom.section_run[0].section_system[0].system_type == "atom"
+    assert atom.section_run[0].section_system[-1].system_type == "atom"
     # Molecule
-    assert molecule.section_run[0].section_system[0].system_type == "molecule / cluster"
+    assert molecule.section_run[0].section_system[-1].system_type == "molecule / cluster"
     # 1D system
-    assert one_d.section_run[0].section_system[0].system_type == "1D"
+    assert one_d.section_run[0].section_system[-1].system_type == "1D"
     # 2D system
-    assert two_d.section_run[0].section_system[0].system_type == "2D"
+    assert two_d.section_run[0].section_system[-1].system_type == "2D"
     # Surface
-    assert surface.section_run[0].section_system[0].system_type == "surface"
+    assert surface.section_run[0].section_system[-1].system_type == "surface"
     # Bulk system
-    assert bulk.section_run[0].section_system[0].system_type == "bulk"
+    assert bulk.section_run[0].section_system[-1].system_type == "bulk"
 
 
 def test_representative_systems(single_point, molecular_dynamics, geometry_optimization, phonon):
diff --git a/tests/normalizing/test_workflow.py b/tests/normalizing/test_workflow.py
index 537566689911b843430b3e94e1319229186f8dd9..89e333d9da1d9b9e8e3096dfdc62fc80a9131061 100644
--- a/tests/normalizing/test_workflow.py
+++ b/tests/normalizing/test_workflow.py
@@ -18,7 +18,7 @@
 
 import pytest
 
-from tests.test_parsing import parse_file
+from tests.parsing.test_parsing import parse_file
 from .conftest import run_normalize
 
 
diff --git a/tests/parsing/test_file_parser.py b/tests/parsing/test_file_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..b71b77960eabb7f0c2f694626a5c9be0e13e1f59
--- /dev/null
+++ b/tests/parsing/test_file_parser.py
@@ -0,0 +1,307 @@
+import pytest
+import numpy as np
+import pint
+
+from nomad.parsing.file_parser import UnstructuredTextFileParser, Quantity, ParsePattern,\
+    XMLParser
+from nomad.datamodel.metainfo.public import section_system
+
+
+class TestUnstructuredTextFileParser:
+    @pytest.fixture(scope='class')
+    def mainfile(self):
+        return 'tests/data/parsers/exciting/Ag/INFO.OUT'
+
+    @pytest.fixture(scope='class')
+    def mainfile2(self):
+        return 'tests/data/parsers/exciting/GW/INFO.OUT'
+
+    @pytest.fixture(scope='class')
+    def quantity_string(self):
+        return dict(
+            quantity=Quantity('spin', r'Spin treatment\s*:\s*([\w\-]+)', repeats=False),
+            value='spin-unpolarised')
+
+    @pytest.fixture(scope='class')
+    def quantity_float(self):
+        return dict(
+            quantity=Quantity('cell_volume', r'Unit cell volume\s*:\s*([\d\.]+)', repeats=False),
+            value=115.0293819379)
+
+    @pytest.fixture(scope='class')
+    def quantity_array(self):
+        return dict(
+            quantity=Quantity(
+                'lattice_vector',
+                r'Lattice vectors \(cartesian\) :\s*([\d\s\.]+)', repeats=False, shape=(3, 3)),
+            value=np.array([
+                [3.86005, 3.86005, 0], [3.86005, 0, 3.86005], [0, 3.86005, 3.86005]]))
+
+    @pytest.fixture(scope='class')
+    def quantity_repeats(self):
+        return dict(
+            quantity=Quantity('total_energy', r'Total energy\s*:\s*([\d\.\-]+)'),
+            value=np.array([
+                -5307.34855605, -5313.90710687, -5315.97055490, -5316.38701749,
+                -5317.59994092, -5317.26163104, -5317.26791647, -5317.26750374,
+                -5317.26724651, -5317.26725951, -5317.26726114, -5317.26726119,
+                -5317.26726118]))
+
+    @pytest.fixture(scope='class')
+    def quantity_with_unit(self):
+        return dict(
+            quantity=Quantity('wall_time', r'Wall time \((?P<__unit>\w+)\)\s*:\s*([\d\.]+)'),
+            value=[pint.Quantity(v, 'seconds') for v in [
+                3.55, 5.32, 7.09, 8.84, 10.58, 12.33, 14.09, 15.84, 17.58,
+                19.33, 21.09, 22.91]])
+
+    @pytest.fixture(scope='function')
+    def parser(self, mainfile):
+        return UnstructuredTextFileParser(mainfile=mainfile)
+
+    def test_mainfile_setter(self, parser, mainfile2):
+        parser.quantities = [Quantity(
+            'time', r'Time \(hh:mm:ss\)\s*:\s*([\d:]+)', repeats=False)]
+        assert parser.get('time') == '19:10:23'
+        parser.mainfile = mainfile2
+        assert parser.get('time') == '08:24:03'
+
+    def test_constructor(self, mainfile, quantity_string):
+        class TestParser(UnstructuredTextFileParser):
+            def __init__(self, **kwargs):
+                super().__init__(**kwargs)
+
+            def init_quantities(self):
+                self.quantities = [quantity_string['quantity']]
+
+        parser = TestParser(mainfile=mainfile)
+        assert parser.get(quantity_string['quantity'].name) == quantity_string['value']
+
+    def test_copy(self, parser, quantity_string):
+        parser.quantities = [quantity_string['quantity']]
+        parser.parse()
+        parser2 = parser.copy()
+        assert parser2.mainfile == parser.mainfile
+        assert parser2.quantities == parser.quantities
+        assert parser2._results != parser._results
+
+    def test_findall(self, parser, quantity_string, quantity_float, quantity_repeats):
+        parser.quantities = [
+            q['quantity'] for q in [quantity_string, quantity_float, quantity_repeats]]
+        assert parser.findall
+        spin = parser.get(quantity_string['quantity'].name)
+        volume = parser.get(quantity_float['quantity'].name)
+        energies = parser.get(quantity_repeats['quantity'].name)
+
+        parser_finditer = parser.copy()
+        parser_finditer.findall = False
+        assert parser_finditer._results is None
+        assert parser_finditer.get(quantity_string['quantity'].name) == spin
+        assert parser_finditer.get(quantity_float['quantity'].name) == volume
+        assert parser_finditer.get(quantity_repeats['quantity'].name) == energies
+
+    def test_finditer(self, parser, quantity_string, quantity_float, quantity_with_unit):
+        # dynamic parsing
+        parser.quantities = [q['quantity'] for q in [
+            quantity_string, quantity_float, quantity_with_unit]]
+        parser.findall = False
+        count = 0
+        for q in [quantity_string, quantity_float, quantity_with_unit]:
+            count += 1
+            assert parser.get(q['quantity'].name) == q['value']
+            assert len(parser._results) == count
+
+    def test_quantity_sub_parser(self, parser):
+        quantity_species = Quantity(
+            'species', r'cies :\s*([\s\S]+?)(?:Spe|Total)', repeats=True,
+            sub_parser=UnstructuredTextFileParser(quantities=[
+                Quantity('name', r'name\s*:\s*(\w+)', repeats=False),
+                Quantity('mass', r'atomic mass\s*:\s*([\d\.]+)', repeats=False)]))
+
+        quantity_initialization = Quantity(
+            'initialization',
+            r'Starting initialization([\s\S]+?)Ending initialization', repeats=False,
+            sub_parser=UnstructuredTextFileParser(quantities=[
+                Quantity('k_point_grid', r'k\-point grid\s*:\s*([\d ]+)', repeats=False),
+                quantity_species]))
+
+        quantity_scf = Quantity(
+            'scf', r'Self\-consistent loop started([\s\S]+?)Self\-consistent loop stopped',
+            repeats=True, sub_parser=UnstructuredTextFileParser(quantities=[
+                Quantity('iteration', r'SCF iteration number\s*:\s*(\d+)')]))
+
+        parser.quantities = [
+            quantity_initialization, quantity_scf, Quantity(
+                'total_time', r'Total time spent \(seconds\)\s*:\s*([\d.]+)', repeats=False)]
+
+        initialization = parser.get('initialization')
+
+        assert (initialization.get('k_point_grid') == np.array([4, 4, 4])).all()
+        species = initialization.get('species')
+        assert len(species) == 1
+        assert species[0].get('name') == 'silver'
+        assert species[0].get('mass') == 196631.6997
+
+        scf = parser.get('scf')
+        assert len(scf) == 1
+        assert len(scf[0].get('iteration')) == 12
+
+        assert parser.get('total_time') == 22.4
+
+    def test_block_short(self, parser, quantity_repeats):
+        parser.quantities = [Quantity(
+            'scf', r'SCF iteration number\s*:\s*\d+([\s\S]+?)Wall time',
+            repeats=True, sub_parser=UnstructuredTextFileParser(quantities=[
+                quantity_repeats.get('quantity')]))]
+
+        scf = parser.get('scf')
+        assert len(scf) == 12
+        energies = quantity_repeats.get('value')
+        # total_energy repeats is deliberately set to True to confirm that
+        # only one energy per scf block is read
+        for i in range(len(scf)):
+            assert scf[i].get(quantity_repeats.get('quantity').name) == [energies[i]]
+
+    def test_get_default(self, parser, quantity_float):
+        parser.quantities = [quantity_float['quantity']]
+        volume = parser.get('volume', 10.00)
+        assert volume == 10.00
+
+    def test_get_unit(self, parser, quantity_float):
+        parser.quantities = [quantity_float['quantity']]
+        volume = parser.get('cell_volume', unit='angstrom')
+        assert isinstance(volume, pint.Quantity)
+        assert volume.units == 'angstrom'
+        assert volume.magnitude == quantity_float['value']
+
+    def test_quantity_unit(self, parser, quantity_with_unit, quantity_repeats, quantity_string):
+        parser.quantities = [q['quantity'] for q in [
+            quantity_with_unit, quantity_repeats, quantity_string]]
+        for q in [quantity_with_unit, quantity_string, quantity_repeats]:
+            equal = parser.get(q['quantity'].name) == q['value']
+            if isinstance(equal, np.ndarray):
+                equal = equal.all()
+            assert equal
+
+    def test_quantity_conversion(self, parser, quantity_float):
+        quantity = quantity_float.get('quantity')
+        quantity.convert = False
+        parser.quantities = [quantity]
+        assert parser.get(quantity.name) == str(quantity_float.get('value'))
+
+        quantity.convert = True
+        parser = parser.copy()
+        parser.quantities = [quantity]
+        assert parser.get(quantity.name) == quantity_float.get('value')
+
+    def test_quantity_parse_pattern(self, parser):
+        parser.quantities = [
+            Quantity(
+                'BZ_volume', ParsePattern(key='Brillouin zone volume', value='re_float'),
+                repeats=False),
+            Quantity(
+                'n_crystal_symmetry', ParsePattern(
+                    key='Number of crystal symmetries', value='re_int'), repeats=False),
+            Quantity(
+                'g_vector_size', ParsePattern(
+                    key='G-vector grid sizes', value='re_int_array'), repeats=False),
+            Quantity(
+                'smearing', ParsePattern(
+                    key='Smearing scheme', value='re_string'), repeats=False)]
+
+        assert parser.get('BZ_volume') == 2.1564074262
+        assert parser.get('n_crystal_symmetry') == 48
+        assert (parser.get('g_vector_size') == np.array([24, 24, 24])).all()
+        assert parser.get('smearing') == 'Gaussian'
+
+    def test_quantity_str_operation(self, parser):
+        def str_to_max_lm(string):
+            val = [v.split(':') for v in string.split('\n') if v]
+            res = {v[0].strip(): v[1] for v in val}
+            return res
+
+        parser.quantities = [Quantity(
+            'max_lm',
+            r'Maximum angular momentum used for([\s\S]+?inner part of muffin\-tin\s*:\s*\d+)',
+            repeats=False, str_operation=str_to_max_lm)]
+
+        max_lm = parser.get('max_lm')
+        assert max_lm.get('APW functions') == 8
+        assert max_lm.get('computing H and O matrix elements') == 8
+        assert max_lm.get('potential and density') == 8
+        assert max_lm.get('inner part of muffin-tin') == 2
+
+    def test_quantity_repeats(self, parser, quantity_repeats, mainfile):
+        quantity = quantity_repeats.get('quantity')
+        parser.quantities = [quantity]
+        assert isinstance(parser.get(quantity.name), list)
+        quantity.repeats = False
+        parser.mainfile = mainfile
+        parser.quantities = [quantity]
+        assert isinstance(parser.get(quantity.name), float)
+
+    def test_quantity_unit_array(self, parser, quantity_array):
+        quantity = quantity_array.get('quantity')
+        quantity.unit = 'angstrom'
+        parser.quantities = [quantity]
+        lattice_vector = parser.get(quantity.name)
+        assert isinstance(lattice_vector, pint.Quantity)
+        assert lattice_vector.units == 'angstrom'
+        assert (lattice_vector.magnitude == quantity_array.get('value')).all()
+
+    def test_quantity_metainfo(self, parser):
+        quantity = Quantity(
+            section_system.lattice_vectors,
+            r'Lattice vectors \(cartesian\) :\s*([\d\s\.]+)', repeats=False)
+
+        parser.quantities = [quantity]
+        lattice_vectors = parser.get(quantity.name)
+        assert list(lattice_vectors.shape) == section_system.lattice_vectors.shape
+        assert lattice_vectors.dtype == section_system.lattice_vectors.type
+
+
+class TestXMLParser:
+    @pytest.fixture(scope='class')
+    def mainfile(self):
+        return 'tests/data/parsers/vasp/vasp.xml'
+
+    @pytest.fixture(scope='function')
+    def parser(self, mainfile):
+        return XMLParser(mainfile)
+
+    def test_constructor(self, mainfile):
+        class TestParser(XMLParser):
+            def __init__(self, **kwargs):
+                super().__init__(**kwargs)
+
+        test_parser = TestParser(mainfile=mainfile)
+
+        incar = dict(zip(test_parser.get('incar/i/name'), test_parser.get('incar/i')))
+        assert incar['SYSTEM'] == 'SrTiO3'
+        assert incar['ISMEAR'] == 0
+        assert incar['SIGMA'] == 0.1
+
+    def test_parse(self, parser):
+        k_points = parser.get('kpoints/varray[1]/v')
+        assert isinstance(k_points, np.ndarray)
+        assert k_points.shape == (35, 3)
+
+        weights = parser.get('kpoints/varray[2]/v')
+        assert isinstance(weights, np.ndarray)
+        assert weights.shape == (35,)
+
+        sc_energies = parser.get('calculation/scstep/energy/i')
+        assert len(sc_energies) == 13
+
+    def test_conversion(self, parser, mainfile):
+        parser.convert = False
+
+        assert parser.get('atominfo/atoms') == ['       5 ']
+        assert parser.get('structure[1]/crystal[1]/varray[1]/v[1]') == [
+            '       4.00419668       0.00000000       0.00000000 ']
+
+        parser.mainfile = mainfile
+        assert parser.get('atominfo/atoms', convert=True) == 5
+        parser.convert = True
+        assert (parser.get('structure[1]/crystal[1]/varray[1]/v[1]') == np.array([
+            4.00419668, 0.0, 0.0])).all()
diff --git a/tests/test_parsing.py b/tests/parsing/test_parsing.py
similarity index 100%
rename from tests/test_parsing.py
rename to tests/parsing/test_parsing.py