__init__.py 13 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# Copyright 2018 Markus Scheidgen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an"AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
16
The *parsing* module is an interface for the existing NOMAD-coe parsers.
17
18
This module redefines some of the old NOMAD-coe python-common functionality to create a
more coherent interface to the parsers.
19
20
21
22
23
24
25
26

Assumption about parsers
------------------------
For now, we make a few assumption about parsers
- they always work on the same *meta-info* version
- they have no conflicting python requirments
- they can be loaded at the same time and can be used within the same python process
- they are uniquely identified by a GIT URL and publicly accessible
27
- their version is uniquely identified by a GIT commit SHA
28
29
30
31
32
33

Each parser is defined via an instance of :class:`Parser`.

.. autoclass:: nomad.parsing.Parser
    :members:

Markus Scheidgen's avatar
Markus Scheidgen committed
34
35
36
37
The implementation :class:`LegacyParser` is used for most NOMAD-coe parsers.

.. autoclass:: nomad.parsing.LegacyParser

38
39
40
41
42
The parser definitions are available via the following two variables.

.. autodata:: nomad.parsing.parsers
.. autodata:: nomad.parsing.parser_dict

Markus Scheidgen's avatar
Markus Scheidgen committed
43
44
Parsers are reused for multiple caclulations.

45
46
47
48
Parsers and calculation files are matched via regular expressions.

.. autofunc:: nomad.parsing.match_parser

Markus Scheidgen's avatar
Markus Scheidgen committed
49
Parsers in NOMAD-coe use a *backend* to create output. There are different NOMAD-coe
Markus Scheidgen's avatar
Markus Scheidgen committed
50
basends. In nomad@FAIRDI, we only currently only use a single backed. A version of
Markus Scheidgen's avatar
Markus Scheidgen committed
51
52
53
NOMAD-coe's *LocalBackend*. It stores all parser results in memory. The following
classes provide a interface definition for *backends* as an ABC and a concrete implementation
based on NOMAD-coe's *python-common* module.
54
55
56
57
58

.. autoclass:: nomad.parsing.AbstractParserBackend
    :members:
.. autoclass:: nomad.parsing.LocalBackend
    :members:
Markus Scheidgen's avatar
Markus Scheidgen committed
59

60
"""
61
62
from typing import Callable, IO
import magic
63
64
65
66
import gzip
import bz2

from nomad import files
67
68

from nomad.parsing.backend import AbstractParserBackend, LocalBackend, LegacyLocalBackend, JSONStreamWriter, BadContextURI, WrongContextState
69
from nomad.parsing.parser import Parser, LegacyParser, VaspOutcarParser
70
from nomad.parsing.artificial import TemplateParser, GenerateRandomParser, ChaosParser
71

72

73
74
75
76
77
78
79
_compressions = {
    b'\x1f\x8b\x08': ('gz', gzip.open),
    b'\x42\x5a\x68': ('bz2', bz2.open)
}


def match_parser(mainfile: str, upload_files: files.StagingUploadFiles) -> 'Parser':
80
81
82
83
84
85
86
87
88
89
90
91
92
93
    """
    Performs parser matching. This means it take the given mainfile and potentially
    opens it with the given callback and tries to identify a parser that can parse
    the file.

    This is determined by filename (e.g. *.out), mime type (e.g. text/*, application/xml),
    and beginning file contents.

    Arguments:
        mainfile: The upload relative path to the mainfile
        open: A function that allows to open a stream to the file

    Returns: The parser, or None if no parser could be matched.
    """
94
95
96
    with upload_files.raw_file(mainfile, 'rb') as f:
        compression, open_compressed = _compressions.get(f.read(3), (None, open))

97
98
    mainfile_path = upload_files.raw_file_object(mainfile).os_path
    with open_compressed(mainfile_path, 'rb') as f:
99
100
101
102
103
104
105
        buffer = f.read(2048)

    mime_type = magic.from_buffer(buffer, mime=True)
    if mime_type.startswith('application') and not mime_type.endswith('xml'):
        return None

    for parser in parsers:
106
        if parser.is_mainfile(mainfile_path, mime_type, buffer.decode('utf-8'), compression):
107
            # TODO: deal with multiple possible parser specs
108
109
110
111
112
            return parser

    return None


113
parsers = [
114
    GenerateRandomParser(),
115
    TemplateParser(),
116
    ChaosParser(),
117
    LegacyParser(
118
        name='parsers/vasp',
119
        parser_class_name='vaspparser.VASPRunParserInterface',
120
121
        mainfile_mime_re=r'(application/xml)|(text/.*)',
        mainfile_contents_re=(
122
123
124
125
            r'^\s*<\?xml version="1\.0" encoding="ISO-8859-1"\?>\s*'
            r'?\s*<modeling>'
            r'?\s*<generator>'
            r'?\s*<i name="program" type="string">\s*vasp\s*</i>'
126
127
            r'?'),
        supported_compressions=['gz', 'bz2']
128
    ),
129
    VaspOutcarParser(
130
        name='parsers/vasp',
131
        parser_class_name='vaspparser.VaspOutcarParser',
132
133
        mainfile_name_re=r'(.*/)?OUTCAR(\.[^\.]*)?',
        mainfile_contents_re=(r'^\svasp\.')
134
    ),
135
    LegacyParser(
136
        name='parsers/exciting',
137
        parser_class_name='excitingparser.ExcitingParser',
138
139
        mainfile_name_re=r'^.*/INFO\.OUT?',
        mainfile_contents_re=(
140
141
142
143
144
            r'^\s*=================================================+\s*'
            r'\s*\|\s*EXCITING\s+\S+\s+started\s*='
            r'\s*\|\s*version hash id:\s*\S*\s*=')
    ),
    LegacyParser(
145
        name='parsers/fhi-aims',
146
        parser_class_name='fhiaimsparser.FHIaimsParser',
147
        mainfile_contents_re=(
148
149
150
            r'^(.*\n)*'
            r'?\s*Invoking FHI-aims \.\.\.'
            r'?\s*Version')
151
152
    ),
    LegacyParser(
153
        name='parsers/cp2k',
154
        parser_class_name='cp2kparser.CP2KParser',
155
        mainfile_contents_re=(
156
157
158
159
            r'\*\*\*\* \*\*\*\* \*\*\*\*\*\*  \*\*  PROGRAM STARTED AT\s.*\n'
            r' \*\*\*\*\* \*\* \*\*\*  \*\*\* \*\*   PROGRAM STARTED ON\s*.*\n'
            r' \*\*    \*\*\*\*   \*\*\*\*\*\*    PROGRAM STARTED BY .*\n'
            r' \*\*\*\*\* \*\*    \*\* \*\* \*\*   PROGRAM PROCESS ID .*\n'
speckhard's avatar
speckhard committed
160
161
            r'  \*\*\*\* \*\*  \*\*\*\*\*\*\*  \*\*  PROGRAM STARTED IN .*\n'
        )
162
    ),
163
    LegacyParser(
164
        name='parsers/crystal',
165
        parser_class_name='crystalparser.CrystalParser',
166
        mainfile_contents_re=(
speckhard's avatar
speckhard committed
167
168
169
170
171
            r'\s*[\*]{22,}'  # Looks for '*' 22 times or more in a row.
            r'\s*\*\s{20,}\*'  # Looks for a '*' sandwhiched by whitespace.
            r'\s*\*\s{10,}CRYSTAL(?P<majorVersion>[\d]+)\s{10,}\*'
            r'\s*\*\s{10,}public \: (?P<minorVersion>[\d\.]+) \- .*\*'
        )
172
    ),
173
174
175
    # The main contents regex of CPMD was causing a catostrophic backtracking issue
    # when searching through the first 500 bytes of main files. We decided
    # to use only a portion of the regex to avoid that issue.
176
    LegacyParser(
177
        name='parsers/cpmd',
178
        parser_class_name='cpmdparser.CPMDParser',
179
        mainfile_contents_re=(
180
181
182
183
184
185
186
187
            # r'\s+\*\*\*\*\*\*  \*\*\*\*\*\*    \*\*\*\*  \*\*\*\*  \*\*\*\*\*\*\s*'
            # r'\s+\*\*\*\*\*\*\*  \*\*\*\*\*\*\*   \*\*\*\*\*\*\*\*\*\*  \*\*\*\*\*\*\*\s+'
            r'\*\*\*       \*\*   \*\*\*  \*\* \*\*\*\* \*\*  \*\*   \*\*\*'
            # r'\s+\*\*        \*\*   \*\*\*  \*\*  \*\*  \*\*  \*\*    \*\*\s+'
            # r'\s+\*\*        \*\*\*\*\*\*\*   \*\*      \*\*  \*\*    \*\*\s+'
            # r'\s+\*\*\*       \*\*\*\*\*\*    \*\*      \*\*  \*\*   \*\*\*\s+'
            # r'\s+\*\*\*\*\*\*\*  \*\*        \*\*      \*\*  \*\*\*\*\*\*\*\s+'
            # r'\s+\*\*\*\*\*\*  \*\*        \*\*      \*\*  \*\*\*\*\*\*\s+'
speckhard's avatar
speckhard committed
188
        )
189
    ),
speckhard's avatar
speckhard committed
190
    LegacyParser(
191
        name='parsers/nwchem',
speckhard's avatar
speckhard committed
192
        parser_class_name='nwchemparser.NWChemParser',
193
        mainfile_contents_re=(
194
195
196
197
198
            r'\s+Northwest Computational Chemistry Package \(NWChem\) \d+\.\d+'
            r'\s+------------------------------------------------------'
            r'\s+Environmental Molecular Sciences Laboratory'
            r'\s+Pacific Northwest National Laboratory'
            r'\s+Richland, WA 99352'
speckhard's avatar
speckhard committed
199
200
201
        )
    ),
    LegacyParser(
202
        name='parsers/bigdft',
speckhard's avatar
speckhard committed
203
        parser_class_name='bigdftparser.BigDFTParser',
204
        mainfile_contents_re=(
Markus Scheidgen's avatar
Markus Scheidgen committed
205
206
207
208
209
210
            # r'__________________________________ A fast and precise DFT wavelet code\s*'
            # r'\|     \|     \|     \|     \|     \|\s*'
            # r'\|     \|     \|     \|     \|     \|      BBBB         i       gggggg\s*'
            # r'\|_____\|_____\|_____\|_____\|_____\|     B    B               g\s*'
            # r'\|     \|  :  \|  :  \|     \|     \|    B     B        i     g\s*'
            # r'\|     \|-0\+--\|-0\+--\|     \|     \|    B    B         i     g        g\s*'
speckhard's avatar
speckhard committed
211
            r'\|_____\|__:__\|__:__\|_____\|_____\|___ BBBBB          i     g         g\s*'
Markus Scheidgen's avatar
Markus Scheidgen committed
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
            # r'\|  :  \|     \|     \|  :  \|     \|    B    B         i     g         g\s*'
            # r'\|--\+0-\|     \|     \|-0\+--\|     \|    B     B     iiii     g         g\s*'
            # r'\|__:__\|_____\|_____\|__:__\|_____\|    B     B        i      g        g\s*'
            # r'\|     \|  :  \|  :  \|     \|     \|    B BBBB        i        g      g\s*'
            # r'\|     \|-0\+--\|-0\+--\|     \|     \|    B        iiiii          gggggg\s*'
            # r'\|_____\|__:__\|__:__\|_____\|_____\|__BBBBB\s*'
            # r'\|     \|     \|     \|  :  \|     \|                           TTTTTTTTT\s*'
            # r'\|     \|     \|     \|--\+0-\|     \|  DDDDDD          FFFFF        T\s*'
            # r'\|_____\|_____\|_____\|__:__\|_____\| D      D        F        TTTT T\s*'
            # r'\|     \|     \|     \|  :  \|     \|D        D      F        T     T\s*'
            # r'\|     \|     \|     \|--\+0-\|     \|D         D     FFFF     T     T\s*'
            # r'\|_____\|_____\|_____\|__:__\|_____\|D___      D     F         T    T\s*'
            # r'\|     \|     \|  :  \|     \|     \|D         D     F          TTTTT\s*'
            # r'\|     \|     \|--\+0-\|     \|     \| D        D     F         T    T\s*'
            # r'\|_____\|_____\|__:__\|_____\|_____\|          D     F        T     T\s*'
            # r'\|     \|     \|     \|     \|     \|         D               T    T\s*'
            # r'\|     \|     \|     \|     \|     \|   DDDDDD       F         TTTT\s*'
            # r'\|_____\|_____\|_____\|_____\|_____\|______                    www\.bigdft\.org'
speckhard's avatar
speckhard committed
230
        )
231
232
    ),
    LegacyParser(
233
        name='parsers/wien2k',
234
        parser_class_name='wien2kparser.Wien2kParser',
235
        mainfile_contents_re=r':LABEL\d+: using WIEN2k_\d+\.\d+'
Markus Scheidgen's avatar
Markus Scheidgen committed
236
237
238
    ),
    LegacyParser(
        name='parsers/band',
239
        parser_class_name='bandparser.BANDParser',
Markus Scheidgen's avatar
Markus Scheidgen committed
240
        mainfile_contents_re=r' +\* +Amsterdam Density Functional +\(ADF\)'),
Daniel Speckhard's avatar
Daniel Speckhard committed
241
242
243
    LegacyParser(
        name='parsers/gaussian',
        parser_class_name='gaussianparser.GaussianParser',
244
        mainfile_contents_re=(
Daniel Speckhard's avatar
Daniel Speckhard committed
245
246
247
            r'\s*Cite this work as:'
            r'\s*Gaussian [0-9]+, Revision [A-Za-z0-9.]*,'
            r'\s\*\*\*\*\*\*\*\*\*\*\*\**'
Markus Scheidgen's avatar
Markus Scheidgen committed
248
249
            r'\s*Gaussian\s*([0-9]+):\s*([A-Za-z0-9-.]+)\s*([0-9][0-9]?\-[A-Z][a-z][a-z]\-[0-9]+)'
            r'\s*([0-9][0-9]?\-[A-Z][a-z][a-z]\-[0-9]+)')
250
251
252
    ),
    LegacyParser(
        name='parsers/quantumespresso',
253
        parser_class_name='quantumespressoparser.QuantumEspressoParserPWSCF',
254
        mainfile_contents_re=(
Markus Scheidgen's avatar
Markus Scheidgen committed
255
256
            r'^\s*Program (\S+)\s+v\.(\S+)(?:\s+\(svn\s+rev\.\s+'
            r'(\d+)\s*\))?\s+starts[^\n]+'
257
            r'(?:\s*\n?)*This program is part of the open-source Quantum')
Daniel Speckhard's avatar
Daniel Speckhard committed
258
259
260
261
262
    ),
    LegacyParser(
        name='parsers/abinit',
        parser_class_name='abinitparser.AbinitParser',
        mainfile_contents_re=(r'^\n\.Version\s*[0-9.]*\s*of ABINIT\s*')
Daniel Speckhard's avatar
Daniel Speckhard committed
263
264
265
266
267
268
269
270
271
272
    ),
    LegacyParser(
        name='parsers/orca',
        parser_class_name='orcaparser.OrcaParser',
        mainfile_contents_re=(
            r'\s+\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\**\s*'
            r'\s+\* O   R   C   A \*\s*'
            r'\s+\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\**\s*'
            r'\s*'
            r'\s*--- An Ab Initio, DFT and Semiempirical electronic structure package ---\s*')
273
274
275
276
277
    ),
    LegacyParser(
        name='parsers/castep',
        parser_class_name='castepparser.CastepParser',
        mainfile_contents_re=(r'\s\|\s*CCC\s*AA\s*SSS\s*TTTTT\s*EEEEE\s*PPPP\s*\|\s*')
Daniel Speckhard's avatar
Daniel Speckhard committed
278
279
280
281
282
    ),
    LegacyParser(
        name='parsers/dl-poly',
        parser_class_name='dlpolyparser.DlPolyParserWrapper',
        mainfile_contents_re=(r'\*\* DL_POLY \*\*')
Daniel Speckhard's avatar
Daniel Speckhard committed
283
284
285
286
287
    ),
    LegacyParser(
        name='parsers/lib-atoms',
        parser_class_name='libatomsparser.LibAtomsParserWrapper',
        mainfile_contents_re=(r'\s*<GAP_params\s')
Daniel Speckhard's avatar
Daniel Speckhard committed
288
289
290
291
    ),
    LegacyParser(
        name='parsers/octopus',
        parser_class_name='octopusparser.OctopusParserWrapper',
Daniel Speckhard's avatar
Daniel Speckhard committed
292
293
294
        mainfile_contents_re=(r'\|0\) ~ \(0\) \|')
        # We decided to use the octopus eyes instead of
        # r'\*{32} Grid \*{32}Simulation Box:' since it was so far down in the file.
Daniel Speckhard's avatar
Daniel Speckhard committed
295
296
297
298
299
300
    ),
    LegacyParser(
        name='parsers/phonopy',
        parser_class_name='phonopyparser.PhonopyParserWrapper',
        mainfile_contents_re=r'',  # Empty regex since this code calls other DFT codes.
        mainfile_name_re=(r'.*/phonopy-FHI-aims-displacement-0*1/control\\.in$')
speckhard's avatar
speckhard committed
301
    )
Daniel Speckhard's avatar
Daniel Speckhard committed
302

303
]
304

305
306
""" Instanciation and constructor based config of all parsers. """

Markus Scheidgen's avatar
Markus Scheidgen committed
307
parser_dict = {parser.name: parser for parser in parsers}  # type: ignore
308
""" A dict to access parsers by name. Usually 'parsers/<...>', e.g. 'parsers/vasp'. """