__init__.py 15.5 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# Copyright 2018 Markus Scheidgen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an"AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
16
The *parsing* module is an interface for the existing NOMAD-coe parsers.
17
18
This module redefines some of the old NOMAD-coe python-common functionality to create a
more coherent interface to the parsers.
19
20
21
22
23
24
25
26

Assumption about parsers
------------------------
For now, we make a few assumption about parsers
- they always work on the same *meta-info* version
- they have no conflicting python requirments
- they can be loaded at the same time and can be used within the same python process
- they are uniquely identified by a GIT URL and publicly accessible
27
- their version is uniquely identified by a GIT commit SHA
28
29
30
31
32
33

Each parser is defined via an instance of :class:`Parser`.

.. autoclass:: nomad.parsing.Parser
    :members:

Markus Scheidgen's avatar
Markus Scheidgen committed
34
35
36
37
The implementation :class:`LegacyParser` is used for most NOMAD-coe parsers.

.. autoclass:: nomad.parsing.LegacyParser

38
39
40
41
42
The parser definitions are available via the following two variables.

.. autodata:: nomad.parsing.parsers
.. autodata:: nomad.parsing.parser_dict

Markus Scheidgen's avatar
Markus Scheidgen committed
43
44
Parsers are reused for multiple caclulations.

45
46
47
48
Parsers and calculation files are matched via regular expressions.

.. autofunc:: nomad.parsing.match_parser

Markus Scheidgen's avatar
Markus Scheidgen committed
49
Parsers in NOMAD-coe use a *backend* to create output. There are different NOMAD-coe
Markus Scheidgen's avatar
Markus Scheidgen committed
50
basends. In nomad@FAIRDI, we only currently only use a single backed. A version of
Markus Scheidgen's avatar
Markus Scheidgen committed
51
52
53
NOMAD-coe's *LocalBackend*. It stores all parser results in memory. The following
classes provide a interface definition for *backends* as an ABC and a concrete implementation
based on NOMAD-coe's *python-common* module.
54
55
56
57
58

.. autoclass:: nomad.parsing.AbstractParserBackend
    :members:
.. autoclass:: nomad.parsing.LocalBackend
    :members:
Markus Scheidgen's avatar
Markus Scheidgen committed
59

60
"""
61
62
from typing import Callable, IO
import magic
63
64
65
66
import gzip
import bz2

from nomad import files
67
68

from nomad.parsing.backend import AbstractParserBackend, LocalBackend, LegacyLocalBackend, JSONStreamWriter, BadContextURI, WrongContextState
69
from nomad.parsing.parser import Parser, LegacyParser, VaspOutcarParser
70
from nomad.parsing.artificial import TemplateParser, GenerateRandomParser, ChaosParser
71

72

73
74
75
76
77
78
79
_compressions = {
    b'\x1f\x8b\x08': ('gz', gzip.open),
    b'\x42\x5a\x68': ('bz2', bz2.open)
}


def match_parser(mainfile: str, upload_files: files.StagingUploadFiles) -> 'Parser':
80
81
82
83
84
85
86
87
88
89
90
91
92
93
    """
    Performs parser matching. This means it take the given mainfile and potentially
    opens it with the given callback and tries to identify a parser that can parse
    the file.

    This is determined by filename (e.g. *.out), mime type (e.g. text/*, application/xml),
    and beginning file contents.

    Arguments:
        mainfile: The upload relative path to the mainfile
        open: A function that allows to open a stream to the file

    Returns: The parser, or None if no parser could be matched.
    """
94
95
    with upload_files.raw_file(mainfile, 'rb') as f:
        compression, open_compressed = _compressions.get(f.read(3), (None, open))
96
97
    mainfile_path = upload_files.raw_file_object(mainfile).os_path
    with open_compressed(mainfile_path, 'rb') as f:
98
99
100
101
        buffer = f.read(2048)

    mime_type = magic.from_buffer(buffer, mime=True)
    for parser in parsers:
Daniel Speckhard's avatar
Daniel Speckhard committed
102
        if parser.is_mainfile(mainfile_path, mime_type, buffer, compression):
103
            # TODO: deal with multiple possible parser specs
104
105
106
107
108
            return parser

    return None


109
parsers = [
110
    GenerateRandomParser(),
111
    TemplateParser(),
112
    ChaosParser(),
Daniel Speckhard's avatar
Daniel Speckhard committed
113
114
115
    LegacyParser(
        name='parsers/phonopy',
        parser_class_name='phonopyparser.PhonopyParserWrapper',
Daniel Speckhard's avatar
Daniel Speckhard committed
116
        # mainfile_contents_re=r'',  # Empty regex since this code calls other DFT codes.
Daniel Speckhard's avatar
Daniel Speckhard committed
117
118
        mainfile_name_re=(r'.*/phonopy-FHI-aims-displacement-0*1/control.in$')
    ),
119
    LegacyParser(
120
        name='parsers/vasp',
121
        parser_class_name='vaspparser.VASPRunParserInterface',
122
123
        mainfile_mime_re=r'(application/xml)|(text/.*)',
        mainfile_contents_re=(
124
125
126
127
            r'^\s*<\?xml version="1\.0" encoding="ISO-8859-1"\?>\s*'
            r'?\s*<modeling>'
            r'?\s*<generator>'
            r'?\s*<i name="program" type="string">\s*vasp\s*</i>'
128
129
            r'?'),
        supported_compressions=['gz', 'bz2']
130
    ),
131
    VaspOutcarParser(
132
        name='parsers/vasp',
133
        parser_class_name='vaspparser.VaspOutcarParser',
134
135
        mainfile_name_re=r'(.*/)?OUTCAR(\.[^\.]*)?',
        mainfile_contents_re=(r'^\svasp\.')
136
    ),
137
    LegacyParser(
138
        name='parsers/exciting',
139
        parser_class_name='excitingparser.ExcitingParser',
140
141
        mainfile_name_re=r'^.*/INFO\.OUT?',
        mainfile_contents_re=(
142
143
144
145
146
            r'^\s*=================================================+\s*'
            r'\s*\|\s*EXCITING\s+\S+\s+started\s*='
            r'\s*\|\s*version hash id:\s*\S*\s*=')
    ),
    LegacyParser(
147
        name='parsers/fhi-aims',
148
        parser_class_name='fhiaimsparser.FHIaimsParser',
149
        mainfile_contents_re=(
150
151
            r'^(.*\n)*'
            r'?\s*Invoking FHI-aims \.\.\.'
Daniel Speckhard's avatar
Daniel Speckhard committed
152
            r'?\s*Version'),
Daniel Speckhard's avatar
Daniel Speckhard committed
153
        mainfile_name_re=r'^.(?!.*phonopy-FHI-aims-displacement)'
154
155
    ),
    LegacyParser(
156
        name='parsers/cp2k',
157
        parser_class_name='cp2kparser.CP2KParser',
158
        mainfile_contents_re=(
159
160
161
162
            r'\*\*\*\* \*\*\*\* \*\*\*\*\*\*  \*\*  PROGRAM STARTED AT\s.*\n'
            r' \*\*\*\*\* \*\* \*\*\*  \*\*\* \*\*   PROGRAM STARTED ON\s*.*\n'
            r' \*\*    \*\*\*\*   \*\*\*\*\*\*    PROGRAM STARTED BY .*\n'
            r' \*\*\*\*\* \*\*    \*\* \*\* \*\*   PROGRAM PROCESS ID .*\n'
speckhard's avatar
speckhard committed
163
164
            r'  \*\*\*\* \*\*  \*\*\*\*\*\*\*  \*\*  PROGRAM STARTED IN .*\n'
        )
165
    ),
166
    LegacyParser(
167
        name='parsers/crystal',
168
        parser_class_name='crystalparser.CrystalParser',
169
        mainfile_contents_re=(
speckhard's avatar
speckhard committed
170
171
172
173
174
            r'\s*[\*]{22,}'  # Looks for '*' 22 times or more in a row.
            r'\s*\*\s{20,}\*'  # Looks for a '*' sandwhiched by whitespace.
            r'\s*\*\s{10,}CRYSTAL(?P<majorVersion>[\d]+)\s{10,}\*'
            r'\s*\*\s{10,}public \: (?P<minorVersion>[\d\.]+) \- .*\*'
        )
175
    ),
176
177
178
    # The main contents regex of CPMD was causing a catostrophic backtracking issue
    # when searching through the first 500 bytes of main files. We decided
    # to use only a portion of the regex to avoid that issue.
179
    LegacyParser(
180
        name='parsers/cpmd',
181
        parser_class_name='cpmdparser.CPMDParser',
182
        mainfile_contents_re=(
183
184
185
186
187
188
189
190
            # r'\s+\*\*\*\*\*\*  \*\*\*\*\*\*    \*\*\*\*  \*\*\*\*  \*\*\*\*\*\*\s*'
            # r'\s+\*\*\*\*\*\*\*  \*\*\*\*\*\*\*   \*\*\*\*\*\*\*\*\*\*  \*\*\*\*\*\*\*\s+'
            r'\*\*\*       \*\*   \*\*\*  \*\* \*\*\*\* \*\*  \*\*   \*\*\*'
            # r'\s+\*\*        \*\*   \*\*\*  \*\*  \*\*  \*\*  \*\*    \*\*\s+'
            # r'\s+\*\*        \*\*\*\*\*\*\*   \*\*      \*\*  \*\*    \*\*\s+'
            # r'\s+\*\*\*       \*\*\*\*\*\*    \*\*      \*\*  \*\*   \*\*\*\s+'
            # r'\s+\*\*\*\*\*\*\*  \*\*        \*\*      \*\*  \*\*\*\*\*\*\*\s+'
            # r'\s+\*\*\*\*\*\*  \*\*        \*\*      \*\*  \*\*\*\*\*\*\s+'
speckhard's avatar
speckhard committed
191
        )
192
    ),
speckhard's avatar
speckhard committed
193
    LegacyParser(
194
        name='parsers/nwchem',
speckhard's avatar
speckhard committed
195
        parser_class_name='nwchemparser.NWChemParser',
196
        mainfile_contents_re=(
197
198
199
200
201
            r'\s+Northwest Computational Chemistry Package \(NWChem\) \d+\.\d+'
            r'\s+------------------------------------------------------'
            r'\s+Environmental Molecular Sciences Laboratory'
            r'\s+Pacific Northwest National Laboratory'
            r'\s+Richland, WA 99352'
speckhard's avatar
speckhard committed
202
203
204
        )
    ),
    LegacyParser(
205
        name='parsers/bigdft',
speckhard's avatar
speckhard committed
206
        parser_class_name='bigdftparser.BigDFTParser',
207
        mainfile_contents_re=(
Markus Scheidgen's avatar
Markus Scheidgen committed
208
209
210
211
212
213
            # r'__________________________________ A fast and precise DFT wavelet code\s*'
            # r'\|     \|     \|     \|     \|     \|\s*'
            # r'\|     \|     \|     \|     \|     \|      BBBB         i       gggggg\s*'
            # r'\|_____\|_____\|_____\|_____\|_____\|     B    B               g\s*'
            # r'\|     \|  :  \|  :  \|     \|     \|    B     B        i     g\s*'
            # r'\|     \|-0\+--\|-0\+--\|     \|     \|    B    B         i     g        g\s*'
speckhard's avatar
speckhard committed
214
            r'\|_____\|__:__\|__:__\|_____\|_____\|___ BBBBB          i     g         g\s*'
Markus Scheidgen's avatar
Markus Scheidgen committed
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
            # r'\|  :  \|     \|     \|  :  \|     \|    B    B         i     g         g\s*'
            # r'\|--\+0-\|     \|     \|-0\+--\|     \|    B     B     iiii     g         g\s*'
            # r'\|__:__\|_____\|_____\|__:__\|_____\|    B     B        i      g        g\s*'
            # r'\|     \|  :  \|  :  \|     \|     \|    B BBBB        i        g      g\s*'
            # r'\|     \|-0\+--\|-0\+--\|     \|     \|    B        iiiii          gggggg\s*'
            # r'\|_____\|__:__\|__:__\|_____\|_____\|__BBBBB\s*'
            # r'\|     \|     \|     \|  :  \|     \|                           TTTTTTTTT\s*'
            # r'\|     \|     \|     \|--\+0-\|     \|  DDDDDD          FFFFF        T\s*'
            # r'\|_____\|_____\|_____\|__:__\|_____\| D      D        F        TTTT T\s*'
            # r'\|     \|     \|     \|  :  \|     \|D        D      F        T     T\s*'
            # r'\|     \|     \|     \|--\+0-\|     \|D         D     FFFF     T     T\s*'
            # r'\|_____\|_____\|_____\|__:__\|_____\|D___      D     F         T    T\s*'
            # r'\|     \|     \|  :  \|     \|     \|D         D     F          TTTTT\s*'
            # r'\|     \|     \|--\+0-\|     \|     \| D        D     F         T    T\s*'
            # r'\|_____\|_____\|__:__\|_____\|_____\|          D     F        T     T\s*'
            # r'\|     \|     \|     \|     \|     \|         D               T    T\s*'
            # r'\|     \|     \|     \|     \|     \|   DDDDDD       F         TTTT\s*'
            # r'\|_____\|_____\|_____\|_____\|_____\|______                    www\.bigdft\.org'
speckhard's avatar
speckhard committed
233
        )
234
235
    ),
    LegacyParser(
236
        name='parsers/wien2k',
237
        parser_class_name='wien2kparser.Wien2kParser',
238
        mainfile_contents_re=r':LABEL\d+: using WIEN2k_\d+\.\d+'
Markus Scheidgen's avatar
Markus Scheidgen committed
239
240
241
    ),
    LegacyParser(
        name='parsers/band',
242
        parser_class_name='bandparser.BANDParser',
Markus Scheidgen's avatar
Markus Scheidgen committed
243
        mainfile_contents_re=r' +\* +Amsterdam Density Functional +\(ADF\)'),
Daniel Speckhard's avatar
Daniel Speckhard committed
244
245
246
    LegacyParser(
        name='parsers/gaussian',
        parser_class_name='gaussianparser.GaussianParser',
Daniel Speckhard's avatar
Daniel Speckhard committed
247
248
249
250
251
252
253
        # This previous file matching string was too far down the line.
        # r'\s*Cite this work as:'
        # r'\s*Gaussian [0-9]+, Revision [A-Za-z0-9.]*,'
        # r'\s\*\*\*\*\*\*\*\*\*\*\*\**'
        # r'\s*Gaussian\s*([0-9]+):\s*([A-Za-z0-9-.]+)\s*([0-9][0-9]?\-[A-Z][a-z][a-z]\-[0-9]+)'
        # r'\s*([0-9][0-9]?\-[A-Z][a-z][a-z]\-[0-9]+)')
        mainfile_contents_re=r'Gaussian, Inc'),
254
255
    LegacyParser(
        name='parsers/quantumespresso',
256
        parser_class_name='quantumespressoparser.QuantumEspressoParserPWSCF',
257
        mainfile_contents_re=(
Markus Scheidgen's avatar
Markus Scheidgen committed
258
259
            r'^\s*Program (\S+)\s+v\.(\S+)(?:\s+\(svn\s+rev\.\s+'
            r'(\d+)\s*\))?\s+starts[^\n]+'
260
            r'(?:\s*\n?)*This program is part of the open-source Quantum')
Daniel Speckhard's avatar
Daniel Speckhard committed
261
262
263
264
265
    ),
    LegacyParser(
        name='parsers/abinit',
        parser_class_name='abinitparser.AbinitParser',
        mainfile_contents_re=(r'^\n\.Version\s*[0-9.]*\s*of ABINIT\s*')
Daniel Speckhard's avatar
Daniel Speckhard committed
266
267
268
269
270
271
272
273
274
275
    ),
    LegacyParser(
        name='parsers/orca',
        parser_class_name='orcaparser.OrcaParser',
        mainfile_contents_re=(
            r'\s+\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\**\s*'
            r'\s+\* O   R   C   A \*\s*'
            r'\s+\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\**\s*'
            r'\s*'
            r'\s*--- An Ab Initio, DFT and Semiempirical electronic structure package ---\s*')
276
277
278
279
280
    ),
    LegacyParser(
        name='parsers/castep',
        parser_class_name='castepparser.CastepParser',
        mainfile_contents_re=(r'\s\|\s*CCC\s*AA\s*SSS\s*TTTTT\s*EEEEE\s*PPPP\s*\|\s*')
Daniel Speckhard's avatar
Daniel Speckhard committed
281
282
283
284
285
    ),
    LegacyParser(
        name='parsers/dl-poly',
        parser_class_name='dlpolyparser.DlPolyParserWrapper',
        mainfile_contents_re=(r'\*\* DL_POLY \*\*')
Daniel Speckhard's avatar
Daniel Speckhard committed
286
287
288
289
290
    ),
    LegacyParser(
        name='parsers/lib-atoms',
        parser_class_name='libatomsparser.LibAtomsParserWrapper',
        mainfile_contents_re=(r'\s*<GAP_params\s')
Daniel Speckhard's avatar
Daniel Speckhard committed
291
292
293
294
    ),
    LegacyParser(
        name='parsers/octopus',
        parser_class_name='octopusparser.OctopusParserWrapper',
Daniel Speckhard's avatar
Daniel Speckhard committed
295
296
297
        mainfile_contents_re=(r'\|0\) ~ \(0\) \|')
        # We decided to use the octopus eyes instead of
        # r'\*{32} Grid \*{32}Simulation Box:' since it was so far down in the file.
Daniel Speckhard's avatar
Daniel Speckhard committed
298
299
300
    ),
    LegacyParser(
        name='parsers/gpaw',
Daniel Speckhard's avatar
Daniel Speckhard committed
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
        parser_class_name='gpawparser.GPAWParserWrapper',
        mainfile_name_re=(r'^.*\.gpw$'),
        mainfile_mime_re=r'application/x-tar'
    ),
    LegacyParser(
        name='parsers/gpaw2',
        parser_class_name='gpawparser.GPAWParser2Wrapper',
        # mainfile_contents_re=r'',  # We can't read .gpw2 to match AFFormatGPAW'
        mainfile_name_re=(r'^.*\.gpw2$'),
        mainfile_mime_re=r'application/x-tar'
    ),
    LegacyParser(
        name='parsers/atk',
        parser_class_name='atkparser.ATKParserWrapper',
        # mainfile_contents_re=r'',  # We can't read .gpw as txt - of UlmGPAW|AFFormatGPAW'
        mainfile_name_re=r'^.*\.nc',
        # The previously used mime type r'application/x-netcdf' wasn't found by magic library.
        mainfile_mime_re=r'application/octet-stream'
Daniel Speckhard's avatar
Daniel Speckhard committed
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
    ),
    LegacyParser(
        name='parsers/gulp',
        parser_class_name='gulpparser.GULPParser',
        mainfile_contents_re=(
            r'\s*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*'
            r'\*\*\*\*\*\*\*\*\*\*\*\*\*\s*'
            r'\s*\*\s*GENERAL UTILITY LATTICE PROGRAM\s*\*\s*')
    ),
    LegacyParser(
        name='parsers/siesta',
        parser_class_name='siestaparser.SiestaParser',
        mainfile_contents_re=(
            r'(Siesta Version: siesta-|SIESTA [0-9]\.[0-9]\.[0-9])')
    ),
    LegacyParser(
        name='parsers/elk',
        parser_class_name='elkparser.ElkParser',
        mainfile_contents_re=(
            r'\s*\+-----------+\+\s*'
            r'\s*\| Elk version (P?<version>[0-9.a-zA-Z]+) started \|\s*'
            r'\s*\+----------+\+\s*')
    ),
    LegacyParser(
        name='parsers/elastic',
        parser_class_name='elasticparser.ElasticParser',
        mainfile_contents_re=r'\s*Order of elastic constants\s*=\s*[0-9]+\s*'
Daniel Speckhard's avatar
Daniel Speckhard committed
346
347
348
349
350
351
352
353
    ),
    LegacyParser(
        name='parsers/gamess',
        parser_class_name='gamessparser.GamessParser',
        mainfile_contents_re=(
            r'\s*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\**\s*'
            r'\s*\*\s*GAMESS VERSION =\s*(.*)\*\s*'
            r'\s*\*\s*FROM IOWA STATE UNIVERSITY\s*\*\s*')
354
355
356
357
358
359
360
    ),
    LegacyParser(
        name='parsers/turbomole',
        parser_class_name='turbomoleparser.TurbomoleParser',
        mainfile_contents_re=(
            r'\s*(P?<progr>[a-zA-z0-9_]+)\s*(?:\([^()]+\))\s*:\s*TURBOMOLE\s*(P?<version>.*)'
            r'\s*Copyright \(C\) [0-9]+ TURBOMOLE GmbH, Karlsruhe')
speckhard's avatar
speckhard committed
361
    )
362
]
363

364
365
""" Instanciation and constructor based config of all parsers. """

Markus Scheidgen's avatar
Markus Scheidgen committed
366
parser_dict = {parser.name: parser for parser in parsers}  # type: ignore
367
""" A dict to access parsers by name. Usually 'parsers/<...>', e.g. 'parsers/vasp'. """