__init__.py 23.2 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# Copyright 2018 Markus Scheidgen
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an"AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15
'''
16
The *parsing* module is an interface for the existing NOMAD-coe parsers.
17
18
This module redefines some of the old NOMAD-coe python-common functionality to create a
more coherent interface to the parsers.
19
20
21
22
23

Assumption about parsers
------------------------
For now, we make a few assumption about parsers
- they always work on the same *meta-info* version
Markus Scheidgen's avatar
Markus Scheidgen committed
24
- they have no conflicting python requirements
25
26
- they can be loaded at the same time and can be used within the same python process
- they are uniquely identified by a GIT URL and publicly accessible
27
- their version is uniquely identified by a GIT commit SHA
28

Markus Scheidgen's avatar
Markus Scheidgen committed
29
Each parser is defined via an instance of :class:`Parser`. The implementation :class:`LegacyParser` is used for most NOMAD-coe parsers.
30
31
32
33

.. autoclass:: nomad.parsing.Parser
    :members:

Markus Scheidgen's avatar
Markus Scheidgen committed
34
35
36
37
38
39
40
41
42
43
44
45
The are sub-classes for parsers with special purposes.

.. autoclass:: nomad.parsing.Parser
.. autoclass:: nomad.parsing.MatchingParser
.. autoclass:: nomad.parsing.MissingParser
.. autoclass:: nomad.parsing.BrokenParser
.. autoclass:: nomad.parsing.TemplateParser
.. autoclass:: nomad.parsing.GenerateRandomParser
.. autoclass:: nomad.parsing.ChaosParser
.. autoclass:: nomad.parsing.EmptyParser


Markus Scheidgen's avatar
Markus Scheidgen committed
46
47
48
49
The implementation :class:`LegacyParser` is used for most NOMAD-coe parsers.

.. autoclass:: nomad.parsing.LegacyParser

Markus Scheidgen's avatar
Markus Scheidgen committed
50

51
52
53
54
55
The parser definitions are available via the following two variables.

.. autodata:: nomad.parsing.parsers
.. autodata:: nomad.parsing.parser_dict

Markus Scheidgen's avatar
Markus Scheidgen committed
56
Parsers are reused for multiple calculations.
Markus Scheidgen's avatar
Markus Scheidgen committed
57

58
59
Parsers and calculation files are matched via regular expressions.

Markus Scheidgen's avatar
Markus Scheidgen committed
60
.. autofunction:: nomad.parsing.match_parser
61

Markus Scheidgen's avatar
Markus Scheidgen committed
62
Parsers in NOMAD-coe use a *backend* to create output. There are different NOMAD-coe
63
basends. In nomad@FAIRDI, we only currently only use a single backed. The following
Markus Scheidgen's avatar
Markus Scheidgen committed
64
classes provide a interface definition for *backends* as an ABC and a concrete implementation
65
based on nomad@fairdi's metainfo:
66
67
68

.. autoclass:: nomad.parsing.AbstractParserBackend
    :members:
69
.. autoclass:: nomad.parsing.Backend
70
    :members:
71
'''
Markus Scheidgen's avatar
Markus Scheidgen committed
72

Alvin Noe Ladines's avatar
Alvin Noe Ladines committed
73
from typing import Callable, IO, Union, Dict
74
import os.path
75

76
from nomad import config, datamodel
77

78
79
80
81
82
from nomad.parsing.legacy import (
    AbstractParserBackend, Backend, BackendError, BadContextUri, LegacyParser, VaspOutcarParser)
from nomad.parsing.parser import Parser, BrokenParser, MissingParser, MatchingParser
from nomad.parsing.artificial import (
    TemplateParser, GenerateRandomParser, ChaosParser, EmptyParser)
83

84
85
86
87
88
89
90
try:
    # these packages are not available without parsing extra, which is ok, if the
    # parsers are only initialized to load their metainfo definitions
    import magic
    import gzip
    import bz2
    import lzma
91

92
93
94
95
96
    _compressions = {
        b'\x1f\x8b\x08': ('gz', gzip.open),
        b'\x42\x5a\x68': ('bz2', bz2.open),
        b'\xfd\x37\x7a': ('xz', lzma.open)
    }
97

98
99
100
101
    encoding_magic = magic.Magic(mime_encoding=True)

except ImportError:
    pass
102

103

104
def match_parser(mainfile_path: str, strict=True) -> 'Parser':
105
    '''
106
107
108
109
110
111
112
113
    Performs parser matching. This means it take the given mainfile and potentially
    opens it with the given callback and tries to identify a parser that can parse
    the file.

    This is determined by filename (e.g. *.out), mime type (e.g. text/*, application/xml),
    and beginning file contents.

    Arguments:
114
        mainfile_path: Path to the mainfile
115
        strict: Only match strict parsers, e.g. no artificial parsers for missing or empty entries.
116
117

    Returns: The parser, or None if no parser could be matched.
118
    '''
119
    mainfile = os.path.basename(mainfile_path)
120
121
122
    if mainfile.startswith('.') or mainfile.startswith('~'):
        return None

123
    with open(mainfile_path, 'rb') as f:
124
        compression, open_compressed = _compressions.get(f.read(3), (None, open))
125

Alvin Noe Ladines's avatar
Alvin Noe Ladines committed
126
    with open_compressed(mainfile_path, 'rb') as cf:  # type: ignore
127
        buffer = cf.read(config.parser_matching_size)
128
129

    mime_type = magic.from_buffer(buffer, mime=True)
130

131
    decoded_buffer = None
132
    encoding = None
133
134
135
136
137
    try:  # Try to open the file as a string for regex matching.
        decoded_buffer = buffer.decode('utf-8')
    except UnicodeDecodeError:
        # This file is either binary or has wrong encoding
        encoding = encoding_magic.from_buffer(buffer)
138
139
140
141

        if config.services.force_raw_file_decoding:
            encoding = 'iso-8859-1'

142
143
144
145
146
        if encoding in ['iso-8859-1']:
            try:
                decoded_buffer = buffer.decode(encoding)
            except Exception:
                pass
147

148
    for parser in parsers:
149
        if strict and isinstance(parser, (MissingParser, EmptyParser)):
150
151
            continue

152
        if parser.is_mainfile(mainfile_path, mime_type, buffer, decoded_buffer, compression):
153
154
155
156
157
158
159
160
161
162
163
            # potentially convert the file
            if encoding in ['iso-8859-1']:
                try:
                    with open(mainfile_path, 'rb') as binary_file:
                        content = binary_file.read().decode(encoding)
                except Exception:
                    pass
                else:
                    with open(mainfile_path, 'wt') as text_file:
                        text_file.write(content)

164
165
            # TODO: deal with multiple possible parser specs
            return parser
166
167
168
169

    return None


170
parsers = [
171
    GenerateRandomParser(),
172
    TemplateParser(),
173
    ChaosParser(),
Daniel Speckhard's avatar
Daniel Speckhard committed
174
    LegacyParser(
175
        name='parsers/phonopy', code_name='Phonopy' ,code_homepage='https://phonopy.github.io/phonopy/'
Daniel Speckhard's avatar
Daniel Speckhard committed
176
        parser_class_name='phonopyparser.PhonopyParserWrapper',
Daniel Speckhard's avatar
Daniel Speckhard committed
177
        # mainfile_contents_re=r'',  # Empty regex since this code calls other DFT codes.
Daniel Speckhard's avatar
Daniel Speckhard committed
178
179
        mainfile_name_re=(r'.*/phonopy-FHI-aims-displacement-0*1/control.in$')
    ),
180
    LegacyParser(
181
        name='parsers/vasp', code_name='VASP', code_homepage='https://www.vasp.at/',
Markus Scheidgen's avatar
Markus Scheidgen committed
182
        parser_class_name='vaspparser.VASPRunParser',
Alvin Noe Ladines's avatar
Alvin Noe Ladines committed
183
        mainfile_mime_re=r'(application/.*)|(text/.*)',
184
        mainfile_contents_re=(
185
186
187
188
            r'^\s*<\?xml version="1\.0" encoding="ISO-8859-1"\?>\s*'
            r'?\s*<modeling>'
            r'?\s*<generator>'
            r'?\s*<i name="program" type="string">\s*vasp\s*</i>'
189
            r'?'),
Alvin Noe Ladines's avatar
Alvin Noe Ladines committed
190
        supported_compressions=['gz', 'bz2', 'xz']
191
    ),
192
    VaspOutcarParser(
193
        name='parsers/vasp-outcar', code_name='VASP', code_homepage='https://www.vasp.at/',
194
        parser_class_name='vaspparser.VaspOutcarParser',
195
196
        mainfile_name_re=r'(.*/)?OUTCAR(\.[^\.]*)?',
        mainfile_contents_re=(r'^\svasp\.')
197
    ),
198
    LegacyParser(
199
        name='parsers/exciting', code_name='exciting', code_homepage='http://exciting-code.org/',
200
        parser_class_name='excitingparser.ExcitingParser',
Markus Scheidgen's avatar
Markus Scheidgen committed
201
        mainfile_name_re=r'^.*.OUT(\.[^/]*)?$',
202
        mainfile_contents_re=(r'EXCITING.*started')
203
204
    ),
    LegacyParser(
205
        name='parsers/fhi-aims', code_name='FHI-aims', code_homepage='https://aimsclub.fhi-berlin.mpg.de/',
206
        parser_class_name='fhiaimsparser.FHIaimsParser',
207
        mainfile_contents_re=(
208
209
            r'^(.*\n)*'
            r'?\s*Invoking FHI-aims \.\.\.'
Markus Scheidgen's avatar
Markus Scheidgen committed
210
211
            # r'?\s*Version'
        )
212
213
    ),
    LegacyParser(
214
        name='parsers/cp2k', code_name='CP2K', code_homepage='https://www.cp2k.org/',
215
        parser_class_name='cp2kparser.CP2KParser',
216
        mainfile_contents_re=(
217
218
219
220
            r'\*\*\*\* \*\*\*\* \*\*\*\*\*\*  \*\*  PROGRAM STARTED AT\s.*\n'
            r' \*\*\*\*\* \*\* \*\*\*  \*\*\* \*\*   PROGRAM STARTED ON\s*.*\n'
            r' \*\*    \*\*\*\*   \*\*\*\*\*\*    PROGRAM STARTED BY .*\n'
            r' \*\*\*\*\* \*\*    \*\* \*\* \*\*   PROGRAM PROCESS ID .*\n'
speckhard's avatar
speckhard committed
221
222
            r'  \*\*\*\* \*\*  \*\*\*\*\*\*\*  \*\*  PROGRAM STARTED IN .*\n'
        )
223
    ),
224
    LegacyParser(
225
        name='parsers/crystal', code_name='Crystal', code_homepage='https://www.crystal.unito.it/',
226
        parser_class_name='crystalparser.CrystalParser',
227
        mainfile_contents_re=(
228
229
            r'(CRYSTAL\s*\n\d+ \d+ \d+)|(CRYSTAL will run on \d+ processors)|'
            r'(\s*\*\s*CRYSTAL[\d]+\s*\*\s*\*\s*(public|Release) \: [\d\.]+.*\*)|'
Markus Scheidgen's avatar
Markus Scheidgen committed
230
            r'(Executable:\s*[/_\-a-zA-Z0-9]*MPPcrystal)'
speckhard's avatar
speckhard committed
231
        )
232
    ),
233
234
235
    # The main contents regex of CPMD was causing a catostrophic backtracking issue
    # when searching through the first 500 bytes of main files. We decided
    # to use only a portion of the regex to avoid that issue.
236
    LegacyParser(
237
        name='parsers/cpmd', code_name='CPMD', code_homepage='https://www.lcrc.anl.gov/for-users/software/available-software/cpmd/',
238
        parser_class_name='cpmdparser.CPMDParser',
239
        mainfile_contents_re=(
240
241
242
243
244
245
246
247
            # r'\s+\*\*\*\*\*\*  \*\*\*\*\*\*    \*\*\*\*  \*\*\*\*  \*\*\*\*\*\*\s*'
            # r'\s+\*\*\*\*\*\*\*  \*\*\*\*\*\*\*   \*\*\*\*\*\*\*\*\*\*  \*\*\*\*\*\*\*\s+'
            r'\*\*\*       \*\*   \*\*\*  \*\* \*\*\*\* \*\*  \*\*   \*\*\*'
            # r'\s+\*\*        \*\*   \*\*\*  \*\*  \*\*  \*\*  \*\*    \*\*\s+'
            # r'\s+\*\*        \*\*\*\*\*\*\*   \*\*      \*\*  \*\*    \*\*\s+'
            # r'\s+\*\*\*       \*\*\*\*\*\*    \*\*      \*\*  \*\*   \*\*\*\s+'
            # r'\s+\*\*\*\*\*\*\*  \*\*        \*\*      \*\*  \*\*\*\*\*\*\*\s+'
            # r'\s+\*\*\*\*\*\*  \*\*        \*\*      \*\*  \*\*\*\*\*\*\s+'
speckhard's avatar
speckhard committed
248
        )
249
    ),
speckhard's avatar
speckhard committed
250
    LegacyParser(
251
        name='parsers/nwchem', code_name='NWChem', code_homepage='http://www.nwchem-sw.org/',
speckhard's avatar
speckhard committed
252
        parser_class_name='nwchemparser.NWChemParser',
253
        mainfile_contents_re=(
254
            r'Northwest Computational Chemistry Package \(NWChem\) (\d+\.)+\d+'
speckhard's avatar
speckhard committed
255
256
257
        )
    ),
    LegacyParser(
258
        name='parsers/bigdft', code_name='BigDFT', code_homepage='http://bigdft.org/',
speckhard's avatar
speckhard committed
259
        parser_class_name='bigdftparser.BigDFTParser',
260
        mainfile_contents_re=(
Markus Scheidgen's avatar
Markus Scheidgen committed
261
262
263
264
265
266
            # r'__________________________________ A fast and precise DFT wavelet code\s*'
            # r'\|     \|     \|     \|     \|     \|\s*'
            # r'\|     \|     \|     \|     \|     \|      BBBB         i       gggggg\s*'
            # r'\|_____\|_____\|_____\|_____\|_____\|     B    B               g\s*'
            # r'\|     \|  :  \|  :  \|     \|     \|    B     B        i     g\s*'
            # r'\|     \|-0\+--\|-0\+--\|     \|     \|    B    B         i     g        g\s*'
speckhard's avatar
speckhard committed
267
            r'\|_____\|__:__\|__:__\|_____\|_____\|___ BBBBB          i     g         g\s*'
Markus Scheidgen's avatar
Markus Scheidgen committed
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
            # r'\|  :  \|     \|     \|  :  \|     \|    B    B         i     g         g\s*'
            # r'\|--\+0-\|     \|     \|-0\+--\|     \|    B     B     iiii     g         g\s*'
            # r'\|__:__\|_____\|_____\|__:__\|_____\|    B     B        i      g        g\s*'
            # r'\|     \|  :  \|  :  \|     \|     \|    B BBBB        i        g      g\s*'
            # r'\|     \|-0\+--\|-0\+--\|     \|     \|    B        iiiii          gggggg\s*'
            # r'\|_____\|__:__\|__:__\|_____\|_____\|__BBBBB\s*'
            # r'\|     \|     \|     \|  :  \|     \|                           TTTTTTTTT\s*'
            # r'\|     \|     \|     \|--\+0-\|     \|  DDDDDD          FFFFF        T\s*'
            # r'\|_____\|_____\|_____\|__:__\|_____\| D      D        F        TTTT T\s*'
            # r'\|     \|     \|     \|  :  \|     \|D        D      F        T     T\s*'
            # r'\|     \|     \|     \|--\+0-\|     \|D         D     FFFF     T     T\s*'
            # r'\|_____\|_____\|_____\|__:__\|_____\|D___      D     F         T    T\s*'
            # r'\|     \|     \|  :  \|     \|     \|D         D     F          TTTTT\s*'
            # r'\|     \|     \|--\+0-\|     \|     \| D        D     F         T    T\s*'
            # r'\|_____\|_____\|__:__\|_____\|_____\|          D     F        T     T\s*'
            # r'\|     \|     \|     \|     \|     \|         D               T    T\s*'
            # r'\|     \|     \|     \|     \|     \|   DDDDDD       F         TTTT\s*'
            # r'\|_____\|_____\|_____\|_____\|_____\|______                    www\.bigdft\.org'
speckhard's avatar
speckhard committed
286
        )
287
288
    ),
    LegacyParser(
289
        name='parsers/wien2k', code_name='WIEN2k', code_homepage='http://www.wien2k.at/',
290
        parser_class_name='wien2kparser.Wien2kParser',
Markus Scheidgen's avatar
Markus Scheidgen committed
291
        mainfile_contents_re=r'\s*---------\s*:ITE[0-9]+:\s*[0-9]+\.\s*ITERATION\s*---------'
Markus Scheidgen's avatar
Markus Scheidgen committed
292
293
    ),
    LegacyParser(
294
        name='parsers/band', code_name='BAND', code_homepage='https://www.scm.com/product/band_periodicdft/',
295
        parser_class_name='bandparser.BANDParser',
Markus Scheidgen's avatar
Markus Scheidgen committed
296
        mainfile_contents_re=r' +\* +Amsterdam Density Functional +\(ADF\)'),
Daniel Speckhard's avatar
Daniel Speckhard committed
297
    LegacyParser(
298
        name='parsers/gaussian', code_name='Gaussian', code_homepage='http://gaussian.com/',
Daniel Speckhard's avatar
Daniel Speckhard committed
299
        parser_class_name='gaussianparser.GaussianParser',
300
        mainfile_mime_re=r'.*',
301
302
        mainfile_contents_re=(
            r'\s*Cite this work as:'
303
304
            r'\s*Gaussian [0-9]+, Revision [A-Za-z0-9\.]*,')
    ),
305
    LegacyParser(
306
        name='parsers/quantumespresso', code_name='Quantum Espresso', code_homepage='https://www.quantum-espresso.org/',
307
        parser_class_name='quantumespressoparser.QuantumEspressoParserPWSCF',
Markus Scheidgen's avatar
Markus Scheidgen committed
308
309
310
        mainfile_contents_re=(
            r'(Program PWSCF.*starts)|'
            r'(Current dimensions of program PWSCF are)')
311
312
313
314
        #    r'^(.*\n)*'
        #    r'\s*Program (\S+)\s+v\.(\S+)(?:\s+\(svn\s+rev\.\s+'
        #    r'(\d+)\s*\))?\s+starts[^\n]+'
        #    r'(?:\s*\n?)*This program is part of the open-source Quantum')
Daniel Speckhard's avatar
Daniel Speckhard committed
315
316
    ),
    LegacyParser(
317
        name='parsers/abinit', code_name='ABINIT', code_homepage='https://www.abinit.org/',
Daniel Speckhard's avatar
Daniel Speckhard committed
318
        parser_class_name='abinitparser.AbinitParser',
319
        mainfile_contents_re=(r'^\n*\.Version\s*[0-9.]*\s*of ABINIT\s*')
Daniel Speckhard's avatar
Daniel Speckhard committed
320
321
    ),
    LegacyParser(
322
        name='parsers/orca', code_name='ORCA', code_homepage='https://orcaforum.kofo.mpg.de/',
Daniel Speckhard's avatar
Daniel Speckhard committed
323
324
325
326
327
328
329
        parser_class_name='orcaparser.OrcaParser',
        mainfile_contents_re=(
            r'\s+\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\**\s*'
            r'\s+\* O   R   C   A \*\s*'
            r'\s+\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\**\s*'
            r'\s*'
            r'\s*--- An Ab Initio, DFT and Semiempirical electronic structure package ---\s*')
330
331
    ),
    LegacyParser(
332
        name='parsers/castep', code_name='CASTEP', code_homepage='http://www.castep.org/',
333
334
        parser_class_name='castepparser.CastepParser',
        mainfile_contents_re=(r'\s\|\s*CCC\s*AA\s*SSS\s*TTTTT\s*EEEEE\s*PPPP\s*\|\s*')
Daniel Speckhard's avatar
Daniel Speckhard committed
335
336
    ),
    LegacyParser(
337
        name='parsers/dl-poly', code_name='DL_POLY', code_homepage='https://www.scd.stfc.ac.uk/Pages/DL_POLY.aspx',
Daniel Speckhard's avatar
Daniel Speckhard committed
338
339
        parser_class_name='dlpolyparser.DlPolyParserWrapper',
        mainfile_contents_re=(r'\*\* DL_POLY \*\*')
Daniel Speckhard's avatar
Daniel Speckhard committed
340
341
    ),
    LegacyParser(
342
        name='parsers/lib-atoms', code_name='libAtoms', code_homepage='https://libatoms.github.io/',
Daniel Speckhard's avatar
Daniel Speckhard committed
343
344
        parser_class_name='libatomsparser.LibAtomsParserWrapper',
        mainfile_contents_re=(r'\s*<GAP_params\s')
Daniel Speckhard's avatar
Daniel Speckhard committed
345
346
    ),
    LegacyParser(
347
        name='parsers/octopus', code_name='Octopus', code_homepage='https://octopus-code.org/',
Daniel Speckhard's avatar
Daniel Speckhard committed
348
        parser_class_name='octopusparser.OctopusParserWrapper',
Daniel Speckhard's avatar
Daniel Speckhard committed
349
350
351
        mainfile_contents_re=(r'\|0\) ~ \(0\) \|')
        # We decided to use the octopus eyes instead of
        # r'\*{32} Grid \*{32}Simulation Box:' since it was so far down in the file.
Daniel Speckhard's avatar
Daniel Speckhard committed
352
    ),
353
354
    # match gpaw2 first, other .gpw files are then considered to be "gpaw1"
    LegacyParser(
Cuauhtemoc Salazar's avatar
Cuauhtemoc Salazar committed
355
        name='parsers/gpaw2', code_name='GPAW', code_homepage='https://wiki.fysik.dtu.dk/gpaw/',
356
357
358
359
360
        parser_class_name='gpawparser.GPAWParser2Wrapper',
        mainfile_binary_header=b'GPAW',
        mainfile_name_re=(r'^.*\.(gpw2|gpw)$'),
        mainfile_mime_re=r'application/(x-tar|octet-stream)'
    ),
Daniel Speckhard's avatar
Daniel Speckhard committed
361
    LegacyParser(
362
        name='parsers/gpaw', code_name='GPAW', code_homepage='https://wiki.fysik.dtu.dk/gpaw/',
Daniel Speckhard's avatar
Daniel Speckhard committed
363
364
        parser_class_name='gpawparser.GPAWParserWrapper',
        mainfile_name_re=(r'^.*\.gpw$'),
365
        mainfile_mime_re=r'application/(x-tar|octet-stream)'
Daniel Speckhard's avatar
Daniel Speckhard committed
366
367
    ),
    LegacyParser(
368
        name='parsers/atk', code_name='ATK', code_homepage='https://www.synopsys.com/silicon/quantumatk.html',
Daniel Speckhard's avatar
Daniel Speckhard committed
369
370
371
372
373
        parser_class_name='atkparser.ATKParserWrapper',
        # mainfile_contents_re=r'',  # We can't read .gpw as txt - of UlmGPAW|AFFormatGPAW'
        mainfile_name_re=r'^.*\.nc',
        # The previously used mime type r'application/x-netcdf' wasn't found by magic library.
        mainfile_mime_re=r'application/octet-stream'
Daniel Speckhard's avatar
Daniel Speckhard committed
374
375
    ),
    LegacyParser(
376
        name='parsers/gulp', code_name='gulp', code_homepage='http://gulp.curtin.edu.au/gulp/',
Daniel Speckhard's avatar
Daniel Speckhard committed
377
378
379
380
381
382
383
        parser_class_name='gulpparser.GULPParser',
        mainfile_contents_re=(
            r'\s*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*'
            r'\*\*\*\*\*\*\*\*\*\*\*\*\*\s*'
            r'\s*\*\s*GENERAL UTILITY LATTICE PROGRAM\s*\*\s*')
    ),
    LegacyParser(
384
        name='parsers/siesta', code_name='Siesta', code_homepage='https://departments.icmab.es/leem/siesta/',
Daniel Speckhard's avatar
Daniel Speckhard committed
385
386
        parser_class_name='siestaparser.SiestaParser',
        mainfile_contents_re=(
387
388
            r'(Siesta Version: siesta-|SIESTA [0-9]\.[0-9]\.[0-9])|'
            r'(\*\s*WELCOME TO SIESTA\s*\*)')
Daniel Speckhard's avatar
Daniel Speckhard committed
389
390
    ),
    LegacyParser(
391
        name='parsers/elk', code_name='elk', code_homepage='http://elk.sourceforge.net/',
Daniel Speckhard's avatar
Daniel Speckhard committed
392
        parser_class_name='elkparser.ElkParser',
393
        mainfile_contents_re=r'\| Elk version [0-9.a-zA-Z]+ started \|'
Daniel Speckhard's avatar
Daniel Speckhard committed
394
395
    ),
    LegacyParser(
396
        name='parsers/elastic', code_name='elastic', code_homepage='http://exciting-code.org/elastic',
Daniel Speckhard's avatar
Daniel Speckhard committed
397
398
        parser_class_name='elasticparser.ElasticParser',
        mainfile_contents_re=r'\s*Order of elastic constants\s*=\s*[0-9]+\s*'
Daniel Speckhard's avatar
Daniel Speckhard committed
399
400
    ),
    LegacyParser(
401
        name='parsers/gamess', code_name='GAMESS', code_homepage='https://www.msg.chem.iastate.edu/gamess/versions.html',
Daniel Speckhard's avatar
Daniel Speckhard committed
402
403
404
405
406
        parser_class_name='gamessparser.GamessParser',
        mainfile_contents_re=(
            r'\s*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\**\s*'
            r'\s*\*\s*GAMESS VERSION =\s*(.*)\*\s*'
            r'\s*\*\s*FROM IOWA STATE UNIVERSITY\s*\*\s*')
407
408
    ),
    LegacyParser(
409
        name='parsers/turbomole', code_name='turbomole', code_homepage='https://www.turbomole.org/',
410
411
        parser_class_name='turbomoleparser.TurbomoleParser',
        mainfile_contents_re=(
412
            r'Copyright \(C\) [0-9]+ TURBOMOLE GmbH, Karlsruhe')
413
414
    ),
    LegacyParser(
415
416
        name='parsers/skeleton', code_name='skeleton', code_homepage=None,
        domain='ems',
417
418
419
        parser_class_name='skeletonparser.SkeletonParserInterface',
        mainfile_mime_re=r'(application/json)|(text/.*)',
        mainfile_contents_re=(r'skeleton experimental metadata format')
Markus Scheidgen's avatar
Markus Scheidgen committed
420
421
    ),
    LegacyParser(
422
        name='parsers/mpes', code_name='mpes', code_homepage='https://github.com/mpes-kit/mpes', domain='ems',
Markus Scheidgen's avatar
Markus Scheidgen committed
423
424
        parser_class_name='mpesparser.MPESParserInterface',
        mainfile_mime_re=r'(application/json)|(text/.*)',
425
        mainfile_name_re=(r'.*.meta'),
426
        mainfile_contents_re=(r'"data_repository_name": "zenodo.org"')
427
428
    ),
    LegacyParser(
429
        name='parsers/aptfim', code_name='mpes', code_homepage='https://github.com/mpes-kit/mpes', domain='ems',
430
431
432
        parser_class_name='aptfimparser.APTFIMParserInterface',
        mainfile_mime_re=r'(application/json)|(text/.*)',
        mainfile_name_re=(r'.*.aptfim')
433
    ),
434
    LegacyParser(
435
        name='parsers/eels', code_name='eels', code_homepage='https://eelsdb.eu/', domain='ems',
436
        parser_class_name='eelsparser.EelsParserInterface',
437
438
439
        mainfile_mime_re=r'text/.*',
        mainfile_name_re=(r'.*.txt'),
        mainfile_contents_re=(r'api_permalink = https://api\.eelsdb\.eu')
440
    ),
441
    LegacyParser(
442
        name='parsers/qbox', code_name='qbox',  code_homepage='http://qboxcode.org/', domain='dft',
443
444
        parser_class_name='qboxparser.QboxParser',
        mainfile_mime_re=r'(application/xml)|(text/.*)',
445
        mainfile_contents_re=(r'http://qboxcode.org')
446
    ),
447
    LegacyParser(
448
        name='parsers/dmol', code_name='DMol3', code_homepage='http://dmol3.web.psi.ch/dmol3.html', domain='dft',
449
450
451
        parser_class_name='dmol3parser.Dmol3Parser',
        mainfile_name_re=r'.*\.outmol',
        mainfile_contents_re=r'Materials Studio DMol\^3'
452
    ),
453
    LegacyParser(
454
        name='parsers/fleur', code_name='fleur', code_homepage='https://www.flapw.de/', domain='dft',
455
        parser_class_name='fleurparser.FleurParser',
456
457
        mainfile_contents_re=r'This output is generated by fleur.'
    ),
458
    LegacyParser(
459
        name='parsers/molcas', code_name='MOLCAS', code_homepage='http://www.molcas.org/', domain='dft',
460
        parser_class_name='molcasparser.MolcasParser',
461
462
        mainfile_contents_re=r'M O L C A S'
    ),
463
    LegacyParser(
464
        name='parsers/onetep', code_name='ONETEP', code_homepage='https://www.onetep.org/', domain='dft',
465
        parser_class_name='onetepparser.OnetepParser',
466
        mainfile_contents_re=r'####### #     # ####### ####### ####### ######'
467
468
469
    )
]

470
471
empty_parsers = [
    EmptyParser(
472
473
        name='missing/octopus', code_name='Octopus', code_homepage='https://octopus-code.org/',
        domain='dft',
474
475
476
        mainfile_name_re=r'(inp)|(.*/inp)'
    ),
    EmptyParser(
477
478
        name='missing/crystal', code_name='Crystal', code_homepage='https://www.crystal.unito.it/index.php',
        domain='dft',
479
480
481
        mainfile_name_re=r'.*\.cryst\.out'
    ),
    EmptyParser(
482
483
        name='missing/wien2k', code_name='WIEN2k', code_homepage='http://www.wien2k.at/',
        domain='dft',
484
485
486
        mainfile_name_re=r'.*\.scf'
    ),
    EmptyParser(
487
488
        name='missing/fhi-aims', code_name='FHI-aims', code_homepage='https://aimsclub.fhi-berlin.mpg.de/',
        domain='dft',
489
490
491
492
        mainfile_name_re=r'.*\.fhiaims'
    )
]

493
if config.use_empty_parsers:
494
495
496
    # There are some entries with PIDs that have mainfiles which do not match what
    # the actual parsers expect. We use the EmptyParser to produce placeholder entries
    # to keep the PIDs. These parsers will not match for new, non migrated data.
497
    parsers.extend(empty_parsers)
498
499

parsers.append(BrokenParser())
500

501
''' Instantiation and constructor based config of all parsers. '''
502

503
parser_dict = {parser.name: parser for parser in parsers + empty_parsers}  # type: ignore
504
''' A dict to access parsers by name. Usually 'parsers/<...>', e.g. 'parsers/vasp'. '''
505
506
507
508
509
510
511

# renamed parsers
parser_dict['parser/broken'] = parser_dict['parsers/broken']
parser_dict['parser/fleur'] = parser_dict['parsers/fleur']
parser_dict['parser/molcas'] = parser_dict['parsers/molcas']
parser_dict['parser/octopus'] = parser_dict['parsers/octopus']
parser_dict['parser/onetep'] = parser_dict['parsers/onetep']
512
513

# register code names as possible statistic value to the dft datamodel
514
515
516
517
518
519
code_names = sorted(
    set([
        getattr(parser, 'code_name')
        for parser in parsers
        if parser.domain == 'dft' and getattr(parser, 'code_name', None) is not None and getattr(parser, 'code_name') != 'currupted mainfile']),
    key=lambda code_name: code_name.lower())
520
datamodel.DFTMetadata.code_name.a_search.statistic_values = code_names + [config.services.unavailable_value, config.services.not_processed_value]