parsers.py 17.1 KB
Newer Older
Markus Scheidgen's avatar
Markus Scheidgen committed
1
2
3
4
#
# Copyright The NOMAD Authors.
#
# This file is part of NOMAD. See https://nomad-lab.eu for further info.
Markus Scheidgen's avatar
Markus Scheidgen committed
5
6
7
8
9
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
Markus Scheidgen's avatar
Markus Scheidgen committed
10
#     http://www.apache.org/licenses/LICENSE-2.0
Markus Scheidgen's avatar
Markus Scheidgen committed
11
12
#
# Unless required by applicable law or agreed to in writing, software
Markus Scheidgen's avatar
Markus Scheidgen committed
13
# distributed under the License is distributed on an "AS IS" BASIS,
Markus Scheidgen's avatar
Markus Scheidgen committed
14
15
16
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Markus Scheidgen's avatar
Markus Scheidgen committed
17
#
Markus Scheidgen's avatar
Markus Scheidgen committed
18
19
20
21
22

import os.path

from nomad import config, datamodel

23
from .parser import MissingParser, BrokenParser, Parser, ArchiveParser
24
from .legacy import LegacyParser
Markus Scheidgen's avatar
Markus Scheidgen committed
25
26
from .artificial import EmptyParser, GenerateRandomParser, TemplateParser, ChaosParser

27
from eelsdbconverter import EELSApiJsonConverter
Markus Scheidgen's avatar
Markus Scheidgen committed
28
29
from mpesparser import MPESParser
from aptfimparser import APTFIMParser
30
from vaspparser import VASPParser
31
from phonopyparser import PhonopyParser
Alvin Noe Ladines's avatar
Alvin Noe Ladines committed
32
from elasticparser import ElasticParser
33
from lammpsparser import LammpsParser
34
from gromacsparser import GromacsParser
35
from crystalparser import CrystalParser
36
37
from fhiaimsparser import FHIAimsParser
from excitingparser import ExcitingParser
38
from abinitparser import AbinitParser
39
from quantumespressoparser import QuantumEspressoParser
40
from gaussianparser import GaussianParser
41
from gpawparser import GPAWParser
42
from octopusparser import OctopusParser
43
from orcaparser import OrcaParser
Alvin Noe Ladines's avatar
Alvin Noe Ladines committed
44
from cp2kparser import CP2KParser
45
from fhivibesparser import FHIVibesParser
46
from turbomoleparser import TurbomoleParser
47
from castepparser import CastepParser
48
49
from wien2kparser import Wien2kParser
from nwchemparser import NWChemParser
Alvin Noe Ladines's avatar
Alvin Noe Ladines committed
50
from lobsterparser import LobsterParser
Markus Scheidgen's avatar
Markus Scheidgen committed
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141

try:
    # these packages are not available without parsing extra, which is ok, if the
    # parsers are only initialized to load their metainfo definitions
    import magic
    import gzip
    import bz2
    import lzma

    _compressions = {
        b'\x1f\x8b\x08': ('gz', gzip.open),
        b'\x42\x5a\x68': ('bz2', bz2.open),
        b'\xfd\x37\x7a': ('xz', lzma.open)
    }

    encoding_magic = magic.Magic(mime_encoding=True)

except ImportError:
    pass


def match_parser(mainfile_path: str, strict=True) -> Parser:
    '''
    Performs parser matching. This means it take the given mainfile and potentially
    opens it with the given callback and tries to identify a parser that can parse
    the file.

    This is determined by filename (e.g. *.out), mime type (e.g. text/*, application/xml),
    and beginning file contents.

    Arguments:
        mainfile_path: Path to the mainfile
        strict: Only match strict parsers, e.g. no artificial parsers for missing or empty entries.

    Returns: The parser, or None if no parser could be matched.
    '''
    mainfile = os.path.basename(mainfile_path)
    if mainfile.startswith('.') or mainfile.startswith('~'):
        return None

    with open(mainfile_path, 'rb') as f:
        compression, open_compressed = _compressions.get(f.read(3), (None, open))

    with open_compressed(mainfile_path, 'rb') as cf:  # type: ignore
        buffer = cf.read(config.parser_matching_size)

    mime_type = magic.from_buffer(buffer, mime=True)

    decoded_buffer = None
    encoding = None
    try:  # Try to open the file as a string for regex matching.
        decoded_buffer = buffer.decode('utf-8')
    except UnicodeDecodeError:
        # This file is either binary or has wrong encoding
        encoding = encoding_magic.from_buffer(buffer)

        if config.services.force_raw_file_decoding:
            encoding = 'iso-8859-1'

        if encoding in ['iso-8859-1']:
            try:
                decoded_buffer = buffer.decode(encoding)
            except Exception:
                pass

    for parser in parsers:
        if strict and isinstance(parser, (MissingParser, EmptyParser)):
            continue

        if parser.is_mainfile(mainfile_path, mime_type, buffer, decoded_buffer, compression):
            # potentially convert the file
            if encoding in ['iso-8859-1']:
                try:
                    with open(mainfile_path, 'rb') as binary_file:
                        content = binary_file.read().decode(encoding)
                except Exception:
                    pass
                else:
                    with open(mainfile_path, 'wt') as text_file:
                        text_file.write(content)

            # TODO: deal with multiple possible parser specs
            return parser

    return None


parsers = [
    GenerateRandomParser(),
    TemplateParser(),
    ChaosParser(),
142
    PhonopyParser(),
143
    VASPParser(),
144
145
    ExcitingParser(),
    FHIAimsParser(),
146
    FHIVibesParser(),
Alvin Noe Ladines's avatar
Alvin Noe Ladines committed
147
    CP2KParser(),
148
    CrystalParser(),
Markus Scheidgen's avatar
Markus Scheidgen committed
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
    # The main contents regex of CPMD was causing a catostrophic backtracking issue
    # when searching through the first 500 bytes of main files. We decided
    # to use only a portion of the regex to avoid that issue.
    LegacyParser(
        name='parsers/cpmd', code_name='CPMD', code_homepage='https://www.lcrc.anl.gov/for-users/software/available-software/cpmd/',
        parser_class_name='cpmdparser.CPMDParser',
        mainfile_contents_re=(
            # r'\s+\*\*\*\*\*\*  \*\*\*\*\*\*    \*\*\*\*  \*\*\*\*  \*\*\*\*\*\*\s*'
            # r'\s+\*\*\*\*\*\*\*  \*\*\*\*\*\*\*   \*\*\*\*\*\*\*\*\*\*  \*\*\*\*\*\*\*\s+'
            r'\*\*\*       \*\*   \*\*\*  \*\* \*\*\*\* \*\*  \*\*   \*\*\*'
            # r'\s+\*\*        \*\*   \*\*\*  \*\*  \*\*  \*\*  \*\*    \*\*\s+'
            # r'\s+\*\*        \*\*\*\*\*\*\*   \*\*      \*\*  \*\*    \*\*\s+'
            # r'\s+\*\*\*       \*\*\*\*\*\*    \*\*      \*\*  \*\*   \*\*\*\s+'
            # r'\s+\*\*\*\*\*\*\*  \*\*        \*\*      \*\*  \*\*\*\*\*\*\*\s+'
            # r'\s+\*\*\*\*\*\*  \*\*        \*\*      \*\*  \*\*\*\*\*\*\s+'
        )
    ),
166
    NWChemParser(),
Markus Scheidgen's avatar
Markus Scheidgen committed
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
    LegacyParser(
        name='parsers/bigdft', code_name='BigDFT', code_homepage='http://bigdft.org/',
        parser_class_name='bigdftparser.BigDFTParser',
        mainfile_contents_re=(
            # r'__________________________________ A fast and precise DFT wavelet code\s*'
            # r'\|     \|     \|     \|     \|     \|\s*'
            # r'\|     \|     \|     \|     \|     \|      BBBB         i       gggggg\s*'
            # r'\|_____\|_____\|_____\|_____\|_____\|     B    B               g\s*'
            # r'\|     \|  :  \|  :  \|     \|     \|    B     B        i     g\s*'
            # r'\|     \|-0\+--\|-0\+--\|     \|     \|    B    B         i     g        g\s*'
            r'\|_____\|__:__\|__:__\|_____\|_____\|___ BBBBB          i     g         g\s*'
            # r'\|  :  \|     \|     \|  :  \|     \|    B    B         i     g         g\s*'
            # r'\|--\+0-\|     \|     \|-0\+--\|     \|    B     B     iiii     g         g\s*'
            # r'\|__:__\|_____\|_____\|__:__\|_____\|    B     B        i      g        g\s*'
            # r'\|     \|  :  \|  :  \|     \|     \|    B BBBB        i        g      g\s*'
            # r'\|     \|-0\+--\|-0\+--\|     \|     \|    B        iiiii          gggggg\s*'
            # r'\|_____\|__:__\|__:__\|_____\|_____\|__BBBBB\s*'
            # r'\|     \|     \|     \|  :  \|     \|                           TTTTTTTTT\s*'
            # r'\|     \|     \|     \|--\+0-\|     \|  DDDDDD          FFFFF        T\s*'
            # r'\|_____\|_____\|_____\|__:__\|_____\| D      D        F        TTTT T\s*'
            # r'\|     \|     \|     \|  :  \|     \|D        D      F        T     T\s*'
            # r'\|     \|     \|     \|--\+0-\|     \|D         D     FFFF     T     T\s*'
            # r'\|_____\|_____\|_____\|__:__\|_____\|D___      D     F         T    T\s*'
            # r'\|     \|     \|  :  \|     \|     \|D         D     F          TTTTT\s*'
            # r'\|     \|     \|--\+0-\|     \|     \| D        D     F         T    T\s*'
            # r'\|_____\|_____\|__:__\|_____\|_____\|          D     F        T     T\s*'
            # r'\|     \|     \|     \|     \|     \|         D               T    T\s*'
            # r'\|     \|     \|     \|     \|     \|   DDDDDD       F         TTTT\s*'
            # r'\|_____\|_____\|_____\|_____\|_____\|______                    www\.bigdft\.org'
        )
    ),
198
    Wien2kParser(),
Markus Scheidgen's avatar
Markus Scheidgen committed
199
200
201
202
    LegacyParser(
        name='parsers/band', code_name='BAND', code_homepage='https://www.scm.com/product/band_periodicdft/',
        parser_class_name='bandparser.BANDParser',
        mainfile_contents_re=r' +\* +Amsterdam Density Functional +\(ADF\)'),
203
    QuantumEspressoParser(),
204
    GaussianParser(),
205
    AbinitParser(),
206
    OrcaParser(),
207
    CastepParser(),
Markus Scheidgen's avatar
Markus Scheidgen committed
208
209
210
211
212
213
214
215
216
217
    LegacyParser(
        name='parsers/dl-poly', code_name='DL_POLY', code_homepage='https://www.scd.stfc.ac.uk/Pages/DL_POLY.aspx',
        parser_class_name='dlpolyparser.DlPolyParserWrapper',
        mainfile_contents_re=(r'\*\* DL_POLY \*\*')
    ),
    LegacyParser(
        name='parsers/lib-atoms', code_name='libAtoms', code_homepage='https://libatoms.github.io/',
        parser_class_name='libatomsparser.LibAtomsParserWrapper',
        mainfile_contents_re=(r'\s*<GAP_params\s')
    ),
218
    OctopusParser(),
219
    GPAWParser(),
Markus Scheidgen's avatar
Markus Scheidgen committed
220
    LegacyParser(
221
        name='parsers/atk', code_name='AtomistixToolKit', code_homepage='https://www.synopsys.com/silicon/quantumatk.html',
Markus Scheidgen's avatar
Markus Scheidgen committed
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
        parser_class_name='atkparser.ATKParserWrapper',
        # mainfile_contents_re=r'',  # We can't read .gpw as txt - of UlmGPAW|AFFormatGPAW'
        mainfile_name_re=r'^.*\.nc',
        # The previously used mime type r'application/x-netcdf' wasn't found by magic library.
        mainfile_mime_re=r'application/octet-stream'
    ),
    LegacyParser(
        name='parsers/gulp', code_name='gulp', code_homepage='http://gulp.curtin.edu.au/gulp/',
        parser_class_name='gulpparser.GULPParser',
        mainfile_contents_re=(
            r'\s*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*'
            r'\*\*\*\*\*\*\*\*\*\*\*\*\*\s*'
            r'\s*\*\s*GENERAL UTILITY LATTICE PROGRAM\s*\*\s*')
    ),
    LegacyParser(
        name='parsers/siesta', code_name='Siesta', code_homepage='https://departments.icmab.es/leem/siesta/',
        parser_class_name='siestaparser.SiestaParser',
        mainfile_contents_re=(
            r'(Siesta Version: siesta-|SIESTA [0-9]\.[0-9]\.[0-9])|'
            r'(\*\s*WELCOME TO SIESTA\s*\*)')
    ),
    LegacyParser(
        name='parsers/elk', code_name='elk', code_homepage='http://elk.sourceforge.net/',
        parser_class_name='elkparser.ElkParser',
        mainfile_contents_re=r'\| Elk version [0-9.a-zA-Z]+ started \|'
    ),
Alvin Noe Ladines's avatar
Alvin Noe Ladines committed
248
    ElasticParser(),
Markus Scheidgen's avatar
Markus Scheidgen committed
249
250
251
252
253
254
255
256
    LegacyParser(
        name='parsers/gamess', code_name='GAMESS', code_homepage='https://www.msg.chem.iastate.edu/gamess/versions.html',
        parser_class_name='gamessparser.GamessParser',
        mainfile_contents_re=(
            r'\s*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\**\s*'
            r'\s*\*\s*GAMESS VERSION =\s*(.*)\*\s*'
            r'\s*\*\s*FROM IOWA STATE UNIVERSITY\s*\*\s*')
    ),
257
    TurbomoleParser(),
Markus Scheidgen's avatar
Markus Scheidgen committed
258
259
    MPESParser(),
    APTFIMParser(),
260
    EELSApiJsonConverter(),
Markus Scheidgen's avatar
Markus Scheidgen committed
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
    LegacyParser(
        name='parsers/qbox', code_name='qbox', code_homepage='http://qboxcode.org/', domain='dft',
        parser_class_name='qboxparser.QboxParser',
        mainfile_mime_re=r'(application/xml)|(text/.*)',
        mainfile_contents_re=(r'http://qboxcode.org')
    ),
    LegacyParser(
        name='parsers/dmol', code_name='DMol3', code_homepage='http://dmol3.web.psi.ch/dmol3.html', domain='dft',
        parser_class_name='dmol3parser.Dmol3Parser',
        mainfile_name_re=r'.*\.outmol',
        mainfile_contents_re=r'Materials Studio DMol\^3'
    ),
    LegacyParser(
        name='parsers/fleur', code_name='fleur', code_homepage='https://www.flapw.de/', domain='dft',
        parser_class_name='fleurparser.FleurParser',
        mainfile_contents_re=r'This output is generated by fleur.'
    ),
    LegacyParser(
        name='parsers/molcas', code_name='MOLCAS', code_homepage='http://www.molcas.org/', domain='dft',
        parser_class_name='molcasparser.MolcasParser',
        mainfile_contents_re=r'M O L C A S'
    ),
    LegacyParser(
        name='parsers/onetep', code_name='ONETEP', code_homepage='https://www.onetep.org/', domain='dft',
        parser_class_name='onetepparser.OnetepParser',
        mainfile_contents_re=r'####### #     # ####### ####### ####### ######'
    ),
    LegacyParser(
        name='parsers/openkim', code_name='OpenKIM', domain='dft',
        parser_class_name='openkimparser.OpenKIMParser',
        mainfile_contents_re=r'OPENKIM'
    ),
    LegacyParser(
        name='parsers/tinker', code_name='TINKER', domain='dft',
        parser_class_name='tinkerparser.TinkerParser',
        mainfile_contents_re=r'TINKER  ---  Software Tools for Molecular Design'
    ),
298
    LammpsParser(),
Markus Scheidgen's avatar
Markus Scheidgen committed
299
300
301
302
303
    LegacyParser(
        name='parsers/amber', code_name='Amber', domain='dft',
        parser_class_name='amberparser.AMBERParser',
        mainfile_contents_re=r'\s*Amber\s[0-9]+\s[A-Z]+\s*[0-9]+'
    ),
304
    GromacsParser(),
Markus Scheidgen's avatar
Markus Scheidgen committed
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
    LegacyParser(
        name='parsers/gromos', code_name='Gromos', domain='dft',
        parser_class_name='gromosparser.GromosParser',
        mainfile_contents_re=r'Bugreports to http://www.gromos.net'
    ),
    LegacyParser(
        name='parsers/namd', code_name='Namd', domain='dft',
        parser_class_name='namdparser.NamdParser',
        mainfile_contents_re=r'\s*Info:\s*NAMD\s*[0-9.]+\s*for\s*',
        mainfile_mime_re=r'text/.*',
    ),
    LegacyParser(
        name='parsers/charmm', code_name='Charmm', domain='dft',
        parser_class_name='charmmparser.CharmmParser',
        mainfile_contents_re=r'\s*Chemistry\s*at\s*HARvard\s*Macromolecular\s*Mechanics\s*',
        mainfile_mime_re=r'text/.*',
    ),
    LegacyParser(
Markus Scheidgen's avatar
Markus Scheidgen committed
323
        name='parsers/dftbplus', code_name='DFTB+', domain='dft',
Markus Scheidgen's avatar
Markus Scheidgen committed
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
        parser_class_name='dftbplusparser.DFTBPlusParser',
        mainfile_contents_re=r'^ Fermi distribution function\s*',
        mainfile_mime_re=r'text/.*',
    ),
    LegacyParser(
        name='parsers/asap', code_name='ASAP', domain='dft',
        parser_class_name='asapparser.AsapParser',
        mainfile_name_re=r'.*.traj$',
        mainfile_mime_re=r'application/octet-stream',
    ),
    LegacyParser(
        name='parsers/fplo', code_name='fplo', domain='dft',
        parser_class_name='fploparser.FploParser',
        mainfile_contents_re=r'\s*\|\s*FULL-POTENTIAL LOCAL-ORBITAL MINIMUM BASIS BANDSTRUCTURE CODE\s*\|\s*',
        mainfile_mime_re=r'text/.*',
    ),
    LegacyParser(
        name='parsers/mopac', code_name='MOPAC', domain='dft',
        parser_class_name='mopacparser.MopacParser',
        mainfile_contents_re=r'\s*\*\*\s*MOPAC\s*([0-9a-zA-Z]*)\s*\*\*\s*',
        mainfile_mime_re=r'text/.*',
345
    ),
Alvin Noe Ladines's avatar
Alvin Noe Ladines committed
346
    LobsterParser(),
347
    ArchiveParser()
Markus Scheidgen's avatar
Markus Scheidgen committed
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
]

empty_parsers = [
    EmptyParser(
        name='missing/octopus', code_name='Octopus', code_homepage='https://octopus-code.org/',
        domain='dft',
        mainfile_name_re=r'(inp)|(.*/inp)'
    ),
    EmptyParser(
        name='missing/crystal', code_name='Crystal', code_homepage='https://www.crystal.unito.it/index.php',
        domain='dft',
        mainfile_name_re=r'.*\.cryst\.out'
    ),
    EmptyParser(
        name='missing/wien2k', code_name='WIEN2k', code_homepage='http://www.wien2k.at/',
        domain='dft',
        mainfile_name_re=r'.*\.scf'
    ),
    EmptyParser(
        name='missing/fhi-aims', code_name='FHI-aims', code_homepage='https://aimsclub.fhi-berlin.mpg.de/',
        domain='dft',
        mainfile_name_re=r'.*\.fhiaims'
    )
]

if config.use_empty_parsers:
    # There are some entries with PIDs that have mainfiles which do not match what
    # the actual parsers expect. We use the EmptyParser to produce placeholder entries
    # to keep the PIDs. These parsers will not match for new, non migrated data.
    parsers.extend(empty_parsers)

parsers.append(BrokenParser())

''' Instantiation and constructor based config of all parsers. '''

parser_dict = {parser.name: parser for parser in parsers + empty_parsers}  # type: ignore
''' A dict to access parsers by name. Usually 'parsers/<...>', e.g. 'parsers/vasp'. '''

# renamed parsers
parser_dict['parser/broken'] = parser_dict['parsers/broken']
parser_dict['parser/fleur'] = parser_dict['parsers/fleur']
parser_dict['parser/molcas'] = parser_dict['parsers/molcas']
parser_dict['parser/octopus'] = parser_dict['parsers/octopus']
parser_dict['parser/onetep'] = parser_dict['parsers/onetep']

# register code names as possible statistic value to the dft datamodel
Markus Scheidgen's avatar
Markus Scheidgen committed
394
395
396
397
398
399
400
401
402
403
code_names = []
for parser in parsers:
    if parser.domain == 'dft' and \
            getattr(parser, 'code_name', None) is not None and \
            getattr(parser, 'code_name') != 'currupted mainfile' and \
            getattr(parser, 'code_name') != 'Template':
        code_names.append(getattr(parser, 'code_name'))
code_names = sorted(set(code_names), key=lambda code_name: code_name.lower())
datamodel.DFTMetadata.code_name.a_search.statistic_values = code_names + [
    config.services.unavailable_value, config.services.not_processed_value]