FortranNamelistParser.py 19.6 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import setup_paths
import re
import sys
import os
import logging
from nomadcore.match_highlighter import ANSI

LOGGER = logging.getLogger(__name__)


# regex for _valid_ fortran float output, what a mess ...
RE_f = (r"(?:" + # outer alternative, between numbers and number-too-wide-for-field markers
    r"([+-]?)(?:" + # MANTISSA, SIGN (group 1, optional), followed by alternatives
        '|'.join([ # MANTISSA
            r"(\d+(?!\.))", # MANTISSA without a decimal point, group 2
            r"(\d*)" + ( # MANTISSA, WHOLE part (group 3)
                # we need negative look-ahead/look-behind assertions around the
                # decimal point as there is too much optional stuff around
                r"(?<![^\d\s+-])" + # char preceding the dot must be nothing but number, whitespace, or sign
                r"\." +
                r"(?![^eEdD\d\s,])" + # char succeeding the dot must be nothing but number, exponential/precision char, comma or whitespace
                r"(\d*)" # MANTISSA, FRACTIONAL part (group 4), separated by dot
            )
        ]) +
    r")(?:" + ( # EXPONENT part (optional)
        r"([eEdD])" + # PRECISION (group5)
        r"([+-]?)(\d*)" # EXPONENT SIGN (group 6), VALUE (group 7)
    ) + ")?" + # make precision/exponet part optinal
    r"|(\*+))" # outer alternative, between numbers and number-too-wide markers (group 8)
)
cRE_f = re.compile(RE_f)


def match_to_float(m, group_offset=0):
    group = [ m.group(0) ] + [ m.group(group_offset + i) for i in range(1,9)]
    LOGGER.debug("g: %s", str(group))
    if group[8] is not None:
        pyfloat_str = 'nan'
        dtype = 'f'
    else:
        pyfloat_str = group[1] # sign, maybe zero-length
        if group[2] is not None:
            pyfloat_str += group[2]
            dtype = 'i'
        else:
            pyfloat_str += group[3] if len(group[3])>0 else '0'
            pyfloat_str += '.'
            pyfloat_str += group[4] if len(group[4])>0 else '0'
            dtype = 'f'
        if group[5] is not None:
            pyfloat_str += 'e' + group[6]
            pyfloat_str += group[7] if len(group[7])>0 else '0'
            dtype = 'f'
    LOGGER.debug("pyfloat_str: %s", pyfloat_str)
55
56
57
58
    if dtype == 'f':
        return (float(pyfloat_str), dtype)
    else:
        return (int(pyfloat_str), dtype)
59
60
61
62
63
64
65
66
67

RE_unescape = {
    '"': re.compile(r'""'),
    "'": re.compile(r"''"),
}


def unquote_string(value):
    result = value[1:-1]
68
69
    result = RE_unescape[value[0]].sub(value[0], result)
    return result
70
71
72
73


# quoted strings
cRE_string_quoted = re.compile(r"(?:'[^']*'|\"[^\"]*\")")
74
cRE_comment = re.compile(r"\s*!(?P<comment>.*)")
75
76
77
78
79
80
81
82
83
84
85
86
87
RE_identifier = r"[a-zA-Z]\w*" # fortran identifier
cRE_start_group = re.compile(r'\s*&(' + RE_identifier + r')') # beginning of namelist group 
cRE_end_group = re.compile(r'\s*/')
cRE_assigned_value = re.compile(
    r'\s*(?:' + '|'.join([
        r'(?P<num>' + RE_f + r')', # integers and floats
        r'\(\s*(?P<cnum_r>' + RE_f + r')\s*,\s*(?P<cnum_i>' + RE_f + r')\s*\)', # complex numbers
        r'(?P<bool_t>\.t(?:rue)?\.)', # true-value bool
        r'(?P<bool_f>\.f(?:alse)?\.)', # false-value bool
        r"(?P<str_s>'[^']*(?:[^']|'')*'(?!'))", # single-quoted string, closed, allowing for escaped quotes ('')
        r'(?P<str_d>"[^"]*(?:[^"]|"")*"(?!"))', # double-quoted string, closed, allowing for escaped quotes ("")
        r"(?P<str_s_nc>'[^']*(?:[^']|'')*)", # single-quoted string, not closed
        r'(?P<str_d_nc>"[^"]*(?:[^"]|"")*)', # double-quoted string, not closed
88
        r'!(?P<comment>.*)', # comment
89
90
91
92
    ]) + ')', re.I)
cRE_str_s_close = re.compile(r"([^']*(?:[^']|'')*'(?!'))") # single-quoted string, closing
cRE_str_d_close = re.compile(r'([^"]*(?:[^"]|"")*"(?!"))') # double-quoted string, closing
cRE_comma = re.compile(r'\s*,')
93
cRE_trailing_whitespace = re.compile(r'\s+$')
94

95
96
97
98
99
100
cRE_identifier = re.compile(r'\s*(?P<target>' + RE_identifier + r')')
cRE_assignment_subscript_open = re.compile(r'\s*\((?P<subscript>[^\)!]*)')
cRE_assignment_subscript_continue = re.compile(r'(?P<subscript>[^\)!]+)')
cRE_assignment_subscript_close = re.compile(r'(?P<subscript>[^\)!]*)\)')
cRE_assignment_equals = re.compile(r'\s*=')

Henning Glawe's avatar
Henning Glawe committed
101
cRE_subscript = re.compile(r'\s*,?\s*(?:(\d*)\s*:\s*(\d*)|(\d+))')
Henning Glawe's avatar
Henning Glawe committed
102

103
cRE_end_newline = re.compile(r'(.*?)(\n*)$')
104
105
106
107

class FortranNamelistParser(object):
    """Parser for Fortran 90 Namelists
    """
108
    def __init__(self, file_path, annotateFile = None):
109
110
        self.input_tree = {}
        self.file_path = file_path
111
        self.state = self.state_root
112
        self.__annotateFile = annotateFile
113
114
115
116
117
118
        self.__nl_group = None
        self.__target = None
        self.__subscript = None
        self.__values = None
        self.__types = None
        self.__nvalues_after_comma = 0
119
        self.__cre_closing = None
120
        self.bad_input = False
121
        self.cache = {}
122
123

    def parse(self):
124
        """open file and parse line-by-line"""
125
        with open(self.file_path, "r") as fIn:
126
            # process line-by-line
127
128
            for line in fIn:
                self.parse_line(line)
129
        # check if there was input flagged as 'bad'/'syntactically incorrect'
130
        if self.bad_input:
131
            # call bad-input hook
132
            self.onBad_input()
Henning Glawe's avatar
Henning Glawe committed
133
134
        # call end-of-file hook
        self.onEnd_of_file()
135

136
137
138
139
140
141
142
143
144
145
146
    def parse_line(self, line):
        """parse one line, delegating to the parser state handlers"""
        pos_in_line = 0
        while pos_in_line<len(line):
            new_pos_in_line = self.state(line, pos_in_line)
            # check if anything was parsed, otherwise cancel that line
            if new_pos_in_line is None:
                break
            else:
                pos_in_line = new_pos_in_line
        if pos_in_line < len(line):
147
148
            self.bad_input = True
            self.annotate(line[pos_in_line:], ANSI.BEGIN_INVERT + ANSI.FG_BRIGHT_RED)
149

150
    def annotate(self, what, highlight):
151
        """write string to annotateFile with ANSI highlight/reset sequences"""
Henning Glawe's avatar
Henning Glawe committed
152
        if self.__annotateFile:
153
            m = cRE_end_newline.match(what)
Henning Glawe's avatar
Henning Glawe committed
154
            self.__annotateFile.write(highlight + m.group(1) + ANSI.RESET + m.group(2))
155

156
    def parse_subscript_string(self, subscript):
157
        """parse fully captured subscript string into python array"""
Henning Glawe's avatar
Henning Glawe committed
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
        if subscript is None:
            return None
        result = []
        last_end = 0
        while last_end<len(subscript):
            m = cRE_subscript.match(subscript, last_end)
            if m is None:
                break
            elif m.group(3) is not None:
                # prepend to result list, making ranges explicit:
                #    fortran has fastest-running index first
                #  while
                #    python/c has fastest-running index last
                result[0:0] = [ [int(m.group(3))] ]
                last_end = m.end()
                continue
            elif m.group(1) is not None:
                # prepend to result list, making ranges explicit
                #    fortran has fastest-running index first
                #  while
                #    python/c has fastest-running index last
                result[0:0] = [list(range(int(m.group(1)),int(m.group(2))+1))]
                last_end = m.end()
                continue
            break
        if last_end < len(subscript):
184
            if subscript[last_end:].strip():
Henning Glawe's avatar
Henning Glawe committed
185
                LOGGER.error("ERROR: leftover chars in subscript: '%s'", subscript[last_end:])
186
                self.bad_input = True
Henning Glawe's avatar
Henning Glawe committed
187
188
        return result

189
    def state_root(self, line, pos_in_line):
190
        """state: no open namelist groups, i.e. at the root of the namelist"""
191
192
193
        m = cRE_start_group.match(line, pos_in_line)
        if m is not None:
            self.__nl_group = m.group(1).lower()
194
            self.annotate(m.group(), ANSI.FG_BRIGHT_GREEN)
195
            self.state = self.state_inside_group
196
197
198
199
200
201
            self.onOpen_namelist_group(self.__nl_group)
            return m.end()
        else:
            # but comments may appear here
            m = cRE_comment.match(line, pos_in_line)
            if m is not None:
202
                self.annotate(m.group(), ANSI.FG_BLUE)
203
                self.onComment(m.group('comment'))
204
                return m.end()
205
206
207
208
209
            # as well as whitespace-only lines
            m = cRE_trailing_whitespace.match(line, pos_in_line)
            if m is not None:
                self.annotate(m.group(), ANSI.BG_WHITE)
                return m.end()
210
211
        # nothing matched, call hook
        return self.onRoot_data(line, pos_in_line)
212

213
    def state_inside_group(self, line, pos_in_line):
214
215
        """state: inside opened group, but no open assignment"""
        # check for group-closing /
216
217
218
        m = cRE_end_group.match(line, pos_in_line)
        if m is not None:
            # we just closed a NL group
219
            self.annotate(m.group(), ANSI.BEGIN_INVERT + ANSI.FG_BRIGHT_GREEN)
220
221
222
223
224
225
226
227
228
229
230
231
            if self.__target is not None:
                self.onClose_value_assignment(
                    self.__nl_group,
                    self.__target, self.__subscript,
                    self.__values, self.__types)
            self.__target = None
            self.__subscript = None
            self.__values = None
            self.__types = None
            self.__nvalues_after_comma = 0
            self.onClose_namelist_group(self.__nl_group)
            self.__nl_group = None
232
            self.state = self.state_root
233
            return m.end()
234
        # check for new identifier (part of left-hand side of assignment)
235
        m = cRE_identifier.match(line, pos_in_line)
236
        if m is not None:
237
            self.annotate(m.group(), ANSI.FG_GREEN)
238
239
240
241
242
243
            if self.__target is not None:
                self.onClose_value_assignment(
                    self.__nl_group,
                    self.__target, self.__subscript,
                    self.__values, self.__types)
            self.__target = m.group('target').lower()
244
            self.__subscript = None
245
246
247
            self.__values = []
            self.__types = []
            self.__nvalues_after_comma = 0
248
            return m.end()
249
        # check for new subscript (part of left-hand side of assignment)
250
251
252
253
        m = cRE_assignment_subscript_open.match(line, pos_in_line)
        if m is not None:
            self.annotate(line[pos_in_line:m.start('subscript')], ANSI.FG_GREEN)
            self.annotate(m.group('subscript'), ANSI.FG_CYAN)
254
            self.__subscript = m.group('subscript')
255
            self.state = self.state_assignment_subscript
256
            return m.end()
257
        # check for '=' sign in assignment, separating target from values
258
259
260
        m = cRE_assignment_equals.match(line, pos_in_line)
        if m is not None:
            self.annotate(line[pos_in_line:m.end()], ANSI.FG_GREEN)
261
262
263
            self.onOpen_value_assignment(
                self.__nl_group,
                self.__target, self.__subscript)
264
            self.state = self.state_assignment_values
265
            return m.end()
266
        # check for comments ('!' character up until end of line)
267
268
269
270
271
        m = cRE_comment.match(line, pos_in_line)
        if m is not None:
            self.annotate(m.group(), ANSI.FG_BLUE)
            self.onComment(m.group('comment'))
            return m.end()
272
273
274
275
276
        # check for trailing whitespace
        m = cRE_trailing_whitespace.match(line, pos_in_line)
        if m is not None:
            self.annotate(m.group(), ANSI.BG_WHITE)
            return m.end()
277
278
        return None

279
    def state_assignment_values(self, line, pos_in_line):
280
281
        """state: parse values, i.e. right-hand side of assignment"""
        # match value literals, groups decide on data type
282
283
284
        m = cRE_assigned_value.match(line, pos_in_line)
        if m is not None:
            if m.group('comment') is not None:
285
                # found a comment
286
                self.annotate(m.group(), ANSI.FG_BLUE)
287
                self.onComment(m.group('comment'))
288
            else:
289
                self.annotate(m.group(), ANSI.FG_YELLOW)
290
                if m.group('num') is not None:
291
                    # literal is single integer or float
292
293
294
295
                    (value, dtype) = match_to_float(m, group_offset=1)
                    self.__values.append(value)
                    self.__types.append(dtype)
                elif m.group('cnum_r') is not None:
296
                    # literal is complex: (float, float)
297
298
299
300
301
                    (cnum_r, dtype) = match_to_float(m, group_offset=10)
                    (cnum_i, dtype) = match_to_float(m, group_offset=19)
                    self.__values.append(complex(cnum_r, cnum_i))
                    self.__types.append('complex')
                elif m.group('bool_t') is not None:
302
                    # literal is a true-value bool
303
304
305
                    self.__values.append(True)
                    self.__types.append('b')
                elif m.group('bool_f') is not None:
306
                    # literal is a false-value bool
307
308
309
                    self.__values.append(False)
                    self.__types.append('b')
                elif m.group('str_s') is not None:
310
                    # literal is a closed, single-quoted string
311
312
313
                    self.__values.append(unquote_string(m.group('str_s')))
                    self.__types.append('C')
                elif m.group('str_d') is not None:
314
                    # literal is a closed, double-quoted string
315
316
317
                    self.__values.append(unquote_string(m.group('str_d')))
                    self.__types.append('C')
                elif m.group('str_s_nc') is not None:
318
                    # literal is a non-closed, single-quoted string
319
                    self.state = self.state_assignment_values_multiline_string
320
                    self.__values.append(m.group('str_s_nc'))
321
322
                    self.__types.append('C')
                    self.__cre_closing = cRE_str_s_close
323
                elif m.group('str_d_nc') is not None:
324
                    # literal is a non-closed, double-quoted string
325
                    self.state = self.state_assignment_values_multiline_string
326
                    self.__values.append(m.group('str_d_nc'))
327
328
                    self.__types.append('C')
                    self.__cre_closing = cRE_str_d_close
329
                # keep track if there were values following the previous comma
330
                self.__nvalues_after_comma += 1
331
            return m.end()
332
333
334
        # special meaning of comma: may indicate Null values in array assignments
        m = cRE_comma.match(line, pos_in_line)
        if m is not None:
335
            self.annotate(m.group(), ANSI.FG_MAGENTA)
336
            if self.__nvalues_after_comma is 0:
337
338
                # there were no value literals preceeding the comma, which
                # means a null value
339
340
341
342
                self.__values.append(None)
                self.__types.append(None)
            self.__nvalues_after_comma = 0
            return m.end()
343
344
345
346
347
        # check for trailing whitespace
        m = cRE_trailing_whitespace.match(line, pos_in_line)
        if m is not None:
            self.annotate(m.group(), ANSI.BG_WHITE)
            return m.end()
348
        # if none of the above matched, switch back to checking for new assignment
349
        self.state = self.state_inside_group
350
        return pos_in_line
351

352
    def state_assignment_values_multiline_string(self, line, pos_in_line):
353
354
        """state: parse multiline string in right-hand side of assignment"""
        # check for closing quotes
Henning Glawe's avatar
Henning Glawe committed
355
356
        m = self.__cre_closing.match(line, pos_in_line)
        if m is None:
357
            # no closing quotes, append data to value
358
            self.annotate(line[pos_in_line:], ANSI.FG_YELLOW)
359
            self.__values[-1] += line
Henning Glawe's avatar
Henning Glawe committed
360
361
            return len(line)
        else:
362
            # closing quotes, postprocess string
363
            self.annotate(m.group(), ANSI.FG_YELLOW)
364
            self.__values[-1] += m.group(1)
365
            # remove enclosing quotes and resolve escaped quotes in string
Henning Glawe's avatar
Henning Glawe committed
366
367
            self.__values[-1] = unquote_string(self.__values[-1])
            self.__cre_closing = None
368
            self.state = self.state_assignment_values
Henning Glawe's avatar
Henning Glawe committed
369
370
371
            return m.end()
        return None

372
    def state_assignment_subscript(self, line, pos_in_line):
373
374
        """state: capture subscipt, possibly spanning multiple lines"""
        # check for closing bracket
375
376
        m = cRE_assignment_subscript_close.match(line, pos_in_line)
        if m is not None:
377
            # subscript closed, convert string form to python array
378
379
            self.annotate(m.group('subscript'), ANSI.FG_CYAN)
            self.annotate(line[m.end('subscript'):m.end()], ANSI.FG_GREEN)
380
            self.__subscript = self.parse_subscript_string(self.__subscript + m.group('subscript'))
381
            self.state = self.state_inside_group
382
            return m.end()
383
        # check for new indices in subscript
384
385
386
387
388
        m = cRE_assignment_subscript_continue.match(line, pos_in_line)
        if m is not None:
            self.annotate(m.group('subscript'), ANSI.FG_CYAN)
            self.__subscript += m.group('subscript')
            return m.end()
389
        # comments may appear within subscripts spanning multiple lines
390
391
392
        m = cRE_comment.match(line, pos_in_line)
        if m is not None:
            self.annotate(m.group(), ANSI.FG_BLUE)
393
            self.onComment(m.group('comment'))
394
395
            self.__subscript += line[pos_in_line:m.start()]
            return m.end()
396
        self.annotate(line[pos_in_line:], ANSI.BEGIN_INVERT + ANSI.FG_BRIGHT_RED)
397
398
399
400
        LOGGER.error("ERROR: leftover chars in line while inside subscript: '%s'", line[pos_in_line:])
        self.bad_input = True
        return None

401
    # Hooks to be overloaded in derived classes in order to do stuff beyond caching
402
    def onComment(self, comment):
403
        """hook: called whan a comment was found"""
404
405
406
        pass

    def onOpen_namelist_group(self, groupname):
407
        """hook: called when a namelist group opens"""
408
409
410
411
412
        if groupname in self.cache:
            LOGGER.error("ERROR: multiple definitions of group &%s", groupname)
            self.bad_input = True
        else:
            self.cache[groupname]={}
413
414

    def onClose_namelist_group(self, groupname):
415
        """hook: called when a namelist group closes"""
416
417
418
        LOGGER.error("group: %s", groupname)
        for identifier in sorted(self.cache[groupname]):
            LOGGER.error("  %s: %s", identifier, str(self.cache[groupname][identifier]))
419

420
    def onOpen_value_assignment(self, groupname, target, subscript):
421
        """hook: called when a value assignment within a namelist group starts"""
422
423
        pass

424
    def onClose_value_assignment(self, groupname, target, subscript, values, dtypes):
425
426
427
428
        """hook: called when a value assignment within a namelist group closes
        Arguments are: NL group name, identifier/subscript, values and assumed
        data types
        """
429
        if subscript is None:
430
            LOGGER.debug("SET %s/%s = %s (types: %s)", groupname, target, str(values), str(dtypes))
431
        else:
432
433
434
435
            LOGGER.debug("SET %s/%s(%s) = %s (types: %s)", groupname, target, subscript, str(values), str(dtypes))
        if target not in self.cache[groupname]:
            self.cache[groupname][target] = []
        self.cache[groupname][target].append([subscript, values, dtypes])
436

437
438
439
440
441
442
443
444
    def onRoot_data(self, line, pos_in_line):
        """hook: called if data appears outside namelists groups, directly
        at root level within the file;
        data means: line is not empty or a comment
        useful for code-specific extensions beyond the F90 namelist standard
        """
        return None

445
    def onBad_input(self):
446
        """hook: called at the end of parsing if there was any bad input"""
447
448
        pass

Henning Glawe's avatar
Henning Glawe committed
449
450
451
452
    def onEnd_of_file(self):
        """hook: called at the end of parsing"""
        pass

453
if __name__ == "__main__":
454
    parser = FortranNamelistParser(sys.argv[1], annotateFile=sys.stdout)
455
    parser.parse()