diff --git a/parser/parser-quantum-espresso/FortranNamelistParser.py b/parser/parser-quantum-espresso/FortranNamelistParser.py new file mode 100644 index 0000000000000000000000000000000000000000..f9dfdf7a684ed9b5a7bf462a8b8479ae0f559695 --- /dev/null +++ b/parser/parser-quantum-espresso/FortranNamelistParser.py @@ -0,0 +1,289 @@ +import setup_paths +import re +import sys +import os +import logging +from nomadcore.match_highlighter import ANSI + +LOGGER = logging.getLogger(__name__) + + +# regex for _valid_ fortran float output, what a mess ... +RE_f = (r"(?:" + # outer alternative, between numbers and number-too-wide-for-field markers + r"([+-]?)(?:" + # MANTISSA, SIGN (group 1, optional), followed by alternatives + '|'.join([ # MANTISSA + r"(\d+(?!\.))", # MANTISSA without a decimal point, group 2 + r"(\d*)" + ( # MANTISSA, WHOLE part (group 3) + # we need negative look-ahead/look-behind assertions around the + # decimal point as there is too much optional stuff around + r"(?<![^\d\s+-])" + # char preceding the dot must be nothing but number, whitespace, or sign + r"\." + + r"(?![^eEdD\d\s,])" + # char succeeding the dot must be nothing but number, exponential/precision char, comma or whitespace + r"(\d*)" # MANTISSA, FRACTIONAL part (group 4), separated by dot + ) + ]) + + r")(?:" + ( # EXPONENT part (optional) + r"([eEdD])" + # PRECISION (group5) + r"([+-]?)(\d*)" # EXPONENT SIGN (group 6), VALUE (group 7) + ) + ")?" + # make precision/exponet part optinal + r"|(\*+))" # outer alternative, between numbers and number-too-wide markers (group 8) +) +cRE_f = re.compile(RE_f) + + +def match_to_float(m, group_offset=0): + group = [ m.group(0) ] + [ m.group(group_offset + i) for i in range(1,9)] + LOGGER.debug("g: %s", str(group)) + if group[8] is not None: + pyfloat_str = 'nan' + dtype = 'f' + else: + pyfloat_str = group[1] # sign, maybe zero-length + if group[2] is not None: + pyfloat_str += group[2] + dtype = 'i' + else: + pyfloat_str += group[3] if len(group[3])>0 else '0' + pyfloat_str += '.' + pyfloat_str += group[4] if len(group[4])>0 else '0' + dtype = 'f' + if group[5] is not None: + pyfloat_str += 'e' + group[6] + pyfloat_str += group[7] if len(group[7])>0 else '0' + dtype = 'f' + LOGGER.debug("pyfloat_str: %s", pyfloat_str) + return (float(pyfloat_str), dtype) + + +RE_unescape = { + '"': re.compile(r'""'), + "'": re.compile(r"''"), +} + + +def unquote_string(value): + result = value[1:-1] + return RE_unescape[value[0]].sub(value[0], result) + + +# quoted strings +cRE_string_quoted = re.compile(r"(?:'[^']*'|\"[^\"]*\")") +cRE_comment = re.compile(r"\s*!.*") +RE_identifier = r"[a-zA-Z]\w*" # fortran identifier +cRE_start_group = re.compile(r'\s*&(' + RE_identifier + r')') # beginning of namelist group +cRE_end_group = re.compile(r'\s*/') +cRE_start_assignment = re.compile(r'\s*(?P<target>' + RE_identifier + r')(?:\(\s*(?P<subscript>[^\)]*?)\s*\))?\s*=\s*') +cRE_assigned_value = re.compile( + r'\s*(?:' + '|'.join([ + r'(?P<num>' + RE_f + r')', # integers and floats + r'\(\s*(?P<cnum_r>' + RE_f + r')\s*,\s*(?P<cnum_i>' + RE_f + r')\s*\)', # complex numbers + r'(?P<bool_t>\.t(?:rue)?\.)', # true-value bool + r'(?P<bool_f>\.f(?:alse)?\.)', # false-value bool + r"(?P<str_s>'[^']*(?:[^']|'')*'(?!'))", # single-quoted string, closed, allowing for escaped quotes ('') + r'(?P<str_d>"[^"]*(?:[^"]|"")*"(?!"))', # double-quoted string, closed, allowing for escaped quotes ("") + r"(?P<str_s_nc>'[^']*(?:[^']|'')*)", # single-quoted string, not closed + r'(?P<str_d_nc>"[^"]*(?:[^"]|"")*)', # double-quoted string, not closed + r'(?P<comment>!.*)', # comment + ]) + ')', re.I) +cRE_str_s_close = re.compile(r"([^']*(?:[^']|'')*'(?!'))") # single-quoted string, closing +cRE_str_d_close = re.compile(r'([^"]*(?:[^"]|"")*"(?!"))') # double-quoted string, closing +cRE_comma = re.compile(r'\s*,') + + +class FortranNamelistParser(object): + """Parser for Fortran 90 Namelists + """ + def __init__(self, file_path): + self.input_tree = {} + self.file_path = file_path + self.state = 0 + self.nl_group = None + self.target = None + self.target_subscript = None + self.values = None + self.types = None + self.nvalues_after_comma = 0 + + def parse(self): + with open(self.file_path, "r") as fIn: + # split lines into 'line' and 'comment' parts + for line in fIn: + # strip final newline if it exists + if line[-1] == '\n': + line = line[:-1] + self.parse_line(line) + + def parse_line(self, line): + last_end = 0 + while last_end<len(line): + if self.state == 0: + # we have no open group + m = cRE_start_group.match(line, last_end) + if m is not None: + self.nl_group = m.group(1) + sys.stdout.write(ANSI.FG_BRIGHT_YELLOW + m.group() + ANSI.RESET) + last_end = m.end() + self.state = 1 + self.onOpen_namelist_group(m.group(1)) + continue + # but comments may appear here + m = cRE_comment.match(line, last_end) + if m is not None: + sys.stdout.write(ANSI.FG_BLUE + m.group() + ANSI.RESET) + last_end = m.end() + self.onComment(m.group()) + continue + elif self.state==3: + # we are inside single-quoted multiline string + m = cRE_str_s_close.match(line, last_end) + if m is None: + sys.stdout.write(ANSI.FG_YELLOW + line[last_end:] + ANSI.RESET) + self.values[-1] += "\n" + line + last_end=len(line) + else: + sys.stdout.write(ANSI.FG_YELLOW + m.group() + ANSI.RESET) + self.values[-1] += "\n" + m.group(1) + self.values[-1] = unquote_string(self.values[-1]) + self.types[-1] = 'C' + last_end=m.end() + self.state = 2 + continue + elif self.state==4: + # we are inside double-quoted multiline string + m = cRE_str_d_close.match(line, last_end) + if m is None: + sys.stdout.write(ANSI.FG_YELLOW + line[last_end:] + ANSI.RESET) + self.values[-1] += "\n" + line + last_end=len(line) + else: + sys.stdout.write(ANSI.FG_YELLOW + m.group() + ANSI.RESET) + self.values[-1] += "\n" + m.group(1) + self.values[-1] = unquote_string(self.values[-1]) + self.types[-1] = 'C' + last_end=m.end() + self.state = 2 + continue + else: + # we are inside opened group + # check for group-closing / + m = cRE_end_group.match(line, last_end) + if m is not None: + if self.target is not None: + self.onClose_value_assignment( + self.target, self.target_subscript, + self.values, self.types) + self.target = None + self.target_subscript = None + self.values = None + self.types = None + self.nvalues_after_comma = 0 + self.onClose_namelist_group(self.nl_group) + self.nl_group = None + sys.stdout.write(ANSI.BEGIN_INVERT + ANSI.FG_BRIGHT_YELLOW + m.group() + ANSI.RESET) + self.state = 0 + last_end = m.end() + continue + # check for new assignment + m = cRE_start_assignment.match(line, last_end) + if m is not None: + if self.target is not None: + self.onClose_value_assignment( + self.target, self.target_subscript, + self.values, self.types) + self.state = 2 + last_end=m.end() + sys.stdout.write(ANSI.FG_GREEN + m.group() + ANSI.RESET) + self.target = m.group('target') + self.target_subscript = m.group('subscript') + self.values = [] + self.types = [] + self.values_after_comma = 0 + self.onOpen_value_assignment( + self.target, self.target_subscript) + continue + if self.state >= 2: + # we are inside the values-part of an assignment + m = cRE_assigned_value.match(line, last_end) + if m is not None: + if m.group('num') is not None: + (value, dtype) = match_to_float(m, group_offset=1) + self.values.append(value) + self.types.append(dtype) + elif m.group('cnum_r') is not None: + (cnum_r, dtype) = match_to_float(m, group_offset=10) + (cnum_i, dtype) = match_to_float(m, group_offset=19) + self.values.append(complex(cnum_r, cnum_i)) + self.types.append('complex') + elif m.group('bool_t') is not None: + self.values.append(True) + self.types.append('b') + elif m.group('bool_f') is not None: + self.values.append(False) + self.types.append('b') + elif m.group('str_s') is not None: + self.values.append(unquote_string(m.group('str_s'))) + self.types.append('C') + elif m.group('str_d') is not None: + self.values.append(unquote_string(m.group('str_d'))) + self.types.append('C') + elif m.group('str_s_nc') is not None: + # non-closed single-quoted string + self.state=3 + self.values.append(m.group('str_s_nc')) + self.types.append('string_singlequoted') + elif m.group('str_d_nc') is not None: + # non-closed double-quoted string + self.state=4 + self.values.append(m.group('str_d_nc')) + self.types.append('string_doublequoted') + elif m.group('comment') is not None: + sys.stdout.write(ANSI.FG_BLUE + m.group() + ANSI.RESET) + last_end=m.end() + self.onComment(m.group()) + continue + self.values_after_comma +=1 + sys.stdout.write(ANSI.FG_YELLOW + m.group() + ANSI.RESET) + last_end=m.end() + continue + # special meaning of comma: may indicate Null values in array assignments + m = cRE_comma.match(line, last_end) + if m is not None: + if self.values_after_comma is 0: + self.values.append(None) + self.types.append(None) + self.values_after_comma = 0 + sys.stdout.write(ANSI.FG_MAGENTA + m.group() + ANSI.RESET) + last_end = m.end() + continue + break + if last_end < len(line): + line_leftover = line[last_end:] + if self.state > 0 and line_leftover.strip(): + LOGGER.error("ERROR: leftover chars in line while inside namelist group") + sys.stdout.write(ANSI.BEGIN_INVERT + ANSI.FG_BRIGHT_RED + line_leftover + ANSI.RESET) + else: + sys.stdout.write(ANSI.BEGIN_INVERT + ANSI.FG_BLUE + line_leftover + ANSI.RESET) + sys.stdout.write('\n') + + # Hooks to be overloaded in derived classes in order to do stuff + def onComment(self, comment): + pass + + def onOpen_namelist_group(self, groupname): + pass + + def onClose_namelist_group(self, groupname): + pass + + def onOpen_value_assignment(self, target, subscript): + pass + + def onClose_value_assignment(self, target, subscript, values, dtypes): + if subscript is None: + LOGGER.error("SET %s = %s (types: %s)", target, str(values), str(dtypes)) + else: + LOGGER.error("SET %s(%s) = %s (types: %s)", target, subscript, str(values), str(dtypes)) + +if __name__ == "__main__": + parser = FortranNamelistParser(sys.argv[1]) + parser.parse()