Commit cb222142 authored by Lauri Himanen's avatar Lauri Himanen
Browse files

Better logic for parsing input files, added support for aliases and default/lone keyword values.

parent 88166032
......@@ -33,9 +33,9 @@ the common parser structure when it is available.
# Structure
Currently the python package is divided into three subpackages:
- Engines: Classes for parsing different type of files
- Generics: Generic utility classes and base classes
- Implementation: The classes that actually define the parser functionality.
- Engines: Classes for parsing different type of files
- Generics: Generic utility classes and base classes
- Implementation: The classes that actually define the parser functionality.
# Reusable components and ideas for other parsers
......@@ -56,13 +56,13 @@ the performance of an engine but if the function calls remain the same no other
code has to be changed.
Currently implemented engines that could be reused (not tested properly yet):
- RegexEngine: For parsing text files with regular expressions. Uses the re2
library if available (falls back to default python regex implementation if
re2 not found).
- XYZEngine: For parsing XYZ files and files with similar structure. Has a very
flexible nature as you can specify comments, column delimiters, column
indices and the patterns used to separate different configurations.
- XMLEngine: For parsing XML files using XPath syntax.
- RegexEngine: For parsing text files with regular expressions. Uses the re2
library if available (falls back to default python regex implementation if
re2 not found).
- XYZEngine: For parsing XYZ files and files with similar structure. Has a very
flexible nature as you can specify comments, column delimiters, column
indices and the patterns used to separate different configurations.
- XMLEngine: For parsing XML files using XPath syntax.
## NomadParser base class
In the generics folder there is a module called nomadparser.py that defines a
......@@ -74,11 +74,11 @@ the scala code (will be modified later to conform to the common interface).
This class is also responsible for some common tasks that are present in all
parsers:
- Unit conversion
- JSON encoding
- Caching
- Time measurement for performance analysis
- Providing file contents, sizes and handles
- Unit conversion
- JSON encoding
- Caching
- Time measurement for performance analysis
- Providing file contents, sizes and handles
## Logging
Python has a great [logging package](https://www.google.com) which helps in
......
......@@ -3,6 +3,7 @@
import os
from collections import defaultdict
import logging
import cPickle as pickle
logger = logging.getLogger(__name__)
......@@ -11,9 +12,7 @@ class CP2KInputEngine(object):
"""Used to parse out a CP2K input file.
When given a file handle to a CP2K input file, this class attemts to parse
out it's structure into an accessible object tree. Because the input file
has such a clearly defined structure (unlike the output file of CP2K), it
is better to use a dedicated parser instead of regular expressions.
out it's structure into an accessible object tree.
"""
def __init__(self, parser):
"""
......@@ -23,16 +22,15 @@ class CP2KInputEngine(object):
"""
self.parser = parser
self.root_section = None
self.xml_file = None
def parse_input(self):
"""Parses the given CP2K input string"""
self.input_tree = None
def parse(self):
"""Parses the CP2K input file into an object tree.
"""
# The input file should be quite small, so just get the entire contents
inp = self.parser.get_file_contents("input")
root_section = InputSection('CP2K_INPUT')
section_stack = [root_section]
section_stack = []
for line in inp.split('\n'):
line = line.split('!', 1)[0].strip()
......@@ -40,168 +38,406 @@ class CP2KInputEngine(object):
continue
if line.upper().startswith('&END'):
s = section_stack.pop()
section_stack.pop()
elif line[0] == '&':
parts = line.split(' ', 1)
name = parts[0][1:]
section_stack.append(name)
# Form the path
path = ""
for index, item in enumerate(section_stack):
if index != 0:
path += '/'
path += item
# print path
# Save the section parameters
if len(parts) > 1:
s = InputSection(name=name, params=parts[1].strip())
else:
s = InputSection(name=name)
section_stack[-1].subsections[name.upper()].append(s)
section_stack.append(s)
self.input_tree.set_parameter(path, parts[1].strip())
else:
split = line.split(' ', 1)
keyword_name = split[0]
keyword_value = split[1]
section_stack[-1].keywords[keyword_name].append(keyword_value)
self.root_section = root_section
def get_subsection(self, path, index=0):
return self.root_section.get_subsection(path, index)
self.input_tree.set_keyword(path + "/" + keyword_name, keyword_value)
def get_keyword(self, path, index=0):
split = path.rsplit('/', 1)
section_path = split[0]
keyword = split[1]
section = self.root_section.get_subsection(section_path, index)
if section is not None:
return section.get_keyword(keyword, section_path, self)
def get_parameter(self, path, index=0):
section = self.root_section.get_subsection(path, index)
if section is not None:
return section.get_parameter(self, path)
def get_input_tree(self):
if self.input_tree is not None:
return self.input_tree
else:
logger.error("Input tree not yet created.")
def setup_version_number(self, version_number):
xml_file_path = os.path.dirname(__file__) + "/cp2kinputenginedata/xml/cp2k_{}/cp2k_input.xml".format(version_number)
self.xml_file = open(xml_file_path, 'r')
def get_xml_file(self):
"""Return the file handle that has been reset to the beginning.
"""
self.xml_file.seek(os.SEEK_SET)
return self.xml_file
pickle_path = os.path.dirname(__file__) + "/cp2kinputenginedata/cp2k_{}/cp2k_input_tree.pickle".format(version_number)
input_tree_pickle_file = open(pickle_path, 'rb')
self.input_tree = pickle.load(input_tree_pickle_file)
#===============================================================================
class InputSection(object):
"""Represents a section in a CP2K input file"""
def __init__(self, name, params=None):
self.name = name.upper()
self.params = params
self.keywords = defaultdict(list)
self.subsections = defaultdict(list)
def write(self):
"""Outputs input section as string"""
output = []
for name, k_list in self.keywords.iteritems():
for value in k_list:
output.append(value)
for name, s_list in self.subsections.iteritems():
for s in s_list:
if s.params:
output.append('&%s %s' % (s.name, s.params))
else:
output.append('&%s' % s.name)
for l in s.write():
output.append(' %s' % l)
output.append('&END %s' % s.name)
return output
def get_subsection(self, path, index=0):
"""Finds a subsection specified by a string where subsections are
separated by a slash. If multiple subsections are found with the same
path, the one specified by the given index (default 0) is returned.
Example: get_subsection("FORCE_EVAL/PRINT/FORCES")
Args:
path: String indicating the path to the subsection
index: In case of repeating subsections, return the one specified
by this index.
# Run main function by default
# if __name__ == "__main__":
# input_file = open("../tests/cp2k_2.6.2/functionals/lda/lda.inp", 'r').read()
# engine = CP2KInputEngine()
# engine.setup_version_number(262)
# engine.parse(input_file)
Returns:
The InputSection object if found.
"""
parts = path.upper().split('/', 1)
candidates = self.subsections.get(parts[0]) # [s for s in self.subsections if s.name == parts[0]]
if not candidates:
logger.debug("Subsection '{}' not found.".format(parts[0]))
return None
elif len(candidates) > 1:
logger.warning("Multiple subsections with the same name found with name '{}' If no index is given, the first occurence in the input file is returned.".format(parts[0]))
try:
subsection = candidates[index]
except IndexError:
logger.error("Invalid subsection index given.")
if len(parts) == 1:
return subsection
return subsection.get_subsection(parts[1])
def get_keyword(self, keyword, section_path, engine, index=0):
"""Finds a keyword specified by a string. If multiple keywords are
found with the same name, the one specified by the given index (default
0) is returned. If the keyword is not explicitly set, returns the
default specified by the cp2k version specific XML file.
Args:
keyword: String indicating the name of the keyword. The name is the
first word in the line.
index: In case of repeating keywords, return the one specified
by this index.
Returns:
The keyword value (everything else than the first word on the line).
"""
candidates = self.keywords.get(keyword)
if not candidates:
logger.debug("No keywords with name '{}' found in subsection '{}'. Using the default XML value.".format(keyword, self.name))
# Form a XPath from the given path
xpath = "."
sections = section_path.split("/")
for section in sections:
xpath += "/SECTION[NAME='{}']".format(section)
xpath += "/KEYWORD[NAME='{}']/DEFAULT_VALUE".format(keyword)
xml_file = engine.get_xml_file()
xmlengine = engine.parser.xmlengine
result = xmlengine.parse(xml_file, xpath)
return result[0].text
elif len(candidates) > 1:
logger.warning("Multiple keywords found with name '{}'. If no index is given, the first occurence in the input file is returned.".format(keyword))
try:
result = candidates[index]
except IndexError:
logger.error("Invalid keyword index given.")
return result
def get_parameter(self, engine, path):
"""Return the SECTION_PARAMETER for this InputSection. If none is
explicitly set, return the default specified by the cp2k version
specific XML file.
"""
if self.params is None:
# Form a XPath from the given path
xpath = "."
sections = path.split("/")
for section in sections:
xpath += "/SECTION[NAME='{}']".format(section)
xpath += "/SECTION_PARAMETERS/LONE_KEYWORD_VALUE"
#===============================================================================
# class InputSection(object):
# """Represents a section in a CP2K input file"""
# def __init__(self, name, params=None):
# self.name = name.upper()
# self.params = params
# self.keywords = defaultdict(list)
# self.subsections = defaultdict(list)
# def write(self):
# """Outputs input section as string"""
# output = []
# for name, k_list in self.keywords.iteritems():
# for value in k_list:
# output.append(value)
# for name, s_list in self.subsections.iteritems():
# for s in s_list:
# if s.params:
# output.append('&%s %s' % (s.name, s.params))
# else:
# output.append('&%s' % s.name)
# for l in s.write():
# output.append(' %s' % l)
# output.append('&END %s' % s.name)
# return output
# def get_subsection(self, path, index=0):
# """Finds a subsection specified by a string where subsections are
# separated by a slash. If multiple subsections are found with the same
# path, the one specified by the given index (default 0) is returned.
# Example: get_subsection("FORCE_EVAL/PRINT/FORCES")
# Args:
# path: String indicating the path to the subsection
# index: In case of repeating subsections, return the one specified
# by this index.
# Returns:
# The InputSection object if found.
# """
# parts = path.upper().split('/', 1)
# candidates = self.subsections.get(parts[0]) # [s for s in self.subsections if s.name == parts[0]]
# if not candidates:
# logger.debug("Subsection '{}' not found.".format(parts[0]))
# return None
# elif len(candidates) > 1:
# logger.warning("Multiple subsections with the same name found with name '{}' If no index is given, the first occurence in the input file is returned.".format(parts[0]))
# try:
# subsection = candidates[index]
# except IndexError:
# logger.error("Invalid subsection index given.")
# if len(parts) == 1:
# return subsection
# return subsection.get_subsection(parts[1])
# def get_keyword(self, keyword, section_path, engine, index=0):
# """Finds a keyword specified by a string. If multiple keywords are
# found with the same name, the one specified by the given index (default
# 0) is returned. If the keyword is not explicitly set, returns the
# default specified by the cp2k version specific XML file.
# Args:
# keyword: String indicating the name of the keyword. The name is the
# first word in the line.
# index: In case of repeating keywords, return the one specified
# by this index.
# Returns:
# The keyword value (everything else than the first word on the line).
# """
# candidates = self.keywords.get(keyword)
# if not candidates:
# logger.debug("No keywords with name '{}' found in subsection '{}'. Using the default XML value.".format(keyword, self.name))
# # Form a XPath from the given path
# xpath = "."
# sections = section_path.split("/")
# for section in sections:
# xpath += "/SECTION[NAME='{}']".format(section)
# xpath += "/KEYWORD[NAME='{}']/DEFAULT_VALUE".format(keyword)
# xml_file = engine.get_xml_file()
# xmlengine = engine.parser.xmlengine
# result = xmlengine.parse(xml_file, xpath)
# return result[0].text
# elif len(candidates) > 1:
# logger.warning("Multiple keywords found with name '{}'. If no index is given, the first occurence in the input file is returned.".format(keyword))
# try:
# result = candidates[index]
# except IndexError:
# logger.error("Invalid keyword index given.")
# return result
# def get_parameter(self, engine, path):
# """Return the SECTION_PARAMETER for this InputSection. If none is
# explicitly set, return the default specified by the cp2k version
# specific XML file.
# """
# if self.params is None:
# # Form a XPath from the given path
# xpath = "."
# sections = path.split("/")
# for section in sections:
# xpath += "/SECTION[NAME='{}']".format(section)
# xpath += "/SECTION_PARAMETERS/LONE_KEYWORD_VALUE"
# xml_file = engine.get_xml_file()
# xmlengine = engine.parser.xmlengine
# result = xmlengine.parse(xml_file, xpath)
# return result[0].text
# return self.params
xml_file = engine.get_xml_file()
xmlengine = engine.parser.xmlengine
result = xmlengine.parse(xml_file, xpath)
return result[0].text
return self.params
#===============================================================================
# class CP2KInputEngine(object):
# """Used to parse out a CP2K input file.
# When given a file handle to a CP2K input file, this class attemts to parse
# out it's structure into an accessible object tree. Because the input file
# has such a clearly defined structure (unlike the output file of CP2K), it
# is better to use a dedicated parser instead of regular expressions.
# """
# def __init__(self, parser):
# """
# Args:
# parser: Instance of a NomadParser or it's subclass. Allows
# access to e.g. unified file reading methods.
# """
# self.parser = parser
# self.root_section = None
# self.xml_file = None
# def parse_input(self):
# """Parses the given CP2K input string. Default any aliases used for
# keywords to the default names.
# """
# # The input file should be quite small, so just get the entire contents
# inp = self.parser.get_file_contents("input")
# root_section = InputSection('CP2K_INPUT')
# section_stack = [root_section]
# for line in inp.split('\n'):
# line = line.split('!', 1)[0].strip()
# if len(line) == 0:
# continue
# if line.upper().startswith('&END'):
# s = section_stack.pop()
# elif line[0] == '&':
# parts = line.split(' ', 1)
# name = parts[0][1:]
# if len(parts) > 1:
# s = InputSection(name=name, params=parts[1].strip())
# else:
# s = InputSection(name=name)
# section_stack[-1].subsections[name.upper()].append(s)
# section_stack.append(s)
# else:
# split = line.split(' ', 1)
# keyword_name = split[0]
# normalized_keyword = self.normalize_keyword(keyword_name)
# keyword_value = split[1]
# section_stack[-1].keywords[normalized_keyword].append(keyword_value)
# self.root_section = root_section
# def get_subsection(self, path, index=0):
# return self.root_section.get_subsection(path, index)
# def get_keyword(self, path, index=0):
# split = path.rsplit('/', 1)
# section_path = split[0]
# normalized_keyword = self.normalize_keyword(path)
# section = self.root_section.get_subsection(section_path, index)
# if section is not None:
# return section.get_keyword(normalized_keyword, section_path, self)
# def get_parameter(self, path, index=0):
# section = self.root_section.get_subsection(path, index)
# if section is not None:
# return section.get_parameter(self, path)
# def setup_version_number(self, version_number):
# xml_file_path = os.path.dirname(__file__) + "/cp2kinputenginedata/xml/cp2k_{}/cp2k_input.xml".format(version_number)
# self.xml_file = open(xml_file_path, 'r')
# def get_xml_file(self):
# """Return the file handle that has been reset to the beginning.
# """
# self.xml_file.seek(os.SEEK_SET)
# return self.xml_file
# def create_section_xpath(self, path):
# """Strip the last part of the path and get the xpart for the remaining
# part.
# """
# # Form a XPath from the given path
# xpath = "."
# splitted_path = path.split("/")
# sections = splitted_path[:-1]
# keyword = splitted_path[-1]
# for section in sections:
# xpath += "/SECTION[NAME='{}']".format(section)
# return xpath, keyword
# def normalize_keyword(self, path):
# """Translate every section and keyword in the input file to the default
# name (=remove aliases).
# """
# xml_file = self.get_xml_file()
# # See if already normalized
# section_xpath, keyword = self.create_section_xpath(path)
# xml_engine = self.parser.xmlengine
# section = xml_engine.parse(xml_file, section_xpath)[0]
# # Find if default
# default_xpath = section_xpath + "/KEYWORD/[NAME='{}'][@type='default']".format(keyword)
# default_name = xml_engine.parse(section, default_xpath)
# if default_name:
# return keyword
# # If alias, find default
# # default_xpath = section_xpath + "/KEYWORD/[NAME='{}'][@type='alias']../KEYWORD/[@type='default']".format(keyword)
# # default_name = xml_engine.parse(section, default_xpath)
# return None #default_name[0].text
# #===============================================================================
# class InputSection(object):
# """Represents a section in a CP2K input file"""
# def __init__(self, name, params=None):
# self.name = name.upper()
# self.params = params
# self.keywords = defaultdict(list)
# self.subsections = defaultdict(list)
# def write(self):
# """Outputs input section as string"""
# output = []
# for name, k_list in self.keywords.iteritems():
# for value in k_list:
# output.append(value)
# for name, s_list in self.subsections.iteritems():
# for s in s_list:
# if s.params:
# output.append('&%s %s' % (s.name, s.params))
# else:
# output.append('&%s' % s.name)
# for l in s.write():
# output.append(' %s' % l)
# output.append('&END %s' % s.name)
# return output
# def get_subsection(self, path, index=0):
# """Finds a subsection specified by a string where subsections are
# separated by a slash. If multiple subsections are found with the same
# path, the one specified by the given index (default 0) is returned.
# Example: get_subsection("FORCE_EVAL/PRINT/FORCES")
# Args:
# path: String indicating the path to the subsection
# index: In case of repeating subsections, return the one specified
# by this index.
# Returns:
# The InputSection object if found.
# """
# parts = path.upper().split('/', 1)
# candidates = self.subsections.get(parts[0]) # [s for s in self.subsections if s.name == parts[0]]
# if not candidates:
# logger.debug("Subsection '{}' not found.".format(parts[0]))
# return None
# elif len(candidates) > 1:
# logger.warning("Multiple subsections with the same name found with name '{}' If no index is given, the first occurence in the input file is returned.".format(parts[0]))
# try:
# subsection = candidates[index]
# except IndexError:
# logger.error("Invalid subsection index given.")
# if len(parts) == 1:
# return subsection
# return subsection.get_subsection(parts[1])
# def get_keyword(self, keyword, section_path, engine, index=0):
# """Finds a keyword specified by a string. If multiple keywords are
# found with the same name, the one specified by the given index (default
# 0) is returned. If the keyword is not explicitly set, returns the
# default specified by the cp2k version specific XML file.
# Args:
# keyword: String indicating the name of the keyword. The name is the
# first word in the line.
# index: In case of repeating keywords, return the one specified
# by this index.
# Returns:
# The keyword value (everything else than the first word on the line).
# """
# candidates = self.keywords.get(keyword)
# if not candidates:
# logger.debug("No keywords with name '{}' found in subsection '{}'. Using the default XML value.".format(keyword, self.name))
# # Form a XPath from the given path
# xpath = "."
# sections = section_path.split("/")
# for section in sections:
# xpath += "/SECTION[NAME='{}']".format(section)
# xpath += "/KEYWORD[NAME='{}']/DEFAULT_VALUE".format(keyword)
# xml_file = engine.get_xml_file()
# xmlengine = engine.parser.xmlengine
# result = xmlengine.parse(xml_file, xpath)
# return result[0].text
# elif len(candidates) > 1:
# logger.warning("Multiple keywords found with name '{}'. If no index is given, the first occurence in the input file is returned.".format(keyword))
# try:
# result = candidates[index]
# except IndexError:
# logger.error("Invalid keyword index given.")
# return result