diff --git a/README.md b/README.md index a15680983b599d4c6eb54ad9487cb873e233d289..a9a06fcd4cb6ab9be7ec1a3ee7dc2d72d7bb6907 100644 --- a/README.md +++ b/README.md @@ -33,9 +33,9 @@ the common parser structure when it is available. # Structure Currently the python package is divided into three subpackages: - - Engines: Classes for parsing different type of files - - Generics: Generic utility classes and base classes - - Implementation: The classes that actually define the parser functionality. + - Engines: Classes for parsing different type of files + - Generics: Generic utility classes and base classes + - Implementation: The classes that actually define the parser functionality. # Reusable components and ideas for other parsers @@ -56,13 +56,13 @@ the performance of an engine but if the function calls remain the same no other code has to be changed. Currently implemented engines that could be reused (not tested properly yet): -- RegexEngine: For parsing text files with regular expressions. Uses the re2 - library if available (falls back to default python regex implementation if - re2 not found). -- XYZEngine: For parsing XYZ files and files with similar structure. Has a very - flexible nature as you can specify comments, column delimiters, column - indices and the patterns used to separate different configurations. -- XMLEngine: For parsing XML files using XPath syntax. + - RegexEngine: For parsing text files with regular expressions. Uses the re2 + library if available (falls back to default python regex implementation if + re2 not found). + - XYZEngine: For parsing XYZ files and files with similar structure. Has a very + flexible nature as you can specify comments, column delimiters, column + indices and the patterns used to separate different configurations. + - XMLEngine: For parsing XML files using XPath syntax. ## NomadParser base class In the generics folder there is a module called nomadparser.py that defines a @@ -74,11 +74,11 @@ the scala code (will be modified later to conform to the common interface). This class is also responsible for some common tasks that are present in all parsers: -- Unit conversion -- JSON encoding -- Caching -- Time measurement for performance analysis -- Providing file contents, sizes and handles + - Unit conversion + - JSON encoding + - Caching + - Time measurement for performance analysis + - Providing file contents, sizes and handles ## Logging Python has a great [logging package](https://www.google.com) which helps in diff --git a/cp2kparser/engines/cp2kinputengine.py b/cp2kparser/engines/cp2kinputengine.py index 8ef6a7406f25da10f4a3f28eb1fd3cff06719416..4b614156bbdf913a46968c2bc7676c2493ba1a6b 100644 --- a/cp2kparser/engines/cp2kinputengine.py +++ b/cp2kparser/engines/cp2kinputengine.py @@ -3,6 +3,7 @@ import os from collections import defaultdict import logging +import cPickle as pickle logger = logging.getLogger(__name__) @@ -11,9 +12,7 @@ class CP2KInputEngine(object): """Used to parse out a CP2K input file. When given a file handle to a CP2K input file, this class attemts to parse - out it's structure into an accessible object tree. Because the input file - has such a clearly defined structure (unlike the output file of CP2K), it - is better to use a dedicated parser instead of regular expressions. + out it's structure into an accessible object tree. """ def __init__(self, parser): """ @@ -23,16 +22,15 @@ class CP2KInputEngine(object): """ self.parser = parser self.root_section = None - self.xml_file = None - - def parse_input(self): - """Parses the given CP2K input string""" + self.input_tree = None + def parse(self): + """Parses the CP2K input file into an object tree. + """ # The input file should be quite small, so just get the entire contents inp = self.parser.get_file_contents("input") - root_section = InputSection('CP2K_INPUT') - section_stack = [root_section] + section_stack = [] for line in inp.split('\n'): line = line.split('!', 1)[0].strip() @@ -40,168 +38,406 @@ class CP2KInputEngine(object): continue if line.upper().startswith('&END'): - s = section_stack.pop() + section_stack.pop() elif line[0] == '&': parts = line.split(' ', 1) name = parts[0][1:] + section_stack.append(name) + + # Form the path + path = "" + for index, item in enumerate(section_stack): + if index != 0: + path += '/' + path += item + # print path + + # Save the section parameters if len(parts) > 1: - s = InputSection(name=name, params=parts[1].strip()) - else: - s = InputSection(name=name) - section_stack[-1].subsections[name.upper()].append(s) - section_stack.append(s) + self.input_tree.set_parameter(path, parts[1].strip()) else: split = line.split(' ', 1) keyword_name = split[0] keyword_value = split[1] - section_stack[-1].keywords[keyword_name].append(keyword_value) - - self.root_section = root_section - - def get_subsection(self, path, index=0): - return self.root_section.get_subsection(path, index) + self.input_tree.set_keyword(path + "/" + keyword_name, keyword_value) - def get_keyword(self, path, index=0): - split = path.rsplit('/', 1) - section_path = split[0] - keyword = split[1] - section = self.root_section.get_subsection(section_path, index) - if section is not None: - return section.get_keyword(keyword, section_path, self) - - def get_parameter(self, path, index=0): - section = self.root_section.get_subsection(path, index) - if section is not None: - return section.get_parameter(self, path) + def get_input_tree(self): + if self.input_tree is not None: + return self.input_tree + else: + logger.error("Input tree not yet created.") def setup_version_number(self, version_number): - xml_file_path = os.path.dirname(__file__) + "/cp2kinputenginedata/xml/cp2k_{}/cp2k_input.xml".format(version_number) - self.xml_file = open(xml_file_path, 'r') - - def get_xml_file(self): - """Return the file handle that has been reset to the beginning. - """ - self.xml_file.seek(os.SEEK_SET) - return self.xml_file - + pickle_path = os.path.dirname(__file__) + "/cp2kinputenginedata/cp2k_{}/cp2k_input_tree.pickle".format(version_number) + input_tree_pickle_file = open(pickle_path, 'rb') + self.input_tree = pickle.load(input_tree_pickle_file) #=============================================================================== -class InputSection(object): - """Represents a section in a CP2K input file""" - def __init__(self, name, params=None): - self.name = name.upper() - self.params = params - self.keywords = defaultdict(list) - self.subsections = defaultdict(list) - - def write(self): - """Outputs input section as string""" - output = [] - for name, k_list in self.keywords.iteritems(): - for value in k_list: - output.append(value) - for name, s_list in self.subsections.iteritems(): - for s in s_list: - if s.params: - output.append('&%s %s' % (s.name, s.params)) - else: - output.append('&%s' % s.name) - for l in s.write(): - output.append(' %s' % l) - output.append('&END %s' % s.name) - return output - - def get_subsection(self, path, index=0): - """Finds a subsection specified by a string where subsections are - separated by a slash. If multiple subsections are found with the same - path, the one specified by the given index (default 0) is returned. - - Example: get_subsection("FORCE_EVAL/PRINT/FORCES") - - Args: - path: String indicating the path to the subsection - index: In case of repeating subsections, return the one specified - by this index. +# Run main function by default +# if __name__ == "__main__": + # input_file = open("../tests/cp2k_2.6.2/functionals/lda/lda.inp", 'r').read() + # engine = CP2KInputEngine() + # engine.setup_version_number(262) + # engine.parse(input_file) - Returns: - The InputSection object if found. - - """ - parts = path.upper().split('/', 1) - candidates = self.subsections.get(parts[0]) # [s for s in self.subsections if s.name == parts[0]] - if not candidates: - logger.debug("Subsection '{}' not found.".format(parts[0])) - return None - elif len(candidates) > 1: - logger.warning("Multiple subsections with the same name found with name '{}' If no index is given, the first occurence in the input file is returned.".format(parts[0])) - try: - subsection = candidates[index] - except IndexError: - logger.error("Invalid subsection index given.") - - if len(parts) == 1: - return subsection - return subsection.get_subsection(parts[1]) - - def get_keyword(self, keyword, section_path, engine, index=0): - """Finds a keyword specified by a string. If multiple keywords are - found with the same name, the one specified by the given index (default - 0) is returned. If the keyword is not explicitly set, returns the - default specified by the cp2k version specific XML file. - - Args: - keyword: String indicating the name of the keyword. The name is the - first word in the line. - index: In case of repeating keywords, return the one specified - by this index. - - Returns: - The keyword value (everything else than the first word on the line). - """ - candidates = self.keywords.get(keyword) - if not candidates: - logger.debug("No keywords with name '{}' found in subsection '{}'. Using the default XML value.".format(keyword, self.name)) - - # Form a XPath from the given path - xpath = "." - sections = section_path.split("/") - - for section in sections: - xpath += "/SECTION[NAME='{}']".format(section) - xpath += "/KEYWORD[NAME='{}']/DEFAULT_VALUE".format(keyword) - - xml_file = engine.get_xml_file() - xmlengine = engine.parser.xmlengine - result = xmlengine.parse(xml_file, xpath) - - return result[0].text - - elif len(candidates) > 1: - logger.warning("Multiple keywords found with name '{}'. If no index is given, the first occurence in the input file is returned.".format(keyword)) - try: - result = candidates[index] - except IndexError: - logger.error("Invalid keyword index given.") - return result - - def get_parameter(self, engine, path): - """Return the SECTION_PARAMETER for this InputSection. If none is - explicitly set, return the default specified by the cp2k version - specific XML file. - """ - if self.params is None: - - # Form a XPath from the given path - xpath = "." - sections = path.split("/") - - for section in sections: - xpath += "/SECTION[NAME='{}']".format(section) - xpath += "/SECTION_PARAMETERS/LONE_KEYWORD_VALUE" +#=============================================================================== +# class InputSection(object): + # """Represents a section in a CP2K input file""" + # def __init__(self, name, params=None): + # self.name = name.upper() + # self.params = params + # self.keywords = defaultdict(list) + # self.subsections = defaultdict(list) + + # def write(self): + # """Outputs input section as string""" + # output = [] + # for name, k_list in self.keywords.iteritems(): + # for value in k_list: + # output.append(value) + # for name, s_list in self.subsections.iteritems(): + # for s in s_list: + # if s.params: + # output.append('&%s %s' % (s.name, s.params)) + # else: + # output.append('&%s' % s.name) + # for l in s.write(): + # output.append(' %s' % l) + # output.append('&END %s' % s.name) + # return output + + # def get_subsection(self, path, index=0): + # """Finds a subsection specified by a string where subsections are + # separated by a slash. If multiple subsections are found with the same + # path, the one specified by the given index (default 0) is returned. + + # Example: get_subsection("FORCE_EVAL/PRINT/FORCES") + + # Args: + # path: String indicating the path to the subsection + # index: In case of repeating subsections, return the one specified + # by this index. + + # Returns: + # The InputSection object if found. + + # """ + # parts = path.upper().split('/', 1) + # candidates = self.subsections.get(parts[0]) # [s for s in self.subsections if s.name == parts[0]] + # if not candidates: + # logger.debug("Subsection '{}' not found.".format(parts[0])) + # return None + # elif len(candidates) > 1: + # logger.warning("Multiple subsections with the same name found with name '{}' If no index is given, the first occurence in the input file is returned.".format(parts[0])) + # try: + # subsection = candidates[index] + # except IndexError: + # logger.error("Invalid subsection index given.") + + # if len(parts) == 1: + # return subsection + # return subsection.get_subsection(parts[1]) + + # def get_keyword(self, keyword, section_path, engine, index=0): + # """Finds a keyword specified by a string. If multiple keywords are + # found with the same name, the one specified by the given index (default + # 0) is returned. If the keyword is not explicitly set, returns the + # default specified by the cp2k version specific XML file. + + # Args: + # keyword: String indicating the name of the keyword. The name is the + # first word in the line. + # index: In case of repeating keywords, return the one specified + # by this index. + + # Returns: + # The keyword value (everything else than the first word on the line). + # """ + # candidates = self.keywords.get(keyword) + # if not candidates: + # logger.debug("No keywords with name '{}' found in subsection '{}'. Using the default XML value.".format(keyword, self.name)) + + # # Form a XPath from the given path + # xpath = "." + # sections = section_path.split("/") + + # for section in sections: + # xpath += "/SECTION[NAME='{}']".format(section) + # xpath += "/KEYWORD[NAME='{}']/DEFAULT_VALUE".format(keyword) + + # xml_file = engine.get_xml_file() + # xmlengine = engine.parser.xmlengine + # result = xmlengine.parse(xml_file, xpath) + + # return result[0].text + + # elif len(candidates) > 1: + # logger.warning("Multiple keywords found with name '{}'. If no index is given, the first occurence in the input file is returned.".format(keyword)) + # try: + # result = candidates[index] + # except IndexError: + # logger.error("Invalid keyword index given.") + # return result + + # def get_parameter(self, engine, path): + # """Return the SECTION_PARAMETER for this InputSection. If none is + # explicitly set, return the default specified by the cp2k version + # specific XML file. + # """ + # if self.params is None: + + # # Form a XPath from the given path + # xpath = "." + # sections = path.split("/") + + # for section in sections: + # xpath += "/SECTION[NAME='{}']".format(section) + # xpath += "/SECTION_PARAMETERS/LONE_KEYWORD_VALUE" + + # xml_file = engine.get_xml_file() + # xmlengine = engine.parser.xmlengine + # result = xmlengine.parse(xml_file, xpath) + # return result[0].text + + # return self.params - xml_file = engine.get_xml_file() - xmlengine = engine.parser.xmlengine - result = xmlengine.parse(xml_file, xpath) - return result[0].text - return self.params +#=============================================================================== +# class CP2KInputEngine(object): + # """Used to parse out a CP2K input file. + + # When given a file handle to a CP2K input file, this class attemts to parse + # out it's structure into an accessible object tree. Because the input file + # has such a clearly defined structure (unlike the output file of CP2K), it + # is better to use a dedicated parser instead of regular expressions. + # """ + # def __init__(self, parser): + # """ + # Args: + # parser: Instance of a NomadParser or it's subclass. Allows + # access to e.g. unified file reading methods. + # """ + # self.parser = parser + # self.root_section = None + # self.xml_file = None + + # def parse_input(self): + # """Parses the given CP2K input string. Default any aliases used for + # keywords to the default names. + # """ + + # # The input file should be quite small, so just get the entire contents + # inp = self.parser.get_file_contents("input") + + # root_section = InputSection('CP2K_INPUT') + # section_stack = [root_section] + + # for line in inp.split('\n'): + # line = line.split('!', 1)[0].strip() + # if len(line) == 0: + # continue + + # if line.upper().startswith('&END'): + # s = section_stack.pop() + # elif line[0] == '&': + # parts = line.split(' ', 1) + # name = parts[0][1:] + # if len(parts) > 1: + # s = InputSection(name=name, params=parts[1].strip()) + # else: + # s = InputSection(name=name) + # section_stack[-1].subsections[name.upper()].append(s) + # section_stack.append(s) + # else: + # split = line.split(' ', 1) + # keyword_name = split[0] + # normalized_keyword = self.normalize_keyword(keyword_name) + # keyword_value = split[1] + # section_stack[-1].keywords[normalized_keyword].append(keyword_value) + + # self.root_section = root_section + + # def get_subsection(self, path, index=0): + # return self.root_section.get_subsection(path, index) + + # def get_keyword(self, path, index=0): + # split = path.rsplit('/', 1) + # section_path = split[0] + # normalized_keyword = self.normalize_keyword(path) + # section = self.root_section.get_subsection(section_path, index) + # if section is not None: + # return section.get_keyword(normalized_keyword, section_path, self) + + # def get_parameter(self, path, index=0): + # section = self.root_section.get_subsection(path, index) + # if section is not None: + # return section.get_parameter(self, path) + + # def setup_version_number(self, version_number): + # xml_file_path = os.path.dirname(__file__) + "/cp2kinputenginedata/xml/cp2k_{}/cp2k_input.xml".format(version_number) + # self.xml_file = open(xml_file_path, 'r') + + # def get_xml_file(self): + # """Return the file handle that has been reset to the beginning. + # """ + # self.xml_file.seek(os.SEEK_SET) + # return self.xml_file + + # def create_section_xpath(self, path): + # """Strip the last part of the path and get the xpart for the remaining + # part. + # """ + # # Form a XPath from the given path + # xpath = "." + # splitted_path = path.split("/") + # sections = splitted_path[:-1] + # keyword = splitted_path[-1] + + # for section in sections: + # xpath += "/SECTION[NAME='{}']".format(section) + + # return xpath, keyword + + # def normalize_keyword(self, path): + # """Translate every section and keyword in the input file to the default + # name (=remove aliases). + # """ + # xml_file = self.get_xml_file() + + # # See if already normalized + # section_xpath, keyword = self.create_section_xpath(path) + # xml_engine = self.parser.xmlengine + # section = xml_engine.parse(xml_file, section_xpath)[0] + + # # Find if default + # default_xpath = section_xpath + "/KEYWORD/[NAME='{}'][@type='default']".format(keyword) + # default_name = xml_engine.parse(section, default_xpath) + # if default_name: + # return keyword + + # # If alias, find default + # # default_xpath = section_xpath + "/KEYWORD/[NAME='{}'][@type='alias']../KEYWORD/[@type='default']".format(keyword) + # # default_name = xml_engine.parse(section, default_xpath) + # return None #default_name[0].text + + +# #=============================================================================== +# class InputSection(object): + # """Represents a section in a CP2K input file""" + # def __init__(self, name, params=None): + # self.name = name.upper() + # self.params = params + # self.keywords = defaultdict(list) + # self.subsections = defaultdict(list) + + # def write(self): + # """Outputs input section as string""" + # output = [] + # for name, k_list in self.keywords.iteritems(): + # for value in k_list: + # output.append(value) + # for name, s_list in self.subsections.iteritems(): + # for s in s_list: + # if s.params: + # output.append('&%s %s' % (s.name, s.params)) + # else: + # output.append('&%s' % s.name) + # for l in s.write(): + # output.append(' %s' % l) + # output.append('&END %s' % s.name) + # return output + + # def get_subsection(self, path, index=0): + # """Finds a subsection specified by a string where subsections are + # separated by a slash. If multiple subsections are found with the same + # path, the one specified by the given index (default 0) is returned. + + # Example: get_subsection("FORCE_EVAL/PRINT/FORCES") + + # Args: + # path: String indicating the path to the subsection + # index: In case of repeating subsections, return the one specified + # by this index. + + # Returns: + # The InputSection object if found. + + # """ + # parts = path.upper().split('/', 1) + # candidates = self.subsections.get(parts[0]) # [s for s in self.subsections if s.name == parts[0]] + # if not candidates: + # logger.debug("Subsection '{}' not found.".format(parts[0])) + # return None + # elif len(candidates) > 1: + # logger.warning("Multiple subsections with the same name found with name '{}' If no index is given, the first occurence in the input file is returned.".format(parts[0])) + # try: + # subsection = candidates[index] + # except IndexError: + # logger.error("Invalid subsection index given.") + + # if len(parts) == 1: + # return subsection + # return subsection.get_subsection(parts[1]) + + # def get_keyword(self, keyword, section_path, engine, index=0): + # """Finds a keyword specified by a string. If multiple keywords are + # found with the same name, the one specified by the given index (default + # 0) is returned. If the keyword is not explicitly set, returns the + # default specified by the cp2k version specific XML file. + + # Args: + # keyword: String indicating the name of the keyword. The name is the + # first word in the line. + # index: In case of repeating keywords, return the one specified + # by this index. + + # Returns: + # The keyword value (everything else than the first word on the line). + # """ + # candidates = self.keywords.get(keyword) + # if not candidates: + # logger.debug("No keywords with name '{}' found in subsection '{}'. Using the default XML value.".format(keyword, self.name)) + + # # Form a XPath from the given path + # xpath = "." + # sections = section_path.split("/") + + # for section in sections: + # xpath += "/SECTION[NAME='{}']".format(section) + # xpath += "/KEYWORD[NAME='{}']/DEFAULT_VALUE".format(keyword) + + # xml_file = engine.get_xml_file() + # xmlengine = engine.parser.xmlengine + # result = xmlengine.parse(xml_file, xpath) + + # return result[0].text + + # elif len(candidates) > 1: + # logger.warning("Multiple keywords found with name '{}'. If no index is given, the first occurence in the input file is returned.".format(keyword)) + # try: + # result = candidates[index] + # except IndexError: + # logger.error("Invalid keyword index given.") + # return result + + # def get_parameter(self, engine, path): + # """Return the SECTION_PARAMETER for this InputSection. If none is + # explicitly set, return the default specified by the cp2k version + # specific XML file. + # """ + # if self.params is None: + + # # Form a XPath from the given path + # xpath = "." + # sections = path.split("/") + + # for section in sections: + # xpath += "/SECTION[NAME='{}']".format(section) + # xpath += "/SECTION_PARAMETERS/LONE_KEYWORD_VALUE" + + # xml_file = engine.get_xml_file() + # xmlengine = engine.parser.xmlengine + # result = xmlengine.parse(xml_file, xpath) + # return result[0].text + + # return self.params diff --git a/cp2kparser/engines/cp2kinputenginedata/__init__.py b/cp2kparser/engines/cp2kinputenginedata/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ee3ecd22c807354b4cea36b827e76c0285c9e401 --- /dev/null +++ b/cp2kparser/engines/cp2kinputenginedata/__init__.py @@ -0,0 +1 @@ +#! /usr/bin/env python diff --git a/cp2kparser/engines/cp2kinputenginedata/xml/cp2k_262/cp2k_input.xml b/cp2kparser/engines/cp2kinputenginedata/cp2k_262/cp2k_input.xml similarity index 100% rename from cp2kparser/engines/cp2kinputenginedata/xml/cp2k_262/cp2k_input.xml rename to cp2kparser/engines/cp2kinputenginedata/cp2k_262/cp2k_input.xml diff --git a/cp2kparser/engines/cp2kinputenginedata/cp2k_262/cp2k_input_tree.pickle b/cp2kparser/engines/cp2kinputenginedata/cp2k_262/cp2k_input_tree.pickle new file mode 100644 index 0000000000000000000000000000000000000000..2cb7ab60b42319498e6a188011de9d78d3d1699d Binary files /dev/null and b/cp2kparser/engines/cp2kinputenginedata/cp2k_262/cp2k_input_tree.pickle differ diff --git a/cp2kparser/engines/cp2kinputenginedata/xml/cp2k_262/references.html b/cp2kparser/engines/cp2kinputenginedata/cp2k_262/references.html similarity index 100% rename from cp2kparser/engines/cp2kinputenginedata/xml/cp2k_262/references.html rename to cp2kparser/engines/cp2kinputenginedata/cp2k_262/references.html diff --git a/cp2kparser/engines/cp2kinputenginedata/xml/cp2k_262/units.html b/cp2kparser/engines/cp2kinputenginedata/cp2k_262/units.html similarity index 100% rename from cp2kparser/engines/cp2kinputenginedata/xml/cp2k_262/units.html rename to cp2kparser/engines/cp2kinputenginedata/cp2k_262/units.html diff --git a/cp2kparser/engines/cp2kinputenginedata/input_tree.py b/cp2kparser/engines/cp2kinputenginedata/input_tree.py new file mode 100644 index 0000000000000000000000000000000000000000..9094df8a1dff1e0477be8a1bdb69e99350d39e5c --- /dev/null +++ b/cp2kparser/engines/cp2kinputenginedata/input_tree.py @@ -0,0 +1,122 @@ +"""The classes which make up the CP2K input tree. + +These are defined in their own module, instead of the xmlpreparser module, +because the pickling of these classes is wrong if they are defined in the same +file which is run in console (module will be then __main__). +""" +from collections import defaultdict + + +#=============================================================================== +class Keyword(object): + """Information about a keyword in a CP2K calculation. + """ + + def __init__(self, default_name, default_value): + self.value = None + self.default_name = default_name + self.default_value = default_value + + +#=============================================================================== +class Section(object): + """An input section in a CP2K calculation. + """ + + def __init__(self, name): + self.name = name + self.keywords = defaultdict(list) + self.default_keyword = "" + self.parameter = None + self.sections = defaultdict(list) + + def get_section(self, path): + split_path = path.split("/") + section = self + for part in split_path: + section = section.sections.get(part) + if section: + if len(section) == 1: + section = section[0] + else: + # print "The subsection '{}' is repeated. Not yet supported.".format(path) + return None + else: + # print "Subsection '{}' does not exist in section '{}'".format(path, self.name) + return None + return section + + def get_keyword_object(self, path): + split_path = path.rsplit("/", 1) + keyword = split_path[1] + section_path = split_path[0] + section = self.get_section(section_path) + keyword = section.keywords.get(keyword) + if keyword: + if len(keyword) == 1: + return keyword[0] + # print "The keyword in '{}' does not exist or has too many entries.".format(path) + return None + + def get_keyword(self, path): + keyword = self.get_keyword_object(path) + if keyword: + return keyword.value + + def get_default_keyword(self, path): + return self.get_section(path) + + def set_keyword(self, path, value): + keyword = self.get_keyword_object(path) + if keyword: + keyword.value = value + else: + # print "Saving default keyword at path '{}'".format(path) + split_path = path.rsplit("/", 1) + keyword = split_path[1] + section_path = split_path[0] + section = self.get_section(section_path) + section.default_keyword += '\n' + keyword + + def get_keyword_default(self, path): + keyword = self.get_keyword_object(path) + if keyword: + return keyword.default_value + + def get_parameter_object(self, path): + section = self.get_section(path) + parameter = section.parameter + if parameter: + return parameter + else: + print "The section parameters object '{}' could not be found.".format(path) + + def get_parameter(self, path): + parameter = self.get_parameter_object(path) + return parameter.value + + def set_parameter(self, path, value): + parameter = self.get_parameter_object(path) + parameter.value = value + + def get_parameter_lone(self, path): + parameter = self.get_parameter_object(path) + return parameter.lone_value + + def get_parameter_default(self, path): + parameter = self.get_parameter_object(path) + return parameter.default_value + + +#=============================================================================== +class SectionParameters(object): + """Section parameters in a CP2K calculation. + + Section parameters are the short values that can be added right after a + section name, e.g. &PRINT ON, where ON is the section parameter. + """ + + def __init__(self, default_value, lone_value): + self.value = None + self.default_value = default_value + self.lone_value = lone_value diff --git a/cp2kparser/engines/cp2kinputenginedata/xmlpreparser.py b/cp2kparser/engines/cp2kinputenginedata/xmlpreparser.py new file mode 100644 index 0000000000000000000000000000000000000000..7b9dadfddcd77b870874e4036e4deb9973cda0d0 --- /dev/null +++ b/cp2kparser/engines/cp2kinputenginedata/xmlpreparser.py @@ -0,0 +1,91 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- + +"""Provides functions for creating a python object representing a CP2K input +structure. + +Creates preparsed versions of the cp2k_input.xmls and pickles them (python +version of serialization). The pickle files can then be easily reused without +doing the xml parsing again. + +The actual calculation input contents can later be added to this object. Then +the object can be queried for the results, or the default values defined by the +cp2k_input.xml. +""" + +import xml.etree.cElementTree as ET +import logging +import cPickle as pickle +from cp2kparser.engines.cp2kinputenginedata.input_tree import * +logger = logging + + +#=============================================================================== +def generate_object_tree(xml_file): + + xml_element = ET.parse(xml_file) + object_tree = recursive_tree_generation(xml_element) + return object_tree + + +#=============================================================================== +def recursive_tree_generation(xml_element): + + # Make new section object for the root + section_name_element = xml_element.find("NAME") + if section_name_element is not None: + section_name = section_name_element.text + else: + section_name = "CP2K_INPUT" + section = Section(section_name) + + # Section parameters + parameter = xml_element.find("SECTION_PARAMETERS") + if parameter: + sp_default_element = parameter.find("DEFAULT_VALUE") + sp_default_value = None + if sp_default_element is not None: + sp_default_value = sp_default_element.text + sp_lone_element = parameter.find("LONE_KEYWORD_VALUE") + sp_lone_value = None + if sp_lone_element is not None: + sp_lone_value = sp_lone_element.text + parameter_object = SectionParameters(sp_default_value, sp_lone_value) + section.parameter = parameter_object + + # Keywords + for keyword in xml_element.findall("KEYWORD"): + keyword_names = keyword.findall("NAME") + default_name = None + aliases = [] + for name in keyword_names: + keytype = name.get("type") + if keytype == "default": + default_name = name.text + else: + aliases.append(name.text) + default_keyword_element = keyword.find("DEFAULT_VALUE") + default_keyword_value = None + if default_keyword_element is not None: + default_keyword_value = default_keyword_element.text + keyword_object = Keyword(default_name, default_keyword_value) + section.keywords[default_name].append(keyword_object) + for alias in aliases: + section.keywords[alias].append(keyword_object) + + # Sections + for sub_section_element in xml_element.findall("SECTION"): + sub_section = recursive_tree_generation(sub_section_element) + section.sections[sub_section.name].append(sub_section) + + # Return section + return section + +#=============================================================================== +# Run main function by default +if __name__ == "__main__": + xml_file = open("./cp2k_262/cp2k_input.xml", 'r') + object_tree = generate_object_tree(xml_file) + file_name = "./cp2k_262/cp2k_input_tree.pickle" + fh = open(file_name, "wb") + pickle.dump(object_tree, fh, protocol=2) diff --git a/cp2kparser/engines/xmlengine.py b/cp2kparser/engines/xmlengine.py index 5cdde172a8c58b7ed07aefe5ef6db8eeae02dcc5..f07f16a8ce53e12c8f9194667d89a138b6131891 100644 --- a/cp2kparser/engines/xmlengine.py +++ b/cp2kparser/engines/xmlengine.py @@ -6,7 +6,6 @@ ElemenTree API such as lxml. """ import xml.etree.cElementTree as ET -import sys #=============================================================================== @@ -27,8 +26,11 @@ class XMLEngine(object): # handle if isinstance(contents, (str, unicode)): tree = ET.fromstring(contents) - else: + elif isinstance(contents, file): tree = ET.parse(contents) + else: + tree = contents + return tree.findall(XPath) # Get the path return tree.getroot().findall(XPath) diff --git a/cp2kparser/implementation/parser.py b/cp2kparser/implementation/parser.py index cf8b68b0188761db88415ccd2ee015739b1bf1d4..4c723d3891b4929e8fdf0bc428b4b255918dd85a 100644 --- a/cp2kparser/implementation/parser.py +++ b/cp2kparser/implementation/parser.py @@ -29,11 +29,12 @@ class CP2KParser(NomadParser): self.version_number = None # Engines are created here - self.inputengine = CP2KInputEngine(self) self.xyzengine = XYZEngine(self) self.regexengine = RegexEngine(self) self.xmlengine = XMLEngine(self) + self.inputengine = CP2KInputEngine(self) + self.input_tree = None self.regexs = None self.analyse_input_json() self.check_resolved_file_ids() @@ -50,6 +51,8 @@ class CP2KParser(NomadParser): version_regex = re.compile(r"CP2K\|\ version\ string:\s+CP2K\ version\ (\d+\.\d+\.\d+)\n") self.version_number = version_regex.search(beginning).groups()[0].replace('.', '') self.inputengine.setup_version_number(self.version_number) + self.inputengine.parse() + self.input_tree = self.inputengine.get_input_tree() version_name = '_' + self.version_number + '_' # Search for a version specific regex class @@ -112,9 +115,8 @@ class CP2KParser(NomadParser): """Inherited from NomadParser. """ # Check from input what the other files are called - self.inputengine.parse_input() - force_path = self.inputengine.get_keyword("FORCE_EVAL/PRINT/FORCES/FILENAME") - project_name = self.inputengine.get_keyword("GLOBAL/PROJECT_NAME") + force_path = self.input_tree.get_keyword("FORCE_EVAL/PRINT/FORCES/FILENAME") + project_name = self.input_tree.get_keyword("GLOBAL/PROJECT_NAME") if force_path is not None and force_path != "__STD_OUT__": # The force path is not typically exactly as written in input @@ -181,8 +183,8 @@ class CP2KImplementation(object): self.parser = parser self.regexs = parser.regexs self.regexengine = parser.regexengine - self.inputengine = parser.inputengine self.xyzengine = parser.xyzengine + self.input_tree = parser.input_tree def _Q_energy_total(self): """Return the total energy from the bottom of the input file""" @@ -200,13 +202,13 @@ class CP2KImplementation(object): """ # First try to look at the shortcut - xc_shortcut = self.inputengine.get_parameter("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL") + xc_shortcut = self.input_tree.get_parameter("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL") if xc_shortcut is not None and xc_shortcut != "NONE" and xc_shortcut != "NO_SHORTCUT": logger.debug("Shortcut defined for XC_FUNCTIONAL") # If PBE, check version if xc_shortcut == "PBE": - pbe_version = self.inputengine.get_subsection("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL/PBE").get_keyword("PARAMETRIZATION") + pbe_version = self.input_tree.get_keyword("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL/PBE/PARAMETRIZATION") return { 'ORIG': "GGA_X_PBE", 'PBESOL': "GGA_X_PBE_SOL", @@ -232,14 +234,14 @@ class CP2KImplementation(object): # Becke88 xc_components = [] - becke_88 = self.inputengine.get_parameter("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL/BECKE88") + becke_88 = self.input_tree.get_parameter("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL/BECKE88") if becke_88 == "TRUE": xc_components.append("GGA_X_B88") # Becke 97 - becke_97 = self.inputengine.get_parameter("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL/BECKE97") + becke_97 = self.input_tree.get_parameter("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL/BECKE97") if becke_97 == "TRUE": - becke_97_param = self.inputengine.get_keyword("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL/BECKE97/PARAMETRIZATION") + becke_97_param = self.input_tree.get_keyword("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL/BECKE97/PARAMETRIZATION") becke_97_result = { 'B97GRIMME': None, 'B97_GRIMME': None, @@ -261,7 +263,7 @@ class CP2KImplementation(object): # Determine if a separate force file is used or are the forces printed # in the output file. separate_file = True - filename = self.inputengine.get_keyword("FORCE_EVAL/PRINT/FORCES/FILENAME") + filename = self.input_tree.get_keyword("FORCE_EVAL/PRINT/FORCES/FILENAME") if not filename or filename == "__STD_OUT__": separate_file = False diff --git a/cp2kparser/tests/cp2k_2.6.2/profile_file b/cp2kparser/tests/cp2k_2.6.2/profile_file index 84a3c0fcfa2d10523b1095e643f7bde50753d62f..18c8ee61bcea906c0a89d25672e7263f68489449 100644 Binary files a/cp2kparser/tests/cp2k_2.6.2/profile_file and b/cp2kparser/tests/cp2k_2.6.2/profile_file differ diff --git a/cp2kparser/tests/cp2k_2.6.2/run_tests.py b/cp2kparser/tests/cp2k_2.6.2/run_tests.py index 7b37471814ae40812cd4f5892f089de05b75b74e..edb0c43a74919269082d1c04d08a47113f7002f5 100644 --- a/cp2kparser/tests/cp2k_2.6.2/run_tests.py +++ b/cp2kparser/tests/cp2k_2.6.2/run_tests.py @@ -2,6 +2,7 @@ import unittest import os import logging from cp2kparser.implementation.autoparser import get_parser +from cp2kparser.engines.cp2kinputenginedata.xmlpreparser import * import cProfile import pstats @@ -92,11 +93,11 @@ class TestForces(unittest.TestCase): if __name__ == '__main__': logger = logging.getLogger("cp2kparser") logger.setLevel(logging.ERROR) - # unittest.main() - suite = unittest.TestLoader().loadTestsFromTestCase(TestForces) - - def runtests(): - unittest.TextTestRunner().run(suite) - s = cProfile.run("runtests()", sort="cumtime", filename="profile_file") - - # unittest.TextTestRunner(verbosity=0).run(suite) + unittest.main() + # suite = unittest.TestLoader().loadTestsFromTestCase(TestForces) + + # def runtests(): + # unittest.main() + # unittest.TextTestRunner().run(suite) + # unittest.TextTestRunner(verbosity=0).run(suite) + # s = cProfile.run("runtests()", sort="cumtime", filename="profile_file")