diff --git a/README.md b/README.md index 4e2a0ce5ad3c3ee226a20e7896221e4f31546a9d..a6981dc0e11249a86a514d360f94021990ed22a5 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ # Structure Currently the python package is divided the following subpackages: - utils: Generic utility classes and base classes -- implementation: The classes that actually define the parser functionality. +- parsing: The classes that actually define the parser functionality. # Tools and Methods diff --git a/parser/parser-cp2k/cp2kparser/parsing/parser.py b/parser/parser-cp2k/cp2kparser/parsing/parser.py index 36fca1009cd3f080f6356ecc9a664f8b4a972849..78aa0a1c475f0c0ce4cb8ed59a9f7e6a3280eb64 100644 --- a/parser/parser-cp2k/cp2kparser/parsing/parser.py +++ b/parser/parser-cp2k/cp2kparser/parsing/parser.py @@ -1,16 +1,13 @@ import re import logging from cp2kparser.utils.baseclasses import Parser -from cp2kparser.parsing.implementations import * +from cp2kparser.parsing.versions.versionsetup import get_implementation_class logger = logging.getLogger(__name__) #=============================================================================== class CP2KParser(Parser): - """Builds the correct parser by looking at the given files and the given - input. - - This class handles the initial setup before any parsing can happen. It + """This class handles the initial setup before any parsing can happen. It determines which version of CP2K was used to generate the output and then sets up a correct implementation. @@ -18,25 +15,30 @@ class CP2KParser(Parser): parse(). """ - def __init__(self, contents=None, metainfo_to_keep=None, backend=None): - Parser.__init__(self, contents, metainfo_to_keep, backend) + def __init__(self, contents=None, metainfo_to_keep=None, backend=None, main_file=None): + Parser.__init__(self, contents, metainfo_to_keep, backend, main_file) def setup(self): """Setups the version by looking at the output file and the version specified in it. """ - # Search for the output file - count = 0 - for filepath in self.parser_context.files: - if filepath.endswith(".out"): - count += 1 - outputpath = filepath - if count > 1: - logger("Could not determine the correct outputfile because multiple files with extension '.out' were found.") - return - elif count == 0: - logger.error("No output file could be found. The outputfile should have a '.out' extension.") - return + + # If a main file is provided, search it for a version number. + if self.parser_context.main_file is not None: + outputpath = self.parser_context.main_file + else: + # Search for the output file + count = 0 + for filepath in self.parser_context.files: + if filepath.endswith(".out"): + count += 1 + outputpath = filepath + if count > 1: + logger("Could not determine the correct outputfile because multiple files with extension '.out' were found.") + return + elif count == 0: + logger.error("No output file could be found. The outputfile should have a '.out' extension.") + return # Search for the version specification outputfile = open(outputpath, 'r') @@ -48,15 +50,7 @@ class CP2KParser(Parser): break # Search and initialize a version specific implementation - class_name = "CP2KImplementation{}".format(self.parser_context.version_id) - class_object = globals().get(class_name) - if class_object: - logger.debug("Using version specific implementation '{}'.".format(class_name)) - self.implementation = class_object(self.parser_context) - else: - logger.debug("No version specific implementation found. Using the default implementation: {}".format(class_name)) - self.parser_context.version_id = "262" - self.implementation = globals()["CP2KImplementation262"](self.parser_context) + self.implementation = get_implementation_class(self.parser_context.version_id)(self.parser_context) def search_parseable_files(self, files): """Searches the given path for files that are of interest to this diff --git a/parser/parser-cp2k/cp2kparser/parsing/versions/__init__.py b/parser/parser-cp2k/cp2kparser/parsing/versions/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/parser/parser-cp2k/cp2kparser/parsing/versions/cp2k262/__init__.py b/parser/parser-cp2k/cp2kparser/parsing/versions/cp2k262/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/parser/parser-cp2k/cp2kparser/parsing/implementations.py b/parser/parser-cp2k/cp2kparser/parsing/versions/cp2k262/implementation.py similarity index 96% rename from parser/parser-cp2k/cp2kparser/parsing/implementations.py rename to parser/parser-cp2k/cp2kparser/parsing/versions/cp2k262/implementation.py index 2cae122d9bc7e497014f47bd52f8459e02019b89..a82a6a105371d43c551090a1515709482afa06db 100644 --- a/parser/parser-cp2k/cp2kparser/parsing/implementations.py +++ b/parser/parser-cp2k/cp2kparser/parsing/versions/cp2k262/implementation.py @@ -2,16 +2,16 @@ import re import os import logging from cp2kparser.parsing.csvparsing import CSVParser -from cp2kparser.parsing.inputparsing import CP2KInputParser +from .inputparsing import CP2KInputParser +from .outputparser import CP2KOutputParser from cp2kparser.parsing.cp2kinputenginedata.input_tree import CP2KInput -from cp2kparser.parsing.outputparsing import * from cp2kparser.utils.baseclasses import ParserImplementation from nomadcore.coordinate_reader import CoordinateReader logger = logging.getLogger(__name__) #=============================================================================== -class CP2KImplementation262(ParserImplementation): +class CP2KImplementation(ParserImplementation): """The default implementation for a CP2K parser based on version 2.6.2. """ def __init__(self, parser_context): @@ -25,7 +25,6 @@ class CP2KImplementation262(ParserImplementation): self.atomsengine = CoordinateReader() self.inputparser = CP2KInputParser() self.inputparser.setup_version(self.version_id) - self.outputparser = None #globals()["CP2KOutputParser{}".format(self.version_id)](file_path, self.parser_context) self.input_tree = None self.extended_input = None @@ -37,14 +36,15 @@ class CP2KImplementation262(ParserImplementation): """Resolve the input and output files based on extension and the include files by looking for @INCLUDE commands in the input file. """ + # Input and output files for file_path in self.files: if file_path.endswith(".inp"): self.setup_file_id(file_path, "input") if file_path.endswith(".out"): self.setup_file_id(file_path, "output") - self.outputparser = globals()["CP2KOutputParser{}".format(self.version_id)](file_path, self.parser_context) - self.file_parsers.append(self.outputparser) + outputparser = CP2KOutputParser(file_path, self.parser_context) + self.file_parsers.append(outputparser) # Include files input_file = self.get_file_contents("input") @@ -58,6 +58,22 @@ class CP2KImplementation262(ParserImplementation): filepath = self.search_file(filename) self.setup_file_id(filepath, "include") + # def determine_output_file(self): + # """Determine which of the given files is the output file. + # """ + # # If a main file has been specified it is the output file. + # if self.parser_context.main_file is not None: + # self.setup_file_id(file_path, "output") + # # Otherwise try to determine by the file extension + # else: + # n_outfiles = 0 + # for file_path in self.files: + # if file_path.endswith(".out"): + # n_outfiles += 1 + # self.setup_file_id(file_path, "output") + # self.outputparser = globals()["CP2KOutputParser{}".format(self.version_id)](file_path, self.parser_context) + # self.file_parsers.append(self.outputparser) + def input_preprocessor(self): """Preprocess the input file. Concatenate .inc files into the main input file and explicitly state all variables. diff --git a/parser/parser-cp2k/cp2kparser/parsing/cp2kinputenginedata/cp2k_262/cp2k_input.xml b/parser/parser-cp2k/cp2kparser/parsing/versions/cp2k262/input_xml/cp2k_input.xml similarity index 100% rename from parser/parser-cp2k/cp2kparser/parsing/cp2kinputenginedata/cp2k_262/cp2k_input.xml rename to parser/parser-cp2k/cp2kparser/parsing/versions/cp2k262/input_xml/cp2k_input.xml diff --git a/parser/parser-cp2k/cp2kparser/parsing/cp2kinputenginedata/cp2k_262/cp2k_input_tree.pickle b/parser/parser-cp2k/cp2kparser/parsing/versions/cp2k262/input_xml/cp2k_input_tree.pickle similarity index 100% rename from parser/parser-cp2k/cp2kparser/parsing/cp2kinputenginedata/cp2k_262/cp2k_input_tree.pickle rename to parser/parser-cp2k/cp2kparser/parsing/versions/cp2k262/input_xml/cp2k_input_tree.pickle diff --git a/parser/parser-cp2k/cp2kparser/parsing/cp2kinputenginedata/cp2k_262/references.html b/parser/parser-cp2k/cp2kparser/parsing/versions/cp2k262/input_xml/references.html similarity index 100% rename from parser/parser-cp2k/cp2kparser/parsing/cp2kinputenginedata/cp2k_262/references.html rename to parser/parser-cp2k/cp2kparser/parsing/versions/cp2k262/input_xml/references.html diff --git a/parser/parser-cp2k/cp2kparser/parsing/cp2kinputenginedata/cp2k_262/units.html b/parser/parser-cp2k/cp2kparser/parsing/versions/cp2k262/input_xml/units.html similarity index 100% rename from parser/parser-cp2k/cp2kparser/parsing/cp2kinputenginedata/cp2k_262/units.html rename to parser/parser-cp2k/cp2kparser/parsing/versions/cp2k262/input_xml/units.html diff --git a/parser/parser-cp2k/cp2kparser/parsing/inputparsing.py b/parser/parser-cp2k/cp2kparser/parsing/versions/cp2k262/inputparsing.py similarity index 96% rename from parser/parser-cp2k/cp2kparser/parsing/inputparsing.py rename to parser/parser-cp2k/cp2kparser/parsing/versions/cp2k262/inputparsing.py index d1a6d884d89db073203358de3a6cf5d9ddd0e96c..fb3fc12e0c3a44fbf7a325190990f2251b520eb9 100644 --- a/parser/parser-cp2k/cp2kparser/parsing/inputparsing.py +++ b/parser/parser-cp2k/cp2kparser/parsing/versions/cp2k262/inputparsing.py @@ -84,6 +84,6 @@ class CP2KInputParser(object): cp2k_input.xml is version specific. By calling this function before parsing the correct file can be found. """ - pickle_path = os.path.dirname(__file__) + "/cp2kinputenginedata/cp2k_{}/cp2k_input_tree.pickle".format(version_number) + pickle_path = os.path.dirname(__file__) + "/input_xml/cp2k_input_tree.pickle".format(version_number) input_tree_pickle_file = open(pickle_path, 'rb') self.input_tree = pickle.load(input_tree_pickle_file) diff --git a/parser/parser-cp2k/cp2kparser/parsing/outputparsing.py b/parser/parser-cp2k/cp2kparser/parsing/versions/cp2k262/outputparser.py similarity index 99% rename from parser/parser-cp2k/cp2kparser/parsing/outputparsing.py rename to parser/parser-cp2k/cp2kparser/parsing/versions/cp2k262/outputparser.py index 3ae46372023a643a34aee72b7e17e819ffb5f279..ac95060850c1b6281ce29794e6cb5bcc7ffe76a3 100644 --- a/parser/parser-cp2k/cp2kparser/parsing/outputparsing.py +++ b/parser/parser-cp2k/cp2kparser/parsing/versions/cp2k262/outputparser.py @@ -6,7 +6,7 @@ import numpy as np #=============================================================================== -class CP2KOutputParser262(FileParser): +class CP2KOutputParser(FileParser): """The object that goes through the CP2K output file and parses everything it can using the SimpleParser architecture. """ diff --git a/parser/parser-cp2k/cp2kparser/parsing/versions/versionsetup.py b/parser/parser-cp2k/cp2kparser/parsing/versions/versionsetup.py new file mode 100644 index 0000000000000000000000000000000000000000..6e4697992eda1f9683c936f0cb3d10494ca9969d --- /dev/null +++ b/parser/parser-cp2k/cp2kparser/parsing/versions/versionsetup.py @@ -0,0 +1,14 @@ + +"""Returns the implementation classes based on the given version identifier. +The different version are grouped into subpackages. +""" +import importlib + + +def get_implementation_class(version_id): + + # Currently the version id is a pure integer, so it can directly be mapped + # into a package name. + base = "cp2kparser.parsing.versions.cp2k{}.".format(version_id) + implementation = importlib.import_module(base + "implementation").CP2KImplementation + return implementation diff --git a/parser/parser-cp2k/cp2kparser/scalainterface.py b/parser/parser-cp2k/cp2kparser/scalainterface.py index 333ebaf5f10e2a87ac9c5303a3b0ba5fe1398a66..a713ec1c3cc9617ca0f74bb46121f98cd5f5c7f1 100644 --- a/parser/parser-cp2k/cp2kparser/scalainterface.py +++ b/parser/parser-cp2k/cp2kparser/scalainterface.py @@ -3,7 +3,7 @@ This is the access point to the parser for the scala layer in the nomad project. """ import os from cp2kparser import CP2KParser -from cp2kparser.parsing.outputparsing import CP2KOutputParser262 +from cp2kparser.parsing.versions.cp2k262.outputparser import CP2KOutputParser from nomadcore.local_meta_info import loadJsonFile, InfoKindEl from nomadcore.simple_parser import mainFunction @@ -16,7 +16,7 @@ if __name__ == "__main__": cp2kparser = CP2KParser() # Get the outputparser class - outputparser = globals()["CP2KOutputParser262"](None, None) + outputparser = CP2KOutputParser(None, None) # Setup the metainfos metaInfoPath = os.path.normpath(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../../../nomad-meta-info/meta_info/nomad_meta_info/{}".format(cp2kparser.get_metainfo_filename()))) diff --git a/parser/parser-cp2k/cp2kparser/utils/baseclasses.py b/parser/parser-cp2k/cp2kparser/utils/baseclasses.py index cfc9a7317ee2a6928c1d87203830293eaef99bcd..baf69bac1ad76e7b38e794058cb34c9c1ae69907 100644 --- a/parser/parser-cp2k/cp2kparser/utils/baseclasses.py +++ b/parser/parser-cp2k/cp2kparser/utils/baseclasses.py @@ -1,9 +1,8 @@ import os import sys import logging -import StringIO from abc import ABCMeta, abstractmethod -from nomadcore.simple_parser import SimpleParserBuilder, defaultParseFile, extractOnCloseTriggers, PushbackLineFile +from nomadcore.simple_parser import SimpleParserBuilder, extractOnCloseTriggers, PushbackLineFile from nomadcore.caching_backend import CachingLevel, ActiveBackend logger = logging.getLogger(__name__) @@ -20,42 +19,54 @@ class Parser(object): setup by this class based on the given contents. parser_context: A wrapper class for all the parser related information. This is contructed here and then passed onto the different - implementations. - backend: An object to which the parser will give all the parsed data. - The backend will then determine where and when to output that data. + implementations and FileParsers. """ __metaclass__ = ABCMeta - def __init__(self, contents, metainfo_to_keep=None, backend=None): + def __init__(self, contents, metainfo_to_keep=None, backend=None, main_file=None): """ - Args: - contents: list of absolute filepaths as strings - metainfo_to_keep: list of metainfo names to parse as strings. - backend: the backend where the parsing results are outputted + Args: + contents: The contents to parse as a list of file and directory paths. + The given directory paths will be searched recursively for interesting + files. + metainfo_to_keep: A list of metainfo names. This list is used to + optimize the parsing process as optimally only the information relevant + to these metainfos will be parsed. + backend: An object to which the parser will give all the parsed data. + The backend will then determine where and when to output that data. + main_file: A special file that can be considered the main file. + Currently used in when interfacing to the scala environment in the + nomad project. """ - self.initialize(contents, metainfo_to_keep, backend) + self.initialize(contents, metainfo_to_keep, backend, main_file) - def initialize(self, contents, metainfo_to_keep, backend): + def initialize(self, contents, metainfo_to_keep, backend, main_file): """Initialize the parser with the given environment. """ self.parser_context = ParserContext() self.parser_context.backend = backend self.parser_context.metainfo_to_keep = metainfo_to_keep + self.parser_context.main_file = main_file self.implementation = None # If single path provided, make it into a list if isinstance(contents, basestring): contents = [contents] - # Figure out all the files from the contents if contents: + # Use a set as it will automatically ignore duplicates (nested + # folders may have been included) files = set() + for content in contents: + # Add all files recursively from a directory + found_files = [] if os.path.isdir(content): - dir_files = set() - for filename in os.listdir(content): - dir_files.add(os.path.join(content, filename)) - files |= dir_files + for root, dirnames, filenames in os.walk(content): + for filename in filenames: + filename = os.path.join(root, filename) + found_files.append(filename) + files |= set(found_files) elif os.path.isfile(content): files.add(content) else: @@ -80,7 +91,7 @@ class Parser(object): def search_parseable_files(self, files): """From a list of filenames tries to guess which files are relevant to the parsing process. Essentially filters the files before they are sent - to the parser implementation. + to the parser implementation. By default does not do any filtering. """ return files @@ -282,6 +293,12 @@ class FileParser(object): __metaclass__ = ABCMeta def __init__(self, files, parser_context): + """ + Args: + files: A list of filenames that are parsed and analyzed by this + object. + parser_context: The parsing context that contains e.g. the backend. + """ if not isinstance(files, list): files = [files] self.files = files @@ -360,8 +377,9 @@ class FileParser(object): class ParserContext(object): """Contains everything needed to instantiate a parser implementation. """ - def __init__(self, files=None, metainfo_to_keep=None, backend=None, version_id=None): + def __init__(self, files=None, metainfo_to_keep=None, backend=None, version_id=None, main_file=None): self.files = files self.version_id = version_id self.metainfo_to_keep = metainfo_to_keep self.backend = backend + self.main_file = main_file