inputparser.py 16.8 KB
Newer Older
1
import os
2
import re
3
4
import logging
import cPickle as pickle
5
import numpy as np
6
from nomadcore.baseclasses import BasicParser
7
from cp2kparser.generic.inputparsing import *
8
logger = logging.getLogger("nomad")
9
10
11
12
13
14


#===============================================================================
class CP2KInputParser(BasicParser):
    """Used to parse out a CP2K input file.

15
16
17
18
19
20
21
22
23
24
25
26
27
    CP2K offers a complete structure for the input in an XML file, which can be
    printed with the command cp2k --xml. This XML file has been preparsed into
    a native python object ('CP2KInput' class found in generic.inputparsing)
    and stored in a python pickle file. It e.g. contains all the default values
    that are often needed as they are used if the user hasn't specified a
    settings in the input. This XML file is used to get the default values
    because it is rather cumbersome to hard code them in the parser itself,
    especially if there will be lot's of them. Hard coded values will also be
    more error prone, and would have to be checked for each parser version.

    CP2K input supports including other input files and also
    supports variables. This is currently not supported, but may be added at
    some point.
28
29
    """
    def __init__(self, file_path, parser_context):
30
31
32
33
34
35
36
37
38
        """
        Attributes:
            input_tree: The input structure for this version of CP2K. The
                structure is already present, in this module it will be filled with
                data found from the input file.
            input_lines: List of preprocessed lines in the input. Here all the
                variables have been stated explicitly and the additional input files have
                been merged.
        """
39
40
        super(CP2KInputParser, self).__init__(file_path, parser_context)
        self.input_tree = None
41
42
        self.input_lines = None
        self.force_file_name = None
43
44
45

    def parse(self):

46
47
48
49
50
51
        #=======================================================================
        # Preprocess to spell out variables and to include stuff from other
        # files
        self.preprocess_input()

        #=======================================================================
52
53
54
        # Gather the information from the input file
        self.fill_input_tree(self.file_path)

55
56
57
58
        #=======================================================================
        # Parse everything in the input to cp2k specific metadata
        self.fill_metadata()

59
        #=======================================================================
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
        # Parse the used XC_functionals and their parameters
        xc = self.input_tree.get_section("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL")
        if xc is not None:
            xc_list = []

            class XCFunctional(object):
                def __init__(self, name, weight=1, parameters=None):
                    self.name = name
                    self.weight = weight
                    self.parameters = parameters

            # First see if a functional has been specified in the section parameter
            section_parameter = xc.section_parameter.value
            if section_parameter is not None:

                if section_parameter == "BLYP":
                    xc_list.append(XCFunctional("GGA_X_B88"))
                    xc_list.append(XCFunctional("GGA_C_LYP"))

                elif section_parameter == "LDA" or section_parameter == "PADE":
                    xc_list.append(XCFunctional("LDA_XC_TETER93"))

                elif section_parameter == "PBE":
                    xc_list.append(XCFunctional("GGA_X_PBE"))
                    xc_list.append(XCFunctional("GGA_C_PBE"))

                elif section_parameter == "OLYP":
                    xc_list.append(XCFunctional("GGA_X_OPTX"))
                    xc_list.append(XCFunctional("GGA_C_LYP"))

                elif section_parameter == "HCTH120":
                    xc_list.append(XCFunctional("GGA_XC_HCTH_120"))

                elif section_parameter == "PBE0":
                    xc_list.append(XCFunctional("HYB_GGA_XC_PBEH"))

                elif section_parameter == "B3LYP":
                    xc_list.append(XCFunctional("HYB_GGA_XC_B3LYP"))

                else:
                    logger.warning("Unknown XC functional given in XC_FUNCTIONAL section parameter.")

            # Otherwise one has to look at the individual functional settings
            else:
                pass

            # Sort the functionals alphabetically by name
            xc_list.sort(key=lambda x: x.name)
            xc_summary = ""

            # For every defined functional, stream the information to the
            # backend and construct the summary string
            for i, functional in enumerate(xc_list):

                gId = self.backend.openSection("section_XC_functionals")
                self.backend.addValue("XC_functional_name", functional.name)
                self.backend.addValue("XC_functional_weight", functional.weight)
                if functional.parameters is not None:
                    pass
                self.backend.closeSection("section_XC_functionals", gId)

                if i != 0:
                    xc_summary += "+"
                xc_summary += "{}*{}".format(functional.weight, functional.name)
                if functional.parameters is not None:
                    xc_summary += ":{}".format()

            # Stream summary
            if xc_summary is not "":
                self.backend.addValue("XC_functional", xc_summary)

131
132
        #=======================================================================
        # Cell periodicity
133
134
135
136
        periodicity = self.input_tree.get_keyword("FORCE_EVAL/SUBSYS/CELL/PERIODIC")
        if periodicity is not None:
            periodicity = periodicity.upper()
            periodicity_list = ("X" in periodicity, "Y" in periodicity, "Z" in periodicity)
137
            self.backend.addArrayValues("configuration_periodic_dimensions", np.asarray(periodicity_list))
138
139
140
        else:
            logger.warning("Could not determine cell periodicity from FORCE_EVAL/SUBSYS/CELL/PERIODIC")

141
142
        #=======================================================================
        # Single point force file name
143
144
145
        # force_file = self.input_tree.get_keyword("FORCE_EVAL/PRINT/FORCES/FILENAME")
        force_file = self.force_file_name
        if force_file is not None and force_file != "__STD_OUT__":
146
147
148
            force_file_path = self.normalize_cp2k_path(force_file, "xyz")
            self.file_service.set_file_id(force_file_path, "force_file_single_point")

149
150
151
152
153
154
155
156
157
158
159
160
161
162
        #=======================================================================
        # Stress tensor calculation method
        stress_tensor_method = self.input_tree.get_keyword("FORCE_EVAL/STRESS_TENSOR")
        if stress_tensor_method != "NONE":
            mapping = {
                "NUMERICAL": "Numerical",
                "ANALYTICAL": "Analytical",
                "DIAGONAL_ANALYTICAL": "Diagonal analytical",
                "DIAGONAL_NUMERICAL": "Diagonal numerical",
            }
            stress_tensor_method = mapping.get(stress_tensor_method)
            if stress_tensor_method is not None:
                self.backend.addValue("stress_tensor_method", stress_tensor_method)

163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
    def normalize_cp2k_path(self, path, extension, name=""):
        """The paths in CP2K input can be given in many ways. This function
        tries to normalize these forms into a valid path.
        """
        if name:
            name = "-" + name
        project_name = self.input_tree.get_keyword("GLOBAL/PROJECT_NAME")
        if path.startswith("="):
            normalized_path = path[1:]
        elif re.match(r"./", path):
            normalized_path = "{}{}-1_0.{}".format(path, name, extension)
        else:
            normalized_path = "{}-{}{}-1_0.{}".format(project_name, path, name, extension)
        return normalized_path

178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
    def fill_input_tree(self, file_path):
        """Parses a CP2K input file into an object tree.

        Return an object tree represenation of the input augmented with the
        default values and lone keyword values from the cp2k_input.xml file
        which is version specific. Keyword aliases are also mapped to the same
        data.

        The cp2k input is largely case-insensitive. In the input tree, we wan't
        only one standard way to name things, so all section names and section
        parameters will be transformed into upper case.

        To query the returned tree use the following functions:
            get_keyword("GLOBAL/PROJECT_NAME")
            get_parameter("GLOBAL/PRINT")
            get_default_keyword("FORCE_EVAL/SUBSYS/COORD")

        Args:
            : A string containing the contents of a CP2K input file. The
            input file can be stored as string as it isn't that big.

        Returns:
            The input as an object tree.
        """

        self.setup_version(self.parser_context.version_id)
        section_stack = []
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
        self.input_tree.root_section.accessed = True

        for line in self.input_lines:
            line = line.split('!', 1)[0].strip()

            # Skip empty lines
            if len(line) == 0:
                continue

            # Section ends
            if line.upper().startswith('&END'):
                section_stack.pop()
            # Section starts
            elif line[0] == '&':
                parts = line.split(' ', 1)
                name = parts[0][1:].upper()
                section_stack.append(name)

                # Form the path
                path = ""
                for index, item in enumerate(section_stack):
                    if index != 0:
                        path += '/'
                    path += item

                # Mark the section as accessed.
                self.input_tree.set_section_accessed(path)

                # Save the section parameters
                if len(parts) > 1:
                    self.input_tree.set_parameter(path, parts[1].strip().upper())

            # Ignore variables and includes that might still be here for some
            # reason
            elif line.upper().startswith('@'):
                continue

            # Contents (keywords, default keywords)
            else:
                split = line.split(' ', 1)
                keyword_name = split[0].upper()
                keyword_value = split[1]
                self.input_tree.set_keyword(path + "/" + keyword_name, keyword_value)

                # Here we store some exceptional print settings that are
                # inportant to the parsing. These dont exist in the input tree
                # because they take much space and are not really important
                # otherwise.
                if path == "FORCE_EVAL/PRINT/FORCES":
                    if keyword_name == "FILENAME":
                        self.force_file_name = keyword_value

    def fill_metadata(self):
        """Goes through the input data and pushes everything to the
        backend.
        """
        name_stack = []
        self.fill_metadata_recursively(self.input_tree.root_section, name_stack)

    def fill_metadata_recursively(self, section, name_stack):
        """Recursively goes through the input sections and pushes everything to the
        backend.
        """
        if not section.accessed:
            return
270

271
272
        name_stack.append(section.name)
        path = "x_cp2k_{}".format(".".join(name_stack))
273

274
        gid = self.backend.openSection(path)
275

276
277
278
279
280
        # Keywords
        for default_name in section.default_keyword_names:
            keywords = section.keywords.get(default_name)
            for keyword in keywords:
                if keyword.value is not None:
281
                    name = "{}.{}".format(path, keyword.default_name)
282
                    formatted_value = keyword.get_formatted_value()
283
                    self.add_formatted_value_to_backend(name, formatted_value)
284
285

        # Section parameter
286
287
288
289
290
        section_parameter = section.section_parameter
        if section_parameter is not None:
            name = "{}.SECTION_PARAMETERS".format(path)
            formatted_value = section_parameter.get_formatted_value()
            self.add_formatted_value_to_backend(name, formatted_value)
291
292

        # Default keyword
293
294
295
296
297
298
        default_keyword = section.default_keyword
        if default_keyword is not None:

            name = "{}.DEFAULT_KEYWORD".format(path)
            formatted_value = default_keyword.get_formatted_value()
            self.add_formatted_value_to_backend(name, formatted_value)
299
300
301
302
303
304
305
306
307

        # Subsections
        for name, subsections in section.sections.iteritems():
            for subsection in subsections:
                self.fill_metadata_recursively(subsection, name_stack)

        self.backend.closeSection(path, gid)

        name_stack.pop()
308

309
310
311
312
313
314
315
    def add_formatted_value_to_backend(self, name, formatted_value):
        if formatted_value is not None:
            if isinstance(formatted_value, np.ndarray):
                self.backend.addArrayValues(name, formatted_value)
            else:
                self.backend.addValue(name, formatted_value)

316
317
318
319
320
321
322
323
    def setup_version(self, version_number):
        """ The pickle file which contains preparsed data from the
        cp2k_input.xml is version specific. By calling this function before
        parsing the correct file can be found.
        """
        pickle_path = os.path.dirname(__file__) + "/input_data/cp2k_input_tree.pickle".format(version_number)
        input_tree_pickle_file = open(pickle_path, 'rb')
        self.input_tree = pickle.load(input_tree_pickle_file)
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407

    def preprocess_input(self):
        """Preprocess the input file. Concatenate .inc files into the main
        input file and explicitly state all variables.
        """
        # Read the input file into memory. It shouldn't be that big so we can
        # do this easily
        input_lines = []
        with open(self.file_path, "r") as f:
            for line in f:
                input_lines.append(line.strip())

        # Merge include files to input
        extended_input = input_lines[:]  # Make a copy
        i_line = 0
        for line in input_lines:
            if line.startswith("@INCLUDE") or line.startswith("@include"):
                split = line.split(None, 1)
                includepath = split[1]
                basedir = os.path.dirname(self.file_path)
                filepath = os.path.join(basedir, includepath)
                filepath = os.path.abspath(filepath)
                if not os.path.isfile(filepath):
                    logger.warning("Could not find the include file '{}' stated in the CP2K input file. Continuing without it.".format(filepath))
                    print filepath
                    continue

                # Get the content from include file
                included_lines = []
                with open(filepath, "r") as includef:
                    for line in includef:
                        included_lines.append(line.strip())
                    del extended_input[i_line]
                    extended_input[i_line:i_line] = included_lines
                    i_line += len(included_lines)
            i_line += 1

        # Gather the variable definitions
        variables = {}
        input_set_removed = []
        for i_line, line in enumerate(extended_input):
            if line.startswith("@SET") or line.startswith("@set"):
                components = line.split(None, 2)
                name = components[1]
                value = components[2]
                variables[name] = value
                logger.debug("Variable '{}' found with value '{}'".format(name, value))
            else:
                input_set_removed.append(line)

        # Place the variables
        variable_pattern = r"\@\{(\w+)\}|@(\w+)"
        compiled = re.compile(variable_pattern)
        reserved = ("include", "set", "if", "endif")
        input_variables_replaced = []
        for line in input_set_removed:
            results = compiled.finditer(line)
            new_line = line
            offset = 0
            for result in results:
                options = result.groups()
                first = options[0]
                second = options[1]
                if first:
                    name = first
                elif second:
                    name = second
                if name in reserved:
                    continue
                value = variables.get(name)
                if not value:
                    logger.error("Value for variable '{}' not set.".format(name))
                    continue
                len_value = len(value)
                len_name = len(name)
                start = result.start()
                end = result.end()
                beginning = new_line[:offset+start]
                rest = new_line[offset+end:]
                new_line = beginning + value + rest
                offset += len_value - len_name - 1
            input_variables_replaced.append(new_line)

        self.input_lines = input_variables_replaced