inputparser.py 20.9 KB
Newer Older
1
import os
2
import re
3
4
import logging
import cPickle as pickle
5
import numpy as np
6
from nomadcore.baseclasses import BasicParser
7
from cp2kparser.generic.inputparsing import *
8
logger = logging.getLogger("nomad")
9
10
11
12
13
14


#===============================================================================
class CP2KInputParser(BasicParser):
    """Used to parse out a CP2K input file.

15
16
17
18
19
20
21
22
23
24
25
26
27
    CP2K offers a complete structure for the input in an XML file, which can be
    printed with the command cp2k --xml. This XML file has been preparsed into
    a native python object ('CP2KInput' class found in generic.inputparsing)
    and stored in a python pickle file. It e.g. contains all the default values
    that are often needed as they are used if the user hasn't specified a
    settings in the input. This XML file is used to get the default values
    because it is rather cumbersome to hard code them in the parser itself,
    especially if there will be lot's of them. Hard coded values will also be
    more error prone, and would have to be checked for each parser version.

    CP2K input supports including other input files and also
    supports variables. This is currently not supported, but may be added at
    some point.
28
29
    """
    def __init__(self, file_path, parser_context):
30
31
32
33
34
35
36
37
38
        """
        Attributes:
            input_tree: The input structure for this version of CP2K. The
                structure is already present, in this module it will be filled with
                data found from the input file.
            input_lines: List of preprocessed lines in the input. Here all the
                variables have been stated explicitly and the additional input files have
                been merged.
        """
39
40
        super(CP2KInputParser, self).__init__(file_path, parser_context)
        self.input_tree = None
41
        self.input_lines = None
42

43
44
45
        #=======================================================================
        # Cached values
        self.cache_service.add_cache_object("configuration_periodic_dimensions", single=False, update=False)
46
        self.cache_service.add_cache_object("trajectory_format")
47
48
        self.cache_service.add_cache_object("each_geo_opt")
        self.cache_service.add_cache_object("traj_add_last")
49

50
51
    def parse(self):

52
53
54
55
56
57
        #=======================================================================
        # Preprocess to spell out variables and to include stuff from other
        # files
        self.preprocess_input()

        #=======================================================================
58
59
60
        # Gather the information from the input file
        self.fill_input_tree(self.file_path)

61
62
63
64
        #=======================================================================
        # Parse everything in the input to cp2k specific metadata
        self.fill_metadata()

65
        #=======================================================================
66
67
68
69
70
71
72
73
74
75
76
77
78
        # Parse the used XC_functionals and their parameters
        xc = self.input_tree.get_section("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL")
        if xc is not None:
            xc_list = []

            class XCFunctional(object):
                def __init__(self, name, weight=1, parameters=None):
                    self.name = name
                    self.weight = weight
                    self.parameters = parameters

            # First see if a functional has been specified in the section parameter
            section_parameter = xc.section_parameter.value
79
            if section_parameter is not None and section_parameter != "NO_SHORTCUT":
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104

                if section_parameter == "BLYP":
                    xc_list.append(XCFunctional("GGA_X_B88"))
                    xc_list.append(XCFunctional("GGA_C_LYP"))

                elif section_parameter == "LDA" or section_parameter == "PADE":
                    xc_list.append(XCFunctional("LDA_XC_TETER93"))

                elif section_parameter == "PBE":
                    xc_list.append(XCFunctional("GGA_X_PBE"))
                    xc_list.append(XCFunctional("GGA_C_PBE"))

                elif section_parameter == "OLYP":
                    xc_list.append(XCFunctional("GGA_X_OPTX"))
                    xc_list.append(XCFunctional("GGA_C_LYP"))

                elif section_parameter == "HCTH120":
                    xc_list.append(XCFunctional("GGA_XC_HCTH_120"))

                elif section_parameter == "PBE0":
                    xc_list.append(XCFunctional("HYB_GGA_XC_PBEH"))

                elif section_parameter == "B3LYP":
                    xc_list.append(XCFunctional("HYB_GGA_XC_B3LYP"))

Lauri Himanen's avatar
Lauri Himanen committed
105
106
107
108
                elif section_parameter == "TPSS":
                    xc_list.append(XCFunctional("MGGA_X_TPSS"))
                    xc_list.append(XCFunctional("MGGA_C_TPSS"))

109
110
111
112
113
                else:
                    logger.warning("Unknown XC functional given in XC_FUNCTIONAL section parameter.")

            # Otherwise one has to look at the individual functional settings
            else:
Lauri Himanen's avatar
Lauri Himanen committed
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
                pbe = xc.get_subsection("PBE")
                if pbe is not None:
                    if pbe.accessed:
                        sp = pbe.get_section_parameter()
                        if sp == "T":
                            parametrization = pbe.get_keyword_value_formatted("PARAMETRIZATION")
                            scale_x = pbe.get_keyword_value_formatted("SCALE_X")
                            scale_c = pbe.get_keyword_value_formatted("SCALE_C")
                            if parametrization == "ORIG":
                                xc_list.append(XCFunctional("GGA_X_PBE", scale_x))
                                xc_list.append(XCFunctional("GGA_C_PBE", scale_c))
                            elif parametrization == "PBESOL":
                                xc_list.append(XCFunctional("GGA_X_PBE_SOL", scale_x))
                                xc_list.append(XCFunctional("GGA_C_PBE_SOL", scale_c))
                            elif parametrization == "REVPBE":
                                xc_list.append(XCFunctional("GGA_X_PBE_R", scale_x))
                                xc_list.append(XCFunctional("GGA_C_PBE", scale_c))
                tpss = xc.get_subsection("TPSS")
                if tpss is not None:
                    if tpss.accessed:
                        sp = tpss.get_section_parameter()
                        if sp == "T":
                            scale_x = tpss.get_keyword_value_formatted("SCALE_X")
                            scale_c = tpss.get_keyword_value_formatted("SCALE_C")
                            xc_list.append(XCFunctional("MGGA_X_TPSS", scale_x))
                            xc_list.append(XCFunctional("MGGA_C_TPSS", scale_c))

141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166

            # Sort the functionals alphabetically by name
            xc_list.sort(key=lambda x: x.name)
            xc_summary = ""

            # For every defined functional, stream the information to the
            # backend and construct the summary string
            for i, functional in enumerate(xc_list):

                gId = self.backend.openSection("section_XC_functionals")
                self.backend.addValue("XC_functional_name", functional.name)
                self.backend.addValue("XC_functional_weight", functional.weight)
                if functional.parameters is not None:
                    pass
                self.backend.closeSection("section_XC_functionals", gId)

                if i != 0:
                    xc_summary += "+"
                xc_summary += "{}*{}".format(functional.weight, functional.name)
                if functional.parameters is not None:
                    xc_summary += ":{}".format()

            # Stream summary
            if xc_summary is not "":
                self.backend.addValue("XC_functional", xc_summary)

167
168
        #=======================================================================
        # Cell periodicity
Lauri Himanen's avatar
Lauri Himanen committed
169
        periodicity = self.input_tree.get_keyword_value_formatted("FORCE_EVAL/SUBSYS/CELL/PERIODIC")
170
171
172
        if periodicity is not None:
            periodicity = periodicity.upper()
            periodicity_list = ("X" in periodicity, "Y" in periodicity, "Z" in periodicity)
173
            self.cache_service["configuration_periodic_dimensions"] = np.asarray(periodicity_list)
174
175
176
        else:
            logger.warning("Could not determine cell periodicity from FORCE_EVAL/SUBSYS/CELL/PERIODIC")

177
178
        #=======================================================================
        # Single point force file name
179
180
181
182
183
        self.setup_force_file_name()

        #=======================================================================
        # Trajectory file name
        self.setup_trajectory_file_name()
184

185
186
        #=======================================================================
        # Stress tensor calculation method
Lauri Himanen's avatar
Lauri Himanen committed
187
        stress_tensor_method = self.input_tree.get_keyword_value_formatted("FORCE_EVAL/STRESS_TENSOR")
188
189
190
191
192
193
194
195
196
197
198
        if stress_tensor_method != "NONE":
            mapping = {
                "NUMERICAL": "Numerical",
                "ANALYTICAL": "Analytical",
                "DIAGONAL_ANALYTICAL": "Diagonal analytical",
                "DIAGONAL_NUMERICAL": "Diagonal numerical",
            }
            stress_tensor_method = mapping.get(stress_tensor_method)
            if stress_tensor_method is not None:
                self.backend.addValue("stress_tensor_method", stress_tensor_method)

199
    def normalize_x_cp2k_path(self, path):
200
201
202
        """The paths in CP2K input can be given in many ways. This function
        tries to normalize these forms into a valid path.
        """
203
        # Path is exactly as given
204
205
        if path.startswith("="):
            normalized_path = path[1:]
206
        # Path is relative, no project name added
207
        elif re.match(r"./", path):
208
209
            normalized_path = path
        # Path is relative, project name added
210
        else:
Lauri Himanen's avatar
Lauri Himanen committed
211
            project_name = self.input_tree.get_keyword_value_formatted("GLOBAL/PROJECT_NAME")
212
213
214
215
            if path:
                normalized_path = "{}-{}".format(project_name, path)
            else:
                normalized_path = project_name
216
217
        return normalized_path

218
219
220
    def setup_force_file_name(self):
        """Setup the force file path.
        """
221
        force_file = self.input_tree.get_keyword_value_formatted("FORCE_EVAL/PRINT/FORCES/FILENAME")
222
223
        extension = "xyz"
        if force_file is not None and force_file != "__STD_OUT__":
224
            normalized_path = self.normalize_x_cp2k_path(force_file)
225
226
227
228
229
230
            final_path = "{}-1_0.{}".format(normalized_path, extension)
            self.file_service.set_file_id(final_path, "force_file_single_point")

    def setup_trajectory_file_name(self):
        """Setup the trajectory file path.
        """
231
232
233
234
235
236
237
238
        traj_format = self.input_tree.get_keyword_value_formatted("MOTION/PRINT/TRAJECTORY/FORMAT")
        traj_filename = self.input_tree.get_keyword_value_formatted("MOTION/PRINT/TRAJECTORY/FILENAME")
        geo_opt_each = self.input_tree.get_keyword_value_formatted("MOTION/PRINT/TRAJECTORY/EACH/GEO_OPT")
        traj_add_last = self.input_tree.get_keyword_value_formatted("MOTION/PRINT/TRAJECTORY/ADD_LAST")
        self.cache_service["each_geo_opt"] = geo_opt_each
        self.cache_service["traj_add_last"] = traj_add_last
        if traj_filename is None:
            traj_filename = ""
239
240
241
242
243
244
245
246
247
248
249
250
        self.cache_service["trajectory_format"] = traj_format
        extension_map = {
            "XYZ": "xyz",
            "XMOL": "xyz",
            "ATOMIC": "xyz",
            "PDB": "pdb",
            "DCD": "dcd",
        }
        extension = extension_map.get(traj_format)
        if extension is None:
            logger.error("Unknown file format '{}' for CP2K trajectory file ".format(traj_format))
            return
251
        normalized_path = self.normalize_x_cp2k_path(traj_filename)
252
        final_path = "{}-pos-1.{}".format(normalized_path, extension)
253
254
        self.file_service.set_file_id(final_path, "trajectory")

255
256
257
258
    def fill_input_tree(self, file_path):
        """Parses a CP2K input file into an object tree.

        Return an object tree represenation of the input augmented with the
259
        default values and lone keyword values from the x_cp2k_input.xml file
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
        which is version specific. Keyword aliases are also mapped to the same
        data.

        The cp2k input is largely case-insensitive. In the input tree, we wan't
        only one standard way to name things, so all section names and section
        parameters will be transformed into upper case.

        To query the returned tree use the following functions:
            get_keyword("GLOBAL/PROJECT_NAME")
            get_parameter("GLOBAL/PRINT")
            get_default_keyword("FORCE_EVAL/SUBSYS/COORD")

        Args:
            : A string containing the contents of a CP2K input file. The
            input file can be stored as string as it isn't that big.

        Returns:
            The input as an object tree.
        """

        self.setup_version(self.parser_context.version_id)
        section_stack = []
282
283
284
        self.input_tree.root_section.accessed = True

        for line in self.input_lines:
Lauri Himanen's avatar
Lauri Himanen committed
285
286

            # Remove comments and whitespaces
Lauri Himanen's avatar
Lauri Himanen committed
287
            line = line.split('!', 1)[0].split('#', 1)[0].strip()
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322

            # Skip empty lines
            if len(line) == 0:
                continue

            # Section ends
            if line.upper().startswith('&END'):
                section_stack.pop()
            # Section starts
            elif line[0] == '&':
                parts = line.split(' ', 1)
                name = parts[0][1:].upper()
                section_stack.append(name)

                # Form the path
                path = ""
                for index, item in enumerate(section_stack):
                    if index != 0:
                        path += '/'
                    path += item

                # Mark the section as accessed.
                self.input_tree.set_section_accessed(path)

                # Save the section parameters
                if len(parts) > 1:
                    self.input_tree.set_parameter(path, parts[1].strip().upper())

            # Ignore variables and includes that might still be here for some
            # reason
            elif line.upper().startswith('@'):
                continue

            # Contents (keywords, default keywords)
            else:
323
                split = line.split(None, 1)
Lauri Himanen's avatar
Lauri Himanen committed
324
                if len(split) <= 1:
Lauri Himanen's avatar
Lauri Himanen committed
325
326
327
                    keyword_value = ""
                else:
                    keyword_value = split[1]
328
                keyword_name = split[0].upper()
Lauri Himanen's avatar
Lauri Himanen committed
329
330
331
332
                try:
                    self.input_tree.set_keyword(path + "/" + keyword_name, keyword_value)
                except UnboundLocalError:
                    print line
333
334
335
336
337

                # Here we store some exceptional print settings that are
                # inportant to the parsing. These dont exist in the input tree
                # because they take much space and are not really important
                # otherwise.
338
339
340
341
342
343
344
345
346
                # if path == "FORCE_EVAL/PRINT/FORCES":
                    # if keyword_name == "FILENAME":
                        # self.force_file_name = keyword_value
                # if path == "MOTION/PRINT/TRAJECTORY":
                    # if keyword_name == "FILENAME":
                        # self.trajectory_file_name = keyword_value
                # if path == "MOTION/PRINT/TRAJECTORY":
                    # if keyword_name == "FORMAT":
                        # self.trajectory_format = keyword_value
347
348
349
350
351
352
353
354
355
356
357
358
359
360

    def fill_metadata(self):
        """Goes through the input data and pushes everything to the
        backend.
        """
        name_stack = []
        self.fill_metadata_recursively(self.input_tree.root_section, name_stack)

    def fill_metadata_recursively(self, section, name_stack):
        """Recursively goes through the input sections and pushes everything to the
        backend.
        """
        if not section.accessed:
            return
361

362
        name_stack.append(section.name)
363
364
        path = "x_cp2k_section_{}".format(".".join(name_stack))
        not_section_path = "x_cp2k_{}".format(".".join(name_stack))
365

366
        gid = self.backend.openSection(path)
367

368
369
370
371
372
        # Keywords
        for default_name in section.default_keyword_names:
            keywords = section.keywords.get(default_name)
            for keyword in keywords:
                if keyword.value is not None:
373
                    name = "{}.{}".format(not_section_path, keyword.default_name)
374
                    self.backend.addValue(name, keyword.value)
375
376

        # Section parameter
377
378
        section_parameter = section.section_parameter
        if section_parameter is not None:
379
            name = "{}.SECTION_PARAMETERS".format(not_section_path)
Lauri Himanen's avatar
Lauri Himanen committed
380
381
            if section_parameter.value is not None:
                self.backend.addValue(name, section_parameter.value)
382
383

        # Default keyword
384
385
386
        default_keyword = section.default_keyword
        if default_keyword is not None:

387
            name = "{}.DEFAULT_KEYWORD".format(not_section_path)
388
            self.backend.addValue(name, default_keyword.value)
389
390
391
392
393
394
395
396
397

        # Subsections
        for name, subsections in section.sections.iteritems():
            for subsection in subsections:
                self.fill_metadata_recursively(subsection, name_stack)

        self.backend.closeSection(path, gid)

        name_stack.pop()
398
399
400

    def setup_version(self, version_number):
        """ The pickle file which contains preparsed data from the
401
        x_cp2k_input.xml is version specific. By calling this function before
402
403
404
405
406
        parsing the correct file can be found.
        """
        pickle_path = os.path.dirname(__file__) + "/input_data/cp2k_input_tree.pickle".format(version_number)
        input_tree_pickle_file = open(pickle_path, 'rb')
        self.input_tree = pickle.load(input_tree_pickle_file)
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489

    def preprocess_input(self):
        """Preprocess the input file. Concatenate .inc files into the main
        input file and explicitly state all variables.
        """
        # Read the input file into memory. It shouldn't be that big so we can
        # do this easily
        input_lines = []
        with open(self.file_path, "r") as f:
            for line in f:
                input_lines.append(line.strip())

        # Merge include files to input
        extended_input = input_lines[:]  # Make a copy
        i_line = 0
        for line in input_lines:
            if line.startswith("@INCLUDE") or line.startswith("@include"):
                split = line.split(None, 1)
                includepath = split[1]
                basedir = os.path.dirname(self.file_path)
                filepath = os.path.join(basedir, includepath)
                filepath = os.path.abspath(filepath)
                if not os.path.isfile(filepath):
                    logger.warning("Could not find the include file '{}' stated in the CP2K input file. Continuing without it.".format(filepath))
                    continue

                # Get the content from include file
                included_lines = []
                with open(filepath, "r") as includef:
                    for line in includef:
                        included_lines.append(line.strip())
                    del extended_input[i_line]
                    extended_input[i_line:i_line] = included_lines
                    i_line += len(included_lines)
            i_line += 1

        # Gather the variable definitions
        variables = {}
        input_set_removed = []
        for i_line, line in enumerate(extended_input):
            if line.startswith("@SET") or line.startswith("@set"):
                components = line.split(None, 2)
                name = components[1]
                value = components[2]
                variables[name] = value
                logger.debug("Variable '{}' found with value '{}'".format(name, value))
            else:
                input_set_removed.append(line)

        # Place the variables
        variable_pattern = r"\@\{(\w+)\}|@(\w+)"
        compiled = re.compile(variable_pattern)
        reserved = ("include", "set", "if", "endif")
        input_variables_replaced = []
        for line in input_set_removed:
            results = compiled.finditer(line)
            new_line = line
            offset = 0
            for result in results:
                options = result.groups()
                first = options[0]
                second = options[1]
                if first:
                    name = first
                elif second:
                    name = second
                if name in reserved:
                    continue
                value = variables.get(name)
                if not value:
                    logger.error("Value for variable '{}' not set.".format(name))
                    continue
                len_value = len(value)
                len_name = len(name)
                start = result.start()
                end = result.end()
                beginning = new_line[:offset+start]
                rest = new_line[offset+end:]
                new_line = beginning + value + rest
                offset += len_value - len_name - 1
            input_variables_replaced.append(new_line)

        self.input_lines = input_variables_replaced