inputparser.py 21.7 KB
Newer Older
1
import os
2
import re
3
4
import logging
import cPickle as pickle
5
import numpy as np
6
from nomadcore.baseclasses import BasicParser
7
from cp2kparser.generic.inputparsing import *
8
logger = logging.getLogger("nomad")
9
10
11
12
13
14


#===============================================================================
class CP2KInputParser(BasicParser):
    """Used to parse out a CP2K input file.

15
16
17
18
19
20
21
22
23
24
25
26
27
    CP2K offers a complete structure for the input in an XML file, which can be
    printed with the command cp2k --xml. This XML file has been preparsed into
    a native python object ('CP2KInput' class found in generic.inputparsing)
    and stored in a python pickle file. It e.g. contains all the default values
    that are often needed as they are used if the user hasn't specified a
    settings in the input. This XML file is used to get the default values
    because it is rather cumbersome to hard code them in the parser itself,
    especially if there will be lot's of them. Hard coded values will also be
    more error prone, and would have to be checked for each parser version.

    CP2K input supports including other input files and also
    supports variables. This is currently not supported, but may be added at
    some point.
28
29
    """
    def __init__(self, file_path, parser_context):
30
31
32
33
34
35
36
37
38
        """
        Attributes:
            input_tree: The input structure for this version of CP2K. The
                structure is already present, in this module it will be filled with
                data found from the input file.
            input_lines: List of preprocessed lines in the input. Here all the
                variables have been stated explicitly and the additional input files have
                been merged.
        """
39
40
        super(CP2KInputParser, self).__init__(file_path, parser_context)
        self.input_tree = None
41
        self.input_lines = None
42
43
44
45
46
47
48
49
50
51
52
53
54
55
        self.unit_mapping = {
            # Distance
            "BOHR": "bohr",
            "M": "m",
            "PM": "pm",
            "NM": "nm",
            "ANGSTROM": "angstrom",
            # Time
            "S": "s",
            "FS": "fs",
            "PS": "ps",
            "AU_T": "(planckConstant/hartree)",
            "WAVENUMBER_T": None,
        }
56

57
58
        #=======================================================================
        # Cached values
59
60
61
62
63
64
65
66
        self.cache_service.add("configuration_periodic_dimensions", single=False, update=False)
        self.cache_service.add("trajectory_format")
        self.cache_service.add("trajectory_unit")
        self.cache_service.add("velocity_format")
        self.cache_service.add("velocity_unit")
        self.cache_service.add("vel_add_last")
        self.cache_service.add("each_geo_opt")
        self.cache_service.add("traj_add_last")
67

68
69
    def parse(self):

70
71
72
73
74
75
        #=======================================================================
        # Preprocess to spell out variables and to include stuff from other
        # files
        self.preprocess_input()

        #=======================================================================
76
77
78
        # Gather the information from the input file
        self.fill_input_tree(self.file_path)

79
80
81
82
        #=======================================================================
        # Parse everything in the input to cp2k specific metadata
        self.fill_metadata()

83
        #=======================================================================
84
85
86
87
88
89
90
91
92
93
94
95
96
        # Parse the used XC_functionals and their parameters
        xc = self.input_tree.get_section("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL")
        if xc is not None:
            xc_list = []

            class XCFunctional(object):
                def __init__(self, name, weight=1, parameters=None):
                    self.name = name
                    self.weight = weight
                    self.parameters = parameters

            # First see if a functional has been specified in the section parameter
            section_parameter = xc.section_parameter.value
97
            if section_parameter is not None and section_parameter != "NO_SHORTCUT":
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122

                if section_parameter == "BLYP":
                    xc_list.append(XCFunctional("GGA_X_B88"))
                    xc_list.append(XCFunctional("GGA_C_LYP"))

                elif section_parameter == "LDA" or section_parameter == "PADE":
                    xc_list.append(XCFunctional("LDA_XC_TETER93"))

                elif section_parameter == "PBE":
                    xc_list.append(XCFunctional("GGA_X_PBE"))
                    xc_list.append(XCFunctional("GGA_C_PBE"))

                elif section_parameter == "OLYP":
                    xc_list.append(XCFunctional("GGA_X_OPTX"))
                    xc_list.append(XCFunctional("GGA_C_LYP"))

                elif section_parameter == "HCTH120":
                    xc_list.append(XCFunctional("GGA_XC_HCTH_120"))

                elif section_parameter == "PBE0":
                    xc_list.append(XCFunctional("HYB_GGA_XC_PBEH"))

                elif section_parameter == "B3LYP":
                    xc_list.append(XCFunctional("HYB_GGA_XC_B3LYP"))

Lauri Himanen's avatar
Lauri Himanen committed
123
124
125
126
                elif section_parameter == "TPSS":
                    xc_list.append(XCFunctional("MGGA_X_TPSS"))
                    xc_list.append(XCFunctional("MGGA_C_TPSS"))

127
128
129
130
131
                else:
                    logger.warning("Unknown XC functional given in XC_FUNCTIONAL section parameter.")

            # Otherwise one has to look at the individual functional settings
            else:
Lauri Himanen's avatar
Lauri Himanen committed
132
133
134
135
136
                pbe = xc.get_subsection("PBE")
                if pbe is not None:
                    if pbe.accessed:
                        sp = pbe.get_section_parameter()
                        if sp == "T":
137
138
139
                            parametrization = pbe.get_keyword("PARAMETRIZATION")
                            scale_x = pbe.get_keyword("SCALE_X")
                            scale_c = pbe.get_keyword("SCALE_C")
Lauri Himanen's avatar
Lauri Himanen committed
140
141
142
143
144
145
146
147
148
149
150
151
152
153
                            if parametrization == "ORIG":
                                xc_list.append(XCFunctional("GGA_X_PBE", scale_x))
                                xc_list.append(XCFunctional("GGA_C_PBE", scale_c))
                            elif parametrization == "PBESOL":
                                xc_list.append(XCFunctional("GGA_X_PBE_SOL", scale_x))
                                xc_list.append(XCFunctional("GGA_C_PBE_SOL", scale_c))
                            elif parametrization == "REVPBE":
                                xc_list.append(XCFunctional("GGA_X_PBE_R", scale_x))
                                xc_list.append(XCFunctional("GGA_C_PBE", scale_c))
                tpss = xc.get_subsection("TPSS")
                if tpss is not None:
                    if tpss.accessed:
                        sp = tpss.get_section_parameter()
                        if sp == "T":
154
155
                            scale_x = tpss.get_keyword("SCALE_X")
                            scale_c = tpss.get_keyword("SCALE_C")
Lauri Himanen's avatar
Lauri Himanen committed
156
157
158
                            xc_list.append(XCFunctional("MGGA_X_TPSS", scale_x))
                            xc_list.append(XCFunctional("MGGA_C_TPSS", scale_c))

159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
            # Sort the functionals alphabetically by name
            xc_list.sort(key=lambda x: x.name)
            xc_summary = ""

            # For every defined functional, stream the information to the
            # backend and construct the summary string
            for i, functional in enumerate(xc_list):

                gId = self.backend.openSection("section_XC_functionals")
                self.backend.addValue("XC_functional_name", functional.name)
                self.backend.addValue("XC_functional_weight", functional.weight)
                if functional.parameters is not None:
                    pass
                self.backend.closeSection("section_XC_functionals", gId)

                if i != 0:
                    xc_summary += "+"
                xc_summary += "{}*{}".format(functional.weight, functional.name)
                if functional.parameters is not None:
                    xc_summary += ":{}".format()

            # Stream summary
            if xc_summary is not "":
                self.backend.addValue("XC_functional", xc_summary)

184
185
        #=======================================================================
        # Cell periodicity
186
        periodicity = self.input_tree.get_keyword("FORCE_EVAL/SUBSYS/CELL/PERIODIC")
187
188
189
        if periodicity is not None:
            periodicity = periodicity.upper()
            periodicity_list = ("X" in periodicity, "Y" in periodicity, "Z" in periodicity)
190
            self.cache_service["configuration_periodic_dimensions"] = np.asarray(periodicity_list)
191
192
193
        else:
            logger.warning("Could not determine cell periodicity from FORCE_EVAL/SUBSYS/CELL/PERIODIC")

194
195
        #=======================================================================
        # Single point force file name
196
197
198
        self.setup_force_file_name()

        #=======================================================================
199
        # Trajectory file name
200
        self.setup_trajectory_file_name()
201

202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
        #=======================================================================
        # Trajectory file format
        self.cache_service["trajectory_format"] = self.input_tree.get_keyword("MOTION/PRINT/TRAJECTORY/FORMAT")
        self.cache_service["traj_add_last"] = self.input_tree.get_keyword("MOTION/PRINT/TRAJECTORY/ADD_LAST")
        traj_unit = self.input_tree.get_keyword("MOTION/PRINT/TRAJECTORY/UNIT")
        pint_traj_unit = self.get_pint_unit_string(traj_unit)
        self.cache_service["trajectory_unit"] = pint_traj_unit

        #=======================================================================
        # Velocity file format
        self.cache_service["velocity_format"] = self.input_tree.get_keyword("MOTION/PRINT/VELOCITIES/FORMAT")
        self.cache_service["vel_add_last"] = self.input_tree.get_keyword("MOTION/PRINT/VELOCITIES/ADD_LAST")
        vel_unit = self.input_tree.get_keyword("MOTION/PRINT/VELOCITIES/UNIT")
        pint_vel_unit = self.get_pint_unit_string(vel_unit)
        self.cache_service["velocity_unit"] = pint_vel_unit

218
219
        #=======================================================================
        # Stress tensor calculation method
220
        stress_tensor_method = self.input_tree.get_keyword("FORCE_EVAL/STRESS_TENSOR")
221
222
223
224
225
226
227
228
229
230
231
        if stress_tensor_method != "NONE":
            mapping = {
                "NUMERICAL": "Numerical",
                "ANALYTICAL": "Analytical",
                "DIAGONAL_ANALYTICAL": "Diagonal analytical",
                "DIAGONAL_NUMERICAL": "Diagonal numerical",
            }
            stress_tensor_method = mapping.get(stress_tensor_method)
            if stress_tensor_method is not None:
                self.backend.addValue("stress_tensor_method", stress_tensor_method)

232
    def normalize_x_cp2k_path(self, path):
233
234
235
        """The paths in CP2K input can be given in many ways. This function
        tries to normalize these forms into a valid path.
        """
236
        # Path is exactly as given
237
238
        if path.startswith("="):
            normalized_path = path[1:]
239
        # Path is relative, no project name added
240
        elif re.match(r"./", path):
241
242
            normalized_path = path
        # Path is relative, project name added
243
        else:
244
            project_name = self.input_tree.get_keyword("GLOBAL/PROJECT_NAME")
245
246
247
248
            if path:
                normalized_path = "{}-{}".format(project_name, path)
            else:
                normalized_path = project_name
249
250
        return normalized_path

251
252
253
254
255
256
257
258
259
260
261
262
263
    def get_pint_unit_string(self, cp2k_unit_string):
        """Translate the CP2K unit definition into a valid pint unit.
        """
        units = re.split('[\^\-\+\*\d]+', cp2k_unit_string)
        for unit in units:
            if unit == "":
                continue
            pint_unit = self.unit_mapping.get(unit.upper())
            if pint_unit is None:
                return None
            cp2k_unit_string = cp2k_unit_string.replace(unit, pint_unit)
        return cp2k_unit_string

264
265
266
    def setup_force_file_name(self):
        """Setup the force file path.
        """
267
        force_file = self.input_tree.get_keyword("FORCE_EVAL/PRINT/FORCES/FILENAME")
268
269
        extension = "xyz"
        if force_file is not None and force_file != "__STD_OUT__":
270
            normalized_path = self.normalize_x_cp2k_path(force_file)
271
272
273
274
275
276
            final_path = "{}-1_0.{}".format(normalized_path, extension)
            self.file_service.set_file_id(final_path, "force_file_single_point")

    def setup_trajectory_file_name(self):
        """Setup the trajectory file path.
        """
277
278
279
        traj_format = self.input_tree.get_keyword("MOTION/PRINT/TRAJECTORY/FORMAT")
        traj_filename = self.input_tree.get_keyword("MOTION/PRINT/TRAJECTORY/FILENAME")
        self.cache_service["each_geo_opt"] = self.input_tree.get_keyword("MOTION/PRINT/TRAJECTORY/EACH/GEO_OPT")
280
281
        if traj_filename is None:
            traj_filename = ""
282
283
284
285
286
287
288
289
290
291
292
        extension_map = {
            "XYZ": "xyz",
            "XMOL": "xyz",
            "ATOMIC": "xyz",
            "PDB": "pdb",
            "DCD": "dcd",
        }
        extension = extension_map.get(traj_format)
        if extension is None:
            logger.error("Unknown file format '{}' for CP2K trajectory file ".format(traj_format))
            return
293
        normalized_path = self.normalize_x_cp2k_path(traj_filename)
294
        final_path = "{}-pos-1.{}".format(normalized_path, extension)
295
296
        self.file_service.set_file_id(final_path, "trajectory")

297
298
299
300
    def fill_input_tree(self, file_path):
        """Parses a CP2K input file into an object tree.

        Return an object tree represenation of the input augmented with the
301
        default values and lone keyword values from the x_cp2k_input.xml file
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
        which is version specific. Keyword aliases are also mapped to the same
        data.

        The cp2k input is largely case-insensitive. In the input tree, we wan't
        only one standard way to name things, so all section names and section
        parameters will be transformed into upper case.

        To query the returned tree use the following functions:
            get_keyword("GLOBAL/PROJECT_NAME")
            get_parameter("GLOBAL/PRINT")
            get_default_keyword("FORCE_EVAL/SUBSYS/COORD")

        Args:
            : A string containing the contents of a CP2K input file. The
            input file can be stored as string as it isn't that big.

        Returns:
            The input as an object tree.
        """

        self.setup_version(self.parser_context.version_id)
        section_stack = []
324
325
326
        self.input_tree.root_section.accessed = True

        for line in self.input_lines:
Lauri Himanen's avatar
Lauri Himanen committed
327
328

            # Remove comments and whitespaces
Lauri Himanen's avatar
Lauri Himanen committed
329
            line = line.split('!', 1)[0].split('#', 1)[0].strip()
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364

            # Skip empty lines
            if len(line) == 0:
                continue

            # Section ends
            if line.upper().startswith('&END'):
                section_stack.pop()
            # Section starts
            elif line[0] == '&':
                parts = line.split(' ', 1)
                name = parts[0][1:].upper()
                section_stack.append(name)

                # Form the path
                path = ""
                for index, item in enumerate(section_stack):
                    if index != 0:
                        path += '/'
                    path += item

                # Mark the section as accessed.
                self.input_tree.set_section_accessed(path)

                # Save the section parameters
                if len(parts) > 1:
                    self.input_tree.set_parameter(path, parts[1].strip().upper())

            # Ignore variables and includes that might still be here for some
            # reason
            elif line.upper().startswith('@'):
                continue

            # Contents (keywords, default keywords)
            else:
365
                split = line.split(None, 1)
Lauri Himanen's avatar
Lauri Himanen committed
366
                if len(split) <= 1:
Lauri Himanen's avatar
Lauri Himanen committed
367
368
369
                    keyword_value = ""
                else:
                    keyword_value = split[1]
370
                keyword_name = split[0].upper()
371
                self.input_tree.set_keyword(path + "/" + keyword_name, keyword_value)
372
373
374
375
376
377
378
379
380
381
382
383
384
385

    def fill_metadata(self):
        """Goes through the input data and pushes everything to the
        backend.
        """
        name_stack = []
        self.fill_metadata_recursively(self.input_tree.root_section, name_stack)

    def fill_metadata_recursively(self, section, name_stack):
        """Recursively goes through the input sections and pushes everything to the
        backend.
        """
        if not section.accessed:
            return
386

387
        name_stack.append(section.name)
388
389
        path = "x_cp2k_section_{}".format(".".join(name_stack))
        not_section_path = "x_cp2k_{}".format(".".join(name_stack))
390

391
        gid = self.backend.openSection(path)
392

393
394
395
396
397
        # Keywords
        for default_name in section.default_keyword_names:
            keywords = section.keywords.get(default_name)
            for keyword in keywords:
                if keyword.value is not None:
398
                    name = "{}.{}".format(not_section_path, keyword.default_name)
399
                    self.backend.addValue(name, keyword.value)
400
401

        # Section parameter
402
403
        section_parameter = section.section_parameter
        if section_parameter is not None:
404
            name = "{}.SECTION_PARAMETERS".format(not_section_path)
Lauri Himanen's avatar
Lauri Himanen committed
405
406
            if section_parameter.value is not None:
                self.backend.addValue(name, section_parameter.value)
407
408

        # Default keyword
409
410
411
        default_keyword = section.default_keyword
        if default_keyword is not None:

412
            name = "{}.DEFAULT_KEYWORD".format(not_section_path)
413
            self.backend.addValue(name, default_keyword.value)
414
415
416
417
418
419
420
421
422

        # Subsections
        for name, subsections in section.sections.iteritems():
            for subsection in subsections:
                self.fill_metadata_recursively(subsection, name_stack)

        self.backend.closeSection(path, gid)

        name_stack.pop()
423
424
425

    def setup_version(self, version_number):
        """ The pickle file which contains preparsed data from the
426
        x_cp2k_input.xml is version specific. By calling this function before
427
428
429
430
431
        parsing the correct file can be found.
        """
        pickle_path = os.path.dirname(__file__) + "/input_data/cp2k_input_tree.pickle".format(version_number)
        input_tree_pickle_file = open(pickle_path, 'rb')
        self.input_tree = pickle.load(input_tree_pickle_file)
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514

    def preprocess_input(self):
        """Preprocess the input file. Concatenate .inc files into the main
        input file and explicitly state all variables.
        """
        # Read the input file into memory. It shouldn't be that big so we can
        # do this easily
        input_lines = []
        with open(self.file_path, "r") as f:
            for line in f:
                input_lines.append(line.strip())

        # Merge include files to input
        extended_input = input_lines[:]  # Make a copy
        i_line = 0
        for line in input_lines:
            if line.startswith("@INCLUDE") or line.startswith("@include"):
                split = line.split(None, 1)
                includepath = split[1]
                basedir = os.path.dirname(self.file_path)
                filepath = os.path.join(basedir, includepath)
                filepath = os.path.abspath(filepath)
                if not os.path.isfile(filepath):
                    logger.warning("Could not find the include file '{}' stated in the CP2K input file. Continuing without it.".format(filepath))
                    continue

                # Get the content from include file
                included_lines = []
                with open(filepath, "r") as includef:
                    for line in includef:
                        included_lines.append(line.strip())
                    del extended_input[i_line]
                    extended_input[i_line:i_line] = included_lines
                    i_line += len(included_lines)
            i_line += 1

        # Gather the variable definitions
        variables = {}
        input_set_removed = []
        for i_line, line in enumerate(extended_input):
            if line.startswith("@SET") or line.startswith("@set"):
                components = line.split(None, 2)
                name = components[1]
                value = components[2]
                variables[name] = value
                logger.debug("Variable '{}' found with value '{}'".format(name, value))
            else:
                input_set_removed.append(line)

        # Place the variables
        variable_pattern = r"\@\{(\w+)\}|@(\w+)"
        compiled = re.compile(variable_pattern)
        reserved = ("include", "set", "if", "endif")
        input_variables_replaced = []
        for line in input_set_removed:
            results = compiled.finditer(line)
            new_line = line
            offset = 0
            for result in results:
                options = result.groups()
                first = options[0]
                second = options[1]
                if first:
                    name = first
                elif second:
                    name = second
                if name in reserved:
                    continue
                value = variables.get(name)
                if not value:
                    logger.error("Value for variable '{}' not set.".format(name))
                    continue
                len_value = len(value)
                len_name = len(name)
                start = result.start()
                end = result.end()
                beginning = new_line[:offset+start]
                rest = new_line[offset+end:]
                new_line = beginning + value + rest
                offset += len_value - len_name - 1
            input_variables_replaced.append(new_line)

        self.input_lines = input_variables_replaced