inputparser.py 22.8 KB
Newer Older
1
2
3
from future import standard_library
standard_library.install_aliases()
from builtins import object
4
import os
5
import re
6
import logging
7
import pickle
8
import numpy as np
9
from nomadcore.baseclasses import BasicParser
10
from cp2kparser.generic.inputparsing import *
11
logger = logging.getLogger("nomad")
12
13
14
15
16
17


#===============================================================================
class CP2KInputParser(BasicParser):
    """Used to parse out a CP2K input file.

18
19
20
21
22
23
24
25
26
27
28
29
30
    CP2K offers a complete structure for the input in an XML file, which can be
    printed with the command cp2k --xml. This XML file has been preparsed into
    a native python object ('CP2KInput' class found in generic.inputparsing)
    and stored in a python pickle file. It e.g. contains all the default values
    that are often needed as they are used if the user hasn't specified a
    settings in the input. This XML file is used to get the default values
    because it is rather cumbersome to hard code them in the parser itself,
    especially if there will be lot's of them. Hard coded values will also be
    more error prone, and would have to be checked for each parser version.

    CP2K input supports including other input files and also
    supports variables. This is currently not supported, but may be added at
    some point.
31
32
    """
    def __init__(self, file_path, parser_context):
33
34
35
36
37
38
39
40
41
        """
        Attributes:
            input_tree: The input structure for this version of CP2K. The
                structure is already present, in this module it will be filled with
                data found from the input file.
            input_lines: List of preprocessed lines in the input. Here all the
                variables have been stated explicitly and the additional input files have
                been merged.
        """
42
43
        super(CP2KInputParser, self).__init__(file_path, parser_context)
        self.input_tree = None
44
        self.input_lines = None
45
46
47
48
49
50
51
52
53
54
55
56
57
58
        self.unit_mapping = {
            # Distance
            "BOHR": "bohr",
            "M": "m",
            "PM": "pm",
            "NM": "nm",
            "ANGSTROM": "angstrom",
            # Time
            "S": "s",
            "FS": "fs",
            "PS": "ps",
            "AU_T": "(planckConstant/hartree)",
            "WAVENUMBER_T": None,
        }
59

60
61
        #=======================================================================
        # Cached values
62
63
64
65
66
67
68
69
        self.cache_service.add("configuration_periodic_dimensions", single=False, update=False)
        self.cache_service.add("trajectory_format")
        self.cache_service.add("trajectory_unit")
        self.cache_service.add("velocity_format")
        self.cache_service.add("velocity_unit")
        self.cache_service.add("vel_add_last")
        self.cache_service.add("each_geo_opt")
        self.cache_service.add("traj_add_last")
70
        # self.cache_service.add("electronic_structure_method")
71

72
73
    def parse(self):

74
75
76
77
78
79
        #=======================================================================
        # Preprocess to spell out variables and to include stuff from other
        # files
        self.preprocess_input()

        #=======================================================================
80
81
82
        # Gather the information from the input file
        self.fill_input_tree(self.file_path)

83
84
85
86
        #=======================================================================
        # Parse everything in the input to cp2k specific metadata
        self.fill_metadata()

87
        #=======================================================================
88
89
90
91
92
93
94
95
96
97
98
99
100
        # Parse the used XC_functionals and their parameters
        xc = self.input_tree.get_section("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL")
        if xc is not None:
            xc_list = []

            class XCFunctional(object):
                def __init__(self, name, weight=1, parameters=None):
                    self.name = name
                    self.weight = weight
                    self.parameters = parameters

            # First see if a functional has been specified in the section parameter
            section_parameter = xc.section_parameter.value
101
            if section_parameter is not None and section_parameter != "NO_SHORTCUT":
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126

                if section_parameter == "BLYP":
                    xc_list.append(XCFunctional("GGA_X_B88"))
                    xc_list.append(XCFunctional("GGA_C_LYP"))

                elif section_parameter == "LDA" or section_parameter == "PADE":
                    xc_list.append(XCFunctional("LDA_XC_TETER93"))

                elif section_parameter == "PBE":
                    xc_list.append(XCFunctional("GGA_X_PBE"))
                    xc_list.append(XCFunctional("GGA_C_PBE"))

                elif section_parameter == "OLYP":
                    xc_list.append(XCFunctional("GGA_X_OPTX"))
                    xc_list.append(XCFunctional("GGA_C_LYP"))

                elif section_parameter == "HCTH120":
                    xc_list.append(XCFunctional("GGA_XC_HCTH_120"))

                elif section_parameter == "PBE0":
                    xc_list.append(XCFunctional("HYB_GGA_XC_PBEH"))

                elif section_parameter == "B3LYP":
                    xc_list.append(XCFunctional("HYB_GGA_XC_B3LYP"))

Lauri Himanen's avatar
Lauri Himanen committed
127
128
129
130
                elif section_parameter == "TPSS":
                    xc_list.append(XCFunctional("MGGA_X_TPSS"))
                    xc_list.append(XCFunctional("MGGA_C_TPSS"))

131
132
133
134
135
                else:
                    logger.warning("Unknown XC functional given in XC_FUNCTIONAL section parameter.")

            # Otherwise one has to look at the individual functional settings
            else:
Lauri Himanen's avatar
Lauri Himanen committed
136
137
138
139
140
                pbe = xc.get_subsection("PBE")
                if pbe is not None:
                    if pbe.accessed:
                        sp = pbe.get_section_parameter()
                        if sp == "T":
141
142
143
                            parametrization = pbe.get_keyword("PARAMETRIZATION", allow_default=True)
                            scale_x = pbe.get_keyword("SCALE_X", allow_default=True)
                            scale_c = pbe.get_keyword("SCALE_C", allow_default=True)
Lauri Himanen's avatar
Lauri Himanen committed
144
145
146
147
148
149
150
151
152
153
154
155
156
157
                            if parametrization == "ORIG":
                                xc_list.append(XCFunctional("GGA_X_PBE", scale_x))
                                xc_list.append(XCFunctional("GGA_C_PBE", scale_c))
                            elif parametrization == "PBESOL":
                                xc_list.append(XCFunctional("GGA_X_PBE_SOL", scale_x))
                                xc_list.append(XCFunctional("GGA_C_PBE_SOL", scale_c))
                            elif parametrization == "REVPBE":
                                xc_list.append(XCFunctional("GGA_X_PBE_R", scale_x))
                                xc_list.append(XCFunctional("GGA_C_PBE", scale_c))
                tpss = xc.get_subsection("TPSS")
                if tpss is not None:
                    if tpss.accessed:
                        sp = tpss.get_section_parameter()
                        if sp == "T":
158
159
                            scale_x = tpss.get_keyword("SCALE_X", allow_default=True)
                            scale_c = tpss.get_keyword("SCALE_C", allow_default=True)
Lauri Himanen's avatar
Lauri Himanen committed
160
161
162
                            xc_list.append(XCFunctional("MGGA_X_TPSS", scale_x))
                            xc_list.append(XCFunctional("MGGA_C_TPSS", scale_c))

163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
            # Sort the functionals alphabetically by name
            xc_list.sort(key=lambda x: x.name)
            xc_summary = ""

            # For every defined functional, stream the information to the
            # backend and construct the summary string
            for i, functional in enumerate(xc_list):

                gId = self.backend.openSection("section_XC_functionals")
                self.backend.addValue("XC_functional_name", functional.name)
                self.backend.addValue("XC_functional_weight", functional.weight)
                if functional.parameters is not None:
                    pass
                self.backend.closeSection("section_XC_functionals", gId)

                if i != 0:
                    xc_summary += "+"
                xc_summary += "{}*{}".format(functional.weight, functional.name)
                if functional.parameters is not None:
                    xc_summary += ":{}".format()

            # Stream summary
            if xc_summary is not "":
                self.backend.addValue("XC_functional", xc_summary)

188
189
        #=======================================================================
        # Cell periodicity
190
        periodicity = self.input_tree.get_keyword("FORCE_EVAL/SUBSYS/CELL/PERIODIC")
191
192
193
        if periodicity is not None:
            periodicity = periodicity.upper()
            periodicity_list = ("X" in periodicity, "Y" in periodicity, "Z" in periodicity)
194
            self.cache_service["configuration_periodic_dimensions"] = np.asarray(periodicity_list)
195
196
197
        else:
            logger.warning("Could not determine cell periodicity from FORCE_EVAL/SUBSYS/CELL/PERIODIC")

198
199
        #=======================================================================
        # Single point force file name
200
201
202
        self.setup_force_file_name()

        #=======================================================================
203
        # Trajectory file name
204
        self.setup_trajectory_file_name()
205

206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
        #=======================================================================
        # Trajectory file format
        self.cache_service["trajectory_format"] = self.input_tree.get_keyword("MOTION/PRINT/TRAJECTORY/FORMAT")
        self.cache_service["traj_add_last"] = self.input_tree.get_keyword("MOTION/PRINT/TRAJECTORY/ADD_LAST")
        traj_unit = self.input_tree.get_keyword("MOTION/PRINT/TRAJECTORY/UNIT")
        pint_traj_unit = self.get_pint_unit_string(traj_unit)
        self.cache_service["trajectory_unit"] = pint_traj_unit

        #=======================================================================
        # Velocity file format
        self.cache_service["velocity_format"] = self.input_tree.get_keyword("MOTION/PRINT/VELOCITIES/FORMAT")
        self.cache_service["vel_add_last"] = self.input_tree.get_keyword("MOTION/PRINT/VELOCITIES/ADD_LAST")
        vel_unit = self.input_tree.get_keyword("MOTION/PRINT/VELOCITIES/UNIT")
        pint_vel_unit = self.get_pint_unit_string(vel_unit)
        self.cache_service["velocity_unit"] = pint_vel_unit

222
223
224
        #=======================================================================
        # See if some more exotic calculation is requested (e.g. MP2, DFT+U, GW, RPA)

225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
        # Search for a WF_CORRELATION section
        # correlation = self.input_tree.get_section("FORCE_EVAL/DFT/XC/WF_CORRELATION")
        # method = "DFT"
        # if correlation.accessed:
            # method = correlation.get_keyword_value_raw("METHOD")
            # if method != "NONE":
                # # Can't really decide which method used (MP2, RPA, GW)
                # method = None

        # # Search for DFT+U settings
        # kinds = self.input_tree.get_section_list("FORCE_EVAL/SUBSYS/KIND")
        # for kind in kinds:
            # dft_u = kind.get_subsection("DFT_PLUS_U")
            # if dft_u.accessed:
                # method = "DFT+U"

        # self.cache_service["electronic_structure_method"] = method
242

243
244
        #=======================================================================
        # Stress tensor calculation method
245
        stress_tensor_method = self.input_tree.get_keyword("FORCE_EVAL/STRESS_TENSOR")
246
247
248
249
250
251
252
253
254
255
256
        if stress_tensor_method != "NONE":
            mapping = {
                "NUMERICAL": "Numerical",
                "ANALYTICAL": "Analytical",
                "DIAGONAL_ANALYTICAL": "Diagonal analytical",
                "DIAGONAL_NUMERICAL": "Diagonal numerical",
            }
            stress_tensor_method = mapping.get(stress_tensor_method)
            if stress_tensor_method is not None:
                self.backend.addValue("stress_tensor_method", stress_tensor_method)

257
    def normalize_x_cp2k_path(self, path):
258
259
260
        """The paths in CP2K input can be given in many ways. This function
        tries to normalize these forms into a valid path.
        """
261
        # Path is exactly as given
262
263
        if path.startswith("="):
            normalized_path = path[1:]
264
        # Path is relative, no project name added
265
        elif re.match(r"./", path):
266
267
            normalized_path = path
        # Path is relative, project name added
268
        else:
269
            project_name = self.input_tree.get_keyword("GLOBAL/PROJECT_NAME")
270
271
272
273
            if path:
                normalized_path = "{}-{}".format(project_name, path)
            else:
                normalized_path = project_name
274
275
        return normalized_path

276
277
278
279
280
281
282
283
284
285
286
287
288
    def get_pint_unit_string(self, cp2k_unit_string):
        """Translate the CP2K unit definition into a valid pint unit.
        """
        units = re.split('[\^\-\+\*\d]+', cp2k_unit_string)
        for unit in units:
            if unit == "":
                continue
            pint_unit = self.unit_mapping.get(unit.upper())
            if pint_unit is None:
                return None
            cp2k_unit_string = cp2k_unit_string.replace(unit, pint_unit)
        return cp2k_unit_string

289
290
291
    def setup_force_file_name(self):
        """Setup the force file path.
        """
292
        force_file = self.input_tree.get_keyword("FORCE_EVAL/PRINT/FORCES/FILENAME")
293
294
        extension = "xyz"
        if force_file is not None and force_file != "__STD_OUT__":
295
            normalized_path = self.normalize_x_cp2k_path(force_file)
296
297
298
299
300
301
            final_path = "{}-1_0.{}".format(normalized_path, extension)
            self.file_service.set_file_id(final_path, "force_file_single_point")

    def setup_trajectory_file_name(self):
        """Setup the trajectory file path.
        """
302
303
304
        traj_format = self.input_tree.get_keyword("MOTION/PRINT/TRAJECTORY/FORMAT")
        traj_filename = self.input_tree.get_keyword("MOTION/PRINT/TRAJECTORY/FILENAME")
        self.cache_service["each_geo_opt"] = self.input_tree.get_keyword("MOTION/PRINT/TRAJECTORY/EACH/GEO_OPT")
305
306
        if traj_filename is None:
            traj_filename = ""
307
308
309
310
311
312
313
314
315
316
317
        extension_map = {
            "XYZ": "xyz",
            "XMOL": "xyz",
            "ATOMIC": "xyz",
            "PDB": "pdb",
            "DCD": "dcd",
        }
        extension = extension_map.get(traj_format)
        if extension is None:
            logger.error("Unknown file format '{}' for CP2K trajectory file ".format(traj_format))
            return
318
        normalized_path = self.normalize_x_cp2k_path(traj_filename)
319
        final_path = "{}-pos-1.{}".format(normalized_path, extension)
320
321
        self.file_service.set_file_id(final_path, "trajectory")

322
323
324
325
    def fill_input_tree(self, file_path):
        """Parses a CP2K input file into an object tree.

        Return an object tree represenation of the input augmented with the
326
        default values and lone keyword values from the x_cp2k_input.xml file
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
        which is version specific. Keyword aliases are also mapped to the same
        data.

        The cp2k input is largely case-insensitive. In the input tree, we wan't
        only one standard way to name things, so all section names and section
        parameters will be transformed into upper case.

        To query the returned tree use the following functions:
            get_keyword("GLOBAL/PROJECT_NAME")
            get_parameter("GLOBAL/PRINT")
            get_default_keyword("FORCE_EVAL/SUBSYS/COORD")

        Args:
            : A string containing the contents of a CP2K input file. The
            input file can be stored as string as it isn't that big.

        Returns:
            The input as an object tree.
        """

        self.setup_version(self.parser_context.version_id)
        section_stack = []
349
350
351
        self.input_tree.root_section.accessed = True

        for line in self.input_lines:
Lauri Himanen's avatar
Lauri Himanen committed
352
353

            # Remove comments and whitespaces
Lauri Himanen's avatar
Lauri Himanen committed
354
            line = line.split('!', 1)[0].split('#', 1)[0].strip()
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389

            # Skip empty lines
            if len(line) == 0:
                continue

            # Section ends
            if line.upper().startswith('&END'):
                section_stack.pop()
            # Section starts
            elif line[0] == '&':
                parts = line.split(' ', 1)
                name = parts[0][1:].upper()
                section_stack.append(name)

                # Form the path
                path = ""
                for index, item in enumerate(section_stack):
                    if index != 0:
                        path += '/'
                    path += item

                # Mark the section as accessed.
                self.input_tree.set_section_accessed(path)

                # Save the section parameters
                if len(parts) > 1:
                    self.input_tree.set_parameter(path, parts[1].strip().upper())

            # Ignore variables and includes that might still be here for some
            # reason
            elif line.upper().startswith('@'):
                continue

            # Contents (keywords, default keywords)
            else:
390
                split = line.split(None, 1)
Lauri Himanen's avatar
Lauri Himanen committed
391
                if len(split) <= 1:
Lauri Himanen's avatar
Lauri Himanen committed
392
393
394
                    keyword_value = ""
                else:
                    keyword_value = split[1]
395
                keyword_name = split[0].upper()
396
                self.input_tree.set_keyword(path + "/" + keyword_name, keyword_value)
397
398
399
400
401
402
403
404
405
406
407
408
409
410

    def fill_metadata(self):
        """Goes through the input data and pushes everything to the
        backend.
        """
        name_stack = []
        self.fill_metadata_recursively(self.input_tree.root_section, name_stack)

    def fill_metadata_recursively(self, section, name_stack):
        """Recursively goes through the input sections and pushes everything to the
        backend.
        """
        if not section.accessed:
            return
411

412
        name_stack.append(section.name)
413
414
        path = "x_cp2k_section_{}".format(".".join(name_stack))
        not_section_path = "x_cp2k_{}".format(".".join(name_stack))
415

416
        gid = self.backend.openSection(path)
417

418
419
420
421
422
        # Keywords
        for default_name in section.default_keyword_names:
            keywords = section.keywords.get(default_name)
            for keyword in keywords:
                if keyword.value is not None:
423
                    name = "{}.{}".format(not_section_path, keyword.default_name)
424
                    self.backend.addValue(name, keyword.value)
425
426

        # Section parameter
427
428
        section_parameter = section.section_parameter
        if section_parameter is not None:
429
            name = "{}.SECTION_PARAMETERS".format(not_section_path)
Lauri Himanen's avatar
Lauri Himanen committed
430
431
            if section_parameter.value is not None:
                self.backend.addValue(name, section_parameter.value)
432
433

        # Default keyword
434
435
436
        default_keyword = section.default_keyword
        if default_keyword is not None:

437
            name = "{}.DEFAULT_KEYWORD".format(not_section_path)
438
            self.backend.addValue(name, default_keyword.value)
439
440

        # Subsections
441
        for name, subsections in section.sections.items():
442
443
444
445
446
447
            for subsection in subsections:
                self.fill_metadata_recursively(subsection, name_stack)

        self.backend.closeSection(path, gid)

        name_stack.pop()
448
449
450

    def setup_version(self, version_number):
        """ The pickle file which contains preparsed data from the
451
        x_cp2k_input.xml is version specific. By calling this function before
452
453
454
455
456
        parsing the correct file can be found.
        """
        pickle_path = os.path.dirname(__file__) + "/input_data/cp2k_input_tree.pickle".format(version_number)
        input_tree_pickle_file = open(pickle_path, 'rb')
        self.input_tree = pickle.load(input_tree_pickle_file)
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539

    def preprocess_input(self):
        """Preprocess the input file. Concatenate .inc files into the main
        input file and explicitly state all variables.
        """
        # Read the input file into memory. It shouldn't be that big so we can
        # do this easily
        input_lines = []
        with open(self.file_path, "r") as f:
            for line in f:
                input_lines.append(line.strip())

        # Merge include files to input
        extended_input = input_lines[:]  # Make a copy
        i_line = 0
        for line in input_lines:
            if line.startswith("@INCLUDE") or line.startswith("@include"):
                split = line.split(None, 1)
                includepath = split[1]
                basedir = os.path.dirname(self.file_path)
                filepath = os.path.join(basedir, includepath)
                filepath = os.path.abspath(filepath)
                if not os.path.isfile(filepath):
                    logger.warning("Could not find the include file '{}' stated in the CP2K input file. Continuing without it.".format(filepath))
                    continue

                # Get the content from include file
                included_lines = []
                with open(filepath, "r") as includef:
                    for line in includef:
                        included_lines.append(line.strip())
                    del extended_input[i_line]
                    extended_input[i_line:i_line] = included_lines
                    i_line += len(included_lines)
            i_line += 1

        # Gather the variable definitions
        variables = {}
        input_set_removed = []
        for i_line, line in enumerate(extended_input):
            if line.startswith("@SET") or line.startswith("@set"):
                components = line.split(None, 2)
                name = components[1]
                value = components[2]
                variables[name] = value
                logger.debug("Variable '{}' found with value '{}'".format(name, value))
            else:
                input_set_removed.append(line)

        # Place the variables
        variable_pattern = r"\@\{(\w+)\}|@(\w+)"
        compiled = re.compile(variable_pattern)
        reserved = ("include", "set", "if", "endif")
        input_variables_replaced = []
        for line in input_set_removed:
            results = compiled.finditer(line)
            new_line = line
            offset = 0
            for result in results:
                options = result.groups()
                first = options[0]
                second = options[1]
                if first:
                    name = first
                elif second:
                    name = second
                if name in reserved:
                    continue
                value = variables.get(name)
                if not value:
                    logger.error("Value for variable '{}' not set.".format(name))
                    continue
                len_value = len(value)
                len_name = len(name)
                start = result.start()
                end = result.end()
                beginning = new_line[:offset+start]
                rest = new_line[offset+end:]
                new_line = beginning + value + rest
                offset += len_value - len_name - 1
            input_variables_replaced.append(new_line)

        self.input_lines = input_variables_replaced