Commit d07576e8 authored by Lauri Himanen's avatar Lauri Himanen
Browse files

Lots of new parsing stuff

parent cf26eeb2
import os
from cp2kparser.implementation.autoparser import parse_path
import argparse
import json
parser = argparse.ArgumentParser(description='Parse a CP2K calculation from folder.')
parser.add_argument('-metaInfoToKeep', type=str, help='A json list containing the names of the metainfos to keep during parsing.')
parser.add_argument('-metaInfoToSkip', type=str, help='A json list containing the names of the metainfos to skip during parsing.')
args = parser.parse_args()
# Try to decode the metaInfoTokeep
metaInfoToKeep = []
if args.metaInfoToKeep:
try:
metaInfoToKeep = json.loads(args.metaInfoToKeep)
except:
raise Exception("Could not decode the 'metaInfoToKeep' argument as a json list. You might need to surround the string with single quotes if it contains double quotes.")
# Try to decode the metaInfoToSkip
metaInfoToSkip = []
if args.metaInfoToSkip:
try:
metaInfoToSkip = json.loads(args.metaInfoToSkip)
except:
raise Exception("Could not decode the 'metaInfoToKeep' argument as a json list. You might need to surround the string with single quotes if it contains double quotes.")
path = os.getcwd()
parse_path(path)
parse_path(path, metaInfoToKeep, metaInfoToSkip)
......@@ -29,8 +29,8 @@ class Parser(object):
Attributes:
See the ParserContext class for more details about the attributes.
_file_handles: A "private" dictionary containing the cached file handles
_file_handles: A "private" dictionary containing the cached file contents
_file_handles: A "private" dictionary containing the cached file sizes
_file_contents: A "private" dictionary containing the cached file contents
_file_sizes: A "private" dictionary containing the cached file sizes
file_ids: A dictionary containing the mapping between file ids and filepaths
"""
......@@ -67,6 +67,8 @@ class Parser(object):
fileToParse,
mainFileDescription,
metaInfoEnv,
metaInfoToKeep,
metaInfoToSkip,
backend,
parserInfo,
cachingLevelForMetaName={},
......@@ -80,7 +82,7 @@ class Parser(object):
Returns:
"""
# Initialize the parser builder
parserBuilder = SimpleParserBuilder(mainFileDescription, metaInfoEnv)
parserBuilder = SimpleParserBuilder(mainFileDescription, metaInfoEnv, metaInfoToKeep)
if logger.isEnabledFor(logging.DEBUG):
s = StringIO.StringIO()
s.write("matchers:")
......
......@@ -3,12 +3,13 @@ Tools for testing a nomad parser.
"""
import os
import sys
import json
import numpy as np
#===============================================================================
def get_parser(path, metainfopath, parserbuilderclass, stream):
def get_parser(path, metainfopath, parserbuilderclass, metainfo_to_keep=[], metainfo_to_skip=[], stream=sys.stdout):
"""Initialize a parser that is able to parse the contents in the given path.
Args:
......@@ -26,8 +27,8 @@ def get_parser(path, metainfopath, parserbuilderclass, stream):
json_input = {
"version": "nomadparsein.json 1.0",
"metaInfoFile": metainfopath,
"metainfoToKeep": [],
"metainfoToSkip": [],
"metainfoToKeep": metainfo_to_keep,
"metainfoToSkip": metainfo_to_skip,
"files": files
}
parser = parserbuilderclass(json.dumps(json_input), stream=stream).build_parser()
......@@ -54,7 +55,7 @@ def get_metainfo(metaname, json_list):
# Return value if present
values = event.get("value")
if values:
return values
yield values
# Return reshaped flatvalues if present
flat_values = event.get("flatValues")
......@@ -62,4 +63,4 @@ def get_metainfo(metaname, json_list):
if flat_values and shape:
shaped_values = np.reshape(flat_values, shape)
return shaped_values
yield shaped_values
......@@ -5,11 +5,11 @@ from cp2kparser.generics.testing import get_parser
#===============================================================================
def parse_path(path):
def parse_path(path, metainfo_to_keep=[], metainfo_to_skip=[]):
"""Generates a cp2k parser using the tools defined in testing.py and parses
the contents in the given path
"""
parserbuilder = CP2KParserVersioner
metainfopath = os.path.normpath(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../../nomad-meta-info/meta_info/nomad_meta_info/cp2k.nomadmetainfo.json"))
parser = get_parser(path, metainfopath, parserbuilder, sys.stdout)
parser = get_parser(path, metainfopath, parserbuilder, metainfo_to_keep, metainfo_to_skip, sys.stdout)
parser.parse()
......@@ -172,7 +172,7 @@ class CP2KImplementation262(Parser):
# Determine the presence of a trajectory file
traj_file = self.input_tree.get_keyword("MOTION/PRINT/TRAJECTORY/FILENAME")
if traj_file is not None:
if traj_file is not None and traj_file != "__STD_OUT__":
file_format = self.input_tree.get_keyword("MOTION/PRINT/TRAJECTORY/FORMAT")
extension = {
"PDB": "pdb",
......@@ -275,9 +275,11 @@ class CP2KImplementation262(Parser):
outputfilename = self.get_file_handle("output").name
metainfoenv = self.metainfoenv
backend = self.backend
metainfo_to_keep = self.metainfo_to_keep
metainfo_to_skip = self.metainfo_to_skip
outputstructure = self.outputparser.outputstructure
cachingLevelForMetaName = self.outputparser.cachingLevelForMetaName
self.parse_file(outputfilename, outputstructure, metainfoenv, backend, parserInfo, cachingLevelForMetaName, superContext=self.outputparser)
self.parse_file(outputfilename, outputstructure, metainfoenv, metainfo_to_keep, metainfo_to_skip, backend, parserInfo, cachingLevelForMetaName, superContext=self.outputparser)
# Then extract the things that cannot be extracted by the SimpleMatcher
......@@ -438,6 +440,70 @@ class CP2KImplementation262(Parser):
# Return the iterator and unit
return (traj_iter, unit)
def get_functionals(self):
"""Used to search the input file for a functional definition
"""
# First try to look at the shortcut
xc_shortcut = self.input_tree.get_parameter("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL")
if xc_shortcut is not None and xc_shortcut != "NONE" and xc_shortcut != "NO_SHORTCUT":
logger.debug("Shortcut defined for XC_FUNCTIONAL")
# If PBE, check version
if xc_shortcut == "PBE":
pbe_version = self.input_tree.get_keyword("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL/PBE/PARAMETRIZATION")
result.value = {
'ORIG': "GGA_X_PBE",
'PBESOL': "GGA_X_PBE_SOL",
'REVPBE': "GGA_X_PBE_R",
}.get(pbe_version, "GGA_X_PBE")
return result
result.value = {
'B3LYP': "HYB_GGA_XC_B3LYP",
'BEEFVDW': None,
'BLYP': "GGA_C_LYP_GGA_X_B88",
'BP': None,
'HCTH120': None,
'OLYP': None,
'LDA': "LDA_XC_TETER93",
'PADE': "LDA_XC_TETER93",
'PBE0': None,
'TPSS': None,
}.get(xc_shortcut, None)
return result
else:
logger.debug("No shortcut defined for XC_FUNCTIONAL. Looking into subsections.")
# Look at the subsections and determine what part have been activated
# Becke88
xc_components = []
becke_88 = self.input_tree.get_parameter("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL/BECKE88")
if becke_88 == "TRUE":
xc_components.append("GGA_X_B88")
# Becke 97
becke_97 = self.input_tree.get_parameter("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL/BECKE97")
if becke_97 == "TRUE":
becke_97_param = self.input_tree.get_keyword("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL/BECKE97/PARAMETRIZATION")
becke_97_result = {
'B97GRIMME': None,
'B97_GRIMME': None,
'ORIG': "GGA_XC_B97",
'WB97X-V': None,
}.get(becke_97_param, None)
if becke_97_result is not None:
xc_components.append(becke_97_result)
# Return an alphabetically sorted and joined list of the xc components
result.value = "_".join(sorted(xc_components))
return result
# #===============================================================================
# class CP2K_262_Implementation(CP2KImplementation):
# def __init__(self, parser):
# CP2KImplementation.__init__(self, parser)
# def get_cell(self):
# """The cell size can be static or dynamic if e.g. doing NPT. If the
# cell size changes, outputs an Nx3x3 array where N is typically the
......@@ -552,67 +618,3 @@ class CP2KImplementation262(Parser):
# # No cell found
# else:
# logger.error("Could not find cell declaration.")
def get_functionals(self):
"""Used to search the input file for a functional definition
"""
# First try to look at the shortcut
xc_shortcut = self.input_tree.get_parameter("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL")
if xc_shortcut is not None and xc_shortcut != "NONE" and xc_shortcut != "NO_SHORTCUT":
logger.debug("Shortcut defined for XC_FUNCTIONAL")
# If PBE, check version
if xc_shortcut == "PBE":
pbe_version = self.input_tree.get_keyword("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL/PBE/PARAMETRIZATION")
result.value = {
'ORIG': "GGA_X_PBE",
'PBESOL': "GGA_X_PBE_SOL",
'REVPBE': "GGA_X_PBE_R",
}.get(pbe_version, "GGA_X_PBE")
return result
result.value = {
'B3LYP': "HYB_GGA_XC_B3LYP",
'BEEFVDW': None,
'BLYP': "GGA_C_LYP_GGA_X_B88",
'BP': None,
'HCTH120': None,
'OLYP': None,
'LDA': "LDA_XC_TETER93",
'PADE': "LDA_XC_TETER93",
'PBE0': None,
'TPSS': None,
}.get(xc_shortcut, None)
return result
else:
logger.debug("No shortcut defined for XC_FUNCTIONAL. Looking into subsections.")
# Look at the subsections and determine what part have been activated
# Becke88
xc_components = []
becke_88 = self.input_tree.get_parameter("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL/BECKE88")
if becke_88 == "TRUE":
xc_components.append("GGA_X_B88")
# Becke 97
becke_97 = self.input_tree.get_parameter("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL/BECKE97")
if becke_97 == "TRUE":
becke_97_param = self.input_tree.get_keyword("FORCE_EVAL/DFT/XC/XC_FUNCTIONAL/BECKE97/PARAMETRIZATION")
becke_97_result = {
'B97GRIMME': None,
'B97_GRIMME': None,
'ORIG': "GGA_XC_B97",
'WB97X-V': None,
}.get(becke_97_param, None)
if becke_97_result is not None:
xc_components.append(becke_97_result)
# Return an alphabetically sorted and joined list of the xc components
result.value = "_".join(sorted(xc_components))
return result
# #===============================================================================
# class CP2K_262_Implementation(CP2KImplementation):
# def __init__(self, parser):
# CP2KImplementation.__init__(self, parser)
......@@ -3,6 +3,18 @@ from nomadcore.caching_backend import CachingLevel
import numpy as np
#===============================================================================
class MetaInfo(object):
def __init__(self, name, description="", dtypeStr="c", shape=[], dependencies=[]):
self.name = name
self.description = description
self.dtypeStr = dtypeStr
self.shape = shape
if not isinstance(dependencies, list):
dependencies = [dependencies]
self.dependencies = dependencies
#===============================================================================
class CP2KOutputParser262(object):
"""The object that goes through the CP2K output file and parses everything
......@@ -17,90 +29,125 @@ class CP2KOutputParser262(object):
self.cp2kparser = cp2kparser
self.metaInfoToKeep = metaInfoToKeep
self.metaInfoToSkip = metaInfoToSkip
self.f_regex = "-?\d+\.\d+(E+|-\d+)?"
# Define the output parsing tree for this version
self.outputstructure = SM(
name='root',
startReStr="",
subMatchers=[
SM(
name='new_run',
startReStr=r" DBCSR\| Multiplication driver",
endReStr="[.\*]+PROGRAM STOPPED IN",
required=True,
sections=['section_run'],
subMatchers=[
SM(
name="run_datetime",
startReStr=r"[\*\s]+PROGRAM STARTED AT\s+(?P<cp2k_run_start_date>\d{4}-\d{2}-\d{2}) (?P<cp2k_run_start_time>\d{2}:\d{2}:\d{2}.\d{3})",
),
SM(
name="version",
startReStr=r" CP2K\| version string:\s+(?P<program_version>[\w\d\W\s]+)",
),
SM(
name="svn_revision",
startReStr=r" CP2K\| source code revision number:\s+svn:(?P<cp2k_svn_revision>\d+)",
),
# System Description
SM(
name="system_description",
startReStr="",
sections=["cp2k_system_description"],
otherMetaInfo=["atom_number"],
dependencies={"atom_number": ["cp2k_atom_number"]},
subMatchers=[
SM(
name="cell",
startReStr=" CELL\|",
forwardMatch=True,
sections=["cp2k_section_cell"],
subMatchers=[
SM(
name="cell_a",
startReStr=" CELL\| Vector a \[angstrom\]:\s+(?P<cp2k_cell_vector_a>[\d\.]+\s+[\d\.]+\s+[\d\.]+)+"
),
SM(
name="cell_b",
startReStr=" CELL\| Vector b \[angstrom\]:\s+(?P<cp2k_cell_vector_b>[\d\.]+\s+[\d\.]+\s+[\d\.]+)+"
),
SM(
name="cell_c",
startReStr=" CELL\| Vector c \[angstrom\]:\s+(?P<cp2k_cell_vector_c>[\d\.]+\s+[\d\.]+\s+[\d\.]+)+"
),
]
),
# SM(
# name="positions",
# sections=["cp2k_section_atom_position"],
# startReStr="",
# ),
SM(
name="functionals",
startReStr=" FUNCTIONAL\|",
forwardMatch=True,
sections=["section_method", "cp2k_section_functionals"],
otherMetaInfo=["XC_functional"],
dependencies={"XC_functional": ["cp2k_functional_name"]},
subMatchers=[
SM(
name="functional",
repeats=True,
startReStr=" FUNCTIONAL\| (?P<cp2k_functional_name>[\w\d\W]+):"
)
]
),
SM(
name="numbers",
startReStr=" TOTAL NUMBERS AND MAXIMUM NUMBERS",
sections=["cp2k_section_numbers"],
subMatchers=[
SM(
name="number_of_atoms",
startReStr="\s+- Atoms:\s+(?P<cp2k_atom_number>\d+)"
),
SM(
name="number_of_shell_sets",
startReStr="\s+- Shell sets:\s+(?P<cp2k_shell_sets>\d+)"
)
]
)
]
),
# Molecular Dynamics
SM(
startReStr=" MD| Molecular Dynamics Protocol",
forwardMatch=True,
sections=["cp2k_section_md"],
subMatchers=[
SM(
repeats=True,
startReStr=" ENERGY| Total FORCE_EVAL",
sections=["cp2k_section_md_step"],
subMatchers=[
SM(
startReStr=" ATOMIC FORCES in \[a\.u\.\]",
sections=["cp2k_section_md_forces"],
subMatchers=[
SM(
startReStr="\s+\d+\s+\d+\s+[\w\W\d]+\s+(?P<cp2k_md_force_atom_string>{0}\s+{0}\s+{0})".format(self.f_regex),
sections=["cp2k_section_md_force_atom"],
repeats=True,
)
]
),
SM(
startReStr=" STEP NUMBER\s+=\s+(?P<cp2k_md_step_number>\d+)"
),
SM(
startReStr=" TIME \[fs\]\s+=\s+(?P<cp2k_md_step_time>\d+\.\d+)"
),
SM(
startReStr=" TEMPERATURE \[K\]\s+=\s+(?P<cp2k_md_temperature_instantaneous>{0})\s+(?P<cp2k_md_temperature_average>{0})".format(self.f_regex)
),
SM(
startReStr=" i =",
sections=["cp2k_section_md_coordinates"],
otherMetaInfo=["cp2k_md_coordinates"],
dependencies={"cp2k_md_coordinates": ["cp2k_md_coordinate_atom_string"]},
subMatchers=[
SM(
startReStr=" \w+\s+(?P<cp2k_md_coordinate_atom_string>{0}\s+{0}\s+{0})".format(self.f_regex),
endReStr="\n",
sections=["cp2k_section_md_coordinate_atom"],
repeats=True,
)
]
)
]
)
]
)
]
)
......@@ -119,6 +166,16 @@ class CP2KOutputParser262(object):
'cp2k_section_numbers': CachingLevel.Cache,
'cp2k_atom_number': CachingLevel.Cache,
'cp2k_shell_sets': CachingLevel.Cache,
'cp2k_section_md_coordinates': CachingLevel.Cache,
'cp2k_section_md_coordinate_atom': CachingLevel.Cache,
'cp2k_md_coordinate_atom_string': CachingLevel.Cache,
'cp2k_md_coordinate_atom_float': CachingLevel.Cache,
'cp2k_section_md_forces': CachingLevel.Cache,
'cp2k_section_md_force_atom': CachingLevel.Cache,
'cp2k_md_force_atom_string': CachingLevel.Cache,
'cp2k_md_force_atom_float': CachingLevel.Cache,
}
# The trigger functions
......@@ -129,6 +186,7 @@ class CP2KOutputParser262(object):
# Open the common system description section
backend.openSection("section_system_description")
# Get the cell information
cell = section["cp2k_section_cell"]
if cell:
cell = cell[0]
......@@ -148,7 +206,16 @@ class CP2KOutputParser262(object):
cell[1, :] = b_comp
cell[2, :] = c_comp
backend.addArrayValues("cell", cell)
backend.addArrayValues("cell", cell, unit="angstrom")
# Get the number of atoms
numbers = section["cp2k_section_numbers"]
if numbers:
numbers = numbers[0]
n_atoms = numbers["cp2k_atom_number"]
if n_atoms:
n_atoms = n_atoms[0]
backend.addValue("atom_number", n_atoms)
# Close the common system description section
backend.closeSection("section_system_description", 0)
......@@ -187,7 +254,36 @@ class CP2KOutputParser262(object):
positions, unit = self.cp2kparser.get_initial_atom_positions_and_unit()
backend.addArrayValues("atom_position", positions)
# def onClose_section_run(self, backend, gIndex, section):
# """At the end the parser is able to place the final cached values into
# the backend.
# """
def onClose_cp2k_section_md_coordinate_atom(self, backend, gIndex, section):
"""Given the string with the coordinate components for one atom, make it
into a numpy array of coordinate components and store for later
concatenation.
"""
force_string = section["cp2k_md_coordinate_atom_string"][0]
components = np.array([float(x) for x in force_string.split()])
backend.addArrayValues("cp2k_md_coordinate_atom_float", components)
def onClose_cp2k_section_md_coordinates(self, backend, gIndex, section):
"""When all the coordinates for individual atoms have been gathered,
concatenate them into one big array and forward to the backend.
"""
forces = section["cp2k_md_coordinate_atom_float"]
forces = np.array(forces)
backend.addArrayValues("cp2k_md_coordinates", forces)
def onClose_cp2k_section_md_force_atom(self, backend, gIndex, section):
"""Given the string with the force components for one atom, make it
into a numpy array of force components and store for later
concatenation.
"""
force_string = section["cp2k_md_force_atom_string"][0]
components = np.array([float(x) for x in force_string.split()])
backend.addArrayValues("cp2k_md_force_atom_float", components)
def onClose_cp2k_section_md_forces(self, backend, gIndex, section):
"""When all the forces for individual atoms have been gathered,
concatenate them into one big array and forward to the backend.
"""
forces = section["cp2k_md_force_atom_float"]
forces = np.array(forces)
backend.addArrayValues("cp2k_md_forces", forces, unit="force_au")
# Version information for this restart file
# current date 2015-11-11 08:42:47.381
# current working dir /home/lauri/Dropbox/NoMaD Personal/gitlab/parser-cp2k/cp2kparser/tests/cp2k_2.6.2/forces/outputfile/n
# current date 2015-12-28 15:25:27.570
# current working dir /home/lauri/Dropbox/nomad-dev/parser-cp2k/cp2kparser/tests/cp2k_2.6.2/forces/outputfile/n
# Program compiled at ke 4.11.2015 08.48.42 +0200
# Program compiled on lauri-Lenovo-Z50-70
# Program compiled for Linux-x86-64-gfortran_basic
# Source code revision number svn:15893
&GLOBAL
PRINT_LEVEL LOW
PROJECT_NAME NaCl
RUN_TYPE MD
&END GLOBAL
......@@ -42,7 +41,7 @@
&AVERAGES T
&RESTART_AVERAGES
ITIMES_START 1
AVECPU 1.2543010792499901E-01
AVECPU 1.0667945061999486E-01
AVEHUGONIOT 0.0000000000000000E+00
AVETEMP_BARO 0.0000000000000000E+00
AVEPOT -1.5079368351817857E+01
......
# Version information for this restart file
# current date 2015-11-11 08:42:45.396
# current working dir /home/lauri/Dropbox/NoMaD Personal/gitlab/parser-cp2k/cp2kparser/tests/cp2k_2.6.2/forces/outputfile/n
# current date 2015-12-28 15:25:25.872
# current working dir /home/lauri/Dropbox/nomad-dev/parser-cp2k/cp2kparser/tests/cp2k_2.6.2/forces/outputfile/n
# Program compiled at ke 4.11.2015 08.48.42 +0200
# Program compiled on lauri-Lenovo-Z50-70
# Program compiled for Linux-x86-64-gfortran_basic
# Source code revision number svn:15893
&GLOBAL
PRINT_LEVEL LOW
PROJECT_NAME NaCl
RUN_TYPE MD
&END GLOBAL
......@@ -42,7 +41,7 @@
&AVERAGES T
&RESTART_AVERAGES
ITIMES_START 1
AVECPU 1.2833907072222245E-01
AVECPU 1.0910088984443667E-01
AVEHUGONIOT 0.0000000000000000E+00
AVETEMP_BARO 0.0000000000000000E+00
AVEPOT -1.5059100386767753E+01
......
# Version information for this restart file
# current date 2015-11-11 08:42:42.942
# current working dir /home/lauri/Dropbox/NoMaD Personal/gitlab/parser-cp2k/cp2kparser/tests/cp2k_2.6.2/forces/outputfile/n
# current date 2015-12-28 15:25:23.990
# current working dir /home/lauri/Dropbox/nomad-dev/parser-cp2k/cp2kparser/tests/cp2k_2.6.2/forces/outputfile/n
# Program compiled at ke 4.11.2015 08.48.42 +0200
# Program compiled on lauri-Lenovo-Z50-70
# Program compiled for Linux-x86-64-gfortran_basic
# Source code revision number svn:15893
&GLOBAL
PRINT_LEVEL LOW
PROJECT_NAME NaCl
RUN_TYPE MD
&END GLOBAL
......@@ -42,7 +41,7 @@
&AVERAGES T
&RESTART_AVERAGES
ITIMES_START 1
AVECPU 1.2904027982499996E-01
AVECPU 1.1097601980000033E-01
AVEHUGONIOT 0.0000000000000000E+00
AVETEMP_BARO 0.0000000000000000E+00
AVEPOT -1.5033491790569380E+01
......
# Version information for this restart file
# current date 2015-11-11 08:42:40.984
# current working dir /home/lauri/Dropbox/NoMaD Personal/gitlab/parser-cp2k/cp2kparser/tests/cp2k_2.6.2/forces/outputfile/n
# current date 2015-12-28 15:25:22.257
# current working dir /home/lauri/Dropbox/nomad-dev/parser-cp2k/cp2kparser/tests/cp2k_2.6.2/forces/outputfile/n
# Program compiled at ke 4.11.2015 08.48.42 +0200
# Program compiled on lauri-Lenovo-Z50-70
# Program compiled for Linux-x86-64-gfortran_basic
# Source code revision number svn:15893
&GLOBAL