Something went wrong on our end
-
Ask Hjorth Larsen authoredAsk Hjorth Larsen authored
main.py 26.05 KiB
# Copyright 2016-2018 The NOMAD Developers Group
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Main author and maintainer: Ask Hjorth Larsen <asklarsen@gmail.com>
from __future__ import print_function
import os
import sys
import setup_paths
import re
import numpy as np
from ase import Atoms
from ase.spacegroup import crystal
#from ase.data import chemical_symbols
from nomadcore.simple_parser import mainFunction, SimpleMatcher as SM
from nomadcore.local_meta_info import loadJsonFile, InfoKindEl
from nomadcore.unit_conversion.unit_conversion \
import register_userdefined_quantity, convert_unit
from functionals import functionals
metaInfoPath = os.path.normpath(os.path.join(os.path.dirname(os.path.abspath(__file__)),"../../../../nomad-meta-info/meta_info/nomad_meta_info/molcas.nomadmetainfo.json"))
metaInfoEnv, warnings = loadJsonFile(filePath=metaInfoPath,
dependencyLoader=None,
extraArgsHandling=InfoKindEl.ADD_EXTRA_ARGS,
uri=None)
parser_info = {'name': 'molcas-parser', 'version': '1.0'}
# OK
# "program_basis_set_type",
# "program_name",
# "program_version",
# "configuration_periodic_dimensions"
# "simulation_cell", # molcas never has a cell
# "atom_labels",
# "atom_positions",
# "energy_total",
# 'section_system',
# 'section_method',
# 'section_frame_sequence',
# 'section_sampling_method',
# 'single_configuration_to_calculation_method_ref',
# 'single_configuration_calculation_to_system_ref',
# 'atom_forces_raw',
# 'frame_sequence_local_frames_ref',
# 'frame_sequence_to_sampling_ref',
# 'XC_functional_name',
# 'smearing_kind',
# 'smearing_width'
# 'eigenvalues_kpoints',
# 'eigenvalues_values',
# 'eigenvalues_occupation',
# 'band_k_points',
# 'band_energies',
# 'band_segm_start_end',
# 'band_segm_labels',
# 'dos_energies',
# 'dos_values',
# 'section_XC_functionals',
class MolcasContext(object):
def __init__(self):
self.data = {}
self.last_line = None
self.current_module_name = None
self.section_refs = {}
self.basis_info = []
self.buffered_lines = {}
def onOpen_section_method(self, backend, gindex, section):
self.section_refs['method'] = gindex
def onOpen_section_system(self, backend, gindex, section):
self.section_refs['system'] = gindex
def startedParsing(self, fname, parser):
pass
def adhoc_store_line(self, parser):
self.last_line = parser.fIn.readline()
def adhoc_pushback_last(self, parser):
assert self.last_line is not None
parser.fIn.pushbackLine(self.last_line)
self.last_line = None
def onClose_section_single_configuration_calculation(self, backend, gindex, section):
backend.addValue('single_configuration_to_calculation_method_ref',
self.section_refs['method'])
backend.addValue('single_configuration_calculation_to_system_ref',
self.section_refs['system'])
#print(self.buffered_lines.keys())
def lines2arrays(name):
elines = self.buffered_lines.pop('energy_%s' % name)
olines = self.buffered_lines.pop('occupations_%s' % name)
energies = []
occupations = []
assert len(elines) == len(olines)
for l1, l2 in zip(elines, olines):
energies += l1.split()[1:] # Energy <numbers>
occupations += l2.split()[2:] # Occ. No. <numbers>
assert len(energies) == len(occupations)
energies = np.array(energies).astype(float)
energies = convert_unit(energies, 'hartree')
occupations = np.array(occupations).astype(float)
# Sort by energy:
args = np.argsort(energies)
energies = energies[args]
occupations = occupations[args]
return energies.reshape(1, 1, -1), occupations.reshape(1, 1, -1)
energies = None
try:
energies, occupations = lines2arrays('nospin')
nspins = 1
nstates = energies.shape[-1]
except KeyError:
pass
try:
ealpha, oalpha = lines2arrays('alpha')
ebeta, obeta = lines2arrays('beta')
except KeyError:
pass
else:
# The spinup and spindown (alpha, beta)
# arrays can have different lengths
# so we must allocate arrays that are big enough
# and fill out with zeros or something else
assert energies is None
nspins = 2
nstates = max(ealpha.shape[-1], ebeta.shape[-1])
energies = np.empty((nspins, 1, nstates))
# We will only fill out the values that we have.
# The rest will be nan:
energies.fill(np.nan)
occupations = np.zeros((nspins, 1, nstates))
energies[0, :, :ealpha.shape[2]] = ealpha[0]
occupations[0, :, :ealpha.shape[2]] = oalpha[0]
energies[1, 0, :ebeta.shape[2]] = ebeta[0]
occupations[1, 0, :ebeta.shape[2]] = obeta[0]
if energies is not None:
for arr in [energies, occupations]:
assert arr.shape == (nspins, 1, nstates)
g = backend.openSection('section_eigenvalues')
backend.addArrayValues('eigenvalues_values', energies)
backend.addArrayValues('eigenvalues_occupation', occupations)
backend.closeSection('section_eigenvalues', g)
def onClose_section_method(self, backend, gindex, section):
for line in self.basis_info:
m = re.match('\s*Basis set label:(.*?)\.(.*?)\s*$', line)
atomlabel, name = m.group(1, 2)
# The terrible syntax in molcas has a lot of dots everywhere.
# Replace sequences of dots with single dots to make things
# reasonable.
name = re.sub(r'\.+', '.', name).strip('.')
g = backend.openSection('x_molcas_section_basis')
backend.addValue('x_molcas_basis_atom_label', atomlabel)
backend.addValue('x_molcas_basis_name', name)
backend.closeSection('x_molcas_section_basis', g)
del self.basis_info[:]
#print('SECTION', section)
#methods = section['x_molcas_method']
xc = []
methods = section['x_molcas_method_name']
assert len(methods) == 1, methods
method = methods[0]
uhf = method.startswith('UHF ')
if uhf:
method = method[4:].strip()
backend.addValue('x_molcas_uhf', uhf)
if method == 'SCF':
# XXX distinguish somehow between these two?
esm = 'DFT'
xc = ['HF_X']
elif method in ['RASSCF', 'RAS-CI', 'CASPT2', 'CCSDT']:
esm = method
elif method in ['CASVB', 'MRCI']:
esm = 'MRCISD'
elif method == 'MCPF':
esm = method ## Not defined by nomad
elif method == 'MBPT2':
esm = 'MP2'
elif method in functionals:
esm = 'DFT'
xc = functionals[method]
assert xc is not None, 'no xc: %s' % method
else:
raise ValueError('method: %s' % method)
backend.addValue('electronic_structure_method', esm)
for xcfunc in xc:
g = backend.openSection('section_XC_functionals')
backend.addValue('XC_functional_name', xcfunc)
backend.closeSection('section_XC_functionals', g)
if 0:
if methods is not None:
assert len(methods) == 1
method = methods[0]
scfnames = section['x_molcas_method_name']
assert len(scfnames) == 1, scfnames
scfname = scfnames[0]
if scfname == 'SCF':
backend.addValue('electronic_structure_method', 'DFT')
g = backend.openSection('section_XC_functionals')
backend.addValue('XC_functional_name', 'HF_X')
backend.closeSection('section_XC_functionals', g)
#print(self.current_module_name)
#scftype = section['x_molcas_method_name']
#if scftype is not None:
# print('SCF', scftype)
def onClose_section_system(self, backend, gindex, section):
matrix = self.data.pop('coordinates')
assert matrix.shape[1] == 4
atom_labels = matrix[:, 0]
coords = matrix[:, 1:4].astype(float)
assert coords.shape[1] == 3
coords = convert_unit(coords, 'bohr')
backend.addArrayValues('atom_labels', atom_labels)
backend.addArrayValues('atom_positions', coords)
backend.addArrayValues('configuration_periodic_dimensions', np.zeros(3, bool))
def adhoc_add_basis_set(self, parser):
line = parser.fIn.readline()
self.basis_info.append(line)
def adhoc_add_energies(self, parser):
line = parser.fIn.readline()
self.energies.append(line)
def adhoc_add_occupations(self, parser):
line = parser.fIn.readline()
self.occupations.append(line)
# multi_sm copied from Siesta/GULP
def multi_sm(self, name, startpattern, linepattern, endmatcher=None,
conflict='fail', # 'fail', 'keep', 'overwrite'
*args, **kwargs):
pat = re.compile(linepattern) # XXX how to get compiled pattern?
ngroups = pat.groups
allgroups = []
def addline(parser):
line = parser.fIn.readline()
match = pat.match(line)
assert match is not None
thislinegroups = match.groups()
assert len(thislinegroups) == ngroups
allgroups.append(thislinegroups)
def savearray(parser):
arr = np.array(allgroups, dtype=object)
del allgroups[:]
if name in self.data:
if conflict == 'fail':
raise ValueError('grrr %s %s' % (name, self.data[name]))
elif conflict == 'keep':
return # Do not save array
elif conflict == 'overwrite':
pass
else:
raise ValueError('Unknown keyword %s' % conflict)
#if arr.size > 0:
if arr.size == 0:
arr.shape = (0, ngroups)
self.data[name] = arr
if endmatcher is None:
endmatcher = r'.*'
if hasattr(endmatcher, 'swapcase'):
endmatcher = SM(endmatcher,
endReStr='',
forwardMatch=True,
name='%s-end' % name,
adHoc=savearray)
sm = SM(startpattern,
name=name,
subMatchers=[
SM(linepattern,
name='%s-line' % name,
repeats=True,
forwardMatch=True,
required=True,
adHoc=addline),
endmatcher,
], **kwargs)
return sm
context = MolcasContext()
#def get_inputfile_echo_sm():
# m = SM(r'\+\+\s*----+\s*Input file\s*---+',
# endReStr='--\s*-------+',
# sections=['section_method'],
# name='inputfile')
# return m
def get_system_sm():
m = SM(r'\s*Molecular structure info:',
name='structure',
sections=['section_system'],
subMatchers=[
SM(r'\s*--------+', name='bar'),
SM(r'.*?Cartesian Coordinates / Bohr, Angstrom\s*\*+',
name='coords',
subMatchers=[
context.multi_sm('coordinates',
r'\s*Center\s*Label\s*x\s*y\s*z\s*x\s*y\s*z',
r'\s*\d+\s*(\S+)\s*(\S+)\s*(\S+)\s*(\S+)')
])
])
return m
def get_finalresults_sm(): # Other modes than SCF/KS-DFT?
m = SM(r'\s*\*\s*SCF/KS-DFT Program, Final results',
endReStr='\s*Molecular Properties', # No clue when to really stop matching this
sections=['section_single_configuration_calculation'],
name='results',
subMatchers=[
SM(r'\s*Total SCF energy\s*(?P<energy_total__hartree>\S+)', name='etot')
])
return m
def get_header_sm():
m = SM(r'[\s^]*M O L C A S',
endReStr=r'\s*For the author list and the recommended.*',
name='header',
subMatchers=[
SM(r'[\s^]*version\s*(?P<program_version>.*?)\s*$',
name='version')
])
return m
# Does not include 'auto'
molcas_modules = ('alaska|caspt2|casvb|ccsdt|cpf|expbas|ffpt|gateway|'
'genano|grid_it|guessorb|guga|last_energy|loprop|mbpt2|'
'mckinley|mclr|motra|mrci|numerical_gradient|rasscf|'
'rassi|scf|seward|slapaf|vibrot').split('|')
ignore_modules = {'alaska', # OK
'numerical_gradient', # OK
'rassi', # OK
'motra',
'guga',
'ffpt',
'genano',
'grid_it',
'loprop',
'guessorb',
'expbas',
'vibrot',
'mckinley'
#'last_energy',
}
def get_anymodule_sms():
sms = []
for modulename in molcas_modules:
adhoc = None
if modulename not in ignore_modules:
class AdHoc:
def __init__(self, name):
self.name = name
def __call__(self, parser):
raise ValueError('module: %s' % self.name)
adhoc = AdHoc(modulename)
sms.append(SM(r'--- Start Module:\s*%s' % modulename,
adHoc=adhoc,
name=modulename))
return sms
#def module_sm(name, *args, **kwargs):
# m = SM(r'--- Start Module:\s*(%s)' % name, *args, name=name, **kwargs)
# return m
# mw = "module-wrapper" for SM
# The idea is to guarantee that matchers modules that do not always write a proper header (e.g. SLAPAF)
# are invoked when the module starts, but that the matcher's sections are only opened if recognizable
# output is written (sometimes they write nothing).
#
# Typically, pass a matcher which opens some sections but will only match actual output of the module
# This wrapper will call the wrapped matcher when the module is started, but the wrapped matcher
# will only match if the output is actually there, and also will only open its sections in that case.
def mw(modname, *args, **kwargs):
m = SM(r'--- Start Module:\s*%s' % modname,
name=modname,
subMatchers=[
SM(*args, **kwargs)
])
return m
def mod_pattern(lower, upper=None):
if upper is None:
upper = lower.upper()
return (r'---\s*Start Module:\s*{0}|'
r'\s*MOLCAS executing module\s*{1}'.format(lower, upper))
# Define these separately or not?
def startmod_pattern(name):
return r'---\s*Start Module:\s*{}'.format(name)
def execmod_pattern(name):
return r'\s*MOLCAS executing module\s*{}'.format(name)
def gateway_seward():
m = SM(mod_pattern(r'(:?gateway|seward)'),
#weak=True,
name='gate/seward',
#r'\s*MOLCAS executing module (GATEWAY|SEWARD)', name='seward',
subMatchers=[
SM(r'\s*Basis set label:(.*?)\s*$',
forwardMatch=True,
repeats=True,
adHoc=context.adhoc_add_basis_set,
name='basis'),
get_system_sm()
])
return m
def eigenvalue_line_matcher(pattern, spinname):
class AdHocLineAccumulator:
def __init__(self, var):
self.var = var
def __call__(self, parser):
line = parser.fIn.readline()
context.buffered_lines.setdefault(self.var, []).append(line)
#print(context.buffered_lines)
#sdfkjsdfkj
m = SM(pattern, name=spinname,
subMatchers=[
SM(r'\s*Orbital',
repeats=True,
subMatchers=[
SM(r'\s*Energy\s*(.*)', name='energies', adHoc=AdHocLineAccumulator('energy_%s' % spinname),
forwardMatch=True),
SM(r'\s*Occ\. No\.\s*(.*)', name='occ', adHoc=AdHocLineAccumulator('occupations_%s' % spinname),
forwardMatch=True)
])
])
return m
def scf_rasscf():
m = SM(mod_pattern(r'(:?ras)?scf'),
name='(ras)scf', #r'(?:ras)?scf', r'\s*MOLCAS executing module (?:SCF|RASSCF)', name='scf',
#sections=['section_method', 'section_single_configuration_calculation'],
subMatchers=[ # XXX opens method section even if there is no outpout
SM(r'\s*(?P<x_molcas_method_name>.*?)\s*iterations: Energy and convergence statistics',
sections=['section_method'],
name='scfiter'),
SM(r'\s*CI only, no orbital optimization',
sections=['section_method'],
fixedStartValues={'x_molcas_method_name': 'RAS-CI'}),
SM(r'\s*\*\s*Final Results\s*\*',
name='finalresults',
sections=['section_single_configuration_calculation'],
subMatchers=[
SM(r'\s*Total \S+ energy\s*(?P<energy_total__hartree>\S+)', name='E-tot'),
SM(r'\s*RASSCF root number\s*\d+\s*Total energy\s*=\s*(?P<energy_total__hartree>\S+)', name='E-ras'),
SM(r'\s*Molecular orbitals:',
name='mol-orb',
endReStr=r'\s*Molecular Charges:',
subMatchers=[
eigenvalue_line_matcher(r'\s*Title:\s*.*?\s*orbitals\s*$', 'nospin'),
eigenvalue_line_matcher(r'\s*Title:\s*.*?\s*orbitals\s*\(alpha\)', 'alpha'),
eigenvalue_line_matcher(r'\s*Title:\s*.*?\s*orbitals\s*\(beta\)', 'beta')
])
])
])
return m
def caspt2():
m = SM(mod_pattern(r'caspt2'),
#caspt2', r'\s*MOLCAS executing module CASPT2',
name='caspt2', # XXX opens section even if there is no output
sections=['section_method', 'section_single_configuration_calculation'],
fixedStartValues={'x_molcas_method_name': 'CASPT2'},
subMatchers=[
SM(r'\s*Total energy:\s*(?P<energy_total__hartree>\S+)', name='E-caspt2')
])
return m
def molcas_main_loop_sm():
m = SM(r'--- Start Module:\s*(?:%s)' % '|'.join(molcas_modules),
endReStr='--- Stop Module:',
repeats=True,
forwardMatch=True,
name='module',
subMatchers=[
gateway_seward(),
scf_rasscf(),
caspt2(),
SM(r'%s|%s' % (mod_pattern(r'slapaf'),
# Sometimes the damn thing starts without a proper header!! ARGH!
r's*Specification of the internal coordinates according to the user-defined'),
name='slapaf',
# The slapaf module writes forces, but only for a symmetry-reduced set of atoms.
# Therefore they are not useful to parse. We could parse the set of symmetries probably
# and somehow make it possible to get the full set of Cartesian forces,
# but the operation would be very complicated.
#
# Let us consider parsing the reduced forces and
# symmetries the day that someone wants to implement
# this symmetry-unreduction (this will not happen).
subMatchers=[
SM(r'\*\s*Energy Statistics for Geometry Optimization',
name='opt-loop',
endReStr=r'\s*$',
sections=['section_single_configuration_calculation'],
subMatchers=[
SM(r'\s*\d+\s*\S+\s*', name='opt-iter', repeats=True, forwardMatch=True, adHoc=context.adhoc_store_line),
SM(r'\s*$', forwardMatch=True, adHoc=context.adhoc_pushback_last, name='pushback'),
# Molcas is supposed to have spaces between the numbers, but with grdNorm/Max,
# it sometimes writes 0.051834-0.037589 in one. Thus we use -?[^\s-]+ to match such a number.
SM(r'\s*\d+\s*(?P<energy_total__hartree>\S+)\s*'
r'\S+\s*(?P<x_molcas_slapaf_grad_norm>-?[^\s-]+)\s*'
r'(?P<x_molcas_slapaf_grad_max>\S+)',
name='opt-iter', repeats=True),
]),
SM(r'\s*\*\s*Nuclear coordinates for the next iteration / Bohr',
endReStr=r'\s*$',
name='newcoords',
sections=['section_system'],
subMatchers=[
context.multi_sm('coordinates',
r'\s*ATOM\s*X\s*Y\s*Z',
r'\s*(\S+)\s*(\S+)\s*(\S+)\s*(\S+)')
])
]),
# last_energy just means it calls some other modules with annoyingly irregular output format
SM(mod_pattern(r'last_energy'),
name='last',
subMatchers=[
gateway_seward(),
scf_rasscf(),
caspt2(),
]),
SM(mod_pattern('casvb'),
name='casvb',
sections=['section_method'],
fixedStartValues={'x_molcas_method_name': 'CASVB'},
subMatchers=[
SM('\s*CASSCF energy\s*:\s*(?P<energy_total__hartree>\S+)',
sections=['section_single_configuration_calculation'],
name='e-tot')
]),
SM(mod_pattern('mrci'),
name='mrci',
sections=['section_method'],
fixedStartValues={'x_molcas_method_name': 'MRCI'},
subMatchers=[
SM(r'\s*CI ENERGY:\s*(?P<energy_total__hartree>\S+)',
sections=['section_single_configuration_calculation'])
]),
SM(mod_pattern('cpf', 'CPFMCPF'),
name='cpf',
sections=['section_method'],
fixedStartValues={'x_molcas_method_name': 'MCPF'},
subMatchers=[
SM(r'\s*FINAL MCPF ENERGY\s*(?P<energy_total__hartree>\S+)',
sections=['section_single_configuration_calculation'])
]),
SM(mod_pattern('ccsdt', r'CCSD\(T\)'),
name='ccsdt',
sections=['section_method'],
fixedStartValues={'x_molcas_method_name': 'CCSDT'},
subMatchers=[
SM(r'\s*Total energy \(diff\)\s*:\s*(?P<energy_total__hartree>\S+)',
sections=['section_single_configuration_calculation'])
]),
SM(mod_pattern('mbpt2'),
name='mbpt2',
fixedStartValues={'x_molcas_method_name': 'MBPT2'},
sections=['section_method', 'section_single_configuration_calculation'],
subMatchers=[
SM(r'\s*Total energy\s*=\s*(?P<energy_total__hartree>\S+)\s*a\.u\.')
]),
SM(mod_pattern('mclr'),
name='mclr',
subMatchers=[
SM(r'\s*\*\s*Harmonic frequencies in cm-1\s*\*',
endReStr=r'\s*\*\s*THERMOCHEMISTRY',
sections=['section_single_configuration_calculation'],
subMatchers=[
SM(r'\s*Symmetry\s*(?P<x_molcas_frequency_symmetry>\S+)',
name='symm',
repeats=True,
sections=['x_molcas_section_frequency'],
subMatchers=[
SM(r'\s*Freq\.\s*i(?P<x_molcas_imaginary_frequency_value__cm_1>\S+)', name='ifreq'),
SM(r'\s*Freq\.\s*(?P<x_molcas_frequency_value>\S+)', name='freq'),
SM(r'\s*Intensity:\s*(?P<x_molcas_frequency_intensity__km_mol_1>\S+)', name='intensity')
])
]),
]),
] + get_anymodule_sms())
return m
mainFileDescription = SM(
name='root',
weak=True,
startReStr='',
fixedStartValues={'program_name': 'Molcas',
'program_basis_set_type': 'gaussians'},
sections=['section_run'],
subMatchers=[
get_header_sm(),
#get_inputfile_echo_sm(),
molcas_main_loop_sm(),
SM(r'x^', # force parser to parse the whole file
name='impossible')
])
def main(**kwargs):
mainFunction(mainFileDescription=mainFileDescription,
metaInfoEnv=metaInfoEnv,
parserInfo=parser_info,
cachingLevelForMetaName={},
superContext=context,
**kwargs)
if __name__ == '__main__':
main()