diff --git a/nomad/app/api/info.py b/nomad/app/api/info.py index ce67893f0063c05f5a88fcc1cbdaf7510027699a..c5d07d6cfc343a4904239e2fe93d371fff566210 100644 --- a/nomad/app/api/info.py +++ b/nomad/app/api/info.py @@ -20,7 +20,8 @@ from typing import Dict, Any from flask_restplus import Resource, fields from datetime import datetime -from nomad import config, parsing, normalizing, datamodel, gitinfo, search +from nomad import config, normalizing, datamodel, gitinfo, search +from nomad.parsing import parsers, MatchingParser from .api import api @@ -94,8 +95,8 @@ class InfoResource(Resource): def get(self): ''' Return information about the nomad backend and its configuration. ''' codes_dict = {} - for parser in parsing.parser_dict.values(): - if isinstance(parser, parsing.MatchingParser) and parser.domain == 'dft': + for parser in parsers.parser_dict.values(): + if isinstance(parser, MatchingParser) and parser.domain == 'dft': code_name = parser.code_name if code_name in codes_dict: continue @@ -105,10 +106,10 @@ class InfoResource(Resource): return { 'parsers': [ key[key.index('/') + 1:] - for key in parsing.parser_dict.keys()], + for key in parsers.parser_dict.keys()], 'metainfo_packages': ['general', 'general.experimental', 'common', 'public'] + sorted([ key[key.index('/') + 1:] - for key in parsing.parser_dict.keys()]), + for key in parsers.parser_dict.keys()]), 'codes': codes, 'normalizers': [normalizer.__name__ for normalizer in normalizing.normalizers], 'statistics': statistics(), diff --git a/nomad/app/api/metainfo.py b/nomad/app/api/metainfo.py index 1b0979f16514eadb1115a4e67357eabb60f0b6eb..be6d05cd38f1b1300287251a2c5ae4b50987e104 100644 --- a/nomad/app/api/metainfo.py +++ b/nomad/app/api/metainfo.py @@ -22,7 +22,7 @@ import importlib from nomad.metainfo.legacy import python_package_mapping, LegacyMetainfoEnvironment from nomad.metainfo import Package -from nomad.parsing import parsers +from nomad.parsing.parsers import parsers from .api import api diff --git a/nomad/cli/parse.py b/nomad/cli/parse.py index 445afa554b5845b24c260fd6ad9d61a6f1f8719a..2ed3c9b0f3b0e12f38a03532d8f2c66622259726 100644 --- a/nomad/cli/parse.py +++ b/nomad/cli/parse.py @@ -4,10 +4,9 @@ import json import click import sys -from nomad import utils -from nomad import parsing -from nomad import normalizing -from nomad import datamodel +from nomad import utils, parsing, normalizing, datamodel +from nomad.parsing.parsers import parser_dict, match_parser + import nomadcore from .cli import cli @@ -27,10 +26,10 @@ def parse( if logger is None: logger = utils.get_logger(__name__) if parser_name is not None: - parser = parsing.parser_dict.get(parser_name) + parser = parser_dict.get(parser_name) assert parser is not None, 'the given parser must exist' else: - parser = parsing.match_parser(mainfile_path, strict=strict) + parser = match_parser(mainfile_path, strict=strict) if isinstance(parser, parsing.MatchingParser): parser_name = parser.name else: diff --git a/nomad/datamodel/dft.py b/nomad/datamodel/dft.py index 5e5ae809bff908e830b135a594e4b15b196af261..6b117a39539ac9ae464d8e3713695bf0f3a31fb4 100644 --- a/nomad/datamodel/dft.py +++ b/nomad/datamodel/dft.py @@ -266,7 +266,7 @@ class DFTMetadata(MSection): def code_name_from_parser(self): entry = self.m_parent if entry.parser_name is not None: - from nomad.parsing import parser_dict + from nomad.parsing.parsers import parser_dict parser = parser_dict.get(entry.parser_name) if hasattr(parser, 'code_name'): return parser.code_name diff --git a/nomad/parsing/__init__.py b/nomad/parsing/__init__.py index 942e0f5a79bd979d73631ab163199813361a0b95..4cfa35be822e101485dcf5f5016b0151ad5ea18f 100644 --- a/nomad/parsing/__init__.py +++ b/nomad/parsing/__init__.py @@ -50,14 +50,14 @@ The implementation :class:`LegacyParser` is used for most NOMAD-coe parsers. The parser definitions are available via the following two variables. -.. autodata:: nomad.parsing.parsers -.. autodata:: nomad.parsing.parser_dict +.. autodata:: nomad.parsing.parsers.parsers +.. autodata:: nomad.parsing.parsers.parser_dict Parsers are reused for multiple calculations. Parsers and calculation files are matched via regular expressions. -.. autofunction:: nomad.parsing.match_parser +.. autofunction:: nomad.parsing.parsers.match_parser Parsers in NOMAD-coe use a *backend* to create output. There are different NOMAD-coe basends. In nomad@FAIRDI, we only currently only use a single backed. The following @@ -70,503 +70,6 @@ based on nomad@fairdi's metainfo: :members: ''' -from typing import Callable, IO, Union, Dict -import os.path - -from nomad import config, datamodel - -from nomad.parsing.legacy import ( - AbstractParserBackend, Backend, BackendError, BadContextUri, LegacyParser, VaspOutcarParser) +from nomad.parsing.legacy import AbstractParserBackend, Backend, BackendError, BadContextUri, LegacyParser from nomad.parsing.parser import Parser, BrokenParser, MissingParser, MatchingParser -from nomad.parsing.artificial import ( - TemplateParser, GenerateRandomParser, ChaosParser, EmptyParser) -from eelsparser import EelsParser -from mpesparser import MPESParser -from aptfimparser import APTFIMParser - -try: - # these packages are not available without parsing extra, which is ok, if the - # parsers are only initialized to load their metainfo definitions - import magic - import gzip - import bz2 - import lzma - - _compressions = { - b'\x1f\x8b\x08': ('gz', gzip.open), - b'\x42\x5a\x68': ('bz2', bz2.open), - b'\xfd\x37\x7a': ('xz', lzma.open) - } - - encoding_magic = magic.Magic(mime_encoding=True) - -except ImportError: - pass - - -def match_parser(mainfile_path: str, strict=True) -> 'Parser': - ''' - Performs parser matching. This means it take the given mainfile and potentially - opens it with the given callback and tries to identify a parser that can parse - the file. - - This is determined by filename (e.g. *.out), mime type (e.g. text/*, application/xml), - and beginning file contents. - - Arguments: - mainfile_path: Path to the mainfile - strict: Only match strict parsers, e.g. no artificial parsers for missing or empty entries. - - Returns: The parser, or None if no parser could be matched. - ''' - mainfile = os.path.basename(mainfile_path) - if mainfile.startswith('.') or mainfile.startswith('~'): - return None - - with open(mainfile_path, 'rb') as f: - compression, open_compressed = _compressions.get(f.read(3), (None, open)) - - with open_compressed(mainfile_path, 'rb') as cf: # type: ignore - buffer = cf.read(config.parser_matching_size) - - mime_type = magic.from_buffer(buffer, mime=True) - - decoded_buffer = None - encoding = None - try: # Try to open the file as a string for regex matching. - decoded_buffer = buffer.decode('utf-8') - except UnicodeDecodeError: - # This file is either binary or has wrong encoding - encoding = encoding_magic.from_buffer(buffer) - - if config.services.force_raw_file_decoding: - encoding = 'iso-8859-1' - - if encoding in ['iso-8859-1']: - try: - decoded_buffer = buffer.decode(encoding) - except Exception: - pass - - for parser in parsers: - if strict and isinstance(parser, (MissingParser, EmptyParser)): - continue - - if parser.is_mainfile(mainfile_path, mime_type, buffer, decoded_buffer, compression): - # potentially convert the file - if encoding in ['iso-8859-1']: - try: - with open(mainfile_path, 'rb') as binary_file: - content = binary_file.read().decode(encoding) - except Exception: - pass - else: - with open(mainfile_path, 'wt') as text_file: - text_file.write(content) - - # TODO: deal with multiple possible parser specs - return parser - - return None - - -parsers = [ - GenerateRandomParser(), - TemplateParser(), - ChaosParser(), - LegacyParser( - name='parsers/phonopy', code_name='Phonopy', code_homepage='https://phonopy.github.io/phonopy/', - parser_class_name='phonopyparser.PhonopyParserWrapper', - # mainfile_contents_re=r'', # Empty regex since this code calls other DFT codes. - mainfile_name_re=(r'.*/phonopy-FHI-aims-displacement-0*1/control.in$') - ), - LegacyParser( - name='parsers/vasp', code_name='VASP', code_homepage='https://www.vasp.at/', - parser_class_name='vaspparser.VASPRunParser', - mainfile_mime_re=r'(application/.*)|(text/.*)', - mainfile_contents_re=( - r'^\s*<\?xml version="1\.0" encoding="ISO-8859-1"\?>\s*' - r'?\s*<modeling>' - r'?\s*<generator>' - r'?\s*<i name="program" type="string">\s*vasp\s*</i>' - r'?'), - supported_compressions=['gz', 'bz2', 'xz'] - ), - VaspOutcarParser( - name='parsers/vasp-outcar', code_name='VASP', code_homepage='https://www.vasp.at/', - parser_class_name='vaspparser.VaspOutcarParser', - mainfile_name_re=r'(.*/)?OUTCAR(\.[^\.]*)?', - mainfile_contents_re=(r'^\svasp\.') - ), - LegacyParser( - name='parsers/exciting', code_name='exciting', code_homepage='http://exciting-code.org/', - parser_class_name='excitingparser.ExcitingParser', - mainfile_name_re=r'^.*.OUT(\.[^/]*)?$', - mainfile_contents_re=(r'EXCITING.*started') - ), - LegacyParser( - name='parsers/fhi-aims', code_name='FHI-aims', code_homepage='https://aimsclub.fhi-berlin.mpg.de/', - parser_class_name='fhiaimsparser.FHIaimsParser', - mainfile_contents_re=( - r'^(.*\n)*' - r'?\s*Invoking FHI-aims \.\.\.' - # r'?\s*Version' - ) - ), - LegacyParser( - name='parsers/cp2k', code_name='CP2K', code_homepage='https://www.cp2k.org/', - parser_class_name='cp2kparser.CP2KParser', - mainfile_contents_re=( - r'\*\*\*\* \*\*\*\* \*\*\*\*\*\* \*\* PROGRAM STARTED AT\s.*\n' - r' \*\*\*\*\* \*\* \*\*\* \*\*\* \*\* PROGRAM STARTED ON\s*.*\n' - r' \*\* \*\*\*\* \*\*\*\*\*\* PROGRAM STARTED BY .*\n' - r' \*\*\*\*\* \*\* \*\* \*\* \*\* PROGRAM PROCESS ID .*\n' - r' \*\*\*\* \*\* \*\*\*\*\*\*\* \*\* PROGRAM STARTED IN .*\n' - ) - ), - LegacyParser( - name='parsers/crystal', code_name='Crystal', code_homepage='https://www.crystal.unito.it/', - parser_class_name='crystalparser.CrystalParser', - mainfile_contents_re=( - r'(CRYSTAL\s*\n\d+ \d+ \d+)|(CRYSTAL will run on \d+ processors)|' - r'(\s*\*\s*CRYSTAL[\d]+\s*\*\s*\*\s*(public|Release) \: [\d\.]+.*\*)|' - r'(Executable:\s*[/_\-a-zA-Z0-9]*MPPcrystal)' - ) - ), - # The main contents regex of CPMD was causing a catostrophic backtracking issue - # when searching through the first 500 bytes of main files. We decided - # to use only a portion of the regex to avoid that issue. - LegacyParser( - name='parsers/cpmd', code_name='CPMD', code_homepage='https://www.lcrc.anl.gov/for-users/software/available-software/cpmd/', - parser_class_name='cpmdparser.CPMDParser', - mainfile_contents_re=( - # r'\s+\*\*\*\*\*\* \*\*\*\*\*\* \*\*\*\* \*\*\*\* \*\*\*\*\*\*\s*' - # r'\s+\*\*\*\*\*\*\* \*\*\*\*\*\*\* \*\*\*\*\*\*\*\*\*\* \*\*\*\*\*\*\*\s+' - r'\*\*\* \*\* \*\*\* \*\* \*\*\*\* \*\* \*\* \*\*\*' - # r'\s+\*\* \*\* \*\*\* \*\* \*\* \*\* \*\* \*\*\s+' - # r'\s+\*\* \*\*\*\*\*\*\* \*\* \*\* \*\* \*\*\s+' - # r'\s+\*\*\* \*\*\*\*\*\* \*\* \*\* \*\* \*\*\*\s+' - # r'\s+\*\*\*\*\*\*\* \*\* \*\* \*\* \*\*\*\*\*\*\*\s+' - # r'\s+\*\*\*\*\*\* \*\* \*\* \*\* \*\*\*\*\*\*\s+' - ) - ), - LegacyParser( - name='parsers/nwchem', code_name='NWChem', code_homepage='http://www.nwchem-sw.org/', - parser_class_name='nwchemparser.NWChemParser', - mainfile_contents_re=( - r'Northwest Computational Chemistry Package \(NWChem\) (\d+\.)+\d+' - ) - ), - LegacyParser( - name='parsers/bigdft', code_name='BigDFT', code_homepage='http://bigdft.org/', - parser_class_name='bigdftparser.BigDFTParser', - mainfile_contents_re=( - # r'__________________________________ A fast and precise DFT wavelet code\s*' - # r'\| \| \| \| \| \|\s*' - # r'\| \| \| \| \| \| BBBB i gggggg\s*' - # r'\|_____\|_____\|_____\|_____\|_____\| B B g\s*' - # r'\| \| : \| : \| \| \| B B i g\s*' - # r'\| \|-0\+--\|-0\+--\| \| \| B B i g g\s*' - r'\|_____\|__:__\|__:__\|_____\|_____\|___ BBBBB i g g\s*' - # r'\| : \| \| \| : \| \| B B i g g\s*' - # r'\|--\+0-\| \| \|-0\+--\| \| B B iiii g g\s*' - # r'\|__:__\|_____\|_____\|__:__\|_____\| B B i g g\s*' - # r'\| \| : \| : \| \| \| B BBBB i g g\s*' - # r'\| \|-0\+--\|-0\+--\| \| \| B iiiii gggggg\s*' - # r'\|_____\|__:__\|__:__\|_____\|_____\|__BBBBB\s*' - # r'\| \| \| \| : \| \| TTTTTTTTT\s*' - # r'\| \| \| \|--\+0-\| \| DDDDDD FFFFF T\s*' - # r'\|_____\|_____\|_____\|__:__\|_____\| D D F TTTT T\s*' - # r'\| \| \| \| : \| \|D D F T T\s*' - # r'\| \| \| \|--\+0-\| \|D D FFFF T T\s*' - # r'\|_____\|_____\|_____\|__:__\|_____\|D___ D F T T\s*' - # r'\| \| \| : \| \| \|D D F TTTTT\s*' - # r'\| \| \|--\+0-\| \| \| D D F T T\s*' - # r'\|_____\|_____\|__:__\|_____\|_____\| D F T T\s*' - # r'\| \| \| \| \| \| D T T\s*' - # r'\| \| \| \| \| \| DDDDDD F TTTT\s*' - # r'\|_____\|_____\|_____\|_____\|_____\|______ www\.bigdft\.org' - ) - ), - LegacyParser( - name='parsers/wien2k', code_name='WIEN2k', code_homepage='http://www.wien2k.at/', - parser_class_name='wien2kparser.Wien2kParser', - mainfile_contents_re=r'\s*---------\s*:ITE[0-9]+:\s*[0-9]+\.\s*ITERATION\s*---------' - ), - LegacyParser( - name='parsers/band', code_name='BAND', code_homepage='https://www.scm.com/product/band_periodicdft/', - parser_class_name='bandparser.BANDParser', - mainfile_contents_re=r' +\* +Amsterdam Density Functional +\(ADF\)'), - LegacyParser( - name='parsers/gaussian', code_name='Gaussian', code_homepage='http://gaussian.com/', - parser_class_name='gaussianparser.GaussianParser', - mainfile_mime_re=r'.*', - mainfile_contents_re=( - r'\s*Cite this work as:' - r'\s*Gaussian [0-9]+, Revision [A-Za-z0-9\.]*,') - ), - LegacyParser( - name='parsers/quantumespresso', code_name='Quantum Espresso', code_homepage='https://www.quantum-espresso.org/', - parser_class_name='quantumespressoparser.QuantumEspressoParserPWSCF', - mainfile_contents_re=( - r'(Program PWSCF.*starts)|' - r'(Current dimensions of program PWSCF are)') - # r'^(.*\n)*' - # r'\s*Program (\S+)\s+v\.(\S+)(?:\s+\(svn\s+rev\.\s+' - # r'(\d+)\s*\))?\s+starts[^\n]+' - # r'(?:\s*\n?)*This program is part of the open-source Quantum') - ), - LegacyParser( - name='parsers/abinit', code_name='ABINIT', code_homepage='https://www.abinit.org/', - parser_class_name='abinitparser.AbinitParser', - mainfile_contents_re=(r'^\n*\.Version\s*[0-9.]*\s*of ABINIT\s*') - ), - LegacyParser( - name='parsers/orca', code_name='ORCA', code_homepage='https://orcaforum.kofo.mpg.de/', - parser_class_name='orcaparser.OrcaParser', - mainfile_contents_re=( - r'\s+\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\**\s*' - r'\s+\* O R C A \*\s*' - r'\s+\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\**\s*' - r'\s*' - r'\s*--- An Ab Initio, DFT and Semiempirical electronic structure package ---\s*') - ), - LegacyParser( - name='parsers/castep', code_name='CASTEP', code_homepage='http://www.castep.org/', - parser_class_name='castepparser.CastepParser', - mainfile_contents_re=(r'\s\|\s*CCC\s*AA\s*SSS\s*TTTTT\s*EEEEE\s*PPPP\s*\|\s*') - ), - LegacyParser( - name='parsers/dl-poly', code_name='DL_POLY', code_homepage='https://www.scd.stfc.ac.uk/Pages/DL_POLY.aspx', - parser_class_name='dlpolyparser.DlPolyParserWrapper', - mainfile_contents_re=(r'\*\* DL_POLY \*\*') - ), - LegacyParser( - name='parsers/lib-atoms', code_name='libAtoms', code_homepage='https://libatoms.github.io/', - parser_class_name='libatomsparser.LibAtomsParserWrapper', - mainfile_contents_re=(r'\s*<GAP_params\s') - ), - LegacyParser( - name='parsers/octopus', code_name='Octopus', code_homepage='https://octopus-code.org/', - parser_class_name='octopusparser.OctopusParserWrapper', - mainfile_contents_re=(r'\|0\) ~ \(0\) \|') - # We decided to use the octopus eyes instead of - # r'\*{32} Grid \*{32}Simulation Box:' since it was so far down in the file. - ), - # match gpaw2 first, other .gpw files are then considered to be "gpaw1" - LegacyParser( - name='parsers/gpaw2', code_name='GPAW', code_homepage='https://wiki.fysik.dtu.dk/gpaw/', - parser_class_name='gpawparser.GPAWParser2Wrapper', - mainfile_binary_header=b'GPAW', - mainfile_name_re=(r'^.*\.(gpw2|gpw)$'), - mainfile_mime_re=r'application/(x-tar|octet-stream)' - ), - LegacyParser( - name='parsers/gpaw', code_name='GPAW', code_homepage='https://wiki.fysik.dtu.dk/gpaw/', - parser_class_name='gpawparser.GPAWParserWrapper', - mainfile_name_re=(r'^.*\.gpw$'), - mainfile_mime_re=r'application/(x-tar|octet-stream)' - ), - LegacyParser( - name='parsers/atk', code_name='ATK', code_homepage='https://www.synopsys.com/silicon/quantumatk.html', - parser_class_name='atkparser.ATKParserWrapper', - # mainfile_contents_re=r'', # We can't read .gpw as txt - of UlmGPAW|AFFormatGPAW' - mainfile_name_re=r'^.*\.nc', - # The previously used mime type r'application/x-netcdf' wasn't found by magic library. - mainfile_mime_re=r'application/octet-stream' - ), - LegacyParser( - name='parsers/gulp', code_name='gulp', code_homepage='http://gulp.curtin.edu.au/gulp/', - parser_class_name='gulpparser.GULPParser', - mainfile_contents_re=( - r'\s*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*' - r'\*\*\*\*\*\*\*\*\*\*\*\*\*\s*' - r'\s*\*\s*GENERAL UTILITY LATTICE PROGRAM\s*\*\s*') - ), - LegacyParser( - name='parsers/siesta', code_name='Siesta', code_homepage='https://departments.icmab.es/leem/siesta/', - parser_class_name='siestaparser.SiestaParser', - mainfile_contents_re=( - r'(Siesta Version: siesta-|SIESTA [0-9]\.[0-9]\.[0-9])|' - r'(\*\s*WELCOME TO SIESTA\s*\*)') - ), - LegacyParser( - name='parsers/elk', code_name='elk', code_homepage='http://elk.sourceforge.net/', - parser_class_name='elkparser.ElkParser', - mainfile_contents_re=r'\| Elk version [0-9.a-zA-Z]+ started \|' - ), - LegacyParser( - name='parsers/elastic', code_name='elastic', code_homepage='http://exciting-code.org/elastic', - parser_class_name='elasticparser.ElasticParser', - mainfile_contents_re=r'\s*Order of elastic constants\s*=\s*[0-9]+\s*' - ), - LegacyParser( - name='parsers/gamess', code_name='GAMESS', code_homepage='https://www.msg.chem.iastate.edu/gamess/versions.html', - parser_class_name='gamessparser.GamessParser', - mainfile_contents_re=( - r'\s*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\**\s*' - r'\s*\*\s*GAMESS VERSION =\s*(.*)\*\s*' - r'\s*\*\s*FROM IOWA STATE UNIVERSITY\s*\*\s*') - ), - LegacyParser( - name='parsers/turbomole', code_name='turbomole', code_homepage='https://www.turbomole.org/', - parser_class_name='turbomoleparser.TurbomoleParser', - mainfile_contents_re=( - r'Copyright \(C\) [0-9]+ TURBOMOLE GmbH, Karlsruhe') - ), - LegacyParser( - name='parsers/skeleton', code_name='skeleton', code_homepage=None, - domain='ems', - parser_class_name='skeletonparser.SkeletonParserInterface', - mainfile_mime_re=r'(application/json)|(text/.*)', - mainfile_contents_re=(r'skeleton experimental metadata format') - ), - MPESParser(), - APTFIMParser(), - EelsParser(), - LegacyParser( - name='parsers/qbox', code_name='qbox', code_homepage='http://qboxcode.org/', domain='dft', - parser_class_name='qboxparser.QboxParser', - mainfile_mime_re=r'(application/xml)|(text/.*)', - mainfile_contents_re=(r'http://qboxcode.org') - ), - LegacyParser( - name='parsers/dmol', code_name='DMol3', code_homepage='http://dmol3.web.psi.ch/dmol3.html', domain='dft', - parser_class_name='dmol3parser.Dmol3Parser', - mainfile_name_re=r'.*\.outmol', - mainfile_contents_re=r'Materials Studio DMol\^3' - ), - LegacyParser( - name='parsers/fleur', code_name='fleur', code_homepage='https://www.flapw.de/', domain='dft', - parser_class_name='fleurparser.FleurParser', - mainfile_contents_re=r'This output is generated by fleur.' - ), - LegacyParser( - name='parsers/molcas', code_name='MOLCAS', code_homepage='http://www.molcas.org/', domain='dft', - parser_class_name='molcasparser.MolcasParser', - mainfile_contents_re=r'M O L C A S' - ), - LegacyParser( - name='parsers/onetep', code_name='ONETEP', code_homepage='https://www.onetep.org/', domain='dft', - parser_class_name='onetepparser.OnetepParser', - mainfile_contents_re=r'####### # # ####### ####### ####### ######' - ), - LegacyParser( - name='parsers/openkim', code_name='OpenKIM', domain='dft', - parser_class_name='openkimparser.OpenKIMParser', - mainfile_contents_re=r'OPENKIM' - ), - LegacyParser( - name='parsers/tinker', code_name='TINKER', domain='dft', - parser_class_name='tinkerparser.TinkerParser', - mainfile_contents_re=r'TINKER --- Software Tools for Molecular Design' - ), - LegacyParser( - name='parsers/lammps', code_name='lammps', domain='dft', - parser_class_name='lammpsparser.LammpsParser', - mainfile_contents_re=r'^LAMMPS' - ), - LegacyParser( - name='parsers/amber', code_name='Amber', domain='dft', - parser_class_name='amberparser.AMBERParser', - mainfile_contents_re=r'\s*Amber\s[0-9]+\s[A-Z]+\s*[0-9]+' - ), - LegacyParser( - name='parsers/gromacs', code_name='Gromacs', domain='dft', - parser_class_name='gromacsparser.GROMACSParser', - mainfile_contents_re=r'GROMACS - gmx mdrun' - ), - LegacyParser( - name='parsers/gromos', code_name='Gromos', domain='dft', - parser_class_name='gromosparser.GromosParser', - mainfile_contents_re=r'Bugreports to http://www.gromos.net' - ), - LegacyParser( - name='parsers/namd', code_name='Namd', domain='dft', - parser_class_name='namdparser.NamdParser', - mainfile_contents_re=r'\s*Info:\s*NAMD\s*[0-9.]+\s*for\s*', - mainfile_mime_re=r'text/.*', - ), - LegacyParser( - name='parsers/charmm', code_name='Charmm', domain='dft', - parser_class_name='charmmparser.CharmmParser', - mainfile_contents_re=r'\s*Chemistry\s*at\s*HARvard\s*Macromolecular\s*Mechanics\s*', - mainfile_mime_re=r'text/.*', - ), - LegacyParser( - name='parsers/dftbplus', code_name='DFTb plus', domain='dft', - parser_class_name='dftbplusparser.DFTBPlusParser', - mainfile_contents_re=r'^ Fermi distribution function\s*', - mainfile_mime_re=r'text/.*', - ), - LegacyParser( - name='parsers/asap', code_name='ASAP', domain='dft', - parser_class_name='asapparser.AsapParser', - mainfile_name_re=r'.*.traj$', - mainfile_mime_re=r'application/octet-stream', - ), - LegacyParser( - name='parsers/fplo', code_name='fplo', domain='dft', - parser_class_name='fploparser.FploParser', - mainfile_contents_re=r'\s*\|\s*FULL-POTENTIAL LOCAL-ORBITAL MINIMUM BASIS BANDSTRUCTURE CODE\s*\|\s*', - mainfile_mime_re=r'text/.*', - ), - LegacyParser( - name='parsers/mopac', code_name='MOPAC', domain='dft', - parser_class_name='mopacparser.MopacParser', - mainfile_contents_re=r'\s*\*\*\s*MOPAC\s*([0-9a-zA-Z]*)\s*\*\*\s*', - mainfile_mime_re=r'text/.*', - ) -] - -empty_parsers = [ - EmptyParser( - name='missing/octopus', code_name='Octopus', code_homepage='https://octopus-code.org/', - domain='dft', - mainfile_name_re=r'(inp)|(.*/inp)' - ), - EmptyParser( - name='missing/crystal', code_name='Crystal', code_homepage='https://www.crystal.unito.it/index.php', - domain='dft', - mainfile_name_re=r'.*\.cryst\.out' - ), - EmptyParser( - name='missing/wien2k', code_name='WIEN2k', code_homepage='http://www.wien2k.at/', - domain='dft', - mainfile_name_re=r'.*\.scf' - ), - EmptyParser( - name='missing/fhi-aims', code_name='FHI-aims', code_homepage='https://aimsclub.fhi-berlin.mpg.de/', - domain='dft', - mainfile_name_re=r'.*\.fhiaims' - ) -] - -if config.use_empty_parsers: - # There are some entries with PIDs that have mainfiles which do not match what - # the actual parsers expect. We use the EmptyParser to produce placeholder entries - # to keep the PIDs. These parsers will not match for new, non migrated data. - parsers.extend(empty_parsers) - -parsers.append(BrokenParser()) - -''' Instantiation and constructor based config of all parsers. ''' - -parser_dict = {parser.name: parser for parser in parsers + empty_parsers} # type: ignore -''' A dict to access parsers by name. Usually 'parsers/<...>', e.g. 'parsers/vasp'. ''' - -# renamed parsers -parser_dict['parser/broken'] = parser_dict['parsers/broken'] -parser_dict['parser/fleur'] = parser_dict['parsers/fleur'] -parser_dict['parser/molcas'] = parser_dict['parsers/molcas'] -parser_dict['parser/octopus'] = parser_dict['parsers/octopus'] -parser_dict['parser/onetep'] = parser_dict['parsers/onetep'] - -# register code names as possible statistic value to the dft datamodel -code_names = sorted( - set([ - getattr(parser, 'code_name') - for parser in parsers - if parser.domain == 'dft' and getattr(parser, 'code_name', None) is not None and getattr(parser, 'code_name') != 'currupted mainfile']), - key=lambda code_name: code_name.lower()) -datamodel.DFTMetadata.code_name.a_search.statistic_values = code_names + [config.services.unavailable_value, config.services.not_processed_value] +from nomad.parsing.artificial import TemplateParser, GenerateRandomParser, ChaosParser, EmptyParser diff --git a/nomad/parsing/parsers.py b/nomad/parsing/parsers.py new file mode 100644 index 0000000000000000000000000000000000000000..c477e0dcbe9d706f6212a4c418f8b25aa91aefce --- /dev/null +++ b/nomad/parsing/parsers.py @@ -0,0 +1,513 @@ +# Copyright 2018 Markus Scheidgen +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an"AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os.path + +from nomad import config, datamodel + +from .parser import MissingParser, BrokenParser, Parser +from .legacy import LegacyParser, VaspOutcarParser +from .artificial import EmptyParser, GenerateRandomParser, TemplateParser, ChaosParser + +from eelsparser import EelsParser +from mpesparser import MPESParser +from aptfimparser import APTFIMParser + +try: + # these packages are not available without parsing extra, which is ok, if the + # parsers are only initialized to load their metainfo definitions + import magic + import gzip + import bz2 + import lzma + + _compressions = { + b'\x1f\x8b\x08': ('gz', gzip.open), + b'\x42\x5a\x68': ('bz2', bz2.open), + b'\xfd\x37\x7a': ('xz', lzma.open) + } + + encoding_magic = magic.Magic(mime_encoding=True) + +except ImportError: + pass + + +def match_parser(mainfile_path: str, strict=True) -> Parser: + ''' + Performs parser matching. This means it take the given mainfile and potentially + opens it with the given callback and tries to identify a parser that can parse + the file. + + This is determined by filename (e.g. *.out), mime type (e.g. text/*, application/xml), + and beginning file contents. + + Arguments: + mainfile_path: Path to the mainfile + strict: Only match strict parsers, e.g. no artificial parsers for missing or empty entries. + + Returns: The parser, or None if no parser could be matched. + ''' + mainfile = os.path.basename(mainfile_path) + if mainfile.startswith('.') or mainfile.startswith('~'): + return None + + with open(mainfile_path, 'rb') as f: + compression, open_compressed = _compressions.get(f.read(3), (None, open)) + + with open_compressed(mainfile_path, 'rb') as cf: # type: ignore + buffer = cf.read(config.parser_matching_size) + + mime_type = magic.from_buffer(buffer, mime=True) + + decoded_buffer = None + encoding = None + try: # Try to open the file as a string for regex matching. + decoded_buffer = buffer.decode('utf-8') + except UnicodeDecodeError: + # This file is either binary or has wrong encoding + encoding = encoding_magic.from_buffer(buffer) + + if config.services.force_raw_file_decoding: + encoding = 'iso-8859-1' + + if encoding in ['iso-8859-1']: + try: + decoded_buffer = buffer.decode(encoding) + except Exception: + pass + + for parser in parsers: + if strict and isinstance(parser, (MissingParser, EmptyParser)): + continue + + if parser.is_mainfile(mainfile_path, mime_type, buffer, decoded_buffer, compression): + # potentially convert the file + if encoding in ['iso-8859-1']: + try: + with open(mainfile_path, 'rb') as binary_file: + content = binary_file.read().decode(encoding) + except Exception: + pass + else: + with open(mainfile_path, 'wt') as text_file: + text_file.write(content) + + # TODO: deal with multiple possible parser specs + return parser + + return None + + +parsers = [ + GenerateRandomParser(), + TemplateParser(), + ChaosParser(), + LegacyParser( + name='parsers/phonopy', code_name='Phonopy', code_homepage='https://phonopy.github.io/phonopy/', + parser_class_name='phonopyparser.PhonopyParserWrapper', + # mainfile_contents_re=r'', # Empty regex since this code calls other DFT codes. + mainfile_name_re=(r'.*/phonopy-FHI-aims-displacement-0*1/control.in$') + ), + LegacyParser( + name='parsers/vasp', code_name='VASP', code_homepage='https://www.vasp.at/', + parser_class_name='vaspparser.VASPRunParser', + mainfile_mime_re=r'(application/.*)|(text/.*)', + mainfile_contents_re=( + r'^\s*<\?xml version="1\.0" encoding="ISO-8859-1"\?>\s*' + r'?\s*<modeling>' + r'?\s*<generator>' + r'?\s*<i name="program" type="string">\s*vasp\s*</i>' + r'?'), + supported_compressions=['gz', 'bz2', 'xz'] + ), + VaspOutcarParser( + name='parsers/vasp-outcar', code_name='VASP', code_homepage='https://www.vasp.at/', + parser_class_name='vaspparser.VaspOutcarParser', + mainfile_name_re=r'(.*/)?OUTCAR(\.[^\.]*)?', + mainfile_contents_re=(r'^\svasp\.') + ), + LegacyParser( + name='parsers/exciting', code_name='exciting', code_homepage='http://exciting-code.org/', + parser_class_name='excitingparser.ExcitingParser', + mainfile_name_re=r'^.*.OUT(\.[^/]*)?$', + mainfile_contents_re=(r'EXCITING.*started') + ), + LegacyParser( + name='parsers/fhi-aims', code_name='FHI-aims', code_homepage='https://aimsclub.fhi-berlin.mpg.de/', + parser_class_name='fhiaimsparser.FHIaimsParser', + mainfile_contents_re=( + r'^(.*\n)*' + r'?\s*Invoking FHI-aims \.\.\.' + # r'?\s*Version' + ) + ), + LegacyParser( + name='parsers/cp2k', code_name='CP2K', code_homepage='https://www.cp2k.org/', + parser_class_name='cp2kparser.CP2KParser', + mainfile_contents_re=( + r'\*\*\*\* \*\*\*\* \*\*\*\*\*\* \*\* PROGRAM STARTED AT\s.*\n' + r' \*\*\*\*\* \*\* \*\*\* \*\*\* \*\* PROGRAM STARTED ON\s*.*\n' + r' \*\* \*\*\*\* \*\*\*\*\*\* PROGRAM STARTED BY .*\n' + r' \*\*\*\*\* \*\* \*\* \*\* \*\* PROGRAM PROCESS ID .*\n' + r' \*\*\*\* \*\* \*\*\*\*\*\*\* \*\* PROGRAM STARTED IN .*\n' + ) + ), + LegacyParser( + name='parsers/crystal', code_name='Crystal', code_homepage='https://www.crystal.unito.it/', + parser_class_name='crystalparser.CrystalParser', + mainfile_contents_re=( + r'(CRYSTAL\s*\n\d+ \d+ \d+)|(CRYSTAL will run on \d+ processors)|' + r'(\s*\*\s*CRYSTAL[\d]+\s*\*\s*\*\s*(public|Release) \: [\d\.]+.*\*)|' + r'(Executable:\s*[/_\-a-zA-Z0-9]*MPPcrystal)' + ) + ), + # The main contents regex of CPMD was causing a catostrophic backtracking issue + # when searching through the first 500 bytes of main files. We decided + # to use only a portion of the regex to avoid that issue. + LegacyParser( + name='parsers/cpmd', code_name='CPMD', code_homepage='https://www.lcrc.anl.gov/for-users/software/available-software/cpmd/', + parser_class_name='cpmdparser.CPMDParser', + mainfile_contents_re=( + # r'\s+\*\*\*\*\*\* \*\*\*\*\*\* \*\*\*\* \*\*\*\* \*\*\*\*\*\*\s*' + # r'\s+\*\*\*\*\*\*\* \*\*\*\*\*\*\* \*\*\*\*\*\*\*\*\*\* \*\*\*\*\*\*\*\s+' + r'\*\*\* \*\* \*\*\* \*\* \*\*\*\* \*\* \*\* \*\*\*' + # r'\s+\*\* \*\* \*\*\* \*\* \*\* \*\* \*\* \*\*\s+' + # r'\s+\*\* \*\*\*\*\*\*\* \*\* \*\* \*\* \*\*\s+' + # r'\s+\*\*\* \*\*\*\*\*\* \*\* \*\* \*\* \*\*\*\s+' + # r'\s+\*\*\*\*\*\*\* \*\* \*\* \*\* \*\*\*\*\*\*\*\s+' + # r'\s+\*\*\*\*\*\* \*\* \*\* \*\* \*\*\*\*\*\*\s+' + ) + ), + LegacyParser( + name='parsers/nwchem', code_name='NWChem', code_homepage='http://www.nwchem-sw.org/', + parser_class_name='nwchemparser.NWChemParser', + mainfile_contents_re=( + r'Northwest Computational Chemistry Package \(NWChem\) (\d+\.)+\d+' + ) + ), + LegacyParser( + name='parsers/bigdft', code_name='BigDFT', code_homepage='http://bigdft.org/', + parser_class_name='bigdftparser.BigDFTParser', + mainfile_contents_re=( + # r'__________________________________ A fast and precise DFT wavelet code\s*' + # r'\| \| \| \| \| \|\s*' + # r'\| \| \| \| \| \| BBBB i gggggg\s*' + # r'\|_____\|_____\|_____\|_____\|_____\| B B g\s*' + # r'\| \| : \| : \| \| \| B B i g\s*' + # r'\| \|-0\+--\|-0\+--\| \| \| B B i g g\s*' + r'\|_____\|__:__\|__:__\|_____\|_____\|___ BBBBB i g g\s*' + # r'\| : \| \| \| : \| \| B B i g g\s*' + # r'\|--\+0-\| \| \|-0\+--\| \| B B iiii g g\s*' + # r'\|__:__\|_____\|_____\|__:__\|_____\| B B i g g\s*' + # r'\| \| : \| : \| \| \| B BBBB i g g\s*' + # r'\| \|-0\+--\|-0\+--\| \| \| B iiiii gggggg\s*' + # r'\|_____\|__:__\|__:__\|_____\|_____\|__BBBBB\s*' + # r'\| \| \| \| : \| \| TTTTTTTTT\s*' + # r'\| \| \| \|--\+0-\| \| DDDDDD FFFFF T\s*' + # r'\|_____\|_____\|_____\|__:__\|_____\| D D F TTTT T\s*' + # r'\| \| \| \| : \| \|D D F T T\s*' + # r'\| \| \| \|--\+0-\| \|D D FFFF T T\s*' + # r'\|_____\|_____\|_____\|__:__\|_____\|D___ D F T T\s*' + # r'\| \| \| : \| \| \|D D F TTTTT\s*' + # r'\| \| \|--\+0-\| \| \| D D F T T\s*' + # r'\|_____\|_____\|__:__\|_____\|_____\| D F T T\s*' + # r'\| \| \| \| \| \| D T T\s*' + # r'\| \| \| \| \| \| DDDDDD F TTTT\s*' + # r'\|_____\|_____\|_____\|_____\|_____\|______ www\.bigdft\.org' + ) + ), + LegacyParser( + name='parsers/wien2k', code_name='WIEN2k', code_homepage='http://www.wien2k.at/', + parser_class_name='wien2kparser.Wien2kParser', + mainfile_contents_re=r'\s*---------\s*:ITE[0-9]+:\s*[0-9]+\.\s*ITERATION\s*---------' + ), + LegacyParser( + name='parsers/band', code_name='BAND', code_homepage='https://www.scm.com/product/band_periodicdft/', + parser_class_name='bandparser.BANDParser', + mainfile_contents_re=r' +\* +Amsterdam Density Functional +\(ADF\)'), + LegacyParser( + name='parsers/gaussian', code_name='Gaussian', code_homepage='http://gaussian.com/', + parser_class_name='gaussianparser.GaussianParser', + mainfile_mime_re=r'.*', + mainfile_contents_re=( + r'\s*Cite this work as:' + r'\s*Gaussian [0-9]+, Revision [A-Za-z0-9\.]*,') + ), + LegacyParser( + name='parsers/quantumespresso', code_name='Quantum Espresso', code_homepage='https://www.quantum-espresso.org/', + parser_class_name='quantumespressoparser.QuantumEspressoParserPWSCF', + mainfile_contents_re=( + r'(Program PWSCF.*starts)|' + r'(Current dimensions of program PWSCF are)') + # r'^(.*\n)*' + # r'\s*Program (\S+)\s+v\.(\S+)(?:\s+\(svn\s+rev\.\s+' + # r'(\d+)\s*\))?\s+starts[^\n]+' + # r'(?:\s*\n?)*This program is part of the open-source Quantum') + ), + LegacyParser( + name='parsers/abinit', code_name='ABINIT', code_homepage='https://www.abinit.org/', + parser_class_name='abinitparser.AbinitParser', + mainfile_contents_re=(r'^\n*\.Version\s*[0-9.]*\s*of ABINIT\s*') + ), + LegacyParser( + name='parsers/orca', code_name='ORCA', code_homepage='https://orcaforum.kofo.mpg.de/', + parser_class_name='orcaparser.OrcaParser', + mainfile_contents_re=( + r'\s+\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\**\s*' + r'\s+\* O R C A \*\s*' + r'\s+\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\**\s*' + r'\s*' + r'\s*--- An Ab Initio, DFT and Semiempirical electronic structure package ---\s*') + ), + LegacyParser( + name='parsers/castep', code_name='CASTEP', code_homepage='http://www.castep.org/', + parser_class_name='castepparser.CastepParser', + mainfile_contents_re=(r'\s\|\s*CCC\s*AA\s*SSS\s*TTTTT\s*EEEEE\s*PPPP\s*\|\s*') + ), + LegacyParser( + name='parsers/dl-poly', code_name='DL_POLY', code_homepage='https://www.scd.stfc.ac.uk/Pages/DL_POLY.aspx', + parser_class_name='dlpolyparser.DlPolyParserWrapper', + mainfile_contents_re=(r'\*\* DL_POLY \*\*') + ), + LegacyParser( + name='parsers/lib-atoms', code_name='libAtoms', code_homepage='https://libatoms.github.io/', + parser_class_name='libatomsparser.LibAtomsParserWrapper', + mainfile_contents_re=(r'\s*<GAP_params\s') + ), + LegacyParser( + name='parsers/octopus', code_name='Octopus', code_homepage='https://octopus-code.org/', + parser_class_name='octopusparser.OctopusParserWrapper', + mainfile_contents_re=(r'\|0\) ~ \(0\) \|') + # We decided to use the octopus eyes instead of + # r'\*{32} Grid \*{32}Simulation Box:' since it was so far down in the file. + ), + # match gpaw2 first, other .gpw files are then considered to be "gpaw1" + LegacyParser( + name='parsers/gpaw2', code_name='GPAW', code_homepage='https://wiki.fysik.dtu.dk/gpaw/', + parser_class_name='gpawparser.GPAWParser2Wrapper', + mainfile_binary_header=b'GPAW', + mainfile_name_re=(r'^.*\.(gpw2|gpw)$'), + mainfile_mime_re=r'application/(x-tar|octet-stream)' + ), + LegacyParser( + name='parsers/gpaw', code_name='GPAW', code_homepage='https://wiki.fysik.dtu.dk/gpaw/', + parser_class_name='gpawparser.GPAWParserWrapper', + mainfile_name_re=(r'^.*\.gpw$'), + mainfile_mime_re=r'application/(x-tar|octet-stream)' + ), + LegacyParser( + name='parsers/atk', code_name='ATK', code_homepage='https://www.synopsys.com/silicon/quantumatk.html', + parser_class_name='atkparser.ATKParserWrapper', + # mainfile_contents_re=r'', # We can't read .gpw as txt - of UlmGPAW|AFFormatGPAW' + mainfile_name_re=r'^.*\.nc', + # The previously used mime type r'application/x-netcdf' wasn't found by magic library. + mainfile_mime_re=r'application/octet-stream' + ), + LegacyParser( + name='parsers/gulp', code_name='gulp', code_homepage='http://gulp.curtin.edu.au/gulp/', + parser_class_name='gulpparser.GULPParser', + mainfile_contents_re=( + r'\s*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*' + r'\*\*\*\*\*\*\*\*\*\*\*\*\*\s*' + r'\s*\*\s*GENERAL UTILITY LATTICE PROGRAM\s*\*\s*') + ), + LegacyParser( + name='parsers/siesta', code_name='Siesta', code_homepage='https://departments.icmab.es/leem/siesta/', + parser_class_name='siestaparser.SiestaParser', + mainfile_contents_re=( + r'(Siesta Version: siesta-|SIESTA [0-9]\.[0-9]\.[0-9])|' + r'(\*\s*WELCOME TO SIESTA\s*\*)') + ), + LegacyParser( + name='parsers/elk', code_name='elk', code_homepage='http://elk.sourceforge.net/', + parser_class_name='elkparser.ElkParser', + mainfile_contents_re=r'\| Elk version [0-9.a-zA-Z]+ started \|' + ), + LegacyParser( + name='parsers/elastic', code_name='elastic', code_homepage='http://exciting-code.org/elastic', + parser_class_name='elasticparser.ElasticParser', + mainfile_contents_re=r'\s*Order of elastic constants\s*=\s*[0-9]+\s*' + ), + LegacyParser( + name='parsers/gamess', code_name='GAMESS', code_homepage='https://www.msg.chem.iastate.edu/gamess/versions.html', + parser_class_name='gamessparser.GamessParser', + mainfile_contents_re=( + r'\s*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\**\s*' + r'\s*\*\s*GAMESS VERSION =\s*(.*)\*\s*' + r'\s*\*\s*FROM IOWA STATE UNIVERSITY\s*\*\s*') + ), + LegacyParser( + name='parsers/turbomole', code_name='turbomole', code_homepage='https://www.turbomole.org/', + parser_class_name='turbomoleparser.TurbomoleParser', + mainfile_contents_re=( + r'Copyright \(C\) [0-9]+ TURBOMOLE GmbH, Karlsruhe') + ), + LegacyParser( + name='parsers/skeleton', code_name='skeleton', code_homepage=None, + domain='ems', + parser_class_name='skeletonparser.SkeletonParserInterface', + mainfile_mime_re=r'(application/json)|(text/.*)', + mainfile_contents_re=(r'skeleton experimental metadata format') + ), + MPESParser(), + APTFIMParser(), + EelsParser(), + LegacyParser( + name='parsers/qbox', code_name='qbox', code_homepage='http://qboxcode.org/', domain='dft', + parser_class_name='qboxparser.QboxParser', + mainfile_mime_re=r'(application/xml)|(text/.*)', + mainfile_contents_re=(r'http://qboxcode.org') + ), + LegacyParser( + name='parsers/dmol', code_name='DMol3', code_homepage='http://dmol3.web.psi.ch/dmol3.html', domain='dft', + parser_class_name='dmol3parser.Dmol3Parser', + mainfile_name_re=r'.*\.outmol', + mainfile_contents_re=r'Materials Studio DMol\^3' + ), + LegacyParser( + name='parsers/fleur', code_name='fleur', code_homepage='https://www.flapw.de/', domain='dft', + parser_class_name='fleurparser.FleurParser', + mainfile_contents_re=r'This output is generated by fleur.' + ), + LegacyParser( + name='parsers/molcas', code_name='MOLCAS', code_homepage='http://www.molcas.org/', domain='dft', + parser_class_name='molcasparser.MolcasParser', + mainfile_contents_re=r'M O L C A S' + ), + LegacyParser( + name='parsers/onetep', code_name='ONETEP', code_homepage='https://www.onetep.org/', domain='dft', + parser_class_name='onetepparser.OnetepParser', + mainfile_contents_re=r'####### # # ####### ####### ####### ######' + ), + LegacyParser( + name='parsers/openkim', code_name='OpenKIM', domain='dft', + parser_class_name='openkimparser.OpenKIMParser', + mainfile_contents_re=r'OPENKIM' + ), + LegacyParser( + name='parsers/tinker', code_name='TINKER', domain='dft', + parser_class_name='tinkerparser.TinkerParser', + mainfile_contents_re=r'TINKER --- Software Tools for Molecular Design' + ), + LegacyParser( + name='parsers/lammps', code_name='lammps', domain='dft', + parser_class_name='lammpsparser.LammpsParser', + mainfile_contents_re=r'^LAMMPS' + ), + LegacyParser( + name='parsers/amber', code_name='Amber', domain='dft', + parser_class_name='amberparser.AMBERParser', + mainfile_contents_re=r'\s*Amber\s[0-9]+\s[A-Z]+\s*[0-9]+' + ), + LegacyParser( + name='parsers/gromacs', code_name='Gromacs', domain='dft', + parser_class_name='gromacsparser.GROMACSParser', + mainfile_contents_re=r'GROMACS - gmx mdrun' + ), + LegacyParser( + name='parsers/gromos', code_name='Gromos', domain='dft', + parser_class_name='gromosparser.GromosParser', + mainfile_contents_re=r'Bugreports to http://www.gromos.net' + ), + LegacyParser( + name='parsers/namd', code_name='Namd', domain='dft', + parser_class_name='namdparser.NamdParser', + mainfile_contents_re=r'\s*Info:\s*NAMD\s*[0-9.]+\s*for\s*', + mainfile_mime_re=r'text/.*', + ), + LegacyParser( + name='parsers/charmm', code_name='Charmm', domain='dft', + parser_class_name='charmmparser.CharmmParser', + mainfile_contents_re=r'\s*Chemistry\s*at\s*HARvard\s*Macromolecular\s*Mechanics\s*', + mainfile_mime_re=r'text/.*', + ), + LegacyParser( + name='parsers/dftbplus', code_name='DFTb plus', domain='dft', + parser_class_name='dftbplusparser.DFTBPlusParser', + mainfile_contents_re=r'^ Fermi distribution function\s*', + mainfile_mime_re=r'text/.*', + ), + LegacyParser( + name='parsers/asap', code_name='ASAP', domain='dft', + parser_class_name='asapparser.AsapParser', + mainfile_name_re=r'.*.traj$', + mainfile_mime_re=r'application/octet-stream', + ), + LegacyParser( + name='parsers/fplo', code_name='fplo', domain='dft', + parser_class_name='fploparser.FploParser', + mainfile_contents_re=r'\s*\|\s*FULL-POTENTIAL LOCAL-ORBITAL MINIMUM BASIS BANDSTRUCTURE CODE\s*\|\s*', + mainfile_mime_re=r'text/.*', + ), + LegacyParser( + name='parsers/mopac', code_name='MOPAC', domain='dft', + parser_class_name='mopacparser.MopacParser', + mainfile_contents_re=r'\s*\*\*\s*MOPAC\s*([0-9a-zA-Z]*)\s*\*\*\s*', + mainfile_mime_re=r'text/.*', + ) +] + +empty_parsers = [ + EmptyParser( + name='missing/octopus', code_name='Octopus', code_homepage='https://octopus-code.org/', + domain='dft', + mainfile_name_re=r'(inp)|(.*/inp)' + ), + EmptyParser( + name='missing/crystal', code_name='Crystal', code_homepage='https://www.crystal.unito.it/index.php', + domain='dft', + mainfile_name_re=r'.*\.cryst\.out' + ), + EmptyParser( + name='missing/wien2k', code_name='WIEN2k', code_homepage='http://www.wien2k.at/', + domain='dft', + mainfile_name_re=r'.*\.scf' + ), + EmptyParser( + name='missing/fhi-aims', code_name='FHI-aims', code_homepage='https://aimsclub.fhi-berlin.mpg.de/', + domain='dft', + mainfile_name_re=r'.*\.fhiaims' + ) +] + +if config.use_empty_parsers: + # There are some entries with PIDs that have mainfiles which do not match what + # the actual parsers expect. We use the EmptyParser to produce placeholder entries + # to keep the PIDs. These parsers will not match for new, non migrated data. + parsers.extend(empty_parsers) + +parsers.append(BrokenParser()) + +''' Instantiation and constructor based config of all parsers. ''' + +parser_dict = {parser.name: parser for parser in parsers + empty_parsers} # type: ignore +''' A dict to access parsers by name. Usually 'parsers/<...>', e.g. 'parsers/vasp'. ''' + +# renamed parsers +parser_dict['parser/broken'] = parser_dict['parsers/broken'] +parser_dict['parser/fleur'] = parser_dict['parsers/fleur'] +parser_dict['parser/molcas'] = parser_dict['parsers/molcas'] +parser_dict['parser/octopus'] = parser_dict['parsers/octopus'] +parser_dict['parser/onetep'] = parser_dict['parsers/onetep'] + +# register code names as possible statistic value to the dft datamodel +code_names = sorted( + set([ + getattr(parser, 'code_name') + for parser in parsers + if parser.domain == 'dft' and getattr(parser, 'code_name', None) is not None and getattr(parser, 'code_name') != 'currupted mainfile']), + key=lambda code_name: code_name.lower()) +datamodel.DFTMetadata.code_name.a_search.statistic_values = code_names + [config.services.unavailable_value, config.services.not_processed_value] diff --git a/nomad/processing/data.py b/nomad/processing/data.py index 8e52799d21cd4d63eebe724b68acd1252eeed9e4..fee1f526bcf5cd759d8619b3592ac43ce26ed019 100644 --- a/nomad/processing/data.py +++ b/nomad/processing/data.py @@ -38,7 +38,8 @@ from structlog.processors import StackInfoRenderer, format_exc_info, TimeStamper from nomad import utils, config, infrastructure, search, datamodel from nomad.files import PathObject, UploadFiles, ExtractError, ArchiveBasedStagingUploadFiles, PublicUploadFiles, StagingUploadFiles from nomad.processing.base import Proc, process, task, PENDING, SUCCESS, FAILURE -from nomad.parsing import parser_dict, match_parser, Backend +from nomad.parsing import Backend +from nomad.parsing.parsers import parser_dict, match_parser from nomad.normalizing import normalizers from nomad.datamodel import EntryArchive from nomad.archive import query_archive @@ -1036,7 +1037,7 @@ class Upload(Proc): modified_upload = self._get_collection().find_one_and_update( {'_id': self.upload_id, 'joined': {'$ne': True}}, {'$set': {'joined': True}}) - if modified_upload['joined'] is False: + if modified_upload is None or modified_upload['joined'] is False: self.get_logger().info('join') # Before cleaning up, run an additional normalizer on phonon diff --git a/tests/parser_measurement.py b/tests/parser_measurement.py index e0cec2adac49c8dcca22cb1125ec0d590effa8c8..bc5905bb1cb2a39035889f2a24c21df7abd1c8a1 100644 --- a/tests/parser_measurement.py +++ b/tests/parser_measurement.py @@ -3,7 +3,7 @@ if __name__ == '__main__': import logging import time from nomad import config, utils - from nomad.parsing import parser_dict + from nomad.parsing.parsers import parser_dict from nomad.cli.parse import normalize_all from nomad.metainfo.legacy import LegacyMetainfoEnvironment from nomad.parsing.legacy import Backend diff --git a/tests/test_datamodel.py b/tests/test_datamodel.py index 54d46db39997c2a4ead0b5a2b9bb81e94dde7562..d60de974fe57b47cadd7bbc8fa8b5c35fda86e25 100644 --- a/tests/test_datamodel.py +++ b/tests/test_datamodel.py @@ -22,7 +22,8 @@ import datetime from ase.data import chemical_symbols from ase.spacegroup import Spacegroup -from nomad import datamodel, parsing, utils, files +from nomad import datamodel, utils, files +from nomad.parsing.parsers import parser_dict number_of = 20 @@ -37,7 +38,7 @@ systems = ['atom', 'molecule/cluster', '2D/surface', 'bulk'] comments = [gen.sentence() for _ in range(0, number_of)] references = [(i + 1, gen.url()) for i in range(0, number_of)] datasets = [(i + 1, gen.slug()) for i in range(0, number_of)] -codes = list(set([parser.code_name for parser in parsing.parser_dict.values() if hasattr(parser, 'code_name')])) # type: ignore +codes = list(set([parser.code_name for parser in parser_dict.values() if hasattr(parser, 'code_name')])) # type: ignore filepaths = ['/'.join(gen.url().split('/')[3:]) for _ in range(0, number_of)] low_numbers_for_atoms = [1, 1, 2, 2, 2, 2, 2, 3, 3, 4] diff --git a/tests/test_parsing.py b/tests/test_parsing.py index 6f1271a647f72055729db7ae4a208e54da11d23d..d0b88a96b0df3cafd15b01f596c382c5b3c1244d 100644 --- a/tests/test_parsing.py +++ b/tests/test_parsing.py @@ -20,7 +20,8 @@ import os from shutil import copyfile from nomad import utils, files, datamodel -from nomad.parsing import parser_dict, match_parser, BrokenParser, BadContextUri, Backend +from nomad.parsing import BrokenParser, BadContextUri, Backend +from nomad.parsing.parsers import parser_dict, match_parser from nomad.app import dump_json from nomad.metainfo import MSection