Commit 61f4c44e authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

Some migration scrip adds and fixes. Fixed some parser matching regexps. Added...

Some migration scrip adds and fixes. Fixed some parser matching regexps. Added bogus parsers for missing and broken files.
parent 48fe3d6e
Pipeline #48176 canceled with stages
in 5 seconds
......@@ -128,24 +128,107 @@ def missing_calcs_data():
results = utils.POPO(
no_package=[],
no_calcs=[],
not_migrated=[],
failed_packages=[],
missing_mainfile=[],
others=[])
# not not check these uploads
not_check_uploads = [
'ftp_upload_for_uid_125',
'ftp_upload_for_uid_290',
'ftp_upload_for_uid_502_2011-09-06-15-33-33-333221',
'ftp_upload_for_uid_502_2011-09-27-20-49-58-937390',
'ftp_upload_for_uid_502_2011-10-01-20-48-22-561661',
'ftp_upload_for_uid_502_2011-10-07-08-52-06-841358',
'ftp_upload_for_uid_502_2011-10-07-08-57-17-804213',
'ftp_upload_for_uid_502_2011-10-07-08-59-32-464608',
'ftp_upload_for_uid_502_2011-10-07-19-04-54-725186',
'ftp_upload_for_uid_502_2011-11-15-20-50-34-020718',
'ftp_upload_for_uid_502_2011-11-15-20-56-28-015287',
'ftp_upload_for_uid_502_2011-11-15-21-30-01-561680',
'ftp_upload_for_uid_502_2011-11-15-21-33-26-574967',
'ftp_upload_for_uid_502_2011-11-15-21-40-33-307359',
'ftp_upload_for_uid_502_2011-11-26-23-17-19-882290',
'ftp_upload_for_uid_502_2011-11-26-23-50-30-089143',
'ftp_upload_for_uid_502_2011-12-01-00-14-18-140240',
'ftp_upload_for_uid_502_2011-12-01-14-04-45-404271',
'ftp_upload_for_uid_502_2011-12-01-23-09-09-854328',
'ftp_upload_for_uid_502_2011-12-05-08-46-20-831174',
'ftp_upload_for_uid_502_2011-12-05-10-46-30-923923',
'ftp_upload_for_uid_502_2011-12-23-09-26-49-935721',
'ftp_upload_for_uid_502_2011-12-23-10-39-22-459271',
'ftp_upload_for_uid_502_2012-03-15-09-16-22-390174',
'ftp_upload_for_uid_502_2012-03-23-19-18-02-789330',
'ftp_upload_for_uid_502_2012-03-24-06-09-06-576223',
'ftp_upload_for_uid_502_2012-03-26-08-53-28-847937',
'ftp_upload_for_uid_502_2012-03-28-09-53-35-930264',
'ftp_upload_for_uid_502_2012-04-25-17-12-51-662156',
'ftp_upload_for_uid_502_2012-04-26-00-04-07-260381',
'ftp_upload_for_uid_502_2012-04-26-09-31-29-421336',
'ftp_upload_for_uid_502_2012-04-27-07-15-28-871403',
'ftp_upload_for_uid_502_2012-04-27-22-53-49-117894',
'ftp_upload_for_uid_502_2012-05-16-13-36-29-938929',
'ftp_upload_for_uid_502_2012-05-18-17-18-20-527193',
'ftp_upload_for_uid_502_2012-05-19-19-51-50-814160',
'ftp_upload_for_uid_502_2012-05-21-15-14-17-579123',
'ftp_upload_for_uid_502_2012-05-25-13-52-49-651647',
'ftp_upload_for_uid_502_2012-06-14-17-47-19-089204',
'ftp_upload_for_uid_502_2012-06-21-21-34-07-966108',
'ftp_upload_for_uid_502_2012-06-26-22-25-28-412879',
'ftp_upload_for_uid_502_2012-07-02-10-35-45-887222',
'ftp_upload_for_uid_502_2012-07-02-10-36-33-740348',
'ftp_upload_for_uid_502_2012-07-09-10-03-15-368689',
'ftp_upload_for_uid_502_2012-07-26-07-27-00-284225',
'ftp_upload_for_uid_502_2012-07-26-07-29-11-627501',
'ftp_upload_for_uid_502_2012-08-14-13-16-25-535995',
'ftp_upload_for_uid_502_2012-08-16-15-04-45-599710',
'ftp_upload_for_uid_502_2012-08-23-06-23-02-115869',
'ftp_upload_for_uid_502_2012-08-23-16-36-49-087908',
'ftp_upload_for_uid_502_2012-08-24-17-10-15-161628',
'ftp_upload_for_uid_502_2012-08-26-05-04-25-027012',
'ftp_upload_for_uid_502_2012-08-29-18-31-26-494251',
'ftp_upload_for_uid_502_2012-08-30-07-01-07-502171',
'ftp_upload_for_uid_502_2012-09-01-08-01-03-573873',
'ftp_upload_for_uid_502_2012-09-06-13-54-56-201039',
'ftp_upload_for_uid_502_2012-09-07-21-38-22-787875',
'ftp_upload_for_uid_502_2012-09-09-07-32-31-653109',
'ftp_upload_for_uid_502_2012-09-10-09-48-57-289279',
'ftp_upload_for_uid_502_2012-09-11-07-04-32-036763',
'ftp_upload_for_uid_502_2012-09-15-20-31-02-157060',
'ftp_upload_for_uid_502_2012-09-20-06-29-02-132434',
'ftp_upload_for_uid_502_2012-09-21-11-27-20-615773',
'ftp_upload_for_uid_502_2012-09-21-17-31-17-335523',
'ftp_upload_for_uid_502_2012-09-24-20-27-36-041292',
'ftp_upload_for_uid_502_2012-09-25-16-21-09-043610',
'ftp_upload_for_uid_502_2012-10-01-17-27-20-733800',
'ftp_upload_for_uid_502_2012-10-02-17-02-03-194493',
'ftp_upload_for_uid_502_2012-10-08-14-10-54-373136',
'ftp_upload_for_uid_502_2012-10-12-12-40-36-780644',
'ftp_upload_for_uid_502_2012-10-24-14-51-09-134377',
'ftp_upload_for_uid_502_2012-10-29-11-01-45-431034',
'ftp_upload_for_uid_502_2012-11-16-17-02-37-016199',
'ftp_upload_for_uid_502_2012-11-19-09-16-47-377264',
'ftp_upload_for_uid_502_2012-11-23-13-23-45-623620',
'ftp_upload_for_uid_502_2012-11-26-14-56-17-339064',
'ftp_upload_for_uid_502_2012-12-03-09-52-02-714224',
'ftp_upload_for_uid_502_2012-12-10-20-09-30-463926',
'ftp_upload_for_uid_502_2011-08-17-14-29-25-505869']
# aggregate missing calcs based on uploads
source_uploads = SourceCalc._get_collection().aggregate([
{'$match': {'migration_version': -1, 'upload': {'$ne': 'ftp_upload_for_uid_125'}}},
{'$group': {'_id': '$upload', 'mainfiles': {'$push': '$mainfile'}}}])
{'$match': {'migration_version': -1, 'upload': {'$nin':not_check_uploads}}},
{'$group': {'_id': '$upload', 'calcs': {'$push': { 'mainfile': '$mainfile', 'pid': '$metadata.pid'}}}}])
source_uploads = list(source_uploads)
for source_upload in source_uploads:
source_upload['mainfiles'] = sorted(source_upload['mainfiles'])
source_upload['calcs'] = sorted(source_upload['calcs'], key=lambda a: a['mainfile'])
source_uploads = [
utils.POPO(source_upload_id=d['_id'], mainfiles=d['mainfiles'])
utils.POPO(source_upload_id=d['_id'], calcs=d['calcs'])
for d in source_uploads]
source_uploads = sorted(source_uploads, key=lambda u: len(u.mainfiles))
source_uploads = sorted(source_uploads, key=lambda u: len(u.calcs))
# go through all problematic uploads
for source_upload in source_uploads:
......@@ -156,9 +239,11 @@ def missing_calcs_data():
def cause(upload, **kwargs):
cause = dict(
source_upload_id=upload.source_upload_id, mainfiles=len(upload.mainfiles),
example_mainfile=upload.mainfiles[0],
**kwargs)
source_upload_id=upload.source_upload_id, calcs=len(upload.calcs),
example_mainfile=upload.calcs[0]['mainfile'],
example_pid=upload.calcs[0]['pid'])
cause.update(**kwargs)
return cause
try:
......@@ -198,26 +283,58 @@ def missing_calcs_data():
logger.debug('packages are processed')
# check if a mainfile does not exist in the package
try:
for mainfile in source_upload.mainfiles:
contained = False
for package in package_query():
with zipfile.ZipFile(package.package_path, 'r', allowZip64=True) as zf:
checkall = True
if checkall:
all_files = {}
for package in package_query():
with zipfile.ZipFile(package.package_path, 'r') as zf:
for path in zf.namelist():
all_files[path] = path
exist, not_exist = 0, 0
example_mainfile, example_exists_mainfile = '', ''
for calc in source_upload.calcs:
mainfile = calc['mainfile']
if mainfile in all_files:
exist += 1
example_exists_mainfile = mainfile
else:
not_exist += 1
example_mainfile = mainfile
example_pid = calc['pid']
if not_exist > 0:
results.missing_mainfile.append(cause(
source_upload,
missing=not_exist,
example_mainfile=example_mainfile,
example_pid=example_pid,
missing_but_exist=exist,
missing_but_exists_example=example_exists_mainfile))
continue
else:
try:
for calc in source_upload.calcs:
mainfile = calc['mainfile']
contained = False
for package in package_query():
try:
if zf.getinfo(mainfile) is not None:
contained = True
break
except KeyError:
pass
if not contained:
results.missing_mainfile.append(cause(source_upload, missing_mainfile=mainfile))
raise KeyError
# only check the first
break
except KeyError:
continue
with zipfile.ZipFile(package.package_path, 'r', allowZip64=True) as zf:
try:
if zf.getinfo(mainfile) is not None:
contained = True
break
except KeyError:
pass
except FileNotFoundError:
logger.info('cannot verify mainfile existence due to missing package data.')
if not contained:
results.missing_mainfile.append(cause(source_upload, missing_mainfile=mainfile))
raise KeyError
# only check the first
break
except KeyError:
continue
logger.debug('mainfiles do exist')
results.others.append(cause(source_upload))
......@@ -225,6 +342,15 @@ def missing_calcs_data():
except Exception as e:
logger.error('exception while checking upload', exc_info=e)
summary = utils.POPO(overall_missing=0)
for key, values in results.items():
summary[key] = 0
for value in values:
summary[key] += value['calcs']
summary.overall_missing += value['calcs']
results.summary = summary
return results
......
......@@ -66,7 +66,7 @@ import bz2
from nomad import files, config
from nomad.parsing.backend import AbstractParserBackend, LocalBackend, LegacyLocalBackend, JSONStreamWriter, BadContextURI, WrongContextState
from nomad.parsing.parser import Parser, LegacyParser, VaspOutcarParser
from nomad.parsing.parser import Parser, LegacyParser, VaspOutcarParser, BrokenParser, MissingParser
from nomad.parsing.artificial import TemplateParser, GenerateRandomParser, ChaosParser
......@@ -138,11 +138,8 @@ parsers = [
LegacyParser(
name='parsers/exciting', code_name='exciting',
parser_class_name='excitingparser.ExcitingParser',
mainfile_name_re=r'^.*/INFO\.OUT?',
mainfile_contents_re=(
r'^\s*=================================================+\s*'
r'\s*\|\s*EXCITING\s+\S+\s+started\s*='
r'\s*\|\s*version hash id:\s*\S*\s*=')
mainfile_name_re=r'^.*.OUT?',
mainfile_contents_re=(r'EXCITING.*started')
),
LegacyParser(
name='parsers/fhi-aims', code_name='FHI-aims',
......@@ -168,10 +165,9 @@ parsers = [
name='parsers/crystal', code_name='Crystal',
parser_class_name='crystalparser.CrystalParser',
mainfile_contents_re=(
r'\s*[\*]{22,}' # Looks for '*' 22 times or more in a row.
r'\s*\*\s{20,}\*' # Looks for a '*' sandwhiched by whitespace.
r'(CRYSTAL\s*\n0 0 0)|('
r'\s*\*\s{10,}CRYSTAL(?P<majorVersion>[\d]+)\s{10,}\*'
r'\s*\*\s{10,}public \: (?P<minorVersion>[\d\.]+) \- .*\*'
r'\s*\*\s{10,}public \: (?P<minorVersion>[\d\.]+) \- .*\*)'
)
),
# The main contents regex of CPMD was causing a catostrophic backtracking issue
......@@ -195,11 +191,7 @@ parsers = [
name='parsers/nwchem', code_name='NWChem',
parser_class_name='nwchemparser.NWChemParser',
mainfile_contents_re=(
r'\s+Northwest Computational Chemistry Package \(NWChem\) \d+\.\d+'
r'\s+------------------------------------------------------'
r'\s+Environmental Molecular Sciences Laboratory'
r'\s+Pacific Northwest National Laboratory'
r'\s+Richland, WA 99352'
r'Northwest Computational Chemistry Package \(NWChem\) (\d+\.)+\d+'
)
),
LegacyParser(
......@@ -255,15 +247,16 @@ parsers = [
LegacyParser(
name='parsers/quantumespresso', code_name='Quantum Espresso',
parser_class_name='quantumespressoparser.QuantumEspressoParserPWSCF',
mainfile_contents_re=(
r'^\s*Program (\S+)\s+v\.(\S+)(?:\s+\(svn\s+rev\.\s+'
r'(\d+)\s*\))?\s+starts[^\n]+'
r'(?:\s*\n?)*This program is part of the open-source Quantum')
mainfile_contents_re=r'(Program PWSCF)|(This program is part of the open-source Quantum)'
# r'^(.*\n)*'
# r'\s*Program (\S+)\s+v\.(\S+)(?:\s+\(svn\s+rev\.\s+'
# r'(\d+)\s*\))?\s+starts[^\n]+'
# r'(?:\s*\n?)*This program is part of the open-source Quantum')
),
LegacyParser(
name='parsers/abinit', code_name='ABINIT',
parser_class_name='abinitparser.AbinitParser',
mainfile_contents_re=(r'^\n\.Version\s*[0-9.]*\s*of ABINIT\s*')
mainfile_contents_re=(r'^\n*\.Version\s*[0-9.]*\s*of ABINIT\s*')
),
LegacyParser(
name='parsers/orca', code_name='ORCA',
......@@ -335,10 +328,7 @@ parsers = [
LegacyParser(
name='parsers/elk', code_name='elk',
parser_class_name='elkparser.ElkParser',
mainfile_contents_re=(
r'\s*\+-----------+\+\s*'
r'\s*\| Elk version (P?<version>[0-9.a-zA-Z]+) started \|\s*'
r'\s*\+----------+\+\s*')
mainfile_contents_re=r'\| Elk version [0-9.a-zA-Z]+ started \|'
),
LegacyParser(
name='parsers/elastic', code_name='elastic',
......@@ -357,8 +347,7 @@ parsers = [
name='parsers/turbomole', code_name='turbomole',
parser_class_name='turbomoleparser.TurbomoleParser',
mainfile_contents_re=(
r'\s*(P?<progr>[a-zA-z0-9_]+)\s*(?:\([^()]+\))\s*:\s*TURBOMOLE\s*(P?<version>.*)'
r'\s*Copyright \(C\) [0-9]+ TURBOMOLE GmbH, Karlsruhe')
r'Copyright \(C\) [0-9]+ TURBOMOLE GmbH, Karlsruhe')
),
LegacyParser(
name='parsers/skeleton', code_name='skeleton', domain='EMS',
......@@ -378,7 +367,51 @@ parsers = [
parser_class_name='aptfimparser.APTFIMParserInterface',
mainfile_mime_re=r'(application/json)|(text/.*)',
mainfile_name_re=(r'.*.aptfim')
)
),
MissingParser(
name='parsers/qbox', code_name='qbox', domain='DFT',
mainfile_contents_re=(r'http://qboxcode.org')
),
MissingParser(
name='parsers/dmol', code_name='DMol3', domain='DFT',
mainfile_name_re=r'.*\.outmol'
),
MissingParser(
name='parser/fleur', code_name='fleur', domain='DFT',
mainfile_contents_re=r'This output is generated by fleur.'
),
MissingParser(
name='parser/molcas', code_name='MOLCAS', domain='DFT',
mainfile_contents_re=r'M O L C A S'
),
MissingParser(
name='parser/molcas', code_name='MOLCAS', domain='DFT',
mainfile_contents_re=r'####### # # ####### ####### ####### ######'
),
# These are supposedly octopus files, but they do not look like octopus files at all
MissingParser(
name='parser/octopus', code_name='Octopus', domain='DFT',
mainfile_name_re=r'(inp)|(.*/inp)'
),
# We already have crystal with mainfile_contents_re, but this one does not always properly match
LegacyParser(
name='parsers/crystal', code_name='Crystal',
parser_class_name='crystalparser.CrystalParser',
mainfile_name_re='.*\.cryst\.out'
),
# We already have wien2k with mainfile_contents_re, but this one does not always properly match
LegacyParser(
name='parsers/wien2k', code_name='WIEN2k',
parser_class_name='wien2kparser.Wien2kParser',
mainfile_name_re='.*\.scf'
),
# We already have fhi-aims with mainfile_contents_re, but this one does not always properly match
LegacyParser(
name='parsers/fhi-aims', code_name='FHI-aims',
parser_class_name='fhiaimsparser.FHIaimsParser',
mainfile_name_re='.*\.fhiaims'
),
BrokenParser()
]
""" Instantiation and constructor based config of all parsers. """
......
......@@ -65,16 +65,43 @@ class Parser(metaclass=ABCMeta):
"""
class LegacyParser(Parser):
class BrokenParser(Parser):
"""
A parser implementation for legacy NOMAD-coe parsers. It assumes that parsers
are installed to the python environment. It
uses regular expessions to match parsers to mainfiles.
A parser implementation that just fails and is used to match mainfiles with known
patterns of corruption.
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.name = 'parser/broken'
self.code_name = 'currupted mainfile'
self._patterns = [
re.compile(r'^pid=[0-9]+'), # some 'mainfile' contain list of log-kinda information with pids
re.compile(r'^Can\'t open .* library:.*') # probably bad code runs
]
def is_mainfile(self, filename: str, mime: str, buffer: bytes, compression: str = None) -> bool:
try: # Try to open the file as a string for regex matching.
decoded_buffer = buffer.decode('utf-8')
except UnicodeDecodeError:
# This file is binary, and should not be binary
return True
else:
for pattern in self._patterns:
if pattern.search(decoded_buffer) is not None:
return True
return False
def run(self, mainfile: str, logger=None) -> LocalBackend:
raise Exception('Failed on purpose.')
class MatchingParser(Parser):
"""
A parser implementation that used regular experessions to match mainfiles.
Arguments:
python_get: the git repository and commit that contains the legacy parser
parser_class_name: the main parser class that implements NOMAD-coe's
python-common *ParserInterface*. Instances of this class are currently not reused.
mainfile_mime_re: A regexp that is used to match against a files mime type
mainfile_contents_re: A regexp that is used to match the first 1024 bytes of a
potential mainfile.
......@@ -83,7 +110,7 @@ class LegacyParser(Parser):
supported_compressions: A list of [gz, bz2], if the parser supports compressed files
"""
def __init__(
self, name: str, code_name: str, parser_class_name: str,
self, name: str, code_name: str,
mainfile_contents_re: str = None,
mainfile_mime_re: str = r'text/.*',
mainfile_name_re: str = r'.*',
......@@ -93,7 +120,6 @@ class LegacyParser(Parser):
super().__init__()
self.name = name
self.parser_class_name = parser_class_name
self.domain = domain
self._mainfile_mime_re = re.compile(mainfile_mime_re)
self._mainfile_name_re = re.compile(mainfile_name_re)
......@@ -105,7 +131,6 @@ class LegacyParser(Parser):
self._supported_compressions = supported_compressions
def is_mainfile(self, filename: str, mime: str, buffer: bytes, compression: str = None) -> bool:
if self._mainfile_contents_re is not None:
try: # Try to open the file as a string for regex matching.
decoded_buffer = buffer.decode('utf-8')
......@@ -113,11 +138,39 @@ class LegacyParser(Parser):
return False # We're looking for a string match in a file that can't be converted to string.
if self._mainfile_contents_re.search(decoded_buffer) is None:
return False
return self._mainfile_mime_re.match(mime) is not None and \
self._mainfile_name_re.match(filename) is not None and \
(compression is None or compression in self._supported_compressions)
def __repr__(self):
return self.name
class MissingParser(MatchingParser):
"""
A parser implementation that just fails and is used to match mainfiles with known
patterns of corruption.
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def run(self, mainfile: str, logger=None) -> LocalBackend:
raise Exception('The code %s is not yet supported.' % self.code_name)
class LegacyParser(MatchingParser):
"""
A parser implementation for legacy NOMAD-coe parsers. It assumes that parsers
are installed to the python environment.
Arguments:
parser_class_name: the main parser class that implements NOMAD-coe's
"""
def __init__(self, parser_class_name: str, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.parser_class_name = parser_class_name
def run(self, mainfile: str, logger=None) -> LocalBackend:
# TODO we need a homogeneous interface to parsers, but we dont have it right now.
# There are some hacks to distringuish between ParserInterface parser and simple_parser
......@@ -148,9 +201,6 @@ class LegacyParser(Parser):
return backend
def __repr__(self):
return self.name
class VaspOutcarParser(LegacyParser):
"""
......
import pymongo
import zipfile
import sys
from nomad import parsing
client = pymongo.MongoClient()
packages = client['coe_migration']['package']
def check(upload_id, mainfile):
content = None
for package in packages.find(dict(upload_id=upload_id)):
package_path = package['package_path']
with zipfile.ZipFile(package_path, 'r') as zf:
try:
with zf.open(mainfile, 'r') as f:
content = f.read(5000)
except KeyError:
pass
if content is None:
print('mainfile does not exist')
sys.exit(1)
match = None
for parser in parsing.parsers:
if parser.is_mainfile(mainfile, 'text/plain', content, None):
match = parser
if match is None:
try:
print(content.decode('utf-8'))
except Exception:
print('not unicode decodable, probably binary file')
return match is not None
import json
with open('local/missing_calcs_data.json') as f:
data = json.load(f)
for cause in data['others'] + data['no_calcs']:
if 'investigated_cause' not in cause and 'phonopy' not in cause['example_mainfile'] and not check(cause['source_upload_id'], cause['example_mainfile']):
input(cause)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment