From 57bea61f687d2f323e05c4f5a0d79616f6476893 Mon Sep 17 00:00:00 2001 From: Markus Scheidgen <markus.scheidgen@gmail.com> Date: Fri, 13 Dec 2019 11:22:32 +0100 Subject: [PATCH] Added possible binary raw file decoding. --- nomad/config.py | 3 ++- nomad/parsing/__init__.py | 22 +++++++++++++++++----- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/nomad/config.py b/nomad/config.py index 02189a865e..46f49fdd16 100644 --- a/nomad/config.py +++ b/nomad/config.py @@ -148,7 +148,8 @@ services = NomadConfig( not_processed_value='not processed', unavailable_value='unavailable', https=False, - upload_limit=10 + upload_limit=10, + force_raw_file_decoding=False ) tests = NomadConfig( diff --git a/nomad/parsing/__init__.py b/nomad/parsing/__init__.py index 39d0619e15..9b8383c752 100644 --- a/nomad/parsing/__init__.py +++ b/nomad/parsing/__init__.py @@ -126,21 +126,21 @@ def match_parser(mainfile: str, upload_files: Union[str, files.StagingUploadFile mime_type = magic.from_buffer(buffer, mime=True) decoded_buffer = None + encoding = None try: # Try to open the file as a string for regex matching. decoded_buffer = buffer.decode('utf-8') except UnicodeDecodeError: # This file is either binary or has wrong encoding encoding = encoding_magic.from_buffer(buffer) + + if config.services.force_raw_file_decoding: + encoding = 'iso-8859-1' + if encoding in ['iso-8859-1']: try: - with open(mainfile_path, 'rb') as binary_file: - content = binary_file.read().decode(encoding) decoded_buffer = buffer.decode(encoding) except Exception: pass - else: - with open(mainfile_path, 'wt') as text_file: - text_file.write(content) for parser in parsers: if strict and (isinstance(parser, MissingParser) or isinstance(parser, EmptyParser)): @@ -150,6 +150,17 @@ def match_parser(mainfile: str, upload_files: Union[str, files.StagingUploadFile continue if parser.is_mainfile(mainfile_path, mime_type, buffer, decoded_buffer, compression): + # potentially convert the file + if encoding in ['iso-8859-1']: + try: + with open(mainfile_path, 'rb') as binary_file: + content = binary_file.read().decode(encoding) + except Exception: + pass + else: + with open(mainfile_path, 'wt') as text_file: + text_file.write(content) + # TODO: deal with multiple possible parser specs return parser @@ -286,6 +297,7 @@ parsers = [ LegacyParser( name='parsers/gaussian', code_name='Gaussian', parser_class_name='gaussianparser.GaussianParser', + mainfile_mime_re=r'.*', mainfile_contents_re=( r'\s*Cite this work as:' r'\s*Gaussian [0-9]+, Revision [A-Za-z0-9\.]*,') -- GitLab