From 57bea61f687d2f323e05c4f5a0d79616f6476893 Mon Sep 17 00:00:00 2001
From: Markus Scheidgen <markus.scheidgen@gmail.com>
Date: Fri, 13 Dec 2019 11:22:32 +0100
Subject: [PATCH] Added possible binary raw file decoding.

---
 nomad/config.py           |  3 ++-
 nomad/parsing/__init__.py | 22 +++++++++++++++++-----
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/nomad/config.py b/nomad/config.py
index 02189a865e..46f49fdd16 100644
--- a/nomad/config.py
+++ b/nomad/config.py
@@ -148,7 +148,8 @@ services = NomadConfig(
     not_processed_value='not processed',
     unavailable_value='unavailable',
     https=False,
-    upload_limit=10
+    upload_limit=10,
+    force_raw_file_decoding=False
 )
 
 tests = NomadConfig(
diff --git a/nomad/parsing/__init__.py b/nomad/parsing/__init__.py
index 39d0619e15..9b8383c752 100644
--- a/nomad/parsing/__init__.py
+++ b/nomad/parsing/__init__.py
@@ -126,21 +126,21 @@ def match_parser(mainfile: str, upload_files: Union[str, files.StagingUploadFile
     mime_type = magic.from_buffer(buffer, mime=True)
 
     decoded_buffer = None
+    encoding = None
     try:  # Try to open the file as a string for regex matching.
         decoded_buffer = buffer.decode('utf-8')
     except UnicodeDecodeError:
         # This file is either binary or has wrong encoding
         encoding = encoding_magic.from_buffer(buffer)
+
+        if config.services.force_raw_file_decoding:
+            encoding = 'iso-8859-1'
+
         if encoding in ['iso-8859-1']:
             try:
-                with open(mainfile_path, 'rb') as binary_file:
-                    content = binary_file.read().decode(encoding)
                 decoded_buffer = buffer.decode(encoding)
             except Exception:
                 pass
-            else:
-                with open(mainfile_path, 'wt') as text_file:
-                    text_file.write(content)
 
     for parser in parsers:
         if strict and (isinstance(parser, MissingParser) or isinstance(parser, EmptyParser)):
@@ -150,6 +150,17 @@ def match_parser(mainfile: str, upload_files: Union[str, files.StagingUploadFile
             continue
 
         if parser.is_mainfile(mainfile_path, mime_type, buffer, decoded_buffer, compression):
+            # potentially convert the file
+            if encoding in ['iso-8859-1']:
+                try:
+                    with open(mainfile_path, 'rb') as binary_file:
+                        content = binary_file.read().decode(encoding)
+                except Exception:
+                    pass
+                else:
+                    with open(mainfile_path, 'wt') as text_file:
+                        text_file.write(content)
+
             # TODO: deal with multiple possible parser specs
             return parser
 
@@ -286,6 +297,7 @@ parsers = [
     LegacyParser(
         name='parsers/gaussian', code_name='Gaussian',
         parser_class_name='gaussianparser.GaussianParser',
+        mainfile_mime_re=r'.*',
         mainfile_contents_re=(
             r'\s*Cite this work as:'
             r'\s*Gaussian [0-9]+, Revision [A-Za-z0-9\.]*,')
-- 
GitLab