Commit 3cf4d41a authored by Markus Scheidgen's avatar Markus Scheidgen
Browse files

Improved matching with non utf-8 encoded files.

parent 0a85588d
......@@ -92,6 +92,8 @@ _compressions = {
b'\xfd\x37\x7a': ('xz', lzma.open)
}
encoding_magic = magic.Magic(mime_encoding=True)
def match_parser(mainfile: str, upload_files: Union[str, files.StagingUploadFiles], strict=True) -> 'Parser':
"""
......@@ -122,6 +124,24 @@ def match_parser(mainfile: str, upload_files: Union[str, files.StagingUploadFile
buffer = cf.read(config.parser_matching_size)
mime_type = magic.from_buffer(buffer, mime=True)
decoded_buffer = None
if 'text' in mime_type:
try: # Try to open the file as a string for regex matching.
decoded_buffer = buffer.decode('utf-8')
except UnicodeDecodeError:
# This file is either binary or has wrong encoding
encoding = encoding_magic.from_buffer(buffer)
if encoding in ['iso-8859-1']:
try:
with open(mainfile_path, 'rb') as binary_file:
content = binary_file.read().decode(encoding)
decoded_buffer = buffer.decode(encoding)
except Exception:
pass
else:
with open(mainfile_path, 'wt') as text_file:
text_file.write(content)
for parser in parsers:
if strict and (isinstance(parser, MissingParser) or isinstance(parser, EmptyParser)):
continue
......@@ -129,7 +149,7 @@ def match_parser(mainfile: str, upload_files: Union[str, files.StagingUploadFile
if parser.domain != config.domain:
continue
if parser.is_mainfile(mainfile_path, mime_type, buffer, compression):
if parser.is_mainfile(mainfile_path, mime_type, buffer, decoded_buffer, compression):
# TODO: deal with multiple possible parser specs
return parser
......
......@@ -72,7 +72,9 @@ class TemplateParser(ArtificalParser):
"""
name = 'parsers/template'
def is_mainfile(self, filename: str, mime: str, buffer: bytes, compression: str = None) -> bool:
def is_mainfile(
self, filename: str, mime: str, buffer: bytes, decoded_buffer: str,
compression: str = None) -> bool:
return filename.endswith('template.json')
def transform_value(self, name, value):
......@@ -140,7 +142,9 @@ class ChaosParser(ArtificalParser):
"""
name = 'parsers/chaos'
def is_mainfile(self, filename: str, mime: str, buffer: bytes, compression: str = None) -> bool:
def is_mainfile(
self, filename: str, mime: str, buffer: bytes, decoded_buffer: str,
compression: str = None) -> bool:
return filename.endswith('chaos.json')
def run(self, mainfile: str, logger=None) -> LocalBackend:
......@@ -202,7 +206,9 @@ class GenerateRandomParser(TemplateParser):
self.template = json.load(open(template_file, 'r'))
self.random = None
def is_mainfile(self, filename: str, mime: str, buffer: bytes, compression: str = None) -> bool:
def is_mainfile(
self, filename: str, mime: str, buffer: bytes, decoded_buffer: str,
compression: str = None) -> bool:
return os.path.basename(filename).startswith('random_')
def transform_section(self, name, section):
......
......@@ -38,7 +38,9 @@ class Parser(metaclass=ABCMeta):
self.domain = 'DFT'
@abstractmethod
def is_mainfile(self, filename: str, mime: str, buffer: bytes, compression: str = None) -> bool:
def is_mainfile(
self, filename: str, mime: str, buffer: bytes, decoded_buffer: str,
compression: str = None) -> bool:
"""
Checks if a file is a mainfile for the parsers.
......@@ -79,14 +81,11 @@ class BrokenParser(Parser):
re.compile(r'^Can\'t open .* library:.*') # probably bad code runs
]
def is_mainfile(self, filename: str, mime: str, buffer: bytes, compression: str = None) -> bool:
def is_mainfile(
self, filename: str, mime: str, buffer: bytes, decoded_buffer: str,
compression: str = None) -> bool:
try: # Try to open the file as a string for regex matching.
decoded_buffer = buffer.decode('utf-8')
except UnicodeDecodeError:
# This file is binary, and should not be binary
pass
else:
if decoded_buffer is not None:
for pattern in self._patterns:
if pattern.search(decoded_buffer) is not None:
return True
......@@ -132,16 +131,18 @@ class MatchingParser(Parser):
self._mainfile_contents_re = None
self._supported_compressions = supported_compressions
def is_mainfile(self, filename: str, mime: str, buffer: bytes, compression: str = None) -> bool:
def is_mainfile(
self, filename: str, mime: str, buffer: bytes, decoded_buffer: str,
compression: str = None) -> bool:
if self._mainfile_binary_header is not None:
if self._mainfile_binary_header not in buffer:
return False
if self._mainfile_contents_re is not None:
try: # Try to open the file as a string for regex matching.
decoded_buffer = buffer.decode('utf-8')
except UnicodeDecodeError:
return False # We're looking for a string match in a file that can't be converted to string.
if self._mainfile_contents_re.search(decoded_buffer) is None:
if decoded_buffer is not None:
if self._mainfile_contents_re.search(decoded_buffer) is None:
return False
else:
return False
return self._mainfile_mime_re.match(mime) is not None and \
self._mainfile_name_re.fullmatch(filename) is not None and \
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -17,6 +17,7 @@ import json
import numpy as np
import pytest
import os
from shutil import copyfile
from nomad import utils, files
from nomad.parsing import JSONStreamWriter, parser_dict, match_parser, BrokenParser
......@@ -74,7 +75,7 @@ for parser, mainfile in parser_examples:
parser_examples = fixed_parser_examples
correct_num_output_files = 44
correct_num_output_files = 45
class TestLocalBackend(object):
......@@ -355,7 +356,14 @@ def test_broken_xml_vasp():
assert_parser_dir_unchanged(previous_wd, current_wd=os.getcwd())
def test_match(raw_files, no_warn):
@pytest.fixture(scope='function')
def with_latin_1_file(raw_files):
copyfile('tests/data/latin-1.out', 'tests/data/parsers/latin-1.out')
yield
os.remove('tests/data/parsers/latin-1.out')
def test_match(raw_files, with_latin_1_file, no_warn):
example_upload_id = 'example_upload_id'
upload_files = files.StagingUploadFiles(example_upload_id, create=True, is_authorized=lambda: True)
upload_files.add_rawfiles('tests/data/parsers')
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment