From 26a9be3790c0a2de6dbd7a8ee1f04f431020a46e Mon Sep 17 00:00:00 2001 From: Amir Golparvar <amir.golparvar@physik.hu-berlin.de> Date: Fri, 30 Aug 2024 14:53:09 +0000 Subject: [PATCH] Resolve "reading xlsx data in MatchingParser fails" Changelog: Fixed --- nomad/config/models/plugins.py | 6 ++- nomad/parsing/parser.py | 30 ++++--------- tests/parsing/test_parsing.py | 78 +++++++++++++++++++++++++++++++++- tests/parsing/test_tabular.py | 28 ++++++++++++ 4 files changed, 119 insertions(+), 23 deletions(-) diff --git a/nomad/config/models/plugins.py b/nomad/config/models/plugins.py index 5bcb11b7b7..bd44ee0ddb 100644 --- a/nomad/config/models/plugins.py +++ b/nomad/config/models/plugins.py @@ -471,7 +471,11 @@ class Parser(PythonPluginBase): ) mainfile_contents_dict: Optional[dict] = Field( description=""" - Is used to match structured data files like JSON or HDF5. + Is used to match structured data files like JSON, HDF5 or csv/excel files. In case of a csv/excel file + for example, in order to check if certain columns exist in a given sheet, one can set this attribute to + `{'<sheet name>': {'__has_all_keys': [<column names>]}}`. In case the csv/excel file contains comments that + are supposed to be ignored, use this reserved key-value pair + `'__comment_symbol': '<symbol>'` at the top level of the dict right next to the <sheet name>. """ ) supported_compressions: List[str] = Field( diff --git a/nomad/parsing/parser.py b/nomad/parsing/parser.py index ba6cbce529..092d4ef508 100644 --- a/nomad/parsing/parser.py +++ b/nomad/parsing/parser.py @@ -340,7 +340,11 @@ class MatchingParser(Parser): if self._mainfile_contents_dict is not None: is_match = False - if mime.startswith('application/json') or mime.startswith('text/plain'): + if ( + mime.startswith('application/json') + or mime.startswith('text/plain') + and not re.match(r'.+\.(?:csv|xlsx?)$', filename) + ): try: is_match = match( self._mainfile_contents_dict, json.load(open(filename)) @@ -357,26 +361,10 @@ class MatchingParser(Parser): from nomad.parsing.tabular import read_table_data try: - data: Dict[str, Any] = {} - table_data = read_table_data(filename) - for sheet_name in table_data: - data.setdefault(sheet_name, {}) - nrow = 0 - for column_name in table_data[sheet_name][0].keys(): - column_values = list( - table_data[sheet_name][0][column_name].values() - ) - if not nrow: - for n, row in enumerate(column_values): - if not str(row).strip().startswith('#'): - nrow = n - break - if nrow: - data[sheet_name][column_values[nrow]] = column_values[ - nrow + 1 : - ] - else: - data[column_name] = column_values + comment = self._mainfile_contents_dict.get('__comment_symbol', None) + self._mainfile_contents_dict.pop('__comment_symbol', None) + table_data = read_table_data(filename, comment=comment)[0] + data = table_data.to_dict() is_match = match(self._mainfile_contents_dict, data) except Exception: diff --git a/tests/parsing/test_parsing.py b/tests/parsing/test_parsing.py index f784e4a246..4a99fc64d9 100644 --- a/tests/parsing/test_parsing.py +++ b/tests/parsing/test_parsing.py @@ -19,12 +19,13 @@ import json import os from shutil import copyfile +from unittest.mock import patch, MagicMock import pytest from nomad import files, utils from nomad.datamodel import EntryArchive -from nomad.parsing import BrokenParser, MatchingParserInterface +from nomad.parsing import BrokenParser, MatchingParserInterface, MatchingParser from nomad.parsing.parsers import match_parser, parser_dict, parsers, run_parser from nomad.utils import dump_json @@ -321,6 +322,81 @@ def parser_in_dir(dir): print(file_path, parser, 'SUCCESS') +@pytest.fixture +def matching_parser(): + return MatchingParser( + mainfile_mime_re=r'application/json|application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + mainfile_name_re=r'.*\.(json|xlsx)', + ) + + +@pytest.mark.parametrize( + 'mime, mocked_dict, file_content, expected_result', + [ + pytest.param( + 'application/json', + {'__has_all_keys': ['key1', 'key2']}, + {'key1': 'value1', 'key2': 'value2'}, + True, + id='json_content_match', + ), + pytest.param( + 'application/json', + {'__has_all_keys': ['key1', 'key2']}, + {'key1': 'value1', 'key3': 'value3'}, + False, + id='json_content_no_match', + ), + pytest.param( + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + {'Sheet1': {'__has_all_keys': ['col1', 'col2']}, '__comment_symbol': '#'}, + {'Sheet1': {'col1': 'value1', 'col2': 'value2'}}, + True, + id='excel_content_match', + ), + pytest.param( + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + {'Sheet1': {'__has_all_keys': ['col1', 'col2']}, '__comment_symbol': '#'}, + {'Sheet1': {'col1': 'value1', 'col3': 'value3'}}, + False, + id='excel_content_no_match', + ), + ], +) +def test_is_mainfile_with_mocked_content( + matching_parser, mime, mocked_dict, file_content, expected_result +): + """Test is_mainfile with mocked content (JSON, Excel).""" + + matching_parser._mainfile_contents_dict = mocked_dict + + with patch('builtins.open', new_callable=MagicMock) as mock_open: + if mime.startswith('application/json'): + with patch('json.load', return_value=file_content): + result = matching_parser.is_mainfile( + filename='mocked_file.json', + mime=mime, + buffer=b'', + decoded_buffer='', + ) + assert result == expected_result + + elif mime.startswith( + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' + ): + with patch( + 'nomad.parsing.tabular.read_table_data', + return_value=(MagicMock(to_dict=lambda: file_content),), + ): + result = matching_parser.is_mainfile( + filename='mocked_file.xlsx', + mime=mime, + buffer=b'', + decoded_buffer='', + ) + assert result == expected_result + + if __name__ == '__main__': import os import sys diff --git a/tests/parsing/test_tabular.py b/tests/parsing/test_tabular.py index 293c7aadb6..04af3aa492 100644 --- a/tests/parsing/test_tabular.py +++ b/tests/parsing/test_tabular.py @@ -15,7 +15,10 @@ # See the License for the specific language governing permissions and # limitations under the License. # +from io import StringIO +from unittest.mock import MagicMock, patch +import pandas as pd import pytest import os import os.path @@ -26,6 +29,7 @@ import yaml from nomad.config import config from nomad.datamodel.datamodel import EntryArchive, EntryMetadata from nomad.datamodel.context import ClientContext +from nomad.parsing.tabular import read_table_data from nomad.utils import generate_entry_id, strip from nomad.parsing.parser import ArchiveParser from tests.normalizing.conftest import run_normalize @@ -959,3 +963,27 @@ def get_archives(context, mainfile, upload_id, keys=None): child_archives = None return main_archive, child_archives + + +def test_read_table_data_excel(): + expected_data = { + 0: { + 'sheet_4': { + 'Header 1': {0: 1, 1: 2, 2: 3}, + 'Header 2': {0: 'a', 1: 'b', 2: 'c'}, + 'Header 3': {0: 'test1', 1: 'test2', 2: 'test3'}, + 'Header 4': {0: 'my_str1', 1: 'my_str2', 2: 'my_str3'}, + 'Header 5': {0: 11, 1: 12, 2: 13}, + 'Header 6': {0: 'test11', 1: 'test12', 2: 'test13'}, + } + } + } + excel_file = os.path.join( + os.path.dirname(__file__), '../../tests/data/parsers/tabular/Test.xlsx' + ) + + result_df = read_table_data(excel_file, comment='#') + + result_dict = result_df.to_dict() + + assert result_dict[0]['sheet_4'] == expected_data[0]['sheet_4'] -- GitLab