From 26a9be3790c0a2de6dbd7a8ee1f04f431020a46e Mon Sep 17 00:00:00 2001
From: Amir Golparvar <amir.golparvar@physik.hu-berlin.de>
Date: Fri, 30 Aug 2024 14:53:09 +0000
Subject: [PATCH] Resolve "reading xlsx data in MatchingParser fails"

Changelog: Fixed
---
 nomad/config/models/plugins.py |  6 ++-
 nomad/parsing/parser.py        | 30 ++++---------
 tests/parsing/test_parsing.py  | 78 +++++++++++++++++++++++++++++++++-
 tests/parsing/test_tabular.py  | 28 ++++++++++++
 4 files changed, 119 insertions(+), 23 deletions(-)

diff --git a/nomad/config/models/plugins.py b/nomad/config/models/plugins.py
index 5bcb11b7b7..bd44ee0ddb 100644
--- a/nomad/config/models/plugins.py
+++ b/nomad/config/models/plugins.py
@@ -471,7 +471,11 @@ class Parser(PythonPluginBase):
     )
     mainfile_contents_dict: Optional[dict] = Field(
         description="""
-        Is used to match structured data files like JSON or HDF5.
+        Is used to match structured data files like JSON, HDF5 or csv/excel files. In case of a csv/excel file
+        for example, in order to check if certain columns exist in a given sheet, one can set this attribute to
+        `{'<sheet name>': {'__has_all_keys': [<column names>]}}`. In case the csv/excel file contains comments that
+        are supposed to be ignored, use this reserved key-value pair
+        `'__comment_symbol': '<symbol>'` at the top level of the dict right next to the <sheet name>.
     """
     )
     supported_compressions: List[str] = Field(
diff --git a/nomad/parsing/parser.py b/nomad/parsing/parser.py
index ba6cbce529..092d4ef508 100644
--- a/nomad/parsing/parser.py
+++ b/nomad/parsing/parser.py
@@ -340,7 +340,11 @@ class MatchingParser(Parser):
 
         if self._mainfile_contents_dict is not None:
             is_match = False
-            if mime.startswith('application/json') or mime.startswith('text/plain'):
+            if (
+                mime.startswith('application/json')
+                or mime.startswith('text/plain')
+                and not re.match(r'.+\.(?:csv|xlsx?)$', filename)
+            ):
                 try:
                     is_match = match(
                         self._mainfile_contents_dict, json.load(open(filename))
@@ -357,26 +361,10 @@ class MatchingParser(Parser):
                 from nomad.parsing.tabular import read_table_data
 
                 try:
-                    data: Dict[str, Any] = {}
-                    table_data = read_table_data(filename)
-                    for sheet_name in table_data:
-                        data.setdefault(sheet_name, {})
-                        nrow = 0
-                        for column_name in table_data[sheet_name][0].keys():
-                            column_values = list(
-                                table_data[sheet_name][0][column_name].values()
-                            )
-                            if not nrow:
-                                for n, row in enumerate(column_values):
-                                    if not str(row).strip().startswith('#'):
-                                        nrow = n
-                                        break
-                            if nrow:
-                                data[sheet_name][column_values[nrow]] = column_values[
-                                    nrow + 1 :
-                                ]
-                            else:
-                                data[column_name] = column_values
+                    comment = self._mainfile_contents_dict.get('__comment_symbol', None)
+                    self._mainfile_contents_dict.pop('__comment_symbol', None)
+                    table_data = read_table_data(filename, comment=comment)[0]
+                    data = table_data.to_dict()
 
                     is_match = match(self._mainfile_contents_dict, data)
                 except Exception:
diff --git a/tests/parsing/test_parsing.py b/tests/parsing/test_parsing.py
index f784e4a246..4a99fc64d9 100644
--- a/tests/parsing/test_parsing.py
+++ b/tests/parsing/test_parsing.py
@@ -19,12 +19,13 @@
 import json
 import os
 from shutil import copyfile
+from unittest.mock import patch, MagicMock
 
 import pytest
 
 from nomad import files, utils
 from nomad.datamodel import EntryArchive
-from nomad.parsing import BrokenParser, MatchingParserInterface
+from nomad.parsing import BrokenParser, MatchingParserInterface, MatchingParser
 from nomad.parsing.parsers import match_parser, parser_dict, parsers, run_parser
 from nomad.utils import dump_json
 
@@ -321,6 +322,81 @@ def parser_in_dir(dir):
                     print(file_path, parser, 'SUCCESS')
 
 
+@pytest.fixture
+def matching_parser():
+    return MatchingParser(
+        mainfile_mime_re=r'application/json|application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+        mainfile_name_re=r'.*\.(json|xlsx)',
+    )
+
+
+@pytest.mark.parametrize(
+    'mime, mocked_dict, file_content, expected_result',
+    [
+        pytest.param(
+            'application/json',
+            {'__has_all_keys': ['key1', 'key2']},
+            {'key1': 'value1', 'key2': 'value2'},
+            True,
+            id='json_content_match',
+        ),
+        pytest.param(
+            'application/json',
+            {'__has_all_keys': ['key1', 'key2']},
+            {'key1': 'value1', 'key3': 'value3'},
+            False,
+            id='json_content_no_match',
+        ),
+        pytest.param(
+            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+            {'Sheet1': {'__has_all_keys': ['col1', 'col2']}, '__comment_symbol': '#'},
+            {'Sheet1': {'col1': 'value1', 'col2': 'value2'}},
+            True,
+            id='excel_content_match',
+        ),
+        pytest.param(
+            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+            {'Sheet1': {'__has_all_keys': ['col1', 'col2']}, '__comment_symbol': '#'},
+            {'Sheet1': {'col1': 'value1', 'col3': 'value3'}},
+            False,
+            id='excel_content_no_match',
+        ),
+    ],
+)
+def test_is_mainfile_with_mocked_content(
+    matching_parser, mime, mocked_dict, file_content, expected_result
+):
+    """Test is_mainfile with mocked content (JSON, Excel)."""
+
+    matching_parser._mainfile_contents_dict = mocked_dict
+
+    with patch('builtins.open', new_callable=MagicMock) as mock_open:
+        if mime.startswith('application/json'):
+            with patch('json.load', return_value=file_content):
+                result = matching_parser.is_mainfile(
+                    filename='mocked_file.json',
+                    mime=mime,
+                    buffer=b'',
+                    decoded_buffer='',
+                )
+                assert result == expected_result
+
+        elif mime.startswith(
+            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
+        ):
+            with patch(
+                'nomad.parsing.tabular.read_table_data',
+                return_value=(MagicMock(to_dict=lambda: file_content),),
+            ):
+                result = matching_parser.is_mainfile(
+                    filename='mocked_file.xlsx',
+                    mime=mime,
+                    buffer=b'',
+                    decoded_buffer='',
+                )
+                assert result == expected_result
+
+
 if __name__ == '__main__':
     import os
     import sys
diff --git a/tests/parsing/test_tabular.py b/tests/parsing/test_tabular.py
index 293c7aadb6..04af3aa492 100644
--- a/tests/parsing/test_tabular.py
+++ b/tests/parsing/test_tabular.py
@@ -15,7 +15,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+from io import StringIO
+from unittest.mock import MagicMock, patch
 
+import pandas as pd
 import pytest
 import os
 import os.path
@@ -26,6 +29,7 @@ import yaml
 from nomad.config import config
 from nomad.datamodel.datamodel import EntryArchive, EntryMetadata
 from nomad.datamodel.context import ClientContext
+from nomad.parsing.tabular import read_table_data
 from nomad.utils import generate_entry_id, strip
 from nomad.parsing.parser import ArchiveParser
 from tests.normalizing.conftest import run_normalize
@@ -959,3 +963,27 @@ def get_archives(context, mainfile, upload_id, keys=None):
         child_archives = None
 
     return main_archive, child_archives
+
+
+def test_read_table_data_excel():
+    expected_data = {
+        0: {
+            'sheet_4': {
+                'Header 1': {0: 1, 1: 2, 2: 3},
+                'Header 2': {0: 'a', 1: 'b', 2: 'c'},
+                'Header 3': {0: 'test1', 1: 'test2', 2: 'test3'},
+                'Header 4': {0: 'my_str1', 1: 'my_str2', 2: 'my_str3'},
+                'Header 5': {0: 11, 1: 12, 2: 13},
+                'Header 6': {0: 'test11', 1: 'test12', 2: 'test13'},
+            }
+        }
+    }
+    excel_file = os.path.join(
+        os.path.dirname(__file__), '../../tests/data/parsers/tabular/Test.xlsx'
+    )
+
+    result_df = read_table_data(excel_file, comment='#')
+
+    result_dict = result_df.to_dict()
+
+    assert result_dict[0]['sheet_4'] == expected_data[0]['sheet_4']
-- 
GitLab