Skip to content
Snippets Groups Projects
Commit d20c4a19 authored by Amir Golparvar's avatar Amir Golparvar
Browse files

Merge branch '2114-reading-xlsx-data-in-matchingparser-fails' into 'develop'

Resolve "reading xlsx data in MatchingParser fails"

Closes #2114

See merge request !2065
parents 4253f870 26a9be37
No related branches found
No related tags found
1 merge request!2065Resolve "reading xlsx data in MatchingParser fails"
Pipeline #217370 passed
......@@ -471,7 +471,11 @@ class Parser(PythonPluginBase):
)
mainfile_contents_dict: Optional[dict] = Field(
description="""
Is used to match structured data files like JSON or HDF5.
Is used to match structured data files like JSON, HDF5 or csv/excel files. In case of a csv/excel file
for example, in order to check if certain columns exist in a given sheet, one can set this attribute to
`{'<sheet name>': {'__has_all_keys': [<column names>]}}`. In case the csv/excel file contains comments that
are supposed to be ignored, use this reserved key-value pair
`'__comment_symbol': '<symbol>'` at the top level of the dict right next to the <sheet name>.
"""
)
supported_compressions: List[str] = Field(
......
......@@ -340,7 +340,11 @@ class MatchingParser(Parser):
if self._mainfile_contents_dict is not None:
is_match = False
if mime.startswith('application/json') or mime.startswith('text/plain'):
if (
mime.startswith('application/json')
or mime.startswith('text/plain')
and not re.match(r'.+\.(?:csv|xlsx?)$', filename)
):
try:
is_match = match(
self._mainfile_contents_dict, json.load(open(filename))
......@@ -357,26 +361,10 @@ class MatchingParser(Parser):
from nomad.parsing.tabular import read_table_data
try:
data: Dict[str, Any] = {}
table_data = read_table_data(filename)
for sheet_name in table_data:
data.setdefault(sheet_name, {})
nrow = 0
for column_name in table_data[sheet_name][0].keys():
column_values = list(
table_data[sheet_name][0][column_name].values()
)
if not nrow:
for n, row in enumerate(column_values):
if not str(row).strip().startswith('#'):
nrow = n
break
if nrow:
data[sheet_name][column_values[nrow]] = column_values[
nrow + 1 :
]
else:
data[column_name] = column_values
comment = self._mainfile_contents_dict.get('__comment_symbol', None)
self._mainfile_contents_dict.pop('__comment_symbol', None)
table_data = read_table_data(filename, comment=comment)[0]
data = table_data.to_dict()
is_match = match(self._mainfile_contents_dict, data)
except Exception:
......
......@@ -19,12 +19,13 @@
import json
import os
from shutil import copyfile
from unittest.mock import patch, MagicMock
import pytest
from nomad import files, utils
from nomad.datamodel import EntryArchive
from nomad.parsing import BrokenParser, MatchingParserInterface
from nomad.parsing import BrokenParser, MatchingParserInterface, MatchingParser
from nomad.parsing.parsers import match_parser, parser_dict, parsers, run_parser
from nomad.utils import dump_json
......@@ -321,6 +322,81 @@ def parser_in_dir(dir):
print(file_path, parser, 'SUCCESS')
@pytest.fixture
def matching_parser():
return MatchingParser(
mainfile_mime_re=r'application/json|application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
mainfile_name_re=r'.*\.(json|xlsx)',
)
@pytest.mark.parametrize(
'mime, mocked_dict, file_content, expected_result',
[
pytest.param(
'application/json',
{'__has_all_keys': ['key1', 'key2']},
{'key1': 'value1', 'key2': 'value2'},
True,
id='json_content_match',
),
pytest.param(
'application/json',
{'__has_all_keys': ['key1', 'key2']},
{'key1': 'value1', 'key3': 'value3'},
False,
id='json_content_no_match',
),
pytest.param(
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
{'Sheet1': {'__has_all_keys': ['col1', 'col2']}, '__comment_symbol': '#'},
{'Sheet1': {'col1': 'value1', 'col2': 'value2'}},
True,
id='excel_content_match',
),
pytest.param(
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
{'Sheet1': {'__has_all_keys': ['col1', 'col2']}, '__comment_symbol': '#'},
{'Sheet1': {'col1': 'value1', 'col3': 'value3'}},
False,
id='excel_content_no_match',
),
],
)
def test_is_mainfile_with_mocked_content(
matching_parser, mime, mocked_dict, file_content, expected_result
):
"""Test is_mainfile with mocked content (JSON, Excel)."""
matching_parser._mainfile_contents_dict = mocked_dict
with patch('builtins.open', new_callable=MagicMock) as mock_open:
if mime.startswith('application/json'):
with patch('json.load', return_value=file_content):
result = matching_parser.is_mainfile(
filename='mocked_file.json',
mime=mime,
buffer=b'',
decoded_buffer='',
)
assert result == expected_result
elif mime.startswith(
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
):
with patch(
'nomad.parsing.tabular.read_table_data',
return_value=(MagicMock(to_dict=lambda: file_content),),
):
result = matching_parser.is_mainfile(
filename='mocked_file.xlsx',
mime=mime,
buffer=b'',
decoded_buffer='',
)
assert result == expected_result
if __name__ == '__main__':
import os
import sys
......
......@@ -15,7 +15,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
from io import StringIO
from unittest.mock import MagicMock, patch
import pandas as pd
import pytest
import os
import os.path
......@@ -26,6 +29,7 @@ import yaml
from nomad.config import config
from nomad.datamodel.datamodel import EntryArchive, EntryMetadata
from nomad.datamodel.context import ClientContext
from nomad.parsing.tabular import read_table_data
from nomad.utils import generate_entry_id, strip
from nomad.parsing.parser import ArchiveParser
from tests.normalizing.conftest import run_normalize
......@@ -959,3 +963,27 @@ def get_archives(context, mainfile, upload_id, keys=None):
child_archives = None
return main_archive, child_archives
def test_read_table_data_excel():
expected_data = {
0: {
'sheet_4': {
'Header 1': {0: 1, 1: 2, 2: 3},
'Header 2': {0: 'a', 1: 'b', 2: 'c'},
'Header 3': {0: 'test1', 1: 'test2', 2: 'test3'},
'Header 4': {0: 'my_str1', 1: 'my_str2', 2: 'my_str3'},
'Header 5': {0: 11, 1: 12, 2: 13},
'Header 6': {0: 'test11', 1: 'test12', 2: 'test13'},
}
}
}
excel_file = os.path.join(
os.path.dirname(__file__), '../../tests/data/parsers/tabular/Test.xlsx'
)
result_df = read_table_data(excel_file, comment='#')
result_dict = result_df.to_dict()
assert result_dict[0]['sheet_4'] == expected_data[0]['sheet_4']
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment