From af7cd8d147b670bf41b3c875f3854fd74ed8f28a Mon Sep 17 00:00:00 2001 From: Christian Boulanger <boulanger@lhlt.mpg.de> Date: Tue, 8 Oct 2024 13:40:50 +0200 Subject: [PATCH] fix whitespace removal rules to preserve elipses --- convert-anystyle-data/lib/string.py | 23 ++++++-- convert-anystyle-data/tei-to-gold.ipynb | 75 +++++++++++++++++++++---- 2 files changed, 83 insertions(+), 15 deletions(-) diff --git a/convert-anystyle-data/lib/string.py b/convert-anystyle-data/lib/string.py index 413c493..7936a38 100644 --- a/convert-anystyle-data/lib/string.py +++ b/convert-anystyle-data/lib/string.py @@ -13,15 +13,19 @@ def prettify(xml_string, indentation=" "): def remove_whitespace(text): - # we need to remove the whitespace that comes with the pretty-printed xml + # we need to remove the whitespace that comes with the indentation of pretty-printed xml text = re.sub(r'\n *', ' ', text) + + # reduce double spaces to one while re.search(r' ', text): text = re.sub(r' ', ' ', text) + + # escape character sequences which would be corrupted by whitespace removal rules + text = re.sub(r'\. \. \.', '[!spaced_elipsis!]', text) + text = re.sub(r'\.\.\.', '[!elipsis!]', text) + # fix issues with whitespace before and after punctuation text = re.sub(r' ([.;,!?%:])( |$)', r'\1 ', text) - # remove duplicated punctuation, this comes from incorrect serializing - text = re.sub(r'\.\.', r'.', text) - text = re.sub(r'\. \.', r'.', text) # opening and closing punctutation, such as brackets text = re.sub(r'(\p{Ps}) ', r'\1', text) text = re.sub(r' (\p{Pe})', r'\1', text) @@ -30,8 +34,16 @@ def remove_whitespace(text): text = re.sub(r' (\p{Pf})', r'\1', text) # slash text = re.sub(r' ?/ ?', r'/', text) + + # restore sequences + text = text.replace('[!spaced_elipsis!]', r'. . .') + text = text.replace( '[!elipsis!]', '...') return text.strip() +def fix_serialization_issues(text): + # remove duplicated punctuation, todo this incorrectly shortens elipses ("...") + text = re.sub(r'\.\.', r'.', text) + text = re.sub(r'\. \.', r'.', text) # to do, only applies when def tei_to_input(tei_xml_doc): """ @@ -81,6 +93,9 @@ def compare_input_strings(tei_dir_path, ref_dir_path, html_output_path, target_u # convert it to raw reference text tei_input_data = tei_to_input(tei_xml_doc) + # postprocess to fix issues that should be fixed in the source or in the converter code + tei_input_data = fix_serialization_issues(tei_input_data) + # if an output path has been given, store the reconstructed raw references if refs_output_path: with open(f'{refs_output_path}/{id}.txt', 'w', encoding='utf-8') as w: diff --git a/convert-anystyle-data/tei-to-gold.ipynb b/convert-anystyle-data/tei-to-gold.ipynb index adeed2e..3aa2427 100644 --- a/convert-anystyle-data/tei-to-gold.ipynb +++ b/convert-anystyle-data/tei-to-gold.ipynb @@ -12,13 +12,13 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 1, "outputs": [ { "data": { "text/plain": "'3 See R. Goff, ‘The Search for Principle’ (1983) Proceeedings of the British Academy 169, at 171. This is an\\n amplification of Dicey’s remark that ‘[b]y adequate study and careful thought whole departments of law can .\\n . . be reduced to order and exhibited under the form of a few principles which sum up the effect of a\\n hundred cases . . .’. A. Dicey, Can English Law be taught at the Universities? (1883) 20.'" }, - "execution_count": 14, + "execution_count": 1, "metadata": {}, "output_type": "execute_result" } @@ -27,43 +27,96 @@ "from lxml import etree\n", "tree = etree.parse('./schema/gold_standard.xml')\n", "input_formatted = tree.find(\".//input[@type='formatted']\")\n", - "etree.tounicode(input_formatted, method='text').strip()" + "content_formatted = etree.tounicode(input_formatted, method='text').strip()\n", + "content_formatted" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-10-08T11:18:49.880708500Z", - "start_time": "2024-10-08T11:18:49.873127500Z" + "end_time": "2024-10-08T11:39:49.038045100Z", + "start_time": "2024-10-08T11:39:49.016104600Z" } }, "id": "78f4e23b884790d" }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 2, "outputs": [ { "data": { "text/plain": "'3 See R. Goff, ‘The Search for Principle’ (1983) Proceeedings of the British Academy 169, at 171. This is an amplification of Dicey’s remark that ‘[b]y adequate study and careful thought whole departments of law can . . . be reduced to order and exhibited under the form of a few principles which sum up the effect of a hundred cases . . .’. A. Dicey, Can English Law be taught at the Universities? (1883) 20.'" }, - "execution_count": 13, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "input_raw = tree.find(\".//input[@type='raw']\")\n", - "etree.tounicode(input_raw, method='text').strip()" + "content_raw = etree.tounicode(input_raw, method='text').strip()\n", + "content_raw" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-10-08T11:18:42.693696400Z", - "start_time": "2024-10-08T11:18:42.668273100Z" + "end_time": "2024-10-08T11:39:51.237897800Z", + "start_time": "2024-10-08T11:39:51.223601700Z" } }, "id": "cfb51a0fd0603503" }, + { + "cell_type": "code", + "execution_count": 3, + "outputs": [ + { + "data": { + "text/plain": "'3 See R. Goff, ‘The Search for Principle’ (1983) Proceeedings of the British Academy 169, at 171. This is an amplification of Dicey’s remark that ‘[b]y adequate study and careful thought whole departments of law can . . . be reduced to order and exhibited under the form of a few principles which sum up the effect of a hundred cases . . .’. A. Dicey, Can English Law be taught at the Universities? (1883) 20.'" + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from lib.string import remove_whitespace\n", + "remove_whitespace(content_formatted)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-10-08T11:39:52.586602Z", + "start_time": "2024-10-08T11:39:52.564482200Z" + } + }, + "id": "7c1c8d3783e36526" + }, + { + "cell_type": "code", + "execution_count": 4, + "outputs": [ + { + "data": { + "text/plain": "True" + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "content_raw == remove_whitespace(content_formatted) " + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-10-08T11:40:13.942921600Z", + "start_time": "2024-10-08T11:40:13.941151Z" + } + }, + "id": "897eaa6d13d1a498" + }, { "cell_type": "code", "execution_count": null, @@ -72,7 +125,7 @@ "metadata": { "collapsed": false }, - "id": "897eaa6d13d1a498" + "id": "338af7ddf4cc739d" } ], "metadata": { -- GitLab