From af7cd8d147b670bf41b3c875f3854fd74ed8f28a Mon Sep 17 00:00:00 2001
From: Christian Boulanger <boulanger@lhlt.mpg.de>
Date: Tue, 8 Oct 2024 13:40:50 +0200
Subject: [PATCH] fix whitespace removal rules to preserve elipses

---
 convert-anystyle-data/lib/string.py     | 23 ++++++--
 convert-anystyle-data/tei-to-gold.ipynb | 75 +++++++++++++++++++++----
 2 files changed, 83 insertions(+), 15 deletions(-)

diff --git a/convert-anystyle-data/lib/string.py b/convert-anystyle-data/lib/string.py
index 413c493..7936a38 100644
--- a/convert-anystyle-data/lib/string.py
+++ b/convert-anystyle-data/lib/string.py
@@ -13,15 +13,19 @@ def prettify(xml_string, indentation="  "):
 
 
 def remove_whitespace(text):
-    # we need to remove the whitespace that comes with the pretty-printed xml
+    # we need to remove the whitespace that comes with the indentation of pretty-printed xml
     text = re.sub(r'\n *', ' ', text)
+
+    # reduce double spaces to one
     while re.search(r'  ', text):
         text = re.sub(r'  ', ' ', text)
+
+    # escape character sequences which would be corrupted by whitespace removal rules
+    text = re.sub(r'\. \. \.', '[!spaced_elipsis!]', text)
+    text = re.sub(r'\.\.\.', '[!elipsis!]', text)
+
     # fix issues with whitespace before and after punctuation
     text = re.sub(r' ([.;,!?%:])( |$)', r'\1 ', text)
-    # remove duplicated punctuation, this comes from incorrect serializing
-    text = re.sub(r'\.\.', r'.', text)
-    text = re.sub(r'\. \.', r'.', text)
     # opening and closing punctutation, such as brackets
     text = re.sub(r'(\p{Ps}) ', r'\1', text)
     text = re.sub(r' (\p{Pe})', r'\1', text)
@@ -30,8 +34,16 @@ def remove_whitespace(text):
     text = re.sub(r' (\p{Pf})', r'\1', text)
     # slash
     text = re.sub(r' ?/ ?', r'/', text)
+
+    # restore sequences
+    text = text.replace('[!spaced_elipsis!]', r'. . .')
+    text = text.replace( '[!elipsis!]', '...')
     return text.strip()
 
+def fix_serialization_issues(text):
+    # remove duplicated punctuation, todo this incorrectly shortens elipses ("...")
+    text = re.sub(r'\.\.', r'.', text)
+    text = re.sub(r'\. \.', r'.', text) # to do, only applies when
 
 def tei_to_input(tei_xml_doc):
     """
@@ -81,6 +93,9 @@ def compare_input_strings(tei_dir_path, ref_dir_path, html_output_path, target_u
         # convert it to raw reference text
         tei_input_data = tei_to_input(tei_xml_doc)
 
+        # postprocess to fix issues that should be fixed in the source or in the converter code
+        tei_input_data = fix_serialization_issues(tei_input_data)
+
         # if an output path has been given, store the reconstructed raw references
         if refs_output_path:
             with open(f'{refs_output_path}/{id}.txt', 'w', encoding='utf-8') as w:
diff --git a/convert-anystyle-data/tei-to-gold.ipynb b/convert-anystyle-data/tei-to-gold.ipynb
index adeed2e..3aa2427 100644
--- a/convert-anystyle-data/tei-to-gold.ipynb
+++ b/convert-anystyle-data/tei-to-gold.ipynb
@@ -12,13 +12,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 1,
    "outputs": [
     {
      "data": {
       "text/plain": "'3 See R. Goff, ‘The Search for Principle’ (1983) Proceeedings of the British Academy 169, at 171. This is an\\n            amplification of Dicey’s remark that ‘[b]y adequate study and careful thought whole departments of law can .\\n            . . be reduced to order and exhibited under the form of a few principles which sum up the effect of a\\n            hundred cases . . .’. A. Dicey, Can English Law be taught at the Universities? (1883) 20.'"
      },
-     "execution_count": 14,
+     "execution_count": 1,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -27,43 +27,96 @@
     "from lxml import etree\n",
     "tree = etree.parse('./schema/gold_standard.xml')\n",
     "input_formatted = tree.find(\".//input[@type='formatted']\")\n",
-    "etree.tounicode(input_formatted, method='text').strip()"
+    "content_formatted = etree.tounicode(input_formatted, method='text').strip()\n",
+    "content_formatted"
    ],
    "metadata": {
     "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2024-10-08T11:18:49.880708500Z",
-     "start_time": "2024-10-08T11:18:49.873127500Z"
+     "end_time": "2024-10-08T11:39:49.038045100Z",
+     "start_time": "2024-10-08T11:39:49.016104600Z"
     }
    },
    "id": "78f4e23b884790d"
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 2,
    "outputs": [
     {
      "data": {
       "text/plain": "'3 See R. Goff, ‘The Search for Principle’ (1983) Proceeedings of the British Academy 169, at 171. This is an amplification of Dicey’s remark that ‘[b]y adequate study and careful thought whole departments of law can . . . be reduced to order and exhibited under the form of a few principles which sum up the effect of a hundred cases . . .’. A. Dicey, Can English Law be taught at the Universities? (1883) 20.'"
      },
-     "execution_count": 13,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "input_raw = tree.find(\".//input[@type='raw']\")\n",
-    "etree.tounicode(input_raw, method='text').strip()"
+    "content_raw = etree.tounicode(input_raw, method='text').strip()\n",
+    "content_raw"
    ],
    "metadata": {
     "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2024-10-08T11:18:42.693696400Z",
-     "start_time": "2024-10-08T11:18:42.668273100Z"
+     "end_time": "2024-10-08T11:39:51.237897800Z",
+     "start_time": "2024-10-08T11:39:51.223601700Z"
     }
    },
    "id": "cfb51a0fd0603503"
   },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "'3 See R. Goff, ‘The Search for Principle’ (1983) Proceeedings of the British Academy 169, at 171. This is an amplification of Dicey’s remark that ‘[b]y adequate study and careful thought whole departments of law can . . . be reduced to order and exhibited under the form of a few principles which sum up the effect of a hundred cases . . .’. A. Dicey, Can English Law be taught at the Universities? (1883) 20.'"
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from lib.string import remove_whitespace\n",
+    "remove_whitespace(content_formatted)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-10-08T11:39:52.586602Z",
+     "start_time": "2024-10-08T11:39:52.564482200Z"
+    }
+   },
+   "id": "7c1c8d3783e36526"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "True"
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "content_raw == remove_whitespace(content_formatted) "
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2024-10-08T11:40:13.942921600Z",
+     "start_time": "2024-10-08T11:40:13.941151Z"
+    }
+   },
+   "id": "897eaa6d13d1a498"
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -72,7 +125,7 @@
    "metadata": {
     "collapsed": false
    },
-   "id": "897eaa6d13d1a498"
+   "id": "338af7ddf4cc739d"
   }
  ],
  "metadata": {
-- 
GitLab