From 5cc3a9dbb2e37abdb01609eb590b1469d7889fcb Mon Sep 17 00:00:00 2001
From: Christian Boulanger <boulanger@lhlt.mpg.de>
Date: Tue, 8 Oct 2024 13:54:45 +0200
Subject: [PATCH] fix postprocessing rules to preserve elipses

---
 convert-anystyle-data/lib/string.py     | 16 ++++++++----
 convert-anystyle-data/tei-to-gold.ipynb | 34 ++++++++++++-------------
 2 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/convert-anystyle-data/lib/string.py b/convert-anystyle-data/lib/string.py
index 7936a38..9076058 100644
--- a/convert-anystyle-data/lib/string.py
+++ b/convert-anystyle-data/lib/string.py
@@ -22,7 +22,6 @@ def remove_whitespace(text):
 
     # escape character sequences which would be corrupted by whitespace removal rules
     text = re.sub(r'\. \. \.', '[!spaced_elipsis!]', text)
-    text = re.sub(r'\.\.\.', '[!elipsis!]', text)
 
     # fix issues with whitespace before and after punctuation
     text = re.sub(r' ([.;,!?%:])( |$)', r'\1 ', text)
@@ -37,13 +36,20 @@ def remove_whitespace(text):
 
     # restore sequences
     text = text.replace('[!spaced_elipsis!]', r'. . .')
-    text = text.replace( '[!elipsis!]', '...')
     return text.strip()
 
 def fix_serialization_issues(text):
-    # remove duplicated punctuation, todo this incorrectly shortens elipses ("...")
-    text = re.sub(r'\.\.', r'.', text)
-    text = re.sub(r'\. \.', r'.', text) # to do, only applies when
+    # escape character sequences
+    text = re.sub(r'\.\.\.', '[!elipsis!]', text)
+    text = re.sub(r'\. \. \.', '[!spaced_elipsis!]', text)
+
+    # remove duplicated punctuation
+    text = re.sub(r'(\. ?){2}', r'.', text)
+
+    # restore character sequences
+    text = text.replace( '[!elipsis!]', '...')
+    text = text.replace('[!spaced_elipsis!]', r'. . .')
+    return text.strip()
 
 def tei_to_input(tei_xml_doc):
     """
diff --git a/convert-anystyle-data/tei-to-gold.ipynb b/convert-anystyle-data/tei-to-gold.ipynb
index 3aa2427..7ab6a20 100644
--- a/convert-anystyle-data/tei-to-gold.ipynb
+++ b/convert-anystyle-data/tei-to-gold.ipynb
@@ -12,13 +12,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 5,
    "outputs": [
     {
      "data": {
       "text/plain": "'3 See R. Goff, ‘The Search for Principle’ (1983) Proceeedings of the British Academy 169, at 171. This is an\\n            amplification of Dicey’s remark that ‘[b]y adequate study and careful thought whole departments of law can .\\n            . . be reduced to order and exhibited under the form of a few principles which sum up the effect of a\\n            hundred cases . . .’. A. Dicey, Can English Law be taught at the Universities? (1883) 20.'"
      },
-     "execution_count": 1,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -33,21 +33,21 @@
    "metadata": {
     "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2024-10-08T11:39:49.038045100Z",
-     "start_time": "2024-10-08T11:39:49.016104600Z"
+     "end_time": "2024-10-08T11:54:12.425330400Z",
+     "start_time": "2024-10-08T11:54:12.419195400Z"
     }
    },
    "id": "78f4e23b884790d"
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 6,
    "outputs": [
     {
      "data": {
       "text/plain": "'3 See R. Goff, ‘The Search for Principle’ (1983) Proceeedings of the British Academy 169, at 171. This is an amplification of Dicey’s remark that ‘[b]y adequate study and careful thought whole departments of law can . . . be reduced to order and exhibited under the form of a few principles which sum up the effect of a hundred cases . . .’. A. Dicey, Can English Law be taught at the Universities? (1883) 20.'"
      },
-     "execution_count": 2,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -60,21 +60,21 @@
    "metadata": {
     "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2024-10-08T11:39:51.237897800Z",
-     "start_time": "2024-10-08T11:39:51.223601700Z"
+     "end_time": "2024-10-08T11:54:13.020336800Z",
+     "start_time": "2024-10-08T11:54:12.972616Z"
     }
    },
    "id": "cfb51a0fd0603503"
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 7,
    "outputs": [
     {
      "data": {
       "text/plain": "'3 See R. Goff, ‘The Search for Principle’ (1983) Proceeedings of the British Academy 169, at 171. This is an amplification of Dicey’s remark that ‘[b]y adequate study and careful thought whole departments of law can . . . be reduced to order and exhibited under the form of a few principles which sum up the effect of a hundred cases . . .’. A. Dicey, Can English Law be taught at the Universities? (1883) 20.'"
      },
-     "execution_count": 3,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -86,33 +86,33 @@
    "metadata": {
     "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2024-10-08T11:39:52.586602Z",
-     "start_time": "2024-10-08T11:39:52.564482200Z"
+     "end_time": "2024-10-08T11:54:13.526005800Z",
+     "start_time": "2024-10-08T11:54:13.498511400Z"
     }
    },
    "id": "7c1c8d3783e36526"
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 8,
    "outputs": [
     {
      "data": {
       "text/plain": "True"
      },
-     "execution_count": 4,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "content_raw == remove_whitespace(content_formatted) "
+    "content_raw == remove_whitespace(content_formatted)"
    ],
    "metadata": {
     "collapsed": false,
     "ExecuteTime": {
-     "end_time": "2024-10-08T11:40:13.942921600Z",
-     "start_time": "2024-10-08T11:40:13.941151Z"
+     "end_time": "2024-10-08T11:54:14.128130900Z",
+     "start_time": "2024-10-08T11:54:14.120356500Z"
     }
    },
    "id": "897eaa6d13d1a498"
-- 
GitLab