diff --git a/convert-anystyle-data/lib/string.py b/convert-anystyle-data/lib/string.py index 7936a385dbe8fdea9800e9fd9e01de1d43255c15..9076058be9471eb8a115693f3ebbba883ba9cc90 100644 --- a/convert-anystyle-data/lib/string.py +++ b/convert-anystyle-data/lib/string.py @@ -22,7 +22,6 @@ def remove_whitespace(text): # escape character sequences which would be corrupted by whitespace removal rules text = re.sub(r'\. \. \.', '[!spaced_elipsis!]', text) - text = re.sub(r'\.\.\.', '[!elipsis!]', text) # fix issues with whitespace before and after punctuation text = re.sub(r' ([.;,!?%:])( |$)', r'\1 ', text) @@ -37,13 +36,20 @@ def remove_whitespace(text): # restore sequences text = text.replace('[!spaced_elipsis!]', r'. . .') - text = text.replace( '[!elipsis!]', '...') return text.strip() def fix_serialization_issues(text): - # remove duplicated punctuation, todo this incorrectly shortens elipses ("...") - text = re.sub(r'\.\.', r'.', text) - text = re.sub(r'\. \.', r'.', text) # to do, only applies when + # escape character sequences + text = re.sub(r'\.\.\.', '[!elipsis!]', text) + text = re.sub(r'\. \. \.', '[!spaced_elipsis!]', text) + + # remove duplicated punctuation + text = re.sub(r'(\. ?){2}', r'.', text) + + # restore character sequences + text = text.replace( '[!elipsis!]', '...') + text = text.replace('[!spaced_elipsis!]', r'. . .') + return text.strip() def tei_to_input(tei_xml_doc): """ diff --git a/convert-anystyle-data/tei-to-gold.ipynb b/convert-anystyle-data/tei-to-gold.ipynb index 3aa2427a18e9620225f2ac79131f239167e052ce..7ab6a208dcf31f60a024a1004ba931e17b368e23 100644 --- a/convert-anystyle-data/tei-to-gold.ipynb +++ b/convert-anystyle-data/tei-to-gold.ipynb @@ -12,13 +12,13 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 5, "outputs": [ { "data": { "text/plain": "'3 See R. Goff, ‘The Search for Principle’ (1983) Proceeedings of the British Academy 169, at 171. This is an\\n amplification of Dicey’s remark that ‘[b]y adequate study and careful thought whole departments of law can .\\n . . be reduced to order and exhibited under the form of a few principles which sum up the effect of a\\n hundred cases . . .’. A. Dicey, Can English Law be taught at the Universities? (1883) 20.'" }, - "execution_count": 1, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -33,21 +33,21 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-10-08T11:39:49.038045100Z", - "start_time": "2024-10-08T11:39:49.016104600Z" + "end_time": "2024-10-08T11:54:12.425330400Z", + "start_time": "2024-10-08T11:54:12.419195400Z" } }, "id": "78f4e23b884790d" }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 6, "outputs": [ { "data": { "text/plain": "'3 See R. Goff, ‘The Search for Principle’ (1983) Proceeedings of the British Academy 169, at 171. This is an amplification of Dicey’s remark that ‘[b]y adequate study and careful thought whole departments of law can . . . be reduced to order and exhibited under the form of a few principles which sum up the effect of a hundred cases . . .’. A. Dicey, Can English Law be taught at the Universities? (1883) 20.'" }, - "execution_count": 2, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -60,21 +60,21 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-10-08T11:39:51.237897800Z", - "start_time": "2024-10-08T11:39:51.223601700Z" + "end_time": "2024-10-08T11:54:13.020336800Z", + "start_time": "2024-10-08T11:54:12.972616Z" } }, "id": "cfb51a0fd0603503" }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 7, "outputs": [ { "data": { "text/plain": "'3 See R. Goff, ‘The Search for Principle’ (1983) Proceeedings of the British Academy 169, at 171. This is an amplification of Dicey’s remark that ‘[b]y adequate study and careful thought whole departments of law can . . . be reduced to order and exhibited under the form of a few principles which sum up the effect of a hundred cases . . .’. A. Dicey, Can English Law be taught at the Universities? (1883) 20.'" }, - "execution_count": 3, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -86,33 +86,33 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-10-08T11:39:52.586602Z", - "start_time": "2024-10-08T11:39:52.564482200Z" + "end_time": "2024-10-08T11:54:13.526005800Z", + "start_time": "2024-10-08T11:54:13.498511400Z" } }, "id": "7c1c8d3783e36526" }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 8, "outputs": [ { "data": { "text/plain": "True" }, - "execution_count": 4, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "content_raw == remove_whitespace(content_formatted) " + "content_raw == remove_whitespace(content_formatted)" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-10-08T11:40:13.942921600Z", - "start_time": "2024-10-08T11:40:13.941151Z" + "end_time": "2024-10-08T11:54:14.128130900Z", + "start_time": "2024-10-08T11:54:14.120356500Z" } }, "id": "897eaa6d13d1a498"