Conversion to gold works

c0bb65d0 · Christian Boulanger · 1408e5df · c0bb65d0 · c0bb65d0 · c0bb65d0
Commit c0bb65d0 authored Oct 15, 2024 by Christian Boulanger
--- a/convert-anystyle-data/gold/10.1111_1467-6478.00057.xml
+++ b/convert-anystyle-data/gold/10.1111_1467-6478.00057.xml
--- a/convert-anystyle-data/gold/10.1111_1467-6478.00080.xml
+++ b/convert-anystyle-data/gold/10.1111_1467-6478.00080.xml
--- a/convert-anystyle-data/gold/10.1515_zfrs-1980-0103.xml
+++ b/convert-anystyle-data/gold/10.1515_zfrs-1980-0103.xml
--- a/convert-anystyle-data/gold/10.1515_zfrs-1980-0104.xml
+++ b/convert-anystyle-data/gold/10.1515_zfrs-1980-0104.xml
--- a/convert-anystyle-data/lib/convert.py
+++ b/convert-anystyle-data/lib/convert.py
@@ -2,6 +2,7 @@ import xml.etree.ElementTree as ET
 from lxml import etree
 import regex as re
 from nameparser import HumanName
+from lib.xml import remove_whitespace
 def even_num_brackets(string: str):
@@ -366,3 +367,33 @@ def anystyle_to_tei(input_xml_path, id, preserve=False):
    final_root = ET.fromstring(lxml_str)
    return ET.tostring(final_root, 'unicode')
+def tei_to_input(tei_xml_doc):
+    """
+    Extract the original footnote strings from the <note> elements in a given TEI document and return a list of strings
+    """
+    root = etree.fromstring(tei_xml_doc)
+    ref_list = []
+    ns = {"tei": "http://www.tei-c.org/ns/1.0"}
+    # iterate over the <note type="footnote"> and <listBibl> elements
+    for xpath in ['*//tei:listBibl[@type="inText"]',
+                  '*//tei:note[@type="footnote"]',
+                  '*//tei:listBibl[@type="bibliography"]']:
+        for element in root.findall(xpath, ns):
+            localname = etree.QName(element).localname
+            type = element.attrib['type'] if 'type' in element.attrib else None
+            ref_parts = []
+            if 'n' in element.attrib:
+                ref_parts.append(element.attrib['n'])
+            # iterate over the <bibl> elements
+            for bibl in element.findall('tei:bibl', ns):
+                # extract the text without xml tags, still contains all (collapsed) whitespace
+                text = etree.tostring(bibl, method="text", encoding='utf-8').decode()
+                text = remove_whitespace(text)
+                ref_parts.append(text)
+            if localname == "listBibl" and type != "inText":
+                ref_list += ref_parts
+            elif localname == "note" or (localname == "listBibl" and type == "inText"):
+                ref_list.append(remove_whitespace(" ".join(ref_parts)))
+    return ref_list
\ No newline at end of file
--- a/convert-anystyle-data/lib/gold_standard.py
+++ b/convert-anystyle-data/lib/gold_standard.py
@@ -2,10 +2,12 @@ from lxml import etree
 import os
 from glob import glob
 from copy import deepcopy
-from .xml import remove_whitespace, prettify
 from IPython.display import display, Markdown
 from pathlib import Path
 from lxml import etree
+import regex as re
+import html
+import textwrap
 # namespaces
 tei_namespace = "http://www.tei-c.org/ns/1.0"
@@ -16,6 +18,110 @@ ns = {
 }
 id_attrib = f'{{{xml_namespace}}}id'
+def remove_whitespace(text):
+    # we need to remove the whitespace that comes with the indentation of pretty-printed xml
+    text = re.sub(r'\n\s*', ' ', text)
+    # reduce double spaces to one
+    while re.search(r'\s\s', text):
+        text = re.sub(r'\s\s', ' ', text)
+    # escape character sequences which would be corrupted by whitespace removal rules
+    text = re.sub(r'\. \. \.', '[!spaced_elipsis!]', text)
+    # fix issues with whitespace before punctuation
+    text = re.sub(r' ([.;,!?%:])( |$)', r'\1 ', text)
+    # opening and closing punctutation, such as brackets
+    text = re.sub(r'(\p{Ps}) ', r'\1', text)
+    text = re.sub(r' (\p{Pe})', r'\1', text)
+    # opening and closing quotes
+    text = re.sub(r'(\p{Pi}) ', r'\1', text)
+    text = re.sub(r' (\p{Pf})', r'\1', text)
+    # slash
+    text = re.sub(r' ?/ ?', r'/', text)
+    # restore sequences
+    text = text.replace('[!spaced_elipsis!]', r'. . .')
+    return text.strip()
+def indentation_level(element):
+    level = 0
+    while element is not None:
+        element = element.getparent()
+        level += 1
+    return level - 1 
+def fix_tail(elem: etree._Element, indentation="    "):
+    tail = elem.tail
+    # normalize line endings
+    tail = re.sub(r'\r\n', '\n', tail)
+    # fix issues with whitespace before punctuation
+    tail = re.sub(r'(\s*\n\s+)([.;,!?%:/])', r'\2', tail)
+    # opening and closing punctutation, such as brackets
+    tail = re.sub(r'(\p{Ps})(\s*\n\s+)', r'\1', tail)
+    tail = re.sub(r'(\s*\n\s+)(\p{Pe})', r'\2', tail)
+    # opening and closing quotes
+    tail = re.sub(r'(\p{Pi})(\s*\n\s+)', r'\1', tail)
+    tail = re.sub(r'(\s*\n\s+)(\p{Pf})', r'\2', tail)
+    # in tails without line break but with whitespace, replace the last whitespace with a linebreak
+    if '\n' not in tail and ' ' in tail:
+        p = tail.split(" ")
+        s = '\n' + indentation_level(elem) * indentation
+        tail = s.join(p) 
+    elem.tail = tail
+def prettify_content(elem: etree._Element, indentation="    ", width=120):
+    text = re.sub(r'\s+', ' ', elem.text, flags=re.MULTILINE)
+    lines = textwrap.wrap(text, width=width)
+    indent = indentation_level(elem) * indentation
+    lines = [ indent +  indentation + line for line in lines ]
+    lines.insert(0, "")
+    lines.append(indent)
+    elem.text = '\n'.join(lines)
+def fix_indentation(elem, level=0, indentation="    "):
+    indent = "\n" + level*indentation
+    if len(elem):
+        if not elem.text or not elem.text.strip():
+            elem.text = indent + "  "
+        if not elem.tail or not elem.tail.strip():
+            elem.tail = indent
+        for elem in elem:
+            fix_indentation(elem, level+1)
+        if not elem.tail or not elem.tail.strip():
+            elem.tail = indent
+    else:
+        if level and (not elem.tail or not elem.tail.strip()):
+            elem.tail = indent
+def remove_encoding_declaration(xml_string):
+    return xml_string.replace('<?xml version="1.0" encoding="UTF-8"?>', '')
+def prettify_gold(xml_doc: etree._Element, indentation="    "):
+    """The built-in prettification doesn't work well for XML containing TEI annotations. We need to prettify manually"""
+    # manually reindent the nodes
+    fix_indentation(xml_doc, indentation=indentation)
+    # wrap and indent formatted input
+    for elem in xml_doc.xpath(r'//input[@type="formatted"]'):
+        if type(elem.text) is str and elem.text != "": 
+            prettify_content(elem)
+    # fix the tail with incorrect whitespace    
+    for elem in xml_doc.xpath(r'//*'):
+        if type(elem.tail) is str and elem.tail != "": 
+            fix_tail(elem)
+    # stringify
+    xml_string = etree.tostring(xml_doc, pretty_print=True).decode()
+    # replace xml entities
+    xml_string = html.unescape(xml_string)
+    return xml_string
 def add_id_to_bibl(dir_path):
    for file_path in glob(f'{dir_path}/*.xml'):
        print(f' - Processing {file_path}')
@@ -30,9 +136,6 @@ def add_id_to_bibl(dir_path):
        with open(file_path, 'wb') as file:
            file.write(pretty_tree)
-def remove_encoding_declaration(xml_string):
-    return xml_string.replace('<?xml version="1.0" encoding="UTF-8"?>', '')
 def find_biblstruct(bibl_struct_tree, id):
     # find correspondent biblStruct data
    xpath_expr= f"//tei:biblStruct[substring(@source, string-length(@source) - {len(id) - 1})='{id}']"
@@ -40,7 +143,7 @@ def find_biblstruct(bibl_struct_tree, id):
    if len(bibl_struct_matches) > 0:
        # found it - make a copy and remove unneeded attributes
        bibl_struct_copy = deepcopy(bibl_struct_matches[0])
-        if bibl_struct_copy.attrib['source']:
+        if 'source' in  bibl_struct_copy.attrib:
            del bibl_struct_copy.attrib['source']    
        # add it to target listBibl
        return bibl_struct_copy
@@ -48,7 +151,7 @@ def find_biblstruct(bibl_struct_tree, id):
        raise RuntimeError(f"Could not find matching biblStruct element with id '{id}'.")
-def create_gold_standard(bibl_content, biblstruct_content, verbose=False):
+def create_gold_standard(bibl_content, biblstruct_content, verbose=False, pretty=False):
    # get source trees and elements
    bibl_tree = etree.fromstring(remove_encoding_declaration(bibl_content))
    bibl_struct_tree = etree.fromstring(remove_encoding_declaration(biblstruct_content))
@@ -108,8 +211,10 @@ def create_gold_standard(bibl_content, biblstruct_content, verbose=False):
                        raise RuntimeError(f"Missing 'xml:id' attribute for bibl element: {elem}")
                    bibl_id = elem.attrib[id_attrib]
                    biblstruct = find_biblstruct(bibl_struct_tree, bibl_id)           
-                    # add the biblStruct
+                    # add the biblStruct without xmlns
                    output_listBibl.append(biblstruct)
+                    if 'xmlns' in biblstruct.attrib:
+                        del biblstruct.attrib['xmlns']                        
                # add the element
                output_bibl_p.append(deepcopy(elem))
@@ -207,8 +312,9 @@ def create_gold_standard(bibl_content, biblstruct_content, verbose=False):
    #         raise RuntimeError(f"Could not find matching biblStruct element with id '{bibl_n}'.")
    # display(Markdown("\n".join(process_log)))
-    return remove_encoding_declaration(prettify(etree.tostring(root)))
+    # serialize to a pretty xml string with the linebreak issues addressed
+    return prettify_gold(root)
 def create_all_gold_standards(bibl_dir, biblstruct_dir, biblstruct_gold_dir, verbose=False):
    for file_path in glob(f'{bibl_dir}/*.xml'):
@@ -228,7 +334,7 @@ def create_all_gold_standards(bibl_dir, biblstruct_dir, biblstruct_gold_dir, ver
            bibl_content = bibl_file.read()
            biblStruct_content = biblStruct_file.read()
-        output_data = create_gold_standard(bibl_content, biblStruct_content, verbose=verbose)
+        output_data = create_gold_standard(bibl_content, biblStruct_content, verbose=verbose, pretty=True)
        with open(biblstruct_gs_path, 'w', encoding='utf-8') as output_file:
            output_file.write(output_data)
\ No newline at end of file
--- a/convert-anystyle-data/lib/string.py
+++ b/convert-anystyle-data/lib/string.py
@@ -2,6 +2,7 @@ import glob
 import os
 from difflib import HtmlDiff
 from IPython.display import display, Markdown
+import regex as re
 def compare_input_strings(tei_dir_path, ref_dir_path, html_output_path, target_url, refs_output_path=None):
    os.makedirs(html_output_path, exist_ok=True)
@@ -40,4 +41,15 @@ def compare_input_strings(tei_dir_path, ref_dir_path, html_output_path, target_u
            f.write(html_diff)
            display(Markdown(f'Extracted and compared input data for {id}  ([See diff]({target_url}/{id}.diff.html))'))
-# %%
+def fix_serialization_issues(text):
+    # escape character sequences
+    text = re.sub(r'\.\.\.', '[!elipsis!]', text)
+    text = re.sub(r'\. \. \.', '[!spaced_elipsis!]', text)
+    # remove duplicated punctuation
+    text = re.sub(r'(\. ?){2}', r'.', text)
+    # restore character sequences
+    text = text.replace( '[!elipsis!]', '...')
+    text = text.replace('[!spaced_elipsis!]', r'. . .')
+    return text.strip()
\ No newline at end of file
--- a/convert-anystyle-data/lib/xml.py
+++ b/convert-anystyle-data/lib/xml.py
-import regex as re
-from lxml import etree
-from IPython.display import display, Markdown
-import xml.dom.minidom
-def prettify(xml_string, indentation="  "):
-    """Return a pretty-printed XML string"""
-    return xml.dom.minidom.parseString(xml_string).toprettyxml(indent=indentation)
-def remove_whitespace(text):
-    # we need to remove the whitespace that comes with the indentation of pretty-printed xml
-    text = re.sub(r'\n *', ' ', text)
-    # reduce double spaces to one
-    while re.search(r'  ', text):
-        text = re.sub(r'  ', ' ', text)
-    # escape character sequences which would be corrupted by whitespace removal rules
-    text = re.sub(r'\. \. \.', '[!spaced_elipsis!]', text)
-    # fix issues with whitespace before and after punctuation
-    text = re.sub(r' ([.;,!?%:])( |$)', r'\1 ', text)
-    # opening and closing punctutation, such as brackets
-    text = re.sub(r'(\p{Ps}) ', r'\1', text)
-    text = re.sub(r' (\p{Pe})', r'\1', text)
-    # opening and closing quotes
-    text = re.sub(r'(\p{Pi}) ', r'\1', text)
-    text = re.sub(r' (\p{Pf})', r'\1', text)
-    # slash
-    text = re.sub(r' ?/ ?', r'/', text)
-    # restore sequences
-    text = text.replace('[!spaced_elipsis!]', r'. . .')
-    return text.strip()
-def fix_serialization_issues(text):
-    # escape character sequences
-    text = re.sub(r'\.\.\.', '[!elipsis!]', text)
-    text = re.sub(r'\. \. \.', '[!spaced_elipsis!]', text)
-    # remove duplicated punctuation
-    text = re.sub(r'(\. ?){2}', r'.', text)
-    # restore character sequences
-    text = text.replace( '[!elipsis!]', '...')
-    text = text.replace('[!spaced_elipsis!]', r'. . .')
-    return text.strip()
-def tei_to_input(tei_xml_doc):
-    """
-    Extract the original footnote strings from the <note> elements in a given TEI document and return a list of strings
-    """
-    root = etree.fromstring(tei_xml_doc)
-    ref_list = []
-    ns = {"tei": "http://www.tei-c.org/ns/1.0"}
-    # iterate over the <note type="footnote"> and <listBibl> elements
-    for xpath in ['*//tei:listBibl[@type="inText"]',
-                  '*//tei:note[@type="footnote"]',
-                  '*//tei:listBibl[@type="bibliography"]']:
-        for element in root.findall(xpath, ns):
-            localname = etree.QName(element).localname
-            type = element.attrib['type'] if 'type' in element.attrib else None
-            ref_parts = []
-            if 'n' in element.attrib:
-                ref_parts.append(element.attrib['n'])
-            # iterate over the <bibl> elements
-            for bibl in element.findall('tei:bibl', ns):
-                # extract the text without xml tags, still contains all (collapsed) whitespace
-                text = etree.tostring(bibl, method="text", encoding='utf-8').decode()
-                text = remove_whitespace(text)
-                ref_parts.append(text)
-            if localname == "listBibl" and type != "inText":
-                ref_list += ref_parts
-            elif localname == "note" or (localname == "listBibl" and type == "inText"):
-                ref_list.append(remove_whitespace(" ".join(ref_parts)))
-    return ref_list
--- a/convert-anystyle-data/tei-bibl-corrected/10.1111_1467-6478.00057.xml
+++ b/convert-anystyle-data/tei-bibl-corrected/10.1111_1467-6478.00057.xml
@@ -56,8 +56,8 @@
                            <surname>Pateman</surname>
                        </persName>
                    </author>
-                    , ‘“
+                    , ‘
-                    <title level="a">Mere Auxiliaries to the Commonwealth”: Women and the Origins of Liberalism</title>
+                    <title level="a">“Mere Auxiliaries to the Commonwealth”: Women and the Origins of Liberalism</title>
                    ’ (
                    <date>1979</date>
                    )


--- a/convert-anystyle-data/tei-bibl/10.1111_1467-6478.00057.xml
+++ b/convert-anystyle-data/tei-bibl/10.1111_1467-6478.00057.xml
@@ -56,8 +56,8 @@
              <surname>Pateman</surname>
            </persName>
          </surname>
-          , ‘“
+          , ‘
-          <title level="m">Mere Auxiliaries to the Commonwealth”: Women and the Origins of Liberalism</title>
+          <title level="m">“Mere Auxiliaries to the Commonwealth”: Women and the Origins of Liberalism</title>
          ’ (
          <date>1979</date>
          )


--- a/convert-anystyle-data/tei-to-gold.ipynb
+++ b/convert-anystyle-data/tei-to-gold.ipynb
@@ -12,19 +12,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 1,
   "id": "2a90251a",
   "metadata": {},
-   "outputs": [
+   "outputs": [],
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The autoreload extension is already loaded. To reload it, use:\n",
-      "  %reload_ext autoreload\n"
-     ]
-    }
-   ],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2"
@@ -105,7 +96,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 4,
   "id": "2cc1a0d6",
   "metadata": {},
   "outputs": [
@@ -174,7 +165,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "base",
+   "display_name": "experiments",
   "language": "python",
   "name": "python3"
  },
@@ -188,7 +179,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.11.5"
+   "version": "3.11.7"
  }
 },
 "nbformat": 4,


 %% Cell type:markdown id:a7894c78ec06bd10 tags:
 # Translate TEI/bibl to final gold standard schema
 %% Cell type:code id:2a90251a tags:
 ``` python
 %load_ext autoreload
 %autoreload 2
 ```
-%% Output
-    The autoreload extension is already loaded. To reload it, use:
-      %reload_ext autoreload
 %% Cell type:markdown id:bac6fffc tags:
 Add an `xml:id` attribute to all `bibl` elements so that they can be matched later.
 %% Cell type:code id:338af7ddf4cc739d tags:
 ``` python
 from lib.gold_standard import add_id_to_bibl
 add_id_to_bibl('./tei-bibl-corrected')
 ```
 %% Output
     - Processing ./tei-bibl-corrected\10.1111_1467-6478.00057.xml
     - Processing ./tei-bibl-corrected\10.1111_1467-6478.00080.xml
     - Processing ./tei-bibl-corrected\10.1515_zfrs-1980-0103.xml
     - Processing ./tei-bibl-corrected\10.1515_zfrs-1980-0104.xml
 %% Cell type:markdown id:f18f6515 tags:
 Create `biblStruct` from `bibl`:
 %% Cell type:code id:d39d9f75 tags:
 ``` python
 from lib.xslt import transform
 transform(xslt_path='lib/xslt/convert_tei-to-biblstruct_bibl.xsl',
          input_path='tei-bibl-corrected',
          output_path='tei-biblStruct',
          rename_extension=('-bibl_biblStruct.TEIP5.xml','.biblStruct.xml')).stderr
 ```
 %% Output
    Applied lib\xslt\convert_tei-to-biblstruct_bibl.xsl to files in tei-bibl-corrected and saved result in tei-biblStruct.
    ''
 %% Cell type:code id:2cc1a0d6 tags:
 ``` python
 from lib.gold_standard import create_all_gold_standards
 create_all_gold_standards('tei-bibl-corrected',
                          'tei-biblStruct',
                          'gold',
                          verbose=False)
 ```
 %% Output
    ### Processing 10.1111_1467-6478.00057
    Files: [TEI/bibl](tei-bibl-corrected/10.1111_1467-6478.00057.xml) | [TEI/biblStruct](tei-biblStruct/10.1111_1467-6478.00057.biblstruct.xml) | [Gold Standard](gold/10.1111_1467-6478.00057.xml)
    ### Processing 10.1111_1467-6478.00080
    Files: [TEI/bibl](tei-bibl-corrected/10.1111_1467-6478.00080.xml) | [TEI/biblStruct](tei-biblStruct/10.1111_1467-6478.00080.biblstruct.xml) | [Gold Standard](gold/10.1111_1467-6478.00080.xml)
    ### Processing 10.1515_zfrs-1980-0103
    Files: [TEI/bibl](tei-bibl-corrected/10.1515_zfrs-1980-0103.xml) | [TEI/biblStruct](tei-biblStruct/10.1515_zfrs-1980-0103.biblstruct.xml) | [Gold Standard](gold/10.1515_zfrs-1980-0103.xml)
    ### Processing 10.1515_zfrs-1980-0104
    Files: [TEI/bibl](tei-bibl-corrected/10.1515_zfrs-1980-0104.xml) | [TEI/biblStruct](tei-biblStruct/10.1515_zfrs-1980-0104.biblstruct.xml) | [Gold Standard](gold/10.1515_zfrs-1980-0104.xml)