Skip to content
Snippets Groups Projects
Commit c0bb65d0 authored by Christian Boulanger's avatar Christian Boulanger
Browse files

Conversion to gold works

parent 1408e5df
No related branches found
No related tags found
No related merge requests found
Showing with 13050 additions and 31403 deletions
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -2,6 +2,7 @@ import xml.etree.ElementTree as ET ...@@ -2,6 +2,7 @@ import xml.etree.ElementTree as ET
from lxml import etree from lxml import etree
import regex as re import regex as re
from nameparser import HumanName from nameparser import HumanName
from lib.xml import remove_whitespace
def even_num_brackets(string: str): def even_num_brackets(string: str):
...@@ -366,3 +367,33 @@ def anystyle_to_tei(input_xml_path, id, preserve=False): ...@@ -366,3 +367,33 @@ def anystyle_to_tei(input_xml_path, id, preserve=False):
final_root = ET.fromstring(lxml_str) final_root = ET.fromstring(lxml_str)
return ET.tostring(final_root, 'unicode') return ET.tostring(final_root, 'unicode')
def tei_to_input(tei_xml_doc):
"""
Extract the original footnote strings from the <note> elements in a given TEI document and return a list of strings
"""
root = etree.fromstring(tei_xml_doc)
ref_list = []
ns = {"tei": "http://www.tei-c.org/ns/1.0"}
# iterate over the <note type="footnote"> and <listBibl> elements
for xpath in ['*//tei:listBibl[@type="inText"]',
'*//tei:note[@type="footnote"]',
'*//tei:listBibl[@type="bibliography"]']:
for element in root.findall(xpath, ns):
localname = etree.QName(element).localname
type = element.attrib['type'] if 'type' in element.attrib else None
ref_parts = []
if 'n' in element.attrib:
ref_parts.append(element.attrib['n'])
# iterate over the <bibl> elements
for bibl in element.findall('tei:bibl', ns):
# extract the text without xml tags, still contains all (collapsed) whitespace
text = etree.tostring(bibl, method="text", encoding='utf-8').decode()
text = remove_whitespace(text)
ref_parts.append(text)
if localname == "listBibl" and type != "inText":
ref_list += ref_parts
elif localname == "note" or (localname == "listBibl" and type == "inText"):
ref_list.append(remove_whitespace(" ".join(ref_parts)))
return ref_list
\ No newline at end of file
...@@ -2,10 +2,12 @@ from lxml import etree ...@@ -2,10 +2,12 @@ from lxml import etree
import os import os
from glob import glob from glob import glob
from copy import deepcopy from copy import deepcopy
from .xml import remove_whitespace, prettify
from IPython.display import display, Markdown from IPython.display import display, Markdown
from pathlib import Path from pathlib import Path
from lxml import etree from lxml import etree
import regex as re
import html
import textwrap
# namespaces # namespaces
tei_namespace = "http://www.tei-c.org/ns/1.0" tei_namespace = "http://www.tei-c.org/ns/1.0"
...@@ -16,6 +18,110 @@ ns = { ...@@ -16,6 +18,110 @@ ns = {
} }
id_attrib = f'{{{xml_namespace}}}id' id_attrib = f'{{{xml_namespace}}}id'
def remove_whitespace(text):
# we need to remove the whitespace that comes with the indentation of pretty-printed xml
text = re.sub(r'\n\s*', ' ', text)
# reduce double spaces to one
while re.search(r'\s\s', text):
text = re.sub(r'\s\s', ' ', text)
# escape character sequences which would be corrupted by whitespace removal rules
text = re.sub(r'\. \. \.', '[!spaced_elipsis!]', text)
# fix issues with whitespace before punctuation
text = re.sub(r' ([.;,!?%:])( |$)', r'\1 ', text)
# opening and closing punctutation, such as brackets
text = re.sub(r'(\p{Ps}) ', r'\1', text)
text = re.sub(r' (\p{Pe})', r'\1', text)
# opening and closing quotes
text = re.sub(r'(\p{Pi}) ', r'\1', text)
text = re.sub(r' (\p{Pf})', r'\1', text)
# slash
text = re.sub(r' ?/ ?', r'/', text)
# restore sequences
text = text.replace('[!spaced_elipsis!]', r'. . .')
return text.strip()
def indentation_level(element):
level = 0
while element is not None:
element = element.getparent()
level += 1
return level - 1
def fix_tail(elem: etree._Element, indentation=" "):
tail = elem.tail
# normalize line endings
tail = re.sub(r'\r\n', '\n', tail)
# fix issues with whitespace before punctuation
tail = re.sub(r'(\s*\n\s+)([.;,!?%:/])', r'\2', tail)
# opening and closing punctutation, such as brackets
tail = re.sub(r'(\p{Ps})(\s*\n\s+)', r'\1', tail)
tail = re.sub(r'(\s*\n\s+)(\p{Pe})', r'\2', tail)
# opening and closing quotes
tail = re.sub(r'(\p{Pi})(\s*\n\s+)', r'\1', tail)
tail = re.sub(r'(\s*\n\s+)(\p{Pf})', r'\2', tail)
# in tails without line break but with whitespace, replace the last whitespace with a linebreak
if '\n' not in tail and ' ' in tail:
p = tail.split(" ")
s = '\n' + indentation_level(elem) * indentation
tail = s.join(p)
elem.tail = tail
def prettify_content(elem: etree._Element, indentation=" ", width=120):
text = re.sub(r'\s+', ' ', elem.text, flags=re.MULTILINE)
lines = textwrap.wrap(text, width=width)
indent = indentation_level(elem) * indentation
lines = [ indent + indentation + line for line in lines ]
lines.insert(0, "")
lines.append(indent)
elem.text = '\n'.join(lines)
def fix_indentation(elem, level=0, indentation=" "):
indent = "\n" + level*indentation
if len(elem):
if not elem.text or not elem.text.strip():
elem.text = indent + " "
if not elem.tail or not elem.tail.strip():
elem.tail = indent
for elem in elem:
fix_indentation(elem, level+1)
if not elem.tail or not elem.tail.strip():
elem.tail = indent
else:
if level and (not elem.tail or not elem.tail.strip()):
elem.tail = indent
def remove_encoding_declaration(xml_string):
return xml_string.replace('<?xml version="1.0" encoding="UTF-8"?>', '')
def prettify_gold(xml_doc: etree._Element, indentation=" "):
"""The built-in prettification doesn't work well for XML containing TEI annotations. We need to prettify manually"""
# manually reindent the nodes
fix_indentation(xml_doc, indentation=indentation)
# wrap and indent formatted input
for elem in xml_doc.xpath(r'//input[@type="formatted"]'):
if type(elem.text) is str and elem.text != "":
prettify_content(elem)
# fix the tail with incorrect whitespace
for elem in xml_doc.xpath(r'//*'):
if type(elem.tail) is str and elem.tail != "":
fix_tail(elem)
# stringify
xml_string = etree.tostring(xml_doc, pretty_print=True).decode()
# replace xml entities
xml_string = html.unescape(xml_string)
return xml_string
def add_id_to_bibl(dir_path): def add_id_to_bibl(dir_path):
for file_path in glob(f'{dir_path}/*.xml'): for file_path in glob(f'{dir_path}/*.xml'):
print(f' - Processing {file_path}') print(f' - Processing {file_path}')
...@@ -30,9 +136,6 @@ def add_id_to_bibl(dir_path): ...@@ -30,9 +136,6 @@ def add_id_to_bibl(dir_path):
with open(file_path, 'wb') as file: with open(file_path, 'wb') as file:
file.write(pretty_tree) file.write(pretty_tree)
def remove_encoding_declaration(xml_string):
return xml_string.replace('<?xml version="1.0" encoding="UTF-8"?>', '')
def find_biblstruct(bibl_struct_tree, id): def find_biblstruct(bibl_struct_tree, id):
# find correspondent biblStruct data # find correspondent biblStruct data
xpath_expr= f"//tei:biblStruct[substring(@source, string-length(@source) - {len(id) - 1})='{id}']" xpath_expr= f"//tei:biblStruct[substring(@source, string-length(@source) - {len(id) - 1})='{id}']"
...@@ -40,7 +143,7 @@ def find_biblstruct(bibl_struct_tree, id): ...@@ -40,7 +143,7 @@ def find_biblstruct(bibl_struct_tree, id):
if len(bibl_struct_matches) > 0: if len(bibl_struct_matches) > 0:
# found it - make a copy and remove unneeded attributes # found it - make a copy and remove unneeded attributes
bibl_struct_copy = deepcopy(bibl_struct_matches[0]) bibl_struct_copy = deepcopy(bibl_struct_matches[0])
if bibl_struct_copy.attrib['source']: if 'source' in bibl_struct_copy.attrib:
del bibl_struct_copy.attrib['source'] del bibl_struct_copy.attrib['source']
# add it to target listBibl # add it to target listBibl
return bibl_struct_copy return bibl_struct_copy
...@@ -48,7 +151,7 @@ def find_biblstruct(bibl_struct_tree, id): ...@@ -48,7 +151,7 @@ def find_biblstruct(bibl_struct_tree, id):
raise RuntimeError(f"Could not find matching biblStruct element with id '{id}'.") raise RuntimeError(f"Could not find matching biblStruct element with id '{id}'.")
def create_gold_standard(bibl_content, biblstruct_content, verbose=False): def create_gold_standard(bibl_content, biblstruct_content, verbose=False, pretty=False):
# get source trees and elements # get source trees and elements
bibl_tree = etree.fromstring(remove_encoding_declaration(bibl_content)) bibl_tree = etree.fromstring(remove_encoding_declaration(bibl_content))
bibl_struct_tree = etree.fromstring(remove_encoding_declaration(biblstruct_content)) bibl_struct_tree = etree.fromstring(remove_encoding_declaration(biblstruct_content))
...@@ -108,8 +211,10 @@ def create_gold_standard(bibl_content, biblstruct_content, verbose=False): ...@@ -108,8 +211,10 @@ def create_gold_standard(bibl_content, biblstruct_content, verbose=False):
raise RuntimeError(f"Missing 'xml:id' attribute for bibl element: {elem}") raise RuntimeError(f"Missing 'xml:id' attribute for bibl element: {elem}")
bibl_id = elem.attrib[id_attrib] bibl_id = elem.attrib[id_attrib]
biblstruct = find_biblstruct(bibl_struct_tree, bibl_id) biblstruct = find_biblstruct(bibl_struct_tree, bibl_id)
# add the biblStruct # add the biblStruct without xmlns
output_listBibl.append(biblstruct) output_listBibl.append(biblstruct)
if 'xmlns' in biblstruct.attrib:
del biblstruct.attrib['xmlns']
# add the element # add the element
output_bibl_p.append(deepcopy(elem)) output_bibl_p.append(deepcopy(elem))
...@@ -207,8 +312,9 @@ def create_gold_standard(bibl_content, biblstruct_content, verbose=False): ...@@ -207,8 +312,9 @@ def create_gold_standard(bibl_content, biblstruct_content, verbose=False):
# raise RuntimeError(f"Could not find matching biblStruct element with id '{bibl_n}'.") # raise RuntimeError(f"Could not find matching biblStruct element with id '{bibl_n}'.")
# display(Markdown("\n".join(process_log))) # display(Markdown("\n".join(process_log)))
return remove_encoding_declaration(prettify(etree.tostring(root)))
# serialize to a pretty xml string with the linebreak issues addressed
return prettify_gold(root)
def create_all_gold_standards(bibl_dir, biblstruct_dir, biblstruct_gold_dir, verbose=False): def create_all_gold_standards(bibl_dir, biblstruct_dir, biblstruct_gold_dir, verbose=False):
for file_path in glob(f'{bibl_dir}/*.xml'): for file_path in glob(f'{bibl_dir}/*.xml'):
...@@ -228,7 +334,7 @@ def create_all_gold_standards(bibl_dir, biblstruct_dir, biblstruct_gold_dir, ver ...@@ -228,7 +334,7 @@ def create_all_gold_standards(bibl_dir, biblstruct_dir, biblstruct_gold_dir, ver
bibl_content = bibl_file.read() bibl_content = bibl_file.read()
biblStruct_content = biblStruct_file.read() biblStruct_content = biblStruct_file.read()
output_data = create_gold_standard(bibl_content, biblStruct_content, verbose=verbose) output_data = create_gold_standard(bibl_content, biblStruct_content, verbose=verbose, pretty=True)
with open(biblstruct_gs_path, 'w', encoding='utf-8') as output_file: with open(biblstruct_gs_path, 'w', encoding='utf-8') as output_file:
output_file.write(output_data) output_file.write(output_data)
\ No newline at end of file
...@@ -2,6 +2,7 @@ import glob ...@@ -2,6 +2,7 @@ import glob
import os import os
from difflib import HtmlDiff from difflib import HtmlDiff
from IPython.display import display, Markdown from IPython.display import display, Markdown
import regex as re
def compare_input_strings(tei_dir_path, ref_dir_path, html_output_path, target_url, refs_output_path=None): def compare_input_strings(tei_dir_path, ref_dir_path, html_output_path, target_url, refs_output_path=None):
os.makedirs(html_output_path, exist_ok=True) os.makedirs(html_output_path, exist_ok=True)
...@@ -40,4 +41,15 @@ def compare_input_strings(tei_dir_path, ref_dir_path, html_output_path, target_u ...@@ -40,4 +41,15 @@ def compare_input_strings(tei_dir_path, ref_dir_path, html_output_path, target_u
f.write(html_diff) f.write(html_diff)
display(Markdown(f'Extracted and compared input data for {id} ([See diff]({target_url}/{id}.diff.html))')) display(Markdown(f'Extracted and compared input data for {id} ([See diff]({target_url}/{id}.diff.html))'))
# %% def fix_serialization_issues(text):
# escape character sequences
text = re.sub(r'\.\.\.', '[!elipsis!]', text)
text = re.sub(r'\. \. \.', '[!spaced_elipsis!]', text)
# remove duplicated punctuation
text = re.sub(r'(\. ?){2}', r'.', text)
# restore character sequences
text = text.replace( '[!elipsis!]', '...')
text = text.replace('[!spaced_elipsis!]', r'. . .')
return text.strip()
\ No newline at end of file
import regex as re
from lxml import etree
from IPython.display import display, Markdown
import xml.dom.minidom
def prettify(xml_string, indentation=" "):
"""Return a pretty-printed XML string"""
return xml.dom.minidom.parseString(xml_string).toprettyxml(indent=indentation)
def remove_whitespace(text):
# we need to remove the whitespace that comes with the indentation of pretty-printed xml
text = re.sub(r'\n *', ' ', text)
# reduce double spaces to one
while re.search(r' ', text):
text = re.sub(r' ', ' ', text)
# escape character sequences which would be corrupted by whitespace removal rules
text = re.sub(r'\. \. \.', '[!spaced_elipsis!]', text)
# fix issues with whitespace before and after punctuation
text = re.sub(r' ([.;,!?%:])( |$)', r'\1 ', text)
# opening and closing punctutation, such as brackets
text = re.sub(r'(\p{Ps}) ', r'\1', text)
text = re.sub(r' (\p{Pe})', r'\1', text)
# opening and closing quotes
text = re.sub(r'(\p{Pi}) ', r'\1', text)
text = re.sub(r' (\p{Pf})', r'\1', text)
# slash
text = re.sub(r' ?/ ?', r'/', text)
# restore sequences
text = text.replace('[!spaced_elipsis!]', r'. . .')
return text.strip()
def fix_serialization_issues(text):
# escape character sequences
text = re.sub(r'\.\.\.', '[!elipsis!]', text)
text = re.sub(r'\. \. \.', '[!spaced_elipsis!]', text)
# remove duplicated punctuation
text = re.sub(r'(\. ?){2}', r'.', text)
# restore character sequences
text = text.replace( '[!elipsis!]', '...')
text = text.replace('[!spaced_elipsis!]', r'. . .')
return text.strip()
def tei_to_input(tei_xml_doc):
"""
Extract the original footnote strings from the <note> elements in a given TEI document and return a list of strings
"""
root = etree.fromstring(tei_xml_doc)
ref_list = []
ns = {"tei": "http://www.tei-c.org/ns/1.0"}
# iterate over the <note type="footnote"> and <listBibl> elements
for xpath in ['*//tei:listBibl[@type="inText"]',
'*//tei:note[@type="footnote"]',
'*//tei:listBibl[@type="bibliography"]']:
for element in root.findall(xpath, ns):
localname = etree.QName(element).localname
type = element.attrib['type'] if 'type' in element.attrib else None
ref_parts = []
if 'n' in element.attrib:
ref_parts.append(element.attrib['n'])
# iterate over the <bibl> elements
for bibl in element.findall('tei:bibl', ns):
# extract the text without xml tags, still contains all (collapsed) whitespace
text = etree.tostring(bibl, method="text", encoding='utf-8').decode()
text = remove_whitespace(text)
ref_parts.append(text)
if localname == "listBibl" and type != "inText":
ref_list += ref_parts
elif localname == "note" or (localname == "listBibl" and type == "inText"):
ref_list.append(remove_whitespace(" ".join(ref_parts)))
return ref_list
...@@ -56,8 +56,8 @@ ...@@ -56,8 +56,8 @@
<surname>Pateman</surname> <surname>Pateman</surname>
</persName> </persName>
</author> </author>
, ‘ , ‘
<title level="a">Mere Auxiliaries to the Commonwealth”: Women and the Origins of Liberalism</title> <title level="a">Mere Auxiliaries to the Commonwealth”: Women and the Origins of Liberalism</title>
’ ( ’ (
<date>1979</date> <date>1979</date>
) )
... ...
......
...@@ -56,8 +56,8 @@ ...@@ -56,8 +56,8 @@
<surname>Pateman</surname> <surname>Pateman</surname>
</persName> </persName>
</surname> </surname>
, ‘ , ‘
<title level="m">Mere Auxiliaries to the Commonwealth”: Women and the Origins of Liberalism</title> <title level="m">Mere Auxiliaries to the Commonwealth”: Women and the Origins of Liberalism</title>
’ ( ’ (
<date>1979</date> <date>1979</date>
) )
... ...
......
%% Cell type:markdown id:a7894c78ec06bd10 tags: %% Cell type:markdown id:a7894c78ec06bd10 tags:
# Translate TEI/bibl to final gold standard schema # Translate TEI/bibl to final gold standard schema
%% Cell type:code id:2a90251a tags: %% Cell type:code id:2a90251a tags:
``` python ``` python
%load_ext autoreload %load_ext autoreload
%autoreload 2 %autoreload 2
``` ```
%% Output
The autoreload extension is already loaded. To reload it, use:
%reload_ext autoreload
%% Cell type:markdown id:bac6fffc tags: %% Cell type:markdown id:bac6fffc tags:
Add an `xml:id` attribute to all `bibl` elements so that they can be matched later. Add an `xml:id` attribute to all `bibl` elements so that they can be matched later.
%% Cell type:code id:338af7ddf4cc739d tags: %% Cell type:code id:338af7ddf4cc739d tags:
``` python ``` python
from lib.gold_standard import add_id_to_bibl from lib.gold_standard import add_id_to_bibl
add_id_to_bibl('./tei-bibl-corrected') add_id_to_bibl('./tei-bibl-corrected')
``` ```
%% Output %% Output
- Processing ./tei-bibl-corrected\10.1111_1467-6478.00057.xml - Processing ./tei-bibl-corrected\10.1111_1467-6478.00057.xml
- Processing ./tei-bibl-corrected\10.1111_1467-6478.00080.xml - Processing ./tei-bibl-corrected\10.1111_1467-6478.00080.xml
- Processing ./tei-bibl-corrected\10.1515_zfrs-1980-0103.xml - Processing ./tei-bibl-corrected\10.1515_zfrs-1980-0103.xml
- Processing ./tei-bibl-corrected\10.1515_zfrs-1980-0104.xml - Processing ./tei-bibl-corrected\10.1515_zfrs-1980-0104.xml
%% Cell type:markdown id:f18f6515 tags: %% Cell type:markdown id:f18f6515 tags:
Create `biblStruct` from `bibl`: Create `biblStruct` from `bibl`:
%% Cell type:code id:d39d9f75 tags: %% Cell type:code id:d39d9f75 tags:
``` python ``` python
from lib.xslt import transform from lib.xslt import transform
transform(xslt_path='lib/xslt/convert_tei-to-biblstruct_bibl.xsl', transform(xslt_path='lib/xslt/convert_tei-to-biblstruct_bibl.xsl',
input_path='tei-bibl-corrected', input_path='tei-bibl-corrected',
output_path='tei-biblStruct', output_path='tei-biblStruct',
rename_extension=('-bibl_biblStruct.TEIP5.xml','.biblStruct.xml')).stderr rename_extension=('-bibl_biblStruct.TEIP5.xml','.biblStruct.xml')).stderr
``` ```
%% Output %% Output
Applied lib\xslt\convert_tei-to-biblstruct_bibl.xsl to files in tei-bibl-corrected and saved result in tei-biblStruct. Applied lib\xslt\convert_tei-to-biblstruct_bibl.xsl to files in tei-bibl-corrected and saved result in tei-biblStruct.
'' ''
%% Cell type:code id:2cc1a0d6 tags: %% Cell type:code id:2cc1a0d6 tags:
``` python ``` python
from lib.gold_standard import create_all_gold_standards from lib.gold_standard import create_all_gold_standards
create_all_gold_standards('tei-bibl-corrected', create_all_gold_standards('tei-bibl-corrected',
'tei-biblStruct', 'tei-biblStruct',
'gold', 'gold',
verbose=False) verbose=False)
``` ```
%% Output %% Output
### Processing 10.1111_1467-6478.00057 ### Processing 10.1111_1467-6478.00057
Files: [TEI/bibl](tei-bibl-corrected/10.1111_1467-6478.00057.xml) | [TEI/biblStruct](tei-biblStruct/10.1111_1467-6478.00057.biblstruct.xml) | [Gold Standard](gold/10.1111_1467-6478.00057.xml) Files: [TEI/bibl](tei-bibl-corrected/10.1111_1467-6478.00057.xml) | [TEI/biblStruct](tei-biblStruct/10.1111_1467-6478.00057.biblstruct.xml) | [Gold Standard](gold/10.1111_1467-6478.00057.xml)
### Processing 10.1111_1467-6478.00080 ### Processing 10.1111_1467-6478.00080
Files: [TEI/bibl](tei-bibl-corrected/10.1111_1467-6478.00080.xml) | [TEI/biblStruct](tei-biblStruct/10.1111_1467-6478.00080.biblstruct.xml) | [Gold Standard](gold/10.1111_1467-6478.00080.xml) Files: [TEI/bibl](tei-bibl-corrected/10.1111_1467-6478.00080.xml) | [TEI/biblStruct](tei-biblStruct/10.1111_1467-6478.00080.biblstruct.xml) | [Gold Standard](gold/10.1111_1467-6478.00080.xml)
### Processing 10.1515_zfrs-1980-0103 ### Processing 10.1515_zfrs-1980-0103
Files: [TEI/bibl](tei-bibl-corrected/10.1515_zfrs-1980-0103.xml) | [TEI/biblStruct](tei-biblStruct/10.1515_zfrs-1980-0103.biblstruct.xml) | [Gold Standard](gold/10.1515_zfrs-1980-0103.xml) Files: [TEI/bibl](tei-bibl-corrected/10.1515_zfrs-1980-0103.xml) | [TEI/biblStruct](tei-biblStruct/10.1515_zfrs-1980-0103.biblstruct.xml) | [Gold Standard](gold/10.1515_zfrs-1980-0103.xml)
### Processing 10.1515_zfrs-1980-0104 ### Processing 10.1515_zfrs-1980-0104
Files: [TEI/bibl](tei-bibl-corrected/10.1515_zfrs-1980-0104.xml) | [TEI/biblStruct](tei-biblStruct/10.1515_zfrs-1980-0104.biblstruct.xml) | [Gold Standard](gold/10.1515_zfrs-1980-0104.xml) Files: [TEI/bibl](tei-bibl-corrected/10.1515_zfrs-1980-0104.xml) | [TEI/biblStruct](tei-biblStruct/10.1515_zfrs-1980-0104.biblstruct.xml) | [Gold Standard](gold/10.1515_zfrs-1980-0104.xml)
... ...
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment