From 619136d5e88aa35ddfc058acdc0036bae0579b38 Mon Sep 17 00:00:00 2001 From: cboulanger <info@bibliograph.org> Date: Thu, 17 Oct 2024 14:57:28 +0200 Subject: [PATCH] improve fix_tail func --- convert-anystyle-data/lib/gold_standard.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/convert-anystyle-data/lib/gold_standard.py b/convert-anystyle-data/lib/gold_standard.py index 7f3695a..6da1175 100644 --- a/convert-anystyle-data/lib/gold_standard.py +++ b/convert-anystyle-data/lib/gold_standard.py @@ -105,8 +105,9 @@ def fix_tail(elem: etree._Element, indentation=" "): # opening and closing quotes tail = re.sub(r'(\p{Pi})(\s*\n\s+)', r'\1', tail) tail = re.sub(r'(\s*\n\s+)(\p{Pf})', r'\2', tail) - # in tails without line break but with whitespace, replace the last whitespace with a linebreak - if '\n' not in tail and ' ' in tail: + # in tails without line break but with whitespace, replace normalized whitespace with linebreak + if '\n' not in tail and re.match(r'\s', tail) is not None: + tail = re.sub(r'\s+', r' ', tail).strip() p = tail.split(" ") s = '\n' + indentation_level(elem) * indentation tail = s.join(p) -- GitLab