diff --git a/convert-anystyle-data/lib/gold_standard.py b/convert-anystyle-data/lib/gold_standard.py index 7f3695a9a6b8911da2dace8d45aaf01927b071d3..6da1175f829d6799a50b8c173ed110b75f8c39b7 100644 --- a/convert-anystyle-data/lib/gold_standard.py +++ b/convert-anystyle-data/lib/gold_standard.py @@ -105,8 +105,9 @@ def fix_tail(elem: etree._Element, indentation=" "): # opening and closing quotes tail = re.sub(r'(\p{Pi})(\s*\n\s+)', r'\1', tail) tail = re.sub(r'(\s*\n\s+)(\p{Pf})', r'\2', tail) - # in tails without line break but with whitespace, replace the last whitespace with a linebreak - if '\n' not in tail and ' ' in tail: + # in tails without line break but with whitespace, replace normalized whitespace with linebreak + if '\n' not in tail and re.match(r'\s', tail) is not None: + tail = re.sub(r'\s+', r' ', tail).strip() p = tail.split(" ") s = '\n' + indentation_level(elem) * indentation tail = s.join(p)