Update gold schema proposal

17f654bc · Christian Boulanger · dd050e2d · 17f654bc
Commit 17f654bc authored 9 months ago by Christian Boulanger
--- a/convert-anystyle-data/schema/gold_standard.xml
+++ b/convert-anystyle-data/schema/gold_standard.xml
@@ -8,13 +8,13 @@
            . . be reduced to order and exhibited under the form of a few principles which sum up the effect of a
            hundred cases . . .’. A. Dicey, Can English Law be taught at the Universities? (1883) 20.
        </input>
-        <output type="binary" xmlns="http://www.tei-c.org/ns/1.0">
+        <output type="references" xmlns="http://www.tei-c.org/ns/1.0">
            <!-- alternative tag names: <target> <label> -->
            <!-- This output type simply segments the input into parts which contain bibliographic data and those which do not.
                 The parts that contain bibliographic data can individually be fed into processors which cannot predict
                 this distinction themselves, but expect an isolated instance of a bibliographic item to be
-                  segmented into its individual parts (such as Grobid)-->
-            <!-- is type="binary" the right name? -->
+                  segmented into its individual parts (such as Grobid). No other semantic information needs to be predicted
+                  by the model producing this output -->
            <!-- It's important to keep in mind that some non-bibliographic data is _outside_ of the bibliographic parts,
                 whereas others are _inside_ (example required), i.e. need to be nested inside the <bibl> so that not to
                 split a coherent bibliographic item into two incomplete ones -->
@@ -29,50 +29,62 @@
            <bibl>A. Dicey, Can English Law be taught at the Universities? (1883) 20.</bibl>
        </output>
        <output type="bibl">
-            <bibl xmlns="http://www.tei-c.org/ns/1.0">
-                <seg type="footnote-number">3</seg>
-                <seg type="signal">See</seg>
-                <author>
-                    <persName>
-                        <forename>R.</forename>
-                        <surname>Goff</surname>
-                    </persName>
-                </author>
-                , ‘
-                <title level="a">The Search for Principle</title>
-                ’ (
-                <date>1983</date>
-                )
-                <title level="j">Proceeedings of the British Academy</title>
-                <biblScope unit="volume" from="169" to="169">169</biblScope>
-                ,
-                <biblScope unit="page" from="at">at 171</biblScope>
-                .
-            </bibl>
-            <bibl xmlns="http://www.tei-c.org/ns/1.0">
-                <!-- From the context, it is clear that the following comment is connected to this reference, however,
-                    this could only be inferred by a LLM. Normal ML-models could be trained to predict that is is a
-                    non-bibliographic section and discard it. -->
-                <seg type="comment">This is an amplification of Dicey’s remark that ‘[b]y adequate study and careful
-                    thought whole departments of law can . . . be reduced to order and exhibited under the form of a
-                    few principles which sum up the effect of a hundred cases . . .’.
-                </seg>
-                <author>
-                    <persName>
-                        <forename>A.</forename>
-                        <surname>Dicey</surname>
-                    </persName>
-                </author>
-                ,
-                <title level="m">Can English Law be taught at the Universities?</title>
-                (
-                <date>1883</date>
-                )
-                <biblScope unit="page" from="20">20</biblScope>
-                .
-            </bibl>
+            <!-- This output contains the input text annotated according to the TEI rules on bibliographic references,
+                using top-level <bibl> elements and those allowed within them. The data usually comes from human annotators
+                and might contain errors and inconsistencies, but should be valid TEI. The annotation should be lossless, i.e.
+                it needs to be possible to reconstruct the input text by simply removing the xml tags and applying some
+                rules to remove the whitespace introduced by XML pretty-printing (such as after an opening bracket -
+                in fact, I am looking for a way to encode "no-following-whitespace" but haven't found one yet). -->
+            <p type="footnote">
+                <seg type="footnote-marker">3</seg>
+                <bibl xmlns="http://www.tei-c.org/ns/1.0">
+                    <seg type="signal">See</seg>
+                    <author>
+                        <persName>
+                            <forename>R.</forename>
+                            <surname>Goff</surname>
+                        </persName>
+                    </author>
+                    , ‘
+                    <title level="a">The Search for Principle</title>
+                    ’ (
+                    <date>1983</date>
+                    )
+                    <title level="j">Proceeedings of the British Academy</title>
+                    <biblScope unit="volume" from="169" to="169">169</biblScope>
+                    ,
+                    <biblScope unit="page" from="at">at 171</biblScope>
+                    .
+                </bibl>
+                <bibl xmlns="http://www.tei-c.org/ns/1.0">
+                    <!-- From the context, it is clear that the following comment is connected to this reference, however,
+                        this could only be inferred by a LLM. Normal ML-models could be trained to predict that it is a
+                        non-bibliographic section and discard it. -->
+                    <seg type="comment">This is an amplification of Dicey’s remark that ‘[b]y adequate study and careful
+                        thought whole departments of law can . . . be reduced to order and exhibited under the form of a
+                        few principles which sum up the effect of a hundred cases . . .’.
+                    </seg>
+                    <author>
+                        <persName>
+                            <forename>A.</forename>
+                            <surname>Dicey</surname>
+                        </persName>
+                    </author>
+                    ,
+                    <title level="m">Can English Law be taught at the Universities?</title>
+                    (
+                    <date>1883</date>
+                    )
+                    <biblScope unit="page" from="20">20</biblScope>
+                    .
+                </bibl>
+            </p>
        </output>
        <output type="biblStruct">
+            <!-- This output contains pure bibliographic metadata with all non-relevant information removed, using
+                 top-level TEI <biblStruct> elements. This data can then be further translated into other bibliographic
+                 data exchange formats (such as MODS, RIS, BibTeX, etc.). The information is generated from the <bibl>
+                 data above. -->
            <biblStruct xmlns="http://www.tei-c.org/ns/1.0" n="2">
                <analytic>
                    <title level="a">The Search for Principle</title>