From 0806886f75dc2a550529cef26b9250b5fea4bbc6 Mon Sep 17 00:00:00 2001
From: cboulanger <info@bibliograph.org>
Date: Fri, 3 Jan 2025 09:40:10 +0100
Subject: [PATCH] Add grobid experiment

---
 convert-anystyle-data/lib/gold_standard.py |  4 +-
 grobid/.gitignore                          |  1 +
 grobid/config.json.dist                    |  7 +++
 grobid/grobid-web-services.ipynb           | 63 ++++++++++++++++++++++
 grobid/in/.gitignore                       |  2 +
 grobid/out/.gitignore                      |  2 +
 6 files changed, 77 insertions(+), 2 deletions(-)
 create mode 100644 grobid/.gitignore
 create mode 100644 grobid/config.json.dist
 create mode 100644 grobid/grobid-web-services.ipynb
 create mode 100644 grobid/in/.gitignore
 create mode 100644 grobid/out/.gitignore

diff --git a/convert-anystyle-data/lib/gold_standard.py b/convert-anystyle-data/lib/gold_standard.py
index 6da1175..77eb038 100644
--- a/convert-anystyle-data/lib/gold_standard.py
+++ b/convert-anystyle-data/lib/gold_standard.py
@@ -93,7 +93,7 @@ def indentation_level(element):
         level += 1
     return level - 1 
 
-def fix_tail(elem: etree._Element, indentation="    "):
+def fix_tail(elem: etree._Element, indentation="  "):
     tail = elem.tail
     # normalize line endings
     tail = re.sub(r'\r\n', '\n', tail)
@@ -128,7 +128,7 @@ def fix_indentation(elem, level=0, indentation="    "):
     indent = "\n" + level*indentation
     if len(elem):
         if not elem.text or not elem.text.strip():
-            elem.text = indent + "  "
+            elem.text = indent + indentation
         if not elem.tail or not elem.tail.strip():
             elem.tail = indent
         for elem in elem:
diff --git a/grobid/.gitignore b/grobid/.gitignore
new file mode 100644
index 0000000..d344ba6
--- /dev/null
+++ b/grobid/.gitignore
@@ -0,0 +1 @@
+config.json
diff --git a/grobid/config.json.dist b/grobid/config.json.dist
new file mode 100644
index 0000000..df1adb2
--- /dev/null
+++ b/grobid/config.json.dist
@@ -0,0 +1,7 @@
+{
+    "grobid_server": "http://localhost:8070",
+    "batch_size": 1000,
+    "sleep_time": 5,
+    "timeout": 60,
+    "coordinates": [ "persName", "figure", "ref", "biblStruct", "formula", "s" ]
+}
\ No newline at end of file
diff --git a/grobid/grobid-web-services.ipynb b/grobid/grobid-web-services.ipynb
new file mode 100644
index 0000000..07a331e
--- /dev/null
+++ b/grobid/grobid-web-services.ipynb
@@ -0,0 +1,63 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install -q grobid-client-python"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "GROBID server is up and running\n"
+     ]
+    }
+   ],
+   "source": [
+    "from grobid_client.grobid_client import GrobidClient\n",
+    "from pathlib import Path\n",
+    "\n",
+    "client = GrobidClient(config_path=\"./config.json\")\n",
+    "client.accept_type = \"application/x-bibtex\"\n",
+    "client.process(\"processReferences\", \n",
+    "                input_path=\"./in\",\n",
+    "                output=\"./out\",\n",
+    "                consolidate_citations=True,\n",
+    "                n=20, force=True)\n",
+    "for file in Path('out').glob('*.grobid.tei.xml'):\n",
+    "    new_file = file.with_name(file.name.replace('.grobid.tei.xml', '.bib'))\n",
+    "    file.rename(new_file)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/grobid/in/.gitignore b/grobid/in/.gitignore
new file mode 100644
index 0000000..c96a04f
--- /dev/null
+++ b/grobid/in/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
\ No newline at end of file
diff --git a/grobid/out/.gitignore b/grobid/out/.gitignore
new file mode 100644
index 0000000..c96a04f
--- /dev/null
+++ b/grobid/out/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
\ No newline at end of file
-- 
GitLab