diff --git a/convert-anystyle-data/lib/gold_standard.py b/convert-anystyle-data/lib/gold_standard.py index 6da1175f829d6799a50b8c173ed110b75f8c39b7..77eb038aaf6b9d2655626e99a3253a0df207d973 100644 --- a/convert-anystyle-data/lib/gold_standard.py +++ b/convert-anystyle-data/lib/gold_standard.py @@ -93,7 +93,7 @@ def indentation_level(element): level += 1 return level - 1 -def fix_tail(elem: etree._Element, indentation=" "): +def fix_tail(elem: etree._Element, indentation=" "): tail = elem.tail # normalize line endings tail = re.sub(r'\r\n', '\n', tail) @@ -128,7 +128,7 @@ def fix_indentation(elem, level=0, indentation=" "): indent = "\n" + level*indentation if len(elem): if not elem.text or not elem.text.strip(): - elem.text = indent + " " + elem.text = indent + indentation if not elem.tail or not elem.tail.strip(): elem.tail = indent for elem in elem: diff --git a/grobid/.gitignore b/grobid/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..d344ba6b06cb4611d3a27589d8f48b22c832b048 --- /dev/null +++ b/grobid/.gitignore @@ -0,0 +1 @@ +config.json diff --git a/grobid/config.json.dist b/grobid/config.json.dist new file mode 100644 index 0000000000000000000000000000000000000000..df1adb29f848f269e4dc7b42972c72ba4d4aede7 --- /dev/null +++ b/grobid/config.json.dist @@ -0,0 +1,7 @@ +{ + "grobid_server": "http://localhost:8070", + "batch_size": 1000, + "sleep_time": 5, + "timeout": 60, + "coordinates": [ "persName", "figure", "ref", "biblStruct", "formula", "s" ] +} \ No newline at end of file diff --git a/grobid/grobid-web-services.ipynb b/grobid/grobid-web-services.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..07a331e9549f97faf720930aaa348e5668be2375 --- /dev/null +++ b/grobid/grobid-web-services.ipynb @@ -0,0 +1,63 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -q grobid-client-python" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GROBID server is up and running\n" + ] + } + ], + "source": [ + "from grobid_client.grobid_client import GrobidClient\n", + "from pathlib import Path\n", + "\n", + "client = GrobidClient(config_path=\"./config.json\")\n", + "client.accept_type = \"application/x-bibtex\"\n", + "client.process(\"processReferences\", \n", + " input_path=\"./in\",\n", + " output=\"./out\",\n", + " consolidate_citations=True,\n", + " n=20, force=True)\n", + "for file in Path('out').glob('*.grobid.tei.xml'):\n", + " new_file = file.with_name(file.name.replace('.grobid.tei.xml', '.bib'))\n", + " file.rename(new_file)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/grobid/in/.gitignore b/grobid/in/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..c96a04f008ee21e260b28f7701595ed59e2839e3 --- /dev/null +++ b/grobid/in/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore \ No newline at end of file diff --git a/grobid/out/.gitignore b/grobid/out/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..c96a04f008ee21e260b28f7701595ed59e2839e3 --- /dev/null +++ b/grobid/out/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore \ No newline at end of file