From abdc73b3f28df02b55a69c084df1685fa1340309 Mon Sep 17 00:00:00 2001
From: Christian Boulanger <boulanger@lhlt.mpg.de>
Date: Tue, 21 Jan 2025 21:04:03 +0100
Subject: [PATCH] Add pandoc experiment

---
 pandoc/__init__.js      |   0
 pandoc/md-to-docx.ipynb |  48 +++++++++++++++++++
 pandoc/md_to_docx.py    | 102 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 150 insertions(+)
 create mode 100644 pandoc/__init__.js
 create mode 100644 pandoc/md-to-docx.ipynb
 create mode 100644 pandoc/md_to_docx.py

diff --git a/pandoc/__init__.js b/pandoc/__init__.js
new file mode 100644
index 0000000..e69de29
diff --git a/pandoc/md-to-docx.ipynb b/pandoc/md-to-docx.ipynb
new file mode 100644
index 0000000..b596c6b
--- /dev/null
+++ b/pandoc/md-to-docx.ipynb
@@ -0,0 +1,48 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Checking if Pandoc is installed...\n",
+      "Pandoc found at: C:\\Users\\boulanger\\AppData\\Local\\Pandoc\\pandoc.exe\n",
+      "Converting H:\\Downloads\\test.md to Word document H:\\Downloads\\test.docx...\n",
+      "Conversion completed successfully! Output saved as H:\\Downloads\\test.docx.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from md_to_docx import MarkdownToWordConverter\n",
+    "\n",
+    "converter = MarkdownToWordConverter(verbose=True)\n",
+    "converter.run(r'H:\\Downloads\\test.md', r'H:\\Downloads\\test.docx')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/pandoc/md_to_docx.py b/pandoc/md_to_docx.py
new file mode 100644
index 0000000..2ed6c69
--- /dev/null
+++ b/pandoc/md_to_docx.py
@@ -0,0 +1,102 @@
+import os
+import subprocess
+import requests
+import platform
+import argparse
+import sys
+
+
+class MarkdownToWordConverter:
+    def __init__(self, verbose=False):
+        self.verbose = verbose
+
+    def log(self, message):
+        """Prints a message only if verbose is True."""
+        if self.verbose:
+            print(message)
+
+    def download_markdown_file(self, url, output_path):
+        """Downloads a Markdown file from a URL."""
+        self.log(f"Downloading Markdown file from {url}...")
+        try:
+            response = requests.get(url, timeout=10)
+            response.raise_for_status()
+            with open(output_path, 'w', encoding='utf-8') as file:
+                file.write(response.text)
+            self.log(f"Markdown file saved as {output_path}.")
+        except requests.exceptions.RequestException as e:
+            raise Exception(f"Failed to download the Markdown file: {e}")
+
+    def check_pandoc_installed(self):
+        """Checks if Pandoc is installed."""
+        self.log("Checking if Pandoc is installed...")
+        command = "where" if platform.system() == "Windows" else "which"
+        try:
+            result = subprocess.run([command, "pandoc"], capture_output=True, text=True, check=True)
+            self.log(f"Pandoc found at: {result.stdout.strip()}")
+        except subprocess.CalledProcessError:
+            raise Exception("Pandoc is not installed or not in the system PATH.")
+
+    def convert_markdown_to_word(self, input_path, output_path):
+        """Converts a Markdown file to a Word document using Pandoc."""
+        self.log(f"Converting {input_path} to Word document {output_path}...")
+        try:
+            subprocess.run(["pandoc", input_path, "-o", output_path], check=True, shell=(platform.system() == "Windows"))
+            self.log(f"Conversion completed successfully! Output saved as {output_path}.")
+        except subprocess.CalledProcessError as e:
+            raise Exception(f"Conversion failed: {e}")
+
+    def run(self, source, output_docx):
+        """Handles the conversion process."""
+        temp_md = "temp_downloaded_file.md"
+        input_path = source
+
+        try:
+            # Handle remote URL or local file
+            if source.startswith(("http://", "https://")):
+                input_path = temp_md
+                self.download_markdown_file(source, input_path)
+            elif not os.path.isfile(input_path):
+                raise Exception(f"Local file '{source}' does not exist.")
+
+            # Check if Pandoc is installed
+            self.check_pandoc_installed()
+
+            # Convert to Word document
+            self.convert_markdown_to_word(input_path, output_docx)
+        finally:
+            # Clean up temporary file if used
+            if input_path == temp_md and os.path.isfile(input_path):
+                os.remove(input_path)
+                self.log(f"Temporary file {input_path} removed.")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert a Markdown file (local or remote) to a Word document using Pandoc."
+    )
+    parser.add_argument(
+        "source",
+        help="URL or path to the local Markdown file."
+    )
+    parser.add_argument(
+        "output_docx",
+        help="Path to save the Word document (no default; must be specified)."
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Enable verbose output (default: False)."
+    )
+    args = parser.parse_args()
+
+    converter = MarkdownToWordConverter(verbose=args.verbose)
+    try:
+        converter.run(args.source, args.output_docx)
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
-- 
GitLab