From abdc73b3f28df02b55a69c084df1685fa1340309 Mon Sep 17 00:00:00 2001 From: Christian Boulanger <boulanger@lhlt.mpg.de> Date: Tue, 21 Jan 2025 21:04:03 +0100 Subject: [PATCH] Add pandoc experiment --- pandoc/__init__.js | 0 pandoc/md-to-docx.ipynb | 48 +++++++++++++++++++ pandoc/md_to_docx.py | 102 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 150 insertions(+) create mode 100644 pandoc/__init__.js create mode 100644 pandoc/md-to-docx.ipynb create mode 100644 pandoc/md_to_docx.py diff --git a/pandoc/__init__.js b/pandoc/__init__.js new file mode 100644 index 0000000..e69de29 diff --git a/pandoc/md-to-docx.ipynb b/pandoc/md-to-docx.ipynb new file mode 100644 index 0000000..b596c6b --- /dev/null +++ b/pandoc/md-to-docx.ipynb @@ -0,0 +1,48 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Checking if Pandoc is installed...\n", + "Pandoc found at: C:\\Users\\boulanger\\AppData\\Local\\Pandoc\\pandoc.exe\n", + "Converting H:\\Downloads\\test.md to Word document H:\\Downloads\\test.docx...\n", + "Conversion completed successfully! Output saved as H:\\Downloads\\test.docx.\n" + ] + } + ], + "source": [ + "from md_to_docx import MarkdownToWordConverter\n", + "\n", + "converter = MarkdownToWordConverter(verbose=True)\n", + "converter.run(r'H:\\Downloads\\test.md', r'H:\\Downloads\\test.docx')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pandoc/md_to_docx.py b/pandoc/md_to_docx.py new file mode 100644 index 0000000..2ed6c69 --- /dev/null +++ b/pandoc/md_to_docx.py @@ -0,0 +1,102 @@ +import os +import subprocess +import requests +import platform +import argparse +import sys + + +class MarkdownToWordConverter: + def __init__(self, verbose=False): + self.verbose = verbose + + def log(self, message): + """Prints a message only if verbose is True.""" + if self.verbose: + print(message) + + def download_markdown_file(self, url, output_path): + """Downloads a Markdown file from a URL.""" + self.log(f"Downloading Markdown file from {url}...") + try: + response = requests.get(url, timeout=10) + response.raise_for_status() + with open(output_path, 'w', encoding='utf-8') as file: + file.write(response.text) + self.log(f"Markdown file saved as {output_path}.") + except requests.exceptions.RequestException as e: + raise Exception(f"Failed to download the Markdown file: {e}") + + def check_pandoc_installed(self): + """Checks if Pandoc is installed.""" + self.log("Checking if Pandoc is installed...") + command = "where" if platform.system() == "Windows" else "which" + try: + result = subprocess.run([command, "pandoc"], capture_output=True, text=True, check=True) + self.log(f"Pandoc found at: {result.stdout.strip()}") + except subprocess.CalledProcessError: + raise Exception("Pandoc is not installed or not in the system PATH.") + + def convert_markdown_to_word(self, input_path, output_path): + """Converts a Markdown file to a Word document using Pandoc.""" + self.log(f"Converting {input_path} to Word document {output_path}...") + try: + subprocess.run(["pandoc", input_path, "-o", output_path], check=True, shell=(platform.system() == "Windows")) + self.log(f"Conversion completed successfully! Output saved as {output_path}.") + except subprocess.CalledProcessError as e: + raise Exception(f"Conversion failed: {e}") + + def run(self, source, output_docx): + """Handles the conversion process.""" + temp_md = "temp_downloaded_file.md" + input_path = source + + try: + # Handle remote URL or local file + if source.startswith(("http://", "https://")): + input_path = temp_md + self.download_markdown_file(source, input_path) + elif not os.path.isfile(input_path): + raise Exception(f"Local file '{source}' does not exist.") + + # Check if Pandoc is installed + self.check_pandoc_installed() + + # Convert to Word document + self.convert_markdown_to_word(input_path, output_docx) + finally: + # Clean up temporary file if used + if input_path == temp_md and os.path.isfile(input_path): + os.remove(input_path) + self.log(f"Temporary file {input_path} removed.") + + +def main(): + parser = argparse.ArgumentParser( + description="Convert a Markdown file (local or remote) to a Word document using Pandoc." + ) + parser.add_argument( + "source", + help="URL or path to the local Markdown file." + ) + parser.add_argument( + "output_docx", + help="Path to save the Word document (no default; must be specified)." + ) + parser.add_argument( + "--verbose", + action="store_true", + help="Enable verbose output (default: False)." + ) + args = parser.parse_args() + + converter = MarkdownToWordConverter(verbose=args.verbose) + try: + converter.run(args.source, args.output_docx) + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() -- GitLab