From 1a03bfa35ecc17b5040b2f1e5ee20b24d544d17e Mon Sep 17 00:00:00 2001
From: Ahmed Ilyas <ahmed.ilyas@hu-berlin.de>
Date: Thu, 27 Feb 2025 10:28:54 +0000
Subject: [PATCH] Add nomad distro commit info

Changelog: Added
---
 nomad/datamodel/datamodel.py |  6 ++++
 nomad/processing/data.py     |  5 ++++
 nomad/utils/__init__.py      | 44 +++++++++++++++++++++++++++++
 tests/test_utils.py          | 54 ++++++++++++++++++++++++++++++++++++
 4 files changed, 109 insertions(+)

diff --git a/nomad/datamodel/datamodel.py b/nomad/datamodel/datamodel.py
index 4f58304b22..f32012bc14 100644
--- a/nomad/datamodel/datamodel.py
+++ b/nomad/datamodel/datamodel.py
@@ -723,6 +723,12 @@ class EntryMetadata(MSection):
         a_elasticsearch=Elasticsearch(),
     )
 
+    nomad_distro_commit_url = Quantity(
+        type=str,
+        description='The NOMAD distro commit url used for the last processing',
+        categories=[MongoEntryMetadata],
+        a_elasticsearch=Elasticsearch(),
+    )
     comment = Quantity(
         type=str,
         categories=[MongoEntryMetadata, EditableUserMetadata],
diff --git a/nomad/processing/data.py b/nomad/processing/data.py
index a4d40a56a2..b75902e2c3 100644
--- a/nomad/processing/data.py
+++ b/nomad/processing/data.py
@@ -909,6 +909,7 @@ class Entry(Proc):
             external database where the data was imported from
         nomad_version: the NOMAD version used for the last processing
         nomad_commit: the NOMAD commit used for the last processing
+        nomad_distro_commit_url: the NOMAD distro commit url used for the last processing
         comment: a user provided comment for this entry
         references: user provided references (URLs) for this entry
         entry_coauthors: a user provided list of co-authors specific for this entry. Note
@@ -929,6 +930,7 @@ class Entry(Proc):
     external_id = StringField()
     nomad_version = StringField()
     nomad_commit = StringField()
+    nomad_distro_commit_url = StringField()
     comment = StringField()
     references = ListField(StringField())
     entry_coauthors = ListField()
@@ -1012,8 +1014,11 @@ class Entry(Proc):
         In this case, the timestamp stored in the archive is used.
         If no previous timestamp is available, a new timestamp is generated.
         """
+        distro_commit_url = utils.nomad_distro_metadata()
+        entry_metadata.nomad_version = config.meta.version
         entry_metadata.nomad_version = config.meta.version
         entry_metadata.nomad_commit = ''
+        entry_metadata.nomad_distro_commit_url = distro_commit_url or ''
         entry_metadata.entry_hash = self.upload_files.entry_hash(
             self.mainfile, self.mainfile_key
         )
diff --git a/nomad/utils/__init__.py b/nomad/utils/__init__.py
index 2e0bf62e05..f124328c29 100644
--- a/nomad/utils/__init__.py
+++ b/nomad/utils/__init__.py
@@ -54,6 +54,7 @@ from datetime import timedelta
 import collections
 import logging
 import inspect
+from importlib.metadata import PackageNotFoundError, metadata, version
 
 import orjson
 import os
@@ -1147,3 +1148,46 @@ def dict_to_dataframe(
         filtered_df = filter_df_columns_by_prefix(df, keys_to_filter)
         filtered_dict = dataframe_to_dict(filtered_df)
         return pd.json_normalize(filtered_dict, errors='ignore')
+
+
+def nomad_distro_metadata() -> str | None:
+    """
+    Retrieves metadata for the 'nomad-distribution' package, including the
+    repository URL with latest commit hash.
+
+    Returns:
+        The repo url with commit hash or None if unavailable.
+    """
+    try:
+        distro_metadata = metadata('nomad-distribution')
+
+        # Extract repository URL from Project-URL metadata
+        project_urls: list[str] = distro_metadata.get_all('Project-URL', [])
+        repo_url = next(
+            (
+                url.split(', ', 1)[1]
+                for url in project_urls
+                if url.startswith('repository, ')
+            ),
+            None,
+        )
+
+        distro_version = version('nomad-distribution')
+        if '+g' in distro_version:
+            # Split on '+g' to extract the commit hash from the version string, as 'g' is a Git-specific prefix.
+            commit = distro_version.split('+g')[
+                -1
+            ]  # Extract commit hash if present (setuptools_scm format)
+        else:
+            commit = (
+                f'v{distro_version}'  # Otherwise, assume it's a tag and prefix with 'v'
+            )
+
+        if not repo_url or not commit:
+            return None
+
+        commit_url = f'{repo_url}/tree/{commit}'
+
+        return commit_url
+    except (PackageNotFoundError, IndexError, StopIteration, KeyError):
+        return None
diff --git a/tests/test_utils.py b/tests/test_utils.py
index b5da826ebd..5d7339d68e 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -16,6 +16,7 @@
 # limitations under the License.
 #
 
+from importlib.metadata import PackageNotFoundError
 import time
 import pytest
 import pandas as pd
@@ -25,6 +26,7 @@ from nomad.metainfo.metainfo import MSection, Quantity, SubSection
 from nomad import files
 from nomad.processing import Upload
 from nomad.utils import (
+    nomad_distro_metadata,
     structlogging,
     flatten_dict,
     rebuild_dict,
@@ -313,3 +315,55 @@ class TestDictDataFrameConverter:
     def test_invalid_input_type(self, invalid_input):
         with pytest.raises(ValueError, match='Input must be a dictionary'):
             dict_to_dataframe(invalid_input)
+
+
+@pytest.mark.parametrize(
+    'project_urls, version_str, expected_url',
+    [
+        (
+            ['repository, https://github.com/example/repo'],
+            '1.2.3+gabcdef',
+            'https://github.com/example/repo/tree/abcdef',
+        ),
+        (['notrepository, https://github.com/example/repo'], '1.2.3+gabcdef', None),
+        (
+            ['repository, https://github.com/example/repo'],
+            '1.2.3',
+            'https://github.com/example/repo/tree/v1.2.3',
+        ),
+        ([], '1.2.3+gabcdef', None),
+        (['repository, '], '1.2.3+gabcdef', None),
+    ],
+)
+def test_nomad_distro_metadata(monkeypatch, project_urls, version_str, expected_url):
+    def mock_metadata(package_name):
+        class MockMetadata:
+            def get_all(self, key, default=[]):
+                if key == 'Project-URL':
+                    return project_urls
+                return default
+
+        return MockMetadata()
+
+    def mock_version(package_name):
+        return version_str
+
+    monkeypatch.setattr('nomad.utils.metadata', lambda x: mock_metadata(x))
+    monkeypatch.setattr('nomad.utils.version', mock_version)
+
+    actual_url = nomad_distro_metadata()
+    assert actual_url == expected_url
+
+
+def test_nomad_distro_package_not_found(monkeypatch):
+    def mock_metadata(package_name):
+        raise PackageNotFoundError
+
+    def mock_version(package_name):
+        return '1.2.3'
+
+    monkeypatch.setattr('nomad.utils.metadata', lambda x: mock_metadata(x))
+    monkeypatch.setattr('nomad.utils.version', mock_version)
+
+    actual_url = nomad_distro_metadata()
+    assert actual_url is None
-- 
GitLab