From d4d691300d5c3ce375ee42c28d9d1e9dcf5565c6 Mon Sep 17 00:00:00 2001
From: Amir Golparvar <amir.golparvar@physik.hu-berlin.de>
Date: Thu, 7 Mar 2024 12:40:18 +0000
Subject: [PATCH] Resolve "conversion of ArchiveQuery to dataframe works only
 for the last call of the downloaded entries"

---
 nomad/client/archive.py | 45 +++++++++++++++++++++++++++++++++++------
 1 file changed, 39 insertions(+), 6 deletions(-)

diff --git a/nomad/client/archive.py b/nomad/client/archive.py
index e9840bfc82..75dc689c42 100644
--- a/nomad/client/archive.py
+++ b/nomad/client/archive.py
@@ -20,7 +20,7 @@ from __future__ import annotations
 import asyncio
 from asyncio import Semaphore
 from itertools import islice
-from typing import Any
+from typing import Any, Union
 import threading
 
 from click import progressbar
@@ -246,6 +246,7 @@ class ArchiveQuery:
         """
 
         self._entries = []
+        self._entries_dict = []
         self._current_after = self._after
         self._current_results = 0
         self._results_actual = 0
@@ -486,9 +487,7 @@ class ArchiveQuery:
             self.fetch(number - pending_size)
 
         async_query = run_async(self._download_async, number)
-        self._entries_dict = [
-            aq.m_to_dict(resolve_references=True) for aq in async_query
-        ]
+        self._entries_dict.append(async_query)
         return async_query
 
     async def async_fetch(self, number: int = 0) -> int:
@@ -523,7 +522,41 @@ class ArchiveQuery:
     def entry_list(self) -> list[tuple[str, str]]:
         return self._entries
 
-    def entries_to_dataframe(self, keys_to_filter=None):
+    def entries_to_dataframe(
+        self,
+        keys_to_filter: list[str] = None,
+        resolve_references: bool = False,
+        query_selection: Union[str, list[str]] = 'last',
+    ):
+        """
+        Interface to convert the archives to pandas dataframe.
+        Params:
+            keys_to_filter (int): number of **entries** to download at a single time
+            resolve_references (bool): boolean if the references are to be resolved
+            query_selection (str or list[int]): selection of which archives to be used for conversion. Available options are either 'last', 'all' or a list of indices that each denoting the index of download call (e.g. [0,2,1])
+        Returns:
+            pandas dataframe of the downloaded (and selected) archives
+        """
+        t_list: Union[list[Any], dict] = []
+        if query_selection == 'all':
+            t_list = [item for sublist in self._entries_dict for item in sublist]
+        elif query_selection == 'last':
+            t_list = self._entries_dict[-1]
+        elif isinstance(query_selection, list):
+            if not all(isinstance(i, int) for i in query_selection):
+                raise TypeError("All elements in 'query_selection' must be integers.")
+            t_list = [
+                item
+                for i, sublist in enumerate(self._entries_dict)
+                if i in query_selection
+                for item in sublist
+            ]
+        else:
+            return
+
+        list_of_entries_dict = [
+            aq.m_to_dict(resolve_references=resolve_references) for aq in t_list
+        ]
         if not keys_to_filter:
             keys_to_filter = []
-        return dict_to_dataframe(self._entries_dict, keys_to_filter=keys_to_filter)
+        return dict_to_dataframe(list_of_entries_dict, keys_to_filter=keys_to_filter)
-- 
GitLab