Reinsert expected computation time

3f61b445 · Luigi Sbailo · c1fe007f · 3f61b445
Commit 3f61b445 authored 3 years ago by Luigi Sbailo
--- a/descriptor_role.ipynb
+++ b/descriptor_role.ipynb
@@ -66,45 +66,15 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
-     "end_time": "2021-09-14T14:30:04.851511Z",
-     "start_time": "2021-09-14T14:30:04.832503Z"
+     "end_time": "2021-10-18T18:40:26.344758Z",
+     "start_time": "2021-10-18T18:40:26.341192Z"
    },
    "init_cell": true
   },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<script>\n",
-       "    code_show=true; \n",
-       "    function code_toggle() {\n",
-       "        if (code_show)\n",
-       "        {\n",
-       "            $('div.input').hide();\n",
-       "        } \n",
-       "        else \n",
-       "        {\n",
-       "            $('div.input').show();\n",
-       "        }\n",
-       "        code_show = !code_show\n",
-       "    } \n",
-       "    $( document ).ready(code_toggle);\n",
-       "    window.runCells(\"startup\");\n",
-       "</script>\n",
-       "The raw code for this notebook is by default hidden for easier reading.\n",
-       "To toggle on/off the raw code, click <a href=\"javascript:code_toggle()\">here</a>.\n"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "%%HTML\n",
    "<script>\n",
@@ -192,33 +162,18 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
-     "end_time": "2021-09-14T14:30:07.180032Z",
-     "start_time": "2021-09-14T14:30:04.852666Z"
+     "end_time": "2021-10-18T18:40:26.354836Z",
+     "start_time": "2021-10-18T18:40:26.345968Z"
    },
    "init_cell": true,
    "tags": [
     "startup"
    ]
   },
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "4ee6df9ee3f741d69dedf298cfccedf9",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "_ColormakerRegistry()"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "import os\n",
    "import math\n",
@@ -247,6 +202,7 @@
    "from sissopp import Inputs, FeatureSpace, SISSORegressor, FeatureNode, Unit\n",
    "from sissopp.py_interface import read_csv\n",
    "from sissopp.py_interface.import_dataframe import get_unit\n",
+    "from sissopp.py_interface.estimate_feat_space_size import get_max_number_feats\n",
    "\n",
    "import timeit\n",
    "\n",
@@ -257,11 +213,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
-     "end_time": "2021-09-14T14:30:07.283447Z",
-     "start_time": "2021-09-14T14:30:07.182435Z"
+     "end_time": "2021-10-18T18:40:26.441763Z",
+     "start_time": "2021-10-18T18:40:26.356265Z"
    },
    "init_cell": true,
    "tags": [
@@ -320,11 +276,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
-     "end_time": "2021-09-14T14:30:07.287864Z",
-     "start_time": "2021-09-14T14:30:07.284712Z"
+     "end_time": "2021-10-18T18:40:26.446270Z",
+     "start_time": "2021-10-18T18:40:26.443066Z"
    },
    "init_cell": true
   },
@@ -350,11 +306,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
-     "end_time": "2021-09-14T14:30:07.298609Z",
-     "start_time": "2021-09-14T14:30:07.289053Z"
+     "end_time": "2021-10-18T18:40:26.455992Z",
+     "start_time": "2021-10-18T18:40:26.447340Z"
    },
    "init_cell": true,
    "tags": [
@@ -439,11 +395,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
-     "end_time": "2021-09-14T14:30:07.315349Z",
-     "start_time": "2021-09-14T14:30:07.299720Z"
+     "end_time": "2021-10-18T18:40:26.474971Z",
+     "start_time": "2021-10-18T18:40:26.457100Z"
    },
    "init_cell": true,
    "tags": [
@@ -501,9 +457,13 @@
    "                    selected_features.extend([feat+nuc for nuc in ['_A', '_B']])       \n",
    "\n",
    "        if rung_selection.value == 'PRL2015':\n",
+    "            n_features = 5\n",
    "            reproduce_prl = True\n",
+    "            tier = 2\n",
    "            max_rung = 2\n",
    "        else:\n",
+    "            tier = rung_selection.value\n",
+    "            n_features = len(selected_features)\n",
    "            reproduce_prl = False        \n",
    "            max_rung = rung_selection.value\n",
    "\n",
@@ -511,6 +471,17 @@
    "        global feat_space\n",
    "        global sisso\n",
    "\n",
+    "        size_estimate = get_max_number_feats(allowed_operations, n_features, tier)\n",
+    "        time_estimate = fs_time_estimate(fs_size_estimate(size_estimate))\n",
+    "        minutes = int(time_estimate/60)\n",
+    "        seconds = time_estimate-60*minutes\n",
+    "        with out3:\n",
+    "            clear_output()\n",
+    "            print('Estimated calculation time: {}min and {:.1f}s'.format(minutes, seconds), flush=True)\n",
+    "        if time_estimate > 1000:\n",
+    "            print('The calculation is estimated to take too long. Please change the settings to reduce the calculation time (e.g. reduce the SISSO rung, the number of operations or the number of features)')\n",
+    "            return\n",
+    "        \n",
    "        try:\n",
    "            start = timeit.default_timer()\n",
    "\n",
@@ -579,11 +550,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
-     "end_time": "2021-09-14T14:30:07.581620Z",
-     "start_time": "2021-09-14T14:30:07.316390Z"
+     "end_time": "2021-10-18T18:40:26.745789Z",
+     "start_time": "2021-10-18T18:40:26.476667Z"
    },
    "init_cell": true,
    "scrolled": false,
@@ -591,22 +562,7 @@
     "startup"
    ]
   },
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "e8d2c04ad65049e8bcb440fe6e638ab3",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "VBox(children=(HBox(children=(VBox(children=(Label(value=''), Checkbox(value=True, indent=False, layout=Layout…"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "cb_layout = widgets.Layout(width = '15px')\n",
    "thin_layout = widgets.Layout(width = '100px')\n",
@@ -659,7 +615,7 @@
    "feat_label_box = widgets.VBox([widgets.Label(value='Features:', layout=thin_layout)]+feat_labels)\n",
    "\n",
    "rung_selection = widgets.Dropdown(options=['PRL2015', 1,2,3], layout=thin_layout)\n",
-    "sis_selection = widgets.BoundedIntText(value=26, min=1, max=100, step=1, layout=thin_layout)\n",
+    "sis_selection = widgets.BoundedIntText(value=26, min=1, max=5000, step=1, layout=thin_layout)\n",
    "dimension_selection = widgets.BoundedIntText(value = 3, min=1, max=4, step=1, layout = thin_layout)\n",
    "\n",
    "settings_box = widgets.VBox([\n",

 %% Cell type:code id: tags:

 ``` python
 %%HTML
 <script>

 window.findCellIndicesByTag = function findCellIndicesByTag(tagName) {
  return (Jupyter.notebook.get_cells()
    .filter(
      ({metadata: {tags}}) => tags && tags.includes(tagName)
    )
    .map((cell) => Jupyter.notebook.find_cell_index(cell))
  );
 };

 window.runCells = function runCells(tagName) {
    var c = window.findCellIndicesByTag(tagName);
    Jupyter.notebook.execute_cells(c);
 };

 </script>
 ```

 %% Cell type:markdown id: tags:

 <div id="teaser" style=' background-position:  right center; background-size: 00px; background-repeat: no-repeat;
    padding-top: 20px;
    padding-right: 10px;
    padding-bottom: 170px;
    padding-left: 10px;
    border-bottom: 14px double #333;
    border-top: 14px double #333;' >


   <div style="text-align:center">
    <b><font size="6.4">Predicting energy differences between crystal structures: (Meta-)stability of octet-binary compounds</font></b>
  </div>

 <p>
 created by: Mohammad-Yasin Arif, Luigi Sbailò, Thomas A. R. Purcell, Luca M. Ghiringhelli, and Matthias Scheffler

 based on the original version designed by: Angelo Ziletti, Emre Ahmetcik, Runhai Ouyang, Luca M. Ghiringhelli, and Matthias Scheffler<br><br>

 Fritz Haber Institute of the Max Planck Society, Faradayweg 4-6, D-14195 Berlin, Germany <br>
 <span class="nomad--last-updated" data-version="v1.0.0">[Last updated: Sep 5, 2020]</span>


 <div>
 <img  style="float: left;" src="assets/descriptor_role/Logo_MPG.png" width="200">
 <img  style="float: right;" src="assets/descriptor_role/Logo_NOMAD.png" width="250">
 </div>
 </div>

 %% Cell type:code id: tags:

 ``` python
 %%HTML
 <script>
    code_show=true;
    function code_toggle() {
        if (code_show)
        {
            $('div.input').hide();
        }
        else
        {
            $('div.input').show();
        }
        code_show = !code_show
    }
    $( document ).ready(code_toggle);
    window.runCells("startup");
 </script>
 The raw code for this notebook is by default hidden for easier reading.
 To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.
 ```

-%% Output
-
-
 %% Cell type:markdown id: tags:

 ### Introduction
 This tutorial shows how to find descriptive parameters (short formulas) that predict the crystal structure (here, rocksalt (RS) or zincblende (ZB)), using the example of octet binary compounds. It is based on the algorithm sure independence screening and sparsifying operator (SISSO), that enables to search for optimal descriptor by scanning huge feature spaces.

 <div style="padding: 1ex; margin-top: 1ex; margin-bottom: 1ex; border-style: dotted; border-width: 1pt; border-color: blue; border-radius: 3px;">R. Ouyang, S. Curtarolo, E. Ahmetcik, M. Scheffler, L. M. Ghiringhelli: <span style="font-style: italic;">SISSO: a compressed-sensing method for identifying the best low-dimensional descriptor in an immensity of offered candidates</span>, Phys. Rev. Materials  2, 083802 (2018) <a href="https://journals.aps.org/prmaterials/abstract/10.1103/PhysRevMaterials.2.083802" target="_blank">[PDF]</a>.</div>

 With the default settings, the method reproduces the same results from:

 <div style="padding: 1ex; margin-top: 1ex; margin-bottom: 1ex; border-style: dotted; border-width: 1pt; border-color: blue; border-radius: 3px;">L. M. Ghiringhelli, J. Vybiral, S. V. Levchenko, C. Draxl, M. Scheffler: <span style="font-style: italic;">Big Data of Materials Science: Critical Role of the Descriptor</span>,  Phys. Rev. Lett. 114, 105503 (2015) <a href="http://journals.aps.org/prl/abstract/10.1103/PhysRevLett.114.105503">[PDF]</a>,</div>

 %% Cell type:markdown id: tags:

 <details>
    <summary>
        <div style="padding: 1ex; margin-top: 1ex; margin-bottom: 1ex; border-style: dotted; border-width: 1pt; border-color: blue; border-radius: 3px;"><b>Explanation of the method (click to expand)</b></div></summary>

 We present a tool for predicting the crystal structure of octet binary compounds, by using a set of descriptive parameters (a descriptor) based on free-atom data of the atomic species constituting the binary material. We apply a newly developed method: sure independence screening and sparsifying operator (SISSO), that allows to find an optimal descriptor in a huge feature space containing billions of features. In this tutorial an $\ell_0$-optimization is used as the sparsifying operator.
 The method is described in:

 <div style="padding: 1ex; margin-top: 1ex; margin-bottom: 1ex; border-style: dotted; border-width: 1pt; border-color: blue; border-radius: 3px;">
 R. Ouyang, S. Curtarolo, E. Ahmetcik, M. Scheffler, L. M. Ghiringhelli: <span style="font-style: italic;">SISSO: a compressed-sensing method for identifying the best low-dimensional descriptor in an immensity of offered candidates</span>, Phys. Rev. Materials  2, 083802 (2018) <a href="https://journals.aps.org/prmaterials/abstract/10.1103/PhysRevMaterials.2.083802" target="_blank">[PDF]</a>. <br> </div>

 SISSO($\ell_0$) works iteratively. In the first iteration, a number k of features is collected that have the largest correlation (scalar product) with the property vector. The feature with the largest correlation is simply the 1D descriptor. Next, a residual is constructed as the error made at the first iteration. A new set of k features is now selected as those having the largest  correlation with the residual. The 2D descriptor is the pair of features that yield the smallest fitting error upon least square regression, among all possible pairs contained in the union of the sets selected in this and the first iteration. In each next iteration a new residual is constructed as the error made in the previous iteration, then a new set of k features is extracted as those that have largest correlation with each new residual. The nD descriptor is the n-tuple of features that yield the smallest fitting error upon least square regression, among all possible n-tuples contained in the union of the sets obtained in each new iteration and all the previous iterations. If k=1 the method collapses to the so-called orthogonal matching pursuit.

 The prediction of the ground-state structure for binary compounds from a simple descriptor has a notable history in materials science [1-7], where descriptors were designed by chemically/physically-inspired intuition. The tool presented here allows for the machine-learning-aided automatic discovery of a descriptor and a model for the prediction of the difference in energy between a selected pair of structures for 82 octet binary materials.


   References:
            <ol>
                <li>J. A. van Vechten, Phys. Rev. 182, 891 (1969).</li>
                <li>J. C. Phillips, Rev. Mod. Phys. 42, 317 (1970).</li>
                <li>J. John and A. N. Bloch, Phys. Rev. Lett. 33, 1095 (1974).</li>
                <li>J. R. Chelikowsky and J. C. Phillips, Phys. Rev. B 17, 2453 (1978).</li>
                <li>A. Zunger, Phys. Rev. B 22, 5839 (1980).</li>
                <li>D. G. Pettifor, Solid State Commun. 51, 31 (1984).</li>
                <li>Y. Saad, D. Gao, T. Ngo, S. Bobbitt, J. R. Chelikowsky, and W. Andreoni, Phys. Rev. B 85, 104104 (2012).</li>
                </ol>


 In this example, you can run a compressed-sensing based algorithm for finding the optimal descriptor and model that predicts the difference in energy between crystal structures (here, zincblende vs. rocksalt).
 The descriptor is selected out of a large number of candidates constructed as functions of basic input features, the primary features.

 </details>

 %% Cell type:markdown id: tags:

 The idea demonstrated in this tutorial is to start from simple physical quantities ("primary features", here properties of the constituent free atoms such as orbital radii), to generate millions (or billions) of candidate formulas by applying arithmetic operations combining primary features. These candidate formulas constitute the so-called "feature space". Then, SISSO is used to select only a few of these formulas that explain the data.

 By clicking directly on "Run" below, you can reproduce the 2D map as published in <a href="http://journals.aps.org/prl/abstract/10.1103/PhysRevLett.114.10550" target="_blank">PRL 2015</a>. You can also select primary features and allowed operations (by clicking the check-boxes), as well as the SISSO rung (i.e., the number of iterations in the construction of the feature space), the number of features that are selected at each iteration of the SIS step, and the max number of dimensions of the model. Then press "Run". \
 After the results are shown for all models from one dimensional to the max chosen dimension, you can press "Plot interactive map" to reveal a map of the RS vs ZB relative stability, for the highest dimensional model. If the highest dimension model is 2D, the separation line between the two phases (i.e., the locus where the predicted $\Delta$E is zero) is shown. For higher dimensional models, the 3rd and 4th dimensions can be visualized via the size or the color of the data-point markers. Intuitive drop-down menus allow to assign axes, markers, and colors, to the descriptor components of choice.

 With the selection of "PRL2015" as SISSO rung, a special feature space is uploaded, which allows for the reproduction of also the 1D and 3D models as published in <a href="http://journals.aps.org/prl/abstract/10.1103/PhysRevLett.114.10550" target="_blank">PRL 2015</a> . This is because in <a href="http://journals.aps.org/prl/abstract/10.1103/PhysRevLett.114.10550" target="_blank">PRL-2015</a> a slightly different criterion for the construction of the feature set was adopted, compared to <a href="https://journals.aps.org/prmaterials/abstract/10.1103/PhysRevMaterials.2.083802" target="_blank">PRM-2018</a>.

 %% Cell type:code id: tags:startup

 ``` python
 import os
 import math
 import pandas as pd
 import numpy as np
 from itertools import combinations
 from time import time
 import matplotlib.pyplot as plt
 import scipy.stats as ss
 import warnings
 from collections import Counter
 from sklearn.kernel_ridge import KernelRidge
 from sklearn.model_selection import GridSearchCV, LeaveOneOut
 from IPython.display import HTML, clear_output
 import ipywidgets as widgets

 import plotly.graph_objects as go
 from ipywidgets import widgets, interactive
 from jupyter_jsmol import JsmolView

 import nglview
 from ase.io import read

 from descriptor_role.utils import generate_structures
 from descriptor_role.visualizer import Visualizer
 from sissopp import Inputs, FeatureSpace, SISSORegressor, FeatureNode, Unit
 from sissopp.py_interface import read_csv
 from sissopp.py_interface.import_dataframe import get_unit
+from sissopp.py_interface.estimate_feat_space_size import get_max_number_feats

 import timeit

 # set display options for the notebook
 %matplotlib inline
 warnings.filterwarnings('ignore')
 ```

-%% Output
-
-
 %% Cell type:code id: tags:startup

 ``` python
 # load data
 RS_structures = read("data/descriptor_role/RS_structures.xyz", index=':')
 ZB_structures = read("data/descriptor_role/ZB_structures.xyz", index=':')

 def generate_table(RS_structures, ZB_structures):

    for RS, ZB in zip(RS_structures, ZB_structures):
        energy_diff = RS.info['energy'] - ZB.info['energy']
        min_struc_type = 'RS' if energy_diff < 0 else 'ZB'
        struc_obj_min = RS if energy_diff < 0 else ZB

        yield [RS.info['energy'], ZB.info['energy'],
               energy_diff, min_struc_type,
               RS.info['Z'], ZB.info['Z'],
               RS.info['period'], ZB.info['period'],
               RS.info['IP'], ZB.info['IP'],
               RS.info['EA'], ZB.info['EA'],
               RS.info['E_HOMO'], ZB.info['E_HOMO'],
               RS.info['E_LUMO'], ZB.info['E_LUMO'],
               RS.info['r_s'], ZB.info['r_s'],
               RS.info['r_p'], ZB.info['r_p'],
               RS.info['r_d'], ZB.info['r_d'],
               abs(RS.info['r_p']+RS.info['r_s']-ZB.info['r_p']-ZB.info['r_s']),
               abs(RS.info['r_p']-RS.info['r_s'])+abs(ZB.info['r_p']-ZB.info['r_s']),
               RS, ZB, struc_obj_min]

 df = pd.DataFrame(
    generate_table(RS_structures, ZB_structures),
    columns=['energy_RS', 'energy_ZB',
             'energy_diff', 'min_struc_type',
             'Z_A (nuc_charge)', 'Z_B (nuc_charge)',
             'period_A (unitless)', 'period_B (unitless)',
             'IP_A (eV_IP)', 'IP_B (eV_IP)',
             'EA_A (eV_IP)', 'EA_B (eV_IP)',
             'E_HOMO_A (eV)', 'E_HOMO_B (eV)',
             'E_LUMO_A (eV)', 'E_LUMO_B (eV)',
             'r_s_A', 'r_s_B',
             'r_p_A', 'r_p_B',
             'r_d_A', 'r_d_B',
             'r_sigma', 'r_pi',
             'struc_obj_RS', 'struc_obj_ZB', 'struc_obj_min'],
    index=list(RS.get_chemical_formula() for RS in RS_structures)
 )

 # print data without structure objects
 df_reduced = df.drop(['energy_RS', 'energy_ZB', 'min_struc_type', 'struc_obj_RS', 'struc_obj_ZB', 'struc_obj_min'], axis=1)
 ```

 %% Cell type:code id: tags:

 ``` python
 def make_phi0_fromdf (df_reduced):

    phi_0 = []
    for cc, col in enumerate(df_reduced.columns[1:]):

        phi_0.append(
            FeatureNode(
                cc,
                col.split("(")[0],
                df[col].to_numpy(),
                np.zeros(0),
                get_unit(col),
            )
        )

    return phi_0
 ```

 %% Cell type:code id: tags:startup

 ``` python
 def get_feat_space_and_sisso_regressor(
    df_selected,
    selected_ops=["add", "abs_diff", "div", "sq", "exp"],
    selected_features = 'all',
    max_rung=2,
    n_sis_select=50,
    n_dim=2,
    n_residual=1,
    prl_2015=False,
 ):

    if prl_2015:

        phi_0 = make_phi0_fromdf (df_selected)

        feat_space = FeatureSpace(
            "./data/descriptor_role/phi.txt",
            phi_0,
            df["energy_diff"].to_numpy(),
            [len(df.index)],
            project_type='regression',
            cross_corr_max=1.0,
            n_sis_select=n_sis_select,
        )
        inputs = Inputs()
        inputs.allowed_ops = selected_ops
        inputs.n_dim = n_dim
        inputs.max_rung = max_rung
        inputs.n_residual = n_residual
        inputs.n_model_store = 1
        inputs.calc_type = "regression"
        inputs.leave_out_inds = []
        inputs.task_sizes_train = [82]
        inputs.task_sizes_test = [0]
        inputs.sample_ids_train = df_selected.index.tolist()
        inputs.prop_train = df_selected["energy_diff"].to_numpy()
        inputs.prop_test = np.array([])
        inputs.prop_label = "energy_diff"
        inputs.prop_unit = Unit("eV")
        inputs.task_names = ["all_mats"]

    else:

        inputs = read_csv(
            df_selected,
            prop_key="energy_diff",
            cols=selected_features,
            max_rung=max_rung,
            leave_out_frac=0.0
            )
        inputs.allowed_ops = selected_ops
        inputs.n_sis_select = n_sis_select
        inputs.n_dim = n_dim
        inputs.max_rung = max_rung
        inputs.n_residual = n_residual
        inputs.n_model_store = 1
        inputs.calc_type = "regression"
        inputs.leave_out_inds = []
        inputs.task_sizes_train = [82]
        inputs.task_sizes_test = [0]
        inputs.sample_ids_train = df_selected.index.tolist()
        inputs.prop_train = df_selected["energy_diff"].to_numpy()
        inputs.prop_test = np.array([])
        inputs.prop_label = "energy_diff"
        inputs.prop_unit = Unit("eV")
        inputs.task_names = ["all_mats"]

        feat_space = FeatureSpace(inputs)

    sisso = SISSORegressor(inputs, feat_space)

    return feat_space, sisso
 ```

 %% Cell type:code id: tags:startup

 ``` python
 def default_op_and_feat():
    #this is used both for 'default_selection' and 'prl_selection'
    default_operations = ['add','abs_diff','exp','sq','div']
    default_features = ['IP','EA','r_s','r_p','r_d']
    for op, widget in zip(possible_operations, op_list):
        widget.value = op in default_operations
    for feat, widget in zip(possible_features, feat_list):
        widget.value = feat in default_features

 def prl_selection(change):
    if change['new'] == 'PRL2015':
        default_op_and_feat()
        dimension_selection.value = 2
        for widget in op_list+feat_list:
            widget.disabled = True
    else:
        for widget in op_list+feat_list:
            widget.disabled = False

 def default_selection(b):
    default_op_and_feat()
    rung_selection.value = 2
    sis_selection.value = 50
    dimension_selection.value = 2

 def run(b):
    with out2:
        clear_output()
    with out1:
        clear_output()
        print('Calculating...', flush=True)

        selected_features = []
        allowed_operations = []

        print('to select features', flush=True)

        for op, widget in zip(possible_operations, op_list):
            if widget.value:
                allowed_operations.append(op)

        for feat, widget in zip(possible_features, feat_list):
            if widget.value:
                if feat == 'r_sigma' or feat == 'r_pi':
                    selected_features.append(feat)
                else:
                    selected_features.extend([feat+nuc for nuc in ['_A', '_B']])

        if rung_selection.value == 'PRL2015':
+            n_features = 5
            reproduce_prl = True
+            tier = 2
            max_rung = 2
        else:
+            tier = rung_selection.value
+            n_features = len(selected_features)
            reproduce_prl = False
            max_rung = rung_selection.value

        # set as global because these variables are then used in the visualizer
        global feat_space
        global sisso

+        size_estimate = get_max_number_feats(allowed_operations, n_features, tier)
+        time_estimate = fs_time_estimate(fs_size_estimate(size_estimate))
+        minutes = int(time_estimate/60)
+        seconds = time_estimate-60*minutes
+        with out3:
+            clear_output()
+            print('Estimated calculation time: {}min and {:.1f}s'.format(minutes, seconds), flush=True)
+        if time_estimate > 1000:
+            print('The calculation is estimated to take too long. Please change the settings to reduce the calculation time (e.g. reduce the SISSO rung, the number of operations or the number of features)')
+            return
+
        try:
            start = timeit.default_timer()

            feat_space, sisso = get_feat_space_and_sisso_regressor(
                df_selected = df_reduced,
                selected_ops = allowed_operations,
                selected_features = selected_features,
                max_rung = max_rung,
                n_sis_select = sis_selection.value,
                n_dim = dimension_selection.value,
                n_residual = 1,
                prl_2015 = reproduce_prl
            )

            clear_output()
            if (dimension_selection.value>1):
                plot_button.disabled=False
            else:
                plot_button.disabled=True

            clear_output()
            print("Number of features generated: " + str(feat_space.n_feat), flush=True)

            try:
                sisso.fit()

                for i in range(dimension_selection.value):
                    print(str(i+1)+'D model')
                    print("RMSE: {:.4} | Descriptor: {}".format(sisso.models[i][0].rmse, sisso.models[i][0]))
                    string = "c0:{:.4}".format(sisso.models[i][0].coefs[0][0])
                    for j in range(i+1):
                        string = string + str("  |  a"+str(j)+":{:.4}".format(sisso.models[i][0].coefs[0][j]))
                    print(string + '\n')


            except RuntimeError:
                print("\nThe number of selected features per SIS iteration is bigger than the number of features available. Please reduce the number of selected features per SIS iteration (number of features generated / max number of dimensions) or increase the number of selected features and operations.")
        except RuntimeError:
            print('The present selection does not lead to the creation of any derived features in the highest selected rung, please select at least one binary or power operator, or reduce the maximum rung')

 def plot_2d_solution(b):
    with out2:
        clear_output()
        generate_structures (RS_structures,ZB_structures)
        visualizer=Visualizer(df, sisso, feat_space)
        visualizer.show()

 def fs_size_estimate(max_size):
    c0 = 0.6192417
    c1 = 0.4310250
    c2 = 0.1675253
    c3 = -0.0240753
    c4 = 0.0010133
    x = math.log10(max_size)
    fs_size = 10.0**(c0 + c1*x + c2*x**2 + c3*x**3 +  c4*x**4)
    return fs_size

 def fs_time_estimate(fs_size):
    c0 = -2.986271
    c1 = 0.337106
    c2 = 0.063651
    x = math.log10(fs_size)
    time = 10.0**(c0 + c1*x + c2*x**2)
    return time
 ```

 %% Cell type:code id: tags:startup

 ``` python
 cb_layout = widgets.Layout(width = '15px')
 thin_layout = widgets.Layout(width = '100px')
 mid_layout = widgets.Layout(width = '200px')
 wide_layout = widgets.Layout(width = '300px')

 #possible_operations = ['add', 'sub', 'abs_diff', 'mult', 'div', 'exp', 'neg_exp', 'inv', 'sq', 'cb',
 #                       'six_pow', 'sqrt', 'cbrt', 'log', 'abs', 'sin', 'cos']
 possible_operations = ['add', 'sub', 'abs_diff', 'mult', 'div', 'exp', 'neg_exp', 'inv', 'sq', 'cb',
                        'sqrt', 'cbrt', 'log', 'abs']

 possible_features = ['Z','IP','EA','E_HOMO','E_LUMO','r_s','r_p','r_d', 'r_sigma', 'r_pi']

 tooltips = {
    "Z" : "Atomic number",
    "IP" : "Atomic ionization potential",
    "EA" : "Atomic electron affinity",
    "E_HOMO" : "Energy of highest occupied atomic orbital",
    "E_LUMO" : "Energy of lowest unoccupied molecular orbital",
    "r_s" : "Radius at which the radial probability density of the valence s orbital is maximum",
    "r_p" : "Radius at which the radial probability density of the valence p orbital is maximum",
    "r_d" : "Radius at which the radial probability density of the valence d orbital is maximum",
    "r_sigma" : "John-Bloch's indicator 1: |rp(A) + rs(A) - rp(B)  - rs(B)|  [Phys. Rev. Lett. 33. 1095 (1974)",
    "r_pi": "John-Bloch's indicator2: |rp(A) - rs(A)| +| rp(B) - rs(B)| [Phys. Rev. Lett. 33. 1095 (1974)]"
 }

 labels = {
    'add' : '$x + y$', 'sub' : '$x - y$', 'abs_diff' : '$|x - y|$', 'mult' : '$x \cdot y$', 'div' : '$x / y$',
    'exp' : '$\exp(x)$', 'neg_exp' : '$\exp(-x)$', 'inv' : '$1/x$', 'sq' : '$x^2$', 'cb' : '$x^3$',
    'six_pow' : '$x^6$', 'sqrt' : '$\sqrt{x}$', 'cbrt' : '$\sqrt[3]{x}$', 'log' : '$\log(x)$',
    'abs' :  '$|x|$', 'sin' : '$\sin(x)$', 'cos' : '$\cos(x)$', 'Z' : '$Z$', 'IP' : '$IP$', 'EA' : '$EA$',
    'E_HOMO' : '$E_{HOMO}$', 'E_LUMO' : '$E_{LUMO}$', 'r_s' : '$r_s$', 'r_p' : '$r_p$', 'r_d' :  '$r_d$',
    'r_sigma' : '$r_\sigma$', 'r_pi' : '$r_\pi$'
 }

 op_list = []
 op_labels  = []
 feat_list = []
 feat_labels = []
 for operation in possible_operations:
    op_list.append(widgets.Checkbox(description='', value=True, indent=False, layout=cb_layout))
    op_labels.append(widgets.Label(value=labels[operation]))
 for feature in possible_features:
    feat_list.append(widgets.Checkbox(description=tooltips[feature], value=True, indent=False, layout=cb_layout))
    feat_labels.append(widgets.Label(value=labels[feature]))

 op_box = widgets.VBox([widgets.Label()]+op_list)
 op_label_box = widgets.VBox([widgets.Label(value='Operations:', layout=thin_layout)]+op_labels)
 feat_box = widgets.VBox([widgets.Label()]+feat_list)
 feat_label_box = widgets.VBox([widgets.Label(value='Features:', layout=thin_layout)]+feat_labels)

 rung_selection = widgets.Dropdown(options=['PRL2015', 1,2,3], layout=thin_layout)
-sis_selection = widgets.BoundedIntText(value=26, min=1, max=100, step=1, layout=thin_layout)
+sis_selection = widgets.BoundedIntText(value=26, min=1, max=5000, step=1, layout=thin_layout)
 dimension_selection = widgets.BoundedIntText(value = 3, min=1, max=4, step=1, layout = thin_layout)

 settings_box = widgets.VBox([
    widgets.Label(value='Settings:', layout=wide_layout),
    widgets.Label(value='SISSO rung:', layout=wide_layout),
    rung_selection,
    widgets.Label(value='Number of selected features per SIS iteration:', layout=wide_layout),
    sis_selection,
    widgets.Label(value='Maximum number of dimensions:', layout=wide_layout),
    dimension_selection])

 default_button = widgets.Button(description = 'Default selection', layout=mid_layout)
 run_button = widgets.Button(description = 'Run', layout=mid_layout)
 plot_button = widgets.Button(description = 'Plot interactive map', disabled=True, layout=mid_layout)
 default_button.on_click(default_selection)
 run_button.on_click(run)
 plot_button.on_click(plot_2d_solution)
 button_box = widgets.VBox([default_button, run_button, plot_button])

 out1 = widgets.Output()
 out2 = widgets.Output()
 out3 = widgets.Output()

 gui_box = widgets.HBox([op_box, op_label_box, feat_box, feat_label_box, settings_box, button_box])
 out_box = widgets.VBox([gui_box, out3, out1, out2])

 rung_selection.observe(prl_selection, names='value')

 default_selection('')
 display(out_box)
 ```
-
-%% Output
-