From a30357b0ac624d9eb12cbaa2bc2b132d06b0f2fa Mon Sep 17 00:00:00 2001
From: Luigi Sbailo <sbailo@fhi-berlin.mpg.de>
Date: Wed, 9 Dec 2020 22:35:07 +0100
Subject: [PATCH] Restore previous SISSO implementation for fair comparisons

---
 compressed_sensing.ipynb | 187 ++++++++++++++++++---------------------
 1 file changed, 87 insertions(+), 100 deletions(-)

diff --git a/compressed_sensing.ipynb b/compressed_sensing.ipynb
index 84242fb..c5fc700 100644
--- a/compressed_sensing.ipynb
+++ b/compressed_sensing.ipynb
@@ -83,8 +83,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-08T09:03:39.953385Z",
-     "start_time": "2020-12-08T09:03:39.026752Z"
+     "end_time": "2020-12-09T21:31:49.230962Z",
+     "start_time": "2020-12-09T21:31:47.034743Z"
     }
    },
    "outputs": [],
@@ -145,8 +145,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-08T09:03:42.922862Z",
-     "start_time": "2020-12-08T09:03:42.801656Z"
+     "end_time": "2020-12-09T21:31:49.389466Z",
+     "start_time": "2020-12-09T21:31:49.232579Z"
     },
     "scrolled": true
    },
@@ -211,8 +211,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-08T09:03:43.573065Z",
-     "start_time": "2020-12-08T09:03:43.170821Z"
+     "end_time": "2020-12-09T21:31:49.786278Z",
+     "start_time": "2020-12-09T21:31:49.391244Z"
     }
    },
    "outputs": [],
@@ -238,8 +238,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-08T09:03:43.577912Z",
-     "start_time": "2020-12-08T09:03:43.574728Z"
+     "end_time": "2020-12-09T21:31:49.791034Z",
+     "start_time": "2020-12-09T21:31:49.788111Z"
     }
    },
    "outputs": [],
@@ -263,8 +263,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-08T09:03:43.720363Z",
-     "start_time": "2020-12-08T09:03:43.673644Z"
+     "end_time": "2020-12-09T21:31:49.842588Z",
+     "start_time": "2020-12-09T21:31:49.792447Z"
     },
     "scrolled": true
    },
@@ -306,8 +306,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-08T09:03:44.079388Z",
-     "start_time": "2020-12-08T09:03:44.069671Z"
+     "end_time": "2020-12-09T21:31:49.848867Z",
+     "start_time": "2020-12-09T21:31:49.844112Z"
     }
    },
    "outputs": [],
@@ -342,8 +342,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-08T09:03:45.786743Z",
-     "start_time": "2020-12-08T09:03:45.759462Z"
+     "end_time": "2020-12-09T21:31:49.875013Z",
+     "start_time": "2020-12-09T21:31:49.850538Z"
     },
     "scrolled": true
    },
@@ -357,8 +357,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-08T09:03:46.659432Z",
-     "start_time": "2020-12-08T09:03:46.560523Z"
+     "end_time": "2020-12-09T21:31:49.985837Z",
+     "start_time": "2020-12-09T21:31:49.876873Z"
     },
     "scrolled": true
    },
@@ -391,8 +391,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-08T09:04:06.908636Z",
-     "start_time": "2020-12-08T09:03:47.956918Z"
+     "end_time": "2020-12-09T21:32:09.553916Z",
+     "start_time": "2020-12-09T21:31:49.987916Z"
     },
     "scrolled": false
    },
@@ -427,8 +427,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-08T09:04:07.160724Z",
-     "start_time": "2020-12-08T09:04:06.910229Z"
+     "end_time": "2020-12-09T21:32:09.785212Z",
+     "start_time": "2020-12-09T21:32:09.555346Z"
     }
    },
    "outputs": [],
@@ -481,8 +481,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-08T08:40:40.153539Z",
-     "start_time": "2020-12-08T08:40:40.149276Z"
+     "end_time": "2020-12-09T21:32:09.791576Z",
+     "start_time": "2020-12-09T21:32:09.787208Z"
     }
    },
    "outputs": [],
@@ -522,8 +522,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-08T08:40:40.501898Z",
-     "start_time": "2020-12-08T08:40:40.155648Z"
+     "end_time": "2020-12-09T21:32:10.154805Z",
+     "start_time": "2020-12-09T21:32:09.793271Z"
     },
     "scrolled": true
    },
@@ -542,8 +542,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-08T08:40:40.773979Z",
-     "start_time": "2020-12-08T08:40:40.503329Z"
+     "end_time": "2020-12-09T21:32:10.443908Z",
+     "start_time": "2020-12-09T21:32:10.156133Z"
     }
    },
    "outputs": [],
@@ -581,6 +581,25 @@
     "### The SISSO method"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2020-12-09T21:32:10.857853Z",
+     "start_time": "2020-12-09T21:32:10.445342Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "#import Data\n",
+    "selected_feature_list = ['r_s', 'r_p', 'r_d', 'EA', 'IP']\n",
+    "allowed_operations = ['+','|-|','exp', '^2']\n",
+    "P, df_D = get_data(selected_feature_list, allowed_operations)\n",
+    "D = df_D.values\n",
+    "features_list = df_D.columns.tolist()"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -593,14 +612,45 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-08T09:04:07.243017Z",
-     "start_time": "2020-12-08T09:04:07.162549Z"
+     "end_time": "2020-12-09T21:32:15.329849Z",
+     "start_time": "2020-12-09T21:32:15.101788Z"
     },
     "scrolled": true
    },
    "outputs": [],
    "source": [
-    "# here we define a different dataframe to make it compatible with the SISSO regressor object\n",
+    " sisso = SissoRegressor(n_nonzero_coefs=3, n_features_per_sis_iter=10)\n",
+    "\n",
+    "sisso.fit(D, P)\n",
+    "sisso.print_models(features_list)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Run the SISSO method with a (relatively) big feature space\n",
+    "<div style=\"list-style:disc; margin: 2px;padding: 10px;border: 0px;border:8px double   green; font-size:16px;padding-left: 32px;padding-right: 22px; width:89%\">\n",
+    "<li>Reproduce the results from the <a href=\"http://journals.aps.org/prl/abstract/10.1103/PhysRevLett.114.105503\" target=\"_blank\">reference publication</a>  by including further features.</li>\n",
+    "<li>Visualize the 2D descriptors in a structure map.</li>\n",
+    "<li>Experiment with different settings and investigate the influence of the input parameters on the results. (OPTIONAL)</li>\n",
+    "</div>\n",
+    "Note the size of the feature space, the needed time to run the code and the accuracy (using the default settings)!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2020-12-09T21:33:34.682503Z",
+     "start_time": "2020-12-09T21:33:34.590337Z"
+    },
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# here we define a different dataframe to make it compatible with the c++ implementation of SISSO\n",
     "# load data\n",
     "RS_structures = read(\"data/compressed_sensing/RS_structures.xyz\", index=':')\n",
     "ZB_structures = read(\"data/compressed_sensing/ZB_structures.xyz\", index=':')\n",
@@ -654,71 +704,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-08T09:04:07.346777Z",
-     "start_time": "2020-12-08T09:04:07.244250Z"
-    },
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "n_nonzero_coefs=3\n",
-    "n_features_per_sis_iter=10\n",
-    "phi_0, prop_unit, prop, prop_test, task_sizes_train, task_sizes_test, leave_out_inds = generate_phi_0_from_csv(\n",
-    "    df_plus, \"energy_diff\", \n",
-    "    cols=['r_s_A', 'r_p_A', 'r_d_A', 'EA_A', 'IP_A', 'r_s_B', 'r_p_B', 'r_d_B', 'EA_B', 'IP_B'], \n",
-    "    task_key=None, leave_out_frac=0.0, leave_out_inds=None\n",
-    ")\n",
-    "feat_space = generate_fs(\n",
-    "    phi_0, \n",
-    "    prop, \n",
-    "    task_sizes_train, \n",
-    "    ['add','abs_diff','exp', 'sq'],\n",
-    "    'regression',  \n",
-    "    2, \n",
-    "    n_features_per_sis_iter\n",
-    ")\n",
-    "sisso = SISSORegressor(\n",
-    "    feat_space,\n",
-    "    prop_unit,\n",
-    "    prop,\n",
-    "    prop_test,\n",
-    "    task_sizes_train,\n",
-    "    task_sizes_test,\n",
-    "    leave_out_inds,\n",
-    "    n_nonzero_coefs,\n",
-    "    1,\n",
-    "    1\n",
-    ")\n",
-    "sisso.fit()\n",
-    "for i in range(n_nonzero_coefs):\n",
-    "    print(str(i+1)+'D model')\n",
-    "    print(\"RMSE: {:.4} | Descriptor: {}\".format(sisso.models[i][0].rmse, sisso.models[i][0]))\n",
-    "    string = \"c0:{:.4}\".format(sisso.models[i][0].coefs[0][-1])\n",
-    "    for j in range(i+1):\n",
-    "        string = string + str(\"  |  a\"+str(j)+\":{:.4}\".format(sisso.models[i][0].coefs[0][j]))\n",
-    "    print(string + '\\n')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Run the SISSO method with a (relatively) big feature space\n",
-    "<div style=\"list-style:disc; margin: 2px;padding: 10px;border: 0px;border:8px double   green; font-size:16px;padding-left: 32px;padding-right: 22px; width:89%\">\n",
-    "<li>Reproduce the results from the <a href=\"http://journals.aps.org/prl/abstract/10.1103/PhysRevLett.114.105503\" target=\"_blank\">reference publication</a>  by including further features.</li>\n",
-    "<li>Visualize the 2D descriptors in a structure map.</li>\n",
-    "<li>Experiment with different settings and investigate the influence of the input parameters on the results. (OPTIONAL)</li>\n",
-    "</div>\n",
-    "Note the size of the feature space, the needed time to run the code and the accuracy (using the default settings)!"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2020-12-08T09:04:15.868023Z",
-     "start_time": "2020-12-08T09:04:10.727266Z"
+     "end_time": "2020-12-09T21:33:39.961597Z",
+     "start_time": "2020-12-09T21:33:34.758589Z"
     },
     "scrolled": false
    },
@@ -774,8 +761,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-08T09:04:16.257735Z",
-     "start_time": "2020-12-08T09:04:15.869867Z"
+     "end_time": "2020-12-09T21:33:42.060185Z",
+     "start_time": "2020-12-09T21:33:41.655421Z"
     },
     "scrolled": false
    },
@@ -831,8 +818,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-08T09:04:16.439244Z",
-     "start_time": "2020-12-08T09:04:16.259950Z"
+     "end_time": "2020-12-09T21:33:45.327451Z",
+     "start_time": "2020-12-09T21:33:45.136210Z"
     }
    },
    "outputs": [],
@@ -854,8 +841,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-08T09:04:17.222753Z",
-     "start_time": "2020-12-08T09:04:16.440847Z"
+     "end_time": "2020-12-09T21:33:46.787348Z",
+     "start_time": "2020-12-09T21:33:45.846486Z"
     },
     "scrolled": false
    },
-- 
GitLab