diff --git a/compressed_sensing.ipynb b/compressed_sensing.ipynb
index bfee0c66297bdeb19c1c2b58d3a79cbab978b9d8..b667aedc88543713979fab02b897590842b39920 100644
--- a/compressed_sensing.ipynb
+++ b/compressed_sensing.ipynb
@@ -80,353 +80,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2021-02-05T11:34:54.071622Z",
-     "start_time": "2021-02-05T11:34:52.319198Z"
+     "end_time": "2021-02-08T13:51:13.168903Z",
+     "start_time": "2021-02-08T13:51:11.526672Z"
     }
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Bad key \"text.kerning_factor\" on line 4 in\n",
-      "/home/sbailo/anaconda3/envs/analytics/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.\n",
-      "You probably need to get an updated matplotlibrc file from\n",
-      "http://github.com/matplotlib/matplotlib/blob/master/matplotlibrc.template\n",
-      "or from the matplotlib source distribution\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "5e06b0fdf10d4dd7933e31fcac5c96fc",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "_ColormakerRegistry()"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "    <div class=\"bk-root\">\n",
-       "        <a href=\"https://bokeh.pydata.org\" target=\"_blank\" class=\"bk-logo bk-logo-small bk-logo-notebook\"></a>\n",
-       "        <span id=\"1001\">Loading BokehJS ...</span>\n",
-       "    </div>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/javascript": [
-       "\n",
-       "(function(root) {\n",
-       "  function now() {\n",
-       "    return new Date();\n",
-       "  }\n",
-       "\n",
-       "  var force = true;\n",
-       "\n",
-       "  if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n",
-       "    root._bokeh_onload_callbacks = [];\n",
-       "    root._bokeh_is_loading = undefined;\n",
-       "  }\n",
-       "\n",
-       "  var JS_MIME_TYPE = 'application/javascript';\n",
-       "  var HTML_MIME_TYPE = 'text/html';\n",
-       "  var EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n",
-       "  var CLASS_NAME = 'output_bokeh rendered_html';\n",
-       "\n",
-       "  /**\n",
-       "   * Render data to the DOM node\n",
-       "   */\n",
-       "  function render(props, node) {\n",
-       "    var script = document.createElement(\"script\");\n",
-       "    node.appendChild(script);\n",
-       "  }\n",
-       "\n",
-       "  /**\n",
-       "   * Handle when an output is cleared or removed\n",
-       "   */\n",
-       "  function handleClearOutput(event, handle) {\n",
-       "    var cell = handle.cell;\n",
-       "\n",
-       "    var id = cell.output_area._bokeh_element_id;\n",
-       "    var server_id = cell.output_area._bokeh_server_id;\n",
-       "    // Clean up Bokeh references\n",
-       "    if (id != null && id in Bokeh.index) {\n",
-       "      Bokeh.index[id].model.document.clear();\n",
-       "      delete Bokeh.index[id];\n",
-       "    }\n",
-       "\n",
-       "    if (server_id !== undefined) {\n",
-       "      // Clean up Bokeh references\n",
-       "      var cmd = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n",
-       "      cell.notebook.kernel.execute(cmd, {\n",
-       "        iopub: {\n",
-       "          output: function(msg) {\n",
-       "            var id = msg.content.text.trim();\n",
-       "            if (id in Bokeh.index) {\n",
-       "              Bokeh.index[id].model.document.clear();\n",
-       "              delete Bokeh.index[id];\n",
-       "            }\n",
-       "          }\n",
-       "        }\n",
-       "      });\n",
-       "      // Destroy server and session\n",
-       "      var cmd = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n",
-       "      cell.notebook.kernel.execute(cmd);\n",
-       "    }\n",
-       "  }\n",
-       "\n",
-       "  /**\n",
-       "   * Handle when a new output is added\n",
-       "   */\n",
-       "  function handleAddOutput(event, handle) {\n",
-       "    var output_area = handle.output_area;\n",
-       "    var output = handle.output;\n",
-       "\n",
-       "    // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n",
-       "    if ((output.output_type != \"display_data\") || (!output.data.hasOwnProperty(EXEC_MIME_TYPE))) {\n",
-       "      return\n",
-       "    }\n",
-       "\n",
-       "    var toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n",
-       "\n",
-       "    if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n",
-       "      toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];\n",
-       "      // store reference to embed id on output_area\n",
-       "      output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n",
-       "    }\n",
-       "    if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n",
-       "      var bk_div = document.createElement(\"div\");\n",
-       "      bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n",
-       "      var script_attrs = bk_div.children[0].attributes;\n",
-       "      for (var i = 0; i < script_attrs.length; i++) {\n",
-       "        toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n",
-       "      }\n",
-       "      // store reference to server id on output_area\n",
-       "      output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n",
-       "    }\n",
-       "  }\n",
-       "\n",
-       "  function register_renderer(events, OutputArea) {\n",
-       "\n",
-       "    function append_mime(data, metadata, element) {\n",
-       "      // create a DOM node to render to\n",
-       "      var toinsert = this.create_output_subarea(\n",
-       "        metadata,\n",
-       "        CLASS_NAME,\n",
-       "        EXEC_MIME_TYPE\n",
-       "      );\n",
-       "      this.keyboard_manager.register_events(toinsert);\n",
-       "      // Render to node\n",
-       "      var props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n",
-       "      render(props, toinsert[toinsert.length - 1]);\n",
-       "      element.append(toinsert);\n",
-       "      return toinsert\n",
-       "    }\n",
-       "\n",
-       "    /* Handle when an output is cleared or removed */\n",
-       "    events.on('clear_output.CodeCell', handleClearOutput);\n",
-       "    events.on('delete.Cell', handleClearOutput);\n",
-       "\n",
-       "    /* Handle when a new output is added */\n",
-       "    events.on('output_added.OutputArea', handleAddOutput);\n",
-       "\n",
-       "    /**\n",
-       "     * Register the mime type and append_mime function with output_area\n",
-       "     */\n",
-       "    OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n",
-       "      /* Is output safe? */\n",
-       "      safe: true,\n",
-       "      /* Index of renderer in `output_area.display_order` */\n",
-       "      index: 0\n",
-       "    });\n",
-       "  }\n",
-       "\n",
-       "  // register the mime type if in Jupyter Notebook environment and previously unregistered\n",
-       "  if (root.Jupyter !== undefined) {\n",
-       "    var events = require('base/js/events');\n",
-       "    var OutputArea = require('notebook/js/outputarea').OutputArea;\n",
-       "\n",
-       "    if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n",
-       "      register_renderer(events, OutputArea);\n",
-       "    }\n",
-       "  }\n",
-       "\n",
-       "  \n",
-       "  if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n",
-       "    root._bokeh_timeout = Date.now() + 5000;\n",
-       "    root._bokeh_failed_load = false;\n",
-       "  }\n",
-       "\n",
-       "  var NB_LOAD_WARNING = {'data': {'text/html':\n",
-       "     \"<div style='background-color: #fdd'>\\n\"+\n",
-       "     \"<p>\\n\"+\n",
-       "     \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n",
-       "     \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n",
-       "     \"</p>\\n\"+\n",
-       "     \"<ul>\\n\"+\n",
-       "     \"<li>re-rerun `output_notebook()` to attempt to load from CDN again, or</li>\\n\"+\n",
-       "     \"<li>use INLINE resources instead, as so:</li>\\n\"+\n",
-       "     \"</ul>\\n\"+\n",
-       "     \"<code>\\n\"+\n",
-       "     \"from bokeh.resources import INLINE\\n\"+\n",
-       "     \"output_notebook(resources=INLINE)\\n\"+\n",
-       "     \"</code>\\n\"+\n",
-       "     \"</div>\"}};\n",
-       "\n",
-       "  function display_loaded() {\n",
-       "    var el = document.getElementById(\"1001\");\n",
-       "    if (el != null) {\n",
-       "      el.textContent = \"BokehJS is loading...\";\n",
-       "    }\n",
-       "    if (root.Bokeh !== undefined) {\n",
-       "      if (el != null) {\n",
-       "        el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n",
-       "      }\n",
-       "    } else if (Date.now() < root._bokeh_timeout) {\n",
-       "      setTimeout(display_loaded, 100)\n",
-       "    }\n",
-       "  }\n",
-       "\n",
-       "\n",
-       "  function run_callbacks() {\n",
-       "    try {\n",
-       "      root._bokeh_onload_callbacks.forEach(function(callback) {\n",
-       "        if (callback != null)\n",
-       "          callback();\n",
-       "      });\n",
-       "    } finally {\n",
-       "      delete root._bokeh_onload_callbacks\n",
-       "    }\n",
-       "    console.debug(\"Bokeh: all callbacks have finished\");\n",
-       "  }\n",
-       "\n",
-       "  function load_libs(css_urls, js_urls, callback) {\n",
-       "    if (css_urls == null) css_urls = [];\n",
-       "    if (js_urls == null) js_urls = [];\n",
-       "\n",
-       "    root._bokeh_onload_callbacks.push(callback);\n",
-       "    if (root._bokeh_is_loading > 0) {\n",
-       "      console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n",
-       "      return null;\n",
-       "    }\n",
-       "    if (js_urls == null || js_urls.length === 0) {\n",
-       "      run_callbacks();\n",
-       "      return null;\n",
-       "    }\n",
-       "    console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n",
-       "    root._bokeh_is_loading = css_urls.length + js_urls.length;\n",
-       "\n",
-       "    function on_load() {\n",
-       "      root._bokeh_is_loading--;\n",
-       "      if (root._bokeh_is_loading === 0) {\n",
-       "        console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n",
-       "        run_callbacks()\n",
-       "      }\n",
-       "    }\n",
-       "\n",
-       "    function on_error() {\n",
-       "      console.error(\"failed to load \" + url);\n",
-       "    }\n",
-       "\n",
-       "    for (var i = 0; i < css_urls.length; i++) {\n",
-       "      var url = css_urls[i];\n",
-       "      const element = document.createElement(\"link\");\n",
-       "      element.onload = on_load;\n",
-       "      element.onerror = on_error;\n",
-       "      element.rel = \"stylesheet\";\n",
-       "      element.type = \"text/css\";\n",
-       "      element.href = url;\n",
-       "      console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n",
-       "      document.body.appendChild(element);\n",
-       "    }\n",
-       "\n",
-       "    for (var i = 0; i < js_urls.length; i++) {\n",
-       "      var url = js_urls[i];\n",
-       "      var element = document.createElement('script');\n",
-       "      element.onload = on_load;\n",
-       "      element.onerror = on_error;\n",
-       "      element.async = false;\n",
-       "      element.src = url;\n",
-       "      console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n",
-       "      document.head.appendChild(element);\n",
-       "    }\n",
-       "  };var element = document.getElementById(\"1001\");\n",
-       "  if (element == null) {\n",
-       "    console.error(\"Bokeh: ERROR: autoload.js configured with elementid '1001' but no matching script tag was found. \")\n",
-       "    return false;\n",
-       "  }\n",
-       "\n",
-       "  function inject_raw_css(css) {\n",
-       "    const element = document.createElement(\"style\");\n",
-       "    element.appendChild(document.createTextNode(css));\n",
-       "    document.body.appendChild(element);\n",
-       "  }\n",
-       "\n",
-       "  var js_urls = [\"https://cdn.pydata.org/bokeh/release/bokeh-1.3.4.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-widgets-1.3.4.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-tables-1.3.4.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-gl-1.3.4.min.js\"];\n",
-       "  var css_urls = [];\n",
-       "\n",
-       "  var inline_js = [\n",
-       "    function(Bokeh) {\n",
-       "      Bokeh.set_log_level(\"info\");\n",
-       "    },\n",
-       "    \n",
-       "    function(Bokeh) {\n",
-       "      \n",
-       "    },\n",
-       "    function(Bokeh) {} // ensure no trailing comma for IE\n",
-       "  ];\n",
-       "\n",
-       "  function run_inline_js() {\n",
-       "    \n",
-       "    if ((root.Bokeh !== undefined) || (force === true)) {\n",
-       "      for (var i = 0; i < inline_js.length; i++) {\n",
-       "        inline_js[i].call(root, root.Bokeh);\n",
-       "      }if (force === true) {\n",
-       "        display_loaded();\n",
-       "      }} else if (Date.now() < root._bokeh_timeout) {\n",
-       "      setTimeout(run_inline_js, 100);\n",
-       "    } else if (!root._bokeh_failed_load) {\n",
-       "      console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n",
-       "      root._bokeh_failed_load = true;\n",
-       "    } else if (force !== true) {\n",
-       "      var cell = $(document.getElementById(\"1001\")).parents('.cell').data().cell;\n",
-       "      cell.output_area.append_execute_result(NB_LOAD_WARNING)\n",
-       "    }\n",
-       "\n",
-       "  }\n",
-       "\n",
-       "  if (root._bokeh_is_loading === 0) {\n",
-       "    console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n",
-       "    run_inline_js();\n",
-       "  } else {\n",
-       "    load_libs(css_urls, js_urls, function() {\n",
-       "      console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n",
-       "      run_inline_js();\n",
-       "    });\n",
-       "  }\n",
-       "}(window));"
-      ],
-      "application/vnd.bokehjs_load.v0+json": "\n(function(root) {\n  function now() {\n    return new Date();\n  }\n\n  var force = true;\n\n  if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n    root._bokeh_onload_callbacks = [];\n    root._bokeh_is_loading = undefined;\n  }\n\n  \n\n  \n  if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n    root._bokeh_timeout = Date.now() + 5000;\n    root._bokeh_failed_load = false;\n  }\n\n  var NB_LOAD_WARNING = {'data': {'text/html':\n     \"<div style='background-color: #fdd'>\\n\"+\n     \"<p>\\n\"+\n     \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n     \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n     \"</p>\\n\"+\n     \"<ul>\\n\"+\n     \"<li>re-rerun `output_notebook()` to attempt to load from CDN again, or</li>\\n\"+\n     \"<li>use INLINE resources instead, as so:</li>\\n\"+\n     \"</ul>\\n\"+\n     \"<code>\\n\"+\n     \"from bokeh.resources import INLINE\\n\"+\n     \"output_notebook(resources=INLINE)\\n\"+\n     \"</code>\\n\"+\n     \"</div>\"}};\n\n  function display_loaded() {\n    var el = document.getElementById(\"1001\");\n    if (el != null) {\n      el.textContent = \"BokehJS is loading...\";\n    }\n    if (root.Bokeh !== undefined) {\n      if (el != null) {\n        el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n      }\n    } else if (Date.now() < root._bokeh_timeout) {\n      setTimeout(display_loaded, 100)\n    }\n  }\n\n\n  function run_callbacks() {\n    try {\n      root._bokeh_onload_callbacks.forEach(function(callback) {\n        if (callback != null)\n          callback();\n      });\n    } finally {\n      delete root._bokeh_onload_callbacks\n    }\n    console.debug(\"Bokeh: all callbacks have finished\");\n  }\n\n  function load_libs(css_urls, js_urls, callback) {\n    if (css_urls == null) css_urls = [];\n    if (js_urls == null) js_urls = [];\n\n    root._bokeh_onload_callbacks.push(callback);\n    if (root._bokeh_is_loading > 0) {\n      console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n      return null;\n    }\n    if (js_urls == null || js_urls.length === 0) {\n      run_callbacks();\n      return null;\n    }\n    console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n    root._bokeh_is_loading = css_urls.length + js_urls.length;\n\n    function on_load() {\n      root._bokeh_is_loading--;\n      if (root._bokeh_is_loading === 0) {\n        console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n        run_callbacks()\n      }\n    }\n\n    function on_error() {\n      console.error(\"failed to load \" + url);\n    }\n\n    for (var i = 0; i < css_urls.length; i++) {\n      var url = css_urls[i];\n      const element = document.createElement(\"link\");\n      element.onload = on_load;\n      element.onerror = on_error;\n      element.rel = \"stylesheet\";\n      element.type = \"text/css\";\n      element.href = url;\n      console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n      document.body.appendChild(element);\n    }\n\n    for (var i = 0; i < js_urls.length; i++) {\n      var url = js_urls[i];\n      var element = document.createElement('script');\n      element.onload = on_load;\n      element.onerror = on_error;\n      element.async = false;\n      element.src = url;\n      console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n      document.head.appendChild(element);\n    }\n  };var element = document.getElementById(\"1001\");\n  if (element == null) {\n    console.error(\"Bokeh: ERROR: autoload.js configured with elementid '1001' but no matching script tag was found. \")\n    return false;\n  }\n\n  function inject_raw_css(css) {\n    const element = document.createElement(\"style\");\n    element.appendChild(document.createTextNode(css));\n    document.body.appendChild(element);\n  }\n\n  var js_urls = [\"https://cdn.pydata.org/bokeh/release/bokeh-1.3.4.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-widgets-1.3.4.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-tables-1.3.4.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-gl-1.3.4.min.js\"];\n  var css_urls = [];\n\n  var inline_js = [\n    function(Bokeh) {\n      Bokeh.set_log_level(\"info\");\n    },\n    \n    function(Bokeh) {\n      \n    },\n    function(Bokeh) {} // ensure no trailing comma for IE\n  ];\n\n  function run_inline_js() {\n    \n    if ((root.Bokeh !== undefined) || (force === true)) {\n      for (var i = 0; i < inline_js.length; i++) {\n        inline_js[i].call(root, root.Bokeh);\n      }if (force === true) {\n        display_loaded();\n      }} else if (Date.now() < root._bokeh_timeout) {\n      setTimeout(run_inline_js, 100);\n    } else if (!root._bokeh_failed_load) {\n      console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n      root._bokeh_failed_load = true;\n    } else if (force !== true) {\n      var cell = $(document.getElementById(\"1001\")).parents('.cell').data().cell;\n      cell.output_area.append_execute_result(NB_LOAD_WARNING)\n    }\n\n  }\n\n  if (root._bokeh_is_loading === 0) {\n    console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n    run_inline_js();\n  } else {\n    load_libs(css_urls, js_urls, function() {\n      console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n      run_inline_js();\n    });\n  }\n}(window));"
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "import os\n",
     "import pandas as pd\n",
@@ -442,13 +103,14 @@
     "from sklearn.model_selection import GridSearchCV, LeaveOneOut\n",
     "from IPython.display import HTML\n",
     "from jupyter_jsmol import JsmolView\n",
+    "import pathlib\n",
+    "\n",
     "\n",
     "import nglview\n",
     "from ase.units import J\n",
     "\n",
     "from compressed_sensing.sisso import SissoRegressor\n",
     "from compressed_sensing.combine_features import combine_features\n",
-    "from compressed_sensing.utils import generate_structures\n",
     "from compressed_sensing.scatter_plot import  show_scatter_plot\n",
     "from compressed_sensing.visualizer import Visualizer\n",
     "from cpp_sisso import generate_fs, SISSORegressor, generate_phi_0_from_csv, FeatureSpace, get_max_number_feats\n",
@@ -468,7 +130,7 @@
    "metadata": {},
    "source": [
     "# Get the data\n",
-    "Let us load the data from the file data/data.pkl into a data frame. The data was downloaded from the NOMAD archive and the NOMAD atomic data collection. It consists of RS-ZB energy differences (in eV/atom) of the 82 octet binary compounds, structure objects containing the atomic positions of the materials and properties of the atomic constituents. The following atomic features are considered:\n",
+    "Let us load the data from the NOMAD Archive and the atomicfeaturespackage. It consists of RS-ZB energy differences (in eV/atom) of the 82 octet binary compounds, structure objects containing the atomic positions of the materials and properties of the atomic constituents. The following atomic features are considered:\n",
     "\n",
     "<div >\n",
     "   <ul>\n",
@@ -488,7 +150,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "start_time": "2021-02-05T11:35:25.112Z"
+     "end_time": "2021-02-08T13:51:16.037772Z",
+     "start_time": "2021-02-08T13:51:13.170622Z"
     }
    },
    "outputs": [],
@@ -506,8 +169,10 @@
     "                    \"energy_total\": '*',\n",
     "                },\n",
     "                'section_system':{\n",
+    "                    \"chemical_composition_reduced\": '*',\n",
     "                    'atom_labels':'*',\n",
     "                    'atom_positions':'*',\n",
+    "                    'lattice_vectors':'*',\n",
     "                    'section_symmetry':{\n",
     "                        'space_group_number': '*',                    \n",
     "                    }                \n",
@@ -521,6 +186,9 @@
     "    return query\n",
     "\n",
     "def get_target(query):\n",
+    "    \n",
+    "    path_structure = './data/compressed_sensing/structures/'\n",
+    "    pathlib.Path(path_structure).mkdir(parents=True, exist_ok=True)\n",
     "    df_target = pd.DataFrame()\n",
     "    for entry in query:\n",
     "        calculation = entry.section_run[0]\n",
@@ -530,22 +198,40 @@
     "            \"B\": atom_labels[1],\n",
     "            \"space_group\": calculation.section_system[0].section_symmetry[0].space_group_number,\n",
     "            \"energy\": calculation.section_single_configuration_calculation[0].energy_total.magnitude,\n",
-    "            \"positions\": calculation.section_system[0].atom_positions.magnitude,\n",
+    "            'compound': calculation.section_system[0].chemical_composition_reduced,\n",
     "            },\n",
     "            ignore_index=True\n",
     "        )\n",
-    "\n",
-    "    df_target['compound'] = df_target['A'] + df_target['B']   \n",
+    "        atoms = [atom_labels[0], atom_labels[1]]\n",
+    "        # positions are converted into AA using a scale factor\n",
+    "        scale_factor = 10**10\n",
+    "        positions = calculation.section_system[0].atom_positions\n",
+    "        lat_x, lat_y, lat_z = calculation.section_system[0].lattice_vectors.magnitude * scale_factor\n",
+    "        file = open(\"data/compressed_sensing/structures/\"+df_target.iloc[-1]['compound']+\".xyz\", \"w\")\n",
+    "        file.write (\"%d\\n\\n\"%32)\n",
+    "        for i in [0,1,2]:\n",
+    "            for j in [0,1,2]:\n",
+    "                for k in [0,1,2]:\n",
+    "                    for n in range(2):\n",
+    "                        xyz = calculation.section_system[0].atom_positions[n].magnitude * scale_factor\n",
+    "                        xyz += i*lat_x\n",
+    "                        xyz += j*lat_y\n",
+    "                        xyz += k*lat_z\n",
+    "                        file.write(atoms[n])\n",
+    "                        file.write(\"\\t%f\\t%f\\t%f\\n\" % (xyz[0],\n",
+    "                                                       xyz[1],\n",
+    "                                                       xyz[2]))\n",
+    "        file.close()\n",
+    "        \n",
     "    df_RS = df_target.query('space_group==225 or space_group==221').set_index('compound').sort_index()\n",
     "    df_ZB = df_target.query('space_group==216 or space_group==227').set_index('compound').sort_index()\n",
-    "    df_target = df_RS[['A','B', 'positions']]\n",
+    "    df_target = df_RS[['A','B']]\n",
     "    df_target['energy_diff']=(df_RS['energy']-df_ZB['energy'])/2\n",
     "    df_target['min_struc_type']=np.where(df_RS['energy']<df_ZB['energy'],'RS','ZB')\n",
     "\n",
-    "    # convert J in eV and m in AA\n",
+    "    # convert J in eV \n",
     "    df_target['energy_diff'] *= J\n",
-    "    df_target['positions'] *= 10**10\n",
-    "    return df_target[['A', 'B', 'energy_diff', 'min_struc_type', 'positions']]\n",
+    "    return df_target[['A', 'B', 'energy_diff', 'min_struc_type']]\n",
     "\n",
     "# get data (chemical formulas and RS-ZB energy difference) from query\n",
     "query = get_query()\n",
@@ -556,7 +242,13 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-02-08T13:51:16.060958Z",
+     "start_time": "2021-02-08T13:51:16.040921Z"
+    },
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
     "def get_features(elements, features, rename_dict={}):    \n",
@@ -578,7 +270,12 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-02-08T13:51:16.080154Z",
+     "start_time": "2021-02-08T13:51:16.062309Z"
+    }
+   },
    "outputs": [],
    "source": [
     "def sort_AB_wrt_electronegativity(df_target, df_features):\n",
@@ -596,6 +293,27 @@
     "df_target = sort_AB_wrt_electronegativity(df_target, df_features)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2021-02-08T13:51:16.141626Z",
+     "start_time": "2021-02-08T13:51:16.081628Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def merge_target_feature(df_target, df_features, suffixes=('(A)', '(B)')):\n",
+    "    df = df_target.merge(df_features, left_on='A', right_index=True)\n",
+    "    df = df.merge(df_features, left_on='B', right_index=True, suffixes=suffixes)\n",
+    "    return df\n",
+    "\n",
+    "# merge target and feature data frame\n",
+    "df = merge_target_feature(df_target, df_features)\n",
+    "df"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -605,34 +323,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2021-01-15T11:11:40.966163Z",
-     "start_time": "2021-01-15T11:11:40.578468Z"
+     "end_time": "2021-02-08T13:51:16.546500Z",
+     "start_time": "2021-02-08T13:51:16.143452Z"
     }
    },
-   "outputs": [
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAEICAYAAABF82P+AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAUa0lEQVR4nO3da7BlZX3n8e8vDWQmSkawD8itbUwoJ2gEmZNWw+jgtaC1RC2idKWUyVBpdWRKJqnU9MQqdd6RmYmTilAyHUGwiqAkipLQ3HSIhCpFGqq5BQgdpg2d7qEbreEyWEO1858Xe3XYHJ5zevfh7L3O5fup2rXXep5n7f1fZwE/1mWvlapCkqSZfq7vAiRJi5MBIUlqMiAkSU0GhCSpyYCQJDUd0ncBC2n16tW1du3avsuQpCXjrrvueqKqplp9yyog1q5dy9atW/suQ5KWjCQ/mq3PQ0ySpCYDQpLUZEBIkpoMCElSkwEhSWoyICRJTWMLiCQnJLk1yYNJHkjy6a79yCS3JHmkez9iluXPTPJwku1JNo2rTklS2zj3IPYBv1tVvwK8GfhUkpOBTcB3q+ok4Lvd/AskWQVcApwFnAxs6JaVJE3I2AKiqnZX1d3d9NPAg8BxwNnAld2wK4EPNBZfB2yvqker6jnga91ykqQJmcgvqZOsBd4I3AEcXVW7YRAiSY5qLHIc8NjQ/E7gTbN89kZgI8CaNWsWruiXaO2m60cat+Oi9465Ekman7GfpE7ycuAbwIVV9dSoizXamo++q6rNVTVdVdNTU83biUiS5mGsAZHkUAbhcFVVfbNrfjzJMV3/McCexqI7gROG5o8Hdo2zVknSC43zKqYAlwEPVtUXhrquA87rps8Dvt1Y/E7gpCQnJjkMOLdbTpI0IePcgzgd+CjwjiTbutd64CLg3UkeAd7dzZPk2CRbAKpqH3ABcBODk9vXVNUDY6xVkjTD2E5SV9XttM8lALyzMX4XsH5ofguwZTzVSZIOxF9SS5KaDAhJUpMBIUlqMiAkSU0GhCSpyYCQJDUZEJKkJgNCktRkQEiSmgwISVKTASFJajIgJElNBoQkqcmAkCQ1GRCSpCYDQpLUNLYHBiW5HHgfsKeqXt+1fR14bTfkFcD/rqpTG8vuAJ4Gfgbsq6rpcdUpSWobW0AAVwAXA1/d31BVH9k/neQPgSfnWP7tVfXE2KqTJM1pnI8cvS3J2lZfkgAfBt4xru+XJL00fZ2DeCvweFU9Mkt/ATcnuSvJxgnWJUnqjPMQ01w2AFfP0X96Ve1KchRwS5KHquq21sAuQDYCrFmzZuErlaQVauJ7EEkOAT4EfH22MVW1q3vfA1wLrJtj7Oaqmq6q6ampqYUuV5JWrD4OMb0LeKiqdrY6k7wsyeH7p4H3APdPsD5JEmMMiCRXA98HXptkZ5Lzu65zmXF4KcmxSbZ0s0cDtye5B/ghcH1V3TiuOiVJbeO8imnDLO3/utG2C1jfTT8KnDKuuiRJo/GX1JKkJgNCktRkQEiSmgwISVKTASFJajIgJElNBoQkqcmAkCQ1GRCSpCYDQpLUZEBIkpoMCElSkwEhSWoyICRJTQaEJKnJgJAkNRkQkqSmcT5y9PIke5LcP9T2+ST/kGRb91o/y7JnJnk4yfYkm8ZVoyRpduPcg7gCOLPR/t+q6tTutWVmZ5JVwCXAWcDJwIYkJ4+xTklSw9gCoqpuA34yj0XXAdur6tGqeg74GnD2ghYnSTqgPs5BXJDk3u4Q1BGN/uOAx4bmd3ZtTUk2JtmaZOvevXsXulZJWrEmHRBfAn4JOBXYDfxhY0wabTXbB1bV5qqarqrpqamphalSkjTZgKiqx6vqZ1X1/4A/YXA4aaadwAlD88cDuyZRnyTpeRMNiCTHDM1+ELi/MexO4KQkJyY5DDgXuG4S9UmSnnfIuD44ydXAGcDqJDuBzwFnJDmVwSGjHcDHu7HHAl+uqvVVtS/JBcBNwCrg8qp6YFx1SpLaxhYQVbWh0XzZLGN3AeuH5rcAL7oEVpI0Of6SWpLUZEBIkpoMCElSkwEhSWoyICRJTQaEJKnJgJAkNRkQkqQmA0KS1GRASJKaDAhJUpMBIUlqMiAkSU0GhCSpyYCQJDUZEJKkJgNCktQ0toBIcnmSPUnuH2r7L0keSnJvkmuTvGKWZXckuS/JtiRbx1WjJGl249yDuAI4c0bbLcDrq+oNwN8C/3GO5d9eVadW1fSY6pMkzWFsAVFVtwE/mdF2c1Xt62Z/ABw/ru+XJL00fZ6D+DfADbP0FXBzkruSbJzrQ5JsTLI1yda9e/cueJGStFL1EhBJPgPsA66aZcjpVXUacBbwqSRvm+2zqmpzVU1X1fTU1NQYqpWklWniAZHkPOB9wG9WVbXGVNWu7n0PcC2wbnIVSpJgwgGR5EzgPwDvr6pnZxnzsiSH758G3gPc3xorSRqfcV7mejXwfeC1SXYmOR+4GDgcuKW7hPXSbuyxSbZ0ix4N3J7kHuCHwPVVdeO46pQktR0yrg+uqg2N5stmGbsLWN9NPwqcMq66JEmjOeiASHIEcEJV3TuGerQA1m66fqRxOy5675grkbSUjXSIKclfJfnFJEcC9wBfSfKF8ZYmSerTqOcg/llVPQV8CPhKVf0L4F3jK0uS1LdRA+KQJMcAHwb+coz1SJIWiVED4j8BNwHbq+rOJK8BHhlfWZKkvo16knp3d4M9YHClkecgJGl5G3UP4osjtkmSlok59yCSvAX4dWAqye8Mdf0isGqchUmS+nWgQ0yHAS/vxh0+1P4UcM64ipIk9W/OgKiq7wHfS3JFVf1oQjVJkhaBUU9S/3ySzcDa4WWq6h3jKEqS1L9RA+LPgEuBLwM/G185kqTFYtSA2FdVXxprJZKkRWXUy1z/Ism/TXJMkiP3v8ZamSSpV6PuQZzXvf/eUFsBr1nYciRJi8VIAVFVJ467EEnS4jJSQCT5WKu9qr66sOVIkhaLUc9B/NrQ663A54H3z7VAksuT7Ely/1DbkUluSfJI937ELMuemeThJNuTbBqxRknSAhopIKrq3w29fht4I4NfWc/lCuDMGW2bgO9W1UnAd7v5F0iyCrgEOAs4GdiQ5ORR6pQkLZxR9yBmehY4aa4BVXUb8JMZzWcDV3bTVwIfaCy6jsFtxR+tqueAr3XLSZImaNRzEH/B4KolGNyk71eAa+bxfUdX1W6Aqtqd5KjGmOOAx4bmdwJvmqO2jcBGgDVr1syjpH75/GhJi9Wol7n+16HpfcCPqmrnGOoBSKOtGm2DjqrNwGaA6enpWcdJkg7OqOcgvgc8xOCOrkcAz83z+x7vHl1K976nMWYncMLQ/PHArnl+nyRpnkYKiCQfBn4I/AaD51LfkWQ+t/u+jud/dHce8O3GmDuBk5KcmOQw4NxuOUnSBI16iOkzwK9V1R6AJFPAd4A/n22BJFcDZwCrk+wEPgdcBFyT5Hzg7xkEDkmOBb5cVeural+SCxg8A3sVcHlVPTCflZMkzd+oAfFz+8Oh82MOsPdRVRtm6XpnY+wuYP3Q/BZgy4i1SZLGYNSAuDHJTcDV3fxH8D/gkrSsHeiZ1L/M4NLU30vyIeBfMrjK6PvAVROoT5LUkwOdpP4j4GmAqvpmVf1OVf17BnsPfzTu4iRJ/TlQQKytqntnNlbVVgaPH5UkLVMHCoh/MkffP13IQiRJi8uBTlLfmeS3q+pPhhu7y1TvGl9Zk+ctLyTphQ4UEBcC1yb5TZ4PhGkGd3L94DgLkyT1a86AqKrHgV9P8nbg9V3z9VX1P8ZemSSpV6M+cvRW4NYx1yJJWkTm+zwISdIyZ0BIkpoMCElSkwEhSWoyICRJTQaEJKlp1Nt9q2ej/tJbkhaKexCSpKaJB0SS1ybZNvR6KsmFM8ackeTJoTGfnXSdkrTSTfwQU1U9DJwKkGQV8A/AtY2hf11V75tkbZKk5/V9iOmdwN9V1Y96rkOSNEPfAXEuzz/neqa3JLknyQ1JXjfbByTZmGRrkq179+4dT5WStAL1FhBJDgPeD/xZo/tu4NVVdQrwReBbs31OVW2uqumqmp6amhpPsZK0AvW5B3EWcHd3S/EXqKqnquqZbnoLcGiS1ZMuUJJWsj4DYgOzHF5K8qok6abXMajzxxOsTZJWvF5+KJfkF4B3Ax8favsEQFVdCpwDfDLJPuCnwLlVVX3UKkkrVS8BUVXPAq+c0Xbp0PTFwMWTrkuS9Ly+r2KSJC1SBoQkqcmAkCQ1GRCSpCYDQpLUZEBIkpoMCElSkwEhSWoyICRJTQaEJKmpl1ttaHFYu+n6kcbtuOi9Y65E0mLkHoQkqcmAkCQ1GRCSpCYDQpLUZEBIkpoMCElSUy8BkWRHkvuSbEuytdGfJH+cZHuSe5Oc1kedkrSS9fk7iLdX1ROz9J0FnNS93gR8qXuXJE3IYj3EdDbw1Rr4AfCKJMf0XZQkrSR97UEUcHOSAv57VW2e0X8c8NjQ/M6ubffMD0qyEdgIsGbNmvFUq5H4y2xpeelrD+L0qjqNwaGkTyV524z+NJap1gdV1eaqmq6q6ampqYWuU5JWrF4Coqp2de97gGuBdTOG7AROGJo/Htg1meokSdBDQCR5WZLD908D7wHunzHsOuBj3dVMbwaerKoXHV6SJI1PH+cgjgauTbL/+/+0qm5M8gmAqroU2AKsB7YDzwK/1UOdkrSiTTwgqupR4JRG+6VD0wV8apJ1SZJeaLFe5ipJ6pkBIUlqMiAkSU0GhCSpyYCQJDUZEJKkJgNCktRkQEiSmgwISVKTASFJajIgJElNBoQkqcmAkCQ1GRCSpCYDQpLU1McDg5a0tZuu77sESZoI9yAkSU19PJP6hCS3JnkwyQNJPt0Yc0aSJ5Ns616fnXSdkrTS9XGIaR/wu1V1d5LDgbuS3FJVfzNj3F9X1ft6qE+SRA97EFW1u6ru7qafBh4Ejpt0HZKkufV6DiLJWuCNwB2N7rckuSfJDUleN8dnbEyyNcnWvXv3jqlSSVp5eguIJC8HvgFcWFVPzei+G3h1VZ0CfBH41myfU1Wbq2q6qqanpqbGV7AkrTC9BESSQxmEw1VV9c2Z/VX1VFU9001vAQ5NsnrCZUrSitbHVUwBLgMerKovzDLmVd04kqxjUOePJ1elJKmPq5hOBz4K3JdkW9f2+8AagKq6FDgH+GSSfcBPgXOrqnqoVZJWrIkHRFXdDuQAYy4GLp5MRVqsRv3V+o6L3jvmSqSVyV9SS5KaDAhJUpMBIUlqMiAkSU0GhCSpyYCQJDUZEJKkJgNCktRkQEiSmrKc7mAxPT1dW7dundeyPmta87ESf8XtL9wnZxJ/6yR3VdV0q889CElSkwEhSWoyICRJTQaEJKnJgJAkNRkQkqQmA0KS1NRLQCQ5M8nDSbYn2dToT5I/7vrvTXJaH3VK0ko28YBIsgq4BDgLOBnYkOTkGcPOAk7qXhuBL020SElSL3sQ64DtVfVoVT0HfA04e8aYs4Gv1sAPgFckOWbShUrSSnZID995HPDY0PxO4E0jjDkO2D3zw5JsZLCXAfBMkoe76dXAEwtRcM+Wy3rA8lmXf1yP/EHPlbx0Y9smPfxtlt0/X6N6iX/rV8/W0UdApNE284ZQo4wZNFZtBja/6EuSrbPdX2QpWS7rActnXZbLeoDrshgtpvXo4xDTTuCEofnjgV3zGCNJGqM+AuJO4KQkJyY5DDgXuG7GmOuAj3VXM70ZeLKqXnR4SZI0PhM/xFRV+5JcANwErAIur6oHknyi678U2AKsB7YDzwK/NY+vetFhpyVquawHLJ91WS7rAa7LYrRo1mNZPQ9CkrRw/CW1JKnJgJAkNS2bgEhyZJJbkjzSvR8xy7gdSe5Lsi3J/J5POgbL5fYjI6zHGUme7P7+25J8to86DyTJ5Un2JLl/lv4lsT1gpHVZKtvkhCS3JnkwyQNJPt0YsyS2y4jr0v92qapl8QL+M7Cpm94E/MEs43YAq/uud0ZNq4C/A14DHAbcA5w8Y8x64AYGvxF5M3BH33XPcz3OAP6y71pHWJe3AacB98/Sv+i3x0Gsy1LZJscAp3XThwN/uxT/PTmIdel9uyybPQgGt+e4spu+EvhAj7UcrOVy+5FR1mNJqKrbgJ/MMWQpbA9gpHVZEqpqd1Xd3U0/DTzI4A4Lw5bEdhlxXXq3nALi6Op+K9G9HzXLuAJuTnJXd5uOxWC2W4sc7Ji+jVrjW5Lck+SGJK+bTGkLbilsj4OxpLZJkrXAG4E7ZnQtue0yx7pAz9ulj1ttzFuS7wCvanR95iA+5vSq2pXkKOCWJA91/4fVpwW9/UiPRqnxbuDVVfVMkvXAtxjctXepWQrbY1RLapskeTnwDeDCqnpqZndjkUW7XQ6wLr1vlyW1B1FV76qq1zde3wYe378r2b3vmeUzdnXve4BrGRwW6dtyuf3IAWusqqeq6pluegtwaJLVkytxwSyF7TGSpbRNkhzK4D+oV1XVNxtDlsx2OdC6LIbtsqQC4gCuA87rps8Dvj1zQJKXJTl8/zTwHqB5ZceELZfbjxxwPZK8Kkm66XUM/hn88cQrfemWwvYYyVLZJl2NlwEPVtUXZhm2JLbLKOuyGLbLkjrEdAAXAdckOR/4e+A3AJIcC3y5qtYDRwPXdn/zQ4A/raobe6r3H9Xkbj8yViOuxznAJ5PsA34KnFvdJRuLSZKrGVxFsjrJTuBzwKGwdLbHfiOsy5LYJsDpwEeB+5Js69p+H1gDS267jLIuvW8Xb7UhSWpaToeYJEkLyICQJDUZEJKkJgNCktRkQEiSmgwISVKTASFJajIgpAWW5INJKsk/H2r7eJLdQ/f235bkV/usUzoQfygnLbAk1wAnAtdX1ee7tkuAu6vqsj5rkw6GexDSAuruzvmvgPOBDUNdvwpsay4kLVIGhLSwPgB8p6ruBf7P0CMvXwd8Zejw0mJ5Fok0q+V0sz5pMdgAbO6mrwE2JNkL7KmqN/RXlnTw3IOQFkiSVzJ4vsj+OwR/HfgI8Abgob7qkubLgJAWzjnAlqr6vwBV9T+B/wWchgGhJcirmKQFkuSvGOwtDD868pXA7QxOUj/RtRXw1v1PC5MWKwNCktTkISZJUpMBIUlqMiAkSU0GhCSpyYCQJDUZEJKkJgNCktT0/wF/zGq1lBw1cgAAAABJRU5ErkJggg==\n",
-      "text/plain": [
-       "<Figure size 432x288 with 1 Axes>"
-      ]
-     },
-     "metadata": {
-      "needs_background": "light"
-     },
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Standard deviation: 0.448 eV/atom\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "plt.hist(df['energy_diff'].tolist(), bins=30)\n",
     "plt.xlabel('$\\Delta E$')\n",
@@ -655,8 +353,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-09T21:31:49.791034Z",
-     "start_time": "2020-12-09T21:31:49.788111Z"
+     "end_time": "2021-02-08T13:51:16.551139Z",
+     "start_time": "2021-02-08T13:51:16.548048Z"
     }
    },
    "outputs": [],
@@ -680,8 +378,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-09T21:31:49.842588Z",
-     "start_time": "2020-12-09T21:31:49.792447Z"
+     "end_time": "2021-02-08T13:51:16.620249Z",
+     "start_time": "2021-02-08T13:51:16.552600Z"
     },
     "scrolled": true
    },
@@ -723,8 +421,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-09T21:31:49.848867Z",
-     "start_time": "2020-12-09T21:31:49.844112Z"
+     "end_time": "2021-02-08T13:51:16.626622Z",
+     "start_time": "2021-02-08T13:51:16.621742Z"
     }
    },
    "outputs": [],
@@ -759,8 +457,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-09T21:31:49.875013Z",
-     "start_time": "2020-12-09T21:31:49.850538Z"
+     "end_time": "2021-02-08T13:51:16.656484Z",
+     "start_time": "2021-02-08T13:51:16.628554Z"
     },
     "scrolled": true
    },
@@ -774,8 +472,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-09T21:31:49.985837Z",
-     "start_time": "2020-12-09T21:31:49.876873Z"
+     "end_time": "2021-02-08T13:51:16.763864Z",
+     "start_time": "2021-02-08T13:51:16.657819Z"
     },
     "scrolled": true
    },
@@ -808,8 +506,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-09T21:32:09.553916Z",
-     "start_time": "2020-12-09T21:31:49.987916Z"
+     "end_time": "2021-02-08T13:51:37.834144Z",
+     "start_time": "2021-02-08T13:51:16.765421Z"
     },
     "scrolled": false
    },
@@ -844,8 +542,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-09T21:32:09.785212Z",
-     "start_time": "2020-12-09T21:32:09.555346Z"
+     "end_time": "2021-02-08T13:51:38.104687Z",
+     "start_time": "2021-02-08T13:51:37.835766Z"
     }
    },
    "outputs": [],
@@ -898,8 +596,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-09T21:32:09.791576Z",
-     "start_time": "2020-12-09T21:32:09.787208Z"
+     "end_time": "2021-02-08T13:51:38.110486Z",
+     "start_time": "2021-02-08T13:51:38.106011Z"
     }
    },
    "outputs": [],
@@ -939,8 +637,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-09T21:32:10.154805Z",
-     "start_time": "2020-12-09T21:32:09.793271Z"
+     "end_time": "2021-02-08T13:51:38.479613Z",
+     "start_time": "2021-02-08T13:51:38.111880Z"
     },
     "scrolled": true
    },
@@ -959,8 +657,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-09T21:32:10.443908Z",
-     "start_time": "2020-12-09T21:32:10.156133Z"
+     "end_time": "2021-02-08T13:51:38.766814Z",
+     "start_time": "2021-02-08T13:51:38.481195Z"
     }
    },
    "outputs": [],
@@ -1003,8 +701,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-09T21:32:10.857853Z",
-     "start_time": "2020-12-09T21:32:10.445342Z"
+     "end_time": "2021-02-08T13:51:39.113981Z",
+     "start_time": "2021-02-08T13:51:38.769152Z"
     }
    },
    "outputs": [],
@@ -1029,14 +727,14 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-09T21:32:15.329849Z",
-     "start_time": "2020-12-09T21:32:15.101788Z"
+     "end_time": "2021-02-08T13:51:39.375223Z",
+     "start_time": "2021-02-08T13:51:39.115648Z"
     },
     "scrolled": true
    },
    "outputs": [],
    "source": [
-    " sisso = SissoRegressor(n_nonzero_coefs=3, n_features_per_sis_iter=10)\n",
+    "sisso = SissoRegressor(n_nonzero_coefs=3, n_features_per_sis_iter=10)\n",
     "\n",
     "sisso.fit(D, P)\n",
     "sisso.print_models(features_list)"
@@ -1061,60 +759,35 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-09T21:33:34.682503Z",
-     "start_time": "2020-12-09T21:33:34.590337Z"
-    },
-    "scrolled": true
+     "end_time": "2021-02-08T13:51:44.105352Z",
+     "start_time": "2021-02-08T13:51:44.087496Z"
+    }
    },
    "outputs": [],
    "source": [
     "# here we define a different dataframe to make it compatible with the c++ implementation of SISSO\n",
-    "# load data\n",
-    "RS_structures = read(\"data/compressed_sensing/RS_structures.xyz\", index=':')\n",
-    "ZB_structures = read(\"data/compressed_sensing/ZB_structures.xyz\", index=':')\n",
-    "\n",
-    "def generate_table(RS_structures, ZB_structures):\n",
-    "\n",
-    "    for RS, ZB in zip(RS_structures, ZB_structures):\n",
-    "        energy_diff = RS.info['energy'] - ZB.info['energy']\n",
-    "        min_struc_type = 'RS' if energy_diff < 0 else 'ZB'\n",
-    "        struc_obj_min = RS if energy_diff < 0 else ZB\n",
-    "\n",
-    "        yield [RS.info['energy'], ZB.info['energy'],\n",
-    "               energy_diff, min_struc_type,\n",
-    "               RS.info['Z'], ZB.info['Z'],\n",
-    "               RS.info['period'], ZB.info['period'],\n",
-    "               RS.info['IP'], ZB.info['IP'],\n",
-    "               RS.info['EA'], ZB.info['EA'],\n",
-    "               RS.info['E_HOMO'], ZB.info['E_HOMO'],\n",
-    "               RS.info['E_LUMO'], ZB.info['E_LUMO'],\n",
-    "               RS.info['r_s'], ZB.info['r_s'],\n",
-    "               RS.info['r_p'], ZB.info['r_p'],\n",
-    "               RS.info['r_d'], ZB.info['r_d'],\n",
-    "               abs(RS.info['r_p']+RS.info['r_s']-ZB.info['r_p']-ZB.info['r_s']),\n",
-    "               abs(RS.info['r_p']-RS.info['r_s'])+abs(ZB.info['r_p']-ZB.info['r_s']),\n",
-    "               RS, ZB, struc_obj_min]\n",
-    "    \n",
-    "df_plus = pd.DataFrame(\n",
-    "    generate_table(RS_structures, ZB_structures),\n",
-    "    columns=['energy_RS', 'energy_ZB', \n",
-    "             'energy_diff', 'min_struc_type', \n",
-    "             'Z_A (nuc_charge)', 'Z_B (nuc_charge)', \n",
-    "             'period_A (unitless)', 'period_B (unitless)', \n",
-    "             'IP_A (eV_IP)', 'IP_B (eV_IP)', \n",
-    "             'EA_A (eV_IP)', 'EA_B (eV_IP)', \n",
-    "             'E_HOMO_A (eV)', 'E_HOMO_B (eV)', \n",
-    "             'E_LUMO_A (eV)', 'E_LUMO_B (eV)', \n",
-    "             'r_s_A', 'r_s_B', \n",
-    "             'r_p_A', 'r_p_B', \n",
-    "             'r_d_A', 'r_d_B',\n",
-    "             'r_sigma', 'r_pi',\n",
-    "             'struc_obj_RS', 'struc_obj_ZB', 'struc_obj_min'],\n",
-    "    index=list(RS.get_chemical_formula() for RS in RS_structures)\n",
-    ")\n",
     "\n",
-    "# print data without structure objects\n",
-    "df_plus = df_plus.drop(['energy_RS', 'energy_ZB', 'min_struc_type', 'struc_obj_RS', 'struc_obj_ZB', 'struc_obj_min'], axis=1)"
+    "# merge target and feature data frame\n",
+    "df_plus = merge_target_feature(df_target, df_features, suffixes=('_A', '_B'))\n",
+    "\n",
+    "# add Zunger's r_pi and r_sigma\n",
+    "df_plus['r_pi']    = abs(df_plus['r_p_A'] - df_plus['r_s_A']) + abs(df_plus['r_p_B'] + df_plus['r_s_B'])\n",
+    "df_plus['r_sigma'] = abs(df_plus['r_p_A'] + df_plus['r_s_A']  -    (df_plus['r_p_B'] + df_plus['r_s_B']))\n",
+    "\n",
+    "df_plus = df_plus.rename(columns={'Z_A': 'Z_A (nuc_charge)',\n",
+    "                                  'Z_B': 'Z_B (nuc_charge)',\n",
+    "                                  'period_A': 'period_A (unitless)',\n",
+    "                                  'period_B': 'period_B (unitless)',\n",
+    "                                  'IP_A': 'IP_A (eV_IP)',\n",
+    "                                  'IP_B': 'IP_B (eV_IP)',\n",
+    "                                  'EA_A': 'EA_A (eV_IP)',\n",
+    "                                  'EA_B': 'EA_B (eV_IP)',\n",
+    "                                  'E_HOMO_A': 'E_HOMO_A (eV)',\n",
+    "                                  'E_HOMO_B': 'E_HOMO_B (eV)',\n",
+    "                                  'E_LUMO_A': 'E_LUMO_A (eV)',\n",
+    "                                  'E_LUMO_B': 'E_LUMO_B (eV)',\n",
+    "                                 })\n",
+    "df_plus_reduced = df_plus.drop(['A', 'B', 'min_struc_type'], axis=1)     "
    ]
   },
   {
@@ -1122,8 +795,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-09T21:33:39.961597Z",
-     "start_time": "2020-12-09T21:33:34.758589Z"
+     "end_time": "2021-02-08T13:51:50.268397Z",
+     "start_time": "2021-02-08T13:51:44.988511Z"
     },
     "scrolled": false
    },
@@ -1132,7 +805,7 @@
     "n_nonzero_coefs=3\n",
     "n_features_per_sis_iter=50\n",
     "phi_0, prop_unit, prop, prop_test, task_sizes_train, task_sizes_test, leave_out_inds = generate_phi_0_from_csv(\n",
-    "    df_plus, \"energy_diff\", \n",
+    "    df_plus_reduced, \"energy_diff\", \n",
     "    cols=['r_s_A', 'r_p_A', 'r_d_A', 'EA_A', 'IP_A', 'r_s_B', 'r_p_B', 'r_d_B', 'EA_B', 'IP_B'], \n",
     "    task_key=None, leave_out_frac=0.0, leave_out_inds=None\n",
     ")\n",
@@ -1179,8 +852,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-09T21:33:42.060185Z",
-     "start_time": "2020-12-09T21:33:41.655421Z"
+     "end_time": "2021-02-08T13:51:50.737037Z",
+     "start_time": "2021-02-08T13:51:50.270139Z"
     },
     "scrolled": false
    },
@@ -1189,7 +862,7 @@
     "n_nonzero_coefs=2\n",
     "n_features_per_sis_iter=50\n",
     "phi_0, prop_unit, prop, prop_test, task_sizes_train, task_sizes_test, leave_out_inds = generate_phi_0_from_csv(\n",
-    "    df_plus, \"energy_diff\", \n",
+    "    df_plus_reduced, \"energy_diff\", \n",
     "    cols=['r_s_A', 'r_p_A', 'r_d_A', 'EA_A', 'IP_A', 'r_s_B', 'r_p_B', 'r_d_B', 'EA_B', 'IP_B'], \n",
     "    task_key=None, leave_out_frac=0.0, leave_out_inds=None\n",
     ")\n",
@@ -1231,20 +904,6 @@
     "Firstly the atomic coordinates of all compounds are stored in a .xyz file for the successive visualization."
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2020-12-09T21:33:45.327451Z",
-     "start_time": "2020-12-09T21:33:45.136210Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "generate_structures (RS_structures,ZB_structures)"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1259,8 +918,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-09T21:33:46.787348Z",
-     "start_time": "2020-12-09T21:33:45.846486Z"
+     "end_time": "2021-02-08T13:51:51.399756Z",
+     "start_time": "2021-02-08T13:51:50.738969Z"
     },
     "scrolled": false
    },
@@ -1290,8 +949,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-08T09:04:18.777429Z",
-     "start_time": "2020-12-08T09:04:17.224066Z"
+     "end_time": "2021-02-08T13:05:52.104347Z",
+     "start_time": "2021-02-08T13:05:52.050Z"
     }
    },
    "outputs": [],
@@ -1311,8 +970,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-08T09:04:36.911358Z",
-     "start_time": "2020-12-08T09:04:18.779050Z"
+     "end_time": "2021-02-08T13:05:52.105156Z",
+     "start_time": "2021-02-08T13:05:52.054Z"
     },
     "scrolled": true
    },
@@ -1351,8 +1010,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-08T09:04:40.440217Z",
-     "start_time": "2020-12-08T09:04:40.366912Z"
+     "end_time": "2021-02-08T13:05:52.106474Z",
+     "start_time": "2021-02-08T13:05:52.057Z"
     }
    },
    "outputs": [],
@@ -1373,8 +1032,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-08T09:04:41.931592Z",
-     "start_time": "2020-12-08T09:04:41.874643Z"
+     "end_time": "2021-02-08T13:05:52.107446Z",
+     "start_time": "2021-02-08T13:05:52.060Z"
     },
     "scrolled": true
    },
@@ -1453,8 +1112,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-08T09:04:43.054913Z",
-     "start_time": "2020-12-08T09:04:43.045711Z"
+     "end_time": "2021-02-08T13:05:52.108447Z",
+     "start_time": "2021-02-08T13:05:52.063Z"
     }
    },
    "outputs": [],
@@ -1472,8 +1131,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-08T09:04:59.751495Z",
-     "start_time": "2020-12-08T09:04:43.306969Z"
+     "end_time": "2021-02-08T13:05:52.109346Z",
+     "start_time": "2021-02-08T13:05:52.066Z"
     },
     "scrolled": true
    },
@@ -1497,8 +1156,8 @@
    "execution_count": null,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2020-12-08T09:04:59.801410Z",
-     "start_time": "2020-12-08T09:04:59.753070Z"
+     "end_time": "2021-02-08T13:05:52.110132Z",
+     "start_time": "2021-02-08T13:05:52.068Z"
     }
    },
    "outputs": [],
@@ -1514,6 +1173,13 @@
     "show_scatter_plot(xs, ys, data_point_labels=data_point_labels, \n",
     "                  x_label='E_diff_DFT', y_label='E_diff_predicted', legend=legend, unit='eV/atom')"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {