diff --git a/compressed_sensing.ipynb b/compressed_sensing.ipynb index bfee0c66297bdeb19c1c2b58d3a79cbab978b9d8..b667aedc88543713979fab02b897590842b39920 100644 --- a/compressed_sensing.ipynb +++ b/compressed_sensing.ipynb @@ -80,353 +80,14 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "ExecuteTime": { - "end_time": "2021-02-05T11:34:54.071622Z", - "start_time": "2021-02-05T11:34:52.319198Z" + "end_time": "2021-02-08T13:51:13.168903Z", + "start_time": "2021-02-08T13:51:11.526672Z" } }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "Bad key \"text.kerning_factor\" on line 4 in\n", - "/home/sbailo/anaconda3/envs/analytics/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.\n", - "You probably need to get an updated matplotlibrc file from\n", - "http://github.com/matplotlib/matplotlib/blob/master/matplotlibrc.template\n", - "or from the matplotlib source distribution\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "5e06b0fdf10d4dd7933e31fcac5c96fc", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "_ColormakerRegistry()" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - " <div class=\"bk-root\">\n", - " <a href=\"https://bokeh.pydata.org\" target=\"_blank\" class=\"bk-logo bk-logo-small bk-logo-notebook\"></a>\n", - " <span id=\"1001\">Loading BokehJS ...</span>\n", - " </div>" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "\n", - "(function(root) {\n", - " function now() {\n", - " return new Date();\n", - " }\n", - "\n", - " var force = true;\n", - "\n", - " if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n", - " root._bokeh_onload_callbacks = [];\n", - " root._bokeh_is_loading = undefined;\n", - " }\n", - "\n", - " var JS_MIME_TYPE = 'application/javascript';\n", - " var HTML_MIME_TYPE = 'text/html';\n", - " var EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n", - " var CLASS_NAME = 'output_bokeh rendered_html';\n", - "\n", - " /**\n", - " * Render data to the DOM node\n", - " */\n", - " function render(props, node) {\n", - " var script = document.createElement(\"script\");\n", - " node.appendChild(script);\n", - " }\n", - "\n", - " /**\n", - " * Handle when an output is cleared or removed\n", - " */\n", - " function handleClearOutput(event, handle) {\n", - " var cell = handle.cell;\n", - "\n", - " var id = cell.output_area._bokeh_element_id;\n", - " var server_id = cell.output_area._bokeh_server_id;\n", - " // Clean up Bokeh references\n", - " if (id != null && id in Bokeh.index) {\n", - " Bokeh.index[id].model.document.clear();\n", - " delete Bokeh.index[id];\n", - " }\n", - "\n", - " if (server_id !== undefined) {\n", - " // Clean up Bokeh references\n", - " var cmd = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n", - " cell.notebook.kernel.execute(cmd, {\n", - " iopub: {\n", - " output: function(msg) {\n", - " var id = msg.content.text.trim();\n", - " if (id in Bokeh.index) {\n", - " Bokeh.index[id].model.document.clear();\n", - " delete Bokeh.index[id];\n", - " }\n", - " }\n", - " }\n", - " });\n", - " // Destroy server and session\n", - " var cmd = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n", - " cell.notebook.kernel.execute(cmd);\n", - " }\n", - " }\n", - "\n", - " /**\n", - " * Handle when a new output is added\n", - " */\n", - " function handleAddOutput(event, handle) {\n", - " var output_area = handle.output_area;\n", - " var output = handle.output;\n", - "\n", - " // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n", - " if ((output.output_type != \"display_data\") || (!output.data.hasOwnProperty(EXEC_MIME_TYPE))) {\n", - " return\n", - " }\n", - "\n", - " var toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n", - "\n", - " if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n", - " toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];\n", - " // store reference to embed id on output_area\n", - " output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n", - " }\n", - " if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n", - " var bk_div = document.createElement(\"div\");\n", - " bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n", - " var script_attrs = bk_div.children[0].attributes;\n", - " for (var i = 0; i < script_attrs.length; i++) {\n", - " toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n", - " }\n", - " // store reference to server id on output_area\n", - " output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n", - " }\n", - " }\n", - "\n", - " function register_renderer(events, OutputArea) {\n", - "\n", - " function append_mime(data, metadata, element) {\n", - " // create a DOM node to render to\n", - " var toinsert = this.create_output_subarea(\n", - " metadata,\n", - " CLASS_NAME,\n", - " EXEC_MIME_TYPE\n", - " );\n", - " this.keyboard_manager.register_events(toinsert);\n", - " // Render to node\n", - " var props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n", - " render(props, toinsert[toinsert.length - 1]);\n", - " element.append(toinsert);\n", - " return toinsert\n", - " }\n", - "\n", - " /* Handle when an output is cleared or removed */\n", - " events.on('clear_output.CodeCell', handleClearOutput);\n", - " events.on('delete.Cell', handleClearOutput);\n", - "\n", - " /* Handle when a new output is added */\n", - " events.on('output_added.OutputArea', handleAddOutput);\n", - "\n", - " /**\n", - " * Register the mime type and append_mime function with output_area\n", - " */\n", - " OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n", - " /* Is output safe? */\n", - " safe: true,\n", - " /* Index of renderer in `output_area.display_order` */\n", - " index: 0\n", - " });\n", - " }\n", - "\n", - " // register the mime type if in Jupyter Notebook environment and previously unregistered\n", - " if (root.Jupyter !== undefined) {\n", - " var events = require('base/js/events');\n", - " var OutputArea = require('notebook/js/outputarea').OutputArea;\n", - "\n", - " if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n", - " register_renderer(events, OutputArea);\n", - " }\n", - " }\n", - "\n", - " \n", - " if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n", - " root._bokeh_timeout = Date.now() + 5000;\n", - " root._bokeh_failed_load = false;\n", - " }\n", - "\n", - " var NB_LOAD_WARNING = {'data': {'text/html':\n", - " \"<div style='background-color: #fdd'>\\n\"+\n", - " \"<p>\\n\"+\n", - " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", - " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", - " \"</p>\\n\"+\n", - " \"<ul>\\n\"+\n", - " \"<li>re-rerun `output_notebook()` to attempt to load from CDN again, or</li>\\n\"+\n", - " \"<li>use INLINE resources instead, as so:</li>\\n\"+\n", - " \"</ul>\\n\"+\n", - " \"<code>\\n\"+\n", - " \"from bokeh.resources import INLINE\\n\"+\n", - " \"output_notebook(resources=INLINE)\\n\"+\n", - " \"</code>\\n\"+\n", - " \"</div>\"}};\n", - "\n", - " function display_loaded() {\n", - " var el = document.getElementById(\"1001\");\n", - " if (el != null) {\n", - " el.textContent = \"BokehJS is loading...\";\n", - " }\n", - " if (root.Bokeh !== undefined) {\n", - " if (el != null) {\n", - " el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n", - " }\n", - " } else if (Date.now() < root._bokeh_timeout) {\n", - " setTimeout(display_loaded, 100)\n", - " }\n", - " }\n", - "\n", - "\n", - " function run_callbacks() {\n", - " try {\n", - " root._bokeh_onload_callbacks.forEach(function(callback) {\n", - " if (callback != null)\n", - " callback();\n", - " });\n", - " } finally {\n", - " delete root._bokeh_onload_callbacks\n", - " }\n", - " console.debug(\"Bokeh: all callbacks have finished\");\n", - " }\n", - "\n", - " function load_libs(css_urls, js_urls, callback) {\n", - " if (css_urls == null) css_urls = [];\n", - " if (js_urls == null) js_urls = [];\n", - "\n", - " root._bokeh_onload_callbacks.push(callback);\n", - " if (root._bokeh_is_loading > 0) {\n", - " console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n", - " return null;\n", - " }\n", - " if (js_urls == null || js_urls.length === 0) {\n", - " run_callbacks();\n", - " return null;\n", - " }\n", - " console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n", - " root._bokeh_is_loading = css_urls.length + js_urls.length;\n", - "\n", - " function on_load() {\n", - " root._bokeh_is_loading--;\n", - " if (root._bokeh_is_loading === 0) {\n", - " console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n", - " run_callbacks()\n", - " }\n", - " }\n", - "\n", - " function on_error() {\n", - " console.error(\"failed to load \" + url);\n", - " }\n", - "\n", - " for (var i = 0; i < css_urls.length; i++) {\n", - " var url = css_urls[i];\n", - " const element = document.createElement(\"link\");\n", - " element.onload = on_load;\n", - " element.onerror = on_error;\n", - " element.rel = \"stylesheet\";\n", - " element.type = \"text/css\";\n", - " element.href = url;\n", - " console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n", - " document.body.appendChild(element);\n", - " }\n", - "\n", - " for (var i = 0; i < js_urls.length; i++) {\n", - " var url = js_urls[i];\n", - " var element = document.createElement('script');\n", - " element.onload = on_load;\n", - " element.onerror = on_error;\n", - " element.async = false;\n", - " element.src = url;\n", - " console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n", - " document.head.appendChild(element);\n", - " }\n", - " };var element = document.getElementById(\"1001\");\n", - " if (element == null) {\n", - " console.error(\"Bokeh: ERROR: autoload.js configured with elementid '1001' but no matching script tag was found. \")\n", - " return false;\n", - " }\n", - "\n", - " function inject_raw_css(css) {\n", - " const element = document.createElement(\"style\");\n", - " element.appendChild(document.createTextNode(css));\n", - " document.body.appendChild(element);\n", - " }\n", - "\n", - " var js_urls = [\"https://cdn.pydata.org/bokeh/release/bokeh-1.3.4.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-widgets-1.3.4.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-tables-1.3.4.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-gl-1.3.4.min.js\"];\n", - " var css_urls = [];\n", - "\n", - " var inline_js = [\n", - " function(Bokeh) {\n", - " Bokeh.set_log_level(\"info\");\n", - " },\n", - " \n", - " function(Bokeh) {\n", - " \n", - " },\n", - " function(Bokeh) {} // ensure no trailing comma for IE\n", - " ];\n", - "\n", - " function run_inline_js() {\n", - " \n", - " if ((root.Bokeh !== undefined) || (force === true)) {\n", - " for (var i = 0; i < inline_js.length; i++) {\n", - " inline_js[i].call(root, root.Bokeh);\n", - " }if (force === true) {\n", - " display_loaded();\n", - " }} else if (Date.now() < root._bokeh_timeout) {\n", - " setTimeout(run_inline_js, 100);\n", - " } else if (!root._bokeh_failed_load) {\n", - " console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n", - " root._bokeh_failed_load = true;\n", - " } else if (force !== true) {\n", - " var cell = $(document.getElementById(\"1001\")).parents('.cell').data().cell;\n", - " cell.output_area.append_execute_result(NB_LOAD_WARNING)\n", - " }\n", - "\n", - " }\n", - "\n", - " if (root._bokeh_is_loading === 0) {\n", - " console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n", - " run_inline_js();\n", - " } else {\n", - " load_libs(css_urls, js_urls, function() {\n", - " console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n", - " run_inline_js();\n", - " });\n", - " }\n", - "}(window));" - ], - "application/vnd.bokehjs_load.v0+json": "\n(function(root) {\n function now() {\n return new Date();\n }\n\n var force = true;\n\n if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n root._bokeh_onload_callbacks = [];\n root._bokeh_is_loading = undefined;\n }\n\n \n\n \n if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_failed_load = false;\n }\n\n var NB_LOAD_WARNING = {'data': {'text/html':\n \"<div style='background-color: #fdd'>\\n\"+\n \"<p>\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"</p>\\n\"+\n \"<ul>\\n\"+\n \"<li>re-rerun `output_notebook()` to attempt to load from CDN again, or</li>\\n\"+\n \"<li>use INLINE resources instead, as so:</li>\\n\"+\n \"</ul>\\n\"+\n \"<code>\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"</code>\\n\"+\n \"</div>\"}};\n\n function display_loaded() {\n var el = document.getElementById(\"1001\");\n if (el != null) {\n el.textContent = \"BokehJS is loading...\";\n }\n if (root.Bokeh !== undefined) {\n if (el != null) {\n el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n }\n } else if (Date.now() < root._bokeh_timeout) {\n setTimeout(display_loaded, 100)\n }\n }\n\n\n function run_callbacks() {\n try {\n root._bokeh_onload_callbacks.forEach(function(callback) {\n if (callback != null)\n callback();\n });\n } finally {\n delete root._bokeh_onload_callbacks\n }\n console.debug(\"Bokeh: all callbacks have finished\");\n }\n\n function load_libs(css_urls, js_urls, callback) {\n if (css_urls == null) css_urls = [];\n if (js_urls == null) js_urls = [];\n\n root._bokeh_onload_callbacks.push(callback);\n if (root._bokeh_is_loading > 0) {\n console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n return null;\n }\n if (js_urls == null || js_urls.length === 0) {\n run_callbacks();\n return null;\n }\n console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n root._bokeh_is_loading = css_urls.length + js_urls.length;\n\n function on_load() {\n root._bokeh_is_loading--;\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n run_callbacks()\n }\n }\n\n function on_error() {\n console.error(\"failed to load \" + url);\n }\n\n for (var i = 0; i < css_urls.length; i++) {\n var url = css_urls[i];\n const element = document.createElement(\"link\");\n element.onload = on_load;\n element.onerror = on_error;\n element.rel = \"stylesheet\";\n element.type = \"text/css\";\n element.href = url;\n console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n document.body.appendChild(element);\n }\n\n for (var i = 0; i < js_urls.length; i++) {\n var url = js_urls[i];\n var element = document.createElement('script');\n element.onload = on_load;\n element.onerror = on_error;\n element.async = false;\n element.src = url;\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.head.appendChild(element);\n }\n };var element = document.getElementById(\"1001\");\n if (element == null) {\n console.error(\"Bokeh: ERROR: autoload.js configured with elementid '1001' but no matching script tag was found. \")\n return false;\n }\n\n function inject_raw_css(css) {\n const element = document.createElement(\"style\");\n element.appendChild(document.createTextNode(css));\n document.body.appendChild(element);\n }\n\n var js_urls = [\"https://cdn.pydata.org/bokeh/release/bokeh-1.3.4.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-widgets-1.3.4.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-tables-1.3.4.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-gl-1.3.4.min.js\"];\n var css_urls = [];\n\n var inline_js = [\n function(Bokeh) {\n Bokeh.set_log_level(\"info\");\n },\n \n function(Bokeh) {\n \n },\n function(Bokeh) {} // ensure no trailing comma for IE\n ];\n\n function run_inline_js() {\n \n if ((root.Bokeh !== undefined) || (force === true)) {\n for (var i = 0; i < inline_js.length; i++) {\n inline_js[i].call(root, root.Bokeh);\n }if (force === true) {\n display_loaded();\n }} else if (Date.now() < root._bokeh_timeout) {\n setTimeout(run_inline_js, 100);\n } else if (!root._bokeh_failed_load) {\n console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n root._bokeh_failed_load = true;\n } else if (force !== true) {\n var cell = $(document.getElementById(\"1001\")).parents('.cell').data().cell;\n cell.output_area.append_execute_result(NB_LOAD_WARNING)\n }\n\n }\n\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n run_inline_js();\n } else {\n load_libs(css_urls, js_urls, function() {\n console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n run_inline_js();\n });\n }\n}(window));" - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import os\n", "import pandas as pd\n", @@ -442,13 +103,14 @@ "from sklearn.model_selection import GridSearchCV, LeaveOneOut\n", "from IPython.display import HTML\n", "from jupyter_jsmol import JsmolView\n", + "import pathlib\n", + "\n", "\n", "import nglview\n", "from ase.units import J\n", "\n", "from compressed_sensing.sisso import SissoRegressor\n", "from compressed_sensing.combine_features import combine_features\n", - "from compressed_sensing.utils import generate_structures\n", "from compressed_sensing.scatter_plot import show_scatter_plot\n", "from compressed_sensing.visualizer import Visualizer\n", "from cpp_sisso import generate_fs, SISSORegressor, generate_phi_0_from_csv, FeatureSpace, get_max_number_feats\n", @@ -468,7 +130,7 @@ "metadata": {}, "source": [ "# Get the data\n", - "Let us load the data from the file data/data.pkl into a data frame. The data was downloaded from the NOMAD archive and the NOMAD atomic data collection. It consists of RS-ZB energy differences (in eV/atom) of the 82 octet binary compounds, structure objects containing the atomic positions of the materials and properties of the atomic constituents. The following atomic features are considered:\n", + "Let us load the data from the NOMAD Archive and the atomicfeaturespackage. It consists of RS-ZB energy differences (in eV/atom) of the 82 octet binary compounds, structure objects containing the atomic positions of the materials and properties of the atomic constituents. The following atomic features are considered:\n", "\n", "<div >\n", " <ul>\n", @@ -488,7 +150,8 @@ "execution_count": null, "metadata": { "ExecuteTime": { - "start_time": "2021-02-05T11:35:25.112Z" + "end_time": "2021-02-08T13:51:16.037772Z", + "start_time": "2021-02-08T13:51:13.170622Z" } }, "outputs": [], @@ -506,8 +169,10 @@ " \"energy_total\": '*',\n", " },\n", " 'section_system':{\n", + " \"chemical_composition_reduced\": '*',\n", " 'atom_labels':'*',\n", " 'atom_positions':'*',\n", + " 'lattice_vectors':'*',\n", " 'section_symmetry':{\n", " 'space_group_number': '*', \n", " } \n", @@ -521,6 +186,9 @@ " return query\n", "\n", "def get_target(query):\n", + " \n", + " path_structure = './data/compressed_sensing/structures/'\n", + " pathlib.Path(path_structure).mkdir(parents=True, exist_ok=True)\n", " df_target = pd.DataFrame()\n", " for entry in query:\n", " calculation = entry.section_run[0]\n", @@ -530,22 +198,40 @@ " \"B\": atom_labels[1],\n", " \"space_group\": calculation.section_system[0].section_symmetry[0].space_group_number,\n", " \"energy\": calculation.section_single_configuration_calculation[0].energy_total.magnitude,\n", - " \"positions\": calculation.section_system[0].atom_positions.magnitude,\n", + " 'compound': calculation.section_system[0].chemical_composition_reduced,\n", " },\n", " ignore_index=True\n", " )\n", - "\n", - " df_target['compound'] = df_target['A'] + df_target['B'] \n", + " atoms = [atom_labels[0], atom_labels[1]]\n", + " # positions are converted into AA using a scale factor\n", + " scale_factor = 10**10\n", + " positions = calculation.section_system[0].atom_positions\n", + " lat_x, lat_y, lat_z = calculation.section_system[0].lattice_vectors.magnitude * scale_factor\n", + " file = open(\"data/compressed_sensing/structures/\"+df_target.iloc[-1]['compound']+\".xyz\", \"w\")\n", + " file.write (\"%d\\n\\n\"%32)\n", + " for i in [0,1,2]:\n", + " for j in [0,1,2]:\n", + " for k in [0,1,2]:\n", + " for n in range(2):\n", + " xyz = calculation.section_system[0].atom_positions[n].magnitude * scale_factor\n", + " xyz += i*lat_x\n", + " xyz += j*lat_y\n", + " xyz += k*lat_z\n", + " file.write(atoms[n])\n", + " file.write(\"\\t%f\\t%f\\t%f\\n\" % (xyz[0],\n", + " xyz[1],\n", + " xyz[2]))\n", + " file.close()\n", + " \n", " df_RS = df_target.query('space_group==225 or space_group==221').set_index('compound').sort_index()\n", " df_ZB = df_target.query('space_group==216 or space_group==227').set_index('compound').sort_index()\n", - " df_target = df_RS[['A','B', 'positions']]\n", + " df_target = df_RS[['A','B']]\n", " df_target['energy_diff']=(df_RS['energy']-df_ZB['energy'])/2\n", " df_target['min_struc_type']=np.where(df_RS['energy']<df_ZB['energy'],'RS','ZB')\n", "\n", - " # convert J in eV and m in AA\n", + " # convert J in eV \n", " df_target['energy_diff'] *= J\n", - " df_target['positions'] *= 10**10\n", - " return df_target[['A', 'B', 'energy_diff', 'min_struc_type', 'positions']]\n", + " return df_target[['A', 'B', 'energy_diff', 'min_struc_type']]\n", "\n", "# get data (chemical formulas and RS-ZB energy difference) from query\n", "query = get_query()\n", @@ -556,7 +242,13 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2021-02-08T13:51:16.060958Z", + "start_time": "2021-02-08T13:51:16.040921Z" + }, + "scrolled": true + }, "outputs": [], "source": [ "def get_features(elements, features, rename_dict={}): \n", @@ -578,7 +270,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2021-02-08T13:51:16.080154Z", + "start_time": "2021-02-08T13:51:16.062309Z" + } + }, "outputs": [], "source": [ "def sort_AB_wrt_electronegativity(df_target, df_features):\n", @@ -596,6 +293,27 @@ "df_target = sort_AB_wrt_electronegativity(df_target, df_features)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2021-02-08T13:51:16.141626Z", + "start_time": "2021-02-08T13:51:16.081628Z" + } + }, + "outputs": [], + "source": [ + "def merge_target_feature(df_target, df_features, suffixes=('(A)', '(B)')):\n", + " df = df_target.merge(df_features, left_on='A', right_index=True)\n", + " df = df.merge(df_features, left_on='B', right_index=True, suffixes=suffixes)\n", + " return df\n", + "\n", + "# merge target and feature data frame\n", + "df = merge_target_feature(df_target, df_features)\n", + "df" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -605,34 +323,14 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": { "ExecuteTime": { - "end_time": "2021-01-15T11:11:40.966163Z", - "start_time": "2021-01-15T11:11:40.578468Z" + "end_time": "2021-02-08T13:51:16.546500Z", + "start_time": "2021-02-08T13:51:16.143452Z" } }, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAEICAYAAABF82P+AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAUa0lEQVR4nO3da7BlZX3n8e8vDWQmSkawD8itbUwoJ2gEmZNWw+jgtaC1RC2idKWUyVBpdWRKJqnU9MQqdd6RmYmTilAyHUGwiqAkipLQ3HSIhCpFGqq5BQgdpg2d7qEbreEyWEO1858Xe3XYHJ5zevfh7L3O5fup2rXXep5n7f1fZwE/1mWvlapCkqSZfq7vAiRJi5MBIUlqMiAkSU0GhCSpyYCQJDUd0ncBC2n16tW1du3avsuQpCXjrrvueqKqplp9yyog1q5dy9atW/suQ5KWjCQ/mq3PQ0ySpCYDQpLUZEBIkpoMCElSkwEhSWoyICRJTWMLiCQnJLk1yYNJHkjy6a79yCS3JHmkez9iluXPTPJwku1JNo2rTklS2zj3IPYBv1tVvwK8GfhUkpOBTcB3q+ok4Lvd/AskWQVcApwFnAxs6JaVJE3I2AKiqnZX1d3d9NPAg8BxwNnAld2wK4EPNBZfB2yvqker6jnga91ykqQJmcgvqZOsBd4I3AEcXVW7YRAiSY5qLHIc8NjQ/E7gTbN89kZgI8CaNWsWruiXaO2m60cat+Oi9465Ekman7GfpE7ycuAbwIVV9dSoizXamo++q6rNVTVdVdNTU83biUiS5mGsAZHkUAbhcFVVfbNrfjzJMV3/McCexqI7gROG5o8Hdo2zVknSC43zKqYAlwEPVtUXhrquA87rps8Dvt1Y/E7gpCQnJjkMOLdbTpI0IePcgzgd+CjwjiTbutd64CLg3UkeAd7dzZPk2CRbAKpqH3ABcBODk9vXVNUDY6xVkjTD2E5SV9XttM8lALyzMX4XsH5ofguwZTzVSZIOxF9SS5KaDAhJUpMBIUlqMiAkSU0GhCSpyYCQJDUZEJKkJgNCktRkQEiSmgwISVKTASFJajIgJElNBoQkqcmAkCQ1GRCSpCYDQpLUNLYHBiW5HHgfsKeqXt+1fR14bTfkFcD/rqpTG8vuAJ4Gfgbsq6rpcdUpSWobW0AAVwAXA1/d31BVH9k/neQPgSfnWP7tVfXE2KqTJM1pnI8cvS3J2lZfkgAfBt4xru+XJL00fZ2DeCvweFU9Mkt/ATcnuSvJxgnWJUnqjPMQ01w2AFfP0X96Ve1KchRwS5KHquq21sAuQDYCrFmzZuErlaQVauJ7EEkOAT4EfH22MVW1q3vfA1wLrJtj7Oaqmq6q6ampqYUuV5JWrD4OMb0LeKiqdrY6k7wsyeH7p4H3APdPsD5JEmMMiCRXA98HXptkZ5Lzu65zmXF4KcmxSbZ0s0cDtye5B/ghcH1V3TiuOiVJbeO8imnDLO3/utG2C1jfTT8KnDKuuiRJo/GX1JKkJgNCktRkQEiSmgwISVKTASFJajIgJElNBoQkqcmAkCQ1GRCSpCYDQpLUZEBIkpoMCElSkwEhSWoyICRJTQaEJKnJgJAkNRkQkqSmcT5y9PIke5LcP9T2+ST/kGRb91o/y7JnJnk4yfYkm8ZVoyRpduPcg7gCOLPR/t+q6tTutWVmZ5JVwCXAWcDJwIYkJ4+xTklSw9gCoqpuA34yj0XXAdur6tGqeg74GnD2ghYnSTqgPs5BXJDk3u4Q1BGN/uOAx4bmd3ZtTUk2JtmaZOvevXsXulZJWrEmHRBfAn4JOBXYDfxhY0wabTXbB1bV5qqarqrpqamphalSkjTZgKiqx6vqZ1X1/4A/YXA4aaadwAlD88cDuyZRnyTpeRMNiCTHDM1+ELi/MexO4KQkJyY5DDgXuG4S9UmSnnfIuD44ydXAGcDqJDuBzwFnJDmVwSGjHcDHu7HHAl+uqvVVtS/JBcBNwCrg8qp6YFx1SpLaxhYQVbWh0XzZLGN3AeuH5rcAL7oEVpI0Of6SWpLUZEBIkpoMCElSkwEhSWoyICRJTQaEJKnJgJAkNRkQkqQmA0KS1GRASJKaDAhJUpMBIUlqMiAkSU0GhCSpyYCQJDUZEJKkJgNCktQ0toBIcnmSPUnuH2r7L0keSnJvkmuTvGKWZXckuS/JtiRbx1WjJGl249yDuAI4c0bbLcDrq+oNwN8C/3GO5d9eVadW1fSY6pMkzWFsAVFVtwE/mdF2c1Xt62Z/ABw/ru+XJL00fZ6D+DfADbP0FXBzkruSbJzrQ5JsTLI1yda9e/cueJGStFL1EhBJPgPsA66aZcjpVXUacBbwqSRvm+2zqmpzVU1X1fTU1NQYqpWklWniAZHkPOB9wG9WVbXGVNWu7n0PcC2wbnIVSpJgwgGR5EzgPwDvr6pnZxnzsiSH758G3gPc3xorSRqfcV7mejXwfeC1SXYmOR+4GDgcuKW7hPXSbuyxSbZ0ix4N3J7kHuCHwPVVdeO46pQktR0yrg+uqg2N5stmGbsLWN9NPwqcMq66JEmjOeiASHIEcEJV3TuGerQA1m66fqRxOy5675grkbSUjXSIKclfJfnFJEcC9wBfSfKF8ZYmSerTqOcg/llVPQV8CPhKVf0L4F3jK0uS1LdRA+KQJMcAHwb+coz1SJIWiVED4j8BNwHbq+rOJK8BHhlfWZKkvo16knp3d4M9YHClkecgJGl5G3UP4osjtkmSlok59yCSvAX4dWAqye8Mdf0isGqchUmS+nWgQ0yHAS/vxh0+1P4UcM64ipIk9W/OgKiq7wHfS3JFVf1oQjVJkhaBUU9S/3ySzcDa4WWq6h3jKEqS1L9RA+LPgEuBLwM/G185kqTFYtSA2FdVXxprJZKkRWXUy1z/Ism/TXJMkiP3v8ZamSSpV6PuQZzXvf/eUFsBr1nYciRJi8VIAVFVJ467EEnS4jJSQCT5WKu9qr66sOVIkhaLUc9B/NrQ663A54H3z7VAksuT7Ely/1DbkUluSfJI937ELMuemeThJNuTbBqxRknSAhopIKrq3w29fht4I4NfWc/lCuDMGW2bgO9W1UnAd7v5F0iyCrgEOAs4GdiQ5ORR6pQkLZxR9yBmehY4aa4BVXUb8JMZzWcDV3bTVwIfaCy6jsFtxR+tqueAr3XLSZImaNRzEH/B4KolGNyk71eAa+bxfUdX1W6Aqtqd5KjGmOOAx4bmdwJvmqO2jcBGgDVr1syjpH75/GhJi9Wol7n+16HpfcCPqmrnGOoBSKOtGm2DjqrNwGaA6enpWcdJkg7OqOcgvgc8xOCOrkcAz83z+x7vHl1K976nMWYncMLQ/PHArnl+nyRpnkYKiCQfBn4I/AaD51LfkWQ+t/u+jud/dHce8O3GmDuBk5KcmOQw4NxuOUnSBI16iOkzwK9V1R6AJFPAd4A/n22BJFcDZwCrk+wEPgdcBFyT5Hzg7xkEDkmOBb5cVeural+SCxg8A3sVcHlVPTCflZMkzd+oAfFz+8Oh82MOsPdRVRtm6XpnY+wuYP3Q/BZgy4i1SZLGYNSAuDHJTcDV3fxH8D/gkrSsHeiZ1L/M4NLU30vyIeBfMrjK6PvAVROoT5LUkwOdpP4j4GmAqvpmVf1OVf17BnsPfzTu4iRJ/TlQQKytqntnNlbVVgaPH5UkLVMHCoh/MkffP13IQiRJi8uBTlLfmeS3q+pPhhu7y1TvGl9Zk+ctLyTphQ4UEBcC1yb5TZ4PhGkGd3L94DgLkyT1a86AqKrHgV9P8nbg9V3z9VX1P8ZemSSpV6M+cvRW4NYx1yJJWkTm+zwISdIyZ0BIkpoMCElSkwEhSWoyICRJTQaEJKlp1Nt9q2ej/tJbkhaKexCSpKaJB0SS1ybZNvR6KsmFM8ackeTJoTGfnXSdkrTSTfwQU1U9DJwKkGQV8A/AtY2hf11V75tkbZKk5/V9iOmdwN9V1Y96rkOSNEPfAXEuzz/neqa3JLknyQ1JXjfbByTZmGRrkq179+4dT5WStAL1FhBJDgPeD/xZo/tu4NVVdQrwReBbs31OVW2uqumqmp6amhpPsZK0AvW5B3EWcHd3S/EXqKqnquqZbnoLcGiS1ZMuUJJWsj4DYgOzHF5K8qok6abXMajzxxOsTZJWvF5+KJfkF4B3Ax8favsEQFVdCpwDfDLJPuCnwLlVVX3UKkkrVS8BUVXPAq+c0Xbp0PTFwMWTrkuS9Ly+r2KSJC1SBoQkqcmAkCQ1GRCSpCYDQpLUZEBIkpoMCElSkwEhSWoyICRJTQaEJKmpl1ttaHFYu+n6kcbtuOi9Y65E0mLkHoQkqcmAkCQ1GRCSpCYDQpLUZEBIkpoMCElSUy8BkWRHkvuSbEuytdGfJH+cZHuSe5Oc1kedkrSS9fk7iLdX1ROz9J0FnNS93gR8qXuXJE3IYj3EdDbw1Rr4AfCKJMf0XZQkrSR97UEUcHOSAv57VW2e0X8c8NjQ/M6ubffMD0qyEdgIsGbNmvFUq5H4y2xpeelrD+L0qjqNwaGkTyV524z+NJap1gdV1eaqmq6q6ampqYWuU5JWrF4Coqp2de97gGuBdTOG7AROGJo/Htg1meokSdBDQCR5WZLD908D7wHunzHsOuBj3dVMbwaerKoXHV6SJI1PH+cgjgauTbL/+/+0qm5M8gmAqroU2AKsB7YDzwK/1UOdkrSiTTwgqupR4JRG+6VD0wV8apJ1SZJeaLFe5ipJ6pkBIUlqMiAkSU0GhCSpyYCQJDUZEJKkJgNCktRkQEiSmgwISVKTASFJajIgJElNBoQkqcmAkCQ1GRCSpCYDQpLU1McDg5a0tZuu77sESZoI9yAkSU19PJP6hCS3JnkwyQNJPt0Yc0aSJ5Ns616fnXSdkrTS9XGIaR/wu1V1d5LDgbuS3FJVfzNj3F9X1ft6qE+SRA97EFW1u6ru7qafBh4Ejpt0HZKkufV6DiLJWuCNwB2N7rckuSfJDUleN8dnbEyyNcnWvXv3jqlSSVp5eguIJC8HvgFcWFVPzei+G3h1VZ0CfBH41myfU1Wbq2q6qqanpqbGV7AkrTC9BESSQxmEw1VV9c2Z/VX1VFU9001vAQ5NsnrCZUrSitbHVUwBLgMerKovzDLmVd04kqxjUOePJ1elJKmPq5hOBz4K3JdkW9f2+8AagKq6FDgH+GSSfcBPgXOrqnqoVZJWrIkHRFXdDuQAYy4GLp5MRVqsRv3V+o6L3jvmSqSVyV9SS5KaDAhJUpMBIUlqMiAkSU0GhCSpyYCQJDUZEJKkJgNCktRkQEiSmrKc7mAxPT1dW7dundeyPmta87ESf8XtL9wnZxJ/6yR3VdV0q889CElSkwEhSWoyICRJTQaEJKnJgJAkNRkQkqQmA0KS1NRLQCQ5M8nDSbYn2dToT5I/7vrvTXJaH3VK0ko28YBIsgq4BDgLOBnYkOTkGcPOAk7qXhuBL020SElSL3sQ64DtVfVoVT0HfA04e8aYs4Gv1sAPgFckOWbShUrSSnZID995HPDY0PxO4E0jjDkO2D3zw5JsZLCXAfBMkoe76dXAEwtRcM+Wy3rA8lmXf1yP/EHPlbx0Y9smPfxtlt0/X6N6iX/rV8/W0UdApNE284ZQo4wZNFZtBja/6EuSrbPdX2QpWS7rActnXZbLeoDrshgtpvXo4xDTTuCEofnjgV3zGCNJGqM+AuJO4KQkJyY5DDgXuG7GmOuAj3VXM70ZeLKqXnR4SZI0PhM/xFRV+5JcANwErAIur6oHknyi678U2AKsB7YDzwK/NY+vetFhpyVquawHLJ91WS7rAa7LYrRo1mNZPQ9CkrRw/CW1JKnJgJAkNS2bgEhyZJJbkjzSvR8xy7gdSe5Lsi3J/J5POgbL5fYjI6zHGUme7P7+25J8to86DyTJ5Un2JLl/lv4lsT1gpHVZKtvkhCS3JnkwyQNJPt0YsyS2y4jr0v92qapl8QL+M7Cpm94E/MEs43YAq/uud0ZNq4C/A14DHAbcA5w8Y8x64AYGvxF5M3BH33XPcz3OAP6y71pHWJe3AacB98/Sv+i3x0Gsy1LZJscAp3XThwN/uxT/PTmIdel9uyybPQgGt+e4spu+EvhAj7UcrOVy+5FR1mNJqKrbgJ/MMWQpbA9gpHVZEqpqd1Xd3U0/DTzI4A4Lw5bEdhlxXXq3nALi6Op+K9G9HzXLuAJuTnJXd5uOxWC2W4sc7Ji+jVrjW5Lck+SGJK+bTGkLbilsj4OxpLZJkrXAG4E7ZnQtue0yx7pAz9ulj1ttzFuS7wCvanR95iA+5vSq2pXkKOCWJA91/4fVpwW9/UiPRqnxbuDVVfVMkvXAtxjctXepWQrbY1RLapskeTnwDeDCqnpqZndjkUW7XQ6wLr1vlyW1B1FV76qq1zde3wYe378r2b3vmeUzdnXve4BrGRwW6dtyuf3IAWusqqeq6pluegtwaJLVkytxwSyF7TGSpbRNkhzK4D+oV1XVNxtDlsx2OdC6LIbtsqQC4gCuA87rps8Dvj1zQJKXJTl8/zTwHqB5ZceELZfbjxxwPZK8Kkm66XUM/hn88cQrfemWwvYYyVLZJl2NlwEPVtUXZhm2JLbLKOuyGLbLkjrEdAAXAdckOR/4e+A3AJIcC3y5qtYDRwPXdn/zQ4A/raobe6r3H9Xkbj8yViOuxznAJ5PsA34KnFvdJRuLSZKrGVxFsjrJTuBzwKGwdLbHfiOsy5LYJsDpwEeB+5Js69p+H1gDS267jLIuvW8Xb7UhSWpaToeYJEkLyICQJDUZEJKkJgNCktRkQEiSmgwISVKTASFJajIgpAWW5INJKsk/H2r7eJLdQ/f235bkV/usUzoQfygnLbAk1wAnAtdX1ee7tkuAu6vqsj5rkw6GexDSAuruzvmvgPOBDUNdvwpsay4kLVIGhLSwPgB8p6ruBf7P0CMvXwd8Zejw0mJ5Fok0q+V0sz5pMdgAbO6mrwE2JNkL7KmqN/RXlnTw3IOQFkiSVzJ4vsj+OwR/HfgI8Abgob7qkubLgJAWzjnAlqr6vwBV9T+B/wWchgGhJcirmKQFkuSvGOwtDD868pXA7QxOUj/RtRXw1v1PC5MWKwNCktTkISZJUpMBIUlqMiAkSU0GhCSpyYCQJDUZEJKkJgNCktT0/wF/zGq1lBw1cgAAAABJRU5ErkJggg==\n", - "text/plain": [ - "<Figure size 432x288 with 1 Axes>" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Standard deviation: 0.448 eV/atom\n" - ] - } - ], + "outputs": [], "source": [ "plt.hist(df['energy_diff'].tolist(), bins=30)\n", "plt.xlabel('$\\Delta E$')\n", @@ -655,8 +353,8 @@ "execution_count": null, "metadata": { "ExecuteTime": { - "end_time": "2020-12-09T21:31:49.791034Z", - "start_time": "2020-12-09T21:31:49.788111Z" + "end_time": "2021-02-08T13:51:16.551139Z", + "start_time": "2021-02-08T13:51:16.548048Z" } }, "outputs": [], @@ -680,8 +378,8 @@ "execution_count": null, "metadata": { "ExecuteTime": { - "end_time": "2020-12-09T21:31:49.842588Z", - "start_time": "2020-12-09T21:31:49.792447Z" + "end_time": "2021-02-08T13:51:16.620249Z", + "start_time": "2021-02-08T13:51:16.552600Z" }, "scrolled": true }, @@ -723,8 +421,8 @@ "execution_count": null, "metadata": { "ExecuteTime": { - "end_time": "2020-12-09T21:31:49.848867Z", - "start_time": "2020-12-09T21:31:49.844112Z" + "end_time": "2021-02-08T13:51:16.626622Z", + "start_time": "2021-02-08T13:51:16.621742Z" } }, "outputs": [], @@ -759,8 +457,8 @@ "execution_count": null, "metadata": { "ExecuteTime": { - "end_time": "2020-12-09T21:31:49.875013Z", - "start_time": "2020-12-09T21:31:49.850538Z" + "end_time": "2021-02-08T13:51:16.656484Z", + "start_time": "2021-02-08T13:51:16.628554Z" }, "scrolled": true }, @@ -774,8 +472,8 @@ "execution_count": null, "metadata": { "ExecuteTime": { - "end_time": "2020-12-09T21:31:49.985837Z", - "start_time": "2020-12-09T21:31:49.876873Z" + "end_time": "2021-02-08T13:51:16.763864Z", + "start_time": "2021-02-08T13:51:16.657819Z" }, "scrolled": true }, @@ -808,8 +506,8 @@ "execution_count": null, "metadata": { "ExecuteTime": { - "end_time": "2020-12-09T21:32:09.553916Z", - "start_time": "2020-12-09T21:31:49.987916Z" + "end_time": "2021-02-08T13:51:37.834144Z", + "start_time": "2021-02-08T13:51:16.765421Z" }, "scrolled": false }, @@ -844,8 +542,8 @@ "execution_count": null, "metadata": { "ExecuteTime": { - "end_time": "2020-12-09T21:32:09.785212Z", - "start_time": "2020-12-09T21:32:09.555346Z" + "end_time": "2021-02-08T13:51:38.104687Z", + "start_time": "2021-02-08T13:51:37.835766Z" } }, "outputs": [], @@ -898,8 +596,8 @@ "execution_count": null, "metadata": { "ExecuteTime": { - "end_time": "2020-12-09T21:32:09.791576Z", - "start_time": "2020-12-09T21:32:09.787208Z" + "end_time": "2021-02-08T13:51:38.110486Z", + "start_time": "2021-02-08T13:51:38.106011Z" } }, "outputs": [], @@ -939,8 +637,8 @@ "execution_count": null, "metadata": { "ExecuteTime": { - "end_time": "2020-12-09T21:32:10.154805Z", - "start_time": "2020-12-09T21:32:09.793271Z" + "end_time": "2021-02-08T13:51:38.479613Z", + "start_time": "2021-02-08T13:51:38.111880Z" }, "scrolled": true }, @@ -959,8 +657,8 @@ "execution_count": null, "metadata": { "ExecuteTime": { - "end_time": "2020-12-09T21:32:10.443908Z", - "start_time": "2020-12-09T21:32:10.156133Z" + "end_time": "2021-02-08T13:51:38.766814Z", + "start_time": "2021-02-08T13:51:38.481195Z" } }, "outputs": [], @@ -1003,8 +701,8 @@ "execution_count": null, "metadata": { "ExecuteTime": { - "end_time": "2020-12-09T21:32:10.857853Z", - "start_time": "2020-12-09T21:32:10.445342Z" + "end_time": "2021-02-08T13:51:39.113981Z", + "start_time": "2021-02-08T13:51:38.769152Z" } }, "outputs": [], @@ -1029,14 +727,14 @@ "execution_count": null, "metadata": { "ExecuteTime": { - "end_time": "2020-12-09T21:32:15.329849Z", - "start_time": "2020-12-09T21:32:15.101788Z" + "end_time": "2021-02-08T13:51:39.375223Z", + "start_time": "2021-02-08T13:51:39.115648Z" }, "scrolled": true }, "outputs": [], "source": [ - " sisso = SissoRegressor(n_nonzero_coefs=3, n_features_per_sis_iter=10)\n", + "sisso = SissoRegressor(n_nonzero_coefs=3, n_features_per_sis_iter=10)\n", "\n", "sisso.fit(D, P)\n", "sisso.print_models(features_list)" @@ -1061,60 +759,35 @@ "execution_count": null, "metadata": { "ExecuteTime": { - "end_time": "2020-12-09T21:33:34.682503Z", - "start_time": "2020-12-09T21:33:34.590337Z" - }, - "scrolled": true + "end_time": "2021-02-08T13:51:44.105352Z", + "start_time": "2021-02-08T13:51:44.087496Z" + } }, "outputs": [], "source": [ "# here we define a different dataframe to make it compatible with the c++ implementation of SISSO\n", - "# load data\n", - "RS_structures = read(\"data/compressed_sensing/RS_structures.xyz\", index=':')\n", - "ZB_structures = read(\"data/compressed_sensing/ZB_structures.xyz\", index=':')\n", - "\n", - "def generate_table(RS_structures, ZB_structures):\n", - "\n", - " for RS, ZB in zip(RS_structures, ZB_structures):\n", - " energy_diff = RS.info['energy'] - ZB.info['energy']\n", - " min_struc_type = 'RS' if energy_diff < 0 else 'ZB'\n", - " struc_obj_min = RS if energy_diff < 0 else ZB\n", - "\n", - " yield [RS.info['energy'], ZB.info['energy'],\n", - " energy_diff, min_struc_type,\n", - " RS.info['Z'], ZB.info['Z'],\n", - " RS.info['period'], ZB.info['period'],\n", - " RS.info['IP'], ZB.info['IP'],\n", - " RS.info['EA'], ZB.info['EA'],\n", - " RS.info['E_HOMO'], ZB.info['E_HOMO'],\n", - " RS.info['E_LUMO'], ZB.info['E_LUMO'],\n", - " RS.info['r_s'], ZB.info['r_s'],\n", - " RS.info['r_p'], ZB.info['r_p'],\n", - " RS.info['r_d'], ZB.info['r_d'],\n", - " abs(RS.info['r_p']+RS.info['r_s']-ZB.info['r_p']-ZB.info['r_s']),\n", - " abs(RS.info['r_p']-RS.info['r_s'])+abs(ZB.info['r_p']-ZB.info['r_s']),\n", - " RS, ZB, struc_obj_min]\n", - " \n", - "df_plus = pd.DataFrame(\n", - " generate_table(RS_structures, ZB_structures),\n", - " columns=['energy_RS', 'energy_ZB', \n", - " 'energy_diff', 'min_struc_type', \n", - " 'Z_A (nuc_charge)', 'Z_B (nuc_charge)', \n", - " 'period_A (unitless)', 'period_B (unitless)', \n", - " 'IP_A (eV_IP)', 'IP_B (eV_IP)', \n", - " 'EA_A (eV_IP)', 'EA_B (eV_IP)', \n", - " 'E_HOMO_A (eV)', 'E_HOMO_B (eV)', \n", - " 'E_LUMO_A (eV)', 'E_LUMO_B (eV)', \n", - " 'r_s_A', 'r_s_B', \n", - " 'r_p_A', 'r_p_B', \n", - " 'r_d_A', 'r_d_B',\n", - " 'r_sigma', 'r_pi',\n", - " 'struc_obj_RS', 'struc_obj_ZB', 'struc_obj_min'],\n", - " index=list(RS.get_chemical_formula() for RS in RS_structures)\n", - ")\n", "\n", - "# print data without structure objects\n", - "df_plus = df_plus.drop(['energy_RS', 'energy_ZB', 'min_struc_type', 'struc_obj_RS', 'struc_obj_ZB', 'struc_obj_min'], axis=1)" + "# merge target and feature data frame\n", + "df_plus = merge_target_feature(df_target, df_features, suffixes=('_A', '_B'))\n", + "\n", + "# add Zunger's r_pi and r_sigma\n", + "df_plus['r_pi'] = abs(df_plus['r_p_A'] - df_plus['r_s_A']) + abs(df_plus['r_p_B'] + df_plus['r_s_B'])\n", + "df_plus['r_sigma'] = abs(df_plus['r_p_A'] + df_plus['r_s_A'] - (df_plus['r_p_B'] + df_plus['r_s_B']))\n", + "\n", + "df_plus = df_plus.rename(columns={'Z_A': 'Z_A (nuc_charge)',\n", + " 'Z_B': 'Z_B (nuc_charge)',\n", + " 'period_A': 'period_A (unitless)',\n", + " 'period_B': 'period_B (unitless)',\n", + " 'IP_A': 'IP_A (eV_IP)',\n", + " 'IP_B': 'IP_B (eV_IP)',\n", + " 'EA_A': 'EA_A (eV_IP)',\n", + " 'EA_B': 'EA_B (eV_IP)',\n", + " 'E_HOMO_A': 'E_HOMO_A (eV)',\n", + " 'E_HOMO_B': 'E_HOMO_B (eV)',\n", + " 'E_LUMO_A': 'E_LUMO_A (eV)',\n", + " 'E_LUMO_B': 'E_LUMO_B (eV)',\n", + " })\n", + "df_plus_reduced = df_plus.drop(['A', 'B', 'min_struc_type'], axis=1) " ] }, { @@ -1122,8 +795,8 @@ "execution_count": null, "metadata": { "ExecuteTime": { - "end_time": "2020-12-09T21:33:39.961597Z", - "start_time": "2020-12-09T21:33:34.758589Z" + "end_time": "2021-02-08T13:51:50.268397Z", + "start_time": "2021-02-08T13:51:44.988511Z" }, "scrolled": false }, @@ -1132,7 +805,7 @@ "n_nonzero_coefs=3\n", "n_features_per_sis_iter=50\n", "phi_0, prop_unit, prop, prop_test, task_sizes_train, task_sizes_test, leave_out_inds = generate_phi_0_from_csv(\n", - " df_plus, \"energy_diff\", \n", + " df_plus_reduced, \"energy_diff\", \n", " cols=['r_s_A', 'r_p_A', 'r_d_A', 'EA_A', 'IP_A', 'r_s_B', 'r_p_B', 'r_d_B', 'EA_B', 'IP_B'], \n", " task_key=None, leave_out_frac=0.0, leave_out_inds=None\n", ")\n", @@ -1179,8 +852,8 @@ "execution_count": null, "metadata": { "ExecuteTime": { - "end_time": "2020-12-09T21:33:42.060185Z", - "start_time": "2020-12-09T21:33:41.655421Z" + "end_time": "2021-02-08T13:51:50.737037Z", + "start_time": "2021-02-08T13:51:50.270139Z" }, "scrolled": false }, @@ -1189,7 +862,7 @@ "n_nonzero_coefs=2\n", "n_features_per_sis_iter=50\n", "phi_0, prop_unit, prop, prop_test, task_sizes_train, task_sizes_test, leave_out_inds = generate_phi_0_from_csv(\n", - " df_plus, \"energy_diff\", \n", + " df_plus_reduced, \"energy_diff\", \n", " cols=['r_s_A', 'r_p_A', 'r_d_A', 'EA_A', 'IP_A', 'r_s_B', 'r_p_B', 'r_d_B', 'EA_B', 'IP_B'], \n", " task_key=None, leave_out_frac=0.0, leave_out_inds=None\n", ")\n", @@ -1231,20 +904,6 @@ "Firstly the atomic coordinates of all compounds are stored in a .xyz file for the successive visualization." ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-12-09T21:33:45.327451Z", - "start_time": "2020-12-09T21:33:45.136210Z" - } - }, - "outputs": [], - "source": [ - "generate_structures (RS_structures,ZB_structures)" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -1259,8 +918,8 @@ "execution_count": null, "metadata": { "ExecuteTime": { - "end_time": "2020-12-09T21:33:46.787348Z", - "start_time": "2020-12-09T21:33:45.846486Z" + "end_time": "2021-02-08T13:51:51.399756Z", + "start_time": "2021-02-08T13:51:50.738969Z" }, "scrolled": false }, @@ -1290,8 +949,8 @@ "execution_count": null, "metadata": { "ExecuteTime": { - "end_time": "2020-12-08T09:04:18.777429Z", - "start_time": "2020-12-08T09:04:17.224066Z" + "end_time": "2021-02-08T13:05:52.104347Z", + "start_time": "2021-02-08T13:05:52.050Z" } }, "outputs": [], @@ -1311,8 +970,8 @@ "execution_count": null, "metadata": { "ExecuteTime": { - "end_time": "2020-12-08T09:04:36.911358Z", - "start_time": "2020-12-08T09:04:18.779050Z" + "end_time": "2021-02-08T13:05:52.105156Z", + "start_time": "2021-02-08T13:05:52.054Z" }, "scrolled": true }, @@ -1351,8 +1010,8 @@ "execution_count": null, "metadata": { "ExecuteTime": { - "end_time": "2020-12-08T09:04:40.440217Z", - "start_time": "2020-12-08T09:04:40.366912Z" + "end_time": "2021-02-08T13:05:52.106474Z", + "start_time": "2021-02-08T13:05:52.057Z" } }, "outputs": [], @@ -1373,8 +1032,8 @@ "execution_count": null, "metadata": { "ExecuteTime": { - "end_time": "2020-12-08T09:04:41.931592Z", - "start_time": "2020-12-08T09:04:41.874643Z" + "end_time": "2021-02-08T13:05:52.107446Z", + "start_time": "2021-02-08T13:05:52.060Z" }, "scrolled": true }, @@ -1453,8 +1112,8 @@ "execution_count": null, "metadata": { "ExecuteTime": { - "end_time": "2020-12-08T09:04:43.054913Z", - "start_time": "2020-12-08T09:04:43.045711Z" + "end_time": "2021-02-08T13:05:52.108447Z", + "start_time": "2021-02-08T13:05:52.063Z" } }, "outputs": [], @@ -1472,8 +1131,8 @@ "execution_count": null, "metadata": { "ExecuteTime": { - "end_time": "2020-12-08T09:04:59.751495Z", - "start_time": "2020-12-08T09:04:43.306969Z" + "end_time": "2021-02-08T13:05:52.109346Z", + "start_time": "2021-02-08T13:05:52.066Z" }, "scrolled": true }, @@ -1497,8 +1156,8 @@ "execution_count": null, "metadata": { "ExecuteTime": { - "end_time": "2020-12-08T09:04:59.801410Z", - "start_time": "2020-12-08T09:04:59.753070Z" + "end_time": "2021-02-08T13:05:52.110132Z", + "start_time": "2021-02-08T13:05:52.068Z" } }, "outputs": [], @@ -1514,6 +1173,13 @@ "show_scatter_plot(xs, ys, data_point_labels=data_point_labels, \n", " x_label='E_diff_DFT', y_label='E_diff_predicted', legend=legend, unit='eV/atom')" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {