diff --git a/notebook/compressed_sensing.ipynb b/notebook/compressed_sensing.ipynb index 57437ba55ccf622a54fc1902d957727e7c2afb0b..5352950aefe16290e62c3d3f5c8014758b43d352 100644 --- a/notebook/compressed_sensing.ipynb +++ b/notebook/compressed_sensing.ipynb @@ -95,28 +95,18 @@ "from jupyter_jsmol import JsmolView\n", "import pathlib\n", "\n", - "\n", - "# import nglview\n", - "# from ase.units import J\n", - "\n", "from compressed_sensing.sisso import SissoRegressor\n", "from compressed_sensing.combine_features import combine_features\n", "from compressed_sensing.scatter_plot import show_scatter_plot\n", "from compressed_sensing.visualizer import Visualizer\n", + "\n", "from sissopp import Inputs, FeatureSpace, SISSORegressor, FeatureNode, Unit\n", "from sissopp.py_interface import read_csv\n", "from sissopp.py_interface.import_dataframe import get_unit\n", + "from sissopp.sklearn import SISSORegressor as SISSORegressor_skl\n", "\n", - "# from atomicfeaturespackage.atomicproperties import atomic_properties_lda2015\n", - "# from nomad import client, config\n", - "# from nomad.client.archive import ArchiveQuery\n", - "# from nomad.metainfo import units\n", - "# import nest_asyncio\n", - "# nest_asyncio.apply()\n", - "\n", - "# set display options for the notebook \n", - "# %matplotlib inline\n", - "warnings.filterwarnings('ignore')" + "warnings.filterwarnings('ignore')\n", + "from bokeh.io import export_png\n" ] }, { @@ -150,38 +140,6 @@ }, "outputs": [], "source": [ - "# def get_query():\n", - "# query_param={\n", - "# \"datasets.dataset_name:any\": [\n", - "# \"Octet-Binaries-RS-vs-ZB\"\n", - "# ]\n", - "# }\n", - "# required={\n", - "# 'workflow':{\n", - "# 'calculation_result_ref':{\n", - "# 'energy':{\n", - "# 'total':'*',\n", - "# },\n", - "# 'system_ref':{\n", - "# 'chemical_composition_reduced': '*',\n", - "# 'atoms': {\n", - "# 'labels':'*', \n", - "# 'positions':'*',\n", - "# 'lattice_vectors':'*',\n", - "# },\n", - "# 'symmetry':{\n", - "# 'space_group_number': '*', \n", - "# }, \n", - "# },\n", - "# }, \n", - "# } \n", - "# }\n", - "# max_entries=164\n", - "\n", - "# query = ArchiveQuery(query=query_param, required=required, page_size=20, results_max=max_entries)\n", - "\n", - "# return query\n", - "\n", "def write_xyz(df):\n", " for compound, (A, B, pos, lat) in df[['A', 'B', 'atomic_positions', 'lattice_vectors']].iterrows():\n", " lat_x, lat_y, lat_z = lat\n", @@ -199,63 +157,6 @@ " xyz[2])) \n", " file.close()\n", "\n", - "# def get_target(query):\n", - " \n", - "# path_structure = './data/compressed_sensing/structures/'\n", - "# pathlib.Path(path_structure).mkdir(parents=True, exist_ok=True)\n", - "# df_target = pd.DataFrame()\n", - "# results=query.download()\n", - "\n", - "# for entry in results:\n", - "# calc = entry.workflow[0].calculation_result_ref\n", - "# atom_labels = calc.system_ref.atoms.labels\n", - "# df_target=df_target.append({\n", - "# \"A\": atom_labels[0],\n", - "# \"B\": atom_labels[1],\n", - "# \"space_group\": calc.system_ref.symmetry[0].space_group_number,\n", - "# \"energy\": calc.energy.total.value.m,\n", - "# 'compound': calc.system_ref.chemical_composition_reduced,\n", - "# \"atomic_positions\": calc.system_ref.atoms.positions.m,\n", - "# \"lattice_vectors\": calc.system_ref.atoms.lattice_vectors.m,\n", - "# },\n", - "# ignore_index=True\n", - "# )\n", - "\n", - "# df_RS = df_target.query('space_group==225 or space_group==221').set_index('compound').sort_index()\n", - "# df_ZB = df_target.query('space_group==216 or space_group==227').set_index('compound').sort_index()\n", - "# df_RS['struc_type'] = 'RS'\n", - "# df_ZB['struc_type'] = 'ZB'\n", - "# df_target = df_RS[['A','B']]\n", - "# df_target['energy_diff']=(df_RS['energy']-df_ZB['energy'])/2\n", - "# #df_target['min_struc_type']=np.where(df_RS['energy']<df_ZB['energy'],'RS','ZB')\n", - " \n", - "# mins = np.where(df_RS['energy'] < df_ZB['energy'], \n", - "# # [df_RS['atomic_positions'], df_RS['lattice_vectors']], [df_ZB['atomic_positions'], df_ZB['lattice_vectors']]\n", - "# df_RS[['struc_type', 'atomic_positions', 'lattice_vectors']].T, \n", - "# df_ZB[['struc_type', 'atomic_positions', 'lattice_vectors']].T\n", - "# )\n", - "# df_mins = pd.DataFrame(mins.T, columns=['min_struc_type', 'atomic_positions', 'lattice_vectors',],\n", - "# index=df_target.index\n", - "# )\n", - "\n", - "# df_target = pd.concat([df_target, df_mins], axis=1)\n", - " \n", - "# # convert J in eV and m in AA\n", - "# df_target['energy_diff'] *= J\n", - "# df_target[['atomic_positions', 'lattice_vectors']] *= 10**10\n", - " \n", - " \n", - "# # write structures (atomic_positions, lattice_vectors) into xyz files\n", - "# # to be used by the visualizer later\n", - "# write_xyz(df_target)\n", - " \n", - "# return df_target[['A', 'B', 'energy_diff', 'min_struc_type']]\n", - "\n", - "# # get data (chemical formulas and RS-ZB energy difference) from query\n", - "# query = get_query()\n", - "# query.fetch()\n", - "# df_target = get_target(query)\n", - "# df_target\n", "\n", "df_target = pd.read_hdf('compressed_sensing.hdf5', 'target')\n", "path_structure = './data/compressed_sensing/structures/'\n", @@ -276,20 +177,6 @@ }, "outputs": [], "source": [ - "# def get_features(elements, features, rename_dict={}): \n", - "# features_data = [[atomic_properties_lda2015.symbol(el).get('atomic_'+f) for f in features] for el in elements]\n", - "# df_features = pd.DataFrame(features_data, index=elements, columns=features).sort_values('number')\n", - "# df_features = df_features.rename(columns=rename_dict)\n", - "# feautures = df_features.columns.tolist()\n", - "# return df_features, features\n", - "\n", - "# # get features from atomicfeaturespackage\n", - "# features = ['number', 'r_s', 'r_p', 'r_d', 'period', 'ea', 'ip', 'homo', 'lumo']\n", - "# rename_dict = {'number': 'Z', 'ea': 'EA', 'ip': 'IP', 'homo': 'E_HOMO', 'lumo':'E_LUMO'}\n", - "# elements = np.unique(df_target[['A', 'B']])\n", - "\n", - "# df_features, features = get_features(elements, features, rename_dict=rename_dict)\n", - "# df_features\n", "df_features = pd.read_hdf('compressed_sensing.hdf5', 'features')\n", "df_features" ] @@ -715,57 +602,6 @@ "features: 115; 3D RMSE: 0.170545592998 best features: ['(r_s(B)+r_p(A))', '(r_s(B)+r_p(A))^2', 'exp(r_s(B)+r_p(A))']\"" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### The SISSO method" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2022-04-01T13:17:50.479489Z", - "start_time": "2022-04-01T13:17:50.357718Z" - } - }, - "outputs": [], - "source": [ - "#import Data\n", - "selected_feature_list = ['r_s', 'r_p', 'r_d', 'EA', 'IP']\n", - "allowed_operations = ['+','|-|','exp', '^2']\n", - "P, df_D = get_data(selected_feature_list, allowed_operations)\n", - "D = df_D.values\n", - "features_list = df_D.columns.tolist()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now apply the SISSO algorithm. How does the SISSO method compare to the LASSO and to the $\\ell_0$-regularization in terms of accuracy (again when using the same feature space)? How fast is SISSO compared to the $\\ell_0$-regularization? How does n_features_per_sis_iter (the number of features collected per sis iteration) affect the performance? Note, that for n_features_per_sis_iter=1 SISSO becomes the so-called orthogonal matching pursuit, another well-known compressed sensing method." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2022-04-01T13:17:50.590980Z", - "start_time": "2022-04-01T13:17:50.480402Z" - }, - "scrolled": true - }, - "outputs": [], - "source": [ - "sisso = SissoRegressor(n_nonzero_coefs=3, n_features_per_sis_iter=10)\n", - "\n", - "sisso.fit(D, P)\n", - "sisso.print_models(features_list)" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -928,7 +764,8 @@ " string = \"c0:{:.4}\".format(sisso.models[i][0].coefs[0][-1])\n", " for j in range(i+1):\n", " string = string + str(\" | a\"+str(j)+\":{:.4}\".format(sisso.models[i][0].coefs[0][j]))\n", - " print(string + '\\n')" + " print(string + '\\n')\n", + " print(sisso.models[i][0].latex_str)" ] }, { @@ -970,27 +807,6 @@ "How is the prediction error compared to the fitting error? How often is the same descriptor selected? Are there materials that yield an outlying high/low error? " ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2022-04-01T13:17:52.559446Z", - "start_time": "2022-04-01T13:17:51.744639Z" - } - }, - "outputs": [], - "source": [ - "# get the data\n", - "selected_feature_list = ['IP', 'EA', 'r_s', 'r_p','r_d']\n", - "allowed_operations = ['+','|-|','exp', '^2', '/']\n", - "\n", - "P, df_D = get_data(selected_feature_list, allowed_operations)\n", - "features_list = df_D.columns.tolist()\n", - "chemical_formulas = df_D.index.tolist()\n", - "D = df_D.values" - ] - }, { "cell_type": "code", "execution_count": null, @@ -1008,26 +824,32 @@ "dimensions = range(1, 4)\n", "features_count = [[] for i in range(3)]\n", "P_predict = np.empty([len(dimensions), n_compounds])\n", - "\n", - "sisso = SissoRegressor(n_nonzero_coefs=3, n_features_per_sis_iter=10)\n", + "chemical_formulas = df_target.index.tolist()\n", + "\n", + "sisso = SISSORegressor_skl(\n", + " allowed_ops = selected_ops,\n", + " n_sis_select = 100,\n", + " n_dim = 3,\n", + " max_rung = 2,\n", + " n_residual = 1,\n", + " n_models_store = 1,\n", + " verbose = False,\n", + ")\n", "loo = LeaveOneOut()\n", "\n", + "df_sisso_cv = df_plus_reduced.drop(columns=[\"energy_diff\"])\n", + "\n", "for indices_train, index_test in loo.split(P):\n", " i_cv = index_test[0]\n", " print('%2s) Leave out %s: Ediff_ref = %.3f eV/atom' \n", " % (index_test[0]+1, chemical_formulas[i_cv], P[i_cv]))\n", " \n", - " sisso.fit(D[indices_train], P[indices_train])\n", - " sisso.print_models(features_list) \n", - " \n", - " for dim in dimensions: \n", - " features = [features_list[i] for i in sisso.l0_selected_indices[dim - 1]]\n", - " predicted_value = sisso.predict(D[index_test], dim=dim)[0]\n", - " \n", - " features_count[dim-1].append( tuple(features) ) \n", - " P_predict[dim-1, i_cv] = predicted_value\n", - " \n", - " print('Ediff_predicted(%sD) = %.3f eV/atom' %(dim, predicted_value))\n", + " sisso.fit(df_sisso_cv.iloc[indices_train, :], P[indices_train])\n", + " data_dct = {col.split(\"(\")[0].strip(): df_sisso_cv.iloc[i_cv, cc] for cc, col in enumerate(df_sisso_cv.columns)}\n", + " predicted_value = [models[0].eval(data_dct) for models in sisso.solver.models[:]]\n", + " P_predict[:, i_cv] = predicted_value\n", + " for dim in dimensions: \n", + " print('Ediff_predicted(%sD) = %.3f eV/atom' %(dim, predicted_value[dim - 1]))\n", " print('-----')" ] }, @@ -1049,44 +871,8 @@ "legend = ['%sD, RMSE = %.3f eV/atom' %(dim, prediction_errors[dim-1]) for dim in dimensions]\n", "data_point_labels = [df.index.tolist()]*3\n", "\n", - "show_scatter_plot(xs, ys, data_point_labels=data_point_labels, \n", - " x_label='E_diff_DFT', y_label='E_diff_predicted', legend=legend, unit='eV/atom')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2022-04-01T13:18:02.762134Z", - "start_time": "2022-04-01T13:18:02.730962Z" - }, - "scrolled": true - }, - "outputs": [], - "source": [ - "# Print descriptor selection frequency\n", - "print(\"Descriptor selection frequency\")\n", - "for dim in dimensions: \n", - " df_frequency = pd.DataFrame( Counter(features_count[dim-1]).most_common(10), columns=['Features', 'Frequency'] )\n", - " print('-----------------\\n%sD:\\n%s' % (dim, df_frequency))\n", - "\n", - "# create table to display errors and models\n", - "feat = np.array(features_count).flatten('F')\n", - "Pred = np.array(P_predict).flatten('F')\n", - "Pred_errors = np.abs(P-P_predict).flatten('F')\n", - "Ref_values = [r for p in P for r in [p,p,p] ]\n", - "Mats = [m for mat in chemical_formulas for m in [mat, mat, mat] ]\n", - "Dims = ['1D','2D','3D'] * n_compounds\n", - "\n", - "df_loo = pd.DataFrame(list(zip(Ref_values,Pred,Pred_errors,feat)), index = [Mats,Dims],\n", - " columns=['P_ref[eV]', 'P_pred[eV]', 'abs. error [eV]', 'Selected features'])\n", - "\n", - "# if you do not want to sort the data frame by the prediction error comment out the nex line \n", - "df_loo = df_loo.sort_values('abs. error [eV]', ascending=False)\n", - "pd.set_option('display.expand_frame_repr', False)\n", - "\n", - "display(df_loo)" + "fig = show_scatter_plot(xs, ys, data_point_labels=data_point_labels, \n", + " x_label='E_diff_DFT', y_label='E_diff_predicted', legend=legend, unit='eV/atom')\n" ] }, { @@ -1196,7 +982,102 @@ " 'KR, RMSE = %.3f eV/atom' % prediction_rmse_kr]\n", "data_point_labels = [df.index.tolist()]*2\n", "\n", - "show_scatter_plot(xs, ys, data_point_labels=data_point_labels, \n", + "fig = show_scatter_plot(xs, ys, data_point_labels=data_point_labels, \n", + " x_label='E_diff_DFT', y_label='E_diff_predicted', legend=legend, unit='eV/atom')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PySR" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "features = ['Z', 'r_s', 'r_p', 'r_d', 'period', 'EA', 'IP', 'E_HOMO', 'E_LUMO']\n", + "pysr_df = df_plus_reduced.copy()\n", + "col_rename = {col: col.split(\"(\")[0].strip() for col in pysr_df.columns}\n", + "pysr_df.rename(columns=col_rename, inplace=True)\n", + "pysr_df.drop(columns=[\"energy_diff\"], inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import contextlib\n", + "import os\n", + "from pathlib import Path\n", + "\n", + "from pysr import PySRRegressor\n", + "from sympy import Abs, root, sign\n", + "\n", + "@contextlib.contextmanager\n", + "def cwd(path, mkdir=False, debug=False):\n", + " CWD = os.getcwd()\n", + "\n", + " if os.path.exists(path) is False and mkdir:\n", + " os.makedirs(path)\n", + "\n", + " if debug:\n", + " os.chdir(path)\n", + " yield\n", + " os.chdir(CWD)\n", + " return\n", + "\n", + " os.chdir(path)\n", + " yield\n", + " os.chdir(CWD)\n", + "\n", + "pysr = PySRRegressor(\n", + " niterations=100,\n", + " binary_operators=[\"+\", \"-\", \"/\"],\n", + " unary_operators=[\n", + " \"square\",\n", + " \"exp\",\n", + " \"abs\",\n", + " ],\n", + " loss=\"loss(prediction, target) = (prediction - target)^2\",\n", + " verbosity = 0,\n", + " procs = 0,\n", + " multithreading = False,\n", + " random_state = 1,\n", + " deterministic = True,\n", + ")\n", + "\n", + "P_predict_pysr = []\n", + "loo = LeaveOneOut()\n", + "for indices_train, index_test in loo.split(P):\n", + " pysr_path = f\"pySR/{index_test[0]}\"\n", + " with cwd(pysr_path, mkdir=True):\n", + " pysr.fit(pysr_df.iloc[indices_train, :], P[indices_train])\n", + " P_predict_pysr.append(pysr.predict(pysr_df.iloc[index_test, :])[0])\n", + " print(\"%2i Ediff_ref: %.3f, Ediff_pred: %.3f, equation: {%s}\" \n", + " % (index_test[0], P[index_test], P_predict_pysr[-1], \n", + " pysr.get_best().sympy_format))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "xs = [P, P]\n", + "ys = [P_predict[-1], P_predict_pysr,]\n", + "prediction_rmse_pysr = np.sqrt(np.mean((P_predict_pysr - P)**2.0))\n", + "legend = ['SISSO 3D, RMSE = %.3f eV/atom' % prediction_errors[dim-1], \n", + " 'pySR, RMSE = %.3f eV/atom' % prediction_rmse_pysr]\n", + "data_point_labels = [df.index.tolist()]*2\n", + "\n", + "fig = show_scatter_plot(xs, ys, data_point_labels=data_point_labels, \n", " x_label='E_diff_DFT', y_label='E_diff_predicted', legend=legend, unit='eV/atom')" ] }, @@ -1205,7 +1086,85 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "pysr_path = f\"pySR/all_pts\"\n", + "with cwd(pysr_path, mkdir=True):\n", + " pysr.fit(pysr_df, P)\n", + " print(pysr.get_best().sympy_format)\n", + "\n", + "error_df = pysr.equations.copy()\n", + "error_df[\"RMSE\"] = np.sqrt(error_df[\"loss\"])\n", + "\n", + "plt.plot(error_df[\"complexity\"], error_df[\"RMSE\"])\n", + "plt.plot(pysr.get_best()[\"complexity\"], np.sqrt(pysr.get_best()[\"loss\"]), \"*\", ms=10)\n", + "plt.xlabel(\"Number of Nodes\")\n", + "plt.ylabel(\"RMSE (eV / atom)\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# FFX" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import ffx\n", + "\n", + "P_predict_ffx = []\n", + "n_basis = []\n", + "loo = LeaveOneOut()\n", + "FFX = ffx.FFXRegressor()\n", + "for indices_train, index_test in loo.split(P):\n", + " FFX.fit(\n", + " pysr_df.iloc[indices_train, :], \n", + " P[indices_train],\n", + " )\n", + " P_predict_ffx.append(\n", + " FFX.predict(pysr_df.iloc[index_test, :].values)[0]\n", + " )\n", + " n_basis.append(FFX.complexity())\n", + " print(\"%2i Ediff_ref: %.3f, Ediff_pred: %.3f, complexity: %.d\" \n", + " % (index_test[0], P[index_test], P_predict_ffx[-1], n_basis[-1]))\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "xs = [P, P]\n", + "ys = [P_predict[-1], P_predict_ffx]\n", + "\n", + "prediction_rmse_ffx = np.sqrt(np.mean((P_predict_ffx - P)**2.0))\n", + "\n", + "legend = ['SISSO 3D, RMSE = %.3f eV/atom' % prediction_errors[dim-1], \n", + " 'FFX, RMSE = %.3f eV/atom' % prediction_rmse_ffx]\n", + "\n", + "data_point_labels = [df.index.tolist()]*2\n", + "fig = show_scatter_plot(xs, ys, data_point_labels=data_point_labels, \n", + " x_label='E_diff_DFT', y_label='E_diff_predicted', legend=legend, unit='eV/atom')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.hist(n_basis, bins=15)\n", + "plt.xlabel(\"Number of Basis Functions\")\n", + "plt.ylabel(\"Count\")\n", + "plt.show()" + ] } ], "metadata": { @@ -1224,7 +1183,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.9.12" } }, "nbformat": 4,