Commit 3b4bac78 authored by Luigi Sbailo's avatar Luigi Sbailo
Browse files

Initial work

parents
# Cluster Based SISSO
This repository combines cluster extraction methods (Kmeans, DeepAA) with SISSO. With the former, representatives and corresponding
clusters are extracted. Then, either multitask-SISSO is trained on all materials which are not in the test set, with tasks being defined
by the extracted clusters, or singletask-SISSO is trained on the representatives only. As both Kmeans and DeepAA are stochastic
algorithms, they should be applied multiple times to the same dataset; median test-RMSE of subsequent sisso application then indicates the usefulness of the clustering algorithm with given parameters.
Installation prerequisites are:
- python>3.6
- sissopp (https://sissopp_developers.gitlab.io/sissopp/)
- tensorflow
- numpy
- pandas
- scipy
- scikit-learn
- seaborn
- toml
This diff is collapsed.
{
"authors": [
"Oehlers, Milena",
"Sbailo,Luigi"
],
"email": "milenaoehlers@gmail.com",
"title": "Proto- and Archetype Clustering-based SISSO",
"description": "In this tutorial two clustering methods, namely unsupervised k-means and supervised deep-aa, will be used to extract proto- and archetypes, respectively, along with corresponding clusters. The set of proto- or archetypes can be used as a substantially reduced training set for Single-Task SISSO, which outperforms random selection, while the corresponding clusters allow for an educated material2task-assignment of all training and test materials for Multi-Task SISSO, whose training on the whole training set outperforms corresponding training of Single-Task SISSO.",
"url": "https://gitlab.mpcdf.mpg.de/nomad-lab/proto_archetype_clustering_sisso",
"link": "https://analytics-toolkit.nomad-coe.eu/hub/user-redirect/notebooks/tutorials/proto_archetype_clustering_sisso.ipynb",
"link_public": "https://analytics-toolkit.nomad-coe.eu/public/user-redirect/notebooks/tutorials/proto_archetype_sisso.ipynb",
"updated": "2021-12-20",
"flags":{
"featured": false,
"top_of_list": false
},
"labels": {
"application_keyword": [
"k-means",
"deep-aa",
"SISSO","sisso",
"archetypes",
"prototypes",
"clustering",
"training set reduction",
"multi-task",
"single-task",
"unsupervised",
"supervised"
],
"application_section": [
"Materials property prediction"
],
"application_system": [
"System"
],
"category": [
"Tutorial"
],
"data_analytics_method": [
"Clustering",
"SISSO"
],
"platform": [
"jupyter"
]
}
}
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2022-01-03T11:19:01.134826Z",
"start_time": "2022-01-03T11:19:01.078381Z"
}
},
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'daa_luigi'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-1-3fa56a5b2bd1>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mcluster_based_sisso\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mPrimarySpaceParams\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mDerivedSpaceParams\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mMyKmeans\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mMyDeepAA\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mMySisso\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mRepr2Members\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mExecutionType\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mcopy\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mdata_csvpath\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"./data/cluster_based_sisso/cubic_perovskites.csv\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0minterm_results_path\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/Desktop/cluster_based_sisso/cluster_based_sisso/__init__.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mtyping_extensions\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mLiteral\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mre\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mdaa_luigi\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mcommon_functions\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mExecutionFolder\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mraise_exception\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mas_inputs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mcopy\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'daa_luigi'"
]
}
],
"source": [
"from cluster_based_sisso import PrimarySpaceParams,DerivedSpaceParams,MyKmeans,MyDeepAA,MySisso,Repr2Members,ExecutionType\n",
"from copy import copy\n",
"\n",
"data_csvpath = \"./data/cluster_based_sisso/cubic_perovskites.csv\"\n",
"interm_results_path = None\n",
"\n",
"basic_params = PrimarySpaceParams(data_csvpath,\"lat\").deterministic()\n",
"primary_space_params = copy(basic_params)\n",
"derived_space_params = DerivedSpaceParams(**basic_params._asdict())\n",
"sisso_exe_params = copy(derived_space_params)\n",
"\n",
"kmeans_prim_30 = MyKmeans(primary_space_params,30)\n",
"kmeans_der_30 = MyKmeans(derived_space_params,30)\n",
"deppaa_5 = MyDeepAA(primary_space_params,1,1,1,1,4,500)\n",
"\n",
"ssisso_km_prim_30 = MySisso(sisso_exe_params,kmeans_prim_30,\"singletask_on_representatives\")\n",
"msisso_km_der_30 = MySisso(sisso_exe_params,kmeans_der_30,\"multitask_on_all\")\n",
"ssisso_deepaa_5 = MySisso(sisso_exe_params,deppaa_5,\"singletask_on_representatives\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
%% Cell type:code id: tags:
``` python
from cluster_based_sisso import PrimarySpaceParams,DerivedSpaceParams,MyKmeans,MyDeepAA,MySisso,Repr2Members,ExecutionType
from copy import copy
data_csvpath = "./data/cluster_based_sisso/cubic_perovskites.csv"
interm_results_path = None
basic_params = PrimarySpaceParams(data_csvpath,"lat").deterministic()
primary_space_params = copy(basic_params)
derived_space_params = DerivedSpaceParams(**basic_params._asdict())
sisso_exe_params = copy(derived_space_params)
kmeans_prim_30 = MyKmeans(primary_space_params,30)
kmeans_der_30 = MyKmeans(derived_space_params,30)
deppaa_5 = MyDeepAA(primary_space_params,1,1,1,1,4,500)
ssisso_km_prim_30 = MySisso(sisso_exe_params,kmeans_prim_30,"singletask_on_representatives")
msisso_km_der_30 = MySisso(sisso_exe_params,kmeans_der_30,"multitask_on_all")
ssisso_deepaa_5 = MySisso(sisso_exe_params,deppaa_5,"singletask_on_representatives")
```
%%%% Output: error
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
<ipython-input-1-3fa56a5b2bd1> in <module>
----> 1 from cluster_based_sisso import PrimarySpaceParams,DerivedSpaceParams,MyKmeans,MyDeepAA,MySisso,Repr2Members,ExecutionType
2 from copy import copy
3
4 data_csvpath = "./data/cluster_based_sisso/cubic_perovskites.csv"
5 interm_results_path = None
~/Desktop/cluster_based_sisso/cluster_based_sisso/__init__.py in <module>
2 from typing_extensions import Literal
3 import json,re
----> 4 import daa_luigi
5 from common_functions import ExecutionFolder,raise_exception,as_inputs
6 from copy import copy
ModuleNotFoundError: No module named 'daa_luigi'
%% Cell type:code id: tags:
``` python
```
This diff is collapsed.
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from typing import Union
import json
class ExecutionFolder():
def __init__(self, permanent_location: Union[Path, str, None], refers_to_data_file: Union[Path, str]):
execution_folder_path = Path(refers_to_data_file).parent.joinpath("tmp_sisso_exe_folder") \
if permanent_location is None \
else Path(permanent_location)
execution_folder_path.mkdir(parents=True, exist_ok=True)
self.is_permanent = permanent_location is not None
self.path = execution_folder_path
def delete_if_not_permanent(self):
if not self.is_permanent:
shutil.rmtree(self.path, ignore_errors=False, onerror=None)
self.path = None
def raise_exception(txt:str):
raise Exception(txt)
def as_inputs(inputs_jsonfilepath, data_file, property_key,
task_key=None, opset=['add', 'sub', 'mult', 'div', 'sq', 'cb', 'cbrt', 'sqrt'],
param_opset=[], calc_type='regression', desc_dim=3, n_sis_select=100, max_rung=2,
n_residual=1, n_models_store=1, n_rung_store=1, n_rung_generate=0, min_abs_feat_val=1e-05,
max_abs_feat_val=100000000.0, leave_out_inds=[], leave_out_frac=0.25, fix_intercept=False,
max_feat_cross_correlation=1.0, nlopt_seed=13, global_param_opt=False, reparam_residual=True
):
"""writes jsonfile with sisso_execution- or derived_space_construction_parameters to inputs_jsonfilepath
and returns inputs_jsonfilepath"""
jsondict = {'data_file': str(data_file),
'property_key': property_key,
'leave_out_inds': leave_out_inds,
'leave_out_frac': leave_out_frac,
'task_key': task_key,
'opset': opset,
'param_opset': param_opset,
'calc_type': calc_type,
'desc_dim': desc_dim,
'n_sis_select': n_sis_select,
'max_rung': max_rung,
'n_residual': n_residual,
'n_models_store': n_models_store,
'n_rung_store': n_rung_store,
'n_rung_generate': n_rung_generate,
'min_abs_feat_val': min_abs_feat_val,
'max_abs_feat_val': max_abs_feat_val,
'fix_intercept': fix_intercept,
'max_feat_cross_correlation': max_feat_cross_correlation,
'nlopt_seed': nlopt_seed,
'global_param_opt': global_param_opt,
'reparam_residual': reparam_residual}
if Path(inputs_jsonfilepath).suffix != '.json':
inputs_jsonfilepath = inputs_jsonfilepath.with_suffix('.json')
print("'.json' was appended to inputs_jsonfilepath")
try:
jsondict['leave_out_inds'] = [int(ind) for ind in jsondict['leave_out_inds']]
except:
data = pd.read_csv(jsondict['data_file'], sep=',', index_col='material')
jsondict['leave_out_inds'] = [list(data.index).index(mat) for mat in jsondict['leave_out_inds']]
with open(inputs_jsonfilepath, 'w') as jsonfile:
json.dump(jsondict, jsonfile)
return str(inputs_jsonfilepath)
if False:
def force_leave_out_inds_representation(**space_params):
assert set(["data_file", "property_key", "leave_out_inds", "leave_out_frac"]).issubset(set(space_params.keys())), \
'which_space must contain keys "data_file", "property_key", "leave_out_inds", "leave_out_frac"'
testset_chosenby_index = (isinstance(space_params['leave_out_inds'], list)
and len(space_params['leave_out_inds']) > 0)
testset_chosen_randomly = not testset_chosenby_index \
and (isinstance(space_params['leave_out_frac'], tuple([int, float])) \
and 0 <= space_params['leave_out_frac'] < 1)
space_params['data_file'] = str(space_params['data_file'])
print(space_params['data_file'])
whole_set = pd.read_csv(space_params['data_file'], index_col='material')
work_set = whole_set.loc[set(whole_set.index) - set(space_params['leave_out_inds']), :] if testset_chosenby_index \
else whole_set if space_params['leave_out_frac'] == 0.0 \
else train_test_split(whole_set, test_size=space_params['leave_out_frac'])[0] if testset_chosen_randomly \
else raise_exception(
"leave_out_inds must be list of length > 0 and/or leave_out_frac must be float between 0 and 1")
space_params['leave_out_inds'] = [list(whole_set.index).index(mat)
for mat in list(set(whole_set.index) - set(work_set.index))]
space_params['leave_out_frac'] = None
return space_params
def primary_space_construction_parameters(basic_params_path:Path, data_file, property_key,
leave_out_inds=[], leave_out_frac=0.25):
"""returns dict of all parameters that determine primary space
data_file: str or Path that points to csv file
for meaning of the other args, please refer to https://sissopp_developers.gitlab.io/sissopp/quick_start/code_ref.html#input-files
"""
primary_space_dict = {'data_file': data_file,
'property_key': property_key,
'leave_out_inds': leave_out_inds,
'leave_out_frac': leave_out_frac}
with open(basic_params_path, 'w') as jsonfile:
json.dump(primary_space_dict, jsonfile)
return force_leave_out_inds_representation(**primary_space_dict)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 2 20:31:47 2020
matplotlib==3.2.1
tensorflow==2.3.1
tensorflow-estimator==2.3.0
tensorflow-probability==0.9.0
@author: oehlers
"""
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow.keras as tfk
import tensorflow.keras.layers as tfkl
import tensorflow_probability.python.distributions as tfd
import pandas as pd
from func_collection import get_zfixed
def build_network(intermediate_dim = 4, batch_size = 1024, latent_dim = 2, epochs = 100):
def execute(data, at_loss_factor=8.0, target_loss_factor=8.0,recon_loss_factor=4.0,kl_loss_factor=4.0):
#We use variational autoencoders to map the data set into a latent space. The neural network is constructed to force data in latent space to be defined within an arbitrary convex hull. We use a triangular convex hull as shown below.
zfixed = get_zfixed (2)
#We construct a variational autoencoder that generates a mean $\mu$, and a standard deviation $\sigma$ for each data point. The point is then mapped into the latent space with a stochastic extraction from a Gaussian $\mathcal{N}(\mu,\,\sigma^{2})$, where $\mu$'s are by construction within a hull $z_{fixed}$.
x_train = data['train_feat']
y_train = data['train_targets']
x_test = data['test_feat']
y_test = data['test_targets']
original_dim = x_train.shape [1]
try:
sideinfo_dim = y_train.shape [1]
except:
sideinfo_dim = 1
x_train = np.array(np.reshape(x_train, [-1, original_dim]), dtype='float32')
y_train = np.array(np.reshape(y_train, [-1, sideinfo_dim]), dtype='float32')
x_test = np.array(np.reshape(x_test, [-1, original_dim]), dtype='float32')
y_test = np.array(np.reshape(y_test, [-1, sideinfo_dim]), dtype='float32')
# network parameters
simplex_vrtxs = latent_dim + 1
# encoder
input_x = tfk.Input(shape=(original_dim,), name='encoder_input_x', dtype='float32')
x = tfkl.Dense(intermediate_dim, activation='relu')(input_x)
x = tfkl.Dense(intermediate_dim, activation='relu')(x)
A = tfkl.Dense(simplex_vrtxs, activation='linear')(x)
A = tfkl.Dense(simplex_vrtxs, activation=tf.nn.softmax)(A)
B_t = tfkl.Dense(simplex_vrtxs, activation='linear')(x)
B = tf.nn.softmax(tf.transpose(B_t), axis=1)
z_fixed = get_zfixed (latent_dim)
z_fixed = tf.constant (z_fixed, dtype='float32')
mu = tf.matmul(A, z_fixed)
z_pred = tf.matmul(B,mu)
sigma = tfkl.Dense(latent_dim)(x)
t = tfd.Normal(mu,sigma)
input_y = tfk.Input(shape=(sideinfo_dim,), name='encoder_input_y', dtype='float32')
y = tf.identity(input_y)
encoder = tfk.Model([input_x,input_y], [t.sample(),A,mu,sigma, tf.transpose(B) ,y], name='encoder')
encoder.summary()
# decoder
latent_inputs = tfk.Input(shape=(latent_dim,), name='z_sampling')
input_y_lat = tfk.Input(shape=(sideinfo_dim,), name='encoder_input_y_lat')
x = tfkl.Dense(intermediate_dim, activation='relu')(latent_inputs)
x = tfkl.Dense(original_dim, activation='linear')(x)
x_hat = tfkl.Dense(original_dim, activation='linear')(x)
y = tfkl.Dense(intermediate_dim, activation='relu')(latent_inputs)
y = tfkl.Dense(intermediate_dim, activation='relu')(y)
y_hat = tfkl.Dense(sideinfo_dim, activation='linear')(y)
decoder = tfk.Model([latent_inputs,input_y_lat], [x_hat,y_hat], name='decoder')
decoder.summary()
# VAE
encoded = encoder([input_x,input_y])
outputs = decoder([encoded[0],encoded[-1]])
vae = tfk.Model([input_x,input_y], outputs, name='vae')
reconstruction_loss = tfk.losses.mse (input_x, outputs[0])
class_loss = tfk.losses.mse ( input_y, outputs[1])
archetype_loss = tf.reduce_sum( tfk.losses.mse(z_fixed, z_pred))
kl_loss = 1 + sigma - tf.square(mu) - tf.exp(sigma)
kl_loss = tf.reduce_sum(kl_loss, axis=-1)
kl_loss *= -0.5
# annealing kl_loss parameter (milena):
anneal = 0
kl_loss_max = kl_loss_factor
if anneal == 1:
kl_loss_factor = tfk.backend.variable(0.)
class NewCallback(tfk.callbacks.Callback):
def __init__(self, kl_loss_factor):
self.kl_loss_factor = kl_loss_factor
def on_epoch_end(self, epoch, logs={}):
if epoch <= 100:
tfk.backend.set_value(self.kl_loss_factor, tfk.backend.get_value(self.kl_loss_factor) + epoch/100*kl_loss_max)
callbacks = [NewCallback(kl_loss_factor),] if anneal == 1 else None # milena
vae_loss = tf.reduce_mean(recon_loss_factor*reconstruction_loss
+ target_loss_factor*class_loss
+ kl_loss_factor*kl_loss
+ at_loss_factor*archetype_loss)
vae.add_loss(vae_loss)
vae.compile(optimizer='adam')
vae.summary()
vae.fit([x_train,y_train],
epochs=epochs,
batch_size=batch_size,
validation_split = 0.25,
callbacks = callbacks)
# archetypes
archetypes,_ = decoder ([z_pred, tf.zeros([3,3])])
get_archtypes = tfk.Model (input_x, [archetypes,z_pred] , name='get_zpred')
t,A,mu,sigma, B_t, y = encoder.predict([x_train,np.zeros(np.shape(y_train))])
archetypes_pred, z_pred = get_archtypes(x_train)
x_train_pred, y_train_pred = vae.predict([x_train,np.zeros(np.shape(y_train))])
t_train,A_train,mu_train,sigma_train, B_t_train, y_trainzeros = encoder.predict([x_train,np.zeros(np.shape(y_train))])
x_test_pred, y_test_pred = vae.predict([x_test,np.zeros(np.shape(y_test))])
t_test,A_test,mu_test,sigma_test, B_t_test, y_testzeros = encoder.predict([x_test,np.zeros(np.shape(y_test))])
result = {('train','real space','features'): x_train,
('train','real space', 'targets'): y_train,
('train', 'latent space', 'As'): A_train,
('train','latent space','mus'): mu_train,
('train','latent space','sigmas'): sigma_train,
('train','reconstructed real space','features'): x_train_pred,
('train','reconstructed real space','targets'): y_train_pred,
('test', 'real space', 'features'): x_test,
('test', 'real space', 'targets'): y_test,
('test', 'latent space', 'As'): A_test,
('test', 'latent space', 'mus'): mu_test,
('test', 'latent space', 'sigmas'): sigma_test,
('test', 'reconstructed real space', 'features'): x_test_pred,
('test', 'reconstructed real space', 'targets'): y_test_pred }
return result
return execute
import json
from setuptools import setup, find_packages
with open('metainfo.json') as file:
metainfo = json.load(file)
setup(
name='cluster_based_sisso',
version='1.0',
author=', '.join(metainfo['authors']),
author_email=metainfo['email'],
url=metainfo['url'],
description=metainfo['title'],
long_description=metainfo['description'],
packages=find_packages(),
install_requires=['sissopp', 'tensorflow','numpy','pandas','scipy','scikit-learn','seaborn','toml'],
)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment