diff --git a/src/python/postprocess/check_cv_convergence.py b/src/python/postprocess/check_cv_convergence.py new file mode 100644 index 0000000000000000000000000000000000000000..12073daa4fffc221f2b5efa9a9cc5eeb142170fb --- /dev/null +++ b/src/python/postprocess/check_cv_convergence.py @@ -0,0 +1,44 @@ +import numpy as np +from glob import glob + + +def jackknife_cv_conv_est(dir_expr): + """Get the jackknife variance of the CV test error + + Args: + dir_expr (str): Regular expression for the directory list + + Returns: + avg_error: The average rmse of the test error + variance: The jackknife estimate of the variance of the test RMSE + """ + train_model_list = sorted( + glob(dir_expr + "/models/train_*_model_0.dat"), + key=lambda s: int(s.split("/")[-1].split("_")[2]), + ) + test_model_list = sorted( + glob(dir_expr + "/models/test_*_model_0.dat"), + key=lambda s: int(s.split("/")[-1].split("_")[2]), + ) + n_dim = int(train_model_list[-1].split("/")[-1].split("_")[2]) + + models = [ + Model(train_file, test_file) + for train_file, test_file in zip(train_model_list, test_model_list) + ] + + test_rmse = np.array([model.test_rmse for model in models]).reshape(n_dim, -1) + x_bar_i = [] + for dim_error in test_rmse: + x_bar_i.append([]) + for ii in range(len(dim_error)): + x_bar_i[-1].append(np.delete(dim_error, ii).mean()) + + x_bar_i = np.array(x_bar_i) + avg_error = x_bar_i.mean(axis=1) + variance = ( + (test_rmse.shape[1] - 1.0) + / test_rmse.shape[1] + * np.sum((x_bar_i - avg_error.reshape(n_dim, 1)) ** 2.0, axis=1) + ) + return avg_error, variance