Commit 45d2e492 authored by Ievgen Vovk's avatar Ievgen Vovk
Browse files

Initial commit

parents
data_files:
mc:
train_sample:
magic1:
input_mask: "../../../MCs/MAGIC/ST.03.07/za05to35/Train_sample/1.Calibrated/GA_M1*root"
hillas_output: "../../../MCs/MAGIC/ST.03.07/za05to35/Train_sample/2.Hillas/iv_hillas_m1.h5"
magic2:
input_mask: "../../../MCs/MAGIC/ST.03.07/za05to35/Train_sample/1.Calibrated/GA_M2*root"
hillas_output: "../../../MCs/MAGIC/ST.03.07/za05to35/Train_sample/2.Hillas/iv_hillas_m2.h5"
test_sample:
magic1:
input_mask: "../../../MCs/MAGIC/ST.03.07/za05to35/Test_sample/1.Calibrated/GA_M1*root"
hillas_output: "../../../MCs/MAGIC/ST.03.07/za05to35/Test_sample/2.Hillas/iv_hillas_m1.h5"
reco_output: "../../../MCs/MAGIC/ST.03.07/za05to35/Test_sample/3.Reco/iv_reco_m1.h5"
magic2:
input_mask: "../../../MCs/MAGIC/ST.03.07/za05to35/Test_sample/1.Calibrated/GA_M2*root"
hillas_output: "../../../MCs/MAGIC/ST.03.07/za05to35/Test_sample/2.Hillas/iv_hillas_m2.h5"
reco_output: "../../../MCs/MAGIC/ST.03.07/za05to35/Test_sample/3.Reco/iv_reco_m2.h5"
data:
train_sample:
magic1:
input_mask: "../../../Data/MAGIC/Off/Train_sample/1.Calibrated/20*_M1_*root"
hillas_output: "../../../Data/MAGIC/Off/Train_sample/2.Hillas/iv_hillas_m1.h5"
magic2:
input_mask: "../../../Data/MAGIC/Off/Train_sample/1.Calibrated/20*_M2_*root"
hillas_output: "../../../Data/MAGIC/Off/Train_sample/2.Hillas/iv_hillas_m2.h5"
test_sample:
magic1:
input_mask: "../../../Data/MAGIC/CrabNebula/1.Calibrated/20*_M1_*root"
hillas_output: "../../../Data/MAGIC/CrabNebula/ctapipe/2.Hillas/iv_hillas_m1.h5"
reco_output: "../../../Data/MAGIC/CrabNebula/ctapipe/3.Reco/reco_m1.h5"
magic2:
input_mask: "../../../Data/MAGIC/CrabNebula/1.Calibrated/20*_M2_*root"
hillas_output: "../../../Data/MAGIC/CrabNebula/ctapipe/2.Hillas/iv_hillas_m2.h5"
reco_output: "../../../Data/MAGIC/CrabNebula/ctapipe/3.Reco/reco_m2.h5"
image_cleanining:
magic:
charge_thresholds:
picture_thresh: 6
boundary_thresh: 3.5
min_number_picture_neighbors: 1
time_thresholds:
# 1.5 slices x 1.64 GHz
time_limit: 2.46
# 4.5 slices x 1.64 GHz
max_time_off: 7.38
min_number_neighbors: 1
energy_rf:
save_name: "RFs/energy_rf.joblib"
cuts: "(multiplicity > 1) & (intensity > 30) & (length > 0.0) & (leakage1_intensity < 0.15)"
settings:
n_estimators: 100
min_samples_leaf: 10
n_jobs: 3
features:
['slope', 'length', 'width', 'intensity', 'skewness', 'kurtosis', 'x', 'y', 'psi', 'leakage1_intensity', 'leakage2_intensity']
direction_rf:
save_name: "RFs/direction_rf.joblib"
cuts: "(multiplicity > 1) & (intensity > 30) & (length > 0.0) & (leakage1_intensity < 0.15)"
settings:
n_estimators: 100
min_samples_leaf: 10
n_jobs: 3
features:
'disp': ['slope', 'length', 'width', 'intensity', 'skewness', 'kurtosis', 'x', 'y', 'psi']
'pos_angle_shift': ['slope', 'length', 'width', 'intensity', 'skewness', 'kurtosis', 'x', 'y', 'psi']
classifier_rf:
save_name: "RFs/classifier_rf.joblib"
cuts: "(multiplicity > 1) & (intensity > 30) & (length > 0.0) & (leakage1_intensity < 0.15)"
settings:
n_estimators: 100
min_samples_leaf: 10
n_jobs: 3
features:
['slope', 'length', 'width', 'intensity', 'skewness', 'kurtosis', 'x', 'y', 'psi', 'n_islands']
event_list:
max_time_diff: 6.9e-4
cuts:
l3rate: "80 < value < 1000"
dc: "2.5e3 < value < 3.5e3"
# coding: utf-8
import glob
import yaml
import argparse
import datetime
import scipy
import uproot
import pandas
def info_message(text, prefix='info'):
"""
This function prints the specified text with the prefix of the current date
Parameters
----------
text: str
Returns
-------
None
"""
date_str = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
print(f"({prefix:s}) {date_str:s}: {text:s}")
def identify_time_edges(times, criterion, max_time_diff=6.9e-4):
"""
Identifies the time interval edges, corresponding to the True
state of the specified condition. Neighbouring time intervals,
separated by not more than max_time_diff are joined together.
Parameters
----------
times: array_like
Array of the time data points.
criterion: array_like
Array of True/False values, indicating the goodness of the
corresponding data points.
max_time_diff: float, optional
Maximal time difference between the time intervals, below which
they are joined into one.
Returns
-------
parts_edges: list
List of start/stop pairs, describing the identified time intervals.
"""
times = scipy.array(times)
wh = scipy.where(criterion == True)
if len(wh[0]) == 0:
print("No time intervals excluded!")
return [[0,0]]
# The above function will result in indicies of non-zero bins.
# But we want indicies of their _edges_.
non_zero_bin_edges = []
for i in wh[0]:
if i-1 >= 0:
if abs(times[i]-times[i-1]) < max_time_diff:
non_zero_bin_edges.append(i-1)
non_zero_bin_edges.append(i)
if i+1 < len(times):
if abs(times[i]-times[i+1]) < max_time_diff:
non_zero_bin_edges.append(i+1)
non_zero_bin_edges = scipy.unique(non_zero_bin_edges)
if len(non_zero_bin_edges) > 2:
# Finding the elements, that separate the observational time intervals
# During one time interval diff should return 1 (current bin has index i, the index of the next time bin is i+1).
# Here we're looking for diffs that are not equal to 1.
#division_indicies = (scipy.diff(non_zero_bin_edges[1:-1])-1).nonzero()
#division_indicies = division_indicies[0]
cond_id = scipy.diff(non_zero_bin_edges[1:-1]) > 1
cond_time = scipy.diff(times[non_zero_bin_edges[1:-1]]) > max_time_diff
division_indicies = scipy.where(cond_id | cond_time)
division_indicies = division_indicies[0]
# Concatenating to the found elements the beginning and the end of the observational time.
# Also adding i+1 elements, to correctly switch to the next observational time interval.
parts_edges_idx = scipy.concatenate(
(
[non_zero_bin_edges[0]],
non_zero_bin_edges[1:-1][division_indicies],
non_zero_bin_edges[1:-1][division_indicies+1],
[non_zero_bin_edges[-1]]
)
)
else:
parts_edges_idx = scipy.array(non_zero_bin_edges)
parts_edges_idx.sort()
# Transorming edges indicies to the real values and transforming them to the [start, stop] list.
parts_edges = times[parts_edges_idx]
parts_edges = parts_edges.reshape(-1,2)
return parts_edges
def intersect_time_intervals(intervals1, intervals2):
"""
Intersects two lists of (TStart, TStop) pairs. Returned list
contains the start/stop invervals, common in both input lists.
Parameters
----------
interval1: list
First list of (TStart, TStop) lists (or tuples).
interval2: list
Second list of (TStart, TStop) lists (or tuples).
Returns
-------
joint_intervals: list
A list of (TStart, TStop) lists, representing the start/stop invervals,
common in both input lists.
"""
joined_intervals = []
# Comparing 1st to 2nd
for interv1 in intervals1:
tstart = None
tstop = None
for interv2 in intervals2:
if (interv2[0] >= interv1[0]) and (interv2[0] <= interv1[1]):
tstart = interv2[0]
tstop = min(interv2[1], interv1[1])
elif (interv2[1] >= interv1[0]) and (interv2[1] <= interv1[1]):
tstart = max(interv2[0], interv1[0])
tstop = interv2[1]
elif (interv2[0] < interv1[0]) and (interv2[1] > interv1[1]):
tstart = interv1[0]
tstop = interv1[1]
if tstart and tstop:
joined_intervals.append([tstart, tstop])
return joined_intervals
def gti_generator(config):
"""
GTI list generator.
Parameters
----------
config: dict
Configuration dictionary (e.g. read from a YAML file).
Returns
-------
joint_intervals: list
A list of (TStart, TStop) lists, representing the identified GTIs.
"""
if 'event_list' not in config:
raise ValueError('GTI generator error: the configuration file is missing the "event_list" section. Exiting.')
# Identifying the files to read
info_message("looking for the files", "GTI generator")
file_mask = config["data_files"]["data"]["test_sample"]["magic2"]["input_mask"]
file_list = glob.glob(file_mask)
if not file_list:
raise ValueError("No files to process")
# Containers for the data points
dfs = {
'dc': pandas.DataFrame(),
'l3rate': pandas.DataFrame()
}
# Removing the containers, not specified in the configuration card
if "l3rate" not in config['event_list']['cuts']:
del dfs['l3rate']
if "dc" not in config['event_list']['cuts']:
del dfs['dc']
# Looping over the data files
for fnum, file_name in enumerate(file_list):
info_message(f"processing file {fnum+1} / {len(file_list)}", "GTI generator")
with uproot.open(file_name) as input_stream:
# --- DC ---
if "dc" in dfs:
mjd = input_stream["Camera"]["MTimeCamera.fMjd"].array()
millisec = input_stream["Camera"]["MTimeCamera.fTime.fMilliSec"].array()
nanosec = input_stream["Camera"]["MTimeCamera.fNanoSec"].array()
df_ = pandas.DataFrame()
df_['mjd'] = mjd + (millisec / 1e3 + nanosec / 1e9) / 86400
df_['value'] = input_stream["Camera"]["MReportCamera.fMedianDC"].array()
dfs['dc'] = dfs['dc'].append(df_)
# --- L3 rate ---
if "l3rate" in dfs:
mjd = input_stream["Trigger"]["MTimeTrigger.fMjd"].array()
millisec = input_stream["Trigger"]["MTimeTrigger.fTime.fMilliSec"].array()
nanosec = input_stream["Trigger"]["MTimeTrigger.fNanoSec"].array()
df_ = pandas.DataFrame()
df_['mjd'] = mjd + (millisec / 1e3 + nanosec / 1e9) / 86400
df_['value'] = input_stream["Trigger"]["MReportTrigger.fL3Rate"].array()
dfs['l3rate'] = dfs['l3rate'].append(df_)
# Sorting data points by date is needed for the time intervals identification
for key in dfs:
dfs[key] = dfs[key].sort_values(by=['mjd'])
info_message("identifying GTIs", "GTI generator")
time_intervals_list = []
# Identifying DC-related GTIs
if "dc" in dfs:
cut = config['event_list']['cuts']['dc']
selection = dfs['dc'].query(cut)
_, idx, _ = scipy.intersect1d(dfs['dc']["mjd"], selection["mjd"], return_indices=True)
criterion = scipy.repeat(False, len(dfs['dc']["mjd"]))
criterion[idx] = True
time_intervals = identify_time_edges(dfs['dc']["mjd"], criterion, max_time_diff=config['event_list']['max_time_diff'])
time_intervals_list.append(time_intervals)
# Identifying L3-related GTIs
if "l3rate" in dfs:
cut = config['event_list']['cuts']['l3rate']
selection = dfs['l3rate'].query(cut)
_, idx, _ = scipy.intersect1d(dfs['l3rate']["mjd"], selection["mjd"], return_indices=True)
criterion = scipy.repeat(False, len(dfs['l3rate']["mjd"]))
criterion[idx] = True
time_intervals = identify_time_edges(dfs['l3rate']["mjd"], criterion, max_time_diff=config['event_list']['max_time_diff'])
time_intervals_list.append(time_intervals)
if not len(time_intervals_list):
raise ValueError("No valid selection cuts provided in the configuration file.")
joint_intervals = time_intervals_list[0]
# Joining all found GTIs
for i in range(1, len(time_intervals_list)):
joint_intervals = intersect_time_intervals(joint_intervals, time_intervals_list[i])
return joint_intervals
# =================
# === Main code ===
# =================
if __name__ == "__main__":
# --------------------------
# Adding the argument parser
arg_parser = argparse.ArgumentParser(description="""
This tools fits the direction random forest regressor on the specified events files.
""")
arg_parser.add_argument("--config", default="config.yaml",
help='Configuration file to steer the code execution.')
parsed_args = arg_parser.parse_args()
# --------------------------
# ------------------------------
# Reading the configuration file
file_not_found_message = """
Error: can not load the configuration file {:s}.
Please check that the file exists and is of YAML or JSON format.
Exiting.
"""
try:
config = yaml.load(open(parsed_args.config, "r"))
except IOError:
print(file_not_found_message.format(parsed_args.config))
exit()
if 'event_list' not in config:
print('Error: the configuration file is missing the "event_list" section. Exiting.')
exit()
# ------------------------------
joint_intervals = gti_generator(config)
print("--- Identified GTI intervals [MJD] ---")
for interval in joint_intervals:
print(interval)
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
# coding: utf-8
import argparse
import yaml
import datetime
import scipy
import pandas as pd
import numpy as np
import sklearn.metrics
from ctapipe.reco.event_processing import EventClassifierPandas
from astropy import units as u
from matplotlib import pyplot, colors
from MyFunctions import GetHist
def info_message(text, prefix='info'):
"""
This function prints the specified text with the prefix of the current date
Parameters
----------
text: str
Returns
-------
None
"""
date_str = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
print(f"({prefix:s}) {date_str:s}: {text:s}")
def evaluate_performance(data, class0_name='event_class_0'):
data = data.dropna()
report = {
"gammaness": dict(),
"metrics": dict()
}
for event_class in data['true_event_class'].unique():
events = data.query(f'true_event_class == {event_class}')
hist = GetHist(events[class0_name], bins=100, range=(0, 1))
hist['Hist'] = hist['Hist'] / hist['Hist'].sum()
hist['Cumsum'] = 1 - scipy.cumsum(hist['Hist'])
report['gammaness'][event_class] = hist
if 'mean' in class0_name:
class_names = list(filter(lambda name: 'event_class_' in name and '_mean' in name, data.columns))
else:
class_names = list(filter(lambda name: 'event_class_' in name and '_mean' not in name, data.columns))
proba = data[class_names].values
predicted_class = proba.argmax(axis=1)
report['metrics']['acc'] = sklearn.metrics.accuracy_score(data['true_event_class'], predicted_class)
true_class = scipy.clip(data['true_event_class'], 0, 1)
true_class = 1 - true_class
report['metrics']['auc_roc'] = sklearn.metrics.roc_auc_score(true_class, proba[:, 0])
return report
def load_data_sample(sample):
shower_data = pd.DataFrame()
for telescope in sample:
info_message(f'Loading {telescope} data...', prefix='ClassifierRF')
hillas_data = pd.read_hdf(sample[telescope]['hillas_output'], key='dl1/hillas_params')
hillas_data.set_index(['obs_id', 'event_id', 'tel_id'], inplace=True)
shower_data = shower_data.append(hillas_data)
shower_data.sort_index(inplace=True)
return shower_data
def get_weights(mc_data, bkg_data, alt_edges, intensity_edges):
mc_hist, _, _ = scipy.histogram2d(mc_data['tel_alt'],
mc_data['intensity'],
bins=[alt_edges, intensity_edges])
bkg_hist, _, _ = scipy.histogram2d(bkg_data['tel_alt'],
bkg_data['intensity'],
bins=[alt_edges, intensity_edges])
availability_hist = scipy.clip(mc_hist, 0, 1) * scipy.clip(bkg_hist, 0, 1)
# --- MC weights ---
mc_alt_bins = scipy.digitize(mc_data['tel_alt'], alt_edges) - 1
mc_intensity_bins = scipy.digitize(mc_data['intensity'], intensity_edges) - 1
# Treating the out-of-range events
mc_alt_bins[mc_alt_bins == len(alt_edges) - 1] = len(alt_edges) - 2
mc_intensity_bins[mc_intensity_bins == len(intensity_edges) - 1] = len(intensity_edges) - 2
mc_weights = 1 / mc_hist[mc_alt_bins, mc_intensity_bins]
mc_weights *= availability_hist[mc_alt_bins, mc_intensity_bins]
# --- Bkg weights ---
bkg_alt_bins = scipy.digitize(bkg_data['tel_alt'], alt_edges) - 1
bkg_intensity_bins = scipy.digitize(bkg_data['intensity'], intensity_edges) - 1
# Treating the out-of-range events
bkg_alt_bins[bkg_alt_bins == len(alt_edges) - 1] = len(alt_edges) - 2
bkg_intensity_bins[bkg_intensity_bins == len(intensity_edges) - 1] = len(intensity_edges) - 2
bkg_weights = 1 / bkg_hist[bkg_alt_bins, bkg_intensity_bins]
bkg_weights *= availability_hist[bkg_alt_bins, bkg_intensity_bins]
# --- Storing to a data frame ---
mc_weight_df = pd.DataFrame(data={'event_weight': mc_weights},
index=mc_data.index)
bkg_weight_df = pd.DataFrame(data={'event_weight': bkg_weights},
index=bkg_data.index)
return mc_weight_df, bkg_weight_df
# =================
# === Main code ===
# =================
# --------------------------
# Adding the argument parser
arg_parser = argparse.ArgumentParser(description="""
This tools fits the event classification random forest on the specified events files.
""")
arg_parser.add_argument("--config", default="config.yaml",
help='Configuration file to steer the code execution.')
parsed_args = arg_parser.parse_args()
# --------------------------
# ------------------------------
# Reading the configuration file
file_not_found_message = """
Error: can not load the configuration file {:s}.
Please check that the file exists and is of YAML or JSON format.
Exiting.
"""
try:
config = yaml.load(open(parsed_args.config, "r"))
except IOError:
print(file_not_found_message.format(parsed_args.config))
exit()
if 'classifier_rf' not in config:
print('Error: the configuration file is missing the "classifier_rf" section. Exiting.')
exit()
# ------------------------------
# --------------------
# --- Train sample ---
info_message('Loading MC train data...', prefix='ClassifierRF')
mc_data = load_data_sample(config['data_files']['mc']['train_sample'])
info_message('Loading "off" train data...', prefix='ClassifierRF')
bkg_data = load_data_sample(config['data_files']['data']['train_sample'])
# True event classes
mc_data['true_event_class'] = 0
bkg_data['true_event_class'] = 1
# Dropping data with the wrong altitude
bkg_data = bkg_data.query('tel_alt < 1.5707963267948966')
# Dropping extra keys
bkg_data.drop('mjd', axis=1, inplace=True)
mc_data.drop(['true_energy', 'true_alt', 'true_az'], axis=1, inplace=True)
# Computing event weights
sin_edges = scipy.linspace(0, 1, num=51)
alt_edges = scipy.arcsin(sin_edges)
intensity_edges = scipy.logspace(1, 5, num=51)
mc_weights, bkg_weights = get_weights(mc_data, bkg_data,
alt_edges, intensity_edges)
mc_data = mc_data.join(mc_weights)
bkg_data = bkg_data.join(bkg_weights)
# Merging the train sample
shower_data_train = mc_data.append(bkg_data)
#print(shower_data_train.head())
# --------------------
# -------------------
# --- Test sample ---
info_message('Loading MC test data...', prefix='ClassifierRF')
mc_data = load_data_sample(config['data_files']['mc']['test_sample'])
info_message('Loading "off" test data...', prefix='ClassifierRF')
bkg_data = load_data_sample(config['data_files']['data']['test_sample'])