Traininig of the High Level Feature classifier with TensorFlow/Keras and Petastorm¶
Tensorflow/Keras and Petastorm, HLF classifier This notebooks trains a dense neural network for the particle classifier using High Level Features. It uses TensorFlow/Keras on a single node. Data is read using the Petastorm library.
Credits: this notebook is taken with permission from the work:
- Machine Learning Pipelines with Modern Big Data Tools for High Energy Physics Comput Softw Big Sci 4, 8 (2020)
- Code and data at:https://github.com/cerndb/SparkDLTrigger
- The model is a classifier implemented as a DNN
- Model input: 14 "high level features", described in Topology classification with deep learning to improve real-time event selection at the LHC
- Model output: 3 classes, "W + jet", "QCD", "$t\bar{t}$"
Create the Keras model¶
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
tf.version.VERSION
# Check that we have a GPU available
tf.config.list_physical_devices('GPU')
def create_model(nh_1, nh_2, nh_3):
## Create model
model = Sequential()
model.add(Dense(nh_1, input_shape=(14,), activation='relu'))
model.add(Dense(nh_2, activation='relu'))
model.add(Dense(nh_3, activation='relu'))
model.add(Dense(3, activation='softmax'))
## Compile model
optimizer = 'Adam'
loss = 'categorical_crossentropy'
model.compile(loss=loss, optimizer=optimizer, metrics=["accuracy"])
return model
# define the Keras model
keras_model = create_model(50, 20, 10)
Load data and train the Keras model¶
# Download the datasets from
# https://github.com/cerndb/SparkDLTrigger/tree/master/Data
#
# For CERN users, data is already available on EOS
PATH = "file:///eos/project/s/sparkdltrigger/public/"
file_train_dataset = PATH + "trainUndersampled_HLF_features.parquet"
file_test_dataset = PATH + "testUndersampled_HLF_features.parquet"
# PATH needs to be prefixed by the fileystem type as in:
# "file://<full_path>_on_filesystem/Parquet_folder/"
# "hdfs://<full_path_on_hdfs>/Parquet_folder/"
# Install petastorm if not done yet
!pip install petastorm
# We use the petastorm libary to load and feed the training and test data in Parquet format
# It makes use TensorFLow tf.data.dataset
import petastorm
from petastorm import make_batch_reader
from petastorm.tf_utils import make_petastorm_dataset
petastorm.__version__
Basic use of Petastorm with tf.kera, see below for training with custom batch_size
with make_batch_reader(file_test_dataset, num_epochs = 1) as test_data: with make_batch_reader(file_train_dataset, num_epochs = 1, shuffle_row_groups = True) as train_data: print("Number of training rows:", train_data.dataset.read().num_rows) # # Transform Parquet files into TensorFLow datasets (tf.data API) # Note batch size is taken from the Parquet row group size # Each Parquet row group becomes a batch # It has been tuned to 1 MB for this dataset (the default row group size for Parquet files is 128 MB) # test_dataset = make_petastorm_dataset(test_data).cache() train_dataset = make_petastorm_dataset(train_data).cache() # # Train the Keras model # history = keras_model.fit(train_dataset, validation_data = test_dataset, epochs = 5)
#
# Train with TensorFlow using Petastorm to read Parquet files
# This performs a rebatching operation on the training dataset to set explicitly the bach size,
# as otherwise Petastorm produces batches with Parquet rowgroup size, which is often too large.
#
batch_size = 128
n_epochs = 5
with make_batch_reader(file_test_dataset, num_epochs = 1, shuffle_row_groups = False) as test_data:
with make_batch_reader(file_train_dataset, num_epochs = 1, shuffle_row_groups = True) as train_data:
print("Number of training rows:", train_data.dataset.read().num_rows)
#
# Transform Parquet files into TensorFLow datasets (tf.data API)
#
test_dataset = make_petastorm_dataset(test_data).cache()
train_dataset = ( make_petastorm_dataset(train_data)
.unbatch() # change this for rebatch with tensorflow 2.11
.batch(batch_size)
.cache()
)
#
# Train the Keras model
#
history = keras_model.fit(train_dataset,
validation_data = test_dataset,
epochs = n_epochs, verbose=1)
Performance metrics¶
%matplotlib notebook
import matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8-darkgrid')
# Graph with loss vs. epoch
plt.figure()
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='validation')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(loc='upper right')
plt.title("HLF classifier loss")
plt.show()
# Graph with accuracy vs. epoch
%matplotlib notebook
plt.figure()
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='validation')
plt.ylabel('Accuracy')
plt.xlabel('epoch')
plt.legend(loc='lower right')
plt.title("HLF classifier accuracy")
plt.show()
Confusion Matrix¶
import numpy as np
# Need to use workers_count=1 to avoid getting data potentially in different order at each execution
with make_batch_reader(file_test_dataset, num_epochs = 1, workers_count=1, shuffle_row_groups = False, shuffle_rows=False) as test_data:
y_pred = history.model.predict(test_data)
# Need to use workers_count=1 to avoid getting data potentially in different order at each execution
with make_batch_reader(file_test_dataset, num_epochs = 1, workers_count=1, shuffle_row_groups = False, shuffle_rows=False) as test_data:
y_true = np.concatenate([labels for features,labels in test_data])
You can also use this if can afford to read the entire test set in memory
import numpy as np
with make_batch_reader(file_test_dataset, num_epochs = 1, shuffle_row_groups = False) as test_data: testPA = test_data.dataset.read() print("Number of test rows:", testPA.num_rows) y_true = np.stack(testPA["encoded_label"].to_numpy()) X_test = np.stack(testPA["HLF_input"].to_numpy()) y_pred = history.model.predict(X_test)
from sklearn.metrics import accuracy_score
print('Accuracy of the HLF classifier: {:.4f}'.format(
accuracy_score(np.argmax(y_true, axis=1),np.argmax(y_pred, axis=1))))
import seaborn as sns
from sklearn.metrics import confusion_matrix
labels_name = ['qcd', 'tt', 'wjets']
labels = [0,1,2]
cm = confusion_matrix(np.argmax(y_true, axis=1), np.argmax(y_pred, axis=1), labels=labels)
## Normalize CM
cm = cm / cm.astype(float).sum(axis=1)
fig, ax = plt.subplots()
ax = sns.heatmap(cm, annot=True, fmt='g')
ax.xaxis.set_ticklabels(labels_name)
ax.yaxis.set_ticklabels(labels_name)
plt.xlabel('True labels')
plt.ylabel('Predicted labels')
plt.show()
ROC and AUC¶
from sklearn.metrics import roc_curve, auc
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(3):
fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_pred[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Dictionary containign ROC-AUC for the three classes
roc_auc
%matplotlib notebook
# Plot roc curve
import matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8-darkgrid')
plt.figure()
plt.plot(fpr[0], tpr[0], lw=2,
label='HLF classifier (AUC) = %0.4f' % roc_auc[0])
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Background Contamination (FPR)')
plt.ylabel('Signal Efficiency (TPR)')
plt.title('$tt$ selector')
plt.legend(loc="lower right")
plt.show()