Traininig the Inclusive classifier with tf.keras using data in TFRecord format

tf.keras Inclusive classifier, Transformer-based model This notebooks trains a neural network for the particle classifier using the Inclusive Classifier, using as input the full list of recunstructed particles and the High Level Features. Data is prepared in TFRecord format converting from Parquet using Apache Spark. Tensorflow data procesing uses tf.data and tf.io. This notebook uses a Transformerlayer instead of a GRU in the neural network for the Inclusive classifier

Credits: this notebook is part of the work:

The model is a classifier implemented as the concatenation of a Dense Neural Network and a Transformer

Create the Keras model for the inclusive classifier

In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import Sequential, Input, Model
from tensorflow.keras.layers import Masking, Dense, Activation, GRU, Dropout, concatenate
In [2]:
In [3]:
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
In [4]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, Flatten
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, TimeDistributed

# Define input layer
transformer_input = Input(shape=(801, 19), name='transformer_input')

# Apply multi-head self-attention
attention_layer = MultiHeadAttention(num_heads=8, key_dim=19)
attention_output = attention_layer(transformer_input, transformer_input, transformer_input)
attention_output = LayerNormalization(epsilon=1e-6)(attention_output)
attention_output = Dropout(0.2)(attention_output)

# Apply time-distributed dense layer
td_layer = TimeDistributed(Dense(20, activation='relu'))
td_output = td_layer(attention_output)

# Flatten the output of the previous layer
flatten_layer = Flatten()
flatten_output = flatten_layer(td_output)

# Apply a dense layer
dense_layer = Dense(10, activation='relu')
output = dense_layer(flatten_output)
In [5]:
hlf_input = Input(shape=(14,), name='hlf_input')
b = hlf_input
hlfBranch = Dropout(0.2)(b)
In [6]:
c = concatenate([output, hlfBranch])
In [7]:
c = Dense(25, activation='relu')(c)
output = Dense(3, activation='softmax')(c)
In [8]:
# model = Model(inputs=[gru_input, hlf_input], outputs=output)

model = Model(inputs=[transformer_input, hlf_input], outputs=output)
In [9]:
## Compile model
optimizer = 'Adam'
loss = 'categorical_crossentropy'
model.compile(loss=loss, optimizer=optimizer, metrics=["accuracy"] )
In [10]:
Model: "model"
 Layer (type)                   Output Shape         Param #     Connected to                     
 transformer_input (InputLayer)  [(None, 801, 19)]   0           []                               

 multi_head_attention (MultiHea  (None, 801, 19)     12027       ['transformer_input[0][0]',      
 dAttention)                                                      'transformer_input[0][0]',      

 layer_normalization (LayerNorm  (None, 801, 19)     38          ['multi_head_attention[0][0]']   

 dropout (Dropout)              (None, 801, 19)      0           ['layer_normalization[0][0]']    

 time_distributed (TimeDistribu  (None, 801, 20)     400         ['dropout[0][0]']                

 flatten (Flatten)              (None, 16020)        0           ['time_distributed[0][0]']       

 hlf_input (InputLayer)         [(None, 14)]         0           []                               

 dense_1 (Dense)                (None, 10)           160210      ['flatten[0][0]']                

 dropout_1 (Dropout)            (None, 14)           0           ['hlf_input[0][0]']              

 concatenate (Concatenate)      (None, 24)           0           ['dense_1[0][0]',                

 dense_2 (Dense)                (None, 25)           625         ['concatenate[0][0]']            

 dense_3 (Dense)                (None, 3)            78          ['dense_2[0][0]']                

Total params: 173,378
Trainable params: 173,378
Non-trainable params: 0

Load test and training data in TFRecord format, using tf.data and tf.io

In [11]:
# Download from https://github.com/cerndb/SparkDLTrigger/tree/master/Data
# For CERN users, data is already availanle on EOS
FOLDER = "/eos/project/s/sparkdltrigger/public/"

PATH = FOLDER + "testUndersampled_InclusiveClassifier.tfrecord"
files_test_dataset = tf.data.Dataset.list_files(PATH+"/part-r*", shuffle=False)

# training dataset 
PATH = FOLDER + "trainUndersampled.tfrecord"
files_train_dataset = tf.data.Dataset.list_files(PATH+"/part-r*", seed=4242)
In [12]:
# tunable

test_dataset = files_test_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE).interleave(

train_dataset = files_train_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE).interleave(
    tf.data.TFRecordDataset, cycle_length=num_parallel_reads,
In [13]:
# Function to decode TF records into the required features and labels
def decode(serialized_example):
    deser_features = tf.io.parse_single_example(
      # Defaults are not specified since both keys are required.
          'HLF_input': tf.io.FixedLenFeature((14), tf.float32),
          'GRU_input': tf.io.FixedLenFeature((801,19), tf.float32),
          'encoded_label': tf.io.FixedLenFeature((3), tf.float32),
    return((deser_features['GRU_input'], deser_features['HLF_input']), deser_features['encoded_label'])
In [14]:
# use for debug
# for record in test_dataset.take(1):
#     print(record)
In [15]:
parsed_test_dataset=test_dataset.map(decode, num_parallel_calls=tf.data.experimental.AUTOTUNE)
parsed_train_dataset=train_dataset.map(decode, num_parallel_calls=tf.data.experimental.AUTOTUNE)
In [16]:
# Show and example of the parsed data
# for record in parsed_test_dataset.take(1):
#     print(record)
In [17]:
# tunable

<_PrefetchDataset element_spec=((TensorSpec(shape=(None, 801, 19), dtype=tf.float32, name=None), TensorSpec(shape=(None, 14), dtype=tf.float32, name=None)), TensorSpec(shape=(None, 3), dtype=tf.float32, name=None))>
In [18]:
# tunable
# test_batch_size = 10240
test_batch_size = 128


Train the tf.keras model

In [19]:
# train the Keras model

# tunable
num_epochs = 6

# callbacks = [ tf.keras.callbacks.TensorBoard(log_dir='./logs') ]
callbacks = []

%time history = model.fit(train, validation_data=test, epochs=num_epochs, callbacks=callbacks)
Epoch 1/6
  26767/Unknown - 9664s 360ms/step - loss: 0.2304 - accuracy: 0.9146
26767/26767 [==============================] - 10619s 396ms/step - loss: 0.2304 - accuracy: 0.9146 - val_loss: 0.1925 - val_accuracy: 0.9299
Epoch 2/6
26767/26767 [==============================] - 10437s 390ms/step - loss: 0.1927 - accuracy: 0.9294 - val_loss: 0.1808 - val_accuracy: 0.9346
Epoch 3/6
26767/26767 [==============================] - 10457s 391ms/step - loss: 0.1807 - accuracy: 0.9339 - val_loss: 0.1656 - val_accuracy: 0.9400
Epoch 4/6
26767/26767 [==============================] - 10426s 390ms/step - loss: 0.1687 - accuracy: 0.9385 - val_loss: 0.1520 - val_accuracy: 0.9450
Epoch 5/6
26767/26767 [==============================] - 10437s 390ms/step - loss: 0.1590 - accuracy: 0.9423 - val_loss: 0.1470 - val_accuracy: 0.9474
Epoch 6/6
26767/26767 [==============================] - 10446s 390ms/step - loss: 0.1529 - accuracy: 0.9447 - val_loss: 0.1500 - val_accuracy: 0.9462
CPU times: user 2h 47min 13s, sys: 12h 15min 39s, total: 15h 2min 52s
Wall time: 17h 27min 3s
In [20]:
# Save the model
# tf.keras.models.save_model(model, PATH+"mymodel" + ".tf", save_format='tf')

Performance metrics

In [21]:
%matplotlib notebook
import matplotlib.pyplot as plt 
# Graph with loss vs. epoch

plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='validation')
plt.legend(loc='upper right')
plt.title("HLF classifier loss")
In [22]:
# Graph with accuracy vs. epoch
%matplotlib notebook
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='validation')
plt.legend(loc='lower right')
plt.title("HLF classifier accuracy")
Confusion Matrix

In [25]:
# model = tf.keras.models.load_model("./mymodel.tf")
In [ ]:
# Optionally run this
# %time model.evaluate(test)
In [25]:
%time y_pred = model.predict(test)
6689/6689 [==============================] - 967s 145ms/step
CPU times: user 4min 6s, sys: 17min 57s, total: 22min 3s
Wall time: 16min 12s
In [26]:
%time y_test_all = [labels.numpy() for features,labels in parsed_test_dataset.__iter__()]
CPU times: user 6min 51s, sys: 4min 34s, total: 11min 26s
Wall time: 6min 46s
In [27]:
y_true = np.stack(y_test_all[:num_entries])
In [28]:
from sklearn.metrics import accuracy_score

print('Accuracy of the classifier: {:.4f}'.format(
    accuracy_score(np.argmax(y_true, axis=1),np.argmax(y_pred, axis=1))))
Accuracy of the classifier: 0.9462
In [29]:
import seaborn as sns
from sklearn.metrics import confusion_matrix
labels_name = ['qcd', 'tt', 'wjets']
labels = [0,1,2]

cm = confusion_matrix(np.argmax(y_true, axis=1), np.argmax(y_pred, axis=1), labels=labels)

## Normalize CM
cm = cm / cm.astype(np.float).sum(axis=1)

fig, ax = plt.subplots()
ax = sns.heatmap(cm, annot=True, fmt='g')
plt.xlabel('True labels')
plt.ylabel('Predicted labels')
In [30]:
from sklearn.metrics import roc_curve, auc

fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(3):
    fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_pred[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
In [31]:
# Dictionary containign ROC-AUC for the three classes 
{0: 0.9936400941715727, 1: 0.994817438367295, 2: 0.9922890256542002}
In [32]:
%matplotlib notebook

# Plot roc curve 
import matplotlib.pyplot as plt

plt.plot(fpr[0], tpr[0], lw=2, 
         label='HLF classifier (AUC) = %0.4f' % roc_auc[0])
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Background Contamination (FPR)')
plt.ylabel('Signal Efficiency (TPR)')
plt.title('$tt$ selector')
plt.legend(loc="lower right")
In [ ]: