Tabular data modeling - with Tensorflow
In this notebook I try to use a competition dataset of log parameters of a server and classify if there was an attempt to attack the server
- Import required libraries
- Download data
- Target Distribution
- Data Preprocessing
- Model setup
- Train the model
- Predict with test data
In this notebook, I was giving a shot with Tensorflow for tabular data modeling, this particular competition data is quite imbalanced as you will see in this notebook a little later, This example is more or else fully adapted from the example from Tensorflow docs
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
!unzip data.zip
import pandas as pd
from pathlib import Path
DATA_PATH = Path('/content/Dataset')
train_data = pd.read_csv(DATA_PATH/'Train.csv', index_col=0, infer_datetime_format=True, converters={'DATE':pd.to_datetime})
train_data.head()
train_data['MULTIPLE_OFFENSE'].value_counts().plot.bar()
neg, pos = np.bincount(train_data['MULTIPLE_OFFENSE'])
total = neg + pos
print('Examples:\n Total: {}\n Positive: {} ({:.2f}% of total)\n'.format(
total, pos, 100 * pos / total))
We do have an imbalanced dataset, as shown above, 95%
of the targets are positive (Class 1
).
seed=98
np.random.seed(seed=seed)
tf.random.set_seed(seed)
We shall drop the date column and try to model with only the logging parameters
cleaned_df = train_data.drop('DATE', axis='columns')
We split the training data to three train, test and eval sets
train_df, test_df = train_test_split(cleaned_df, test_size=0.2,
random_state=seed, stratify=cleaned_df['MULTIPLE_OFFENSE'])
train_df, val_df = train_test_split(train_df, test_size=0.2,
random_state=seed, stratify=train_df['MULTIPLE_OFFENSE'])
train_labels = np.array(train_df.pop('MULTIPLE_OFFENSE'))
bool_train_labels = train_labels != 0
val_labels = np.array(val_df.pop('MULTIPLE_OFFENSE'))
test_labels = np.array(test_df.pop('MULTIPLE_OFFENSE'))
train_features = np.array(train_df)
val_features = np.array(val_df)
test_features = np.array(test_df)
Create a preprocessing pipeline with scikit-learn
from sklearn.pipeline import Pipeline
preproc_pipe = Pipeline([
('median_imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
train_features = preproc_pipe.fit_transform(train_features)
val_features = preproc_pipe.transform(val_features)
test_features = preproc_pipe.transform(test_features)
# ensure that the values are within a range
train_features = np.clip(train_features, -5, 5)
val_features = np.clip(val_features, -5, 5)
test_features = np.clip(test_features, -5, 5)
Check the dimensions of our data
print('Training labels shape:', train_labels.shape)
print('Validation labels shape:', val_labels.shape)
print('Test labels shape:', test_labels.shape)
print('Training features shape:', train_features.shape)
print('Validation features shape:', val_features.shape)
print('Test features shape:', test_features.shape)
Let's check if our preprocessing has shown a distinction between the classes of the dataset
pos_df = pd.DataFrame(train_features[ bool_train_labels], columns = train_df.columns)
neg_df = pd.DataFrame(train_features[~bool_train_labels], columns = train_df.columns)
sns.jointplot(pos_df['X_10'], pos_df['X_15'],
kind='hex', xlim = (-1,1), ylim = (-1,1))
plt.suptitle("Positive distribution")
sns.jointplot(neg_df['X_10'], neg_df['X_15'],
kind='hex', xlim = (-1,1), ylim = (-1,1))
_ = plt.suptitle("Negative distribution")
There is a slight difference in terms of the value, (ie) for the positive class the distribution is slightly on the negative
side of zero, and the negative class is slightly on the positive
side of zero
Let's setup the model layers and the evaluation metrics
def plot_loss(history, label, n):
# Use a log scale to show the wide range of values.
plt.semilogy(history.epoch, history.history['loss'],
color=colors[n], label='Train '+label)
plt.semilogy(history.epoch, history.history['val_loss'],
color=colors[n], label='Val '+label,
linestyle="--")
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
METRICS = [
keras.metrics.TruePositives(name='tp'),
keras.metrics.FalsePositives(name='fp'),
keras.metrics.TrueNegatives(name='tn'),
keras.metrics.FalseNegatives(name='fn'),
keras.metrics.BinaryAccuracy(name='accuracy'),
keras.metrics.Precision(name='precision'),
keras.metrics.Recall(name='recall'),
keras.metrics.AUC(name='auc'),
]
def make_model(metrics=METRICS, output_bias=None):
if output_bias is not None:
output_bias = tf.keras.initializers.Constant(output_bias)
model = keras.Sequential([
keras.layers.Dense(
16, activation='relu',
input_shape=(train_features.shape[-1],)),
keras.layers.Dropout(0.5),
keras.layers.Dense(1, activation='sigmoid',
bias_initializer=output_bias),
])
model.compile(optimizer=keras.optimizers.Adam(lr=1e-3),
loss=keras.losses.BinaryCrossentropy(),
metrics=metrics)
return model
Set constants and callbacks
EPOCHS = 500
BATCH_SIZE = 4096
early_stopping = tf.keras.callbacks.EarlyStopping(
monitor='val_recall',
verbose=1,
patience=10,
mode='max',
restore_best_weights=True)
Calculate initial bias for a smooth training
initial_bias = np.log([pos/neg])
initial_bias
model = make_model(output_bias=initial_bias)
results = model.evaluate(train_features, train_labels, batch_size=BATCH_SIZE, verbose=0)
print("Loss: {:0.4f}".format(results[0]))
import os, tempfile
initial_weights = os.path.join(tempfile.mkdtemp(),'initial_weights')
model.save_weights(initial_weights)
Calculate class weights to set to the model
weight_for_0 = (1 / neg)*(total)/2.0
weight_for_1 = (1 / pos)*(total)/2.0
class_weight = {0: weight_for_0, 1: weight_for_1}
print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))
weighted_model = make_model()
weighted_model.load_weights(initial_weights)
weighted_history = weighted_model.fit(
train_features,
train_labels,
batch_size=BATCH_SIZE,
epochs=EPOCHS,
callbacks = [early_stopping],
validation_data=(val_features, val_labels),
# The class weights go here
class_weight=class_weight)
We have got a good validation recall of 0.9026
mpl.rcParams['figure.figsize'] = (12, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
plot_loss(weighted_history, "Weighted", n=0)
Looks like there still could be some room for improvement
def plot_metrics(history):
metrics = ['loss', 'auc', 'precision', 'recall']
for n, metric in enumerate(metrics):
name = metric.replace("_"," ").capitalize()
plt.subplot(2,2,n+1)
plt.plot(history.epoch, history.history[metric], color=colors[0], label='Train')
plt.plot(history.epoch, history.history['val_'+metric],
color=colors[0], linestyle="--", label='Val')
plt.xlabel('Epoch')
plt.ylabel(name)
if metric == 'loss':
plt.ylim([0, plt.ylim()[1]])
elif metric == 'auc':
plt.ylim([0.8,1])
else:
plt.ylim([0,1])
plt.legend()
plot_metrics(weighted_history)
test_data = pd.read_csv(DATA_PATH/'Test.csv', index_col=0)
test_data.drop('DATE', axis='columns', inplace=True)
proc_test = preproc_pipe.transform(np.array(test_data))
preds = np.argmax(weighted_model.predict(proc_test), axis=-1)
sub_df = pd.DataFrame(
{'INCIDENT_ID':test_data.index,
'MULTIPLE_OFFENSE':preds},
)
sub_df.head()
sub_df.to_csv('submission_df.csv', index=False)
sub_df['MULTIPLE_OFFENSE'].value_counts()
I tried with various batch sizes and epochs to try to get it predict the classes differently maybe I should also try to change the model structure to achieve a different result because this submission only scored 50
recall on the competition test set.
I will also post other notebooks very soon, which will use machine learning based decision tree and random forest methods which performed way better on this problem.