Monday, June 20, 2022

Titanic Dataset: Predict who survived and who died in the disaster

The Titanic Dataset found in kaggle is a good way to use my python program I posted earlier which you can read it here. The basic problem is to predict who among the passengers died and survived. And surpringly I got 100% accuracy.

The dataset link: Titanic Kaggle Dataset

The output:




The code:

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay, classification_report, confusion_matrix
from matplotlib import style
style.use('classic')

df = pd.read_csv('tested.csv')

df['Sex']  = df['Sex'].apply(lambda x: 1 if x=='female' else 0)
df['Embarked'] = df['Embarked'].apply(lambda x : 1 if x=='Q' else (0 if x=='S' else 2))
df['Age'] = df['Age'].fillna(0)
df['Fare'] = df['Fare'].fillna(0)
X = df.drop(['PassengerId', 's', 'Name', 'Ticket', 'Cabin'], axis=1)
y = df['s'].apply(lambda x: 1 if x==1 else 0)


y = np.array([[y]])  
y =np.reshape(y, (-1, 1))
Xxx = X.to_numpy()

alphas = [0.001]
hiddenSize = 80

batches = 128
# compute sigmoid nonlinearity
def sigmoid(x):
  if x.any() < 0.0:
    output = 0.0
  elif x.any() > 20.0:
    output = 1.0
  else:
    #return 1.0 / (1.0 + np.exp(-x))
    output = 1/(1+np.exp(-x))
  
  return output
# convert output of sigmoid function to its derivative
def sigmoid_output_to_derivative(output):
    return output*(1-output)
    
# rectified linear function
def relu(x):
	return max(0.0, x.all())

for alpha in alphas:
    print("\nTraining With Alpha:" + str(alpha))
    np.random.seed(1)

    # randomly initialize our weights with mean 0
    synapse_0 = 2*np.random.random((7,hiddenSize)) - 1
    #print(synapse_0.shape)
    synapse_1 = 2*np.random.random((hiddenSize,hiddenSize)) - 1
    #print(synapse_1.shape)
    synapse_2 = 2*np.random.random((hiddenSize,1)) - 1
    #print(synapse_2.shape)
    
    for j in range(10000):

        # Feed forward through layers 0, 1, and 2
        layer_0 = Xxx
        #print(layer_0.shape)
        layer_1 = sigmoid(np.dot(layer_0,synapse_0))
        #print(layer_1.shape)
        layer_2 = sigmoid(np.dot(layer_1,synapse_1))
        #print(layer_2.shape)
        layer_3 = sigmoid(np.dot(layer_2,synapse_2))
        #print(layer_3.shape)
        # how much did we miss the target value?
        layer_3_error = layer_3 - y
        #print(layer_3.shape)
        
        if (j% 1000) == 0:
            print( "Error after "+str(j)+" iterations:" + str(np.mean(np.abs(layer_3_error))))

        # in what direction is the target value?
        # were we really sure? if so, don't change too much.
        layer_3_delta = layer_3_error*sigmoid_output_to_derivative(layer_3)
       
        # how much did each l1 value contribute to the l2 error (according to the weights)?
        layer_2_error = layer_3_delta.dot(synapse_2.T)

        # in what direction is the target l1?
        # were we really sure? if so, don't change too much.
        layer_2_delta = layer_2_error * sigmoid_output_to_derivative(layer_2)
        
        layer_1_error = layer_2_delta.dot(synapse_1.T)
        layer_1_delta = layer_1_error * sigmoid_output_to_derivative(layer_1)
        synapse_2 -= alpha * (layer_1.T.dot(layer_3_delta))
        synapse_1 -= alpha * (layer_2.T.dot(layer_2_delta))
        synapse_0 -= alpha * (layer_0.T.dot(layer_1_delta))

y_hat = [0 if val < 0.5 else 1 for val in layer_3]


print(accuracy_score(y, y_hat))

y_hat = pd.DataFrame(y_hat)
y = pd.DataFrame(y)
cm = confusion_matrix(y, y_hat)
print(cm)

No comments:

Post a Comment