Ok, so here's the fully functioning MNIST code I have. It has pre-processing and fixes the test set forward pass bug (see above post).

I was also originally getting terrible results. Turns out it was mostly because of normalization. Anyway, here's what my pre-process steps looked like. Note I'm using keras here. But only cause it already has the MNIST data, and some helpful utils. The actual neural net is (mostly) straight from the book.

def onehot(y):
return keras.utils.np_utils.to_categorical(y)
from keras.datasets import mnist
(orig_x_train, orig_y_train), (orig_x_test, orig_y_test) = mnist.load_data()
sample_size = 2000
# Reshape (which just flattens the 28x28 arrays) and then normalize the X vars.
orig_x_test_sample = (orig_x_test[:sample_size].reshape((sample_size, 784))).astype(float) / 255
orig_x_train_sample = (orig_x_train[:sample_size].reshape((sample_size, 784))).astype(float) / 255
# One hot encode the Y vars so we can use softmax
orig_y_train_sample = onehot(orig_y_train[:sample_size])
orig_y_test_sample = onehot(orig_y_test[:sample_size])

And here's the actual network

import numpy as np
np.random.seed(1)
def tanh(x):
return np.tanh(x)
def tanh2deriv(output):
return 1 - (output ** 2)
def softmax(x):
temp = np.exp(x)
return temp / np.sum(temp, axis=1, keepdims=True)
alpha, iterations, hidden_size = (2, 300, 100)
pixels_per_image, num_labels = (784, 10)
batch_size = 100
weights_0_1 = 0.02*np.random.random((pixels_per_image,hidden_size))-0.01
weights_1_2 = 0.2*np.random.random((hidden_size,num_labels)) - 0.1
images = orig_x_train_sample
labels = orig_y_train_sample
test_images = orig_x_test_sample
test_labels = orig_y_test_sample
for j in range(iterations):
correct_cnt = 0
for i in xrange(len(images) / batch_size):
batch_start, batch_end=((i * batch_size),((i+1)*batch_size))
layer_0 = images[batch_start:batch_end]
layer_1 = tanh(np.dot(layer_0,weights_0_1))
dropout_mask = np.random.randint(2,size=layer_1.shape)
layer_1 *= dropout_mask * 2
layer_2 = softmax(np.dot(layer_1,weights_1_2))
for k in xrange(batch_size):
correct_cnt += int(np.argmax(layer_2[k:k+1]) == np.argmax(labels[batch_start+k:batch_start+k+1]))
layer_2_delta = (labels[batch_start:batch_end]-layer_2) / (batch_size * layer_2.shape[0])
layer_1_delta = layer_2_delta.dot(weights_1_2.T) * tanh2deriv(layer_1)
layer_1_delta *= dropout_mask
weights_1_2 += alpha * layer_1.T.dot(layer_2_delta)
weights_0_1 += alpha * layer_0.T.dot(layer_1_delta)
test_correct_cnt = 0
for i in xrange(len(test_images)):
layer_0 = test_images[i:i+1]
layer_1 = tanh(np.dot(layer_0,weights_0_1))
layer_2 = softmax(np.dot(layer_1,weights_1_2))
test_correct_cnt += int(np.argmax(layer_2) == np.argmax(test_labels[i:i+1]))
if(j % 10 == 0):
sys.stdout.write("\n"+ "I:" + str(j) + " Test-Acc:"+str(test_correct_cnt/float(len(test_images)))+ " Train-Acc:" + str(correct_cnt/float(len(images))))

My last couple outputs:

I:280 Test-Acc:0.856 Train-Acc:0.922
I:290 Test-Acc:0.8585 Train-Acc:0.9205