# STA 414, Spring 2014, Assignment #3, Neural network functions
# The networks handled by these functions have one layer of hidden
# units, with tanh as the activation function. The networks are used
# for predicting a binary response, with a logistic link function.
#
# The networks parameters (weights) are in four groups: w1 for weights
# on the input to hidden connections, used in computing the summed
# input into a hidden unit, w1_0 for the constants added to this sum,
# w2 for the weights on the hidden to output connections, used to
# compute the sum that is the output of the network, and w2_0 for the
# constant added this sum.
#
# The network parameters are sometimes stored as a list with named
# elements w1, w1_0, w2, and w2_0, and sometimes as a single vector
# containing all these parameters, in that order. Conversion between
# these representations is done with R's unlist and relist functions.
# SKELETON FOR LIST OF NETWORK PARAMETERS. Returns a skeleton suitable for
# use with "relist" for converting a vector of parameters for a network with
# p inputs and m hidden units to a list.
mlp_skeleton <- function (p,m)
{ list (w1=matrix(0,p,m), w1_0=rep(0,m), w2=rep(0,m), w2_0=0)
}
# FIT MLP NETWORK USING SIMPLE GRADIENT DESCENT. The arguments are a vector
# of real responses for training cases, a matrix of inputs for training cases,
# the number of hidden units, the learning rate, the number of iterations
# of gradient descent to do, and two penalty magnitudes (for input-hidden and
# hidden-output weights), which default to zero. The penalty is the penalty
# magnitude times the sum of squares of weights in the group.
#
# Note that the input variables and response variables should be scaled to
# have close to mean zero and standard deviation one, so that a single learning
# rate is roughly appropriate for all weights.
#
# The initial network weights are randomly chosen independently from Gaussian
# distributions with mean zero and standard deviation 0.01, except that w2_0
# is set to the mean response in the training cases.
#
# The result returned is a list with vector element E giving minus the log
# likelihood for each iteration and matrix element W giving the network
# parameters for each iteration (rows) in vector (not list) form. Note that
# these results are for states after each iteration, with initial values not
# included.
mlp_train <- function (y, X, m, eta, iters, lambda1=0, lambda2=0)
{
n <- nrow(X)
p <- ncol(X)
skel <- mlp_skeleton(p,m)
E <- rep(NA,iters)
W <- matrix(NA,iters,length(unlist(skel)))
w <- c (rnorm(length(unlist(skel))-1,0,0.01), mean(y))
wl <- relist(w,skel)
fw <- mlp_forward(X,wl)
lambda <- skel
lambda$w1[,] <- lambda1
lambda$w2[] <- lambda2
lambda <- unlist(lambda)
for (iter in 1:iters)
{
bk <- mlp_backward(y,X,wl,fw)
gr <- mlp_grad(X,skel,fw,bk)
w <- w - eta*unlist(gr) - eta*2*lambda*w
wl <- relist(w,skel)
W[iter,] <- w
fw <- mlp_forward(X,wl)
E[iter] <- mean(log(1+exp(-fw$o*(2*y-1))))
}
list (E=E, W=W)
}
# FORWARD NETWORK COMPUTATIONS. Given the matrix of inputs and the network
# parameters in list form, this function returns a list with a matrix of hidden
# unit inputs (s) for all training cases, a matrix of hidden unit values (h) for
# all training cases, and a matrix (with one column) of output unit values (o)
# for all training cases.
mlp_forward <- function (X, wl)
{
n <- nrow(X)
p <- ncol(X)
s <- matrix(wl$w1_0,n,length(wl$w1_0),byrow=TRUE) + X %*% wl$w1
h <- tanh(s)
o <- h %*% wl$w2 + wl$w2_0
list (s=s, h=h, o=o)
}
# BACKWARD NETWORK COMPUTATIONS. Given the training responses, training inputs,
# network parameters in list form, and results of the forward computation, this
# function returns a list of matrices of partial derivatives with respect to
# unit values, for each training case.
mlp_backward <- function (y, X, wl, fw)
{
dE_do <- 1/(1+exp(-fw$o)) - y
dE_dh <- dE_do %*% wl$w2
dE_ds <- (1-fw$h^2) * dE_dh
list (dE_do=dE_do, dE_dh=dE_dh, dE_ds=dE_ds)
}
# COMPUTE GRADIENT OF LOG LIKELIHOOD WITH RESPECT TO PARAMETERS. Takes
# as arguments the matrix of inputs, the skeleton for the parameter list, and
# the results of the forward and backward computations. Returns the gradient
# of the squared error in list form.
mlp_grad <- function (X, skel, fw, bk)
{
p <- ncol(X)
gr <- skel
gr$w2_0 <- sum (bk$dE_do)
gr$w2 <- t(fw$h) %*% bk$dE_do
gr$w1_0 <- colSums(bk$dE_ds)
for (j in 1:p)
{ gr$w1[j,] <- X[,j] %*% bk$dE_ds
}
gr
}
# CHECK THAT GRADIENTS ARE CORRECT.
mlp_check_grad <- function (y, X, skel, w, epsilon=0.001)
{
grd <- unlist(skel)
for (i in 1:length(w))
{
w1 <- w
w1[i] <- w1[i]-epsilon
w2 <- w
w2[i] <- w2[i]+epsilon
E1 <- sum ( log(1 + exp(-(2*y-1)*mlp_forward(X,relist(w1,skel))$o)) )
E2 <- sum ( log(1 + exp(-(2*y-1)*mlp_forward(X,relist(w2,skel))$o)) )
grd[i] <- (E2-E1) / (2*epsilon)
}
wl <- relist(w,skel)
fw <- mlp_forward (X, wl)
bk <- mlp_backward (y, X, wl, fw)
gr <- mlp_grad (X, skel, fw, bk)
list (unlist(grd), unlist(gr), unlist(gr)-unlist(grd))
}