\documentclass[fleqn]{article}

\usepackage{haldefs}
\usepackage{notes}
\usepackage{url}
\usepackage{graphicx}

\begin{document}
\lecture{CS5350: Machine Learning}{HW3: Neural Networks}{Due 23 Sep 2008}

\section{Reading Exercises}

From the {\tt nnet.pdf} chapter, answer questions: 4.1, 4.2.  CS6350,
also answer 4.9.

\section{Written Exercises}

Answer the following questions in 25-100 words each:

\bee

\i Consider the perceptron update rule (from section 4.4.2 of the {\tt
  nnet.pdf} reading) for binary classification (so the loss is 0/1
loss).  What effect does the parameter $\eta$ have on the learning
process?

\i Suppose that we have $N$ data points in $D$ dimensions, \emph{not}
necessarily linearly separable.  We can map this data into $N$ points
in $D+N$ dimensions, forcing it to be linearly separable.  We may do
this by mapping a the $n$th data point $\vec x = \langle x_1, x_2,
\dots, x_D \rangle$ to $\langle x_1, x_2, \dots, x_D, [n=1], [n=2],
[n=3], \dots, [n=N-1], [n=N] \rangle$, where by $[n=3]$ we mean a
value of $1$ if $n=3$ and a value of $0$ otherwise.  Show that the
result of this mapping \emph{is} linearly separable.

\i \emph{[6350 only]} We proved in class (and in the write-up) that
the perceptron algorithm will converge in $R^2/\ga^2$ iterations, for
linearly separable data with maximum norm $R$.  Suppose we have
linearly inseparable data, but we perform the mapping from the
previous question.  How long will the perceptron take to converge?

\ene


\end{document}
