{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"# Weight Decay"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import d2l\n",
"from mxnet import autograd, gluon, init, nd\n",
"from mxnet.gluon import data as gdata, loss as gloss, nn"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"## High-dimensional Linear Regression\n",
"\n",
"$$y = 0.05 + \\sum_{i = 1}^d 0.01 x_i + \\epsilon \\text{ where }\n",
"\\epsilon \\sim \\mathcal{N}(0, 0.01)$$"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"attributes": {
"classes": [],
"id": "",
"n": "2"
}
},
"outputs": [],
"source": [
"n_train, n_test, num_inputs = 20, 100, 200\n",
"true_w, true_b = nd.ones((num_inputs, 1)) * 0.01, 0.05\n",
"\n",
"features = nd.random.normal(shape=(n_train + n_test, num_inputs))\n",
"labels = nd.dot(features, true_w) + true_b\n",
"labels += nd.random.normal(scale=0.01, shape=labels.shape)\n",
"train_features, test_features = features[:n_train, :], features[n_train:, :]\n",
"train_labels, test_labels = labels[:n_train], labels[n_train:]"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"## Weight Decay from Scratch\n",
"\n",
"### Initialize Model Parameters"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"attributes": {
"classes": [],
"id": "",
"n": "5"
}
},
"outputs": [],
"source": [
"def init_params():\n",
" w = nd.random.normal(scale=1, shape=(num_inputs, 1))\n",
" b = nd.zeros(shape=(1,))\n",
" w.attach_grad()\n",
" b.attach_grad()\n",
" return [w, b]"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"### Define Squared $\\ell_2$ Norm Penalty"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"attributes": {
"classes": [],
"id": "",
"n": "6"
}
},
"outputs": [],
"source": [
"def l2_penalty(w):\n",
" return (w**2).sum() / 2"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"### Define Training and Testing"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"attributes": {
"classes": [],
"id": "",
"n": "7"
}
},
"outputs": [],
"source": [
"batch_size, num_epochs, lr = 1, 100, 0.003\n",
"net, loss = d2l.linreg, d2l.squared_loss\n",
"train_iter = gdata.DataLoader(gdata.ArrayDataset(\n",
" train_features, train_labels), batch_size, shuffle=True)\n",
"\n",
"def fit_and_plot(lambd):\n",
" w, b = init_params()\n",
" train_ls, test_ls = [], []\n",
" for _ in range(num_epochs):\n",
" for X, y in train_iter:\n",
" with autograd.record():\n",
" # The L2 norm penalty term has been added.\n",
" l = loss(net(X, w, b), y) + lambd * l2_penalty(w)\n",
" l.backward()\n",
" d2l.sgd([w, b], lr, batch_size)\n",
" evaluate = lambda X, y: loss(net(X, w, b), y).mean().asscalar()\n",
" train_ls.append(evaluate(train_features, train_labels))\n",
" test_ls.append(evaluate(test_features, test_labels))\n",
" d2l.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'loss',\n",
" range(1, num_epochs + 1), test_ls, ['train', 'test'], [8, 4])\n",
" print('l2 norm of w:', w.norm().asscalar())"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"### Training without Regularization"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"attributes": {
"classes": [],
"id": "",
"n": "8"
}
},
"outputs": [
{
"data": {
"image/svg+xml": [
"\n",
"\n",
"\n",
"\n"
],
"text/plain": [
"