Arquivos
randomfun/min-char-rnn-nb3.ipynb
Andrej Karpathy bd2b874ced steps
2016-02-10 15:52:32 -08:00

832 linhas
26 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## RNN Language Model\n",
"\n",
"Below is a diagram of the RNN computation that we will implement below. We're plugging characters into the RNN with a 1-hot encoding and expecting it to predict the next character. In this example the training data is the string \"hello\", so there are 4 letters in the vocabulary: [h,e,l,o]."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<img src=\"rnnlm.jpeg\">"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import numpy as np\n",
"np.random.seed(1337)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"data has 4573338 characters, 67 unique.\n"
]
}
],
"source": [
"# data I/O\n",
"# get shakespeare from http://cs.stanford.edu/people/karpathy/shakespeare.txt\n",
"data = open('shakespeare.txt', 'r').read() # should be simple plain text file\n",
"chars = list(set(data))\n",
"data_size, vocab_size = len(data), len(chars)\n",
"print 'data has %d characters, %d unique.' % (data_size, vocab_size)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"char_to_ix = { ch:i for i,ch in enumerate(chars) }\n",
"ix_to_char = { i:ch for i,ch in enumerate(chars) }"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"41"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"char_to_ix['a']"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" thing when he was young,\n"
]
}
],
"source": [
"# lets sample a batch of data\n",
"seq_length = 25 # number of characters in the batch\n",
"p = 220000 # point in the book to sample from\n",
"print data[p:p+seq_length] # print a chunk of data"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2, 61, 49, 48, 55, 46, 2, 62, 49, 44, 55, 2, 49, 44, 2, 62, 41, 58, 2, 64, 54, 60, 55, 46, 7]\n",
"[61, 49, 48, 55, 46, 2, 62, 49, 44, 55, 2, 49, 44, 2, 62, 41, 58, 2, 64, 54, 60, 55, 46, 7, 0]\n"
]
}
],
"source": [
"inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]\n",
"targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]\n",
"print inputs\n",
"print targets"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[ 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
" 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
" 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
" 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n"
]
}
],
"source": [
"# lets plug the first character into the RNN\n",
"ix_input = inputs[0]\n",
"ix_target = targets[0]\n",
"# encode the input character with a 1-hot representation\n",
"x = np.zeros((vocab_size,1))\n",
"x[ix_input] = 1\n",
"print x.ravel()"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# create random starting parameters\n",
"hidden_size = 10\n",
"Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden\n",
"Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden\n",
"Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output\n",
"bh = np.zeros((hidden_size, 1)) # hidden bias\n",
"by = np.zeros((vocab_size, 1)) # output bias"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[-0.02566928 -0.00711926 -0.00851462 0.01228545 -0.00241891 0.00636176\n",
" -0.00171284 -0.01129739 -0.0069362 0.00932362]\n"
]
}
],
"source": [
"# compute the hidden state\n",
"h_prev = np.zeros((hidden_size, 1))\n",
"h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h_prev + bh))\n",
"print h.ravel()"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[ 6.29612155e-04 4.09079294e-04 -3.29899872e-04 7.98200509e-04\n",
" -2.62905161e-05 4.94626771e-04 1.97138889e-04 -1.01600591e-04\n",
" 7.72757316e-04 2.84376903e-04 -7.29973921e-04 1.56005304e-05\n",
" -1.11927240e-04 1.35442172e-04 -3.89815428e-05 -4.86357178e-05\n",
" 1.32336208e-04 3.15738595e-04 -3.87247490e-04 7.28991890e-04\n",
" -5.30632950e-05 4.20179198e-04 4.42242144e-04 2.83823246e-04\n",
" -3.58363287e-05 6.98975802e-05 4.84398003e-04 -2.81941909e-04\n",
" 5.07592676e-04 -2.68109997e-04 -6.98104505e-05 3.21717382e-04\n",
" 5.08520176e-05 6.37695233e-04 -1.02395859e-04 1.63546016e-04\n",
" -5.80853510e-04 1.19142485e-04 -3.79932371e-04 -3.94374025e-04\n",
" 7.46960859e-04 4.68737825e-04 -3.62337202e-04 -7.06302136e-06\n",
" 4.24622028e-04 9.21261371e-04 1.02755871e-04 2.95008636e-04\n",
" 1.41569258e-04 -7.44459004e-04 3.24625094e-04 -3.00690740e-05\n",
" 4.47332341e-04 2.14832415e-05 -4.31132112e-04 4.42862442e-04\n",
" 1.87427875e-06 7.64699298e-05 1.02364774e-04 2.62063645e-04\n",
" -3.09534282e-04 -5.05432663e-04 -1.66878227e-05 -7.71886342e-05\n",
" 2.44466581e-04 -2.06128445e-04 2.96956566e-04]\n"
]
}
],
"source": [
"# compute the scores for next character\n",
"y = np.dot(Why, h) + by\n",
"print y.ravel()"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[ 0.01493319 0.0149299 0.01491887 0.01493571 0.0149234 0.01493117\n",
" 0.01492673 0.01492227 0.01493533 0.01492803 0.0149129 0.01492402\n",
" 0.01492212 0.01492581 0.01492321 0.01492306 0.01492576 0.0149285\n",
" 0.01491801 0.01493467 0.014923 0.01493006 0.01493039 0.01492803\n",
" 0.01492325 0.01492483 0.01493102 0.01491958 0.01493137 0.01491979\n",
" 0.01492275 0.01492859 0.01492455 0.01493331 0.01492226 0.01492623\n",
" 0.01491512 0.01492557 0.01491812 0.0149179 0.01493494 0.01493079\n",
" 0.01491838 0.01492368 0.01493013 0.01493754 0.01492532 0.01492819\n",
" 0.0149259 0.01491268 0.01492863 0.01492334 0.01493047 0.01492411\n",
" 0.01491736 0.0149304 0.01492382 0.01492493 0.01492532 0.0149277\n",
" 0.01491917 0.01491625 0.01492354 0.01492264 0.01492744 0.01492071\n",
" 0.01492822]\n",
"probabilities sum to 1.0\n"
]
}
],
"source": [
"# the scores are unnormalized log probabilities. compute the probabilities\n",
"p = np.exp(y) / np.sum(np.exp(y))\n",
"print p.ravel()\n",
"print 'probabilities sum to ', p.sum()"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"probability assigned to the correct next character is right now: 0.0149162482677\n"
]
}
],
"source": [
"print 'probability assigned to the correct next character is right now: ', p[ix_target,0]"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"the cross-entropy (softmax) loss is 4.20530417242\n"
]
}
],
"source": [
"loss = -np.log(p[ix_target,0])\n",
"print 'the cross-entropy (softmax) loss is ', loss"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[ 0.01493319 0.0149299 0.01491887 0.01493571 0.0149234 0.01493117\n",
" 0.01492673 0.01492227 0.01493533 0.01492803 0.0149129 0.01492402\n",
" 0.01492212 0.01492581 0.01492321 0.01492306 0.01492576 0.0149285\n",
" 0.01491801 0.01493467 0.014923 0.01493006 0.01493039 0.01492803\n",
" 0.01492325 0.01492483 0.01493102 0.01491958 0.01493137 0.01491979\n",
" 0.01492275 0.01492859 0.01492455 0.01493331 0.01492226 0.01492623\n",
" 0.01491512 0.01492557 0.01491812 0.0149179 0.01493494 0.01493079\n",
" 0.01491838 0.01492368 0.01493013 0.01493754 0.01492532 0.01492819\n",
" 0.0149259 0.01491268 0.01492863 0.01492334 0.01493047 0.01492411\n",
" 0.01491736 0.0149304 0.01492382 0.01492493 0.01492532 0.0149277\n",
" 0.01491917 -0.98508375 0.01492354 0.01492264 0.01492744 0.01492071\n",
" 0.01492822]\n",
"sum of dy is 2.77555756156e-17\n",
"the gradient for the correct character (t) is: -0.985083751732\n",
"the gradient for the character (a) is: 0.0149307863167\n"
]
}
],
"source": [
"# compute the gradient on y\n",
"dy = np.copy(p)\n",
"dy[ix_target] -= 1\n",
"print dy.ravel()\n",
"print 'sum of dy is ', dy.sum()\n",
"print 'the gradient for the correct character (%s) is: %s' % (ix_to_char[ix_target], dy[ix_target,0])\n",
"print 'the gradient for the character (a) is: ', dy[char_to_ix['a'],0]"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"the hidden vector activations were:\n",
"[-0.02566928 -0.00711926 -0.00851462 0.01228545 -0.00241891 0.00636176\n",
" -0.00171284 -0.01129739 -0.0069362 0.00932362]\n",
"the gradients are:\n",
"[-0.00824375 -0.00696831 -0.00844694 0.01640971 -0.0017776 0.00293419\n",
" 0.01496486 0.0062472 -0.00851701 0.009765 ]\n",
"the gradients dWhy have size: (67, 10)\n",
"a small sample is:\n",
"[[-0.00038332 -0.00010631 -0.00012715 0.00018346]\n",
" [-0.00038324 -0.00010629 -0.00012712 0.00018342]\n",
" [-0.00038296 -0.00010621 -0.00012703 0.00018328]\n",
" [-0.00038339 -0.00010633 -0.00012717 0.00018349]]\n"
]
}
],
"source": [
"# we computed [y = np.dot(Why, h) + by]; Backpropagate to Why, h, and by\n",
"dWhy = np.dot(dy, h.T)\n",
"dh = np.dot(Why.T, dy)\n",
"dby = np.copy(dy)\n",
"print 'the hidden vector activations were:'\n",
"print h.ravel()\n",
"print 'the gradients are:'\n",
"print dh.ravel()\n",
"print 'the gradients dWhy have size: ', dWhy.shape\n",
"print 'a small sample is:'\n",
"print dWhy[:4,:4]"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"small sample of Whh:\n",
"[[-0.01152775 0.00881821 -0.00906459 0.00349652]\n",
" [ 0.00261533 0.0120227 -0.00259614 0.01621284]\n",
" [ 0.00182372 0.00073918 -0.00662722 0.02817786]\n",
" [-0.01495566 0.00292029 -0.00142797 0.00315272]]\n"
]
}
],
"source": [
"# we computed [h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h_prev + bh))]; \n",
"# Backprop into Wxh, x, Whh, h_prev, bh:\n",
"dh_before_tanh = (1-h**2)*dh\n",
"dbh = np.copy(dh_before_tanh)\n",
"dWxh = np.dot(dh_before_tanh, x.T)\n",
"dWhh = np.dot(dh_before_tanh, h.T)\n",
"dh_prev = np.dot(Whh.T, dh_before_tanh)\n",
"print 'small sample of Whh:'\n",
"print Whh[:4,:4]"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# we now have the gradients for all parameters! (Wxh, Whh, Why, bh, by)\n",
"# lets do a parameter update\n",
"learning_rate = 0.1\n",
"Wxh2 = Wxh - learning_rate * dWxh\n",
"Whh2 = Whh - learning_rate * dWhh\n",
"Why2 = Why - learning_rate * dWhy\n",
"bh2 = bh - learning_rate * dbh\n",
"by2 = by - learning_rate * dby"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"probability assigned to the correct next character was: 0.0149162482677\n",
"probability assigned to the correct next character is now: 0.0164625966368\n",
"the cross-entropy (softmax) loss was 4.20530417242\n",
"the loss is now 4.10666434182\n"
]
}
],
"source": [
"# these parameters should be much better! lets try it out:\n",
"h2 = np.tanh(np.dot(Wxh2, x) + np.dot(Whh2, h_prev + bh2))\n",
"y2 = np.dot(Why2, h2) + by2\n",
"p2 = np.exp(y2) / np.sum(np.exp(y2))\n",
"print 'probability assigned to the correct next character was: ', p[ix_target,0]\n",
"print 'probability assigned to the correct next character is now: ', p2[ix_target,0]\n",
"loss2 = -np.log(p2[ix_target,0])\n",
"print 'the cross-entropy (softmax) loss was ', loss\n",
"print 'the loss is now ', loss2"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# note: the probability for the correct character went up! (and the loss went down)"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# putting it together with loops\n",
"def lossFun(inputs, targets, hprev):\n",
" \"\"\"\n",
" inputs,targets are both list of integers.\n",
" hprev is Hx1 array of initial hidden state\n",
" returns the loss, gradients on model parameters, and last hidden state\n",
" \"\"\"\n",
" xs, hs, ys, ps = {}, {}, {}, {}\n",
" hs[-1] = np.copy(hprev)\n",
" loss = 0\n",
" \n",
" # forward pass\n",
" for t in xrange(len(inputs)):\n",
" xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation\n",
" xs[t][inputs[t]] = 1\n",
" hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state\n",
" ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars\n",
" ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars\n",
" loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)\n",
" \n",
" # backward pass: compute gradients going backwards\n",
" dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)\n",
" dbh, dby = np.zeros_like(bh), np.zeros_like(by)\n",
" dhnext = np.zeros_like(hs[0])\n",
" for t in reversed(xrange(len(inputs))):\n",
" dy = np.copy(ps[t])\n",
" dy[targets[t]] -= 1 # backprop into y\n",
" dWhy += np.dot(dy, hs[t].T)\n",
" dby += dy\n",
" dh = np.dot(Why.T, dy) + dhnext # backprop into h\n",
" dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity\n",
" dbh += dhraw\n",
" dWxh += np.dot(dhraw, xs[t].T)\n",
" dWhh += np.dot(dhraw, hs[t-1].T)\n",
" dhnext = np.dot(Whh.T, dhraw)\n",
" \n",
" # clip to mitigate exploding gradients\n",
" for dparam in [dWxh, dWhh, dWhy, dbh, dby]:\n",
" np.clip(dparam, -5, 5, out=dparam)\n",
" \n",
" return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"105.12080312\n"
]
}
],
"source": [
"loss, dWxh, dWhh, dWhy, dbh, dby, hnew = lossFun(inputs, targets, h_prev)\n",
"print loss"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# TODO: write the sampling code\n",
"def sample(h, seed_ix, n):\n",
" \"\"\" \n",
" sample a sequence of integers from the model \n",
" h is initial memory state, seed_ix is seed letter for first time step\n",
" n is the number of time steps to sample for\n",
" \"\"\"\n",
" x = np.zeros((vocab_size, 1))\n",
" x[seed_ix] = 1\n",
" ixes = [] # sampled indices\n",
" for t in xrange(n):\n",
" pass # TODO: run the RNN for one time step, sample from distribution\n",
" return ixes\n"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"iter 0, loss: 105.117317\n",
"iter 100, loss: 104.983575\n",
"iter 200, loss: 104.623957\n",
"iter 300, loss: 104.095161\n",
"iter 400, loss: 103.411069\n",
"iter 500, loss: 102.748750\n",
"iter 600, loss: 101.900376\n",
"iter 700, loss: 101.043855\n",
"iter 800, loss: 100.269985\n",
"iter 900, loss: 99.357290\n",
"iter 1000, loss: 98.447726\n",
"iter 1100, loss: 97.485653\n",
"iter 1200, loss: 96.434239\n",
"iter 1300, loss: 95.316229\n",
"iter 1400, loss: 94.361564\n",
"iter 1500, loss: 93.489495\n",
"iter 1600, loss: 92.453112\n",
"iter 1700, loss: 91.702083\n",
"iter 1800, loss: 91.107364\n",
"iter 1900, loss: 90.257844\n",
"iter 2000, loss: 89.215814\n",
"iter 2100, loss: 88.522362\n",
"iter 2200, loss: 88.013419\n",
"iter 2300, loss: 87.378470\n",
"iter 2400, loss: 87.002066\n",
"iter 2500, loss: 86.754673\n",
"iter 2600, loss: 86.210309\n",
"iter 2700, loss: 85.948958\n",
"iter 2800, loss: 86.013457\n",
"iter 2900, loss: 85.739568\n",
"iter 3000, loss: 85.272399\n",
"iter 3100, loss: 85.384693\n",
"iter 3200, loss: 85.350449\n",
"iter 3300, loss: 85.119789\n",
"iter 3400, loss: 84.972730\n",
"iter 3500, loss: 84.656232\n",
"iter 3600, loss: 84.490833\n",
"iter 3700, loss: 84.449176\n",
"iter 3800, loss: 84.361207\n",
"iter 3900, loss: 84.169058\n",
"iter 4000, loss: 84.097223\n",
"iter 4100, loss: 84.034489\n",
"iter 4200, loss: 83.707687\n",
"iter 4300, loss: 83.863006\n",
"iter 4400, loss: 83.561466\n",
"iter 4500, loss: 83.265996\n",
"iter 4600, loss: 82.946393\n",
"iter 4700, loss: 82.996679\n",
"iter 4800, loss: 83.080147\n",
"iter 4900, loss: 83.110710\n",
"iter 5000, loss: 82.976515\n",
"iter 5100, loss: 82.801721\n",
"iter 5200, loss: 82.825836\n",
"iter 5300, loss: 82.666174\n",
"iter 5400, loss: 82.523678\n",
"iter 5500, loss: 82.482473\n",
"iter 5600, loss: 82.339404\n",
"iter 5700, loss: 82.110700\n",
"iter 5800, loss: 82.118627\n",
"iter 5900, loss: 82.097737\n",
"iter 6000, loss: 82.007989\n",
"iter 6100, loss: 82.116768\n",
"iter 6200, loss: 82.211658\n",
"iter 6300, loss: 82.147884\n",
"iter 6400, loss: 82.385061\n",
"iter 6500, loss: 82.285403\n",
"iter 6600, loss: 82.354143\n",
"iter 6700, loss: 82.720619\n",
"iter 6800, loss: 82.787212\n",
"iter 6900, loss: 82.999636\n",
"iter 7000, loss: 82.992352\n",
"iter 7100, loss: 83.115493\n",
"iter 7200, loss: 83.140399\n",
"iter 7300, loss: 83.273761\n",
"iter 7400, loss: 83.247985\n",
"iter 7500, loss: 83.356551\n",
"iter 7600, loss: 83.306669\n",
"iter 7700, loss: 83.182086\n",
"iter 7800, loss: 83.045905\n",
"iter 7900, loss: 82.727804\n",
"iter 8000, loss: 82.638186\n",
"iter 8100, loss: 82.478594\n",
"iter 8200, loss: 82.546788\n",
"iter 8300, loss: 82.556720\n",
"iter 8400, loss: 82.489606\n",
"iter 8500, loss: 82.482931\n",
"iter 8600, loss: 82.500909\n",
"iter 8700, loss: 82.506185\n",
"iter 8800, loss: 82.389225\n",
"iter 8900, loss: 82.780411\n",
"iter 9000, loss: 82.682004\n",
"iter 9100, loss: 82.839024\n",
"iter 9200, loss: 83.075629\n",
"iter 9300, loss: 83.029504\n",
"iter 9400, loss: 82.721697\n",
"iter 9500, loss: 82.556405\n",
"iter 9600, loss: 82.588480\n",
"iter 9700, loss: 82.519233\n",
"iter 9800, loss: 82.450922\n",
"iter 9900, loss: 82.098659\n"
]
}
],
"source": [
"# Stochastic Gradient Descent\n",
"n, p = 0, 0\n",
"smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0\n",
"learning_rate = 1e-3\n",
"while n < 10000:\n",
" # prepare inputs (we're sweeping from left to right in steps seq_length long)\n",
" if p+seq_length+1 >= len(data) or n == 0: \n",
" hprev = np.zeros((hidden_size,1)) # reset RNN memory\n",
" p = 0 # go from start of data\n",
" inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]\n",
" targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]\n",
"\n",
" # forward seq_length characters through the net and fetch gradient\n",
" loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)\n",
" smooth_loss = smooth_loss * 0.999 + loss * 0.001\n",
" if n % 100 == 0: print 'iter %d, loss: %f' % (n, smooth_loss) # print progress\n",
"\n",
" # perform parameter update with Adagrad\n",
" for param, dparam in zip([Wxh, Whh, Why, bh, by], \n",
" [dWxh, dWhh, dWhy, dbh, dby]):\n",
" param += -learning_rate * dparam\n",
"\n",
" p += seq_length # move data pointer\n",
" n += 1 # iteration counter "
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def sample(h, seed_ix, n):\n",
" \"\"\" \n",
" sample a sequence of integers from the model \n",
" h is memory state, seed_ix is seed letter for first time step\n",
" \"\"\"\n",
" x = np.zeros((vocab_size, 1))\n",
" x[seed_ix] = 1\n",
" ixes = []\n",
" for t in xrange(n):\n",
" h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)\n",
" y = np.dot(Why, h) + by\n",
" p = np.exp(y) / np.sum(np.exp(y))\n",
" ix = np.random.choice(range(vocab_size), p=p.ravel())\n",
" x = np.zeros((vocab_size, 1))\n",
" x[ix] = 1\n",
" ixes.append(ix)\n",
" return ixes"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"noe a L:odmlt hd ts\n",
"olu hed og hoe. i\n",
"nrarro uCws ' Zlmky ynwrss[ mmo\n",
"sS hCInAr\n",
"ib htneayHseiCf uueo\n",
"H'Il s,dou\n",
"usKi., t?fr\n",
"i whhfgdre AtitTe\n",
",mer\n",
"ll \n",
"G tetdjcqsrrtfs soin,rnseiIdnIshos mn iasgh\n",
"idghfMIaoeM;,oaat,wv\n",
"d , rnthoikfrlh rsqa at ;ilr wdeat tawl\n",
"atran iter yahnneesevd rls Soide TswD haPsiu no sphrcnhyGrsd mbRyos ,g sdt widotLn ohh \n",
" tcavny svag\n",
"bee rr nlh dnl h olCn . terTrfo o t d nhoeewisn riwgono maeAahe CrS\n",
"tystet i!BWehi-'Nh:syd cv mr\n",
"iceawtesbthnhul eaec ,eeddage\n",
"\n",
"mtdsu : mioeuT,ayin aharwoouh,uh alrhofnwW Tdeo CDnupee tntaehd\n",
"lee hGtc\n",
"oneE hs oh otil'rr ftO\n",
"t hoe nie yeidoqotnlt h s! ai.e,s c? eTwbr r ,etaa t y\n",
"Sar\n",
"ieeeio\n",
"oUoyehm UIGndoLeaeae h eaFfoaH\n",
"etnmris Paib\n",
"h r,iiEb\n",
"haemlkecySa,nlfTe\n",
"o hssRThokiwo R\n",
"r mso pmdsnbn weu ameUe l\n",
"tldal -enr otguO, ue \n",
"nY olKh oh Iboewornr \n",
"yhh o:a,e pntU edvhclsndH hs mEQoeoi lhnub !e n,&iirEr sdiSdWl saases tsstgitsg urhdaleeweareit,e n!hitra u\n",
"Bl pr .ih 't oit.es aoy atyt, enit fntae hEt\n",
"stuhaoIr uKis\n",
"tUikn otbareu oefudz'uay\n"
]
}
],
"source": [
"sample_ix = sample(hprev, char_to_ix['a'], 1000)\n",
"txt = ''.join(ix_to_char[ix] for ix in sample_ix)\n",
"print txt"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}