randomfun/min-char-rnn-nb.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## RNN Language Model\n",
    "\n",
    "Below is a diagram of the RNN computation that we will implement below. We're plugging characters into the RNN with a 1-hot encoding and expecting it to predict the next character. In this example the training data is the string \"hello\", so there are 4 letters in the vocabulary: [h,e,l,o]."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<img src=\"rnnlm.jpeg\">"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "np.random.seed(1337)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "data has 4573338 characters, 67 unique.\n"
     ]
    }
   ],
   "source": [
    "# data I/O\n",
    "# get shakespeare from http://cs.stanford.edu/people/karpathy/shakespeare.txt\n",
    "data = open('shakespeare.txt', 'r').read() # should be simple plain text file\n",
    "chars = list(set(data))\n",
    "data_size, vocab_size = len(data), len(chars)\n",
    "print 'data has %d characters, %d unique.' % (data_size, vocab_size)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "char_to_ix = { ch:i for i,ch in enumerate(chars) }\n",
    "ix_to_char = { i:ch for i,ch in enumerate(chars) }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "41"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "char_to_ix['a']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " thing when he was young,\n"
     ]
    }
   ],
   "source": [
    "# lets sample a batch of data\n",
    "seq_length = 25 # number of characters in the batch\n",
    "p = 220000 # point in the book to sample from\n",
    "print data[p:p+seq_length] # print a chunk of data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2, 61, 49, 48, 55, 46, 2, 62, 49, 44, 55, 2, 49, 44, 2, 62, 41, 58, 2, 64, 54, 60, 55, 46, 7]\n",
      "[61, 49, 48, 55, 46, 2, 62, 49, 44, 55, 2, 49, 44, 2, 62, 41, 58, 2, 64, 54, 60, 55, 46, 7, 0]\n"
     ]
    }
   ],
   "source": [
    "inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]\n",
    "targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]\n",
    "print inputs\n",
    "print targets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.\n",
      "  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.\n",
      "  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.\n",
      "  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]\n"
     ]
    }
   ],
   "source": [
    "# lets plug the first character into the RNN\n",
    "ix_input = inputs[0]\n",
    "ix_target = targets[0]\n",
    "# encode the input character with a 1-hot representation\n",
    "x = np.zeros((vocab_size,1))\n",
    "x[ix_input] = 1\n",
    "print x.ravel()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# create random starting parameters\n",
    "hidden_size = 10\n",
    "Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden\n",
    "Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden\n",
    "Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output\n",
    "bh = np.zeros((hidden_size, 1)) # hidden bias\n",
    "by = np.zeros((vocab_size, 1)) # output bias"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[-0.00321813  0.00640499  0.00424507  0.00783602 -0.00428041  0.00900686\n",
      " -0.01618631  0.00071848 -0.01342806 -0.0048051 ]\n"
     ]
    }
   ],
   "source": [
    "# compute the hidden state\n",
    "h_prev = np.zeros((hidden_size, 1))\n",
    "h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h_prev + bh))\n",
    "print h.ravel()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[  3.83078834e-05   3.73893809e-05  -1.07305639e-04   2.67774461e-04\n",
      "   1.19325035e-04  -3.37354220e-04  -3.34852820e-04   3.32538588e-04\n",
      "   3.18794726e-04  -1.03554566e-05   1.36845629e-04  -5.38710213e-04\n",
      "  -4.57711660e-04   2.99868693e-04   1.68690516e-04   1.78520042e-04\n",
      "   2.36693532e-04  -7.03401501e-05  -7.16902773e-05   1.27203335e-04\n",
      "   8.04252112e-05   1.59279072e-04   2.17314448e-04  -2.72573982e-05\n",
      "  -7.02211895e-04  -2.42104646e-05   5.30702842e-04   1.94948893e-04\n",
      "  -9.90711082e-05   4.03627462e-04  -2.93323333e-04   3.93383233e-04\n",
      "  -4.05743771e-04  -3.38345251e-04  -5.29097616e-05  -4.80911954e-05\n",
      "   1.33355225e-04   4.75679274e-04   6.58103198e-05   1.94801460e-04\n",
      "  -2.20233399e-04   3.08235108e-04  -2.53968781e-05   1.44684905e-04\n",
      "  -8.29930806e-05   5.83661343e-04   7.77516275e-05   2.55059906e-04\n",
      "   1.81287457e-04   2.64252428e-04   1.78958318e-04  -2.92440820e-04\n",
      "  -3.40954240e-04   8.32849829e-05  -6.05381172e-05   3.07491179e-04\n",
      "   3.90229048e-04  -1.39828111e-04   2.11848605e-04  -5.79932785e-04\n",
      "   6.02578409e-05  -2.12493371e-04   2.07956569e-04  -2.84122452e-04\n",
      "  -2.30967665e-04   3.80355416e-04   9.87418965e-05]\n"
     ]
    }
   ],
   "source": [
    "# compute the scores for next character\n",
    "y = np.dot(Why, h) + by\n",
    "print y.ravel()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 0.0149254   0.01492538  0.01492322  0.01492882  0.01492661  0.01491979\n",
      "  0.01491983  0.01492979  0.01492958  0.01492467  0.01492687  0.01491679\n",
      "  0.014918    0.0149293   0.01492734  0.01492749  0.01492836  0.01492378\n",
      "  0.01492376  0.01492672  0.01492603  0.0149272   0.01492807  0.01492442\n",
      "  0.01491435  0.01492446  0.01493275  0.01492774  0.01492335  0.01493085\n",
      "  0.01492045  0.0149307   0.01491877  0.01491978  0.01492404  0.01492411\n",
      "  0.01492682  0.01493193  0.01492581  0.01492773  0.01492154  0.01492943\n",
      "  0.01492445  0.01492699  0.01492359  0.01493354  0.01492599  0.01492863\n",
      "  0.01492753  0.01492877  0.0149275   0.01492046  0.01491974  0.01492607\n",
      "  0.01492392  0.01492942  0.01493065  0.01492274  0.01492799  0.01491617\n",
      "  0.01492572  0.01492165  0.01492793  0.01492059  0.01492138  0.0149305\n",
      "  0.0149263 ]\n",
      "probabilities sum to  1.0\n"
     ]
    }
   ],
   "source": [
    "# the scores are unnormalized log probabilities. compute the probabilities\n",
    "p = np.exp(y) / np.sum(np.exp(y))\n",
    "print p.ravel()\n",
    "print 'probabilities sum to ', p.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "probability assigned to the correct next character is right now:  0.0149216543899\n"
     ]
    }
   ],
   "source": [
    "print 'probability assigned to the correct next character is right now: ', p[ix_target,0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "the cross-entropy (softmax) loss is  4.20494180632\n"
     ]
    }
   ],
   "source": [
    "loss = -np.log(p[ix_target,0])\n",
    "print 'the cross-entropy (softmax) loss is ', loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 0.0149254   0.01492538  0.01492322  0.01492882  0.01492661  0.01491979\n",
      "  0.01491983  0.01492979  0.01492958  0.01492467  0.01492687  0.01491679\n",
      "  0.014918    0.0149293   0.01492734  0.01492749  0.01492836  0.01492378\n",
      "  0.01492376  0.01492672  0.01492603  0.0149272   0.01492807  0.01492442\n",
      "  0.01491435  0.01492446  0.01493275  0.01492774  0.01492335  0.01493085\n",
      "  0.01492045  0.0149307   0.01491877  0.01491978  0.01492404  0.01492411\n",
      "  0.01492682  0.01493193  0.01492581  0.01492773  0.01492154  0.01492943\n",
      "  0.01492445  0.01492699  0.01492359  0.01493354  0.01492599  0.01492863\n",
      "  0.01492753  0.01492877  0.0149275   0.01492046  0.01491974  0.01492607\n",
      "  0.01492392  0.01492942  0.01493065  0.01492274  0.01492799  0.01491617\n",
      "  0.01492572 -0.98507835  0.01492793  0.01492059  0.01492138  0.0149305\n",
      "  0.0149263 ]\n",
      "sum of dy is  1.45716771982e-16\n",
      "the gradient for the correct character (t) is: -0.98507834561\n",
      "the gradient for the character (a) is:  0.0149294265437\n"
     ]
    }
   ],
   "source": [
    "# compute the gradient on y\n",
    "dy = np.copy(p)\n",
    "dy[ix_target] -= 1\n",
    "print dy.ravel()\n",
    "print 'sum of dy is ', dy.sum()\n",
    "print 'the gradient for the correct character (%s) is: %s' % (ix_to_char[ix_target], dy[ix_target,0])\n",
    "print 'the gradient for the character (a) is: ', dy[char_to_ix['a'],0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "the hidden vector activations were:\n",
      "[-0.00321813  0.00640499  0.00424507  0.00783602 -0.00428041  0.00900686\n",
      " -0.01618631  0.00071848 -0.01342806 -0.0048051 ]\n",
      "the gradients are:\n",
      "[-0.00983172 -0.01049054  0.00911112 -0.01110877 -0.00134586  0.00686193\n",
      " -0.01532285  0.0099507  -0.00530998  0.01267197]\n",
      "the gradients dWhy have size:  (67, 10)\n",
      "a small sample is:\n",
      "[[ -4.80319012e-05   9.55970187e-05   6.33594305e-05   1.16955768e-04]\n",
      " [ -4.80318571e-05   9.55969309e-05   6.33593723e-05   1.16955661e-04]\n",
      " [ -4.80249076e-05   9.55830995e-05   6.33502052e-05   1.16938739e-04]\n",
      " [ -4.80429242e-05   9.56189576e-05   6.33739710e-05   1.16982609e-04]]\n"
     ]
    }
   ],
   "source": [
    "# we computed [y = np.dot(Why, h) + by]; Backpropagate to Why, h, and by\n",
    "dWhy = np.dot(dy, h.T)\n",
    "dh = np.dot(Why.T, dy)\n",
    "dby = np.copy(dy)\n",
    "print 'the hidden vector activations were:'\n",
    "print h.ravel()\n",
    "print 'the gradients are:'\n",
    "print dh.ravel()\n",
    "print 'the gradients dWhy have size: ', dWhy.shape\n",
    "print 'a small sample is:'\n",
    "print dWhy[:4,:4]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "small sample of Whh:\n",
      "[[-0.0028736  -0.00195895  0.00885911 -0.00349354]\n",
      " [-0.00773272 -0.00051873  0.00219991 -0.00234756]\n",
      " [ 0.01687054 -0.01221995  0.00125455 -0.00568523]\n",
      " [-0.00031645  0.00514377  0.01194564  0.0070584 ]]\n"
     ]
    }
   ],
   "source": [
    "# we computed [h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h_prev + bh))]; \n",
    "# Backprop into Wxh, x, Whh, h_prev, bh:\n",
    "dh_before_tanh = (1-h**2)*dh\n",
    "dbh = np.copy(dh_before_tanh)\n",
    "dWxh = np.dot(dh_before_tanh, x.T)\n",
    "dWhh = np.dot(dh_before_tanh, h.T)\n",
    "dh_prev = np.dot(Whh.T, dh_before_tanh)\n",
    "print 'small sample of Whh:'\n",
    "print Whh[:4,:4]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# we now have the gradients for all parameters! (Wxh, Whh, Why, bh, by)\n",
    "# lets do a parameter update\n",
    "learning_rate = 0.1\n",
    "Wxh2 = Wxh - learning_rate * dWxh\n",
    "Whh2 = Whh - learning_rate * dWhh\n",
    "Why2 = Why - learning_rate * dWhy\n",
    "bh2 = bh - learning_rate * dbh\n",
    "by2 = by - learning_rate * dby"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "probability assigned to the correct next character was:  0.0149216543899\n",
      "probability assigned to the correct next character is now:  0.0164678301753\n",
      "the cross-entropy (softmax) loss was  4.20494180632\n",
      "the loss is now  4.10634648754\n"
     ]
    }
   ],
   "source": [
    "# these parameters should be much better! lets try it out:\n",
    "h2 = np.tanh(np.dot(Wxh2, x) + np.dot(Whh2, h_prev + bh2))\n",
    "y2 = np.dot(Why2, h2) + by2\n",
    "p2 = np.exp(y2) / np.sum(np.exp(y2))\n",
    "print 'probability assigned to the correct next character was: ', p[ix_target,0]\n",
    "print 'probability assigned to the correct next character is now: ', p2[ix_target,0]\n",
    "loss2 = -np.log(p2[ix_target,0])\n",
    "print 'the cross-entropy (softmax) loss was ', loss\n",
    "print 'the loss is now ', loss2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# note: the probability for the correct character went up! (and the loss went down)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# putting it together with loops\n",
    "def lossFun(inputs, targets, hprev):\n",
    "    \"\"\"\n",
    "    inputs,targets are both list of integers.\n",
    "    hprev is Hx1 array of initial hidden state\n",
    "    returns the loss, gradients on model parameters, and last hidden state\n",
    "    \"\"\"\n",
    "    xs, hs, ys, ps = {}, {}, {}, {}\n",
    "    hs[-1] = np.copy(hprev)\n",
    "    loss = 0\n",
    "    \n",
    "    # forward pass\n",
    "    for t in xrange(len(inputs)):\n",
    "        xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation\n",
    "        xs[t][inputs[t]] = 1\n",
    "        hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state\n",
    "        ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars\n",
    "        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars\n",
    "        loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)\n",
    "    \n",
    "    # backward pass: compute gradients going backwards\n",
    "    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)\n",
    "    dbh, dby = np.zeros_like(bh), np.zeros_like(by)\n",
    "    dhnext = np.zeros_like(hs[0])\n",
    "    for t in reversed(xrange(len(inputs))):\n",
    "        dy = np.copy(ps[t])\n",
    "        dy[targets[t]] -= 1 # backprop into y\n",
    "        dWhy += np.dot(dy, hs[t].T)\n",
    "        dby += dy\n",
    "        dh = np.dot(Why.T, dy) + dhnext # backprop into h\n",
    "        dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity\n",
    "        dbh += dhraw\n",
    "        dWxh += np.dot(dhraw, xs[t].T)\n",
    "        dWhh += np.dot(dhraw, hs[t-1].T)\n",
    "        dhnext = np.dot(Whh.T, dhraw)\n",
    "        \n",
    "    # clip to mitigate exploding gradients\n",
    "    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:\n",
    "        np.clip(dparam, -5, 5, out=dparam)\n",
    "    \n",
    "    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "105.118896505\n"
     ]
    }
   ],
   "source": [
    "loss, dWxh, dWhh, dWhy, dbh, dby, hnew = lossFun(inputs, targets, h_prev)\n",
    "print loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# TODO: write the sampling code\n",
    "def sample(h, seed_ix, n):\n",
    "    \"\"\" \n",
    "    sample a sequence of integers from the model \n",
    "    h is initial memory state, seed_ix is seed letter for first time step\n",
    "    n is the number of time steps to sample for\n",
    "    \"\"\"\n",
    "    x = np.zeros((vocab_size, 1))\n",
    "    x[seed_ix] = 1\n",
    "    ixes = [] # sampled indices\n",
    "    for t in xrange(n):\n",
    "        pass # TODO: run the RNN for one time step, sample from distribution\n",
    "    return ixes\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# TODO: write the optimization loop\n",
    "# Loop over the dataset from beginning to end, sampling batches of characters seq_length long\n",
    "# Call the loss function and get the gradients\n",
    "# Perform a parameter update\n",
    "# Sample some examples from the model"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}