steps

2016-02-10 15:52:32 -08:00
commit bd2b874ced
@@ -39,13 +39,14 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "data has 3291648 characters, 93 unique.\n"
+      "data has 4573338 characters, 67 unique.\n"
     ]
    }
   ],
   "source": [
    "# data I/O\n",
-    "data = open('warpeace.txt', 'r').read() # should be simple plain text file\n",
+    "# get shakespeare from http://cs.stanford.edu/people/karpathy/shakespeare.txt\n",
+    "data = open('shakespeare.txt', 'r').read() # should be simple plain text file\n",
    "chars = list(set(data))\n",
    "data_size, vocab_size = len(data), len(chars)\n",
    "print 'data has %d characters, %d unique.' % (data_size, vocab_size)"
@@ -73,7 +74,7 @@
    {
     "data": {
      "text/plain": [
-       "86"
+       "41"
      ]
     },
     "execution_count": 4,
@@ -96,7 +97,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "at the twitching cheeks o\n"
+      " thing when he was young,\n"
     ]
    }
   ],
@@ -118,8 +119,8 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[86, 22, 0, 22, 18, 87, 0, 22, 45, 88, 22, 40, 18, 88, 64, 41, 0, 40, 18, 87, 87, 42, 44, 0, 43]\n",
-      "[22, 0, 22, 18, 87, 0, 22, 45, 88, 22, 40, 18, 88, 64, 41, 0, 40, 18, 87, 87, 42, 44, 0, 43, 62]\n"
+      "[2, 61, 49, 48, 55, 46, 2, 62, 49, 44, 55, 2, 49, 44, 2, 62, 41, 58, 2, 64, 54, 60, 55, 46, 7]\n",
+      "[61, 49, 48, 55, 46, 2, 62, 49, 44, 55, 2, 49, 44, 2, 62, 41, 58, 2, 64, 54, 60, 55, 46, 7, 0]\n"
     ]
    }
   ],
@@ -141,12 +142,10 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.\n",
+      "[ 0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.\n",
      "  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.\n",
      "  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.\n",
-      "  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.\n",
-      "  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.\n",
-      "  0.  0.  0.]\n"
+      "  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]\n"
     ]
    }
   ],
@@ -188,9 +187,8 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[ -2.19597294e-03  -1.11958403e-02  -7.96573514e-03  -8.29025315e-03\n",
-      "  -1.69046852e-02   7.16480643e-03  -1.08628224e-05   1.38136549e-03\n",
-      "   8.91231078e-03   1.22455580e-02]\n"
+      "[-0.00321813  0.00640499  0.00424507  0.00783602 -0.00428041  0.00900686\n",
+      " -0.01618631  0.00071848 -0.01342806 -0.0048051 ]\n"
     ]
    }
   ],
@@ -212,30 +210,23 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[ -1.86941995e-04   5.75478992e-05   5.45725372e-05  -4.12423671e-04\n",
-      "   3.06201649e-04  -5.06931560e-04   3.19806236e-04  -3.51163814e-05\n",
-      "  -3.38972137e-04  -2.39857442e-04  -8.01432457e-05  -1.04391992e-04\n",
-      "  -3.15399258e-04   1.48454913e-05   4.55998798e-05   1.28931375e-04\n",
-      "   4.04072882e-04  -5.75504844e-04   1.57533734e-04  -4.08258042e-04\n",
-      "  -7.41755605e-05  -7.76690551e-05   1.87837950e-04  -2.79910204e-04\n",
-      "  -7.96190232e-04  -5.79876645e-05   1.26730230e-04   3.95894081e-04\n",
-      "   2.76955496e-04   2.59637379e-05  -4.61015050e-04  -5.14636280e-04\n",
-      "  -2.56714611e-04   4.94377196e-04  -3.64149466e-04  -4.26364575e-04\n",
-      "  -1.25515821e-04   9.29132184e-06   4.88944584e-05  -6.31681837e-04\n",
-      "  -1.75121523e-05   9.51597096e-06   3.08227238e-04   1.29694758e-04\n",
-      "   1.72591168e-04  -5.87989566e-05   2.74250548e-04   5.90320270e-05\n",
-      "   1.64825617e-04   3.11431559e-04   3.48582321e-04  -4.19733486e-05\n",
-      "  -2.77062543e-04  -1.10111861e-05  -1.33642742e-05  -5.99994639e-05\n",
-      "   6.07084027e-04   3.51784289e-04  -3.89265870e-05  -1.03640935e-05\n",
-      "   2.94412266e-04   7.75998698e-05  -2.85416619e-04   4.26053337e-04\n",
-      "  -3.47809927e-04  -4.67567746e-04  -6.81980155e-05  -1.09923565e-06\n",
-      "   6.26504669e-05  -2.76671607e-04   2.96818053e-04   1.87011604e-05\n",
-      "   2.98832018e-04  -2.65531746e-04  -5.58804832e-04  -3.07801824e-04\n",
-      "   3.75513314e-04   4.76734817e-04  -2.96633044e-04  -3.06249150e-04\n",
-      "   1.22093579e-05  -1.43195488e-04  -1.58954120e-04  -1.08600298e-04\n",
-      "   1.52122727e-04  -6.54969615e-04   1.60857638e-04  -9.11628405e-04\n",
-      "   3.67740402e-05   2.16278686e-04  -1.98860111e-04   4.08539302e-05\n",
-      "  -1.47124899e-04]\n"
+      "[  3.83078834e-05   3.73893809e-05  -1.07305639e-04   2.67774461e-04\n",
+      "   1.19325035e-04  -3.37354220e-04  -3.34852820e-04   3.32538588e-04\n",
+      "   3.18794726e-04  -1.03554566e-05   1.36845629e-04  -5.38710213e-04\n",
+      "  -4.57711660e-04   2.99868693e-04   1.68690516e-04   1.78520042e-04\n",
+      "   2.36693532e-04  -7.03401501e-05  -7.16902773e-05   1.27203335e-04\n",
+      "   8.04252112e-05   1.59279072e-04   2.17314448e-04  -2.72573982e-05\n",
+      "  -7.02211895e-04  -2.42104646e-05   5.30702842e-04   1.94948893e-04\n",
+      "  -9.90711082e-05   4.03627462e-04  -2.93323333e-04   3.93383233e-04\n",
+      "  -4.05743771e-04  -3.38345251e-04  -5.29097616e-05  -4.80911954e-05\n",
+      "   1.33355225e-04   4.75679274e-04   6.58103198e-05   1.94801460e-04\n",
+      "  -2.20233399e-04   3.08235108e-04  -2.53968781e-05   1.44684905e-04\n",
+      "  -8.29930806e-05   5.83661343e-04   7.77516275e-05   2.55059906e-04\n",
+      "   1.81287457e-04   2.64252428e-04   1.78958318e-04  -2.92440820e-04\n",
+      "  -3.40954240e-04   8.32849829e-05  -6.05381172e-05   3.07491179e-04\n",
+      "   3.90229048e-04  -1.39828111e-04   2.11848605e-04  -5.79932785e-04\n",
+      "   6.02578409e-05  -2.12493371e-04   2.07956569e-04  -2.84122452e-04\n",
+      "  -2.30967665e-04   3.80355416e-04   9.87418965e-05]\n"
     ]
    }
   ],
@@ -256,22 +247,18 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[ 0.01075121  0.01075383  0.0107538   0.01074878  0.01075651  0.01074777\n",
-      "  0.01075666  0.01075284  0.01074957  0.01075064  0.01075235  0.01075209\n",
-      "  0.01074982  0.01075338  0.01075371  0.0107546   0.01075756  0.01074703\n",
-      "  0.01075491  0.01074883  0.01075242  0.01075238  0.01075524  0.01075021\n",
-      "  0.01074466  0.01075259  0.01075458  0.01075747  0.01075619  0.01075349\n",
-      "  0.01074826  0.01074768  0.01075046  0.01075853  0.0107493   0.01074863\n",
-      "  0.01075187  0.01075332  0.01075374  0.01074643  0.01075303  0.01075332\n",
-      "  0.01075653  0.01075461  0.01075507  0.01075258  0.01075617  0.01075385\n",
-      "  0.01075499  0.01075656  0.01075696  0.01075276  0.01075024  0.0107531\n",
-      "  0.01075307  0.01075257  0.01075975  0.010757    0.0107528   0.0107531\n",
-      "  0.01075638  0.01075405  0.01075015  0.0107578   0.01074948  0.01074819\n",
-      "  0.01075248  0.0107532   0.01075389  0.01075024  0.01075641  0.01075342\n",
-      "  0.01075643  0.01075036  0.01074721  0.01074991  0.01075725  0.01075834\n",
-      "  0.01075003  0.01074992  0.01075335  0.01075168  0.01075151  0.01075205\n",
-      "  0.01075485  0.01074617  0.01075495  0.01074342  0.01075361  0.01075554\n",
-      "  0.01075108  0.01075365  0.01075163]\n",
+      "[ 0.0149254   0.01492538  0.01492322  0.01492882  0.01492661  0.01491979\n",
+      "  0.01491983  0.01492979  0.01492958  0.01492467  0.01492687  0.01491679\n",
+      "  0.014918    0.0149293   0.01492734  0.01492749  0.01492836  0.01492378\n",
+      "  0.01492376  0.01492672  0.01492603  0.0149272   0.01492807  0.01492442\n",
+      "  0.01491435  0.01492446  0.01493275  0.01492774  0.01492335  0.01493085\n",
+      "  0.01492045  0.0149307   0.01491877  0.01491978  0.01492404  0.01492411\n",
+      "  0.01492682  0.01493193  0.01492581  0.01492773  0.01492154  0.01492943\n",
+      "  0.01492445  0.01492699  0.01492359  0.01493354  0.01492599  0.01492863\n",
+      "  0.01492753  0.01492877  0.0149275   0.01492046  0.01491974  0.01492607\n",
+      "  0.01492392  0.01492942  0.01493065  0.01492274  0.01492799  0.01491617\n",
+      "  0.01492572  0.01492165  0.01492793  0.01492059  0.01492138  0.0149305\n",
+      "  0.0149263 ]\n",
      "probabilities sum to  1.0\n"
     ]
    }
@@ -294,7 +281,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "probability assigned to the correct next character is right now:  0.0107552356218\n"
+      "probability assigned to the correct next character is right now:  0.0149216543899\n"
     ]
    }
   ],
@@ -313,7 +300,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "the cross-entropy (softmax) loss is  4.53236260838\n"
+      "the cross-entropy (softmax) loss is  4.20494180632\n"
     ]
    }
   ],
@@ -333,25 +320,21 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[ 0.01075121  0.01075383  0.0107538   0.01074878  0.01075651  0.01074777\n",
-      "  0.01075666  0.01075284  0.01074957  0.01075064  0.01075235  0.01075209\n",
-      "  0.01074982  0.01075338  0.01075371  0.0107546   0.01075756  0.01074703\n",
-      "  0.01075491  0.01074883  0.01075242  0.01075238 -0.98924476  0.01075021\n",
-      "  0.01074466  0.01075259  0.01075458  0.01075747  0.01075619  0.01075349\n",
-      "  0.01074826  0.01074768  0.01075046  0.01075853  0.0107493   0.01074863\n",
-      "  0.01075187  0.01075332  0.01075374  0.01074643  0.01075303  0.01075332\n",
-      "  0.01075653  0.01075461  0.01075507  0.01075258  0.01075617  0.01075385\n",
-      "  0.01075499  0.01075656  0.01075696  0.01075276  0.01075024  0.0107531\n",
-      "  0.01075307  0.01075257  0.01075975  0.010757    0.0107528   0.0107531\n",
-      "  0.01075638  0.01075405  0.01075015  0.0107578   0.01074948  0.01074819\n",
-      "  0.01075248  0.0107532   0.01075389  0.01075024  0.01075641  0.01075342\n",
-      "  0.01075643  0.01075036  0.01074721  0.01074991  0.01075725  0.01075834\n",
-      "  0.01075003  0.01074992  0.01075335  0.01075168  0.01075151  0.01075205\n",
-      "  0.01075485  0.01074617  0.01075495  0.01074342  0.01075361  0.01075554\n",
-      "  0.01075108  0.01075365  0.01075163]\n",
-      "sum of dy is  -4.23272528138e-16\n",
-      "the gradient for the correct character (t) is: -0.989244764378\n",
-      "the gradient for the character (a) is:  0.0107549454461\n"
+      "[ 0.0149254   0.01492538  0.01492322  0.01492882  0.01492661  0.01491979\n",
+      "  0.01491983  0.01492979  0.01492958  0.01492467  0.01492687  0.01491679\n",
+      "  0.014918    0.0149293   0.01492734  0.01492749  0.01492836  0.01492378\n",
+      "  0.01492376  0.01492672  0.01492603  0.0149272   0.01492807  0.01492442\n",
+      "  0.01491435  0.01492446  0.01493275  0.01492774  0.01492335  0.01493085\n",
+      "  0.01492045  0.0149307   0.01491877  0.01491978  0.01492404  0.01492411\n",
+      "  0.01492682  0.01493193  0.01492581  0.01492773  0.01492154  0.01492943\n",
+      "  0.01492445  0.01492699  0.01492359  0.01493354  0.01492599  0.01492863\n",
+      "  0.01492753  0.01492877  0.0149275   0.01492046  0.01491974  0.01492607\n",
+      "  0.01492392  0.01492942  0.01493065  0.01492274  0.01492799  0.01491617\n",
+      "  0.01492572 -0.98507835  0.01492793  0.01492059  0.01492138  0.0149305\n",
+      "  0.0149263 ]\n",
+      "sum of dy is  1.45716771982e-16\n",
+      "the gradient for the correct character (t) is: -0.98507834561\n",
+      "the gradient for the character (a) is:  0.0149294265437\n"
     ]
    }
   ],
@@ -377,18 +360,17 @@
     "output_type": "stream",
     "text": [
      "the hidden vector activations were:\n",
-      "[ -2.19597294e-03  -1.11958403e-02  -7.96573514e-03  -8.29025315e-03\n",
-      "  -1.69046852e-02   7.16480643e-03  -1.08628224e-05   1.38136549e-03\n",
-      "   8.91231078e-03   1.22455580e-02]\n",
+      "[-0.00321813  0.00640499  0.00424507  0.00783602 -0.00428041  0.00900686\n",
+      " -0.01618631  0.00071848 -0.01342806 -0.0048051 ]\n",
      "the gradients are:\n",
-      "[ 0.010636   -0.00556332  0.01572234 -0.01013355  0.01024101  0.00893933\n",
-      "  0.01156068 -0.0035879  -0.00304811 -0.00761244]\n",
-      "the gradients dWhy have size:  (93, 10)\n",
+      "[-0.00983172 -0.01049054  0.00911112 -0.01110877 -0.00134586  0.00686193\n",
+      " -0.01532285  0.0099507  -0.00530998  0.01267197]\n",
+      "the gradients dWhy have size:  (67, 10)\n",
      "a small sample is:\n",
-      "[[ -2.36093564e-05  -1.20368781e-04  -8.56412557e-05  -8.91302155e-05]\n",
-      " [ -2.36151293e-05  -1.20398213e-04  -8.56621967e-05  -8.91520096e-05]\n",
-      " [ -2.36150591e-05  -1.20397855e-04  -8.56619418e-05  -8.91517444e-05]\n",
-      " [ -2.36040335e-05  -1.20341643e-04  -8.56219473e-05  -8.91101206e-05]]\n"
+      "[[ -4.80319012e-05   9.55970187e-05   6.33594305e-05   1.16955768e-04]\n",
+      " [ -4.80318571e-05   9.55969309e-05   6.33593723e-05   1.16955661e-04]\n",
+      " [ -4.80249076e-05   9.55830995e-05   6.33502052e-05   1.16938739e-04]\n",
+      " [ -4.80429242e-05   9.56189576e-05   6.33739710e-05   1.16982609e-04]]\n"
     ]
    }
   ],
@@ -418,10 +400,10 @@
     "output_type": "stream",
     "text": [
      "small sample of Whh:\n",
-      "[[-0.0181644  -0.00730641  0.00068086  0.00043998]\n",
-      " [-0.00706002 -0.00657655  0.0100175   0.00198405]\n",
-      " [ 0.0175388   0.01643369 -0.00161146 -0.01413518]\n",
-      " [ 0.00812143 -0.00105367  0.00944295  0.01148334]]\n"
+      "[[-0.0028736  -0.00195895  0.00885911 -0.00349354]\n",
+      " [-0.00773272 -0.00051873  0.00219991 -0.00234756]\n",
+      " [ 0.01687054 -0.01221995  0.00125455 -0.00568523]\n",
+      " [-0.00031645  0.00514377  0.01194564  0.0070584 ]]\n"
     ]
    }
   ],
@@ -466,10 +448,10 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "probability assigned to the correct next character was:  0.0107552356218\n",
-      "probability assigned to the correct next character is now:  0.0118750046895\n",
-      "the cross-entropy (softmax) loss was  4.53236260838\n",
-      "the loss is now  4.43331953415\n"
+      "probability assigned to the correct next character was:  0.0149216543899\n",
+      "probability assigned to the correct next character is now:  0.0164678301753\n",
+      "the cross-entropy (softmax) loss was  4.20494180632\n",
+      "the loss is now  4.10634648754\n"
     ]
    }
   ],
@@ -558,7 +540,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "113.314839864\n"
+      "105.118896505\n"
     ]
    }
   ],
@@ -622,7 +604,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
-   "version": "2.7.3"
+   "version": "2.7.11"
  }
 },
 "nbformat": 4,
@@ -0,0 +1,758 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## RNN Language Model\n",
+    "\n",
+    "Below is a diagram of the RNN computation that we will implement below. We're plugging characters into the RNN with a 1-hot encoding and expecting it to predict the next character. In this example the training data is the string \"hello\", so there are 4 letters in the vocabulary: [h,e,l,o]."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<img src=\"rnnlm.jpeg\">"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "np.random.seed(1337)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "data has 4573338 characters, 67 unique.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# data I/O\n",
+    "# get shakespeare from http://cs.stanford.edu/people/karpathy/shakespeare.txt\n",
+    "data = open('shakespeare.txt', 'r').read() # should be simple plain text file\n",
+    "chars = list(set(data))\n",
+    "data_size, vocab_size = len(data), len(chars)\n",
+    "print 'data has %d characters, %d unique.' % (data_size, vocab_size)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "char_to_ix = { ch:i for i,ch in enumerate(chars) }\n",
+    "ix_to_char = { i:ch for i,ch in enumerate(chars) }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "41"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "char_to_ix['a']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " thing when he was young,\n"
+     ]
+    }
+   ],
+   "source": [
+    "# lets sample a batch of data\n",
+    "seq_length = 25 # number of characters in the batch\n",
+    "p = 220000 # point in the book to sample from\n",
+    "print data[p:p+seq_length] # print a chunk of data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2, 61, 49, 48, 55, 46, 2, 62, 49, 44, 55, 2, 49, 44, 2, 62, 41, 58, 2, 64, 54, 60, 55, 46, 7]\n",
+      "[61, 49, 48, 55, 46, 2, 62, 49, 44, 55, 2, 49, 44, 2, 62, 41, 58, 2, 64, 54, 60, 55, 46, 7, 0]\n"
+     ]
+    }
+   ],
+   "source": [
+    "inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]\n",
+    "targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]\n",
+    "print inputs\n",
+    "print targets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[ 0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.\n",
+      "  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.\n",
+      "  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.\n",
+      "  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# lets plug the first character into the RNN\n",
+    "ix_input = inputs[0]\n",
+    "ix_target = targets[0]\n",
+    "# encode the input character with a 1-hot representation\n",
+    "x = np.zeros((vocab_size,1))\n",
+    "x[ix_input] = 1\n",
+    "print x.ravel()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# create random starting parameters\n",
+    "hidden_size = 10\n",
+    "Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden\n",
+    "Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden\n",
+    "Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output\n",
+    "bh = np.zeros((hidden_size, 1)) # hidden bias\n",
+    "by = np.zeros((vocab_size, 1)) # output bias"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[-0.00321813  0.00640499  0.00424507  0.00783602 -0.00428041  0.00900686\n",
+      " -0.01618631  0.00071848 -0.01342806 -0.0048051 ]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# compute the hidden state\n",
+    "h_prev = np.zeros((hidden_size, 1))\n",
+    "h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h_prev + bh))\n",
+    "print h.ravel()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[  3.83078834e-05   3.73893809e-05  -1.07305639e-04   2.67774461e-04\n",
+      "   1.19325035e-04  -3.37354220e-04  -3.34852820e-04   3.32538588e-04\n",
+      "   3.18794726e-04  -1.03554566e-05   1.36845629e-04  -5.38710213e-04\n",
+      "  -4.57711660e-04   2.99868693e-04   1.68690516e-04   1.78520042e-04\n",
+      "   2.36693532e-04  -7.03401501e-05  -7.16902773e-05   1.27203335e-04\n",
+      "   8.04252112e-05   1.59279072e-04   2.17314448e-04  -2.72573982e-05\n",
+      "  -7.02211895e-04  -2.42104646e-05   5.30702842e-04   1.94948893e-04\n",
+      "  -9.90711082e-05   4.03627462e-04  -2.93323333e-04   3.93383233e-04\n",
+      "  -4.05743771e-04  -3.38345251e-04  -5.29097616e-05  -4.80911954e-05\n",
+      "   1.33355225e-04   4.75679274e-04   6.58103198e-05   1.94801460e-04\n",
+      "  -2.20233399e-04   3.08235108e-04  -2.53968781e-05   1.44684905e-04\n",
+      "  -8.29930806e-05   5.83661343e-04   7.77516275e-05   2.55059906e-04\n",
+      "   1.81287457e-04   2.64252428e-04   1.78958318e-04  -2.92440820e-04\n",
+      "  -3.40954240e-04   8.32849829e-05  -6.05381172e-05   3.07491179e-04\n",
+      "   3.90229048e-04  -1.39828111e-04   2.11848605e-04  -5.79932785e-04\n",
+      "   6.02578409e-05  -2.12493371e-04   2.07956569e-04  -2.84122452e-04\n",
+      "  -2.30967665e-04   3.80355416e-04   9.87418965e-05]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# compute the scores for next character\n",
+    "y = np.dot(Why, h) + by\n",
+    "print y.ravel()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[ 0.0149254   0.01492538  0.01492322  0.01492882  0.01492661  0.01491979\n",
+      "  0.01491983  0.01492979  0.01492958  0.01492467  0.01492687  0.01491679\n",
+      "  0.014918    0.0149293   0.01492734  0.01492749  0.01492836  0.01492378\n",
+      "  0.01492376  0.01492672  0.01492603  0.0149272   0.01492807  0.01492442\n",
+      "  0.01491435  0.01492446  0.01493275  0.01492774  0.01492335  0.01493085\n",
+      "  0.01492045  0.0149307   0.01491877  0.01491978  0.01492404  0.01492411\n",
+      "  0.01492682  0.01493193  0.01492581  0.01492773  0.01492154  0.01492943\n",
+      "  0.01492445  0.01492699  0.01492359  0.01493354  0.01492599  0.01492863\n",
+      "  0.01492753  0.01492877  0.0149275   0.01492046  0.01491974  0.01492607\n",
+      "  0.01492392  0.01492942  0.01493065  0.01492274  0.01492799  0.01491617\n",
+      "  0.01492572  0.01492165  0.01492793  0.01492059  0.01492138  0.0149305\n",
+      "  0.0149263 ]\n",
+      "probabilities sum to  1.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "# the scores are unnormalized log probabilities. compute the probabilities\n",
+    "p = np.exp(y) / np.sum(np.exp(y))\n",
+    "print p.ravel()\n",
+    "print 'probabilities sum to ', p.sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "probability assigned to the correct next character is right now:  0.0149216543899\n"
+     ]
+    }
+   ],
+   "source": [
+    "print 'probability assigned to the correct next character is right now: ', p[ix_target,0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "the cross-entropy (softmax) loss is  4.20494180632\n"
+     ]
+    }
+   ],
+   "source": [
+    "loss = -np.log(p[ix_target,0])\n",
+    "print 'the cross-entropy (softmax) loss is ', loss"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[ 0.0149254   0.01492538  0.01492322  0.01492882  0.01492661  0.01491979\n",
+      "  0.01491983  0.01492979  0.01492958  0.01492467  0.01492687  0.01491679\n",
+      "  0.014918    0.0149293   0.01492734  0.01492749  0.01492836  0.01492378\n",
+      "  0.01492376  0.01492672  0.01492603  0.0149272   0.01492807  0.01492442\n",
+      "  0.01491435  0.01492446  0.01493275  0.01492774  0.01492335  0.01493085\n",
+      "  0.01492045  0.0149307   0.01491877  0.01491978  0.01492404  0.01492411\n",
+      "  0.01492682  0.01493193  0.01492581  0.01492773  0.01492154  0.01492943\n",
+      "  0.01492445  0.01492699  0.01492359  0.01493354  0.01492599  0.01492863\n",
+      "  0.01492753  0.01492877  0.0149275   0.01492046  0.01491974  0.01492607\n",
+      "  0.01492392  0.01492942  0.01493065  0.01492274  0.01492799  0.01491617\n",
+      "  0.01492572 -0.98507835  0.01492793  0.01492059  0.01492138  0.0149305\n",
+      "  0.0149263 ]\n",
+      "sum of dy is  1.45716771982e-16\n",
+      "the gradient for the correct character (t) is: -0.98507834561\n",
+      "the gradient for the character (a) is:  0.0149294265437\n"
+     ]
+    }
+   ],
+   "source": [
+    "# compute the gradient on y\n",
+    "dy = np.copy(p)\n",
+    "dy[ix_target] -= 1\n",
+    "print dy.ravel()\n",
+    "print 'sum of dy is ', dy.sum()\n",
+    "print 'the gradient for the correct character (%s) is: %s' % (ix_to_char[ix_target], dy[ix_target,0])\n",
+    "print 'the gradient for the character (a) is: ', dy[char_to_ix['a'],0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "the hidden vector activations were:\n",
+      "[-0.00321813  0.00640499  0.00424507  0.00783602 -0.00428041  0.00900686\n",
+      " -0.01618631  0.00071848 -0.01342806 -0.0048051 ]\n",
+      "the gradients are:\n",
+      "[-0.00983172 -0.01049054  0.00911112 -0.01110877 -0.00134586  0.00686193\n",
+      " -0.01532285  0.0099507  -0.00530998  0.01267197]\n",
+      "the gradients dWhy have size:  (67, 10)\n",
+      "a small sample is:\n",
+      "[[ -4.80319012e-05   9.55970187e-05   6.33594305e-05   1.16955768e-04]\n",
+      " [ -4.80318571e-05   9.55969309e-05   6.33593723e-05   1.16955661e-04]\n",
+      " [ -4.80249076e-05   9.55830995e-05   6.33502052e-05   1.16938739e-04]\n",
+      " [ -4.80429242e-05   9.56189576e-05   6.33739710e-05   1.16982609e-04]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# we computed [y = np.dot(Why, h) + by]; Backpropagate to Why, h, and by\n",
+    "dWhy = np.dot(dy, h.T)\n",
+    "dh = np.dot(Why.T, dy)\n",
+    "dby = np.copy(dy)\n",
+    "print 'the hidden vector activations were:'\n",
+    "print h.ravel()\n",
+    "print 'the gradients are:'\n",
+    "print dh.ravel()\n",
+    "print 'the gradients dWhy have size: ', dWhy.shape\n",
+    "print 'a small sample is:'\n",
+    "print dWhy[:4,:4]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "small sample of Whh:\n",
+      "[[-0.0028736  -0.00195895  0.00885911 -0.00349354]\n",
+      " [-0.00773272 -0.00051873  0.00219991 -0.00234756]\n",
+      " [ 0.01687054 -0.01221995  0.00125455 -0.00568523]\n",
+      " [-0.00031645  0.00514377  0.01194564  0.0070584 ]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# we computed [h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h_prev + bh))]; \n",
+    "# Backprop into Wxh, x, Whh, h_prev, bh:\n",
+    "dh_before_tanh = (1-h**2)*dh\n",
+    "dbh = np.copy(dh_before_tanh)\n",
+    "dWxh = np.dot(dh_before_tanh, x.T)\n",
+    "dWhh = np.dot(dh_before_tanh, h.T)\n",
+    "dh_prev = np.dot(Whh.T, dh_before_tanh)\n",
+    "print 'small sample of Whh:'\n",
+    "print Whh[:4,:4]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# we now have the gradients for all parameters! (Wxh, Whh, Why, bh, by)\n",
+    "# lets do a parameter update\n",
+    "learning_rate = 0.1\n",
+    "Wxh2 = Wxh - learning_rate * dWxh\n",
+    "Whh2 = Whh - learning_rate * dWhh\n",
+    "Why2 = Why - learning_rate * dWhy\n",
+    "bh2 = bh - learning_rate * dbh\n",
+    "by2 = by - learning_rate * dby"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "probability assigned to the correct next character was:  0.0149216543899\n",
+      "probability assigned to the correct next character is now:  0.0164678301753\n",
+      "the cross-entropy (softmax) loss was  4.20494180632\n",
+      "the loss is now  4.10634648754\n"
+     ]
+    }
+   ],
+   "source": [
+    "# these parameters should be much better! lets try it out:\n",
+    "h2 = np.tanh(np.dot(Wxh2, x) + np.dot(Whh2, h_prev + bh2))\n",
+    "y2 = np.dot(Why2, h2) + by2\n",
+    "p2 = np.exp(y2) / np.sum(np.exp(y2))\n",
+    "print 'probability assigned to the correct next character was: ', p[ix_target,0]\n",
+    "print 'probability assigned to the correct next character is now: ', p2[ix_target,0]\n",
+    "loss2 = -np.log(p2[ix_target,0])\n",
+    "print 'the cross-entropy (softmax) loss was ', loss\n",
+    "print 'the loss is now ', loss2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# note: the probability for the correct character went up! (and the loss went down)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# putting it together with loops\n",
+    "def lossFun(inputs, targets, hprev):\n",
+    "    \"\"\"\n",
+    "    inputs,targets are both list of integers.\n",
+    "    hprev is Hx1 array of initial hidden state\n",
+    "    returns the loss, gradients on model parameters, and last hidden state\n",
+    "    \"\"\"\n",
+    "    xs, hs, ys, ps = {}, {}, {}, {}\n",
+    "    hs[-1] = np.copy(hprev)\n",
+    "    loss = 0\n",
+    "    \n",
+    "    # forward pass\n",
+    "    for t in xrange(len(inputs)):\n",
+    "        xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation\n",
+    "        xs[t][inputs[t]] = 1\n",
+    "        hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state\n",
+    "        ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars\n",
+    "        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars\n",
+    "        loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)\n",
+    "    \n",
+    "    # backward pass: compute gradients going backwards\n",
+    "    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)\n",
+    "    dbh, dby = np.zeros_like(bh), np.zeros_like(by)\n",
+    "    dhnext = np.zeros_like(hs[0])\n",
+    "    for t in reversed(xrange(len(inputs))):\n",
+    "        dy = np.copy(ps[t])\n",
+    "        dy[targets[t]] -= 1 # backprop into y\n",
+    "        dWhy += np.dot(dy, hs[t].T)\n",
+    "        dby += dy\n",
+    "        dh = np.dot(Why.T, dy) + dhnext # backprop into h\n",
+    "        dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity\n",
+    "        dbh += dhraw\n",
+    "        dWxh += np.dot(dhraw, xs[t].T)\n",
+    "        dWhh += np.dot(dhraw, hs[t-1].T)\n",
+    "        dhnext = np.dot(Whh.T, dhraw)\n",
+    "        \n",
+    "    # clip to mitigate exploding gradients\n",
+    "    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:\n",
+    "        np.clip(dparam, -5, 5, out=dparam)\n",
+    "    \n",
+    "    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "105.118896505\n"
+     ]
+    }
+   ],
+   "source": [
+    "loss, dWxh, dWhh, dWhy, dbh, dby, hnew = lossFun(inputs, targets, h_prev)\n",
+    "print loss"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# TODO: write the sampling code\n",
+    "def sample(h, seed_ix, n):\n",
+    "    \"\"\" \n",
+    "    sample a sequence of integers from the model \n",
+    "    h is initial memory state, seed_ix is seed letter for first time step\n",
+    "    n is the number of time steps to sample for\n",
+    "    \"\"\"\n",
+    "    x = np.zeros((vocab_size, 1))\n",
+    "    x[seed_ix] = 1\n",
+    "    ixes = [] # sampled indices\n",
+    "    for t in xrange(n):\n",
+    "        pass # TODO: run the RNN for one time step, sample from distribution\n",
+    "    return ixes\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "iter 0, loss: 105.117314\n",
+      "iter 100, loss: 104.983384\n",
+      "iter 200, loss: 104.623511\n",
+      "iter 300, loss: 104.093896\n",
+      "iter 400, loss: 103.407954\n",
+      "iter 500, loss: 102.742497\n",
+      "iter 600, loss: 101.886946\n",
+      "iter 700, loss: 101.017334\n",
+      "iter 800, loss: 100.224049\n",
+      "iter 900, loss: 99.275074\n",
+      "iter 1000, loss: 98.310447\n",
+      "iter 1100, loss: 97.280136\n",
+      "iter 1200, loss: 96.170347\n",
+      "iter 1300, loss: 95.026980\n",
+      "iter 1400, loss: 94.074431\n",
+      "iter 1500, loss: 93.214796\n",
+      "iter 1600, loss: 92.194884\n",
+      "iter 1700, loss: 91.461468\n",
+      "iter 1800, loss: 90.884004\n",
+      "iter 1900, loss: 90.051538\n",
+      "iter 2000, loss: 89.025933\n",
+      "iter 2100, loss: 88.348015\n",
+      "iter 2200, loss: 87.853860\n",
+      "iter 2300, loss: 87.232155\n",
+      "iter 2400, loss: 86.868957\n",
+      "iter 2500, loss: 86.633172\n",
+      "iter 2600, loss: 86.099010\n",
+      "iter 2700, loss: 85.847026\n",
+      "iter 2800, loss: 85.920993\n",
+      "iter 2900, loss: 85.654873\n",
+      "iter 3000, loss: 85.194774\n",
+      "iter 3100, loss: 85.314185\n",
+      "iter 3200, loss: 85.286007\n",
+      "iter 3300, loss: 85.060589\n",
+      "iter 3400, loss: 84.918822\n",
+      "iter 3500, loss: 84.606756\n",
+      "iter 3600, loss: 84.445501\n",
+      "iter 3700, loss: 84.407657\n",
+      "iter 3800, loss: 84.323040\n",
+      "iter 3900, loss: 84.133951\n",
+      "iter 4000, loss: 84.064946\n",
+      "iter 4100, loss: 84.004744\n",
+      "iter 4200, loss: 83.680242\n",
+      "iter 4300, loss: 83.838111\n",
+      "iter 4400, loss: 83.538384\n",
+      "iter 4500, loss: 83.244682\n",
+      "iter 4600, loss: 82.926760\n",
+      "iter 4700, loss: 82.978359\n",
+      "iter 4800, loss: 83.063037\n",
+      "iter 4900, loss: 83.094751\n",
+      "iter 5000, loss: 82.961633\n",
+      "iter 5100, loss: 82.787609\n",
+      "iter 5200, loss: 82.812643\n",
+      "iter 5300, loss: 82.653842\n",
+      "iter 5400, loss: 82.512128\n",
+      "iter 5500, loss: 82.471460\n",
+      "iter 5600, loss: 82.328936\n",
+      "iter 5700, loss: 82.100785\n",
+      "iter 5800, loss: 82.109023\n",
+      "iter 5900, loss: 82.088454\n",
+      "iter 6000, loss: 81.998958\n",
+      "iter 6100, loss: 82.107872\n",
+      "iter 6200, loss: 82.203081\n",
+      "iter 6300, loss: 82.139286\n",
+      "iter 6400, loss: 82.375904\n",
+      "iter 6500, loss: 82.276112\n",
+      "iter 6600, loss: 82.344397\n",
+      "iter 6700, loss: 82.709979\n",
+      "iter 6800, loss: 82.775993\n",
+      "iter 6900, loss: 82.987775\n",
+      "iter 7000, loss: 82.979527\n",
+      "iter 7100, loss: 83.100577\n",
+      "iter 7200, loss: 83.124201\n",
+      "iter 7300, loss: 83.255949\n",
+      "iter 7400, loss: 83.229047\n",
+      "iter 7500, loss: 83.334554\n",
+      "iter 7600, loss: 83.281780\n",
+      "iter 7700, loss: 83.155654\n",
+      "iter 7800, loss: 83.017638\n",
+      "iter 7900, loss: 82.702089\n",
+      "iter 8000, loss: 82.610772\n",
+      "iter 8100, loss: 82.449181\n",
+      "iter 8200, loss: 82.512648\n",
+      "iter 8300, loss: 82.518510\n",
+      "iter 8400, loss: 82.448900\n",
+      "iter 8500, loss: 82.438597\n",
+      "iter 8600, loss: 82.449625\n",
+      "iter 8700, loss: 82.450940\n",
+      "iter 8800, loss: 82.330407\n",
+      "iter 8900, loss: 82.704262\n",
+      "iter 9000, loss: 82.595555\n",
+      "iter 9100, loss: 82.738328\n",
+      "iter 9200, loss: 82.954857\n",
+      "iter 9300, loss: 82.893699\n",
+      "iter 9400, loss: 82.574069\n",
+      "iter 9500, loss: 82.386784\n",
+      "iter 9600, loss: 82.385382\n",
+      "iter 9700, loss: 82.287088\n",
+      "iter 9800, loss: 82.185769\n",
+      "iter 9900, loss: 81.809823\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Stochastic Gradient Descent\n",
+    "n, p = 0, 0\n",
+    "smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0\n",
+    "learning_rate = 1e-3\n",
+    "while n < 10000:\n",
+    "    # prepare inputs (we're sweeping from left to right in steps seq_length long)\n",
+    "    if p+seq_length+1 >= len(data) or n == 0: \n",
+    "        hprev = np.zeros((hidden_size,1)) # reset RNN memory\n",
+    "        p = 0 # go from start of data\n",
+    "    inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]\n",
+    "    targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]\n",
+    "\n",
+    "    # forward seq_length characters through the net and fetch gradient\n",
+    "    loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)\n",
+    "    smooth_loss = smooth_loss * 0.999 + loss * 0.001\n",
+    "    if n % 100 == 0: print 'iter %d, loss: %f' % (n, smooth_loss) # print progress\n",
+    "\n",
+    "    # perform parameter update with Adagrad\n",
+    "    for param, dparam in zip([Wxh, Whh, Why, bh, by], \n",
+    "                                [dWxh, dWhh, dWhy, dbh, dby]):\n",
+    "        param += -learning_rate * dparam\n",
+    "\n",
+    "    p += seq_length # move data pointer\n",
+    "    n += 1 # iteration counter "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# TODO: Implement a sampling function that lets us generate from the model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
@@ -0,0 +1,831 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## RNN Language Model\n",
+    "\n",
+    "Below is a diagram of the RNN computation that we will implement below. We're plugging characters into the RNN with a 1-hot encoding and expecting it to predict the next character. In this example the training data is the string \"hello\", so there are 4 letters in the vocabulary: [h,e,l,o]."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<img src=\"rnnlm.jpeg\">"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "np.random.seed(1337)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "data has 4573338 characters, 67 unique.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# data I/O\n",
+    "# get shakespeare from http://cs.stanford.edu/people/karpathy/shakespeare.txt\n",
+    "data = open('shakespeare.txt', 'r').read() # should be simple plain text file\n",
+    "chars = list(set(data))\n",
+    "data_size, vocab_size = len(data), len(chars)\n",
+    "print 'data has %d characters, %d unique.' % (data_size, vocab_size)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "char_to_ix = { ch:i for i,ch in enumerate(chars) }\n",
+    "ix_to_char = { i:ch for i,ch in enumerate(chars) }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "41"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "char_to_ix['a']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " thing when he was young,\n"
+     ]
+    }
+   ],
+   "source": [
+    "# lets sample a batch of data\n",
+    "seq_length = 25 # number of characters in the batch\n",
+    "p = 220000 # point in the book to sample from\n",
+    "print data[p:p+seq_length] # print a chunk of data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2, 61, 49, 48, 55, 46, 2, 62, 49, 44, 55, 2, 49, 44, 2, 62, 41, 58, 2, 64, 54, 60, 55, 46, 7]\n",
+      "[61, 49, 48, 55, 46, 2, 62, 49, 44, 55, 2, 49, 44, 2, 62, 41, 58, 2, 64, 54, 60, 55, 46, 7, 0]\n"
+     ]
+    }
+   ],
+   "source": [
+    "inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]\n",
+    "targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]\n",
+    "print inputs\n",
+    "print targets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[ 0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.\n",
+      "  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.\n",
+      "  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.\n",
+      "  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# lets plug the first character into the RNN\n",
+    "ix_input = inputs[0]\n",
+    "ix_target = targets[0]\n",
+    "# encode the input character with a 1-hot representation\n",
+    "x = np.zeros((vocab_size,1))\n",
+    "x[ix_input] = 1\n",
+    "print x.ravel()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# create random starting parameters\n",
+    "hidden_size = 10\n",
+    "Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden\n",
+    "Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden\n",
+    "Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output\n",
+    "bh = np.zeros((hidden_size, 1)) # hidden bias\n",
+    "by = np.zeros((vocab_size, 1)) # output bias"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[-0.02566928 -0.00711926 -0.00851462  0.01228545 -0.00241891  0.00636176\n",
+      " -0.00171284 -0.01129739 -0.0069362   0.00932362]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# compute the hidden state\n",
+    "h_prev = np.zeros((hidden_size, 1))\n",
+    "h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h_prev + bh))\n",
+    "print h.ravel()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[  6.29612155e-04   4.09079294e-04  -3.29899872e-04   7.98200509e-04\n",
+      "  -2.62905161e-05   4.94626771e-04   1.97138889e-04  -1.01600591e-04\n",
+      "   7.72757316e-04   2.84376903e-04  -7.29973921e-04   1.56005304e-05\n",
+      "  -1.11927240e-04   1.35442172e-04  -3.89815428e-05  -4.86357178e-05\n",
+      "   1.32336208e-04   3.15738595e-04  -3.87247490e-04   7.28991890e-04\n",
+      "  -5.30632950e-05   4.20179198e-04   4.42242144e-04   2.83823246e-04\n",
+      "  -3.58363287e-05   6.98975802e-05   4.84398003e-04  -2.81941909e-04\n",
+      "   5.07592676e-04  -2.68109997e-04  -6.98104505e-05   3.21717382e-04\n",
+      "   5.08520176e-05   6.37695233e-04  -1.02395859e-04   1.63546016e-04\n",
+      "  -5.80853510e-04   1.19142485e-04  -3.79932371e-04  -3.94374025e-04\n",
+      "   7.46960859e-04   4.68737825e-04  -3.62337202e-04  -7.06302136e-06\n",
+      "   4.24622028e-04   9.21261371e-04   1.02755871e-04   2.95008636e-04\n",
+      "   1.41569258e-04  -7.44459004e-04   3.24625094e-04  -3.00690740e-05\n",
+      "   4.47332341e-04   2.14832415e-05  -4.31132112e-04   4.42862442e-04\n",
+      "   1.87427875e-06   7.64699298e-05   1.02364774e-04   2.62063645e-04\n",
+      "  -3.09534282e-04  -5.05432663e-04  -1.66878227e-05  -7.71886342e-05\n",
+      "   2.44466581e-04  -2.06128445e-04   2.96956566e-04]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# compute the scores for next character\n",
+    "y = np.dot(Why, h) + by\n",
+    "print y.ravel()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[ 0.01493319  0.0149299   0.01491887  0.01493571  0.0149234   0.01493117\n",
+      "  0.01492673  0.01492227  0.01493533  0.01492803  0.0149129   0.01492402\n",
+      "  0.01492212  0.01492581  0.01492321  0.01492306  0.01492576  0.0149285\n",
+      "  0.01491801  0.01493467  0.014923    0.01493006  0.01493039  0.01492803\n",
+      "  0.01492325  0.01492483  0.01493102  0.01491958  0.01493137  0.01491979\n",
+      "  0.01492275  0.01492859  0.01492455  0.01493331  0.01492226  0.01492623\n",
+      "  0.01491512  0.01492557  0.01491812  0.0149179   0.01493494  0.01493079\n",
+      "  0.01491838  0.01492368  0.01493013  0.01493754  0.01492532  0.01492819\n",
+      "  0.0149259   0.01491268  0.01492863  0.01492334  0.01493047  0.01492411\n",
+      "  0.01491736  0.0149304   0.01492382  0.01492493  0.01492532  0.0149277\n",
+      "  0.01491917  0.01491625  0.01492354  0.01492264  0.01492744  0.01492071\n",
+      "  0.01492822]\n",
+      "probabilities sum to  1.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "# the scores are unnormalized log probabilities. compute the probabilities\n",
+    "p = np.exp(y) / np.sum(np.exp(y))\n",
+    "print p.ravel()\n",
+    "print 'probabilities sum to ', p.sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "probability assigned to the correct next character is right now:  0.0149162482677\n"
+     ]
+    }
+   ],
+   "source": [
+    "print 'probability assigned to the correct next character is right now: ', p[ix_target,0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "the cross-entropy (softmax) loss is  4.20530417242\n"
+     ]
+    }
+   ],
+   "source": [
+    "loss = -np.log(p[ix_target,0])\n",
+    "print 'the cross-entropy (softmax) loss is ', loss"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[ 0.01493319  0.0149299   0.01491887  0.01493571  0.0149234   0.01493117\n",
+      "  0.01492673  0.01492227  0.01493533  0.01492803  0.0149129   0.01492402\n",
+      "  0.01492212  0.01492581  0.01492321  0.01492306  0.01492576  0.0149285\n",
+      "  0.01491801  0.01493467  0.014923    0.01493006  0.01493039  0.01492803\n",
+      "  0.01492325  0.01492483  0.01493102  0.01491958  0.01493137  0.01491979\n",
+      "  0.01492275  0.01492859  0.01492455  0.01493331  0.01492226  0.01492623\n",
+      "  0.01491512  0.01492557  0.01491812  0.0149179   0.01493494  0.01493079\n",
+      "  0.01491838  0.01492368  0.01493013  0.01493754  0.01492532  0.01492819\n",
+      "  0.0149259   0.01491268  0.01492863  0.01492334  0.01493047  0.01492411\n",
+      "  0.01491736  0.0149304   0.01492382  0.01492493  0.01492532  0.0149277\n",
+      "  0.01491917 -0.98508375  0.01492354  0.01492264  0.01492744  0.01492071\n",
+      "  0.01492822]\n",
+      "sum of dy is  2.77555756156e-17\n",
+      "the gradient for the correct character (t) is: -0.985083751732\n",
+      "the gradient for the character (a) is:  0.0149307863167\n"
+     ]
+    }
+   ],
+   "source": [
+    "# compute the gradient on y\n",
+    "dy = np.copy(p)\n",
+    "dy[ix_target] -= 1\n",
+    "print dy.ravel()\n",
+    "print 'sum of dy is ', dy.sum()\n",
+    "print 'the gradient for the correct character (%s) is: %s' % (ix_to_char[ix_target], dy[ix_target,0])\n",
+    "print 'the gradient for the character (a) is: ', dy[char_to_ix['a'],0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "the hidden vector activations were:\n",
+      "[-0.02566928 -0.00711926 -0.00851462  0.01228545 -0.00241891  0.00636176\n",
+      " -0.00171284 -0.01129739 -0.0069362   0.00932362]\n",
+      "the gradients are:\n",
+      "[-0.00824375 -0.00696831 -0.00844694  0.01640971 -0.0017776   0.00293419\n",
+      "  0.01496486  0.0062472  -0.00851701  0.009765  ]\n",
+      "the gradients dWhy have size:  (67, 10)\n",
+      "a small sample is:\n",
+      "[[-0.00038332 -0.00010631 -0.00012715  0.00018346]\n",
+      " [-0.00038324 -0.00010629 -0.00012712  0.00018342]\n",
+      " [-0.00038296 -0.00010621 -0.00012703  0.00018328]\n",
+      " [-0.00038339 -0.00010633 -0.00012717  0.00018349]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# we computed [y = np.dot(Why, h) + by]; Backpropagate to Why, h, and by\n",
+    "dWhy = np.dot(dy, h.T)\n",
+    "dh = np.dot(Why.T, dy)\n",
+    "dby = np.copy(dy)\n",
+    "print 'the hidden vector activations were:'\n",
+    "print h.ravel()\n",
+    "print 'the gradients are:'\n",
+    "print dh.ravel()\n",
+    "print 'the gradients dWhy have size: ', dWhy.shape\n",
+    "print 'a small sample is:'\n",
+    "print dWhy[:4,:4]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "small sample of Whh:\n",
+      "[[-0.01152775  0.00881821 -0.00906459  0.00349652]\n",
+      " [ 0.00261533  0.0120227  -0.00259614  0.01621284]\n",
+      " [ 0.00182372  0.00073918 -0.00662722  0.02817786]\n",
+      " [-0.01495566  0.00292029 -0.00142797  0.00315272]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# we computed [h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h_prev + bh))]; \n",
+    "# Backprop into Wxh, x, Whh, h_prev, bh:\n",
+    "dh_before_tanh = (1-h**2)*dh\n",
+    "dbh = np.copy(dh_before_tanh)\n",
+    "dWxh = np.dot(dh_before_tanh, x.T)\n",
+    "dWhh = np.dot(dh_before_tanh, h.T)\n",
+    "dh_prev = np.dot(Whh.T, dh_before_tanh)\n",
+    "print 'small sample of Whh:'\n",
+    "print Whh[:4,:4]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# we now have the gradients for all parameters! (Wxh, Whh, Why, bh, by)\n",
+    "# lets do a parameter update\n",
+    "learning_rate = 0.1\n",
+    "Wxh2 = Wxh - learning_rate * dWxh\n",
+    "Whh2 = Whh - learning_rate * dWhh\n",
+    "Why2 = Why - learning_rate * dWhy\n",
+    "bh2 = bh - learning_rate * dbh\n",
+    "by2 = by - learning_rate * dby"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "probability assigned to the correct next character was:  0.0149162482677\n",
+      "probability assigned to the correct next character is now:  0.0164625966368\n",
+      "the cross-entropy (softmax) loss was  4.20530417242\n",
+      "the loss is now  4.10666434182\n"
+     ]
+    }
+   ],
+   "source": [
+    "# these parameters should be much better! lets try it out:\n",
+    "h2 = np.tanh(np.dot(Wxh2, x) + np.dot(Whh2, h_prev + bh2))\n",
+    "y2 = np.dot(Why2, h2) + by2\n",
+    "p2 = np.exp(y2) / np.sum(np.exp(y2))\n",
+    "print 'probability assigned to the correct next character was: ', p[ix_target,0]\n",
+    "print 'probability assigned to the correct next character is now: ', p2[ix_target,0]\n",
+    "loss2 = -np.log(p2[ix_target,0])\n",
+    "print 'the cross-entropy (softmax) loss was ', loss\n",
+    "print 'the loss is now ', loss2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# note: the probability for the correct character went up! (and the loss went down)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# putting it together with loops\n",
+    "def lossFun(inputs, targets, hprev):\n",
+    "    \"\"\"\n",
+    "    inputs,targets are both list of integers.\n",
+    "    hprev is Hx1 array of initial hidden state\n",
+    "    returns the loss, gradients on model parameters, and last hidden state\n",
+    "    \"\"\"\n",
+    "    xs, hs, ys, ps = {}, {}, {}, {}\n",
+    "    hs[-1] = np.copy(hprev)\n",
+    "    loss = 0\n",
+    "    \n",
+    "    # forward pass\n",
+    "    for t in xrange(len(inputs)):\n",
+    "        xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation\n",
+    "        xs[t][inputs[t]] = 1\n",
+    "        hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state\n",
+    "        ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars\n",
+    "        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars\n",
+    "        loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)\n",
+    "    \n",
+    "    # backward pass: compute gradients going backwards\n",
+    "    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)\n",
+    "    dbh, dby = np.zeros_like(bh), np.zeros_like(by)\n",
+    "    dhnext = np.zeros_like(hs[0])\n",
+    "    for t in reversed(xrange(len(inputs))):\n",
+    "        dy = np.copy(ps[t])\n",
+    "        dy[targets[t]] -= 1 # backprop into y\n",
+    "        dWhy += np.dot(dy, hs[t].T)\n",
+    "        dby += dy\n",
+    "        dh = np.dot(Why.T, dy) + dhnext # backprop into h\n",
+    "        dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity\n",
+    "        dbh += dhraw\n",
+    "        dWxh += np.dot(dhraw, xs[t].T)\n",
+    "        dWhh += np.dot(dhraw, hs[t-1].T)\n",
+    "        dhnext = np.dot(Whh.T, dhraw)\n",
+    "        \n",
+    "    # clip to mitigate exploding gradients\n",
+    "    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:\n",
+    "        np.clip(dparam, -5, 5, out=dparam)\n",
+    "    \n",
+    "    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "105.12080312\n"
+     ]
+    }
+   ],
+   "source": [
+    "loss, dWxh, dWhh, dWhy, dbh, dby, hnew = lossFun(inputs, targets, h_prev)\n",
+    "print loss"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# TODO: write the sampling code\n",
+    "def sample(h, seed_ix, n):\n",
+    "    \"\"\" \n",
+    "    sample a sequence of integers from the model \n",
+    "    h is initial memory state, seed_ix is seed letter for first time step\n",
+    "    n is the number of time steps to sample for\n",
+    "    \"\"\"\n",
+    "    x = np.zeros((vocab_size, 1))\n",
+    "    x[seed_ix] = 1\n",
+    "    ixes = [] # sampled indices\n",
+    "    for t in xrange(n):\n",
+    "        pass # TODO: run the RNN for one time step, sample from distribution\n",
+    "    return ixes\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "iter 0, loss: 105.117317\n",
+      "iter 100, loss: 104.983575\n",
+      "iter 200, loss: 104.623957\n",
+      "iter 300, loss: 104.095161\n",
+      "iter 400, loss: 103.411069\n",
+      "iter 500, loss: 102.748750\n",
+      "iter 600, loss: 101.900376\n",
+      "iter 700, loss: 101.043855\n",
+      "iter 800, loss: 100.269985\n",
+      "iter 900, loss: 99.357290\n",
+      "iter 1000, loss: 98.447726\n",
+      "iter 1100, loss: 97.485653\n",
+      "iter 1200, loss: 96.434239\n",
+      "iter 1300, loss: 95.316229\n",
+      "iter 1400, loss: 94.361564\n",
+      "iter 1500, loss: 93.489495\n",
+      "iter 1600, loss: 92.453112\n",
+      "iter 1700, loss: 91.702083\n",
+      "iter 1800, loss: 91.107364\n",
+      "iter 1900, loss: 90.257844\n",
+      "iter 2000, loss: 89.215814\n",
+      "iter 2100, loss: 88.522362\n",
+      "iter 2200, loss: 88.013419\n",
+      "iter 2300, loss: 87.378470\n",
+      "iter 2400, loss: 87.002066\n",
+      "iter 2500, loss: 86.754673\n",
+      "iter 2600, loss: 86.210309\n",
+      "iter 2700, loss: 85.948958\n",
+      "iter 2800, loss: 86.013457\n",
+      "iter 2900, loss: 85.739568\n",
+      "iter 3000, loss: 85.272399\n",
+      "iter 3100, loss: 85.384693\n",
+      "iter 3200, loss: 85.350449\n",
+      "iter 3300, loss: 85.119789\n",
+      "iter 3400, loss: 84.972730\n",
+      "iter 3500, loss: 84.656232\n",
+      "iter 3600, loss: 84.490833\n",
+      "iter 3700, loss: 84.449176\n",
+      "iter 3800, loss: 84.361207\n",
+      "iter 3900, loss: 84.169058\n",
+      "iter 4000, loss: 84.097223\n",
+      "iter 4100, loss: 84.034489\n",
+      "iter 4200, loss: 83.707687\n",
+      "iter 4300, loss: 83.863006\n",
+      "iter 4400, loss: 83.561466\n",
+      "iter 4500, loss: 83.265996\n",
+      "iter 4600, loss: 82.946393\n",
+      "iter 4700, loss: 82.996679\n",
+      "iter 4800, loss: 83.080147\n",
+      "iter 4900, loss: 83.110710\n",
+      "iter 5000, loss: 82.976515\n",
+      "iter 5100, loss: 82.801721\n",
+      "iter 5200, loss: 82.825836\n",
+      "iter 5300, loss: 82.666174\n",
+      "iter 5400, loss: 82.523678\n",
+      "iter 5500, loss: 82.482473\n",
+      "iter 5600, loss: 82.339404\n",
+      "iter 5700, loss: 82.110700\n",
+      "iter 5800, loss: 82.118627\n",
+      "iter 5900, loss: 82.097737\n",
+      "iter 6000, loss: 82.007989\n",
+      "iter 6100, loss: 82.116768\n",
+      "iter 6200, loss: 82.211658\n",
+      "iter 6300, loss: 82.147884\n",
+      "iter 6400, loss: 82.385061\n",
+      "iter 6500, loss: 82.285403\n",
+      "iter 6600, loss: 82.354143\n",
+      "iter 6700, loss: 82.720619\n",
+      "iter 6800, loss: 82.787212\n",
+      "iter 6900, loss: 82.999636\n",
+      "iter 7000, loss: 82.992352\n",
+      "iter 7100, loss: 83.115493\n",
+      "iter 7200, loss: 83.140399\n",
+      "iter 7300, loss: 83.273761\n",
+      "iter 7400, loss: 83.247985\n",
+      "iter 7500, loss: 83.356551\n",
+      "iter 7600, loss: 83.306669\n",
+      "iter 7700, loss: 83.182086\n",
+      "iter 7800, loss: 83.045905\n",
+      "iter 7900, loss: 82.727804\n",
+      "iter 8000, loss: 82.638186\n",
+      "iter 8100, loss: 82.478594\n",
+      "iter 8200, loss: 82.546788\n",
+      "iter 8300, loss: 82.556720\n",
+      "iter 8400, loss: 82.489606\n",
+      "iter 8500, loss: 82.482931\n",
+      "iter 8600, loss: 82.500909\n",
+      "iter 8700, loss: 82.506185\n",
+      "iter 8800, loss: 82.389225\n",
+      "iter 8900, loss: 82.780411\n",
+      "iter 9000, loss: 82.682004\n",
+      "iter 9100, loss: 82.839024\n",
+      "iter 9200, loss: 83.075629\n",
+      "iter 9300, loss: 83.029504\n",
+      "iter 9400, loss: 82.721697\n",
+      "iter 9500, loss: 82.556405\n",
+      "iter 9600, loss: 82.588480\n",
+      "iter 9700, loss: 82.519233\n",
+      "iter 9800, loss: 82.450922\n",
+      "iter 9900, loss: 82.098659\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Stochastic Gradient Descent\n",
+    "n, p = 0, 0\n",
+    "smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0\n",
+    "learning_rate = 1e-3\n",
+    "while n < 10000:\n",
+    "    # prepare inputs (we're sweeping from left to right in steps seq_length long)\n",
+    "    if p+seq_length+1 >= len(data) or n == 0: \n",
+    "        hprev = np.zeros((hidden_size,1)) # reset RNN memory\n",
+    "        p = 0 # go from start of data\n",
+    "    inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]\n",
+    "    targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]\n",
+    "\n",
+    "    # forward seq_length characters through the net and fetch gradient\n",
+    "    loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)\n",
+    "    smooth_loss = smooth_loss * 0.999 + loss * 0.001\n",
+    "    if n % 100 == 0: print 'iter %d, loss: %f' % (n, smooth_loss) # print progress\n",
+    "\n",
+    "    # perform parameter update with Adagrad\n",
+    "    for param, dparam in zip([Wxh, Whh, Why, bh, by], \n",
+    "                                [dWxh, dWhh, dWhy, dbh, dby]):\n",
+    "        param += -learning_rate * dparam\n",
+    "\n",
+    "    p += seq_length # move data pointer\n",
+    "    n += 1 # iteration counter "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def sample(h, seed_ix, n):\n",
+    "    \"\"\" \n",
+    "    sample a sequence of integers from the model \n",
+    "    h is memory state, seed_ix is seed letter for first time step\n",
+    "    \"\"\"\n",
+    "    x = np.zeros((vocab_size, 1))\n",
+    "    x[seed_ix] = 1\n",
+    "    ixes = []\n",
+    "    for t in xrange(n):\n",
+    "        h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)\n",
+    "        y = np.dot(Why, h) + by\n",
+    "        p = np.exp(y) / np.sum(np.exp(y))\n",
+    "        ix = np.random.choice(range(vocab_size), p=p.ravel())\n",
+    "        x = np.zeros((vocab_size, 1))\n",
+    "        x[ix] = 1\n",
+    "        ixes.append(ix)\n",
+    "    return ixes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "noe a L:odmlt hd ts\n",
+      "olu hed og hoe. i\n",
+      "nrarro uCws ' Zlmky ynwrss[ mmo\n",
+      "sS hCInAr\n",
+      "ib htneayHseiCf uueo\n",
+      "H'Il s,dou\n",
+      "usKi., t?fr\n",
+      "i whhfgdre AtitTe\n",
+      ",mer\n",
+      "ll \n",
+      "G tetdjcqsrrtfs soin,rnseiIdnIshos mn iasgh\n",
+      "idghfMIaoeM;,oaat,wv\n",
+      "d  , rnthoikfrlh rsqa at ;ilr wdeat tawl\n",
+      "atran iter yahnneesevd rls Soide TswD haPsiu no sphrcnhyGrsd mbRyos ,g sdt widotLn ohh \n",
+      " tcavny svag\n",
+      "bee rr nlh dnl h olCn  . terTrfo o t d nhoeewisn riwgono maeAahe CrS\n",
+      "tystet i!BWehi-'Nh:syd cv mr\n",
+      "iceawtesbthnhul eaec ,eeddage\n",
+      "\n",
+      "mtdsu : mioeuT,ayin aharwoouh,uh alrhofnwW Tdeo CDnupee tntaehd\n",
+      "lee hGtc\n",
+      "oneE hs oh otil'rr ftO\n",
+      "t hoe nie yeidoqotnlt h s! ai.e,s c? eTwbr r ,etaa t y\n",
+      "Sar\n",
+      "ieeeio\n",
+      "oUoyehm UIGndoLeaeae h eaFfoaH\n",
+      "etnmris Paib\n",
+      "h r,iiEb\n",
+      "haemlkecySa,nlfTe\n",
+      "o hssRThokiwo R\n",
+      "r mso pmdsnbn weu ameUe l\n",
+      "tldal -enr otguO, ue \n",
+      "nY olKh oh  Iboewornr \n",
+      "yhh o:a,e pntU edvhclsndH hs mEQoeoi lhnub !e n,&iirEr sdiSdWl saases tsstgitsg urhdaleeweareit,e n!hitra u\n",
+      "Bl pr .ih 't  oit.es aoy atyt, enit fntae hEt\n",
+      "stuhaoIr  uKis\n",
+      "tUikn otbareu oefudz'uay\n"
+     ]
+    }
+   ],
+   "source": [
+    "sample_ix = sample(hprev, char_to_ix['a'], 1000)\n",
+    "txt = ''.join(ix_to_char[ix] for ix in sample_ix)\n",
+    "print txt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}