Tensorflow RNN PTB Example Walkthrough


There is only one graph, but there are three PTBModel instances to keep track of the elements related to the stage of train, valid and test. Note the use of variable_scope and the resue to share the weights/bias.

with tf.Graph().as_default():
  initializer = tf.random_uniform_initializer(-config.init_scale,

  with tf.name_scope("Train"):
	train_input = PTBInput(config=config, data=train_data, name="TrainInput")
	with tf.variable_scope("Model", reuse=None, initializer=initializer):
	  m = PTBModel(is_training=True, config=config, input_=train_input)
	tf.scalar_summary("Training Loss", m.cost)
	tf.scalar_summary("Learning Rate", m.lr)

  with tf.name_scope("Valid"):
	valid_input = PTBInput(config=config, data=valid_data, name="ValidInput")
	with tf.variable_scope("Model", reuse=True, initializer=initializer):
	  mvalid = PTBModel(is_training=False, config=config, input_=valid_input)
	tf.scalar_summary("Validation Loss", mvalid.cost)

  with tf.name_scope("Test"):
	test_input = PTBInput(config=eval_config, data=test_data, name="TestInput")
	with tf.variable_scope("Model", reuse=True, initializer=initializer):
	  mtest = PTBModel(is_training=False, config=eval_config,

A typical epoch loop follows. Each epoch iteration completes a full pass of the train, valid and test dataset. Note that the learning rate are kept constant in each epoch and only updated across epoch.

sv = tf.train.Supervisor(logdir=FLAGS.save_path)
with sv.managed_session() as session:
  for i in range(config.max_max_epoch):
	lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0)
	m.assign_lr(session, config.learning_rate * lr_decay)

	print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr)))
	train_perplexity = run_epoch(session, m, eval_op=m.train_op,
	print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity))
	valid_perplexity = run_epoch(session, mvalid)
	print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity))

  test_perplexity = run_epoch(session, mtest)
  print("Test Perplexity: %.3f" % test_perplexity)

  if FLAGS.save_path:
	print("Saving model to %s." % FLAGS.save_path)
	sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step)

Building the Graph

RNN is for sequence model. Each learning example is a pair of (input_data, target) where input_data and target are both sequence of the same length. Moreover, input_data and target are only differed by 1 in position. For example, Given a sequence (w_1,w_2,....), if input is (w_1,w_2,..,w_n), the target is (w_2,w_3, ..., w_n). Also, note that how RNN is unrolled. I have commented the parts worth attention.

class PTBModel(object):
  """The PTB model."""

  def __init__(self, is_training, config, input_):
	self._input = input_

	batch_size = input_.batch_size  #20
	num_steps = input_.num_steps #20, the length of the sequence in each learning example. 
	size = config.hidden_size #200-1500 depends on the config
	vocab_size = config.vocab_size

	# Slightly better results can be obtained with forget gate biases
	# initialized to 1 but the hyperparameters of the model would need to be
	# different than reported in the paper.
	lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(size, forget_bias=0.0, state_is_tuple=True) #sgu: This line doesn't create the weights/bias insight the LSTM cell, not yet.

	#sgu: Dropout is applied for medium/large configuration
	if is_training and config.keep_prob < 1:
	  lstm_cell = tf.nn.rnn_cell.DropoutWrapper(
		  lstm_cell, output_keep_prob=config.keep_prob)
	cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers, state_is_tuple=True)

	self._initial_state = cell.zero_state(batch_size, data_type())

	with tf.device("/cpu:0"):
	  #sgu: embedding vector is shared across train, valid and test 
	  embedding = tf.get_variable(
		  "embedding", [vocab_size, size], dtype=data_type())
	  inputs = tf.nn.embedding_lookup(embedding, input_.input_data)

	if is_training and config.keep_prob  0: tf.get_variable_scope().reuse_variables()
		(cell_output, state) = cell(inputs[:, time_step, :], state)

	#sgu: cell_output: (batch_size, embedding_size) 
	#sgu: output shape: (num_steps*batch_size, embedding_size)
	output = tf.reshape(tf.concat(1, outputs), [-1, size])
	softmax_w = tf.get_variable(
		"softmax_w", [size, vocab_size], dtype=data_type())
	softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type())
	#sgu: logit shape: (num_step*batch_size, vocab_size)
	logits = tf.matmul(output, softmax_w) + softmax_b
	loss = tf.nn.seq2seq.sequence_loss_by_example(
		[tf.reshape(input_.targets, [-1])],
		[tf.ones([batch_size * num_steps], dtype=data_type())])
	self._cost = cost = tf.reduce_sum(loss) / batch_size
	#sgu: keep final state which is used as initial_state for next iteration(see run_epoch())
	self._final_state = state  

	if not is_training:

	self._lr = tf.Variable(0.0, trainable=False)
	tvars = tf.trainable_variables() 
	grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),

	#sgu: When the learning rate self._lr changes, the optimizer picks up automatically  
	optimizer = tf.train.GradientDescentOptimizer(self._lr)
	self._train_op = optimizer.apply_gradients(
		zip(grads, tvars),
	self._new_lr = tf.placeholder(
		tf.float32, shape=[], name="new_learning_rate")
	self._lr_update = tf.assign(self._lr, self._new_lr)  

  def assign_lr(self, session, lr_value):
	session.run(self._lr_update, feed_dict={self._new_lr: lr_value})

Feeding the States of LSTM

The run_epoch run multiple iterations to pass the one full dataset. In each iteration, the stats of LSTM needs to be fed. Note that the last state of the current iterations are used as initial state of next iterations. Also, the learning rate are kept constant during one epoch.

def run_epoch(session, model, eval_op=None, verbose=False):
  """Runs the model on the given data."""
  start_time = time.time()
  costs = 0.0
  iters = 0
  state = session.run(model.initial_state)

  fetches = {
	  "cost": model.cost,
	  "final_state": model.final_state,
  if eval_op is not None:
	fetches["eval_op"] = eval_op

  for step in range(model.input.epoch_size):
	feed_dict = {}
	#sgu: use the final state of the current mini-batch as the initial state of the subsequent minibatch
	#sgu: multiple LSTM cells can stack together. state[i] is the state of i-th cell.  
	for i, (c, h) in enumerate(model.initial_state):
	  feed_dict[c] = state[i].c
	  feed_dict[h] = state[i].h

	vals = session.run(fetches, feed_dict)
	cost = vals["cost"]
	state = vals["final_state"] 

	costs += cost
	iters += model.input.num_steps
	if verbose and step % (model.input.epoch_size // 10) == 10:
	  #sgu: the 1st : % of progress in current epoch;
	  #     the 2nd : perplexity
	  #     the 3rd:  words per sec  so far in the training 
	  print("%.3f perplexity: %.3f speed: %.0f wps" %
			(step * 1.0 / model.input.epoch_size, np.exp(costs / iters),
			 iters * model.input.batch_size / (time.time() - start_time)))

  return np.exp(costs / iters)

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s