Tensorflow RNN PTB Example Walkthrough



Structure

There is only one graph, but there are three PTBModel instances to keep track of the elements related to the stage of train, valid and test. Note the use of variable_scope and the resue to share the weights/bias.

with tf.Graph().as_default():
  initializer = tf.random_uniform_initializer(-config.init_scale,
											  config.init_scale)

  with tf.name_scope("Train"):
	train_input = PTBInput(config=config, data=train_data, name="TrainInput")
	with tf.variable_scope("Model", reuse=None, initializer=initializer):
	  m = PTBModel(is_training=True, config=config, input_=train_input)
	tf.scalar_summary("Training Loss", m.cost)
	tf.scalar_summary("Learning Rate", m.lr)

  with tf.name_scope("Valid"):
	valid_input = PTBInput(config=config, data=valid_data, name="ValidInput")
	with tf.variable_scope("Model", reuse=True, initializer=initializer):
	  mvalid = PTBModel(is_training=False, config=config, input_=valid_input)
	tf.scalar_summary("Validation Loss", mvalid.cost)

  with tf.name_scope("Test"):
	test_input = PTBInput(config=eval_config, data=test_data, name="TestInput")
	with tf.variable_scope("Model", reuse=True, initializer=initializer):
	  mtest = PTBModel(is_training=False, config=eval_config,
					  input_=test_input)

A typical epoch loop follows. Each epoch iteration completes a full pass of the train, valid and test dataset. Note that the learning rate are kept constant in each epoch and only updated across epoch.

sv = tf.train.Supervisor(logdir=FLAGS.save_path)
with sv.managed_session() as session:
  for i in range(config.max_max_epoch):
	lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0)
	m.assign_lr(session, config.learning_rate * lr_decay)

	print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr)))
	train_perplexity = run_epoch(session, m, eval_op=m.train_op,
								 verbose=True)
	print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity))
	valid_perplexity = run_epoch(session, mvalid)
	print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity))

  test_perplexity = run_epoch(session, mtest)
  print("Test Perplexity: %.3f" % test_perplexity)

  if FLAGS.save_path:
	print("Saving model to %s." % FLAGS.save_path)
	sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step)

Building the Graph

RNN is for sequence model. Each learning example is a pair of (input_data, target) where input_data and target are both sequence of the same length. Moreover, input_data and target are only differed by 1 in position. For example, Given a sequence (w_1,w_2,....), if input is (w_1,w_2,..,w_n), the target is (w_2,w_3, ..., w_n). Also, note that how RNN is unrolled. I have commented the parts worth attention.

class PTBModel(object):
  """The PTB model."""

  def __init__(self, is_training, config, input_):
	self._input = input_

	batch_size = input_.batch_size  #20
	num_steps = input_.num_steps #20, the length of the sequence in each learning example. 
	size = config.hidden_size #200-1500 depends on the config
	vocab_size = config.vocab_size

	# Slightly better results can be obtained with forget gate biases
	# initialized to 1 but the hyperparameters of the model would need to be
	# different than reported in the paper.
	lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(size, forget_bias=0.0, state_is_tuple=True) #sgu: This line doesn't create the weights/bias insight the LSTM cell, not yet.

	#sgu: Dropout is applied for medium/large configuration
	if is_training and config.keep_prob < 1:
	  lstm_cell = tf.nn.rnn_cell.DropoutWrapper(
		  lstm_cell, output_keep_prob=config.keep_prob)
	cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers, state_is_tuple=True)

	self._initial_state = cell.zero_state(batch_size, data_type())

	with tf.device("/cpu:0"):
	  #sgu: embedding vector is shared across train, valid and test 
	  embedding = tf.get_variable(
		  "embedding", [vocab_size, size], dtype=data_type())
	  inputs = tf.nn.embedding_lookup(embedding, input_.input_data)

	if is_training and config.keep_prob  0: tf.get_variable_scope().reuse_variables()
		(cell_output, state) = cell(inputs[:, time_step, :], state)
		outputs.append(cell_output)

	#sgu: cell_output: (batch_size, embedding_size) 
	#sgu: output shape: (num_steps*batch_size, embedding_size)
	output = tf.reshape(tf.concat(1, outputs), [-1, size])
	softmax_w = tf.get_variable(
		"softmax_w", [size, vocab_size], dtype=data_type())
	softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type())
	#sgu: logit shape: (num_step*batch_size, vocab_size)
	logits = tf.matmul(output, softmax_w) + softmax_b
	loss = tf.nn.seq2seq.sequence_loss_by_example(
		[logits],
		[tf.reshape(input_.targets, [-1])],
		[tf.ones([batch_size * num_steps], dtype=data_type())])
	self._cost = cost = tf.reduce_sum(loss) / batch_size
	#sgu: keep final state which is used as initial_state for next iteration(see run_epoch())
	self._final_state = state  

	if not is_training:
	  return

	self._lr = tf.Variable(0.0, trainable=False)
	tvars = tf.trainable_variables() 
	grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
									  config.max_grad_norm)

	#sgu: When the learning rate self._lr changes, the optimizer picks up automatically  
	optimizer = tf.train.GradientDescentOptimizer(self._lr)
	self._train_op = optimizer.apply_gradients(
		zip(grads, tvars),
		global_step=tf.contrib.framework.get_or_create_global_step())
	self._new_lr = tf.placeholder(
		tf.float32, shape=[], name="new_learning_rate")
	self._lr_update = tf.assign(self._lr, self._new_lr)  

  def assign_lr(self, session, lr_value):
	session.run(self._lr_update, feed_dict={self._new_lr: lr_value})

Feeding the States of LSTM

The run_epoch run multiple iterations to pass the one full dataset. In each iteration, the stats of LSTM needs to be fed. Note that the last state of the current iterations are used as initial state of next iterations. Also, the learning rate are kept constant during one epoch.

def run_epoch(session, model, eval_op=None, verbose=False):
  """Runs the model on the given data."""
  start_time = time.time()
  costs = 0.0
  iters = 0
  state = session.run(model.initial_state)

  fetches = {
	  "cost": model.cost,
	  "final_state": model.final_state,
  }
  if eval_op is not None:
	fetches["eval_op"] = eval_op

  for step in range(model.input.epoch_size):
	feed_dict = {}
	#sgu: use the final state of the current mini-batch as the initial state of the subsequent minibatch
	#sgu: multiple LSTM cells can stack together. state[i] is the state of i-th cell.  
	for i, (c, h) in enumerate(model.initial_state):
	  feed_dict[c] = state[i].c
	  feed_dict[h] = state[i].h

	vals = session.run(fetches, feed_dict)
	cost = vals["cost"]
	state = vals["final_state"] 

	costs += cost
	iters += model.input.num_steps
	if verbose and step % (model.input.epoch_size // 10) == 10:
	  #sgu: the 1st : % of progress in current epoch;
	  #     the 2nd : perplexity
	  #     the 3rd:  words per sec  so far in the training 
	  print("%.3f perplexity: %.3f speed: %.0f wps" %
			(step * 1.0 / model.input.epoch_size, np.exp(costs / iters),
			 iters * model.input.batch_size / (time.time() - start_time)))

  return np.exp(costs / iters)
Advertisements

Study Note: Tensorflow



Tensor vs. Variable, Constant, Node

#code excerpt from https://github.com/tensorflow/tensorflow/issues/6322
# Hidden 1
with tf.name_scope('hidden1'):
  weights = tf.Variable(
      tf.truncated_normal([IMAGE_PIXELS, hidden1_units],
                        stddev=1.0 / math.sqrt(float(IMAGE_PIXELS))),
      name='weights')
   biases = tf.Variable(tf.zeros([hidden1_units]),name='biases')
hidden1 = tf.nn.relu(tf.matmul(images, weights) + biases)
//....
hidden1_outputs = tf.get_default_graph().get_tensor_by_name('hidden1/add:0')
global_step = tf.Variable(0, name='global_step', trainable=False)
train_op = optimizer.minimize(loss, global_step=global_step)

However, the “Variables” referred in the tensorflow doc are not the same as memory holding values (as in programming language). In a tf NN, many other tensors are not “Variable”. For example, hidden1_outputs in the snippet above is not a variable. To get all variables in the graph,

#for the MNIST network in tutorial: 
In [2]: lv = tf.get_collection(tf.GraphKeys.VARIABLES)
In [4]: [v.name for v in lv]
Out[4]: 
[u'hidden1/weights:0',
 u'hidden1/biases:0',
 u'hidden2/weights:0',
 u'hidden2/biases:0',
 u'softmax_linear/weights:0',
 u'softmax_linear/biases:0',
 u'global_step:0']

In [13]: lvt = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
In [14]: [v.name for v in lvt]
Out[14]: 
[u'hidden1/weights:0',
 u'hidden1/biases:0',
 u'hidden2/weights:0',
 u'hidden2/biases:0',
 u'softmax_linear/weights:0',
 u'softmax_linear/biases:0']
  • The term Tensor and Variable are used differently in Python and C++ API.

https://stackoverflow.com/questions/40866675/implementation-difference-between-tensorflow-variable-and-tensorflow-tensor

  • Why use tf.constant()?

For efficiency, readability(of the graph).
http://stackoverflow.com/questions/39512276/tensorflow-simple-operations-tensors-vs-python-variables

The shape of tensors

The first dimension of the input tensor and output tensor is the batch size.
Cf. https://stackoverflow.com/questions/39090222/tensorflow-single-value-vs-batch-tensors
In the MNIST example, the input tensor is of shape (100, 784) where 100 is the batch size. The output tensor is of shape (100,) since the target (\(y\)) is just one number indicating the class (0-9). Since input data is fed by a batch in each iteration, it’s typical to apply reduction when computing loss ( tf.reduce_mean, as shown below). The first dimension of the tensors of the hidden layer also matches the batch size(see the example below)

#examples/tutorials/mnist/fully_connected_feed.py
def placeholder_inputs(batch_size):
  """Generate placeholder variables to represent the input tensors.

  These placeholders are used as inputs by the rest of the model building
  code and will be fed from the downloaded data in the .run() loop, below.

  Args:
    batch_size: The batch size will be baked into both placeholders.

  Returns:
    images_placeholder: Images placeholder.
    labels_placeholder: Labels placeholder.
  """
  # Note that the shapes of the placeholders match the shapes of the full
  # image and label tensors, except the first dimension is now batch_size
  # rather than the full size of the train or test data sets.
  images_placeholder = tf.placeholder(tf.float32, shape=(batch_size,
                                                         mnist.IMAGE_PIXELS))
  labels_placeholder = tf.placeholder(tf.int32, shape=(batch_size))
  return images_placeholder, labels_placeholder

feed_dict = fill_feed_dict(data_set,images_placeholder,labels_placeholder)

In [1]: feed_dict
Out[1]: 
{<tf.Tensor 'Placeholder:0' shape=(100, 784) dtype=float32>: array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float32),
 <tf.Tensor 'Placeholder_1:0' shape=(100,) dtype=int32>: array([5, 6, 5, 1, 3, 1, 2, 0, 9, 3, 5, 1, 9, 2, 2, 3, 6, 5, 4, 1, 6, 4, 9,
        9, 0, 0, 2, 8, 9, 2, 9, 9, 5, 9, 9, 4, 3, 7, 8, 5, 5, 1, 8, 5, 0, 3,
        8, 8, 1, 9, 3, 5, 0, 3, 2, 5, 6, 3, 6, 5, 7, 8, 7, 0, 8, 1, 6, 3, 3,
        4, 0, 8, 7, 7, 7, 5, 7, 6, 0, 5, 7, 5, 1, 3, 6, 0, 1, 1, 7, 7, 5, 5,
        1, 0, 3, 0, 9, 5, 0, 4], dtype=uint8)}

def loss(logits, labels):
  """Calculates the loss from the logits and the labels.

  Args:
    logits: Logits tensor, float - [batch_size, NUM_CLASSES].
    labels: Labels tensor, int32 - [batch_size].

  Returns:
    loss: Loss tensor of type float.
  """
  labels = tf.to_int64(labels)
  # sgu: cross_entropy.get_shape(): TensorShape([Dimension(100)]) where 100 is the batch_size 
  cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
      labels=labels, logits=logits, name='xentropy') 
  return tf.reduce_mean(cross_entropy, name='xentropy_mean')

#sgu: due to the tf.reduce_mean, loss now is just a number(i.e., 0-D tensor)
loss = mnist.loss(logits, labels_placeholder) #loss.get_shape(): TensorShape([])

# Get the outputs before the ReLU.
# hidden1_outputs.get_shape(): TensorShape([Dimension(100), Dimension(128)])
# where 100 is the batch size and 128 is the dimension in the hidden1 layer,
# i.e., 128 features are extracted in hidden1
hidden1_outputs = tf.get_default_graph().get_tensor_by_name('hidden1/add:0')

In some case, the input tensor might be of rank 1 (train_inputs) and the output tensor (train_labels) would be rank 2. This is not a problem as long as the operations can consume them. In the sample code below, tf.nn.embedding_lookup and tf.nn.nce_loss require the arguments of those shapes.

#example/tutorials/word2vec/word2vec_basic.py. 
    #sgu train_inputs.get_shape(): TensorShape([Dimension(32)]) where 32 is the batch_size 
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1]) #nce_loss requires train_lables to be rank 2 
    embeddings = tf.Variable(
               tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))

    #sgu embed.get_shape(): TensorShape([Dimension(50000), Dimension(128)]) where 50000 is the vocab size and 128 is the embedding_size. 

    #sgu:embed.get_shape(): TensorShape([Dimension(32), Dimension(128)]) 
    #sgu: where 32 is the batch_size and 128 is the dimension size
    embed = tf.nn.embedding_lookup(embeddings, train_inputs) 

    # Construct the variables for the NCE loss
    nce_weights = tf.Variable(
        tf.truncated_normal([vocabulary_size, embedding_size],
                            stddev=1.0 / math.sqrt(embedding_size)))
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

  #sgu: loss is a scalar, ie, a tensor of rank 0
  #loss.get_shape: TensorShape([])
  loss = tf.reduce_mean(
      tf.nn.nce_loss(weights=nce_weights,
                     biases=nce_biases,
                     labels=train_labels,
                     inputs=embed,
                     num_sampled=num_sampled,
                     num_classes=vocabulary_size))

Add operation node to the graph

The way the Tensorflow API is designed, library routines that create new operation nodes always attach nodes to the default graph. In fact, value+1 or reduce(value2) adds new nodes to the graph (as demonstrated below). Interactive debugging needs to be careful not to create unintended new nodes to the graph.

In [1]: import tensorflow as tf
In [2]: graph=tf.get_default_graph()
In [3]: graph.as_graph_def()
Out[3]: 
versions {
  producer: 17
}
In [4]: value = tf.constant(1)
In [5]: graph.as_graph_def()
Out[5]: 
node {
  name: "Const"
  op: "Const"
  attr {
    key: "dtype"
    value {
      type: DT_INT32
    }
  }
  attr {
    key: "value"
    value {
      tensor {
        dtype: DT_INT32
        tensor_shape {
        }
        int_val: 1
      }
    }
  }
}
versions {
  producer: 17
}
In [6]: value2=value+1
In [7]: graph.as_graph_def()
Out[7]: 
node {
  name: "Const"
  op: "Const"
  attr {
    key: "dtype"
    value {
      type: DT_INT32
    }
  }
  attr {
    key: "value"
    value {
      tensor {
        dtype: DT_INT32
        tensor_shape {
        }
        int_val: 1
      }
    }
  }
}
node {
  name: "add/y"
  op: "Const"
  attr {
    key: "dtype"
    value {
      type: DT_INT32
    }
  }
  attr {
    key: "value"
    value {
      tensor {
        dtype: DT_INT32
        tensor_shape {
        }
        int_val: 1
      }
    }
  }
}
node {
  name: "add"
  op: "Add"
  input: "Const"
  input: "add/y"
  attr {
    key: "T"
    value {
      type: DT_INT32
    }
  }
}
versions {
  producer: 17
}
In [8]: tf.reduce_mean(value)
Out[8]: <tf.Tensor 'Mean:0' shape=() dtype=int32>
In [9]: graph.as_graph_def()
Out[9]: 
node {
  name: "Const"
  op: "Const"
  attr {
    key: "dtype"
    value {
      type: DT_INT32
    }
  }
  attr {
    key: "value"
    value {
      tensor {
        dtype: DT_INT32
        tensor_shape {
        }
        int_val: 1
      }
    }
  }
}
node {
  name: "add/y"
  op: "Const"
  attr {
    key: "dtype"
    value {
      type: DT_INT32
    }
  }
  attr {
    key: "value"
    value {
      tensor {
        dtype: DT_INT32
        tensor_shape {
        }
        int_val: 1
      }
    }
  }
}
node {
  name: "add"
  op: "Add"
  input: "Const"
  input: "add/y"
  attr {
    key: "T"
    value {
      type: DT_INT32
    }
  }
}
node {
  name: "Const_1"
  op: "Const"
  attr {
    key: "dtype"
    value {
      type: DT_INT32
    }
  }
  attr {
    key: "value"
    value {
      tensor {
        dtype: DT_INT32
        tensor_shape {
          dim {
          }
        }
      }
    }
  }
}
node {
  name: "Mean"
  op: "Mean"
  input: "Const"
  input: "Const_1"
  attr {
    key: "T"
    value {
      type: DT_INT32
    }
  }
  attr {
    key: "Tidx"
    value {
      type: DT_INT32
    }
  }
  attr {
    key: "keep_dims"
    value {
      b: false
    }
  }
}
versions {
  producer: 17
}

Inspect the Graph

# Q: what happen when name_scope is used together with variable_scope?
with tf.name_scope("ns"):
    with tf.variable_scope("vs"):
        v1 = tf.get_variable("v1",[1.0]) 
        v2 = tf.Variable([2.],name="v2")
        v3 = v1+v2
v1.name  #vs/v1:0
v2.name  #ns/vs/v1:0
v3.name  #ns/vs/add:0 

#list all the node
l = [n for n in tf.get_default_graph().as_graph_def().node] 
[(ll.name,ll.op) for  ll in l]

In [12]: g  = tf.get_default_graph()
In [13]: op = g.get_operation_by_name("ns/vs/add")
In [14]: op.node_def
Out[14]: 
name: "ns/vs/add"
op: "Add"
input: "vs/v1/read"
input: "ns/vs/v2/read"
attr {
  key: "T"
  value {
    type: DT_FLOAT
  }
}
#get the output tensor of an op 
t = g.get_tensor_by_name("ns/vs/add:0") 
assert t==v3

Name Scope vs. Variable Scope:

with tf.name_scope("ns"):
    with tf.variable_scope("vs"):
        v1 = tf.get_variable("v1",[1.0]) 
        v2 = tf.Variable([2.],name="v2")
        v3 = v1+v2
v1.name  #vs/v1:0
v2.name  #ns/vs/v1:0
v3.name  #ns/vs/add:0

Sample code to visualization embedding vector

This is an sample code to visualize MNIST hidden vector in Tensorboard
https://github.com/tensorflow/tensorflow/issues/6322
This is an sample code to visualize word2vec in Tensorboard:
https://github.com/shiyuangu/tensorflow/blob/master/tensorflow/examples/tutorials/word2vec/word2vec_basic.py