Source code for cadl.deprecated.seq2seq

"""Sequence to Sequence models w/ Attention and BiDirectional Dynamic RNNs.

Parag K. Mital
"""

import tensorflow as tf
import numpy as np
import nltk
import pickle
from cadl import cornell

# Special vocabulary symbols:
# PAD is used to pad a sequence to a fixed size
# GO is for the end of the encoding
# EOS is for the end of decoding
# UNK is for out of vocabulary words
_PAD, _GO, _EOS, _UNK = "_PAD", "_GO", "_EOS", "_UNK"
_START_VOCAB = [_PAD, _GO, _EOS, _UNK]
PAD_ID, GO_ID, EOS_ID, UNK_ID = range(4)


def _create_embedding(x, vocab_size, embed_size, embed_matrix=None):
    # Creating an embedding matrix if one isn't given
    if embed_matrix is None:
        # This is a big matrix
        embed_matrix = tf.get_variable(
            name="embedding_matrix",
            shape=[vocab_size, embed_size],
            dtype=tf.float32,
            initializer=tf.random_uniform_initializer(-1.0, 1.0))

    # Perform the lookup of ids in x and perform the embedding to embed_size
    # [batch_size, max_time, embed_size]
    embed = tf.nn.embedding_lookup(embed_matrix, x)

    return embed, embed_matrix


def _create_encoder(embed, lengths, batch_size, n_enc_neurons, n_layers,
                    use_lstm):
    # Create the RNN Cells for encoder
    if use_lstm:
        cell_fw = tf.contrib.rnn.BasicLSTMCell(n_enc_neurons)
    else:
        cell_fw = tf.contrib.rnn.GRUCell(n_enc_neurons)

    # Build deeper recurrent net if using more than 1 layer
    if n_layers > 1:
        cell_fw = tf.contrib.rnn.MultiRNNCell([cell_fw] * n_layers)

    # Create the internal multi-layer cell for the backward RNN.
    if use_lstm:
        cell_bw = tf.contrib.rnn.BasicLSTMCell(n_enc_neurons)
    else:
        cell_bw = tf.contrib.rnn.GRUCell(n_enc_neurons)

    # Build deeper recurrent net if using more than 1 layer
    if n_layers > 1:
        cell_bw = tf.contrib.rnn.MultiRNNCell([cell_bw] * n_layers)

    # Now hookup the cells to the input
    # [batch_size, max_time, embed_size]
    # We only use the forward cell's final state since the decoder is
    # not a bidirectional rnn
    (_, final_state) = tf.nn.bidirectional_dynamic_rnn(
        cell_fw=cell_fw,
        cell_bw=cell_bw,
        inputs=embed,
        sequence_length=lengths,
        time_major=False,
        dtype=tf.float32)
    return final_state


def _create_train_decoder(cells, encoder_state, encoding_lengths, decoding,
                          decoding_lengths, embed_matrix, batch_size,
                          target_vocab_size, use_attention, n_dec_neurons,
                          scope, output_fn, max_sequence_size):

    if use_attention:
        attention_states = tf.zeros([batch_size, 1, cells.output_size])
        # Pass in the final hidden states of the encoder's RNN which it will
        # attend over... thus determining which ones are useful for the
        # decoding.
        (attn_keys, attn_vals, attn_score_fn, attn_construct_fn) = \
            tf.contrib.seq2seq.prepare_attention(
                attention_states=attention_states,
                attention_option='bahdanau',
                num_units=n_dec_neurons)

        # Use the final state of the encoder as input and build a decoder also
        # taking information from the attention module acting on the encoder_state.
        decoder_fn = \
            tf.contrib.seq2seq.attention_decoder_fn_train(
                encoder_state=encoder_state,
                attention_keys=attn_keys,
                attention_values=attn_vals,
                attention_score_fn=attn_score_fn,
                attention_construct_fn=attn_construct_fn)

    else:
        # Build training decoder function
        decoder_fn = \
            tf.contrib.seq2seq.simple_decoder_fn_train(
                encoder_state=encoder_state)

    # Build training rnn decoder
    outputs, _, _ = tf.contrib.seq2seq.dynamic_rnn_decoder(
        cell=cells,
        decoder_fn=decoder_fn,
        inputs=decoding,
        sequence_length=decoding_lengths,
        time_major=False,
        scope=scope)

    # Convert to vocab size
    train_logits = output_fn(outputs)

    return train_logits


def _create_inference_decoder(cells, encoder_state, encoding_lengths, decoding,
                              decoding_lengths, embed_matrix, batch_size,
                              n_dec_neurons, target_vocab_size, use_attention,
                              scope, output_fn, max_sequence_size):

    if use_attention:
        attention_states = tf.zeros([batch_size, 1, cells.output_size])
        # Pass in the final hidden states of the encoder's RNN which it will
        # attend over... thus determining which ones are useful for the
        # decoding.
        (attn_keys, attn_vals, attn_score_fn, attn_construct_fn) = \
            tf.contrib.seq2seq.prepare_attention(
                attention_states=attention_states,
                attention_option='bahdanau',
                num_units=n_dec_neurons)

        # Build a separate inference network to use during generation.
        decoder_fn_inference = \
            tf.contrib.seq2seq.attention_decoder_fn_inference(
                output_fn=output_fn,
                encoder_state=encoder_state,
                attention_keys=attn_keys,
                attention_values=attn_vals,
                attention_score_fn=attn_score_fn,
                attention_construct_fn=attn_construct_fn,
                embeddings=embed_matrix,
                start_of_sequence_id=GO_ID,
                end_of_sequence_id=EOS_ID,
                maximum_length=max_sequence_size,
                num_decoder_symbols=target_vocab_size)
    else:
        # Build inference decoder function
        decoder_fn_inference = \
            tf.contrib.seq2seq.simple_decoder_fn_inference(
                output_fn=output_fn,
                encoder_state=encoder_state,
                embeddings=embed_matrix,
                start_of_sequence_id=GO_ID,
                end_of_sequence_id=EOS_ID,
                maximum_length=max_sequence_size,
                num_decoder_symbols=target_vocab_size)

    # Build inference rnn decoder (handles output to vocab size, so we
    # do not have to apply the output function).
    (infer_logits, _, _) = tf.contrib.seq2seq.dynamic_rnn_decoder(
        cell=cells,
        decoder_fn=decoder_fn_inference,
        time_major=False,
        scope=scope)

    return infer_logits


[docs]def create_model(source_vocab_size=20000, target_vocab_size=20000, input_embed_size=1024, target_embed_size=1024, share_input_and_target_embedding=True, n_neurons=512, n_layers=3, use_lstm=True, use_attention=True, max_sequence_size=50): n_enc_neurons = n_neurons n_dec_neurons = n_neurons # First sentence (i.e. input, original language sentence before translation) # [batch_size, max_time] source = tf.placeholder(tf.int32, shape=(None, None), name='source') # User should also pass in the sequence lengths source_lengths = tf.placeholder( tf.int32, shape=(None), name='source_lengths') # Second sentence (i.e. reply, translation, etc...) # [batch_size, max_time] target = tf.placeholder(tf.int32, shape=(None, None), name='target') # User should also pass in the sequence lengths target_lengths = tf.placeholder( tf.int32, shape=(None), name='target_lengths') # Get symbolic shapes batch_size, sequence_size = tf.unstack(tf.shape(source)) with tf.variable_scope('target/slicing'): slice = tf.slice(target, [0, 0], [batch_size, -1]) decoder_input = tf.concat([tf.fill([batch_size, 1], GO_ID), slice], 1) with tf.variable_scope('source/embedding'): source_embed, source_embed_matrix = _create_embedding( x=source, vocab_size=source_vocab_size, embed_size=input_embed_size) with tf.variable_scope('target/embedding'): # Check if we need a new embedding matrix or not. If we are for # instance translating to another language, then we'd need different # vocabularies for the input and outputs, and so new embeddings. # However if we are for instance building a chatbot with the same # language, then it doesn't make sense to have different embeddings and # we should share them. if (share_input_and_target_embedding and source_vocab_size == target_vocab_size): target_embed, target_embed_matrix = _create_embedding( x=decoder_input, vocab_size=target_vocab_size, embed_size=target_embed_size, embed_matrix=source_embed_matrix) elif source_vocab_size != target_vocab_size: raise ValueError( 'source_vocab_size must equal target_vocab_size if ' + 'sharing input and target embeddings') else: target_embed, target_embed_matrix = _create_embedding( x=target, vocab_size=target_vocab_size, embed_size=target_embed_size) # Build the encoder with tf.variable_scope('encoder'): encoder_state = _create_encoder( embed=source_embed, lengths=source_lengths, batch_size=batch_size, n_enc_neurons=n_enc_neurons, n_layers=n_layers, use_lstm=use_lstm) # Build the decoder with tf.variable_scope('decoder') as scope: def output_fn(x): return tf.contrib.layers.fully_connected( inputs=x, num_outputs=target_vocab_size, activation_fn=None, scope=scope) # Create the RNN Cells for decoder if use_lstm: cells = tf.contrib.rnn.BasicLSTMCell(n_dec_neurons) else: cells = tf.contrib.rnn.GRUCell(n_dec_neurons) # Build deeper recurrent net if using more than 1 layer if n_layers > 1: cells = tf.contrib.rnn.MultiRNNCell([cells] * n_layers) decoding_train = _create_train_decoder( cells=cells, encoder_state=encoder_state[0], encoding_lengths=source_lengths, decoding=target_embed, decoding_lengths=target_lengths, embed_matrix=target_embed_matrix, batch_size=batch_size, target_vocab_size=target_vocab_size, use_attention=use_attention, scope=scope, max_sequence_size=max_sequence_size, n_dec_neurons=n_dec_neurons, output_fn=output_fn) # Inference model: scope.reuse_variables() decoding_inference = _create_inference_decoder( cells=cells, encoder_state=encoder_state[0], encoding_lengths=source_lengths, decoding=target_embed, decoding_lengths=target_lengths, embed_matrix=target_embed_matrix, batch_size=batch_size, target_vocab_size=target_vocab_size, use_attention=use_attention, scope=scope, max_sequence_size=max_sequence_size, n_dec_neurons=n_dec_neurons, output_fn=output_fn) with tf.variable_scope('loss'): weights = tf.ones( [batch_size, tf.reduce_max(target_lengths)], dtype=tf.float32, name="weights") loss = tf.contrib.seq2seq.sequence_loss( logits=tf.reshape(decoding_train, [ batch_size, tf.reduce_max(target_lengths), target_vocab_size ]), targets=target, weights=weights) return { 'loss': loss, 'source': source, 'source_lengths': source_lengths, 'target': target, 'target_lengths': target_lengths, 'thought_vector': encoder_state, 'decoder': decoding_inference }
[docs]def batch_generator(Xs, Ys, source_lengths, target_lengths, batch_size=50): idxs = np.random.permutation(np.arange(len(Xs))) n_batches = len(idxs) // batch_size for batch_i in range(n_batches): this_idxs = idxs[batch_i * batch_size:(batch_i + 1) * batch_size] this_Xs, this_Ys = Xs[this_idxs, :], Ys[this_idxs, :] this_source_lengths, this_target_lengths = source_lengths[ this_idxs], target_lengths[this_idxs] yield (this_Xs[:, :np.max(this_source_lengths)], this_Ys[:, :np.max(this_target_lengths)], this_source_lengths, this_target_lengths)
[docs]def preprocess(text, min_count=10, max_length=50): sentences = [el for s in text for el in nltk.sent_tokenize(s)] # We'll first tokenize each sentence into words to get a sense of # how long each sentence is: words = [[word.lower() for word in nltk.word_tokenize(s)] for s in sentences] # Then see how long each sentence is: lengths = np.array([len(s) for s in words]) good_idxs = np.where(lengths <= max_length)[0] dataset = [words[idx] for idx in good_idxs] fdist = nltk.FreqDist([word for sentence in dataset for word in sentence]) vocab_counts = [el for el in fdist.most_common() if el[1] > min_count] # First sort the vocabulary vocab = [v[0] for v in vocab_counts] vocab.sort() # Now add the special symbols: vocab = _START_VOCAB + vocab # Then create the word to id mapping vocab = {k: v for v, k in enumerate(vocab)} with open('vocab.pkl', 'wb') as fp: pickle.dump(vocab, fp) unked = word2id(dataset, vocab) return unked, vocab
[docs]def word2id(words, vocab): unked = [] for s in words: this_sentence = [vocab.get(w, UNK_ID) for w in s] unked.append(this_sentence) return unked
[docs]def id2word(ids, vocab): words = [] id2words = {v: k for k, v in vocab.items()} for s in ids: this_sentence = [id2words.get(w) for w in s] words.append(this_sentence) return words
[docs]def test_cornell(): # Get the cornell dataset text text = cornell.get_scripts() # Preprocess it to word IDs including UNKs for out of vocabulary words max_sequence_size = 50 unked, vocab = preprocess( text, min_count=10, max_length=max_sequence_size - 1) # Get the vocabulary size vocab_size = len(vocab) # Create input output pairs formed by neighboring sentences of dialog Xs_list, Ys_list = unked[:-1], unked[1:] # Store the final lengths source_lengths = np.zeros((len(Xs_list)), dtype=np.int32) target_lengths = np.zeros((len(Ys_list)), dtype=np.int32) Xs = np.ones((len(Xs_list), max_sequence_size), dtype=np.int32) * PAD_ID Ys = np.ones((len(Ys_list), max_sequence_size), dtype=np.int32) * PAD_ID for i, (source_i, target_i) in enumerate(zip(Xs_list, Ys_list)): el = source_i source_lengths[i] = len(el) Xs[i, :len(el)] = el el = target_i + [EOS_ID] target_lengths[i] = len(el) Ys[i, :len(el)] = el sess = tf.Session() net = create_model( use_attention=True, source_vocab_size=vocab_size, target_vocab_size=vocab_size) learning_rate = tf.placeholder(tf.float32, name='learning_rate') current_learning_rate = 0.01 opt = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(net['loss']) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) saver = tf.train.Saver() def decode(sentence): preprocessed = [ word for s in nltk.sent_tokenize(sentence.lower()) for word in nltk.word_tokenize(s) ][::-1] tokens = cornell.word2id([preprocessed + [_GO]], vocab) outputs = sess.run( net['decoder'], feed_dict={ net['source']: tokens, net['source_lengths']: [len(x_i) for x_i in tokens] }) decoding = np.argmax(outputs, axis=2) print('input:', sentence, '\n', 'output:', " ".join(cornell.id2word(decoding, vocab)[0])) n_epochs = 10 batch_size = 50 for epoch_i in range(n_epochs): for it_i, (this_Xs, this_Ys, this_source_lengths, this_target_lengths) \ in enumerate(batch_generator( Xs, Ys, source_lengths, target_lengths, batch_size=batch_size)): if it_i % 100 == 0: current_learning_rate = current_learning_rate * 0.9 rand_idx = np.random.randint(0, high=len(text)) print(it_i) decode(text[rand_idx]) l = sess.run( [net['loss'], opt], feed_dict={ learning_rate: current_learning_rate, net['source']: this_Xs, net['target']: this_Ys, net['source_lengths']: this_source_lengths, net['target_lengths']: this_target_lengths })[0] print('{}: {}'.format(it_i, l), end='\r') # End of epoch, save saver.save(sess, './dynamic-seq2seq.ckpt', global_step=it_i) sess.close()