Created by Reid Swanson last modified 2020-10-11
snippet.txt
# ##############################################################################
#  Copyright 2020 Google Developers, Reid Swanson
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
# ##############################################################################

# For details on the transformer implemenation
# see: https://www.tensorflow.org/tutorials/text/transformer

# Python Modules
import argparse
import gzip

# 3rd Party Modules
from typing import Dict, Tuple, Union

import tensorflow as tf
import tensorflow.keras.layers as tfl

# Project Modules


# region Transformer Model
class PositionEncoding(tfl.Layer):
    def __init__(self, position: int, n_model_units: int):
        """
        A layer for adding the positional information to an input tensor.

        :param position:
        :param n_model_units:
        """
        super().__init__()

        self.position = position
        self.n_model_units = n_model_units

        angle_rads = self._get_angles(
            tf.expand_dims(tf.range(position, dtype=tf.float32), axis=1),
            tf.expand_dims(tf.range(n_model_units, dtype=tf.float32), axis=0),
            n_model_units
        )

        even = tf.reshape(tf.range(0, tf.shape(angle_rads)[1], 2), (-1, 1))
        odd = tf.reshape(tf.range(1, tf.shape(angle_rads)[1], 2), (-1, 1))

        # Transposing makes the gather/scattering easier
        angle_rads = tf.transpose(angle_rads)
        even_rads = tf.math.sin(tf.gather_nd(angle_rads, even))
        odd_rads = tf.math.cos(tf.gather_nd(angle_rads, odd))

        angle_rads = tf.tensor_scatter_nd_update(angle_rads, even, even_rads)
        angle_rads = tf.tensor_scatter_nd_update(angle_rads, odd, odd_rads)
        angle_rads = tf.transpose(angle_rads)

        pos_encoding = tf.expand_dims(angle_rads, axis=0)

        self.pos_encoding = tf.cast(pos_encoding, dtype=tf.float32)

    def get_config(self):
        config = super().get_config()
        config.update({
            'position': self.position,
            'n_model_units': self.n_model_units
        })

        return config

    def __call__(self, x, *args, **kwargs):
        seq_len = tf.shape(x)[1]

        return x + self.pos_encoding[:, seq_len, :]

    @classmethod
    def _get_angles(cls, pos, i, n_model_units):
        exponent = (2.0 * (i//2.0)) / tf.cast(n_model_units, tf.float32)
        angle_rates = 1.0 / tf.pow(10000.0, exponent)

        return pos * angle_rates


class MultiHeadAttention(tfl.Layer):
    def __init__(self, n_model_units: int, n_heads: int):
        """
        See: https://www.tensorflow.org/tutorials/text/transformer#multi-head_attention

        :param n_model_units: Number of dimensions for the core model.
               The value must be an exact multiple of the number of heads.
        :param n_heads: Number of heads
        """
        super().__init__()

        # Validate the inputs
        if n_model_units % n_heads != 0:
            raise ValueError(
                f"The model dimension ({n_model_units}) must be an exact "
                f"multiple of the number of heads ({n_heads})."
            )

        self.n_model_units = n_model_units
        self.n_heads = n_heads
        self.depth = n_model_units // n_heads

        # Layers to store the query, key, and value weights
        self.wq = tfl.Dense(n_model_units)
        self.wk = tfl.Dense(n_model_units)
        self.wv = tfl.Dense(n_model_units)

        # Output layer
        self.dense = tfl.Dense(n_model_units)

    def get_config(self):
        config = super().get_config()
        config.update({
            'n_model_units': self.n_model_units,
            'n_heads': self.n_heads
        })

        return config

    # noinspection PyMethodOverriding
    def call(self, v, k, q, mask, **kwargs):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)  # (batch_size, seq_len, n_model_units)
        k = self.wk(k)  # (batch_size, seq_len, n_model_units)
        v = self.wv(v)  # (batch_size, seq_len, n_model_units)

        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

        # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
        # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
        scaled_attention, attention_weights = self.sdp_attention(q, k, v, mask)

        # (batch_size, seq_len_q, num_heads, depth)
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])

        # (batch_size, seq_len_q, n_model_units)
        concat_attention = tf.reshape(
            scaled_attention,
            (batch_size, -1, self.n_model_units)
        )

        # (batch_size, seq_len_q, n_model_units)
        output = self.dense(concat_attention)

        return output, attention_weights

    def split_heads(self, x, batch_size):
        """
        Split the last dimension into (num_heads, depth).
        Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
        """
        x = tf.reshape(x, (batch_size, -1, self.n_heads, self.depth))

        # Reorder the dimensions
        return tf.transpose(x, perm=[0, 2, 1, 3])

    @classmethod
    def sdp_attention(cls, q, k, v, mask):
        """
        Calculate the (scaled dot product) attention weights.
        q, k, v must have matching leading dimensions.
        k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
        The mask has different shapes depending on its type(padding or look ahead)
        but it must be broadcastable for addition.

        See: https://www.tensorflow.org/tutorials/text/transformer#scaled_dot_product_attention

        :param q: query shape == (..., seq_len_q, depth)
        :param k: key shape == (..., seq_len_k, depth)
        :param v: value shape == (..., seq_len_v, depth_v)
        :param mask: Float tensor with shape broadcastable
               to (..., seq_len_q, seq_len_k). Defaults to None.
        :return: output, attention_weights
        """
        # (..., seq_len_q, seq_len_k)
        matmul_qk = tf.matmul(q, k, transpose_b=True)

        # scale matmul_qk
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

        # add the mask to the scaled tensor.
        if mask is not None:
            scaled_attention_logits += (mask * -1e9)

        # softmax is normalized on the last axis (seq_len_k) so that the scores
        # add up to 1. # (..., seq_len_q, seq_len_k)
        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)

        # (..., seq_len_q, depth_v)
        output = tf.matmul(attention_weights, v)

        return output, attention_weights


class DecoderLayer(tfl.Layer):
    def __init__(
            self,
            n_model_units: int,
            n_heads: int,
            n_ff_units: int,
            dropout_rate: float = 0.1
    ):
        """
        See: https://www.tensorflow.org/tutorials/text/transformer#decoder_layer

        :param n_model_units: Number of dimensions for the core model
        :param n_heads: Number of heads
        :param dropout_rate: The dropout rate
        """
        super().__init__()

        self.n_model_units = n_model_units
        self.n_heads = n_heads
        self.n_ff_units = n_ff_units
        self.dropout_rate = dropout_rate

        # Attention
        self.mha = [MultiHeadAttention(n_model_units, n_heads) for _ in range(1)]

        # Feed Forward
        self.ffn = self._make_feed_forward_layers(n_model_units, n_ff_units)

        # Normalization
        self.layer_norm = [tfl.LayerNormalization(epsilon=1e-6) for _ in range(2)]

        # Dropout
        self.dropout = [tfl.Dropout(dropout_rate) for _ in range(2)]

    def get_config(self):
        config = super().get_config()
        config.update({
            'n_model_units': self.n_model_units,
            'n_heads': self.n_heads,
            'n_ff_units': self.n_ff_units,
            'dropout_rate': self.dropout_rate,
        })

        return config

    # noinspection PyMethodOverriding
    def call(self, x, training, look_ahead_mask):
        # enc_output.shape == (batch_size, input_seq_len, n_model_units)

        # (batch_size, target_seq_len, n_model_units)
        attn, attn_weights_block = self.mha[0](x, x, x, look_ahead_mask)
        attn = self.dropout[0](attn, training=training)
        out = self.layer_norm[0](attn + x)

        # There isn't an encoder so we don't need the other layers used
        # in the transformer tutorial.

        # (batch_size, target_seq_len, n_model_units)
        ffn_output = self.ffn(out)
        ffn_output = self.dropout[1](ffn_output, training=training)
        result = self.layer_norm[1](ffn_output + out)  # (batch_size, target_seq_len, n_model_units)

        return result, attn_weights_block

    @classmethod
    def _make_feed_forward_layers(cls, n_model_units, n_ff_units):
        return tf.keras.Sequential([
            # (batch_size, seq_len, dff)
            tfl.Dense(n_ff_units, activation='relu'),

            # (batch_size, seq_len, n_model_units)
            tfl.Dense(n_model_units)
        ])


class Decoder(tfl.Layer):
    def __init__(
            self,
            n_layers: int,
            n_model_units: int,
            n_heads: int,
            n_ff_units: int,
            vocab_size: int,
            max_position_encoding: int,
            dropout_rate: float = 0.1
    ):
        """
        See: https://www.tensorflow.org/tutorials/text/transformer#decoder

        :param n_layers: The number of layers.
        :param n_model_units: The number of units in the base "model" layers.
               This must be an exact multiple of ``n_heads``.
        :param n_heads: The number of heads.
        :param n_ff_units: The number of units in the feed forward layers.
        :param vocab_size: The number of words in the vocabulary including
               the start, end, and padding tokens (typically 0).
        :param max_position_encoding: The maximum length of a sequence.
        :param dropout_rate: The dropout rate.
        """
        super().__init__()

        self.n_layers = n_layers
        self.n_model_units = n_model_units
        self.n_heads = n_heads
        self.n_ff_units = n_ff_units
        self.vocab_size = vocab_size
        self.max_position_encoding = max_position_encoding
        self.dropout_rate = dropout_rate

        self.embedding = tfl.Embedding(vocab_size, n_model_units)
        self.pos_encoding = PositionEncoding(max_position_encoding, n_model_units)

        self.decoder_layers = [
            DecoderLayer(n_model_units, n_heads, n_ff_units, dropout_rate)
            for _ in range(n_layers)
        ]
        self.dropout = tfl.Dropout(dropout_rate)

    def get_config(self):
        config = super().get_config()
        config.update({
            'n_layers': self.n_layers,
            'n_model_units': self.n_model_units,
            'n_heads': self.n_heads,
            'n_ff_units': self.n_ff_units,
            'vocab_size': self.vocab_size,
            'max_position_encoding': self.max_position_encoding,
            'dropout_rate': self.dropout_rate,
        })

        return config

    # noinspection PyMethodOverriding
    def call(
            self,
            x: tf.Tensor,
            training: bool,
            lookahead_mask: tf.Tensor,
            **kwargs
    ):
        attention_weights = {}

        # (batch_size, target_seq_len, n_model_units)
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.n_model_units, tf.float32))
        x = self.pos_encoding(x)

        x = self.dropout(x, training=training)

        for i in range(self.n_layers):
            x, block_1 = self.decoder_layers[i](x, training, lookahead_mask)
            attention_weights[f'decoder_layer{i+1}_block_1'] = block_1

        return x, attention_weights


class DecoderOnlyLanguageModel(tf.keras.models.Model):
    def __init__(
            self,
            n_layers: int,
            n_model_units: int,
            n_heads: int,
            n_ff_units: int,
            vocab_size: int,
            seq_len: int,
            dropout: float,
            **kwargs
    ):
        super().__init__(**kwargs)

        self.n_layers = n_layers
        self.n_model_units = n_model_units
        self.n_heads = n_heads
        self.n_ff_units = n_ff_units
        self.vocab_size = vocab_size
        self.pos_encoding = self.vocab_size
        self.seq_len = seq_len
        self.dropout = dropout

        self.decoder = Decoder(
            n_layers,
            n_model_units,
            n_heads,
            n_ff_units,
            self.vocab_size,
            self.pos_encoding,
            self.dropout
        )

        self.final_layer = tfl.Dense(self.vocab_size)

    # noinspection PyMethodOverriding
    def call(self, inputs: Tuple[tf.Tensor, tf.Tensor], training: bool):
        """

        :param inputs: The input data and lookahead mask as a tuple.
        :param training: ``True`` if we are training.
        :return: A tuple containing the final output and the attention
                 weights.
        """
        x, lookahead_mask = inputs
        output, attention_weights = self.decoder(x, training, lookahead_mask)

        final_output = self.final_layer(output)

        return final_output, attention_weights

    def train_step(self, data):
        x, y_true = data

        x = x, self.make_lookahead_mask(tf.shape(x)[1])

        with tf.GradientTape() as tape:
            y_pred, _ = self(x, True)
            loss = self.compiled_loss(y_true, y_pred)

        # For custom training steps, users can just write:
        trainable_variables = self.trainable_variables
        gradients = tape.gradient(loss, trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, trainable_variables))

        # Do not add the metrics for the training steps. They are kind of
        # interesting to track, but are slow to compute.
        result = {'loss': loss}

        return result

    def test_step(self, data):
        x, y_true = data

        x = x, self.make_lookahead_mask(tf.shape(x)[1])
        y_pred, _ = self(x, False)
        loss = self.compiled_loss(y_true, y_pred)

        self.compiled_metrics.update_state(y_true, y_pred)

        result = {m.name: m.result() for m in self.metrics}
        result['loss'] = loss

        return result

    @classmethod
    def make_padding_mask(cls, seq: tf.Tensor):
        """
        Mask all the pad tokens in the batch of sequence. It ensures that the
        model does not treat padding as the input. The mask indicates where pad
        value 0 is present: it outputs a 1 at those locations, and a 0 otherwise.

        :param seq:
        :return:
        """
        seq = tf.cast(tf.math.equal(seq, 0), tf.float32)

        # add extra dimensions to add the padding
        # to the attention logits.
        # (batch_size, 1, 1, seq_len)
        return seq[:, tf.newaxis, tf.newaxis, :]

    @classmethod
    def make_lookahead_mask(cls, size: int):
        """
       Create a mask that filters elements that have not been seen yet.
       Namely if there are ``size`` inputs (e.g., sequence length) then
       when examining the first element (corresponding to the first row of the
       mask), the first column will be 0 and all other columns in the row will
       be 1. When examining the second element (the second row), then the first
       two columns will be 0 and all others 1.

       :param size: The size of the mask (i.e., the sequence length)
       :return: The mask of shape (``size``, ``size``).
       """
        # https://www.tensorflow.org/api_docs/python/tf/linalg/band_part
        # This will create an upper triangular matrix of shape (size, size)
        # that is filled with 1s.
        mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)

        return mask  # (seq_len, seq_len)
# endregion Transformer Model


# region Losses and Metrics
class MaskedSparseCategoricalCrossentropy(tf.keras.losses.Loss):
    def __init__(
            self,
            mask_value: Union[int, float] = 0,
            name: str = 'MaskedSparseCategoricalCrossentropy',
            **kwargs
    ):
        """
        A simple wrapper for
        :class:`tf.keras.losses.SparseCategoricalCrossntropy` that excludes
        masked values from the calculation.

        See: https://www.tensorflow.org/tutorials/text/transformer#loss_and_metrics

        :param mask_value: The value indicating whether an entry should be
               masked.
        :param name: The name of the operation.
        """
        super().__init__(name=name, **kwargs)

        self.mask_value = mask_value

    def get_config(self):
        cfg = super().get_config()
        cfg.update({
            'mask_value': self.mask_value
        })

        return cfg

    @tf.function
    def call(self, y_true: tf.Tensor, y_pred: tf.Tensor) -> float:
        """
        Compute the categorical crossentropy excluding masked values.

        :param y_true: The true values.
        :param y_pred: The predicted values.
        :return: The crossentropy.
        """
        loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(y_true, y_pred)
        mask = tf.cast(tf.math.logical_not(tf.math.equal(y_true, self.mask_value)), loss_.dtype)
        loss_ *= mask

        # Use divid_no_nan to handle empty batches which might occur
        # when using a stateful RNN.
        result = tf.math.divide_no_nan(tf.reduce_sum(loss_), tf.reduce_sum(mask))

        return result


class MaskedSparseCategoricalCrossentropyMetric(tf.keras.metrics.Metric):
    def __init__(
            self,
            mask_value: Union[int, float] = 0,
            name='masked_sparse_categorical_crossentropy',
            **kwargs
    ):
        super().__init__(name=name, **kwargs)

        self.mask_value = mask_value
        self.crossentropy = tf.keras.metrics.Mean()

    def update_state(self, y_true: tf.Tensor, y_pred: tf.Tensor, *args, **kwargs):
        loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(y_true, y_pred)
        mask = tf.cast(tf.math.logical_not(tf.math.equal(y_true, self.mask_value)), loss_.dtype)

        # Using the sample_weight parameter is easier than the
        # method in the loss version and will also handle empty
        # batches smoothly.
        self.crossentropy.update_state(loss_, sample_weight=mask)

    def result(self):
        return self.crossentropy.result()

    def reset_states(self):
        self.crossentropy.reset_states()


class MaskedSparseCategoricalAccuracy(tf.keras.metrics.Metric):
    def __init__(
            self,
            mask_value: Union[int, float] = 0,
            name='masked_sparse_categorical_accuracy',
            **kwargs
    ):
        super().__init__(name=name, **kwargs)

        self.mask_value = mask_value
        self.accuracy = tf.keras.metrics.Mean()

    def update_state(self, y_true: tf.Tensor, y_pred: tf.Tensor, *args, **kwargs):
        argmax = tf.cast(tf.math.argmax(y_pred, axis=-1), tf.int32)
        loss_ = tf.cast(tf.math.equal(y_true, argmax), y_pred.dtype)

        mask = tf.cast(tf.math.logical_not(tf.math.equal(y_true, self.mask_value)), loss_.dtype)

        self.accuracy.update_state(loss_, sample_weight=mask)

    def result(self):
        return self.accuracy.result()

    def reset_states(self):
        self.accuracy.reset_states()
# endregion Losses and Metrics


# region Make Vocabulary
def py_vocab_from_file(filename: str, unk: str = b'<unk>', start: str = b'<s>', end: str = b'</s>'):
    unique_words = set()
    in_open = gzip.open if filename.endswith('.gz') else open

    with in_open(filename, 'r') as fh:
        for line in fh:
            if line:
                tokens = line.split()
                unique_words.update(tokens)

    word2id = {b'': 0}
    word2id.update({k: v for v, k in enumerate(unique_words, 1)})

    if unk not in word2id:
        word2id[unk] = len(word2id)

    if start not in word2id:
        word2id[start] = len(word2id)

    if end not in word2id:
        word2id[end] = len(word2id)

    return word2id


def tf_vocab_from_dict(word2id: Dict[str, int], unk: str = b'<unk>'):
    keys = tf.constant(list(word2id.keys()))
    values = tf.constant(list(word2id.values()))

    word_initializer = tf.lookup.KeyValueTensorInitializer(keys, values)
    word2id = tf.lookup.StaticHashTable(word_initializer, word2id[unk])

    return word2id
# endregion Make Vocabulary


# region Dataset Creation
def make_dataset(
        filename: str,
        word2id: tf.lookup.StaticHashTable,
        batch_size: int,
        seq_len: int,
        shuffle: bool = False,
        shingle: bool = False,
        buffer_size: int = 10000
) -> tf.data.Dataset:
    # Assume the dataset is compressed if it ends with .gz
    compression = 'GZIP' if filename.endswith('.gz') else ''

    # The sequence length should represent the final length of the source and
    # target values. However, to create those sequences we will truncate the
    # front and back of the respectively, so 1 is added to the parameter
    # value to account for this truncation.
    seq_len = seq_len + 1
    dataset = tf.data.TextLineDataset(filename, compression_type=compression)
    dataset = dataset.map(tf.strings.strip)
    dataset = dataset.filter(lambda line: tf.not_equal(tf.strings.length(line), 0))
    dataset = dataset.map(tf.strings.split)
    dataset = dataset.unbatch()
    dataset = dataset.map(lambda t: word2id.lookup(t))

    if shingle:
        dataset = dataset.window(seq_len, shift=1)
        dataset = dataset.flat_map(lambda w: w.batch(seq_len))
    else:
        # Shift by seq_len - 1 so that the last token of the
        # current window is reused as the first token of the
        # next window.
        dataset = dataset.window(seq_len, seq_len - 1)
        dataset = dataset.flat_map(lambda w: w.batch(seq_len))

    dataset = dataset.map(lambda t: (t[:-1], t[1:]))

    dataset = dataset.cache()

    if shuffle:
        dataset = dataset.shuffle(buffer_size, reshuffle_each_iteration=True)

        if shingle:
            dataset = dataset.shard(seq_len, 0)

    dataset = dataset.padded_batch(
        batch_size,
        padded_shapes=(seq_len-1, seq_len-1)
    )

    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

    return dataset
# endregion Dataset Creation


# region Main Program
def main(args: argparse.Namespace):
    py_vocab = py_vocab_from_file(args.train_file)
    tf_vocab = tf_vocab_from_dict(py_vocab)
    vocab_size = len(py_vocab)

    train_data = make_dataset(
        args.train_file,
        tf_vocab,
        args.batch_size,
        args.seq_len,
        args.shuffle,
        args.shingle
    )
    valid_data = make_dataset(args.valid_file, tf_vocab, args.batch_size, args.seq_len)

    model = DecoderOnlyLanguageModel(
        args.n_layers,
        args.n_model_units,
        args.n_heads,
        args.n_ff_units,
        vocab_size,
        args.seq_len,
        args.dropout
    )

    # A standard Adam optimizer seems to work better than the custom
    # schedule.
    optimizer = tf.keras.optimizers.Adam(args.learning_rate)
    loss = MaskedSparseCategoricalCrossentropy()
    metrics = [
        MaskedSparseCategoricalCrossentropyMetric(name='xent'),
        MaskedSparseCategoricalAccuracy(name='acc')
    ]

    model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

    model.fit(
        train_data,
        epochs=args.n_epochs,
        validation_data=valid_data
    )


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--train-file', required=True, help="Path to WikiText-2 training file.")
    parser.add_argument('--valid-file', required=True, help="Path to WikiText-2 valid file.")
    parser.add_argument('--batch-size', required=False, type=int, default=64)
    parser.add_argument('--seq-len', required=False, type=int, default=100)
    parser.add_argument('--shuffle', action='store_true')
    parser.add_argument('--shingle', action='store_true')
    parser.add_argument('--n-layers', required=False, type=int, default=4)
    parser.add_argument('--n-model-units', required=False, type=int, default=64)
    parser.add_argument('--n-heads', required=False, type=int, default=8)
    parser.add_argument('--n-ff-units', required=False, type=int, default=512)
    parser.add_argument('--dropout', required=False, type=float, default=0.2)
    parser.add_argument('--n-epochs', required=False, type=int, default=15)
    parser.add_argument('--learning-rate', type=float, default=0.0002)

    parser.set_defaults(func=main)

    args = parser.parse_args()
    args.func(args)
# endregion Main Program
Comments (0)

HTTPS
SSH
You can clone a snippet to your computer for local editing. Learn more.
Snippets

Reid Swanson Transformer based language model

Comments (0)