diff --git a/examples/machine_translation/README.md b/examples/machine_translation/README.md new file mode 100644 index 0000000000..8ce1168604 --- /dev/null +++ b/examples/machine_translation/README.md @@ -0,0 +1,74 @@ +# This is an example to create a machine translation dataset and train a translation model. + +In this example, we have two different approaches to train the machine translation model. One with character-composed embedding as an input, and the other using word embedding. Character-composed embedding is an embedding constructed by running CNN on top set of characters from each words in a sentence. The CNN layer will produce a word embedding layer but with different approach than the normal word embedding. To give you an example + +``` +sentence: I love torchtext +character representation: [['I'], ['l', 'o', 'v', 'e'], ['t', 'o', 'r', 'c', 'h', 't', 'e', 'x', 't']] +``` + +The character representation is used as an input for the CNN layer and will result in `m x n` matrix, where `m` is the length of the sentence and `n` is the embedding size. + +Other than the word representation, the rest are using the exact same model (seq2seq) and training loss. + +## Training + +To try the example, simply run the following for character level training + +```bash +python train_char.py +``` + +and run the following for word level training + +```bash +python train_word.py +``` + +For word level training + +## Experiment Result + +The following is the output example for running `train_char.py` + +``` +The model has 5,617,503 trainable parameters +100%|█████████████████████████████████████████████████████████████████████████████████| 227/227 [01:54<00:00, 1.98it/s] +100%|█████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:03<00:00, 2.37it/s] +Epoch: 01 | Time: 1m 57s + Train Loss: 5.277 | Train PPL: 195.798 | Train BLEU: 0.001 + Val. Loss: 4.088 | Val. PPL: 59.598 | Val. BLEU: 0.006 +... +100%|█████████████████████████████████████████████████████████████████████████████████| 227/227 [02:25<00:00, 1.56it/s] +100%|█████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:03<00:00, 2.03it/s] +Epoch: 10 | Time: 2m 29s + Train Loss: 1.373 | Train PPL: 3.948 | Train BLEU: 0.187 + Val. Loss: 0.972 | Val. PPL: 2.644 | Val. BLEU: 0.280 +100%|█████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:04<00:00, 1.95it/s] +| Test Loss: 1.011 | Test PPL: 2.748 | Test BLEU: 0.273 +Saving model to char_mt_seq2seq.pt +Save vocab to torchtext_char_mt_vocab.pt +``` + +And the following is the output of `train_word.py` + +``` +The model has 14,601,140 trainable parameters + 0%| | 0/227 [00:00", eos_token=""): + tok_list = [[init_token], [eos_token]] + return build_vocab_from_iterator(tok_list + list(map(lambda x: transforms(x[index]), data))) + + +def build_char_vocab( + data, transforms, index, init_word_token="", eos_word_token="", init_sent_token="", eos_sent_token="", +): + tok_list = [ + [init_word_token], + [eos_word_token], + [init_sent_token], + [eos_sent_token], + ] + for line in data: + tokens = list(itertools.chain.from_iterable(transforms(line[index]))) + tok_list.append(tokens) + return build_vocab_from_iterator(tok_list) + + +def char_vocab_func(vocab): + def func(tok_iter): + return [[vocab[char] for char in word] for word in tok_iter] + + return func + + +def special_char_tokens_func( + init_word_token="", eos_word_token="", init_sent_token="", eos_sent_token="", +): + def func(tok_iter): + result = [[init_word_token, init_sent_token, eos_word_token]] + result += [[init_word_token] + word + [eos_word_token] for word in tok_iter] + result += [[init_word_token, eos_sent_token, eos_word_token]] + return result + + return func + + +def special_word_token_func(init_word_token="", eos_word_token=""): + def func(tok_iter): + return [init_word_token] + tok_iter + [eos_word_token] + + return func + + +def parallel_transforms(*transforms): + def func(txt_input): + result = [] + for transform in transforms: + result.append(transform(txt_input)) + return tuple(result) + + return func + + +def get_dataset(dataset_name: str): + # Get the raw dataset first. This will give us the text + # version of the dataset + train, test, val = DATASETS[dataset_name]() + # Cache training data for vocabulary construction + train_data = [line for line in train] + val_data = [line for line in val] + test_data = [line for line in test] + # Setup word tokenizer + src_tokenizer = get_tokenizer("spacy", language="de_core_news_sm") + tgt_tokenizer = get_tokenizer("spacy", language="en_core_web_sm") + # Setup char tokenizer + + def remove_extra_whitespace(line): + return re.sub(" {2,}", " ", line) + + src_char_transform = sequential_transforms(remove_extra_whitespace, src_tokenizer, partial(map, list)) + tgt_char_transform = sequential_transforms(remove_extra_whitespace, tgt_tokenizer, partial(map, list)) + tgt_word_transform = sequential_transforms(remove_extra_whitespace, tgt_tokenizer) + + # Setup vocabularies (both words and chars) + src_char_vocab = build_char_vocab(train_data, src_char_transform, index=0) + tgt_char_vocab = build_char_vocab(train_data, tgt_char_transform, index=1) + tgt_word_vocab = build_word_vocab(train, tgt_word_transform, 0) + + # Building the dataset with character level tokenization + src_char_transform = sequential_transforms( + src_char_transform, special_char_tokens_func(), char_vocab_func(src_char_vocab) + ) + tgt_char_transform = sequential_transforms( + tgt_char_transform, special_char_tokens_func(), char_vocab_func(tgt_char_vocab) + ) + tgt_word_transform = sequential_transforms( + tgt_word_transform, special_word_token_func(), vocab_func(tgt_word_vocab) + ) + tgt_transform = parallel_transforms(tgt_char_transform, tgt_word_transform) + train_dataset = TranslationDataset( + train_data, (src_char_vocab, tgt_char_vocab, tgt_word_vocab), (src_char_transform, tgt_transform) + ) + val_dataset = TranslationDataset( + val_data, (src_char_vocab, tgt_char_vocab, tgt_word_vocab), (src_char_transform, tgt_transform) + ) + test_dataset = TranslationDataset( + test_data, (src_char_vocab, tgt_char_vocab, tgt_word_vocab), (src_char_transform, tgt_transform) + ) + + return train_dataset, val_dataset, test_dataset diff --git a/examples/machine_translation/embedding.py b/examples/machine_translation/embedding.py new file mode 100644 index 0000000000..ab2ef1c6fb --- /dev/null +++ b/examples/machine_translation/embedding.py @@ -0,0 +1,85 @@ +from typing import NamedTuple + +import torch.nn as nn + + +class WordCharCNNEmbedding(nn.Module): + """The character embedding is built upon CNN and pooling layer + with dropout applied before the convolution and after the pooling. + """ + + def __init__( + self, + ntokens: int, + char_embedding_dim: int = 30, + char_padding_idx: int = 1, + dropout: float = 0.5, + kernel_size: int = 3, + out_channels: int = 30, + target_emb: int = 300, + use_highway: bool = False, + ): + super(WordCharCNNEmbedding, self).__init__() + self._use_highway = use_highway + + if self._use_highway and out_channels != target_emb: + raise ValueError("out_channels and target_emb must be " "equal in highway setting") + + self.char_embedding = nn.Embedding(ntokens, char_embedding_dim, char_padding_idx) + self.conv_embedding = nn.Sequential( + nn.Dropout(p=dropout), + nn.Conv1d( + in_channels=char_embedding_dim, + out_channels=out_channels, + kernel_size=kernel_size, + padding=kernel_size - 1, + ), + nn.AdaptiveMaxPool1d(1), + ) + self.proj_layer = nn.Linear(out_channels, target_emb) + self.out_dropout = nn.Dropout(p=dropout) + self._char_padding_idx = char_padding_idx + + self.init_weights() + + def init_weights(self): + """Initialize the weight of character embedding with xavier + and reinitalize the padding vectors to zero + """ + + self.char_embedding.weight.data.uniform_(-0.1, 0.1) + # Reinitialize vectors at padding_idx to have 0 value + self.char_embedding.weight.data[self._char_padding_idx].uniform_(0, 0) + + def forward(self, chars): + """Run the forward calculation of the char-cnn embedding + model. + Args: + chars (torch.Tensor): An integer tensor with the size of + [seq_len x batch x char_size] + Returns: + char_embedding_vec (torch.Tensor): An embedding tensor with + the size of [batch x seq_len x out_channels] + """ + char_embedding_vec = self.char_embedding(chars) + # Reshape the character embedding to the size of + # [batch * seq_len, char_len, char_dim] + char_embedding_vec = char_embedding_vec.view( + -1, char_embedding_vec.size(2), char_embedding_vec.size(3) + ).contiguous() + # Transpose the embedding into [batch * seq_len, char_dim, char_len] + char_embedding_vec = char_embedding_vec.transpose(1, 2).contiguous() + # Apply char embedding with dropout and convolution + # layers so the dim now will be [batch * seq_len, out_channel, new_len] + char_embedding_vec = self.conv_embedding(char_embedding_vec) + char_embedding_vec = char_embedding_vec.squeeze(-1) + # Revert the size back to [seq_len, batch, out_channel] + char_embedding_vec = char_embedding_vec.view(chars.size(0), chars.size(1), -1).contiguous() + char_embedding_vec = self.out_dropout(char_embedding_vec) + proj_char_embedding_vec = self.proj_layer(char_embedding_vec) + # Apply highway connection between projection layer and + # pooling layer + if self._use_highway: + proj_char_embedding_vec += char_embedding_vec + + return proj_char_embedding_vec diff --git a/examples/machine_translation/model.py b/examples/machine_translation/model.py new file mode 100644 index 0000000000..9818ce8bd3 --- /dev/null +++ b/examples/machine_translation/model.py @@ -0,0 +1,151 @@ +import random +from typing import Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class Encoder(nn.Module): + def __init__( + self, input_dim: int, emb_dim: int, enc_hid_dim: int, dec_hid_dim: int, dropout: float, embedding: nn.Module + ): + super().__init__() + + self.input_dim = input_dim + self.emb_dim = emb_dim + self.enc_hid_dim = enc_hid_dim + self.dec_hid_dim = dec_hid_dim + self.dropout = dropout + + self.embedding = embedding + self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional=True) + self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim) + self.dropout = nn.Dropout(dropout) + + self.init_weights() + + def forward(self, src: torch.Tensor) -> Tuple[torch.Tensor]: + embedded = self.dropout(self.embedding(src)) + outputs, hidden = self.rnn(embedded) + hidden = torch.tanh(self.fc(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))) + + return outputs, hidden + + def init_weights(self): + for name, param in self.named_parameters(): + if "weight" in name: + nn.init.normal_(param.data, mean=0, std=0.01) + else: + nn.init.constant_(param.data, 0) + + +class Attention(nn.Module): + def __init__(self, enc_hid_dim: int, dec_hid_dim: int, attn_dim: int): + super().__init__() + + self.enc_hid_dim = enc_hid_dim + self.dec_hid_dim = dec_hid_dim + self.attn_in = (enc_hid_dim * 2) + dec_hid_dim + self.attn = nn.Linear(self.attn_in, attn_dim) + + self.init_weights() + + def forward(self, decoder_hidden: torch.Tensor, encoder_outputs: torch.Tensor) -> torch.Tensor: + src_len = encoder_outputs.shape[0] + repeated_decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1, src_len, 1) + encoder_outputs = encoder_outputs.permute(1, 0, 2) + energy = torch.tanh(self.attn(torch.cat((repeated_decoder_hidden, encoder_outputs), dim=2))) + attention = torch.sum(energy, dim=2) + + return F.softmax(attention, dim=1) + + def init_weights(self): + for name, param in self.named_parameters(): + if "weight" in name: + nn.init.normal_(param.data, mean=0, std=0.01) + else: + nn.init.constant_(param.data, 0) + + +class Decoder(nn.Module): + def __init__( + self, + output_dim: int, + emb_dim: int, + enc_hid_dim: int, + dec_hid_dim: int, + dropout: int, + attention: nn.Module, + embedding: nn.Module, + ): + super().__init__() + + self.emb_dim = emb_dim + self.enc_hid_dim = enc_hid_dim + self.dec_hid_dim = dec_hid_dim + self.output_dim = output_dim + self.dropout = dropout + self.attention = attention + + self.embedding = embedding + self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim) + self.out = nn.Linear(self.attention.attn_in + emb_dim, output_dim) + self.dropout = nn.Dropout(dropout) + + self.init_weights() + + def _weighted_encoder_rep(self, decoder_hidden: torch.Tensor, encoder_outputs: torch.Tensor) -> torch.Tensor: + a = self.attention(decoder_hidden, encoder_outputs) + a = a.unsqueeze(1) + encoder_outputs = encoder_outputs.permute(1, 0, 2) + weighted_encoder_rep = torch.bmm(a, encoder_outputs) + weighted_encoder_rep = weighted_encoder_rep.permute(1, 0, 2) + + return weighted_encoder_rep + + def forward( + self, input: torch.Tensor, decoder_hidden: torch.Tensor, encoder_outputs: torch.Tensor + ) -> Tuple[torch.Tensor]: + + input = input.unsqueeze(0) + embedded = self.dropout(self.embedding(input)) + weighted_encoder_rep = self._weighted_encoder_rep(decoder_hidden, encoder_outputs) + rnn_input = torch.cat((embedded, weighted_encoder_rep), dim=2) + output, decoder_hidden = self.rnn(rnn_input, decoder_hidden.unsqueeze(0)) + embedded = embedded.squeeze(0) + output = output.squeeze(0) + weighted_encoder_rep = weighted_encoder_rep.squeeze(0) + output = self.out(torch.cat((output, weighted_encoder_rep, embedded), dim=1)) + + return output, decoder_hidden.squeeze(0) + + def init_weights(self): + for name, param in self.named_parameters(): + if "weight" in name: + nn.init.normal_(param.data, mean=0, std=0.01) + else: + nn.init.constant_(param.data, 0) + + +class Seq2Seq(nn.Module): + def __init__(self, encoder: nn.Module, decoder: nn.Module, device: torch.device): + super().__init__() + + self.encoder = encoder + self.decoder = decoder + self.device = device + + def forward(self, src: torch.Tensor, trg: torch.Tensor) -> torch.Tensor: + batch_size = src.shape[1] + max_len = trg.shape[0] + trg_vocab_size = self.decoder.output_dim + outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device) + encoder_outputs, hidden = self.encoder(src) + + # first input to the decoder is the token + for t in range(max_len): + output, hidden = self.decoder(trg[t], hidden, encoder_outputs) + outputs[t] = output + + return outputs diff --git a/examples/machine_translation/train_char.py b/examples/machine_translation/train_char.py new file mode 100644 index 0000000000..66f4e85d74 --- /dev/null +++ b/examples/machine_translation/train_char.py @@ -0,0 +1,187 @@ +import argparse +import math +import time + +import torch +import torch.nn as nn +import torch.optim as optim +from torch.utils.data.dataloader import DataLoader +from tqdm import tqdm + +from char_dataset import get_dataset +from embedding import WordCharCNNEmbedding +from model import Attention, Decoder, Encoder, Seq2Seq +from torchtext.data.metrics import bleu_score +from torchtext.vocab import Vocab +from utils import count_parameters, epoch_time, pad_chars, pad_words, seed_everything + + +def train( + model: nn.Module, + iterator: DataLoader, + optimizer: optim.Optimizer, + criterion: nn.Module, + clip: float, + trg_vocab: Vocab, + device: torch.device, +): + model.train() + + epoch_loss = 0 + bleu = 0.0 + for _, batch in tqdm(enumerate(iterator), total=len(iterator)): + # Need to convert to [seq_len x batch x ...] size + src = batch[0].transpose(0, 1).to(device) + trg_char = batch[1][0].transpose(0, 1).to(device) + trg_word = batch[1][1].transpose(0, 1).to(device) + optimizer.zero_grad() + + output = model(src, trg_char) + # Convert prediction to words + true = trg_word.transpose(0, 1).unsqueeze(1) + true = [[[trg_vocab.itos[word_idx] for word_idx in comb] for comb in sent] for sent in true] + pred = output.transpose(0, 1).argmax(-1) + pred = [[trg_vocab.itos[word_idx] for word_idx in sent] for sent in pred] + + output = output[1:].reshape(-1, output.shape[-1]) + trg = trg_word[1:].reshape(-1) + loss = criterion(output, trg) + + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), clip) + optimizer.step() + + epoch_loss += loss.item() + bleu += bleu_score(pred, true) + + return (epoch_loss / len(iterator)), (bleu / len(iterator)) + + +def evaluate(model: nn.Module, iterator: DataLoader, criterion: nn.Module, trg_vocab: Vocab, device: torch.device): + model.eval() + + epoch_loss = 0 + bleu = 0.0 + with torch.no_grad(): + for _, batch in tqdm(enumerate(iterator), total=len(iterator)): + # Need to convert to [seq_len x batch x ...] size + src = batch[0].transpose(0, 1).to(device) + trg_char = batch[1][0].transpose(0, 1).to(device) + trg_word = batch[1][1].transpose(0, 1).to(device) + + output = model(src, trg_char) + # Convert prediction to words + true = trg_word.transpose(0, 1).unsqueeze(1) + true = [[[trg_vocab.itos[word_idx] for word_idx in comb] for comb in sent] for sent in true] + pred = output.transpose(0, 1).argmax(-1) + pred = [[trg_vocab.itos[word_idx] for word_idx in sent] for sent in pred] + + output = output[1:].reshape(-1, output.shape[-1]) + trg = trg_word[1:].reshape(-1) + + loss = criterion(output, trg) + + epoch_loss += loss.item() + bleu += bleu_score(pred, true) + return (epoch_loss / len(iterator)), (bleu / len(iterator)) + + +def collate_fn(batch): + src_batch, tgt_batch = zip(*batch) + char_tgt_batch, word_tgt_batch = zip(*tgt_batch) + padded_src_batch = pad_chars(src_batch) + padded_tgt_char_batch = pad_chars(char_tgt_batch) + padded_tgt_word_batch = pad_words(word_tgt_batch) + return (padded_src_batch, (padded_tgt_char_batch, padded_tgt_word_batch)) + + +def main(args): + # Ensure reproducibility + seed_everything(args.seed) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + batch_size = args.batch_size + train_dataset, val_dataset, test_dataset = get_dataset(args.dataset) + train_iterator = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn) + valid_iterator = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn) + test_iterator = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn) + input_dim = len(train_dataset.vocab[0]) + char_output_dim = len(train_dataset.vocab[1]) + word_output_dim = len(train_dataset.vocab[2]) + + enc_emb = WordCharCNNEmbedding( + input_dim, char_padding_idx=train_dataset.vocab[1].stoi[""], target_emb=args.enc_emb_dim + ) + enc = Encoder(input_dim, args.enc_emb_dim, args.enc_hid_dim, args.dec_hid_dim, args.enc_dropout, enc_emb) + attn = Attention(args.enc_hid_dim, args.dec_hid_dim, args.attn_dim) + dec_emb = WordCharCNNEmbedding( + char_output_dim, char_padding_idx=train_dataset.vocab[1].stoi[""], target_emb=args.dec_emb_dim + ) + dec = Decoder( + word_output_dim, args.dec_emb_dim, args.enc_hid_dim, args.dec_hid_dim, args.dec_dropout, attn, dec_emb + ) + model = Seq2Seq(enc, dec, device).to(device) + + criterion = nn.CrossEntropyLoss(ignore_index=train_dataset.vocab[2].stoi[""]) + optimizer = optim.Adam(model.parameters(), lr=args.lr) + print(f"The model has {count_parameters(model):,} trainable parameters") + n_epochs = args.epochs + clip = args.clip + + best_valid_loss = float("inf") + + for epoch in range(n_epochs): + + start_time = time.time() + + train_loss, train_bleu_score = train( + model, train_iterator, optimizer, criterion, clip, train_dataset.vocab[2], device + ) + valid_loss, valid_bleu_score = evaluate(model, valid_iterator, criterion, train_dataset.vocab[2], device) + + end_time = time.time() + + epoch_mins, epoch_secs = epoch_time(start_time, end_time) + + print(f"Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s") + print( + f"\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f} | Train BLEU: {train_bleu_score:7.3f}" + ) + print( + f"\t Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f} | Val. BLEU: {valid_bleu_score:7.3f}" + ) + + test_loss, test_bleu_score = evaluate(model, test_iterator, criterion, train_dataset.vocab[2], device) + + print(f"| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} | Test BLEU: {test_bleu_score:7.3f}") + + if args.save: + print("Saving model to {}".format(args.save)) + torch.save(model.to("cpu"), args.save) + + if args.save_vocab: + print("Save vocab to {}".format(args.save_vocab)) + torch.save(train_dataset.vocab, args.save_vocab) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="PyTorch Experimental Seq2seq for Machine Translation") + parser.add_argument("--enc_emb_dim", type=int, default=300, help="size of encoder char-composed embeddings") + parser.add_argument("--dec_emb_dim", type=int, default=300, help="size of decoder char-composed embeddings") + parser.add_argument("--enc_hid_dim", type=int, default=64, help="size of encoder hidden units") + parser.add_argument("--dec_hid_dim", type=int, default=64, help="size of decoder hidden units") + parser.add_argument("--attn_dim", type=int, default=8, help="size of attention weights") + parser.add_argument("--enc_dropout", type=float, default=0.5, help="dropout applied to encoder") + parser.add_argument("--dec_dropout", type=float, default=0.5, help="dropout applied to decoder") + parser.add_argument("--lr", type=float, default=0.001, help="initial learning rate") + parser.add_argument("--clip", type=float, default=1, help="gradient clipping") + parser.add_argument("--epochs", type=int, default=10, help="upper epoch limit") + parser.add_argument("--batch_size", type=int, default=128, metavar="N", help="batch size") + parser.add_argument("--seed", type=int, default=42, help="random seed") + parser.add_argument("--checkpoint", type=str, default="None", help="path to load the checkpoint") + parser.add_argument("--save", type=str, default="char_mt_seq2seq.pt", help="path to save the final model") + parser.add_argument("--save_vocab", type=str, default="torchtext_char_mt_vocab.pt", help="path to save the vocab") + parser.add_argument("--dataset", type=str, default="Multi30k", help="dataset used for MLM task") + args = parser.parse_args() + + main(args) diff --git a/examples/machine_translation/train_word.py b/examples/machine_translation/train_word.py new file mode 100644 index 0000000000..3e60dbe283 --- /dev/null +++ b/examples/machine_translation/train_word.py @@ -0,0 +1,181 @@ +import argparse +import math +import time + +import torch +import torch.nn as nn +import torch.optim as optim +from torch.utils.data.dataloader import DataLoader +from tqdm import tqdm + +from model import Attention, Decoder, Encoder, Seq2Seq +from torchtext.data.metrics import bleu_score +from torchtext.experimental.datasets import Multi30k +from torchtext.vocab import Vocab +from utils import count_parameters, epoch_time, pad_words, seed_everything + + +def train( + model: nn.Module, + iterator: DataLoader, + optimizer: optim.Optimizer, + criterion: nn.Module, + clip: float, + trg_vocab: Vocab, + device: torch.device, +): + model.train() + + epoch_loss = 0 + bleu = 0.0 + for _, batch in tqdm(enumerate(iterator), total=len(iterator)): + # Need to convert to [seq_len x batch x ...] size + src = batch[0].transpose(0, 1).to(device) + trg = batch[1].transpose(0, 1).to(device) + optimizer.zero_grad() + + output = model(src, trg) + # Convert prediction to words + true = trg.transpose(0, 1).unsqueeze(1) + true = [ + [[trg_vocab.itos[word_idx] for word_idx in comb if trg_vocab.itos[word_idx] != " "] for comb in sent] + for sent in true + ] + pred = output.transpose(0, 1).argmax(-1) + pred = [[trg_vocab.itos[word_idx] for word_idx in sent if trg_vocab.itos[word_idx] != " "] for sent in pred] + + output = output.reshape(-1, output.shape[-1]) + trg = trg.reshape(-1) + loss = criterion(output, trg) + + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), clip) + optimizer.step() + + epoch_loss += loss.item() + bleu += bleu_score(pred, true) + + return (epoch_loss / len(iterator)), (bleu / len(iterator)) + + +def evaluate(model: nn.Module, iterator: DataLoader, criterion: nn.Module, trg_vocab: Vocab, device: torch.device): + model.eval() + + epoch_loss = 0 + bleu = 0.0 + with torch.no_grad(): + for _, batch in tqdm(enumerate(iterator), total=len(iterator)): + # Need to convert to [seq_len x batch x ...] size + src = batch[0].transpose(0, 1).to(device) + trg = batch[1].transpose(0, 1).to(device) + + output = model(src, trg) + # Convert prediction to words + true = trg.transpose(0, 1).unsqueeze(1) + true = [ + [[trg_vocab.itos[word_idx] for word_idx in comb if trg_vocab.itos[word_idx] != " "] for comb in sent] + for sent in true + ] + pred = output.transpose(0, 1).argmax(-1) + pred = [[trg_vocab.itos[word_idx] for word_idx in sent if trg_vocab.itos[word_idx] != " "] for sent in pred] + + output = output.reshape(-1, output.shape[-1]) + trg = trg.reshape(-1) + + loss = criterion(output, trg) + + epoch_loss += loss.item() + bleu += bleu_score(pred, true) + return (epoch_loss / len(iterator)), (bleu / len(iterator)) + + +def collate_fn(batch): + src_batch, tgt_batch = zip(*batch) + padded_src_batch = pad_words(src_batch) + padded_tgt_batch = pad_words(tgt_batch) + return (padded_src_batch, padded_tgt_batch) + + +def main(args): + # Ensure reproducibility + seed_everything(args.seed) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + batch_size = args.batch_size + # train_dataset, val_dataset, test_dataset = get_dataset(args.dataset) + train_dataset, val_dataset, test_dataset = Multi30k() + train_iterator = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn) + valid_iterator = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn) + test_iterator = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn) + input_dim = len(train_dataset.vocab[0]) + output_dim = len(train_dataset.vocab[1]) + + enc_emb = nn.Embedding(input_dim, args.enc_emb_dim, padding_idx=train_dataset.vocab[0].stoi[""]) + enc = Encoder(input_dim, args.enc_emb_dim, args.enc_hid_dim, args.dec_hid_dim, args.enc_dropout, enc_emb) + attn = Attention(args.enc_hid_dim, args.dec_hid_dim, args.attn_dim) + dec_emb = nn.Embedding(output_dim, args.dec_emb_dim, padding_idx=train_dataset.vocab[1].stoi[""]) + dec = Decoder(output_dim, args.dec_emb_dim, args.enc_hid_dim, args.dec_hid_dim, args.dec_dropout, attn, dec_emb) + model = Seq2Seq(enc, dec, device).to(device) + + criterion = nn.CrossEntropyLoss(ignore_index=train_dataset.vocab[1].stoi[""]) + optimizer = optim.Adam(model.parameters(), lr=args.lr) + print(f"The model has {count_parameters(model):,} trainable parameters") + n_epochs = args.epochs + clip = args.clip + + best_valid_loss = float("inf") + + for epoch in range(n_epochs): + + start_time = time.time() + + train_loss, train_bleu_score = train( + model, train_iterator, optimizer, criterion, clip, train_dataset.vocab[1], device + ) + valid_loss, valid_bleu_score = evaluate(model, valid_iterator, criterion, train_dataset.vocab[1], device) + + end_time = time.time() + + epoch_mins, epoch_secs = epoch_time(start_time, end_time) + + print(f"Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s") + print( + f"\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f} | Train BLEU: {train_bleu_score:7.3f}" + ) + print( + f"\t Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f} | Val. BLEU: {valid_bleu_score:7.3f}" + ) + + test_loss, test_bleu_score = evaluate(model, test_iterator, criterion, train_dataset.vocab[1], device) + + print(f"| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} | Test BLEU: {test_bleu_score:7.3f}") + + if args.save: + print("Saving model to {}".format(args.save)) + torch.save(model.to("cpu"), args.save) + + if args.save_vocab: + print("Save vocab to {}".format(args.save_vocab)) + torch.save(train_dataset.vocab, args.save_vocab) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="PyTorch Experimental Seq2seq for Machine Translation") + parser.add_argument("--enc_emb_dim", type=int, default=300, help="size of encoder char-composed embeddings") + parser.add_argument("--dec_emb_dim", type=int, default=300, help="size of decoder char-composed embeddings") + parser.add_argument("--enc_hid_dim", type=int, default=64, help="size of encoder hidden units") + parser.add_argument("--dec_hid_dim", type=int, default=64, help="size of decoder hidden units") + parser.add_argument("--attn_dim", type=int, default=8, help="size of attention weights") + parser.add_argument("--enc_dropout", type=float, default=0.5, help="dropout applied to encoder") + parser.add_argument("--dec_dropout", type=float, default=0.5, help="dropout applied to decoder") + parser.add_argument("--lr", type=float, default=0.001, help="initial learning rate") + parser.add_argument("--clip", type=float, default=1, help="gradient clipping") + parser.add_argument("--epochs", type=int, default=10, help="upper epoch limit") + parser.add_argument("--batch_size", type=int, default=128, metavar="N", help="batch size") + parser.add_argument("--seed", type=int, default=42, help="random seed") + parser.add_argument("--save", type=str, default="mt_seq2seq.pt", help="path to save the final model") + parser.add_argument("--save_vocab", type=str, default="torchtext_mt_vocab.pt", help="path to save the vocab") + parser.add_argument("--dataset", type=str, default="Multi30k", help="dataset used for MLM task") + args = parser.parse_args() + + main(args) diff --git a/examples/machine_translation/utils.py b/examples/machine_translation/utils.py new file mode 100644 index 0000000000..5fb36607f8 --- /dev/null +++ b/examples/machine_translation/utils.py @@ -0,0 +1,53 @@ +import itertools +import os +import random +from typing import Optional + +import torch +import torch.nn as nn +from torch.nn.utils.rnn import pad_sequence + + +def epoch_time(start_time: int, end_time: int): + elapsed_time = end_time - start_time + elapsed_mins = int(elapsed_time / 60) + elapsed_secs = int(elapsed_time - (elapsed_mins * 60)) + return elapsed_mins, elapsed_secs + + +def count_parameters(model: nn.Module): + return sum(p.numel() for p in model.parameters() if p.requires_grad) + + +def pad_chars(input, pad_idx=1): + # get info on length on each sentences + batch_sizes = [len(sent) for sent in input] + # flattening the array first and convert them to tensor + tx = list(map(torch.tensor, itertools.chain.from_iterable(input))) + # pad all the chars + ptx = pad_sequence(tx, True, pad_idx) + # split according to the original length + sptx = ptx.split(batch_sizes) + # finally, merge them back with padding + final_padding = pad_sequence(sptx, True, pad_idx) + + return final_padding + + +def pad_words(input, pad_idx=1): + txt = list(map(torch.tensor, input)) + return pad_sequence(txt, True, pad_idx) + + +def seed_everything(seed: Optional[int] = None) -> int: + """Function that sets seed for pseudo-random number generators in: + pytorch, python.random and sets PYTHONHASHSEED environment variable. + Imported from pytorch-lightning module + """ + + os.environ["PYTHONHASHSEED"] = str(seed) + random.seed(seed) + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False