diff --git a/examples/machine_translation/README.md b/examples/machine_translation/README.md
new file mode 100644
index 0000000000..8ce1168604
--- /dev/null
+++ b/examples/machine_translation/README.md
@@ -0,0 +1,74 @@
+# This is an example to create a machine translation dataset and train a translation model.
+
+In this example, we have two different approaches to train the machine translation model. One with character-composed embedding as an input, and the other using word embedding. Character-composed embedding is an embedding constructed by running CNN on top set of characters from each words in a sentence. The CNN layer will produce a word embedding layer but with different approach than the normal word embedding. To give you an example
+
+```
+sentence: I love torchtext
+character representation: [['I'], ['l', 'o', 'v', 'e'], ['t', 'o', 'r', 'c', 'h', 't', 'e', 'x', 't']]
+```
+
+The character representation is used as an input for the CNN layer and will result in `m x n` matrix, where `m` is the length of the sentence and `n` is the embedding size.
+
+Other than the word representation, the rest are using the exact same model (seq2seq) and training loss.
+
+## Training
+
+To try the example, simply run the following for character level training
+
+```bash
+python train_char.py
+```
+
+and run the following for word level training
+
+```bash
+python train_word.py
+```
+
+For word level training
+
+## Experiment Result
+
+The following is the output example for running `train_char.py`
+
+```
+The model has 5,617,503 trainable parameters
+100%|█████████████████████████████████████████████████████████████████████████████████| 227/227 [01:54<00:00, 1.98it/s]
+100%|█████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:03<00:00, 2.37it/s]
+Epoch: 01 | Time: 1m 57s
+ Train Loss: 5.277 | Train PPL: 195.798 | Train BLEU: 0.001
+ Val. Loss: 4.088 | Val. PPL: 59.598 | Val. BLEU: 0.006
+...
+100%|█████████████████████████████████████████████████████████████████████████████████| 227/227 [02:25<00:00, 1.56it/s]
+100%|█████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:03<00:00, 2.03it/s]
+Epoch: 10 | Time: 2m 29s
+ Train Loss: 1.373 | Train PPL: 3.948 | Train BLEU: 0.187
+ Val. Loss: 0.972 | Val. PPL: 2.644 | Val. BLEU: 0.280
+100%|█████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:04<00:00, 1.95it/s]
+| Test Loss: 1.011 | Test PPL: 2.748 | Test BLEU: 0.273
+Saving model to char_mt_seq2seq.pt
+Save vocab to torchtext_char_mt_vocab.pt
+```
+
+And the following is the output of `train_word.py`
+
+```
+The model has 14,601,140 trainable parameters
+ 0%| | 0/227 [00:00, ?it/s]/home/akurniawan/text/examples/machine_translation/utils.py:38: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
+ txt = list(map(torch.tensor, input))
+100%|█████████████████████████████████████████████████████████████████████████████████| 227/227 [02:14<00:00, 1.69it/s]
+100%|█████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:04<00:00, 1.61it/s]
+Epoch: 01 | Time: 2m 19s
+ Train Loss: 3.796 | Train PPL: 44.519 | Train BLEU: 0.139
+ Val. Loss: 1.480 | Val. PPL: 4.391 | Val. BLEU: 0.315
+...
+100%|█████████████████████████████████████████████████████████████████████████████████| 227/227 [02:52<00:00, 1.31it/s]
+100%|█████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:05<00:00, 1.49it/s]
+Epoch: 10 | Time: 2m 58s
+ Train Loss: 0.062 | Train PPL: 1.064 | Train BLEU: 0.442
+ Val. Loss: 0.182 | Val. PPL: 1.200 | Val. BLEU: 0.435
+100%|█████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:05<00:00, 1.44it/s]
+| Test Loss: 0.198 | Test PPL: 1.219 | Test BLEU: 0.420
+Saving model to mt_seq2seq.pt
+Save vocab to torchtext_mt_vocab.pt
+```
\ No newline at end of file
diff --git a/examples/machine_translation/char_dataset.py b/examples/machine_translation/char_dataset.py
new file mode 100644
index 0000000000..d794407976
--- /dev/null
+++ b/examples/machine_translation/char_dataset.py
@@ -0,0 +1,117 @@
+import itertools
+import re
+from functools import partial
+
+import torch
+from torch.utils.data import DataLoader
+
+from torchtext.data.utils import get_tokenizer
+from torchtext.experimental.datasets.raw.translation import DATASETS
+from torchtext.experimental.datasets.translation import TranslationDataset
+from torchtext.experimental.functional import sequential_transforms, vocab_func
+from torchtext.vocab import build_vocab_from_iterator
+
+
+def build_word_vocab(data, transforms, index, init_token="", eos_token=""):
+ tok_list = [[init_token], [eos_token]]
+ return build_vocab_from_iterator(tok_list + list(map(lambda x: transforms(x[index]), data)))
+
+
+def build_char_vocab(
+ data, transforms, index, init_word_token="", eos_word_token="", init_sent_token="", eos_sent_token="",
+):
+ tok_list = [
+ [init_word_token],
+ [eos_word_token],
+ [init_sent_token],
+ [eos_sent_token],
+ ]
+ for line in data:
+ tokens = list(itertools.chain.from_iterable(transforms(line[index])))
+ tok_list.append(tokens)
+ return build_vocab_from_iterator(tok_list)
+
+
+def char_vocab_func(vocab):
+ def func(tok_iter):
+ return [[vocab[char] for char in word] for word in tok_iter]
+
+ return func
+
+
+def special_char_tokens_func(
+ init_word_token="", eos_word_token="", init_sent_token="", eos_sent_token="",
+):
+ def func(tok_iter):
+ result = [[init_word_token, init_sent_token, eos_word_token]]
+ result += [[init_word_token] + word + [eos_word_token] for word in tok_iter]
+ result += [[init_word_token, eos_sent_token, eos_word_token]]
+ return result
+
+ return func
+
+
+def special_word_token_func(init_word_token="", eos_word_token=""):
+ def func(tok_iter):
+ return [init_word_token] + tok_iter + [eos_word_token]
+
+ return func
+
+
+def parallel_transforms(*transforms):
+ def func(txt_input):
+ result = []
+ for transform in transforms:
+ result.append(transform(txt_input))
+ return tuple(result)
+
+ return func
+
+
+def get_dataset(dataset_name: str):
+ # Get the raw dataset first. This will give us the text
+ # version of the dataset
+ train, test, val = DATASETS[dataset_name]()
+ # Cache training data for vocabulary construction
+ train_data = [line for line in train]
+ val_data = [line for line in val]
+ test_data = [line for line in test]
+ # Setup word tokenizer
+ src_tokenizer = get_tokenizer("spacy", language="de_core_news_sm")
+ tgt_tokenizer = get_tokenizer("spacy", language="en_core_web_sm")
+ # Setup char tokenizer
+
+ def remove_extra_whitespace(line):
+ return re.sub(" {2,}", " ", line)
+
+ src_char_transform = sequential_transforms(remove_extra_whitespace, src_tokenizer, partial(map, list))
+ tgt_char_transform = sequential_transforms(remove_extra_whitespace, tgt_tokenizer, partial(map, list))
+ tgt_word_transform = sequential_transforms(remove_extra_whitespace, tgt_tokenizer)
+
+ # Setup vocabularies (both words and chars)
+ src_char_vocab = build_char_vocab(train_data, src_char_transform, index=0)
+ tgt_char_vocab = build_char_vocab(train_data, tgt_char_transform, index=1)
+ tgt_word_vocab = build_word_vocab(train, tgt_word_transform, 0)
+
+ # Building the dataset with character level tokenization
+ src_char_transform = sequential_transforms(
+ src_char_transform, special_char_tokens_func(), char_vocab_func(src_char_vocab)
+ )
+ tgt_char_transform = sequential_transforms(
+ tgt_char_transform, special_char_tokens_func(), char_vocab_func(tgt_char_vocab)
+ )
+ tgt_word_transform = sequential_transforms(
+ tgt_word_transform, special_word_token_func(), vocab_func(tgt_word_vocab)
+ )
+ tgt_transform = parallel_transforms(tgt_char_transform, tgt_word_transform)
+ train_dataset = TranslationDataset(
+ train_data, (src_char_vocab, tgt_char_vocab, tgt_word_vocab), (src_char_transform, tgt_transform)
+ )
+ val_dataset = TranslationDataset(
+ val_data, (src_char_vocab, tgt_char_vocab, tgt_word_vocab), (src_char_transform, tgt_transform)
+ )
+ test_dataset = TranslationDataset(
+ test_data, (src_char_vocab, tgt_char_vocab, tgt_word_vocab), (src_char_transform, tgt_transform)
+ )
+
+ return train_dataset, val_dataset, test_dataset
diff --git a/examples/machine_translation/embedding.py b/examples/machine_translation/embedding.py
new file mode 100644
index 0000000000..ab2ef1c6fb
--- /dev/null
+++ b/examples/machine_translation/embedding.py
@@ -0,0 +1,85 @@
+from typing import NamedTuple
+
+import torch.nn as nn
+
+
+class WordCharCNNEmbedding(nn.Module):
+ """The character embedding is built upon CNN and pooling layer
+ with dropout applied before the convolution and after the pooling.
+ """
+
+ def __init__(
+ self,
+ ntokens: int,
+ char_embedding_dim: int = 30,
+ char_padding_idx: int = 1,
+ dropout: float = 0.5,
+ kernel_size: int = 3,
+ out_channels: int = 30,
+ target_emb: int = 300,
+ use_highway: bool = False,
+ ):
+ super(WordCharCNNEmbedding, self).__init__()
+ self._use_highway = use_highway
+
+ if self._use_highway and out_channels != target_emb:
+ raise ValueError("out_channels and target_emb must be " "equal in highway setting")
+
+ self.char_embedding = nn.Embedding(ntokens, char_embedding_dim, char_padding_idx)
+ self.conv_embedding = nn.Sequential(
+ nn.Dropout(p=dropout),
+ nn.Conv1d(
+ in_channels=char_embedding_dim,
+ out_channels=out_channels,
+ kernel_size=kernel_size,
+ padding=kernel_size - 1,
+ ),
+ nn.AdaptiveMaxPool1d(1),
+ )
+ self.proj_layer = nn.Linear(out_channels, target_emb)
+ self.out_dropout = nn.Dropout(p=dropout)
+ self._char_padding_idx = char_padding_idx
+
+ self.init_weights()
+
+ def init_weights(self):
+ """Initialize the weight of character embedding with xavier
+ and reinitalize the padding vectors to zero
+ """
+
+ self.char_embedding.weight.data.uniform_(-0.1, 0.1)
+ # Reinitialize vectors at padding_idx to have 0 value
+ self.char_embedding.weight.data[self._char_padding_idx].uniform_(0, 0)
+
+ def forward(self, chars):
+ """Run the forward calculation of the char-cnn embedding
+ model.
+ Args:
+ chars (torch.Tensor): An integer tensor with the size of
+ [seq_len x batch x char_size]
+ Returns:
+ char_embedding_vec (torch.Tensor): An embedding tensor with
+ the size of [batch x seq_len x out_channels]
+ """
+ char_embedding_vec = self.char_embedding(chars)
+ # Reshape the character embedding to the size of
+ # [batch * seq_len, char_len, char_dim]
+ char_embedding_vec = char_embedding_vec.view(
+ -1, char_embedding_vec.size(2), char_embedding_vec.size(3)
+ ).contiguous()
+ # Transpose the embedding into [batch * seq_len, char_dim, char_len]
+ char_embedding_vec = char_embedding_vec.transpose(1, 2).contiguous()
+ # Apply char embedding with dropout and convolution
+ # layers so the dim now will be [batch * seq_len, out_channel, new_len]
+ char_embedding_vec = self.conv_embedding(char_embedding_vec)
+ char_embedding_vec = char_embedding_vec.squeeze(-1)
+ # Revert the size back to [seq_len, batch, out_channel]
+ char_embedding_vec = char_embedding_vec.view(chars.size(0), chars.size(1), -1).contiguous()
+ char_embedding_vec = self.out_dropout(char_embedding_vec)
+ proj_char_embedding_vec = self.proj_layer(char_embedding_vec)
+ # Apply highway connection between projection layer and
+ # pooling layer
+ if self._use_highway:
+ proj_char_embedding_vec += char_embedding_vec
+
+ return proj_char_embedding_vec
diff --git a/examples/machine_translation/model.py b/examples/machine_translation/model.py
new file mode 100644
index 0000000000..9818ce8bd3
--- /dev/null
+++ b/examples/machine_translation/model.py
@@ -0,0 +1,151 @@
+import random
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Encoder(nn.Module):
+ def __init__(
+ self, input_dim: int, emb_dim: int, enc_hid_dim: int, dec_hid_dim: int, dropout: float, embedding: nn.Module
+ ):
+ super().__init__()
+
+ self.input_dim = input_dim
+ self.emb_dim = emb_dim
+ self.enc_hid_dim = enc_hid_dim
+ self.dec_hid_dim = dec_hid_dim
+ self.dropout = dropout
+
+ self.embedding = embedding
+ self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional=True)
+ self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
+ self.dropout = nn.Dropout(dropout)
+
+ self.init_weights()
+
+ def forward(self, src: torch.Tensor) -> Tuple[torch.Tensor]:
+ embedded = self.dropout(self.embedding(src))
+ outputs, hidden = self.rnn(embedded)
+ hidden = torch.tanh(self.fc(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)))
+
+ return outputs, hidden
+
+ def init_weights(self):
+ for name, param in self.named_parameters():
+ if "weight" in name:
+ nn.init.normal_(param.data, mean=0, std=0.01)
+ else:
+ nn.init.constant_(param.data, 0)
+
+
+class Attention(nn.Module):
+ def __init__(self, enc_hid_dim: int, dec_hid_dim: int, attn_dim: int):
+ super().__init__()
+
+ self.enc_hid_dim = enc_hid_dim
+ self.dec_hid_dim = dec_hid_dim
+ self.attn_in = (enc_hid_dim * 2) + dec_hid_dim
+ self.attn = nn.Linear(self.attn_in, attn_dim)
+
+ self.init_weights()
+
+ def forward(self, decoder_hidden: torch.Tensor, encoder_outputs: torch.Tensor) -> torch.Tensor:
+ src_len = encoder_outputs.shape[0]
+ repeated_decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1, src_len, 1)
+ encoder_outputs = encoder_outputs.permute(1, 0, 2)
+ energy = torch.tanh(self.attn(torch.cat((repeated_decoder_hidden, encoder_outputs), dim=2)))
+ attention = torch.sum(energy, dim=2)
+
+ return F.softmax(attention, dim=1)
+
+ def init_weights(self):
+ for name, param in self.named_parameters():
+ if "weight" in name:
+ nn.init.normal_(param.data, mean=0, std=0.01)
+ else:
+ nn.init.constant_(param.data, 0)
+
+
+class Decoder(nn.Module):
+ def __init__(
+ self,
+ output_dim: int,
+ emb_dim: int,
+ enc_hid_dim: int,
+ dec_hid_dim: int,
+ dropout: int,
+ attention: nn.Module,
+ embedding: nn.Module,
+ ):
+ super().__init__()
+
+ self.emb_dim = emb_dim
+ self.enc_hid_dim = enc_hid_dim
+ self.dec_hid_dim = dec_hid_dim
+ self.output_dim = output_dim
+ self.dropout = dropout
+ self.attention = attention
+
+ self.embedding = embedding
+ self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
+ self.out = nn.Linear(self.attention.attn_in + emb_dim, output_dim)
+ self.dropout = nn.Dropout(dropout)
+
+ self.init_weights()
+
+ def _weighted_encoder_rep(self, decoder_hidden: torch.Tensor, encoder_outputs: torch.Tensor) -> torch.Tensor:
+ a = self.attention(decoder_hidden, encoder_outputs)
+ a = a.unsqueeze(1)
+ encoder_outputs = encoder_outputs.permute(1, 0, 2)
+ weighted_encoder_rep = torch.bmm(a, encoder_outputs)
+ weighted_encoder_rep = weighted_encoder_rep.permute(1, 0, 2)
+
+ return weighted_encoder_rep
+
+ def forward(
+ self, input: torch.Tensor, decoder_hidden: torch.Tensor, encoder_outputs: torch.Tensor
+ ) -> Tuple[torch.Tensor]:
+
+ input = input.unsqueeze(0)
+ embedded = self.dropout(self.embedding(input))
+ weighted_encoder_rep = self._weighted_encoder_rep(decoder_hidden, encoder_outputs)
+ rnn_input = torch.cat((embedded, weighted_encoder_rep), dim=2)
+ output, decoder_hidden = self.rnn(rnn_input, decoder_hidden.unsqueeze(0))
+ embedded = embedded.squeeze(0)
+ output = output.squeeze(0)
+ weighted_encoder_rep = weighted_encoder_rep.squeeze(0)
+ output = self.out(torch.cat((output, weighted_encoder_rep, embedded), dim=1))
+
+ return output, decoder_hidden.squeeze(0)
+
+ def init_weights(self):
+ for name, param in self.named_parameters():
+ if "weight" in name:
+ nn.init.normal_(param.data, mean=0, std=0.01)
+ else:
+ nn.init.constant_(param.data, 0)
+
+
+class Seq2Seq(nn.Module):
+ def __init__(self, encoder: nn.Module, decoder: nn.Module, device: torch.device):
+ super().__init__()
+
+ self.encoder = encoder
+ self.decoder = decoder
+ self.device = device
+
+ def forward(self, src: torch.Tensor, trg: torch.Tensor) -> torch.Tensor:
+ batch_size = src.shape[1]
+ max_len = trg.shape[0]
+ trg_vocab_size = self.decoder.output_dim
+ outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
+ encoder_outputs, hidden = self.encoder(src)
+
+ # first input to the decoder is the token
+ for t in range(max_len):
+ output, hidden = self.decoder(trg[t], hidden, encoder_outputs)
+ outputs[t] = output
+
+ return outputs
diff --git a/examples/machine_translation/train_char.py b/examples/machine_translation/train_char.py
new file mode 100644
index 0000000000..66f4e85d74
--- /dev/null
+++ b/examples/machine_translation/train_char.py
@@ -0,0 +1,187 @@
+import argparse
+import math
+import time
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data.dataloader import DataLoader
+from tqdm import tqdm
+
+from char_dataset import get_dataset
+from embedding import WordCharCNNEmbedding
+from model import Attention, Decoder, Encoder, Seq2Seq
+from torchtext.data.metrics import bleu_score
+from torchtext.vocab import Vocab
+from utils import count_parameters, epoch_time, pad_chars, pad_words, seed_everything
+
+
+def train(
+ model: nn.Module,
+ iterator: DataLoader,
+ optimizer: optim.Optimizer,
+ criterion: nn.Module,
+ clip: float,
+ trg_vocab: Vocab,
+ device: torch.device,
+):
+ model.train()
+
+ epoch_loss = 0
+ bleu = 0.0
+ for _, batch in tqdm(enumerate(iterator), total=len(iterator)):
+ # Need to convert to [seq_len x batch x ...] size
+ src = batch[0].transpose(0, 1).to(device)
+ trg_char = batch[1][0].transpose(0, 1).to(device)
+ trg_word = batch[1][1].transpose(0, 1).to(device)
+ optimizer.zero_grad()
+
+ output = model(src, trg_char)
+ # Convert prediction to words
+ true = trg_word.transpose(0, 1).unsqueeze(1)
+ true = [[[trg_vocab.itos[word_idx] for word_idx in comb] for comb in sent] for sent in true]
+ pred = output.transpose(0, 1).argmax(-1)
+ pred = [[trg_vocab.itos[word_idx] for word_idx in sent] for sent in pred]
+
+ output = output[1:].reshape(-1, output.shape[-1])
+ trg = trg_word[1:].reshape(-1)
+ loss = criterion(output, trg)
+
+ loss.backward()
+ torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
+ optimizer.step()
+
+ epoch_loss += loss.item()
+ bleu += bleu_score(pred, true)
+
+ return (epoch_loss / len(iterator)), (bleu / len(iterator))
+
+
+def evaluate(model: nn.Module, iterator: DataLoader, criterion: nn.Module, trg_vocab: Vocab, device: torch.device):
+ model.eval()
+
+ epoch_loss = 0
+ bleu = 0.0
+ with torch.no_grad():
+ for _, batch in tqdm(enumerate(iterator), total=len(iterator)):
+ # Need to convert to [seq_len x batch x ...] size
+ src = batch[0].transpose(0, 1).to(device)
+ trg_char = batch[1][0].transpose(0, 1).to(device)
+ trg_word = batch[1][1].transpose(0, 1).to(device)
+
+ output = model(src, trg_char)
+ # Convert prediction to words
+ true = trg_word.transpose(0, 1).unsqueeze(1)
+ true = [[[trg_vocab.itos[word_idx] for word_idx in comb] for comb in sent] for sent in true]
+ pred = output.transpose(0, 1).argmax(-1)
+ pred = [[trg_vocab.itos[word_idx] for word_idx in sent] for sent in pred]
+
+ output = output[1:].reshape(-1, output.shape[-1])
+ trg = trg_word[1:].reshape(-1)
+
+ loss = criterion(output, trg)
+
+ epoch_loss += loss.item()
+ bleu += bleu_score(pred, true)
+ return (epoch_loss / len(iterator)), (bleu / len(iterator))
+
+
+def collate_fn(batch):
+ src_batch, tgt_batch = zip(*batch)
+ char_tgt_batch, word_tgt_batch = zip(*tgt_batch)
+ padded_src_batch = pad_chars(src_batch)
+ padded_tgt_char_batch = pad_chars(char_tgt_batch)
+ padded_tgt_word_batch = pad_words(word_tgt_batch)
+ return (padded_src_batch, (padded_tgt_char_batch, padded_tgt_word_batch))
+
+
+def main(args):
+ # Ensure reproducibility
+ seed_everything(args.seed)
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+ batch_size = args.batch_size
+ train_dataset, val_dataset, test_dataset = get_dataset(args.dataset)
+ train_iterator = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn)
+ valid_iterator = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn)
+ test_iterator = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn)
+ input_dim = len(train_dataset.vocab[0])
+ char_output_dim = len(train_dataset.vocab[1])
+ word_output_dim = len(train_dataset.vocab[2])
+
+ enc_emb = WordCharCNNEmbedding(
+ input_dim, char_padding_idx=train_dataset.vocab[1].stoi[""], target_emb=args.enc_emb_dim
+ )
+ enc = Encoder(input_dim, args.enc_emb_dim, args.enc_hid_dim, args.dec_hid_dim, args.enc_dropout, enc_emb)
+ attn = Attention(args.enc_hid_dim, args.dec_hid_dim, args.attn_dim)
+ dec_emb = WordCharCNNEmbedding(
+ char_output_dim, char_padding_idx=train_dataset.vocab[1].stoi[""], target_emb=args.dec_emb_dim
+ )
+ dec = Decoder(
+ word_output_dim, args.dec_emb_dim, args.enc_hid_dim, args.dec_hid_dim, args.dec_dropout, attn, dec_emb
+ )
+ model = Seq2Seq(enc, dec, device).to(device)
+
+ criterion = nn.CrossEntropyLoss(ignore_index=train_dataset.vocab[2].stoi[""])
+ optimizer = optim.Adam(model.parameters(), lr=args.lr)
+ print(f"The model has {count_parameters(model):,} trainable parameters")
+ n_epochs = args.epochs
+ clip = args.clip
+
+ best_valid_loss = float("inf")
+
+ for epoch in range(n_epochs):
+
+ start_time = time.time()
+
+ train_loss, train_bleu_score = train(
+ model, train_iterator, optimizer, criterion, clip, train_dataset.vocab[2], device
+ )
+ valid_loss, valid_bleu_score = evaluate(model, valid_iterator, criterion, train_dataset.vocab[2], device)
+
+ end_time = time.time()
+
+ epoch_mins, epoch_secs = epoch_time(start_time, end_time)
+
+ print(f"Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s")
+ print(
+ f"\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f} | Train BLEU: {train_bleu_score:7.3f}"
+ )
+ print(
+ f"\t Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f} | Val. BLEU: {valid_bleu_score:7.3f}"
+ )
+
+ test_loss, test_bleu_score = evaluate(model, test_iterator, criterion, train_dataset.vocab[2], device)
+
+ print(f"| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} | Test BLEU: {test_bleu_score:7.3f}")
+
+ if args.save:
+ print("Saving model to {}".format(args.save))
+ torch.save(model.to("cpu"), args.save)
+
+ if args.save_vocab:
+ print("Save vocab to {}".format(args.save_vocab))
+ torch.save(train_dataset.vocab, args.save_vocab)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="PyTorch Experimental Seq2seq for Machine Translation")
+ parser.add_argument("--enc_emb_dim", type=int, default=300, help="size of encoder char-composed embeddings")
+ parser.add_argument("--dec_emb_dim", type=int, default=300, help="size of decoder char-composed embeddings")
+ parser.add_argument("--enc_hid_dim", type=int, default=64, help="size of encoder hidden units")
+ parser.add_argument("--dec_hid_dim", type=int, default=64, help="size of decoder hidden units")
+ parser.add_argument("--attn_dim", type=int, default=8, help="size of attention weights")
+ parser.add_argument("--enc_dropout", type=float, default=0.5, help="dropout applied to encoder")
+ parser.add_argument("--dec_dropout", type=float, default=0.5, help="dropout applied to decoder")
+ parser.add_argument("--lr", type=float, default=0.001, help="initial learning rate")
+ parser.add_argument("--clip", type=float, default=1, help="gradient clipping")
+ parser.add_argument("--epochs", type=int, default=10, help="upper epoch limit")
+ parser.add_argument("--batch_size", type=int, default=128, metavar="N", help="batch size")
+ parser.add_argument("--seed", type=int, default=42, help="random seed")
+ parser.add_argument("--checkpoint", type=str, default="None", help="path to load the checkpoint")
+ parser.add_argument("--save", type=str, default="char_mt_seq2seq.pt", help="path to save the final model")
+ parser.add_argument("--save_vocab", type=str, default="torchtext_char_mt_vocab.pt", help="path to save the vocab")
+ parser.add_argument("--dataset", type=str, default="Multi30k", help="dataset used for MLM task")
+ args = parser.parse_args()
+
+ main(args)
diff --git a/examples/machine_translation/train_word.py b/examples/machine_translation/train_word.py
new file mode 100644
index 0000000000..3e60dbe283
--- /dev/null
+++ b/examples/machine_translation/train_word.py
@@ -0,0 +1,181 @@
+import argparse
+import math
+import time
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data.dataloader import DataLoader
+from tqdm import tqdm
+
+from model import Attention, Decoder, Encoder, Seq2Seq
+from torchtext.data.metrics import bleu_score
+from torchtext.experimental.datasets import Multi30k
+from torchtext.vocab import Vocab
+from utils import count_parameters, epoch_time, pad_words, seed_everything
+
+
+def train(
+ model: nn.Module,
+ iterator: DataLoader,
+ optimizer: optim.Optimizer,
+ criterion: nn.Module,
+ clip: float,
+ trg_vocab: Vocab,
+ device: torch.device,
+):
+ model.train()
+
+ epoch_loss = 0
+ bleu = 0.0
+ for _, batch in tqdm(enumerate(iterator), total=len(iterator)):
+ # Need to convert to [seq_len x batch x ...] size
+ src = batch[0].transpose(0, 1).to(device)
+ trg = batch[1].transpose(0, 1).to(device)
+ optimizer.zero_grad()
+
+ output = model(src, trg)
+ # Convert prediction to words
+ true = trg.transpose(0, 1).unsqueeze(1)
+ true = [
+ [[trg_vocab.itos[word_idx] for word_idx in comb if trg_vocab.itos[word_idx] != " "] for comb in sent]
+ for sent in true
+ ]
+ pred = output.transpose(0, 1).argmax(-1)
+ pred = [[trg_vocab.itos[word_idx] for word_idx in sent if trg_vocab.itos[word_idx] != " "] for sent in pred]
+
+ output = output.reshape(-1, output.shape[-1])
+ trg = trg.reshape(-1)
+ loss = criterion(output, trg)
+
+ loss.backward()
+ torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
+ optimizer.step()
+
+ epoch_loss += loss.item()
+ bleu += bleu_score(pred, true)
+
+ return (epoch_loss / len(iterator)), (bleu / len(iterator))
+
+
+def evaluate(model: nn.Module, iterator: DataLoader, criterion: nn.Module, trg_vocab: Vocab, device: torch.device):
+ model.eval()
+
+ epoch_loss = 0
+ bleu = 0.0
+ with torch.no_grad():
+ for _, batch in tqdm(enumerate(iterator), total=len(iterator)):
+ # Need to convert to [seq_len x batch x ...] size
+ src = batch[0].transpose(0, 1).to(device)
+ trg = batch[1].transpose(0, 1).to(device)
+
+ output = model(src, trg)
+ # Convert prediction to words
+ true = trg.transpose(0, 1).unsqueeze(1)
+ true = [
+ [[trg_vocab.itos[word_idx] for word_idx in comb if trg_vocab.itos[word_idx] != " "] for comb in sent]
+ for sent in true
+ ]
+ pred = output.transpose(0, 1).argmax(-1)
+ pred = [[trg_vocab.itos[word_idx] for word_idx in sent if trg_vocab.itos[word_idx] != " "] for sent in pred]
+
+ output = output.reshape(-1, output.shape[-1])
+ trg = trg.reshape(-1)
+
+ loss = criterion(output, trg)
+
+ epoch_loss += loss.item()
+ bleu += bleu_score(pred, true)
+ return (epoch_loss / len(iterator)), (bleu / len(iterator))
+
+
+def collate_fn(batch):
+ src_batch, tgt_batch = zip(*batch)
+ padded_src_batch = pad_words(src_batch)
+ padded_tgt_batch = pad_words(tgt_batch)
+ return (padded_src_batch, padded_tgt_batch)
+
+
+def main(args):
+ # Ensure reproducibility
+ seed_everything(args.seed)
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+ batch_size = args.batch_size
+ # train_dataset, val_dataset, test_dataset = get_dataset(args.dataset)
+ train_dataset, val_dataset, test_dataset = Multi30k()
+ train_iterator = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn)
+ valid_iterator = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn)
+ test_iterator = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn)
+ input_dim = len(train_dataset.vocab[0])
+ output_dim = len(train_dataset.vocab[1])
+
+ enc_emb = nn.Embedding(input_dim, args.enc_emb_dim, padding_idx=train_dataset.vocab[0].stoi[""])
+ enc = Encoder(input_dim, args.enc_emb_dim, args.enc_hid_dim, args.dec_hid_dim, args.enc_dropout, enc_emb)
+ attn = Attention(args.enc_hid_dim, args.dec_hid_dim, args.attn_dim)
+ dec_emb = nn.Embedding(output_dim, args.dec_emb_dim, padding_idx=train_dataset.vocab[1].stoi[""])
+ dec = Decoder(output_dim, args.dec_emb_dim, args.enc_hid_dim, args.dec_hid_dim, args.dec_dropout, attn, dec_emb)
+ model = Seq2Seq(enc, dec, device).to(device)
+
+ criterion = nn.CrossEntropyLoss(ignore_index=train_dataset.vocab[1].stoi[""])
+ optimizer = optim.Adam(model.parameters(), lr=args.lr)
+ print(f"The model has {count_parameters(model):,} trainable parameters")
+ n_epochs = args.epochs
+ clip = args.clip
+
+ best_valid_loss = float("inf")
+
+ for epoch in range(n_epochs):
+
+ start_time = time.time()
+
+ train_loss, train_bleu_score = train(
+ model, train_iterator, optimizer, criterion, clip, train_dataset.vocab[1], device
+ )
+ valid_loss, valid_bleu_score = evaluate(model, valid_iterator, criterion, train_dataset.vocab[1], device)
+
+ end_time = time.time()
+
+ epoch_mins, epoch_secs = epoch_time(start_time, end_time)
+
+ print(f"Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s")
+ print(
+ f"\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f} | Train BLEU: {train_bleu_score:7.3f}"
+ )
+ print(
+ f"\t Val. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f} | Val. BLEU: {valid_bleu_score:7.3f}"
+ )
+
+ test_loss, test_bleu_score = evaluate(model, test_iterator, criterion, train_dataset.vocab[1], device)
+
+ print(f"| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} | Test BLEU: {test_bleu_score:7.3f}")
+
+ if args.save:
+ print("Saving model to {}".format(args.save))
+ torch.save(model.to("cpu"), args.save)
+
+ if args.save_vocab:
+ print("Save vocab to {}".format(args.save_vocab))
+ torch.save(train_dataset.vocab, args.save_vocab)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="PyTorch Experimental Seq2seq for Machine Translation")
+ parser.add_argument("--enc_emb_dim", type=int, default=300, help="size of encoder char-composed embeddings")
+ parser.add_argument("--dec_emb_dim", type=int, default=300, help="size of decoder char-composed embeddings")
+ parser.add_argument("--enc_hid_dim", type=int, default=64, help="size of encoder hidden units")
+ parser.add_argument("--dec_hid_dim", type=int, default=64, help="size of decoder hidden units")
+ parser.add_argument("--attn_dim", type=int, default=8, help="size of attention weights")
+ parser.add_argument("--enc_dropout", type=float, default=0.5, help="dropout applied to encoder")
+ parser.add_argument("--dec_dropout", type=float, default=0.5, help="dropout applied to decoder")
+ parser.add_argument("--lr", type=float, default=0.001, help="initial learning rate")
+ parser.add_argument("--clip", type=float, default=1, help="gradient clipping")
+ parser.add_argument("--epochs", type=int, default=10, help="upper epoch limit")
+ parser.add_argument("--batch_size", type=int, default=128, metavar="N", help="batch size")
+ parser.add_argument("--seed", type=int, default=42, help="random seed")
+ parser.add_argument("--save", type=str, default="mt_seq2seq.pt", help="path to save the final model")
+ parser.add_argument("--save_vocab", type=str, default="torchtext_mt_vocab.pt", help="path to save the vocab")
+ parser.add_argument("--dataset", type=str, default="Multi30k", help="dataset used for MLM task")
+ args = parser.parse_args()
+
+ main(args)
diff --git a/examples/machine_translation/utils.py b/examples/machine_translation/utils.py
new file mode 100644
index 0000000000..5fb36607f8
--- /dev/null
+++ b/examples/machine_translation/utils.py
@@ -0,0 +1,53 @@
+import itertools
+import os
+import random
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from torch.nn.utils.rnn import pad_sequence
+
+
+def epoch_time(start_time: int, end_time: int):
+ elapsed_time = end_time - start_time
+ elapsed_mins = int(elapsed_time / 60)
+ elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
+ return elapsed_mins, elapsed_secs
+
+
+def count_parameters(model: nn.Module):
+ return sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+
+def pad_chars(input, pad_idx=1):
+ # get info on length on each sentences
+ batch_sizes = [len(sent) for sent in input]
+ # flattening the array first and convert them to tensor
+ tx = list(map(torch.tensor, itertools.chain.from_iterable(input)))
+ # pad all the chars
+ ptx = pad_sequence(tx, True, pad_idx)
+ # split according to the original length
+ sptx = ptx.split(batch_sizes)
+ # finally, merge them back with padding
+ final_padding = pad_sequence(sptx, True, pad_idx)
+
+ return final_padding
+
+
+def pad_words(input, pad_idx=1):
+ txt = list(map(torch.tensor, input))
+ return pad_sequence(txt, True, pad_idx)
+
+
+def seed_everything(seed: Optional[int] = None) -> int:
+ """Function that sets seed for pseudo-random number generators in:
+ pytorch, python.random and sets PYTHONHASHSEED environment variable.
+ Imported from pytorch-lightning module
+ """
+
+ os.environ["PYTHONHASHSEED"] = str(seed)
+ random.seed(seed)
+ torch.manual_seed(seed)
+ if torch.cuda.is_available():
+ torch.backends.cudnn.deterministic = True
+ torch.backends.cudnn.benchmark = False