[feat] Add UNITER pretraining heads (#1126)

Ryan-Qiyu-Jiang · facebook-github-bot · commit 95649833d638 · 2021-11-26T14:49:27.000-08:00
Summary: Pull Request resolved: #1126 Add MRC, MRFR, WRA heads for UNITER pretraining. MRC = Masked Region Classification. MRFR = Masked Region Feature Regression. WRA = Word Region Alignment. Heads forward return a dict with `losses`. These heads can be used as pretraining tasks for other VL models. Details at https://arxiv.org/abs/1909.11740 Test Plan: **Unit tests** Test direct instantiation and forward pass for each head. Instantiation through build() and configs is tested in unit tests in later diffs by the models that use these heads. Tested as part of UNITER pretraining on masked COCO Reviewed By: ebsmothers Differential Revision: D31768455 Pulled By: Ryan-Qiyu-Jiang fbshipit-source-id: 9b48f81c472cd1859f32bc813484296208e206f5
diff --git a/mmf/models/transformers/heads/mrc.py b/mmf/models/transformers/heads/mrc.py
@@ -0,0 +1,92 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# Initial version was taken from https://github.com/ChenRocks/UNITER/
+# and adapted for MMF.
+
+from typing import Dict
+
+import torch
+import torch.nn.functional as F
+from mmf.common.registry import registry
+from mmf.models.transformers.heads.utils import compute_masked_hidden
+from torch import Tensor, nn
+
+
+@registry.register_transformer_head("mrc")
+class MRC(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int = 768,
+        loss_name: str = "mrc_loss",
+        ignore_index: int = -1,
+        mrc_label_key: str = "region_class",
+        mrc_mask_key: str = "image_region_mask",
+        label_dim: int = 1601,
+        eps: float = 1e-12,
+        use_kl: bool = True,
+        *args,
+        **kwargs,
+    ):
+
+        super().__init__()
+        self.loss_name = loss_name
+        self.ignore_index = ignore_index
+        self.mrc_label_key = mrc_label_key
+        self.mrc_mask_key = mrc_mask_key
+        self.use_kl = use_kl
+
+        # Head modules
+        self.region_classifier = nn.Sequential(
+            nn.Linear(hidden_size, hidden_size),
+            nn.GELU(),
+            nn.LayerNorm(hidden_size, eps=eps),
+            nn.Linear(hidden_size, label_dim),
+        )
+
+    def forward(
+        self,
+        sequence_output: Tensor,
+        processed_sample_list: Dict[str, Dict[str, Tensor]],
+    ) -> Dict[str, Dict[str, Tensor]]:
+
+        output_dict = {}
+        assert (
+            self.mrc_label_key in processed_sample_list
+            and processed_sample_list[self.mrc_label_key] is not None
+        ), (
+            f"MRC pretraining requires {self.mrc_label_key} to be in sample "
+            + "list with value not None."
+        )
+        # (bs*num_feat, label_dim)  Look at unit test for example usage!
+        region_labels = processed_sample_list[self.mrc_label_key]
+
+        assert (
+            self.mrc_mask_key in processed_sample_list
+            and processed_sample_list[self.mrc_mask_key] is not None
+        ), (
+            f"MRC pretraining requires {self.mrc_mask_key} to be in sample "
+            + "list with value not None."
+        )
+        # (bs, num_feat)
+        image_region_masks = processed_sample_list[self.mrc_mask_key]
+
+        masked_output = compute_masked_hidden(sequence_output, image_region_masks)
+        prediction_soft_label = self.region_classifier(masked_output)
+        if self.use_kl:
+            prediction_soft_label = F.log_softmax(prediction_soft_label, dim=-1)
+            mrc_loss = F.kl_div(
+                prediction_soft_label, region_labels, reduction="batchmean"
+            )
+        else:
+            # background class should not be the target
+            label_targets = torch.max(region_labels[:, 1:], dim=-1)[1] + 1
+            mrc_loss = F.cross_entropy(
+                prediction_soft_label,
+                label_targets,
+                ignore_index=self.ignore_index,
+                reduction="mean",
+            )
+
+        output_dict["losses"] = {}
+        output_dict["losses"][self.loss_name] = mrc_loss
+        return output_dict
diff --git a/mmf/models/transformers/heads/mrfr.py b/mmf/models/transformers/heads/mrfr.py
@@ -0,0 +1,94 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# Initial version was taken from https://github.com/ChenRocks/UNITER/
+# and adapted for MMF.
+
+from typing import Dict
+
+import torch
+import torch.nn.functional as F
+from mmf.common.registry import registry
+from mmf.models.transformers.heads.utils import compute_masked_hidden
+from torch import Tensor, nn
+
+
+@registry.register_transformer_head("mrfr")
+class MRFR(nn.Module):
+    """
+    Masked Region Feature Regression transformer head,
+    From uniter paper https://arxiv.org/pdf/1909.11740.pdf
+    For an example usage take a look at the unit test.
+    """
+
+    def __init__(
+        self,
+        img_embedding_weight: nn.Parameter,
+        hidden_size: int = 768,
+        loss_name: str = "mrfr_loss",
+        mrfr_target_key: str = "mrfr_region_target",
+        mrfr_mask_key: str = "mrfr_region_mask",
+        img_dim: int = 2048,
+        eps: float = 1e-12,
+        *args,
+        **kwargs,
+    ):
+        super().__init__()
+        self.loss_name = loss_name
+        self.mrfr_target_key = mrfr_target_key
+        self.mrfr_mask_key = mrfr_mask_key
+
+        # Head modules
+        assert img_embedding_weight is not None and tuple(
+            img_embedding_weight.shape
+        ) == (hidden_size, img_dim), (
+            "MRFR head requires 'img_embedding_weight' with shape "
+            + f"({hidden_size}, {img_dim})."
+        )
+
+        self.linear_proj_weight = img_embedding_weight
+        self.linear_proj_bias = nn.Parameter(torch.zeros(img_dim))
+
+        self.feat_regress = nn.Sequential(
+            nn.Linear(hidden_size, hidden_size),
+            nn.GELU(),
+            nn.LayerNorm(hidden_size, eps=eps),
+        )
+
+    def forward(
+        self,
+        sequence_output: Tensor,
+        processed_sample_list: Dict[str, Dict[str, Tensor]],
+    ) -> Dict[str, Dict[str, Tensor]]:
+
+        output_dict = {}
+
+        assert (
+            self.mrfr_target_key in processed_sample_list
+            and processed_sample_list[self.mrfr_target_key] is not None
+        ), (
+            f"MRFR pretraining requires {self.mrfr_target_key} to be in sample "
+            + "list with value not None."
+        )
+        # (bs*num_feat, img_dim)  Look at unit test for example usage!
+        feat_targets = processed_sample_list[self.mrfr_target_key]
+
+        assert (
+            self.mrfr_mask_key in processed_sample_list
+            and processed_sample_list[self.mrfr_mask_key] is not None
+        ), (
+            f"MRFR pretraining requires {self.mrfr_mask_key} to be in sample "
+            + "list with value not None."
+        )
+        # (bs, num_feat)
+        image_region_masks = processed_sample_list[self.mrfr_mask_key]
+
+        masked_output = compute_masked_hidden(sequence_output, image_region_masks)
+        hidden_states = self.feat_regress(masked_output)
+        prediction_feat = F.linear(
+            hidden_states, self.linear_proj_weight.t(), self.linear_proj_bias
+        )
+        mrfr_loss = F.mse_loss(prediction_feat, feat_targets, reduction="mean")
+
+        output_dict["losses"] = {}
+        output_dict["losses"][self.loss_name] = mrfr_loss
+        return output_dict
diff --git a/mmf/models/transformers/heads/utils.py b/mmf/models/transformers/heads/utils.py
@@ -164,3 +164,16 @@ def _process_head_output(
             )
         output = self.losses[loss_name](sample_list, {"scores": logits})
         return {"losses": output, "scores": logits}
+
+
+def compute_masked_hidden(hidden: Tensor, mask: Tensor) -> Tensor:
+    """Get only the masked region.
+
+    hidden: tensor, dim (bs, num_feat, feat_dim)
+    mask: bool tensor, dim (bs, num_feat)
+    Returns a tensor of dim (bs * num_feat_unmasked, feat_dim),
+    containing the features in hidden that are True in the mask tensor.
+    """
+    mask = mask.unsqueeze(-1).expand_as(hidden)
+    hidden_masked = hidden[mask].contiguous().view(-1, hidden.size(-1))
+    return hidden_masked
diff --git a/mmf/models/transformers/heads/wra.py b/mmf/models/transformers/heads/wra.py
@@ -0,0 +1,84 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# Initial version was taken from https://github.com/ChenRocks/UNITER/
+# and adapted for MMF.
+
+from typing import Dict
+
+from mmf.common.registry import registry
+from mmf.modules.ot import optimal_transport_dist
+from torch import Tensor, nn
+
+
+@registry.register_transformer_head("wra")
+class WRA(nn.Module):
+    """
+    Word Region Alignment from UNITER.
+    Optimal Transport (OT) distance between text and image
+    features is used to optimize for WRA.
+    OT transport plan (T) is approximated through IPOT.
+    """
+
+    def __init__(
+        self,
+        loss_name: str = "wra_loss",
+        ot_inputs_key: str = "wra_info",
+        wra_label_key: str = "is_correct",
+        *args,
+        **kwargs,
+    ):
+        super().__init__()
+        self.loss_name = loss_name
+        self.ot_inputs_key = ot_inputs_key
+        self.wra_label_key = wra_label_key
+
+    def forward(
+        self,
+        sequence_output: Tensor,
+        processed_sample_list: Dict[str, Dict[str, Tensor]],
+    ) -> Dict[str, Dict[str, Tensor]]:
+
+        output_dict = {}
+
+        assert (
+            self.ot_inputs_key in processed_sample_list
+            and processed_sample_list[self.ot_inputs_key] is not None
+        ), (
+            f"WRA pretraining requires {self.ot_inputs_key} to be in sample "
+            + "list with value not None."
+        )
+        ot_inputs = processed_sample_list[self.ot_inputs_key]
+
+        assert (
+            ot_inputs.get("txt_pad") is not None
+            and ot_inputs.get("img_pad") is not None
+        ), (
+            "WRA pretraining requires 'txt_pad', and 'img_pad' to be in "
+            + f"'processed_sample_list[{self.ot_inputs_key}]' with"
+            + " values not None."
+        )
+        assert processed_sample_list.get(self.wra_label_key) is not None, (
+            f"WRA pretraining requires {self.wra_label_key} to be in sample "
+            + "list with value not None."
+        )
+
+        ctx_emb = sequence_output
+        tl = processed_sample_list["input_ids"].size(1)
+        il = processed_sample_list["image_feat"].size(1)
+        txt_emb = ctx_emb[:, :tl, :]
+        img_emb = ctx_emb[:, tl : tl + il, :]
+
+        txt_pad = ot_inputs["txt_pad"].bool()
+        img_pad = ot_inputs["img_pad"].bool()
+        itm_labels = processed_sample_list[self.wra_label_key]
+        # NOTE: run in fp32 for stability
+        ot_dist = optimal_transport_dist(
+            txt_emb.float(), img_emb.float(), txt_pad, img_pad
+        ).to(txt_emb)
+        ot_pos = ot_dist.masked_select(itm_labels == 1)
+        ot_neg = ot_dist.masked_select(itm_labels == 0)
+        ot_loss = (ot_pos.sum() - ot_neg.sum()) / (ot_pos.size(0) + ot_neg.size(0))
+
+        output_dict["losses"] = {}
+        output_dict["losses"][self.loss_name] = ot_loss
+        return output_dict
diff --git a/mmf/modules/ot.py b/mmf/modules/ot.py
@@ -0,0 +1,102 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+"""
+Initial version was taken from https://github.com/ChenRocks/UNITER/
+Licensed under the MIT license.
+
+Wasserstein Distance (Optimal Transport)
+"""
+
+import torch
+from torch import Tensor
+from torch.nn import functional as F
+
+
+def cost_matrix_cosine(x: Tensor, y: Tensor, eps: float = 1e-5) -> Tensor:
+    """Compute cosine distance across every pairs of x, y (batched)
+    [B, L_x, D] [B, L_y, D] -> [B, Lx, Ly]"""
+    assert x.dim() == y.dim()
+    assert x.size(0) == y.size(0)
+    assert x.size(2) == y.size(2)
+    x_norm = F.normalize(x, p=2, dim=-1, eps=eps)
+    y_norm = F.normalize(y, p=2, dim=-1, eps=eps)
+    cosine_sim = x_norm.matmul(y_norm.transpose(1, 2))
+    cosine_dist = 1 - cosine_sim
+    return cosine_dist
+
+
+def trace(x: Tensor) -> Tensor:
+    """Compute trace of input tensor (batched)"""
+    b, m, n = x.size()
+    assert m == n
+    mask = torch.eye(n, dtype=torch.bool, device=x.device).unsqueeze(0).expand_as(x)
+    trace = x.masked_select(mask).contiguous().view(b, n).sum(dim=-1, keepdim=False)
+    return trace
+
+
+@torch.no_grad()
+def ipot(
+    C: Tensor,
+    x_len: int,
+    x_pad: Tensor,
+    y_len: int,
+    y_pad: Tensor,
+    joint_pad: Tensor,
+    beta: float,
+    iteration: int,
+    k: int,
+) -> Tensor:
+    """[B, M, N], [B], [B, M], [B], [B, N], [B, M, N]"""
+    b, m, n = C.size()
+    sigma = torch.ones(b, m, dtype=C.dtype, device=C.device) / x_len.unsqueeze(1)
+    T = torch.ones(b, n, m, dtype=C.dtype, device=C.device)
+    A = torch.exp(-C.transpose(1, 2) / beta)
+
+    # mask padded positions
+    sigma.masked_fill_(x_pad, 0)
+    joint_pad = joint_pad.transpose(1, 2)
+    T.masked_fill_(joint_pad, 0)
+    A.masked_fill_(joint_pad, 0)
+
+    # broadcastable lengths
+    x_len = x_len.unsqueeze(1).unsqueeze(2)
+    y_len = y_len.unsqueeze(1).unsqueeze(2)
+
+    # mask to zero out padding in delta and sigma
+    x_mask = (x_pad.to(C.dtype) * 1e4).unsqueeze(1)
+    y_mask = (y_pad.to(C.dtype) * 1e4).unsqueeze(1)
+
+    for _ in range(iteration):
+        Q = A * T  # bs * n * m
+        sigma = sigma.view(b, m, 1)
+        for _ in range(k):
+            delta = 1 / (y_len * Q.matmul(sigma).view(b, 1, n) + y_mask)
+            sigma = 1 / (x_len * delta.matmul(Q) + x_mask)
+        T = delta.view(b, n, 1) * Q * sigma
+    T.masked_fill_(joint_pad, 0)
+    return T
+
+
+def optimal_transport_dist(
+    txt_emb: Tensor,
+    img_emb: Tensor,
+    txt_pad: Tensor,
+    img_pad: Tensor,
+    beta: float = 0.5,
+    iteration: int = 50,
+    k: int = 1,
+) -> Tensor:
+    """[B, M, D], [B, N, D], [B, M], [B, N]"""
+    cost = cost_matrix_cosine(txt_emb, img_emb)
+    # mask the padded inputs
+    joint_pad = txt_pad.unsqueeze(-1) | img_pad.unsqueeze(-2)
+    cost.masked_fill_(joint_pad, 0)
+
+    txt_len = (txt_pad.size(1) - txt_pad.sum(dim=1, keepdim=False)).to(dtype=cost.dtype)
+    img_len = (img_pad.size(1) - img_pad.sum(dim=1, keepdim=False)).to(dtype=cost.dtype)
+
+    T = ipot(
+        cost.detach(), txt_len, txt_pad, img_len, img_pad, joint_pad, beta, iteration, k
+    )
+    distance = trace(cost.matmul(T.detach()))
+    return distance
diff --git a/tests/models/transformers/test_heads.py b/tests/models/transformers/test_heads.py