From 90d4c769f6855de72698b321938d230eb450fed7 Mon Sep 17 00:00:00 2001
From: DougM <mengdong0427@gmail.com>
Date: Sun, 7 Feb 2021 22:25:45 -0800
Subject: [PATCH 1/5] fix bert input dimension to match Triton requirement

Signed-off-by: Dong Meng <dongm@nvidia.com>
Signed-off-by: Rajeev Rao <rajeevrao@nvidia.com>
---
 demo/BERT/builder.py | 61 ++++++++++++++++++++++----------------------
 1 file changed, 31 insertions(+), 30 deletions(-)

diff --git a/demo/BERT/builder.py b/demo/BERT/builder.py
index 76425cde4..5881e3367 100644
--- a/demo/BERT/builder.py
+++ b/demo/BERT/builder.py
@@ -528,39 +528,32 @@ def load_onnx_weights_and_quant(path, config):
     return weights_dict
 
 def emb_layernorm(builder, network, config, weights_dict, builder_config, sequence_lengths, batch_sizes):
-    if len(batch_sizes) > 1 or len(sequence_lengths) > 1:
-        # int8 only support some of the sequence length, we dynamic on sequence length is not allowed.
-        input_ids = network.add_input(name="input_ids", dtype=trt.int32, shape=(-1 if len(sequence_lengths) > 1 else sequence_lengths[0], -1 if len(batch_sizes) > 1 else batch_sizes[0]))
-        segment_ids = network.add_input(name="segment_ids", dtype=trt.int32, shape=(-1 if len(sequence_lengths) > 1 else sequence_lengths[0], -1 if len(batch_sizes) > 1 else batch_sizes[0]))
-        input_mask = network.add_input(name="input_mask", dtype=trt.int32, shape=(-1 if len(sequence_lengths) > 1 else sequence_lengths[0], -1 if len(batch_sizes) > 1 else batch_sizes[0]))
-
-        # Specify profiles for the batch sizes we're interested in.
-        # Make sure the profile also works for all sizes not covered by the previous profile.
-        prev_batch_size = 0
-        for batch_size in sorted(batch_sizes):
-            if len(sequence_lengths) == 1:
-                min_shape = (sequence_lengths[0], prev_batch_size + 1)
-                shape = (sequence_lengths[0], batch_size)
+    # int8 only support some of the sequence length, we dynamic on sequence length is not allowed.
+    input_ids = network.add_input(name="input_ids", dtype=trt.int32, shape=(-1, -1 if len(sequence_lengths) > 1 else sequence_lengths[0]))
+    segment_ids = network.add_input(name="segment_ids", dtype=trt.int32, shape=(-1, -1 if len(sequence_lengths) > 1 else sequence_lengths[0]))
+    input_mask = network.add_input(name="input_mask", dtype=trt.int32, shape=(-1, -1 if len(sequence_lengths) > 1 else sequence_lengths[0]))
+
+    # Specify profiles for the batch sizes we're interested in.
+    # Make sure the profile also works for all sizes not covered by the previous profile.
+
+    for batch_size in sorted(batch_sizes):
+        if len(sequence_lengths) == 1:
+            profile = builder.create_optimization_profile()
+            min_shape = (1, sequence_lengths[0])
+            shape = (batch_size, sequence_lengths[0])
+            profile.set_shape("input_ids", min=min_shape, opt=shape, max=shape)
+            profile.set_shape("segment_ids", min=min_shape, opt=shape, max=shape)
+            profile.set_shape("input_mask", min=min_shape, opt=shape, max=shape)
+            builder_config.add_optimization_profile(profile)
+        else:
+            for sequence_length in sorted(sequence_lengths):
+                profile = builder.create_optimization_profile()
+                min_shape = (1, sequence_length)
+                shape = (batch_size, sequence_length)
                 profile.set_shape("input_ids", min=min_shape, opt=shape, max=shape)
                 profile.set_shape("segment_ids", min=min_shape, opt=shape, max=shape)
                 profile.set_shape("input_mask", min=min_shape, opt=shape, max=shape)
                 builder_config.add_optimization_profile(profile)
-            else:
-                prev_sequence_length = 0
-                for sequence_length in sorted(sequence_lengths):
-                    profile = builder.create_optimization_profile()
-                    min_shape = (prev_sequence_length + 1, prev_batch_size + 1)
-                    shape = (sequence_length, batch_size)
-                    profile.set_shape("input_ids", min=min_shape, opt=shape, max=shape)
-                    profile.set_shape("segment_ids", min=min_shape, opt=shape, max=shape)
-                    profile.set_shape("input_mask", min=min_shape, opt=shape, max=shape)
-                    builder_config.add_optimization_profile(profile)
-                    prev_sequence_length = sequence_length
-            prev_batch_size = batch_size
-    else:
-        input_ids = network.add_input(name="input_ids", dtype=trt.int32, shape=(sequence_lengths[0], batch_sizes[0]))
-        segment_ids = network.add_input(name="segment_ids", dtype=trt.int32, shape=(sequence_lengths[0], batch_sizes[0]))
-        input_mask = network.add_input(name="input_mask", dtype=trt.int32, shape=(sequence_lengths[0], batch_sizes[0]))
 
     wbeta = trt.PluginField("bert_embeddings_layernorm_beta", weights_dict["bert_embeddings_layernorm_beta"].numpy(), trt.PluginFieldType.FLOAT32)
     wgamma = trt.PluginField("bert_embeddings_layernorm_gamma", weights_dict["bert_embeddings_layernorm_gamma"].numpy(), trt.PluginFieldType.FLOAT32)
@@ -574,7 +567,15 @@ def emb_layernorm(builder, network, config, weights_dict, builder_config, sequen
     pfc = trt.PluginFieldCollection([wbeta, wgamma, wwordemb, wtokemb, wposemb, output_fp16, mha_type])
     fn = emln_plg_creator.create_plugin("embeddings", pfc)
 
-    inputs = [input_ids, segment_ids, input_mask]
+    input_ids = network.add_shuffle(input_ids)
+    input_ids.second_transpose = (1, 0)
+    segment_ids = network.add_shuffle(segment_ids)
+    segment_ids.second_transpose = (1, 0)
+    input_mask = network.add_shuffle(input_mask)
+    input_mask.second_transpose = (1, 0)
+    inputs = [input_ids.get_output(0),
+              segment_ids.get_output(0),
+              input_mask.get_output(0)]
     emb_layer = network.add_plugin_v2(inputs, fn)
 
     if config.use_qat:

From 6b8734219b64422f65201f21b8f010989381fbd8 Mon Sep 17 00:00:00 2001
From: jiahongl <jiahong@nvidia.com>
Date: Mon, 10 May 2021 13:36:52 -0700
Subject: [PATCH 2/5] python_inference_fix

Signed-off-by: Rajeev Rao <rajeevrao@nvidia.com>
---
 .../inference-checkpoint.ipynb                | 357 ++++++++++++++++++
 demo/BERT/inference.ipynb                     |   4 +-
 demo/BERT/inference.py                        |  10 +-
 demo/BERT/perf.py                             |  16 +-
 4 files changed, 372 insertions(+), 15 deletions(-)
 create mode 100644 demo/BERT/.ipynb_checkpoints/inference-checkpoint.ipynb

diff --git a/demo/BERT/.ipynb_checkpoints/inference-checkpoint.ipynb b/demo/BERT/.ipynb_checkpoints/inference-checkpoint.ipynb
new file mode 100644
index 000000000..d015fd72e
--- /dev/null
+++ b/demo/BERT/.ipynb_checkpoints/inference-checkpoint.ipynb
@@ -0,0 +1,357 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Copyright 2021 NVIDIA Corporation. All Rights Reserved.\n",
+    "#\n",
+    "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+    "# you may not use this file except in compliance with the License.\n",
+    "# You may obtain a copy of the License at\n",
+    "#\n",
+    "#     http://www.apache.org/licenses/LICENSE-2.0\n",
+    "#\n",
+    "# Unless required by applicable law or agreed to in writing, software\n",
+    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+    "# See the License for the specific language governing permissions and\n",
+    "# limitations under the License.\n",
+    "# =============================================================================="
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<img src=\"https://upload.wikimedia.org/wikipedia/en/6/6d/Nvidia_image_logo.svg\" style=\"width: 90px; float: right;\">\n",
+    "\n",
+    "# QA Inference on BERT using TensorRT"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Overview\n",
+    "\n",
+    "Bidirectional Embedding Representations from Transformers (BERT), is a method of pre-training language representations which obtains state-of-the-art results on a wide array of Natural Language Processing (NLP) tasks. \n",
+    "\n",
+    "The original paper can be found here: https://arxiv.org/abs/1810.04805.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 1.a Learning objectives\n",
+    "\n",
+    "This notebook demonstrates:\n",
+    "- Inference on Question Answering (QA) task with BERT Base/Large model\n",
+    "- The use fine-tuned NVIDIA BERT models\n",
+    "- Use of BERT model with TRT"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Requirements\n",
+    "\n",
+    "Please refer to the ReadMe file"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. BERT Inference: Question Answering\n",
+    "\n",
+    "We can run inference on a fine-tuned BERT model for tasks like Question Answering.\n",
+    "\n",
+    "Here we use a BERT model fine-tuned on a [SQuaD 2.0 Dataset](https://rajpurkar.github.io/SQuAD-explorer/) which contains 100,000+ question-answer pairs on 500+ articles combined with over 50,000 new, unanswerable questions."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3.a Paragraph and Queries\n",
+    "\n",
+    "The paragraph and the questions can be customized by changing the text below. Note that when using models with small sequence lengths, you should use a shorter paragraph:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Paragraph:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "paragraph_text = \"The Apollo program, also known as Project Apollo, was the third United States human spaceflight program carried out by the National Aeronautics and Space Administration (NASA), which accomplished landing the first humans on the Moon from 1969 to 1972. First conceived during Dwight D. Eisenhower's administration as a three-man spacecraft to follow the one-man Project Mercury which put the first Americans in space, Apollo was later dedicated to President John F. Kennedy's national goal of landing a man on the Moon and returning him safely to the Earth by the end of the 1960s, which he proposed in a May 25, 1961, address to Congress. Project Mercury was followed by the two-man Project Gemini. The first manned flight of Apollo was in 1968. Apollo ran from 1961 to 1972, and was supported by the two-man Gemini program which ran concurrently with it from 1962 to 1966. Gemini missions developed some of the space travel techniques that were necessary for the success of the Apollo missions. Apollo used Saturn family rockets as launch vehicles. Apollo/Saturn vehicles were also used for an Apollo Applications Program, which consisted of Skylab, a space station that supported three manned missions in 1973-74, and the Apollo-Soyuz Test Project, a joint Earth orbit mission with the Soviet Union in 1975.\"\n",
+    "\n",
+    "# Short paragraph version for BERT models with max sequence length of 128\n",
+    "short_paragraph_text = \"The Apollo program was the third United States human spaceflight program. First conceived as a three-man spacecraft to follow the one-man Project Mercury which put the first Americans in space, Apollo was dedicated to President John F. Kennedy's national goal of landing a man on the Moon. The first manned flight of Apollo was in 1968. Apollo ran from 1961 to 1972 followed by the Apollo-Soyuz Test Project a joint Earth orbit mission with the Soviet Union in 1975.\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Question:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "question_text = \"What project put the first Americans into space?\"\n",
+    "#question_text =  \"What year did the first manned Apollo flight occur?\"\n",
+    "#question_text =  \"What President is credited with the original notion of putting Americans in space?\"\n",
+    "#question_text =  \"Who did the U.S. collaborate with on an Earth orbit mission in 1975?\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this example we ask our BERT model questions related to the following paragraph:\n",
+    "\n",
+    "**The Apollo Program**\n",
+    "_\"The Apollo program, also known as Project Apollo, was the third United States human spaceflight program carried out by the National Aeronautics and Space Administration (NASA), which accomplished landing the first humans on the Moon from 1969 to 1972. First conceived during Dwight D. Eisenhower's administration as a three-man spacecraft to follow the one-man Project Mercury which put the first Americans in space, Apollo was later dedicated to President John F. Kennedy's national goal of landing a man on the Moon and returning him safely to the Earth by the end of the 1960s, which he proposed in a May 25, 1961, address to Congress. Project Mercury was followed by the two-man Project Gemini. The first manned flight of Apollo was in 1968. Apollo ran from 1961 to 1972, and was supported by the two-man Gemini program which ran concurrently with it from 1962 to 1966. Gemini missions developed some of the space travel techniques that were necessary for the success of the Apollo missions. Apollo used Saturn family rockets as launch vehicles. Apollo/Saturn vehicles were also used for an Apollo Applications Program, which consisted of Skylab, a space station that supported three manned missions in 1973-74, and the Apollo-Soyuz Test Project, a joint Earth orbit mission with the Soviet Union in 1975.\"_\n",
+    "\n",
+    "The questions and relative answers expected are shown below:\n",
+    "\n",
+    " - **Q1:** \"What project put the first Americans into space?\" \n",
+    "  - **A1:** \"Project Mercury\"\n",
+    " - **Q2:** \"What program was created to carry out these projects and missions?\"\n",
+    "  - **A2:** \"The Apollo program\"\n",
+    " - **Q3:** \"What year did the first manned Apollo flight occur?\"\n",
+    "  - **A3:** \"1968\"\n",
+    " - **Q4:** \"What President is credited with the original notion of putting Americans in space?\"\n",
+    "  - **A4:** \"John F. Kennedy\"\n",
+    " - **Q5:** \"Who did the U.S. collaborate with on an Earth orbit mission in 1975?\"\n",
+    "  - **A5:** \"Soviet Union\"\n",
+    " - **Q6:** \"How long did Project Apollo run?\"\n",
+    "  - **A6:** \"1961 to 1972\"\n",
+    " - **Q7:** \"What program helped develop space travel techniques that Project Apollo used?\"\n",
+    "  - **A7:** \"Gemini Mission\"\n",
+    " - **Q8:** \"What space station supported three manned missions in 1973-1974?\"\n",
+    "  - **A8:** \"Skylab\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Data Preprocessing\n",
+    "Let's convert the paragraph and the question to BERT input with the help of the tokenizer:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import helpers.data_processing as dp\n",
+    "import helpers.tokenization as tokenization\n",
+    "\n",
+    "tokenizer = tokenization.FullTokenizer(vocab_file=\"/workspace/TensorRT/demo/BERT/models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/vocab.txt\", do_lower_case=True)\n",
+    "\n",
+    "# The maximum number of tokens for the question. Questions longer than this will be truncated to this length.\n",
+    "max_query_length = 64\n",
+    "\n",
+    "# When splitting up a long document into chunks, how much stride to take between chunks.\n",
+    "doc_stride = 128\n",
+    "\n",
+    "# The maximum total input sequence length after WordPiece tokenization. \n",
+    "# Sequences longer than this will be truncated, and sequences shorter \n",
+    "max_seq_length = 128\n",
+    "\n",
+    "# Extract tokens from the paragraph\n",
+    "doc_tokens = dp.convert_doc_tokens(short_paragraph_text)\n",
+    "\n",
+    "# Extract features from the paragraph and question\n",
+    "features = dp.convert_example_to_features(doc_tokens, question_text, tokenizer, max_seq_length, doc_stride, max_query_length)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## TensorRT Inference"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorrt as trt\n",
+    "TRT_LOGGER = trt.Logger(trt.Logger.INFO)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import ctypes\n",
+    "import os\n",
+    "\n",
+    "ctypes.CDLL(\"libnvinfer_plugin.so\", mode=ctypes.RTLD_GLOBAL)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pycuda.driver as cuda\n",
+    "import pycuda.autoinit\n",
+    "import collections\n",
+    "import numpy as np\n",
+    "import time\n",
+    "\n",
+    "# Load the BERT-Large Engine\n",
+    "with open(\"/workspace/TensorRT/demo/BERT/engines/bert_large_128.engine\", \"rb\") as f, \\\n",
+    "    trt.Runtime(TRT_LOGGER) as runtime, \\\n",
+    "    runtime.deserialize_cuda_engine(f.read()) as engine, \\\n",
+    "    engine.create_execution_context() as context:\n",
+    "\n",
+    "     # We always use batch size 1.\n",
+    "    input_shape = (1, max_seq_length)\n",
+    "    input_nbytes = trt.volume(input_shape) * trt.int32.itemsize\n",
+    "    \n",
+    "    # Allocate device memory for inputs.\n",
+    "    d_inputs = [cuda.mem_alloc(input_nbytes) for binding in range(3)]\n",
+    "    # Create a stream in which to copy inputs/outputs and run inference.\n",
+    "    stream = cuda.Stream()\n",
+    "\n",
+    "    # Specify input shapes. These must be within the min/max bounds of the active profile (0th profile in this case)\n",
+    "    # Note that input shapes can be specified on a per-inference basis, but in this case, we only have a single shape.\n",
+    "    for binding in range(3):\n",
+    "        context.set_binding_shape(binding, input_shape)\n",
+    "    assert context.all_binding_shapes_specified\n",
+    "\n",
+    "    # Allocate output buffer by querying the size from the context. This may be different for different input shapes.\n",
+    "    h_output = cuda.pagelocked_empty(tuple(context.get_binding_shape(3)), dtype=np.float32)\n",
+    "    d_output = cuda.mem_alloc(h_output.nbytes)\n",
+    "\n",
+    "    print(\"\\nRunning Inference...\")\n",
+    "\n",
+    "    _NetworkOutput = collections.namedtuple(  # pylint: disable=invalid-name\n",
+    "        \"NetworkOutput\",\n",
+    "        [\"start_logits\", \"end_logits\", \"feature_index\"])\n",
+    "    networkOutputs = []\n",
+    "\n",
+    "    eval_time_elapsed = 0\n",
+    "    for feature_index, feature in enumerate(features):\n",
+    "        # Copy inputs\n",
+    "        input_ids = cuda.register_host_memory(np.ascontiguousarray(feature.input_ids.ravel()))\n",
+    "        segment_ids = cuda.register_host_memory(np.ascontiguousarray(feature.segment_ids.ravel()))\n",
+    "        input_mask = cuda.register_host_memory(np.ascontiguousarray(feature.input_mask.ravel()))\n",
+    "\n",
+    "        eval_start_time = time.time()\n",
+    "        cuda.memcpy_htod_async(d_inputs[0], input_ids, stream)\n",
+    "        cuda.memcpy_htod_async(d_inputs[1], segment_ids, stream)\n",
+    "        cuda.memcpy_htod_async(d_inputs[2], input_mask, stream)\n",
+    "\n",
+    "        # Run inference\n",
+    "        context.execute_async_v2(bindings=[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle)\n",
+    "        # Synchronize the stream\n",
+    "        stream.synchronize()\n",
+    "        eval_time_elapsed += (time.time() - eval_start_time)\n",
+    "\n",
+    "        # Transfer predictions back from GPU\n",
+    "        cuda.memcpy_dtoh_async(h_output, d_output, stream)\n",
+    "        stream.synchronize()\n",
+    "\n",
+    "        for index, batch in enumerate(h_output):\n",
+    "            # Data Post-processing\n",
+    "            networkOutputs.append(_NetworkOutput(\n",
+    "                start_logits = np.array(batch.squeeze()[:, 0]),\n",
+    "                end_logits = np.array(batch.squeeze()[:, 1]),\n",
+    "                feature_index = feature_index\n",
+    "                ))\n",
+    "\n",
+    "    eval_time_elapsed /= len(features)\n",
+    "    \n",
+    "    print(\"-----------------------------\")\n",
+    "    print(\"Running Inference at {:.3f} Sentences/Sec\".format(1.0/eval_time_elapsed))\n",
+    "    print(\"-----------------------------\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Data Post-Processing"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now that we have the inference results let's extract the actual answer to our question"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "    # The total number of n-best predictions to generate in the nbest_predictions.json output file\n",
+    "    n_best_size = 20\n",
+    "\n",
+    "    # The maximum length of an answer that can be generated. This is needed \n",
+    "    #  because the start and end predictions are not conditioned on one another\n",
+    "    max_answer_length = 30\n",
+    "\n",
+    "    prediction, nbest_json, scores_diff_json = dp.get_predictions(doc_tokens, features,\n",
+    "        networkOutputs, n_best_size, max_answer_length)\n",
+    "    \n",
+    "    for index, output in enumerate(networkOutputs):\n",
+    "        print(\"Processing output\")\n",
+    "        print(\"Answer: '{}'\".format(prediction))\n",
+    "        print(\"with prob: {:.3f}%\".format(nbest_json[0]['probability'] * 100.0))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/demo/BERT/inference.ipynb b/demo/BERT/inference.ipynb
index 6633d6bae..d015fd72e 100644
--- a/demo/BERT/inference.ipynb
+++ b/demo/BERT/inference.ipynb
@@ -234,7 +234,7 @@
     "    engine.create_execution_context() as context:\n",
     "\n",
     "     # We always use batch size 1.\n",
-    "    input_shape = (max_seq_length, 1)\n",
+    "    input_shape = (1, max_seq_length)\n",
     "    input_nbytes = trt.volume(input_shape) * trt.int32.itemsize\n",
     "    \n",
     "    # Allocate device memory for inputs.\n",
@@ -349,7 +349,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.9"
+   "version": "3.8.3"
   }
  },
  "nbformat": 4,
diff --git a/demo/BERT/inference.py b/demo/BERT/inference.py
index 3759f2ce7..ab4a71239 100644
--- a/demo/BERT/inference.py
+++ b/demo/BERT/inference.py
@@ -130,7 +130,7 @@ def question_features(tokens, question):
         num_binding_per_profile = engine.num_bindings // engine.num_optimization_profiles
         for idx in range(engine.num_optimization_profiles):
             profile_shape = engine.get_profile_shape(profile_index = idx, binding = idx * num_binding_per_profile)
-            if profile_shape[0][1] <= args.batch_size and profile_shape[2][1] >= args.batch_size and profile_shape[0][0] <= max_seq_length and profile_shape[2][0] >= max_seq_length:
+            if profile_shape[0][0] <= args.batch_size and profile_shape[2][0] >= args.batch_size and profile_shape[0][1] <= max_seq_length and profile_shape[2][1] >= max_seq_length:
                 selected_profile = idx
                 break
         if selected_profile == -1:
@@ -141,7 +141,7 @@ def question_features(tokens, question):
 
         # Specify input shapes. These must be within the min/max bounds of the active profile
         # Note that input shapes can be specified on a per-inference basis, but in this case, we only have a single shape.
-        input_shape = (max_seq_length, args.batch_size)
+        input_shape = (args.batch_size, max_seq_length)
         input_nbytes = trt.volume(input_shape) * trt.int32.itemsize
         for binding in range(3):
             context.set_binding_shape(binding_idx_offset + binding, input_shape)
@@ -168,9 +168,9 @@ def inference(features, tokens):
             eval_time_elapsed = 0
             for feature_index, feature in enumerate(features):
                 # Copy inputs
-                input_ids_batch = np.dstack([feature.input_ids] * args.batch_size).squeeze()
-                segment_ids_batch = np.dstack([feature.segment_ids] * args.batch_size).squeeze()
-                input_mask_batch = np.dstack([feature.input_mask] * args.batch_size).squeeze()
+                input_ids_batch = np.repeat(np.expand_dims(feature.input_ids, 0), args.batch_size, axis=0)
+                segment_ids_batch = np.repeat(np.expand_dims(feature.segment_ids, 0), args.batch_size, axis=0)
+                input_mask_batch = np.repeat(np.expand_dims(feature.input_mask, 0), args.batch_size, axis=0)
 
                 input_ids = cuda.register_host_memory(np.ascontiguousarray(input_ids_batch.ravel()))
                 segment_ids = cuda.register_host_memory(np.ascontiguousarray(segment_ids_batch.ravel()))
diff --git a/demo/BERT/perf.py b/demo/BERT/perf.py
index 28abec837..976afea3a 100644
--- a/demo/BERT/perf.py
+++ b/demo/BERT/perf.py
@@ -52,8 +52,8 @@ def main():
 
     with open(args.engine, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime, runtime.deserialize_cuda_engine(f.read()) as engine, engine.create_execution_context() as context:
         # Allocate buffers large enough to store the largest batch size
-        max_input_shape = (args.sequence_length, max(args.batch_size))
-        max_output_shape = (args.sequence_length, max(args.batch_size), 2, 1, 1)
+        max_input_shape = (max(args.batch_size), args.sequence_length)
+        max_output_shape = (max(args.batch_size), args.sequence_length, 2, 1, 1)
         buffers = [
             DeviceBuffer(max_input_shape),
             DeviceBuffer(max_input_shape),
@@ -65,9 +65,9 @@ def main():
         pseudo_vocab_size = 30522
         pseudo_type_vocab_size = 2
         np.random.seed(args.random_seed)
-        test_word_ids = np.random.randint(0, pseudo_vocab_size, (args.sequence_length, max(args.batch_size)), dtype=np.int32)
-        test_segment_ids = np.random.randint(0, pseudo_type_vocab_size, (args.sequence_length, max(args.batch_size)), dtype=np.int32)
-        test_input_mask = np.ones((args.sequence_length, max(args.batch_size)), dtype=np.int32)
+        test_word_ids = np.random.randint(0, pseudo_vocab_size, (max(args.batch_size), args.sequence_length), dtype=np.int32)
+        test_segment_ids = np.random.randint(0, pseudo_type_vocab_size, (max(args.batch_size), args.sequence_length), dtype=np.int32)
+        test_input_mask = np.ones((max(args.batch_size), args.sequence_length), dtype=np.int32)
 
         # Copy input h2d
         cuda.memcpy_htod(buffers[0].buf, test_word_ids.ravel())
@@ -86,9 +86,9 @@ def main():
             bindings = [0] * binding_idx_offset + [buf.binding() for buf in buffers]
 
             shapes = {
-                "input_ids": (args.sequence_length, batch_size),
-                "segment_ids": (args.sequence_length, batch_size),
-                "input_mask": (args.sequence_length, batch_size),
+                "input_ids": (batch_size, args.sequence_length),
+                "segment_ids": (batch_size, args.sequence_length),
+                "input_mask": (batch_size, args.sequence_length),
             }
 
             for binding, shape in shapes.items():

From 78e5c9a6ef39138e5932ccd5e17ce73084347b99 Mon Sep 17 00:00:00 2001
From: Jiahong-Nvidia <64861949+Jiahong-Nvidia@users.noreply.github.com>
Date: Mon, 10 May 2021 13:39:42 -0700
Subject: [PATCH 3/5] Delete demo/BERT/.ipynb_checkpoints directory

delete checkpoints

Signed-off-by: Rajeev Rao <rajeevrao@nvidia.com>
---
 .../inference-checkpoint.ipynb                | 357 ------------------
 1 file changed, 357 deletions(-)
 delete mode 100644 demo/BERT/.ipynb_checkpoints/inference-checkpoint.ipynb

diff --git a/demo/BERT/.ipynb_checkpoints/inference-checkpoint.ipynb b/demo/BERT/.ipynb_checkpoints/inference-checkpoint.ipynb
deleted file mode 100644
index d015fd72e..000000000
--- a/demo/BERT/.ipynb_checkpoints/inference-checkpoint.ipynb
+++ /dev/null
@@ -1,357 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Copyright 2021 NVIDIA Corporation. All Rights Reserved.\n",
-    "#\n",
-    "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
-    "# you may not use this file except in compliance with the License.\n",
-    "# You may obtain a copy of the License at\n",
-    "#\n",
-    "#     http://www.apache.org/licenses/LICENSE-2.0\n",
-    "#\n",
-    "# Unless required by applicable law or agreed to in writing, software\n",
-    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
-    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
-    "# See the License for the specific language governing permissions and\n",
-    "# limitations under the License.\n",
-    "# =============================================================================="
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "<img src=\"https://upload.wikimedia.org/wikipedia/en/6/6d/Nvidia_image_logo.svg\" style=\"width: 90px; float: right;\">\n",
-    "\n",
-    "# QA Inference on BERT using TensorRT"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 1. Overview\n",
-    "\n",
-    "Bidirectional Embedding Representations from Transformers (BERT), is a method of pre-training language representations which obtains state-of-the-art results on a wide array of Natural Language Processing (NLP) tasks. \n",
-    "\n",
-    "The original paper can be found here: https://arxiv.org/abs/1810.04805.\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 1.a Learning objectives\n",
-    "\n",
-    "This notebook demonstrates:\n",
-    "- Inference on Question Answering (QA) task with BERT Base/Large model\n",
-    "- The use fine-tuned NVIDIA BERT models\n",
-    "- Use of BERT model with TRT"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 2. Requirements\n",
-    "\n",
-    "Please refer to the ReadMe file"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 3. BERT Inference: Question Answering\n",
-    "\n",
-    "We can run inference on a fine-tuned BERT model for tasks like Question Answering.\n",
-    "\n",
-    "Here we use a BERT model fine-tuned on a [SQuaD 2.0 Dataset](https://rajpurkar.github.io/SQuAD-explorer/) which contains 100,000+ question-answer pairs on 500+ articles combined with over 50,000 new, unanswerable questions."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 3.a Paragraph and Queries\n",
-    "\n",
-    "The paragraph and the questions can be customized by changing the text below. Note that when using models with small sequence lengths, you should use a shorter paragraph:"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Paragraph:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "paragraph_text = \"The Apollo program, also known as Project Apollo, was the third United States human spaceflight program carried out by the National Aeronautics and Space Administration (NASA), which accomplished landing the first humans on the Moon from 1969 to 1972. First conceived during Dwight D. Eisenhower's administration as a three-man spacecraft to follow the one-man Project Mercury which put the first Americans in space, Apollo was later dedicated to President John F. Kennedy's national goal of landing a man on the Moon and returning him safely to the Earth by the end of the 1960s, which he proposed in a May 25, 1961, address to Congress. Project Mercury was followed by the two-man Project Gemini. The first manned flight of Apollo was in 1968. Apollo ran from 1961 to 1972, and was supported by the two-man Gemini program which ran concurrently with it from 1962 to 1966. Gemini missions developed some of the space travel techniques that were necessary for the success of the Apollo missions. Apollo used Saturn family rockets as launch vehicles. Apollo/Saturn vehicles were also used for an Apollo Applications Program, which consisted of Skylab, a space station that supported three manned missions in 1973-74, and the Apollo-Soyuz Test Project, a joint Earth orbit mission with the Soviet Union in 1975.\"\n",
-    "\n",
-    "# Short paragraph version for BERT models with max sequence length of 128\n",
-    "short_paragraph_text = \"The Apollo program was the third United States human spaceflight program. First conceived as a three-man spacecraft to follow the one-man Project Mercury which put the first Americans in space, Apollo was dedicated to President John F. Kennedy's national goal of landing a man on the Moon. The first manned flight of Apollo was in 1968. Apollo ran from 1961 to 1972 followed by the Apollo-Soyuz Test Project a joint Earth orbit mission with the Soviet Union in 1975.\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Question:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "question_text = \"What project put the first Americans into space?\"\n",
-    "#question_text =  \"What year did the first manned Apollo flight occur?\"\n",
-    "#question_text =  \"What President is credited with the original notion of putting Americans in space?\"\n",
-    "#question_text =  \"Who did the U.S. collaborate with on an Earth orbit mission in 1975?\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In this example we ask our BERT model questions related to the following paragraph:\n",
-    "\n",
-    "**The Apollo Program**\n",
-    "_\"The Apollo program, also known as Project Apollo, was the third United States human spaceflight program carried out by the National Aeronautics and Space Administration (NASA), which accomplished landing the first humans on the Moon from 1969 to 1972. First conceived during Dwight D. Eisenhower's administration as a three-man spacecraft to follow the one-man Project Mercury which put the first Americans in space, Apollo was later dedicated to President John F. Kennedy's national goal of landing a man on the Moon and returning him safely to the Earth by the end of the 1960s, which he proposed in a May 25, 1961, address to Congress. Project Mercury was followed by the two-man Project Gemini. The first manned flight of Apollo was in 1968. Apollo ran from 1961 to 1972, and was supported by the two-man Gemini program which ran concurrently with it from 1962 to 1966. Gemini missions developed some of the space travel techniques that were necessary for the success of the Apollo missions. Apollo used Saturn family rockets as launch vehicles. Apollo/Saturn vehicles were also used for an Apollo Applications Program, which consisted of Skylab, a space station that supported three manned missions in 1973-74, and the Apollo-Soyuz Test Project, a joint Earth orbit mission with the Soviet Union in 1975.\"_\n",
-    "\n",
-    "The questions and relative answers expected are shown below:\n",
-    "\n",
-    " - **Q1:** \"What project put the first Americans into space?\" \n",
-    "  - **A1:** \"Project Mercury\"\n",
-    " - **Q2:** \"What program was created to carry out these projects and missions?\"\n",
-    "  - **A2:** \"The Apollo program\"\n",
-    " - **Q3:** \"What year did the first manned Apollo flight occur?\"\n",
-    "  - **A3:** \"1968\"\n",
-    " - **Q4:** \"What President is credited with the original notion of putting Americans in space?\"\n",
-    "  - **A4:** \"John F. Kennedy\"\n",
-    " - **Q5:** \"Who did the U.S. collaborate with on an Earth orbit mission in 1975?\"\n",
-    "  - **A5:** \"Soviet Union\"\n",
-    " - **Q6:** \"How long did Project Apollo run?\"\n",
-    "  - **A6:** \"1961 to 1972\"\n",
-    " - **Q7:** \"What program helped develop space travel techniques that Project Apollo used?\"\n",
-    "  - **A7:** \"Gemini Mission\"\n",
-    " - **Q8:** \"What space station supported three manned missions in 1973-1974?\"\n",
-    "  - **A8:** \"Skylab\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Data Preprocessing\n",
-    "Let's convert the paragraph and the question to BERT input with the help of the tokenizer:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import helpers.data_processing as dp\n",
-    "import helpers.tokenization as tokenization\n",
-    "\n",
-    "tokenizer = tokenization.FullTokenizer(vocab_file=\"/workspace/TensorRT/demo/BERT/models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/vocab.txt\", do_lower_case=True)\n",
-    "\n",
-    "# The maximum number of tokens for the question. Questions longer than this will be truncated to this length.\n",
-    "max_query_length = 64\n",
-    "\n",
-    "# When splitting up a long document into chunks, how much stride to take between chunks.\n",
-    "doc_stride = 128\n",
-    "\n",
-    "# The maximum total input sequence length after WordPiece tokenization. \n",
-    "# Sequences longer than this will be truncated, and sequences shorter \n",
-    "max_seq_length = 128\n",
-    "\n",
-    "# Extract tokens from the paragraph\n",
-    "doc_tokens = dp.convert_doc_tokens(short_paragraph_text)\n",
-    "\n",
-    "# Extract features from the paragraph and question\n",
-    "features = dp.convert_example_to_features(doc_tokens, question_text, tokenizer, max_seq_length, doc_stride, max_query_length)\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## TensorRT Inference"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import tensorrt as trt\n",
-    "TRT_LOGGER = trt.Logger(trt.Logger.INFO)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import ctypes\n",
-    "import os\n",
-    "\n",
-    "ctypes.CDLL(\"libnvinfer_plugin.so\", mode=ctypes.RTLD_GLOBAL)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pycuda.driver as cuda\n",
-    "import pycuda.autoinit\n",
-    "import collections\n",
-    "import numpy as np\n",
-    "import time\n",
-    "\n",
-    "# Load the BERT-Large Engine\n",
-    "with open(\"/workspace/TensorRT/demo/BERT/engines/bert_large_128.engine\", \"rb\") as f, \\\n",
-    "    trt.Runtime(TRT_LOGGER) as runtime, \\\n",
-    "    runtime.deserialize_cuda_engine(f.read()) as engine, \\\n",
-    "    engine.create_execution_context() as context:\n",
-    "\n",
-    "     # We always use batch size 1.\n",
-    "    input_shape = (1, max_seq_length)\n",
-    "    input_nbytes = trt.volume(input_shape) * trt.int32.itemsize\n",
-    "    \n",
-    "    # Allocate device memory for inputs.\n",
-    "    d_inputs = [cuda.mem_alloc(input_nbytes) for binding in range(3)]\n",
-    "    # Create a stream in which to copy inputs/outputs and run inference.\n",
-    "    stream = cuda.Stream()\n",
-    "\n",
-    "    # Specify input shapes. These must be within the min/max bounds of the active profile (0th profile in this case)\n",
-    "    # Note that input shapes can be specified on a per-inference basis, but in this case, we only have a single shape.\n",
-    "    for binding in range(3):\n",
-    "        context.set_binding_shape(binding, input_shape)\n",
-    "    assert context.all_binding_shapes_specified\n",
-    "\n",
-    "    # Allocate output buffer by querying the size from the context. This may be different for different input shapes.\n",
-    "    h_output = cuda.pagelocked_empty(tuple(context.get_binding_shape(3)), dtype=np.float32)\n",
-    "    d_output = cuda.mem_alloc(h_output.nbytes)\n",
-    "\n",
-    "    print(\"\\nRunning Inference...\")\n",
-    "\n",
-    "    _NetworkOutput = collections.namedtuple(  # pylint: disable=invalid-name\n",
-    "        \"NetworkOutput\",\n",
-    "        [\"start_logits\", \"end_logits\", \"feature_index\"])\n",
-    "    networkOutputs = []\n",
-    "\n",
-    "    eval_time_elapsed = 0\n",
-    "    for feature_index, feature in enumerate(features):\n",
-    "        # Copy inputs\n",
-    "        input_ids = cuda.register_host_memory(np.ascontiguousarray(feature.input_ids.ravel()))\n",
-    "        segment_ids = cuda.register_host_memory(np.ascontiguousarray(feature.segment_ids.ravel()))\n",
-    "        input_mask = cuda.register_host_memory(np.ascontiguousarray(feature.input_mask.ravel()))\n",
-    "\n",
-    "        eval_start_time = time.time()\n",
-    "        cuda.memcpy_htod_async(d_inputs[0], input_ids, stream)\n",
-    "        cuda.memcpy_htod_async(d_inputs[1], segment_ids, stream)\n",
-    "        cuda.memcpy_htod_async(d_inputs[2], input_mask, stream)\n",
-    "\n",
-    "        # Run inference\n",
-    "        context.execute_async_v2(bindings=[int(d_inp) for d_inp in d_inputs] + [int(d_output)], stream_handle=stream.handle)\n",
-    "        # Synchronize the stream\n",
-    "        stream.synchronize()\n",
-    "        eval_time_elapsed += (time.time() - eval_start_time)\n",
-    "\n",
-    "        # Transfer predictions back from GPU\n",
-    "        cuda.memcpy_dtoh_async(h_output, d_output, stream)\n",
-    "        stream.synchronize()\n",
-    "\n",
-    "        for index, batch in enumerate(h_output):\n",
-    "            # Data Post-processing\n",
-    "            networkOutputs.append(_NetworkOutput(\n",
-    "                start_logits = np.array(batch.squeeze()[:, 0]),\n",
-    "                end_logits = np.array(batch.squeeze()[:, 1]),\n",
-    "                feature_index = feature_index\n",
-    "                ))\n",
-    "\n",
-    "    eval_time_elapsed /= len(features)\n",
-    "    \n",
-    "    print(\"-----------------------------\")\n",
-    "    print(\"Running Inference at {:.3f} Sentences/Sec\".format(1.0/eval_time_elapsed))\n",
-    "    print(\"-----------------------------\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Data Post-Processing"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now that we have the inference results let's extract the actual answer to our question"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "    # The total number of n-best predictions to generate in the nbest_predictions.json output file\n",
-    "    n_best_size = 20\n",
-    "\n",
-    "    # The maximum length of an answer that can be generated. This is needed \n",
-    "    #  because the start and end predictions are not conditioned on one another\n",
-    "    max_answer_length = 30\n",
-    "\n",
-    "    prediction, nbest_json, scores_diff_json = dp.get_predictions(doc_tokens, features,\n",
-    "        networkOutputs, n_best_size, max_answer_length)\n",
-    "    \n",
-    "    for index, output in enumerate(networkOutputs):\n",
-    "        print(\"Processing output\")\n",
-    "        print(\"Answer: '{}'\".format(prediction))\n",
-    "        print(\"with prob: {:.3f}%\".format(nbest_json[0]['probability'] * 100.0))"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}

From 718c13d0d00c6a9b7ee7df3621250bd957e0261b Mon Sep 17 00:00:00 2001
From: Rajeev Rao <rajeevrao@nvidia.com>
Date: Mon, 10 May 2021 15:02:14 -0700
Subject: [PATCH 4/5] Use relative paths in demoBERT scripts

Signed-off-by: Rajeev Rao <rajeevrao@nvidia.com>
---
 demo/BERT/README.md                      |  4 ++--
 demo/BERT/scripts/download_model.sh      |  5 +++--
 demo/BERT/scripts/download_squad.sh      |  5 +++--
 demo/BERT/scripts/inference_benchmark.sh | 10 +++++-----
 4 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/demo/BERT/README.md b/demo/BERT/README.md
index 335308c6d..07d5d49f7 100644
--- a/demo/BERT/README.md
+++ b/demo/BERT/README.md
@@ -118,12 +118,12 @@ This demo BERT application can be run within the TensorRT OSS build container. I
 
     Download SQuAD v1.1 training and dev dataset.
     ```bash
-    sh ./scripts/download_squad.sh
+    bash ./scripts/download_squad.sh
     ```
 
     Download Tensorflow checkpoints for BERT large model with sequence length 128, fine-tuned for SQuAD v2.0.
     ```bash
-    sh scripts/download_model.sh
+    bash scripts/download_model.sh
     ```
 
 **Note:** Since the datasets and checkpoints are stored in the directory mounted from the host, they do *not* need to be downloaded each time the container is launched. 
diff --git a/demo/BERT/scripts/download_model.sh b/demo/BERT/scripts/download_model.sh
index dda89a493..5339df9ff 100755
--- a/demo/BERT/scripts/download_model.sh
+++ b/demo/BERT/scripts/download_model.sh
@@ -51,8 +51,8 @@ do
 done
 
 # Prepare the download directory
-mkdir -p /workspace/TensorRT/demo/BERT/models/fine-tuned
-cd /workspace/TensorRT/demo/BERT/models/fine-tuned
+mkdir -p models/fine-tuned
+pushd models/fine-tuned
 
 # Download the BERT fine-tuned model
 echo "Downloading BERT-${FW} ${MODEL} checkpoints for sequence length ${SEQ_LEN} and fine-tuned for SQuAD ${SQUAD}."
@@ -78,3 +78,4 @@ if [ -n "$CKPT" ]; then
         ngc registry model download-version nvidia/${CKPT}:${CKPT_VERSION}
     fi
 fi
+popd
diff --git a/demo/BERT/scripts/download_squad.sh b/demo/BERT/scripts/download_squad.sh
index 842ca85eb..5653a7fff 100755
--- a/demo/BERT/scripts/download_squad.sh
+++ b/demo/BERT/scripts/download_squad.sh
@@ -37,7 +37,8 @@ done
 
 # Download the SQuAD training and dev datasets
 echo "Downloading SQuAD-${VERSION} training and dev datasets"
-mkdir -p /workspace/TensorRT/demo/BERT/squad
-cd /workspace/TensorRT/demo/BERT/squad
+mkdir -p squad
+pushd squad
 wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-${VERSION}.json
 wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-${VERSION}.json
+popd
diff --git a/demo/BERT/scripts/inference_benchmark.sh b/demo/BERT/scripts/inference_benchmark.sh
index 1df185a5a..e2bc0a9d5 100755
--- a/demo/BERT/scripts/inference_benchmark.sh
+++ b/demo/BERT/scripts/inference_benchmark.sh
@@ -24,12 +24,12 @@ SEQUENCE_LENGTH="${4}"
 MAX_BATCH="${5}"
 GPU_ARCH="${6}"
 
-CHECKPOINTS_DIR="/workspace/TensorRT/demo/BERT/models/fine-tuned/bert_tf_ckpt_${MODEL_VARIANT}_qa_squad2_amp_${SEQUENCE_LENGTH}_v19.03.1"
-SQUAD_DIR="/workspace/TensorRT/demo/BERT/squad"
-ENGINE_NAME="/workspace/TensorRT/demo/BERT/engines/bert_${MODEL_VARIANT}_${PRECISION}_bs${MAX_BATCH}_seqlen${SEQUENCE_LENGTH}_benchmark.engine"
+CHECKPOINTS_DIR="models/fine-tuned/bert_tf_ckpt_${MODEL_VARIANT}_qa_squad2_amp_${SEQUENCE_LENGTH}_v19.03.1"
+SQUAD_DIR="BERT/squad"
+ENGINE_NAME="engines/bert_${MODEL_VARIANT}_${PRECISION}_bs${MAX_BATCH}_seqlen${SEQUENCE_LENGTH}_benchmark.engine"
 # QAT Checkpoint - available only for BERT-Large
-QAT_CHECKPOINT="/workspace/TensorRT/demo/BERT/models/fine-tuned/bert_pyt_onnx_large_qa_squad11_amp_fake_quant_v1/bert_large_v1_1_fake_quant.onnx"
-CUDAGRAPH_PERFBIN="/workspace/TensorRT/demo/BERT/build/perf"
+QAT_CHECKPOINT="models/fine-tuned/bert_pyt_onnx_large_qa_squad11_amp_fake_quant_v1/bert_large_v1_1_fake_quant.onnx"
+CUDAGRAPH_PERFBIN="build/perf"
 
 echo "==== Benchmarking BERT ${MODEL_VARIANT} ${PRECISION} SEQLEN ${SEQUENCE_LENGTH} on ${GPU_ARCH} ===="
 if [ ! -f ${ENGINE_NAME} ]; then

From faced5806c8d272b8f71f33dc1dfe5bd32b3c906 Mon Sep 17 00:00:00 2001
From: Rajeev Rao <rajeevrao@nvidia.com>
Date: Mon, 10 May 2021 22:27:36 -0700
Subject: [PATCH 5/5] Update demoBERT C++ inference for triton

Signed-off-by: Rajeev Rao <rajeevrao@nvidia.com>
---
 demo/BERT/infer_c/bert_infer.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/demo/BERT/infer_c/bert_infer.h b/demo/BERT/infer_c/bert_infer.h
index 881e70d22..f50776b9c 100644
--- a/demo/BERT/infer_c/bert_infer.h
+++ b/demo/BERT/infer_c/bert_infer.h
@@ -74,7 +74,7 @@ struct BertInference
             exit(-1);
         }
 
-        mEngine = TrtUniquePtr<ICudaEngine>(runtime->deserializeCudaEngine(bytes.data(), bytes.size(), nullptr));
+        mEngine = TrtUniquePtr<ICudaEngine>(runtime->deserializeCudaEngine(bytes.data(), bytes.size()));
         if (mEngine == nullptr)
         {
             gLogError << "Error deserializing CUDA engine\n";
@@ -175,7 +175,7 @@ struct BertInference
         {
             for (int i = 0; i < kBERT_INPUT_NUM; i++)
             {
-                mContext->setBindingDimensions(i + bindingIdxOffset, Dims2(mSeqLength, batchSize));
+                mContext->setBindingDimensions(i + bindingIdxOffset, Dims2(batchSize, mSeqLength));
             }
         }