Skip to content

Commit ab20a8a

Browse files
authored
Update demoBERT input dimensions to match Triton requirement (#1051)
Signed-off-by: Rajeev Rao <rajeevrao@nvidia.com>
1 parent c9c1327 commit ab20a8a

9 files changed

Lines changed: 61 additions & 58 deletions

File tree

demo/BERT/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,12 +118,12 @@ This demo BERT application can be run within the TensorRT OSS build container. I
118118

119119
Download SQuAD v1.1 training and dev dataset.
120120
```bash
121-
sh ./scripts/download_squad.sh
121+
bash ./scripts/download_squad.sh
122122
```
123123

124124
Download Tensorflow checkpoints for BERT large model with sequence length 128, fine-tuned for SQuAD v2.0.
125125
```bash
126-
sh scripts/download_model.sh
126+
bash scripts/download_model.sh
127127
```
128128

129129
**Note:** Since the datasets and checkpoints are stored in the directory mounted from the host, they do *not* need to be downloaded each time the container is launched.

demo/BERT/builder.py

Lines changed: 31 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -528,39 +528,32 @@ def load_onnx_weights_and_quant(path, config):
528528
return weights_dict
529529

530530
def emb_layernorm(builder, network, config, weights_dict, builder_config, sequence_lengths, batch_sizes):
531-
if len(batch_sizes) > 1 or len(sequence_lengths) > 1:
532-
# int8 only support some of the sequence length, we dynamic on sequence length is not allowed.
533-
input_ids = network.add_input(name="input_ids", dtype=trt.int32, shape=(-1 if len(sequence_lengths) > 1 else sequence_lengths[0], -1 if len(batch_sizes) > 1 else batch_sizes[0]))
534-
segment_ids = network.add_input(name="segment_ids", dtype=trt.int32, shape=(-1 if len(sequence_lengths) > 1 else sequence_lengths[0], -1 if len(batch_sizes) > 1 else batch_sizes[0]))
535-
input_mask = network.add_input(name="input_mask", dtype=trt.int32, shape=(-1 if len(sequence_lengths) > 1 else sequence_lengths[0], -1 if len(batch_sizes) > 1 else batch_sizes[0]))
536-
537-
# Specify profiles for the batch sizes we're interested in.
538-
# Make sure the profile also works for all sizes not covered by the previous profile.
539-
prev_batch_size = 0
540-
for batch_size in sorted(batch_sizes):
541-
if len(sequence_lengths) == 1:
542-
min_shape = (sequence_lengths[0], prev_batch_size + 1)
543-
shape = (sequence_lengths[0], batch_size)
531+
# int8 only support some of the sequence length, we dynamic on sequence length is not allowed.
532+
input_ids = network.add_input(name="input_ids", dtype=trt.int32, shape=(-1, -1 if len(sequence_lengths) > 1 else sequence_lengths[0]))
533+
segment_ids = network.add_input(name="segment_ids", dtype=trt.int32, shape=(-1, -1 if len(sequence_lengths) > 1 else sequence_lengths[0]))
534+
input_mask = network.add_input(name="input_mask", dtype=trt.int32, shape=(-1, -1 if len(sequence_lengths) > 1 else sequence_lengths[0]))
535+
536+
# Specify profiles for the batch sizes we're interested in.
537+
# Make sure the profile also works for all sizes not covered by the previous profile.
538+
539+
for batch_size in sorted(batch_sizes):
540+
if len(sequence_lengths) == 1:
541+
profile = builder.create_optimization_profile()
542+
min_shape = (1, sequence_lengths[0])
543+
shape = (batch_size, sequence_lengths[0])
544+
profile.set_shape("input_ids", min=min_shape, opt=shape, max=shape)
545+
profile.set_shape("segment_ids", min=min_shape, opt=shape, max=shape)
546+
profile.set_shape("input_mask", min=min_shape, opt=shape, max=shape)
547+
builder_config.add_optimization_profile(profile)
548+
else:
549+
for sequence_length in sorted(sequence_lengths):
550+
profile = builder.create_optimization_profile()
551+
min_shape = (1, sequence_length)
552+
shape = (batch_size, sequence_length)
544553
profile.set_shape("input_ids", min=min_shape, opt=shape, max=shape)
545554
profile.set_shape("segment_ids", min=min_shape, opt=shape, max=shape)
546555
profile.set_shape("input_mask", min=min_shape, opt=shape, max=shape)
547556
builder_config.add_optimization_profile(profile)
548-
else:
549-
prev_sequence_length = 0
550-
for sequence_length in sorted(sequence_lengths):
551-
profile = builder.create_optimization_profile()
552-
min_shape = (prev_sequence_length + 1, prev_batch_size + 1)
553-
shape = (sequence_length, batch_size)
554-
profile.set_shape("input_ids", min=min_shape, opt=shape, max=shape)
555-
profile.set_shape("segment_ids", min=min_shape, opt=shape, max=shape)
556-
profile.set_shape("input_mask", min=min_shape, opt=shape, max=shape)
557-
builder_config.add_optimization_profile(profile)
558-
prev_sequence_length = sequence_length
559-
prev_batch_size = batch_size
560-
else:
561-
input_ids = network.add_input(name="input_ids", dtype=trt.int32, shape=(sequence_lengths[0], batch_sizes[0]))
562-
segment_ids = network.add_input(name="segment_ids", dtype=trt.int32, shape=(sequence_lengths[0], batch_sizes[0]))
563-
input_mask = network.add_input(name="input_mask", dtype=trt.int32, shape=(sequence_lengths[0], batch_sizes[0]))
564557

565558
wbeta = trt.PluginField("bert_embeddings_layernorm_beta", weights_dict["bert_embeddings_layernorm_beta"].numpy(), trt.PluginFieldType.FLOAT32)
566559
wgamma = trt.PluginField("bert_embeddings_layernorm_gamma", weights_dict["bert_embeddings_layernorm_gamma"].numpy(), trt.PluginFieldType.FLOAT32)
@@ -574,7 +567,15 @@ def emb_layernorm(builder, network, config, weights_dict, builder_config, sequen
574567
pfc = trt.PluginFieldCollection([wbeta, wgamma, wwordemb, wtokemb, wposemb, output_fp16, mha_type])
575568
fn = emln_plg_creator.create_plugin("embeddings", pfc)
576569

577-
inputs = [input_ids, segment_ids, input_mask]
570+
input_ids = network.add_shuffle(input_ids)
571+
input_ids.second_transpose = (1, 0)
572+
segment_ids = network.add_shuffle(segment_ids)
573+
segment_ids.second_transpose = (1, 0)
574+
input_mask = network.add_shuffle(input_mask)
575+
input_mask.second_transpose = (1, 0)
576+
inputs = [input_ids.get_output(0),
577+
segment_ids.get_output(0),
578+
input_mask.get_output(0)]
578579
emb_layer = network.add_plugin_v2(inputs, fn)
579580

580581
if config.use_qat:

demo/BERT/infer_c/bert_infer.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ struct BertInference
7474
exit(-1);
7575
}
7676

77-
mEngine = TrtUniquePtr<ICudaEngine>(runtime->deserializeCudaEngine(bytes.data(), bytes.size(), nullptr));
77+
mEngine = TrtUniquePtr<ICudaEngine>(runtime->deserializeCudaEngine(bytes.data(), bytes.size()));
7878
if (mEngine == nullptr)
7979
{
8080
gLogError << "Error deserializing CUDA engine\n";
@@ -175,7 +175,7 @@ struct BertInference
175175
{
176176
for (int i = 0; i < kBERT_INPUT_NUM; i++)
177177
{
178-
mContext->setBindingDimensions(i + bindingIdxOffset, Dims2(mSeqLength, batchSize));
178+
mContext->setBindingDimensions(i + bindingIdxOffset, Dims2(batchSize, mSeqLength));
179179
}
180180
}
181181

demo/BERT/inference.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,7 @@
234234
" engine.create_execution_context() as context:\n",
235235
"\n",
236236
" # We always use batch size 1.\n",
237-
" input_shape = (max_seq_length, 1)\n",
237+
" input_shape = (1, max_seq_length)\n",
238238
" input_nbytes = trt.volume(input_shape) * trt.int32.itemsize\n",
239239
" \n",
240240
" # Allocate device memory for inputs.\n",
@@ -349,7 +349,7 @@
349349
"name": "python",
350350
"nbconvert_exporter": "python",
351351
"pygments_lexer": "ipython3",
352-
"version": "3.6.9"
352+
"version": "3.8.3"
353353
}
354354
},
355355
"nbformat": 4,

demo/BERT/inference.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ def question_features(tokens, question):
130130
num_binding_per_profile = engine.num_bindings // engine.num_optimization_profiles
131131
for idx in range(engine.num_optimization_profiles):
132132
profile_shape = engine.get_profile_shape(profile_index = idx, binding = idx * num_binding_per_profile)
133-
if profile_shape[0][1] <= args.batch_size and profile_shape[2][1] >= args.batch_size and profile_shape[0][0] <= max_seq_length and profile_shape[2][0] >= max_seq_length:
133+
if profile_shape[0][0] <= args.batch_size and profile_shape[2][0] >= args.batch_size and profile_shape[0][1] <= max_seq_length and profile_shape[2][1] >= max_seq_length:
134134
selected_profile = idx
135135
break
136136
if selected_profile == -1:
@@ -141,7 +141,7 @@ def question_features(tokens, question):
141141

142142
# Specify input shapes. These must be within the min/max bounds of the active profile
143143
# Note that input shapes can be specified on a per-inference basis, but in this case, we only have a single shape.
144-
input_shape = (max_seq_length, args.batch_size)
144+
input_shape = (args.batch_size, max_seq_length)
145145
input_nbytes = trt.volume(input_shape) * trt.int32.itemsize
146146
for binding in range(3):
147147
context.set_binding_shape(binding_idx_offset + binding, input_shape)
@@ -168,9 +168,9 @@ def inference(features, tokens):
168168
eval_time_elapsed = 0
169169
for feature_index, feature in enumerate(features):
170170
# Copy inputs
171-
input_ids_batch = np.dstack([feature.input_ids] * args.batch_size).squeeze()
172-
segment_ids_batch = np.dstack([feature.segment_ids] * args.batch_size).squeeze()
173-
input_mask_batch = np.dstack([feature.input_mask] * args.batch_size).squeeze()
171+
input_ids_batch = np.repeat(np.expand_dims(feature.input_ids, 0), args.batch_size, axis=0)
172+
segment_ids_batch = np.repeat(np.expand_dims(feature.segment_ids, 0), args.batch_size, axis=0)
173+
input_mask_batch = np.repeat(np.expand_dims(feature.input_mask, 0), args.batch_size, axis=0)
174174

175175
input_ids = cuda.register_host_memory(np.ascontiguousarray(input_ids_batch.ravel()))
176176
segment_ids = cuda.register_host_memory(np.ascontiguousarray(segment_ids_batch.ravel()))

demo/BERT/perf.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,8 @@ def main():
5252

5353
with open(args.engine, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime, runtime.deserialize_cuda_engine(f.read()) as engine, engine.create_execution_context() as context:
5454
# Allocate buffers large enough to store the largest batch size
55-
max_input_shape = (args.sequence_length, max(args.batch_size))
56-
max_output_shape = (args.sequence_length, max(args.batch_size), 2, 1, 1)
55+
max_input_shape = (max(args.batch_size), args.sequence_length)
56+
max_output_shape = (max(args.batch_size), args.sequence_length, 2, 1, 1)
5757
buffers = [
5858
DeviceBuffer(max_input_shape),
5959
DeviceBuffer(max_input_shape),
@@ -65,9 +65,9 @@ def main():
6565
pseudo_vocab_size = 30522
6666
pseudo_type_vocab_size = 2
6767
np.random.seed(args.random_seed)
68-
test_word_ids = np.random.randint(0, pseudo_vocab_size, (args.sequence_length, max(args.batch_size)), dtype=np.int32)
69-
test_segment_ids = np.random.randint(0, pseudo_type_vocab_size, (args.sequence_length, max(args.batch_size)), dtype=np.int32)
70-
test_input_mask = np.ones((args.sequence_length, max(args.batch_size)), dtype=np.int32)
68+
test_word_ids = np.random.randint(0, pseudo_vocab_size, (max(args.batch_size), args.sequence_length), dtype=np.int32)
69+
test_segment_ids = np.random.randint(0, pseudo_type_vocab_size, (max(args.batch_size), args.sequence_length), dtype=np.int32)
70+
test_input_mask = np.ones((max(args.batch_size), args.sequence_length), dtype=np.int32)
7171

7272
# Copy input h2d
7373
cuda.memcpy_htod(buffers[0].buf, test_word_ids.ravel())
@@ -86,9 +86,9 @@ def main():
8686
bindings = [0] * binding_idx_offset + [buf.binding() for buf in buffers]
8787

8888
shapes = {
89-
"input_ids": (args.sequence_length, batch_size),
90-
"segment_ids": (args.sequence_length, batch_size),
91-
"input_mask": (args.sequence_length, batch_size),
89+
"input_ids": (batch_size, args.sequence_length),
90+
"segment_ids": (batch_size, args.sequence_length),
91+
"input_mask": (batch_size, args.sequence_length),
9292
}
9393

9494
for binding, shape in shapes.items():

demo/BERT/scripts/download_model.sh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,8 @@ do
5151
done
5252

5353
# Prepare the download directory
54-
mkdir -p /workspace/TensorRT/demo/BERT/models/fine-tuned
55-
cd /workspace/TensorRT/demo/BERT/models/fine-tuned
54+
mkdir -p models/fine-tuned
55+
pushd models/fine-tuned
5656

5757
# Download the BERT fine-tuned model
5858
echo "Downloading BERT-${FW} ${MODEL} checkpoints for sequence length ${SEQ_LEN} and fine-tuned for SQuAD ${SQUAD}."
@@ -78,3 +78,4 @@ if [ -n "$CKPT" ]; then
7878
ngc registry model download-version nvidia/${CKPT}:${CKPT_VERSION}
7979
fi
8080
fi
81+
popd

demo/BERT/scripts/download_squad.sh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@ done
3737

3838
# Download the SQuAD training and dev datasets
3939
echo "Downloading SQuAD-${VERSION} training and dev datasets"
40-
mkdir -p /workspace/TensorRT/demo/BERT/squad
41-
cd /workspace/TensorRT/demo/BERT/squad
40+
mkdir -p squad
41+
pushd squad
4242
wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-${VERSION}.json
4343
wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-${VERSION}.json
44+
popd

demo/BERT/scripts/inference_benchmark.sh

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,12 @@ SEQUENCE_LENGTH="${4}"
2424
MAX_BATCH="${5}"
2525
GPU_ARCH="${6}"
2626

27-
CHECKPOINTS_DIR="/workspace/TensorRT/demo/BERT/models/fine-tuned/bert_tf_ckpt_${MODEL_VARIANT}_qa_squad2_amp_${SEQUENCE_LENGTH}_v19.03.1"
28-
SQUAD_DIR="/workspace/TensorRT/demo/BERT/squad"
29-
ENGINE_NAME="/workspace/TensorRT/demo/BERT/engines/bert_${MODEL_VARIANT}_${PRECISION}_bs${MAX_BATCH}_seqlen${SEQUENCE_LENGTH}_benchmark.engine"
27+
CHECKPOINTS_DIR="models/fine-tuned/bert_tf_ckpt_${MODEL_VARIANT}_qa_squad2_amp_${SEQUENCE_LENGTH}_v19.03.1"
28+
SQUAD_DIR="BERT/squad"
29+
ENGINE_NAME="engines/bert_${MODEL_VARIANT}_${PRECISION}_bs${MAX_BATCH}_seqlen${SEQUENCE_LENGTH}_benchmark.engine"
3030
# QAT Checkpoint - available only for BERT-Large
31-
QAT_CHECKPOINT="/workspace/TensorRT/demo/BERT/models/fine-tuned/bert_pyt_onnx_large_qa_squad11_amp_fake_quant_v1/bert_large_v1_1_fake_quant.onnx"
32-
CUDAGRAPH_PERFBIN="/workspace/TensorRT/demo/BERT/build/perf"
31+
QAT_CHECKPOINT="models/fine-tuned/bert_pyt_onnx_large_qa_squad11_amp_fake_quant_v1/bert_large_v1_1_fake_quant.onnx"
32+
CUDAGRAPH_PERFBIN="build/perf"
3333

3434
echo "==== Benchmarking BERT ${MODEL_VARIANT} ${PRECISION} SEQLEN ${SEQUENCE_LENGTH} on ${GPU_ARCH} ===="
3535
if [ ! -f ${ENGINE_NAME} ]; then

0 commit comments

Comments
 (0)