yoloserv/modules/paravision/liveness/tensorrt/utils.py

63 lines
1.5 KiB
Python

import numpy as np
import pycuda.driver as cuda
import tensorrt as trt
from collections import defaultdict
DTYPES = defaultdict(lambda: np.float32)
DTYPES["NMS_1"] = np.int32
def GiB(val):
return val * 1 << 30
def do_inference(
context, bindings, inputs, input_data, outputs, output_data, stream, batch_size=1
):
[
cuda.memcpy_htod_async(input, input_data[i], stream)
for i, input in enumerate(inputs)
]
context.execute_async(
bindings=bindings, stream_handle=stream.handle, batch_size=batch_size
)
data = []
for i, device_output in enumerate(outputs):
size, dtype = output_data[i]
host_output = cuda.pagelocked_zeros(batch_size * size, dtype=dtype)
cuda.memcpy_dtoh_async(host_output, device_output, stream)
data.append(host_output)
stream.synchronize()
return data
def allocate_buffers(engine):
inputs = []
outputs = []
data = []
bindings = []
for binding in engine:
shape = engine.get_binding_shape(binding)
size = trt.volume(shape)
dtype = DTYPES[str(binding)]
host_mem = (size, dtype)
device_mem = cuda.mem_alloc(size * engine.max_batch_size * dtype().itemsize)
bindings.append(int(device_mem))
if engine.binding_is_input(binding):
inputs.append(device_mem)
else:
outputs.append(device_mem)
data.append(host_mem)
return inputs, outputs, data, bindings