Stereo Vision Hardware Function Optimization
The code for the existing hardware function stereo_remap_bm is shown below
with the optimization pragmas highlighted. Before reviewing the optimization directives, there
are a few things to note about the function.
- The hardware function contains sub-functions readLRinput, writeDispOut and writeDispOut which have also been optimized.
- The hardware function also uses pre-optimized functions, prefixed with the namespace hls, from the Vivado HLS video library hls_video.h. These sub-functions use their own data type MAT.
#include "hls_video.h"
#include "top.h"
#include "transform.h"
void readLRinput (yuv_t *inLR,
hls::Mat<IMG_HEIGHT, IMG_WIDTH, HLS_8UC1>& img_l,
hls::Mat<IMG_HEIGHT, IMG_WIDTH, HLS_8UC1>& img_r,
int height, int dual_width, int width, int stride)
{
for (int i=0; i < height; ++i) {
#pragma HLS loop_tripcount min=1080 max=1080 avg=1080
for (int j=0; j < stride; ++j) {
#pragma HLS loop_tripcount min=1920 max=1920 avg=1920
#pragma HLS PIPELINE
yuv_t tmpData = inLR [i*stride + j]; // from yuv_t array: consume height*stride
if (j < width)
img_l.write (tmpData & 0x00FF); // to HLS_8UC1 stream
else if (j < dual_width)
img_r.write (tmpData & 0x00FF); // to HLS_8UC1 stream
}
}
}
void writeDispOut(hls::Mat<IMG_HEIGHT, IMG_WIDTH, HLS_8UC1>& img_d,
yuv_t *dst,
int height, int width, int stride)
{
pix_t tmpOut;
yuv_t outData;
for (int i=0; i < height; ++i) {
#pragma HLS loop_tripcount min=1080 max=1080 avg=1080
for (int j=0; j < stride; ++j) {
#pragma HLS loop_tripcount min=960 max=960 avg=960
#pragma HLS PIPELINE
if (j < width) {
tmpOut = img_d.read().val[0];
outData = ((yuv_t) 0x8000) | ((yuv_t)tmpOut);
dst [i*stride +j] = outData;
}
else {
outData = (yuv_t) 0x8000;
dst [i*stride +j] = outData;
}
}
}
}
namespace hls {
void SaveAsGray(
Mat<IMG_HEIGHT, IMG_WIDTH, HLS_16SC1>& src,
Mat<IMG_HEIGHT, IMG_WIDTH, HLS_8UC1>& dst)
{
int height = src.rows;
int width = src.cols;
for (int i = 0; i < height; i++) {
#pragma HLS loop_tripcount min=1080 max=1080 avg=1080
for (int j = 0; j < width; j++) {
#pragma HLS loop_tripcount min=960 max=960 avg=960
#pragma HLS pipeline II=1
Scalar<1, short> s;
Scalar<1, unsigned char> d;
src >> s;
short uval = (short) (abs ((int)s.val[0]));
// Scale to avoid overflow. The right scaling here for a
// good picture depends on the NDISP parameter during
// block matching.
d.val[0] = (unsigned char)(uval >> 1);
//d.val[0] = (unsigned char)(s.val[0] >> 1);
dst << d;
}
}
}
} // namespace hls
int stereo_remap_bm_new(
yuv_t *img_data_lr,
yuv_t *img_data_disp,
hls::Window<3, 3, param_T > &lcameraMA_l,
hls::Window<3, 3, param_T > &lcameraMA_r,
hls::Window<3, 3, param_T > &lirA_l,
hls::Window<3, 3, param_T > &lirA_r,
param_T (&ldistC_l)[5],
param_T (&ldistC_r)[5],
int height, // 1080
int dual_width, // 1920 (two 960x1080 images side by side)
int stride_in, // 1920 (two 960x1080 images side by side)
int stride_out) // 960
{
int width = dual_width/2; // 960
#pragma HLS DATAFLOW
hls::Mat<IMG_HEIGHT, IMG_WIDTH, HLS_8UC1> img_l(height, width);
hls::Mat<IMG_HEIGHT, IMG_WIDTH, HLS_8UC1> img_r(height, width);
hls::Mat<IMG_HEIGHT, IMG_WIDTH, HLS_8UC1> img_l_remap(height, width); // remapped left image
hls::Mat<IMG_HEIGHT, IMG_WIDTH, HLS_8UC1> img_r_remap(height, width); // remapped left image
hls::Mat<IMG_HEIGHT, IMG_WIDTH, HLS_8UC1> img_d(height, width);
hls::Mat<IMG_HEIGHT, IMG_WIDTH, HLS_16SC2> map1_l(height, width);
hls::Mat<IMG_HEIGHT, IMG_WIDTH, HLS_16SC2> map1_r(height, width);
hls::Mat<IMG_HEIGHT, IMG_WIDTH, HLS_16UC2> map2_l(height, width);
hls::Mat<IMG_HEIGHT, IMG_WIDTH, HLS_16UC2> map2_r(height, width);
hls::Mat<IMG_HEIGHT, IMG_WIDTH, HLS_16SC1> img_disp(height, width);
hls::StereoBMState<15, 32, 32> state;
// ddr -> kernel streams: extract luma from left and right yuv images
// store it in single channel HLS_8UC1 left and right Mat's
readLRinput (img_data_lr, img_l, img_r, height, dual_width, width, stride_in);
//////////////////////// remap left and right images, all types are HLS_8UC1 //////////
hls::InitUndistortRectifyMapInverse(lcameraMA_l, ldistC_l, lirA_l, map1_l, map2_l);
hls::Remap<8>(img_l, img_l_remap, map1_l, map2_l, HLS_INTER_LINEAR);
hls::InitUndistortRectifyMapInverse(lcameraMA_r, ldistC_r, lirA_r, map1_r, map2_r);
hls::Remap<8>(img_r, img_r_remap, map1_r, map2_r, HLS_INTER_LINEAR);
////////// find disparity of remapped images //////////
hls::FindStereoCorrespondenceBM(img_l_remap, img_r_remap, img_disp, state);
hls::SaveAsGray(img_disp, img_d);
// kernel stream -> ddr : output single wide
writeDispOut (img_d, img_data_disp, height, width, stride_out);
return 0;
}
int stereo_remap_bm(
yuv_t *img_data_lr,
yuv_t *img_data_disp,
int height, // 1080
int dual_width, // 1920 (two 960x1080 images side by side)
int stride_in, // 1920 (two 960x1080 images side by side)
int stride_out) // 960
{
//1920*1080
//#pragma HLS interface m_axi port=img_data_lr depth=2073600
//#pragma HLS interface m_axi port=img_data_disp depth=2073600
hls::Window<3, 3, param_T > lcameraMA_l;
hls::Window<3, 3, param_T > lcameraMA_r;
hls::Window<3, 3, param_T > lirA_l;
hls::Window<3, 3, param_T > lirA_r;
param_T ldistC_l[5];
param_T ldistC_r[5];
for (int i=0; i<3; i++) {
for (int j=0; j<3; j++) {
lcameraMA_l.val[i][j]=cameraMA_l[i*3+j];
lcameraMA_r.val[i][j]=cameraMA_r[i*3+j];
lirA_l.val[i][j]=irA_l[i*3+j];
lirA_r.val[i][j]=irA_r[i*3+j];
}
}
for (int i=0; i<5; i++) {
ldistC_l[i] = distC_l[i];
ldistC_r[i] = distC_r[i];
}
int ret = stereo_remap_bm_new(img_data_lr,
img_data_disp,
lcameraMA_l,
lcameraMA_r,
lirA_l,
lirA_r,
ldistC_l,
ldistC_r,
height,
dual_width,
stride_in,
stride_out);
return ret;
}
As noted in Hardware Function Optimization Methodology, the primary optimization directives used are the PIPELINE and DATAFLOW directives. In addition, the LOOP_TRIPCOUNT directive is used.
In keeping with the recommendations for optimizing hardware functions which process frames of data, the PIPELINE directives are all applied to for-loops that process data at the sample level, or in this case, the pixel level. This ensures hardware pipelining is used to achieve the highest performing design.
The LOOP_TRIPCOUNT directives are used on for-loops for which the upper bound of the loop index is defined by a variable, the exact value of which is unknown at compile time. The estimated tripcount, or loop iteration count, allows the reports generated by Vivado HLS to include expected values for latency and initiation interval (II) instead of unknowns. This directive has no impact on the hardware created—it only impacts reporting.
The top-level function stereo_remap_bm is composed of the optimized
sub-functions and a number of functions from the Vivado HLS video library
(hls_video.h). Details of the library functions provided by Vivado HLS
are provided in Vivado Design Suite User Guide: High-Level Synthesis (UG902). The functions provided in the Vivado HLS video
library are already pre-optimized and contain all the optimization directives to ensure they
are implemented with the highest possible performance. The top-level function is therefore
composed of sub-functions that are all optimized, and it only requires the DATAFLOW directive
to ensure each sub-function starts to execute in hardware as soon as data becomes
available.
int stereo_remap_bm(..) {
#pragma HLS DATAFLOW
readLRinput (img_data_lr, img_l, img_r, height, dual_width, width, stride
hls::InitUndistortRectifyMapInverse(lcameraMA_l, ldistC_l, lirA_l, map1_l, map2_l);
hls::Remap<8>(img_l, img_l_remap, map1_l, map2_l, HLS_INTER_LINEAR);
hls::InitUndistortRectifyMapInverse(lcameraMA_r, ldistC_r, lirA_r, map1_r, map2_r);
hls::Remap<8>(img_r, img_r_remap, map1_r, map2_r, HLS_INTER_LINEAR);
hls::Duplicate(img_l_remap, img_l_remap_bm, img_l_remap_pt);
hls::FindStereoCorrespondenceBM(img_l_remap_bm, img_r_remap, img_disp, state);
hls::SaveAsGray(img_disp, img_d);
writeDispOut (img_l_remap_pt, img_d, img_data_disp, height, dual_width, width, stride);
}
In general, the DATAFLOW optimization is not required because the SDSoC environment
automatically ensures data is passed from one hardware function to the next as soon as it
becomes available. However, in this example the functions within
stereo_remap_bm are using a Vivado HLS data type
hls::stream which cannot be compiled on the ARM processor and cannot be
used in the hardware function interface in the SDSoC environment. For this reason, the
top-level hardware function must be stereo_remap_bm and thus, the DATAFLOW
directive is used to achieve high-performance transfers between the sub-functions. If this
were not the case, the DATAFLOW directive could be removed and each sub-function within
stereo_remap_bm could be specified as a hardware function.
The hardware functions in this design example use the data type Mat which is based on the
Vivado HLS data type hls::stream. The hls::stream data type
can only be accessed in a sequential manner. Data is pushed on and popped off.
- In software simulation, the
hls::streamdata type has infinite size. - In hardware, the
hls::streamdata type is implemented as a single register and can only store one data value at a time, because it is expected that the streaming data is consumed before the previous value is overwritten.
By specifying the top-level function stereo_remap_bm as the hardware
function, the effects of these hardware types can be ignored in the software environment.
However, when these functions are incorporated into the SDSoC environment, they cannot be
compiled on the ARM processor, and the system can only be verified through hardware emulation,
executing on the target platform, or both.
hls::stream data type is designed for use within
Vivado HLS, but is unsuitable for running software on embedded CPUs. Therefore, this type
should not be part of the top-level hardware function interface.If any of the arguments of the hardware function use any Vivado HLS specific data types, the function must be enclosed by a top-level C/C++ wrapper function that exposes only native C/C++ types in the function argument list.