Y
yuning he
Guest
Hello, here is my question:
Purpose: realize face detection on zynq-7020 SoC
Platform: Zedboard with OV5640 camera
Completed work: capturing video from camera, writing into DDR for storage and reading from DDR for display
Question: how to realize a face detection IP and its throughput can reach 30fps(pixel 320*240)
Here are my jobs:
Base on the Viola Jones algorithm, using HLS(high level synthesis) tool to realize hardware IP from a C++ design
And this is my reference: https://github.com/cornell-zhang/facedetect-fpga
I have simulate and synthesize it into hardware IP, but its throughput does not reach the goal because the interval and latency are very large. (latency is 338(min) to 576593236(max), interval is 336(min) to 142310514002(max))
Looking into the code, I find the latency is mainly caused by the following for loops, but I don't know how to optimize the latency compromising between area.
So you may help me a lot with these:
1.Another way to realize face detection on zynq-7020?
2.How to test the throughput of my system and the relation between real throughput and the synthesis result?
3.Any way to optimize the following for loops?
Looking forward to your reply. Please feel free to contact me at anytime.
Thanks.
----loop1:
imageScalerL1: for ( i = 0 ; i < IMAGE_HEIGHT ; i++ ){
imageScalerL1_1: for (j=0;j < IMAGE_WIDTH ;j++) {
#pragma HLS pipeline
if ( j < w2 && i < h2 )
IMG1_data[j] = Data[(i*y_ratio)>>16][(j*x_ratio)>>16];
}
}
----loop2:
Pixely: for( y = 0; y < sum_row; y++ ){
Pixelx: for ( x = 0; x < sum_col; x++ ){
/* Updates for Integral Image Window Buffer (I) */
SetIIu: for ( u = 0; u < WINDOW_SIZE; u++){
#pragma HLS unroll
SetIIj: for ( v = 0; v < WINDOW_SIZE; v++ ){
#pragma HLS unroll
II[v] = II[v] + ( I[v+1] - I[0] );
}
}
/* Updates for Square Image Window Buffer (SI) */
SII[0][0] = SII[0][0] + ( SI[0][1] - SI[0][0] );
SII[0][1] = SII[0][1] + ( SI[0][WINDOW_SIZE] - SI[0][0] );
SII[1][0] = SII[1][0] + ( SI[WINDOW_SIZE-1][1] - SI[WINDOW_SIZE-1][0] );
SII[1][1] = SII[1][1] + ( SI[WINDOW_SIZE-1][WINDOW_SIZE] - SI[WINDOW_SIZE-1][0] );
/* Updates for Image Window Buffer (I) and Square Image Window Bufer (SI) */
SetIj: for( j = 0; j < 2*WINDOW_SIZE-1; j++){
#pragma HLS unroll
SetIi: for( i = 0; i < WINDOW_SIZE; i++ ){
#pragma HLS unroll
if( i+j != 2*WINDOW_SIZE-1 ){
I[j] = I[j+1];
SI[j] = SI[j+1];
}
else if ( i > 0 ){
I[j] = I[j+1] + I[i-1][j+1];
SI[j] = SI[j+1] + SI[i-1][j+1];
}
}
}
// Last column of the I[][] and SI[][] matrix
Ilast: for( i = 0; i < WINDOW_SIZE-1; i++ ){
#pragma HLS unroll
I[2*WINDOW_SIZE-1] = L[x];
SI[2*WINDOW_SIZE-1] = L[x]*L[x];
}
I[WINDOW_SIZE-1][2*WINDOW_SIZE-1] = IMG1_data[y][x];
SI[WINDOW_SIZE-1][2*WINDOW_SIZE-1] = IMG1_data[y][x]*IMG1_data[y][x];
/** Updates for Image Line Buffer (L) **/
LineBuf: for( k = 0; k < WINDOW_SIZE-2; k++ ){
#pragma HLS unroll
L[k][x] = L[k+1][x];
}
L[WINDOW_SIZE-2][x] = IMG1_data[y][x];
/* Pass the Integral Image Window buffer through Cascaded Classifier. Only pass
* when the integral image window buffer has flushed out the initial garbage data */
if ( element_counter >= ( ( (WINDOW_SIZE-1)*sum_col + WINDOW_SIZE ) + WINDOW_SIZE -1 ) ) {
/* Sliding Window should not go beyond the boundary */
if ( x_index < ( sum_col - (WINDOW_SIZE-1) ) && y_index < ( sum_row - (WINDOW_SIZE-1) ) ){
p.x = x_index;
p.y = y_index;
result = cascadeClassifier ( p, II, SII );
if ( result > 0 )
{
MyRect r = {myRound(p.x*factor), myRound(p.y*factor), winSize.width, winSize.height};
AllCandidates_x[*AllCandidates_size]=r.x;
AllCandidates_y[*AllCandidates_size]=r.y;
AllCandidates_w[*AllCandidates_size]=r.width;
AllCandidates_h[*AllCandidates_size]=r.height;
*AllCandidates_size=*AllCandidates_size+1;
}
}// inner if
if ( x_index < sum_col-1 )
x_index = x_index + 1;
else{
x_index = 0;
y_index = y_index + 1;
}
} // outer if
element_counter +=1;
}
}
Purpose: realize face detection on zynq-7020 SoC
Platform: Zedboard with OV5640 camera
Completed work: capturing video from camera, writing into DDR for storage and reading from DDR for display
Question: how to realize a face detection IP and its throughput can reach 30fps(pixel 320*240)
Here are my jobs:
Base on the Viola Jones algorithm, using HLS(high level synthesis) tool to realize hardware IP from a C++ design
And this is my reference: https://github.com/cornell-zhang/facedetect-fpga
I have simulate and synthesize it into hardware IP, but its throughput does not reach the goal because the interval and latency are very large. (latency is 338(min) to 576593236(max), interval is 336(min) to 142310514002(max))
Looking into the code, I find the latency is mainly caused by the following for loops, but I don't know how to optimize the latency compromising between area.
So you may help me a lot with these:
1.Another way to realize face detection on zynq-7020?
2.How to test the throughput of my system and the relation between real throughput and the synthesis result?
3.Any way to optimize the following for loops?
Looking forward to your reply. Please feel free to contact me at anytime.
Thanks.
----loop1:
imageScalerL1: for ( i = 0 ; i < IMAGE_HEIGHT ; i++ ){
imageScalerL1_1: for (j=0;j < IMAGE_WIDTH ;j++) {
#pragma HLS pipeline
if ( j < w2 && i < h2 )
IMG1_data[j] = Data[(i*y_ratio)>>16][(j*x_ratio)>>16];
}
}
----loop2:
Pixely: for( y = 0; y < sum_row; y++ ){
Pixelx: for ( x = 0; x < sum_col; x++ ){
/* Updates for Integral Image Window Buffer (I) */
SetIIu: for ( u = 0; u < WINDOW_SIZE; u++){
#pragma HLS unroll
SetIIj: for ( v = 0; v < WINDOW_SIZE; v++ ){
#pragma HLS unroll
II[v] = II[v] + ( I[v+1] - I[0] );
}
}
/* Updates for Square Image Window Buffer (SI) */
SII[0][0] = SII[0][0] + ( SI[0][1] - SI[0][0] );
SII[0][1] = SII[0][1] + ( SI[0][WINDOW_SIZE] - SI[0][0] );
SII[1][0] = SII[1][0] + ( SI[WINDOW_SIZE-1][1] - SI[WINDOW_SIZE-1][0] );
SII[1][1] = SII[1][1] + ( SI[WINDOW_SIZE-1][WINDOW_SIZE] - SI[WINDOW_SIZE-1][0] );
/* Updates for Image Window Buffer (I) and Square Image Window Bufer (SI) */
SetIj: for( j = 0; j < 2*WINDOW_SIZE-1; j++){
#pragma HLS unroll
SetIi: for( i = 0; i < WINDOW_SIZE; i++ ){
#pragma HLS unroll
if( i+j != 2*WINDOW_SIZE-1 ){
I[j] = I[j+1];
SI[j] = SI[j+1];
}
else if ( i > 0 ){
I[j] = I[j+1] + I[i-1][j+1];
SI[j] = SI[j+1] + SI[i-1][j+1];
}
}
}
// Last column of the I[][] and SI[][] matrix
Ilast: for( i = 0; i < WINDOW_SIZE-1; i++ ){
#pragma HLS unroll
I[2*WINDOW_SIZE-1] = L[x];
SI[2*WINDOW_SIZE-1] = L[x]*L[x];
}
I[WINDOW_SIZE-1][2*WINDOW_SIZE-1] = IMG1_data[y][x];
SI[WINDOW_SIZE-1][2*WINDOW_SIZE-1] = IMG1_data[y][x]*IMG1_data[y][x];
/** Updates for Image Line Buffer (L) **/
LineBuf: for( k = 0; k < WINDOW_SIZE-2; k++ ){
#pragma HLS unroll
L[k][x] = L[k+1][x];
}
L[WINDOW_SIZE-2][x] = IMG1_data[y][x];
/* Pass the Integral Image Window buffer through Cascaded Classifier. Only pass
* when the integral image window buffer has flushed out the initial garbage data */
if ( element_counter >= ( ( (WINDOW_SIZE-1)*sum_col + WINDOW_SIZE ) + WINDOW_SIZE -1 ) ) {
/* Sliding Window should not go beyond the boundary */
if ( x_index < ( sum_col - (WINDOW_SIZE-1) ) && y_index < ( sum_row - (WINDOW_SIZE-1) ) ){
p.x = x_index;
p.y = y_index;
result = cascadeClassifier ( p, II, SII );
if ( result > 0 )
{
MyRect r = {myRound(p.x*factor), myRound(p.y*factor), winSize.width, winSize.height};
AllCandidates_x[*AllCandidates_size]=r.x;
AllCandidates_y[*AllCandidates_size]=r.y;
AllCandidates_w[*AllCandidates_size]=r.width;
AllCandidates_h[*AllCandidates_size]=r.height;
*AllCandidates_size=*AllCandidates_size+1;
}
}// inner if
if ( x_index < sum_col-1 )
x_index = x_index + 1;
else{
x_index = 0;
y_index = y_index + 1;
}
} // outer if
element_counter +=1;
}
}