函数原型
<code>
代码原型,不是我写的。只是拿来copy。
static LA_bool xxx(unsigned char *src, int Width, int Height, short *table1, short *table2,
int dstWidth, int dstHeight, int nchanner, unsigned char *dst, Rect_S stRect)
{
int sx, sy;
int i, j;
int stepSrc = Width * nchanner;
int stepDstMapxy = dstWidth;
int stepDstMapCoef = dstWidth * 2;
short cof00, cof01, cof10, cof11;
int offset1, offset2;
//int r, g, b;
int dstoff = 0;
int coflinestart;
int dstlinestart;
int *xy_tab = (int *)table1;
int *cof_tab = (int *)table2;
int xyval = 0;
int xyoff, coff;
int cofval1, cofval2;
unsigned char r00, g00, b00, r01, g01, b01, r10, g10, b10, r11, g11, b11;
char *p1, *p2;
int rgb0, rgb1, rgb2, rgb3;
unsigned char *pDtmp;
for (j = stRect.top; j <stRect.bottom; ++j)
{
coflinestart = j*stepDstMapCoef;
dstlinestart = j*dstWidth * 3;
pDtmp = dst + dstlinestart + stRect.left * 3;
for (i = stRect.left; i <stRect.right; ++i)
{
xyoff = j*stepDstMapxy + i;
xyval = *(xy_tab + xyoff);
sy = (xyval>> 16) & 0x0000ffff;
sx = xyval & 0x0000ffff;
coff = coflinestart + (i <<1);
cofval1 = *(cof_tab + coff);
cofval2 = *(cof_tab + 1 + coff);
cof01 = (cofval1>> 16) & 0x0000ffff;
cof00 = cofval1 & 0x0000ffff;
cof11 = (cofval2>> 16) & 0x0000ffff;
cof10 = cofval2 & 0x0000ffff;
offset1 = (sy * stepSrc + sx*nchanner);
offset2 = offset1 + stepSrc;
p1 = (char *)(src + offset1);
p2 = (char *)(src + offset2);
r00 = *p1++;
g00 = *p1++;
b00 = *p1++;
r01 = *p1++;
g01 = *p1++;
b01 = *p1++;
r10 = *p2++;
g10 = *p2++;
b10 = *p2++;
r11 = *p2++;
g11 = *p2++;
b11 = *p2++;
*(pDtmp++) = (unsigned char)((r00 * cof00 + r10 * cof01 + r01 * cof10 + r11 * cof11)>> BITOFF);
*(pDtmp++) = (unsigned char)((g00 * cof00 + g10 * cof01 + g01 * cof10 + g11 * cof11)>> BITOFF);
*(pDtmp++) = (unsigned char)((b00 * cof00 + b10 * cof01 + b01 * cof10 + b11 * cof11)>> BITOFF);
}
}
return 1;
}
</code>
优化后代码
<code>
static int ImgIn_w = 800;
static int ImgIn_h = 600;
static int Img_w = 600;
static int Img_h = 800;
static int PixelSize = 2;
static void Load_Dot16_YUV422(int stRect_left, int stRect_right, int stRect_top, int stRect_bottom, int ImageIndex, int ImageOutIndex)
{
unsigned int src0 = pImgS1In;
unsigned int dst0 = pImgS1Out;
unsigned int xy_tab0 = pParam_Loadx4XY;
unsigned int cof_tab0 = pParam_Dot16Cof;
int Index_i, Index_j;
unsigned char* src;
unsigned char * src1;
register tu32 XY_tab_Addr;
unsigned long long *pXY_tab;
int i_LoopNum;
int j_offset;
unsigned long long offset_12;
int offset_load_1;
int offset_load_1_is;
unsigned long long data_load_A_1;
unsigned long long data_load_B_1;
int offset_load_2;
int offset_load_2_is;
unsigned long long data_load_A_2;
unsigned long long data_load_B_2;
unsigned long long *pCof_tab;
unsigned long long *pCof_tab1;
unsigned long long Cof_ABCD_1;
unsigned long long Cof_ABCD_2;
unsigned long long Data64_AND_00FF = 0x00FF00FF00FF00FF;
unsigned int Data32_7654_A_1;
unsigned int Data32_7654_B_1;
unsigned int Data32_3210_A_2;
unsigned int Data32_3210_B_2;
unsigned long long Data64_DP2_76765454_1;
unsigned long long Data64_DP2_32321010_2;
unsigned long long Data64_DPH4_75753131_1;
unsigned long long Data64_DPH4_75753131_2;
unsigned long long Data64_MV55_75753131_1;
unsigned long long Data64_MV33_75753131_2;
unsigned long long Data64_SHFU_75753131_1;
unsigned long long Data64_SHFU_75753131_2;
unsigned int Data32_7575_1;
unsigned int Data32_3131_1;
unsigned int Data32_7575_2;
unsigned int Data32_3131_2;
int Shift_Num_1;
int Shift_Num_2;
//AB
//CD
unsigned long long Data64_Y0_0B0D0A0C;
unsigned long long Data64_Y1_0B0D0A0C;
unsigned long long Data64_U0_0B0D0A0C;
unsigned long long Data64_V1_0B0D0A0C;
__x128_t D128_Y1Y0_0B0D0A0C;
__x128_t D128_V1U0_0B0D0A0C;
__x128_t D128_C1C0_0B0D0A0C;
unsigned long long D64_Y1Y0_Dot16;
unsigned long long D64_V1U0_Dot16;
unsigned long long D64_Y1Y0_SHRU;
unsigned long long D64_V1U0_SHRU;
unsigned long long D64_V1_Y1_U0_Y0;
unsigned int D32_V1_Y1;
unsigned int D32_U0_Y0;
unsigned int D32_V1Y1U0Y0;
src = src0;
src1 = src0 + ImgIn_w * PixelSize;
//图层偏址
xy_tab0+=Img_h*Img_w*4*ImageIndex;
cof_tab0+=Img_h*Img_w*(1<<3)*ImageIndex;
dst0 +=Img_h*Img_w*2*ImageOutIndex;
//windows偏址
{
j_offset = stRect_top*Img_w;
xy_tab0 = xy_tab0 + (j_offset<<2);
cof_tab0= cof_tab0 + (j_offset<<3);
dst0 = dst0 + (j_offset<<1);
}
//line偏址
{
j_offset = stRect_left;
xy_tab0 = xy_tab0 + (j_offset<<2);
cof_tab0= cof_tab0 + (j_offset<<3);
dst0 = dst0 + (j_offset<<1);
}
//行循环
i_LoopNum = stRect_right-stRect_left;
i_LoopNum = i_LoopNum>>1;
for (Index_j = stRect_top; Index_j <stRect_bottom; Index_j++)
{
//pre_init
XY_tab_Addr = xy_tab0;
pXY_tab = XY_tab_Addr;
xy_tab0+=(Img_w<<2);
pCof_tab = cof_tab0;
cof_tab0+=(Img_w<<3);
pDtmp = dst0;
dst0+=(Img_w<<1);
//pre_Loop
{
pCof_tab1 = &pCof_tab[1];
offset_12 = *pXY_tab++;//C1
offset_load_1 = _loll(offset_12);//C1
offset_load_2 = _hill(offset_12);//C1
data_load_A_1 = _mem8(src+offset_load_1);//C1
data_load_B_1 = _mem8(src1+offset_load_1);//C1
data_load_A_2 = _mem8(src+offset_load_2);//C1
data_load_B_2 = _mem8(src1+offset_load_2);//C1
offset_load_1_is = offset_load_1&2;//C1
offset_load_2_is = offset_load_2&2;//C1
offset_12 = *pXY_tab++;//C2
}
for (Index_i =0; Index_i <i_LoopNum+1; Index_i++)
{
Data32_7654_A_1 = _hill(data_load_A_1);
Data32_7654_B_1 = _hill(data_load_B_1);
Data64_DP2_76765454_1 = _dpack2(Data32_7654_B_1, Data32_7654_A_1);
Data32_3210_A_2 = _loll(data_load_A_2);
Data32_3210_B_2 = _loll(data_load_B_2);
Data64_DP2_32321010_2 = _dpack2(Data32_3210_B_2, Data32_3210_A_2);
Data64_DPH4_75753131_1 = _dpackh4(data_load_B_1, data_load_A_1); //76543210 76543210>7575 3131
Data64_DPH4_75753131_2 = _dpackh4(data_load_B_2, data_load_A_2); //76543210 76543210>7575 3131
Data32_7575_1 = _hill(Data64_DPH4_75753131_1);
Data32_3131_1 = _loll(Data64_DPH4_75753131_1);
Shift_Num_1 = 8;
if(offset_load_1_is==0)
{
Data32_3131_1 = Data32_7575_1;
Shift_Num_1 = 0;
}
Data64_MV55_75753131_1 = _itoll(Data32_7575_1, Data32_3131_1);
Data64_SHFU_75753131_1 = _dshru(Data64_MV55_75753131_1, Shift_Num_1);
Data32_7575_2 = _hill(Data64_DPH4_75753131_2);
Data32_3131_2 = _loll(Data64_DPH4_75753131_2);
Shift_Num_2 = 0;
if(offset_load_2_is==0)
{
Data32_7575_2 = Data32_3131_2;
Shift_Num_2 = 8;
}
Data64_MV33_75753131_2 = _itoll(Data32_7575_2, Data32_3131_2);
Data64_SHFU_75753131_2 = _dshru(Data64_MV33_75753131_2, Shift_Num_2);
Data64_Y0_0B0D0A0C = Data64_DP2_76765454_1 & Data64_AND_00FF;
Data64_Y1_0B0D0A0C = Data64_DP2_32321010_2 & Data64_AND_00FF;
Data64_U0_0B0D0A0C = Data64_SHFU_75753131_1 & Data64_AND_00FF;
Data64_V1_0B0D0A0C = Data64_SHFU_75753131_2 & Data64_AND_00FF;
//这里是循环优化begin
offset_load_1 = _loll(offset_12);//C2
offset_load_2 = _hill(offset_12);//C2
data_load_A_1 = _mem8(src+offset_load_1);//C2
data_load_B_1 = _mem8(src1+offset_load_1);//C2
data_load_A_2 = _mem8(src+offset_load_2);//C2
data_load_B_2 = _mem8(src1+offset_load_2);//C2
offset_load_1_is = offset_load_1&2;//C2
offset_load_2_is = offset_load_2&2;//C2
offset_12 = *pXY_tab++;//C3
//这里是循环优化end
//-----------------
Cof_ABCD_1 = *pCof_tab;pCof_tab+=2;
Cof_ABCD_2 = *pCof_tab1;pCof_tab1+=2;
D128_C1C0_0B0D0A0C = _llto128(Cof_ABCD_2, Cof_ABCD_1);
D128_Y1Y0_0B0D0A0C = _llto128(Data64_Y1_0B0D0A0C, Data64_Y0_0B0D0A0C);
D128_V1U0_0B0D0A0C = _llto128(Data64_V1_0B0D0A0C, Data64_U0_0B0D0A0C);
D64_Y1Y0_SHRU = _dshr(D64_Y1Y0_Dot16, BITOFF);//dot延迟,流水线输出超越处理
D64_V1U0_SHRU = _dshr(D64_V1U0_Dot16, BITOFF);
D64_Y1Y0_Dot16 = _ddotpsu4h(D128_Y1Y0_0B0D0A0C, D128_C1C0_0B0D0A0C);
D64_V1U0_Dot16 = _ddotpsu4h(D128_V1U0_0B0D0A0C, D128_C1C0_0B0D0A0C);
D64_V1_Y1_U0_Y0 = _dpackl4(D64_V1U0_SHRU, D64_Y1Y0_SHRU);
D32_V1_Y1 = _hill(D64_V1_Y1_U0_Y0);
D32_U0_Y0 = _loll(D64_V1_Y1_U0_Y0);
D32_V1Y1U0Y0 = _packl4(D32_V1_Y1, D32_U0_Y0);
if(Index_i)
{
*pDtmp++ = D32_V1Y1U0Y0;
}
//-----------------
}
}
return ;
}
</code>
优化后代码密度
<code>
;*----------------------------------------------------------------------------*
$C$L2: ; PIPED LOOP PROLOG
SPLOOPD 8 ;16 ; (P)
|| MV .L1X B10,A6
|| MV .L2X A3,B24
|| MVKH .S2 0xff00ff,B16
;** --------------------------------------------------------------------------*
$C$L3: ; PIPED LOOP KERNEL
MV .S1 A17,A7 ; |341| (P) <0,0>
|| DPACKH4 .L1 A7:A6,A5:A4,A9:A8 ; |303| (P) <0,0> ^
|| [!A1] MVK .D1 0x8,A3 ; |323| (P) <0,0>
|| [ A0] MVK .S2 0x8,B25 ; |307| (P) <0,0>
|| LDDW .D2T2 *B24++(16),B7:B6 ; |352| (P) <0,0>
SPMASK L2,S2
|| ZERO .L2 B0 ; |293|
|| MV .S2 B16,B17
|| LDDW .D1T1 *A20++,A17:A16 ; |345| (P) <0,1>
|| MV .S1 A9,A6 ; |317| (P) <0,1> ^
|| LDDW .D2T2 *B23++(16),B5:B4 ; |352| (P) <0,1>
|| DPACK2 .L1 A6,A4,A5:A4 ; |300| (P) <0,1>
LDNDW .D1T1 *+A7(A19),A7:A6 ; |342| (P) <0,2> ^
|| [!A1] MV .L1 A8,A6 ; |318| (P) <0,2> ^
|| [ A1] ZERO .S1 A3 ; |319| (P) <0,2>
|| DPACKH4 .L2 B7:B6,B5:B4,B21:B20 ; |305| (P) <0,2>
|| [!A0] ZERO .D2 B25 ; |311| (P) <0,2>
|| DSHR .S2 B19:B18,11,B19:B18 ; |356| (P) <0,2>
MV .L1 A16,A6 ; |339| (P) <0,3>
|| LDNDW .D1T1 *+A7(A18),A5:A4 ; |341| (P) <0,3> ^
|| MV .S1 A6,A5 ; |354| (P) <0,3> ^
|| MV .D2 B20,B4 ; |306| (P) <0,3>
|| AND .S2X B17:B16,A5:A4,B11:B10 ; |353| (P) <0,3>
|| DPACK2 .L2 B7,B5,B7:B6 ; |297| (P) <0,3>
AND .L1 2,A17,A1 ; |344| (P) <0,4>
|| LDNDW .D1T2 *+A6(A18),B5:B4 ; |339| (P) <0,4>
|| MV .S1 A8,A4 ; |318| (P) <0,4>
|| [!A0] MV .D2 B21,B4 ; |305| (P) <0,4>
|| DSHR .S2 B9:B8,11,B27:B26 ; |357| (P) <0,4>
|| AND .L2 B17:B16,B7:B6,B9:B8 ; |353| (P) <0,4>
AND .L1 2,A16,A0 ; |343| (P) <0,5>
|| LDNDW .D1T2 *+A6(A19),B7:B6 ; |340| (P) <0,5>
|| DSHRU .S1 A5:A4,A3,A5:A4 ; |354| (P) <0,5> ^
|| DPACKL4 .L2 B27:B26,B19:B18,B19:B18 ; |369| (P) <0,5>
|| MV .D2 B4,B20 ; |314| (P) <0,5>
SPMASK D2
|| MV .D2 B28,B22
|| DDOTPSU4H .M2 B11:B10:B9:B8,B7:B6:B5:B4,B19:B18 ; |359| (P) <0,6>
|| PACKL4 .L2 B19,B18,B18 ; |369| (P) <0,6>
|| DSHRU .S2 B21:B20,B25,B9:B8 ; |314| (P) <0,6>
[ B0] STW .D2T2 B18,*B22++ ; |369| (P) <0,7>
|| AND .L2 B17:B16,B9:B8,B9:B8 ; |354| (P) <0,7>
|| AND .S2X B17:B16,A5:A4,B11:B10 ; |354| (P) <0,7> ^
SPKERNEL 0,0
|| ADD .L2 1,B0,B0 ; |293| <0,8>
|| DDOTPSU4H .M2 B11:B10:B9:B8,B7:B6:B5:B4,B9:B8 ; |360| <0,8>
;** --------------------------------------------------------------------------*
</code>
|