12 #define FW_SCALE 0.00390625f
13 #define FW_UL (30.0f * FW_SCALE)
14 #define FW_UM (5.0f * FW_SCALE)
15 #define FW_UR (6.0f * FW_SCALE)
16 #define FW_CL (19.0f * FW_SCALE)
17 #define FW_CM (30.0f * FW_SCALE)
18 #define FW_CR (9.0f * FW_SCALE)
19 #define FW_BL (15.0f * FW_SCALE)
20 #define FW_BM (5.0f * FW_SCALE)
21 #define FW_BR (40.0f * FW_SCALE)
29 __kernel
void fir_float(__global
const float* restrict input,
30 __global
float* restrict output,
39 const int column = get_global_id(0) * 4;
40 const int row = get_global_id(1);
42 const int offset = row * width + column;
46 float4 accumulator = (float4)0.0f;
74 float4 data0 = vload4(0, input + offset);
75 float4 data2 = vload4(0, input + offset + 2);
76 float4 data1 = (float4)(data0.s12, data2.s12);
80 accumulator += data0 *
FW_UL;
81 accumulator += data1 *
FW_UM;
82 accumulator += data2 *
FW_UR;
87 data0 = vload4(0, input + offset + width);
88 data2 = vload4(0, input + offset + width + 2);
89 data1 = (float4)(data0.s12, data2.s12);
91 accumulator += data0 *
FW_CL;
92 accumulator += data1 *
FW_CM;
93 accumulator += data2 *
FW_CR;
96 data0 = vload4(0, input + offset + width * 2);
97 data2 = vload4(0, input + offset + width * 2 + 2);
98 data1 = (float4)(data0.s12, data2.s12);
100 accumulator += data0 *
FW_BL;
101 accumulator += data1 *
FW_BM;
102 accumulator += data2 *
FW_BR;
107 vstore4(accumulator, 0, output + offset);