Mali OpenCL SDK v1.1.0
 All Classes Files Functions Variables Macros Pages
fir_float.cl
Go to the documentation of this file.
1 /*
2  * This confidential and proprietary software may be used only as
3  * authorised by a licensing agreement from ARM Limited
4  * (C) COPYRIGHT 2013 ARM Limited
5  * ALL RIGHTS RESERVED
6  * The entire notice above must be reproduced on all authorised
7  * copies and copies may only be made to the extent permitted
8  * by a licensing agreement from ARM Limited.
9  */
10 
11 /* FW_SCALE = 1 / 256. */
12 #define FW_SCALE 0.00390625f
13 #define FW_UL (30.0f * FW_SCALE)
14 #define FW_UM (5.0f * FW_SCALE)
15 #define FW_UR (6.0f * FW_SCALE)
16 #define FW_CL (19.0f * FW_SCALE)
17 #define FW_CM (30.0f * FW_SCALE)
18 #define FW_CR (9.0f * FW_SCALE)
19 #define FW_BL (15.0f * FW_SCALE)
20 #define FW_BM (5.0f * FW_SCALE)
21 #define FW_BR (40.0f * FW_SCALE)
22 
29 __kernel void fir_float(__global const float* restrict input,
30  __global float* restrict output,
31  const int width)
32 {
33  /* [Kernel size] */
34  /*
35  * Each kernel calculates 4 output pixels in the same row (hence the '* 4').
36  * column is in the range [0, width] in steps of 4.
37  * row is in the range [0, height].
38  */
39  const int column = get_global_id(0) * 4;
40  const int row = get_global_id(1);
41  /* Offset calculates the position in the linear data for the row and the column. */
42  const int offset = row * width + column;
43  /* [Kernel size] */
44 
45  /* Accumulator array of 4 floats. */
46  float4 accumulator = (float4)0.0f;
47 
48  /*
49  * As mentioned before the kernel works on a 6x3 window.
50  * e.g. If the data in the input array is as follows:
51  * [0 1 2 3 4 5 ]
52  * [6 7 8 9 10 11]
53  * [12 13 14 15 16 17]
54  *
55  * We load overlapping data from the first row into 3 vectors:
56  *
57  * data0 = [0 1 2 3]
58  * data1 = [1 2 3 4]
59  * data2 = [2 3 4 5]
60  *
61  * Which means the first result of accumulator will be equal to:
62  * accumulator.s0 = data0.s0 * FW_UL + data1.s0 * FW_UM + data2.s0 * FW_UR
63  *
64  * The same is done for the second and third row, which makes acc.s0 the sum of 0, 1, 2, 6, 7, 8, 12, 13 and 14
65  * multiplied by the corresponding coefficients.
66  * If this indices are compared with the input array, we can see the 3x3 window of the FIR filter.
67  */
68 
69  /* [Load first row] */
70  /*
71  * Access the first row in the 6x3 window to apply FW_U coefficients.
72  * data1 can be constructed from the other vectors without doing an additional load.
73  */
74  float4 data0 = vload4(0, input + offset);
75  float4 data2 = vload4(0, input + offset + 2);
76  float4 data1 = (float4)(data0.s12, data2.s12);
77  /* [Load first row] */
78 
79  /* [Filter first row] */
80  accumulator += data0 * FW_UL;
81  accumulator += data1 * FW_UM;
82  accumulator += data2 * FW_UR;
83  /* [Filter first row] */
84 
85  /* [Load and filter second and third row] */
86  /* Access the second row in the 6x3 window and repeat the process, but with FW_C coefficients. */
87  data0 = vload4(0, input + offset + width);
88  data2 = vload4(0, input + offset + width + 2);
89  data1 = (float4)(data0.s12, data2.s12);
90 
91  accumulator += data0 * FW_CL;
92  accumulator += data1 * FW_CM;
93  accumulator += data2 * FW_CR;
94 
95  /* Access the third row in the 6x3 window and repeat the process, but with FW_B coefficients. */
96  data0 = vload4(0, input + offset + width * 2);
97  data2 = vload4(0, input + offset + width * 2 + 2);
98  data1 = (float4)(data0.s12, data2.s12);
99 
100  accumulator += data0 * FW_BL;
101  accumulator += data1 * FW_BM;
102  accumulator += data2 * FW_BR;
103  /* [Load and filter second and third row] */
104 
105  /* [Store] */
106  /* Store the accumulator. */
107  vstore4(accumulator, 0, output + offset);
108  /* [Store] */
109 }