Mali OpenCL SDK v1.1.0
 All Classes Files Functions Variables Macros Pages
64_bit_integer.cl
Go to the documentation of this file.
1 /*
2  * This confidential and proprietary software may be used only as
3  * authorised by a licensing agreement from ARM Limited
4  * (C) COPYRIGHT 2013 ARM Limited
5  * ALL RIGHTS RESERVED
6  * The entire notice above must be reproduced on all authorised
7  * copies and copies may only be made to the extent permitted
8  * by a licensing agreement from ARM Limited.
9  */
10 
20 /* [Enable atom_add extension] */
21 #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
22 /* [Enable atom_add extension] */
23 
24 __kernel void long_vectors(__global uchar* restrict imagePixels,
25  __global ulong* restrict squareOfPixels,
26  __global ulong* restrict sumOfPixels)
27 {
28  /*
29  * Set i to be the ID of the kernel instance.
30  * If the global work size (set by clEnqueueNDRangeKernel) is n,
31  * then n kernels will be run and i will be in the range [0, n - 1].
32  */
33  int i = get_global_id(0);
34 
35  /* [Squares and sums]*/
36  /* Load 8 pixels (char) and convert them to shorts to calculate the square.*/
37  ushort8 pixelShort = convert_ushort8(vload8(i, imagePixels));
38  /* Square of 255 < 2 ^ 16. */
39  ushort8 newSquareShort = pixelShort * pixelShort;
40 
41  /*
42  * Convert original pixel value and the square to longs to sum
43  * all the vectors together and add the final values to the
44  * respective accumulators.
45  */
46  ulong8 pixelLong = convert_ulong8(pixelShort);
47  ulong8 newSquareLong = convert_ulong8(newSquareShort);
48 
49  /*
50  * Use vector data type suffixes (.lo and .hi) to get smaller vector types,
51  * until we obtain one single value.
52  */
53  ulong4 sumLongPixels1 = pixelLong.hi + pixelLong.lo;
54  ulong2 sumLongPixels2 = sumLongPixels1.hi + sumLongPixels1.lo;
55  ulong sumLongPixels3 = sumLongPixels2.hi + sumLongPixels2.lo;
56 
57  ulong4 sumLongSquares1 = newSquareLong.hi + newSquareLong.lo;
58  ulong2 sumLongSquares2 = sumLongSquares1.hi + sumLongSquares1.lo;
59  ulong sumLongSquares3 = sumLongSquares2.hi + sumLongSquares2.lo;
60  /* [Squares and sums]*/
61 
62  /*
63  * As all the kernels are accessing sumOfPixels
64  * and squareOfPixels at the same time,
65  * we use atom_add to ensure only one kernel
66  * at a time can access the given variables.
67  * This means that this operation is very expensive,
68  * so we want to use it only when necessary.
69  */
70  /* [Atomic transaction] */
71  atom_add(sumOfPixels, sumLongPixels3);
72  atom_add(squareOfPixels, sumLongSquares3);
73  /* [Atomic transaction] */
74 }