Mali OpenCL SDK v1.1.0
 All Classes Files Functions Variables Macros Pages
sobel.cl
Go to the documentation of this file.
1 /*
2  * This confidential and proprietary software may be used only as
3  * authorised by a licensing agreement from ARM Limited
4  * (C) COPYRIGHT 2013 ARM Limited
5  * ALL RIGHTS RESERVED
6  * The entire notice above must be reproduced on all authorised
7  * copies and copies may only be made to the extent permitted
8  * by a licensing agreement from ARM Limited.
9  */
10 
18 __kernel void sobel(__global const uchar* restrict inputImage,
19  const int width,
20  __global char* restrict outputImageDX,
21  __global char* restrict outputImageDY)
22 {
23  /* [Kernel size] */
24  /*
25  * Each kernel calculates 16 output pixels in the same row (hence the '* 16').
26  * column is in the range [0, width] in steps of 16.
27  * row is in the range [0, height].
28  */
29  const int column = get_global_id(0) * 16;
30  const int row = get_global_id(1) * 1;
31 
32  /* Offset calculates the position in the linear data for the row and the column. */
33  const int offset = row * width + column;
34  /* [Kernel size] */
35 
36  /* [Load row] */
37  /*
38  * First row of input.
39  * In a scalar Sobel calculation you would load 1 value for leftLoad, middleLoad and rightLoad.
40  * In the vector case we load 16 values for each.
41  * leftLoad, middleLoad and rightLoad load 16-bytes of data from the first row.
42  * The data they load overlaps. e.g. for the first column and row, leftLoad is 0->15, middleLoad is 1->16 and rightLoad is 2->17.
43  * So we're actually loading 18-bytes of data from the first row.
44  */
45  uchar16 leftLoad = vload16(0, inputImage + (offset + 0));
46  uchar16 middleLoad = vload16(0, inputImage + (offset + 1));
47  uchar16 rightLoad = vload16(0, inputImage + (offset + 2));
48  /* [Load row] */
49 
50  /* [Convert data] */
51  /*
52  * Convert the data from unsigned chars to shorts (8-bit unsigned to 16-bit signed).
53  * The calculations can overflow 8-bits so we require larger intermediate storage.
54  * Additionally, the values can become negative so we need a signed type.
55  */
56  short16 leftData = convert_short16(leftLoad);
57  short16 middleData = convert_short16(middleLoad);
58  short16 rightData = convert_short16(rightLoad);
59  /* [Convert data] */
60 
61  /* [Calculation] */
62  /*
63  * Calculate the results for the first row.
64  * Looking at the Sobel masks above for the first line of input,
65  * the dX calculation is the sum of 1 * leftData, 0 * middleData, and -1 * rightData.
66  * The dY calculation is the sum of 1 * leftData, 2 * middleData, and 1 * rightData.
67  * This is what is being calculated below, except we have removed the
68  * unnecessary calculations (multiplications by 1 or 0) and we are calculating 16 values at once.
69  * This pattern repeats for the other 2 rows of data.
70  */
71  short16 dx = rightData - leftData;
72  short16 dy = rightData + leftData + middleData * (short)2;
73  /* [Calculation] */
74 
75  /*
76  * Second row of input.
77  * By adding the 'width * 1' to the offset we get the next row of data at the same column position.
78  * middleData is not loaded because it is not used in any of the calculations.
79  */
80  leftLoad = vload16(0, inputImage + (offset + width * 1 + 0));
81  rightLoad = vload16(0, inputImage + (offset + width * 1 + 2));
82 
83  leftData = convert_short16(leftLoad);
84  rightData = convert_short16(rightLoad);
85 
86  /*
87  * Calculate the results for the second row.
88  * The dX calculation is the sum of -2 * leftData, 0 * middleData, and -2 * rightData.
89  * There is no dY calculation to do: sum of 0 * leftData, 0 * middleData, and 0 * rightData.
90  */
91  dx += (rightData - leftData) * (short)2;
92 
93  /* Third row of input. */
94  leftLoad = vload16(0, inputImage + (offset + width * 2 + 0));
95  middleLoad = vload16(0, inputImage + (offset + width * 2 + 1));
96  rightLoad = vload16(0, inputImage + (offset + width * 2 + 2));
97 
98  leftData = convert_short16(leftLoad);
99  middleData = convert_short16(middleLoad);
100  rightData = convert_short16(rightLoad);
101 
102  /*
103  * Calculate the results for the third row.
104  * The dX calculation is the sum of -1 * leftData, 0 * middleData, and -1 * rightData.
105  * The dY calculation is the sum of -1 * leftData, -2 * middleData, and -1 * rightData.
106  */
107  dx += rightData - leftData;
108  dy -= rightData + leftData + middleData * (short)2;
109 
110  /* [Store] */
111  /*
112  * Store the results.
113  * The range of outputs from our Sobel calculations is [-1020, 1020].
114  * In order to output this as an 8-bit signed char we must divide it by 8 (or shift right 3 times).
115  * This gives the range [-128, 128]. Depending on what type of output you require,
116  * (signed/unsigned, seperate/combined gradients) it is possible to do more of the calculations on the GPU using OpenCL.
117  * In this sample we're assuming that the application requires signed uncombined gradient outputs.
118  */
119  vstore16(convert_char16(dx >> 3), 0, outputImageDX + offset + width + 1);
120  vstore16(convert_char16(dy >> 3), 0, outputImageDY + offset + width + 1);
121  /* [Store] */
122 }