Page MenuHomePhabricator
Paste P105

SPIR-V -> NV50 IR: Texture R/W
ActivePublic

Authored by pmoreau on Sep 22 2016, 1:19 PM.
There are 1 platforms.
There are 1 GPU devices.
clCreateContext() succeeded.
clCreateCommandQueue() succeeded.
clCreateProgramWithIL() suceeded.
clBuildProgram() suceeded.
clCreateKernel() suceeded.
clCreateImage2D() succeeded.
clSetKernelArg() succeeded.
clCreateImage2D() succeeded.
clSetKernelArg() succeeded.
Parsing SPIR-V generated by 6 (version 14)
Version 1.0
ID bound: 47
Compiling for nve7
rw_image:13 ()
---
BB:0 (3 instructions) - df = { }
-> BB:2 (tree)
0: ld - %r2 c0[0x0] (0)
1: ld - %r3 c0[0x0] (0)
2: bra BB:2 (0)
---
<- BB:0 (tree)
BB:2 (47 instructions) - df = { }
-> BB:6 (tree)
-> BB:3 (tree)
3: rdsv u32 %r4 sv[TID:0] (0)
4: rdsv u32 %r5 sv[NTID:0] (0)
5: rdsv u32 %r6 sv[CTAID:0] (0)
6: add u32 %r7 %r4 0x00000000 (0)
7: mad u32 %r8 %r5 %r6 %r7 (0)
8: cvt u64 %r9d u32 %r8 (0)
9: rdsv u32 %r10 sv[TID:1] (0)
10: rdsv u32 %r11 sv[NTID:1] (0)
11: rdsv u32 %r12 sv[CTAID:1] (0)
12: add u32 %r13 %r10 0x00000000 (0)
13: mad u32 %r14 %r11 %r12 %r13 (0)
14: cvt u64 %r15d u32 %r14 (0)
15: rdsv u32 %r16 sv[TID:2] (0)
16: rdsv u32 %r17 sv[NTID:2] (0)
17: rdsv u32 %r18 sv[CTAID:2] (0)
18: add u32 %r19 %r16 0x00000000 (0)
19: mad u32 %r20 %r17 %r18 %r19 (0)
20: cvt u64 %r21d u32 %r20 (0)
21: mov u64 %r22d %r9d (0)
22: cvt u32 %r23 u64 %r22d (0)
23: mov u32 %r24 %r23 (0)
24: mov u32 %r25 %r1 (0)
25: rdsv u32 %r26 sv[TID:0] (0)
26: rdsv u32 %r27 sv[NTID:0] (0)
27: rdsv u32 %r28 sv[CTAID:0] (0)
28: add u32 %r29 %r26 0x00000000 (0)
29: mad u32 %r30 %r27 %r28 %r29 (0)
30: cvt u64 %r31d u32 %r30 (0)
31: rdsv u32 %r32 sv[TID:1] (0)
32: rdsv u32 %r33 sv[NTID:1] (0)
33: rdsv u32 %r34 sv[CTAID:1] (0)
34: add u32 %r35 %r32 0x00000000 (0)
35: mad u32 %r36 %r33 %r34 %r35 (0)
36: cvt u64 %r37d u32 %r36 (0)
37: rdsv u32 %r38 sv[TID:2] (0)
38: rdsv u32 %r39 sv[NTID:2] (0)
39: rdsv u32 %r40 sv[CTAID:2] (0)
40: add u32 %r41 %r38 0x00000000 (0)
41: mad u32 %r42 %r39 %r40 %r41 (0)
42: cvt u64 %r43d u32 %r42 (0)
43: mov u64 %r44d %r37d (0)
44: cvt u32 %r45 u64 %r44d (0)
45: mov u32 %r46 %r24 (0)
46: mov u32 %r47 %r45 (0)
47: mov u32 %r48 %r46 (0)
48: set u8 %p49 gt s32 %r48 4 (0)
49: %p49 bra BB:6 (0)
---
<- BB:2 (tree)
BB:3 (1 instructions) - df = { }
-> BB:4 (tree)
50: bra BB:4 (0)
---
<- BB:3 (tree)
BB:4 (3 instructions) - df = { }
-> BB:6 (forward)
-> BB:5 (tree)
51: mov u32 %r50 %r47 (0)
52: set u8 %p51 gt s32 %r50 4 (0)
53: %p51 bra BB:6 (0)
---
<- BB:4 (tree)
BB:5 (1 instructions) - df = { }
-> BB:7 (tree)
54: bra BB:7 (0)
---
<- BB:5 (tree)
BB:7 (7 instructions) - df = { }
-> BB:8 (forward)
55: suldp 1D $r0 $s0 f32 { %r52 %r53 %r54 %r55 } %r46 %r47 (0)
56: mul f32 %r56 %r52 2.000000 (0)
57: mul f32 %r57 %r53 2.000000 (0)
58: mul f32 %r58 %r54 2.000000 (0)
59: mul f32 %r59 %r55 2.000000 (0)
60: sustp 2D $r0 $s0 f32 # %r46 %r47 (0)
61: bra BB:8 (0)
---
<- BB:4 (forward)
<- BB:2 (tree)
BB:6 (1 instructions) - df = { }
-> BB:8 (tree)
62: bra BB:8 (0)
---
<- BB:7 (forward)
<- BB:6 (tree)
BB:8 (1 instructions) - df = { }
-> BB:1 (tree)
63: bra BB:1 (0)
---
<- BB:8 (tree)
BB:1 (1 instructions) - df = { }
64: exit - # (0)
MAIN:-1 ()
---
BB:0 (0 instructions) - df = { }
rw_image:13 ()
---
BB:0 (3 instructions) - df = { }
-> BB:2 (tree)
0: ld - %r2 c0[0x0] (0)
1: ld - %r3 c0[0x0] (0)
2: bra BB:2 (0)
---
<- BB:0 (tree)
BB:2 (47 instructions) - df = { }
-> BB:6 (tree)
-> BB:3 (tree)
3: rdsv u32 %r4 sv[TID:0] (0)
4: rdsv u32 %r5 sv[NTID:0] (0)
5: rdsv u32 %r6 sv[CTAID:0] (0)
6: add u32 %r7 %r4 0x00000000 (0)
7: mad u32 %r8 %r5 %r6 %r7 (0)
8: cvt u64 %r9d u32 %r8 (0)
9: rdsv u32 %r10 sv[TID:1] (0)
10: rdsv u32 %r11 sv[NTID:1] (0)
11: rdsv u32 %r12 sv[CTAID:1] (0)
12: add u32 %r13 %r10 0x00000000 (0)
13: mad u32 %r14 %r11 %r12 %r13 (0)
14: cvt u64 %r15d u32 %r14 (0)
15: rdsv u32 %r16 sv[TID:2] (0)
16: rdsv u32 %r17 sv[NTID:2] (0)
17: rdsv u32 %r18 sv[CTAID:2] (0)
18: add u32 %r19 %r16 0x00000000 (0)
19: mad u32 %r20 %r17 %r18 %r19 (0)
20: cvt u64 %r21d u32 %r20 (0)
21: mov u64 %r22d %r9d (0)
22: cvt u32 %r23 u64 %r22d (0)
23: mov u32 %r24 %r23 (0)
24: mov u32 %r25 %r1 (0)
25: rdsv u32 %r26 sv[TID:0] (0)
26: rdsv u32 %r27 sv[NTID:0] (0)
27: rdsv u32 %r28 sv[CTAID:0] (0)
28: add u32 %r29 %r26 0x00000000 (0)
29: mad u32 %r30 %r27 %r28 %r29 (0)
30: cvt u64 %r31d u32 %r30 (0)
31: rdsv u32 %r32 sv[TID:1] (0)
32: rdsv u32 %r33 sv[NTID:1] (0)
33: rdsv u32 %r34 sv[CTAID:1] (0)
34: add u32 %r35 %r32 0x00000000 (0)
35: mad u32 %r36 %r33 %r34 %r35 (0)
36: cvt u64 %r37d u32 %r36 (0)
37: rdsv u32 %r38 sv[TID:2] (0)
38: rdsv u32 %r39 sv[NTID:2] (0)
39: rdsv u32 %r40 sv[CTAID:2] (0)
40: add u32 %r41 %r38 0x00000000 (0)
41: mad u32 %r42 %r39 %r40 %r41 (0)
42: cvt u64 %r43d u32 %r42 (0)
43: mov u64 %r44d %r37d (0)
44: cvt u32 %r45 u64 %r44d (0)
45: mov u32 %r46 %r24 (0)
46: mov u32 %r47 %r45 (0)
47: mov u32 %r48 %r46 (0)
48: set u8 %p49 gt s32 %r48 4 (0)
49: %p49 bra BB:6 (0)
---
<- BB:2 (tree)
BB:3 (1 instructions) - df = { }
-> BB:4 (tree)
50: bra BB:4 (0)
---
<- BB:3 (tree)
BB:4 (3 instructions) - df = { }
-> BB:6 (forward)
-> BB:5 (tree)
51: mov u32 %r50 %r47 (0)
52: set u8 %p51 gt s32 %r50 4 (0)
53: %p51 bra BB:6 (0)
---
<- BB:4 (tree)
BB:5 (1 instructions) - df = { }
-> BB:7 (tree)
54: bra BB:7 (0)
---
<- BB:5 (tree)
BB:7 (7 instructions) - df = { }
-> BB:8 (forward)
55: suldp 1D $r0 $s0 f32 { %r52 %r53 %r54 %r55 } %r46 %r47 (0)
56: mul f32 %r56 %r52 2.000000 (0)
57: mul f32 %r57 %r53 2.000000 (0)
58: mul f32 %r58 %r54 2.000000 (0)
59: mul f32 %r59 %r55 2.000000 (0)
60: sustp 2D $r0 $s0 f32 # %r46 %r47 (0)
61: bra BB:8 (0)
---
<- BB:4 (forward)
<- BB:2 (tree)
BB:6 (1 instructions) - df = { }
-> BB:8 (tree)
62: bra BB:8 (0)
---
<- BB:7 (forward)
<- BB:6 (tree)
BB:8 (1 instructions) - df = { }
-> BB:1 (tree)
63: bra BB:1 (0)
---
<- BB:8 (tree)
BB:1 (1 instructions) - df = { }
64: exit - # (0)
MAIN:-1 ()
---
BB:0 (0 instructions) - df = { }
Program received signal SIGSEGV, Segmentation fault.
0x00007fffe6f74f8e in nv50_ir::RegAlloc::InsertConstraintsPass::condenseSrcs (this=0x7ffffff08b60, insn=0x67b7b0, a=3, b=6) at ../../../../../mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp:2064
2064 size += insn->getSrc(s)->reg.size;
(gdb) bt
#0 0x00007fffe6f74f8e in nv50_ir::RegAlloc::InsertConstraintsPass::condenseSrcs (this=0x7ffffff08b60, insn=0x67b7b0, a=3, b=6) at ../../../../../mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp:2064
#1 0x00007fffe6f7549e in nv50_ir::RegAlloc::InsertConstraintsPass::texConstraintNVE0 (this=0x7ffffff08b60, tex=0x67b7b0) at ../../../../../mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp:2151
#2 0x00007fffe6f75b7f in nv50_ir::RegAlloc::InsertConstraintsPass::visit (this=0x7ffffff08b60, bb=0x67b310) at ../../../../../mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp:2255
#3 0x00007fffe6ee8061 in nv50_ir::Pass::doRun (this=0x7ffffff08b60, func=0x674dc0, ordered=true, skipPhi=true) at ../../../../../mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp:495
#4 0x00007fffe6ee7f3b in nv50_ir::Pass::run (this=0x7ffffff08b60, func=0x674dc0, ordered=true, skipPhi=true) at ../../../../../mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp:477
#5 0x00007fffe6f74646 in nv50_ir::RegAlloc::InsertConstraintsPass::exec (this=0x7ffffff08b60, ir=0x674dc0) at ../../../../../mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp:1926
#6 0x00007fffe6f73efd in nv50_ir::RegAlloc::execFunc (this=0x7ffffff08c30) at ../../../../../mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp:1827
#7 0x00007fffe6f73cf9 in nv50_ir::RegAlloc::exec (this=0x7ffffff08c30) at ../../../../../mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp:1799
#8 0x00007fffe6f745d8 in nv50_ir::Program::registerAllocation (this=0x666bf0) at ../../../../../mesa/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp:1918
#9 0x00007fffe6edebb1 in nv50_ir_generate_code (info=0x667be0) at ../../../../../mesa/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp:1256
#10 0x00007fffe6feea5c in nvc0_program_translate (prog=prog@entry=0x667300, chipset=<optimized out>, debug=debug@entry=0x640558) at ../../../../../mesa/src/gallium/drivers/nouveau/nvc0/nvc0_program.c:604
#11 0x00007fffe6ff7c19 in nvc0_program_validate (prog=0x667300, nvc0=0x640220) at ../../../../../mesa/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c:56
#12 nvc0_compprog_validate (nvc0=0x640220) at ../../../../../mesa/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c:247
#13 0x00007fffe6fff965 in nvc0_state_validate (nvc0=nvc0@entry=0x640220, mask=mask@entry=4294967295, validate_list=validate_list@entry=0x7fffe7314e00 <validate_list_cp>, size=size@entry=8, dirty=dirty@entry=0x64061c, bufctx=0x644be0) at ../../../../../mesa/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c:831
#14 0x00007fffe701303b in nve4_state_validate_cp (mask=4294967295, nvc0=0x640220) at ../../../../../mesa/src/gallium/drivers/nouveau/nvc0/nve4_compute.c:475
#15 nve4_launch_grid (pipe=0x640220, info=0x7ffffff08e70) at ../../../../../mesa/src/gallium/drivers/nouveau/nvc0/nve4_compute.c:620
#16 0x00007ffff7a92883 in clover::kernel::launch (this=0x664950, q=..., grid_offset=std::vector of length 2, capacity 2 = {...}, grid_size=std::vector of length 2, capacity 2 = {...}, block_size=std::vector of length 2, capacity 2 = {...}) at ../../../../../mesa/src/gallium/state_trackers/clover/core/kernel.cpp:85
#17 0x00007ffff7a42f5a in <lambda(clover::event&)>::operator()(clover::event &) const (__closure=0x666450) at ../../../../../mesa/src/gallium/state_trackers/clover/api/kernel.cpp:292
#18 0x00007ffff7a44575 in std::_Function_handler<void(clover::event&), clEnqueueNDRangeKernel(cl_command_queue, cl_kernel, cl_uint, const size_t*, const size_t*, const size_t*, cl_uint, _cl_event* const*, _cl_event**)::<lambda(clover::event&)> >::_M_invoke(const std::_Any_data &, clover::event &) (__functor=...,
__args#0=...) at /usr/include/c++/6.2.1/functional:1740
#19 0x00007ffff7a8bfed in std::function<void (clover::event&)>::operator()(clover::event&) const (this=0x665cb0, __args#0=...) at /usr/include/c++/6.2.1/functional:2136
#20 0x00007ffff7a8a06f in clover::event::trigger (this=0x665c70) at ../../../../../mesa/src/gallium/state_trackers/clover/core/event.cpp:55
#21 0x00007ffff7a8a74c in clover::hard_event::hard_event(clover::command_queue&, unsigned int, clover::ref_vector<clover::event> const&, std::function<void (clover::event&)>) (this=0x665c70, q=..., command=4592, deps=..., action=...) at ../../../../../mesa/src/gallium/state_trackers/clover/core/event.cpp:126
#22 0x00007ffff7a43966 in clover::create<clover::hard_event, clover::command_queue&, int, clover::ref_vector<clover::event>&, clEnqueueNDRangeKernel(cl_command_queue, cl_kernel, cl_uint, const size_t*, const size_t*, const size_t*, cl_uint, _cl_event* const*, _cl_event**)::<lambda(clover::event&)> >(clover::command_queue &, <unknown type in /home/pmoreau/projects/nouveau/mesa_install/lib/libOpenCL.so.1, CU 0x9450f, DIE 0xf6619>, clover::ref_vector<clover::event> &, <unknown type in /home/pmoreau/projects/nouveau/mesa_install/lib/libOpenCL.so.1, CU 0x9450f, DIE 0xf6976>) (as#0=...,
as#1=<unknown type in /home/pmoreau/projects/nouveau/mesa_install/lib/libOpenCL.so.1, CU 0x9450f, DIE 0xf6619>, as#2=..., as#3=<unknown type in /home/pmoreau/projects/nouveau/mesa_install/lib/libOpenCL.so.1, CU 0x9450f, DIE 0xf6976>) at ../../../../../mesa/src/gallium/state_trackers/clover/util/pointer.hpp:230
#23 0x00007ffff7a4313a in clEnqueueNDRangeKernel (d_q=0x63ff18, d_kern=0x664958, dims=2, d_grid_offset=0x0, d_grid_size=0x7fffffffd870, d_block_size=0x7fffffffd860, num_deps=0, d_deps=0x0, rd_ev=0x0) at ../../../../../mesa/src/gallium/state_trackers/clover/api/kernel.cpp:293
#24 0x000000000040189d in main (argc=1, argv=0x7fffffffdbf8) at image2d_rw.c:214
(gdb)

Event Timeline

pmoreau created this paste.Sep 22 2016, 1:19 PM
pmoreau created this object with visibility "Public (No Login Required)".
pmoreau created this object with edit policy "Nouveau (Project)".
pmoreau added a comment.EditedSep 22 2016, 1:39 PM
__kernel void rw_image(__read_only image2d_t in, __write_only image2d_t out)
{
  int2 coords = (int2) (get_global_id(0), get_global_id(1));
  if (coords.x > 4 || coords.y > 4)
    return;

  sampler_t const smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
  float4 val = read_imagef(in, smp, coords);
  write_imagef(out, coords, val * 2.0f);
}
pmoreau added a comment.EditedSep 22 2016, 1:40 PM
#include <stdio.h>
#include <stdlib.h>

#include <CL/cl.h>

#include "cl_util.h"


int main(int argc, char ** argv)
{
   cl_int error;

   cl_uint total_platforms;
   cl_platform_id platform_id;

   cl_uint total_gpu_devices;
   cl_device_id device_id;

   cl_context context;

   cl_command_queue command_queue;

   cl_program program;

   cl_kernel kernel;

   FILE *input;
   void *il;
   size_t length;

   cl_image_format format;

   float in_data[64];
   float out_data[64];
   cl_mem in_image;
   cl_mem out_image;
   size_t global_work_size[2] = { 4, 4 };
   size_t local_work_size[2] = { 1, 1 };

   size_t origin[3] = { 0, 0, 0 };
   size_t region[3] = { 4, 4, 1 };

   cl_int status;

   error = clGetPlatformIDs(
                           1, /* Max number of platform IDs to return */
                           &platform_id, /* Pointer to platform_id */
                           &total_platforms); /* Total number of platforms
                                               * found on the system */

   if (error != CL_SUCCESS) {
      fprintf(stderr, "clGetPlatformIDs() failed: %s\n", clUtilErrorString(error));
      return EXIT_FAILURE;
   }

   fprintf(stderr, "There are %u platforms.\n", total_platforms);



   error = clGetDeviceIDs(platform_id,
                          CL_DEVICE_TYPE_GPU,
                          1,
                          &device_id,
                          &total_gpu_devices);

   if (error != CL_SUCCESS) {
      fprintf(stderr, "clGetDeviceIDs() failed: %s\n", clUtilErrorString(error));
      return EXIT_FAILURE;
   }

   fprintf(stderr, "There are %u GPU devices.\n", total_gpu_devices);

   context = clCreateContext(NULL, /* Properties */
                           1, /* Number of devices */
                           &device_id, /* Device pointer */
                           NULL, /* Callback for reporting errors */
                           NULL, /* User data to pass to error callback */
                           &error); /* Error code */

   if (error != CL_SUCCESS) {
      fprintf(stderr, "clCreateContext() failed: %s\n", clUtilErrorString(error));
      return EXIT_FAILURE;
   }

   fprintf(stderr, "clCreateContext() succeeded.\n");

   command_queue = clCreateCommandQueue(context,
                                        device_id,
                                        0, /* Command queue properties */
                                        &error); /* Error code */

   if (error != CL_SUCCESS) {
      fprintf(stderr, "clCreateCommandQueue() failed: %s\n",
                      clUtilErrorString(error));
      return EXIT_FAILURE;
   }

   fprintf(stderr, "clCreateCommandQueue() succeeded.\n");

   input = fopen("image2d_rw.spv", "rb");
   if (!input) {
     fprintf(stderr, "failed to open image2d_rw.spv\n");
     return EXIT_FAILURE;
   }

   fseek(input, 0l, SEEK_END);
   length = ftell(input);
   rewind(input);

   il = calloc(length, sizeof(char));
   fread(il, length, 1, input);

//   program = clCreateProgramWithBinary(context, 1, &device_id, &length, (const unsigned char**)&il, &status, &error);
   program = clCreateProgramWithIL(context, il, length, &error);

   if (error != CL_SUCCESS) {
//     fprintf(stderr, "clCreateProgramWithBinary() failed: %s\n",
      fprintf(stderr, "clCreateProgramWithIL() failed: %s\n",
                      clUtilErrorString(error));
      return EXIT_FAILURE;
   }

//   fprintf(stderr, "clCreateProgramWithBinary() succeeded\n");
   fprintf(stderr, "clCreateProgramWithIL() suceeded.\n");

   error = clBuildProgram(program,
                          1, /* Number of devices */
                          &device_id,
                          NULL, /* options */
                          NULL, /* callback function when compile is complete */
                          NULL); /* user data for callback */


   if (error != CL_SUCCESS) {
      char build_str[1000000];
      error = clGetProgramBuildInfo(program,
                                    device_id,
                                    CL_PROGRAM_BUILD_LOG,
                                    1000000, /* Size of output string */
                                    build_str, /* pointer to write the log to */
                                    NULL); /* Number of bytes written to the log */
      if (error != CL_SUCCESS) {
         fprintf(stderr, "clGetProgramBuildInfo() failed: %s\n",
                          clUtilErrorString(error));
      } else {
         fprintf(stderr, "Build Log: \n%s\n\n", build_str);
      }
      return EXIT_FAILURE;
   }

   fprintf(stderr, "clBuildProgram() suceeded.\n");

   kernel = clCreateKernel(program, "rw_image", &error);

   if (error != CL_SUCCESS) {
      fprintf(stderr, "clCreateKernel() failed: %s\n", clUtilErrorString(error));
      return EXIT_FAILURE;
   }

   fprintf(stderr, "clCreateKernel() suceeded.\n");

   format.image_channel_order = CL_RGBA;
   format.image_channel_data_type = CL_FLOAT;

   for (int i = 0; i < 64; ++i)
     out_data[i] = 0.0f;

   out_image = clCreateImage2D(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, &format, 4, 4, 0, out_data, &error);

   if (error != CL_SUCCESS) {
      fprintf(stderr, "clCreateImage2D() failed: %s\n", clUtilErrorString(error));
      return EXIT_FAILURE;
   }

   fprintf(stderr, "clCreateImage2D() succeeded.\n");

   error = clSetKernelArg(kernel,
                          0, /* Arg index */
                          sizeof(cl_mem),
                          &out_image);

   if (error != CL_SUCCESS) {
      fprintf(stderr, "clSetKernelArg failed: %s\n", clUtilErrorString(error));
      return EXIT_FAILURE;
   }

   fprintf(stderr, "clSetKernelArg() succeeded.\n");

   for (int i = 0; i < 64; ++i)
     in_data[i] = (float) i;

   in_image = clCreateImage2D(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &format, 4, 4, 0, in_data, &error);

   if (error != CL_SUCCESS) {
      fprintf(stderr, "clCreateImage2D() failed: %s\n", clUtilErrorString(error));
      return EXIT_FAILURE;
   }

   fprintf(stderr, "clCreateImage2D() succeeded.\n");

   error = clSetKernelArg(kernel,
                          1, /* Arg index */
                          sizeof(cl_mem),
                          &in_image);

   if (error != CL_SUCCESS) {
      fprintf(stderr, "clSetKernelArg failed: %s\n", clUtilErrorString(error));
      return EXIT_FAILURE;
   }

   fprintf(stderr, "clSetKernelArg() succeeded.\n");


   error = clEnqueueNDRangeKernel(command_queue,
                                  kernel,
                                  2, /* Number of dimensions */
                                  NULL, /* Global work offset */
                                  global_work_size,
                                  local_work_size, /* local work size */
                                  0, /* Events in wait list */
                                  NULL, /* Wait list */
                                  NULL); /* Event object for this event */

   if (error != CL_SUCCESS) {
      fprintf(stderr, "clEnqueueNDRangeKernel() failed: %s\n",
                      clUtilErrorString(error));
      return EXIT_FAILURE;
   }

   fprintf(stderr, "clEnqueueNDRangeKernel() suceeded.\n");

   error = clFinish(command_queue);
   if (error != CL_SUCCESS) {
      fprintf(stderr, "clFinish() failed: %s\n", clUtilErrorString(error));
      return EXIT_FAILURE;
   }

   fprintf(stderr, "clFinish() succeeded.\n");

   error = clEnqueueReadImage(command_queue,
                                out_image,
                                CL_TRUE, /* TRUE means it is a blocking read. */
                                origin,
                                region,
                                0,
                                0,
                                out_data, /* Pointer to store the data */
                                0, /* Events in wait list */
                                NULL, /* Wait list */
                                NULL); /* Event object */


   if (error != CL_SUCCESS) {
      fprintf(stderr, "clEnqueueReadImage() failed: %s\n",
                      clUtilErrorString(error));
      return EXIT_FAILURE;
   }

   fprintf(stderr, "clEnqueueReadImage() suceeded.\n");

   for (int i = 0; i < 64; ++i)
     if (out_data[i] != 2.0f * in_data[i])
       fprintf(stderr, "out_data[%d] = %f, expected %f\n", i, out_data[i], (2.0f * in_data[i]));
   return EXIT_SUCCESS;
}