// // Copyright (c) 2017, 2020 The Khronos Group Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // #include "harness/compat.h" // Bug: Missing in spec: atomic_intptr_t is always supported if device is // 32-bits. // Bug: Missing in spec: CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE #define FLUSH fflush(stdout) #define MAX_STR 16 * 1024 #define ALIGNMENT 128 // NUM_ROUNDS must be at least 1. // It determines how many sets of random data we push through the global // variables. #define NUM_ROUNDS 1 // This is a shared property of the writer and reader kernels. #define NUM_TESTED_VALUES 5 // TODO: pointer-to-half (and its vectors) // TODO: union of... #include #include #include #include #include #include #include #include #include #include "harness/typeWrappers.h" #include "harness/errorHelpers.h" #include "harness/mt19937.h" #include "procs.h" //////////////////// // Device capabilities static int l_has_double = 0; static int l_has_half = 0; static int l_64bit_device = 0; static int l_has_int64_atomics = 0; static int l_has_intptr_atomics = 0; static int l_has_cles_int64 = 0; static int l_host_is_big_endian = 1; static size_t l_max_global_id0 = 0; static cl_bool l_linker_available = false; #define check_error(errCode, msg, ...) \ ((errCode != CL_SUCCESS) ? (log_error("ERROR: " msg "! (%s:%d)\n", \ ##__VA_ARGS__, __FILE__, __LINE__), \ 1) \ : 0) //////////////////// // Info about types we can use for program scope variables. class TypeInfo { public: TypeInfo() : name(""), m_elem_type(0), m_num_elem(0), m_is_vecbase(false), m_is_atomic(false), m_is_like_size_t(false), m_is_bool(false), m_size(0), m_value_size(0), m_buf_elem_type("") {} TypeInfo(const char* name_arg) : name(name_arg), m_elem_type(0), m_num_elem(0), m_is_vecbase(false), m_is_atomic(false), m_is_like_size_t(false), m_is_bool(false), m_size(0), m_value_size(0), m_buf_elem_type(name_arg) {} // Vectors TypeInfo(TypeInfo* elem_type, int num_elem) : m_elem_type(elem_type), m_num_elem(num_elem), m_is_vecbase(false), m_is_atomic(false), m_is_like_size_t(false), m_is_bool(false) { char the_name[10]; // long enough for longest vector type name "double16" snprintf(the_name, sizeof(the_name), "%s%d", elem_type->get_name_c_str(), m_num_elem); this->name = std::string(the_name); this->m_buf_elem_type = std::string(the_name); this->m_value_size = num_elem * elem_type->get_size(); if (m_num_elem == 3) { this->m_size = 4 * elem_type->get_size(); } else { this->m_size = num_elem * elem_type->get_size(); } } const std::string& get_name(void) const { return name; } const char* get_name_c_str(void) const { return name.c_str(); } TypeInfo& set_vecbase(void) { this->m_is_vecbase = true; return *this; } TypeInfo& set_atomic(void) { this->m_is_atomic = true; return *this; } TypeInfo& set_like_size_t(void) { this->m_is_like_size_t = true; this->set_size(l_64bit_device ? 8 : 4); this->m_buf_elem_type = l_64bit_device ? "ulong" : "uint"; return *this; } TypeInfo& set_bool(void) { this->m_is_bool = true; return *this; } TypeInfo& set_size(size_t n) { this->m_value_size = this->m_size = n; return *this; } TypeInfo& set_buf_elem_type(const char* name) { this->m_buf_elem_type = std::string(name); return *this; } const TypeInfo* elem_type(void) const { return m_elem_type; } int num_elem(void) const { return m_num_elem; } bool is_vecbase(void) const { return m_is_vecbase; } bool is_atomic(void) const { return m_is_atomic; } bool is_atomic_64bit(void) const { return m_is_atomic && m_size == 8; } bool is_like_size_t(void) const { return m_is_like_size_t; } bool is_bool(void) const { return m_is_bool; } size_t get_size(void) const { return m_size; } size_t get_value_size(void) const { return m_value_size; } // When passing values of this type to a kernel, what buffer type // should be used? const char* get_buf_elem_type(void) const { return m_buf_elem_type.c_str(); } std::string as_string(const cl_uchar* value_ptr) const { // This method would be shorter if I had a real handle to element // vector type. if (this->is_bool()) { std::string result(name); result += "<"; result += (*value_ptr ? "true" : "false"); result += ", "; char buf[10]; sprintf(buf, "%02x", *value_ptr); result += buf; result += ">"; return result; } else if (this->num_elem()) { std::string result(name); result += "<"; for (unsigned ielem = 0; ielem < this->num_elem(); ielem++) { char buf[MAX_STR]; if (ielem) result += ", "; for (unsigned ibyte = 0; ibyte < this->m_elem_type->get_size(); ibyte++) { sprintf(buf + 2 * ibyte, "%02x", value_ptr[ielem * this->m_elem_type->get_size() + ibyte]); } result += buf; } result += ">"; return result; } else { std::string result(name); result += "<"; char buf[MAX_STR]; for (unsigned ibyte = 0; ibyte < this->get_size(); ibyte++) { sprintf(buf + 2 * ibyte, "%02x", value_ptr[ibyte]); } result += buf; result += ">"; return result; } } // Initialize the given buffer to a constant value initialized as if it // were from the INIT_VAR macro below. // Only needs to support values 0 and 1. void init(cl_uchar* buf, cl_uchar val) const { if (this->num_elem()) { for (unsigned ielem = 0; ielem < this->num_elem(); ielem++) { // Delegate! this->init_elem( buf + ielem * this->get_value_size() / this->num_elem(), val); } } else { init_elem(buf, val); } } private: void init_elem(cl_uchar* buf, cl_uchar val) const { size_t elem_size = this->num_elem() ? this->get_value_size() / this->num_elem() : this->get_size(); memset(buf, 0, elem_size); if (val) { if (strstr(name.c_str(), "float")) { *(float*)buf = (float)val; return; } if (strstr(name.c_str(), "double")) { *(double*)buf = (double)val; return; } if (this->is_bool()) { *buf = (bool)val; return; } // Write a single character value to the correct spot, // depending on host endianness. if (l_host_is_big_endian) *(buf + elem_size - 1) = (cl_uchar)val; else *buf = (cl_uchar)val; } } public: void dump(FILE* fp) const { fprintf(fp, "Type %s : <%d,%d,%s> ", name.c_str(), (int)m_size, (int)m_value_size, m_buf_elem_type.c_str()); if (this->m_elem_type) fprintf(fp, " vec(%s,%d)", this->m_elem_type->get_name_c_str(), this->num_elem()); if (this->m_is_vecbase) fprintf(fp, " vecbase"); if (this->m_is_bool) fprintf(fp, " bool"); if (this->m_is_like_size_t) fprintf(fp, " like-size_t"); if (this->m_is_atomic) fprintf(fp, " atomic"); fprintf(fp, "\n"); fflush(fp); } private: std::string name; TypeInfo* m_elem_type; int m_num_elem; bool m_is_vecbase; bool m_is_atomic; bool m_is_like_size_t; bool m_is_bool; size_t m_size; // Number of bytes of storage occupied by this type. size_t m_value_size; // Number of bytes of value significant for this type. // Differs for vec3. // When passing values of this type to a kernel, what buffer type // should be used? // For most types, it's just itself. // Use a std::string so I don't have to make a copy constructor. std::string m_buf_elem_type; }; #define NUM_SCALAR_TYPES \ (8 + 2) // signed and unsigned integral types, float and double #define NUM_VECTOR_SIZES (5) // 2,3,4,8,16 #define NUM_PLAIN_TYPES \ 5 /*boolean and size_t family */ \ + NUM_SCALAR_TYPES + NUM_SCALAR_TYPES* NUM_VECTOR_SIZES \ + 10 /* atomic types */ // Need room for plain, array, pointer, struct #define MAX_TYPES (4 * NUM_PLAIN_TYPES) static TypeInfo type_info[MAX_TYPES]; static int num_type_info = 0; // Number of valid entries in type_info[] // A helper class to form kernel source arguments for clCreateProgramWithSource. class StringTable { public: StringTable(): m_strings(), m_c_strs(NULL), m_lengths(NULL), m_frozen(false) {} ~StringTable() { release_frozen(); } void add(std::string s) { release_frozen(); m_strings.push_back(s); } const size_t num_str() { freeze(); return m_strings.size(); } const char** strs() { freeze(); return m_c_strs; } const size_t* lengths() { freeze(); return m_lengths; } private: void freeze(void) { if (!m_frozen) { release_frozen(); m_c_strs = (const char**)malloc(sizeof(const char*) * m_strings.size()); m_lengths = (size_t*)malloc(sizeof(size_t) * m_strings.size()); assert(m_c_strs); assert(m_lengths); for (size_t i = 0; i < m_strings.size(); i++) { m_c_strs[i] = m_strings[i].c_str(); m_lengths[i] = strlen(m_c_strs[i]); } m_frozen = true; } } void release_frozen(void) { if (m_c_strs) { free(m_c_strs); m_c_strs = 0; } if (m_lengths) { free(m_lengths); m_lengths = 0; } m_frozen = false; } typedef std::vector strlist_t; strlist_t m_strings; const char** m_c_strs; size_t* m_lengths; bool m_frozen; }; //////////////////// // File scope function declarations static void l_load_abilities(cl_device_id device); static const char* l_get_fp64_pragma(void); static const char* l_get_cles_int64_pragma(void); static int l_build_type_table(cl_device_id device); static int l_get_device_info(cl_device_id device, size_t* max_size_ret, size_t* pref_size_ret); static void l_set_randomly(cl_uchar* buf, size_t buf_size, RandomSeed& rand_state); static int l_compare(const char* test_name, const cl_uchar* expected, const cl_uchar* received, size_t num_values, const TypeInfo& ti); static int l_copy(cl_uchar* dest, unsigned dest_idx, const cl_uchar* src, unsigned src_idx, const TypeInfo& ti); static std::string conversion_functions(const TypeInfo& ti); static std::string global_decls(const TypeInfo& ti, bool with_init); static std::string global_check_function(const TypeInfo& ti); static std::string writer_function(const TypeInfo& ti); static std::string reader_function(const TypeInfo& ti); static int l_write_read(cl_device_id device, cl_context context, cl_command_queue queue); static int l_write_read_for_type(cl_device_id device, cl_context context, cl_command_queue queue, const TypeInfo& ti, RandomSeed& rand_state); static int l_init_write_read(cl_device_id device, cl_context context, cl_command_queue queue); static int l_init_write_read_for_type(cl_device_id device, cl_context context, cl_command_queue queue, const TypeInfo& ti, RandomSeed& rand_state); static int l_capacity(cl_device_id device, cl_context context, cl_command_queue queue, size_t max_size); static int l_user_type(cl_device_id device, cl_context context, cl_command_queue queue, bool separate_compile); static std::string get_build_options(cl_device_id device); //////////////////// // File scope function definitions static cl_int print_build_log(cl_program program, cl_uint num_devices, cl_device_id* device_list, cl_uint count, const char** strings, const size_t* lengths, const char* options) { cl_uint i; cl_int error; BufferOwningPtr devices; if (num_devices == 0 || device_list == NULL) { error = clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES, sizeof(num_devices), &num_devices, NULL); test_error(error, "clGetProgramInfo CL_PROGRAM_NUM_DEVICES failed"); device_list = (cl_device_id*)malloc(sizeof(cl_device_id) * num_devices); devices.reset(device_list); memset(device_list, 0, sizeof(cl_device_id) * num_devices); error = clGetProgramInfo(program, CL_PROGRAM_DEVICES, sizeof(cl_device_id) * num_devices, device_list, NULL); test_error(error, "clGetProgramInfo CL_PROGRAM_DEVICES failed"); } cl_uint z; bool sourcePrinted = false; for (z = 0; z < num_devices; z++) { char deviceName[4096] = ""; error = clGetDeviceInfo(device_list[z], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL); check_error(error, "Device \"%d\" failed to return a name. clGetDeviceInfo " "CL_DEVICE_NAME failed", z); cl_build_status buildStatus; error = clGetProgramBuildInfo(program, device_list[z], CL_PROGRAM_BUILD_STATUS, sizeof(buildStatus), &buildStatus, NULL); check_error(error, "clGetProgramBuildInfo CL_PROGRAM_BUILD_STATUS failed"); if (buildStatus != CL_BUILD_SUCCESS) { if (!sourcePrinted) { log_error("Build options: %s\n", options); if (count && strings) { log_error("Original source is: ------------\n"); for (i = 0; i < count; i++) log_error("%s", strings[i]); } sourcePrinted = true; } char statusString[64] = ""; if (buildStatus == (cl_build_status)CL_BUILD_SUCCESS) sprintf(statusString, "CL_BUILD_SUCCESS"); else if (buildStatus == (cl_build_status)CL_BUILD_NONE) sprintf(statusString, "CL_BUILD_NONE"); else if (buildStatus == (cl_build_status)CL_BUILD_ERROR) sprintf(statusString, "CL_BUILD_ERROR"); else if (buildStatus == (cl_build_status)CL_BUILD_IN_PROGRESS) sprintf(statusString, "CL_BUILD_IN_PROGRESS"); else sprintf(statusString, "UNKNOWN (%d)", buildStatus); log_error("Build not successful for device \"%s\", status: %s\n", deviceName, statusString); size_t paramSize = 0; error = clGetProgramBuildInfo(program, device_list[z], CL_PROGRAM_BUILD_LOG, 0, NULL, ¶mSize); if (check_error( error, "clGetProgramBuildInfo CL_PROGRAM_BUILD_LOG failed")) break; std::string log; log.resize(paramSize / sizeof(char)); error = clGetProgramBuildInfo(program, device_list[z], CL_PROGRAM_BUILD_LOG, paramSize, &log[0], NULL); if (check_error(error, "Device %d (%s) failed to return a build log", z, deviceName)) break; if (log[0] == 0) log_error("clGetProgramBuildInfo returned an empty log.\n"); else { log_error("Build log for device \"%s\":\n", deviceName); log_error("%s\n", log.c_str()); } } } return error; } static void l_load_abilities(cl_device_id device) { l_has_half = is_extension_available(device, "cl_khr_fp16"); l_has_double = is_extension_available(device, "cl_khr_fp64"); l_has_cles_int64 = is_extension_available(device, "cles_khr_int64"); l_has_int64_atomics = is_extension_available(device, "cl_khr_int64_base_atomics") && is_extension_available(device, "cl_khr_int64_extended_atomics"); { int status = CL_SUCCESS; cl_uint addr_bits = 32; status = clGetDeviceInfo(device, CL_DEVICE_ADDRESS_BITS, sizeof(addr_bits), &addr_bits, 0); l_64bit_device = (status == CL_SUCCESS && addr_bits == 64); } // 32-bit devices always have intptr atomics. l_has_intptr_atomics = !l_64bit_device || l_has_int64_atomics; union { char c[4]; int i; } probe; probe.i = 1; l_host_is_big_endian = !probe.c[0]; // Determine max global id. { int status = CL_SUCCESS; cl_uint max_dim = 0; status = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(max_dim), &max_dim, 0); if (check_error(status, "clGetDeviceInfo for " "CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS failed.")) return; assert(max_dim > 0); size_t max_id[3]; max_id[0] = 0; status = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, max_dim * sizeof(size_t), &max_id[0], 0); if (check_error(status, "clGetDeviceInfo for " "CL_DEVICE_MAX_WORK_ITEM_SIZES failed.")) return; l_max_global_id0 = max_id[0]; } { // Is separate compilation supported? int status = CL_SUCCESS; l_linker_available = false; status = clGetDeviceInfo(device, CL_DEVICE_LINKER_AVAILABLE, sizeof(l_linker_available), &l_linker_available, 0); if (check_error(status, "clGetDeviceInfo for " "CL_DEVICE_LINKER_AVAILABLE failed.")) return; } } static const char* l_get_fp64_pragma(void) { return l_has_double ? "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" : ""; } static const char* l_get_cles_int64_pragma(void) { return l_has_cles_int64 ? "#pragma OPENCL EXTENSION cles_khr_int64 : enable\n" : ""; } static const char* l_get_int64_atomic_pragma(void) { return "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n" "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n"; } static int l_build_type_table(cl_device_id device) { int status = CL_SUCCESS; size_t iscalar = 0; size_t ivecsize = 0; int vecsizes[] = { 2, 3, 4, 8, 16 }; const char* vecbase[] = { "uchar", "char", "ushort", "short", "uint", "int", "ulong", "long", "float", "double" }; int vecbase_size[] = { 1, 1, 2, 2, 4, 4, 8, 8, 4, 8 }; const char* like_size_t[] = { "intptr_t", "uintptr_t", "size_t", "ptrdiff_t" }; const char* atomics[] = { "atomic_int", "atomic_uint", "atomic_long", "atomic_ulong", "atomic_float", "atomic_double", }; int atomics_size[] = { 4, 4, 8, 8, 4, 8 }; const char* intptr_atomics[] = { "atomic_intptr_t", "atomic_uintptr_t", "atomic_size_t", "atomic_ptrdiff_t" }; l_load_abilities(device); num_type_info = 0; // Boolean. type_info[num_type_info++] = TypeInfo("bool").set_bool().set_size(1).set_buf_elem_type("uchar"); // Vector types, and the related scalar element types. for (iscalar = 0; iscalar < sizeof(vecbase) / sizeof(vecbase[0]); ++iscalar) { if (!gHasLong && strstr(vecbase[iscalar], "long")) continue; if (!l_has_double && strstr(vecbase[iscalar], "double")) continue; // Scalar TypeInfo* elem_type = type_info + num_type_info++; *elem_type = TypeInfo(vecbase[iscalar]) .set_vecbase() .set_size(vecbase_size[iscalar]); // Vector for (ivecsize = 0; ivecsize < sizeof(vecsizes) / sizeof(vecsizes[0]); ivecsize++) { type_info[num_type_info++] = TypeInfo(elem_type, vecsizes[ivecsize]); } } // Size_t-like types for (iscalar = 0; iscalar < sizeof(like_size_t) / sizeof(like_size_t[0]); ++iscalar) { type_info[num_type_info++] = TypeInfo(like_size_t[iscalar]).set_like_size_t(); } // Atomic types. for (iscalar = 0; iscalar < sizeof(atomics) / sizeof(atomics[0]); ++iscalar) { if (!l_has_int64_atomics && strstr(atomics[iscalar], "long")) continue; if (!(l_has_int64_atomics && l_has_double) && strstr(atomics[iscalar], "double")) continue; // The +7 is used to skip over the "atomic_" prefix. const char* buf_type = atomics[iscalar] + 7; type_info[num_type_info++] = TypeInfo(atomics[iscalar]) .set_atomic() .set_size(atomics_size[iscalar]) .set_buf_elem_type(buf_type); } if (l_has_intptr_atomics) { for (iscalar = 0; iscalar < sizeof(intptr_atomics) / sizeof(intptr_atomics[0]); ++iscalar) { type_info[num_type_info++] = TypeInfo(intptr_atomics[iscalar]) .set_atomic() .set_like_size_t(); } } assert(num_type_info <= MAX_TYPES); // or increase MAX_TYPES #if 0 for ( size_t i = 0 ; i < num_type_info ; i++ ) { type_info[ i ].dump(stdout); } exit(0); #endif return status; } static const TypeInfo& l_find_type(const char* name) { auto itr = std::find_if(type_info, type_info + num_type_info, [name](TypeInfo& ti) { return ti.get_name() == name; }); assert(itr != type_info + num_type_info); return *itr; } // Populate return parameters for max program variable size, preferred program // variable size. static int l_get_device_info(cl_device_id device, size_t* max_size_ret, size_t* pref_size_ret) { int err = CL_SUCCESS; size_t return_size = 0; err = clGetDeviceInfo(device, CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE, sizeof(*max_size_ret), max_size_ret, &return_size); if (err != CL_SUCCESS) { log_error("Error: Failed to get device info for " "CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE\n"); return err; } if (return_size != sizeof(size_t)) { log_error("Error: Invalid size %d returned for " "CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE\n", (int)return_size); return 1; } if (return_size != sizeof(size_t)) { log_error("Error: Invalid size %d returned for " "CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE\n", (int)return_size); return 1; } return_size = 0; err = clGetDeviceInfo(device, CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE, sizeof(*pref_size_ret), pref_size_ret, &return_size); if (err != CL_SUCCESS) { log_error("Error: Failed to get device info for " "CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE: %d\n", err); return err; } if (return_size != sizeof(size_t)) { log_error("Error: Invalid size %d returned for " "CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE\n", (int)return_size); return 1; } return CL_SUCCESS; } static void l_set_randomly(cl_uchar* buf, size_t buf_size, RandomSeed& rand_state) { assert(0 == (buf_size % sizeof(cl_uint))); for (size_t i = 0; i < buf_size; i += sizeof(cl_uint)) { *((cl_uint*)(buf + i)) = genrand_int32(rand_state); } #if 0 for ( size_t i = 0; i < buf_size ; i++ ) { printf("%02x",buf[i]); } printf("\n"); #endif } // Return num_value values of the given type. // Returns CL_SUCCESS if they compared as equal. static int l_compare(const char* test_name, const cl_uchar* expected, const cl_uchar* received, size_t num_values, const TypeInfo& ti) { // Compare only the valid returned bytes. for (unsigned value_idx = 0; value_idx < num_values; value_idx++) { const cl_uchar* expv = expected + value_idx * ti.get_size(); const cl_uchar* gotv = received + value_idx * ti.get_size(); if (memcmp(expv, gotv, ti.get_value_size())) { std::string exp_str = ti.as_string(expv); std::string got_str = ti.as_string(gotv); log_error( "Error: %s test for type %s, at index %d: Expected %s got %s\n", test_name, ti.get_name_c_str(), value_idx, exp_str.c_str(), got_str.c_str()); return 1; } } return CL_SUCCESS; } // Copy a target value from src[idx] to dest[idx] static int l_copy(cl_uchar* dest, unsigned dest_idx, const cl_uchar* src, unsigned src_idx, const TypeInfo& ti) { cl_uchar* raw_dest = dest + dest_idx * ti.get_size(); const cl_uchar* raw_src = src + src_idx * ti.get_size(); memcpy(raw_dest, raw_src, ti.get_value_size()); return 0; } static std::string conversion_functions(const TypeInfo& ti) { std::string result; static char buf[MAX_STR]; int num_printed = 0; // The atomic types just use the base type. if (ti.is_atomic() || 0 == strcmp(ti.get_buf_elem_type(), ti.get_name_c_str())) { // The type is represented in a buffer by itself. num_printed = snprintf(buf, MAX_STR, "%s from_buf(%s a) { return a; }\n" "%s to_buf(%s a) { return a; }\n", ti.get_buf_elem_type(), ti.get_buf_elem_type(), ti.get_buf_elem_type(), ti.get_buf_elem_type()); } else { // Just use C-style cast. num_printed = snprintf(buf, MAX_STR, "%s from_buf(%s a) { return (%s)a; }\n" "%s to_buf(%s a) { return (%s)a; }\n", ti.get_name_c_str(), ti.get_buf_elem_type(), ti.get_name_c_str(), ti.get_buf_elem_type(), ti.get_name_c_str(), ti.get_buf_elem_type()); } // Add initializations. if (ti.is_atomic()) { num_printed += snprintf(buf + num_printed, MAX_STR - num_printed, "#define INIT_VAR(a) ATOMIC_VAR_INIT(a)\n"); } else { // This cast works even if the target type is a vector type. num_printed += snprintf(buf + num_printed, MAX_STR - num_printed, "#define INIT_VAR(a) ((%s)(a))\n", ti.get_name_c_str()); } assert(num_printed < MAX_STR); // or increase MAX_STR result = buf; return result; } static std::string global_decls(const TypeInfo& ti, bool with_init) { const char* tn = ti.get_name_c_str(); const char* vol = (ti.is_atomic() ? " volatile " : " "); static char decls[MAX_STR]; int num_printed = 0; if (with_init) { const char* decls_template_with_init = "%s %s var = INIT_VAR(0);\n" "global %s %s g_var = INIT_VAR(1);\n" "%s %s a_var[2] = { INIT_VAR(1), INIT_VAR(1) };\n" "volatile global %s %s* p_var = &a_var[1];\n\n"; num_printed = snprintf(decls, sizeof(decls), decls_template_with_init, vol, tn, vol, tn, vol, tn, vol, tn); } else { const char* decls_template_no_init = "%s %s var;\n" "global %s %s g_var;\n" "%s %s a_var[2];\n" "global %s %s* p_var;\n\n"; num_printed = snprintf(decls, sizeof(decls), decls_template_no_init, vol, tn, vol, tn, vol, tn, vol, tn); } assert(num_printed < sizeof(decls)); (void)num_printed; return std::string(decls); } // Return the source code for the "global_check" function for the given type. // This function checks that all program-scope variables have appropriate // initial values when no explicit initializer is used. If all tests pass the // kernel writes a non-zero value to its output argument, otherwise it writes // zero. static std::string global_check_function(const TypeInfo& ti) { const std::string type_name = ti.get_buf_elem_type(); // all() should only be used on vector inputs. For scalar comparison, the // result of the equality operator can be used as a bool value. const bool is_scalar = ti.num_elem() == 0; // 0 is used to represent scalar types, not 1. const std::string is_equality_true = is_scalar ? "" : "all"; std::string code = "kernel void global_check(global int* out) {\n"; code += " const " + type_name + " zero = ((" + type_name + ")0);\n"; code += " bool status = true;\n"; if (ti.is_atomic()) { code += " status &= " + is_equality_true + "(atomic_load(&var) == zero);\n"; code += " status &= " + is_equality_true + "(atomic_load(&g_var) == zero);\n"; code += " status &= " + is_equality_true + "(atomic_load(&a_var[0]) == zero);\n"; code += " status &= " + is_equality_true + "(atomic_load(&a_var[1]) == zero);\n"; } else { code += " status &= " + is_equality_true + "(var == zero);\n"; code += " status &= " + is_equality_true + "(g_var == zero);\n"; code += " status &= " + is_equality_true + "(a_var[0] == zero);\n"; code += " status &= " + is_equality_true + "(a_var[1] == zero);\n"; } code += " status &= (p_var == NULL);\n"; code += " *out = status ? 1 : 0;\n"; code += "}\n\n"; return code; } // Return the source text for the writer function for the given type. // For types that can't be passed as pointer-to-type as a kernel argument, // use a substitute base type of the same size. static std::string writer_function(const TypeInfo& ti) { static char writer_src[MAX_STR]; int num_printed = 0; if (!ti.is_atomic()) { const char* writer_template_normal = "kernel void writer( global %s* src, uint idx ) {\n" " var = from_buf(src[0]);\n" " g_var = from_buf(src[1]);\n" " a_var[0] = from_buf(src[2]);\n" " a_var[1] = from_buf(src[3]);\n" " p_var = a_var + idx;\n" "}\n\n"; num_printed = snprintf(writer_src, sizeof(writer_src), writer_template_normal, ti.get_buf_elem_type()); } else { const char* writer_template_atomic = "kernel void writer( global %s* src, uint idx ) {\n" " atomic_store( &var, from_buf(src[0]) );\n" " atomic_store( &g_var, from_buf(src[1]) );\n" " atomic_store( &a_var[0], from_buf(src[2]) );\n" " atomic_store( &a_var[1], from_buf(src[3]) );\n" " p_var = a_var + idx;\n" "}\n\n"; num_printed = snprintf(writer_src, sizeof(writer_src), writer_template_atomic, ti.get_buf_elem_type()); } assert(num_printed < sizeof(writer_src)); (void)num_printed; std::string result = writer_src; return result; } // Return source text for teh reader function for the given type. // For types that can't be passed as pointer-to-type as a kernel argument, // use a substitute base type of the same size. static std::string reader_function(const TypeInfo& ti) { static char reader_src[MAX_STR]; int num_printed = 0; if (!ti.is_atomic()) { const char* reader_template_normal = "kernel void reader( global %s* dest, %s ptr_write_val ) {\n" " *p_var = from_buf(ptr_write_val);\n" " dest[0] = to_buf(var);\n" " dest[1] = to_buf(g_var);\n" " dest[2] = to_buf(a_var[0]);\n" " dest[3] = to_buf(a_var[1]);\n" "}\n\n"; num_printed = snprintf(reader_src, sizeof(reader_src), reader_template_normal, ti.get_buf_elem_type(), ti.get_buf_elem_type()); } else { const char* reader_template_atomic = "kernel void reader( global %s* dest, %s ptr_write_val ) {\n" " atomic_store( p_var, from_buf(ptr_write_val) );\n" " dest[0] = to_buf( atomic_load( &var ) );\n" " dest[1] = to_buf( atomic_load( &g_var ) );\n" " dest[2] = to_buf( atomic_load( &a_var[0] ) );\n" " dest[3] = to_buf( atomic_load( &a_var[1] ) );\n" "}\n\n"; num_printed = snprintf(reader_src, sizeof(reader_src), reader_template_atomic, ti.get_buf_elem_type(), ti.get_buf_elem_type()); } assert(num_printed < sizeof(reader_src)); (void)num_printed; std::string result = reader_src; return result; } // Check that all globals where appropriately default-initialized. static int check_global_initialization(cl_context context, cl_program program, cl_command_queue queue) { int status = CL_SUCCESS; // Create a buffer on device to store a unique integer. cl_int is_init_valid = 0; clMemWrapper buffer( clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(is_init_valid), &is_init_valid, &status)); test_error_ret(status, "Failed to allocate buffer", status); // Create, setup and invoke kernel. clKernelWrapper global_check( clCreateKernel(program, "global_check", &status)); test_error_ret(status, "Failed to create global_check kernel", status); status = clSetKernelArg(global_check, 0, sizeof(cl_mem), &buffer); test_error_ret(status, "Failed to set up argument for the global_check kernel", status); const cl_uint work_dim = 1; const size_t global_work_offset[] = { 0 }; const size_t global_work_size[] = { 1 }; status = clEnqueueNDRangeKernel(queue, global_check, work_dim, global_work_offset, global_work_size, nullptr, 0, nullptr, nullptr); test_error_ret(status, "Failed to run global_check kernel", status); status = clFinish(queue); test_error_ret(status, "clFinish() failed", status); // Read back the memory buffer from the device. status = clEnqueueReadBuffer(queue, buffer, CL_TRUE, 0, sizeof(is_init_valid), &is_init_valid, 0, nullptr, nullptr); test_error_ret(status, "Failed to read buffer from device", status); if (is_init_valid == 0) { log_error("Unexpected default values were detected"); return 1; } return CL_SUCCESS; } // Check write-then-read. static int l_write_read(cl_device_id device, cl_context context, cl_command_queue queue) { int status = CL_SUCCESS; int itype; RandomSeed rand_state(gRandomSeed); for (itype = 0; itype < num_type_info; itype++) { status = status | l_write_read_for_type(device, context, queue, type_info[itype], rand_state); FLUSH; } return status; } static int l_write_read_for_type(cl_device_id device, cl_context context, cl_command_queue queue, const TypeInfo& ti, RandomSeed& rand_state) { int err = CL_SUCCESS; std::string type_name(ti.get_name()); const char* tn = type_name.c_str(); log_info(" %s ", tn); StringTable ksrc; ksrc.add(l_get_fp64_pragma()); ksrc.add(l_get_cles_int64_pragma()); if (ti.is_atomic_64bit()) ksrc.add(l_get_int64_atomic_pragma()); ksrc.add(conversion_functions(ti)); ksrc.add(global_decls(ti, false)); ksrc.add(global_check_function(ti)); ksrc.add(writer_function(ti)); ksrc.add(reader_function(ti)); int status = CL_SUCCESS; clProgramWrapper program; clKernelWrapper writer; status = create_single_kernel_helper(context, &program, &writer, ksrc.num_str(), ksrc.strs(), "writer"); test_error_ret(status, "Failed to create program for read-after-write test", status); clKernelWrapper reader(clCreateKernel(program, "reader", &status)); test_error_ret(status, "Failed to create reader kernel for read-after-write test", status); // Check size query. size_t used_bytes = 0; status = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE, sizeof(used_bytes), &used_bytes, 0); test_error_ret(status, "Failed to query global variable total size", status); size_t expected_used_bytes = (NUM_TESTED_VALUES - 1) * ti.get_size() // Two regular variables and an array of 2 elements. + (l_64bit_device ? 8 : 4); // The pointer if (used_bytes < expected_used_bytes) { log_error("Error program query for global variable total size query " "failed: Expected at least %llu but got %llu\n", (unsigned long long)expected_used_bytes, (unsigned long long)used_bytes); err |= 1; } err |= check_global_initialization(context, program, queue); // We need to create 5 random values of the given type, // and read 4 of them back. const size_t write_data_size = NUM_TESTED_VALUES * sizeof(cl_ulong16); const size_t read_data_size = (NUM_TESTED_VALUES - 1) * sizeof(cl_ulong16); cl_uchar* write_data = (cl_uchar*)align_malloc(write_data_size, ALIGNMENT); cl_uchar* read_data = (cl_uchar*)align_malloc(read_data_size, ALIGNMENT); clMemWrapper write_mem(clCreateBuffer( context, CL_MEM_USE_HOST_PTR, write_data_size, write_data, &status)); test_error_ret(status, "Failed to allocate write buffer", status); clMemWrapper read_mem(clCreateBuffer(context, CL_MEM_USE_HOST_PTR, read_data_size, read_data, &status)); test_error_ret(status, "Failed to allocate read buffer", status); status = clSetKernelArg(writer, 0, sizeof(cl_mem), &write_mem); test_error_ret(status, "set arg", status); status = clSetKernelArg(reader, 0, sizeof(cl_mem), &read_mem); test_error_ret(status, "set arg", status); // Boolean random data needs to be massaged a bit more. const int num_rounds = ti.is_bool() ? (1 << NUM_TESTED_VALUES) : NUM_ROUNDS; unsigned bool_iter = 0; for (int iround = 0; iround < num_rounds; iround++) { for (cl_uint iptr_idx = 0; iptr_idx < 2; iptr_idx++) { // Index into array, to write via pointer // Generate new random data to push through. // Generate 5 * 128 bytes all the time, even though the test for // many types use less than all that. cl_uchar* write_ptr = (cl_uchar*)clEnqueueMapBuffer( queue, write_mem, CL_TRUE, CL_MAP_WRITE, 0, write_data_size, 0, 0, 0, 0); if (ti.is_bool()) { // For boolean, random data cast to bool isn't very random. // So use the bottom bit of bool_value_iter to get true // diversity. for (unsigned value_idx = 0; value_idx < NUM_TESTED_VALUES; value_idx++) { write_data[value_idx] = (1 << value_idx) & bool_iter; // printf(" %s", (write_data[value_idx] ? "true" : "false" // )); } bool_iter++; } else { l_set_randomly(write_data, write_data_size, rand_state); } status = clSetKernelArg(writer, 1, sizeof(cl_uint), &iptr_idx); test_error_ret(status, "set arg", status); // The value to write via the pointer should be taken from the // 5th typed slot of the write_data. status = clSetKernelArg( reader, 1, ti.get_size(), write_data + (NUM_TESTED_VALUES - 1) * ti.get_size()); test_error_ret(status, "set arg", status); // Determine the expected values. cl_uchar expected[read_data_size]; memset(expected, -1, sizeof(expected)); l_copy(expected, 0, write_data, 0, ti); l_copy(expected, 1, write_data, 1, ti); l_copy(expected, 2, write_data, 2, ti); l_copy(expected, 3, write_data, 3, ti); // But we need to take into account the value from the pointer // write. The 2 represents where the "a" array values begin in our // read-back. l_copy(expected, 2 + iptr_idx, write_data, 4, ti); clEnqueueUnmapMemObject(queue, write_mem, write_ptr, 0, 0, 0); if (ti.is_bool()) { // Collapse down to one bit. for (unsigned i = 0; i < NUM_TESTED_VALUES - 1; i++) expected[i] = (bool)expected[i]; } cl_uchar* read_ptr = (cl_uchar*)clEnqueueMapBuffer( queue, read_mem, CL_TRUE, CL_MAP_READ, 0, read_data_size, 0, 0, 0, 0); memset(read_data, -1, read_data_size); clEnqueueUnmapMemObject(queue, read_mem, read_ptr, 0, 0, 0); // Now run the kernel const size_t one = 1; status = clEnqueueNDRangeKernel(queue, writer, 1, 0, &one, 0, 0, 0, 0); test_error_ret(status, "enqueue writer", status); status = clEnqueueNDRangeKernel(queue, reader, 1, 0, &one, 0, 0, 0, 0); test_error_ret(status, "enqueue reader", status); status = clFinish(queue); test_error_ret(status, "finish", status); read_ptr = (cl_uchar*)clEnqueueMapBuffer( queue, read_mem, CL_TRUE, CL_MAP_READ, 0, read_data_size, 0, 0, 0, 0); if (ti.is_bool()) { // Collapse down to one bit. for (unsigned i = 0; i < NUM_TESTED_VALUES - 1; i++) read_data[i] = (bool)read_data[i]; } // Compare only the valid returned bytes. int compare_result = l_compare("read-after-write", expected, read_data, NUM_TESTED_VALUES - 1, ti); // log_info("Compared %d values each of size %llu. Result %d\n", // NUM_TESTED_VALUES-1, (unsigned long long)ti.get_value_size(), // compare_result ); err |= compare_result; clEnqueueUnmapMemObject(queue, read_mem, read_ptr, 0, 0, 0); if (err) break; } } if (CL_SUCCESS == err) { log_info("OK\n"); FLUSH; } align_free(write_data); align_free(read_data); return err; } // Check initialization, then, read, then write, then read. static int l_init_write_read(cl_device_id device, cl_context context, cl_command_queue queue) { int status = CL_SUCCESS; int itype; RandomSeed rand_state(gRandomSeed); for (itype = 0; itype < num_type_info; itype++) { status = status | l_init_write_read_for_type(device, context, queue, type_info[itype], rand_state); } return status; } static int l_init_write_read_for_type(cl_device_id device, cl_context context, cl_command_queue queue, const TypeInfo& ti, RandomSeed& rand_state) { int err = CL_SUCCESS; std::string type_name(ti.get_name()); const char* tn = type_name.c_str(); log_info(" %s ", tn); StringTable ksrc; ksrc.add(l_get_fp64_pragma()); ksrc.add(l_get_cles_int64_pragma()); if (ti.is_atomic_64bit()) ksrc.add(l_get_int64_atomic_pragma()); ksrc.add(conversion_functions(ti)); ksrc.add(global_decls(ti, true)); ksrc.add(writer_function(ti)); ksrc.add(reader_function(ti)); int status = CL_SUCCESS; clProgramWrapper program; clKernelWrapper writer; status = create_single_kernel_helper(context, &program, &writer, ksrc.num_str(), ksrc.strs(), "writer"); test_error_ret(status, "Failed to create program for init-read-after-write test", status); clKernelWrapper reader(clCreateKernel(program, "reader", &status)); test_error_ret( status, "Failed to create reader kernel for init-read-after-write test", status); // Check size query. size_t used_bytes = 0; status = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE, sizeof(used_bytes), &used_bytes, 0); test_error_ret(status, "Failed to query global variable total size", status); size_t expected_used_bytes = (NUM_TESTED_VALUES - 1) * ti.get_size() // Two regular variables and an array of 2 elements. + (l_64bit_device ? 8 : 4); // The pointer if (used_bytes < expected_used_bytes) { log_error("Error: program query for global variable total size query " "failed: Expected at least %llu but got %llu\n", (unsigned long long)expected_used_bytes, (unsigned long long)used_bytes); err |= 1; } // We need to create 5 random values of the given type, // and read 4 of them back. const size_t write_data_size = NUM_TESTED_VALUES * sizeof(cl_ulong16); const size_t read_data_size = (NUM_TESTED_VALUES - 1) * sizeof(cl_ulong16); cl_uchar* write_data = (cl_uchar*)align_malloc(write_data_size, ALIGNMENT); cl_uchar* read_data = (cl_uchar*)align_malloc(read_data_size, ALIGNMENT); clMemWrapper write_mem(clCreateBuffer( context, CL_MEM_USE_HOST_PTR, write_data_size, write_data, &status)); test_error_ret(status, "Failed to allocate write buffer", status); clMemWrapper read_mem(clCreateBuffer(context, CL_MEM_USE_HOST_PTR, read_data_size, read_data, &status)); test_error_ret(status, "Failed to allocate read buffer", status); status = clSetKernelArg(writer, 0, sizeof(cl_mem), &write_mem); test_error_ret(status, "set arg", status); status = clSetKernelArg(reader, 0, sizeof(cl_mem), &read_mem); test_error_ret(status, "set arg", status); // Boolean random data needs to be massaged a bit more. const int num_rounds = ti.is_bool() ? (1 << NUM_TESTED_VALUES) : NUM_ROUNDS; unsigned bool_iter = 0; // We need to count iterations. We do something *different on the // first iteration, to ensure we actually pick up the initialized // values. unsigned iteration = 0; for (int iround = 0; iround < num_rounds; iround++) { for (cl_uint iptr_idx = 0; iptr_idx < 2; iptr_idx++) { // Index into array, to write via pointer // Generate new random data to push through. // Generate 5 * 128 bytes all the time, even though the test for // many types use less than all that. cl_uchar* write_ptr = (cl_uchar*)clEnqueueMapBuffer( queue, write_mem, CL_TRUE, CL_MAP_WRITE, 0, write_data_size, 0, 0, 0, 0); if (ti.is_bool()) { // For boolean, random data cast to bool isn't very random. // So use the bottom bit of bool_value_iter to get true // diversity. for (unsigned value_idx = 0; value_idx < NUM_TESTED_VALUES; value_idx++) { write_data[value_idx] = (1 << value_idx) & bool_iter; // printf(" %s", (write_data[value_idx] ? "true" : "false" // )); } bool_iter++; } else { l_set_randomly(write_data, write_data_size, rand_state); } status = clSetKernelArg(writer, 1, sizeof(cl_uint), &iptr_idx); test_error_ret(status, "set arg", status); if (!iteration) { // On first iteration, the value we write via the last arg // to the "reader" function is 0. // It's way easier to code the test this way. ti.init(write_data + (NUM_TESTED_VALUES - 1) * ti.get_size(), 0); } // The value to write via the pointer should be taken from the // 5th typed slot of the write_data. status = clSetKernelArg( reader, 1, ti.get_size(), write_data + (NUM_TESTED_VALUES - 1) * ti.get_size()); test_error_ret(status, "set arg", status); // Determine the expected values. cl_uchar expected[read_data_size]; memset(expected, -1, sizeof(expected)); if (iteration) { l_copy(expected, 0, write_data, 0, ti); l_copy(expected, 1, write_data, 1, ti); l_copy(expected, 2, write_data, 2, ti); l_copy(expected, 3, write_data, 3, ti); // But we need to take into account the value from the pointer // write. The 2 represents where the "a" array values begin in // our read-back. But we need to take into account the value // from the pointer write. l_copy(expected, 2 + iptr_idx, write_data, 4, ti); } else { // On first iteration, expect these initialized values! // See the decls_template_with_init above. ti.init(expected, 0); ti.init(expected + ti.get_size(), 1); ti.init(expected + 2 * ti.get_size(), 1); // Emulate the effect of the write via the pointer. // The value is 0, not 1 (see above). // The pointer is always initialized to the second element // of the array. So it goes into slot 3 of the "expected" array. ti.init(expected + 3 * ti.get_size(), 0); } if (ti.is_bool()) { // Collapse down to one bit. for (unsigned i = 0; i < NUM_TESTED_VALUES - 1; i++) expected[i] = (bool)expected[i]; } clEnqueueUnmapMemObject(queue, write_mem, write_ptr, 0, 0, 0); cl_uchar* read_ptr = (cl_uchar*)clEnqueueMapBuffer( queue, read_mem, CL_TRUE, CL_MAP_READ, 0, read_data_size, 0, 0, 0, 0); memset(read_data, -1, read_data_size); clEnqueueUnmapMemObject(queue, read_mem, read_ptr, 0, 0, 0); // Now run the kernel const size_t one = 1; if (iteration) { status = clEnqueueNDRangeKernel(queue, writer, 1, 0, &one, 0, 0, 0, 0); test_error_ret(status, "enqueue writer", status); } else { // On first iteration, we should be picking up the // initialized value. So don't enqueue the writer. } status = clEnqueueNDRangeKernel(queue, reader, 1, 0, &one, 0, 0, 0, 0); test_error_ret(status, "enqueue reader", status); status = clFinish(queue); test_error_ret(status, "finish", status); read_ptr = (cl_uchar*)clEnqueueMapBuffer( queue, read_mem, CL_TRUE, CL_MAP_READ, 0, read_data_size, 0, 0, 0, 0); if (ti.is_bool()) { // Collapse down to one bit. for (unsigned i = 0; i < NUM_TESTED_VALUES - 1; i++) read_data[i] = (bool)read_data[i]; } // Compare only the valid returned bytes. // log_info(" Round %d ptr_idx %u\n", iround, iptr_idx ); int compare_result = l_compare("init-write-read", expected, read_data, NUM_TESTED_VALUES - 1, ti); // log_info("Compared %d values each of size %llu. Result %d\n", // NUM_TESTED_VALUES-1, (unsigned long long)ti.get_value_size(), // compare_result ); err |= compare_result; clEnqueueUnmapMemObject(queue, read_mem, read_ptr, 0, 0, 0); if (err) break; iteration++; } } if (CL_SUCCESS == err) { log_info("OK\n"); FLUSH; } align_free(write_data); align_free(read_data); return err; } // Check that we can make at least one variable with size // max_size which is returned from the device info property : // CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE. static int l_capacity(cl_device_id device, cl_context context, cl_command_queue queue, size_t max_size) { int err = CL_SUCCESS; // Just test one type. const TypeInfo ti(l_find_type("uchar")); log_info(" l_capacity..."); const char prog_src_template[] = #if defined(_WIN32) "uchar var[%Iu];\n\n" #else "uchar var[%zu];\n\n" #endif "kernel void get_max_size( global ulong* size_ret ) {\n" #if defined(_WIN32) " *size_ret = (ulong)%Iu;\n" #else " *size_ret = (ulong)%zu;\n" #endif "}\n\n" "kernel void writer( global uchar* src ) {\n" " var[get_global_id(0)] = src[get_global_linear_id()];\n" "}\n\n" "kernel void reader( global uchar* dest ) {\n" " dest[get_global_linear_id()] = var[get_global_id(0)];\n" "}\n\n"; char prog_src[MAX_STR]; int num_printed = snprintf(prog_src, sizeof(prog_src), prog_src_template, max_size, max_size); assert(num_printed < MAX_STR); // or increase MAX_STR (void)num_printed; StringTable ksrc; ksrc.add(prog_src); int status = CL_SUCCESS; clProgramWrapper program; clKernelWrapper get_max_size; status = create_single_kernel_helper(context, &program, &get_max_size, ksrc.num_str(), ksrc.strs(), "get_max_size"); test_error_ret(status, "Failed to create program for capacity test", status); // Check size query. size_t used_bytes = 0; status = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE, sizeof(used_bytes), &used_bytes, 0); test_error_ret(status, "Failed to query global variable total size", status); if (used_bytes < max_size) { log_error("Error: program query for global variable total size query " "failed: Expected at least %llu but got %llu\n", (unsigned long long)max_size, (unsigned long long)used_bytes); err |= 1; } // Prepare to execute clKernelWrapper writer(clCreateKernel(program, "writer", &status)); test_error_ret(status, "Failed to create writer kernel for capacity test", status); clKernelWrapper reader(clCreateKernel(program, "reader", &status)); test_error_ret(status, "Failed to create reader kernel for capacity test", status); cl_ulong max_size_ret = 0; const size_t arr_size = 10 * 1024 * 1024; cl_uchar* buffer = (cl_uchar*)align_malloc(arr_size, ALIGNMENT); if (!buffer) { log_error("Failed to allocate buffer\n"); return 1; } clMemWrapper max_size_ret_mem(clCreateBuffer(context, CL_MEM_USE_HOST_PTR, sizeof(max_size_ret), &max_size_ret, &status)); test_error_ret(status, "Failed to allocate size query buffer", status); clMemWrapper buffer_mem( clCreateBuffer(context, CL_MEM_READ_WRITE, arr_size, 0, &status)); test_error_ret(status, "Failed to allocate write buffer", status); status = clSetKernelArg(get_max_size, 0, sizeof(cl_mem), &max_size_ret_mem); test_error_ret(status, "set arg", status); status = clSetKernelArg(writer, 0, sizeof(cl_mem), &buffer_mem); test_error_ret(status, "set arg", status); status = clSetKernelArg(reader, 0, sizeof(cl_mem), &buffer_mem); test_error_ret(status, "set arg", status); // Check the macro value of CL_DEVICE_MAX_GLOBAL_VARIABLE const size_t one = 1; status = clEnqueueNDRangeKernel(queue, get_max_size, 1, 0, &one, 0, 0, 0, 0); test_error_ret(status, "enqueue size query", status); status = clFinish(queue); test_error_ret(status, "finish", status); cl_uchar* max_size_ret_ptr = (cl_uchar*)clEnqueueMapBuffer( queue, max_size_ret_mem, CL_TRUE, CL_MAP_READ, 0, sizeof(max_size_ret), 0, 0, 0, 0); if (max_size_ret != max_size) { log_error("Error: preprocessor definition for " "CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE is %llu and does not " "match device query value %llu\n", (unsigned long long)max_size_ret, (unsigned long long)max_size); err |= 1; } clEnqueueUnmapMemObject(queue, max_size_ret_mem, max_size_ret_ptr, 0, 0, 0); RandomSeed rand_state_write(gRandomSeed); for (size_t offset = 0; offset < max_size; offset += arr_size) { size_t curr_size = (max_size - offset) < arr_size ? (max_size - offset) : arr_size; l_set_randomly(buffer, curr_size, rand_state_write); status = clEnqueueWriteBuffer(queue, buffer_mem, CL_TRUE, 0, curr_size, buffer, 0, 0, 0); test_error_ret(status, "populate buffer_mem object", status); status = clEnqueueNDRangeKernel(queue, writer, 1, &offset, &curr_size, 0, 0, 0, 0); test_error_ret(status, "enqueue writer", status); status = clFinish(queue); test_error_ret(status, "finish", status); } RandomSeed rand_state_read(gRandomSeed); for (size_t offset = 0; offset < max_size; offset += arr_size) { size_t curr_size = (max_size - offset) < arr_size ? (max_size - offset) : arr_size; status = clEnqueueNDRangeKernel(queue, reader, 1, &offset, &curr_size, 0, 0, 0, 0); test_error_ret(status, "enqueue reader", status); cl_uchar* read_mem_ptr = (cl_uchar*)clEnqueueMapBuffer( queue, buffer_mem, CL_TRUE, CL_MAP_READ, 0, curr_size, 0, 0, 0, &status); test_error_ret(status, "map read data", status); l_set_randomly(buffer, curr_size, rand_state_read); err |= l_compare("capacity", buffer, read_mem_ptr, curr_size, ti); clEnqueueUnmapMemObject(queue, buffer_mem, read_mem_ptr, 0, 0, 0); } if (CL_SUCCESS == err) { log_info("OK\n"); FLUSH; } align_free(buffer); return err; } // Check operation on a user type. static int l_user_type(cl_device_id device, cl_context context, cl_command_queue queue, bool separate_compile) { int err = CL_SUCCESS; // Just test one type. const TypeInfo ti(l_find_type("uchar")); log_info(" l_user_type %s...", separate_compile ? "separate compilation" : "single source compilation"); if (separate_compile && !l_linker_available) { log_info("Separate compilation is not supported. Skipping test\n"); return err; } const char type_src[] = "typedef struct { uchar c; uint i; } my_struct_t;\n\n"; const char def_src[] = "my_struct_t var = { 'a', 42 };\n\n"; const char decl_src[] = "extern my_struct_t var;\n\n"; // Don't use a host struct. We can't guarantee that the host // compiler has the same structure layout as the device compiler. const char writer_src[] = "kernel void writer( uchar c, uint i ) {\n" " var.c = c;\n" " var.i = i;\n" "}\n\n"; const char reader_src[] = "kernel void reader( global uchar* C, global uint* I ) {\n" " *C = var.c;\n" " *I = var.i;\n" "}\n\n"; clProgramWrapper program; const std::string options = get_build_options(device); if (separate_compile) { // Separate compilation flow. StringTable wksrc; wksrc.add(type_src); wksrc.add(def_src); wksrc.add(writer_src); StringTable rksrc; rksrc.add(type_src); rksrc.add(decl_src); rksrc.add(reader_src); int status = CL_SUCCESS; clProgramWrapper writer_program(clCreateProgramWithSource( context, wksrc.num_str(), wksrc.strs(), wksrc.lengths(), &status)); test_error_ret(status, "Failed to create writer program for user type test", status); status = clCompileProgram(writer_program, 1, &device, options.c_str(), 0, 0, 0, 0, 0); if (check_error( status, "Failed to compile writer program for user type test (%s)", IGetErrorString(status))) { print_build_log(writer_program, 1, &device, wksrc.num_str(), wksrc.strs(), wksrc.lengths(), options.c_str()); return status; } clProgramWrapper reader_program(clCreateProgramWithSource( context, rksrc.num_str(), rksrc.strs(), rksrc.lengths(), &status)); test_error_ret(status, "Failed to create reader program for user type test", status); status = clCompileProgram(reader_program, 1, &device, options.c_str(), 0, 0, 0, 0, 0); if (check_error( status, "Failed to compile reader program for user type test (%s)", IGetErrorString(status))) { print_build_log(reader_program, 1, &device, rksrc.num_str(), rksrc.strs(), rksrc.lengths(), options.c_str()); return status; } cl_program progs[2]; progs[0] = writer_program; progs[1] = reader_program; program = clLinkProgram(context, 1, &device, "", 2, progs, 0, 0, &status); if (check_error(status, "Failed to link program for user type test (%s)", IGetErrorString(status))) { print_build_log(program, 1, &device, 0, NULL, NULL, ""); return status; } } else { // Single compilation flow. StringTable ksrc; ksrc.add(type_src); ksrc.add(def_src); ksrc.add(writer_src); ksrc.add(reader_src); int status = CL_SUCCESS; status = create_single_kernel_helper_create_program( context, &program, ksrc.num_str(), ksrc.strs(), options.c_str()); if (check_error(status, "Failed to build program for user type test (%s)", IGetErrorString(status))) { print_build_log(program, 1, &device, ksrc.num_str(), ksrc.strs(), ksrc.lengths(), options.c_str()); return status; } status = clBuildProgram(program, 1, &device, options.c_str(), 0, 0); if (check_error(status, "Failed to compile program for user type test (%s)", IGetErrorString(status))) { print_build_log(program, 1, &device, ksrc.num_str(), ksrc.strs(), ksrc.lengths(), options.c_str()); return status; } } // Check size query. size_t used_bytes = 0; int status = clGetProgramBuildInfo( program, device, CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE, sizeof(used_bytes), &used_bytes, 0); test_error_ret(status, "Failed to query global variable total size", status); size_t expected_size = sizeof(cl_uchar) + sizeof(cl_uint); if (used_bytes < expected_size) { log_error("Error: program query for global variable total size query " "failed: Expected at least %llu but got %llu\n", (unsigned long long)expected_size, (unsigned long long)used_bytes); err |= 1; } // Prepare to execute clKernelWrapper writer(clCreateKernel(program, "writer", &status)); test_error_ret(status, "Failed to create writer kernel for user type test", status); clKernelWrapper reader(clCreateKernel(program, "reader", &status)); test_error_ret(status, "Failed to create reader kernel for user type test", status); // Set up data. cl_uchar* uchar_data = (cl_uchar*)align_malloc(sizeof(cl_uchar), ALIGNMENT); cl_uint* uint_data = (cl_uint*)align_malloc(sizeof(cl_uint), ALIGNMENT); clMemWrapper uchar_mem(clCreateBuffer( context, CL_MEM_USE_HOST_PTR, sizeof(cl_uchar), uchar_data, &status)); test_error_ret(status, "Failed to allocate uchar buffer", status); clMemWrapper uint_mem(clCreateBuffer(context, CL_MEM_USE_HOST_PTR, sizeof(cl_uint), uint_data, &status)); test_error_ret(status, "Failed to allocate uint buffer", status); status = clSetKernelArg(reader, 0, sizeof(cl_mem), &uchar_mem); test_error_ret(status, "set arg", status); status = clSetKernelArg(reader, 1, sizeof(cl_mem), &uint_mem); test_error_ret(status, "set arg", status); cl_uchar expected_uchar = 'a'; cl_uint expected_uint = 42; for (unsigned iter = 0; iter < 5; iter++) { // Must go around at least twice // Read back data *uchar_data = -1; *uint_data = -1; const size_t one = 1; status = clEnqueueNDRangeKernel(queue, reader, 1, 0, &one, 0, 0, 0, 0); test_error_ret(status, "enqueue reader", status); status = clFinish(queue); test_error_ret(status, "finish", status); cl_uchar* uint_data_ptr = (cl_uchar*)clEnqueueMapBuffer(queue, uint_mem, CL_TRUE, CL_MAP_READ, 0, sizeof(cl_uint), 0, 0, 0, 0); cl_uchar* uchar_data_ptr = (cl_uchar*)clEnqueueMapBuffer( queue, uchar_mem, CL_TRUE, CL_MAP_READ, 0, sizeof(cl_uchar), 0, 0, 0, 0); if (expected_uchar != *uchar_data || expected_uint != *uint_data) { log_error( "FAILED: Iteration %d Got (0x%2x,%d) but expected (0x%2x,%d)\n", iter, (int)*uchar_data, *uint_data, (int)expected_uchar, expected_uint); err |= 1; } clEnqueueUnmapMemObject(queue, uint_mem, uint_data_ptr, 0, 0, 0); clEnqueueUnmapMemObject(queue, uchar_mem, uchar_data_ptr, 0, 0, 0); // Mutate the data. expected_uchar++; expected_uint++; // Write the new values into persistent store. *uchar_data = expected_uchar; *uint_data = expected_uint; status = clSetKernelArg(writer, 0, sizeof(cl_uchar), uchar_data); test_error_ret(status, "set arg", status); status = clSetKernelArg(writer, 1, sizeof(cl_uint), uint_data); test_error_ret(status, "set arg", status); status = clEnqueueNDRangeKernel(queue, writer, 1, 0, &one, 0, 0, 0, 0); test_error_ret(status, "enqueue writer", status); status = clFinish(queue); test_error_ret(status, "finish", status); } if (CL_SUCCESS == err) { log_info("OK\n"); FLUSH; } align_free(uchar_data); align_free(uint_data); return err; } static std::string get_build_options(cl_device_id device) { std::string options = "-cl-std=CL"; Version latest_cl_c_version = get_device_latest_cl_c_version(device); options += latest_cl_c_version.to_string(); return options; } // Determines whether its valid to skip this test based on the driver version // and the features it optionally supports. // Whether the test should be skipped is writen into the out paramter skip. // The check returns an error code for the clDeviceInfo query. static cl_int should_skip(cl_device_id device, cl_bool& skip) { // Assume we can't skip to begin with. skip = CL_FALSE; // Progvar tests are already skipped for OpenCL < 2.0, so here we only need // to test for 3.0 since that is when program scope global variables become // optional. if (get_device_cl_version(device) >= Version(3, 0)) { size_t max_global_variable_size{}; test_error(clGetDeviceInfo(device, CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE, sizeof(max_global_variable_size), &max_global_variable_size, nullptr), "clGetDeviceInfo failed"); skip = (max_global_variable_size != 0) ? CL_FALSE : CL_TRUE; } return CL_SUCCESS; } //////////////////// // Global functions // Test support for variables at program scope. Miscellaneous int test_progvar_prog_scope_misc(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements) { cl_bool skip{ CL_FALSE }; auto error = should_skip(device, skip); if (CL_SUCCESS != error) { return TEST_FAIL; } if (skip) { log_info("Skipping progvar_prog_scope_misc since it is optionally not " "supported on this device\n"); return TEST_SKIPPED_ITSELF; } size_t max_size = 0; size_t pref_size = 0; cl_int err = CL_SUCCESS; err = l_get_device_info(device, &max_size, &pref_size); err |= l_build_type_table(device); err |= l_capacity(device, context, queue, max_size); err |= l_user_type(device, context, queue, false); err |= l_user_type(device, context, queue, true); return err; } // Test support for variables at program scope. Unitialized data int test_progvar_prog_scope_uninit(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements) { cl_bool skip{ CL_FALSE }; auto error = should_skip(device, skip); if (CL_SUCCESS != error) { return TEST_FAIL; } if (skip) { log_info( "Skipping progvar_prog_scope_uninit since it is optionally not " "supported on this device\n"); return TEST_SKIPPED_ITSELF; } size_t max_size = 0; size_t pref_size = 0; cl_int err = CL_SUCCESS; err = l_get_device_info(device, &max_size, &pref_size); err |= l_build_type_table(device); err |= l_write_read(device, context, queue); return err; } // Test support for variables at program scope. Initialized data. int test_progvar_prog_scope_init(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements) { cl_bool skip{ CL_FALSE }; auto error = should_skip(device, skip); if (CL_SUCCESS != error) { return TEST_FAIL; } if (skip) { log_info("Skipping progvar_prog_scope_init since it is optionally not " "supported on this device\n"); return TEST_SKIPPED_ITSELF; } size_t max_size = 0; size_t pref_size = 0; cl_int err = CL_SUCCESS; err = l_get_device_info(device, &max_size, &pref_size); err |= l_build_type_table(device); err |= l_init_write_read(device, context, queue); return err; } // A simple test for support of static variables inside a kernel. int test_progvar_func_scope(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements) { cl_bool skip{ CL_FALSE }; auto error = should_skip(device, skip); if (CL_SUCCESS != error) { return TEST_FAIL; } if (skip) { log_info("Skipping progvar_func_scope since it is optionally not " "supported on this device\n"); return TEST_SKIPPED_ITSELF; } cl_int err = CL_SUCCESS; // Deliberately have two variables with the same name but in different // scopes. // Also, use a large initialized structure in both cases. // clang-format off const char prog_src[] = "typedef struct { char c; int16 i; } mystruct_t;\n" "kernel void test_bump(global int* value, int which) {\n" " if (which) {\n" // Explicit address space. // Last element set to 0 " static global mystruct_t persistent = { 'a', (int16)(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,0) };\n" " *value = persistent.i.sf++;\n" " } else {\n" // Implicitly global // Last element set to 100 " static mystruct_t persistent = { 'b' , (int16)(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,100) };\n" " *value = persistent.i.sf++;\n" " }\n" "}\n"; // clang-format on StringTable ksrc; ksrc.add(prog_src); int status = CL_SUCCESS; clProgramWrapper program; clKernelWrapper test_bump; status = create_single_kernel_helper(context, &program, &test_bump, ksrc.num_str(), ksrc.strs(), "test_bump"); test_error_ret(status, "Failed to create program for function static variable test", status); // Check size query. size_t used_bytes = 0; status = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE, sizeof(used_bytes), &used_bytes, 0); test_error_ret(status, "Failed to query global variable total size", status); size_t expected_size = 2 * sizeof(cl_int); // Two ints. if (used_bytes < expected_size) { log_error("Error: program query for global variable total size query " "failed: Expected at least %llu but got %llu\n", (unsigned long long)expected_size, (unsigned long long)used_bytes); err |= 1; } // Prepare the data. cl_int counter_value = 0; clMemWrapper counter_value_mem(clCreateBuffer(context, CL_MEM_USE_HOST_PTR, sizeof(counter_value), &counter_value, &status)); test_error_ret(status, "Failed to allocate counter query buffer", status); status = clSetKernelArg(test_bump, 0, sizeof(cl_mem), &counter_value_mem); test_error_ret(status, "set arg", status); // Go a few rounds, alternating between the two counters in the kernel. // Same as initial values in kernel. // But "true" which increments the 0-based counter, and "false" which // increments the 100-based counter. cl_int expected_counter[2] = { 100, 0 }; const size_t one = 1; for (int iround = 0; iround < 5; iround++) { // Must go at least twice around for (int iwhich = 0; iwhich < 2; iwhich++) { // Cover both counters status = clSetKernelArg(test_bump, 1, sizeof(iwhich), &iwhich); test_error_ret(status, "set arg", status); status = clEnqueueNDRangeKernel(queue, test_bump, 1, 0, &one, 0, 0, 0, 0); test_error_ret(status, "enqueue test_bump", status); status = clFinish(queue); test_error_ret(status, "finish", status); cl_uchar* counter_value_ptr = (cl_uchar*)clEnqueueMapBuffer( queue, counter_value_mem, CL_TRUE, CL_MAP_READ, 0, sizeof(counter_value), 0, 0, 0, 0); if (counter_value != expected_counter[iwhich]) { log_error( "Error: Round %d on counter %d: Expected %d but got %d\n", iround, iwhich, expected_counter[iwhich], counter_value); err |= 1; } expected_counter[iwhich]++; // Emulate behaviour of the kernel. clEnqueueUnmapMemObject(queue, counter_value_mem, counter_value_ptr, 0, 0, 0); } } if (CL_SUCCESS == err) { log_info("OK\n"); FLUSH; } return err; }