1// 2// Copyright (C) 2009-2021 Intel Corporation 3// 4// SPDX-License-Identifier: MIT 5// 6// 7 8#include "api_interface.h" 9#include "common.h" 10#include "d3d12.h" 11#include "mem_utils.h" 12#include "misc_shared.h" 13 14/// Align value to 128 15/// 16/// @param value vale to align 17/// @return aligned value 18GRL_INLINE ulong AlignTo128(ulong value) { return ((value + 127) / 128) * 128; } 19 20GRL_INLINE char* GetVertexBuffersStart(global InputBatchPtrs* batchPtrs) { 21 return (global char*)(batchPtrs->dumpDst + AlignTo128(sizeof(InputBatch))); 22} 23 24/// Finds max used byte in vertex buffer 25/// 26/// @param indexBuffPtr pointer to index buffer 27/// @param vertexBufferUsedByteEnd pointer to max used byte of vertex buffers 28/// @param IndexCount number of indices in index buffer 29/// @param IndexFormat index format 30/// @param VertexCount number of vertices in vertex buffer 31/// @param VertexBufferByteStride vertex buffer byte stride 32__attribute__((reqd_work_group_size(256, 1, 1))) 33__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) 34void kernel find_max_used_byte_in_buff( 35 global void* indexBuffPtr, 36 global uint* vertexBufferUsedByteEnd, 37 dword IndexCount, 38 dword IndexFormat, 39 dword VertexCount, 40 qword VertexBufferByteStride) 41{ 42 local uint sgMax[16]; 43 uint glob_id = get_group_id(0) * get_local_size(0) + get_local_id(0); 44 45 if (IndexFormat != INDEX_FORMAT_NONE) 46 { 47 uint endByte = 0; 48 if (glob_id < IndexCount) 49 { 50 if (IndexFormat == INDEX_FORMAT_R16_UINT) 51 { 52 global ushort* indexBuffPtrShort = (global ushort*) indexBuffPtr; 53 endByte = indexBuffPtrShort[glob_id]; 54 } 55 else 56 { 57 global uint* indexBuffPtrUint = (global uint*) indexBuffPtr; 58 endByte = indexBuffPtrUint[glob_id]; 59 } 60 } 61 62 endByte = sub_group_reduce_max(endByte); 63 64 if (get_sub_group_local_id() == 0) { sgMax[get_sub_group_id()] = endByte; } 65 66 barrier(CLK_LOCAL_MEM_FENCE); 67 68 if (get_sub_group_id() == 0) 69 { 70 endByte = sub_group_reduce_max(sgMax[get_sub_group_local_id()]); 71 if (get_sub_group_local_id() == 0) 72 { 73 endByte = min(endByte, VertexCount); 74 if (endByte < VertexCount && IndexCount != 0) 75 ++endByte; 76 endByte *= (dword)VertexBufferByteStride; 77 atomic_max(vertexBufferUsedByteEnd, endByte); 78 } 79 } 80 } 81 else if (glob_id == 0) 82 { 83 uint endByte = VertexCount * VertexBufferByteStride; 84 atomic_max(vertexBufferUsedByteEnd, endByte); 85 } 86} 87 88/// Allocates buffer for vertices 89/// 90/// @param batchPtrs batch pointers struct 91/// @param vertexBufferUsedByteEnd pointer to sizes of vertex buffers 92/// @param vertexBufferOffset pointer to offsets to vertex buffers 93/// @param numVertexBuffers number of vertex buffers 94__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) 95__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) 96void kernel allocate_linear_offsets_for_vertex_buffers( 97 global InputBatchPtrs* batchPtrs, 98 global uint* vertexBufferUsedByteEnd, 99 global uint* vertexBufferOffset, 100 dword numVertexBuffers) 101{ 102 uint glob_id = get_group_id(0) * get_local_size(0) + get_sub_group_local_id(); 103 104 if (glob_id < numVertexBuffers) 105 { 106 uint numBytes = AlignTo128(vertexBufferUsedByteEnd[glob_id]); 107 uint position = atomic_add_global( &batchPtrs->vertexBuffersSize, numBytes); 108 vertexBufferOffset[glob_id] = position; 109 } 110} 111 112/// Sets the dst data space for input dump of this batch 113/// 114/// @param inputDumpMainBuffer pointer to main dump buffer 115/// @param batchPtrs batch pointers struct 116/// @param nonVertexSize size of non vertex data 117/// @param batchIdPtr pointer to batch id 118__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) 119__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) 120void kernel allocate_data_space_for_inputs( 121 global DebugBufferHeader* inputDumpMainBuffer, 122 global InputBatchPtrs* batchPtrs, 123 uint nonVertexSize, 124 global qword* batchIdPtr) 125{ 126 if (get_sub_group_local_id() == 0) 127 { 128 uint vertexBufferSize = batchPtrs->vertexBuffersSize; 129 uint sizeOfThisBatch = vertexBufferSize + AlignTo128(sizeof(InputBatch)) + nonVertexSize; 130 131 if ((sizeOfThisBatch + sizeof(InputBatch)) > ((inputDumpMainBuffer->totalSize - inputDumpMainBuffer->headStart) / 2)) 132 { 133 inputDumpMainBuffer->overflow = 1; 134 batchPtrs->dumpDst = 0; 135 batchPtrs->globalDumpBuffer = 0; 136 batchPtrs->nonVertexDataStart = 0; 137 batchPtrs->totalSize = 0; 138 return; 139 } 140 141 dword prevHead = inputDumpMainBuffer->gpuHead; 142 dword newHead; 143 bool circled; 144 145 do 146 { 147 circled = false; 148 newHead = prevHead + sizeOfThisBatch; 149 dword bufferBegin = prevHead; 150 if ((newHead + sizeof(InputBatch)) > inputDumpMainBuffer->totalSize) 151 { 152 circled = true; 153 newHead = inputDumpMainBuffer->headStart + sizeOfThisBatch; 154 bufferBegin = inputDumpMainBuffer->headStart; 155 } 156 dword bufferEnd = newHead + sizeof(InputBatch); 157 158 uint tail; 159 uint tail2 = 7; 160 bool wait; 161 do 162 { 163 wait = true; 164 tail = load_uint_L1UC_L3UC(&inputDumpMainBuffer->tail, 0); 165 166 // dead code, workaround so IGC won't move tail load out of loop 167 if (tail > inputDumpMainBuffer->totalSize) 168 { 169 store_uint_L1UC_L3UC(&inputDumpMainBuffer->tail, 0, tail + tail2); 170 tail2 = tail; 171 } 172 173 if( prevHead >= tail ) 174 { 175 //colision example: 176 // ----------T=======H------------ 177 // -------B=====E----------------- 178 // 179 if((bufferEnd < tail) || (bufferBegin >= prevHead)) 180 { 181 wait = false; 182 } 183 } 184 else 185 { 186 //colision example: 187 // ==========H-------T============ 188 // B==============E--------------- 189 // caution: we will never have H circled completely so that H == T 190 if((bufferEnd < tail) && (bufferBegin >= prevHead)) 191 { 192 wait = false; 193 } 194 } 195 } while (wait); 196 } while (!atomic_compare_exchange_global(&inputDumpMainBuffer->gpuHead, &prevHead, newHead)); 197 198 if (circled) 199 { 200 global InputBatch* endBufferOp = (global InputBatch*)(((global char*)inputDumpMainBuffer) + prevHead); 201 endBufferOp->header.opHeader.operationType = INPUT_DUMP_OP_END_BUFFER; 202 prevHead = inputDumpMainBuffer->headStart; 203 } 204 205 global char* thisBatchDump = ((global char*)inputDumpMainBuffer) + prevHead; 206 batchPtrs->dumpDst = (qword)thisBatchDump; 207 batchPtrs->globalDumpBuffer = (qword)inputDumpMainBuffer; 208 batchPtrs->nonVertexDataStart = (qword)(thisBatchDump + AlignTo128(sizeof(InputBatch)) + vertexBufferSize); 209 batchPtrs->totalSize = sizeOfThisBatch; 210 211 global InputBatch* batchOp = (global InputBatch*) thisBatchDump; 212 batchOp->header.opHeader.operationType = INPUT_DUMP_OP_BATCH; 213 batchOp->header.opHeader.endOfData = sizeOfThisBatch; 214 batchOp->vertexBufferDataSize = vertexBufferSize; 215 batchOp->firstContainedOpOffset = AlignTo128(sizeof(InputBatch)) + vertexBufferSize; 216 batchOp->batchId = *batchIdPtr; 217 } 218} 219 220/// Sets the dst data space for output dump of this batch 221/// 222/// @param outputDumpMainBuffer pointer to main dump buffer 223/// @param batchPtrs batch pointers struct 224/// @param batchIdPtr pointer to batch id 225__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) 226__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) 227void kernel allocate_data_space_for_outputs( 228 global DebugBufferHeader* outputDumpMainBuffer, 229 global OutputBatchPtrs* batchPtrs, 230 global qword* batchIdPtr) 231{ 232 if (get_sub_group_local_id() == 0) 233 { 234 uint sizeOfThisBatch = AlignTo128(sizeof(OutputBatch)) + batchPtrs->dataSize; 235 236 if ((sizeOfThisBatch + sizeof(OutputBatch)) > ((outputDumpMainBuffer->totalSize - outputDumpMainBuffer->headStart) / 2)) 237 { 238 outputDumpMainBuffer->overflow = 1; 239 batchPtrs->dumpDst = 0; 240 batchPtrs->dataStart = 0; 241 batchPtrs->totalSize = 0; 242 return; 243 } 244 245 dword prevHead = *((volatile global uint*)(&outputDumpMainBuffer->gpuHead)); 246 dword newHead; 247 bool circled; 248 249 do 250 { 251 //mem_fence_gpu_invalidate(); 252 //prevHead = *((volatile global uint*)(&outputDumpMainBuffer->gpuHead)); 253 circled = false; 254 newHead = prevHead + sizeOfThisBatch; 255 dword bufferBegin = prevHead; 256 if ((newHead + sizeof(OutputBatch)) > outputDumpMainBuffer->totalSize) 257 { 258 circled = true; 259 newHead = outputDumpMainBuffer->headStart + sizeOfThisBatch; 260 bufferBegin = outputDumpMainBuffer->headStart; 261 } 262 dword bufferEnd = newHead + sizeof(OutputBatch); 263 264 uint tail; 265 uint tail2 = 7; 266 bool wait; 267 do 268 { 269 wait = true; 270 tail = load_uint_L1UC_L3UC(&outputDumpMainBuffer->tail, 0); 271 272 // dead code, workaround so IGC won't move tail load out of loop 273 if (tail > outputDumpMainBuffer->totalSize) 274 { 275 store_uint_L1UC_L3UC(&outputDumpMainBuffer->tail, 0, tail + tail2); 276 tail2 = tail; 277 } 278 279 if( prevHead >= tail ) 280 { 281 //colision example: 282 // ----------T=======H------------ 283 // -------B=====E----------------- 284 // 285 if((bufferEnd < tail) || (bufferBegin >= prevHead)) 286 { 287 wait = false; 288 } 289 } 290 else 291 { 292 //colision example: 293 // ==========H-------T============ 294 // B==============E--------------- 295 // caution: we will never have H circled completely so that H == T 296 if((bufferEnd < tail) && (bufferBegin >= prevHead)) 297 { 298 wait = false; 299 } 300 } 301 } while (wait); 302 } while (!atomic_compare_exchange_global(&outputDumpMainBuffer->gpuHead, &prevHead, newHead)); 303 304 if (circled) 305 { 306 global OutputBatch* endBufferOp = (global OutputBatch*)(((global char*)outputDumpMainBuffer) + prevHead); 307 endBufferOp->header.opHeader.operationType = OUTPUT_DUMP_OP_END_BUFFER; 308 prevHead = outputDumpMainBuffer->headStart; 309 } 310 311 global char* thisBatchDump = ((global char*)outputDumpMainBuffer) + prevHead; 312 batchPtrs->dumpDst = (qword)thisBatchDump; 313 batchPtrs->dataStart = (qword)(thisBatchDump + AlignTo128(sizeof(OutputBatch))); 314 batchPtrs->totalSize = sizeOfThisBatch; 315 316 global OutputBatch* batchOp = (global OutputBatch*) thisBatchDump; 317 batchOp->header.opHeader.operationType = OUTPUT_DUMP_OP_BATCH; 318 batchOp->header.opHeader.endOfData = sizeOfThisBatch; 319 batchOp->firstContainedOpOffset = AlignTo128(sizeof(OutputBatch)); 320 batchOp->batchId = *batchIdPtr; 321 } 322} 323 324/// Calculates sum of output sizes 325/// 326/// @param pbi pointer to post build infos 327/// @param destOffset offset in dest buffer 328/// @param numOutputs number of outputs 329/// @param batchPtrs batch pointers struct 330__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) 331__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) 332void kernel calc_outputs_data_size( 333 global PostbuildInfoSerializationDesc* pbi, 334 global dword* destOffsets, 335 qword numOutputs, 336 global OutputBatchPtrs* batchPtrs) 337{ 338 uint offset = 0; 339 for (uint i = get_sub_group_local_id(); i < numOutputs + (MAX_HW_SIMD_WIDTH - 1); i += MAX_HW_SIMD_WIDTH) 340 { 341 uint size = 0; 342 if (i < numOutputs) 343 { 344 size = AlignTo128(pbi[i].SerializedSizeInBytes); 345 size += AlignTo128(sizeof(OutputData)); 346 destOffsets[i] = offset + sub_group_scan_exclusive_add(size); 347 } 348 offset += sub_group_reduce_add(size); 349 } 350 if (get_sub_group_local_id() == 0) 351 batchPtrs->dataSize = offset; 352} 353 354/// Adds output data operation to batch 355/// 356/// @param batchPtrs batch pointers struct 357/// @param destOffset offset in dest buffer 358/// @param src pointer to source bvh 359/// @param pbi pointer to post build info 360__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) 361__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) 362void kernel write_output_data_op( 363 global OutputBatchPtrs* batchPtrs, 364 global dword* destOffset, 365 qword src, 366 global PostbuildInfoSerializationDesc* pbi) 367{ 368 if (batchPtrs->dataStart == 0) 369 return; 370 371 global OutputData* out = (global OutputData*)(batchPtrs->dataStart + *destOffset); 372 out->header.operationType = OUTPUT_DUMP_OP_DATA; 373 out->header.endOfData = AlignTo128(sizeof(OutputData)) + AlignTo128(pbi->SerializedSizeInBytes); 374 out->srcBvhPtr = src; 375} 376 377/// Writes indices and transform or procedurals data 378/// 379/// @param batchPtrs batch pointers struct 380/// @param srcDesc description of source geometry 381/// @param pVertexBufferOffsetInLinearisedUniqueVertexBuffers pointer to offset to vertices in vertex buffer 382/// @param dstDescOffset offset to dest geo desc 383/// @param dstDataOffset offset to dest geo data 384/// @param numThreads number of threads 385__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) 386__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) 387void kernel write_geo_data( 388 global InputBatchPtrs* batchPtrs, 389 global GRL_RAYTRACING_GEOMETRY_DESC* srcDesc, 390 global uint* pVertexBufferOffsetInLinearisedUniqueVertexBuffers, 391 global uint* pVertexBufferSize, 392 qword dstDescOffset, 393 qword dstDataOffset, 394 dword numThreads) 395{ 396 if (batchPtrs->dumpDst == 0) return; 397 398 uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id(); 399 400 GRL_RAYTRACING_GEOMETRY_DESC geoDescToStore = *srcDesc; 401 402 global char* dstDataPtr = (global char*)( 403 batchPtrs->nonVertexDataStart + dstDataOffset); 404 405 global char* srcDataPtr; 406 global char* dstTransform; 407 uint bytesToCopy = 0; 408 409 if (geoDescToStore.Type == GEOMETRY_TYPE_TRIANGLES) 410 { 411 uint sizeOfMatrix = 0; 412 413 if (geoDescToStore.Desc.Triangles.pTransformBuffer) 414 { 415 sizeOfMatrix = AlignTo128(4 * 3 * sizeof(float)); 416 if (glob_id < 12) 417 { 418 global float* matrixSrc = (global float*)geoDescToStore.Desc.Triangles.pTransformBuffer; 419 global float* matrixDst = (global float*)dstDataPtr; 420 matrixDst[glob_id] = matrixSrc[glob_id]; 421 if (glob_id == 0) 422 { 423 geoDescToStore.Desc.Triangles.pTransformBuffer = ((qword)matrixDst) - batchPtrs->globalDumpBuffer; 424 } 425 } 426 } 427 428 dstDataPtr += sizeOfMatrix; 429 srcDataPtr = (global char*)geoDescToStore.Desc.Triangles.pIndexBuffer; 430 431 bytesToCopy = AlignTo128(geoDescToStore.Desc.Triangles.IndexFormat * geoDescToStore.Desc.Triangles.IndexCount); 432 433 if (bytesToCopy && (glob_id == 0)) 434 { 435 qword vertBuff = (qword)(GetVertexBuffersStart(batchPtrs) + *pVertexBufferOffsetInLinearisedUniqueVertexBuffers); 436 // for this we remember offset relative to global debug buffer 437 geoDescToStore.Desc.Triangles.pVertexBuffer = ((qword)vertBuff) - batchPtrs->globalDumpBuffer; 438 geoDescToStore.Desc.Triangles.pIndexBuffer = ((qword)dstDataPtr) - batchPtrs->globalDumpBuffer; 439 geoDescToStore.Desc.Triangles.VertexCount = *pVertexBufferSize / geoDescToStore.Desc.Triangles.VertexBufferByteStride; 440 } 441 else if (geoDescToStore.Desc.Triangles.IndexFormat == INDEX_FORMAT_NONE && geoDescToStore.Desc.Triangles.VertexCount > 0 && glob_id == 0) 442 { 443 if (geoDescToStore.Desc.Triangles.pVertexBuffer) 444 { 445 qword vertBuff = (qword)(GetVertexBuffersStart(batchPtrs) + *pVertexBufferOffsetInLinearisedUniqueVertexBuffers); 446 // for this we remember offset relative to global debug buffer 447 geoDescToStore.Desc.Triangles.pVertexBuffer = ((qword)vertBuff) - batchPtrs->globalDumpBuffer; 448 } 449 } 450 else if (glob_id == 0) 451 { 452 geoDescToStore.Desc.Triangles.IndexCount = 0; 453 geoDescToStore.Desc.Triangles.VertexCount = 0; 454 geoDescToStore.Desc.Triangles.pVertexBuffer = 0; 455 geoDescToStore.Desc.Triangles.pIndexBuffer = 0; 456 } 457 } 458 else 459 { 460 srcDataPtr = (global char*)geoDescToStore.Desc.Procedural.pAABBs_GPUVA; 461 bytesToCopy = AlignTo128(geoDescToStore.Desc.Procedural.AABBByteStride * geoDescToStore.Desc.Procedural.AABBCount); 462 if (glob_id == 0) 463 { 464 geoDescToStore.Desc.Procedural.pAABBs_GPUVA = ((qword)dstDataPtr) - batchPtrs->globalDumpBuffer; 465 } 466 } 467 468 if (bytesToCopy) 469 { 470 CopyMemory(dstDataPtr, srcDataPtr, bytesToCopy, numThreads); 471 } 472 473 if (glob_id == 0) 474 { 475 global GRL_RAYTRACING_GEOMETRY_DESC* dstDescPtr = (global GRL_RAYTRACING_GEOMETRY_DESC*)( 476 batchPtrs->nonVertexDataStart + dstDescOffset); 477 *dstDescPtr = geoDescToStore; 478 } 479} 480 481/// Adds build operation to batch 482/// 483/// @param batchPtrs batch pointers struct 484/// @param buildOpOffset offset in dst buffer 485/// @param srcBvh address of src bvh (in case of update) 486/// @param dstBvhAddr address of dest bvh buffer 487/// @param offsetToEnd offset to end of this operation 488/// @param flags build flags 489/// @param numGeometries number of geometries in build 490/// @param numInstances number of instances in build 491__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) 492__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) 493void kernel write_input_build_op( 494 global InputBatchPtrs* batchPtrs, 495 qword buildOpOffset, 496 qword srcBvh, 497 qword dstBvhAddr, 498 dword offsetToEnd, 499 dword flags, 500 dword numGeometries, 501 dword numInstances, 502 dword instArrayOfPtrs) 503{ 504 uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id(); 505 if (batchPtrs->dumpDst == 0 || glob_id != 0) return; 506 507 global InputBuild* buildOp = (global InputBuild*)( 508 batchPtrs->nonVertexDataStart + buildOpOffset); 509 buildOp->header.operationType = srcBvh ? INPUT_DUMP_OP_UPDATE : INPUT_DUMP_OP_BUILD; 510 buildOp->header.endOfData = offsetToEnd; 511 buildOp->dstBvhPtr = dstBvhAddr; 512 buildOp->srcBvhPtr = srcBvh; 513 buildOp->flags = flags; 514 buildOp->numGeos = numGeometries; 515 buildOp->numInstances = numInstances; 516 buildOp->instArrayOfPtrs = instArrayOfPtrs; 517} 518 519/// Copies instance description 520/// 521/// @param batchPtrs batch pointers struct 522/// @param instanceDescArr inst desc source 523/// @param offset ptr to offset in dst buffer 524/// @param numInstances number of instances to copy 525__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) 526__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel 527copy_instance_descriptors_array( 528 global InputBatchPtrs* batchPtrs, 529 global GRL_RAYTRACING_INSTANCE_DESC* instanceDescArr, 530 qword offset, 531 dword numInstances) 532{ 533 uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id(); 534 if (batchPtrs->dumpDst == 0) return; 535 536 global GRL_RAYTRACING_INSTANCE_DESC* dst = (global GRL_RAYTRACING_INSTANCE_DESC* )( 537 batchPtrs->nonVertexDataStart + offset); 538 539 if (glob_id < numInstances) 540 { 541 dst[glob_id] = instanceDescArr[glob_id]; 542 } 543} 544 545/// Copies instance description, array of pointers version 546/// 547/// @param batchPtrs batch pointers struct 548/// @param pInstanceDescPtrsArr inst desc source 549/// @param offset ptr to offset in dst buffer 550/// @param numInstances number of instances to copy 551__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) 552__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel 553copy_instance_descriptors_array_of_ptrs( 554 global InputBatchPtrs* batchPtrs, 555 global qword* pInstanceDescPtrsArr, 556 qword offset, 557 dword numInstances) 558{ 559 uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id(); 560 if (batchPtrs->dumpDst == 0) return; 561 562 // save gpuva of instance descs for debug 563 global qword* gpuvaDst = (global qword*)(batchPtrs->nonVertexDataStart + offset); 564 565 global GRL_RAYTRACING_INSTANCE_DESC* dst = (global GRL_RAYTRACING_INSTANCE_DESC*)( 566 batchPtrs->nonVertexDataStart + AlignTo128(numInstances * sizeof(qword)) + offset); 567 global GRL_RAYTRACING_INSTANCE_DESC** instanceDescPtrsArr = (global GRL_RAYTRACING_INSTANCE_DESC **)pInstanceDescPtrsArr; 568 569 if (glob_id < numInstances) 570 { 571 gpuvaDst[glob_id] = (qword)instanceDescPtrsArr[glob_id]; 572 dst[glob_id] = *(instanceDescPtrsArr[glob_id]); 573 } 574} 575 576/// Adds copy operation to batch 577/// 578/// @param batchPtrs batch pointers struct 579/// @param offset ptr to offset in dst buffer 580/// @param src copy source pointer 581/// @param dst copy destination pointer 582/// @param copyOpType copy type 583__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) 584__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) 585void kernel insert_copy_op( 586 global InputBatchPtrs* batchPtrs, 587 qword offset, 588 global void* src, 589 global void* dst, 590 uint copyOpType) 591{ 592 uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id(); 593 if (batchPtrs->dumpDst == 0 || glob_id != 0) return; 594 595 global InputCopy* copyOp = (global InputCopy*)(batchPtrs->nonVertexDataStart + offset); 596 597 copyOp->header.operationType = copyOpType; 598 copyOp->header.endOfData = AlignTo128(sizeof(InputCopy)); 599 copyOp->srcBvhPtr = (qword)src; 600 copyOp->dstBvhPtr = (qword)dst; 601} 602 603/// Copies vertex buffer 604/// 605/// @param batchPtrs batch pointers struct 606/// @param src input buffer 607/// @param offset ptr to offset in dst buffer 608/// @param size ptr to number of bytes to copy 609__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1))) 610__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) 611void kernel copy_vertex_data( 612 global InputBatchPtrs* batchPtrs, 613 global const char* src, 614 global const uint* offset, 615 global const uint* size) 616{ 617 if (batchPtrs->dumpDst == 0) return; 618 619 global char *dst = (global char *)(GetVertexBuffersStart(batchPtrs) + *offset); 620 uint numGroups = (*size >> 6) + 1; 621 CopyMemory(dst, src, *size, numGroups); 622} 623 624/// Generate unique batch id 625/// 626/// @param batchIds array of unique batch ids 627/// @param index index of batch id to generate 628__attribute__((reqd_work_group_size(1, 1, 1))) 629void kernel generate_unique_batch_id(global unsigned long *batchIds, unsigned int index) { 630 global unsigned int *counterPtrs = (global unsigned int *)batchIds; 631 atomic_add(&counterPtrs[index * 2 + 1], 1); 632 batchIds[index] |= (unsigned long)index; 633} 634 635/// Sets batch as ready to read and moves cpuHead forward, inputs case 636/// 637/// @param batchPtrs batch pointers struct 638/// @param dumpMainBuffer pointer to main dump buffer 639__attribute__((reqd_work_group_size(1, 1, 1))) 640void kernel finish_batch_dump_inputs( 641 global InputBatchPtrs* batchPtrs, 642 global DebugBufferHeader* dumpMainBuffer) 643{ 644 if (batchPtrs->dumpDst == 0) 645 return; 646 647 global InputBatch* myBatchOp = (global InputBatch*)batchPtrs->dumpDst; 648 649 dword myDstOffset = (batchPtrs->dumpDst - (qword)dumpMainBuffer); 650 651 dword seven = 7; 652 while (true) 653 { 654 dword currentHead = load_uint_L1UC_L3C(&dumpMainBuffer->cpuHead, 0); 655 if (currentHead > dumpMainBuffer->totalSize) // dead code - workaround so IGC won't move currentHead load out of loop 656 { 657 store_uint_L1UC_L3UC(&dumpMainBuffer->cpuHead, 0, currentHead + seven); 658 currentHead = seven; 659 } 660 661 if (currentHead == myDstOffset) 662 { 663 mem_fence_evict_to_memory(); 664 dumpMainBuffer->cpuHead = currentHead + myBatchOp->header.opHeader.endOfData; 665 break; 666 } 667 else if (myDstOffset == dumpMainBuffer->headStart) 668 { 669 global InputBatch* curBatchOp = (global InputBatch*)(((global char*)dumpMainBuffer) + currentHead); 670 if (curBatchOp->header.opHeader.operationType == INPUT_DUMP_OP_END_BUFFER) 671 { 672 mem_fence_evict_to_memory(); 673 dumpMainBuffer->cpuHead = dumpMainBuffer->headStart + myBatchOp->header.opHeader.endOfData; 674 break; 675 } 676 } 677 } 678} 679 680/// Sets batch as ready to read and moves cpuHead forward, outputs case 681/// 682/// @param batchPtrs batch pointers struct 683/// @param dumpMainBuffer pointer to main dump buffer 684__attribute__((reqd_work_group_size(1, 1, 1))) 685void kernel finish_batch_dump_outputs( 686 global OutputBatchPtrs* batchPtrs, 687 global DebugBufferHeader* dumpMainBuffer) 688{ 689 if (batchPtrs->dumpDst == 0) 690 return; 691 692 global OutputBatch* myBatchOp = (global OutputBatch*)batchPtrs->dumpDst; 693 694 dword myDstOffset = (batchPtrs->dumpDst - (qword)dumpMainBuffer); 695 696 dword seven = 7; 697 while (true) 698 { 699 dword currentHead = load_uint_L1UC_L3C(&dumpMainBuffer->cpuHead, 0); 700 if (currentHead > dumpMainBuffer->totalSize) // dead code - workaround so IGC won't move currentHead load out of loop 701 { 702 store_uint_L1UC_L3UC(&dumpMainBuffer->cpuHead, 0, currentHead + seven); 703 currentHead = seven; 704 } 705 706 if (currentHead == myDstOffset) 707 { 708 mem_fence_evict_to_memory(); 709 dumpMainBuffer->cpuHead = currentHead + myBatchOp->header.opHeader.endOfData; 710 break; 711 } 712 else if (myDstOffset == dumpMainBuffer->headStart) 713 { 714 global OutputBatch* curBatchOp = (global OutputBatch*)(((global char*)dumpMainBuffer) + currentHead); 715 if (curBatchOp->header.opHeader.operationType == OUTPUT_DUMP_OP_END_BUFFER) 716 { 717 mem_fence_evict_to_memory(); 718 dumpMainBuffer->cpuHead = dumpMainBuffer->headStart + myBatchOp->header.opHeader.endOfData; 719 break; 720 } 721 } 722 } 723} 724