xref: /aosp_15_r20/external/mesa3d/src/intel/vulkan/grl/gpu/input_dump.cl (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1//
2// Copyright (C) 2009-2021 Intel Corporation
3//
4// SPDX-License-Identifier: MIT
5//
6//
7
8#include "api_interface.h"
9#include "common.h"
10#include "d3d12.h"
11#include "mem_utils.h"
12#include "misc_shared.h"
13
14/// Align value to 128
15///
16/// @param value vale to align
17/// @return aligned value
18GRL_INLINE ulong AlignTo128(ulong value) { return ((value + 127) / 128) * 128; }
19
20GRL_INLINE char* GetVertexBuffersStart(global InputBatchPtrs* batchPtrs) {
21    return (global char*)(batchPtrs->dumpDst + AlignTo128(sizeof(InputBatch)));
22}
23
24/// Finds max used byte in vertex buffer
25///
26/// @param indexBuffPtr pointer to index buffer
27/// @param vertexBufferUsedByteEnd pointer to max used byte of vertex buffers
28/// @param IndexCount number of indices in index buffer
29/// @param IndexFormat index format
30/// @param VertexCount number of vertices in vertex buffer
31/// @param VertexBufferByteStride vertex buffer byte stride
32__attribute__((reqd_work_group_size(256, 1, 1)))
33__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
34void kernel find_max_used_byte_in_buff(
35    global void* indexBuffPtr,
36    global uint* vertexBufferUsedByteEnd,
37    dword IndexCount,
38    dword IndexFormat,
39    dword VertexCount,
40    qword VertexBufferByteStride)
41{
42    local uint sgMax[16];
43    uint glob_id = get_group_id(0) * get_local_size(0) + get_local_id(0);
44
45    if (IndexFormat != INDEX_FORMAT_NONE)
46    {
47        uint endByte = 0;
48        if (glob_id < IndexCount)
49        {
50            if (IndexFormat == INDEX_FORMAT_R16_UINT)
51            {
52                global ushort* indexBuffPtrShort = (global ushort*) indexBuffPtr;
53                endByte = indexBuffPtrShort[glob_id];
54            }
55            else
56            {
57                global uint* indexBuffPtrUint = (global uint*) indexBuffPtr;
58                endByte = indexBuffPtrUint[glob_id];
59            }
60        }
61
62        endByte = sub_group_reduce_max(endByte);
63
64        if (get_sub_group_local_id() == 0) { sgMax[get_sub_group_id()] = endByte; }
65
66        barrier(CLK_LOCAL_MEM_FENCE);
67
68        if (get_sub_group_id() == 0)
69        {
70            endByte = sub_group_reduce_max(sgMax[get_sub_group_local_id()]);
71            if (get_sub_group_local_id() == 0)
72            {
73                endByte = min(endByte, VertexCount);
74                if (endByte < VertexCount && IndexCount != 0)
75                    ++endByte;
76                endByte *= (dword)VertexBufferByteStride;
77                atomic_max(vertexBufferUsedByteEnd, endByte);
78            }
79        }
80    }
81    else if (glob_id == 0)
82    {
83        uint endByte = VertexCount * VertexBufferByteStride;
84        atomic_max(vertexBufferUsedByteEnd, endByte);
85    }
86}
87
88/// Allocates buffer for vertices
89///
90/// @param batchPtrs batch pointers struct
91/// @param vertexBufferUsedByteEnd pointer to sizes of vertex buffers
92/// @param vertexBufferOffset pointer to offsets to vertex buffers
93/// @param numVertexBuffers number of vertex buffers
94__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
95__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
96void kernel allocate_linear_offsets_for_vertex_buffers(
97    global InputBatchPtrs* batchPtrs,
98    global uint* vertexBufferUsedByteEnd,
99    global uint* vertexBufferOffset,
100    dword numVertexBuffers)
101{
102    uint glob_id = get_group_id(0) * get_local_size(0) + get_sub_group_local_id();
103
104    if (glob_id < numVertexBuffers)
105    {
106        uint numBytes = AlignTo128(vertexBufferUsedByteEnd[glob_id]);
107        uint position = atomic_add_global( &batchPtrs->vertexBuffersSize, numBytes);
108        vertexBufferOffset[glob_id] = position;
109    }
110}
111
112/// Sets the dst data space for input dump of this batch
113///
114/// @param inputDumpMainBuffer pointer to main dump buffer
115/// @param batchPtrs batch pointers struct
116/// @param nonVertexSize size of non vertex data
117/// @param batchIdPtr pointer to batch id
118__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
119__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
120void kernel allocate_data_space_for_inputs(
121    global DebugBufferHeader* inputDumpMainBuffer,
122    global InputBatchPtrs* batchPtrs,
123    uint nonVertexSize,
124    global qword* batchIdPtr)
125{
126    if (get_sub_group_local_id() == 0)
127    {
128        uint vertexBufferSize = batchPtrs->vertexBuffersSize;
129        uint sizeOfThisBatch = vertexBufferSize + AlignTo128(sizeof(InputBatch)) + nonVertexSize;
130
131        if ((sizeOfThisBatch + sizeof(InputBatch)) > ((inputDumpMainBuffer->totalSize - inputDumpMainBuffer->headStart) / 2))
132        {
133            inputDumpMainBuffer->overflow = 1;
134            batchPtrs->dumpDst = 0;
135            batchPtrs->globalDumpBuffer = 0;
136            batchPtrs->nonVertexDataStart = 0;
137            batchPtrs->totalSize = 0;
138            return;
139        }
140
141        dword prevHead = inputDumpMainBuffer->gpuHead;
142        dword newHead;
143        bool circled;
144
145        do
146        {
147            circled = false;
148            newHead = prevHead + sizeOfThisBatch;
149            dword bufferBegin = prevHead;
150            if ((newHead + sizeof(InputBatch)) > inputDumpMainBuffer->totalSize)
151            {
152                circled = true;
153                newHead = inputDumpMainBuffer->headStart + sizeOfThisBatch;
154                bufferBegin = inputDumpMainBuffer->headStart;
155            }
156            dword bufferEnd = newHead + sizeof(InputBatch);
157
158            uint tail;
159            uint tail2 = 7;
160            bool wait;
161            do
162            {
163                wait = true;
164                tail = load_uint_L1UC_L3UC(&inputDumpMainBuffer->tail, 0);
165
166                // dead code, workaround so IGC won't move tail load out of loop
167                if (tail > inputDumpMainBuffer->totalSize)
168                {
169                   store_uint_L1UC_L3UC(&inputDumpMainBuffer->tail, 0, tail + tail2);
170                   tail2 = tail;
171                }
172
173                if( prevHead >= tail )
174                {
175                    //colision example:
176                    //  ----------T=======H------------
177                    //  -------B=====E-----------------
178                    //
179                    if((bufferEnd < tail) || (bufferBegin >= prevHead))
180                    {
181                        wait = false;
182                    }
183                }
184                else
185                {
186                    //colision example:
187                    //  ==========H-------T============
188                    //  B==============E---------------
189                    // caution: we will never have H circled completely so that H == T
190                    if((bufferEnd < tail) && (bufferBegin >= prevHead))
191                    {
192                        wait = false;
193                    }
194                }
195            } while (wait);
196        } while (!atomic_compare_exchange_global(&inputDumpMainBuffer->gpuHead, &prevHead, newHead));
197
198        if (circled)
199        {
200            global InputBatch* endBufferOp = (global InputBatch*)(((global char*)inputDumpMainBuffer) + prevHead);
201            endBufferOp->header.opHeader.operationType = INPUT_DUMP_OP_END_BUFFER;
202            prevHead = inputDumpMainBuffer->headStart;
203        }
204
205        global char* thisBatchDump = ((global char*)inputDumpMainBuffer) + prevHead;
206        batchPtrs->dumpDst = (qword)thisBatchDump;
207        batchPtrs->globalDumpBuffer = (qword)inputDumpMainBuffer;
208        batchPtrs->nonVertexDataStart = (qword)(thisBatchDump + AlignTo128(sizeof(InputBatch)) + vertexBufferSize);
209        batchPtrs->totalSize = sizeOfThisBatch;
210
211        global InputBatch* batchOp = (global InputBatch*) thisBatchDump;
212        batchOp->header.opHeader.operationType = INPUT_DUMP_OP_BATCH;
213        batchOp->header.opHeader.endOfData = sizeOfThisBatch;
214        batchOp->vertexBufferDataSize = vertexBufferSize;
215        batchOp->firstContainedOpOffset = AlignTo128(sizeof(InputBatch)) + vertexBufferSize;
216        batchOp->batchId = *batchIdPtr;
217    }
218}
219
220/// Sets the dst data space for output dump of this batch
221///
222/// @param outputDumpMainBuffer pointer to main dump buffer
223/// @param batchPtrs batch pointers struct
224/// @param batchIdPtr pointer to batch id
225__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
226__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
227void kernel allocate_data_space_for_outputs(
228    global DebugBufferHeader* outputDumpMainBuffer,
229    global OutputBatchPtrs* batchPtrs,
230    global qword* batchIdPtr)
231{
232    if (get_sub_group_local_id() == 0)
233    {
234        uint sizeOfThisBatch = AlignTo128(sizeof(OutputBatch)) + batchPtrs->dataSize;
235
236        if ((sizeOfThisBatch + sizeof(OutputBatch)) > ((outputDumpMainBuffer->totalSize - outputDumpMainBuffer->headStart) / 2))
237        {
238            outputDumpMainBuffer->overflow = 1;
239            batchPtrs->dumpDst = 0;
240            batchPtrs->dataStart = 0;
241            batchPtrs->totalSize = 0;
242            return;
243        }
244
245        dword prevHead = *((volatile global uint*)(&outputDumpMainBuffer->gpuHead));
246        dword newHead;
247        bool circled;
248
249        do
250        {
251            //mem_fence_gpu_invalidate();
252            //prevHead = *((volatile global uint*)(&outputDumpMainBuffer->gpuHead));
253            circled = false;
254            newHead = prevHead + sizeOfThisBatch;
255            dword bufferBegin = prevHead;
256            if ((newHead + sizeof(OutputBatch)) > outputDumpMainBuffer->totalSize)
257            {
258                circled = true;
259                newHead = outputDumpMainBuffer->headStart + sizeOfThisBatch;
260                bufferBegin = outputDumpMainBuffer->headStart;
261            }
262            dword bufferEnd = newHead + sizeof(OutputBatch);
263
264            uint tail;
265            uint tail2 = 7;
266            bool wait;
267            do
268            {
269                wait = true;
270                tail = load_uint_L1UC_L3UC(&outputDumpMainBuffer->tail, 0);
271
272                // dead code, workaround so IGC won't move tail load out of loop
273                if (tail > outputDumpMainBuffer->totalSize)
274                {
275                   store_uint_L1UC_L3UC(&outputDumpMainBuffer->tail, 0, tail + tail2);
276                   tail2 = tail;
277                }
278
279                if( prevHead >= tail )
280                {
281                    //colision example:
282                    //  ----------T=======H------------
283                    //  -------B=====E-----------------
284                    //
285                    if((bufferEnd < tail) || (bufferBegin >= prevHead))
286                    {
287                        wait = false;
288                    }
289                }
290                else
291                {
292                    //colision example:
293                    //  ==========H-------T============
294                    //  B==============E---------------
295                    // caution: we will never have H circled completely so that H == T
296                    if((bufferEnd < tail) && (bufferBegin >= prevHead))
297                    {
298                        wait = false;
299                    }
300                }
301            } while (wait);
302        } while (!atomic_compare_exchange_global(&outputDumpMainBuffer->gpuHead, &prevHead, newHead));
303
304        if (circled)
305        {
306            global OutputBatch* endBufferOp = (global OutputBatch*)(((global char*)outputDumpMainBuffer) + prevHead);
307            endBufferOp->header.opHeader.operationType = OUTPUT_DUMP_OP_END_BUFFER;
308            prevHead = outputDumpMainBuffer->headStart;
309        }
310
311        global char* thisBatchDump = ((global char*)outputDumpMainBuffer) + prevHead;
312        batchPtrs->dumpDst = (qword)thisBatchDump;
313        batchPtrs->dataStart = (qword)(thisBatchDump + AlignTo128(sizeof(OutputBatch)));
314        batchPtrs->totalSize = sizeOfThisBatch;
315
316        global OutputBatch* batchOp = (global OutputBatch*) thisBatchDump;
317        batchOp->header.opHeader.operationType = OUTPUT_DUMP_OP_BATCH;
318        batchOp->header.opHeader.endOfData = sizeOfThisBatch;
319        batchOp->firstContainedOpOffset = AlignTo128(sizeof(OutputBatch));
320        batchOp->batchId = *batchIdPtr;
321    }
322}
323
324/// Calculates sum of output sizes
325///
326/// @param pbi pointer to post build infos
327/// @param destOffset offset in dest buffer
328/// @param numOutputs number of outputs
329/// @param batchPtrs batch pointers struct
330__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
331__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
332void kernel calc_outputs_data_size(
333    global PostbuildInfoSerializationDesc* pbi,
334    global dword* destOffsets,
335    qword numOutputs,
336    global OutputBatchPtrs* batchPtrs)
337{
338    uint offset = 0;
339    for (uint i = get_sub_group_local_id(); i < numOutputs + (MAX_HW_SIMD_WIDTH - 1); i += MAX_HW_SIMD_WIDTH)
340    {
341        uint size = 0;
342        if (i < numOutputs)
343        {
344            size = AlignTo128(pbi[i].SerializedSizeInBytes);
345            size += AlignTo128(sizeof(OutputData));
346            destOffsets[i] = offset + sub_group_scan_exclusive_add(size);
347        }
348        offset += sub_group_reduce_add(size);
349    }
350    if (get_sub_group_local_id() == 0)
351        batchPtrs->dataSize = offset;
352}
353
354/// Adds output data operation to batch
355///
356/// @param batchPtrs batch pointers struct
357/// @param destOffset offset in dest buffer
358/// @param src pointer to source bvh
359/// @param pbi pointer to post build info
360__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
361__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
362void kernel write_output_data_op(
363    global OutputBatchPtrs* batchPtrs,
364    global dword* destOffset,
365    qword src,
366    global PostbuildInfoSerializationDesc* pbi)
367{
368    if (batchPtrs->dataStart == 0)
369        return;
370
371    global OutputData* out = (global OutputData*)(batchPtrs->dataStart + *destOffset);
372    out->header.operationType = OUTPUT_DUMP_OP_DATA;
373    out->header.endOfData = AlignTo128(sizeof(OutputData)) + AlignTo128(pbi->SerializedSizeInBytes);
374    out->srcBvhPtr = src;
375}
376
377/// Writes indices and transform or procedurals data
378///
379/// @param batchPtrs batch pointers struct
380/// @param srcDesc description of source geometry
381/// @param pVertexBufferOffsetInLinearisedUniqueVertexBuffers pointer to offset to vertices in vertex buffer
382/// @param dstDescOffset offset to dest geo desc
383/// @param dstDataOffset offset to dest geo data
384/// @param numThreads number of threads
385__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
386__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
387void kernel write_geo_data(
388    global InputBatchPtrs* batchPtrs,
389    global GRL_RAYTRACING_GEOMETRY_DESC* srcDesc,
390    global uint* pVertexBufferOffsetInLinearisedUniqueVertexBuffers,
391    global uint* pVertexBufferSize,
392    qword dstDescOffset,
393    qword dstDataOffset,
394    dword numThreads)
395{
396    if (batchPtrs->dumpDst == 0) return;
397
398    uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id();
399
400    GRL_RAYTRACING_GEOMETRY_DESC geoDescToStore = *srcDesc;
401
402    global char* dstDataPtr = (global char*)(
403        batchPtrs->nonVertexDataStart + dstDataOffset);
404
405    global char* srcDataPtr;
406    global char* dstTransform;
407    uint bytesToCopy = 0;
408
409    if (geoDescToStore.Type == GEOMETRY_TYPE_TRIANGLES)
410    {
411        uint sizeOfMatrix = 0;
412
413        if (geoDescToStore.Desc.Triangles.pTransformBuffer)
414        {
415            sizeOfMatrix = AlignTo128(4 * 3 * sizeof(float));
416            if (glob_id < 12)
417            {
418                global float* matrixSrc = (global float*)geoDescToStore.Desc.Triangles.pTransformBuffer;
419                global float* matrixDst = (global float*)dstDataPtr;
420                matrixDst[glob_id] = matrixSrc[glob_id];
421                if (glob_id == 0)
422                {
423                    geoDescToStore.Desc.Triangles.pTransformBuffer = ((qword)matrixDst) - batchPtrs->globalDumpBuffer;
424                }
425            }
426        }
427
428        dstDataPtr += sizeOfMatrix;
429        srcDataPtr = (global char*)geoDescToStore.Desc.Triangles.pIndexBuffer;
430
431        bytesToCopy = AlignTo128(geoDescToStore.Desc.Triangles.IndexFormat * geoDescToStore.Desc.Triangles.IndexCount);
432
433        if (bytesToCopy && (glob_id == 0))
434        {
435            qword vertBuff = (qword)(GetVertexBuffersStart(batchPtrs) + *pVertexBufferOffsetInLinearisedUniqueVertexBuffers);
436            // for this we remember offset relative to global debug buffer
437            geoDescToStore.Desc.Triangles.pVertexBuffer = ((qword)vertBuff) - batchPtrs->globalDumpBuffer;
438            geoDescToStore.Desc.Triangles.pIndexBuffer = ((qword)dstDataPtr) - batchPtrs->globalDumpBuffer;
439            geoDescToStore.Desc.Triangles.VertexCount = *pVertexBufferSize / geoDescToStore.Desc.Triangles.VertexBufferByteStride;
440        }
441        else if (geoDescToStore.Desc.Triangles.IndexFormat == INDEX_FORMAT_NONE && geoDescToStore.Desc.Triangles.VertexCount > 0 && glob_id == 0)
442        {
443            if (geoDescToStore.Desc.Triangles.pVertexBuffer)
444            {
445                qword vertBuff = (qword)(GetVertexBuffersStart(batchPtrs) + *pVertexBufferOffsetInLinearisedUniqueVertexBuffers);
446                // for this we remember offset relative to global debug buffer
447                geoDescToStore.Desc.Triangles.pVertexBuffer = ((qword)vertBuff) - batchPtrs->globalDumpBuffer;
448            }
449        }
450        else if (glob_id == 0)
451        {
452            geoDescToStore.Desc.Triangles.IndexCount = 0;
453            geoDescToStore.Desc.Triangles.VertexCount = 0;
454            geoDescToStore.Desc.Triangles.pVertexBuffer = 0;
455            geoDescToStore.Desc.Triangles.pIndexBuffer = 0;
456        }
457    }
458    else
459    {
460        srcDataPtr  = (global char*)geoDescToStore.Desc.Procedural.pAABBs_GPUVA;
461        bytesToCopy = AlignTo128(geoDescToStore.Desc.Procedural.AABBByteStride * geoDescToStore.Desc.Procedural.AABBCount);
462        if (glob_id == 0)
463        {
464            geoDescToStore.Desc.Procedural.pAABBs_GPUVA = ((qword)dstDataPtr) - batchPtrs->globalDumpBuffer;
465        }
466    }
467
468    if (bytesToCopy)
469    {
470        CopyMemory(dstDataPtr, srcDataPtr, bytesToCopy, numThreads);
471    }
472
473    if (glob_id == 0)
474    {
475        global GRL_RAYTRACING_GEOMETRY_DESC* dstDescPtr = (global GRL_RAYTRACING_GEOMETRY_DESC*)(
476            batchPtrs->nonVertexDataStart + dstDescOffset);
477        *dstDescPtr = geoDescToStore;
478    }
479}
480
481/// Adds build operation to batch
482///
483/// @param batchPtrs batch pointers struct
484/// @param buildOpOffset offset in dst buffer
485/// @param srcBvh address of src bvh (in case of update)
486/// @param dstBvhAddr address of dest bvh buffer
487/// @param offsetToEnd offset to end of this operation
488/// @param flags build flags
489/// @param numGeometries number of geometries in build
490/// @param numInstances number of instances in build
491__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
492__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
493void kernel write_input_build_op(
494    global InputBatchPtrs* batchPtrs,
495    qword buildOpOffset,
496    qword srcBvh,
497    qword dstBvhAddr,
498    dword offsetToEnd,
499    dword flags,
500    dword numGeometries,
501    dword numInstances,
502    dword instArrayOfPtrs)
503{
504    uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id();
505    if (batchPtrs->dumpDst == 0 || glob_id != 0) return;
506
507    global InputBuild* buildOp = (global InputBuild*)(
508        batchPtrs->nonVertexDataStart + buildOpOffset);
509    buildOp->header.operationType = srcBvh ? INPUT_DUMP_OP_UPDATE : INPUT_DUMP_OP_BUILD;
510    buildOp->header.endOfData = offsetToEnd;
511    buildOp->dstBvhPtr = dstBvhAddr;
512    buildOp->srcBvhPtr = srcBvh;
513    buildOp->flags = flags;
514    buildOp->numGeos = numGeometries;
515    buildOp->numInstances = numInstances;
516    buildOp->instArrayOfPtrs = instArrayOfPtrs;
517}
518
519/// Copies instance description
520///
521/// @param batchPtrs batch pointers struct
522/// @param instanceDescArr inst desc source
523/// @param offset ptr to offset in dst buffer
524/// @param numInstances number of instances to copy
525__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
526__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
527copy_instance_descriptors_array(
528    global InputBatchPtrs* batchPtrs,
529    global GRL_RAYTRACING_INSTANCE_DESC* instanceDescArr,
530    qword offset,
531    dword numInstances)
532{
533    uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id();
534    if (batchPtrs->dumpDst == 0) return;
535
536    global GRL_RAYTRACING_INSTANCE_DESC* dst = (global GRL_RAYTRACING_INSTANCE_DESC* )(
537        batchPtrs->nonVertexDataStart + offset);
538
539    if (glob_id < numInstances)
540    {
541        dst[glob_id] = instanceDescArr[glob_id];
542    }
543}
544
545/// Copies instance description, array of pointers version
546///
547/// @param batchPtrs batch pointers struct
548/// @param pInstanceDescPtrsArr inst desc source
549/// @param offset ptr to offset in dst buffer
550/// @param numInstances number of instances to copy
551__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
552__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH))) void kernel
553copy_instance_descriptors_array_of_ptrs(
554    global InputBatchPtrs* batchPtrs,
555    global qword* pInstanceDescPtrsArr,
556    qword offset,
557    dword numInstances)
558{
559    uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id();
560    if (batchPtrs->dumpDst == 0) return;
561
562    // save gpuva of instance descs for debug
563    global qword* gpuvaDst = (global qword*)(batchPtrs->nonVertexDataStart + offset);
564
565    global GRL_RAYTRACING_INSTANCE_DESC* dst = (global GRL_RAYTRACING_INSTANCE_DESC*)(
566        batchPtrs->nonVertexDataStart + AlignTo128(numInstances * sizeof(qword)) + offset);
567    global GRL_RAYTRACING_INSTANCE_DESC** instanceDescPtrsArr = (global GRL_RAYTRACING_INSTANCE_DESC **)pInstanceDescPtrsArr;
568
569    if (glob_id < numInstances)
570    {
571        gpuvaDst[glob_id] = (qword)instanceDescPtrsArr[glob_id];
572        dst[glob_id] = *(instanceDescPtrsArr[glob_id]);
573    }
574}
575
576/// Adds copy operation to batch
577///
578/// @param batchPtrs batch pointers struct
579/// @param offset ptr to offset in dst buffer
580/// @param src copy source pointer
581/// @param dst copy destination pointer
582/// @param copyOpType copy type
583__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
584__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
585void kernel insert_copy_op(
586    global InputBatchPtrs* batchPtrs,
587    qword offset,
588    global void* src,
589    global void* dst,
590    uint copyOpType)
591{
592    uint glob_id = get_group_id(0) * get_sub_group_size() + get_sub_group_local_id();
593    if (batchPtrs->dumpDst == 0 || glob_id != 0) return;
594
595    global InputCopy* copyOp = (global InputCopy*)(batchPtrs->nonVertexDataStart + offset);
596
597    copyOp->header.operationType = copyOpType;
598    copyOp->header.endOfData = AlignTo128(sizeof(InputCopy));
599    copyOp->srcBvhPtr = (qword)src;
600    copyOp->dstBvhPtr = (qword)dst;
601}
602
603/// Copies vertex buffer
604///
605/// @param batchPtrs batch pointers struct
606/// @param src input buffer
607/// @param offset ptr to offset in dst buffer
608/// @param size ptr to number of bytes to copy
609__attribute__((reqd_work_group_size(MAX_HW_SIMD_WIDTH, 1, 1)))
610__attribute__((intel_reqd_sub_group_size(MAX_HW_SIMD_WIDTH)))
611void kernel copy_vertex_data(
612    global InputBatchPtrs* batchPtrs,
613    global const char* src,
614    global const uint* offset,
615    global const uint* size)
616{
617    if (batchPtrs->dumpDst == 0) return;
618
619    global char *dst = (global char *)(GetVertexBuffersStart(batchPtrs) + *offset);
620    uint numGroups = (*size >> 6) + 1;
621    CopyMemory(dst, src, *size, numGroups);
622}
623
624/// Generate unique batch id
625///
626/// @param batchIds array of unique batch ids
627/// @param index index of batch id to generate
628__attribute__((reqd_work_group_size(1, 1, 1)))
629void kernel generate_unique_batch_id(global unsigned long *batchIds, unsigned int index) {
630    global unsigned int *counterPtrs = (global unsigned int *)batchIds;
631    atomic_add(&counterPtrs[index * 2 + 1], 1);
632    batchIds[index] |= (unsigned long)index;
633}
634
635/// Sets batch as ready to read and moves cpuHead forward, inputs case
636///
637/// @param batchPtrs batch pointers struct
638/// @param dumpMainBuffer pointer to main dump buffer
639__attribute__((reqd_work_group_size(1, 1, 1)))
640void kernel finish_batch_dump_inputs(
641    global InputBatchPtrs* batchPtrs,
642    global DebugBufferHeader* dumpMainBuffer)
643{
644    if (batchPtrs->dumpDst == 0)
645        return;
646
647    global InputBatch* myBatchOp = (global InputBatch*)batchPtrs->dumpDst;
648
649    dword myDstOffset = (batchPtrs->dumpDst - (qword)dumpMainBuffer);
650
651    dword seven = 7;
652    while (true)
653    {
654        dword currentHead = load_uint_L1UC_L3C(&dumpMainBuffer->cpuHead, 0);
655        if (currentHead > dumpMainBuffer->totalSize) // dead code - workaround so IGC won't move currentHead load out of loop
656        {
657            store_uint_L1UC_L3UC(&dumpMainBuffer->cpuHead, 0, currentHead + seven);
658            currentHead = seven;
659        }
660
661        if (currentHead == myDstOffset)
662        {
663            mem_fence_evict_to_memory();
664            dumpMainBuffer->cpuHead = currentHead + myBatchOp->header.opHeader.endOfData;
665            break;
666        }
667        else if (myDstOffset == dumpMainBuffer->headStart)
668        {
669            global InputBatch* curBatchOp = (global InputBatch*)(((global char*)dumpMainBuffer) + currentHead);
670            if (curBatchOp->header.opHeader.operationType == INPUT_DUMP_OP_END_BUFFER)
671            {
672                mem_fence_evict_to_memory();
673                dumpMainBuffer->cpuHead = dumpMainBuffer->headStart + myBatchOp->header.opHeader.endOfData;
674                break;
675            }
676        }
677    }
678}
679
680/// Sets batch as ready to read and moves cpuHead forward, outputs case
681///
682/// @param batchPtrs batch pointers struct
683/// @param dumpMainBuffer pointer to main dump buffer
684__attribute__((reqd_work_group_size(1, 1, 1)))
685void kernel finish_batch_dump_outputs(
686    global OutputBatchPtrs* batchPtrs,
687    global DebugBufferHeader* dumpMainBuffer)
688{
689    if (batchPtrs->dumpDst == 0)
690        return;
691
692    global OutputBatch* myBatchOp = (global OutputBatch*)batchPtrs->dumpDst;
693
694    dword myDstOffset = (batchPtrs->dumpDst - (qword)dumpMainBuffer);
695
696    dword seven = 7;
697    while (true)
698    {
699        dword currentHead = load_uint_L1UC_L3C(&dumpMainBuffer->cpuHead, 0);
700        if (currentHead > dumpMainBuffer->totalSize) // dead code - workaround so IGC won't move currentHead load out of loop
701        {
702            store_uint_L1UC_L3UC(&dumpMainBuffer->cpuHead, 0, currentHead + seven);
703            currentHead = seven;
704        }
705
706        if (currentHead == myDstOffset)
707        {
708            mem_fence_evict_to_memory();
709            dumpMainBuffer->cpuHead = currentHead + myBatchOp->header.opHeader.endOfData;
710            break;
711        }
712        else if (myDstOffset == dumpMainBuffer->headStart)
713        {
714            global OutputBatch* curBatchOp = (global OutputBatch*)(((global char*)dumpMainBuffer) + currentHead);
715            if (curBatchOp->header.opHeader.operationType == OUTPUT_DUMP_OP_END_BUFFER)
716            {
717                mem_fence_evict_to_memory();
718                dumpMainBuffer->cpuHead = dumpMainBuffer->headStart + myBatchOp->header.opHeader.endOfData;
719                break;
720            }
721        }
722    }
723}
724