From 4bd62ccbc0c82588cb6381620cec34a5ca9e0128 Mon Sep 17 00:00:00 2001 From: Ivan Avdeev Date: Fri, 31 Mar 2023 21:01:17 -0700 Subject: [PATCH] [draft] vk: start refactoring commandbuffer/gpu profiler Consolidate VkCommandBuffer management into a single entity. (somewhat done for framectl, not done for staging) Make sure that we pass enough metadata to be able to timestamp scopes in cmdbuf. It does compile, but it won't work: not all init code paths are used. Also, there are many changes, so other failure modes are totally possible. --- ref/vk/ray_pass.c | 28 ++++++--- ref/vk/ray_pass.h | 3 +- ref/vk/vk_combuf.c | 115 ++++++++++++++++++++++++++++++++++ ref/vk/vk_combuf.h | 26 ++++++++ ref/vk/vk_framectl.c | 139 ++++++++++++++++++++++-------------------- ref/vk/vk_framectl.h | 6 +- ref/vk/vk_gpurofl.c | 99 ++++++++++++++++++++++++++++++ ref/vk/vk_gpurofl.h | 30 +++++++++ ref/vk/vk_meatpipe.c | 4 +- ref/vk/vk_meatpipe.h | 3 +- ref/vk/vk_pipeline.c | 7 ++- ref/vk/vk_pipeline.h | 3 +- ref/vk/vk_querypool.c | 3 +- ref/vk/vk_querypool.h | 2 +- ref/vk/vk_render.c | 4 +- ref/vk/vk_render.h | 3 +- ref/vk/vk_rtx.c | 10 +-- ref/vk/vk_rtx.h | 2 +- ref/vk/vk_staging.c | 2 + 19 files changed, 392 insertions(+), 97 deletions(-) create mode 100644 ref/vk/vk_combuf.c create mode 100644 ref/vk/vk_combuf.h create mode 100644 ref/vk/vk_gpurofl.c create mode 100644 ref/vk/vk_gpurofl.h diff --git a/ref/vk/ray_pass.c b/ref/vk/ray_pass.c index 10c51b4f..d851abcf 100644 --- a/ref/vk/ray_pass.c +++ b/ref/vk/ray_pass.c @@ -3,6 +3,7 @@ #include "ray_resources.h" #include "vk_pipeline.h" #include "vk_descriptor.h" +#include "vk_combuf.h" // FIXME this is only needed for MAX_CONCURRENT_FRAMES // TODO specify it externally as ctor arg @@ -21,6 +22,7 @@ typedef struct ray_pass_s { ray_pass_type_t type; // TODO remove this in favor of VkPipelineStageFlagBits VkPipelineStageFlagBits pipeline_type; char debug_name[32]; + int gpurofl_scope_id; struct { int write_from; @@ -181,6 +183,7 @@ struct ray_pass_s *RayPassCreateTracing( const ray_pass_create_tracing_t *create Q_strncpy(header->debug_name, create->debug_name, sizeof(header->debug_name)); header->type = RayPassType_Tracing; header->pipeline_type = VK_PIPELINE_STAGE_RAY_TRACING_SHADER_BIT_KHR; + header->gpurofl_scope_id = R_VkGpuScope_Register(create->debug_name); return header; } @@ -209,6 +212,7 @@ struct ray_pass_s *RayPassCreateCompute( const ray_pass_create_compute_t *create Q_strncpy(header->debug_name, create->debug_name, sizeof(header->debug_name)); header->type = RayPassType_Compute; header->pipeline_type = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + header->gpurofl_scope_id = R_VkGpuScope_Register(create->debug_name); return header; } @@ -235,23 +239,29 @@ void RayPassDestroy( struct ray_pass_s *pass ) { Mem_Free(pass); } -static void performTracing( VkCommandBuffer cmdbuf, int set_slot, const ray_pass_tracing_impl_t *tracing, int width, int height ) { +static void performTracing( vk_combuf_t* combuf, int set_slot, const ray_pass_tracing_impl_t *tracing, int width, int height, int scope_id ) { + const VkCommandBuffer cmdbuf = combuf->cmdbuf; + vkCmdBindPipeline(cmdbuf, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR, tracing->pipeline.pipeline); vkCmdBindDescriptorSets(cmdbuf, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR, tracing->header.desc.riptors.pipeline_layout, 0, 1, tracing->header.desc.riptors.desc_sets + set_slot, 0, NULL); - VK_PipelineRayTracingTrace(cmdbuf, &tracing->pipeline, width, height); + VK_PipelineRayTracingTrace(combuf, &tracing->pipeline, width, height, scope_id); } -static void performCompute( VkCommandBuffer cmdbuf, int set_slot, const ray_pass_compute_impl_t *compute, int width, int height) { +static void performCompute( vk_combuf_t *combuf, int set_slot, const ray_pass_compute_impl_t *compute, int width, int height, int scope_id) { const uint32_t WG_W = 8; const uint32_t WG_H = 8; + const VkCommandBuffer cmdbuf = combuf->cmdbuf; vkCmdBindPipeline(cmdbuf, VK_PIPELINE_BIND_POINT_COMPUTE, compute->pipeline); vkCmdBindDescriptorSets(cmdbuf, VK_PIPELINE_BIND_POINT_COMPUTE, compute->header.desc.riptors.pipeline_layout, 0, 1, compute->header.desc.riptors.desc_sets + set_slot, 0, NULL); + + const int begin_id = R_VkCombufScopeBegin(combuf, scope_id); vkCmdDispatch(cmdbuf, (width + WG_W - 1) / WG_W, (height + WG_H - 1) / WG_H, 1); + R_VkCombufScopeEnd(combuf, begin_id, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT); } -void RayPassPerform(struct ray_pass_s *pass, VkCommandBuffer cmdbuf, ray_pass_perform_args_t args ) { - R_VkResourcesPrepareDescriptorsValues(cmdbuf, +void RayPassPerform(struct ray_pass_s *pass, vk_combuf_t *combuf, ray_pass_perform_args_t args ) { + R_VkResourcesPrepareDescriptorsValues(combuf->cmdbuf, (vk_resources_write_descriptors_args_t){ .pipeline = pass->pipeline_type, .resources = args.resources, @@ -264,22 +274,22 @@ void RayPassPerform(struct ray_pass_s *pass, VkCommandBuffer cmdbuf, ray_pass_pe VK_DescriptorsWrite(&pass->desc.riptors, args.frame_set_slot); - DEBUG_BEGIN(cmdbuf, pass->debug_name); + DEBUG_BEGIN(combuf->cmdbuf, pass->debug_name); switch (pass->type) { case RayPassType_Tracing: { ray_pass_tracing_impl_t *tracing = (ray_pass_tracing_impl_t*)pass; - performTracing(cmdbuf, args.frame_set_slot, tracing, args.width, args.height); + performTracing(combuf, args.frame_set_slot, tracing, args.width, args.height, pass->gpurofl_scope_id); break; } case RayPassType_Compute: { ray_pass_compute_impl_t *compute = (ray_pass_compute_impl_t*)pass; - performCompute(cmdbuf, args.frame_set_slot, compute, args.width, args.height); + performCompute(combuf, args.frame_set_slot, compute, args.width, args.height, pass->gpurofl_scope_id); break; } } - DEBUG_END(cmdbuf); + DEBUG_END(combuf->cmdbuf); } diff --git a/ref/vk/ray_pass.h b/ref/vk/ray_pass.h index 29da3df0..cce701c9 100644 --- a/ref/vk/ray_pass.h +++ b/ref/vk/ray_pass.h @@ -69,5 +69,6 @@ typedef struct ray_pass_perform_args_s { const int *resources_map; } ray_pass_perform_args_t; -void RayPassPerform(struct ray_pass_s *pass, VkCommandBuffer cmdbuf, ray_pass_perform_args_t args ); +struct vk_combuf_s; +void RayPassPerform(struct ray_pass_s *pass, struct vk_combuf_s* combuf, ray_pass_perform_args_t args ); diff --git a/ref/vk/vk_combuf.c b/ref/vk/vk_combuf.c new file mode 100644 index 00000000..52219667 --- /dev/null +++ b/ref/vk/vk_combuf.c @@ -0,0 +1,115 @@ +#include "vk_combuf.h" +#include "vk_commandpool.h" + +#define MAX_COMMANDBUFFERS 4 +#define MAX_QUERY_COUNT 128 + +typedef struct { + vk_combuf_t public; + int used; + struct { + // First two is entire command buffer time [begin, end] + uint32_t timestamps_offset; + } profiler; +} vk_combuf_impl_t; + +static struct { + vk_command_pool_t pool; + + vk_combuf_impl_t combufs[MAX_COMMANDBUFFERS]; + + struct { + VkQueryPool pool; + uint64_t values[MAX_QUERY_COUNT * MAX_COMMANDBUFFERS]; + } timestamp; +} g_combuf; + +qboolean R_VkCombuf_Init( void ) { + g_combuf.pool = R_VkCommandPoolCreate(MAX_COMMANDBUFFERS); + if (!g_combuf.pool.pool) + return false; + + const VkQueryPoolCreateInfo qpci = { + .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, + .pNext = NULL, + .queryType = VK_QUERY_TYPE_TIMESTAMP, + .queryCount = COUNTOF(g_combuf.timestamp.values), + .flags = 0, + }; + + XVK_CHECK(vkCreateQueryPool(vk_core.device, &qpci, NULL, &g_combuf.timestamp.pool)); + + for (int i = 0; i < MAX_COMMANDBUFFERS; ++i) { + vk_combuf_impl_t *const cb = g_combuf.combufs + i; + + cb->public.cmdbuf = g_combuf.pool.buffers[i]; + SET_DEBUG_NAMEF(cb->public.cmdbuf, VK_OBJECT_TYPE_COMMAND_BUFFER, "cmdbuf[%d]", i); + + cb->profiler.timestamps_offset = i * MAX_QUERY_COUNT; + + /* for (int j = 0; j < COUNTOF(cb->public.sema_done); ++j) { */ + /* cb->public.sema_done[j] = R_VkSemaphoreCreate(); */ + /* ASSERT(cb->public.sema_done[j]); */ + /* SET_DEBUG_NAMEF(cb->public.sema_done[j], VK_OBJECT_TYPE_SEMAPHORE, "done[%d][%d]", i, j); */ + /* } */ + } + + return true; +} + +void R_VkCombuf_Destroy( void ) { + vkDestroyQueryPool(vk_core.device, g_combuf.timestamp.pool, NULL); + R_VkCommandPoolDestroy(&g_combuf.pool); +} + +vk_combuf_t* R_VkCombufOpen( void ) { + for (int i = 0; i < MAX_COMMANDBUFFERS; ++i) { + vk_combuf_impl_t *const cb = g_combuf.combufs + i; + if (!cb->used) { + cb->used = 1; + return &cb->public; + } + } + + return NULL; +} + +void R_VkCombufClose( vk_combuf_t* pub ) { + vk_combuf_impl_t *const cb = (vk_combuf_impl_t*)pub; + cb->used = 0; + + // TODO synchronize? +} + +void R_VkCombufBegin( vk_combuf_t* pub ) { + vk_combuf_impl_t *const cb = (vk_combuf_impl_t*)pub; + + const VkCommandBufferBeginInfo beginfo = { + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, + .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT, + }; + XVK_CHECK(vkBeginCommandBuffer(cb->public.cmdbuf, &beginfo)); + + vkCmdResetQueryPool(cb->public.cmdbuf, g_combuf.timestamp.pool, cb->profiler.timestamps_offset, MAX_QUERY_COUNT); + vkCmdWriteTimestamp(cb->public.cmdbuf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, g_combuf.timestamp.pool, cb->profiler.timestamps_offset + 0); +} + +void R_VkCombufEnd( vk_combuf_t* pub ) { + vk_combuf_impl_t *const cb = (vk_combuf_impl_t*)pub; + vkCmdWriteTimestamp(cb->public.cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, g_combuf.timestamp.pool, cb->profiler.timestamps_offset + 1); + XVK_CHECK(vkEndCommandBuffer(cb->public.cmdbuf)); +} + +int R_VkGpuScope_Register(const char *name) { + // FIXME + return -1; +} + +int R_VkCombufScopeBegin(vk_combuf_t* combuf, int scope_id) { + // FIXME + return -1; +} + +void R_VkCombufScopeEnd(vk_combuf_t* combuf, int begin_index, VkPipelineStageFlagBits pipeline_stage) { + // FIXME +} diff --git a/ref/vk/vk_combuf.h b/ref/vk/vk_combuf.h new file mode 100644 index 00000000..bc15cca2 --- /dev/null +++ b/ref/vk/vk_combuf.h @@ -0,0 +1,26 @@ +#pragma once + +#include "vk_core.h" + +typedef struct vk_combuf_s { + VkCommandBuffer cmdbuf; + // VkSemaphore sema_done[2]; + // VkFence fence_done; +} vk_combuf_t; + +qboolean R_VkCombuf_Init( void ); +void R_VkCombuf_Destroy( void ); + +vk_combuf_t* R_VkCombufOpen( void ); +void R_VkCombufClose( vk_combuf_t* ); + +void R_VkCombufBegin( vk_combuf_t* ); +void R_VkCombufEnd( vk_combuf_t* ); + + +int R_VkGpuScope_Register(const char *name); + +int R_VkCombufScopeBegin(vk_combuf_t*, int scope_id); +void R_VkCombufScopeEnd(vk_combuf_t*, int begin_index, VkPipelineStageFlagBits pipeline_stage); + +// TODO r_vkgpu_scopes_t *R_VkGpuScopesGet( VkCommandBuffer cmdbuf ); diff --git a/ref/vk/vk_framectl.c b/ref/vk/vk_framectl.c index bafac1fc..fee90219 100644 --- a/ref/vk/vk_framectl.c +++ b/ref/vk/vk_framectl.c @@ -10,7 +10,7 @@ #include "vk_image.h" #include "vk_staging.h" #include "vk_commandpool.h" -#include "vk_querypool.h" +#include "vk_combuf.h" #include "profiler.h" #include "r_speeds.h" @@ -31,15 +31,22 @@ typedef enum { Phase_Submitted, } frame_phase_t; -static struct { - vk_command_pool_t command; - VkSemaphore sem_framebuffer_ready[MAX_CONCURRENT_FRAMES]; - VkSemaphore sem_done[MAX_CONCURRENT_FRAMES]; - VkSemaphore sem_done2[MAX_CONCURRENT_FRAMES]; - VkFence fence_done[MAX_CONCURRENT_FRAMES]; +typedef struct { + vk_combuf_t *combuf; + VkFence fence_done; + VkSemaphore sem_framebuffer_ready; + VkSemaphore sem_done; - // TODO these should be tightly coupled with commandbuffers - vk_query_pool_t qpools[MAX_CONCURRENT_FRAMES]; + // This extra semaphore is required because we need to synchronize 2 things on GPU: + // 1. swapchain + // 2. next frame command buffer + // Unfortunately waiting on semaphore also means resetting it when it is signaled + // so we can't reuse the same one for two purposes and need to mnozhit sunchnosti + VkSemaphore sem_done2; +} vk_framectl_frame_t; + +static struct { + vk_framectl_frame_t frames[MAX_CONCURRENT_FRAMES]; struct { int index; @@ -147,9 +154,10 @@ static VkRenderPass createRenderPass( VkFormat depth_format, qboolean ray_tracin static void waitForFrameFence( void ) { APROF_SCOPE_BEGIN(wait_for_frame_fence); + const VkFence fence_done[1] = {g_frame.frames[g_frame.current.index].fence_done}; for(qboolean loop = true; loop; ) { #define MAX_WAIT (10ull * 1000*1000*1000) - const VkResult fence_result = vkWaitForFences(vk_core.device, 1, g_frame.fence_done + g_frame.current.index, VK_TRUE, MAX_WAIT); + const VkResult fence_result = vkWaitForFences(vk_core.device, COUNTOF(fence_done), fence_done, VK_TRUE, MAX_WAIT); #undef MAX_WAIT switch (fence_result) { case VK_SUCCESS: @@ -163,7 +171,7 @@ static void waitForFrameFence( void ) { } } - XVK_CHECK(vkResetFences(vk_core.device, 1, g_frame.fence_done + g_frame.current.index)); + XVK_CHECK(vkResetFences(vk_core.device, COUNTOF(fence_done), fence_done)); APROF_SCOPE_END(wait_for_frame_fence); } @@ -193,15 +201,15 @@ void R_BeginFrame( qboolean clearScene ) { APROF_SCOPE_DECLARE_BEGIN(begin_frame_tail, "R_BeginFrame_tail"); ASSERT(g_frame.current.phase == Phase_Submitted || g_frame.current.phase == Phase_Idle); g_frame.current.index = (g_frame.current.index + 1) % MAX_CONCURRENT_FRAMES; - const VkCommandBuffer cmdbuf = vk_frame.cmdbuf = g_frame.command.buffers[g_frame.current.index]; - vk_query_pool_t *const qpool = g_frame.qpools + g_frame.current.index; + + vk_framectl_frame_t *const frame = g_frame.frames + g_frame.current.index; { waitForFrameFence(); // Current command buffer is done and available // Previous might still be in flight - R_VkQueryPoolGetFrameResults(g_frame.qpools + g_frame.current.index); + // TODO R_VkQueryPoolGetFrameResults(g_frame.qpools + g_frame.current.index); } APROF_SCOPE_END(begin_frame_tail); @@ -213,8 +221,8 @@ void R_BeginFrame( qboolean clearScene ) { { // FIXME collect and show more gpu profiling data - const uint64_t gpu_time_begin_ns = (qpool->used) ? qpool->results[0] : 0; - const uint64_t gpu_time_end_ns = (qpool->used) ? qpool->results[1] : 0; + const uint64_t gpu_time_begin_ns = 0;// FIXME (qpool->used) ? qpool->results[0] : 0; + const uint64_t gpu_time_end_ns = 0;// FIXME (qpool->used) ? qpool->results[1] : 0; R_ShowExtendedProfilingData(prev_frame_event_index, gpu_time_begin_ns, gpu_time_end_ns); } @@ -229,21 +237,13 @@ void R_BeginFrame( qboolean clearScene ) { R_VkStagingFrameBegin(); - g_frame.current.framebuffer = R_VkSwapchainAcquire( g_frame.sem_framebuffer_ready[g_frame.current.index] ); + g_frame.current.framebuffer = R_VkSwapchainAcquire( frame->sem_framebuffer_ready ); vk_frame.width = g_frame.current.framebuffer.width; vk_frame.height = g_frame.current.framebuffer.height; VK_RenderBegin( vk_frame.rtx_enabled ); - { - const VkCommandBufferBeginInfo beginfo = { - .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, - .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT, - }; - XVK_CHECK(vkBeginCommandBuffer(cmdbuf, &beginfo)); - } - - R_VkQueryPoolBegin(g_frame.qpools + g_frame.current.index, cmdbuf); + R_VkCombufBegin( frame->combuf ); g_frame.current.phase = Phase_FrameBegan; APROF_SCOPE_END(begin_frame); @@ -256,7 +256,7 @@ void VK_RenderFrame( const struct ref_viewpass_s *rvp ) APROF_SCOPE_END(render_frame); } -static void enqueueRendering( VkCommandBuffer cmdbuf ) { +static void enqueueRendering( vk_combuf_t* combuf ) { const VkClearValue clear_value[] = { {.color = {{1., 0., 0., 0.}}}, {.depthStencil = {1., 0.}} // TODO reverse-z @@ -264,10 +264,11 @@ static void enqueueRendering( VkCommandBuffer cmdbuf ) { ASSERT(g_frame.current.phase == Phase_FrameBegan); + const VkCommandBuffer cmdbuf = combuf->cmdbuf; VK_Render_FIXME_Barrier(cmdbuf); if (vk_frame.rtx_enabled) - VK_RenderEndRTX( cmdbuf, g_frame.current.framebuffer.view, g_frame.current.framebuffer.image, g_frame.current.framebuffer.width, g_frame.current.framebuffer.height ); + VK_RenderEndRTX( combuf, g_frame.current.framebuffer.view, g_frame.current.framebuffer.image, g_frame.current.framebuffer.width, g_frame.current.framebuffer.height ); { VkRenderPassBeginInfo rpbi = { @@ -305,11 +306,15 @@ static void enqueueRendering( VkCommandBuffer cmdbuf ) { g_frame.current.phase = Phase_RenderingEnqueued; } -static void submit( VkCommandBuffer cmdbuf, qboolean wait ) { +static void submit( vk_combuf_t* combuf, qboolean wait ) { ASSERT(g_frame.current.phase == Phase_RenderingEnqueued); - R_VkQueryPoolEnd(g_frame.qpools + g_frame.current.index, cmdbuf); - XVK_CHECK(vkEndCommandBuffer(cmdbuf)); + const VkCommandBuffer cmdbuf = combuf->cmdbuf; + + vk_framectl_frame_t *const frame = g_frame.frames + g_frame.current.index; + vk_framectl_frame_t *const prev_frame = g_frame.frames + (g_frame.current.index + 1) % MAX_CONCURRENT_FRAMES; + + R_VkCombufEnd(combuf); const VkCommandBuffer cmdbufs[] = { R_VkStagingFrameEnd(), @@ -321,13 +326,16 @@ static void submit( VkCommandBuffer cmdbuf, qboolean wait ) { VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, }; + + // TODO for RT renderer we only touch framebuffer at the very end of rendering/cmdbuf. + // Can we postpone waitinf for framebuffer semaphore until we actually need it. const VkSemaphore waitophores[] = { - g_frame.sem_framebuffer_ready[g_frame.current.index], - g_frame.sem_done2[(g_frame.current.index + 1) % MAX_CONCURRENT_FRAMES], + frame->sem_framebuffer_ready, + prev_frame->sem_done2, }; const VkSemaphore signalphores[] = { - g_frame.sem_done[g_frame.current.index], - g_frame.sem_done2[g_frame.current.index], + frame->sem_done, + frame->sem_done2, }; const VkSubmitInfo subinfo = { .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO, @@ -341,16 +349,16 @@ static void submit( VkCommandBuffer cmdbuf, qboolean wait ) { .pSignalSemaphores = signalphores, }; //gEngine.Con_Printf("SYNC: wait for semaphore %d, signal semaphore %d\n", (g_frame.current.index + 1) % MAX_CONCURRENT_FRAMES, g_frame.current.index); - XVK_CHECK(vkQueueSubmit(vk_core.queue, 1, &subinfo, g_frame.fence_done[g_frame.current.index])); + XVK_CHECK(vkQueueSubmit(vk_core.queue, 1, &subinfo, frame->fence_done)); g_frame.current.phase = Phase_Submitted; } - R_VkSwapchainPresent(g_frame.current.framebuffer.index, g_frame.sem_done[g_frame.current.index]); + R_VkSwapchainPresent(g_frame.current.framebuffer.index, frame->sem_done); g_frame.current.framebuffer = (r_vk_swapchain_framebuffer_t){0}; if (wait) { APROF_SCOPE_BEGIN(frame_gpu_wait); - XVK_CHECK(vkWaitForFences(vk_core.device, 1, g_frame.fence_done + g_frame.current.index, VK_TRUE, INT64_MAX)); + XVK_CHECK(vkWaitForFences(vk_core.device, 1, &frame->fence_done, VK_TRUE, INT64_MAX)); APROF_SCOPE_END(frame_gpu_wait); /* if (vk_core.debug) { */ @@ -362,7 +370,7 @@ static void submit( VkCommandBuffer cmdbuf, qboolean wait ) { } inline static VkCommandBuffer currentCommandBuffer( void ) { - return g_frame.command.buffers[g_frame.current.index]; + return g_frame.frames[g_frame.current.index].combuf->cmdbuf; } void R_EndFrame( void ) @@ -370,12 +378,10 @@ void R_EndFrame( void ) APROF_SCOPE_BEGIN_EARLY(end_frame); if (g_frame.current.phase == Phase_FrameBegan) { - const VkCommandBuffer cmdbuf = currentCommandBuffer(); - enqueueRendering( cmdbuf ); - submit( cmdbuf, false ); + vk_combuf_t *const combuf = g_frame.frames[g_frame.current.index].combuf; + enqueueRendering( combuf ); + submit( combuf, false ); //submit( cmdbuf, true ); - - vk_frame.cmdbuf = VK_NULL_HANDLE; } APROF_SCOPE_END(end_frame); @@ -395,8 +401,6 @@ qboolean VK_FrameCtlInit( void ) const VkFormat depth_format = findSupportedImageFormat(depth_formats, VK_IMAGE_TILING_OPTIMAL, VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT); - g_frame.command = R_VkCommandPoolCreate( MAX_CONCURRENT_FRAMES ); - // FIXME move this out to renderers vk_frame.render_pass.raster = createRenderPass(depth_format, false); if (vk_core.rtx) @@ -406,16 +410,17 @@ qboolean VK_FrameCtlInit( void ) return false; for (int i = 0; i < MAX_CONCURRENT_FRAMES; ++i) { - g_frame.sem_framebuffer_ready[i] = R_VkSemaphoreCreate(); - SET_DEBUG_NAMEF(g_frame.sem_framebuffer_ready[i], VK_OBJECT_TYPE_SEMAPHORE, "framebuffer_ready[%d]", i); - g_frame.sem_done[i] = R_VkSemaphoreCreate(); - SET_DEBUG_NAMEF(g_frame.sem_done[i], VK_OBJECT_TYPE_SEMAPHORE, "done[%d]", i); - g_frame.sem_done2[i] = R_VkSemaphoreCreate(); - SET_DEBUG_NAMEF(g_frame.sem_done2[i], VK_OBJECT_TYPE_SEMAPHORE, "done2[%d]", i); - g_frame.fence_done[i] = R_VkFenceCreate(true); - SET_DEBUG_NAMEF(g_frame.fence_done[i], VK_OBJECT_TYPE_FENCE, "done[%d]", i); + vk_framectl_frame_t *const frame = g_frame.frames + i; + frame->combuf = R_VkCombufOpen(); - R_VkQueryPoolInit(g_frame.qpools + i); + frame->sem_framebuffer_ready = R_VkSemaphoreCreate(); + SET_DEBUG_NAMEF(frame->sem_framebuffer_ready, VK_OBJECT_TYPE_SEMAPHORE, "framebuffer_ready[%d]", i); + frame->sem_done = R_VkSemaphoreCreate(); + SET_DEBUG_NAMEF(frame->sem_done, VK_OBJECT_TYPE_SEMAPHORE, "done[%d]", i); + frame->sem_done2 = R_VkSemaphoreCreate(); + SET_DEBUG_NAMEF(frame->sem_done2, VK_OBJECT_TYPE_SEMAPHORE, "done2[%d]", i); + frame->fence_done = R_VkFenceCreate(true); + SET_DEBUG_NAMEF(frame->fence_done, VK_OBJECT_TYPE_FENCE, "done[%d]", i); } // Signal first frame semaphore as done @@ -430,10 +435,9 @@ qboolean VK_FrameCtlInit( void ) .pWaitSemaphores = NULL, .pWaitDstStageMask = &stageflags, .signalSemaphoreCount = 1, - .pSignalSemaphores = g_frame.sem_done2 + 0, + .pSignalSemaphores = &g_frame.frames[0].sem_done2, }; XVK_CHECK(vkQueueSubmit(vk_core.queue, 1, &subinfo, VK_NULL_HANDLE)); - //gEngine.Con_Printf("SYNC: signal semaphore %d\n", 0); } vk_frame.rtx_enabled = vk_core.rtx; @@ -447,11 +451,12 @@ qboolean VK_FrameCtlInit( void ) void VK_FrameCtlShutdown( void ) { for (int i = 0; i < MAX_CONCURRENT_FRAMES; ++i) { - R_VkSemaphoreDestroy(g_frame.sem_framebuffer_ready[i]); - R_VkSemaphoreDestroy(g_frame.sem_done[i]); - R_VkSemaphoreDestroy(g_frame.sem_done2[i]); - R_VkFenceDestroy(g_frame.fence_done[i]); - R_VkQueryPoolDestroy(g_frame.qpools + i); + vk_framectl_frame_t *const frame = g_frame.frames + i; + R_VkCombufClose(frame->combuf); + R_VkSemaphoreDestroy(frame->sem_framebuffer_ready); + R_VkSemaphoreDestroy(frame->sem_done); + R_VkSemaphoreDestroy(frame->sem_done2); + R_VkFenceDestroy(frame->fence_done); } R_VkSwapchainShutdown(); @@ -459,8 +464,6 @@ void VK_FrameCtlShutdown( void ) { vkDestroyRenderPass(vk_core.device, vk_frame.render_pass.raster, NULL); if (vk_core.rtx) vkDestroyRenderPass(vk_core.device, vk_frame.render_pass.after_ray_tracing, NULL); - - R_VkCommandPoolDestroy( &g_frame.command ); } static qboolean canBlitFromSwapchainToFormat( VkFormat dest_format ) { @@ -487,7 +490,9 @@ static rgbdata_t *XVK_ReadPixels( void ) { const VkImage frame_image = g_frame.current.framebuffer.image; rgbdata_t *r_shot = NULL; qboolean blit = canBlitFromSwapchainToFormat( dest_format ); - const VkCommandBuffer cmdbuf = currentCommandBuffer(); + + vk_combuf_t *const combuf = g_frame.frames[g_frame.current.index].combuf; + const VkCommandBuffer cmdbuf = combuf->cmdbuf; if (frame_image == VK_NULL_HANDLE) { gEngine.Con_Printf(S_ERROR "no current image, can't take screenshot\n"); @@ -513,7 +518,7 @@ static rgbdata_t *XVK_ReadPixels( void ) { } // Make sure that all rendering ops are enqueued - enqueueRendering( cmdbuf ); + enqueueRendering( combuf ); { // Barrier 1: dest image @@ -618,7 +623,7 @@ static rgbdata_t *XVK_ReadPixels( void ) { 0, 0, NULL, 0, NULL, ARRAYSIZE(image_barrier), image_barrier); } - submit( cmdbuf, true ); + submit( combuf, true ); // copy bytes to buffer { diff --git a/ref/vk/vk_framectl.h b/ref/vk/vk_framectl.h index 79dc8b91..47e8dac1 100644 --- a/ref/vk/vk_framectl.h +++ b/ref/vk/vk_framectl.h @@ -5,13 +5,11 @@ #define MAX_CONCURRENT_FRAMES 2 +// TODO most of the things below should not be global. Instead, they should be passed as an argument/context to all the drawing functions that want this info typedef struct vk_framectl_s { - // TODO only used from 2d, remove + // TODO only used from 2d and r_speeds, remove uint32_t width, height; - // FIXME - VkCommandBuffer cmdbuf; - // TODO move these into renderer and 2d struct { // Used when the entire rendering is traditional triangle rasterization diff --git a/ref/vk/vk_gpurofl.c b/ref/vk/vk_gpurofl.c new file mode 100644 index 00000000..c782500f --- /dev/null +++ b/ref/vk/vk_gpurofl.c @@ -0,0 +1,99 @@ +#include "vk_gpurofl.h" +#include "vk_querypool.h" + +#define MAX_SCOPES 64 +#define MAX_COMMANDBUFFERS 8 + +typedef struct { + const char *name; +} r_vkgpu_scope_t; + +#define EVENT_BEGIN 0x100 + +// B....E +// B....E +// -> B..B.E..E +// -> B.......E +// -> B.E + +typedef struct { + VkCommandBuffer cmdbuf; + vk_query_pool_t *qpool; + + uint32_t events[MAX_QUERY_COUNT]; +} r_vkgpu_cmdbuf_assoc_t; + +static struct { + r_vkgpu_scope_t scopes[MAX_SCOPES]; + int scopes_count; + + // FIXME couple these more tightly + r_vkgpu_cmdbuf_assoc_t assocs[MAX_COMMANDBUFFERS]; + + r_vkgpu_scopes_t last_frame; +} g_purofl; + +int R_VkGpuScopeRegister(const char *name) { + if (g_purofl.scopes_count == MAX_SCOPES) { + gEngine.Con_Printf(S_ERROR "Cannot register GPU profiler scope \"%s\": max number of scope %d reached\n", name, MAX_SCOPES); + return -1; + } + + g_purofl.scopes[g_purofl.scopes_count].name = name; + + return g_purofl.scopes_count++; +} + +void R_VkGpuBegin(VkCommandBuffer cmdbuf, vk_query_pool_t *qpool) { + for (int i = 0; i < MAX_COMMANDBUFFERS; ++i) { + r_vkgpu_cmdbuf_assoc_t *const assoc = g_purofl.assocs + i; + if (!assoc->cmdbuf) { + assoc->cmdbuf = cmdbuf; + assoc->qpool = qpool; + return; + } + + if (assoc->cmdbuf == cmdbuf) { + assoc->qpool = qpool; + return; + } + } + + ASSERT(!"FIXME Cannot associate cmdbuf with query pool, slots exceeded"); +} + +static vk_query_pool_t *getQueryPool(VkCommandBuffer cmdbuf) { + for (int i = 0; i < MAX_COMMANDBUFFERS; ++i) { + r_vkgpu_cmdbuf_assoc_t *const assoc = g_purofl.assocs + i; + if (!assoc->cmdbuf) + break; + + if (assoc->cmdbuf == cmdbuf) + return assoc->qpool; + } + + return NULL; +} + +static void writeTimestamp(VkCommandBuffer cmdbuf, int scope_id, VkPipelineStageFlagBits stage, int begin) { + if (scope_id < 0) + return; + + // 1. Find query pool for the cmdbuf + vk_query_pool_t *const qpool = getQueryPool(cmdbuf); + if (!qpool) // TODO complain? + return; + + // 2. Write timestamp + const int timestamp_id = R_VkQueryPoolTimestamp(qpool, cmdbuf, stage); + + // 3. Associate timestamp index with scope_begin +} + +/* int R_VkGpuScopeBegin(VkCommandBuffer cmdbuf, int scope_id) { */ +/* writeTimestamp(cmdbuf, scope_id, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 1); */ +/* } */ +/* */ +/* void R_VkGpuScopeEnd(VkCommandBuffer cmdbuf, int begin_index, VkPipelineStageFlagBits pipeline_stage) { */ +/* writeTimestamp(cmdbuf, scope_id, pipeline_stage, 0); */ +/* } */ diff --git a/ref/vk/vk_gpurofl.h b/ref/vk/vk_gpurofl.h new file mode 100644 index 00000000..35f653cb --- /dev/null +++ b/ref/vk/vk_gpurofl.h @@ -0,0 +1,30 @@ +#pragma once + +#include "vk_core.h" + +// Return scope_id for the new scope. -1 if failed +// name is expected to be statically allocated +int R_VkGpuScopeRegister(const char *name); + +typedef struct vk_query_pool_s vk_query_pool_t; +void R_VkGpuBegin(VkCommandBuffer cmdbuf, vk_query_pool_t *qpool); + +// Returns begin_index to use in R_VkGpuScopeEnd +int R_VkGpuScopeBegin(VkCommandBuffer cmdbuf, int scope_id); + +void R_VkGpuScopeEnd(VkCommandBuffer cmdbuf, int begin_index, VkPipelineStageFlagBits pipeline_stage); + +typedef struct { + const char *name; + uint64_t begin_ns, end_ns; +} r_vkgpu_scope_entry_t; + +typedef struct { + r_vkgpu_scope_entry_t *scopes; + int scopes_count; +} r_vkgpu_scopes_t; + +// Reads all the scope timing data (timestamp queries) and returns a list of things happened this frame. +// Prerequisite: all relevant recorded command buffers should've been completed and waited on already. +// The returned pointer remains valid until any next R_VkGpu*() call. +r_vkgpu_scopes_t *R_VkGpuScopesGet( VkCommandBuffer cmdbuf ); diff --git a/ref/vk/vk_meatpipe.c b/ref/vk/vk_meatpipe.c index d43bb5f5..022e48fa 100644 --- a/ref/vk/vk_meatpipe.c +++ b/ref/vk/vk_meatpipe.c @@ -421,10 +421,10 @@ void R_VkMeatpipeDestroy(vk_meatpipe_t *mp) { Mem_Free(mp); } -void R_VkMeatpipePerform(vk_meatpipe_t *mp, VkCommandBuffer cmdbuf, vk_meatpipe_perfrom_args_t args) { +void R_VkMeatpipePerform(vk_meatpipe_t *mp, struct vk_combuf_s *combuf, vk_meatpipe_perfrom_args_t args) { for (int i = 0; i < mp->passes_count; ++i) { const vk_meatpipe_pass_t *pass = mp->passes + i; - RayPassPerform(pass->pass, cmdbuf, + RayPassPerform(pass->pass, combuf, (ray_pass_perform_args_t){ .frame_set_slot = args.frame_set_slot, .width = args.width, diff --git a/ref/vk/vk_meatpipe.h b/ref/vk/vk_meatpipe.h index 68327e9d..606e61e5 100644 --- a/ref/vk/vk_meatpipe.h +++ b/ref/vk/vk_meatpipe.h @@ -42,4 +42,5 @@ typedef struct vk_meatpipe_perfrom_args_s { const vk_resource_p *resources; } vk_meatpipe_perfrom_args_t; -void R_VkMeatpipePerform(vk_meatpipe_t *mp, VkCommandBuffer cmdbuf, vk_meatpipe_perfrom_args_t args); +struct vk_combuf_s; +void R_VkMeatpipePerform(vk_meatpipe_t *mp, struct vk_combuf_s *combuf, vk_meatpipe_perfrom_args_t args); diff --git a/ref/vk/vk_pipeline.c b/ref/vk/vk_pipeline.c index 438fdd84..d637d39a 100644 --- a/ref/vk/vk_pipeline.c +++ b/ref/vk/vk_pipeline.c @@ -1,6 +1,7 @@ #include "vk_pipeline.h" #include "vk_framectl.h" // VkRenderPass +#include "vk_combuf.h" #include "eiface.h" @@ -369,7 +370,9 @@ void VK_PipelineRayTracingDestroy(vk_pipeline_ray_t* pipeline) { pipeline->pipeline = VK_NULL_HANDLE; } -void VK_PipelineRayTracingTrace(VkCommandBuffer cmdbuf, const vk_pipeline_ray_t *pipeline, uint32_t width, uint32_t height) { +void VK_PipelineRayTracingTrace(vk_combuf_t *combuf, const vk_pipeline_ray_t *pipeline, uint32_t width, uint32_t height, int scope_id) { // TODO bind this and accepts descriptors as args? vkCmdBindPipeline(cmdbuf, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR, pipeline->pipeline); - vkCmdTraceRaysKHR(cmdbuf, &pipeline->sbt.raygen, &pipeline->sbt.miss, &pipeline->sbt.hit, &pipeline->sbt.callable, width, height, 1 ); + const int begin_id = R_VkCombufScopeBegin(combuf, scope_id); + vkCmdTraceRaysKHR(combuf->cmdbuf, &pipeline->sbt.raygen, &pipeline->sbt.miss, &pipeline->sbt.hit, &pipeline->sbt.callable, width, height, 1 ); + R_VkCombufScopeEnd(combuf, begin_id, VK_PIPELINE_STAGE_RAY_TRACING_SHADER_BIT_KHR); } diff --git a/ref/vk/vk_pipeline.h b/ref/vk/vk_pipeline.h index 6ae85119..8f48d343 100644 --- a/ref/vk/vk_pipeline.h +++ b/ref/vk/vk_pipeline.h @@ -80,7 +80,8 @@ typedef struct { } vk_pipeline_ray_t; vk_pipeline_ray_t VK_PipelineRayTracingCreate(const vk_pipeline_ray_create_info_t *create); -void VK_PipelineRayTracingTrace(VkCommandBuffer cmdbuf, const vk_pipeline_ray_t *pipeline, uint32_t width, uint32_t height); +struct vk_combuf_s; +void VK_PipelineRayTracingTrace(struct vk_combuf_s *combuf, const vk_pipeline_ray_t *pipeline, uint32_t width, uint32_t height, int scope_id); void VK_PipelineRayTracingDestroy(vk_pipeline_ray_t* pipeline); diff --git a/ref/vk/vk_querypool.c b/ref/vk/vk_querypool.c index dcaad97d..e392df7e 100644 --- a/ref/vk/vk_querypool.c +++ b/ref/vk/vk_querypool.c @@ -80,9 +80,10 @@ void R_VkQueryPoolGetFrameResults( vk_query_pool_t *pool ) { vkGetQueryPoolResults(vk_core.device, pool->pool, 0, pool->used, pool->used * sizeof(uint64_t), pool->results, sizeof(uint64_t), VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT); const uint64_t timestamp_offset_ns = getGpuTimestampOffsetNs( pool ); + const double timestamp_period = vk_core.physical_device.properties.limits.timestampPeriod; for (int i = 0; i < pool->used; ++i) { - const uint64_t gpu_ns = pool->results[i] * (double)vk_core.physical_device.properties.limits.timestampPeriod; + const uint64_t gpu_ns = pool->results[i] * timestamp_period; pool->results[i] = timestamp_offset_ns + gpu_ns; } } diff --git a/ref/vk/vk_querypool.h b/ref/vk/vk_querypool.h index cd6a5b9c..f7c828f0 100644 --- a/ref/vk/vk_querypool.h +++ b/ref/vk/vk_querypool.h @@ -4,7 +4,7 @@ #define MAX_QUERY_COUNT 128 -typedef struct { +typedef struct vk_query_pool_s { VkQueryPool pool; int used; uint64_t results[MAX_QUERY_COUNT]; diff --git a/ref/vk/vk_render.c b/ref/vk/vk_render.c index 1ec7d76e..3ea04182 100644 --- a/ref/vk/vk_render.c +++ b/ref/vk/vk_render.c @@ -649,14 +649,14 @@ void VK_RenderDebugLabelEnd( void ) drawCmdPushDebugLabelEnd(); } -void VK_RenderEndRTX( VkCommandBuffer cmdbuf, VkImageView img_dst_view, VkImage img_dst, uint32_t w, uint32_t h ) +void VK_RenderEndRTX( struct vk_combuf_s* combuf, VkImageView img_dst_view, VkImage img_dst, uint32_t w, uint32_t h ) { const VkBuffer geom_buffer = R_GeometryBuffer_Get(); ASSERT(vk_core.rtx); { const vk_ray_frame_render_args_t args = { - .cmdbuf = cmdbuf, + .combuf = combuf, .dst = { .image_view = img_dst_view, .image = img_dst, diff --git a/ref/vk/vk_render.h b/ref/vk/vk_render.h index 4b3fc438..8bd1f2be 100644 --- a/ref/vk/vk_render.h +++ b/ref/vk/vk_render.h @@ -112,6 +112,7 @@ void VK_RenderDebugLabelEnd( void ); void VK_RenderBegin( qboolean ray_tracing ); void VK_RenderEnd( VkCommandBuffer cmdbuf ); -void VK_RenderEndRTX( VkCommandBuffer cmdbuf, VkImageView img_dst_view, VkImage img_dst, uint32_t w, uint32_t h ); +struct vk_combuf_s; +void VK_RenderEndRTX( struct vk_combuf_s* combuf, VkImageView img_dst_view, VkImage img_dst, uint32_t w, uint32_t h ); void VK_Render_FIXME_Barrier( VkCommandBuffer cmdbuf ); diff --git a/ref/vk/vk_rtx.c b/ref/vk/vk_rtx.c index cb14dd3c..2c340fb7 100644 --- a/ref/vk/vk_rtx.c +++ b/ref/vk/vk_rtx.c @@ -16,6 +16,7 @@ #include "vk_staging.h" #include "vk_textures.h" #include "vk_previous_frame.h" +#include "vk_combuf.h" #include "alolcator.h" @@ -173,7 +174,8 @@ typedef struct { const vk_lights_bindings_t *light_bindings; } perform_tracing_args_t; -static void performTracing(VkCommandBuffer cmdbuf, const perform_tracing_args_t* args) { +static void performTracing( vk_combuf_t *combuf, const perform_tracing_args_t* args) { + const VkCommandBuffer cmdbuf = combuf->cmdbuf; // TODO move this to "TLAS producer" g_rtx.res[ExternalResource_tlas].resource = (vk_resource_t){ .type = VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR, @@ -325,7 +327,7 @@ static void performTracing(VkCommandBuffer cmdbuf, const perform_tracing_args_t* g_rtx.mainpipe_resources[i]->value.image_object = &res->image; } - R_VkMeatpipePerform(g_rtx.mainpipe, cmdbuf, (vk_meatpipe_perfrom_args_t) { + R_VkMeatpipePerform(g_rtx.mainpipe, combuf, (vk_meatpipe_perfrom_args_t) { .frame_set_slot = args->frame_index, .width = FRAME_WIDTH, .height = FRAME_HEIGHT, @@ -533,7 +535,7 @@ fail: void VK_RayFrameEnd(const vk_ray_frame_render_args_t* args) { - const VkCommandBuffer cmdbuf = args->cmdbuf; + const VkCommandBuffer cmdbuf = args->combuf->cmdbuf; // const xvk_ray_frame_images_t* current_frame = g_rtx.frames + (g_rtx.frame_number % 2); ASSERT(vk_core.rtx); @@ -587,7 +589,7 @@ void VK_RayFrameEnd(const vk_ray_frame_render_args_t* args) .fov_angle_y = args->fov_angle_y, .light_bindings = &light_bindings, }; - performTracing( cmdbuf, &trace_args ); + performTracing( args->combuf, &trace_args ); } } diff --git a/ref/vk/vk_rtx.h b/ref/vk/vk_rtx.h index 45386cfd..6350a0c6 100644 --- a/ref/vk/vk_rtx.h +++ b/ref/vk/vk_rtx.h @@ -26,7 +26,7 @@ typedef struct { } vk_buffer_region_t; typedef struct { - VkCommandBuffer cmdbuf; + struct vk_combuf_s *combuf; struct { VkImageView image_view; diff --git a/ref/vk/vk_staging.c b/ref/vk/vk_staging.c index 3a23146d..1c2a9509 100644 --- a/ref/vk/vk_staging.c +++ b/ref/vk/vk_staging.c @@ -69,6 +69,8 @@ void R_VkStagingShutdown(void) { R_VkCommandPoolDestroy( &g_staging.upload_pool ); } +// FIXME There's a severe race condition here. Submitting things manually and prematurely (before framectl had a chance to synchronize with the previous frame) +// may lead to data races and memory corruption (e.g. writing into memory that's being read in some pipeline stage still going) void R_VkStagingFlushSync( void ) { APROF_SCOPE_DECLARE_BEGIN(function, __FUNCTION__);