[draft] vk: start refactoring commandbuffer/gpu profiler

Consolidate VkCommandBuffer management into a single entity. (somewhat
done for framectl, not done for staging)

Make sure that we pass enough metadata to be able to timestamp scopes in
cmdbuf.

It does compile, but it won't work: not all init code paths are used.
Also, there are many changes, so other failure modes are totally
possible.
This commit is contained in:
Ivan Avdeev 2023-03-31 21:01:17 -07:00 committed by Ivan Avdeev
parent 5c7bd9d285
commit 4bd62ccbc0
19 changed files with 392 additions and 97 deletions

View File

@ -3,6 +3,7 @@
#include "ray_resources.h"
#include "vk_pipeline.h"
#include "vk_descriptor.h"
#include "vk_combuf.h"
// FIXME this is only needed for MAX_CONCURRENT_FRAMES
// TODO specify it externally as ctor arg
@ -21,6 +22,7 @@ typedef struct ray_pass_s {
ray_pass_type_t type; // TODO remove this in favor of VkPipelineStageFlagBits
VkPipelineStageFlagBits pipeline_type;
char debug_name[32];
int gpurofl_scope_id;
struct {
int write_from;
@ -181,6 +183,7 @@ struct ray_pass_s *RayPassCreateTracing( const ray_pass_create_tracing_t *create
Q_strncpy(header->debug_name, create->debug_name, sizeof(header->debug_name));
header->type = RayPassType_Tracing;
header->pipeline_type = VK_PIPELINE_STAGE_RAY_TRACING_SHADER_BIT_KHR;
header->gpurofl_scope_id = R_VkGpuScope_Register(create->debug_name);
return header;
}
@ -209,6 +212,7 @@ struct ray_pass_s *RayPassCreateCompute( const ray_pass_create_compute_t *create
Q_strncpy(header->debug_name, create->debug_name, sizeof(header->debug_name));
header->type = RayPassType_Compute;
header->pipeline_type = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
header->gpurofl_scope_id = R_VkGpuScope_Register(create->debug_name);
return header;
}
@ -235,23 +239,29 @@ void RayPassDestroy( struct ray_pass_s *pass ) {
Mem_Free(pass);
}
static void performTracing( VkCommandBuffer cmdbuf, int set_slot, const ray_pass_tracing_impl_t *tracing, int width, int height ) {
static void performTracing( vk_combuf_t* combuf, int set_slot, const ray_pass_tracing_impl_t *tracing, int width, int height, int scope_id ) {
const VkCommandBuffer cmdbuf = combuf->cmdbuf;
vkCmdBindPipeline(cmdbuf, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR, tracing->pipeline.pipeline);
vkCmdBindDescriptorSets(cmdbuf, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR, tracing->header.desc.riptors.pipeline_layout, 0, 1, tracing->header.desc.riptors.desc_sets + set_slot, 0, NULL);
VK_PipelineRayTracingTrace(cmdbuf, &tracing->pipeline, width, height);
VK_PipelineRayTracingTrace(combuf, &tracing->pipeline, width, height, scope_id);
}
static void performCompute( VkCommandBuffer cmdbuf, int set_slot, const ray_pass_compute_impl_t *compute, int width, int height) {
static void performCompute( vk_combuf_t *combuf, int set_slot, const ray_pass_compute_impl_t *compute, int width, int height, int scope_id) {
const uint32_t WG_W = 8;
const uint32_t WG_H = 8;
const VkCommandBuffer cmdbuf = combuf->cmdbuf;
vkCmdBindPipeline(cmdbuf, VK_PIPELINE_BIND_POINT_COMPUTE, compute->pipeline);
vkCmdBindDescriptorSets(cmdbuf, VK_PIPELINE_BIND_POINT_COMPUTE, compute->header.desc.riptors.pipeline_layout, 0, 1, compute->header.desc.riptors.desc_sets + set_slot, 0, NULL);
const int begin_id = R_VkCombufScopeBegin(combuf, scope_id);
vkCmdDispatch(cmdbuf, (width + WG_W - 1) / WG_W, (height + WG_H - 1) / WG_H, 1);
R_VkCombufScopeEnd(combuf, begin_id, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT);
}
void RayPassPerform(struct ray_pass_s *pass, VkCommandBuffer cmdbuf, ray_pass_perform_args_t args ) {
R_VkResourcesPrepareDescriptorsValues(cmdbuf,
void RayPassPerform(struct ray_pass_s *pass, vk_combuf_t *combuf, ray_pass_perform_args_t args ) {
R_VkResourcesPrepareDescriptorsValues(combuf->cmdbuf,
(vk_resources_write_descriptors_args_t){
.pipeline = pass->pipeline_type,
.resources = args.resources,
@ -264,22 +274,22 @@ void RayPassPerform(struct ray_pass_s *pass, VkCommandBuffer cmdbuf, ray_pass_pe
VK_DescriptorsWrite(&pass->desc.riptors, args.frame_set_slot);
DEBUG_BEGIN(cmdbuf, pass->debug_name);
DEBUG_BEGIN(combuf->cmdbuf, pass->debug_name);
switch (pass->type) {
case RayPassType_Tracing:
{
ray_pass_tracing_impl_t *tracing = (ray_pass_tracing_impl_t*)pass;
performTracing(cmdbuf, args.frame_set_slot, tracing, args.width, args.height);
performTracing(combuf, args.frame_set_slot, tracing, args.width, args.height, pass->gpurofl_scope_id);
break;
}
case RayPassType_Compute:
{
ray_pass_compute_impl_t *compute = (ray_pass_compute_impl_t*)pass;
performCompute(cmdbuf, args.frame_set_slot, compute, args.width, args.height);
performCompute(combuf, args.frame_set_slot, compute, args.width, args.height, pass->gpurofl_scope_id);
break;
}
}
DEBUG_END(cmdbuf);
DEBUG_END(combuf->cmdbuf);
}

View File

@ -69,5 +69,6 @@ typedef struct ray_pass_perform_args_s {
const int *resources_map;
} ray_pass_perform_args_t;
void RayPassPerform(struct ray_pass_s *pass, VkCommandBuffer cmdbuf, ray_pass_perform_args_t args );
struct vk_combuf_s;
void RayPassPerform(struct ray_pass_s *pass, struct vk_combuf_s* combuf, ray_pass_perform_args_t args );

115
ref/vk/vk_combuf.c Normal file
View File

@ -0,0 +1,115 @@
#include "vk_combuf.h"
#include "vk_commandpool.h"
#define MAX_COMMANDBUFFERS 4
#define MAX_QUERY_COUNT 128
typedef struct {
vk_combuf_t public;
int used;
struct {
// First two is entire command buffer time [begin, end]
uint32_t timestamps_offset;
} profiler;
} vk_combuf_impl_t;
static struct {
vk_command_pool_t pool;
vk_combuf_impl_t combufs[MAX_COMMANDBUFFERS];
struct {
VkQueryPool pool;
uint64_t values[MAX_QUERY_COUNT * MAX_COMMANDBUFFERS];
} timestamp;
} g_combuf;
qboolean R_VkCombuf_Init( void ) {
g_combuf.pool = R_VkCommandPoolCreate(MAX_COMMANDBUFFERS);
if (!g_combuf.pool.pool)
return false;
const VkQueryPoolCreateInfo qpci = {
.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO,
.pNext = NULL,
.queryType = VK_QUERY_TYPE_TIMESTAMP,
.queryCount = COUNTOF(g_combuf.timestamp.values),
.flags = 0,
};
XVK_CHECK(vkCreateQueryPool(vk_core.device, &qpci, NULL, &g_combuf.timestamp.pool));
for (int i = 0; i < MAX_COMMANDBUFFERS; ++i) {
vk_combuf_impl_t *const cb = g_combuf.combufs + i;
cb->public.cmdbuf = g_combuf.pool.buffers[i];
SET_DEBUG_NAMEF(cb->public.cmdbuf, VK_OBJECT_TYPE_COMMAND_BUFFER, "cmdbuf[%d]", i);
cb->profiler.timestamps_offset = i * MAX_QUERY_COUNT;
/* for (int j = 0; j < COUNTOF(cb->public.sema_done); ++j) { */
/* cb->public.sema_done[j] = R_VkSemaphoreCreate(); */
/* ASSERT(cb->public.sema_done[j]); */
/* SET_DEBUG_NAMEF(cb->public.sema_done[j], VK_OBJECT_TYPE_SEMAPHORE, "done[%d][%d]", i, j); */
/* } */
}
return true;
}
void R_VkCombuf_Destroy( void ) {
vkDestroyQueryPool(vk_core.device, g_combuf.timestamp.pool, NULL);
R_VkCommandPoolDestroy(&g_combuf.pool);
}
vk_combuf_t* R_VkCombufOpen( void ) {
for (int i = 0; i < MAX_COMMANDBUFFERS; ++i) {
vk_combuf_impl_t *const cb = g_combuf.combufs + i;
if (!cb->used) {
cb->used = 1;
return &cb->public;
}
}
return NULL;
}
void R_VkCombufClose( vk_combuf_t* pub ) {
vk_combuf_impl_t *const cb = (vk_combuf_impl_t*)pub;
cb->used = 0;
// TODO synchronize?
}
void R_VkCombufBegin( vk_combuf_t* pub ) {
vk_combuf_impl_t *const cb = (vk_combuf_impl_t*)pub;
const VkCommandBufferBeginInfo beginfo = {
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
};
XVK_CHECK(vkBeginCommandBuffer(cb->public.cmdbuf, &beginfo));
vkCmdResetQueryPool(cb->public.cmdbuf, g_combuf.timestamp.pool, cb->profiler.timestamps_offset, MAX_QUERY_COUNT);
vkCmdWriteTimestamp(cb->public.cmdbuf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, g_combuf.timestamp.pool, cb->profiler.timestamps_offset + 0);
}
void R_VkCombufEnd( vk_combuf_t* pub ) {
vk_combuf_impl_t *const cb = (vk_combuf_impl_t*)pub;
vkCmdWriteTimestamp(cb->public.cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, g_combuf.timestamp.pool, cb->profiler.timestamps_offset + 1);
XVK_CHECK(vkEndCommandBuffer(cb->public.cmdbuf));
}
int R_VkGpuScope_Register(const char *name) {
// FIXME
return -1;
}
int R_VkCombufScopeBegin(vk_combuf_t* combuf, int scope_id) {
// FIXME
return -1;
}
void R_VkCombufScopeEnd(vk_combuf_t* combuf, int begin_index, VkPipelineStageFlagBits pipeline_stage) {
// FIXME
}

26
ref/vk/vk_combuf.h Normal file
View File

@ -0,0 +1,26 @@
#pragma once
#include "vk_core.h"
typedef struct vk_combuf_s {
VkCommandBuffer cmdbuf;
// VkSemaphore sema_done[2];
// VkFence fence_done;
} vk_combuf_t;
qboolean R_VkCombuf_Init( void );
void R_VkCombuf_Destroy( void );
vk_combuf_t* R_VkCombufOpen( void );
void R_VkCombufClose( vk_combuf_t* );
void R_VkCombufBegin( vk_combuf_t* );
void R_VkCombufEnd( vk_combuf_t* );
int R_VkGpuScope_Register(const char *name);
int R_VkCombufScopeBegin(vk_combuf_t*, int scope_id);
void R_VkCombufScopeEnd(vk_combuf_t*, int begin_index, VkPipelineStageFlagBits pipeline_stage);
// TODO r_vkgpu_scopes_t *R_VkGpuScopesGet( VkCommandBuffer cmdbuf );

View File

@ -10,7 +10,7 @@
#include "vk_image.h"
#include "vk_staging.h"
#include "vk_commandpool.h"
#include "vk_querypool.h"
#include "vk_combuf.h"
#include "profiler.h"
#include "r_speeds.h"
@ -31,15 +31,22 @@ typedef enum {
Phase_Submitted,
} frame_phase_t;
static struct {
vk_command_pool_t command;
VkSemaphore sem_framebuffer_ready[MAX_CONCURRENT_FRAMES];
VkSemaphore sem_done[MAX_CONCURRENT_FRAMES];
VkSemaphore sem_done2[MAX_CONCURRENT_FRAMES];
VkFence fence_done[MAX_CONCURRENT_FRAMES];
typedef struct {
vk_combuf_t *combuf;
VkFence fence_done;
VkSemaphore sem_framebuffer_ready;
VkSemaphore sem_done;
// TODO these should be tightly coupled with commandbuffers
vk_query_pool_t qpools[MAX_CONCURRENT_FRAMES];
// This extra semaphore is required because we need to synchronize 2 things on GPU:
// 1. swapchain
// 2. next frame command buffer
// Unfortunately waiting on semaphore also means resetting it when it is signaled
// so we can't reuse the same one for two purposes and need to mnozhit sunchnosti
VkSemaphore sem_done2;
} vk_framectl_frame_t;
static struct {
vk_framectl_frame_t frames[MAX_CONCURRENT_FRAMES];
struct {
int index;
@ -147,9 +154,10 @@ static VkRenderPass createRenderPass( VkFormat depth_format, qboolean ray_tracin
static void waitForFrameFence( void ) {
APROF_SCOPE_BEGIN(wait_for_frame_fence);
const VkFence fence_done[1] = {g_frame.frames[g_frame.current.index].fence_done};
for(qboolean loop = true; loop; ) {
#define MAX_WAIT (10ull * 1000*1000*1000)
const VkResult fence_result = vkWaitForFences(vk_core.device, 1, g_frame.fence_done + g_frame.current.index, VK_TRUE, MAX_WAIT);
const VkResult fence_result = vkWaitForFences(vk_core.device, COUNTOF(fence_done), fence_done, VK_TRUE, MAX_WAIT);
#undef MAX_WAIT
switch (fence_result) {
case VK_SUCCESS:
@ -163,7 +171,7 @@ static void waitForFrameFence( void ) {
}
}
XVK_CHECK(vkResetFences(vk_core.device, 1, g_frame.fence_done + g_frame.current.index));
XVK_CHECK(vkResetFences(vk_core.device, COUNTOF(fence_done), fence_done));
APROF_SCOPE_END(wait_for_frame_fence);
}
@ -193,15 +201,15 @@ void R_BeginFrame( qboolean clearScene ) {
APROF_SCOPE_DECLARE_BEGIN(begin_frame_tail, "R_BeginFrame_tail");
ASSERT(g_frame.current.phase == Phase_Submitted || g_frame.current.phase == Phase_Idle);
g_frame.current.index = (g_frame.current.index + 1) % MAX_CONCURRENT_FRAMES;
const VkCommandBuffer cmdbuf = vk_frame.cmdbuf = g_frame.command.buffers[g_frame.current.index];
vk_query_pool_t *const qpool = g_frame.qpools + g_frame.current.index;
vk_framectl_frame_t *const frame = g_frame.frames + g_frame.current.index;
{
waitForFrameFence();
// Current command buffer is done and available
// Previous might still be in flight
R_VkQueryPoolGetFrameResults(g_frame.qpools + g_frame.current.index);
// TODO R_VkQueryPoolGetFrameResults(g_frame.qpools + g_frame.current.index);
}
APROF_SCOPE_END(begin_frame_tail);
@ -213,8 +221,8 @@ void R_BeginFrame( qboolean clearScene ) {
{
// FIXME collect and show more gpu profiling data
const uint64_t gpu_time_begin_ns = (qpool->used) ? qpool->results[0] : 0;
const uint64_t gpu_time_end_ns = (qpool->used) ? qpool->results[1] : 0;
const uint64_t gpu_time_begin_ns = 0;// FIXME (qpool->used) ? qpool->results[0] : 0;
const uint64_t gpu_time_end_ns = 0;// FIXME (qpool->used) ? qpool->results[1] : 0;
R_ShowExtendedProfilingData(prev_frame_event_index, gpu_time_begin_ns, gpu_time_end_ns);
}
@ -229,21 +237,13 @@ void R_BeginFrame( qboolean clearScene ) {
R_VkStagingFrameBegin();
g_frame.current.framebuffer = R_VkSwapchainAcquire( g_frame.sem_framebuffer_ready[g_frame.current.index] );
g_frame.current.framebuffer = R_VkSwapchainAcquire( frame->sem_framebuffer_ready );
vk_frame.width = g_frame.current.framebuffer.width;
vk_frame.height = g_frame.current.framebuffer.height;
VK_RenderBegin( vk_frame.rtx_enabled );
{
const VkCommandBufferBeginInfo beginfo = {
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
};
XVK_CHECK(vkBeginCommandBuffer(cmdbuf, &beginfo));
}
R_VkQueryPoolBegin(g_frame.qpools + g_frame.current.index, cmdbuf);
R_VkCombufBegin( frame->combuf );
g_frame.current.phase = Phase_FrameBegan;
APROF_SCOPE_END(begin_frame);
@ -256,7 +256,7 @@ void VK_RenderFrame( const struct ref_viewpass_s *rvp )
APROF_SCOPE_END(render_frame);
}
static void enqueueRendering( VkCommandBuffer cmdbuf ) {
static void enqueueRendering( vk_combuf_t* combuf ) {
const VkClearValue clear_value[] = {
{.color = {{1., 0., 0., 0.}}},
{.depthStencil = {1., 0.}} // TODO reverse-z
@ -264,10 +264,11 @@ static void enqueueRendering( VkCommandBuffer cmdbuf ) {
ASSERT(g_frame.current.phase == Phase_FrameBegan);
const VkCommandBuffer cmdbuf = combuf->cmdbuf;
VK_Render_FIXME_Barrier(cmdbuf);
if (vk_frame.rtx_enabled)
VK_RenderEndRTX( cmdbuf, g_frame.current.framebuffer.view, g_frame.current.framebuffer.image, g_frame.current.framebuffer.width, g_frame.current.framebuffer.height );
VK_RenderEndRTX( combuf, g_frame.current.framebuffer.view, g_frame.current.framebuffer.image, g_frame.current.framebuffer.width, g_frame.current.framebuffer.height );
{
VkRenderPassBeginInfo rpbi = {
@ -305,11 +306,15 @@ static void enqueueRendering( VkCommandBuffer cmdbuf ) {
g_frame.current.phase = Phase_RenderingEnqueued;
}
static void submit( VkCommandBuffer cmdbuf, qboolean wait ) {
static void submit( vk_combuf_t* combuf, qboolean wait ) {
ASSERT(g_frame.current.phase == Phase_RenderingEnqueued);
R_VkQueryPoolEnd(g_frame.qpools + g_frame.current.index, cmdbuf);
XVK_CHECK(vkEndCommandBuffer(cmdbuf));
const VkCommandBuffer cmdbuf = combuf->cmdbuf;
vk_framectl_frame_t *const frame = g_frame.frames + g_frame.current.index;
vk_framectl_frame_t *const prev_frame = g_frame.frames + (g_frame.current.index + 1) % MAX_CONCURRENT_FRAMES;
R_VkCombufEnd(combuf);
const VkCommandBuffer cmdbufs[] = {
R_VkStagingFrameEnd(),
@ -321,13 +326,16 @@ static void submit( VkCommandBuffer cmdbuf, qboolean wait ) {
VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
};
// TODO for RT renderer we only touch framebuffer at the very end of rendering/cmdbuf.
// Can we postpone waitinf for framebuffer semaphore until we actually need it.
const VkSemaphore waitophores[] = {
g_frame.sem_framebuffer_ready[g_frame.current.index],
g_frame.sem_done2[(g_frame.current.index + 1) % MAX_CONCURRENT_FRAMES],
frame->sem_framebuffer_ready,
prev_frame->sem_done2,
};
const VkSemaphore signalphores[] = {
g_frame.sem_done[g_frame.current.index],
g_frame.sem_done2[g_frame.current.index],
frame->sem_done,
frame->sem_done2,
};
const VkSubmitInfo subinfo = {
.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
@ -341,16 +349,16 @@ static void submit( VkCommandBuffer cmdbuf, qboolean wait ) {
.pSignalSemaphores = signalphores,
};
//gEngine.Con_Printf("SYNC: wait for semaphore %d, signal semaphore %d\n", (g_frame.current.index + 1) % MAX_CONCURRENT_FRAMES, g_frame.current.index);
XVK_CHECK(vkQueueSubmit(vk_core.queue, 1, &subinfo, g_frame.fence_done[g_frame.current.index]));
XVK_CHECK(vkQueueSubmit(vk_core.queue, 1, &subinfo, frame->fence_done));
g_frame.current.phase = Phase_Submitted;
}
R_VkSwapchainPresent(g_frame.current.framebuffer.index, g_frame.sem_done[g_frame.current.index]);
R_VkSwapchainPresent(g_frame.current.framebuffer.index, frame->sem_done);
g_frame.current.framebuffer = (r_vk_swapchain_framebuffer_t){0};
if (wait) {
APROF_SCOPE_BEGIN(frame_gpu_wait);
XVK_CHECK(vkWaitForFences(vk_core.device, 1, g_frame.fence_done + g_frame.current.index, VK_TRUE, INT64_MAX));
XVK_CHECK(vkWaitForFences(vk_core.device, 1, &frame->fence_done, VK_TRUE, INT64_MAX));
APROF_SCOPE_END(frame_gpu_wait);
/* if (vk_core.debug) { */
@ -362,7 +370,7 @@ static void submit( VkCommandBuffer cmdbuf, qboolean wait ) {
}
inline static VkCommandBuffer currentCommandBuffer( void ) {
return g_frame.command.buffers[g_frame.current.index];
return g_frame.frames[g_frame.current.index].combuf->cmdbuf;
}
void R_EndFrame( void )
@ -370,12 +378,10 @@ void R_EndFrame( void )
APROF_SCOPE_BEGIN_EARLY(end_frame);
if (g_frame.current.phase == Phase_FrameBegan) {
const VkCommandBuffer cmdbuf = currentCommandBuffer();
enqueueRendering( cmdbuf );
submit( cmdbuf, false );
vk_combuf_t *const combuf = g_frame.frames[g_frame.current.index].combuf;
enqueueRendering( combuf );
submit( combuf, false );
//submit( cmdbuf, true );
vk_frame.cmdbuf = VK_NULL_HANDLE;
}
APROF_SCOPE_END(end_frame);
@ -395,8 +401,6 @@ qboolean VK_FrameCtlInit( void )
const VkFormat depth_format = findSupportedImageFormat(depth_formats, VK_IMAGE_TILING_OPTIMAL, VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT);
g_frame.command = R_VkCommandPoolCreate( MAX_CONCURRENT_FRAMES );
// FIXME move this out to renderers
vk_frame.render_pass.raster = createRenderPass(depth_format, false);
if (vk_core.rtx)
@ -406,16 +410,17 @@ qboolean VK_FrameCtlInit( void )
return false;
for (int i = 0; i < MAX_CONCURRENT_FRAMES; ++i) {
g_frame.sem_framebuffer_ready[i] = R_VkSemaphoreCreate();
SET_DEBUG_NAMEF(g_frame.sem_framebuffer_ready[i], VK_OBJECT_TYPE_SEMAPHORE, "framebuffer_ready[%d]", i);
g_frame.sem_done[i] = R_VkSemaphoreCreate();
SET_DEBUG_NAMEF(g_frame.sem_done[i], VK_OBJECT_TYPE_SEMAPHORE, "done[%d]", i);
g_frame.sem_done2[i] = R_VkSemaphoreCreate();
SET_DEBUG_NAMEF(g_frame.sem_done2[i], VK_OBJECT_TYPE_SEMAPHORE, "done2[%d]", i);
g_frame.fence_done[i] = R_VkFenceCreate(true);
SET_DEBUG_NAMEF(g_frame.fence_done[i], VK_OBJECT_TYPE_FENCE, "done[%d]", i);
vk_framectl_frame_t *const frame = g_frame.frames + i;
frame->combuf = R_VkCombufOpen();
R_VkQueryPoolInit(g_frame.qpools + i);
frame->sem_framebuffer_ready = R_VkSemaphoreCreate();
SET_DEBUG_NAMEF(frame->sem_framebuffer_ready, VK_OBJECT_TYPE_SEMAPHORE, "framebuffer_ready[%d]", i);
frame->sem_done = R_VkSemaphoreCreate();
SET_DEBUG_NAMEF(frame->sem_done, VK_OBJECT_TYPE_SEMAPHORE, "done[%d]", i);
frame->sem_done2 = R_VkSemaphoreCreate();
SET_DEBUG_NAMEF(frame->sem_done2, VK_OBJECT_TYPE_SEMAPHORE, "done2[%d]", i);
frame->fence_done = R_VkFenceCreate(true);
SET_DEBUG_NAMEF(frame->fence_done, VK_OBJECT_TYPE_FENCE, "done[%d]", i);
}
// Signal first frame semaphore as done
@ -430,10 +435,9 @@ qboolean VK_FrameCtlInit( void )
.pWaitSemaphores = NULL,
.pWaitDstStageMask = &stageflags,
.signalSemaphoreCount = 1,
.pSignalSemaphores = g_frame.sem_done2 + 0,
.pSignalSemaphores = &g_frame.frames[0].sem_done2,
};
XVK_CHECK(vkQueueSubmit(vk_core.queue, 1, &subinfo, VK_NULL_HANDLE));
//gEngine.Con_Printf("SYNC: signal semaphore %d\n", 0);
}
vk_frame.rtx_enabled = vk_core.rtx;
@ -447,11 +451,12 @@ qboolean VK_FrameCtlInit( void )
void VK_FrameCtlShutdown( void ) {
for (int i = 0; i < MAX_CONCURRENT_FRAMES; ++i) {
R_VkSemaphoreDestroy(g_frame.sem_framebuffer_ready[i]);
R_VkSemaphoreDestroy(g_frame.sem_done[i]);
R_VkSemaphoreDestroy(g_frame.sem_done2[i]);
R_VkFenceDestroy(g_frame.fence_done[i]);
R_VkQueryPoolDestroy(g_frame.qpools + i);
vk_framectl_frame_t *const frame = g_frame.frames + i;
R_VkCombufClose(frame->combuf);
R_VkSemaphoreDestroy(frame->sem_framebuffer_ready);
R_VkSemaphoreDestroy(frame->sem_done);
R_VkSemaphoreDestroy(frame->sem_done2);
R_VkFenceDestroy(frame->fence_done);
}
R_VkSwapchainShutdown();
@ -459,8 +464,6 @@ void VK_FrameCtlShutdown( void ) {
vkDestroyRenderPass(vk_core.device, vk_frame.render_pass.raster, NULL);
if (vk_core.rtx)
vkDestroyRenderPass(vk_core.device, vk_frame.render_pass.after_ray_tracing, NULL);
R_VkCommandPoolDestroy( &g_frame.command );
}
static qboolean canBlitFromSwapchainToFormat( VkFormat dest_format ) {
@ -487,7 +490,9 @@ static rgbdata_t *XVK_ReadPixels( void ) {
const VkImage frame_image = g_frame.current.framebuffer.image;
rgbdata_t *r_shot = NULL;
qboolean blit = canBlitFromSwapchainToFormat( dest_format );
const VkCommandBuffer cmdbuf = currentCommandBuffer();
vk_combuf_t *const combuf = g_frame.frames[g_frame.current.index].combuf;
const VkCommandBuffer cmdbuf = combuf->cmdbuf;
if (frame_image == VK_NULL_HANDLE) {
gEngine.Con_Printf(S_ERROR "no current image, can't take screenshot\n");
@ -513,7 +518,7 @@ static rgbdata_t *XVK_ReadPixels( void ) {
}
// Make sure that all rendering ops are enqueued
enqueueRendering( cmdbuf );
enqueueRendering( combuf );
{
// Barrier 1: dest image
@ -618,7 +623,7 @@ static rgbdata_t *XVK_ReadPixels( void ) {
0, 0, NULL, 0, NULL, ARRAYSIZE(image_barrier), image_barrier);
}
submit( cmdbuf, true );
submit( combuf, true );
// copy bytes to buffer
{

View File

@ -5,13 +5,11 @@
#define MAX_CONCURRENT_FRAMES 2
// TODO most of the things below should not be global. Instead, they should be passed as an argument/context to all the drawing functions that want this info
typedef struct vk_framectl_s {
// TODO only used from 2d, remove
// TODO only used from 2d and r_speeds, remove
uint32_t width, height;
// FIXME
VkCommandBuffer cmdbuf;
// TODO move these into renderer and 2d
struct {
// Used when the entire rendering is traditional triangle rasterization

99
ref/vk/vk_gpurofl.c Normal file
View File

@ -0,0 +1,99 @@
#include "vk_gpurofl.h"
#include "vk_querypool.h"
#define MAX_SCOPES 64
#define MAX_COMMANDBUFFERS 8
typedef struct {
const char *name;
} r_vkgpu_scope_t;
#define EVENT_BEGIN 0x100
// B....E
// B....E
// -> B..B.E..E
// -> B.......E
// -> B.E
typedef struct {
VkCommandBuffer cmdbuf;
vk_query_pool_t *qpool;
uint32_t events[MAX_QUERY_COUNT];
} r_vkgpu_cmdbuf_assoc_t;
static struct {
r_vkgpu_scope_t scopes[MAX_SCOPES];
int scopes_count;
// FIXME couple these more tightly
r_vkgpu_cmdbuf_assoc_t assocs[MAX_COMMANDBUFFERS];
r_vkgpu_scopes_t last_frame;
} g_purofl;
int R_VkGpuScopeRegister(const char *name) {
if (g_purofl.scopes_count == MAX_SCOPES) {
gEngine.Con_Printf(S_ERROR "Cannot register GPU profiler scope \"%s\": max number of scope %d reached\n", name, MAX_SCOPES);
return -1;
}
g_purofl.scopes[g_purofl.scopes_count].name = name;
return g_purofl.scopes_count++;
}
void R_VkGpuBegin(VkCommandBuffer cmdbuf, vk_query_pool_t *qpool) {
for (int i = 0; i < MAX_COMMANDBUFFERS; ++i) {
r_vkgpu_cmdbuf_assoc_t *const assoc = g_purofl.assocs + i;
if (!assoc->cmdbuf) {
assoc->cmdbuf = cmdbuf;
assoc->qpool = qpool;
return;
}
if (assoc->cmdbuf == cmdbuf) {
assoc->qpool = qpool;
return;
}
}
ASSERT(!"FIXME Cannot associate cmdbuf with query pool, slots exceeded");
}
static vk_query_pool_t *getQueryPool(VkCommandBuffer cmdbuf) {
for (int i = 0; i < MAX_COMMANDBUFFERS; ++i) {
r_vkgpu_cmdbuf_assoc_t *const assoc = g_purofl.assocs + i;
if (!assoc->cmdbuf)
break;
if (assoc->cmdbuf == cmdbuf)
return assoc->qpool;
}
return NULL;
}
static void writeTimestamp(VkCommandBuffer cmdbuf, int scope_id, VkPipelineStageFlagBits stage, int begin) {
if (scope_id < 0)
return;
// 1. Find query pool for the cmdbuf
vk_query_pool_t *const qpool = getQueryPool(cmdbuf);
if (!qpool) // TODO complain?
return;
// 2. Write timestamp
const int timestamp_id = R_VkQueryPoolTimestamp(qpool, cmdbuf, stage);
// 3. Associate timestamp index with scope_begin
}
/* int R_VkGpuScopeBegin(VkCommandBuffer cmdbuf, int scope_id) { */
/* writeTimestamp(cmdbuf, scope_id, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 1); */
/* } */
/* */
/* void R_VkGpuScopeEnd(VkCommandBuffer cmdbuf, int begin_index, VkPipelineStageFlagBits pipeline_stage) { */
/* writeTimestamp(cmdbuf, scope_id, pipeline_stage, 0); */
/* } */

30
ref/vk/vk_gpurofl.h Normal file
View File

@ -0,0 +1,30 @@
#pragma once
#include "vk_core.h"
// Return scope_id for the new scope. -1 if failed
// name is expected to be statically allocated
int R_VkGpuScopeRegister(const char *name);
typedef struct vk_query_pool_s vk_query_pool_t;
void R_VkGpuBegin(VkCommandBuffer cmdbuf, vk_query_pool_t *qpool);
// Returns begin_index to use in R_VkGpuScopeEnd
int R_VkGpuScopeBegin(VkCommandBuffer cmdbuf, int scope_id);
void R_VkGpuScopeEnd(VkCommandBuffer cmdbuf, int begin_index, VkPipelineStageFlagBits pipeline_stage);
typedef struct {
const char *name;
uint64_t begin_ns, end_ns;
} r_vkgpu_scope_entry_t;
typedef struct {
r_vkgpu_scope_entry_t *scopes;
int scopes_count;
} r_vkgpu_scopes_t;
// Reads all the scope timing data (timestamp queries) and returns a list of things happened this frame.
// Prerequisite: all relevant recorded command buffers should've been completed and waited on already.
// The returned pointer remains valid until any next R_VkGpu*() call.
r_vkgpu_scopes_t *R_VkGpuScopesGet( VkCommandBuffer cmdbuf );

View File

@ -421,10 +421,10 @@ void R_VkMeatpipeDestroy(vk_meatpipe_t *mp) {
Mem_Free(mp);
}
void R_VkMeatpipePerform(vk_meatpipe_t *mp, VkCommandBuffer cmdbuf, vk_meatpipe_perfrom_args_t args) {
void R_VkMeatpipePerform(vk_meatpipe_t *mp, struct vk_combuf_s *combuf, vk_meatpipe_perfrom_args_t args) {
for (int i = 0; i < mp->passes_count; ++i) {
const vk_meatpipe_pass_t *pass = mp->passes + i;
RayPassPerform(pass->pass, cmdbuf,
RayPassPerform(pass->pass, combuf,
(ray_pass_perform_args_t){
.frame_set_slot = args.frame_set_slot,
.width = args.width,

View File

@ -42,4 +42,5 @@ typedef struct vk_meatpipe_perfrom_args_s {
const vk_resource_p *resources;
} vk_meatpipe_perfrom_args_t;
void R_VkMeatpipePerform(vk_meatpipe_t *mp, VkCommandBuffer cmdbuf, vk_meatpipe_perfrom_args_t args);
struct vk_combuf_s;
void R_VkMeatpipePerform(vk_meatpipe_t *mp, struct vk_combuf_s *combuf, vk_meatpipe_perfrom_args_t args);

View File

@ -1,6 +1,7 @@
#include "vk_pipeline.h"
#include "vk_framectl.h" // VkRenderPass
#include "vk_combuf.h"
#include "eiface.h"
@ -369,7 +370,9 @@ void VK_PipelineRayTracingDestroy(vk_pipeline_ray_t* pipeline) {
pipeline->pipeline = VK_NULL_HANDLE;
}
void VK_PipelineRayTracingTrace(VkCommandBuffer cmdbuf, const vk_pipeline_ray_t *pipeline, uint32_t width, uint32_t height) {
void VK_PipelineRayTracingTrace(vk_combuf_t *combuf, const vk_pipeline_ray_t *pipeline, uint32_t width, uint32_t height, int scope_id) {
// TODO bind this and accepts descriptors as args? vkCmdBindPipeline(cmdbuf, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR, pipeline->pipeline);
vkCmdTraceRaysKHR(cmdbuf, &pipeline->sbt.raygen, &pipeline->sbt.miss, &pipeline->sbt.hit, &pipeline->sbt.callable, width, height, 1 );
const int begin_id = R_VkCombufScopeBegin(combuf, scope_id);
vkCmdTraceRaysKHR(combuf->cmdbuf, &pipeline->sbt.raygen, &pipeline->sbt.miss, &pipeline->sbt.hit, &pipeline->sbt.callable, width, height, 1 );
R_VkCombufScopeEnd(combuf, begin_id, VK_PIPELINE_STAGE_RAY_TRACING_SHADER_BIT_KHR);
}

View File

@ -80,7 +80,8 @@ typedef struct {
} vk_pipeline_ray_t;
vk_pipeline_ray_t VK_PipelineRayTracingCreate(const vk_pipeline_ray_create_info_t *create);
void VK_PipelineRayTracingTrace(VkCommandBuffer cmdbuf, const vk_pipeline_ray_t *pipeline, uint32_t width, uint32_t height);
struct vk_combuf_s;
void VK_PipelineRayTracingTrace(struct vk_combuf_s *combuf, const vk_pipeline_ray_t *pipeline, uint32_t width, uint32_t height, int scope_id);
void VK_PipelineRayTracingDestroy(vk_pipeline_ray_t* pipeline);

View File

@ -80,9 +80,10 @@ void R_VkQueryPoolGetFrameResults( vk_query_pool_t *pool ) {
vkGetQueryPoolResults(vk_core.device, pool->pool, 0, pool->used, pool->used * sizeof(uint64_t), pool->results, sizeof(uint64_t), VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT);
const uint64_t timestamp_offset_ns = getGpuTimestampOffsetNs( pool );
const double timestamp_period = vk_core.physical_device.properties.limits.timestampPeriod;
for (int i = 0; i < pool->used; ++i) {
const uint64_t gpu_ns = pool->results[i] * (double)vk_core.physical_device.properties.limits.timestampPeriod;
const uint64_t gpu_ns = pool->results[i] * timestamp_period;
pool->results[i] = timestamp_offset_ns + gpu_ns;
}
}

View File

@ -4,7 +4,7 @@
#define MAX_QUERY_COUNT 128
typedef struct {
typedef struct vk_query_pool_s {
VkQueryPool pool;
int used;
uint64_t results[MAX_QUERY_COUNT];

View File

@ -649,14 +649,14 @@ void VK_RenderDebugLabelEnd( void )
drawCmdPushDebugLabelEnd();
}
void VK_RenderEndRTX( VkCommandBuffer cmdbuf, VkImageView img_dst_view, VkImage img_dst, uint32_t w, uint32_t h )
void VK_RenderEndRTX( struct vk_combuf_s* combuf, VkImageView img_dst_view, VkImage img_dst, uint32_t w, uint32_t h )
{
const VkBuffer geom_buffer = R_GeometryBuffer_Get();
ASSERT(vk_core.rtx);
{
const vk_ray_frame_render_args_t args = {
.cmdbuf = cmdbuf,
.combuf = combuf,
.dst = {
.image_view = img_dst_view,
.image = img_dst,

View File

@ -112,6 +112,7 @@ void VK_RenderDebugLabelEnd( void );
void VK_RenderBegin( qboolean ray_tracing );
void VK_RenderEnd( VkCommandBuffer cmdbuf );
void VK_RenderEndRTX( VkCommandBuffer cmdbuf, VkImageView img_dst_view, VkImage img_dst, uint32_t w, uint32_t h );
struct vk_combuf_s;
void VK_RenderEndRTX( struct vk_combuf_s* combuf, VkImageView img_dst_view, VkImage img_dst, uint32_t w, uint32_t h );
void VK_Render_FIXME_Barrier( VkCommandBuffer cmdbuf );

View File

@ -16,6 +16,7 @@
#include "vk_staging.h"
#include "vk_textures.h"
#include "vk_previous_frame.h"
#include "vk_combuf.h"
#include "alolcator.h"
@ -173,7 +174,8 @@ typedef struct {
const vk_lights_bindings_t *light_bindings;
} perform_tracing_args_t;
static void performTracing(VkCommandBuffer cmdbuf, const perform_tracing_args_t* args) {
static void performTracing( vk_combuf_t *combuf, const perform_tracing_args_t* args) {
const VkCommandBuffer cmdbuf = combuf->cmdbuf;
// TODO move this to "TLAS producer"
g_rtx.res[ExternalResource_tlas].resource = (vk_resource_t){
.type = VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR,
@ -325,7 +327,7 @@ static void performTracing(VkCommandBuffer cmdbuf, const perform_tracing_args_t*
g_rtx.mainpipe_resources[i]->value.image_object = &res->image;
}
R_VkMeatpipePerform(g_rtx.mainpipe, cmdbuf, (vk_meatpipe_perfrom_args_t) {
R_VkMeatpipePerform(g_rtx.mainpipe, combuf, (vk_meatpipe_perfrom_args_t) {
.frame_set_slot = args->frame_index,
.width = FRAME_WIDTH,
.height = FRAME_HEIGHT,
@ -533,7 +535,7 @@ fail:
void VK_RayFrameEnd(const vk_ray_frame_render_args_t* args)
{
const VkCommandBuffer cmdbuf = args->cmdbuf;
const VkCommandBuffer cmdbuf = args->combuf->cmdbuf;
// const xvk_ray_frame_images_t* current_frame = g_rtx.frames + (g_rtx.frame_number % 2);
ASSERT(vk_core.rtx);
@ -587,7 +589,7 @@ void VK_RayFrameEnd(const vk_ray_frame_render_args_t* args)
.fov_angle_y = args->fov_angle_y,
.light_bindings = &light_bindings,
};
performTracing( cmdbuf, &trace_args );
performTracing( args->combuf, &trace_args );
}
}

View File

@ -26,7 +26,7 @@ typedef struct {
} vk_buffer_region_t;
typedef struct {
VkCommandBuffer cmdbuf;
struct vk_combuf_s *combuf;
struct {
VkImageView image_view;

View File

@ -69,6 +69,8 @@ void R_VkStagingShutdown(void) {
R_VkCommandPoolDestroy( &g_staging.upload_pool );
}
// FIXME There's a severe race condition here. Submitting things manually and prematurely (before framectl had a chance to synchronize with the previous frame)
// may lead to data races and memory corruption (e.g. writing into memory that's being read in some pipeline stage still going)
void R_VkStagingFlushSync( void ) {
APROF_SCOPE_DECLARE_BEGIN(function, __FUNCTION__);