vk: display gpu time taken for a previous frame

its value will be incorrect for non-nv cards which have non-1 timestampPeriod
This commit is contained in:
Ivan 'provod' Avdeev 2023-03-18 13:19:50 -07:00 committed by Ivan Avdeev
parent be95b65b22
commit 93a539df74
6 changed files with 105 additions and 19 deletions

View File

@ -108,10 +108,13 @@ static void drawProfilerScopes(const aprof_event_t *events, uint64_t frame_begin
}
// FIXME move this to r_speeds or something like that
void R_ShowExtendedProfilingData(uint32_t prev_frame_index) {
void R_ShowExtendedProfilingData(uint32_t prev_frame_index, uint64_t gpu_time_ns) {
APROF_SCOPE_DECLARE_BEGIN(__FUNCTION__, __FUNCTION__);
int line = 4;
{
const int dirty = g_lights.stats.dirty_cells;
gEngine.Con_NPrintf(4, "Dirty light cells: %d, size = %dKiB, ranges = %d\n", dirty, (int)(dirty * sizeof(struct LightCluster) / 1024), g_lights.stats.ranges_uploaded);
gEngine.Con_NPrintf(line++, "Dirty light cells: %d, size = %dKiB, ranges = %d\n", dirty, (int)(dirty * sizeof(struct LightCluster) / 1024), g_lights.stats.ranges_uploaded);
}
const uint32_t events = g_aprof.events_last_frame - prev_frame_index;
@ -119,13 +122,14 @@ void R_ShowExtendedProfilingData(uint32_t prev_frame_index) {
const unsigned long long delta_ns = APROF_EVENT_TIMESTAMP(g_aprof.events[g_aprof.events_last_frame]) - frame_begin_time;
const float frame_time = delta_ns / 1e6;
gEngine.Con_NPrintf(5, "aprof events this frame: %u, wraps: %d, frame time: %.03fms\n", events, g_aprof.current_frame_wraparounds, frame_time);
gEngine.Con_NPrintf(line++, "GPU frame time: %.03fms\n", gpu_time_ns * 1e-6);
gEngine.Con_NPrintf(line++, "aprof events this frame: %u, wraps: %d, frame time: %.03fms\n", events, g_aprof.current_frame_wraparounds, frame_time);
g_slows.frame_times[g_slows.frame_num] = frame_time;
g_slows.frame_num = (g_slows.frame_num + 1) % MAX_FRAMES_HISTORY;
const float width = (float)vk_frame.width / MAX_FRAMES_HISTORY;
const int frame_bar_y = 100;
const int frame_bar_y = 100; // TODO font_height * scale * (line + 1)
const float frame_bar_y_scale = 2.f; // ms to pixels
// 60fps
@ -173,16 +177,7 @@ void R_ShowExtendedProfilingData(uint32_t prev_frame_index) {
drawProfilerScopes(events, frame_begin_time, time_scale_ms, event_begin, event_end, y);
}
/* gEngine.Con_NPrintf(5, "Perf scopes:"); */
/* for (int i = 0; i < g_aprof.num_scopes; ++i) { */
/* const aprof_scope_t *const scope = g_aprof.scopes + i; */
/* gEngine.Con_NPrintf(6 + i, "%s: c%d t%.03f(%.03f)ms s%.03f(%.03f)ms", scope->name, */
/* scope->frame.count, */
/* scope->frame.duration / 1e6, */
/* (scope->frame.duration / 1e6) / scope->frame.count, */
/* (scope->frame.duration - scope->frame.duration_children) / 1e6, */
/* (scope->frame.duration - scope->frame.duration_children) / 1e6 / scope->frame.count); */
/* } */
APROF_SCOPE_END(__FUNCTION__);
}
static void togglePause( void ) {

View File

@ -3,4 +3,4 @@
void R_SlowsInit( void );
void R_ShowExtendedProfilingData(uint32_t prev_frame_event_index);
void R_ShowExtendedProfilingData(uint32_t prev_frame_event_index, uint64_t gpu_time_ns);

View File

@ -246,6 +246,11 @@ do { \
X(vkGetImageSubresourceLayout) \
X(vkCmdSetCheckpointNV) \
X(vkGetQueueCheckpointDataNV) \
X(vkCreateQueryPool) \
X(vkDestroyQueryPool) \
X(vkCmdResetQueryPool) \
X(vkCmdWriteTimestamp) \
X(vkGetQueryPoolResults) \
#define DEVICE_FUNCS_RTX(X) \
X(vkGetAccelerationStructureBuildSizesKHR) \

View File

@ -10,6 +10,7 @@
#include "vk_image.h"
#include "vk_staging.h"
#include "vk_commandpool.h"
#include "vk_querypool.h"
#include "profiler.h"
#include "r_slows.h"
@ -22,6 +23,7 @@ extern ref_globals_t *gpGlobals;
vk_framectl_t vk_frame = {0};
// Phase tracking is needed for getting screenshots. Basically, getting a screenshot does the same things as R_EndFrame, and they need to be congruent.
typedef enum {
Phase_Idle,
Phase_FrameBegan,
@ -36,6 +38,9 @@ static struct {
VkSemaphore sem_done2[MAX_CONCURRENT_FRAMES];
VkFence fence_done[MAX_CONCURRENT_FRAMES];
// TODO these should be tightly coupled with commandbuffers
vk_query_pool_t qpools[MAX_CONCURRENT_FRAMES];
qboolean rtx_enabled;
struct {
@ -187,19 +192,33 @@ static void updateGamma( void ) {
}
void R_BeginFrame( qboolean clearScene ) {
APROF_SCOPE_DECLARE_BEGIN(begin_frame_tail, "R_BeginFrame_tail");
ASSERT(g_frame.current.phase == Phase_Submitted || g_frame.current.phase == Phase_Idle);
const int prev_frame_index = g_frame.current.index % MAX_CONCURRENT_FRAMES;
g_frame.current.index = (g_frame.current.index + 1) % MAX_CONCURRENT_FRAMES;
const VkCommandBuffer cmdbuf = vk_frame.cmdbuf = g_frame.command.buffers[g_frame.current.index];
vk_query_pool_t *const qpool = g_frame.qpools + g_frame.current.index;
{
const uint32_t prev_frame_event_index = aprof_scope_frame();
R_ShowExtendedProfilingData(prev_frame_event_index);
waitForFrameFence();
// Current command buffer is done and available
// Previous might still be in flight
R_VkQueryPoolGetFrameResults(g_frame.qpools + g_frame.current.index);
}
APROF_SCOPE_END(begin_frame_tail);
const uint32_t prev_frame_event_index = aprof_scope_frame();
APROF_SCOPE_BEGIN(frame);
APROF_SCOPE_BEGIN(begin_frame);
{
// FIXME correct only for nvidia where timestamps are ns
const uint64_t gpu_time_ns_fixme = (qpool->used) ? qpool->results[1] - qpool->results[0] : 0;
R_ShowExtendedProfilingData(prev_frame_event_index, gpu_time_ns_fixme);
}
if (vk_core.rtx && FBitSet( vk_rtx->flags, FCVAR_CHANGED )) {
g_frame.rtx_enabled = CVAR_TO_BOOL( vk_rtx );
}
@ -209,7 +228,6 @@ void R_BeginFrame( qboolean clearScene ) {
ASSERT(!g_frame.current.framebuffer.framebuffer);
waitForFrameFence();
R_VkStagingFrameBegin();
g_frame.current.framebuffer = R_VkSwapchainAcquire( g_frame.sem_framebuffer_ready[g_frame.current.index] );
@ -226,6 +244,8 @@ void R_BeginFrame( qboolean clearScene ) {
XVK_CHECK(vkBeginCommandBuffer(cmdbuf, &beginfo));
}
R_VkQueryPoolBegin(g_frame.qpools + g_frame.current.index, cmdbuf);
g_frame.current.phase = Phase_FrameBegan;
APROF_SCOPE_END(begin_frame);
}
@ -289,6 +309,7 @@ static void enqueueRendering( VkCommandBuffer cmdbuf ) {
static void submit( VkCommandBuffer cmdbuf, qboolean wait ) {
ASSERT(g_frame.current.phase == Phase_RenderingEnqueued);
R_VkQueryPoolEnd(g_frame.qpools + g_frame.current.index, cmdbuf);
XVK_CHECK(vkEndCommandBuffer(cmdbuf));
const VkCommandBuffer cmdbufs[] = {
@ -394,6 +415,8 @@ qboolean VK_FrameCtlInit( void )
SET_DEBUG_NAMEF(g_frame.sem_done2[i], VK_OBJECT_TYPE_SEMAPHORE, "done2[%d]", i);
g_frame.fence_done[i] = R_VkFenceCreate(true);
SET_DEBUG_NAMEF(g_frame.fence_done[i], VK_OBJECT_TYPE_FENCE, "done[%d]", i);
R_VkQueryPoolInit(g_frame.qpools + i);
}
// Signal first frame semaphore as done
@ -429,6 +452,7 @@ void VK_FrameCtlShutdown( void ) {
R_VkSemaphoreDestroy(g_frame.sem_done[i]);
R_VkSemaphoreDestroy(g_frame.sem_done2[i]);
R_VkFenceDestroy(g_frame.fence_done[i]);
R_VkQueryPoolDestroy(g_frame.qpools + i);
}
R_VkSwapchainShutdown();

43
ref/vk/vk_querypool.c Normal file
View File

@ -0,0 +1,43 @@
#include "vk_querypool.h"
qboolean R_VkQueryPoolInit( vk_query_pool_t* pool ) {
const VkQueryPoolCreateInfo qpci = {
.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO,
.pNext = NULL,
.queryType = VK_QUERY_TYPE_TIMESTAMP,
.queryCount = MAX_QUERY_COUNT,
.flags = 0,
};
XVK_CHECK(vkCreateQueryPool(vk_core.device, &qpci, NULL, &pool->pool));
return true;
}
void R_VkQueryPoolDestroy( vk_query_pool_t *pool ) {
vkDestroyQueryPool(vk_core.device, pool->pool, NULL);
}
int R_VkQueryPoolTimestamp( vk_query_pool_t *pool, VkCommandBuffer cmdbuf, VkPipelineStageFlagBits stage) {
if (pool->used >= MAX_QUERY_COUNT)
return -1;
vkCmdWriteTimestamp(cmdbuf, stage, pool->pool, pool->used);
return pool->used++;
}
void R_VkQueryPoolBegin( vk_query_pool_t *pool, VkCommandBuffer cmdbuf ) {
pool->used = 0;
vkCmdResetQueryPool(cmdbuf, pool->pool, 0, MAX_QUERY_COUNT);
R_VkQueryPoolTimestamp(pool, cmdbuf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT);
}
void R_VkQueryPoolEnd( vk_query_pool_t *pool, VkCommandBuffer cmdbuf ) {
R_VkQueryPoolTimestamp(pool, cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT);
}
void R_VkQueryPoolGetFrameResults( vk_query_pool_t *pool ) {
if (!pool->used)
return;
vkGetQueryPoolResults(vk_core.device, pool->pool, 0, pool->used, pool->used * sizeof(uint64_t), pool->results, sizeof(uint64_t), VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT);
}

19
ref/vk/vk_querypool.h Normal file
View File

@ -0,0 +1,19 @@
#pragma once
#include "vk_core.h"
#define MAX_QUERY_COUNT 128
typedef struct {
VkQueryPool pool;
int used;
uint64_t results[MAX_QUERY_COUNT];
} vk_query_pool_t;
qboolean R_VkQueryPoolInit( vk_query_pool_t *pool );
void R_VkQueryPoolDestroy( vk_query_pool_t *pool );
int R_VkQueryPoolTimestamp( vk_query_pool_t *pool, VkCommandBuffer cmdbuf, VkPipelineStageFlagBits stage);
void R_VkQueryPoolBegin( vk_query_pool_t *pool, VkCommandBuffer cmdbuf );
void R_VkQueryPoolEnd( vk_query_pool_t *pool, VkCommandBuffer cmdbuf );
void R_VkQueryPoolGetFrameResults( vk_query_pool_t *pool );