vk: profiler: draw total gpu time

Use VK_EXT_calibrated_timestamps to convert to host time.
This commit is contained in:
Ivan Avdeev 2023-03-18 15:53:19 -07:00 committed by Ivan Avdeev
parent a66c44a266
commit 22d4202ad9
8 changed files with 118 additions and 24 deletions

View File

@ -40,6 +40,12 @@ void aprof_scope_event(aprof_scope_id_t, int begin);
uint32_t aprof_scope_frame( void );
uint64_t aprof_time_now_ns( void );
#ifdef WIN32
uint64_t aprof_time_now_native_divider( void );
#else
#define aprof_time_now_native_divider() 1
#endif
typedef struct {
const char *name;
} aprof_scope_t;
@ -81,6 +87,8 @@ enum {
typedef uint64_t aprof_event_t;
typedef struct {
uint64_t time_begin_ns;
aprof_scope_t scopes[APROF_MAX_SCOPES];
int num_scopes;
@ -114,6 +122,9 @@ uint64_t aprof_time_now_ns( void ) {
QueryPerformanceCounter(&pc);
return pc.QuadPart * 1000000000ull / _aprof_frequency.QuadPart;
}
uint64_t aprof_time_now_native_divider( void ) {
return _aprof_frequency.QuadPart;
}
#else
#error aprof is not implemented for this os
#endif
@ -126,6 +137,9 @@ aprof_scope_id_t aprof_scope_init(const char *scope_name) {
QueryPerformanceFrequency(&_aprof_frequency);
#endif
if (!g_aprof.time_begin_ns)
g_aprof.time_begin_ns = aprof_time_now_ns();
if (g_aprof.num_scopes == APROF_MAX_SCOPES)
return -1;
@ -134,7 +148,7 @@ aprof_scope_id_t aprof_scope_init(const char *scope_name) {
}
void aprof_scope_event(aprof_scope_id_t scope_id, int begin) {
const uint64_t now = aprof_time_now_ns();
const uint64_t now = aprof_time_now_ns() - g_aprof.time_begin_ns;
if (scope_id < 0 || scope_id >= g_aprof.num_scopes)
return;
@ -146,7 +160,7 @@ void aprof_scope_event(aprof_scope_id_t scope_id, int begin) {
}
uint32_t aprof_scope_frame( void ) {
const uint64_t now = aprof_time_now_ns();
const uint64_t now = aprof_time_now_ns() - g_aprof.time_begin_ns;
const uint32_t previous_frame = g_aprof.events_last_frame;
g_aprof.events_last_frame = g_aprof.events_write;

View File

@ -36,7 +36,22 @@ static uint32_t getHash(const char *s) {
return CRC32_Final(crc);
}
static void drawProfilerScopes(const aprof_event_t *events, uint64_t frame_begin_time, float time_scale_ms, uint32_t begin, uint32_t end, int y) {
static void drawTimeBar(uint64_t begin_time_ns, float time_scale_ms, int64_t begin_ns, int64_t end_ns, int y, int height, int glyph_width, const char *label, const rgba_t color) {
const float delta_ms = (end_ns - begin_ns) * 1e-6;
const int width = delta_ms * time_scale_ms;
const int x = (begin_ns - begin_time_ns) * 1e-6 * time_scale_ms;
rgba_t text_color = {255-color[0], 255-color[1], 255-color[2], 255};
CL_FillRGBA(x, y, width, height, color[0], color[1], color[2], color[3]);
// Tweak this if scope names escape the block boundaries
char tmp[64];
tmp[0] = '\0';
Q_snprintf(tmp, Q_min(sizeof(tmp), width / glyph_width), "%s %.3fms", label, delta_ms);
gEngine.Con_DrawString(x, y, tmp, text_color);
}
static void drawProfilerScopes(const aprof_event_t *events, uint64_t begin_time, float time_scale_ms, uint32_t begin, uint32_t end, int y) {
#define MAX_STACK_DEPTH 16
// hidpi scaling
@ -46,6 +61,7 @@ static void drawProfilerScopes(const aprof_event_t *events, uint64_t frame_begin
// TODO "20" is fine for the "default" font. Unfortunately we don't have any access to font metrics from here, ref_api_t doesn't give us anything about fonts. ;_;
const int height = 20 * scale;
const int estimated_glyph_width = 8 * scale;
struct {
int scope_id;
@ -77,24 +93,11 @@ static void drawProfilerScopes(const aprof_event_t *events, uint64_t frame_begin
ASSERT(stack[depth].scope_id == scope_id);
const float delta_ms = (timestamp_ns - stack[depth].begin_ns) * 1e-6;
const int width = delta_ms * time_scale_ms;
const int x = (stack[depth].begin_ns - frame_begin_time) * 1e-6 * time_scale_ms;
const int yy = y + depth * height;
const char *scope_name = g_aprof.scopes[scope_id].name;
const uint32_t hash = getHash(scope_name);
rgba_t color = {hash >> 24, (hash>>16)&0xff, hash&0xff, 127};
rgba_t text_color = {255-color[0], 255-color[1], 255-color[2], 255};
CL_FillRGBA(x, yy, width, height, color[0], color[1], color[2], color[3]);
// Tweak this if scope names escape the block boundaries
const int estimated_glyph_width = 8;
char tmp[64];
tmp[0] = '\0';
Q_snprintf(tmp, Q_min(sizeof(tmp), width / estimated_glyph_width), "%s %.3fms", scope_name, delta_ms);
gEngine.Con_DrawString(x, yy, tmp, text_color);
const rgba_t color = {hash >> 24, (hash>>16)&0xff, hash&0xff, 127};
drawTimeBar(begin_time, time_scale_ms, stack[depth].begin_ns, timestamp_ns, y + depth * height, height, estimated_glyph_width, scope_name, color);
break;
}
@ -108,7 +111,7 @@ static void drawProfilerScopes(const aprof_event_t *events, uint64_t frame_begin
}
// FIXME move this to r_speeds or something like that
void R_ShowExtendedProfilingData(uint32_t prev_frame_index, uint64_t gpu_time_ns) {
void R_ShowExtendedProfilingData(uint32_t prev_frame_index, uint64_t gpu_frame_begin_ns, uint64_t gpu_frame_end_ns) {
APROF_SCOPE_DECLARE_BEGIN(__FUNCTION__, __FUNCTION__);
int line = 4;
@ -122,7 +125,9 @@ void R_ShowExtendedProfilingData(uint32_t prev_frame_index, uint64_t gpu_time_ns
const unsigned long long delta_ns = APROF_EVENT_TIMESTAMP(g_aprof.events[g_aprof.events_last_frame]) - frame_begin_time;
const float frame_time = delta_ns / 1e6;
const uint64_t gpu_time_ns = gpu_frame_end_ns - gpu_frame_begin_ns;
gEngine.Con_NPrintf(line++, "GPU frame time: %.03fms\n", gpu_time_ns * 1e-6);
gEngine.Con_NPrintf(line++, "aprof events this frame: %u, wraps: %d, frame time: %.03fms\n", events, g_aprof.current_frame_wraparounds, frame_time);
g_slows.frame_times[g_slows.frame_num] = frame_time;
@ -167,6 +172,23 @@ void R_ShowExtendedProfilingData(uint32_t prev_frame_index, uint64_t gpu_time_ns
{
const int y = frame_bar_y + frame_bar_y_scale * TARGET_FRAME_TIME * 2 + 10;
// Draw latest 2 frames; find their boundaries
uint32_t rewind_frame = prev_frame_index;
for (int frame = 1; frame < 2;) {
rewind_frame = (rewind_frame - 1) % APROF_EVENT_BUFFER_SIZE; // NOTE: only correct for power-of-2 buffer sizes
const aprof_event_t event = g_aprof.events[rewind_frame];
// Exhausted all events
if (event == 0 || rewind_frame == g_aprof.events_write)
break;
// Note the frame
if (APROF_EVENT_TYPE(event) == APROF_EVENT_FRAME_BOUNDARY) {
++frame;
prev_frame_index = rewind_frame;
}
}
const aprof_event_t *const events = g_slows.paused_events ? g_slows.paused_events : g_aprof.events;
const int event_begin = g_slows.paused_events ? 0 : prev_frame_index;
const int event_end = g_slows.paused_events ? g_slows.paused_events_count - 1 : g_aprof.events_last_frame;
@ -175,6 +197,9 @@ void R_ShowExtendedProfilingData(uint32_t prev_frame_index, uint64_t gpu_time_ns
const uint64_t delta_ns = frame_end_time - frame_begin_time;
const float time_scale_ms = (double)vk_frame.width / (delta_ns / 1e6);
drawProfilerScopes(events, frame_begin_time, time_scale_ms, event_begin, event_end, y);
const rgba_t color = {255, 255, 0, 127};
drawTimeBar(frame_begin_time, time_scale_ms, gpu_frame_begin_ns, gpu_frame_end_ns, 10, 20, 8, "GPU TIME", color);
}
APROF_SCOPE_END(__FUNCTION__);

View File

@ -3,4 +3,4 @@
void R_SlowsInit( void );
void R_ShowExtendedProfilingData(uint32_t prev_frame_event_index, uint64_t gpu_time_ns);
void R_ShowExtendedProfilingData(uint32_t prev_frame_index, uint64_t gpu_frame_begin_ns, uint64_t gpu_frame_end_ns);

View File

@ -103,6 +103,10 @@ static const char* device_extensions_nv_checkpoint[] = {
VK_NV_DEVICE_DIAGNOSTICS_CONFIG_EXTENSION_NAME,
};
static const char* device_extensions_extra[] = {
VK_EXT_CALIBRATED_TIMESTAMPS_EXTENSION_NAME,
};
VkBool32 VKAPI_PTR debugCallback(
VkDebugUtilsMessageSeverityFlagBitsEXT messageSeverity,
VkDebugUtilsMessageTypeFlagsEXT messageTypes,
@ -303,6 +307,7 @@ typedef struct {
qboolean anisotropy;
qboolean ray_tracing;
qboolean nv_checkpoint;
qboolean calibrated_timestamps;
} vk_available_device_t;
static int enumerateDevices( vk_available_device_t **available_devices ) {
@ -404,6 +409,7 @@ static int enumerateDevices( vk_available_device_t **available_devices ) {
devicePrintExtensionsFromList(extensions, num_device_extensions, device_extensions_req, ARRAYSIZE(device_extensions_req));
devicePrintExtensionsFromList(extensions, num_device_extensions, device_extensions_rt, ARRAYSIZE(device_extensions_rt));
devicePrintExtensionsFromList(extensions, num_device_extensions, device_extensions_nv_checkpoint, ARRAYSIZE(device_extensions_nv_checkpoint));
devicePrintExtensionsFromList(extensions, num_device_extensions, device_extensions_extra, ARRAYSIZE(device_extensions_extra));
vkGetPhysicalDeviceFeatures2(physical_devices[i], &features);
this_device->anisotropy = features.features.samplerAnisotropy;
@ -415,6 +421,8 @@ static int enumerateDevices( vk_available_device_t **available_devices ) {
this_device->nv_checkpoint = vk_core.debug && deviceSupportsExtensions(extensions, num_device_extensions, device_extensions_nv_checkpoint, ARRAYSIZE(device_extensions_nv_checkpoint));
gEngine.Con_Printf("\t\tNV checkpoints supported: %d\n", this_device->nv_checkpoint);
this_device->calibrated_timestamps = deviceSupportsExtensions(extensions, num_device_extensions, device_extensions_extra, ARRAYSIZE(device_extensions_extra));
Mem_Free(extensions);
}
@ -557,6 +565,9 @@ static qboolean createDevice( void ) {
if (candidate_device->nv_checkpoint)
device_extensions_count = appendDeviceExtensions(device_extensions, device_extensions_count, device_extensions_nv_checkpoint, ARRAYSIZE(device_extensions_nv_checkpoint));
if (candidate_device->calibrated_timestamps)
device_extensions_count = appendDeviceExtensions(device_extensions, device_extensions_count, device_extensions_extra, ARRAYSIZE(device_extensions_extra));
VkDeviceCreateInfo create_info = {
.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
.pNext = head,

View File

@ -251,6 +251,7 @@ do { \
X(vkCmdResetQueryPool) \
X(vkCmdWriteTimestamp) \
X(vkGetQueryPoolResults) \
X(vkGetCalibratedTimestampsEXT) \
#define DEVICE_FUNCS_RTX(X) \
X(vkGetAccelerationStructureBuildSizesKHR) \

View File

@ -214,9 +214,10 @@ void R_BeginFrame( qboolean clearScene ) {
APROF_SCOPE_BEGIN(begin_frame);
{
// FIXME correct only for nvidia where timestamps are ns
const uint64_t gpu_time_ns_fixme = (qpool->used) ? qpool->results[1] - qpool->results[0] : 0;
R_ShowExtendedProfilingData(prev_frame_event_index, gpu_time_ns_fixme);
// FIXME collect and show more gpu profiling data
const uint64_t gpu_time_begin_ns = (qpool->used) ? qpool->results[0] : 0;
const uint64_t gpu_time_end_ns = (qpool->used) ? qpool->results[1] : 0;
R_ShowExtendedProfilingData(prev_frame_event_index, gpu_time_begin_ns, gpu_time_end_ns);
}
if (vk_core.rtx && FBitSet( vk_rtx->flags, FCVAR_CHANGED )) {

View File

@ -1,4 +1,5 @@
#include "vk_querypool.h"
#include "profiler.h" // for aprof_time_now_ns()
qboolean R_VkQueryPoolInit( vk_query_pool_t* pool ) {
const VkQueryPoolCreateInfo qpci = {
@ -33,6 +34,43 @@ void R_VkQueryPoolBegin( vk_query_pool_t *pool, VkCommandBuffer cmdbuf ) {
void R_VkQueryPoolEnd( vk_query_pool_t *pool, VkCommandBuffer cmdbuf ) {
R_VkQueryPoolTimestamp(pool, cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT);
pool->end_timestamp_ns = aprof_time_now_ns();
}
static uint64_t getGpuTimestampOffsetNs( const vk_query_pool_t *pool ) {
// FIXME this is an incorrect check, we need to carry per-device extensions availability somehow. vk_core-vs-device refactoring pending
if (!vkGetCalibratedTimestampsEXT) {
// Estimate based on supposed submission time, assuming that we submit, and it starts computing right after cmdbuffer closure
// which may not be true. But it's all we got
// TODO alternative approach: estimate based on end timestamp
const uint64_t gpu_begin_ns = (double)pool->results[0] * vk_core.physical_device.properties.limits.timestampPeriod;
return pool->end_timestamp_ns - gpu_begin_ns;
}
const VkCalibratedTimestampInfoEXT cti[2] = {
{
.sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT,
.pNext = NULL,
.timeDomain = VK_TIME_DOMAIN_DEVICE_EXT,
},
{
.sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT,
.pNext = NULL,
#ifdef WIN32
.timeDomain = VK_TIME_DOMAIN_QUERY_PERFORMANCE_COUNTER_EXT,
#else
.timeDomain = VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT,
#endif
},
};
uint64_t timestamps[2] = {0};
uint64_t max_deviation[2] = {0};
vkGetCalibratedTimestampsEXT(vk_core.device, 2, cti, timestamps, max_deviation);
const uint64_t cpu = (double)timestamps[1] / aprof_time_now_native_divider();
const uint64_t gpu = (double)timestamps[0] * vk_core.physical_device.properties.limits.timestampPeriod;
return cpu - gpu;
}
void R_VkQueryPoolGetFrameResults( vk_query_pool_t *pool ) {
@ -41,7 +79,10 @@ void R_VkQueryPoolGetFrameResults( vk_query_pool_t *pool ) {
vkGetQueryPoolResults(vk_core.device, pool->pool, 0, pool->used, pool->used * sizeof(uint64_t), pool->results, sizeof(uint64_t), VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT);
const uint64_t timestamp_offset_ns = getGpuTimestampOffsetNs( pool ) - g_aprof.time_begin_ns;
for (int i = 0; i < pool->used; ++i) {
pool->results[i] *= vk_core.physical_device.properties.limits.timestampPeriod;
const uint64_t gpu_ns = pool->results[i] * (double)vk_core.physical_device.properties.limits.timestampPeriod;
pool->results[i] = timestamp_offset_ns + gpu_ns;
}
}

View File

@ -8,6 +8,7 @@ typedef struct {
VkQueryPool pool;
int used;
uint64_t results[MAX_QUERY_COUNT];
uint64_t end_timestamp_ns;
} vk_query_pool_t;
qboolean R_VkQueryPoolInit( vk_query_pool_t *pool );