From 22d4202ad96f3617a01aa89b221d0cf0940437c9 Mon Sep 17 00:00:00 2001 From: Ivan Avdeev Date: Sat, 18 Mar 2023 15:53:19 -0700 Subject: [PATCH] vk: profiler: draw total gpu time Use VK_EXT_calibrated_timestamps to convert to host time. --- ref/vk/profiler.h | 18 +++++++++++-- ref/vk/r_slows.c | 59 ++++++++++++++++++++++++++++++------------- ref/vk/r_slows.h | 2 +- ref/vk/vk_core.c | 11 ++++++++ ref/vk/vk_core.h | 1 + ref/vk/vk_framectl.c | 7 ++--- ref/vk/vk_querypool.c | 43 ++++++++++++++++++++++++++++++- ref/vk/vk_querypool.h | 1 + 8 files changed, 118 insertions(+), 24 deletions(-) diff --git a/ref/vk/profiler.h b/ref/vk/profiler.h index 5612bac3..0a0eab17 100644 --- a/ref/vk/profiler.h +++ b/ref/vk/profiler.h @@ -40,6 +40,12 @@ void aprof_scope_event(aprof_scope_id_t, int begin); uint32_t aprof_scope_frame( void ); uint64_t aprof_time_now_ns( void ); +#ifdef WIN32 +uint64_t aprof_time_now_native_divider( void ); +#else +#define aprof_time_now_native_divider() 1 +#endif + typedef struct { const char *name; } aprof_scope_t; @@ -81,6 +87,8 @@ enum { typedef uint64_t aprof_event_t; typedef struct { + uint64_t time_begin_ns; + aprof_scope_t scopes[APROF_MAX_SCOPES]; int num_scopes; @@ -114,6 +122,9 @@ uint64_t aprof_time_now_ns( void ) { QueryPerformanceCounter(&pc); return pc.QuadPart * 1000000000ull / _aprof_frequency.QuadPart; } +uint64_t aprof_time_now_native_divider( void ) { + return _aprof_frequency.QuadPart; +} #else #error aprof is not implemented for this os #endif @@ -126,6 +137,9 @@ aprof_scope_id_t aprof_scope_init(const char *scope_name) { QueryPerformanceFrequency(&_aprof_frequency); #endif + if (!g_aprof.time_begin_ns) + g_aprof.time_begin_ns = aprof_time_now_ns(); + if (g_aprof.num_scopes == APROF_MAX_SCOPES) return -1; @@ -134,7 +148,7 @@ aprof_scope_id_t aprof_scope_init(const char *scope_name) { } void aprof_scope_event(aprof_scope_id_t scope_id, int begin) { - const uint64_t now = aprof_time_now_ns(); + const uint64_t now = aprof_time_now_ns() - g_aprof.time_begin_ns; if (scope_id < 0 || scope_id >= g_aprof.num_scopes) return; @@ -146,7 +160,7 @@ void aprof_scope_event(aprof_scope_id_t scope_id, int begin) { } uint32_t aprof_scope_frame( void ) { - const uint64_t now = aprof_time_now_ns(); + const uint64_t now = aprof_time_now_ns() - g_aprof.time_begin_ns; const uint32_t previous_frame = g_aprof.events_last_frame; g_aprof.events_last_frame = g_aprof.events_write; diff --git a/ref/vk/r_slows.c b/ref/vk/r_slows.c index 43e162fc..f6599d98 100644 --- a/ref/vk/r_slows.c +++ b/ref/vk/r_slows.c @@ -36,7 +36,22 @@ static uint32_t getHash(const char *s) { return CRC32_Final(crc); } -static void drawProfilerScopes(const aprof_event_t *events, uint64_t frame_begin_time, float time_scale_ms, uint32_t begin, uint32_t end, int y) { +static void drawTimeBar(uint64_t begin_time_ns, float time_scale_ms, int64_t begin_ns, int64_t end_ns, int y, int height, int glyph_width, const char *label, const rgba_t color) { + const float delta_ms = (end_ns - begin_ns) * 1e-6; + const int width = delta_ms * time_scale_ms; + const int x = (begin_ns - begin_time_ns) * 1e-6 * time_scale_ms; + + rgba_t text_color = {255-color[0], 255-color[1], 255-color[2], 255}; + CL_FillRGBA(x, y, width, height, color[0], color[1], color[2], color[3]); + + // Tweak this if scope names escape the block boundaries + char tmp[64]; + tmp[0] = '\0'; + Q_snprintf(tmp, Q_min(sizeof(tmp), width / glyph_width), "%s %.3fms", label, delta_ms); + gEngine.Con_DrawString(x, y, tmp, text_color); +} + +static void drawProfilerScopes(const aprof_event_t *events, uint64_t begin_time, float time_scale_ms, uint32_t begin, uint32_t end, int y) { #define MAX_STACK_DEPTH 16 // hidpi scaling @@ -46,6 +61,7 @@ static void drawProfilerScopes(const aprof_event_t *events, uint64_t frame_begin // TODO "20" is fine for the "default" font. Unfortunately we don't have any access to font metrics from here, ref_api_t doesn't give us anything about fonts. ;_; const int height = 20 * scale; + const int estimated_glyph_width = 8 * scale; struct { int scope_id; @@ -77,24 +93,11 @@ static void drawProfilerScopes(const aprof_event_t *events, uint64_t frame_begin ASSERT(stack[depth].scope_id == scope_id); - const float delta_ms = (timestamp_ns - stack[depth].begin_ns) * 1e-6; - const int width = delta_ms * time_scale_ms; - const int x = (stack[depth].begin_ns - frame_begin_time) * 1e-6 * time_scale_ms; - - const int yy = y + depth * height; const char *scope_name = g_aprof.scopes[scope_id].name; const uint32_t hash = getHash(scope_name); - rgba_t color = {hash >> 24, (hash>>16)&0xff, hash&0xff, 127}; - rgba_t text_color = {255-color[0], 255-color[1], 255-color[2], 255}; - CL_FillRGBA(x, yy, width, height, color[0], color[1], color[2], color[3]); - - // Tweak this if scope names escape the block boundaries - const int estimated_glyph_width = 8; - char tmp[64]; - tmp[0] = '\0'; - Q_snprintf(tmp, Q_min(sizeof(tmp), width / estimated_glyph_width), "%s %.3fms", scope_name, delta_ms); - gEngine.Con_DrawString(x, yy, tmp, text_color); + const rgba_t color = {hash >> 24, (hash>>16)&0xff, hash&0xff, 127}; + drawTimeBar(begin_time, time_scale_ms, stack[depth].begin_ns, timestamp_ns, y + depth * height, height, estimated_glyph_width, scope_name, color); break; } @@ -108,7 +111,7 @@ static void drawProfilerScopes(const aprof_event_t *events, uint64_t frame_begin } // FIXME move this to r_speeds or something like that -void R_ShowExtendedProfilingData(uint32_t prev_frame_index, uint64_t gpu_time_ns) { +void R_ShowExtendedProfilingData(uint32_t prev_frame_index, uint64_t gpu_frame_begin_ns, uint64_t gpu_frame_end_ns) { APROF_SCOPE_DECLARE_BEGIN(__FUNCTION__, __FUNCTION__); int line = 4; @@ -122,7 +125,9 @@ void R_ShowExtendedProfilingData(uint32_t prev_frame_index, uint64_t gpu_time_ns const unsigned long long delta_ns = APROF_EVENT_TIMESTAMP(g_aprof.events[g_aprof.events_last_frame]) - frame_begin_time; const float frame_time = delta_ns / 1e6; + const uint64_t gpu_time_ns = gpu_frame_end_ns - gpu_frame_begin_ns; gEngine.Con_NPrintf(line++, "GPU frame time: %.03fms\n", gpu_time_ns * 1e-6); + gEngine.Con_NPrintf(line++, "aprof events this frame: %u, wraps: %d, frame time: %.03fms\n", events, g_aprof.current_frame_wraparounds, frame_time); g_slows.frame_times[g_slows.frame_num] = frame_time; @@ -167,6 +172,23 @@ void R_ShowExtendedProfilingData(uint32_t prev_frame_index, uint64_t gpu_time_ns { const int y = frame_bar_y + frame_bar_y_scale * TARGET_FRAME_TIME * 2 + 10; + // Draw latest 2 frames; find their boundaries + uint32_t rewind_frame = prev_frame_index; + for (int frame = 1; frame < 2;) { + rewind_frame = (rewind_frame - 1) % APROF_EVENT_BUFFER_SIZE; // NOTE: only correct for power-of-2 buffer sizes + const aprof_event_t event = g_aprof.events[rewind_frame]; + + // Exhausted all events + if (event == 0 || rewind_frame == g_aprof.events_write) + break; + + // Note the frame + if (APROF_EVENT_TYPE(event) == APROF_EVENT_FRAME_BOUNDARY) { + ++frame; + prev_frame_index = rewind_frame; + } + } + const aprof_event_t *const events = g_slows.paused_events ? g_slows.paused_events : g_aprof.events; const int event_begin = g_slows.paused_events ? 0 : prev_frame_index; const int event_end = g_slows.paused_events ? g_slows.paused_events_count - 1 : g_aprof.events_last_frame; @@ -175,6 +197,9 @@ void R_ShowExtendedProfilingData(uint32_t prev_frame_index, uint64_t gpu_time_ns const uint64_t delta_ns = frame_end_time - frame_begin_time; const float time_scale_ms = (double)vk_frame.width / (delta_ns / 1e6); drawProfilerScopes(events, frame_begin_time, time_scale_ms, event_begin, event_end, y); + + const rgba_t color = {255, 255, 0, 127}; + drawTimeBar(frame_begin_time, time_scale_ms, gpu_frame_begin_ns, gpu_frame_end_ns, 10, 20, 8, "GPU TIME", color); } APROF_SCOPE_END(__FUNCTION__); diff --git a/ref/vk/r_slows.h b/ref/vk/r_slows.h index 7290b527..61c16e82 100644 --- a/ref/vk/r_slows.h +++ b/ref/vk/r_slows.h @@ -3,4 +3,4 @@ void R_SlowsInit( void ); -void R_ShowExtendedProfilingData(uint32_t prev_frame_event_index, uint64_t gpu_time_ns); +void R_ShowExtendedProfilingData(uint32_t prev_frame_index, uint64_t gpu_frame_begin_ns, uint64_t gpu_frame_end_ns); diff --git a/ref/vk/vk_core.c b/ref/vk/vk_core.c index b640093e..6a544782 100644 --- a/ref/vk/vk_core.c +++ b/ref/vk/vk_core.c @@ -103,6 +103,10 @@ static const char* device_extensions_nv_checkpoint[] = { VK_NV_DEVICE_DIAGNOSTICS_CONFIG_EXTENSION_NAME, }; +static const char* device_extensions_extra[] = { + VK_EXT_CALIBRATED_TIMESTAMPS_EXTENSION_NAME, +}; + VkBool32 VKAPI_PTR debugCallback( VkDebugUtilsMessageSeverityFlagBitsEXT messageSeverity, VkDebugUtilsMessageTypeFlagsEXT messageTypes, @@ -303,6 +307,7 @@ typedef struct { qboolean anisotropy; qboolean ray_tracing; qboolean nv_checkpoint; + qboolean calibrated_timestamps; } vk_available_device_t; static int enumerateDevices( vk_available_device_t **available_devices ) { @@ -404,6 +409,7 @@ static int enumerateDevices( vk_available_device_t **available_devices ) { devicePrintExtensionsFromList(extensions, num_device_extensions, device_extensions_req, ARRAYSIZE(device_extensions_req)); devicePrintExtensionsFromList(extensions, num_device_extensions, device_extensions_rt, ARRAYSIZE(device_extensions_rt)); devicePrintExtensionsFromList(extensions, num_device_extensions, device_extensions_nv_checkpoint, ARRAYSIZE(device_extensions_nv_checkpoint)); + devicePrintExtensionsFromList(extensions, num_device_extensions, device_extensions_extra, ARRAYSIZE(device_extensions_extra)); vkGetPhysicalDeviceFeatures2(physical_devices[i], &features); this_device->anisotropy = features.features.samplerAnisotropy; @@ -415,6 +421,8 @@ static int enumerateDevices( vk_available_device_t **available_devices ) { this_device->nv_checkpoint = vk_core.debug && deviceSupportsExtensions(extensions, num_device_extensions, device_extensions_nv_checkpoint, ARRAYSIZE(device_extensions_nv_checkpoint)); gEngine.Con_Printf("\t\tNV checkpoints supported: %d\n", this_device->nv_checkpoint); + this_device->calibrated_timestamps = deviceSupportsExtensions(extensions, num_device_extensions, device_extensions_extra, ARRAYSIZE(device_extensions_extra)); + Mem_Free(extensions); } @@ -557,6 +565,9 @@ static qboolean createDevice( void ) { if (candidate_device->nv_checkpoint) device_extensions_count = appendDeviceExtensions(device_extensions, device_extensions_count, device_extensions_nv_checkpoint, ARRAYSIZE(device_extensions_nv_checkpoint)); + if (candidate_device->calibrated_timestamps) + device_extensions_count = appendDeviceExtensions(device_extensions, device_extensions_count, device_extensions_extra, ARRAYSIZE(device_extensions_extra)); + VkDeviceCreateInfo create_info = { .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO, .pNext = head, diff --git a/ref/vk/vk_core.h b/ref/vk/vk_core.h index d8b780f9..a423b07c 100644 --- a/ref/vk/vk_core.h +++ b/ref/vk/vk_core.h @@ -251,6 +251,7 @@ do { \ X(vkCmdResetQueryPool) \ X(vkCmdWriteTimestamp) \ X(vkGetQueryPoolResults) \ + X(vkGetCalibratedTimestampsEXT) \ #define DEVICE_FUNCS_RTX(X) \ X(vkGetAccelerationStructureBuildSizesKHR) \ diff --git a/ref/vk/vk_framectl.c b/ref/vk/vk_framectl.c index b21c49ba..14df47cc 100644 --- a/ref/vk/vk_framectl.c +++ b/ref/vk/vk_framectl.c @@ -214,9 +214,10 @@ void R_BeginFrame( qboolean clearScene ) { APROF_SCOPE_BEGIN(begin_frame); { - // FIXME correct only for nvidia where timestamps are ns - const uint64_t gpu_time_ns_fixme = (qpool->used) ? qpool->results[1] - qpool->results[0] : 0; - R_ShowExtendedProfilingData(prev_frame_event_index, gpu_time_ns_fixme); + // FIXME collect and show more gpu profiling data + const uint64_t gpu_time_begin_ns = (qpool->used) ? qpool->results[0] : 0; + const uint64_t gpu_time_end_ns = (qpool->used) ? qpool->results[1] : 0; + R_ShowExtendedProfilingData(prev_frame_event_index, gpu_time_begin_ns, gpu_time_end_ns); } if (vk_core.rtx && FBitSet( vk_rtx->flags, FCVAR_CHANGED )) { diff --git a/ref/vk/vk_querypool.c b/ref/vk/vk_querypool.c index 34a43a3f..3de66245 100644 --- a/ref/vk/vk_querypool.c +++ b/ref/vk/vk_querypool.c @@ -1,4 +1,5 @@ #include "vk_querypool.h" +#include "profiler.h" // for aprof_time_now_ns() qboolean R_VkQueryPoolInit( vk_query_pool_t* pool ) { const VkQueryPoolCreateInfo qpci = { @@ -33,6 +34,43 @@ void R_VkQueryPoolBegin( vk_query_pool_t *pool, VkCommandBuffer cmdbuf ) { void R_VkQueryPoolEnd( vk_query_pool_t *pool, VkCommandBuffer cmdbuf ) { R_VkQueryPoolTimestamp(pool, cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT); + pool->end_timestamp_ns = aprof_time_now_ns(); +} + +static uint64_t getGpuTimestampOffsetNs( const vk_query_pool_t *pool ) { + // FIXME this is an incorrect check, we need to carry per-device extensions availability somehow. vk_core-vs-device refactoring pending + if (!vkGetCalibratedTimestampsEXT) { + // Estimate based on supposed submission time, assuming that we submit, and it starts computing right after cmdbuffer closure + // which may not be true. But it's all we got + // TODO alternative approach: estimate based on end timestamp + const uint64_t gpu_begin_ns = (double)pool->results[0] * vk_core.physical_device.properties.limits.timestampPeriod; + return pool->end_timestamp_ns - gpu_begin_ns; + } + + const VkCalibratedTimestampInfoEXT cti[2] = { + { + .sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, + .pNext = NULL, + .timeDomain = VK_TIME_DOMAIN_DEVICE_EXT, + }, + { + .sType = VK_STRUCTURE_TYPE_CALIBRATED_TIMESTAMP_INFO_EXT, + .pNext = NULL, +#ifdef WIN32 + .timeDomain = VK_TIME_DOMAIN_QUERY_PERFORMANCE_COUNTER_EXT, +#else + .timeDomain = VK_TIME_DOMAIN_CLOCK_MONOTONIC_EXT, +#endif + }, + }; + + uint64_t timestamps[2] = {0}; + uint64_t max_deviation[2] = {0}; + vkGetCalibratedTimestampsEXT(vk_core.device, 2, cti, timestamps, max_deviation); + + const uint64_t cpu = (double)timestamps[1] / aprof_time_now_native_divider(); + const uint64_t gpu = (double)timestamps[0] * vk_core.physical_device.properties.limits.timestampPeriod; + return cpu - gpu; } void R_VkQueryPoolGetFrameResults( vk_query_pool_t *pool ) { @@ -41,7 +79,10 @@ void R_VkQueryPoolGetFrameResults( vk_query_pool_t *pool ) { vkGetQueryPoolResults(vk_core.device, pool->pool, 0, pool->used, pool->used * sizeof(uint64_t), pool->results, sizeof(uint64_t), VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT); + const uint64_t timestamp_offset_ns = getGpuTimestampOffsetNs( pool ) - g_aprof.time_begin_ns; + for (int i = 0; i < pool->used; ++i) { - pool->results[i] *= vk_core.physical_device.properties.limits.timestampPeriod; + const uint64_t gpu_ns = pool->results[i] * (double)vk_core.physical_device.properties.limits.timestampPeriod; + pool->results[i] = timestamp_offset_ns + gpu_ns; } } diff --git a/ref/vk/vk_querypool.h b/ref/vk/vk_querypool.h index b2e81a29..cd6a5b9c 100644 --- a/ref/vk/vk_querypool.h +++ b/ref/vk/vk_querypool.h @@ -8,6 +8,7 @@ typedef struct { VkQueryPool pool; int used; uint64_t results[MAX_QUERY_COUNT]; + uint64_t end_timestamp_ns; } vk_query_pool_t; qboolean R_VkQueryPoolInit( vk_query_pool_t *pool );