vk: profiler: collect and display ref cpu busy and waiting times

This commit is contained in:
Ivan Avdeev 2023-03-21 11:31:27 -07:00 committed by Ivan Avdeev
parent 5d23494cfc
commit e1afb2a9de
4 changed files with 116 additions and 44 deletions

View File

@ -12,19 +12,23 @@
static aprof_scope_id_t _aprof_scope_id_##scope = -1
// scope_name is expected to be static and alive for the entire duration of the program
#define APROF_SCOPE_INIT(scope, scope_name) \
_aprof_scope_id_##scope = aprof_scope_init(scope_name)
#define APROF_SCOPE_INIT_EX(scope, scope_name, flags) \
_aprof_scope_id_##scope = aprof_scope_init(scope_name, flags)
#define APROF_SCOPE_INIT(scope, scope_name) APROF_SCOPE_INIT_EX(scope, scope_name, 0)
#define APROF_SCOPE_BEGIN(scope) \
aprof_scope_event(_aprof_scope_id_##scope, 1)
#define APROF_SCOPE_DECLARE_BEGIN(scope, scope_name) \
#define APROF_SCOPE_DECLARE_BEGIN_EX(scope, scope_name, flags) \
static aprof_scope_id_t _aprof_scope_id_##scope = -1; \
if (_aprof_scope_id_##scope == -1) { \
_aprof_scope_id_##scope = aprof_scope_init(scope_name); \
_aprof_scope_id_##scope = aprof_scope_init(scope_name, flags); \
} \
aprof_scope_event(_aprof_scope_id_##scope, 1)
#define APROF_SCOPE_DECLARE_BEGIN(scope, scope_name) APROF_SCOPE_DECLARE_BEGIN_EX(scope, scope_name, 0)
#define APROF_TOKENPASTE(x, y) x ## y
#define APROF_TOKENPASTE2(x, y) APROF_TOKENPASTE(x, y)
@ -38,7 +42,7 @@
typedef int aprof_scope_id_t;
// scope_name should be static const, and not on stack
aprof_scope_id_t aprof_scope_init(const char *scope_name);
aprof_scope_id_t aprof_scope_init(const char *scope_name, uint32_t flags);
void aprof_scope_event(aprof_scope_id_t, int begin);
// Returns event index for previous frame
uint32_t aprof_scope_frame( void );
@ -50,8 +54,19 @@ uint64_t aprof_time_platform_to_ns( uint64_t );
#define aprof_time_platform_to_ns(time) ((time) - g_aprof.time_begin_ns)
#endif
enum {
// This scope covers waiting for external event.
// Its entire subtree shouldn't count towards CPU usage by external analyzers.
APROF_SCOPE_FLAG_WAIT = (1<<0),
// This scope itself doesn't really mean CPU work, and used only as a frame structure decoration.
// External tools shouldn't count it itself as CPU time used, but its children should be.
APROF_SCOPE_FLAG_DECOR = (1<<1),
};
typedef struct {
const char *name;
uint32_t flags;
} aprof_scope_t;
#define APROF_MAX_SCOPES 256
@ -130,7 +145,7 @@ uint64_t aprof_time_platform_to_ns( uint64_t platform_time ) {
aprof_state_t g_aprof = {0};
aprof_scope_id_t aprof_scope_init(const char *scope_name) {
aprof_scope_id_t aprof_scope_init(const char *scope_name, uint32_t flags) {
#if defined(_WIN32)
if (_aprof_frequency.QuadPart == 0)
QueryPerformanceFrequency(&_aprof_frequency);
@ -143,6 +158,7 @@ aprof_scope_id_t aprof_scope_init(const char *scope_name) {
return -1;
g_aprof.scopes[g_aprof.num_scopes].name = scope_name;
g_aprof.scopes[g_aprof.num_scopes].flags = flags;
return g_aprof.num_scopes++;
}

View File

@ -38,7 +38,11 @@ static struct {
int glyph_width, glyph_height;
} font_metrics;
char message[MAX_SPEEDS_MESSAGE];
struct {
uint64_t ref_cpu_time;
uint64_t ref_cpu_wait_time;
char message[MAX_SPEEDS_MESSAGE];
} frame;
} g_slows;
static void speedsPrintf( const char *msg, ... ) {
@ -49,7 +53,7 @@ static void speedsPrintf( const char *msg, ... ) {
Q_vsprintf( text, msg, argptr );
va_end( argptr );
Q_strncat( g_slows.message, text, sizeof( g_slows.message ));
Q_strncat( g_slows.frame.message, text, sizeof( g_slows.frame.message ));
}
static float linearstep(float min, float max, float v) {
@ -83,7 +87,7 @@ static void drawTimeBar(uint64_t begin_time_ns, float time_scale_ms, int64_t beg
gEngine.Con_DrawString(x, y, tmp, text_color);
}
static void drawProfilerScopes(const aprof_event_t *events, uint64_t begin_time, float time_scale_ms, uint32_t begin, uint32_t end, int y) {
static void drawProfilerScopes(int draw, const aprof_event_t *events, uint64_t begin_time, float time_scale_ms, uint32_t begin, uint32_t end, int y) {
#define MAX_STACK_DEPTH 16
struct {
int scope_id;
@ -92,12 +96,22 @@ static void drawProfilerScopes(const aprof_event_t *events, uint64_t begin_time,
int depth = 0;
int max_depth = 0;
int under_waiting = 0;
g_slows.frame.ref_cpu_time = 0;
g_slows.frame.ref_cpu_wait_time = 0;
for (; begin != end; begin = (begin + 1) % APROF_EVENT_BUFFER_SIZE) {
const aprof_event_t event = events[begin];
const int event_type = APROF_EVENT_TYPE(event);
const uint64_t timestamp_ns = APROF_EVENT_TIMESTAMP(event);
const int scope_id = APROF_EVENT_SCOPE_ID(event);
switch (event_type) {
case APROF_EVENT_FRAME_BOUNDARY:
g_slows.frame.ref_cpu_time = 0;
g_slows.frame.ref_cpu_wait_time = 0;
under_waiting = 0;
break;
case APROF_EVENT_SCOPE_BEGIN: {
if (depth < MAX_STACK_DEPTH) {
stack[depth].begin_ns = timestamp_ns;
@ -106,6 +120,11 @@ static void drawProfilerScopes(const aprof_event_t *events, uint64_t begin_time,
++depth;
if (max_depth < depth)
max_depth = depth;
const aprof_scope_t *const scope = g_aprof.scopes + scope_id;
if (scope->flags & APROF_SCOPE_FLAG_WAIT)
under_waiting++;
break;
}
@ -115,12 +134,37 @@ static void drawProfilerScopes(const aprof_event_t *events, uint64_t begin_time,
ASSERT(stack[depth].scope_id == scope_id);
const char *scope_name = g_aprof.scopes[scope_id].name;
const uint32_t hash = getHash(scope_name);
const aprof_scope_t *const scope = g_aprof.scopes + scope_id;
const uint32_t hash = getHash(scope->name);
const rgba_t color = {hash >> 24, (hash>>16)&0xff, hash&0xff, 127};
const int bar_height = g_slows.font_metrics.glyph_height;
drawTimeBar(begin_time, time_scale_ms, stack[depth].begin_ns, timestamp_ns, y + depth * bar_height, bar_height, scope_name, color);
const uint64_t delta_ns = timestamp_ns - stack[depth].begin_ns;
// This is a top level scope that should be counter towards cpu usage
const int is_top_level = ((scope->flags & APROF_SCOPE_FLAG_DECOR) == 0) && (depth == 0 || (g_aprof.scopes[stack[depth-1].scope_id].flags & APROF_SCOPE_FLAG_DECOR));
// Only count top level scopes towards CPU time, and only if it's not waiting
if (is_top_level && under_waiting == 0)
g_slows.frame.ref_cpu_time += delta_ns;
// If this is a top level waiting scope (under any depth)
if (under_waiting == 1) {
// Count it towards waiting time
g_slows.frame.ref_cpu_wait_time += delta_ns;
// If this is not a top level scope, then we might count its top level parent
// towards cpu usage time, which is not correct. Subtract this waiting time from it.
if (!is_top_level)
g_slows.frame.ref_cpu_time -= delta_ns;
}
if (scope->flags & APROF_SCOPE_FLAG_WAIT)
under_waiting--;
if (draw) {
const rgba_t color = {hash >> 24, (hash>>16)&0xff, hash&0xff, 127};
const int bar_height = g_slows.font_metrics.glyph_height;
drawTimeBar(begin_time, time_scale_ms, stack[depth].begin_ns, timestamp_ns, y + depth * bar_height, bar_height, scope->name, color);
}
break;
}
@ -174,7 +218,7 @@ static int drawFrameTimeGraph( const int frame_bar_y, const float frame_bar_y_sc
return frame_bar_y + frame_bar_y_scale * TARGET_FRAME_TIME * 2;
}
static int drawFrames( uint32_t prev_frame_index, int y, const uint64_t gpu_frame_begin_ns, const uint64_t gpu_frame_end_ns ) {
static int drawFrames( int draw, uint32_t prev_frame_index, int y, const uint64_t gpu_frame_begin_ns, const uint64_t gpu_frame_end_ns ) {
// Draw latest 2 frames; find their boundaries
uint32_t rewind_frame = prev_frame_index;
const int max_frames_to_draw = 2;
@ -200,12 +244,15 @@ static int drawFrames( uint32_t prev_frame_index, int y, const uint64_t gpu_fram
const uint64_t frame_end_time = APROF_EVENT_TIMESTAMP(events[event_end]);
const uint64_t delta_ns = frame_end_time - frame_begin_time;
const float time_scale_ms = (double)vk_frame.width / (delta_ns / 1e6);
drawProfilerScopes(events, frame_begin_time, time_scale_ms, event_begin, event_end, y);
drawProfilerScopes(draw, events, frame_begin_time, time_scale_ms, event_begin, event_end, y);
y += g_slows.font_metrics.glyph_height * 6;
const int bar_height = g_slows.font_metrics.glyph_height;
const rgba_t color = {255, 255, 0, 127};
drawTimeBar(frame_begin_time, time_scale_ms, gpu_frame_begin_ns, gpu_frame_end_ns, y, bar_height, "GPU TIME", color);
// Draw GPU last frame bar
if (draw) {
y += g_slows.font_metrics.glyph_height * 6;
const int bar_height = g_slows.font_metrics.glyph_height;
const rgba_t color = {255, 255, 0, 127};
drawTimeBar(frame_begin_time, time_scale_ms, gpu_frame_begin_ns, gpu_frame_end_ns, y, bar_height, "GPU TIME", color);
}
return y;
}
@ -224,20 +271,39 @@ static void getCurrentFontMetrics(void) {
void R_ShowExtendedProfilingData(uint32_t prev_frame_index, uint64_t gpu_frame_begin_ns, uint64_t gpu_frame_end_ns) {
APROF_SCOPE_DECLARE_BEGIN(__FUNCTION__, __FUNCTION__);
g_slows.message[0] = '\0';
g_slows.frame.message[0] = '\0';
const uint32_t speeds_bits = r_speeds->value;
speedsPrintf( "Renderer: ^1Vulkan%s^7\n", vk_frame.rtx_enabled ? " RT" : "" );
if (speeds_bits)
speedsPrintf( "Renderer: ^1Vulkan%s^7\n", vk_frame.rtx_enabled ? " RT" : "" );
const uint32_t events = g_aprof.events_last_frame - prev_frame_index;
const uint64_t frame_begin_time = APROF_EVENT_TIMESTAMP(g_aprof.events[prev_frame_index]);
const unsigned long long delta_ns = APROF_EVENT_TIMESTAMP(g_aprof.events[g_aprof.events_last_frame]) - frame_begin_time;
const float frame_time = delta_ns * 1e-6;
g_slows.frame_times[g_slows.frame_num] = frame_time;
g_slows.frame_num = (g_slows.frame_num + 1) % MAX_FRAMES_HISTORY;
handlePause( prev_frame_index );
{
getCurrentFontMetrics();
int y = 100;
const float frame_bar_y_scale = 2.f; // ms to pixels (somehow)
//
const int draw = speeds_bits & SPEEDS_BIT_FRAME;
if (draw)
y = drawFrameTimeGraph( y, frame_bar_y_scale ) + 20;
y = drawFrames( draw, prev_frame_index, y, gpu_frame_begin_ns, gpu_frame_end_ns );
}
if (speeds_bits & SPEEDS_BIT_SIMPLE) {
const uint64_t gpu_time_ns = gpu_frame_end_ns - gpu_frame_begin_ns;
speedsPrintf("frame: %.03fms GPU: %.03fms\n", frame_time, gpu_time_ns * 1e-6);
speedsPrintf(" (ref) CPU: %.03fms wait: %.03fms\n", g_slows.frame.ref_cpu_time * 1e-6, g_slows.frame.ref_cpu_wait_time * 1e-6);
}
if (speeds_bits & SPEEDS_BIT_STATS) {
@ -246,19 +312,6 @@ void R_ShowExtendedProfilingData(uint32_t prev_frame_index, uint64_t gpu_frame_b
speedsPrintf("Dirty light cells: %d\n\tsize = %dKiB, ranges = %d\n", dirty, (int)(dirty * sizeof(struct LightCluster) / 1024), g_lights.stats.ranges_uploaded);
}
g_slows.frame_times[g_slows.frame_num] = frame_time;
g_slows.frame_num = (g_slows.frame_num + 1) % MAX_FRAMES_HISTORY;
handlePause( prev_frame_index );
if (speeds_bits & SPEEDS_BIT_FRAME) {
getCurrentFontMetrics();
int y = 100;
const float frame_bar_y_scale = 2.f; // ms to pixels (somehow)
y = drawFrameTimeGraph( y, frame_bar_y_scale ) + 20;
y = drawFrames( prev_frame_index, y, gpu_frame_begin_ns, gpu_frame_end_ns );
}
APROF_SCOPE_END(__FUNCTION__);
}
@ -291,7 +344,7 @@ qboolean R_SpeedsMessage( char *out, size_t size )
if( r_speeds->value <= 0 ) return false;
if( !out || !size ) return false;
Q_strncpy( out, g_slows.message, size );
Q_strncpy( out, g_slows.frame.message, size );
return true;
}

View File

@ -49,14 +49,14 @@ static struct {
} g_frame;
#define PROFILER_SCOPES(X) \
X(frame, "Frame"); \
X(begin_frame, "R_BeginFrame"); \
X(render_frame, "VK_RenderFrame"); \
X(end_frame, "R_EndFrame"); \
X(frame_gpu_wait, "Wait for GPU"); \
X(wait_for_frame_fence, "waitForFrameFence"); \
X(frame, "Frame", APROF_SCOPE_FLAG_DECOR); \
X(begin_frame, "R_BeginFrame", 0); \
X(render_frame, "VK_RenderFrame", 0); \
X(end_frame, "R_EndFrame", 0); \
X(frame_gpu_wait, "Wait for GPU", APROF_SCOPE_FLAG_WAIT); \
X(wait_for_frame_fence, "waitForFrameFence", APROF_SCOPE_FLAG_WAIT); \
#define SCOPE_DECLARE(scope, name) APROF_SCOPE_DECLARE(scope)
#define SCOPE_DECLARE(scope, name, flags) APROF_SCOPE_DECLARE(scope)
PROFILER_SCOPES(SCOPE_DECLARE)
#undef SCOPE_DECLARE
@ -391,7 +391,7 @@ static void toggleRaytracing( void ) {
qboolean VK_FrameCtlInit( void )
{
PROFILER_SCOPES(APROF_SCOPE_INIT);
PROFILER_SCOPES(APROF_SCOPE_INIT_EX);
const VkFormat depth_format = findSupportedImageFormat(depth_formats, VK_IMAGE_TILING_OPTIMAL, VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT);

View File

@ -202,7 +202,10 @@ r_vk_swapchain_framebuffer_t R_VkSwapchainAcquire( VkSemaphore sem_image_availa
// Check that swapchain has the same size
recreateSwapchain(force_recreate);
APROF_SCOPE_DECLARE_BEGIN_EX(vkAcquireNextImageKHR, "vkAcquireNextImageKHR", APROF_SCOPE_FLAG_WAIT);
const VkResult acquire_result = vkAcquireNextImageKHR(vk_core.device, g_swapchain.swapchain, UINT64_MAX, sem_image_available, VK_NULL_HANDLE, &ret.index);
APROF_SCOPE_END(vkAcquireNextImageKHR);
switch (acquire_result) {
case VK_SUCCESS:
break;