vk: profiler: collect and display ref cpu busy and waiting times
This commit is contained in:
parent
5d23494cfc
commit
e1afb2a9de
|
@ -12,19 +12,23 @@
|
|||
static aprof_scope_id_t _aprof_scope_id_##scope = -1
|
||||
|
||||
// scope_name is expected to be static and alive for the entire duration of the program
|
||||
#define APROF_SCOPE_INIT(scope, scope_name) \
|
||||
_aprof_scope_id_##scope = aprof_scope_init(scope_name)
|
||||
#define APROF_SCOPE_INIT_EX(scope, scope_name, flags) \
|
||||
_aprof_scope_id_##scope = aprof_scope_init(scope_name, flags)
|
||||
|
||||
#define APROF_SCOPE_INIT(scope, scope_name) APROF_SCOPE_INIT_EX(scope, scope_name, 0)
|
||||
|
||||
#define APROF_SCOPE_BEGIN(scope) \
|
||||
aprof_scope_event(_aprof_scope_id_##scope, 1)
|
||||
|
||||
#define APROF_SCOPE_DECLARE_BEGIN(scope, scope_name) \
|
||||
#define APROF_SCOPE_DECLARE_BEGIN_EX(scope, scope_name, flags) \
|
||||
static aprof_scope_id_t _aprof_scope_id_##scope = -1; \
|
||||
if (_aprof_scope_id_##scope == -1) { \
|
||||
_aprof_scope_id_##scope = aprof_scope_init(scope_name); \
|
||||
_aprof_scope_id_##scope = aprof_scope_init(scope_name, flags); \
|
||||
} \
|
||||
aprof_scope_event(_aprof_scope_id_##scope, 1)
|
||||
|
||||
#define APROF_SCOPE_DECLARE_BEGIN(scope, scope_name) APROF_SCOPE_DECLARE_BEGIN_EX(scope, scope_name, 0)
|
||||
|
||||
#define APROF_TOKENPASTE(x, y) x ## y
|
||||
#define APROF_TOKENPASTE2(x, y) APROF_TOKENPASTE(x, y)
|
||||
|
||||
|
@ -38,7 +42,7 @@
|
|||
typedef int aprof_scope_id_t;
|
||||
|
||||
// scope_name should be static const, and not on stack
|
||||
aprof_scope_id_t aprof_scope_init(const char *scope_name);
|
||||
aprof_scope_id_t aprof_scope_init(const char *scope_name, uint32_t flags);
|
||||
void aprof_scope_event(aprof_scope_id_t, int begin);
|
||||
// Returns event index for previous frame
|
||||
uint32_t aprof_scope_frame( void );
|
||||
|
@ -50,8 +54,19 @@ uint64_t aprof_time_platform_to_ns( uint64_t );
|
|||
#define aprof_time_platform_to_ns(time) ((time) - g_aprof.time_begin_ns)
|
||||
#endif
|
||||
|
||||
enum {
|
||||
// This scope covers waiting for external event.
|
||||
// Its entire subtree shouldn't count towards CPU usage by external analyzers.
|
||||
APROF_SCOPE_FLAG_WAIT = (1<<0),
|
||||
|
||||
// This scope itself doesn't really mean CPU work, and used only as a frame structure decoration.
|
||||
// External tools shouldn't count it itself as CPU time used, but its children should be.
|
||||
APROF_SCOPE_FLAG_DECOR = (1<<1),
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
const char *name;
|
||||
uint32_t flags;
|
||||
} aprof_scope_t;
|
||||
|
||||
#define APROF_MAX_SCOPES 256
|
||||
|
@ -130,7 +145,7 @@ uint64_t aprof_time_platform_to_ns( uint64_t platform_time ) {
|
|||
|
||||
aprof_state_t g_aprof = {0};
|
||||
|
||||
aprof_scope_id_t aprof_scope_init(const char *scope_name) {
|
||||
aprof_scope_id_t aprof_scope_init(const char *scope_name, uint32_t flags) {
|
||||
#if defined(_WIN32)
|
||||
if (_aprof_frequency.QuadPart == 0)
|
||||
QueryPerformanceFrequency(&_aprof_frequency);
|
||||
|
@ -143,6 +158,7 @@ aprof_scope_id_t aprof_scope_init(const char *scope_name) {
|
|||
return -1;
|
||||
|
||||
g_aprof.scopes[g_aprof.num_scopes].name = scope_name;
|
||||
g_aprof.scopes[g_aprof.num_scopes].flags = flags;
|
||||
return g_aprof.num_scopes++;
|
||||
}
|
||||
|
||||
|
|
|
@ -38,7 +38,11 @@ static struct {
|
|||
int glyph_width, glyph_height;
|
||||
} font_metrics;
|
||||
|
||||
struct {
|
||||
uint64_t ref_cpu_time;
|
||||
uint64_t ref_cpu_wait_time;
|
||||
char message[MAX_SPEEDS_MESSAGE];
|
||||
} frame;
|
||||
} g_slows;
|
||||
|
||||
static void speedsPrintf( const char *msg, ... ) {
|
||||
|
@ -49,7 +53,7 @@ static void speedsPrintf( const char *msg, ... ) {
|
|||
Q_vsprintf( text, msg, argptr );
|
||||
va_end( argptr );
|
||||
|
||||
Q_strncat( g_slows.message, text, sizeof( g_slows.message ));
|
||||
Q_strncat( g_slows.frame.message, text, sizeof( g_slows.frame.message ));
|
||||
}
|
||||
|
||||
static float linearstep(float min, float max, float v) {
|
||||
|
@ -83,7 +87,7 @@ static void drawTimeBar(uint64_t begin_time_ns, float time_scale_ms, int64_t beg
|
|||
gEngine.Con_DrawString(x, y, tmp, text_color);
|
||||
}
|
||||
|
||||
static void drawProfilerScopes(const aprof_event_t *events, uint64_t begin_time, float time_scale_ms, uint32_t begin, uint32_t end, int y) {
|
||||
static void drawProfilerScopes(int draw, const aprof_event_t *events, uint64_t begin_time, float time_scale_ms, uint32_t begin, uint32_t end, int y) {
|
||||
#define MAX_STACK_DEPTH 16
|
||||
struct {
|
||||
int scope_id;
|
||||
|
@ -92,12 +96,22 @@ static void drawProfilerScopes(const aprof_event_t *events, uint64_t begin_time,
|
|||
int depth = 0;
|
||||
int max_depth = 0;
|
||||
|
||||
int under_waiting = 0;
|
||||
g_slows.frame.ref_cpu_time = 0;
|
||||
g_slows.frame.ref_cpu_wait_time = 0;
|
||||
|
||||
for (; begin != end; begin = (begin + 1) % APROF_EVENT_BUFFER_SIZE) {
|
||||
const aprof_event_t event = events[begin];
|
||||
const int event_type = APROF_EVENT_TYPE(event);
|
||||
const uint64_t timestamp_ns = APROF_EVENT_TIMESTAMP(event);
|
||||
const int scope_id = APROF_EVENT_SCOPE_ID(event);
|
||||
switch (event_type) {
|
||||
case APROF_EVENT_FRAME_BOUNDARY:
|
||||
g_slows.frame.ref_cpu_time = 0;
|
||||
g_slows.frame.ref_cpu_wait_time = 0;
|
||||
under_waiting = 0;
|
||||
break;
|
||||
|
||||
case APROF_EVENT_SCOPE_BEGIN: {
|
||||
if (depth < MAX_STACK_DEPTH) {
|
||||
stack[depth].begin_ns = timestamp_ns;
|
||||
|
@ -106,6 +120,11 @@ static void drawProfilerScopes(const aprof_event_t *events, uint64_t begin_time,
|
|||
++depth;
|
||||
if (max_depth < depth)
|
||||
max_depth = depth;
|
||||
|
||||
const aprof_scope_t *const scope = g_aprof.scopes + scope_id;
|
||||
if (scope->flags & APROF_SCOPE_FLAG_WAIT)
|
||||
under_waiting++;
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -115,12 +134,37 @@ static void drawProfilerScopes(const aprof_event_t *events, uint64_t begin_time,
|
|||
|
||||
ASSERT(stack[depth].scope_id == scope_id);
|
||||
|
||||
const char *scope_name = g_aprof.scopes[scope_id].name;
|
||||
const uint32_t hash = getHash(scope_name);
|
||||
const aprof_scope_t *const scope = g_aprof.scopes + scope_id;
|
||||
const uint32_t hash = getHash(scope->name);
|
||||
|
||||
const uint64_t delta_ns = timestamp_ns - stack[depth].begin_ns;
|
||||
|
||||
// This is a top level scope that should be counter towards cpu usage
|
||||
const int is_top_level = ((scope->flags & APROF_SCOPE_FLAG_DECOR) == 0) && (depth == 0 || (g_aprof.scopes[stack[depth-1].scope_id].flags & APROF_SCOPE_FLAG_DECOR));
|
||||
|
||||
// Only count top level scopes towards CPU time, and only if it's not waiting
|
||||
if (is_top_level && under_waiting == 0)
|
||||
g_slows.frame.ref_cpu_time += delta_ns;
|
||||
|
||||
// If this is a top level waiting scope (under any depth)
|
||||
if (under_waiting == 1) {
|
||||
// Count it towards waiting time
|
||||
g_slows.frame.ref_cpu_wait_time += delta_ns;
|
||||
|
||||
// If this is not a top level scope, then we might count its top level parent
|
||||
// towards cpu usage time, which is not correct. Subtract this waiting time from it.
|
||||
if (!is_top_level)
|
||||
g_slows.frame.ref_cpu_time -= delta_ns;
|
||||
}
|
||||
|
||||
if (scope->flags & APROF_SCOPE_FLAG_WAIT)
|
||||
under_waiting--;
|
||||
|
||||
if (draw) {
|
||||
const rgba_t color = {hash >> 24, (hash>>16)&0xff, hash&0xff, 127};
|
||||
const int bar_height = g_slows.font_metrics.glyph_height;
|
||||
drawTimeBar(begin_time, time_scale_ms, stack[depth].begin_ns, timestamp_ns, y + depth * bar_height, bar_height, scope_name, color);
|
||||
drawTimeBar(begin_time, time_scale_ms, stack[depth].begin_ns, timestamp_ns, y + depth * bar_height, bar_height, scope->name, color);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -174,7 +218,7 @@ static int drawFrameTimeGraph( const int frame_bar_y, const float frame_bar_y_sc
|
|||
return frame_bar_y + frame_bar_y_scale * TARGET_FRAME_TIME * 2;
|
||||
}
|
||||
|
||||
static int drawFrames( uint32_t prev_frame_index, int y, const uint64_t gpu_frame_begin_ns, const uint64_t gpu_frame_end_ns ) {
|
||||
static int drawFrames( int draw, uint32_t prev_frame_index, int y, const uint64_t gpu_frame_begin_ns, const uint64_t gpu_frame_end_ns ) {
|
||||
// Draw latest 2 frames; find their boundaries
|
||||
uint32_t rewind_frame = prev_frame_index;
|
||||
const int max_frames_to_draw = 2;
|
||||
|
@ -200,12 +244,15 @@ static int drawFrames( uint32_t prev_frame_index, int y, const uint64_t gpu_fram
|
|||
const uint64_t frame_end_time = APROF_EVENT_TIMESTAMP(events[event_end]);
|
||||
const uint64_t delta_ns = frame_end_time - frame_begin_time;
|
||||
const float time_scale_ms = (double)vk_frame.width / (delta_ns / 1e6);
|
||||
drawProfilerScopes(events, frame_begin_time, time_scale_ms, event_begin, event_end, y);
|
||||
drawProfilerScopes(draw, events, frame_begin_time, time_scale_ms, event_begin, event_end, y);
|
||||
|
||||
// Draw GPU last frame bar
|
||||
if (draw) {
|
||||
y += g_slows.font_metrics.glyph_height * 6;
|
||||
const int bar_height = g_slows.font_metrics.glyph_height;
|
||||
const rgba_t color = {255, 255, 0, 127};
|
||||
drawTimeBar(frame_begin_time, time_scale_ms, gpu_frame_begin_ns, gpu_frame_end_ns, y, bar_height, "GPU TIME", color);
|
||||
}
|
||||
return y;
|
||||
}
|
||||
|
||||
|
@ -224,10 +271,11 @@ static void getCurrentFontMetrics(void) {
|
|||
void R_ShowExtendedProfilingData(uint32_t prev_frame_index, uint64_t gpu_frame_begin_ns, uint64_t gpu_frame_end_ns) {
|
||||
APROF_SCOPE_DECLARE_BEGIN(__FUNCTION__, __FUNCTION__);
|
||||
|
||||
g_slows.message[0] = '\0';
|
||||
g_slows.frame.message[0] = '\0';
|
||||
|
||||
const uint32_t speeds_bits = r_speeds->value;
|
||||
|
||||
if (speeds_bits)
|
||||
speedsPrintf( "Renderer: ^1Vulkan%s^7\n", vk_frame.rtx_enabled ? " RT" : "" );
|
||||
|
||||
const uint32_t events = g_aprof.events_last_frame - prev_frame_index;
|
||||
|
@ -235,9 +283,27 @@ void R_ShowExtendedProfilingData(uint32_t prev_frame_index, uint64_t gpu_frame_b
|
|||
const unsigned long long delta_ns = APROF_EVENT_TIMESTAMP(g_aprof.events[g_aprof.events_last_frame]) - frame_begin_time;
|
||||
const float frame_time = delta_ns * 1e-6;
|
||||
|
||||
g_slows.frame_times[g_slows.frame_num] = frame_time;
|
||||
g_slows.frame_num = (g_slows.frame_num + 1) % MAX_FRAMES_HISTORY;
|
||||
|
||||
handlePause( prev_frame_index );
|
||||
|
||||
{
|
||||
getCurrentFontMetrics();
|
||||
int y = 100;
|
||||
const float frame_bar_y_scale = 2.f; // ms to pixels (somehow)
|
||||
//
|
||||
const int draw = speeds_bits & SPEEDS_BIT_FRAME;
|
||||
if (draw)
|
||||
y = drawFrameTimeGraph( y, frame_bar_y_scale ) + 20;
|
||||
|
||||
y = drawFrames( draw, prev_frame_index, y, gpu_frame_begin_ns, gpu_frame_end_ns );
|
||||
}
|
||||
|
||||
if (speeds_bits & SPEEDS_BIT_SIMPLE) {
|
||||
const uint64_t gpu_time_ns = gpu_frame_end_ns - gpu_frame_begin_ns;
|
||||
speedsPrintf("frame: %.03fms GPU: %.03fms\n", frame_time, gpu_time_ns * 1e-6);
|
||||
speedsPrintf(" (ref) CPU: %.03fms wait: %.03fms\n", g_slows.frame.ref_cpu_time * 1e-6, g_slows.frame.ref_cpu_wait_time * 1e-6);
|
||||
}
|
||||
|
||||
if (speeds_bits & SPEEDS_BIT_STATS) {
|
||||
|
@ -246,19 +312,6 @@ void R_ShowExtendedProfilingData(uint32_t prev_frame_index, uint64_t gpu_frame_b
|
|||
speedsPrintf("Dirty light cells: %d\n\tsize = %dKiB, ranges = %d\n", dirty, (int)(dirty * sizeof(struct LightCluster) / 1024), g_lights.stats.ranges_uploaded);
|
||||
}
|
||||
|
||||
g_slows.frame_times[g_slows.frame_num] = frame_time;
|
||||
g_slows.frame_num = (g_slows.frame_num + 1) % MAX_FRAMES_HISTORY;
|
||||
|
||||
handlePause( prev_frame_index );
|
||||
|
||||
if (speeds_bits & SPEEDS_BIT_FRAME) {
|
||||
getCurrentFontMetrics();
|
||||
int y = 100;
|
||||
const float frame_bar_y_scale = 2.f; // ms to pixels (somehow)
|
||||
y = drawFrameTimeGraph( y, frame_bar_y_scale ) + 20;
|
||||
y = drawFrames( prev_frame_index, y, gpu_frame_begin_ns, gpu_frame_end_ns );
|
||||
}
|
||||
|
||||
APROF_SCOPE_END(__FUNCTION__);
|
||||
}
|
||||
|
||||
|
@ -291,7 +344,7 @@ qboolean R_SpeedsMessage( char *out, size_t size )
|
|||
if( r_speeds->value <= 0 ) return false;
|
||||
if( !out || !size ) return false;
|
||||
|
||||
Q_strncpy( out, g_slows.message, size );
|
||||
Q_strncpy( out, g_slows.frame.message, size );
|
||||
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -49,14 +49,14 @@ static struct {
|
|||
} g_frame;
|
||||
|
||||
#define PROFILER_SCOPES(X) \
|
||||
X(frame, "Frame"); \
|
||||
X(begin_frame, "R_BeginFrame"); \
|
||||
X(render_frame, "VK_RenderFrame"); \
|
||||
X(end_frame, "R_EndFrame"); \
|
||||
X(frame_gpu_wait, "Wait for GPU"); \
|
||||
X(wait_for_frame_fence, "waitForFrameFence"); \
|
||||
X(frame, "Frame", APROF_SCOPE_FLAG_DECOR); \
|
||||
X(begin_frame, "R_BeginFrame", 0); \
|
||||
X(render_frame, "VK_RenderFrame", 0); \
|
||||
X(end_frame, "R_EndFrame", 0); \
|
||||
X(frame_gpu_wait, "Wait for GPU", APROF_SCOPE_FLAG_WAIT); \
|
||||
X(wait_for_frame_fence, "waitForFrameFence", APROF_SCOPE_FLAG_WAIT); \
|
||||
|
||||
#define SCOPE_DECLARE(scope, name) APROF_SCOPE_DECLARE(scope)
|
||||
#define SCOPE_DECLARE(scope, name, flags) APROF_SCOPE_DECLARE(scope)
|
||||
PROFILER_SCOPES(SCOPE_DECLARE)
|
||||
#undef SCOPE_DECLARE
|
||||
|
||||
|
@ -391,7 +391,7 @@ static void toggleRaytracing( void ) {
|
|||
|
||||
qboolean VK_FrameCtlInit( void )
|
||||
{
|
||||
PROFILER_SCOPES(APROF_SCOPE_INIT);
|
||||
PROFILER_SCOPES(APROF_SCOPE_INIT_EX);
|
||||
|
||||
const VkFormat depth_format = findSupportedImageFormat(depth_formats, VK_IMAGE_TILING_OPTIMAL, VK_FORMAT_FEATURE_DEPTH_STENCIL_ATTACHMENT_BIT);
|
||||
|
||||
|
|
|
@ -202,7 +202,10 @@ r_vk_swapchain_framebuffer_t R_VkSwapchainAcquire( VkSemaphore sem_image_availa
|
|||
// Check that swapchain has the same size
|
||||
recreateSwapchain(force_recreate);
|
||||
|
||||
APROF_SCOPE_DECLARE_BEGIN_EX(vkAcquireNextImageKHR, "vkAcquireNextImageKHR", APROF_SCOPE_FLAG_WAIT);
|
||||
const VkResult acquire_result = vkAcquireNextImageKHR(vk_core.device, g_swapchain.swapchain, UINT64_MAX, sem_image_available, VK_NULL_HANDLE, &ret.index);
|
||||
APROF_SCOPE_END(vkAcquireNextImageKHR);
|
||||
|
||||
switch (acquire_result) {
|
||||
case VK_SUCCESS:
|
||||
break;
|
||||
|
|
Loading…
Reference in New Issue