vk: rt: extract raw shader clock values

also add observations to the shader, what values do we see
This commit is contained in:
Ivan Avdeev 2023-12-07 12:38:19 -05:00
parent 1fcd64b65c
commit cbdbda3549
7 changed files with 141 additions and 23 deletions

View File

@ -1011,3 +1011,16 @@ This would need the same as above, plus:
- A: probably should still do it on GPU lol
This would also allow passing arbitrary per-pixel data from shaders, which would make shader debugging much much easier.
# 2023-12-07 E343
## What do we really need for shader profiling
### Optimizing polygon light sampling
- Per-pixel numbers:
- Total shader time
- Sampling selection time (Σ)
- Selecting lights to sample (+count)
- Selecting light point to sample
- Vertices count
- Ray tracing time (Σ, +count)
- Aggregate numbers:
- TODO: what does VK_KHR_performance_query give us? Regs usage, etc.

View File

@ -1,3 +1,9 @@
# 2023-12-07 E343
- [x] extract raw shader clock
- [ ] display times as scopes somewhere
- [ ] extract sampling times
- [ ] extract ray times
# 2023-12-05 E342
- [x] tone down the specular indirect blur
- [-] try func_wall static light opt, #687

View File

@ -21,13 +21,15 @@
#define vec3 vec3_t
#define vec4 vec4_t
#define mat4 matrix4x4
typedef int ivec3[3];
typedef int ivec2[2];
typedef int32_t ivec3[3];
typedef int32_t ivec2[2];
typedef uint32_t uvec2[2];
typedef uint32_t uvec3[3];
typedef uint32_t uvec4[4];
#define TOKENPASTE(x, y) x ## y
#define TOKENPASTE2(x, y) TOKENPASTE(x, y)
#define PAD(x) float TOKENPASTE2(pad_, __LINE__)[x];
#define STRUCT struct
enum {
#define DECLARE_SPECIALIZATION_CONSTANT(index, type, name, default_value) \
SPEC_##name##_INDEX = index,
@ -207,6 +209,10 @@ struct UniformBuffer {
uint debug_display_only;
};
struct ProfilingStruct {
uvec4 data[4];
};
#undef PAD
#undef STRUCT

View File

@ -1,6 +1,12 @@
#extension GL_ARB_shader_clock: enable
#extension GL_EXT_shader_realtime_clock: enable
#define PROF_USE_REALTIME
#ifdef PROF_USE_REALTIME
#extension GL_ARB_gpu_shader_int64: enable
#extension GL_EXT_shader_realtime_clock: enable
#else
#extension GL_ARB_shader_clock: enable
#endif
#include "utils.glsl"
#include "noise.glsl"
@ -16,11 +22,25 @@ void readNormals(ivec2 uv, out vec3 geometry_normal, out vec3 shading_normal) {
shading_normal = normalDecode(n.zw);
}
//#define timeNow clockRealtimeEXT
#define timeNow clockARB
#ifdef PROF_USE_REALTIME
// On mesa+amdgpu there's a clear gradient: pixels on top of screen take 2-3x longer to compute than bottom ones. Also,
// it does flicker a lot.
// Deltas are about 30000-100000 parrots
#define timeNow clockRealtime2x32EXT
#else
// clockARB doesn't give directly usable time values on mesa+amdgpu
// even deltas between them are not meaningful enough.
// On mesa+amdgpu clockARB() values are limited to lower 20bits, and they wrap around a lot.
// Absolute difference value are often 30-50% of the available range, so it's not that far off from wrapping around
// multiple times, rendering the value completely useless.
// Deltas are around 300000-500000 parrots.
// Other than that, the values seem uniform across the screen (as compared to realtime clock, which has a clearly
// visible gradient: top differences are larger than bottom ones.
#define timeNow clock2x32ARB
#endif
void main() {
const uint64_t time_begin = timeNow();
const uvec2 time_begin = timeNow();
#ifdef RAY_TRACE
const vec2 uv = (gl_LaunchIDEXT.xy + .5) / gl_LaunchSizeEXT.xy * 2. - 1.;
@ -59,20 +79,32 @@ void main() {
vec3 diffuse = vec3(0.), specular = vec3(0.);
computeLighting(pos + geometry_normal * .001, shading_normal, throughput, -direction, material, diffuse, specular);
const uint64_t time_end = timeNow();
const uint64_t time_diff = time_end - time_begin;
const uvec2 time_end = timeNow();
//const uint64_t time_diff = time_end - time_begin;
//const uint time_diff = time_begin.x - time_end.x;
const float time_diff_f = float(time_diff) / 1e6;////float(time_diff >> 60);// / 1e6;
#ifdef PROF_USE_REALTIME
const uint64_t begin64 = time_begin.x | (uint64_t(time_begin.y) << 32);
const uint64_t end64 = time_end.x | (uint64_t(time_end.y) << 32);
const uint64_t time_diff = end64 - begin64;
const float time_diff_f = float(time_diff) / 1e5;
#else
const uint time_diff = time_begin.x - time_end.x;
const float time_diff_f = float(time_diff & 0xfffffu) / 1e6;
#endif
const uint prof_index = pix.x + pix.y * ubo.ubo.res.x;
#if LIGHT_POINT
imageStore(out_light_point_diffuse, pix, vec4(diffuse, time_diff_f));
imageStore(out_light_point_specular, pix, vec4(specular, 0.f));
//imageStore(out_light_point_profile, pix, profile);
//prof_direct_point[prof_index].data[0] = vec4(time_begin, time_end);
#endif
#if LIGHT_POLYGON
imageStore(out_light_poly_diffuse, pix, vec4(diffuse, time_diff_f));
imageStore(out_light_poly_specular, pix, vec4(specular, 0.f));
//imageStore(out_light_poly_profile, pix, profile);
prof_direct_poly.a[prof_index].data[0] = uvec4(time_begin, time_end);
#endif
}

View File

@ -32,6 +32,8 @@ layout(set = 0, binding = 32, std430) readonly buffer Indices { uint16_t a[]; }
layout(set = 0, binding = 33, std430) readonly buffer Vertices { Vertex a[]; } vertices;
//layout(set = 0, binding = 34, rgba16f) uniform writeonly image2D out_light_point_profile;
//layout(set = 0, binding = 34, std430) writeonly buffer Profiling { ProfilingStruct prof_direct_point[]; };
#define RAY_QUERY
#define LIGHT_POINT 1

View File

@ -31,7 +31,8 @@ layout(set = 0, binding = 31, std430) readonly buffer Kusochki { Kusok a[]; } ku
layout(set = 0, binding = 32, std430) readonly buffer Indices { uint16_t a[]; } indices;
layout(set = 0, binding = 33, std430) readonly buffer Vertices { Vertex a[]; } vertices;
layout(set = 0, binding = 34, rgba16f) uniform writeonly image2D out_light_poly_profile;
//layout(set = 0, binding = 34, rgba16f) uniform writeonly image2D out_light_poly_profile;
layout(set = 0, binding = 34, std430) writeonly restrict buffer Profiling { ProfilingStruct a[]; } prof_direct_poly;
#define RAY_QUERY
#define LIGHT_POLYGON 1

View File

@ -45,7 +45,8 @@
X(Buffer, light_grid) \
X(Texture, textures) \
X(Texture, skybox) \
X(Texture, blue_noise_texture)
X(Texture, blue_noise_texture) \
X(Buffer, prof_direct_poly) \
enum {
#define RES_ENUM(type, name) ExternalResource_##name,
@ -80,10 +81,13 @@ static struct {
matrix4x4 prev_inv_proj, prev_inv_view;
qboolean reload_pipeline;
qboolean dump_profiling_data;
qboolean discontinuity;
int max_frame_width, max_frame_height;
vk_buffer_t prof_direct_poly;
struct {
cvar_t *rt_debug_display_only;
uint32_t rt_debug_display_only_value;
@ -267,6 +271,9 @@ static void performTracing( vk_combuf_t *combuf, const perform_tracing_args_t* a
RES_SET_SBUFFER_FULL(indices, args->render_args->geometry_data);
RES_SET_SBUFFER_FULL(vertices, args->render_args->geometry_data);
// TODO register this one properly
RES_SET_SBUFFER_FULL(prof_direct_poly, g_rtx.prof_direct_poly);
// TODO move this to lights
RES_SET_BUFFER(lights, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, args->light_bindings->buffer, args->light_bindings->metadata.offset, args->light_bindings->metadata.size);
RES_SET_BUFFER(light_grid, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, args->light_bindings->buffer, args->light_bindings->grid.offset, args->light_bindings->grid.size);
@ -593,6 +600,35 @@ fail:
R_VkMeatpipeDestroy(newpipe);
}
static void dumpProfilingData(int w, int h) {
FILE *f = fopen("gpuprof.dump", "w");
for (int y = 0; y < h; ++y) {
const struct ProfilingStruct *s = ((const struct ProfilingStruct*)(g_rtx.prof_direct_poly.mapped)) + y * g_rtx.max_frame_width;
for (int x = 0; x < w; ++x, ++s) {
const unsigned long long begin = s->data[0][0] | ((uint64_t)(s->data[0][1]) << 32);
const unsigned long long end = s->data[0][2] | ((uint64_t)(s->data[0][3]) << 32);
fprintf(f, "x=%d y=%d %016llx %016llx %llu %llu d=%llu\n", x, y, begin, end, begin, end, end - begin);
}
}
fclose(f);
}
static void resizeResources(void) {
// TODO resize textures/images
if (g_rtx.prof_direct_poly.buffer != VK_NULL_HANDLE) {
VK_BufferDestroy(&g_rtx.prof_direct_poly);
}
const uint32_t size = sizeof(struct ProfilingStruct) * g_rtx.max_frame_width * g_rtx.max_frame_height;
const VkBufferUsageFlags usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
const VkMemoryPropertyFlags memf = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
if (!VK_BufferCreate("prof_direct_poly", &g_rtx.prof_direct_poly, size, usage, memf)) {
gEngine.Host_Error("Failed to recreate prof_direct_poly");
}
}
void VK_RayFrameEnd(const vk_ray_frame_render_args_t* args)
{
APROF_SCOPE_DECLARE_BEGIN(ray_frame_end, __FUNCTION__);
@ -612,26 +648,40 @@ void VK_RayFrameEnd(const vk_ray_frame_render_args_t* args)
// if (vk_core.debug)
// XVK_RayModel_Validate();
qboolean need_reload = g_rtx.reload_pipeline;
qboolean resize = false;
if (g_rtx.max_frame_width < args->dst.width) {
g_rtx.max_frame_width = ALIGN_UP(args->dst.width, 16);
WARN("Increasing max_frame_width to %d", g_rtx.max_frame_width);
// TODO only reload resources, no need to reload the entire pipeline
need_reload = true;
resize = true;
}
if (g_rtx.max_frame_height < args->dst.height) {
g_rtx.max_frame_height = ALIGN_UP(args->dst.height, 16);
WARN("Increasing max_frame_height to %d", g_rtx.max_frame_height);
// TODO only reload resources, no need to reload the entire pipeline
need_reload = true;
resize = true;
}
// TODO only reload resources, no need to reload the entire pipeline
const qboolean need_reload = g_rtx.reload_pipeline || resize;
if (need_reload || resize || g_rtx.dump_profiling_data) {
WARN("Reloading RTX shaders/pipelines");
XVK_CHECK(vkDeviceWaitIdle(vk_core.device));
}
if (g_rtx.dump_profiling_data) {
// TODO using next frame w/h is not correct, need previous w/h
dumpProfilingData(args->dst.width, args->dst.height);
g_rtx.dump_profiling_data = false;
}
if (resize) {
// Resize after wait idle
resizeResources();
}
if (need_reload) {
WARN("Reloading RTX shaders/pipelines");
XVK_CHECK(vkDeviceWaitIdle(vk_core.device));
reloadMainpipe();
g_rtx.reload_pipeline = false;
@ -692,10 +742,14 @@ tail:
g_rtx.discontinuity = false;
}
static void reloadPipeline( void ) {
static void cmdReloadPipeline( void ) {
g_rtx.reload_pipeline = true;
}
static void cmdDumpProfilingData( void ) {
g_rtx.dump_profiling_data = true;
}
qboolean VK_RayInit( void )
{
ASSERT(vk_core.rtx);
@ -754,7 +808,10 @@ qboolean VK_RayInit( void )
RT_RayModel_Clear();
gEngine.Cmd_AddCommand("rt_debug_reload_pipelines", reloadPipeline, "Reload RT pipelines");
resizeResources();
gEngine.Cmd_AddCommand("rt_debug_reload_pipelines", cmdReloadPipeline, "Reload RT pipelines");
gEngine.Cmd_AddCommand("rt_debug_prof_dump", cmdDumpProfilingData, "Dump profiling data");
#define X(name) #name ", "
g_rtx.debug.rt_debug_display_only = gEngine.Cvar_Get("rt_debug_display_only", "", FCVAR_GLCONFIG,
@ -775,6 +832,7 @@ void VK_RayShutdown( void ) {
VK_BufferDestroy(&g_ray_model_state.model_headers_buffer);
VK_BufferDestroy(&g_ray_model_state.kusochki_buffer);
VK_BufferDestroy(&g_rtx.uniform_buffer);
VK_BufferDestroy(&g_rtx.prof_direct_poly);
RT_VkAccelShutdown();
RT_DynamicModelShutdown();