vk: rt: add more shader clock timing experiments and results

See notes and stream E344 for data and comments
This commit is contained in:
Ivan Avdeev 2023-12-08 13:20:12 -05:00
parent cbdbda3549
commit 6b51fde5b2
6 changed files with 261 additions and 44 deletions

View File

@ -1024,3 +1024,79 @@ This would also allow passing arbitrary per-pixel data from shaders, which would
- Ray tracing time (Σ, +count)
- Aggregate numbers:
- TODO: what does VK_KHR_performance_query give us? Regs usage, etc.
# 2023-12-08 E344
## Experiment one
some save in test_brush2, default PROJECTED sampling
0-5%: s=0(0, 0.00%) r=133495(133495, 13.18%)
5-10%: s=0(0, 0.00%) r=204348(337843, 33.35%)
10-15%: s=0(0, 0.00%) r=92673(430516, 42.49%)
15-20%: s=0(0, 0.00%) r=25196(455712, 44.98%)
20-25%: s=67(67, 0.01%) r=3529(459241, 45.33%)
25-30%: s=253(320, 0.03%) r=12018(471259, 46.52%)
30-35%: s=319(639, 0.06%) r=39805(511064, 50.45%)
35-40%: s=2096(2735, 0.27%) r=178843(689907, 68.10%)
40-45%: s=8753(11488, 1.13%) r=270099(960006, 94.76%)
45-50%: s=53958(65446, 6.46%) r=44000(1004006, 99.10%)
50-55%: s=275150(340596, 33.62%) r=8256(1012262, 99.92%)
55-60%: s=183343(523939, 51.72%) r=838(1013100, 100.00%)
60-65%: s=38922(562861, 55.56%) r=0(1013100, 100.00%)
65-70%: s=6603(569464, 56.21%) r=0(1013100, 100.00%)
70-75%: s=14101(583565, 57.60%) r=0(1013100, 100.00%)
75-80%: s=85481(669046, 66.04%) r=0(1013100, 100.00%)
80-85%: s=150723(819769, 80.92%) r=0(1013100, 100.00%)
85-90%: s=107863(927632, 91.56%) r=0(1013100, 100.00%)
90-95%: s=85468(1013100, 100.00%) r=0(1013100, 100.00%)
95-100%: s=0(1013100, 100.00%) r=0(1013100, 100.00%)
(shader clock) percentiles:
99%: ray=393223 sampling=496965
95%: ray=321478 sampling=365558
90%: ray=276235 sampling=356515
75%: ray=250132 sampling=349746
50%: ray=184418 sampling=337416
PROJECTED + REALTIME clock:
percentiles:
99%: ray=45580 sampling=62860
95%: ray=30324 sampling=60136
90%: ray=19736 sampling=58868
75%: ray=13688 sampling=48068
50%: ray=10784 sampling=22500
## Two
SOLID sampling
0-5%: s=6(6, 0.00%) r=0(0, 0.00%)
5-10%: s=3361(3367, 0.33%) r=0(0, 0.00%)
10-15%: s=89901(93268, 9.21%) r=0(0, 0.00%)
15-20%: s=681106(774374, 76.44%) r=0(0, 0.00%)
20-25%: s=125185(899559, 88.79%) r=0(0, 0.00%)
25-30%: s=111237(1010796, 99.77%) r=222(222, 0.02%)
30-35%: s=2080(1012876, 99.98%) r=667(889, 0.09%)
35-40%: s=224(1013100, 100.00%) r=621(1510, 0.15%)
40-45%: s=0(1013100, 100.00%) r=1049(2559, 0.25%)
45-50%: s=0(1013100, 100.00%) r=2071(4630, 0.46%)
50-55%: s=0(1013100, 100.00%) r=1494(6124, 0.60%)
55-60%: s=0(1013100, 100.00%) r=7099(13223, 1.31%)
60-65%: s=0(1013100, 100.00%) r=80026(93249, 9.20%)
65-70%: s=0(1013100, 100.00%) r=152210(245459, 24.23%)
70-75%: s=0(1013100, 100.00%) r=466794(712253, 70.30%)
75-80%: s=0(1013100, 100.00%) r=276878(989131, 97.63%)
80-85%: s=0(1013100, 100.00%) r=22822(1011953, 99.89%)
85-90%: s=0(1013100, 100.00%) r=1147(1013100, 100.00%)
90-95%: s=0(1013100, 100.00%) r=0(1013100, 100.00%)
95-100%: s=0(1013100, 100.00%) r=0(1013100, 100.00%)
(shader clock) percentiles:
99%: ray=565588 sampling=105982
95%: ray=422982 sampling=102555
90%: ray=391055 sampling=100877
75%: ray=350036 sampling=97007
50%: ray=290830 sampling=76574
SOLID + REALTIME clock
99%: ray=97056 sampling=20328
95%: ray=78988 sampling=19532
90%: ray=74236 sampling=19076
75%: ray=51580 sampling=17512
50%: ray=29268 sampling=6648

View File

@ -1,8 +1,17 @@
# Next
- [ ] performance query
# 2023-12-08 E344
- [x] measure percentage of direct light shader phases:
- [x] how long does sampling take -- ~~supposedly ALU-bound, need clockARB()?~~
- [x] how long does ray tracing take -- ~~supposedly mem-bound, clockRealtimeEXT()?~~
- [ ] try mapping shader realtime clock values to calibrated vk compute dispatch timestamps
Similar to what RGP draws, but not quite the same. Not sure if this is valuable, but it would be neat
# 2023-12-07 E343
- [x] extract raw shader clock
- [ ] display times as scopes somewhere
- [ ] extract sampling times
- [ ] extract ray times
- [-] display times as scopes somewhere
→ tried chrome trace in ff profiler, it broke (200k scopes in 1k threads is too much)
# 2023-12-05 E342
- [x] tone down the specular indirect blur

View File

@ -5,6 +5,10 @@
#include "noise.glsl"
#include "utils.glsl"
#include "time.glsl"
uint prof_sampling = 0, prof_ray = 0;
uint prof_lights = 0, prof_samples = 0, prof_rays = 0;
#define DO_ALL_IN_CLUSTER 1
@ -232,9 +236,14 @@ void sampleEmissiveSurfaces(vec3 P, vec3 N, vec3 throughput, vec3 view_dir, Mate
const float plane_dist = dot(poly.plane, vec4(P, 1.f));
++prof_lights;
if (plane_dist < 0.)
continue;
++prof_samples;
const time_t prof_sampling_begin = timeNow();
#ifdef PROJECTED
const vec4 light_sample_dir = getPolygonLightSampleProjected(view_dir, ctx, poly);
#elif defined(SOLID)
@ -244,6 +253,7 @@ void sampleEmissiveSurfaces(vec3 P, vec3 N, vec3 throughput, vec3 view_dir, Mate
#else
const vec4 light_sample_dir = getPolygonLightSampleSimple(P, view_dir, poly);
#endif
prof_sampling += timeDelta(prof_sampling_begin, timeNow());
if (light_sample_dir.w <= 0.)
continue;
@ -251,7 +261,12 @@ void sampleEmissiveSurfaces(vec3 P, vec3 N, vec3 throughput, vec3 view_dir, Mate
const float dist = - plane_dist / dot(light_sample_dir.xyz, poly.plane.xyz);
const vec3 emissive = poly.emissive;
if (!shadowed(P, light_sample_dir.xyz, dist)) {
const time_t prof_ray_begin = timeNow();
const bool shadow = shadowed(P, light_sample_dir.xyz, dist);
prof_ray += timeDelta(prof_ray_begin, timeNow());
++prof_rays;
if (!shadow) {
//const float estimate = total_contrib;
const float estimate = light_sample_dir.w;
vec3 poly_diffuse = vec3(0.), poly_specular = vec3(0.);

View File

@ -1,14 +1,6 @@
#define PROF_USE_REALTIME
#ifdef PROF_USE_REALTIME
#extension GL_ARB_gpu_shader_int64: enable
#extension GL_EXT_shader_realtime_clock: enable
#else
#extension GL_ARB_shader_clock: enable
#endif
#include "utils.glsl"
#include "noise.glsl"
#include "time.glsl"
#include "ray_kusochki.glsl"
@ -22,25 +14,8 @@ void readNormals(ivec2 uv, out vec3 geometry_normal, out vec3 shading_normal) {
shading_normal = normalDecode(n.zw);
}
#ifdef PROF_USE_REALTIME
// On mesa+amdgpu there's a clear gradient: pixels on top of screen take 2-3x longer to compute than bottom ones. Also,
// it does flicker a lot.
// Deltas are about 30000-100000 parrots
#define timeNow clockRealtime2x32EXT
#else
// clockARB doesn't give directly usable time values on mesa+amdgpu
// even deltas between them are not meaningful enough.
// On mesa+amdgpu clockARB() values are limited to lower 20bits, and they wrap around a lot.
// Absolute difference value are often 30-50% of the available range, so it's not that far off from wrapping around
// multiple times, rendering the value completely useless.
// Deltas are around 300000-500000 parrots.
// Other than that, the values seem uniform across the screen (as compared to realtime clock, which has a clearly
// visible gradient: top differences are larger than bottom ones.
#define timeNow clock2x32ARB
#endif
void main() {
const uvec2 time_begin = timeNow();
const time_t time_begin = timeNow();
#ifdef RAY_TRACE
const vec2 uv = (gl_LaunchIDEXT.xy + .5) / gl_LaunchSizeEXT.xy * 2. - 1.;
@ -79,19 +54,12 @@ void main() {
vec3 diffuse = vec3(0.), specular = vec3(0.);
computeLighting(pos + geometry_normal * .001, shading_normal, throughput, -direction, material, diffuse, specular);
const uvec2 time_end = timeNow();
const time_t time_end = timeNow();
//const uint64_t time_diff = time_end - time_begin;
//const uint time_diff = time_begin.x - time_end.x;
#ifdef PROF_USE_REALTIME
const uint64_t begin64 = time_begin.x | (uint64_t(time_begin.y) << 32);
const uint64_t end64 = time_end.x | (uint64_t(time_end.y) << 32);
const uint64_t time_diff = end64 - begin64;
const float time_diff_f = float(time_diff) / 1e5;
#else
const uint time_diff = time_begin.x - time_end.x;
const float time_diff_f = float(time_diff & 0xfffffu) / 1e6;
#endif
const uint time_diff = timeDelta(time_begin, time_end);
const float time_diff_f = float(time_diff) / 1e6;
const uint prof_index = pix.x + pix.y * ubo.ubo.res.x;
#if LIGHT_POINT
@ -106,5 +74,9 @@ void main() {
imageStore(out_light_poly_specular, pix, vec4(specular, 0.f));
//imageStore(out_light_poly_profile, pix, profile);
prof_direct_poly.a[prof_index].data[0] = uvec4(time_begin, time_end);
prof_direct_poly.a[prof_index].data[1] = uvec4(timeDelta(time_begin, time_end), prof_sampling, prof_ray, 0);
prof_direct_poly.a[prof_index].data[2] = uvec4(prof_lights, prof_samples, prof_rays, 0);
//prof_direct_poly.a[prof_index].data[2] = uvec4(prof_ray_begin, prof_ray_end);
//prof_direct_poly.a[prof_index].data[3] = uvec4(prof_light_count, 0, 0, 0);
#endif
}

53
ref/vk/shaders/time.glsl Normal file
View File

@ -0,0 +1,53 @@
#ifndef TIME_GLSL_INCLUDED
#define TIME_GLSL_INCLUDED
//#define PROF_USE_REALTIME
#ifdef PROF_USE_REALTIME
#extension GL_ARB_gpu_shader_int64: enable
#extension GL_EXT_shader_realtime_clock: enable
#else
#extension GL_ARB_shader_clock: enable
#endif
#define time_t uvec2
#define T0 uvec2(0)
#define clockRealtime clockRealtime2x32EXT
#define clockShader clock2x32ARB
#define clockRealtime64 clockRealtimeEXT
#define clockShader64 clockARB
#ifdef PROF_USE_REALTIME
uint clockRealtimeDelta(time_t begin, time_t end) {
const uint64_t begin64 = begin.x | (uint64_t(begin.y) << 32);
const uint64_t end64 = end.x | (uint64_t(end.y) << 32);
const uint64_t time_diff = end64 - begin64;
return uint(time_diff);
}
#endif
uint clockShaderDelta(time_t begin, time_t end) {
// AMD RNDA2 SHADER_CYCLES reg is limited to 20 bits
return (end.x - begin.x) & 0xfffffu;
}
#ifdef PROF_USE_REALTIME
// On mesa+amdgpu there's a clear gradient: pixels on top of screen take 2-3x longer to compute than bottom ones. Also,
// it does flicker a lot.
// Deltas are about 30000-100000 parrots
#define timeNow clockRealtime
#define timeDelta clockRealtimeDelta
#else
// clockARB doesn't give directly usable time values on mesa+amdgpu
// even deltas between them are not meaningful enough.
// On AMD clockARB() values are limited to lower 20 bits (see RDNA 2 ISA SHADER_CYCLES reg), and they wrap around a lot.
// Absolute difference value are often 30-50% of the available range, so it's not that far off from wrapping around
// multiple times, rendering the value completely useless.
// Deltas are around 300000-500000 parrots.
// Other than that, the values seem uniform across the screen (as compared to realtime clock, which has a clearly
// visible gradient: top differences are larger than bottom ones.
#define timeNow clockShader
#define timeDelta clockShaderDelta
#endif
#endif //ifndef TIME_GLSL_INCLUDED

View File

@ -25,6 +25,7 @@
#include "xash3d_mathlib.h"
#include <string.h>
#include <stdlib.h>
#define LOG_MODULE rt
@ -600,16 +601,107 @@ fail:
R_VkMeatpipeDestroy(newpipe);
}
static int u32cmp(const void *l, const void *r) {
return *(const uint32_t*)l - *(const uint32_t*)r;
}
static void dumpProfilingData(int w, int h) {
uint64_t min = UINT64_MAX;
const struct ProfilingStruct *const prof = (const struct ProfilingStruct*)g_rtx.prof_direct_poly.mapped;
for (int i = 0; i < w * h; ++i) {
const struct ProfilingStruct *s = prof + i;
// TODO: time can wrap
const unsigned long long begin = s->data[0][0] | ((uint64_t)(s->data[0][1]) << 32);
min = Q_min(begin, min);
}
int sampling_hist[20] = {0};
int ray_hist[20] = {0};
int total = 0;
uint32_t *all_samples = Mem_Calloc(vk_core.pool, w * h * sizeof(uint32_t));
uint32_t *all_rays = Mem_Calloc(vk_core.pool, w * h * sizeof(uint32_t));
FILE *f = fopen("gpuprof.dump", "w");
for (int y = 0; y < h; ++y) {
const struct ProfilingStruct *s = ((const struct ProfilingStruct*)(g_rtx.prof_direct_poly.mapped)) + y * g_rtx.max_frame_width;
for (int x = 0; x < w; ++x, ++s) {
const unsigned long long begin = s->data[0][0] | ((uint64_t)(s->data[0][1]) << 32);
const unsigned long long end = s->data[0][2] | ((uint64_t)(s->data[0][3]) << 32);
fprintf(f, "x=%d y=%d %016llx %016llx %llu %llu d=%llu\n", x, y, begin, end, begin, end, end - begin);
#define TIME_READ(c, i) ((uint64_t)s->data[c][i*2+0] | ((uint64_t)(s->data[c][i*2+1]) << 32))
const unsigned long long begin = TIME_READ(0, 0);
const unsigned long long end = TIME_READ(0, 1);
const uint32_t delta = end - begin;
uint32_t shader_delta = s->data[1][0];
const uint32_t sampling_delta = s->data[1][1];
const uint32_t ray_delta = s->data[1][2];
const uint32_t lights = s->data[2][0];
const uint32_t samples = s->data[2][1];
const uint32_t rays = s->data[2][2];
while (shader_delta < sampling_delta || shader_delta < ray_delta) {
ERR("shader_delta wraparound detected, fixing");
shader_delta += 0x100000u;
}
/* if (shader_delta != delta) */
/* ERR("Shader and local deltas don't match: d=%d", shader_delta - delta); */
if (!shader_delta)
continue;
const float sampling_pct = sampling_delta * 100.f / shader_delta;
const float ray_pct = ray_delta * 100.f / shader_delta;
fprintf(f, "total=%d lights=%d sampling=%d(%.02f%%; %d) ray=%d(%.02f%%; %d)\n",
shader_delta, lights,
sampling_delta, sampling_pct, samples,
ray_delta, ray_pct, rays);
assert(sampling_pct < 100.);
sampling_hist[(int)(sampling_pct / 5.)]++;
assert(ray_pct < 100.);
ray_hist[(int)(ray_pct / 5.)]++;
all_rays[total] = ray_delta;
all_samples[total] = sampling_delta;
++total;
//fwrite(&begin, sizeof(begin), 1, f);
//fwrite(&end, sizeof(end), 1, f);
//fprintf(f, "x=%d y=%d %016llx %016llx %llu %llu d=%llu\n", x, y, begin, end, begin, end, end - begin);
//fprintf(f, "%016llx %016llx\n", begin, end);
//fprintf(f, "{\"name\":\"dpoly\",\"cat\":\"shader\",\"ph\":\"B\",\"ts\":%llu,\"tid\":%d}\n", (begin - min), tid);
//fprintf(f, "{\"name\":\"dpoly\",\"cat\":\"shader\",\"ph\":\"E\",\"ts\":%llu,\"tid\":%d}\n", (end - min), tid);
}
}
for (int i = 0, s = 0, r = 0; i < 20; ++i) {
s += sampling_hist[i];
r += ray_hist[i];
fprintf(f, "%d-%d%%:\ts=%d(%d, %.02f%%)\tr=%d(%d, %.02f%%)\n",
i*5, i*5+5,
sampling_hist[i], s, s * 100.f / total,
ray_hist[i], r, r * 100.f / total);
}
qsort(all_rays, total, sizeof(all_rays[0]), u32cmp);
qsort(all_samples, total, sizeof(all_samples[0]), u32cmp);
fprintf(f, "percentiles:\n");
static const int pcts[] = {99, 95, 90, 75, 50};
for (int i = 0; i < COUNTOF(pcts); ++i) {
const int pct = pcts[i];
const int index = total * pct / 100;
fprintf(f, "%d%%: ray=%d sampling=%d\n", pct,
all_rays[index], all_samples[index]
);
}
Mem_Free(all_rays);
Mem_Free(all_samples);
fclose(f);
}