vk: rt: add more shader clock timing experiments and results

See notes and stream E344 for data and comments
2024-12-04 16:00:22 +01:00 · 2023-12-08 13:20:12 -05:00 · 2023-12-08 13:20:12 -05:00 · 6b51fde5b2
commit 6b51fde5b2
parent cbdbda3549
6 changed files with 261 additions and 44 deletions
--- a/ref/vk/NOTES.md
+++ b/ref/vk/NOTES.md
@ -1024,3 +1024,79 @@ This would also allow passing arbitrary per-pixel data from shaders, which would
 	- Ray tracing time (Σ, +count)
 - Aggregate numbers:
 	- TODO: what does VK_KHR_performance_query give us? Regs usage, etc.
+
+# 2023-12-08 E344
+## Experiment one
+some save in test_brush2, default PROJECTED sampling
+	0-5%:	s=0(0, 0.00%)	r=133495(133495, 13.18%)
+	5-10%:	s=0(0, 0.00%)	r=204348(337843, 33.35%)
+	10-15%:	s=0(0, 0.00%)	r=92673(430516, 42.49%)
+	15-20%:	s=0(0, 0.00%)	r=25196(455712, 44.98%)
+	20-25%:	s=67(67, 0.01%)	r=3529(459241, 45.33%)
+	25-30%:	s=253(320, 0.03%)	r=12018(471259, 46.52%)
+	30-35%:	s=319(639, 0.06%)	r=39805(511064, 50.45%)
+	35-40%:	s=2096(2735, 0.27%)	r=178843(689907, 68.10%)
+	40-45%:	s=8753(11488, 1.13%)	r=270099(960006, 94.76%)
+	45-50%:	s=53958(65446, 6.46%)	r=44000(1004006, 99.10%)
+	50-55%:	s=275150(340596, 33.62%)	r=8256(1012262, 99.92%)
+	55-60%:	s=183343(523939, 51.72%)	r=838(1013100, 100.00%)
+	60-65%:	s=38922(562861, 55.56%)	r=0(1013100, 100.00%)
+	65-70%:	s=6603(569464, 56.21%)	r=0(1013100, 100.00%)
+	70-75%:	s=14101(583565, 57.60%)	r=0(1013100, 100.00%)
+	75-80%:	s=85481(669046, 66.04%)	r=0(1013100, 100.00%)
+	80-85%:	s=150723(819769, 80.92%)	r=0(1013100, 100.00%)
+	85-90%:	s=107863(927632, 91.56%)	r=0(1013100, 100.00%)
+	90-95%:	s=85468(1013100, 100.00%)	r=0(1013100, 100.00%)
+	95-100%:	s=0(1013100, 100.00%)	r=0(1013100, 100.00%)
+
+(shader clock) percentiles:
+99%: ray=393223 sampling=496965
+95%: ray=321478 sampling=365558
+90%: ray=276235 sampling=356515
+75%: ray=250132 sampling=349746
+50%: ray=184418 sampling=337416
+
+PROJECTED + REALTIME clock:
+percentiles:
+99%: ray=45580 sampling=62860
+95%: ray=30324 sampling=60136
+90%: ray=19736 sampling=58868
+75%: ray=13688 sampling=48068
+50%: ray=10784 sampling=22500
+
+## Two
+SOLID sampling
+	0-5%:	s=6(6, 0.00%)	r=0(0, 0.00%)
+	5-10%:	s=3361(3367, 0.33%)	r=0(0, 0.00%)
+	10-15%:	s=89901(93268, 9.21%)	r=0(0, 0.00%)
+	15-20%:	s=681106(774374, 76.44%)	r=0(0, 0.00%)
+	20-25%:	s=125185(899559, 88.79%)	r=0(0, 0.00%)
+	25-30%:	s=111237(1010796, 99.77%)	r=222(222, 0.02%)
+	30-35%:	s=2080(1012876, 99.98%)	r=667(889, 0.09%)
+	35-40%:	s=224(1013100, 100.00%)	r=621(1510, 0.15%)
+	40-45%:	s=0(1013100, 100.00%)	r=1049(2559, 0.25%)
+	45-50%:	s=0(1013100, 100.00%)	r=2071(4630, 0.46%)
+	50-55%:	s=0(1013100, 100.00%)	r=1494(6124, 0.60%)
+	55-60%:	s=0(1013100, 100.00%)	r=7099(13223, 1.31%)
+	60-65%:	s=0(1013100, 100.00%)	r=80026(93249, 9.20%)
+	65-70%:	s=0(1013100, 100.00%)	r=152210(245459, 24.23%)
+	70-75%:	s=0(1013100, 100.00%)	r=466794(712253, 70.30%)
+	75-80%:	s=0(1013100, 100.00%)	r=276878(989131, 97.63%)
+	80-85%:	s=0(1013100, 100.00%)	r=22822(1011953, 99.89%)
+	85-90%:	s=0(1013100, 100.00%)	r=1147(1013100, 100.00%)
+	90-95%:	s=0(1013100, 100.00%)	r=0(1013100, 100.00%)
+	95-100%:	s=0(1013100, 100.00%)	r=0(1013100, 100.00%)
+
+(shader clock) percentiles:
+99%: ray=565588 sampling=105982
+95%: ray=422982 sampling=102555
+90%: ray=391055 sampling=100877
+75%: ray=350036 sampling=97007
+50%: ray=290830 sampling=76574
+
+SOLID + REALTIME clock
+99%: ray=97056 sampling=20328
+95%: ray=78988 sampling=19532
+90%: ray=74236 sampling=19076
+75%: ray=51580 sampling=17512
+50%: ray=29268 sampling=6648
--- a/ref/vk/TODO.md
+++ b/ref/vk/TODO.md
@ -1,8 +1,17 @@
+# Next
+- [ ] performance query
+
+# 2023-12-08 E344
+- [x] measure percentage of direct light shader phases:
+	- [x] how long does sampling take -- ~~supposedly ALU-bound, need clockARB()?~~
+	- [x] how long does ray tracing take -- ~~supposedly mem-bound, clockRealtimeEXT()?~~
+- [ ] try mapping shader realtime clock values to calibrated vk compute dispatch timestamps
+	Similar to what RGP draws, but not quite the same. Not sure if this is valuable, but it would be neat
+
 # 2023-12-07 E343
 - [x] extract raw shader clock
- [ ] display times as scopes somewhere
- [ ] extract sampling times
- [ ] extract ray times
+- [-] display times as scopes somewhere
+	→ tried chrome trace in ff profiler, it broke (200k scopes in 1k threads is too much)

 # 2023-12-05 E342
 - [x] tone down the specular indirect blur
--- a/ref/vk/shaders/light_polygon.glsl
+++ b/ref/vk/shaders/light_polygon.glsl
@ -5,6 +5,10 @@

 #include "noise.glsl"
 #include "utils.glsl"
+#include "time.glsl"
+
+uint prof_sampling = 0, prof_ray = 0;
+uint prof_lights = 0, prof_samples = 0, prof_rays = 0;

 #define DO_ALL_IN_CLUSTER 1

@ -232,9 +236,14 @@ void sampleEmissiveSurfaces(vec3 P, vec3 N, vec3 throughput, vec3 view_dir, Mate

 		const float plane_dist = dot(poly.plane, vec4(P, 1.f));

+		++prof_lights;
+
 		if (plane_dist < 0.)
 			continue;

+		++prof_samples;
+
+		const time_t prof_sampling_begin = timeNow();
 #ifdef PROJECTED
 		const vec4 light_sample_dir = getPolygonLightSampleProjected(view_dir, ctx, poly);
 #elif defined(SOLID)
@ -244,6 +253,7 @@ void sampleEmissiveSurfaces(vec3 P, vec3 N, vec3 throughput, vec3 view_dir, Mate
 #else
 		const vec4 light_sample_dir = getPolygonLightSampleSimple(P, view_dir, poly);
 #endif
+		prof_sampling += timeDelta(prof_sampling_begin, timeNow());

 		if (light_sample_dir.w <= 0.)
 			continue;
@ -251,7 +261,12 @@ void sampleEmissiveSurfaces(vec3 P, vec3 N, vec3 throughput, vec3 view_dir, Mate
 		const float dist = - plane_dist / dot(light_sample_dir.xyz, poly.plane.xyz);
 		const vec3 emissive = poly.emissive;

-		if (!shadowed(P, light_sample_dir.xyz, dist)) {
+		const time_t prof_ray_begin = timeNow();
+		const bool shadow = shadowed(P, light_sample_dir.xyz, dist);
+		prof_ray += timeDelta(prof_ray_begin, timeNow());
+		++prof_rays;
+
+		if (!shadow) {
 			//const float estimate = total_contrib;
 			const float estimate = light_sample_dir.w;
 			vec3 poly_diffuse = vec3(0.), poly_specular = vec3(0.);
--- a/ref/vk/shaders/ray_light_direct.glsl
+++ b/ref/vk/shaders/ray_light_direct.glsl
@ -1,14 +1,6 @@
-#define PROF_USE_REALTIME
-#ifdef PROF_USE_REALTIME
-#extension GL_ARB_gpu_shader_int64: enable
-#extension GL_EXT_shader_realtime_clock: enable
-#else
-#extension GL_ARB_shader_clock: enable
-#endif
-
-
 #include "utils.glsl"
 #include "noise.glsl"
+#include "time.glsl"

 #include "ray_kusochki.glsl"

@ -22,25 +14,8 @@ void readNormals(ivec2 uv, out vec3 geometry_normal, out vec3 shading_normal) {
 	shading_normal = normalDecode(n.zw);
 }

-#ifdef PROF_USE_REALTIME
-// On mesa+amdgpu there's a clear gradient: pixels on top of screen take 2-3x longer to compute than bottom ones. Also,
-// it does flicker a lot.
-// Deltas are about 30000-100000 parrots
-#define timeNow clockRealtime2x32EXT
-#else
-// clockARB doesn't give directly usable time values on mesa+amdgpu
-// even deltas between them are not meaningful enough.
-// On mesa+amdgpu clockARB() values are limited to lower 20bits, and they wrap around a lot.
-// Absolute difference value are often 30-50% of the available range, so it's not that far off from wrapping around
-// multiple times, rendering the value completely useless.
-// Deltas are around 300000-500000 parrots.
-// Other than that, the values seem uniform across the screen (as compared to realtime clock, which has a clearly
-// visible gradient: top differences are larger than bottom ones.
-#define timeNow clock2x32ARB
-#endif
-
 void main() {
-	const uvec2 time_begin = timeNow();
+	const time_t time_begin = timeNow();

 #ifdef RAY_TRACE
 	const vec2 uv = (gl_LaunchIDEXT.xy + .5) / gl_LaunchSizeEXT.xy * 2. - 1.;
@ -79,19 +54,12 @@ void main() {
 	vec3 diffuse = vec3(0.), specular = vec3(0.);
 	computeLighting(pos + geometry_normal * .001, shading_normal, throughput, -direction, material, diffuse, specular);

-	const uvec2 time_end = timeNow();
+	const time_t time_end = timeNow();
 	//const uint64_t time_diff = time_end - time_begin;
 	//const uint time_diff = time_begin.x - time_end.x;

-#ifdef PROF_USE_REALTIME
-	const uint64_t begin64 = time_begin.x | (uint64_t(time_begin.y) << 32);
-	const uint64_t end64 = time_end.x | (uint64_t(time_end.y) << 32);
-	const uint64_t time_diff = end64 - begin64;
-	const float time_diff_f = float(time_diff) / 1e5;
-#else
-	const uint time_diff = time_begin.x - time_end.x;
-	const float time_diff_f = float(time_diff & 0xfffffu) / 1e6;
-#endif
+	const uint time_diff = timeDelta(time_begin, time_end);
+	const float time_diff_f = float(time_diff) / 1e6;

 	const uint prof_index = pix.x + pix.y * ubo.ubo.res.x;
 #if LIGHT_POINT
@ -106,5 +74,9 @@ void main() {
 	imageStore(out_light_poly_specular, pix, vec4(specular, 0.f));
 	//imageStore(out_light_poly_profile, pix, profile);
 	prof_direct_poly.a[prof_index].data[0] = uvec4(time_begin, time_end);
+	prof_direct_poly.a[prof_index].data[1] = uvec4(timeDelta(time_begin, time_end), prof_sampling, prof_ray, 0);
+	prof_direct_poly.a[prof_index].data[2] = uvec4(prof_lights, prof_samples, prof_rays, 0);
+	//prof_direct_poly.a[prof_index].data[2] = uvec4(prof_ray_begin, prof_ray_end);
+	//prof_direct_poly.a[prof_index].data[3] = uvec4(prof_light_count, 0, 0, 0);
 #endif
 }
--- a/ref/vk/shaders/time.glsl
+++ b/ref/vk/shaders/time.glsl
@ -0,0 +1,53 @@
+#ifndef TIME_GLSL_INCLUDED
+#define TIME_GLSL_INCLUDED
+
+//#define PROF_USE_REALTIME
+#ifdef PROF_USE_REALTIME
+#extension GL_ARB_gpu_shader_int64: enable
+#extension GL_EXT_shader_realtime_clock: enable
+#else
+#extension GL_ARB_shader_clock: enable
+#endif
+
+#define time_t uvec2
+#define T0 uvec2(0)
+#define clockRealtime clockRealtime2x32EXT
+#define clockShader clock2x32ARB
+
+#define clockRealtime64 clockRealtimeEXT
+#define clockShader64 clockARB
+
+#ifdef PROF_USE_REALTIME
+uint clockRealtimeDelta(time_t begin, time_t end) {
+	const uint64_t begin64 = begin.x | (uint64_t(begin.y) << 32);
+	const uint64_t end64 = end.x | (uint64_t(end.y) << 32);
+	const uint64_t time_diff = end64 - begin64;
+	return uint(time_diff);
+}
+#endif
+
+uint clockShaderDelta(time_t begin, time_t end) {
+	// AMD RNDA2 SHADER_CYCLES reg is limited to 20 bits
+	return (end.x - begin.x) & 0xfffffu;
+}
+
+#ifdef PROF_USE_REALTIME
+// On mesa+amdgpu there's a clear gradient: pixels on top of screen take 2-3x longer to compute than bottom ones. Also,
+// it does flicker a lot.
+// Deltas are about 30000-100000 parrots
+#define timeNow clockRealtime
+#define timeDelta clockRealtimeDelta
+#else
+// clockARB doesn't give directly usable time values on mesa+amdgpu
+// even deltas between them are not meaningful enough.
+// On AMD clockARB() values are limited to lower 20 bits (see RDNA 2 ISA SHADER_CYCLES reg), and they wrap around a lot.
+// Absolute difference value are often 30-50% of the available range, so it's not that far off from wrapping around
+// multiple times, rendering the value completely useless.
+// Deltas are around 300000-500000 parrots.
+// Other than that, the values seem uniform across the screen (as compared to realtime clock, which has a clearly
+// visible gradient: top differences are larger than bottom ones.
+#define timeNow clockShader
+#define timeDelta clockShaderDelta
+#endif
+
+#endif //ifndef TIME_GLSL_INCLUDED
--- a/ref/vk/vk_rtx.c
+++ b/ref/vk/vk_rtx.c
@ -25,6 +25,7 @@
 #include "xash3d_mathlib.h"

 #include <string.h>
+#include <stdlib.h>

 #define LOG_MODULE rt

@ -600,16 +601,107 @@ fail:
 	R_VkMeatpipeDestroy(newpipe);
 }

+static int u32cmp(const void *l, const void *r) {
+	return *(const uint32_t*)l - *(const uint32_t*)r;
+}
+
 static void dumpProfilingData(int w, int h) {
+	uint64_t min = UINT64_MAX;
+	const struct ProfilingStruct *const prof = (const struct ProfilingStruct*)g_rtx.prof_direct_poly.mapped;
+	for (int i = 0; i < w * h; ++i) {
+		const struct ProfilingStruct *s = prof + i;
+		// TODO: time can wrap
+		const unsigned long long begin = s->data[0][0] | ((uint64_t)(s->data[0][1]) << 32);
+		min = Q_min(begin, min);
+	}
+
+	int sampling_hist[20] = {0};
+	int ray_hist[20] = {0};
+	int total = 0;
+
+	uint32_t *all_samples = Mem_Calloc(vk_core.pool, w * h * sizeof(uint32_t));
+	uint32_t *all_rays = Mem_Calloc(vk_core.pool, w * h * sizeof(uint32_t));
+
 	FILE *f = fopen("gpuprof.dump", "w");
 	for (int y = 0; y < h; ++y) {
 		const struct ProfilingStruct *s = ((const struct ProfilingStruct*)(g_rtx.prof_direct_poly.mapped)) + y * g_rtx.max_frame_width;
 		for (int x = 0; x < w; ++x, ++s) {
-			const unsigned long long begin = s->data[0][0] | ((uint64_t)(s->data[0][1]) << 32);
-			const unsigned long long end = s->data[0][2] | ((uint64_t)(s->data[0][3]) << 32);
-			fprintf(f, "x=%d y=%d %016llx %016llx %llu %llu d=%llu\n", x, y, begin, end, begin, end, end - begin);
+#define TIME_READ(c, i) ((uint64_t)s->data[c][i*2+0] | ((uint64_t)(s->data[c][i*2+1]) << 32))
+			const unsigned long long begin = TIME_READ(0, 0);
+			const unsigned long long end = TIME_READ(0, 1);
+			const uint32_t delta = end - begin;
+
+			uint32_t shader_delta = s->data[1][0];
+			const uint32_t sampling_delta = s->data[1][1];
+			const uint32_t ray_delta = s->data[1][2];
+
+			const uint32_t lights = s->data[2][0];
+			const uint32_t samples = s->data[2][1];
+			const uint32_t rays = s->data[2][2];
+
+			while (shader_delta < sampling_delta || shader_delta < ray_delta) {
+				ERR("shader_delta wraparound detected, fixing");
+				shader_delta += 0x100000u;
+			}
+
+			/* if (shader_delta != delta) */
+			/* 	ERR("Shader and local deltas don't match: d=%d", shader_delta - delta); */
+
+			if (!shader_delta)
+				continue;
+
+			const float sampling_pct = sampling_delta * 100.f / shader_delta;
+			const float ray_pct = ray_delta * 100.f / shader_delta;
+
+			fprintf(f, "total=%d lights=%d sampling=%d(%.02f%%; %d) ray=%d(%.02f%%; %d)\n",
+				shader_delta, lights,
+				sampling_delta, sampling_pct, samples,
+				ray_delta, ray_pct, rays);
+
+			assert(sampling_pct < 100.);
+			sampling_hist[(int)(sampling_pct / 5.)]++;
+
+			assert(ray_pct < 100.);
+			ray_hist[(int)(ray_pct / 5.)]++;
+
+			all_rays[total] = ray_delta;
+			all_samples[total] = sampling_delta;
+			++total;
+
+			//fwrite(&begin, sizeof(begin), 1, f);
+			//fwrite(&end, sizeof(end), 1, f);
+			//fprintf(f, "x=%d y=%d %016llx %016llx %llu %llu d=%llu\n", x, y, begin, end, begin, end, end - begin);
+			//fprintf(f, "%016llx %016llx\n", begin, end);
+			//fprintf(f, "{\"name\":\"dpoly\",\"cat\":\"shader\",\"ph\":\"B\",\"ts\":%llu,\"tid\":%d}\n", (begin - min), tid);
+			//fprintf(f, "{\"name\":\"dpoly\",\"cat\":\"shader\",\"ph\":\"E\",\"ts\":%llu,\"tid\":%d}\n", (end - min), tid);
 		}
 	}
+
+	for (int i = 0, s = 0, r = 0; i < 20; ++i) {
+		s += sampling_hist[i];
+		r += ray_hist[i];
+		fprintf(f, "%d-%d%%:\ts=%d(%d, %.02f%%)\tr=%d(%d, %.02f%%)\n",
+			i*5, i*5+5,
+			sampling_hist[i], s, s * 100.f / total,
+			ray_hist[i], r, r * 100.f / total);
+	}
+
+	qsort(all_rays, total, sizeof(all_rays[0]), u32cmp);
+	qsort(all_samples, total, sizeof(all_samples[0]), u32cmp);
+
+	fprintf(f, "percentiles:\n");
+	static const int pcts[] = {99, 95, 90, 75, 50};
+	for (int i = 0; i < COUNTOF(pcts); ++i) {
+		const int pct = pcts[i];
+		const int index = total * pct / 100;
+		fprintf(f, "%d%%: ray=%d sampling=%d\n", pct,
+			all_rays[index], all_samples[index]
+		);
+	}
+
+	Mem_Free(all_rays);
+	Mem_Free(all_samples);
+
 	fclose(f);
 }