mirror of
https://github.com/w23/xash3d-fwgs
synced 2024-12-04 16:00:22 +01:00
vk: rt: add more shader clock timing experiments and results
See notes and stream E344 for data and comments
This commit is contained in:
parent
cbdbda3549
commit
6b51fde5b2
@ -1024,3 +1024,79 @@ This would also allow passing arbitrary per-pixel data from shaders, which would
|
||||
- Ray tracing time (Σ, +count)
|
||||
- Aggregate numbers:
|
||||
- TODO: what does VK_KHR_performance_query give us? Regs usage, etc.
|
||||
|
||||
# 2023-12-08 E344
|
||||
## Experiment one
|
||||
some save in test_brush2, default PROJECTED sampling
|
||||
0-5%: s=0(0, 0.00%) r=133495(133495, 13.18%)
|
||||
5-10%: s=0(0, 0.00%) r=204348(337843, 33.35%)
|
||||
10-15%: s=0(0, 0.00%) r=92673(430516, 42.49%)
|
||||
15-20%: s=0(0, 0.00%) r=25196(455712, 44.98%)
|
||||
20-25%: s=67(67, 0.01%) r=3529(459241, 45.33%)
|
||||
25-30%: s=253(320, 0.03%) r=12018(471259, 46.52%)
|
||||
30-35%: s=319(639, 0.06%) r=39805(511064, 50.45%)
|
||||
35-40%: s=2096(2735, 0.27%) r=178843(689907, 68.10%)
|
||||
40-45%: s=8753(11488, 1.13%) r=270099(960006, 94.76%)
|
||||
45-50%: s=53958(65446, 6.46%) r=44000(1004006, 99.10%)
|
||||
50-55%: s=275150(340596, 33.62%) r=8256(1012262, 99.92%)
|
||||
55-60%: s=183343(523939, 51.72%) r=838(1013100, 100.00%)
|
||||
60-65%: s=38922(562861, 55.56%) r=0(1013100, 100.00%)
|
||||
65-70%: s=6603(569464, 56.21%) r=0(1013100, 100.00%)
|
||||
70-75%: s=14101(583565, 57.60%) r=0(1013100, 100.00%)
|
||||
75-80%: s=85481(669046, 66.04%) r=0(1013100, 100.00%)
|
||||
80-85%: s=150723(819769, 80.92%) r=0(1013100, 100.00%)
|
||||
85-90%: s=107863(927632, 91.56%) r=0(1013100, 100.00%)
|
||||
90-95%: s=85468(1013100, 100.00%) r=0(1013100, 100.00%)
|
||||
95-100%: s=0(1013100, 100.00%) r=0(1013100, 100.00%)
|
||||
|
||||
(shader clock) percentiles:
|
||||
99%: ray=393223 sampling=496965
|
||||
95%: ray=321478 sampling=365558
|
||||
90%: ray=276235 sampling=356515
|
||||
75%: ray=250132 sampling=349746
|
||||
50%: ray=184418 sampling=337416
|
||||
|
||||
PROJECTED + REALTIME clock:
|
||||
percentiles:
|
||||
99%: ray=45580 sampling=62860
|
||||
95%: ray=30324 sampling=60136
|
||||
90%: ray=19736 sampling=58868
|
||||
75%: ray=13688 sampling=48068
|
||||
50%: ray=10784 sampling=22500
|
||||
|
||||
## Two
|
||||
SOLID sampling
|
||||
0-5%: s=6(6, 0.00%) r=0(0, 0.00%)
|
||||
5-10%: s=3361(3367, 0.33%) r=0(0, 0.00%)
|
||||
10-15%: s=89901(93268, 9.21%) r=0(0, 0.00%)
|
||||
15-20%: s=681106(774374, 76.44%) r=0(0, 0.00%)
|
||||
20-25%: s=125185(899559, 88.79%) r=0(0, 0.00%)
|
||||
25-30%: s=111237(1010796, 99.77%) r=222(222, 0.02%)
|
||||
30-35%: s=2080(1012876, 99.98%) r=667(889, 0.09%)
|
||||
35-40%: s=224(1013100, 100.00%) r=621(1510, 0.15%)
|
||||
40-45%: s=0(1013100, 100.00%) r=1049(2559, 0.25%)
|
||||
45-50%: s=0(1013100, 100.00%) r=2071(4630, 0.46%)
|
||||
50-55%: s=0(1013100, 100.00%) r=1494(6124, 0.60%)
|
||||
55-60%: s=0(1013100, 100.00%) r=7099(13223, 1.31%)
|
||||
60-65%: s=0(1013100, 100.00%) r=80026(93249, 9.20%)
|
||||
65-70%: s=0(1013100, 100.00%) r=152210(245459, 24.23%)
|
||||
70-75%: s=0(1013100, 100.00%) r=466794(712253, 70.30%)
|
||||
75-80%: s=0(1013100, 100.00%) r=276878(989131, 97.63%)
|
||||
80-85%: s=0(1013100, 100.00%) r=22822(1011953, 99.89%)
|
||||
85-90%: s=0(1013100, 100.00%) r=1147(1013100, 100.00%)
|
||||
90-95%: s=0(1013100, 100.00%) r=0(1013100, 100.00%)
|
||||
95-100%: s=0(1013100, 100.00%) r=0(1013100, 100.00%)
|
||||
|
||||
(shader clock) percentiles:
|
||||
99%: ray=565588 sampling=105982
|
||||
95%: ray=422982 sampling=102555
|
||||
90%: ray=391055 sampling=100877
|
||||
75%: ray=350036 sampling=97007
|
||||
50%: ray=290830 sampling=76574
|
||||
|
||||
SOLID + REALTIME clock
|
||||
99%: ray=97056 sampling=20328
|
||||
95%: ray=78988 sampling=19532
|
||||
90%: ray=74236 sampling=19076
|
||||
75%: ray=51580 sampling=17512
|
||||
50%: ray=29268 sampling=6648
|
||||
|
@ -1,8 +1,17 @@
|
||||
# Next
|
||||
- [ ] performance query
|
||||
|
||||
# 2023-12-08 E344
|
||||
- [x] measure percentage of direct light shader phases:
|
||||
- [x] how long does sampling take -- ~~supposedly ALU-bound, need clockARB()?~~
|
||||
- [x] how long does ray tracing take -- ~~supposedly mem-bound, clockRealtimeEXT()?~~
|
||||
- [ ] try mapping shader realtime clock values to calibrated vk compute dispatch timestamps
|
||||
Similar to what RGP draws, but not quite the same. Not sure if this is valuable, but it would be neat
|
||||
|
||||
# 2023-12-07 E343
|
||||
- [x] extract raw shader clock
|
||||
- [ ] display times as scopes somewhere
|
||||
- [ ] extract sampling times
|
||||
- [ ] extract ray times
|
||||
- [-] display times as scopes somewhere
|
||||
→ tried chrome trace in ff profiler, it broke (200k scopes in 1k threads is too much)
|
||||
|
||||
# 2023-12-05 E342
|
||||
- [x] tone down the specular indirect blur
|
||||
|
@ -5,6 +5,10 @@
|
||||
|
||||
#include "noise.glsl"
|
||||
#include "utils.glsl"
|
||||
#include "time.glsl"
|
||||
|
||||
uint prof_sampling = 0, prof_ray = 0;
|
||||
uint prof_lights = 0, prof_samples = 0, prof_rays = 0;
|
||||
|
||||
#define DO_ALL_IN_CLUSTER 1
|
||||
|
||||
@ -232,9 +236,14 @@ void sampleEmissiveSurfaces(vec3 P, vec3 N, vec3 throughput, vec3 view_dir, Mate
|
||||
|
||||
const float plane_dist = dot(poly.plane, vec4(P, 1.f));
|
||||
|
||||
++prof_lights;
|
||||
|
||||
if (plane_dist < 0.)
|
||||
continue;
|
||||
|
||||
++prof_samples;
|
||||
|
||||
const time_t prof_sampling_begin = timeNow();
|
||||
#ifdef PROJECTED
|
||||
const vec4 light_sample_dir = getPolygonLightSampleProjected(view_dir, ctx, poly);
|
||||
#elif defined(SOLID)
|
||||
@ -244,6 +253,7 @@ void sampleEmissiveSurfaces(vec3 P, vec3 N, vec3 throughput, vec3 view_dir, Mate
|
||||
#else
|
||||
const vec4 light_sample_dir = getPolygonLightSampleSimple(P, view_dir, poly);
|
||||
#endif
|
||||
prof_sampling += timeDelta(prof_sampling_begin, timeNow());
|
||||
|
||||
if (light_sample_dir.w <= 0.)
|
||||
continue;
|
||||
@ -251,7 +261,12 @@ void sampleEmissiveSurfaces(vec3 P, vec3 N, vec3 throughput, vec3 view_dir, Mate
|
||||
const float dist = - plane_dist / dot(light_sample_dir.xyz, poly.plane.xyz);
|
||||
const vec3 emissive = poly.emissive;
|
||||
|
||||
if (!shadowed(P, light_sample_dir.xyz, dist)) {
|
||||
const time_t prof_ray_begin = timeNow();
|
||||
const bool shadow = shadowed(P, light_sample_dir.xyz, dist);
|
||||
prof_ray += timeDelta(prof_ray_begin, timeNow());
|
||||
++prof_rays;
|
||||
|
||||
if (!shadow) {
|
||||
//const float estimate = total_contrib;
|
||||
const float estimate = light_sample_dir.w;
|
||||
vec3 poly_diffuse = vec3(0.), poly_specular = vec3(0.);
|
||||
|
@ -1,14 +1,6 @@
|
||||
#define PROF_USE_REALTIME
|
||||
#ifdef PROF_USE_REALTIME
|
||||
#extension GL_ARB_gpu_shader_int64: enable
|
||||
#extension GL_EXT_shader_realtime_clock: enable
|
||||
#else
|
||||
#extension GL_ARB_shader_clock: enable
|
||||
#endif
|
||||
|
||||
|
||||
#include "utils.glsl"
|
||||
#include "noise.glsl"
|
||||
#include "time.glsl"
|
||||
|
||||
#include "ray_kusochki.glsl"
|
||||
|
||||
@ -22,25 +14,8 @@ void readNormals(ivec2 uv, out vec3 geometry_normal, out vec3 shading_normal) {
|
||||
shading_normal = normalDecode(n.zw);
|
||||
}
|
||||
|
||||
#ifdef PROF_USE_REALTIME
|
||||
// On mesa+amdgpu there's a clear gradient: pixels on top of screen take 2-3x longer to compute than bottom ones. Also,
|
||||
// it does flicker a lot.
|
||||
// Deltas are about 30000-100000 parrots
|
||||
#define timeNow clockRealtime2x32EXT
|
||||
#else
|
||||
// clockARB doesn't give directly usable time values on mesa+amdgpu
|
||||
// even deltas between them are not meaningful enough.
|
||||
// On mesa+amdgpu clockARB() values are limited to lower 20bits, and they wrap around a lot.
|
||||
// Absolute difference value are often 30-50% of the available range, so it's not that far off from wrapping around
|
||||
// multiple times, rendering the value completely useless.
|
||||
// Deltas are around 300000-500000 parrots.
|
||||
// Other than that, the values seem uniform across the screen (as compared to realtime clock, which has a clearly
|
||||
// visible gradient: top differences are larger than bottom ones.
|
||||
#define timeNow clock2x32ARB
|
||||
#endif
|
||||
|
||||
void main() {
|
||||
const uvec2 time_begin = timeNow();
|
||||
const time_t time_begin = timeNow();
|
||||
|
||||
#ifdef RAY_TRACE
|
||||
const vec2 uv = (gl_LaunchIDEXT.xy + .5) / gl_LaunchSizeEXT.xy * 2. - 1.;
|
||||
@ -79,19 +54,12 @@ void main() {
|
||||
vec3 diffuse = vec3(0.), specular = vec3(0.);
|
||||
computeLighting(pos + geometry_normal * .001, shading_normal, throughput, -direction, material, diffuse, specular);
|
||||
|
||||
const uvec2 time_end = timeNow();
|
||||
const time_t time_end = timeNow();
|
||||
//const uint64_t time_diff = time_end - time_begin;
|
||||
//const uint time_diff = time_begin.x - time_end.x;
|
||||
|
||||
#ifdef PROF_USE_REALTIME
|
||||
const uint64_t begin64 = time_begin.x | (uint64_t(time_begin.y) << 32);
|
||||
const uint64_t end64 = time_end.x | (uint64_t(time_end.y) << 32);
|
||||
const uint64_t time_diff = end64 - begin64;
|
||||
const float time_diff_f = float(time_diff) / 1e5;
|
||||
#else
|
||||
const uint time_diff = time_begin.x - time_end.x;
|
||||
const float time_diff_f = float(time_diff & 0xfffffu) / 1e6;
|
||||
#endif
|
||||
const uint time_diff = timeDelta(time_begin, time_end);
|
||||
const float time_diff_f = float(time_diff) / 1e6;
|
||||
|
||||
const uint prof_index = pix.x + pix.y * ubo.ubo.res.x;
|
||||
#if LIGHT_POINT
|
||||
@ -106,5 +74,9 @@ void main() {
|
||||
imageStore(out_light_poly_specular, pix, vec4(specular, 0.f));
|
||||
//imageStore(out_light_poly_profile, pix, profile);
|
||||
prof_direct_poly.a[prof_index].data[0] = uvec4(time_begin, time_end);
|
||||
prof_direct_poly.a[prof_index].data[1] = uvec4(timeDelta(time_begin, time_end), prof_sampling, prof_ray, 0);
|
||||
prof_direct_poly.a[prof_index].data[2] = uvec4(prof_lights, prof_samples, prof_rays, 0);
|
||||
//prof_direct_poly.a[prof_index].data[2] = uvec4(prof_ray_begin, prof_ray_end);
|
||||
//prof_direct_poly.a[prof_index].data[3] = uvec4(prof_light_count, 0, 0, 0);
|
||||
#endif
|
||||
}
|
||||
|
53
ref/vk/shaders/time.glsl
Normal file
53
ref/vk/shaders/time.glsl
Normal file
@ -0,0 +1,53 @@
|
||||
#ifndef TIME_GLSL_INCLUDED
|
||||
#define TIME_GLSL_INCLUDED
|
||||
|
||||
//#define PROF_USE_REALTIME
|
||||
#ifdef PROF_USE_REALTIME
|
||||
#extension GL_ARB_gpu_shader_int64: enable
|
||||
#extension GL_EXT_shader_realtime_clock: enable
|
||||
#else
|
||||
#extension GL_ARB_shader_clock: enable
|
||||
#endif
|
||||
|
||||
#define time_t uvec2
|
||||
#define T0 uvec2(0)
|
||||
#define clockRealtime clockRealtime2x32EXT
|
||||
#define clockShader clock2x32ARB
|
||||
|
||||
#define clockRealtime64 clockRealtimeEXT
|
||||
#define clockShader64 clockARB
|
||||
|
||||
#ifdef PROF_USE_REALTIME
|
||||
uint clockRealtimeDelta(time_t begin, time_t end) {
|
||||
const uint64_t begin64 = begin.x | (uint64_t(begin.y) << 32);
|
||||
const uint64_t end64 = end.x | (uint64_t(end.y) << 32);
|
||||
const uint64_t time_diff = end64 - begin64;
|
||||
return uint(time_diff);
|
||||
}
|
||||
#endif
|
||||
|
||||
uint clockShaderDelta(time_t begin, time_t end) {
|
||||
// AMD RNDA2 SHADER_CYCLES reg is limited to 20 bits
|
||||
return (end.x - begin.x) & 0xfffffu;
|
||||
}
|
||||
|
||||
#ifdef PROF_USE_REALTIME
|
||||
// On mesa+amdgpu there's a clear gradient: pixels on top of screen take 2-3x longer to compute than bottom ones. Also,
|
||||
// it does flicker a lot.
|
||||
// Deltas are about 30000-100000 parrots
|
||||
#define timeNow clockRealtime
|
||||
#define timeDelta clockRealtimeDelta
|
||||
#else
|
||||
// clockARB doesn't give directly usable time values on mesa+amdgpu
|
||||
// even deltas between them are not meaningful enough.
|
||||
// On AMD clockARB() values are limited to lower 20 bits (see RDNA 2 ISA SHADER_CYCLES reg), and they wrap around a lot.
|
||||
// Absolute difference value are often 30-50% of the available range, so it's not that far off from wrapping around
|
||||
// multiple times, rendering the value completely useless.
|
||||
// Deltas are around 300000-500000 parrots.
|
||||
// Other than that, the values seem uniform across the screen (as compared to realtime clock, which has a clearly
|
||||
// visible gradient: top differences are larger than bottom ones.
|
||||
#define timeNow clockShader
|
||||
#define timeDelta clockShaderDelta
|
||||
#endif
|
||||
|
||||
#endif //ifndef TIME_GLSL_INCLUDED
|
@ -25,6 +25,7 @@
|
||||
#include "xash3d_mathlib.h"
|
||||
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#define LOG_MODULE rt
|
||||
|
||||
@ -600,16 +601,107 @@ fail:
|
||||
R_VkMeatpipeDestroy(newpipe);
|
||||
}
|
||||
|
||||
static int u32cmp(const void *l, const void *r) {
|
||||
return *(const uint32_t*)l - *(const uint32_t*)r;
|
||||
}
|
||||
|
||||
static void dumpProfilingData(int w, int h) {
|
||||
uint64_t min = UINT64_MAX;
|
||||
const struct ProfilingStruct *const prof = (const struct ProfilingStruct*)g_rtx.prof_direct_poly.mapped;
|
||||
for (int i = 0; i < w * h; ++i) {
|
||||
const struct ProfilingStruct *s = prof + i;
|
||||
// TODO: time can wrap
|
||||
const unsigned long long begin = s->data[0][0] | ((uint64_t)(s->data[0][1]) << 32);
|
||||
min = Q_min(begin, min);
|
||||
}
|
||||
|
||||
int sampling_hist[20] = {0};
|
||||
int ray_hist[20] = {0};
|
||||
int total = 0;
|
||||
|
||||
uint32_t *all_samples = Mem_Calloc(vk_core.pool, w * h * sizeof(uint32_t));
|
||||
uint32_t *all_rays = Mem_Calloc(vk_core.pool, w * h * sizeof(uint32_t));
|
||||
|
||||
FILE *f = fopen("gpuprof.dump", "w");
|
||||
for (int y = 0; y < h; ++y) {
|
||||
const struct ProfilingStruct *s = ((const struct ProfilingStruct*)(g_rtx.prof_direct_poly.mapped)) + y * g_rtx.max_frame_width;
|
||||
for (int x = 0; x < w; ++x, ++s) {
|
||||
const unsigned long long begin = s->data[0][0] | ((uint64_t)(s->data[0][1]) << 32);
|
||||
const unsigned long long end = s->data[0][2] | ((uint64_t)(s->data[0][3]) << 32);
|
||||
fprintf(f, "x=%d y=%d %016llx %016llx %llu %llu d=%llu\n", x, y, begin, end, begin, end, end - begin);
|
||||
#define TIME_READ(c, i) ((uint64_t)s->data[c][i*2+0] | ((uint64_t)(s->data[c][i*2+1]) << 32))
|
||||
const unsigned long long begin = TIME_READ(0, 0);
|
||||
const unsigned long long end = TIME_READ(0, 1);
|
||||
const uint32_t delta = end - begin;
|
||||
|
||||
uint32_t shader_delta = s->data[1][0];
|
||||
const uint32_t sampling_delta = s->data[1][1];
|
||||
const uint32_t ray_delta = s->data[1][2];
|
||||
|
||||
const uint32_t lights = s->data[2][0];
|
||||
const uint32_t samples = s->data[2][1];
|
||||
const uint32_t rays = s->data[2][2];
|
||||
|
||||
while (shader_delta < sampling_delta || shader_delta < ray_delta) {
|
||||
ERR("shader_delta wraparound detected, fixing");
|
||||
shader_delta += 0x100000u;
|
||||
}
|
||||
|
||||
/* if (shader_delta != delta) */
|
||||
/* ERR("Shader and local deltas don't match: d=%d", shader_delta - delta); */
|
||||
|
||||
if (!shader_delta)
|
||||
continue;
|
||||
|
||||
const float sampling_pct = sampling_delta * 100.f / shader_delta;
|
||||
const float ray_pct = ray_delta * 100.f / shader_delta;
|
||||
|
||||
fprintf(f, "total=%d lights=%d sampling=%d(%.02f%%; %d) ray=%d(%.02f%%; %d)\n",
|
||||
shader_delta, lights,
|
||||
sampling_delta, sampling_pct, samples,
|
||||
ray_delta, ray_pct, rays);
|
||||
|
||||
assert(sampling_pct < 100.);
|
||||
sampling_hist[(int)(sampling_pct / 5.)]++;
|
||||
|
||||
assert(ray_pct < 100.);
|
||||
ray_hist[(int)(ray_pct / 5.)]++;
|
||||
|
||||
all_rays[total] = ray_delta;
|
||||
all_samples[total] = sampling_delta;
|
||||
++total;
|
||||
|
||||
//fwrite(&begin, sizeof(begin), 1, f);
|
||||
//fwrite(&end, sizeof(end), 1, f);
|
||||
//fprintf(f, "x=%d y=%d %016llx %016llx %llu %llu d=%llu\n", x, y, begin, end, begin, end, end - begin);
|
||||
//fprintf(f, "%016llx %016llx\n", begin, end);
|
||||
//fprintf(f, "{\"name\":\"dpoly\",\"cat\":\"shader\",\"ph\":\"B\",\"ts\":%llu,\"tid\":%d}\n", (begin - min), tid);
|
||||
//fprintf(f, "{\"name\":\"dpoly\",\"cat\":\"shader\",\"ph\":\"E\",\"ts\":%llu,\"tid\":%d}\n", (end - min), tid);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0, s = 0, r = 0; i < 20; ++i) {
|
||||
s += sampling_hist[i];
|
||||
r += ray_hist[i];
|
||||
fprintf(f, "%d-%d%%:\ts=%d(%d, %.02f%%)\tr=%d(%d, %.02f%%)\n",
|
||||
i*5, i*5+5,
|
||||
sampling_hist[i], s, s * 100.f / total,
|
||||
ray_hist[i], r, r * 100.f / total);
|
||||
}
|
||||
|
||||
qsort(all_rays, total, sizeof(all_rays[0]), u32cmp);
|
||||
qsort(all_samples, total, sizeof(all_samples[0]), u32cmp);
|
||||
|
||||
fprintf(f, "percentiles:\n");
|
||||
static const int pcts[] = {99, 95, 90, 75, 50};
|
||||
for (int i = 0; i < COUNTOF(pcts); ++i) {
|
||||
const int pct = pcts[i];
|
||||
const int index = total * pct / 100;
|
||||
fprintf(f, "%d%%: ray=%d sampling=%d\n", pct,
|
||||
all_rays[index], all_samples[index]
|
||||
);
|
||||
}
|
||||
|
||||
Mem_Free(all_rays);
|
||||
Mem_Free(all_samples);
|
||||
|
||||
fclose(f);
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user