lccrt/tools/lccrt_s/src/lccrt_vec.c

219 lines
5.8 KiB
C
Raw Blame History

/**
* Part of the Lccrt Project, under the Apache License v2.0
* See http://www.apache.org/licenses/LICENSE-2.0.txt for license information.
* SPDX-License-Identifier: Apache-2.0
*/
#include "lccrt_s.h"
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#define __lccrt_shuffle_get( n, k, x, y) (((k) < (n)) ? (x).a[k] : (y).a[(k) - (n)])
#define __lccopt_vec_shuffle( suffix, arg_type, ind_type, len) \
arg_type \
__lccopt_shuffle_##suffix( arg_type x, arg_type y, ind_type c) \
{ \
int i; \
arg_type r; \
\
for ( i = 0; i < len; ++i ) \
{ \
r.a[i] = __lccrt_shuffle_get( len, c.a[i], x, y); \
} \
\
return (r); \
} /* __builtin_lccopt_shuffle_##suffix */
static int __lccrt_bitwidth_bytesize( int bitsize) {
int r = 0;
if ( bitsize <= 8 ) {
r = 1;
} else if ( bitsize <= 16 ) {
r = 2;
} else if ( bitsize <= 32 ) {
r = 4;
} else if ( bitsize <= 64 ) {
r = 8;
} else if ( bitsize <= 128) {
r = 16;
} else {
assert( 0);
}
return (r);
}
void
__lccrt_store_bytes( void *dst, uint64_t v, int64_t bytesize) {
if ( bytesize == 1 ) {
uint8_t *p8 = (uint8_t *)dst;
p8[0] = v;
} else if ( bytesize == 2 ) {
uint16_t *p16 = (uint16_t *)dst;
p16[0] = v;
} else if ( bytesize == 4 ) {
uint32_t *p32 = (uint32_t *)dst;
p32[0] = v;
} else if ( bytesize == 8 ) {
uint64_t *p64 = (uint64_t *)dst;
p64[0] = v;
} else {
assert( 0);
}
}
uint64_t
__lccrt_load_bytes( void *src, int64_t bytesize) {
uint64_t r = 0;
if ( bytesize == 1 ) {
uint8_t *p8 = (uint8_t *)src;
r = p8[0];
} else if ( bytesize == 2 ) {
uint16_t *p16 = (uint16_t *)src;
r = p16[0];
} else if ( bytesize == 4 ) {
uint32_t *p32 = (uint32_t *)src;
r = p32[0];
} else if ( bytesize == 8 ) {
uint64_t *p64 = (uint64_t *)src;
r = p64[0];
} else {
assert( 0);
}
return (r);
}
__lccopt_vec_shuffle( v16i8, __lccrt_vec_si( 8, 16), __lccrt_vec_si( 32, 16), 16)
__lccopt_vec_shuffle( v8i16, __lccrt_vec_si( 16, 8), __lccrt_vec_si( 32, 8), 8)
__lccopt_vec_shuffle( v4i32, __lccrt_vec_si( 32, 4), __lccrt_vec_si( 32, 4), 4)
__lccopt_vec_shuffle( v2i64, __lccrt_vec_si( 64, 2), __lccrt_vec_si( 32, 2), 2)
__lccopt_vec_shuffle( v32i8, __lccrt_vec_si( 8, 32), __lccrt_vec_si( 32, 32), 32)
__lccopt_vec_shuffle( v16i16, __lccrt_vec_si( 16, 16), __lccrt_vec_si( 32, 16), 16)
__lccopt_vec_shuffle( v8i32, __lccrt_vec_si( 32, 8), __lccrt_vec_si( 32, 8), 8)
__lccopt_vec_shuffle( v4i64, __lccrt_vec_si( 64, 4), __lccrt_vec_si( 32, 4), 4)
__lccopt_vec_shuffle( v4f32, __lccrt_vec_f( 32, 4), __lccrt_vec_si( 32, 4), 4)
__lccopt_vec_shuffle( v2f64, __lccrt_vec_f( 64, 2), __lccrt_vec_si( 32, 2), 2)
__lccopt_vec_shuffle( v8f32, __lccrt_vec_f( 32, 8), __lccrt_vec_si( 32, 8), 8)
__lccopt_vec_shuffle( v4f64, __lccrt_vec_f( 64, 4), __lccrt_vec_si( 32, 4), 4)
__lccopt_vec_shuffle( v32v32i8, __lccrt_vec_si( 8, 32), __lccrt_vec_si( 32, 32), 32)
__lccrt_vec_si( 32, 4)
__lccrt_mul_v4i32( __lccrt_vec_si( 32, 4) a, __lccrt_vec_si( 32, 4) b)
{
int i;
__lccrt_vec_si( 32, 4) r;
for ( i = 0; i < 4; ++i ) r.a[i] = a.a[i] * b.a[i];
return (r);
}
__lccrt_vec_f( 32, 4)
__lccrt_fdiv_v4f32( __lccrt_vec_f( 32, 4) a, __lccrt_vec_f( 32, 4) b)
{
int i;
__lccrt_vec_f( 32, 4) r;
for ( i = 0; i < 4; ++i ) r.a[i] = a.a[i] / b.a[i];
return (r);
}
__lccrt_vec_si( 32, 4)
__lccrt_select_v4i32_t( __lccrt_vec_si( 32, 4) a,
__lccrt_vec_si( 32, 4) b,
__lccrt_vec_si( 32, 4) c)
{
int i;
__lccrt_vec_si( 32, 4) r;
for ( i = 0; i < 4; ++i ) r.a[i] = a.a[i] ? b.a[i] : c.a[i];
return (r);
}
__lccrt_vec_f( 32, 4)
__lccrt_select_v4f32_t( __lccrt_vec_si( 32, 4) a,
__lccrt_vec_f( 32, 4) b,
__lccrt_vec_f( 32, 4) c)
{
int i;
__lccrt_vec_f( 32, 4) r;
for ( i = 0; i < 4; ++i ) r.a[i] = a.a[i] ? b.a[i] : c.a[i];
return (r);
}
__lccrt_vec_f( 32, 4)
__lccrt_sqrt_v4f32( __lccrt_vec_f( 32, 4) a)
{
int i;
__lccrt_vec_f( 32, 4) r;
for ( i = 0; i < 4; ++i ) r.a[i] = __builtin_sqrtf( a.a[i]);
return (r);
}
/**
* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD> <20> (<28><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>) <20><><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>.
* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD> <20><> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> 64.
* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>: <3 x i6> -> i18.
*/
void
__lccrt_vecbitpack( void *dst, void *src, int64_t veclen, int64_t elembitsize) {
uint64_t r = 0;
int ebytes = __lccrt_bitwidth_bytesize( elembitsize);
int maskshift = 64 - elembitsize;
int totalbitsize = veclen*elembitsize;
assert( (0 <= veclen) && (0 < elembitsize) && (totalbitsize <= 64));
for ( int i = veclen - 1; i >= 0; --i ) {
uint64_t si = 0;
si = __lccrt_load_bytes( (char *)src + i*ebytes, ebytes);
si = (si << maskshift) >> maskshift;
r = (r << elembitsize) | si;
}
__lccrt_store_bytes( dst, r, __lccrt_bitwidth_bytesize( totalbitsize));
return;
}
/**
* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD> (<28><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>) <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD>.
* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD> <20><> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> 64.
* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>: i18 -> <3 x i6>.
*/
void
__lccrt_vecbitunpack( void *dst, void *src, int64_t veclen, int64_t elembitsize) {
int ebytes = __lccrt_bitwidth_bytesize( elembitsize);
int maskshift = 64 - elembitsize;
int totalbitsize = veclen*elembitsize;
uint64_t v0 = __lccrt_load_bytes( src, __lccrt_bitwidth_bytesize( totalbitsize));
uint64_t v = v0;
assert( (0 <= veclen) && (0 < elembitsize) && (totalbitsize <= 64));
for ( int i = 0; i < veclen; ++i ) {
uint64_t si = 0;
si = (v << maskshift) >> maskshift;
v = v >> elembitsize;
__lccrt_store_bytes( (char *)dst + i*ebytes, si, ebytes);
}
return;
}