219 lines
5.8 KiB
C
219 lines
5.8 KiB
C
/**
|
||
* Part of the Lccrt Project, under the Apache License v2.0
|
||
* See http://www.apache.org/licenses/LICENSE-2.0.txt for license information.
|
||
* SPDX-License-Identifier: Apache-2.0
|
||
*/
|
||
|
||
#include "lccrt_s.h"
|
||
|
||
#include <stdlib.h>
|
||
#include <string.h>
|
||
#include <assert.h>
|
||
|
||
#define __lccrt_shuffle_get( n, k, x, y) (((k) < (n)) ? (x).a[k] : (y).a[(k) - (n)])
|
||
|
||
#define __lccopt_vec_shuffle( suffix, arg_type, ind_type, len) \
|
||
arg_type \
|
||
__lccopt_shuffle_##suffix( arg_type x, arg_type y, ind_type c) \
|
||
{ \
|
||
int i; \
|
||
arg_type r; \
|
||
\
|
||
for ( i = 0; i < len; ++i ) \
|
||
{ \
|
||
r.a[i] = __lccrt_shuffle_get( len, c.a[i], x, y); \
|
||
} \
|
||
\
|
||
return (r); \
|
||
} /* __builtin_lccopt_shuffle_##suffix */
|
||
|
||
static int __lccrt_bitwidth_bytesize( int bitsize) {
|
||
int r = 0;
|
||
|
||
if ( bitsize <= 8 ) {
|
||
r = 1;
|
||
} else if ( bitsize <= 16 ) {
|
||
r = 2;
|
||
} else if ( bitsize <= 32 ) {
|
||
r = 4;
|
||
} else if ( bitsize <= 64 ) {
|
||
r = 8;
|
||
} else if ( bitsize <= 128) {
|
||
r = 16;
|
||
} else {
|
||
assert( 0);
|
||
}
|
||
|
||
return (r);
|
||
}
|
||
|
||
void
|
||
__lccrt_store_bytes( void *dst, uint64_t v, int64_t bytesize) {
|
||
if ( bytesize == 1 ) {
|
||
uint8_t *p8 = (uint8_t *)dst;
|
||
p8[0] = v;
|
||
} else if ( bytesize == 2 ) {
|
||
uint16_t *p16 = (uint16_t *)dst;
|
||
p16[0] = v;
|
||
} else if ( bytesize == 4 ) {
|
||
uint32_t *p32 = (uint32_t *)dst;
|
||
p32[0] = v;
|
||
} else if ( bytesize == 8 ) {
|
||
uint64_t *p64 = (uint64_t *)dst;
|
||
p64[0] = v;
|
||
} else {
|
||
assert( 0);
|
||
}
|
||
}
|
||
|
||
uint64_t
|
||
__lccrt_load_bytes( void *src, int64_t bytesize) {
|
||
uint64_t r = 0;
|
||
|
||
if ( bytesize == 1 ) {
|
||
uint8_t *p8 = (uint8_t *)src;
|
||
r = p8[0];
|
||
} else if ( bytesize == 2 ) {
|
||
uint16_t *p16 = (uint16_t *)src;
|
||
r = p16[0];
|
||
} else if ( bytesize == 4 ) {
|
||
uint32_t *p32 = (uint32_t *)src;
|
||
r = p32[0];
|
||
} else if ( bytesize == 8 ) {
|
||
uint64_t *p64 = (uint64_t *)src;
|
||
r = p64[0];
|
||
} else {
|
||
assert( 0);
|
||
}
|
||
|
||
return (r);
|
||
}
|
||
|
||
__lccopt_vec_shuffle( v16i8, __lccrt_vec_si( 8, 16), __lccrt_vec_si( 32, 16), 16)
|
||
__lccopt_vec_shuffle( v8i16, __lccrt_vec_si( 16, 8), __lccrt_vec_si( 32, 8), 8)
|
||
__lccopt_vec_shuffle( v4i32, __lccrt_vec_si( 32, 4), __lccrt_vec_si( 32, 4), 4)
|
||
__lccopt_vec_shuffle( v2i64, __lccrt_vec_si( 64, 2), __lccrt_vec_si( 32, 2), 2)
|
||
|
||
__lccopt_vec_shuffle( v32i8, __lccrt_vec_si( 8, 32), __lccrt_vec_si( 32, 32), 32)
|
||
__lccopt_vec_shuffle( v16i16, __lccrt_vec_si( 16, 16), __lccrt_vec_si( 32, 16), 16)
|
||
__lccopt_vec_shuffle( v8i32, __lccrt_vec_si( 32, 8), __lccrt_vec_si( 32, 8), 8)
|
||
__lccopt_vec_shuffle( v4i64, __lccrt_vec_si( 64, 4), __lccrt_vec_si( 32, 4), 4)
|
||
|
||
__lccopt_vec_shuffle( v4f32, __lccrt_vec_f( 32, 4), __lccrt_vec_si( 32, 4), 4)
|
||
__lccopt_vec_shuffle( v2f64, __lccrt_vec_f( 64, 2), __lccrt_vec_si( 32, 2), 2)
|
||
|
||
__lccopt_vec_shuffle( v8f32, __lccrt_vec_f( 32, 8), __lccrt_vec_si( 32, 8), 8)
|
||
__lccopt_vec_shuffle( v4f64, __lccrt_vec_f( 64, 4), __lccrt_vec_si( 32, 4), 4)
|
||
|
||
__lccopt_vec_shuffle( v32v32i8, __lccrt_vec_si( 8, 32), __lccrt_vec_si( 32, 32), 32)
|
||
|
||
__lccrt_vec_si( 32, 4)
|
||
__lccrt_mul_v4i32( __lccrt_vec_si( 32, 4) a, __lccrt_vec_si( 32, 4) b)
|
||
{
|
||
int i;
|
||
__lccrt_vec_si( 32, 4) r;
|
||
|
||
for ( i = 0; i < 4; ++i ) r.a[i] = a.a[i] * b.a[i];
|
||
|
||
return (r);
|
||
}
|
||
|
||
__lccrt_vec_f( 32, 4)
|
||
__lccrt_fdiv_v4f32( __lccrt_vec_f( 32, 4) a, __lccrt_vec_f( 32, 4) b)
|
||
{
|
||
int i;
|
||
__lccrt_vec_f( 32, 4) r;
|
||
|
||
for ( i = 0; i < 4; ++i ) r.a[i] = a.a[i] / b.a[i];
|
||
|
||
return (r);
|
||
}
|
||
|
||
__lccrt_vec_si( 32, 4)
|
||
__lccrt_select_v4i32_t( __lccrt_vec_si( 32, 4) a,
|
||
__lccrt_vec_si( 32, 4) b,
|
||
__lccrt_vec_si( 32, 4) c)
|
||
{
|
||
int i;
|
||
__lccrt_vec_si( 32, 4) r;
|
||
|
||
for ( i = 0; i < 4; ++i ) r.a[i] = a.a[i] ? b.a[i] : c.a[i];
|
||
|
||
return (r);
|
||
}
|
||
|
||
__lccrt_vec_f( 32, 4)
|
||
__lccrt_select_v4f32_t( __lccrt_vec_si( 32, 4) a,
|
||
__lccrt_vec_f( 32, 4) b,
|
||
__lccrt_vec_f( 32, 4) c)
|
||
{
|
||
int i;
|
||
__lccrt_vec_f( 32, 4) r;
|
||
|
||
for ( i = 0; i < 4; ++i ) r.a[i] = a.a[i] ? b.a[i] : c.a[i];
|
||
|
||
return (r);
|
||
}
|
||
|
||
__lccrt_vec_f( 32, 4)
|
||
__lccrt_sqrt_v4f32( __lccrt_vec_f( 32, 4) a)
|
||
{
|
||
int i;
|
||
__lccrt_vec_f( 32, 4) r;
|
||
|
||
for ( i = 0; i < 4; ++i ) r.a[i] = __builtin_sqrtf( a.a[i]);
|
||
|
||
return (r);
|
||
}
|
||
|
||
/**
|
||
* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD> <20> (<28><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>) <20><><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>.
|
||
* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD> <20><> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> 64.
|
||
* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>: <3 x i6> -> i18.
|
||
*/
|
||
void
|
||
__lccrt_vecbitpack( void *dst, void *src, int64_t veclen, int64_t elembitsize) {
|
||
uint64_t r = 0;
|
||
int ebytes = __lccrt_bitwidth_bytesize( elembitsize);
|
||
int maskshift = 64 - elembitsize;
|
||
int totalbitsize = veclen*elembitsize;
|
||
|
||
assert( (0 <= veclen) && (0 < elembitsize) && (totalbitsize <= 64));
|
||
for ( int i = veclen - 1; i >= 0; --i ) {
|
||
uint64_t si = 0;
|
||
|
||
si = __lccrt_load_bytes( (char *)src + i*ebytes, ebytes);
|
||
si = (si << maskshift) >> maskshift;
|
||
r = (r << elembitsize) | si;
|
||
}
|
||
|
||
__lccrt_store_bytes( dst, r, __lccrt_bitwidth_bytesize( totalbitsize));
|
||
|
||
return;
|
||
}
|
||
|
||
/**
|
||
* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD> (<28><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>) <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD>.
|
||
* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD> <20><> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> 64.
|
||
* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>: i18 -> <3 x i6>.
|
||
*/
|
||
void
|
||
__lccrt_vecbitunpack( void *dst, void *src, int64_t veclen, int64_t elembitsize) {
|
||
int ebytes = __lccrt_bitwidth_bytesize( elembitsize);
|
||
int maskshift = 64 - elembitsize;
|
||
int totalbitsize = veclen*elembitsize;
|
||
uint64_t v0 = __lccrt_load_bytes( src, __lccrt_bitwidth_bytesize( totalbitsize));
|
||
uint64_t v = v0;
|
||
|
||
assert( (0 <= veclen) && (0 < elembitsize) && (totalbitsize <= 64));
|
||
for ( int i = 0; i < veclen; ++i ) {
|
||
uint64_t si = 0;
|
||
|
||
si = (v << maskshift) >> maskshift;
|
||
v = v >> elembitsize;
|
||
__lccrt_store_bytes( (char *)dst + i*ebytes, si, ebytes);
|
||
}
|
||
|
||
return;
|
||
}
|