linux/arch/mips
Zhaoxiu Zeng fff7fb0b2d lib/GCD.c: use binary GCD algorithm instead of Euclidean
The binary GCD algorithm is based on the following facts:
	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)

Even on x86 machines with reasonable division hardware, the binary
algorithm runs about 25% faster (80% the execution time) than the
division-based Euclidian algorithm.

On platforms like Alpha and ARMv6 where division is a function call to
emulation code, it's even more significant.

There are two variants of the code here, depending on whether a fast
__ffs (find least significant set bit) instruction is available.  This
allows the unpredictable branches in the bit-at-a-time shifting loop to
be eliminated.

If fast __ffs is not available, the "even/odd" GCD variant is used.

I use the following code to benchmark:

	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>
	#include <string.h>
	#include <time.h>
	#include <unistd.h>

	#define swap(a, b) \
		do { \
			a ^= b; \
			b ^= a; \
			a ^= b; \
		} while (0)

	unsigned long gcd0(unsigned long a, unsigned long b)
	{
		unsigned long r;

		if (a < b) {
			swap(a, b);
		}

		if (b == 0)
			return a;

		while ((r = a % b) != 0) {
			a = b;
			b = r;
		}

		return b;
	}

	unsigned long gcd1(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);

		for (;;) {
			a >>= __builtin_ctzl(a);
			if (a == b)
				return a << __builtin_ctzl(r);

			if (a < b)
				swap(a, b);
			a -= b;
		}
	}

	unsigned long gcd2(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a == b)
				return a;

			if (a < b)
				swap(a, b);
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	unsigned long gcd3(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);
		if (b == 1)
			return r & -r;

		for (;;) {
			a >>= __builtin_ctzl(a);
			if (a == 1)
				return r & -r;
			if (a == b)
				return a << __builtin_ctzl(r);

			if (a < b)
				swap(a, b);
			a -= b;
		}
	}

	unsigned long gcd4(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;
		if (b == r)
			return r;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a == r)
				return r;
			if (a == b)
				return a;

			if (a < b)
				swap(a, b);
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	static unsigned long (*gcd_func[])(unsigned long a, unsigned long b) = {
		gcd0, gcd1, gcd2, gcd3, gcd4,
	};

	#define TEST_ENTRIES (sizeof(gcd_func) / sizeof(gcd_func[0]))

	#if defined(__x86_64__)

	#define rdtscll(val) do { \
		unsigned long __a,__d; \
		__asm__ __volatile__("rdtsc" : "=a" (__a), "=d" (__d)); \
		(val) = ((unsigned long long)__a) | (((unsigned long long)__d)<<32); \
	} while(0)

	static unsigned long long benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
								unsigned long a, unsigned long b, unsigned long *res)
	{
		unsigned long long start, end;
		unsigned long long ret;
		unsigned long gcd_res;

		rdtscll(start);
		gcd_res = gcd(a, b);
		rdtscll(end);

		if (end >= start)
			ret = end - start;
		else
			ret = ~0ULL - start + 1 + end;

		*res = gcd_res;
		return ret;
	}

	#else

	static inline struct timespec read_time(void)
	{
		struct timespec time;
		clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time);
		return time;
	}

	static inline unsigned long long diff_time(struct timespec start, struct timespec end)
	{
		struct timespec temp;

		if ((end.tv_nsec - start.tv_nsec) < 0) {
			temp.tv_sec = end.tv_sec - start.tv_sec - 1;
			temp.tv_nsec = 1000000000ULL + end.tv_nsec - start.tv_nsec;
		} else {
			temp.tv_sec = end.tv_sec - start.tv_sec;
			temp.tv_nsec = end.tv_nsec - start.tv_nsec;
		}

		return temp.tv_sec * 1000000000ULL + temp.tv_nsec;
	}

	static unsigned long long benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
								unsigned long a, unsigned long b, unsigned long *res)
	{
		struct timespec start, end;
		unsigned long gcd_res;

		start = read_time();
		gcd_res = gcd(a, b);
		end = read_time();

		*res = gcd_res;
		return diff_time(start, end);
	}

	#endif

	static inline unsigned long get_rand()
	{
		if (sizeof(long) == 8)
			return (unsigned long)rand() << 32 | rand();
		else
			return rand();
	}

	int main(int argc, char **argv)
	{
		unsigned int seed = time(0);
		int loops = 100;
		int repeats = 1000;
		unsigned long (*res)[TEST_ENTRIES];
		unsigned long long elapsed[TEST_ENTRIES];
		int i, j, k;

		for (;;) {
			int opt = getopt(argc, argv, "n:r:s:");
			/* End condition always first */
			if (opt == -1)
				break;

			switch (opt) {
			case 'n':
				loops = atoi(optarg);
				break;
			case 'r':
				repeats = atoi(optarg);
				break;
			case 's':
				seed = strtoul(optarg, NULL, 10);
				break;
			default:
				/* You won't actually get here. */
				break;
			}
		}

		res = malloc(sizeof(unsigned long) * TEST_ENTRIES * loops);
		memset(elapsed, 0, sizeof(elapsed));

		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			/* Do we have args? */
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			unsigned long long min_elapsed[TEST_ENTRIES];
			for (k = 0; k < repeats; k++) {
				for (i = 0; i < TEST_ENTRIES; i++) {
					unsigned long long tmp = benchmark_gcd_func(gcd_func[i], a, b, &res[j][i]);
					if (k == 0 || min_elapsed[i] > tmp)
						min_elapsed[i] = tmp;
				}
			}
			for (i = 0; i < TEST_ENTRIES; i++)
				elapsed[i] += min_elapsed[i];
		}

		for (i = 0; i < TEST_ENTRIES; i++)
			printf("gcd%d: elapsed %llu\n", i, elapsed[i]);

		k = 0;
		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			for (i = 1; i < TEST_ENTRIES; i++) {
				if (res[j][i] != res[j][0])
					break;
			}
			if (i < TEST_ENTRIES) {
				if (k == 0) {
					k = 1;
					fprintf(stderr, "Error:\n");
				}
				fprintf(stderr, "gcd(%lu, %lu): ", a, b);
				for (i = 0; i < TEST_ENTRIES; i++)
					fprintf(stderr, "%ld%s", res[j][i], i < TEST_ENTRIES - 1 ? ", " : "\n");
			}
		}

		if (k == 0)
			fprintf(stderr, "PASS\n");

		free(res);

		return 0;
	}

Compiled with "-O2", on "VirtualBox 4.4.0-22-generic #38-Ubuntu x86_64" got:

  zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
  gcd0: elapsed 10174
  gcd1: elapsed 2120
  gcd2: elapsed 2902
  gcd3: elapsed 2039
  gcd4: elapsed 2812
  PASS
  zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
  gcd0: elapsed 9309
  gcd1: elapsed 2280
  gcd2: elapsed 2822
  gcd3: elapsed 2217
  gcd4: elapsed 2710
  PASS
  zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
  gcd0: elapsed 9589
  gcd1: elapsed 2098
  gcd2: elapsed 2815
  gcd3: elapsed 2030
  gcd4: elapsed 2718
  PASS
  zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
  gcd0: elapsed 9914
  gcd1: elapsed 2309
  gcd2: elapsed 2779
  gcd3: elapsed 2228
  gcd4: elapsed 2709
  PASS

[akpm@linux-foundation.org: avoid #defining a CONFIG_ variable]
Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
Signed-off-by: George Spelvin <linux@horizon.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-05-20 17:58:30 -07:00
..
alchemy Merge branch 'upstream' of git://git.linux-mips.org/pub/scm/ralf/upstream-linus 2016-05-19 10:02:26 -07:00
ar7 MIPS: ar7: use gpiochip data pointer 2016-02-19 09:51:43 +01:00
ath25 genirq: Remove irq argument from irq flow handlers 2015-09-16 15:47:51 +02:00
ath79 MIPS: ath79: fix regression in PCI window initialization 2016-05-17 11:13:28 +02:00
bcm47xx MIPS: BCM47xx: Move SPROM driver to drivers/firmware/ 2016-05-13 14:01:43 +02:00
bcm63xx Merge branch 'ib-mfd-regulator-gpio-4.6' of git://git.kernel.org/pub/scm/linux/kernel/git/lee/mfd into devel 2016-03-09 17:40:37 +07:00
bmips MIPS: BMIPS: Add support for BCM63268 2016-05-13 14:02:09 +02:00
boot MIPS: JZ4740: Probe OHCI platform device via DT 2016-05-13 19:09:24 +02:00
cavium-octeon MIPS: Octeon: Mark some functions __init in smp.c 2016-05-13 14:02:10 +02:00
cobalt MIPS: Cobalt Don't use module_init in non-modular MTD registration. 2015-06-21 22:14:30 +02:00
configs MIPS: BMIPS: Enable partition parser in defconfig 2016-05-13 14:02:06 +02:00
dec MIPS: DEC: Export `ioasic_ssr_lock' to modules 2016-05-17 11:03:54 +02:00
emma
fw MIPS: Fix misspellings in comments. 2016-04-03 12:32:09 +02:00
include lib/GCD.c: use binary GCD algorithm instead of Euclidean 2016-05-20 17:58:30 -07:00
jazz MIPS: Jazz: Migrate to new 'set-state' interface 2015-09-03 12:07:50 +02:00
jz4740 MIPS: JZ4740: Probe OHCI platform device via DT 2016-05-13 19:09:24 +02:00
kernel exit_thread: remove empty bodies 2016-05-20 17:58:30 -07:00
kvm Small release overall. 2016-05-19 11:27:09 -07:00
lantiq MIPS: Change my email address 2016-05-13 14:02:18 +02:00
lasat [mips] switch pvc_proc_cleanup() to remove_proc_subtree() 2015-12-23 10:41:38 -05:00
lib MIPS: Print GuestCtl1 on machine check exception 2016-05-13 15:30:25 +02:00
loongson32 MIPS: Loongson1B: Some updates/fixes for LS1B 2016-05-13 14:02:05 +02:00
loongson64 MIPS: Loongson: Add Loongson-3A R2 basic support 2016-05-13 14:02:14 +02:00
math-emu MIPS: math-emu: Fix jalr emulation when rd == $0 2016-05-13 14:02:24 +02:00
mm Merge branch 'akpm' (patches from Andrew) 2016-05-19 20:00:06 -07:00
mti-malta MIPS: malta-time: Take seconds into account 2016-05-13 15:30:25 +02:00
mti-sead3 MIPS: Introduce plat_get_fdt a platform API to retrieve the FDT 2016-05-13 14:02:03 +02:00
net net: filter: make JITs zero A for SKF_AD_ALU_XOR_X 2016-01-06 00:43:52 -05:00
netlogic MIPS: Define & use CP0_EBase bit definitions 2016-05-13 15:30:25 +02:00
oprofile MIPS: Add perf counter feature 2016-05-13 15:30:25 +02:00
paravirt MIPS: SMP: Don't increment irq_count multiple times for call function IPIs 2015-08-03 09:25:12 +02:00
pci MIPS: Add & use CP0_EntryHi ASID definitions 2016-05-13 14:02:18 +02:00
pic32 Merge branch 'upstream' of git://git.linux-mips.org/pub/scm/ralf/upstream-linus 2016-05-19 10:02:26 -07:00
pistachio MIPS: pistachio: Determine SoC revision during boot 2016-05-13 17:47:47 +02:00
pmcs-msp71xx MIPS: MSP71xx: Use __flush_cache_all instead of flush_cache_all. 2016-05-13 14:01:38 +02:00
pnx833x MIPS: make PCI_DMA_BUS_IS_PHYS=1 constant 2016-05-13 14:02:17 +02:00
power MIPS: Hibernate: flush TLB entries earlier 2015-04-10 15:41:52 +02:00
ralink mips: mt7620: fallback to SDRAM when syscfg0 does not have a valid value for the memory type 2016-05-13 15:39:43 +02:00
rb532 MIPS: rb532: use gpiochip data pointer 2016-02-19 09:51:44 +01:00
sgi-ip22
sgi-ip27 MIPS: Fix misspellings in comments. 2016-04-03 12:32:09 +02:00
sgi-ip32 MIPS: IP32: Fix build errors in reset code in DS1685 platform hook. 2015-05-13 00:01:41 +02:00
sibyte MIPS: Sibyte: Fix Kconfig dependencies of SIBYTE_BUS_WATCHER 2016-05-09 12:00:03 +02:00
sni MIPS: Fix build error due to unused variables. 2015-12-22 15:21:18 +01:00
txx9 Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next 2016-03-19 10:05:34 -07:00
vdso MIPS: Fix genvdso error on rebuild 2016-05-13 22:47:39 +02:00
vr41xx MIPS: VR41xx: Use __flush_cache_all instead of flush_cache_all. 2016-05-13 14:01:38 +02:00
xilfpga Merge branch 'upstream' of git://git.linux-mips.org/pub/scm/ralf/upstream-linus 2015-11-15 09:10:53 -08:00
Kbuild MIPS: Initial implementation of a VDSO 2015-11-11 08:36:36 +01:00
Kbuild.platforms MIPS: Add support for PIC32MZDA platform 2016-01-24 02:53:28 +01:00
Kconfig printk/nmi: generic solution for safe printk in NMI 2016-05-20 17:58:30 -07:00
Kconfig.debug MIPS: CPS: Early debug using an ns16550-compatible UART 2015-11-11 08:34:25 +01:00
Makefile MIPS: Fix VZ probe gas errors with binutils <2.24 2016-05-17 11:06:04 +02:00