/* 
 * given two words a and b, return the high bits that they have in
 * common, with the highest bit where they differ set, and all
 * remaining bits clear. So 10101 and 10011 yields 10100. Then do the
 * same thing, only flipped, so we keep low-order identical bits and
 * mask out the high ones.
 */

#include <stdio.h>
#include <stdint.h>
#include <assert.h>
#include <stdlib.h>
#include <time.h>

uint64_t high_common_bits_reference(uint64_t a, uint64_t b) __attribute__ ((noinline));
uint64_t high_common_bits_reference(uint64_t a, uint64_t b)
{
  uint64_t mask = 0x8000000000000000LLU;
  uint64_t output = 0;
  int i;
  for (i = 63; i >= 0; i--) {
    if ((a & mask) == (b & mask)) {
      output |= (a & mask);
    } else {
      output |= mask;
      goto out;
    }
    mask >>= 1;
  }
 out:
  return output;
}

uint64_t low_common_bits_reference(uint64_t a, uint64_t b) __attribute__ ((noinline));
uint64_t low_common_bits_reference(uint64_t a, uint64_t b)
{
  uint64_t mask = 1;
  uint64_t output = 0;
  int i;
  for (i = 0; i < 64; i++) {
    if ((a & mask) == (b & mask)) {
      output |= (a & mask);
    } else {
      output |= mask;
      goto out;
    }
    mask <<= 1;
  }
 out:
  return output;
}

uint64_t high_common_bits_portable(uint64_t a, uint64_t b) __attribute__ ((noinline));
uint64_t high_common_bits_portable(uint64_t a, uint64_t b)
{
  uint64_t x = a ^ b;
  x |= x >> 1;
  x |= x >> 2;
  x |= x >> 4;
  x |= x >> 8;
  x |= x >> 16;
  x |= x >> 32;
  return (a & ~x) | (x & ~(x >> 1));
}

uint64_t low_common_bits_portable(uint64_t a, uint64_t b) __attribute__ ((noinline));
uint64_t low_common_bits_portable(uint64_t a, uint64_t b)
{
  uint64_t x = (a ^ b) & -(a ^ b);
  return (a & (x - 1)) | x;
}

uint64_t high_common_bits_intrinsic(uint64_t a, uint64_t b) __attribute__ ((noinline));
uint64_t high_common_bits_intrinsic(uint64_t a, uint64_t b)
{
  if (a == b) return a;
  uint64_t bit = 1ULL << 63 - __builtin_clzll(a ^ b);
  return (a & ~(bit - 1)) | bit;
}

uint64_t low_common_bits_intrinsic(uint64_t a, uint64_t b) __attribute__ ((noinline));
uint64_t low_common_bits_intrinsic(uint64_t a, uint64_t b)
{
  uint64_t x=  a ^ b;
  if (x == 0) return a;
  uint64_t bit = 1ULL << __builtin_ctzll(x);
  return (a & bit - 1) | bit;
}

uint64_t rand64(void)
{
  uint64_t a = rand() & 0xffff;
  uint64_t b = rand() & 0xffff;
  uint64_t c = rand() & 0xffff;
  uint64_t d = rand() & 0xffff;
  return (a << 48) | (b << 32) | (c << 16) | d;
}

uint64_t flip(uint64_t x, int k)
{
  uint64_t mask = 1ULL << k;
  x ^= mask;
  return x;
}

uint64_t mutate(uint64_t a)
{
  uint64_t b = a;
  int i;
  int n = rand() % 64;
  for (i = 0; i < n; i++) {
    int k = rand() % 64;
    b = flip(b, k);
  }
  return b;
}

typedef unsigned long long ticks;

static __inline__ ticks start(void)
{
  unsigned cycles_low, cycles_high;
  asm volatile ("CPUID\n\t"
		"RDTSC\n\t"
		"mov %%edx, %0\n\t"
		"mov %%eax, %1\n\t":"=r" (cycles_high),
		"=r"(cycles_low):: "%rax", "%rbx", "%rcx", "%rdx");
  return ((ticks) cycles_high << 32) | cycles_low;
}

static __inline__ ticks stop(void)
{
  unsigned cycles_low, cycles_high;
  asm volatile ("RDTSCP\n\t"
		"mov %%edx, %0\n\t"
		"mov %%eax, %1\n\t"
		"CPUID\n\t":"=r" (cycles_high), "=r"(cycles_low)::"%rax",
		"%rbx", "%rcx", "%rdx");
  return ((ticks) cycles_high << 32) | cycles_low;
}

#define N 1000
#define REPS 5

uint64_t a[N];
uint64_t b[N];

typedef uint64_t(*fp_t) (uint64_t, uint64_t);

void perftest(fp_t fp, const char *str)
{
  int rep;
  uint64_t output = 0;
  ticks best = 1ULL << 60;
  for (rep = 0; rep < REPS; rep++) {
    int i;
    ticks t1 = start();
    for (i = 0; i < N; i++) {
      output += fp(a[i], b[i]);
    }
    ticks t2 = stop();
    ticks diff = t2 - t1;
    if (diff < best) {
      best = diff;
    }
  }
  printf("%s : time = %5.1lf (ignore this: %llu)\n", 
	 str, 
	 (double) best / N,
	 output);
}

int main(void)
{
  srand(time(NULL));
  int i;
  for (i = 0; i < N; i++) {
    a[i] = rand64();
    b[i] = mutate(a[i]);
  }
  perftest(high_common_bits_reference, "high slow     ");
  perftest(high_common_bits_portable,  "high portable ");
  perftest(high_common_bits_intrinsic, "high intrinsic");
  perftest(low_common_bits_reference,  "low slow      ");
  perftest(low_common_bits_portable,   "low portable  ");
  perftest(low_common_bits_intrinsic,  "low intrinsic ");
  return 0;
}

