bit hacks

2022-02-10 · 4 min read

#perf

Collection of bit twiddling hacks with helpful explanations : )

Links #

utils #

Just some convenient functions for debugging.

fn u32_into_nib_le(x: u32) -> [u8; 8] {
	let [b0, b1, b2, b3] = x.to_le_bytes();
	[
		b0 & 0x0f,
		(b0 >> 4) & 0x0f,
		b1 & 0x0f,
		(b1 >> 4) & 0x0f,
		b2 & 0x0f,
		(b2 >> 4) & 0x0f,
		b3 & 0x0f,
		(b3 >> 4) & 0x0f,
	]
}

fn dbg_u8(lbl: &str, x: u8) -> u8 {
	println!("{:>12}  {:08b}", lbl, x);
	x
}

fn dbg_u32(lbl: &str, x: u32) -> u32 {
	println!("{:>12}  {:032b}", lbl, x);
	x
}

fn dbg_u32_byte(lbl: &str, x: u32) -> u32 {
	let [b0, b1, b2, b3] = x.to_le_bytes();
	println!("{:>12}  {:08b} {:08b} {:08b} {:08b}", lbl, b3, b2, b1, b0);
	x
}

fn dbg_u32_nib(lbl: &str, x: u32) -> u32 {
	let [n0, n1, n2, n3, n4, n5, n6, n7] = u32_into_nib_le(x);
	println!(
		"{:>12}  {:04b} {:04b} {:04b} {:04b} {:04b} {:04b} {:04b} {:04b}",
		lbl, n7, n6, n5, n4, n3, n2, n1, n0,
	);
	x
}

fn dbg_u64_byte(lbl: &str, x: u64) -> u64 {
    let [b0, b1, b2, b3, b4, b5, b6, b7] = x.to_le_bytes();
    println!(
        "{:>12}  {:08b} {:08b} {:08b} {:08b} {:08b} {:08b} {:08b} {:08b}",
        lbl, b7, b6, b5, b4, b3, b2, b1, b0,
    );
    x
}

Next k-bits Permutation Algorithm #

Suppose we have an integer with 4-bits, xs = 00101101. We can get the next lexicographic bitvec next(xs) = 00101110 via the following algorithm:

// given a k-bit vector `x` (i.e., `x.count_ones() == k`),
// this fn returns the next k-bit vector permutation in
// lexicographic order.
//
// ex:
//      00001111 -> 00010111
// 		00110101 -> 00101110
//      00101110 -> 00110011
//
// notice that `x.count_ones() == next(x).count_ones()`
fn next_kbit_perm(x: u32) -> u32 {
	// this is `x` but with trailing zeros set to ones
	//
	// ex:
	// 		00101101 -> 00101101
	//      00101100 -> 00101111
	let t = x | x.wrapping_sub(1);
	
	// this selects only the trailing ones in `t`.
	// note: t always has at least one trailing one.
	//
	// ex:
	//      00001111 -> 00001111
	// 	 	00101111 -> 00001111
	//      00101101 -> 00000001
	// alt. ((!t & (!t).wrapping_neg()).wrapping_sub(1))
	let u = (1_u8 << (t.trailing_ones())).wrapping_sub(1);
	
	// this sets the most significant bit that needs to
	// change (but leaves some trailing zeros that we
	// need to fixup later).
	//
	// ex:
	//      00001111 -> 00010000
	// 		00101011 -> 00101100
	//      00101111 -> 00110000
	let v = t.wrapping_add(1);
	
	// the ones we need to add back into v to "fix" it
	//
	// ex:
	// 		x=00011101, u=00000001 -> 00000000
	//      x=00011110, u=00011111 -> 00000111
	// 		x=00101011, u=00000011 -> 00000001
	let w = u.wrapping_shr(x.trailing_zeros() + 1);
	
	// the next permutation
	(v | w)
}

we can use this to generate all k-combinations of a given set, by representing each element of the set x_i as bit b_i.
- For example, let's say we have the set {A,B,C}. The combination {A,C} would be represented by the bitstring 101. Applying next_kbit_perm would then give us the next 2-combination 110 or {B,C}.

Num leading/trailing zero bytes #

fn u64_leading_zero_bytes_ref(x: u64) -> u32 {
	x.to_be_bytes().into_iter().take_while(|&b| b == 0).count() as u32
}

/// Return the number of leading bytes == 0x00
fn u64_leading_zero_bytes(x: u64) -> u32 {
    x.leading_zeros() >> 3
}

fn u64_trailing_zero_bytes_ref(x: u64) -> u32 {
	x.to_le_bytes().into_iter().take_while(|&b| b == 0).count() as u32
}

/// Return the number of trailing bytes == 0x00
fn u64_trailing_zero_bytes(x: u64) -> u32 {
    x.trailing_zeros() >> 3
}

godbolt: https://godbolt.org/z/81a7E6f7M

Any byte == 0 #

Returns true if any byte in x is zero.

fn u32_any_byte_eqz_ref(x: u32) -> bool {
	x.to_ne_bytes()
		.into_iter()
		.any(|b| b == 0)
}

fn u32_any_byte_eqz(x: u32) -> bool {
	// subtract each individual byte by 1. if a byte is 0x00, this will
	// "underflow" to 0xff.
	//
	// for reference:
	// 0x0101_0101  00000001 00000001 00000001 00000001
	let t = x.wrapping_sub(0x0101_0101);
	// for each byte in u, the upper bit is 1 if that corresponding bit
	// is 0 in x
	// 
	// ex:
	// 	         x	0XXXXXXX 1XXXXXXX 1XXXXXXX 0XXXXXXX
	// 0x8080_8080 	10000000 10000000 10000000 10000000
	//           u  00000000 10000000 10000000 00000000
	let u = !x & 0x8080_8080;
	// leave a 0x80 in a byte if the upper bit is 1
	//
	// this can only happen if
	// 	 1. the value in the byte has its msb set (after subtracting 1)
	//   2. the msb was not already 1 before subtracting
	//
	// (2.) implies the byte was in the range (0b00000000..=0b01111111).
	//
	// therefore the only value which will have its msb set after
	// subtracting is 0, since it will "underflow" to 0b11111111
	let v = t & u;
	
	v != 0
}

godbolt: https://godbolt.org/z/4Evq88seo

Any nibble == 0 #

Returns true if any nibble in x is zero. This is just Any byte 0 but with the constants adjusted to work on every nibble rather than every byte.

fn u32_any_nib_eqz_ref(x: u32) -> bool {
	u32_into_nib_le(x)
		.into_iter()
		.any(|nb| nb == 0)
}

fn u32_any_nib_eqz(x: u32) -> bool {
	// 0x1111_1111  0001 0001 0001 0001 0001 0001 0001 0001
	// 0x8888_8888  1000 1000 1000 1000 1000 1000 1000 1000
	let t = x.wrapping_sub(0x1111_1111);
	let u = !x & 0x8888_8888;
	let v = t & u;
	
	v != 0
}

godbolt: https://godbolt.org/z/fnf78rKvz

Any specific bytes == 0 #

Returns true if any of a specific subset of the bytes in x are zero.

mask is a bit-mask where each byte we want to select is equal to 0x01, for example, mask = 0x0100_0100 would select the 2nd and 4th bytes (1-indexed).

fn u32_any_spec_nib_eqz_ref(x: u32, mask: u32) -> bool {
	x.to_ne_bytes().into_iter()
		.zip(mask.to_ne_bytes().into_iter())
		.any(|(b_x, b_mask)| (b_mask == 0x01) && (b_x == 0x00))
}

fn u32_any_spec_nib_eqz(x: u32, mask: u32) -> bool {
	// this is just 0x8888_8888, but only 8 where mask has a 1-nibble
	let mask_hi = mask.wrapping_mul(0x8);
	
	let t = x.wrapping_sub(mask);
	let u = !x & mask_hi;
	let v = t & u;
	
	v != 0
}

Any specific nibbles == 0 #

Returns true if any of a specific subset of the nibbles in x are zero.

mask is a bit-mask where each nibble we want to select is equal to 0001, for example, mask = 0x1100_0010 would select the 2nd, 7th, and 8th nibbles (1-indexed).

fn u32_any_spec_nib_eqz_ref(x: u32, mask: u32) -> bool {
	u32_into_nib_le(x)
		.into_iter()
		.zip(u32_into_nib_le(mask).into_iter())
		.any(|(nb_x, nb_mask)| (nb_mask == 0x1) && (nb_x == 0x0))
}

fn u32_any_spec_nib_eqz(x: u32, mask: u32) -> bool {
	// this is just 0x8888_8888, but only 8 where mask has a 1-nibble
	let mask_hi = mask.wrapping_mul(0x8);
	
	let t = x.wrapping_sub(mask);
	let u = !x & mask_hi;
	let v = t & u;
	
	v != 0
}

godbolt: https://godbolt.org/z/Tfn9fbKKG

Any byte b in range m < b < n #

where 0 <= m <= 127 and 0 <= n <= 128.

Returns true if any byte b in x is in the range m < b < n.

fn u32_has_byte_between_ref(x: u32, m: u32, n: u32) -> bool {
	assert!((0..=127).contains(&m));
	assert!((0..=128).contains(&n));
	
	let m = m as u8;
	let n = n as u8;
	
	x.to_ne_bytes().into_iter().any(|b| (m < b) && (b < n))
}

fn u32_has_byte_between(x: u32, m: u32, n: u32) -> bool {
	assert!((0..=127).contains(&m));
	assert!((0..=128).contains(&n));
	
	let mask = 0x0101_0101;
	let s = mask * 127;
	let y = mask * 128;
	let w = mask * (127 + n);
	let t = mask * (127 - m);
	
	let u = x & s;
	let z = (w - u) & (!x) & (u + t) & y;
	
	z != 0
}

godbolt: https://godbolt.org/z/a5911ra73

Any specific bytes b in range m < b < n #

where 0 <= m <= 127 and 0 <= n <= 128.

Returns true if any specific bytes b in x are in the range m < b < n, selected by a bit-mask mask.

mask is a bit-mask where each byte we want to select is equal to 0x01, for example, mask = 0x0100_0100 would select the 2nd and 4th bytes (1-indexed).

fn u32_has_spec_byte_between_ref(x: u32, mask: u32, m: u32, n: u32) -> bool {
	assert!((0..=127).contains(&m));
	assert!((0..=128).contains(&n));
	
	let m = m as u8;
	let n = n as u8;
	
	x.to_ne_bytes()
		.into_iter()
		.zip(mask.to_ne_bytes().into_iter())
		.any(|(b_x, b_mask)| (b_mask == 0x01) && (m < b_x) && (b_x < n))
}

fn u32_has_spec_byte_between(x: u32, mask: u32, m: u32, n: u32) -> bool {
	assert!((0..=127).contains(&m));
	assert!((0..=128).contains(&n));

	let s = mask * 127;
	let y = mask * 128;
	let w = mask * (127 + n);
	let t = mask * (127 - m);
	
	let u = x & s;
	let z = (w - u) & (!x) & (u + t) & y;
	
	z != 0
}

godbolt: https://godbolt.org/z/aq8T9seEs

Any specific nibbles nb in range m < nb < n #

where 0 <= m <= 7 and 0 <= n <= 8.

Returns true if any specific nibbles nb in x are in the range m < nb < n, selected by a bit-mask mask.

mask is a bit-mask where each nibble we want to select is equal to 0001, for example, mask = 0x1100_0010 would select the 2nd, 7th, and 8th nibbles (1-indexed).

fn u32_has_spec_nib_between_ref(x: u32, mask: u32, m: u32, n: u32) -> bool {
	assert!((0..=7).contains(&m));
	assert!((0..=8).contains(&n));
	
	let m = m as u8;
	let n = n as u8;
	
	u32_into_nib_le(x)
		.into_iter()
		.zip(u32_into_nib_le(mask).into_iter())
		.any(|(nb_x, nb_mask)| {
			(nb_mask == 0x1) && (m < nb_x) && (nb_x < n)
		})
}

fn u32_has_spec_nib_between(x: u32, mask: u32, m: u32, n: u32) -> bool {
	assert!((0..=7).contains(&m));
	assert!((0..=8).contains(&n));

	let s = mask * 7;
	let y = mask * 8;
	let w = mask * (7 + n);
	let t = mask * (7 - m);
	
	let u = x & s;
	let z = (w - u) & (!x) & (u + t) & y;
	
	z != 0
}

godbolt: https://godbolt.org/z/PGfvT39z5

Element-wise bytes less-than #

fn u64_elementwise_bytes_lt_ref(x: u64, y: u64) -> u64 {
	assert!(x & 0x8080_8080_8080_8080_u64 == 0);
	let bytes = x.to_le_bytes()
		.into_iter()
		.zip(y.to_le_bytes().into_iter())
		.map(|(bx, by)| if bx < by { 0x80_u8 } else { 0x00_u8 })
		.collect::<Vec<_>>();
	u64::from_le_bytes(<[u8; 8]>::try_from(&bytes[..]).unwrap())
}

/// Do an element-wise less-than comparison across each byte in `x` and `y`.
/// Returns a mask where each byte is `0x80` if `bx < by` and `0x00` otherwise.
/// Requires that each byte in `x` is in the range `0 <= bx <= 0x7f (127)`.
fn u64_elementwise_bytes_lt(x: u64, y: u64) -> u64 {
    // ensure no elements in x are > 0x7f
    debug_assert!((x & 0x8080_8080_8080_8080) == 0);

    let a = 0x7f7f_7f7f_7f7f_7f7f_u64;
    let b = 0x8080_8080_8080_8080_u64;
    let c = a - x;

    // a = (0x7f << 56) + (0x7f << 48) + .. + (0x7f << 0)
    // x = (b7 << 56) + (b6 << 48) + .. + (b0 << 0)
    // c = ((0x7f - b7) << 56) + .. + ((0x7f - b0) << 0)
	//     iff. bi <= 0x7f forall i

    // (((by & 0x7f) + (0x7f - bx) > 0x7f) || (by > 0x7f)
    // by' + 0x7f - bx > 0x7f
    // by' - bx > 0x7f
    // (by' > bx) || (by > 0x7f)

    ((y & a).wrapping_add(c) | y) & b
}

godbolt: https://godbolt.org/z/fzGT4bonz

Leading byte index less-than #

fn u64_leading_byte_idx_lt_ref(x: u64, y: u64) -> Option<u8> {
	x.to_le_bytes().into_iter()
		.zip(y.to_le_bytes().into_iter())
		.rposition(|(bx, by)| bx < by)
		.map(|idx| idx as u8)
}

/// Given two u64's, `x` and `y`, where all bytes `bx` in `x` are in the range
/// `0 <= bx <= 0x7f (127)`, return the _byte index_ (i.e., idx = 0 = least
/// significant byte, idx = 7 = most-significant byte)  of the first leading byte
/// `by` in `y` where `by > bx`. (i.e., starting from the most-significant byte).
///
/// Returns `None` if there are no bytes in `y` greater than their corresponding
/// byte in `x`.
fn u64_leading_byte_idx_lt(x: u64, y: u64) -> Option<u8> {
    let mask_0x80 = u64_elementwise_bytes_lt(x, y);

    if mask_0x80 != 0 {
        Some((7 - u64_leading_zero_bytes(mask_0x80)) as u8)
    } else {
        None
    }
}

godbolt: https://godbolt.org/z/aadbsqjve

Trailing byte index single byte less-than #

fn u64_trailing_byte_idx_lt_ref(n: u8, y: u64) -> Option<u8> {
	y.to_le_bytes()
		.into_iter()
		.position(|by| n < by)
		.map(|idx| idx as u8)
}

/// Return the _byte index_ of the first _trailing_ byte `by` in `y` where
/// `n < by`.
fn u64_trailing_byte_idx_lt(n: u8, y: u64) -> Option<u8> {
    // broadcast byte `n` across all bytes
    let x = 0x0101_0101_0101_0101_u64 * (n as u64);
    // set 0x80 in all bytes where `n < by`
    let mask_0x80 = u64_elementwise_bytes_lt(x, y);

    if mask_0x80 != 0 {
        Some(u64_trailing_zero_bytes(mask_0x80) as u8)
    } else {
        None
    }
}

godbolt: https://godbolt.org/z/3n5s1TMnW

pdep - parallel bit deposit #

Like "spreading" out compact bits onto a mask.

See: https://github.com/phlip9/fastperm_benches/blob/master/src/select64.rs#L123

select - get the index of the i'th 1-bit in a mask #

See: https://github.com/phlip9/fastperm_benches/blob/master/src/select64.rs#L19

Generate all possible masks #

A proptest strategy for generating all possible combinations of bits, mask that match a "meta" mask. By "match" I mean meta_mask & mask == mask, i.e., mask is a subset of meta_mask.

fn u32_arb_mask(meta_mask: u32) -> impl Strategy<Value = u32> {
	let nmasks: u32 = 1 << meta_mask.count_ones();
	
	(0..=nmasks)
		.prop_map(move |mask_idx| pdep32(mask_idx, meta_mask))
}

Sum all nibbles #

Given x = AAAA BBBB CCCC DDDD EEEE FFFF GGGG HHHH, returns the sum AAAA + BBBB + CCCC + DDDD + EEEE + FFFF + GGGG + HHHH.

fn u32_sum_nibs_ref(x: u32) -> u32 {
	u32_into_nib_le(x)
		.into_iter()
		.map(|x| x as u32)
		.sum()
}

// this implementation is just a truncated popcnt/count_ones.
// it's truncated in that we skip horizontal summing the bits and
// half-nibbles and just dive right into horizontal summing the
// nibbles.
//
// visually:
//
//   x  AAAA BBBB CCCC DDDD EEEE FFFF GGGG HHHH
//   y  000A BABA 000C DCDC 000E FEFE 000G HGHG  (1.)
//   z  0000 0000 00AB CDAB 0000 0000 00EF GHEF  (2.)
//   w  0000 0000 0000 0000 0000 0000 0ABC DEFG  (3.)
//
// (1.) note that any horizontal sum AAAA + BBBB will fill at most
//      _5_ bits. (log2(16 + 16) == 5)
// (2.) each will fill at most _6_ bits. (log2(32 + 32) == 6)
// (3.) the result will fill at most _7_ bits. (log2(64 + 64) == 7)
fn u32_sum_nibs_simple(x: u32) -> u32 {
    const NIBS_0246: u32 = 0x0f0f_0f0f;
    const NIBS_0145: u32 = 0x00ff_00ff;
    const NIBS_0123: u32 = 0x0000_ffff;

	// like taking each individual byte and replacing it with the sum
	// of its hi and lo nibbles.
    let y = (x & NIBS_0246) + ((x >> 4) & NIBS_0246);
	// for each individual half-word, replace it with the sum of its
	// hi and lo bytes.
	let z = (y & NIBS_0145) + ((y >> 8) & NIBS_0145);
	// our result is then the sum of the remaining hi and lo
	// half-words!
    let w = (z & NIBS_0123) + ((z >> 16) & NIBS_0123);

	w
}

// this one's pretty cool and compact : )
//
// it starts off like the one before, horizontal summing the adjacent
// nibbles in each byte.
//
// the trick here is to somehow get the horizontal sum of all bytes
// into the _last_ byte, then shift down so only the last byte is left.
//
// we can decompose any 32-bit integer into base 256, where the
// ith coefficient is the ith byte:
//
// B = 2^8
// y = (b3 * B^3) + (b2 * B^2) + (b1 * B^1) + (b0 * B^0)     (mod 2^32)
//   = (b3 * 2^24) + (b2 * 2^16) + (b1 * 2^8) + (b0 * 2^0)   (mod 2^32)
//
// we want to reach the following:
//
// z = ((b3 + b2 + b1 + b0) * 2^24) + (XXX)   (mod 2^32)
// 	   where 0 <= b0, b1, b2, b3 < 32
//       and (XXX) < 0x00ff_ffff (so it doesn't overflow into the
// 								  part care about)
//
// consider one of the individual bytes multiplied by 0x0101_0101
//
// y2 = b2 * 2^16
// y2 * 0x0101_0101 = b2 * (0x0101_0101 * 2^16)
// 					= b2 * (0x0101_0101 << 16)
// 					= b2 * 0x0101_0000
// 					= 0xb2b2_0000
//
// then, expanding all byte sections
//
// y0 = b0 * 2^0
// y1 = b1 * 2^8
// y2 = b2 * 2^16
// y3 = b3 * 2^24
//
// y = y0 + y1 + y2 + y3
//
// y0 * 0x0101_0101 = b0 * (0x0101_0101 << 0)
//					= b0 * 0x0101_0101
//                  = 0xb0b0_b0b0
// y1 * 0x0101_0101 = b1 * (0x0101_0101 << 8)
//					= b1 * 0x0101_0100
//                  = 0xb1b1_b100
// y2 * 0x0101_0101 = b2 * (0x0101_0101 << 16)
//					= b2 * 0x0101_0000
//                  = 0xb2b2_0000
// y3 * 0x0101_0101 = b3 * (0x0101_0101 << 24)
//					= b3 * 0x0100_0000
//                  = 0xb300_0000
// y * 0x0101_0101 = (y0 + y1 + y2 + y3) * 0x0101_0101
//                 = 0xb0b0_b0b0 + 0xb1b1_b100 +
//	                 0xb2b2_0000 + 0xb300_0000
//                 = [b0, b0+b1, b0+b1+b2, b0+b1+b2+b3]
//
// which has exactly what we want, b0+b1+b2+b3 in the last byte!
// (with some leftover junk in the lower bytes, but we can just
// ignore that when we down shift).
//
// since we're doing a horizontal sum to get `y`, and each nibble
// must be in the range 0 <= nb < 16, it follows that each byte
// (which is a sum of two nibbles here) must be in the range:
// 0 <= b < 32. this means none of the byte sums will overflow,
// since 32 * 4 = 128 <= 256
fn u32_sum_all_nibs(x: u32) -> u32 {
    // a mask that selects the lo nibble in each byte.
    const NIBS_0246: u32 = 0x0f0f_0f0f;

    // horizontal sum hi and lo nibbles in each byte, placing in the
	// lo nibble.
    let y = (x & NIBS_0246) + ((x >> 4) & NIBS_0246);

    // if y = [y0, y1, y2, y3] bytes and each byte b is in the range
	// 0 <= b < 64, then multiplying by 0x0101_0101 will yield
    // z = [y0, y0 + y1, y0 + y1 + y2, y0 + y1 + y2 + y3] without
	// any overflows.
    //
    // since each byte in y is the sum of two nibbles nb where
	// 0 <= nb < 16, it follows that 0 <= b = nb_lo + nb_hi < 32 < 64
	// so we won't have any overflows
    let z = y.wrapping_mul(0x0101_0101);

    // select the last byte in z, which contains our desired sum:
    // z3 = y0 + y1 + y2 + y3
    //    = (nb0 + nb1) + (nb2 + nb3) + (nb4 + nb5) + (nb6 + nb7)
    z >> 24
}

godbolt: https://godbolt.org/z/4EcYae1fj