将uint32
解释为16对比特的序列(虚构的类型uint2
),有没有比下面更有效的算法来添加这些模4?
(x + (x>>2) + (x>>4) + (x>>6) + (x>>8) + (x>>10) + ... + (x>>30)) & 3
此外,我希望避免分支指令,使其可并行化.
将uint32
解释为16对比特的序列(虚构的类型uint2
),有没有比下面更有效的算法来添加这些模4?
(x + (x>>2) + (x>>4) + (x>>6) + (x>>8) + (x>>10) + ... + (x>>30)) & 3
此外,我希望避免分支指令,使其可并行化.
这样做的一种方式是在最高有效的两位而不是最低有效的两位中累积.然后可以按字节形成部分和,然后用一次乘法对四个部分字节和求和.这种方法是无分支的,并且保留了相当多的指令级并行性.整数乘法在大多数现代处理器上都很快,甚至在嵌入式应用中使用的许多低端CPU也是如此.
尤其是对于具有低指令级并行度的处理器,首先按半字节累加可能是有利的,因为这可能以稍微长一些的依赖链为代价来最小化总的指令计数.在下面的代码中,被ACCUMULATE_NIBBLE_WISE=1
选中.
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#define ACCUMULATE_NIBBLE_WISE (1)
uint32_t func_ref (uint32_t x)
{
return ((x >> 0) + (x >> 2) + (x >> 4) + (x >> 6) +
(x >> 8) + (x >> 10) + (x >> 12) + (x >> 14) +
(x >> 16) + (x >> 18) + (x >> 20) + (x >> 22) +
(x >> 24) + (x >> 26) + (x >> 28) + (x >> 30)) & 3;
}
uint32_t func (uint32_t x)
{
uint32_t mult = 0x01010101;
#if ACCUMULATE_NIBBLE_WISE
uint32_t nibble_sum = (x & 0xcccccccc) + ((x << 2) & 0xcccccccc);
uint32_t byte_sum = (nibble_sum + (nibble_sum << 4)) & 0xc0c0c0c0;
#else // !ACCUMULATE_NIBBLE_WISE
uint32_t lolo = x & 0x03030303;
uint32_t hilo = x & 0x0c0c0c0c;
uint32_t lohi = x & 0x30303030;
uint32_t hihi = x & 0xc0c0c0c0;
uint32_t byte_sum = (lolo << 6) + (hilo << 4) + (lohi << 2) + hihi;
#endif // ACCUMULATE_NIBBLE_WISE
return ((uint32_t)(byte_sum * mult)) >> 30;
}
int main (void)
{
uint32_t a = 0;
do {
uint32_t ref = func_ref (a);
uint32_t res = func (a);
if (res != ref) {
printf ("error: a=%08x res=%08x ref=%08x\n", a, res, ref);
return EXIT_FAILURE;
}
a++;
} while (a);
return EXIT_SUCCESS;
}