CUDA有一个很好的SIMD instructions for integers集,允许高效的SIMD计算.在这些函数中,有一些计算每字节或每半个字(like __vadd2 and __vadd4)的加减运算,然而,我找不到类似的函数来计算32位寄存器的每字节乘法.如果有人能帮我找到合适的解决方案,我将不胜感激.

推荐答案

然而,我找不到类似的函数来计算32位寄存器的每字节乘法.

没有人会退回这4种产品.

最接近的是__dp4a() intrinsic,它以32位整数返回4个乘积的总和.

您可以编写一个8位压缩无符号乘法,其饱和如下:

$ cat t2048.cu
#include <cstdio>
#include <cstdint>

__host__ __device__ uchar4 u8mulsat(const uchar4 &a, const uchar4 &b){

  const unsigned sv = 255;
  uchar4 result;
  unsigned t;
  t = a.x*b.x;
  if (t > sv) t = sv;
  result.x = t;
  t = a.y*b.y;
  if (t > sv) t = sv;
  result.y = t;
  t = a.z*b.z;
  if (t > sv) t = sv;
  result.z = t;
  t = a.w*b.w;
  if (t > sv) t = sv;
  result.w = t;
  return result;
}

__global__ void k(uchar4 a, uchar4 b, uchar4 *c){

  *c = u8mulsat(a, b);
}

int main(){

  uchar4 a,b,c, *d_c;
  cudaMalloc(&d_c, sizeof(uchar4));
  a.x = 1;
  a.y = 2;
  a.z = 4;
  a.w = 8;
  b.x = 64;
  b.y = 64;
  b.z = 64;
  b.w = 1;
  k<<<1,1>>>(a, b, d_c);
  cudaMemcpy(&c, d_c, sizeof(uchar4), cudaMemcpyDeviceToHost);
  printf("c.x = %u\n", (unsigned)c.x);
  printf("c.y = %u\n", (unsigned)c.y);
  printf("c.z = %u\n", (unsigned)c.z);
  printf("c.w = %u\n", (unsigned)c.w);
}
$ nvcc -o t2048 t2048.cu
$ compute-sanitizer ./t2048
========= COMPUTE-SANITIZER
c.x = 64
c.y = 128
c.z = 255
c.w = 8
========= ERROR SUMMARY: 0 errors
$ cuobjdump -sass ./t2048

Fatbin elf code:
================
arch = sm_52
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit

        code for sm_52

Fatbin elf code:
================
arch = sm_52
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit

        code for sm_52
                Function : _Z1k6uchar4S_PS_
        .headerflags    @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)"
                                                                 /* 0x001c4400e22007f6 */
        /*0008*/                   MOV R1, c[0x0][0x20] ;        /* 0x4c98078000870001 */
        /*0010*/                   LDC.U8 R0, c[0x0][0x140] ;    /* 0xef9000001407ff00 */
        /*0018*/                   LDC.U8 R2, c[0x0][0x144] ;    /* 0xef9000001447ff02 */
                                                                 /* 0x001d4400e6200731 */
        /*0028*/                   LDC.U8 R3, c[0x0][0x141] ;    /* 0xef9000001417ff03 */
        /*0030*/                   LDC.U8 R4, c[0x0][0x145] ;    /* 0xef9000001457ff04 */
        /*0038*/                   LDC.U8 R5, c[0x0][0x142] ;    /* 0xef9000001427ff05 */
                                                                 /* 0x001dfc00ee200751 */
        /*0048*/                   LDC.U8 R6, c[0x0][0x146] ;    /* 0xef9000001467ff06 */
        /*0050*/                   LDC.U8 R7, c[0x0][0x143] ;    /* 0xef9000001437ff07 */
        /*0058*/                   LDC.U8 R8, c[0x0][0x147] ;    /* 0xef9000001477ff08 */
                                                                 /* 0x009fd002fe200fe1 */
        /*0068*/                   XMAD R0, R2, R0, RZ ;         /* 0x5b007f8000070200 */
        /*0070*/                   XMAD R2, R4, R3, RZ ;         /* 0x5b007f8000370402 */
        /*0078*/                   XMAD R3, R6, R5, RZ ;         /* 0x5b007f8000570603 */
                                                                 /* 0x001fc408fe2007f1 */
        /*0088*/                   IMNMX.U32 R0, R0, 0xff, PT ;  /* 0x382003800ff70000 */
        /*0090*/                   XMAD R4, R8, R7, RZ ;         /* 0x5b007f8000770804 */
        /*0098*/                   IMNMX.U32 R2, R2, 0xff, PT ;  /* 0x382003800ff70202 */
                                                                 /* 0x001fc400fe2007e4 */
        /*00a8*/                   IMNMX.U32 R3, R3, 0xff, PT ;  /* 0x382003800ff70303 */
        /*00b0*/                   IMNMX.U32 R4, R4, 0xff, PT ;  /* 0x382003800ff70404 */
        /*00b8*/                   BFI R0, R2, 0x808, R0 ;       /* 0x36f0000080870200 */
                                                                 /* 0x001fd400fe2007f5 */
        /*00c8*/                   MOV R2, c[0x0][0x148] ;       /* 0x4c98078005270002 */
        /*00d0*/                   BFI R5, R3, 0x810, R0 ;       /* 0x36f0000081070305 */
        /*00d8*/                   MOV R3, c[0x0][0x14c] ;       /* 0x4c98078005370003 */
                                                                 /* 0x001ffc00fe2007e2 */
        /*00e8*/                   BFI R4, R4, 0x818, R5 ;       /* 0x36f0028081870404 */
        /*00f0*/                   STG.E [R2], R4 ;              /* 0xeedc200000070204 */
        /*00f8*/                   EXIT ;                        /* 0xe30000000007000f */
                                                                 /* 0x001f8000fc0007ff */
        /*0108*/                   BRA 0x100 ;                   /* 0xe2400fffff07000f */
        /*0110*/                   NOP;                          /* 0x50b0000000070f00 */
        /*0118*/                   NOP;                          /* 0x50b0000000070f00 */
                                                                 /* 0x001f8000fc0007e0 */
        /*0128*/                   NOP;                          /* 0x50b0000000070f00 */
        /*0130*/                   NOP;                          /* 0x50b0000000070f00 */
        /*0138*/                   NOP;                          /* 0x50b0000000070f00 */
                ..........



Fatbin ptx code:
================
arch = sm_52
code version = [7,4]
producer = <unknown>
host = linux
compile_size = 64bit
compressed
$

SASS代码似乎与我预期的差不多,与C++代码的长度大致相同,忽略了LDCSTG条指令.

FWIW、特斯拉Vimplementation、CUDA 11.4、njuffa和mine的implementation在寄存器使用(njuffa:16,mine:17)和性能(njuffa大约快1%)方面非常接近:

$ cat t2048.cu
#include <iostream>
#include <cstdint>

__device__ unsigned int vmulus4 (unsigned int a, unsigned int b)
{
    unsigned int plo, phi, res;
    // compute products
    plo = ((a & 0x000000ff) * (b & 0x000000ff) +
           (a & 0x0000ff00) * (b & 0x0000ff00));
    phi = (__umulhi (a & 0x00ff0000, b & 0x00ff0000) +
           __umulhi (a & 0xff000000, b & 0xff000000));
    // clamp products to 255
    plo |= __vcmpne2 (plo & 0xff00ff00, 0x00000000);
    phi |= __vcmpne2 (phi & 0xff00ff00, 0x00000000);
    // extract least significant eight bits of each product
    res = __byte_perm (plo, phi, 0x6420);
    return res;
}

__host__ __device__ uchar4 u8mulsat(const uchar4 &a, const uchar4 &b){

  const unsigned sv = 255;
  uchar4 result;
  unsigned t;
  t = a.x*b.x;
  if (t > sv) t = sv;
  result.x = t;
  t = a.y*b.y;
  if (t > sv) t = sv;
  result.y = t;
  t = a.z*b.z;
  if (t > sv) t = sv;
  result.z = t;
  t = a.w*b.w;
  if (t > sv) t = sv;
  result.w = t;
  return result;
}

__global__ void k(const uchar4 * __restrict__ a, const uchar4 * __restrict__ b, uchar4 * __restrict__ c, unsigned N){

  unsigned idx = blockIdx.x*blockDim.x+threadIdx.x;
  if (idx < N)
    c[idx] = u8mulsat(a[idx], b[idx]);
}

__global__ void k1(const unsigned * __restrict__ a, const unsigned * __restrict__ b, unsigned * __restrict__ c, unsigned N){

  unsigned idx = blockIdx.x*blockDim.x+threadIdx.x;
  if (idx < N)
    c[idx] = vmulus4(a[idx], b[idx]);
}

int main(){
  unsigned N = 256U*80U*8U*400U;
  uchar4 *d_a,*d_b, *d_c;
  cudaMalloc(&d_c, sizeof(uchar4)*N);
  cudaMalloc(&d_a, sizeof(uchar4)*N);
  cudaMalloc(&d_b, sizeof(uchar4)*N);
  for (int i = 0; i < 100; i++) {
    k<<<N/256,256>>>(d_a, d_b, d_c, N);
    k1<<<N/256,256>>>((unsigned *)d_a, (unsigned *)d_b, (unsigned *)d_c, N);}
  cudaDeviceSynchronize();
}
$ nvcc -o t2048 t2048.cu -arch=sm_70 -Xptxas -v
ptxas info    : 0 bytes gmem
ptxas info    : Compiling entry function '_Z2k1PKjS0_Pjj' for 'sm_70'
ptxas info    : Function properties for _Z2k1PKjS0_Pjj
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 16 registers, 380 bytes cmem[0]
ptxas info    : Compiling entry function '_Z1kPK6uchar4S1_PS_j' for 'sm_70'
ptxas info    : Function properties for _Z1kPK6uchar4S1_PS_j
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 17 registers, 380 bytes cmem[0]
$ nvprof ./t2048
==2696== NVPROF is profiling process 2696, command: ./t2048
==2696== Profiling application: ./t2048
==2696== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   50.21%  100.24ms       100  1.0024ms  998.26us  1.0084ms  k(uchar4 const *, uchar4 const *, uchar4*, unsigned int)
                   49.79%  99.412ms       100  994.12us  990.33us  1.0015ms  k1(unsigned int const *, unsigned int const *, unsigned int*, unsigned int)
      API calls:   57.39%  279.76ms         3  93.254ms  557.75us  278.64ms  cudaMalloc
                   40.69%  198.31ms         1  198.31ms  198.31ms  198.31ms  cudaDeviceSynchronize
                    1.03%  5.0147ms         4  1.2537ms  589.80us  3.2328ms  cuDeviceTotalMem
                    0.51%  2.4799ms       404  6.1380us     333ns  272.34us  cuDeviceGetAttribute
                    0.30%  1.4715ms       200  7.3570us  6.5220us  68.684us  cudaLaunchKernel
                    0.07%  354.69us         4  88.672us  61.927us  166.60us  cuDeviceGetName
                    0.00%  20.956us         4  5.2390us  3.1200us  7.8000us  cuDeviceGetPCIBusId
                    0.00%  10.445us         8  1.3050us     522ns  4.9100us  cuDeviceGet
                    0.00%  3.7970us         4     949ns     780ns  1.2230us  cuDeviceGetUuid
                    0.00%  3.2030us         3  1.0670us     751ns  1.5050us  cuDeviceGetCount
$

C++相关问答推荐

如何创建由符号组成的垂直结果图形?

显式地将值转换为它从函数返回的类型的含义是什么?

为什么在此程序中必须使用Volatile关键字?

在编译时参数化类型定义

在C23中使用_GENERIC实现带有右值的IS_POINTER(P)?

S将C语言宏定义为自身的目的是什么?(在glibc标题中看到)

如何在C中只对字符串(包含数字、单词等)中的数字进行重复操作?

<;unistd.h>;和<;sys/unistd.h>;之间有什么区别?

S在本文中的价值观到底出了什么问题?

如何使用唯一数字对整型进行分区

当内存来自Malloc时,将char*转换为另一个指针类型是否违反了严格的别名规则?

条件跳转或移动取决于未初始化值(S)/未初始化值由堆分配创建(Realloc)

表达式x&;&;(~x)应该返回1还是0?它依赖于编译器吗?

在文件描述符上设置FD_CLOEXEC与将其传递给POSIX_SPOWN_FILE_ACTIONS_ADCLOSE有区别吗?

C中的数组下标和指针算法给出了不同的结果

将数组返回到链表

为什么需要struct in_addr

char* 上的 free() 被 valgrind 识别为无效

在 C 语言中,为什么 10/3 应该给出 3.333 却给出 3.000? (保存 10 和 3 的变量被声明为double)

C++11 的 constexpr 和 C23 的 [[reproducible]] 有什么区别?