我有一个矩阵乘法内核,当在Cuda c中计时时,它比在rustffi上调用相同的函数时快10倍.
我可以使用cuBLAS,但我将此作为练习来学习更高级的CUDA优化.
当查看NVIDIA的NSight系统时,我发现使用铁 rust 版本的内核耗时非常长.在这两个测试中基本上没有开销.这很令人困惑,因为相同的内核在Rust中怎么会花更长的时间呢?
这让我认为这一定是我为Rust编译的库的一个问题.尤其是在使用cuBLAS时,两个测试中的计时是相同的.
下面是矩阵库的构建.Rs
use cc;
use std::{env, path::Path};
fn main() {
println!("cargo:rerun-if-changed=cuda_kernels/cuda_kernels.cu");
// let cublas_path =
// Path::new("C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.2/lib/x64/cublas.lib");
cc::Build::new()
.cuda(true)
.cudart("static")
// .object(cublas_path)
.file("cuda_kernels/cuda_kernels.cu")
.compile("cuda_kernels");
if let Ok(cuda_path) = env::var("CUDA_HOME") {
println!("cargo:rustc-link-search=native={}/lib64", cuda_path);
} else {
println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
}
println!("cargo:rustc-link-lib=dylib=cudart");
println!("cargo:rustc-link-lib=dylib=cublas");
}
这是我测试的代码.
我的图书馆的Cuda标头
#include <cublas_v2.h>
#include <cuda.h>
// Make sure bindings are not mangled for rust
extern "C" {
// Misc
void cuda_synchronize();
// Matrix Setup API
size_t register_matrix(float* data, size_t rows, size_t cols);
void unregister_matrix(size_t mat_id);
void get_matrix_data(size_t mat_id, int rows, int cols, float* data_buffer);
// Matrix operation API
size_t cuda_matrix_multiply(size_t mat1_id, size_t mat1_rows, size_t mat1_cols, size_t mat2_id, size_t mat2_rows, size_t mat2_cols);
}
CUDA C测试
#include <chrono>
#include <vector>
using namespace std::chrono;
#include "../cuda_kernels.cuh"
int main() {
// This is just for timing, assumes everything is correct.
int dim = 4096;
std::vector<float> data;
for (int i = 0; i < dim * dim; i++) {
data.push_back(23.47);
}
// Register data as a 4096 x 4096 matrix
int mat1 = register_matrix(&data[0], dim, dim);
int mat2 = register_matrix(&data[0], dim, dim);
auto start_host = high_resolution_clock::now();
cudaEvent_t start;
cudaEvent_t end;
cudaEventCreate(&start);
cudaEventCreate(&end);
cudaEventRecord(start);
int num_iter = 100;
for (int i = 0; i < num_iter; i++) {
// Perform multiplication
int result_id = cuda_matrix_multiply(mat1, dim, dim, mat2, dim, dim);
cuda_synchronize();
unregister_matrix(result_id);
}
float gpu_time = 0;
cudaEventRecord(end);
cudaEventSynchronize(start);
cudaEventSynchronize(end);
cudaEventElapsedTime(&gpu_time, start, end);
auto end_host = high_resolution_clock::now();
auto cpu_time = duration_cast<milliseconds>(end_host - start_host);
std::cout << "Average gpu function time was: " << gpu_time / num_iter << " ms" << std::endl;
std::cout << "Including overhead was: " << (float)cpu_time.count() / num_iter << " ms" << std::endl;
// Okay something is wrong with the overhead on rust benchmark. Something taking 184.3 ms here is taking 1.3 seconds there.
}
在Rust方面,下面是Cuda函数的绑定
装订.rs
use std::ffi::c_float;
extern "C" {
pub fn cuda_synchronize();
pub fn register_matrix(data: *const c_float, rows: usize, cols: usize) -> usize;
pub fn unregister_matrix(mat_id: usize) -> usize;
pub fn cuda_matrix_multiply(
mat1_id: usize,
mat1_rows: usize,
mat1_cols: usize,
mat2_buffer: usize,
mat2_rows: usize,
mat2_cols: usize,
) -> usize;
}
与CUDA C版本相同的Rust基准测试
RUST_BASE_TEST.RS
// See why there is a 13x discrepancy between speed of rust ffi and c++ benchmarks
use std::time::Instant;
use matrix_lib::bindings::{
cuda_matrix_multiply, cuda_synchronize, register_matrix, unregister_matrix,
};
#[test]
fn mat_mult_benchmark() {
// Random numbers for generation
let mat_dim = 4096;
let id_1;
let id_2;
unsafe {
id_1 = register_matrix(vec![0.0; mat_dim * mat_dim].as_ptr(), mat_dim, mat_dim);
id_2 = register_matrix(vec![0.0; mat_dim * mat_dim].as_ptr(), mat_dim, mat_dim);
}
let num_iterations = 100;
let start = Instant::now();
let mut result_id = 0;
for _ in 0..num_iterations {
unsafe {
result_id = cuda_matrix_multiply(id_1, mat_dim, mat_dim, id_2, mat_dim, mat_dim);
cuda_synchronize();
unregister_matrix(result_id);
}
}
unsafe { cuda_synchronize() }
let elapsed = start.elapsed();
println!(
"\n=================================\nTime per iteration: {} ms\n=================================",
elapsed.as_millis() as f64 / num_iterations as f64
);
print!("{}", result_id);
assert!(false);
}