How C++ uses cuBLAS to accelerate Matrix Multiplication 02/14 Update SLTechnology News&Howtos

How C++ uses cuBLAS to accelerate Matrix Multiplication

2026-02-14 Update From: SLTechnology News&Howtos shulou NAV: SLTechnology News&Howtos > Development >

Shulou(Shulou.com)06/03 Report--

这篇文章主要讲解了"C++如何使用cuBLAS加速矩阵乘法运算"，文中的讲解内容简单清晰，易于学习与理解，下面请大家跟着小编的思路慢慢深入，一起来研究和学习"C++如何使用cuBLAS加速矩阵乘法运算"吧!

test.cpp

#include "cuda_runtime.h"#include "cublas_v2.h"#include #include using namespace std;// cuBLAS实现矩阵乘法int **matMult_cuBLAS(int **A, int **B, int rowSizeA, int colSizeA, int colSizeB, cublasHandle_t cuHandle){ // 结果矩阵 int** C = new int*[rowSizeA]; for(int i = 0; i < rowSizeA; i++){ C[i] = new int[colSizeB]; } for (int i = 0; i < rowSizeA; i++){ for (int j = 0; j < colSizeB; j++){ C[i][j] = 0; } } // 在内存中为将要计算的矩阵开辟空间 float *h_A = (float*)malloc (rowSizeA * colSizeA * sizeof(float)); float *h_B = (float*)malloc (colSizeA * colSizeB * sizeof(float)); float *h_C = (float*)malloc (rowSizeA * colSizeB * sizeof(float)); // 初始化计算矩阵h_A和h_B for (int i = 0; i < rowSizeA; i++) { for (int j = 0; j < colSizeA; j++) { h_A[i * colSizeA + j] = (float)A[i][j]; } } for (int i = 0; i < colSizeA; i++) { for (int j = 0; j < colSizeB; j++) { h_B[i * colSizeB + j] = (float)B[i][j]; } } // 在显存中为将要计算矩阵与结果矩阵开辟空间 float *d_A, *d_B, *d_C; cudaMalloc ( (void**)&d_A, // 指向开辟的空间的指针 rowSizeA * colSizeA * sizeof(float) // 需要开辟空间的字节数 ); cudaMalloc ( (void**)&d_B, colSizeA * colSizeB * sizeof(float) ); cudaMalloc ( (void**)&d_C, rowSizeA * colSizeB * sizeof(float) ); // 将矩阵数据传递进显存中已经开辟好了的空间 cublasSetVector ( rowSizeA * colSizeA, // 要存入显存的元素个数 sizeof(float), // 每个元素大小 h_A, // 主机端起始地址 1, // 连续元素之间的存储间隔 d_A, // GPU 端起始地址 1 // 连续元素之间的存储间隔 ); cublasSetVector (colSizeA * colSizeB, sizeof(float), h_B, 1, d_B, 1); // 传递进矩阵相乘函数中的参数，具体含义请参考函数手册. float a=1; float b=0; // 矩阵相乘.该函数必然将数组解析成列优先数组 cublasSgemm ( cuHandle, // blas 库对象 CUBLAS_OP_T, // 矩阵 A 属性参数 CUBLAS_OP_T, // 矩阵 B 属性参数 rowSizeA, // A, C 的行数 colSizeB, // B, C 的列数 colSizeA, // A 的列数和 B 的行数 &a, // 运算式的 \alpha 值 d_A, // A 在显存中的地址 colSizeA, // lda d_B, // B 在显存中的地址 colSizeB, // ldb &b, // 运算式的 \beta 值 d_C, // C 在显存中的地址(结果矩阵) rowSizeA // ldc ); // 从显存中取出运算结果至内存中去 cublasGetVector ( rowSizeA * colSizeB, // 要取出元素的个数 sizeof(float), // 每个元素大小 d_C, // GPU 端起始地址 1, // 连续元素之间的存储间隔 h_C, // 主机端起始地址 1 // 连续元素之间的存储间隔 ); for (int i = 0; i < rowSizeA; i++) { for (int j = 0; j < colSizeB; j++) { C[i][j] = (int)h_C[j * rowSizeA + i]; } } // 清理掉使用过的内存 free (h_A); free (h_B); free (h_C); cudaFree (d_A); cudaFree (d_B); cudaFree (d_C); return C;}// 构造一个随机二维数组（矩阵）int** uniformMat(int rowSize, int colSize, int minValue, int maxValue) { int** mat = new int* [rowSize]; for (int i = 0; i < rowSize; i++) mat[i] = new int[colSize]; // srand(1024); srand((unsigned)time(NULL)); //随机数种子采用系统时钟 for (int i = 0; i < rowSize; i++) { for (int j = 0; j < colSize; j++) { mat[i][j] = (int)(rand() % (maxValue - minValue + 1)) + minValue; } } return mat;}int main(void) { // 创建并初始化 CUBLAS 库对象 // 若是CUBLAS对象在主函数中初始化，cuBLAS方法在其他函数中调用，需要将cuHandle传入该函数，并在该函数内创建status对象 cublasHandle_t cuHandle; cublasStatus_t status = cublasCreate(&cuHandle); if (status != CUBLAS_STATUS_SUCCESS) { if (status == CUBLAS_STATUS_NOT_INITIALIZED) { cout

Welcome to subscribe "Shulou Technology Information " to get latest news, interesting things and hot topics in the IT industry, and controls the hottest and latest Internet news, technology news and IT industry trends.

*The comments in the above article only represent the author's personal views and do not represent the views and positions of this website. If you have more insights, please feel free to contribute and share.