when we have a Fortran source code as given below, named test_gemm.cuf

The code is coming from here.

module precision
! Precision control

integer, parameter, public :: Single = kind(0.0) ! Single precision
integer, parameter, public :: Double = kind(0.0d0) ! Double precision

integer, parameter, public :: fp_kind = Double
!integer, parameter, public :: fp_kind = Single

end module precision


module cublas
!
! Define the INTERFACE to the NVIDIA C code cublasSgemm and cublasDgemm
!
interface cuda_gemm
!
! void cublasSgemm (char transa, char transb, int m, int n,
! int k, float alpha, const float *A, int lda,
! const float *B, int ldb, float beta, float *C, int ldc)
!
subroutine cuda_sgemm(cta, ctb, m, n, k,&
alpha, A, lda, B, ldb, beta, c, ldc) bind(C,name='cublasSgemm')
use iso_c_binding
character(1,c_char),value :: cta, ctb
integer(c_int),value :: m,n,k,lda,ldb,ldc
real(c_float),value :: alpha,beta
real(c_float), device, dimension(lda,*) :: A
real(c_float), device, dimension(ldb,*) :: B
real(c_float), device, dimension(ldc,*) :: C
end subroutine cuda_sgemm

!
! void cublasDgemm (char transa, char transb, int m, int n,
! int k, double alpha, const double *A, int lda,
! const double *B, int ldb, double beta, double *C, int ldc)
!
subroutine cuda_dgemm(cta, ctb, m, n, k,&
alpha, A, lda, B, ldb, beta, c, ldc) bind(C,name='cublasDgemm')
use iso_c_binding
character(1,c_char),value :: cta, ctb
integer(c_int),value :: m,n,k,lda,ldb,ldc
real(c_double),value :: alpha,beta
real(c_double), device, dimension(lda,*) :: A
real(c_double), device, dimension(ldb,*) :: B
real(c_double), device, dimension(ldc,*) :: C
end subroutine cuda_dgemm

end interface

end module cublas


program gemm_test
use precision
use cublas
real(fp_kind) ,allocatable:: a(:,:),b(:,:),c(:,:)
real(fp_kind),device,allocatable:: a_d(:,:),b_d(:,:),c_d(:,:)
real(fp_kind):: alpha,beta
integer:: n,m,k

n=4
m=4
k=4
alpha=1._fp_kind
beta=2._fp_kind

! allocate arrays on the host
allocate (a(m,k))
allocate (b(k,n))
allocate (c(m,n))

! allocate arrays on the device
allocate (a_d(m,k))
allocate (b_d(k,n))
allocate (c_d(m,n))

!initialize arrays on host
a=1
b=2
c=3

!copy arrays to device
a_d=a
b_d=b
c_d=c


print *, "Matrix A:"
print *, a

print *, "Matrix B:"
print *, b
print *, "Matrix C:"
print *, c

call cuda_gemm ('N','N',m,n,k,alpha,a_d,m,b_d,k,beta,c_d,m)

c=c_d
print *, "Matrix C = alpha A*B+ beta C"
print *, c

!release memory on the host
deallocate (a,b,c)

!release memory on the device
deallocate (a_d,b_d,c_d)

end program gemm_test

Compile it from PGI BASH, using the command below will give you LINK Error: LNK1104: Can not open "libcublas.lib"

pgfortran -Mcuda -o test_gemm test_gemm.cuf -lcublas

After carefully checking, we already have the NVIDIA CUDA library installed. In PGI we also have they ready at C:Program FilesMATLABR2019bsyscudawin64cudalibx64 in my case.

Such error simply implies that the linker is trying to find the lib file named after libcublas.lib rather than cublas.lib, the latter of which is the one really exists.

The solution is, go to the directory where the library files asides, using soft link or hard link to make a copy using the name that can comfort the linker like below

for i in `ls *.lib`
do ln -s $i lib$i
done

Then the problem is solved. The linker now can find the library files it needs.