10 #if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H) 11 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H 18 class StreamInterface {
20 virtual ~StreamInterface() {}
22 virtual const cudaStream_t& stream()
const = 0;
23 virtual const cudaDeviceProp& deviceProperties()
const = 0;
26 virtual void* allocate(
size_t num_bytes)
const = 0;
27 virtual void deallocate(
void* buffer)
const = 0;
30 static cudaDeviceProp* m_deviceProperties;
31 static bool m_devicePropInitialized =
false;
33 static void initializeDeviceProp() {
34 if (!m_devicePropInitialized) {
35 if (!m_devicePropInitialized) {
37 cudaError_t status = cudaGetDeviceCount(&num_devices);
38 EIGEN_UNUSED_VARIABLE(status)
39 assert(status == cudaSuccess);
40 m_deviceProperties = new cudaDeviceProp[num_devices];
41 for (
int i = 0; i < num_devices; ++i) {
42 status = cudaGetDeviceProperties(&m_deviceProperties[i], i);
43 assert(status == cudaSuccess);
45 m_devicePropInitialized =
true;
50 static const cudaStream_t default_stream = cudaStreamDefault;
52 class CudaStreamDevice :
public StreamInterface {
55 CudaStreamDevice() : stream_(&default_stream) {
56 cudaGetDevice(&device_);
57 initializeDeviceProp();
60 CudaStreamDevice(
int device) : stream_(&default_stream), device_(device) {
61 initializeDeviceProp();
67 CudaStreamDevice(
const cudaStream_t* stream,
int device = -1)
68 : stream_(stream), device_(device) {
70 cudaGetDevice(&device_);
73 cudaError_t err = cudaGetDeviceCount(&num_devices);
74 EIGEN_UNUSED_VARIABLE(err)
75 assert(err == cudaSuccess);
76 assert(device < num_devices);
79 initializeDeviceProp();
82 const cudaStream_t& stream()
const {
return *stream_; }
83 const cudaDeviceProp& deviceProperties()
const {
84 return m_deviceProperties[device_];
86 virtual void* allocate(
size_t num_bytes)
const {
87 cudaError_t err = cudaSetDevice(device_);
88 EIGEN_UNUSED_VARIABLE(err)
89 assert(err == cudaSuccess);
91 err = cudaMalloc(&result, num_bytes);
92 assert(err == cudaSuccess);
93 assert(result != NULL);
96 virtual
void deallocate(
void* buffer)
const {
97 cudaError_t err = cudaSetDevice(device_);
98 EIGEN_UNUSED_VARIABLE(err)
99 assert(err == cudaSuccess);
100 assert(buffer != NULL);
101 err = cudaFree(buffer);
102 assert(err == cudaSuccess);
106 const cudaStream_t* stream_;
113 explicit GpuDevice(
const StreamInterface* stream) : stream_(stream) {
114 eigen_assert(stream);
118 EIGEN_STRONG_INLINE
const cudaStream_t& stream()
const {
119 return stream_->stream();
122 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void* allocate(
size_t num_bytes)
const {
123 #ifndef __CUDA_ARCH__ 124 return stream_->allocate(num_bytes);
126 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
131 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void deallocate(
void* buffer)
const {
132 #ifndef __CUDA_ARCH__ 133 stream_->deallocate(buffer);
136 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
140 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void memcpy(
void* dst,
const void* src,
size_t n)
const {
141 #ifndef __CUDA_ARCH__ 142 cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
144 EIGEN_UNUSED_VARIABLE(err)
145 assert(err == cudaSuccess);
147 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
151 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void memcpyHostToDevice(
void* dst,
const void* src,
size_t n)
const {
152 #ifndef __CUDA_ARCH__ 154 cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream());
155 EIGEN_UNUSED_VARIABLE(err)
156 assert(err == cudaSuccess);
158 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
162 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void memcpyDeviceToHost(
void* dst,
const void* src,
size_t n)
const {
163 #ifndef __CUDA_ARCH__ 165 cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream());
166 EIGEN_UNUSED_VARIABLE(err)
167 assert(err == cudaSuccess);
169 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
173 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void memset(
void* buffer,
int c,
size_t n)
const {
174 #ifndef __CUDA_ARCH__ 175 cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream());
176 EIGEN_UNUSED_VARIABLE(err)
177 assert(err == cudaSuccess);
179 eigen_assert(
false &&
"The default device should be used instead to generate kernel code");
183 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
size_t numThreads()
const {
188 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
size_t firstLevelCacheSize()
const {
193 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
size_t lastLevelCacheSize()
const {
196 return firstLevelCacheSize();
199 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void synchronize()
const {
200 #if defined(__CUDACC__) && !defined(__CUDA_ARCH__) 201 cudaError_t err = cudaStreamSynchronize(stream_->stream());
202 EIGEN_UNUSED_VARIABLE(err)
203 assert(err == cudaSuccess);
205 assert(
false &&
"The default device should be used instead to generate kernel code");
209 inline int getNumCudaMultiProcessors()
const {
210 return stream_->deviceProperties().multiProcessorCount;
212 inline int maxCudaThreadsPerBlock()
const {
213 return stream_->deviceProperties().maxThreadsPerBlock;
215 inline int maxCudaThreadsPerMultiProcessor()
const {
216 return stream_->deviceProperties().maxThreadsPerMultiProcessor;
218 inline int sharedMemPerBlock()
const {
219 return stream_->deviceProperties().sharedMemPerBlock;
221 inline int majorDeviceVersion()
const {
222 return stream_->deviceProperties().major;
227 inline bool ok()
const {
229 cudaError_t error = cudaStreamQuery(stream_->stream());
230 return (error == cudaSuccess) || (error == cudaErrorNotReady);
237 const StreamInterface* stream_;
242 #define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ 243 (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ 244 assert(cudaGetLastError() == cudaSuccess); 249 static inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
250 cudaError_t status = cudaDeviceSetSharedMemConfig(config);
251 EIGEN_UNUSED_VARIABLE(status)
252 assert(status == cudaSuccess);
258 #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_TYPE_H Namespace containing all symbols from the Eigen library.
Definition: CXX11Meta.h:13