10 #ifndef EIGEN_PACKET_MATH_CUDA_H 11 #define EIGEN_PACKET_MATH_CUDA_H 20 #if defined(__CUDACC__) && defined(EIGEN_USE_GPU) 21 template<>
struct is_arithmetic<float4> {
enum { value =
true }; };
22 template<>
struct is_arithmetic<double2> {
enum { value =
true }; };
25 template<>
struct packet_traits<float> : default_packet_traits
50 template<>
struct packet_traits<double> : default_packet_traits
74 template<>
struct unpacket_traits<float4> {
typedef float type;
enum {size=4, alignment=
Aligned16};
typedef float4 half; };
75 template<>
struct unpacket_traits<double2> {
typedef double type;
enum {size=2, alignment=
Aligned16};
typedef double2 half; };
77 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(
const float& from) {
78 return make_float4(from, from, from, from);
80 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(
const double& from) {
81 return make_double2(from, from);
85 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(
const float& a) {
86 return make_float4(a, a+1, a+2, a+3);
88 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double2>(
const double& a) {
89 return make_double2(a, a+1);
92 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(
const float4& a,
const float4& b) {
93 return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
95 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(
const double2& a,
const double2& b) {
96 return make_double2(a.x+b.x, a.y+b.y);
99 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(
const float4& a,
const float4& b) {
100 return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
102 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(
const double2& a,
const double2& b) {
103 return make_double2(a.x-b.x, a.y-b.y);
106 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(
const float4& a) {
107 return make_float4(-a.x, -a.y, -a.z, -a.w);
109 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(
const double2& a) {
110 return make_double2(-a.x, -a.y);
113 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(
const float4& a) {
return a; }
114 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(
const double2& a) {
return a; }
116 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(
const float4& a,
const float4& b) {
117 return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
119 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(
const double2& a,
const double2& b) {
120 return make_double2(a.x*b.x, a.y*b.y);
123 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(
const float4& a,
const float4& b) {
124 return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
126 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(
const double2& a,
const double2& b) {
127 return make_double2(a.x/b.x, a.y/b.y);
130 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(
const float4& a,
const float4& b) {
131 return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
133 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(
const double2& a,
const double2& b) {
134 return make_double2(fmin(a.x, b.x), fmin(a.y, b.y));
137 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(
const float4& a,
const float4& b) {
138 return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
140 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(
const double2& a,
const double2& b) {
141 return make_double2(fmax(a.x, b.x), fmax(a.y, b.y));
144 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(
const float* from) {
145 return *
reinterpret_cast<const float4*
>(from);
148 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(
const double* from) {
149 return *
reinterpret_cast<const double2*
>(from);
152 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(
const float* from) {
153 return make_float4(from[0], from[1], from[2], from[3]);
155 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(
const double* from) {
156 return make_double2(from[0], from[1]);
159 template<> EIGEN_STRONG_INLINE float4 ploaddup<float4>(
const float* from) {
160 return make_float4(from[0], from[0], from[1], from[1]);
162 template<> EIGEN_STRONG_INLINE double2 ploaddup<double2>(
const double* from) {
163 return make_double2(from[0], from[0]);
166 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pstore<float>(
float* to,
const float4& from) {
167 *
reinterpret_cast<float4*
>(to) = from;
170 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pstore<double>(
double* to,
const double2& from) {
171 *
reinterpret_cast<double2*
>(to) = from;
174 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pstoreu<float>(
float* to,
const float4& from) {
181 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pstoreu<double>(
double* to,
const double2& from) {
186 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 188 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(
const float* from) {
189 return __ldg((
const float4*)from);
192 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(
const double* from) {
193 return __ldg((
const double2*)from);
197 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(
const float* from) {
198 return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
201 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(
const double* from) {
202 return make_double2(__ldg(from+0), __ldg(from+1));
206 template<> EIGEN_DEVICE_FUNC
inline float4 pgather<float, float4>(
const float* from, Index stride) {
207 return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
210 template<> EIGEN_DEVICE_FUNC
inline double2 pgather<double, double2>(
const double* from, Index stride) {
211 return make_double2(from[0*stride], from[1*stride]);
214 template<> EIGEN_DEVICE_FUNC
inline void pscatter<float, float4>(
float* to,
const float4& from, Index stride) {
215 to[stride*0] = from.x;
216 to[stride*1] = from.y;
217 to[stride*2] = from.z;
218 to[stride*3] = from.w;
220 template<> EIGEN_DEVICE_FUNC
inline void pscatter<double, double2>(
double* to,
const double2& from, Index stride) {
221 to[stride*0] = from.x;
222 to[stride*1] = from.y;
225 template<> EIGEN_DEVICE_FUNC
inline float pfirst<float4>(
const float4& a) {
228 template<> EIGEN_DEVICE_FUNC
inline double pfirst<double2>(
const double2& a) {
232 template<> EIGEN_DEVICE_FUNC
inline float predux<float4>(
const float4& a) {
233 return a.x + a.y + a.z + a.w;
235 template<> EIGEN_DEVICE_FUNC
inline double predux<double2>(
const double2& a) {
239 template<> EIGEN_DEVICE_FUNC
inline float predux_max<float4>(
const float4& a) {
240 return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w));
242 template<> EIGEN_DEVICE_FUNC
inline double predux_max<double2>(
const double2& a) {
243 return fmax(a.x, a.y);
246 template<> EIGEN_DEVICE_FUNC
inline float predux_min<float4>(
const float4& a) {
247 return fminf(fminf(a.x, a.y), fminf(a.z, a.w));
249 template<> EIGEN_DEVICE_FUNC
inline double predux_min<double2>(
const double2& a) {
250 return fmin(a.x, a.y);
253 template<> EIGEN_DEVICE_FUNC
inline float predux_mul<float4>(
const float4& a) {
254 return a.x * a.y * a.z * a.w;
256 template<> EIGEN_DEVICE_FUNC
inline double predux_mul<double2>(
const double2& a) {
260 template<> EIGEN_DEVICE_FUNC
inline float4 pabs<float4>(
const float4& a) {
261 return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
263 template<> EIGEN_DEVICE_FUNC
inline double2 pabs<double2>(
const double2& a) {
264 return make_double2(fabs(a.x), fabs(a.y));
268 EIGEN_DEVICE_FUNC
inline void 269 ptranspose(PacketBlock<float4,4>& kernel) {
270 double tmp = kernel.packet[0].y;
271 kernel.packet[0].y = kernel.packet[1].x;
272 kernel.packet[1].x = tmp;
274 tmp = kernel.packet[0].z;
275 kernel.packet[0].z = kernel.packet[2].x;
276 kernel.packet[2].x = tmp;
278 tmp = kernel.packet[0].w;
279 kernel.packet[0].w = kernel.packet[3].x;
280 kernel.packet[3].x = tmp;
282 tmp = kernel.packet[1].z;
283 kernel.packet[1].z = kernel.packet[2].y;
284 kernel.packet[2].y = tmp;
286 tmp = kernel.packet[1].w;
287 kernel.packet[1].w = kernel.packet[3].y;
288 kernel.packet[3].y = tmp;
290 tmp = kernel.packet[2].w;
291 kernel.packet[2].w = kernel.packet[3].z;
292 kernel.packet[3].z = tmp;
295 EIGEN_DEVICE_FUNC
inline void 296 ptranspose(PacketBlock<double2,2>& kernel) {
297 double tmp = kernel.packet[0].y;
298 kernel.packet[0].y = kernel.packet[1].x;
299 kernel.packet[1].x = tmp;
309 #endif // EIGEN_PACKET_MATH_CUDA_H Definition: Constants.h:230
Definition: Eigen_Colamd.h:54