35 #ifndef _BLAZE_MATH_SIMD_MULT_H_ 36 #define _BLAZE_MATH_SIMD_MULT_H_ 66 template<
typename T >
68 operator*(
const SIMDi16<T>& a,
const SIMDi16<T>& b ) noexcept
69 #if BLAZE_AVX512BW_MODE 71 return _mm512_mullo_epi16( (~a).value, (~b).value );
75 return _mm256_mullo_epi16( (~a).value, (~b).value );
79 return _mm_mullo_epi16( (~a).value, (~b).value );
100 operator*(
const SIMDi16<T1>& a,
const SIMDi16<T2>& b ) noexcept
101 #if BLAZE_AVX512BW_MODE 103 return _mm512_mullo_epi16( (~a).value, (~b).value );
105 #elif BLAZE_AVX2_MODE 107 return _mm256_mullo_epi16( (~a).value, (~b).value );
109 #elif BLAZE_SSE2_MODE 111 return _mm_mullo_epi16( (~a).value, (~b).value );
130 operator*(
const SIMDcint16& a,
const SIMDint16& b ) noexcept
131 #if BLAZE_AVX512BW_MODE 133 return _mm512_mullo_epi16( (~a).value, (~b).value );
135 #elif BLAZE_AVX2_MODE 137 return _mm256_mullo_epi16( (~a).value, (~b).value );
139 #elif BLAZE_SSE2_MODE 141 return _mm_mullo_epi16( (~a).value, (~b).value );
160 operator*(
const SIMDcuint16& a,
const SIMDuint16& b ) noexcept
161 #if BLAZE_AVX512BW_MODE 163 return _mm512_mullo_epi16( (~a).value, (~b).value );
165 #elif BLAZE_AVX2_MODE 167 return _mm256_mullo_epi16( (~a).value, (~b).value );
169 #elif BLAZE_SSE2_MODE 171 return _mm_mullo_epi16( (~a).value, (~b).value );
190 operator*(
const SIMDint16& a,
const SIMDcint16& b ) noexcept
191 #if BLAZE_AVX512BW_MODE 193 return _mm512_mullo_epi16( (~a).value, (~b).value );
195 #elif BLAZE_AVX2_MODE 197 return _mm256_mullo_epi16( (~a).value, (~b).value );
199 #elif BLAZE_SSE2_MODE 201 return _mm_mullo_epi16( (~a).value, (~b).value );
220 operator*(
const SIMDuint16& a,
const SIMDcuint16& b ) noexcept
221 #if BLAZE_AVX512BW_MODE 223 return _mm512_mullo_epi16( (~a).value, (~b).value );
225 #elif BLAZE_AVX2_MODE 227 return _mm256_mullo_epi16( (~a).value, (~b).value );
229 #elif BLAZE_SSE2_MODE 231 return _mm_mullo_epi16( (~a).value, (~b).value );
249 template<
typename T >
251 operator*(
const SIMDci16<T>& a,
const SIMDci16<T>& b ) noexcept
252 #if BLAZE_AVX512BW_MODE 254 __m512i a_ii = _mm512_shufflelo_epi16( (~a).value, 0b11
'11'01
'01 ); 255 a_ii = _mm512_shufflehi_epi16( a_ii, 0b11'11
'01'01 );
257 __m512i b_ri = _mm512_shufflelo_epi16( (~b).value, 0b10
'11'00
'01 ); 258 b_ri = _mm512_shufflehi_epi16( b_ri, 0b10'11
'00'01 );
260 __m512i a_rr = _mm512_shufflelo_epi16( (~a).value, 0b10
'10'00
'00 ); 261 a_rr = _mm512_shufflehi_epi16( a_rr, 0b10'10
'00'00 );
263 const __m512i a_rr_b = _mm512_mullo_epi16( a_rr, (~b).value );
264 const __m512i a_ii_b_ri = _mm512_mullo_epi16( a_ii, b_ri );
265 const __m512i a_ii_b_ri_signed = _mm512_mask_sub_epi16( a_ii_b_ri, 0x55555555,
266 _mm512_setzero_si512(), a_ii_b_ri );
267 return _mm512_add_epi16( a_rr_b, a_ii_b_ri_signed );
269 #elif BLAZE_AVX2_MODE 272 const __m256i neg( _mm256_set_epi16( 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1 ) );
274 x = _mm256_shufflelo_epi16( (~a).value, 0xA0 );
275 x = _mm256_shufflehi_epi16( x, 0xA0 );
276 z = _mm256_mullo_epi16( x, (~b).value );
277 x = _mm256_shufflelo_epi16( (~a).value, 0xF5 );
278 x = _mm256_shufflehi_epi16( x, 0xF5 );
279 y = _mm256_shufflelo_epi16( (~b).value, 0xB1 );
280 y = _mm256_shufflehi_epi16( y, 0xB1 );
281 y = _mm256_mullo_epi16( x, y );
282 y = _mm256_mullo_epi16( y, neg );
283 return _mm256_add_epi16( z, y );
285 #elif BLAZE_SSE2_MODE 288 const __m128i neg( _mm_set_epi16( 1, -1, 1, -1, 1, -1, 1, -1 ) );
290 x = _mm_shufflelo_epi16( (~a).value, 0xA0 );
291 x = _mm_shufflehi_epi16( x, 0xA0 );
292 z = _mm_mullo_epi16( x, (~b).value );
293 x = _mm_shufflelo_epi16( (~a).value, 0xF5 );
294 x = _mm_shufflehi_epi16( x, 0xF5 );
295 y = _mm_shufflelo_epi16( (~b).value, 0xB1 );
296 y = _mm_shufflehi_epi16( y, 0xB1 );
297 y = _mm_mullo_epi16( x, y );
298 y = _mm_mullo_epi16( y, neg );
299 return _mm_add_epi16( z, y );
325 template<
typename T >
327 operator*(
const SIMDi32<T>& a,
const SIMDi32<T>& b ) noexcept
328 #if BLAZE_AVX512F_MODE || BLAZE_MIC_MODE 330 return _mm512_mullo_epi32( (~a).value, (~b).value );
332 #elif BLAZE_AVX2_MODE 334 return _mm256_mullo_epi32( (~a).value, (~b).value );
336 #elif BLAZE_SSE4_MODE 338 return _mm_mullo_epi32( (~a).value, (~b).value );
356 template<
typename T1
359 operator*(
const SIMDi32<T1>& a,
const SIMDi32<T2>& b ) noexcept
360 #if BLAZE_AVX512F_MODE || BLAZE_MIC_MODE 362 return _mm512_mullo_epi32( (~a).value, (~b).value );
364 #elif BLAZE_AVX2_MODE 366 return _mm256_mullo_epi32( (~a).value, (~b).value );
368 #elif BLAZE_SSE4_MODE 370 return _mm_mullo_epi32( (~a).value, (~b).value );
389 operator*(
const SIMDcint32& a,
const SIMDint32& b ) noexcept
390 #if BLAZE_AVX512F_MODE || BLAZE_MIC_MODE 392 return _mm512_mullo_epi32( (~a).value, (~b).value );
394 #elif BLAZE_AVX2_MODE 396 return _mm256_mullo_epi32( (~a).value, (~b).value );
398 #elif BLAZE_SSE4_MODE 400 return _mm_mullo_epi32( (~a).value, (~b).value );
419 operator*(
const SIMDcuint32& a,
const SIMDuint32& b ) noexcept
420 #if BLAZE_AVX512F_MODE || BLAZE_MIC_MODE 422 return _mm512_mullo_epi32( (~a).value, (~b).value );
424 #elif BLAZE_AVX2_MODE 426 return _mm256_mullo_epi32( (~a).value, (~b).value );
428 #elif BLAZE_SSE4_MODE 430 return _mm_mullo_epi32( (~a).value, (~b).value );
448 template<
typename T1
451 operator*(
const SIMDint32& a,
const SIMDcint32& b ) noexcept
452 #if BLAZE_AVX512F_MODE || BLAZE_MIC_MODE 454 return _mm512_mullo_epi32( (~a).value, (~b).value );
456 #elif BLAZE_AVX2_MODE 458 return _mm256_mullo_epi32( (~a).value, (~b).value );
460 #elif BLAZE_SSE4_MODE 462 return _mm_mullo_epi32( (~a).value, (~b).value );
480 template<
typename T1
483 operator*(
const SIMDuint32& a,
const SIMDcuint32& b ) noexcept
484 #if BLAZE_AVX512F_MODE || BLAZE_MIC_MODE 486 return _mm512_mullo_epi32( (~a).value, (~b).value );
488 #elif BLAZE_AVX2_MODE 490 return _mm256_mullo_epi32( (~a).value, (~b).value );
492 #elif BLAZE_SSE4_MODE 494 return _mm_mullo_epi32( (~a).value, (~b).value );
512 template<
typename T >
514 operator*(
const SIMDci32<T>& a,
const SIMDci32<T>& b ) noexcept
515 #if BLAZE_AVX512F_MODE 517 const __m512i a_ii = _mm512_shuffle_epi32( (~a).value, _MM_PERM_DDBB );
518 const __m512i b_ri = _mm512_shuffle_epi32( (~b).value, _MM_PERM_CDAB );
519 const __m512i a_rr = _mm512_shuffle_epi32( (~a).value, _MM_PERM_CCAA );
521 const __m512i a_rr_b = _mm512_mullo_epi32( a_rr, (~b).value );
522 const __m512i a_ii_b_ri = _mm512_mullo_epi32( a_ii, b_ri );
523 const __m512i a_ii_b_ri_signed = _mm512_mask_sub_epi32( a_ii_b_ri, 0b0101010101010101,
524 _mm512_setzero_si512(), a_ii_b_ri );
525 return _mm512_add_epi32( a_rr_b, a_ii_b_ri_signed );
527 #elif BLAZE_AVX2_MODE 530 const __m256i neg( _mm256_set_epi32( 1, -1, 1, -1, 1, -1, 1, -1 ) );
532 x = _mm256_shuffle_epi32( (~a).value, 0xA0 );
533 z = _mm256_mullo_epi32( x, (~b).value );
534 x = _mm256_shuffle_epi32( (~a).value, 0xF5 );
535 y = _mm256_shuffle_epi32( (~b).value, 0xB1 );
536 y = _mm256_mullo_epi32( x, y );
537 y = _mm256_mullo_epi32( y, neg );
538 return _mm256_add_epi32( z, y );
540 #elif BLAZE_SSE4_MODE 543 const __m128i neg( _mm_set_epi32( 1, -1, 1, -1 ) );
545 x = _mm_shuffle_epi32( (~a).value, 0xA0 );
546 z = _mm_mullo_epi32( x, (~b).value );
547 x = _mm_shuffle_epi32( (~a).value, 0xF5 );
548 y = _mm_shuffle_epi32( (~b).value, 0xB1 );
549 y = _mm_mullo_epi32( x, y );
550 y = _mm_mullo_epi32( y, neg );
551 return _mm_add_epi32( z, y );
577 template<
typename T >
579 operator*(
const SIMDi64<T>& a,
const SIMDi64<T>& b ) noexcept
580 #if BLAZE_AVX512DQ_MODE 582 return _mm512_mullo_epi64( (~a).value, (~b).value );
600 template<
typename T1
603 operator*(
const SIMDi64<T1>& a,
const SIMDi64<T2>& b ) noexcept
604 #if BLAZE_AVX512DQ_MODE 606 return _mm512_mullo_epi64( (~a).value, (~b).value );
625 operator*(
const SIMDcint64& a,
const SIMDint64& b ) noexcept
626 #if BLAZE_AVX512DQ_MODE 628 return _mm512_mullo_epi64( (~a).value, (~b).value );
647 operator*(
const SIMDcuint64& a,
const SIMDuint64& b ) noexcept
648 #if BLAZE_AVX512DQ_MODE 650 return _mm512_mullo_epi64( (~a).value, (~b).value );
668 template<
typename T1
671 operator*(
const SIMDint64& a,
const SIMDcint64& b ) noexcept
672 #if BLAZE_AVX512DQ_MODE 674 return _mm512_mullo_epi64( (~a).value, (~b).value );
692 template<
typename T1
695 operator*(
const SIMDuint64& a,
const SIMDcuint64& b ) noexcept
696 #if BLAZE_AVX512DQ_MODE || BLAZE_MIC_MODE 698 return _mm512_mullo_epi64( (~a).value, (~b).value );
716 template<
typename T >
718 operator*(
const SIMDci64<T>& a,
const SIMDci64<T>& b ) noexcept
719 #if BLAZE_AVX512DQ_MODE 721 const __m512i a_ii = _mm512_shuffle_epi32( (~a).value, 0b11
'10'11
'10 ); 722 const __m512i b_ri = _mm512_shuffle_epi32( (~b).value, 0b01'00
'11'10 );
723 const __m512i a_rr = _mm512_shuffle_epi32( (~a).value, 0b01
'00'01
'00 ); 725 const __m512i a_rr_b = _mm512_mullo_epi64( a_rr, (~b).value ); 726 const __m512i a_ii_b_ri = _mm512_mullo_epi64( a_ii, b_ri ); 727 const __m512i a_ii_b_ri_signed = _mm512_mask_sub_epi64( a_ii_b_ri, 0b01010101, 728 _mm512_setzero_si512(), a_ii_b_ri ); 729 return _mm512_add_epi64( a_rr_b, a_ii_b_ri_signed ); 734 //************************************************************************************************* 739 //================================================================================================= 741 // 32-BIT FLOATING POINT SIMD TYPES 743 //================================================================================================= 745 //************************************************************************************************* 752 template< typename T1 // Type of the left-hand side operand 753 , typename T2 > // Type of the right-hand side operand 754 struct SIMDf32MultExpr 755 : public SIMDf32< SIMDf32MultExpr<T1,T2> > 757 //**Type definitions**************************************************************************** 758 using This = SIMDf32MultExpr<T1,T2>; //!< Type of this SIMDf32MultExpr instance. 759 using BaseType = SIMDf32<This>; //!< Base type of this SIMDf32MultExpr instance. 760 //********************************************************************************************** 762 //**Constructor********************************************************************************* 768 explicit BLAZE_ALWAYS_INLINE SIMDf32MultExpr( const T1& a, const T2& b ) 769 : a_( a ) // The left-hand side operand for the multiplication 770 , b_( b ) // The right-hand side operand for the multiplication 772 //********************************************************************************************** 774 //**Evaluation function************************************************************************* 779 BLAZE_ALWAYS_INLINE const SIMDfloat eval() const noexcept 780 #if BLAZE_AVX512F_MODE || BLAZE_MIC_MODE 782 return _mm512_mul_ps( a_.eval().value, b_.eval().value ); 786 return _mm256_mul_ps( a_.eval().value, b_.eval().value ); 790 return _mm_mul_ps( a_.eval().value, b_.eval().value ); 795 //********************************************************************************************** 797 //**Member variables**************************************************************************** 798 const T1 a_; //!< The left-hand side operand for the multiplication. 799 const T2 b_; //!< The right-hand side operand for the multiplication. 800 //********************************************************************************************** 802 //************************************************************************************************* 805 //************************************************************************************************* 815 template< typename T1 // Type of the left-hand side operand 816 , typename T2 > // Type of the right-hand side operand 817 BLAZE_ALWAYS_INLINE const SIMDf32MultExpr<T1,T2> 818 operator*( const SIMDf32<T1>& a, const SIMDf32<T2>& b ) noexcept 820 return SIMDf32MultExpr<T1,T2>( ~a, ~b ); 822 //************************************************************************************************* 825 //************************************************************************************************* 835 BLAZE_ALWAYS_INLINE const SIMDcfloat 836 operator*( const SIMDcfloat& a, const SIMDfloat& b ) noexcept 837 #if BLAZE_AVX512F_MODE || BLAZE_MIC_MODE 839 return _mm512_mul_ps( a.value, b.value ); 843 return _mm256_mul_ps( a.value, b.value ); 847 return _mm_mul_ps( a.value, b.value ); 852 //************************************************************************************************* 855 //************************************************************************************************* 865 BLAZE_ALWAYS_INLINE const SIMDcfloat 866 operator*( const SIMDfloat& a, const SIMDcfloat& b ) noexcept 867 #if BLAZE_AVX512F_MODE || BLAZE_MIC_MODE 869 return _mm512_mul_ps( a.value, b.value ); 873 return _mm256_mul_ps( a.value, b.value ); 877 return _mm_mul_ps( a.value, b.value ); 882 //************************************************************************************************* 885 //************************************************************************************************* 895 BLAZE_ALWAYS_INLINE const SIMDcfloat 896 operator*( const SIMDcfloat& a, const SIMDcfloat& b ) noexcept 897 #if BLAZE_AVX512F_MODE 899 const __m512 a_ii = _mm512_permute_ps( a.value, 0b11'11
'01'01 );
900 const __m512 b_ri = _mm512_permute_ps( b.value, 0b10
'11'00
'01 ); 901 const __m512 a_rr = _mm512_permute_ps( a.value, 0b10'10
'00'00 );
902 return _mm512_fmaddsub_ps( a_rr, b.value, _mm512_mul_ps( a_ii, b_ri ) );
908 x = _mm256_shuffle_ps( a.value, a.value, 0xA0 );
909 z = _mm256_mul_ps( x, b.value );
910 x = _mm256_shuffle_ps( a.value, a.value, 0xF5 );
911 y = _mm256_shuffle_ps( b.value, b.value, 0xB1 );
912 y = _mm256_mul_ps( x, y );
913 return _mm256_addsub_ps( z, y );
915 #elif BLAZE_SSE3_MODE 919 x = _mm_shuffle_ps( a.value, a.value, 0xA0 );
920 z = _mm_mul_ps( x, b.value );
921 x = _mm_shuffle_ps( a.value, a.value, 0xF5 );
922 y = _mm_shuffle_ps( b.value, b.value, 0xB1 );
923 y = _mm_mul_ps( x, y );
924 return _mm_addsub_ps( z, y );
947 template<
typename T1
950 :
public SIMDf64< SIMDf64MultExpr<T1,T2> >
975 #if BLAZE_AVX512F_MODE || BLAZE_MIC_MODE 977 return _mm512_mul_pd(
a_.eval().value,
b_.eval().value );
981 return _mm256_mul_pd(
a_.eval().value,
b_.eval().value );
983 #elif BLAZE_SSE2_MODE 985 return _mm_mul_pd(
a_.eval().value,
b_.eval().value );
1010 template<
typename T1
1013 operator*(
const SIMDf64<T1>& a,
const SIMDf64<T2>& b ) noexcept
1031 operator*(
const SIMDcdouble& a,
const SIMDdouble& b ) noexcept
1032 #if BLAZE_AVX512F_MODE || BLAZE_MIC_MODE 1034 return _mm512_mul_pd( a.value, b.value );
1036 #elif BLAZE_AVX_MODE 1038 return _mm256_mul_pd( a.value, b.value );
1040 #elif BLAZE_SSE2_MODE 1042 return _mm_mul_pd( a.value, b.value );
1061 operator*(
const SIMDdouble& a,
const SIMDcdouble& b ) noexcept
1062 #if BLAZE_AVX512F_MODE || BLAZE_MIC_MODE 1064 return _mm512_mul_pd( a.value, b.value );
1066 #elif BLAZE_AVX_MODE 1068 return _mm256_mul_pd( a.value, b.value );
1070 #elif BLAZE_SSE2_MODE 1072 return _mm_mul_pd( a.value, b.value );
1091 operator*(
const SIMDcdouble& a,
const SIMDcdouble& b ) noexcept
1092 #if BLAZE_AVX512F_MODE 1094 const __m512d a_ii = _mm512_permute_pd( a.value, 0b1
'1'1
'1'1
'1'1
'1 ); 1095 const __m512d b_ri = _mm512_permute_pd( b.value, 0b0'1
'0'1
'0'1
'0'1 );
1096 const __m512d a_rr = _mm512_permute_pd( a.value, 0 );
1097 return _mm512_fmaddsub_pd( a_rr, b.value, _mm512_mul_pd( a_ii, b_ri ) );
1099 #elif BLAZE_AVX_MODE 1103 x = _mm256_shuffle_pd( a.value, a.value, 0 );
1104 z = _mm256_mul_pd( x, b.value );
1105 x = _mm256_shuffle_pd( a.value, a.value, 15 );
1106 y = _mm256_shuffle_pd( b.value, b.value, 5 );
1107 y = _mm256_mul_pd( x, y );
1108 return _mm256_addsub_pd( z, y );
1110 #elif BLAZE_SSE3_MODE 1114 x = _mm_shuffle_pd( a.value, a.value, 0 );
1115 z = _mm_mul_pd( x, b.value );
1116 x = _mm_shuffle_pd( a.value, a.value, 3 );
1117 y = _mm_shuffle_pd( b.value, b.value, 1 );
1118 y = _mm_mul_pd( x, y );
1119 return _mm_addsub_pd( z, y );
BLAZE_ALWAYS_INLINE const SIMDdouble eval() const noexcept=delete
Evaluation of the expression object.
Expression object for 64-bit floating point multiplication operations.The SIMDf64MultExpr class repre...
Definition: Mult.h:949
SIMD type for 64-bit double precision floating point data values.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
#define BLAZE_ALWAYS_INLINE
Platform dependent setup of an enforced inline keyword.
Definition: Inline.h:85
decltype(auto) operator *(const DenseMatrix< MT1, false > &lhs, const DenseMatrix< MT2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:9091
const T2 b_
The right-hand side operand for the multiplication.
Definition: Mult.h:994
Header file for the basic SIMD types.
BLAZE_ALWAYS_INLINE SIMDf64MultExpr(const T1 &a, const T2 &b)
Constructor for the SIMDf64MultExpr class.
Definition: Mult.h:963
System settings for the SSE mode.
SIMDf64< This > BaseType
Base type of this SIMDf64MultExpr instance.
Definition: Mult.h:954
System settings for the inline keywords.
const T1 a_
The left-hand side operand for the multiplication.
Definition: Mult.h:993