35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDVECTDMATMULTEXPR_H_ 36 #define _BLAZE_MATH_EXPRESSIONS_TDVECTDMATMULTEXPR_H_ 118 template<
typename VT
120 class TDVecTDMatMultExpr
121 :
public TVecMatMultExpr< DenseVector< TDVecTDMatMultExpr<VT,MT>, true > >
122 ,
private Computation
151 template<
typename T1 >
152 struct UseSMPAssign {
153 enum :
bool { value = ( evaluateVector || evaluateMatrix ) };
163 template<
typename T1,
typename T2,
typename T3 >
164 struct UseBlasKernel {
170 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
186 template<
typename T1,
typename T2,
typename T3 >
187 struct UseVectorizedDefaultKernel {
188 enum :
bool { value = useOptimizedKernels &&
190 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
226 VT::simdEnabled && MT::simdEnabled &&
231 enum :
bool { smpAssignable = !evaluateVector && VT::smpAssignable &&
232 !evaluateMatrix && MT::smpAssignable };
265 return vec_[index] *
mat_(index,index);
293 if( index >=
mat_.columns() ) {
296 return (*
this)[index];
305 inline size_t size() const noexcept {
306 return mat_.columns();
336 template<
typename T >
337 inline bool canAlias(
const T* alias )
const noexcept {
338 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
348 template<
typename T >
349 inline bool isAliased(
const T* alias )
const noexcept {
350 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
360 return vec_.isAligned() &&
mat_.isAligned();
374 (
mat_.rows() *
mat_.columns() < TDVECTDMATMULT_THRESHOLD ) ) &&
375 (
size() > SMP_TDVECTDMATMULT_THRESHOLD );
398 template<
typename VT1 >
405 if( rhs.mat_.rows() == 0UL ) {
409 else if( rhs.mat_.columns() == 0UL ) {
421 TDVecTDMatMultExpr::selectAssignKernel( ~lhs, x, A );
437 template<
typename VT1
440 static inline void selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
444 ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
445 selectSmallAssignKernel( y, x, A );
447 selectBlasAssignKernel( y, x, A );
466 template<
typename VT1
469 static inline void selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
490 template<
typename VT1
494 selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
496 selectDefaultAssignKernel( y, x, A );
515 template<
typename VT1
519 selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
523 const size_t M( A.rows() );
524 const size_t N( A.columns() );
528 for( ; (j+8UL) <= N; j+=8UL )
538 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
539 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
541 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
544 for( ; i<ipos; i+=SIMDSIZE ) {
546 xmm1 += x1 * A.load(i,j );
547 xmm2 += x1 * A.load(i,j+1UL);
548 xmm3 += x1 * A.load(i,j+2UL);
549 xmm4 += x1 * A.load(i,j+3UL);
550 xmm5 += x1 * A.load(i,j+4UL);
551 xmm6 += x1 * A.load(i,j+5UL);
552 xmm7 += x1 * A.load(i,j+6UL);
553 xmm8 += x1 * A.load(i,j+7UL);
557 y[j+1UL] =
sum( xmm2 );
558 y[j+2UL] =
sum( xmm3 );
559 y[j+3UL] =
sum( xmm4 );
560 y[j+4UL] =
sum( xmm5 );
561 y[j+5UL] =
sum( xmm6 );
562 y[j+6UL] =
sum( xmm7 );
563 y[j+7UL] =
sum( xmm8 );
565 for( ; remainder && i<iend; ++i ) {
566 y[j ] += x[i] * A(i,j );
567 y[j+1UL] += x[i] * A(i,j+1UL);
568 y[j+2UL] += x[i] * A(i,j+2UL);
569 y[j+3UL] += x[i] * A(i,j+3UL);
570 y[j+4UL] += x[i] * A(i,j+4UL);
571 y[j+5UL] += x[i] * A(i,j+5UL);
572 y[j+6UL] += x[i] * A(i,j+6UL);
573 y[j+7UL] += x[i] * A(i,j+7UL);
577 for( ; (j+4UL) <= N; j+=4UL )
587 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
588 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
593 for( ; i<ipos; i+=SIMDSIZE ) {
595 xmm1 += x1 * A.load(i,j );
596 xmm2 += x1 * A.load(i,j+1UL);
597 xmm3 += x1 * A.load(i,j+2UL);
598 xmm4 += x1 * A.load(i,j+3UL);
602 y[j+1UL] =
sum( xmm2 );
603 y[j+2UL] =
sum( xmm3 );
604 y[j+3UL] =
sum( xmm4 );
606 for( ; remainder && i<iend; ++i ) {
607 y[j ] += x[i] * A(i,j );
608 y[j+1UL] += x[i] * A(i,j+1UL);
609 y[j+2UL] += x[i] * A(i,j+2UL);
610 y[j+3UL] += x[i] * A(i,j+3UL);
614 for( ; (j+3UL) <= N; j+=3UL )
624 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
625 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
630 for( ; i<ipos; i+=SIMDSIZE ) {
632 xmm1 += x1 * A.load(i,j );
633 xmm2 += x1 * A.load(i,j+1UL);
634 xmm3 += x1 * A.load(i,j+2UL);
638 y[j+1UL] =
sum( xmm2 );
639 y[j+2UL] =
sum( xmm3 );
641 for( ; remainder && i<iend; ++i ) {
642 y[j ] += x[i] * A(i,j );
643 y[j+1UL] += x[i] * A(i,j+1UL);
644 y[j+2UL] += x[i] * A(i,j+2UL);
648 for( ; (j+2UL) <= N; j+=2UL )
658 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
659 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
664 for( ; i<ipos; i+=SIMDSIZE ) {
666 xmm1 += x1 * A.load(i,j );
667 xmm2 += x1 * A.load(i,j+1UL);
671 y[j+1UL] =
sum( xmm2 );
673 for( ; remainder && i<iend; ++i ) {
674 y[j ] += x[i] * A(i,j );
675 y[j+1UL] += x[i] * A(i,j+1UL);
689 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
690 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
695 for( ; i<ipos; i+=SIMDSIZE ) {
696 xmm1 += x.load(i) * A.load(i,j);
701 for( ; remainder && i<iend; ++i ) {
702 y[j] += x[i] * A(i,j);
723 template<
typename VT1
727 selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
729 selectDefaultAssignKernel( y, x, A );
748 template<
typename VT1
752 selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
756 const size_t M( A.rows() );
757 const size_t N( A.columns() );
763 for( ; (j+8UL) <= N; j+=8UL )
773 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
774 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
778 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
779 const size_t i1( i+SIMDSIZE );
780 const size_t i2( i+SIMDSIZE*2UL );
781 const size_t i3( i+SIMDSIZE*3UL );
786 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
787 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
788 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
789 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
790 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
791 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
792 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
793 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
796 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
797 const size_t i1( i+SIMDSIZE );
800 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
801 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
802 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
803 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
804 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
805 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
806 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
807 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
810 for( ; i<ipos; i+=SIMDSIZE ) {
812 y[j ] +=
sum( x1 * A.load(i,j ) );
813 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
814 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
815 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
816 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) );
817 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) );
818 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) );
819 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) );
822 for( ; remainder && i<iend; ++i ) {
823 y[j ] += x[i] * A(i,j );
824 y[j+1UL] += x[i] * A(i,j+1UL);
825 y[j+2UL] += x[i] * A(i,j+2UL);
826 y[j+3UL] += x[i] * A(i,j+3UL);
827 y[j+4UL] += x[i] * A(i,j+4UL);
828 y[j+5UL] += x[i] * A(i,j+5UL);
829 y[j+6UL] += x[i] * A(i,j+6UL);
830 y[j+7UL] += x[i] * A(i,j+7UL);
834 for( ; (j+4UL) <= N; j+=4UL )
844 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
845 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
849 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
850 const size_t i1( i+SIMDSIZE );
851 const size_t i2( i+SIMDSIZE*2UL );
852 const size_t i3( i+SIMDSIZE*3UL );
857 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
858 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
859 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
860 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
863 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
864 const size_t i1( i+SIMDSIZE );
867 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
868 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
869 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
870 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
873 for( ; i<ipos; i+=SIMDSIZE ) {
875 y[j ] +=
sum( x1 * A.load(i,j ) );
876 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
877 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
878 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
881 for( ; remainder && i<iend; ++i ) {
882 y[j ] += x[i] * A(i,j );
883 y[j+1UL] += x[i] * A(i,j+1UL);
884 y[j+2UL] += x[i] * A(i,j+2UL);
885 y[j+3UL] += x[i] * A(i,j+3UL);
889 for( ; (j+2UL) <= N; j+=2UL )
899 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
900 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
904 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
905 const size_t i1( i+SIMDSIZE );
906 const size_t i2( i+SIMDSIZE*2UL );
907 const size_t i3( i+SIMDSIZE*3UL );
912 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
913 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
916 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
917 const size_t i1( i+SIMDSIZE );
920 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
921 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
924 for( ; i<ipos; i+=SIMDSIZE ) {
926 y[j ] +=
sum( x1 * A.load(i,j ) );
927 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
930 for( ; remainder && i<iend; ++i ) {
931 y[j ] += x[i] * A(i,j );
932 y[j+1UL] += x[i] * A(i,j+1UL);
946 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
947 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
951 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
952 const size_t i1( i+SIMDSIZE );
953 const size_t i2( i+SIMDSIZE*2UL );
954 const size_t i3( i+SIMDSIZE*3UL );
959 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
962 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
963 const size_t i1( i+SIMDSIZE );
966 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
969 for( ; i<ipos; i+=SIMDSIZE ) {
971 y[j] +=
sum( x1 * A.load(i,j) );
974 for( ; remainder && i<iend; ++i ) {
975 y[j] += x[i] * A(i,j);
996 template<
typename VT1
1000 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1002 selectLargeAssignKernel( y, x, A );
1008 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 1022 template<
typename VT1
1026 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1035 gemv( y, x, A, ET(1), ET(0) );
1055 template<
typename VT1 >
1067 assign( ~lhs, tmp );
1085 template<
typename VT1 >
1092 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1104 TDVecTDMatMultExpr::selectAddAssignKernel( ~lhs, x, A );
1120 template<
typename VT1
1123 static inline void selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1127 ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
1128 selectSmallAddAssignKernel( y, x, A );
1130 selectBlasAddAssignKernel( y, x, A );
1149 template<
typename VT1
1152 static inline void selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1154 y.addAssign( x * A );
1173 template<
typename VT1
1177 selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1179 selectDefaultAddAssignKernel( y, x, A );
1199 template<
typename VT1
1203 selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1207 const size_t M( A.rows() );
1208 const size_t N( A.columns() );
1212 for( ; (j+8UL) <= N; j+=8UL )
1222 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1223 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
1225 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1228 for( ; i<ipos; i+=SIMDSIZE ) {
1230 xmm1 += x1 * A.load(i,j );
1231 xmm2 += x1 * A.load(i,j+1UL);
1232 xmm3 += x1 * A.load(i,j+2UL);
1233 xmm4 += x1 * A.load(i,j+3UL);
1234 xmm5 += x1 * A.load(i,j+4UL);
1235 xmm6 += x1 * A.load(i,j+5UL);
1236 xmm7 += x1 * A.load(i,j+6UL);
1237 xmm8 += x1 * A.load(i,j+7UL);
1240 y[j ] +=
sum( xmm1 );
1241 y[j+1UL] +=
sum( xmm2 );
1242 y[j+2UL] +=
sum( xmm3 );
1243 y[j+3UL] +=
sum( xmm4 );
1244 y[j+4UL] +=
sum( xmm5 );
1245 y[j+5UL] +=
sum( xmm6 );
1246 y[j+6UL] +=
sum( xmm7 );
1247 y[j+7UL] +=
sum( xmm8 );
1249 for( ; remainder && i<iend; ++i ) {
1250 y[j ] += x[i] * A(i,j );
1251 y[j+1UL] += x[i] * A(i,j+1UL);
1252 y[j+2UL] += x[i] * A(i,j+2UL);
1253 y[j+3UL] += x[i] * A(i,j+3UL);
1254 y[j+4UL] += x[i] * A(i,j+4UL);
1255 y[j+5UL] += x[i] * A(i,j+5UL);
1256 y[j+6UL] += x[i] * A(i,j+6UL);
1257 y[j+7UL] += x[i] * A(i,j+7UL);
1261 for( ; (j+4UL) <= N; j+=4UL )
1271 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1272 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
1277 for( ; i<ipos; i+=SIMDSIZE ) {
1279 xmm1 += x1 * A.load(i,j );
1280 xmm2 += x1 * A.load(i,j+1UL);
1281 xmm3 += x1 * A.load(i,j+2UL);
1282 xmm4 += x1 * A.load(i,j+3UL);
1285 y[j ] +=
sum( xmm1 );
1286 y[j+1UL] +=
sum( xmm2 );
1287 y[j+2UL] +=
sum( xmm3 );
1288 y[j+3UL] +=
sum( xmm4 );
1290 for( ; remainder && i<iend; ++i ) {
1291 y[j ] += x[i] * A(i,j );
1292 y[j+1UL] += x[i] * A(i,j+1UL);
1293 y[j+2UL] += x[i] * A(i,j+2UL);
1294 y[j+3UL] += x[i] * A(i,j+3UL);
1298 for( ; (j+3UL) <= N; j+=3UL )
1308 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1309 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
1314 for( ; i<ipos; i+=SIMDSIZE ) {
1316 xmm1 += x1 * A.load(i,j );
1317 xmm2 += x1 * A.load(i,j+1UL);
1318 xmm3 += x1 * A.load(i,j+2UL);
1321 y[j ] +=
sum( xmm1 );
1322 y[j+1UL] +=
sum( xmm2 );
1323 y[j+2UL] +=
sum( xmm3 );
1325 for( ; remainder && i<iend; ++i ) {
1326 y[j ] += x[i] * A(i,j );
1327 y[j+1UL] += x[i] * A(i,j+1UL);
1328 y[j+2UL] += x[i] * A(i,j+2UL);
1332 for( ; (j+2UL) <= N; j+=2UL )
1342 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1343 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
1348 for( ; i<ipos; i+=SIMDSIZE ) {
1350 xmm1 += x1 * A.load(i,j );
1351 xmm2 += x1 * A.load(i,j+1UL);
1354 y[j ] +=
sum( xmm1 );
1355 y[j+1UL] +=
sum( xmm2 );
1357 for( ; remainder && i<iend; ++i ) {
1358 y[j ] += x[i] * A(i,j );
1359 y[j+1UL] += x[i] * A(i,j+1UL);
1373 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1374 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
1379 for( ; i<ipos; i+=SIMDSIZE ) {
1380 xmm1 += A.load(i,j) * x.load(i);
1383 y[j] +=
sum( xmm1 );
1385 for( ; remainder && i<iend; ++i ) {
1386 y[j] += x[i] * A(i,j);
1407 template<
typename VT1
1411 selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1413 selectDefaultAddAssignKernel( y, x, A );
1433 template<
typename VT1
1437 selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1441 const size_t M( A.rows() );
1442 const size_t N( A.columns() );
1446 for( ; (j+8UL) <= N; j+=8UL )
1456 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1457 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
1461 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
1462 const size_t i1( i+SIMDSIZE );
1463 const size_t i2( i+SIMDSIZE*2UL );
1464 const size_t i3( i+SIMDSIZE*3UL );
1469 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1470 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1471 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
1472 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
1473 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
1474 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
1475 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
1476 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
1479 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
1480 const size_t i1( i+SIMDSIZE );
1483 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1484 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1485 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
1486 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
1487 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
1488 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
1489 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
1490 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
1493 for( ; i<ipos; i+=SIMDSIZE ) {
1495 y[j ] +=
sum( x1 * A.load(i,j ) );
1496 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
1497 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
1498 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
1499 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) );
1500 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) );
1501 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) );
1502 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) );
1505 for( ; remainder && i<iend; ++i ) {
1506 y[j ] += x[i] * A(i,j );
1507 y[j+1UL] += x[i] * A(i,j+1UL);
1508 y[j+2UL] += x[i] * A(i,j+2UL);
1509 y[j+3UL] += x[i] * A(i,j+3UL);
1510 y[j+4UL] += x[i] * A(i,j+4UL);
1511 y[j+5UL] += x[i] * A(i,j+5UL);
1512 y[j+6UL] += x[i] * A(i,j+6UL);
1513 y[j+7UL] += x[i] * A(i,j+7UL);
1517 for( ; (j+4UL) <= N; j+=4UL )
1527 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1528 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
1532 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
1533 const size_t i1( i+SIMDSIZE );
1534 const size_t i2( i+SIMDSIZE*2UL );
1535 const size_t i3( i+SIMDSIZE*3UL );
1540 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1541 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1542 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
1543 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
1546 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
1547 const size_t i1( i+SIMDSIZE );
1550 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1551 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1552 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
1553 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
1556 for( ; i<ipos; i+=SIMDSIZE ) {
1558 y[j ] +=
sum( x1 * A.load(i,j ) );
1559 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
1560 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
1561 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
1564 for( ; remainder && i<iend; ++i ) {
1565 y[j ] += x[i] * A(i,j );
1566 y[j+1UL] += x[i] * A(i,j+1UL);
1567 y[j+2UL] += x[i] * A(i,j+2UL);
1568 y[j+3UL] += x[i] * A(i,j+3UL);
1572 for( ; (j+2UL) <= N; j+=2UL )
1582 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1583 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
1587 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
1588 const size_t i1( i+SIMDSIZE );
1589 const size_t i2( i+SIMDSIZE*2UL );
1590 const size_t i3( i+SIMDSIZE*3UL );
1595 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1596 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1599 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
1600 const size_t i1( i+SIMDSIZE );
1603 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1604 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1607 for( ; i<ipos; i+=SIMDSIZE ) {
1609 y[j ] +=
sum( x1 * A.load(i,j ) );
1610 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
1613 for( ; remainder && i<iend; ++i ) {
1614 y[j ] += x[i] * A(i,j );
1615 y[j+1UL] += x[i] * A(i,j+1UL);
1629 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1630 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
1634 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
1635 const size_t i1( i+SIMDSIZE );
1636 const size_t i2( i+SIMDSIZE*2UL );
1637 const size_t i3( i+SIMDSIZE*3UL );
1642 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
1645 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
1646 const size_t i1( i+SIMDSIZE );
1649 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
1652 for( ; i<ipos; i+=SIMDSIZE ) {
1654 y[j] +=
sum( x1 * A.load(i,j) );
1657 for( ; remainder && i<iend; ++i ) {
1658 y[j] += x[i] * A(i,j);
1679 template<
typename VT1
1683 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1685 selectLargeAddAssignKernel( y, x, A );
1691 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 1705 template<
typename VT1
1709 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1716 addAssign( y, tmp );
1719 gemv( y, x, A, ET(1), ET(1) );
1743 template<
typename VT1 >
1750 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1762 TDVecTDMatMultExpr::selectSubAssignKernel( ~lhs, x, A );
1778 template<
typename VT1
1781 static inline void selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1785 ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
1786 selectSmallSubAssignKernel( y, x, A );
1788 selectBlasSubAssignKernel( y, x, A );
1807 template<
typename VT1
1810 static inline void selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1812 y.subAssign( x * A );
1831 template<
typename VT1
1835 selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1837 selectDefaultSubAssignKernel( y, x, A );
1857 template<
typename VT1
1861 selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1865 const size_t M( A.rows() );
1866 const size_t N( A.columns() );
1870 for( ; (j+8UL) <= N; j+=8UL )
1880 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1881 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
1883 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1886 for( ; i<ipos; i+=SIMDSIZE ) {
1888 xmm1 += x1 * A.load(i,j );
1889 xmm2 += x1 * A.load(i,j+1UL);
1890 xmm3 += x1 * A.load(i,j+2UL);
1891 xmm4 += x1 * A.load(i,j+3UL);
1892 xmm5 += x1 * A.load(i,j+4UL);
1893 xmm6 += x1 * A.load(i,j+5UL);
1894 xmm7 += x1 * A.load(i,j+6UL);
1895 xmm8 += x1 * A.load(i,j+7UL);
1898 y[j ] -=
sum( xmm1 );
1899 y[j+1UL] -=
sum( xmm2 );
1900 y[j+2UL] -=
sum( xmm3 );
1901 y[j+3UL] -=
sum( xmm4 );
1902 y[j+4UL] -=
sum( xmm5 );
1903 y[j+5UL] -=
sum( xmm6 );
1904 y[j+6UL] -=
sum( xmm7 );
1905 y[j+7UL] -=
sum( xmm8 );
1907 for( ; remainder && i<iend; ++i ) {
1908 y[j ] -= x[i] * A(i,j );
1909 y[j+1UL] -= x[i] * A(i,j+1UL);
1910 y[j+2UL] -= x[i] * A(i,j+2UL);
1911 y[j+3UL] -= x[i] * A(i,j+3UL);
1912 y[j+4UL] -= x[i] * A(i,j+4UL);
1913 y[j+5UL] -= x[i] * A(i,j+5UL);
1914 y[j+6UL] -= x[i] * A(i,j+6UL);
1915 y[j+7UL] -= x[i] * A(i,j+7UL);
1919 for( ; (j+4UL) <= N; j+=4UL )
1929 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1930 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
1935 for( ; i<ipos; i+=SIMDSIZE ) {
1937 xmm1 += x1 * A.load(i,j );
1938 xmm2 += x1 * A.load(i,j+1UL);
1939 xmm3 += x1 * A.load(i,j+2UL);
1940 xmm4 += x1 * A.load(i,j+3UL);
1943 y[j ] -=
sum( xmm1 );
1944 y[j+1UL] -=
sum( xmm2 );
1945 y[j+2UL] -=
sum( xmm3 );
1946 y[j+3UL] -=
sum( xmm4 );
1948 for( ; remainder && i<iend; ++i ) {
1949 y[j ] -= x[i] * A(i,j );
1950 y[j+1UL] -= x[i] * A(i,j+1UL);
1951 y[j+2UL] -= x[i] * A(i,j+2UL);
1952 y[j+3UL] -= x[i] * A(i,j+3UL);
1956 for( ; (j+3UL) <= N; j+=3UL )
1966 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1967 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
1972 for( ; i<ipos; i+=SIMDSIZE ) {
1974 xmm1 += x1 * A.load(i,j );
1975 xmm2 += x1 * A.load(i,j+1UL);
1976 xmm3 += x1 * A.load(i,j+2UL);
1979 y[j ] -=
sum( xmm1 );
1980 y[j+1UL] -=
sum( xmm2 );
1981 y[j+2UL] -=
sum( xmm3 );
1983 for( ; remainder && i<iend; ++i ) {
1984 y[j ] -= x[i] * A(i,j );
1985 y[j+1UL] -= x[i] * A(i,j+1UL);
1986 y[j+2UL] -= x[i] * A(i,j+2UL);
1990 for( ; (j+2UL) <= N; j+=2UL )
2000 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
2001 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
2006 for( ; i<ipos; i+=SIMDSIZE ) {
2008 xmm1 += x1 * A.load(i,j );
2009 xmm2 += x1 * A.load(i,j+1UL);
2012 y[j ] -=
sum( xmm1 );
2013 y[j+1UL] -=
sum( xmm2 );
2015 for( ; remainder && i<iend; ++i ) {
2016 y[j ] -= x[i] * A(i,j );
2017 y[j+1UL] -= x[i] * A(i,j+1UL);
2031 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
2032 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
2037 for( ; i<ipos; i+=SIMDSIZE ) {
2038 xmm1 += A.load(i,j) * x.load(i);
2041 y[j] -=
sum( xmm1 );
2043 for( ; remainder && i<iend; ++i ) {
2044 y[j] -= x[i] * A(i,j);
2065 template<
typename VT1
2069 selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2071 selectDefaultSubAssignKernel( y, x, A );
2091 template<
typename VT1
2095 selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2099 const size_t M( A.rows() );
2100 const size_t N( A.columns() );
2104 for( ; (j+8UL) <= N; j+=8UL )
2114 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
2115 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
2119 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
2120 const size_t i1( i+SIMDSIZE );
2121 const size_t i2( i+SIMDSIZE*2UL );
2122 const size_t i3( i+SIMDSIZE*3UL );
2127 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2128 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2129 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
2130 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
2131 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
2132 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
2133 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
2134 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
2137 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
2138 const size_t i1( i+SIMDSIZE );
2141 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2142 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2143 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
2144 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
2145 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
2146 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
2147 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
2148 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
2151 for( ; i<ipos; i+=SIMDSIZE ) {
2153 y[j ] -=
sum( x1 * A.load(i,j ) );
2154 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) );
2155 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) );
2156 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) );
2157 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) );
2158 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) );
2159 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) );
2160 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) );
2163 for( ; remainder && i<iend; ++i ) {
2164 y[j ] -= x[i] * A(i,j );
2165 y[j+1UL] -= x[i] * A(i,j+1UL);
2166 y[j+2UL] -= x[i] * A(i,j+2UL);
2167 y[j+3UL] -= x[i] * A(i,j+3UL);
2168 y[j+4UL] -= x[i] * A(i,j+4UL);
2169 y[j+5UL] -= x[i] * A(i,j+5UL);
2170 y[j+6UL] -= x[i] * A(i,j+6UL);
2171 y[j+7UL] -= x[i] * A(i,j+7UL);
2175 for( ; (j+4UL) <= N; j+=4UL )
2185 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
2186 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
2190 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
2191 const size_t i1( i+SIMDSIZE );
2192 const size_t i2( i+SIMDSIZE*2UL );
2193 const size_t i3( i+SIMDSIZE*3UL );
2198 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2199 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2200 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
2201 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
2204 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
2205 const size_t i1( i+SIMDSIZE );
2208 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2209 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2210 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
2211 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
2214 for( ; i<ipos; i+=SIMDSIZE ) {
2216 y[j ] -=
sum( x1 * A.load(i,j ) );
2217 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) );
2218 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) );
2219 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) );
2222 for( ; remainder && i<iend; ++i ) {
2223 y[j ] -= x[i] * A(i,j );
2224 y[j+1UL] -= x[i] * A(i,j+1UL);
2225 y[j+2UL] -= x[i] * A(i,j+2UL);
2226 y[j+3UL] -= x[i] * A(i,j+3UL);
2230 for( ; (j+2UL) <= N; j+=2UL )
2240 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
2241 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
2245 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
2246 const size_t i1( i+SIMDSIZE );
2247 const size_t i2( i+SIMDSIZE*2UL );
2248 const size_t i3( i+SIMDSIZE*3UL );
2253 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2254 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2257 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
2258 const size_t i1( i+SIMDSIZE );
2261 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2262 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2265 for( ; i<ipos; i+=SIMDSIZE ) {
2267 y[j ] -=
sum( x1 * A.load(i,j ) );
2268 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) );
2271 for( ; remainder && i<iend; ++i ) {
2272 y[j ] -= x[i] * A(i,j );
2273 y[j+1UL] -= x[i] * A(i,j+1UL);
2287 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
2288 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
2292 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
2293 const size_t i1( i+SIMDSIZE );
2294 const size_t i2( i+SIMDSIZE*2UL );
2295 const size_t i3( i+SIMDSIZE*3UL );
2300 y[j] -=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
2303 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
2304 const size_t i1( i+SIMDSIZE );
2307 y[j] -=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
2310 for( ; i<ipos; i+=SIMDSIZE ) {
2312 y[j] -=
sum( x1 * A.load(i,j) );
2315 for( ; remainder && i<iend; ++i ) {
2316 y[j] -= x[i] * A(i,j);
2337 template<
typename VT1
2341 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2343 selectLargeSubAssignKernel( y, x, A );
2349 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 2363 template<
typename VT1
2367 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2374 subAssign( y, tmp );
2377 gemv( y, x, A, ET(-1), ET(1) );
2401 template<
typename VT1 >
2413 multAssign( ~lhs, tmp );
2435 template<
typename VT1 >
2447 divAssign( ~lhs, tmp );
2471 template<
typename VT1 >
2479 if( rhs.mat_.rows() == 0UL ) {
2483 else if( rhs.mat_.columns() == 0UL ) {
2515 template<
typename VT1 >
2548 template<
typename VT1 >
2556 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2592 template<
typename VT1 >
2600 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2636 template<
typename VT1 >
2673 template<
typename VT1 >
2724 template<
typename VT
2728 :
public VecScalarMultExpr< DenseVector< DVecScalarMultExpr< TDVecTDMatMultExpr<VT,MT>, ST, true >, true > >
2759 template<
typename T1 >
2760 struct UseSMPAssign {
2761 enum :
bool { value = T1::smpAssignable && ( evaluateVector || evaluateMatrix ) };
2769 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2770 struct UseBlasKernel {
2776 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2791 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2792 struct UseVectorizedDefaultKernel {
2793 enum :
bool { value = useOptimizedKernels &&
2795 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2831 VT::simdEnabled && MT::simdEnabled &&
2837 enum :
bool { smpAssignable = !evaluateVector && VT::smpAssignable &&
2838 !evaluateMatrix && MT::smpAssignable };
2866 return vector_[index] * scalar_;
2878 if( index >= vector_.size() ) {
2881 return (*
this)[index];
2890 inline size_t size()
const {
2891 return vector_.size();
2921 template<
typename T >
2922 inline bool canAlias(
const T* alias )
const {
2923 return vector_.canAlias( alias );
2933 template<
typename T >
2934 inline bool isAliased(
const T* alias )
const {
2935 return vector_.isAliased( alias );
2945 return vector_.isAligned();
2960 ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) ) &&
2961 (
size() > SMP_TDVECTDMATMULT_THRESHOLD );
2983 template<
typename VT1
2994 if( right.rows() == 0UL ) {
2998 else if( right.columns() == 0UL ) {
3010 DVecScalarMultExpr::selectAssignKernel( ~lhs, x, A, rhs.
scalar_ );
3025 template<
typename VT1
3029 static inline void selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3033 ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
3034 selectSmallAssignKernel( y, x, A, scalar );
3036 selectBlasAssignKernel( y, x, A, scalar );
3054 template<
typename VT1
3058 static inline void selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3060 y.assign( x * A * scalar );
3078 template<
typename VT1
3083 selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3085 selectDefaultAssignKernel( y, x, A, scalar );
3104 template<
typename VT1
3109 selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3113 const size_t M( A.rows() );
3114 const size_t N( A.columns() );
3118 for( ; (j+8UL) <= N; j+=8UL )
3128 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3129 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3131 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3134 for( ; i<ipos; i+=SIMDSIZE ) {
3136 xmm1 += x1 * A.load(i,j );
3137 xmm2 += x1 * A.load(i,j+1UL);
3138 xmm3 += x1 * A.load(i,j+2UL);
3139 xmm4 += x1 * A.load(i,j+3UL);
3140 xmm5 += x1 * A.load(i,j+4UL);
3141 xmm6 += x1 * A.load(i,j+5UL);
3142 xmm7 += x1 * A.load(i,j+6UL);
3143 xmm8 += x1 * A.load(i,j+7UL);
3146 y[j ] =
sum( xmm1 ) * scalar;
3147 y[j+1UL] =
sum( xmm2 ) * scalar;
3148 y[j+2UL] =
sum( xmm3 ) * scalar;
3149 y[j+3UL] =
sum( xmm4 ) * scalar;
3150 y[j+4UL] =
sum( xmm5 ) * scalar;
3151 y[j+5UL] =
sum( xmm6 ) * scalar;
3152 y[j+6UL] =
sum( xmm7 ) * scalar;
3153 y[j+7UL] =
sum( xmm8 ) * scalar;
3155 for( ; remainder && i<iend; ++i ) {
3156 y[j ] += x[i] * A(i,j ) * scalar;
3157 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3158 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3159 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3160 y[j+4UL] += x[i] * A(i,j+4UL) * scalar;
3161 y[j+5UL] += x[i] * A(i,j+5UL) * scalar;
3162 y[j+6UL] += x[i] * A(i,j+6UL) * scalar;
3163 y[j+7UL] += x[i] * A(i,j+7UL) * scalar;
3167 for( ; (j+4UL) <= N; j+=4UL )
3177 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3178 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3183 for( ; i<ipos; i+=SIMDSIZE ) {
3185 xmm1 += x1 * A.load(i,j );
3186 xmm2 += x1 * A.load(i,j+1UL);
3187 xmm3 += x1 * A.load(i,j+2UL);
3188 xmm4 += x1 * A.load(i,j+3UL);
3191 y[j ] =
sum( xmm1 ) * scalar;
3192 y[j+1UL] =
sum( xmm2 ) * scalar;
3193 y[j+2UL] =
sum( xmm3 ) * scalar;
3194 y[j+3UL] =
sum( xmm4 ) * scalar;
3196 for( ; remainder && i<iend; ++i ) {
3197 y[j ] += x[i] * A(i,j ) * scalar;
3198 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3199 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3200 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3204 for( ; (j+3UL) <= N; j+=3UL )
3214 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3215 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3220 for( ; i<ipos; i+=SIMDSIZE ) {
3222 xmm1 += x1 * A.load(i,j );
3223 xmm2 += x1 * A.load(i,j+1UL);
3224 xmm3 += x1 * A.load(i,j+2UL);
3227 y[j ] =
sum( xmm1 ) * scalar;
3228 y[j+1UL] =
sum( xmm2 ) * scalar;
3229 y[j+2UL] =
sum( xmm3 ) * scalar;
3231 for( ; remainder && i<iend; ++i ) {
3232 y[j ] += x[i] * A(i,j ) * scalar;
3233 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3234 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3238 for( ; (j+2UL) <= N; j+=2UL )
3248 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3249 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3254 for( ; i<ipos; i+=SIMDSIZE ) {
3256 xmm1 += x1 * A.load(i,j );
3257 xmm2 += x1 * A.load(i,j+1UL);
3260 y[j ] =
sum( xmm1 ) * scalar;
3261 y[j+1UL] =
sum( xmm2 ) * scalar;
3263 for( ; remainder && i<iend; ++i ) {
3264 y[j ] += x[i] * A(i,j ) * scalar;
3265 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3279 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3280 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3285 for( ; i<ipos; i+=SIMDSIZE ) {
3286 xmm1 += A.load(i,j) * x.load(i);
3289 y[j] =
sum( xmm1 ) * scalar;
3291 for( ; remainder && i<iend; ++i ) {
3292 y[j] += x[i] * A(i,j) * scalar;
3312 template<
typename VT1
3317 selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3319 selectDefaultAssignKernel( y, x, A, scalar );
3338 template<
typename VT1
3343 selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3347 const size_t M( A.rows() );
3348 const size_t N( A.columns() );
3354 for( ; (j+8UL) <= N; j+=8UL )
3364 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3365 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3369 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
3370 const size_t i1( i+SIMDSIZE );
3371 const size_t i2( i+SIMDSIZE*2UL );
3372 const size_t i3( i+SIMDSIZE*3UL );
3377 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
3378 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
3379 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
3380 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
3381 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
3382 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
3383 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
3384 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
3387 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
3388 const size_t i1( i+SIMDSIZE );
3391 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
3392 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
3393 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
3394 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
3395 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
3396 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
3397 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
3398 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
3401 for( ; i<ipos; i+=SIMDSIZE ) {
3403 y[j ] +=
sum( x1 * A.load(i,j ) );
3404 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
3405 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
3406 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
3407 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) );
3408 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) );
3409 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) );
3410 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) );
3413 for( ; remainder && i<iend; ++i ) {
3414 y[j ] += x[i] * A(i,j );
3415 y[j+1UL] += x[i] * A(i,j+1UL);
3416 y[j+2UL] += x[i] * A(i,j+2UL);
3417 y[j+3UL] += x[i] * A(i,j+3UL);
3418 y[j+4UL] += x[i] * A(i,j+4UL);
3419 y[j+5UL] += x[i] * A(i,j+5UL);
3420 y[j+6UL] += x[i] * A(i,j+6UL);
3421 y[j+7UL] += x[i] * A(i,j+7UL);
3434 for( ; (j+4UL) <= N; j+=4UL )
3444 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3445 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3449 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
3450 const size_t i1( i+SIMDSIZE );
3451 const size_t i2( i+SIMDSIZE*2UL );
3452 const size_t i3( i+SIMDSIZE*3UL );
3457 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
3458 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
3459 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
3460 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
3463 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
3464 const size_t i1( i+SIMDSIZE );
3467 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
3468 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
3469 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
3470 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
3473 for( ; i<ipos; i+=SIMDSIZE ) {
3475 y[j ] +=
sum( x1 * A.load(i,j ) );
3476 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
3477 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
3478 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
3481 for( ; remainder && i<iend; ++i ) {
3482 y[j ] += x[i] * A(i,j );
3483 y[j+1UL] += x[i] * A(i,j+1UL);
3484 y[j+2UL] += x[i] * A(i,j+2UL);
3485 y[j+3UL] += x[i] * A(i,j+3UL);
3494 for( ; (j+2UL) <= N; j+=2UL )
3504 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3505 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3509 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
3510 const size_t i1( i+SIMDSIZE );
3511 const size_t i2( i+SIMDSIZE*2UL );
3512 const size_t i3( i+SIMDSIZE*3UL );
3517 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
3518 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
3521 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
3522 const size_t i1( i+SIMDSIZE );
3525 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
3526 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
3529 for( ; i<ipos; i+=SIMDSIZE ) {
3531 y[j ] +=
sum( x1 * A.load(i,j ) );
3532 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
3535 for( ; remainder && i<iend; ++i ) {
3536 y[j ] += x[i] * A(i,j );
3537 y[j+1UL] += x[i] * A(i,j+1UL);
3554 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3555 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3559 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
3560 const size_t i1( i+SIMDSIZE );
3561 const size_t i2( i+SIMDSIZE*2UL );
3562 const size_t i3( i+SIMDSIZE*3UL );
3567 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
3570 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
3571 const size_t i1( i+SIMDSIZE );
3574 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
3577 for( ; i<ipos; i+=SIMDSIZE ) {
3579 y[j] +=
sum( x1 * A.load(i,j) );
3582 for( ; remainder && i<iend; ++i ) {
3583 y[j] += x[i] * A(i,j);
3604 template<
typename VT1
3609 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3611 selectLargeAssignKernel( y, x, A, scalar );
3616 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 3630 template<
typename VT1
3635 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3640 assign( y, scalar * x );
3644 gemv( y, x, A, ET(scalar), ET(0) );
3662 template<
typename VT1
3675 assign( ~lhs, tmp );
3691 template<
typename VT1
3702 if( right.rows() == 0UL || right.columns() == 0UL ) {
3714 DVecScalarMultExpr::selectAddAssignKernel( ~lhs, x, A, rhs.
scalar_ );
3729 template<
typename VT1
3733 static inline void selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3737 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
3738 selectSmallAddAssignKernel( y, x, A, scalar );
3740 selectBlasAddAssignKernel( y, x, A, scalar );
3758 template<
typename VT1
3762 static inline void selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3764 y.addAssign( x * A * scalar );
3782 template<
typename VT1
3787 selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3789 selectDefaultAddAssignKernel( y, x, A, scalar );
3808 template<
typename VT1
3813 selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3817 const size_t M( A.rows() );
3818 const size_t N( A.columns() );
3822 for( ; (j+8UL) <= N; j+=8UL )
3832 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3833 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3835 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3838 for( ; i<ipos; i+=SIMDSIZE ) {
3840 xmm1 += x1 * A.load(i,j );
3841 xmm2 += x1 * A.load(i,j+1UL);
3842 xmm3 += x1 * A.load(i,j+2UL);
3843 xmm4 += x1 * A.load(i,j+3UL);
3844 xmm5 += x1 * A.load(i,j+4UL);
3845 xmm6 += x1 * A.load(i,j+5UL);
3846 xmm7 += x1 * A.load(i,j+6UL);
3847 xmm8 += x1 * A.load(i,j+7UL);
3850 y[j ] +=
sum( xmm1 ) * scalar;
3851 y[j+1UL] +=
sum( xmm2 ) * scalar;
3852 y[j+2UL] +=
sum( xmm3 ) * scalar;
3853 y[j+3UL] +=
sum( xmm4 ) * scalar;
3854 y[j+4UL] +=
sum( xmm5 ) * scalar;
3855 y[j+5UL] +=
sum( xmm6 ) * scalar;
3856 y[j+6UL] +=
sum( xmm7 ) * scalar;
3857 y[j+7UL] +=
sum( xmm8 ) * scalar;
3859 for( ; remainder && i<iend; ++i ) {
3860 y[j ] += x[i] * A(i,j ) * scalar;
3861 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3862 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3863 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3864 y[j+4UL] += x[i] * A(i,j+4UL) * scalar;
3865 y[j+5UL] += x[i] * A(i,j+5UL) * scalar;
3866 y[j+6UL] += x[i] * A(i,j+6UL) * scalar;
3867 y[j+7UL] += x[i] * A(i,j+7UL) * scalar;
3871 for( ; (j+4UL) <= N; j+=4UL )
3881 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3882 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3887 for( ; i<ipos; i+=SIMDSIZE ) {
3889 xmm1 += x1 * A.load(i,j );
3890 xmm2 += x1 * A.load(i,j+1UL);
3891 xmm3 += x1 * A.load(i,j+2UL);
3892 xmm4 += x1 * A.load(i,j+3UL);
3895 y[j ] +=
sum( xmm1 ) * scalar;
3896 y[j+1UL] +=
sum( xmm2 ) * scalar;
3897 y[j+2UL] +=
sum( xmm3 ) * scalar;
3898 y[j+3UL] +=
sum( xmm4 ) * scalar;
3900 for( ; remainder && i<iend; ++i ) {
3901 y[j ] += x[i] * A(i,j ) * scalar;
3902 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3903 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3904 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3908 for( ; (j+3UL) <= N; j+=3UL )
3918 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3919 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3924 for( ; i<ipos; i+=SIMDSIZE ) {
3926 xmm1 += x1 * A.load(i,j );
3927 xmm2 += x1 * A.load(i,j+1UL);
3928 xmm3 += x1 * A.load(i,j+2UL);
3931 y[j ] +=
sum( xmm1 ) * scalar;
3932 y[j+1UL] +=
sum( xmm2 ) * scalar;
3933 y[j+2UL] +=
sum( xmm3 ) * scalar;
3935 for( ; remainder && i<iend; ++i ) {
3936 y[j ] += x[i] * A(i,j ) * scalar;
3937 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3938 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3942 for( ; (j+2UL) <= N; j+=2UL )
3952 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3953 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3958 for( ; i<ipos; i+=SIMDSIZE ) {
3960 xmm1 += x1 * A.load(i,j );
3961 xmm2 += x1 * A.load(i,j+1UL);
3964 y[j ] +=
sum( xmm1 ) * scalar;
3965 y[j+1UL] +=
sum( xmm2 ) * scalar;
3967 for( ; remainder && i<iend; ++i ) {
3968 y[j ] += x[i] * A(i,j ) * scalar;
3969 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3983 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3984 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3989 for( ; i<ipos; i+=SIMDSIZE ) {
3990 xmm1 += A.load(i,j) * x.load(i);
3993 y[j] +=
sum( xmm1 ) * scalar;
3995 for( ; remainder && i<iend; ++i ) {
3996 y[j] += x[i] * A(i,j) * scalar;
4016 template<
typename VT1
4021 selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4023 selectDefaultAddAssignKernel( y, x, A, scalar );
4042 template<
typename VT1
4047 selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4051 const size_t M( A.rows() );
4052 const size_t N( A.columns() );
4056 for( ; (j+8UL) <= N; j+=8UL )
4066 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4067 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4071 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4072 const size_t i1( i+SIMDSIZE );
4073 const size_t i2( i+SIMDSIZE*2UL );
4074 const size_t i3( i+SIMDSIZE*3UL );
4079 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4080 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4081 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4082 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4083 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) ) * scalar;
4084 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) ) * scalar;
4085 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) ) * scalar;
4086 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) ) * scalar;
4089 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4090 const size_t i1( i+SIMDSIZE );
4093 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4094 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4095 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4096 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4097 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) ) * scalar;
4098 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) ) * scalar;
4099 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) ) * scalar;
4100 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) ) * scalar;
4103 for( ; i<ipos; i+=SIMDSIZE ) {
4105 y[j ] +=
sum( x1 * A.load(i,j ) ) * scalar;
4106 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4107 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) ) * scalar;
4108 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) ) * scalar;
4109 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) ) * scalar;
4110 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) ) * scalar;
4111 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) ) * scalar;
4112 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) ) * scalar;
4115 for( ; remainder && i<iend; ++i ) {
4116 y[j ] += x[i] * A(i,j ) * scalar;
4117 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4118 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
4119 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
4120 y[j+4UL] += x[i] * A(i,j+4UL) * scalar;
4121 y[j+5UL] += x[i] * A(i,j+5UL) * scalar;
4122 y[j+6UL] += x[i] * A(i,j+6UL) * scalar;
4123 y[j+7UL] += x[i] * A(i,j+7UL) * scalar;
4127 for( ; (j+4UL) <= N; j+=4UL )
4137 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4138 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4142 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4143 const size_t i1( i+SIMDSIZE );
4144 const size_t i2( i+SIMDSIZE*2UL );
4145 const size_t i3( i+SIMDSIZE*3UL );
4150 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4151 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4152 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4153 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4156 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4157 const size_t i1( i+SIMDSIZE );
4160 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4161 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4162 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4163 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4166 for( ; i<ipos; i+=SIMDSIZE ) {
4168 y[j ] +=
sum( x1 * A.load(i,j ) ) * scalar;
4169 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4170 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) ) * scalar;
4171 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) ) * scalar;
4174 for( ; remainder && i<iend; ++i ) {
4175 y[j ] += x[i] * A(i,j ) * scalar;
4176 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4177 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
4178 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
4182 for( ; (j+2UL) <= N; j+=2UL )
4192 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4193 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4197 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4198 const size_t i1( i+SIMDSIZE );
4199 const size_t i2( i+SIMDSIZE*2UL );
4200 const size_t i3( i+SIMDSIZE*3UL );
4205 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4206 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4209 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4210 const size_t i1( i+SIMDSIZE );
4213 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4214 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4217 for( ; i<ipos; i+=SIMDSIZE ) {
4219 y[j ] +=
sum( x1 * A.load(i,j ) ) * scalar;
4220 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4223 for( ; remainder && i<iend; ++i ) {
4224 y[j ] += x[i] * A(i,j ) * scalar;
4225 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4239 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4240 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4244 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4245 const size_t i1( i+SIMDSIZE );
4246 const size_t i2( i+SIMDSIZE*2UL );
4247 const size_t i3( i+SIMDSIZE*3UL );
4252 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) ) * scalar;
4255 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4256 const size_t i1( i+SIMDSIZE );
4259 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) ) * scalar;
4262 for( ; i<ipos; i+=SIMDSIZE ) {
4264 y[j] +=
sum( x1 * A.load(i,j) ) * scalar;
4267 for( ; remainder && i<iend; ++i ) {
4268 y[j] += x[i] * A(i,j) * scalar;
4289 template<
typename VT1
4294 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4296 selectLargeAddAssignKernel( y, x, A, scalar );
4301 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 4315 template<
typename VT1
4320 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4327 addAssign( y, tmp );
4330 gemv( y, x, A, ET(scalar), ET(1) );
4352 template<
typename VT1
4363 if( right.rows() == 0UL || right.columns() == 0UL ) {
4375 DVecScalarMultExpr::selectSubAssignKernel( ~lhs, x, A, rhs.
scalar_ );
4390 template<
typename VT1
4394 static inline void selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4398 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
4399 selectSmallSubAssignKernel( y, x, A, scalar );
4401 selectBlasSubAssignKernel( y, x, A, scalar );
4419 template<
typename VT1
4423 static inline void selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4425 y.subAssign( x * A * scalar );
4443 template<
typename VT1
4448 selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4450 selectDefaultSubAssignKernel( y, x, A, scalar );
4469 template<
typename VT1
4474 selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4478 const size_t M( A.rows() );
4479 const size_t N( A.columns() );
4483 for( ; (j+8UL) <= N; j+=8UL )
4493 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4494 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4496 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4499 for( ; i<ipos; i+=SIMDSIZE ) {
4501 xmm1 += x1 * A.load(i,j );
4502 xmm2 += x1 * A.load(i,j+1UL);
4503 xmm3 += x1 * A.load(i,j+2UL);
4504 xmm4 += x1 * A.load(i,j+3UL);
4505 xmm5 += x1 * A.load(i,j+4UL);
4506 xmm6 += x1 * A.load(i,j+5UL);
4507 xmm7 += x1 * A.load(i,j+6UL);
4508 xmm8 += x1 * A.load(i,j+7UL);
4511 y[j ] -=
sum( xmm1 ) * scalar;
4512 y[j+1UL] -=
sum( xmm2 ) * scalar;
4513 y[j+2UL] -=
sum( xmm3 ) * scalar;
4514 y[j+3UL] -=
sum( xmm4 ) * scalar;
4515 y[j+4UL] -=
sum( xmm5 ) * scalar;
4516 y[j+5UL] -=
sum( xmm6 ) * scalar;
4517 y[j+6UL] -=
sum( xmm7 ) * scalar;
4518 y[j+7UL] -=
sum( xmm8 ) * scalar;
4520 for( ; remainder && i<iend; ++i ) {
4521 y[j ] -= x[i] * A(i,j ) * scalar;
4522 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4523 y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4524 y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4525 y[j+4UL] -= x[i] * A(i,j+4UL) * scalar;
4526 y[j+5UL] -= x[i] * A(i,j+5UL) * scalar;
4527 y[j+6UL] -= x[i] * A(i,j+6UL) * scalar;
4528 y[j+7UL] -= x[i] * A(i,j+7UL) * scalar;
4532 for( ; (j+4UL) <= N; j+=4UL )
4542 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4543 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4548 for( ; i<ipos; i+=SIMDSIZE ) {
4550 xmm1 += x1 * A.load(i,j );
4551 xmm2 += x1 * A.load(i,j+1UL);
4552 xmm3 += x1 * A.load(i,j+2UL);
4553 xmm4 += x1 * A.load(i,j+3UL);
4556 y[j ] -=
sum( xmm1 ) * scalar;
4557 y[j+1UL] -=
sum( xmm2 ) * scalar;
4558 y[j+2UL] -=
sum( xmm3 ) * scalar;
4559 y[j+3UL] -=
sum( xmm4 ) * scalar;
4561 for( ; remainder && i<iend; ++i ) {
4562 y[j ] -= x[i] * A(i,j ) * scalar;
4563 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4564 y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4565 y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4569 for( ; (j+3UL) <= N; j+=3UL )
4579 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4580 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4585 for( ; i<ipos; i+=SIMDSIZE ) {
4587 xmm1 += x1 * A.load(i,j );
4588 xmm2 += x1 * A.load(i,j+1UL);
4589 xmm3 += x1 * A.load(i,j+2UL);
4592 y[j ] -=
sum( xmm1 ) * scalar;
4593 y[j+1UL] -=
sum( xmm2 ) * scalar;
4594 y[j+2UL] -=
sum( xmm3 ) * scalar;
4596 for( ; remainder && i<iend; ++i ) {
4597 y[j ] -= x[i] * A(i,j ) * scalar;
4598 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4599 y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4603 for( ; (j+2UL) <= N; j+=2UL )
4613 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4614 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4619 for( ; i<ipos; i+=SIMDSIZE ) {
4621 xmm1 += x1 * A.load(i,j );
4622 xmm2 += x1 * A.load(i,j+1UL);
4625 y[j ] -=
sum( xmm1 ) * scalar;
4626 y[j+1UL] -=
sum( xmm2 ) * scalar;
4628 for( ; remainder && i<iend; ++i ) {
4629 y[j ] -= x[i] * A(i,j ) * scalar;
4630 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4644 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4645 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4650 for( ; i<ipos; i+=SIMDSIZE ) {
4651 xmm1 += A.load(i,j) * x.load(i);
4654 y[j] -=
sum( xmm1 ) * scalar;
4656 for( ; remainder && i<iend; ++i ) {
4657 y[j] -= x[i] * A(i,j) * scalar;
4677 template<
typename VT1
4682 selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4684 selectDefaultSubAssignKernel( y, x, A, scalar );
4703 template<
typename VT1
4708 selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4712 const size_t M( A.rows() );
4713 const size_t N( A.columns() );
4717 for( ; (j+8UL) <= N; j+=8UL )
4727 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4728 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4732 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4733 const size_t i1( i+SIMDSIZE );
4734 const size_t i2( i+SIMDSIZE*2UL );
4735 const size_t i3( i+SIMDSIZE*3UL );
4740 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4741 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4742 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4743 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4744 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) ) * scalar;
4745 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) ) * scalar;
4746 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) ) * scalar;
4747 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) ) * scalar;
4750 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4751 const size_t i1( i+SIMDSIZE );
4754 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4755 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4756 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4757 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4758 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) ) * scalar;
4759 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) ) * scalar;
4760 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) ) * scalar;
4761 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) ) * scalar;
4764 for( ; i<ipos; i+=SIMDSIZE ) {
4766 y[j ] -=
sum( x1 * A.load(i,j ) ) * scalar;
4767 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4768 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) ) * scalar;
4769 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) ) * scalar;
4770 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) ) * scalar;
4771 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) ) * scalar;
4772 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) ) * scalar;
4773 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) ) * scalar;
4776 for( ; remainder && i<iend; ++i ) {
4777 y[j ] -= x[i] * A(i,j ) * scalar;
4778 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4779 y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4780 y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4781 y[j+4UL] -= x[i] * A(i,j+4UL) * scalar;
4782 y[j+5UL] -= x[i] * A(i,j+5UL) * scalar;
4783 y[j+6UL] -= x[i] * A(i,j+6UL) * scalar;
4784 y[j+7UL] -= x[i] * A(i,j+7UL) * scalar;
4788 for( ; (j+4UL) <= N; j+=4UL )
4798 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4799 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4803 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4804 const size_t i1( i+SIMDSIZE );
4805 const size_t i2( i+SIMDSIZE*2UL );
4806 const size_t i3( i+SIMDSIZE*3UL );
4811 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4812 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4813 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4814 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4817 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4818 const size_t i1( i+SIMDSIZE );
4821 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4822 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4823 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4824 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4827 for( ; i<ipos; i+=SIMDSIZE ) {
4829 y[j ] -=
sum( x1 * A.load(i,j ) ) * scalar;
4830 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4831 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) ) * scalar;
4832 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) ) * scalar;
4835 for( ; remainder && i<iend; ++i ) {
4836 y[j ] -= x[i] * A(i,j ) * scalar;
4837 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4838 y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4839 y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4843 for( ; (j+2UL) <= N; j+=2UL )
4853 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4854 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4858 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4859 const size_t i1( i+SIMDSIZE );
4860 const size_t i2( i+SIMDSIZE*2UL );
4861 const size_t i3( i+SIMDSIZE*3UL );
4866 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4867 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4870 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4871 const size_t i1( i+SIMDSIZE );
4874 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4875 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4878 for( ; i<ipos; i+=SIMDSIZE ) {
4880 y[j ] -=
sum( x1 * A.load(i,j ) ) * scalar;
4881 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4884 for( ; remainder && i<iend; ++i ) {
4885 y[j ] -= x[i] * A(i,j ) * scalar;
4886 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4900 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4901 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4905 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4906 const size_t i1( i+SIMDSIZE );
4907 const size_t i2( i+SIMDSIZE*2UL );
4908 const size_t i3( i+SIMDSIZE*3UL );
4913 y[j] -=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) ) * scalar;
4916 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4917 const size_t i1( i+SIMDSIZE );
4920 y[j] -=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) ) * scalar;
4923 for( ; i<ipos; i+=SIMDSIZE ) {
4925 y[j] -=
sum( x1 * A.load(i,j) ) * scalar;
4928 for( ; remainder && i<iend; ++i ) {
4929 y[j] -= x[i] * A(i,j) * scalar;
4950 template<
typename VT1
4955 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4957 selectLargeSubAssignKernel( y, x, A, scalar );
4962 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 4976 template<
typename VT1
4981 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4988 subAssign( y, tmp );
4991 gemv( y, x, A, ET(-scalar), ET(1) );
5013 template<
typename VT1
5026 multAssign( ~lhs, tmp );
5046 template<
typename VT1
5059 divAssign( ~lhs, tmp );
5081 template<
typename VT1
5093 if( right.rows() == 0UL ) {
5097 else if( right.columns() == 0UL ) {
5127 template<
typename VT1
5159 template<
typename VT1
5171 if( right.rows() == 0UL || right.columns() == 0UL ) {
5205 template<
typename VT1
5217 if( right.rows() == 0UL || right.columns() == 0UL ) {
5251 template<
typename VT1
5287 template<
typename VT1
5363 template<
typename VT
5365 inline decltype(
auto)
5372 if( (~vec).
size() != (~mat).
rows() ) {
5392 template<
typename VT,
typename MT >
5393 struct Size< TDVecTDMatMultExpr<VT,MT> >
5410 template<
typename VT,
typename MT >
5411 struct IsAligned< TDVecTDMatMultExpr<VT,MT> >
5412 :
public BoolConstant< And< IsAligned<VT>, IsAligned<MT> >::value >
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Expression object for transpose dense vector-transpose dense matrix multiplications.The TDVecTDMatMultExpr class represents the compile time expression for multiplications between transpose dense vectors and column-major dense matrices.
Definition: Forward.h:158
Header file for auxiliary alias declarations.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:72
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:87
Header file for basic type definitions.
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDVecTDMatMultExpr.h:206
Subvector< VT, AF > subvector(Vector< VT, TF > &vector, size_t index, size_t size)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:322
MultTrait_< VRT, MRT > ResultType
Result type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:203
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:164
Header file for the serial shim.
Header file for the IsDiagonal type trait.
Generic wrapper for a compile time constant integral value.The IntegralConstant class template repres...
Definition: IntegralConstant.h:71
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
IfTrue_< evaluateMatrix, const MRT, MCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDVecTDMatMultExpr.h:220
Header file for the IsSame and IsStrictlySame type traits.
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:198
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:172
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:316
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDVecTDMatMultExpr.h:369
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:560
TDVecTDMatMultExpr(const VT &vec, const MT &mat) noexcept
Constructor for the TDVecTDMatMultExpr class.
Definition: TDVecTDMatMultExpr.h:246
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDVecTDMatMultExpr.h:260
EnableIf_< IsDenseVector< VT1 > > smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:193
typename DisableIf< Condition, T >::Type DisableIf_
Auxiliary type for the DisableIf class template.The DisableIf_ alias declaration provides a convenien...
Definition: DisableIf.h:224
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DVecScalarMultExpr.h:521
Header file for the And class template.
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:88
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:171
LeftOperand vec_
Left-hand side dense vector of the multiplication expression.
Definition: TDVecTDMatMultExpr.h:381
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:250
RightOperand rightOperand() const noexcept
Returns the right-hand side transpose dense matrix operand.
Definition: TDVecTDMatMultExpr.h:325
Column< MT > column(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific column of the given matrix.
Definition: Column.h:124
Header file for the Computation base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:140
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:88
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Compile time check for data types.This type trait tests whether or not the given types can be combine...
Definition: IsSIMDCombinable.h:120
Header file for the VecScalarMultExpr base class.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:343
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:133
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:78
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
Header file for the IsComplexDouble type trait.
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
Compile time check for the alignment of data types.This type trait tests whether the given data type ...
Definition: IsAligned.h:87
Constraint on the data type.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:72
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: TDVecTDMatMultExpr.h:305
Header file for the DisableIf class template.
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDVecTDMatMultExpr.h:208
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
Header file for the If class template.
ResultType_< MT > MRT
Result type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:127
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
If_< IsExpression< MT >, const MT, const MT &> RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:214
Compile time check for data types with padding.This type trait tests whether the given data type empl...
Definition: IsPadded.h:76
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:102
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the Columns type trait.
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDVecTDMatMultExpr.h:349
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:70
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
ResultType_< VT > VRT
Result type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:126
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:90
#define BLAZE_CONSTRAINT_MUST_NOT_BE_MATMATMULTEXPR_TYPE(T)
Constraint on the data type.In case the given data type T is a matrix/matrix multiplication expressio...
Definition: MatMatMultExpr.h:88
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:80
Header file for the IsTriangular type trait.
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:204
ElementType_< MRT > MET
Element type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:129
Constraint on the data type.
Header file for the exception macros of the math module.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: TDVecTDMatMultExpr.h:315
Constraint on the data type.
LeftOperand vector_
Left-hand side dense vector of the multiplication expression.
Definition: DVecScalarMultExpr.h:592
Header file for all forward declarations for expression class templates.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
typename T::LeftOperand LeftOperand_
Alias declaration for nested LeftOperand type definitions.The LeftOperand_ alias declaration provides...
Definition: Aliases.h:203
Header file for the HasConstDataAccess type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DVecScalarMultExpr.h:593
System settings for the BLAS mode.
Base class for all vector/scalar multiplication expression templates.The VecScalarMultExpr class serv...
Definition: VecScalarMultExpr.h:67
Header file for the IsSIMDCombinable type trait.
Header file for the HasSIMDMult type trait.
BLAZE_ALWAYS_INLINE ValueType_< T > sum(const SIMDi8< T > &a) noexcept
Returns the sum of all elements in the 8-bit integral SIMD vector.
Definition: Reduction.h:65
Header file for run time assertion macros.
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: TDVecTDMatMultExpr.h:292
RightOperand mat_
Right-hand side dense matrix of the multiplication expression.
Definition: TDVecTDMatMultExpr.h:382
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:154
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
EnableIf_< IsDenseVector< VT1 > > smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:222
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
Constraint on the data type.
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:296
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
Compile time check for built-in data types.This type trait tests whether or not the given template pa...
Definition: IsBuiltin.h:75
Header file for the TVecMatMultExpr base class.
Constraints on the storage order of matrix types.
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:819
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:93
Header file for the HasMutableDataAccess type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
typename EnableIf< Condition, T >::Type EnableIf_
Auxiliary alias declaration for the EnableIf class template.The EnableIf_ alias declaration provides ...
Definition: EnableIf.h:224
IfTrue_< evaluateVector, const VRT, VCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDVecTDMatMultExpr.h:217
Header file for BLAS general matrix/vector multiplication functions (gemv)
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
BLAZE_ALWAYS_INLINE size_t rows(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of rows of the matrix.
Definition: Matrix.h:324
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3082
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:61
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_TVECMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid vector/matrix ...
Definition: TVecMatMultExpr.h:109
Compile time check for complex types.This type trait tests whether or not the given template paramete...
Definition: IsComplex.h:76
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Expression object for dense vector-scalar multiplications.The DVecScalarMultExpr class represents the...
Definition: DVecScalarMultExpr.h:106
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:66
Compile time evaluation of the size of a vector.The Size type trait evaluates the size of the given v...
Definition: Size.h:74
Base class for sparse vectors.The SparseVector class is a base class for all arbitrarily sized (N-dim...
Definition: Forward.h:130
ElementType_< ResultType > ElementType
Resulting element type.
Definition: TDVecTDMatMultExpr.h:205
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Compile time evaluation of the number of columns of a matrix.The Columns type trait evaluates the num...
Definition: Columns.h:75
#define BLAZE_CONSTRAINT_MUST_BE_ROW_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a row dense or sparse vector type (i...
Definition: RowVector.h:61
CompositeType_< VT > VCT
Composite type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:130
Header file for the IsComplex type trait.
Header file for the complex data type.
typename T::RightOperand RightOperand_
Alias declaration for nested RightOperand type definitions.The RightOperand_ alias declaration provid...
Definition: Aliases.h:383
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:423
Header file for the IsUpper type trait.
CompositeType_< MT > MCT
Composite type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:131
Constraint on the data type.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDVecTDMatMultExpr.h:337
ElementType_< VRT > VET
Element type of the left-hand side dense vector epxression.
Definition: TDVecTDMatMultExpr.h:128
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:207
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDVecTDMatMultExpr.h:359
Header file for the Size type trait.
If_< IsExpression< VT >, const VT, const VT &> LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:211
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Constraint on the transpose flag of vector types.
Header file for the IsExpression type trait class.
Header file for the function trace functionality.