35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDVECTDMATMULTEXPR_H_ 36 #define _BLAZE_MATH_EXPRESSIONS_TDVECTDMATMULTEXPR_H_ 119 template<
typename VT
121 class TDVecTDMatMultExpr
122 :
public TVecMatMultExpr< DenseVector< TDVecTDMatMultExpr<VT,MT>, true > >
123 ,
private Computation
152 template<
typename T1 >
153 struct UseSMPAssign {
154 enum :
bool { value = ( evaluateVector || evaluateMatrix ) };
164 template<
typename T1,
typename T2,
typename T3 >
165 struct UseBlasKernel {
171 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
187 template<
typename T1,
typename T2,
typename T3 >
188 struct UseVectorizedDefaultKernel {
189 enum :
bool { value = useOptimizedKernels &&
191 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
227 VT::simdEnabled && MT::simdEnabled &&
232 enum :
bool { smpAssignable = !evaluateVector && VT::smpAssignable &&
233 !evaluateMatrix && MT::smpAssignable };
266 return vec_[index] *
mat_(index,index);
296 if( index >=
mat_.columns() ) {
299 return (*
this)[index];
308 inline size_t size() const noexcept {
309 return mat_.columns();
339 template<
typename T >
340 inline bool canAlias(
const T* alias )
const noexcept {
341 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
351 template<
typename T >
352 inline bool isAliased(
const T* alias )
const noexcept {
353 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
363 return vec_.isAligned() &&
mat_.isAligned();
377 (
mat_.rows() *
mat_.columns() < TDVECTDMATMULT_THRESHOLD ) ) &&
378 (
size() > SMP_TDVECTDMATMULT_THRESHOLD );
401 template<
typename VT1 >
408 if( rhs.mat_.rows() == 0UL ) {
412 else if( rhs.mat_.columns() == 0UL ) {
424 TDVecTDMatMultExpr::selectAssignKernel( ~lhs, x, A );
440 template<
typename VT1
443 static inline void selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
447 ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
448 selectSmallAssignKernel( y, x, A );
450 selectBlasAssignKernel( y, x, A );
469 template<
typename VT1
472 static inline void selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
493 template<
typename VT1
497 selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
499 selectDefaultAssignKernel( y, x, A );
518 template<
typename VT1
522 selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
526 const size_t M( A.rows() );
527 const size_t N( A.columns() );
531 for( ; (j+8UL) <= N; j+=8UL )
541 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
542 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
544 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
547 for( ; i<ipos; i+=SIMDSIZE ) {
549 xmm1 += x1 * A.load(i,j );
550 xmm2 += x1 * A.load(i,j+1UL);
551 xmm3 += x1 * A.load(i,j+2UL);
552 xmm4 += x1 * A.load(i,j+3UL);
553 xmm5 += x1 * A.load(i,j+4UL);
554 xmm6 += x1 * A.load(i,j+5UL);
555 xmm7 += x1 * A.load(i,j+6UL);
556 xmm8 += x1 * A.load(i,j+7UL);
560 y[j+1UL] =
sum( xmm2 );
561 y[j+2UL] =
sum( xmm3 );
562 y[j+3UL] =
sum( xmm4 );
563 y[j+4UL] =
sum( xmm5 );
564 y[j+5UL] =
sum( xmm6 );
565 y[j+6UL] =
sum( xmm7 );
566 y[j+7UL] =
sum( xmm8 );
568 for( ; remainder && i<iend; ++i ) {
569 y[j ] += x[i] * A(i,j );
570 y[j+1UL] += x[i] * A(i,j+1UL);
571 y[j+2UL] += x[i] * A(i,j+2UL);
572 y[j+3UL] += x[i] * A(i,j+3UL);
573 y[j+4UL] += x[i] * A(i,j+4UL);
574 y[j+5UL] += x[i] * A(i,j+5UL);
575 y[j+6UL] += x[i] * A(i,j+6UL);
576 y[j+7UL] += x[i] * A(i,j+7UL);
580 for( ; (j+4UL) <= N; j+=4UL )
590 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
591 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
596 for( ; i<ipos; i+=SIMDSIZE ) {
598 xmm1 += x1 * A.load(i,j );
599 xmm2 += x1 * A.load(i,j+1UL);
600 xmm3 += x1 * A.load(i,j+2UL);
601 xmm4 += x1 * A.load(i,j+3UL);
605 y[j+1UL] =
sum( xmm2 );
606 y[j+2UL] =
sum( xmm3 );
607 y[j+3UL] =
sum( xmm4 );
609 for( ; remainder && i<iend; ++i ) {
610 y[j ] += x[i] * A(i,j );
611 y[j+1UL] += x[i] * A(i,j+1UL);
612 y[j+2UL] += x[i] * A(i,j+2UL);
613 y[j+3UL] += x[i] * A(i,j+3UL);
617 for( ; (j+3UL) <= N; j+=3UL )
627 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
628 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
633 for( ; i<ipos; i+=SIMDSIZE ) {
635 xmm1 += x1 * A.load(i,j );
636 xmm2 += x1 * A.load(i,j+1UL);
637 xmm3 += x1 * A.load(i,j+2UL);
641 y[j+1UL] =
sum( xmm2 );
642 y[j+2UL] =
sum( xmm3 );
644 for( ; remainder && i<iend; ++i ) {
645 y[j ] += x[i] * A(i,j );
646 y[j+1UL] += x[i] * A(i,j+1UL);
647 y[j+2UL] += x[i] * A(i,j+2UL);
651 for( ; (j+2UL) <= N; j+=2UL )
661 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
662 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
667 for( ; i<ipos; i+=SIMDSIZE ) {
669 xmm1 += x1 * A.load(i,j );
670 xmm2 += x1 * A.load(i,j+1UL);
674 y[j+1UL] =
sum( xmm2 );
676 for( ; remainder && i<iend; ++i ) {
677 y[j ] += x[i] * A(i,j );
678 y[j+1UL] += x[i] * A(i,j+1UL);
692 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
693 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
698 for( ; i<ipos; i+=SIMDSIZE ) {
699 xmm1 += x.load(i) * A.load(i,j);
704 for( ; remainder && i<iend; ++i ) {
705 y[j] += x[i] * A(i,j);
726 template<
typename VT1
730 selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
732 selectDefaultAssignKernel( y, x, A );
751 template<
typename VT1
755 selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
759 const size_t M( A.rows() );
760 const size_t N( A.columns() );
766 for( ; (j+8UL) <= N; j+=8UL )
776 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
777 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
781 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
782 const size_t i1( i+SIMDSIZE );
783 const size_t i2( i+SIMDSIZE*2UL );
784 const size_t i3( i+SIMDSIZE*3UL );
789 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
790 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
791 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
792 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
793 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
794 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
795 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
796 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
799 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
800 const size_t i1( i+SIMDSIZE );
803 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
804 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
805 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
806 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
807 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
808 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
809 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
810 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
813 for( ; i<ipos; i+=SIMDSIZE ) {
815 y[j ] +=
sum( x1 * A.load(i,j ) );
816 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
817 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
818 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
819 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) );
820 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) );
821 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) );
822 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) );
825 for( ; remainder && i<iend; ++i ) {
826 y[j ] += x[i] * A(i,j );
827 y[j+1UL] += x[i] * A(i,j+1UL);
828 y[j+2UL] += x[i] * A(i,j+2UL);
829 y[j+3UL] += x[i] * A(i,j+3UL);
830 y[j+4UL] += x[i] * A(i,j+4UL);
831 y[j+5UL] += x[i] * A(i,j+5UL);
832 y[j+6UL] += x[i] * A(i,j+6UL);
833 y[j+7UL] += x[i] * A(i,j+7UL);
837 for( ; (j+4UL) <= N; j+=4UL )
847 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
848 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
852 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
853 const size_t i1( i+SIMDSIZE );
854 const size_t i2( i+SIMDSIZE*2UL );
855 const size_t i3( i+SIMDSIZE*3UL );
860 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
861 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
862 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
863 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
866 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
867 const size_t i1( i+SIMDSIZE );
870 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
871 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
872 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
873 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
876 for( ; i<ipos; i+=SIMDSIZE ) {
878 y[j ] +=
sum( x1 * A.load(i,j ) );
879 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
880 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
881 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
884 for( ; remainder && i<iend; ++i ) {
885 y[j ] += x[i] * A(i,j );
886 y[j+1UL] += x[i] * A(i,j+1UL);
887 y[j+2UL] += x[i] * A(i,j+2UL);
888 y[j+3UL] += x[i] * A(i,j+3UL);
892 for( ; (j+2UL) <= N; j+=2UL )
902 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
903 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
907 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
908 const size_t i1( i+SIMDSIZE );
909 const size_t i2( i+SIMDSIZE*2UL );
910 const size_t i3( i+SIMDSIZE*3UL );
915 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
916 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
919 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
920 const size_t i1( i+SIMDSIZE );
923 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
924 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
927 for( ; i<ipos; i+=SIMDSIZE ) {
929 y[j ] +=
sum( x1 * A.load(i,j ) );
930 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
933 for( ; remainder && i<iend; ++i ) {
934 y[j ] += x[i] * A(i,j );
935 y[j+1UL] += x[i] * A(i,j+1UL);
949 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
950 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
954 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
955 const size_t i1( i+SIMDSIZE );
956 const size_t i2( i+SIMDSIZE*2UL );
957 const size_t i3( i+SIMDSIZE*3UL );
962 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
965 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
966 const size_t i1( i+SIMDSIZE );
969 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
972 for( ; i<ipos; i+=SIMDSIZE ) {
974 y[j] +=
sum( x1 * A.load(i,j) );
977 for( ; remainder && i<iend; ++i ) {
978 y[j] += x[i] * A(i,j);
999 template<
typename VT1
1003 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1005 selectLargeAssignKernel( y, x, A );
1011 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 1025 template<
typename VT1
1029 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1038 gemv( y, x, A, ET(1), ET(0) );
1058 template<
typename VT1 >
1070 assign( ~lhs, tmp );
1088 template<
typename VT1 >
1095 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1107 TDVecTDMatMultExpr::selectAddAssignKernel( ~lhs, x, A );
1123 template<
typename VT1
1126 static inline void selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1130 ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
1131 selectSmallAddAssignKernel( y, x, A );
1133 selectBlasAddAssignKernel( y, x, A );
1152 template<
typename VT1
1155 static inline void selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1157 y.addAssign( x * A );
1176 template<
typename VT1
1180 selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1182 selectDefaultAddAssignKernel( y, x, A );
1202 template<
typename VT1
1206 selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1210 const size_t M( A.rows() );
1211 const size_t N( A.columns() );
1215 for( ; (j+8UL) <= N; j+=8UL )
1225 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1226 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
1228 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1231 for( ; i<ipos; i+=SIMDSIZE ) {
1233 xmm1 += x1 * A.load(i,j );
1234 xmm2 += x1 * A.load(i,j+1UL);
1235 xmm3 += x1 * A.load(i,j+2UL);
1236 xmm4 += x1 * A.load(i,j+3UL);
1237 xmm5 += x1 * A.load(i,j+4UL);
1238 xmm6 += x1 * A.load(i,j+5UL);
1239 xmm7 += x1 * A.load(i,j+6UL);
1240 xmm8 += x1 * A.load(i,j+7UL);
1243 y[j ] +=
sum( xmm1 );
1244 y[j+1UL] +=
sum( xmm2 );
1245 y[j+2UL] +=
sum( xmm3 );
1246 y[j+3UL] +=
sum( xmm4 );
1247 y[j+4UL] +=
sum( xmm5 );
1248 y[j+5UL] +=
sum( xmm6 );
1249 y[j+6UL] +=
sum( xmm7 );
1250 y[j+7UL] +=
sum( xmm8 );
1252 for( ; remainder && i<iend; ++i ) {
1253 y[j ] += x[i] * A(i,j );
1254 y[j+1UL] += x[i] * A(i,j+1UL);
1255 y[j+2UL] += x[i] * A(i,j+2UL);
1256 y[j+3UL] += x[i] * A(i,j+3UL);
1257 y[j+4UL] += x[i] * A(i,j+4UL);
1258 y[j+5UL] += x[i] * A(i,j+5UL);
1259 y[j+6UL] += x[i] * A(i,j+6UL);
1260 y[j+7UL] += x[i] * A(i,j+7UL);
1264 for( ; (j+4UL) <= N; j+=4UL )
1274 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1275 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
1280 for( ; i<ipos; i+=SIMDSIZE ) {
1282 xmm1 += x1 * A.load(i,j );
1283 xmm2 += x1 * A.load(i,j+1UL);
1284 xmm3 += x1 * A.load(i,j+2UL);
1285 xmm4 += x1 * A.load(i,j+3UL);
1288 y[j ] +=
sum( xmm1 );
1289 y[j+1UL] +=
sum( xmm2 );
1290 y[j+2UL] +=
sum( xmm3 );
1291 y[j+3UL] +=
sum( xmm4 );
1293 for( ; remainder && i<iend; ++i ) {
1294 y[j ] += x[i] * A(i,j );
1295 y[j+1UL] += x[i] * A(i,j+1UL);
1296 y[j+2UL] += x[i] * A(i,j+2UL);
1297 y[j+3UL] += x[i] * A(i,j+3UL);
1301 for( ; (j+3UL) <= N; j+=3UL )
1311 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1312 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
1317 for( ; i<ipos; i+=SIMDSIZE ) {
1319 xmm1 += x1 * A.load(i,j );
1320 xmm2 += x1 * A.load(i,j+1UL);
1321 xmm3 += x1 * A.load(i,j+2UL);
1324 y[j ] +=
sum( xmm1 );
1325 y[j+1UL] +=
sum( xmm2 );
1326 y[j+2UL] +=
sum( xmm3 );
1328 for( ; remainder && i<iend; ++i ) {
1329 y[j ] += x[i] * A(i,j );
1330 y[j+1UL] += x[i] * A(i,j+1UL);
1331 y[j+2UL] += x[i] * A(i,j+2UL);
1335 for( ; (j+2UL) <= N; j+=2UL )
1345 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1346 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
1351 for( ; i<ipos; i+=SIMDSIZE ) {
1353 xmm1 += x1 * A.load(i,j );
1354 xmm2 += x1 * A.load(i,j+1UL);
1357 y[j ] +=
sum( xmm1 );
1358 y[j+1UL] +=
sum( xmm2 );
1360 for( ; remainder && i<iend; ++i ) {
1361 y[j ] += x[i] * A(i,j );
1362 y[j+1UL] += x[i] * A(i,j+1UL);
1376 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1377 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
1382 for( ; i<ipos; i+=SIMDSIZE ) {
1383 xmm1 += A.load(i,j) * x.load(i);
1386 y[j] +=
sum( xmm1 );
1388 for( ; remainder && i<iend; ++i ) {
1389 y[j] += x[i] * A(i,j);
1410 template<
typename VT1
1414 selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1416 selectDefaultAddAssignKernel( y, x, A );
1436 template<
typename VT1
1440 selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1444 const size_t M( A.rows() );
1445 const size_t N( A.columns() );
1449 for( ; (j+8UL) <= N; j+=8UL )
1459 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1460 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
1464 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
1465 const size_t i1( i+SIMDSIZE );
1466 const size_t i2( i+SIMDSIZE*2UL );
1467 const size_t i3( i+SIMDSIZE*3UL );
1472 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1473 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1474 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
1475 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
1476 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
1477 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
1478 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
1479 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
1482 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
1483 const size_t i1( i+SIMDSIZE );
1486 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1487 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1488 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
1489 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
1490 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
1491 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
1492 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
1493 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
1496 for( ; i<ipos; i+=SIMDSIZE ) {
1498 y[j ] +=
sum( x1 * A.load(i,j ) );
1499 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
1500 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
1501 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
1502 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) );
1503 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) );
1504 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) );
1505 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) );
1508 for( ; remainder && i<iend; ++i ) {
1509 y[j ] += x[i] * A(i,j );
1510 y[j+1UL] += x[i] * A(i,j+1UL);
1511 y[j+2UL] += x[i] * A(i,j+2UL);
1512 y[j+3UL] += x[i] * A(i,j+3UL);
1513 y[j+4UL] += x[i] * A(i,j+4UL);
1514 y[j+5UL] += x[i] * A(i,j+5UL);
1515 y[j+6UL] += x[i] * A(i,j+6UL);
1516 y[j+7UL] += x[i] * A(i,j+7UL);
1520 for( ; (j+4UL) <= N; j+=4UL )
1530 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1531 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
1535 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
1536 const size_t i1( i+SIMDSIZE );
1537 const size_t i2( i+SIMDSIZE*2UL );
1538 const size_t i3( i+SIMDSIZE*3UL );
1543 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1544 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1545 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
1546 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
1549 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
1550 const size_t i1( i+SIMDSIZE );
1553 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1554 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1555 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
1556 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
1559 for( ; i<ipos; i+=SIMDSIZE ) {
1561 y[j ] +=
sum( x1 * A.load(i,j ) );
1562 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
1563 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
1564 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
1567 for( ; remainder && i<iend; ++i ) {
1568 y[j ] += x[i] * A(i,j );
1569 y[j+1UL] += x[i] * A(i,j+1UL);
1570 y[j+2UL] += x[i] * A(i,j+2UL);
1571 y[j+3UL] += x[i] * A(i,j+3UL);
1575 for( ; (j+2UL) <= N; j+=2UL )
1585 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1586 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
1590 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
1591 const size_t i1( i+SIMDSIZE );
1592 const size_t i2( i+SIMDSIZE*2UL );
1593 const size_t i3( i+SIMDSIZE*3UL );
1598 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1599 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1602 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
1603 const size_t i1( i+SIMDSIZE );
1606 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1607 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1610 for( ; i<ipos; i+=SIMDSIZE ) {
1612 y[j ] +=
sum( x1 * A.load(i,j ) );
1613 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
1616 for( ; remainder && i<iend; ++i ) {
1617 y[j ] += x[i] * A(i,j );
1618 y[j+1UL] += x[i] * A(i,j+1UL);
1632 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1633 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
1637 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
1638 const size_t i1( i+SIMDSIZE );
1639 const size_t i2( i+SIMDSIZE*2UL );
1640 const size_t i3( i+SIMDSIZE*3UL );
1645 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
1648 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
1649 const size_t i1( i+SIMDSIZE );
1652 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
1655 for( ; i<ipos; i+=SIMDSIZE ) {
1657 y[j] +=
sum( x1 * A.load(i,j) );
1660 for( ; remainder && i<iend; ++i ) {
1661 y[j] += x[i] * A(i,j);
1682 template<
typename VT1
1686 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1688 selectLargeAddAssignKernel( y, x, A );
1694 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 1708 template<
typename VT1
1712 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1719 addAssign( y, tmp );
1722 gemv( y, x, A, ET(1), ET(1) );
1746 template<
typename VT1 >
1753 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1765 TDVecTDMatMultExpr::selectSubAssignKernel( ~lhs, x, A );
1781 template<
typename VT1
1784 static inline void selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1788 ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
1789 selectSmallSubAssignKernel( y, x, A );
1791 selectBlasSubAssignKernel( y, x, A );
1810 template<
typename VT1
1813 static inline void selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1815 y.subAssign( x * A );
1834 template<
typename VT1
1838 selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1840 selectDefaultSubAssignKernel( y, x, A );
1860 template<
typename VT1
1864 selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1868 const size_t M( A.rows() );
1869 const size_t N( A.columns() );
1873 for( ; (j+8UL) <= N; j+=8UL )
1883 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1884 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
1886 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1889 for( ; i<ipos; i+=SIMDSIZE ) {
1891 xmm1 += x1 * A.load(i,j );
1892 xmm2 += x1 * A.load(i,j+1UL);
1893 xmm3 += x1 * A.load(i,j+2UL);
1894 xmm4 += x1 * A.load(i,j+3UL);
1895 xmm5 += x1 * A.load(i,j+4UL);
1896 xmm6 += x1 * A.load(i,j+5UL);
1897 xmm7 += x1 * A.load(i,j+6UL);
1898 xmm8 += x1 * A.load(i,j+7UL);
1901 y[j ] -=
sum( xmm1 );
1902 y[j+1UL] -=
sum( xmm2 );
1903 y[j+2UL] -=
sum( xmm3 );
1904 y[j+3UL] -=
sum( xmm4 );
1905 y[j+4UL] -=
sum( xmm5 );
1906 y[j+5UL] -=
sum( xmm6 );
1907 y[j+6UL] -=
sum( xmm7 );
1908 y[j+7UL] -=
sum( xmm8 );
1910 for( ; remainder && i<iend; ++i ) {
1911 y[j ] -= x[i] * A(i,j );
1912 y[j+1UL] -= x[i] * A(i,j+1UL);
1913 y[j+2UL] -= x[i] * A(i,j+2UL);
1914 y[j+3UL] -= x[i] * A(i,j+3UL);
1915 y[j+4UL] -= x[i] * A(i,j+4UL);
1916 y[j+5UL] -= x[i] * A(i,j+5UL);
1917 y[j+6UL] -= x[i] * A(i,j+6UL);
1918 y[j+7UL] -= x[i] * A(i,j+7UL);
1922 for( ; (j+4UL) <= N; j+=4UL )
1932 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1933 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
1938 for( ; i<ipos; i+=SIMDSIZE ) {
1940 xmm1 += x1 * A.load(i,j );
1941 xmm2 += x1 * A.load(i,j+1UL);
1942 xmm3 += x1 * A.load(i,j+2UL);
1943 xmm4 += x1 * A.load(i,j+3UL);
1946 y[j ] -=
sum( xmm1 );
1947 y[j+1UL] -=
sum( xmm2 );
1948 y[j+2UL] -=
sum( xmm3 );
1949 y[j+3UL] -=
sum( xmm4 );
1951 for( ; remainder && i<iend; ++i ) {
1952 y[j ] -= x[i] * A(i,j );
1953 y[j+1UL] -= x[i] * A(i,j+1UL);
1954 y[j+2UL] -= x[i] * A(i,j+2UL);
1955 y[j+3UL] -= x[i] * A(i,j+3UL);
1959 for( ; (j+3UL) <= N; j+=3UL )
1969 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1970 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
1975 for( ; i<ipos; i+=SIMDSIZE ) {
1977 xmm1 += x1 * A.load(i,j );
1978 xmm2 += x1 * A.load(i,j+1UL);
1979 xmm3 += x1 * A.load(i,j+2UL);
1982 y[j ] -=
sum( xmm1 );
1983 y[j+1UL] -=
sum( xmm2 );
1984 y[j+2UL] -=
sum( xmm3 );
1986 for( ; remainder && i<iend; ++i ) {
1987 y[j ] -= x[i] * A(i,j );
1988 y[j+1UL] -= x[i] * A(i,j+1UL);
1989 y[j+2UL] -= x[i] * A(i,j+2UL);
1993 for( ; (j+2UL) <= N; j+=2UL )
2003 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
2004 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
2009 for( ; i<ipos; i+=SIMDSIZE ) {
2011 xmm1 += x1 * A.load(i,j );
2012 xmm2 += x1 * A.load(i,j+1UL);
2015 y[j ] -=
sum( xmm1 );
2016 y[j+1UL] -=
sum( xmm2 );
2018 for( ; remainder && i<iend; ++i ) {
2019 y[j ] -= x[i] * A(i,j );
2020 y[j+1UL] -= x[i] * A(i,j+1UL);
2034 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
2035 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
2040 for( ; i<ipos; i+=SIMDSIZE ) {
2041 xmm1 += A.load(i,j) * x.load(i);
2044 y[j] -=
sum( xmm1 );
2046 for( ; remainder && i<iend; ++i ) {
2047 y[j] -= x[i] * A(i,j);
2068 template<
typename VT1
2072 selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2074 selectDefaultSubAssignKernel( y, x, A );
2094 template<
typename VT1
2098 selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2102 const size_t M( A.rows() );
2103 const size_t N( A.columns() );
2107 for( ; (j+8UL) <= N; j+=8UL )
2117 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
2118 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
2122 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
2123 const size_t i1( i+SIMDSIZE );
2124 const size_t i2( i+SIMDSIZE*2UL );
2125 const size_t i3( i+SIMDSIZE*3UL );
2130 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2131 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2132 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
2133 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
2134 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
2135 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
2136 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
2137 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
2140 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
2141 const size_t i1( i+SIMDSIZE );
2144 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2145 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2146 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
2147 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
2148 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
2149 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
2150 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
2151 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
2154 for( ; i<ipos; i+=SIMDSIZE ) {
2156 y[j ] -=
sum( x1 * A.load(i,j ) );
2157 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) );
2158 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) );
2159 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) );
2160 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) );
2161 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) );
2162 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) );
2163 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) );
2166 for( ; remainder && i<iend; ++i ) {
2167 y[j ] -= x[i] * A(i,j );
2168 y[j+1UL] -= x[i] * A(i,j+1UL);
2169 y[j+2UL] -= x[i] * A(i,j+2UL);
2170 y[j+3UL] -= x[i] * A(i,j+3UL);
2171 y[j+4UL] -= x[i] * A(i,j+4UL);
2172 y[j+5UL] -= x[i] * A(i,j+5UL);
2173 y[j+6UL] -= x[i] * A(i,j+6UL);
2174 y[j+7UL] -= x[i] * A(i,j+7UL);
2178 for( ; (j+4UL) <= N; j+=4UL )
2188 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
2189 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
2193 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
2194 const size_t i1( i+SIMDSIZE );
2195 const size_t i2( i+SIMDSIZE*2UL );
2196 const size_t i3( i+SIMDSIZE*3UL );
2201 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2202 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2203 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
2204 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
2207 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
2208 const size_t i1( i+SIMDSIZE );
2211 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2212 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2213 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
2214 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
2217 for( ; i<ipos; i+=SIMDSIZE ) {
2219 y[j ] -=
sum( x1 * A.load(i,j ) );
2220 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) );
2221 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) );
2222 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) );
2225 for( ; remainder && i<iend; ++i ) {
2226 y[j ] -= x[i] * A(i,j );
2227 y[j+1UL] -= x[i] * A(i,j+1UL);
2228 y[j+2UL] -= x[i] * A(i,j+2UL);
2229 y[j+3UL] -= x[i] * A(i,j+3UL);
2233 for( ; (j+2UL) <= N; j+=2UL )
2243 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
2244 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
2248 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
2249 const size_t i1( i+SIMDSIZE );
2250 const size_t i2( i+SIMDSIZE*2UL );
2251 const size_t i3( i+SIMDSIZE*3UL );
2256 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2257 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2260 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
2261 const size_t i1( i+SIMDSIZE );
2264 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2265 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2268 for( ; i<ipos; i+=SIMDSIZE ) {
2270 y[j ] -=
sum( x1 * A.load(i,j ) );
2271 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) );
2274 for( ; remainder && i<iend; ++i ) {
2275 y[j ] -= x[i] * A(i,j );
2276 y[j+1UL] -= x[i] * A(i,j+1UL);
2290 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
2291 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
2295 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
2296 const size_t i1( i+SIMDSIZE );
2297 const size_t i2( i+SIMDSIZE*2UL );
2298 const size_t i3( i+SIMDSIZE*3UL );
2303 y[j] -=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
2306 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
2307 const size_t i1( i+SIMDSIZE );
2310 y[j] -=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
2313 for( ; i<ipos; i+=SIMDSIZE ) {
2315 y[j] -=
sum( x1 * A.load(i,j) );
2318 for( ; remainder && i<iend; ++i ) {
2319 y[j] -= x[i] * A(i,j);
2340 template<
typename VT1
2344 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2346 selectLargeSubAssignKernel( y, x, A );
2352 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 2366 template<
typename VT1
2370 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2377 subAssign( y, tmp );
2380 gemv( y, x, A, ET(-1), ET(1) );
2404 template<
typename VT1 >
2416 multAssign( ~lhs, tmp );
2438 template<
typename VT1 >
2450 divAssign( ~lhs, tmp );
2474 template<
typename VT1 >
2482 if( rhs.mat_.rows() == 0UL ) {
2486 else if( rhs.mat_.columns() == 0UL ) {
2518 template<
typename VT1 >
2551 template<
typename VT1 >
2559 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2595 template<
typename VT1 >
2603 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2639 template<
typename VT1 >
2676 template<
typename VT1 >
2727 template<
typename VT
2731 :
public VecScalarMultExpr< DenseVector< DVecScalarMultExpr< TDVecTDMatMultExpr<VT,MT>, ST, true >, true > >
2762 template<
typename T1 >
2763 struct UseSMPAssign {
2764 enum :
bool { value = T1::smpAssignable && ( evaluateVector || evaluateMatrix ) };
2772 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2773 struct UseBlasKernel {
2779 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2794 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2795 struct UseVectorizedDefaultKernel {
2796 enum :
bool { value = useOptimizedKernels &&
2798 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2834 VT::simdEnabled && MT::simdEnabled &&
2840 enum :
bool { smpAssignable = !evaluateVector && VT::smpAssignable &&
2841 !evaluateMatrix && MT::smpAssignable };
2869 return vector_[index] * scalar_;
2881 if( index >= vector_.size() ) {
2884 return (*
this)[index];
2893 inline size_t size()
const {
2894 return vector_.size();
2924 template<
typename T >
2925 inline bool canAlias(
const T* alias )
const {
2926 return vector_.canAlias( alias );
2936 template<
typename T >
2937 inline bool isAliased(
const T* alias )
const {
2938 return vector_.isAliased( alias );
2948 return vector_.isAligned();
2963 ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) ) &&
2964 (
size() > SMP_TDVECTDMATMULT_THRESHOLD );
2986 template<
typename VT1
2997 if( right.rows() == 0UL ) {
3001 else if( right.columns() == 0UL ) {
3013 DVecScalarMultExpr::selectAssignKernel( ~lhs, x, A, rhs.
scalar_ );
3028 template<
typename VT1
3032 static inline void selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3036 ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
3037 selectSmallAssignKernel( y, x, A, scalar );
3039 selectBlasAssignKernel( y, x, A, scalar );
3057 template<
typename VT1
3061 static inline void selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3063 y.assign( x * A * scalar );
3081 template<
typename VT1
3086 selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3088 selectDefaultAssignKernel( y, x, A, scalar );
3107 template<
typename VT1
3112 selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3116 const size_t M( A.rows() );
3117 const size_t N( A.columns() );
3121 for( ; (j+8UL) <= N; j+=8UL )
3131 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3132 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3134 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3137 for( ; i<ipos; i+=SIMDSIZE ) {
3139 xmm1 += x1 * A.load(i,j );
3140 xmm2 += x1 * A.load(i,j+1UL);
3141 xmm3 += x1 * A.load(i,j+2UL);
3142 xmm4 += x1 * A.load(i,j+3UL);
3143 xmm5 += x1 * A.load(i,j+4UL);
3144 xmm6 += x1 * A.load(i,j+5UL);
3145 xmm7 += x1 * A.load(i,j+6UL);
3146 xmm8 += x1 * A.load(i,j+7UL);
3149 y[j ] =
sum( xmm1 ) * scalar;
3150 y[j+1UL] =
sum( xmm2 ) * scalar;
3151 y[j+2UL] =
sum( xmm3 ) * scalar;
3152 y[j+3UL] =
sum( xmm4 ) * scalar;
3153 y[j+4UL] =
sum( xmm5 ) * scalar;
3154 y[j+5UL] =
sum( xmm6 ) * scalar;
3155 y[j+6UL] =
sum( xmm7 ) * scalar;
3156 y[j+7UL] =
sum( xmm8 ) * scalar;
3158 for( ; remainder && i<iend; ++i ) {
3159 y[j ] += x[i] * A(i,j ) * scalar;
3160 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3161 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3162 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3163 y[j+4UL] += x[i] * A(i,j+4UL) * scalar;
3164 y[j+5UL] += x[i] * A(i,j+5UL) * scalar;
3165 y[j+6UL] += x[i] * A(i,j+6UL) * scalar;
3166 y[j+7UL] += x[i] * A(i,j+7UL) * scalar;
3170 for( ; (j+4UL) <= N; j+=4UL )
3180 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3181 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3186 for( ; i<ipos; i+=SIMDSIZE ) {
3188 xmm1 += x1 * A.load(i,j );
3189 xmm2 += x1 * A.load(i,j+1UL);
3190 xmm3 += x1 * A.load(i,j+2UL);
3191 xmm4 += x1 * A.load(i,j+3UL);
3194 y[j ] =
sum( xmm1 ) * scalar;
3195 y[j+1UL] =
sum( xmm2 ) * scalar;
3196 y[j+2UL] =
sum( xmm3 ) * scalar;
3197 y[j+3UL] =
sum( xmm4 ) * scalar;
3199 for( ; remainder && i<iend; ++i ) {
3200 y[j ] += x[i] * A(i,j ) * scalar;
3201 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3202 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3203 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3207 for( ; (j+3UL) <= N; j+=3UL )
3217 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3218 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3223 for( ; i<ipos; i+=SIMDSIZE ) {
3225 xmm1 += x1 * A.load(i,j );
3226 xmm2 += x1 * A.load(i,j+1UL);
3227 xmm3 += x1 * A.load(i,j+2UL);
3230 y[j ] =
sum( xmm1 ) * scalar;
3231 y[j+1UL] =
sum( xmm2 ) * scalar;
3232 y[j+2UL] =
sum( xmm3 ) * scalar;
3234 for( ; remainder && i<iend; ++i ) {
3235 y[j ] += x[i] * A(i,j ) * scalar;
3236 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3237 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3241 for( ; (j+2UL) <= N; j+=2UL )
3251 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3252 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3257 for( ; i<ipos; i+=SIMDSIZE ) {
3259 xmm1 += x1 * A.load(i,j );
3260 xmm2 += x1 * A.load(i,j+1UL);
3263 y[j ] =
sum( xmm1 ) * scalar;
3264 y[j+1UL] =
sum( xmm2 ) * scalar;
3266 for( ; remainder && i<iend; ++i ) {
3267 y[j ] += x[i] * A(i,j ) * scalar;
3268 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3282 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3283 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3288 for( ; i<ipos; i+=SIMDSIZE ) {
3289 xmm1 += A.load(i,j) * x.load(i);
3292 y[j] =
sum( xmm1 ) * scalar;
3294 for( ; remainder && i<iend; ++i ) {
3295 y[j] += x[i] * A(i,j) * scalar;
3315 template<
typename VT1
3320 selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3322 selectDefaultAssignKernel( y, x, A, scalar );
3341 template<
typename VT1
3346 selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3350 const size_t M( A.rows() );
3351 const size_t N( A.columns() );
3357 for( ; (j+8UL) <= N; j+=8UL )
3367 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3368 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3372 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
3373 const size_t i1( i+SIMDSIZE );
3374 const size_t i2( i+SIMDSIZE*2UL );
3375 const size_t i3( i+SIMDSIZE*3UL );
3380 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
3381 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
3382 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
3383 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
3384 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
3385 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
3386 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
3387 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
3390 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
3391 const size_t i1( i+SIMDSIZE );
3394 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
3395 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
3396 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
3397 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
3398 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
3399 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
3400 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
3401 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
3404 for( ; i<ipos; i+=SIMDSIZE ) {
3406 y[j ] +=
sum( x1 * A.load(i,j ) );
3407 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
3408 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
3409 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
3410 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) );
3411 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) );
3412 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) );
3413 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) );
3416 for( ; remainder && i<iend; ++i ) {
3417 y[j ] += x[i] * A(i,j );
3418 y[j+1UL] += x[i] * A(i,j+1UL);
3419 y[j+2UL] += x[i] * A(i,j+2UL);
3420 y[j+3UL] += x[i] * A(i,j+3UL);
3421 y[j+4UL] += x[i] * A(i,j+4UL);
3422 y[j+5UL] += x[i] * A(i,j+5UL);
3423 y[j+6UL] += x[i] * A(i,j+6UL);
3424 y[j+7UL] += x[i] * A(i,j+7UL);
3437 for( ; (j+4UL) <= N; j+=4UL )
3447 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3448 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3452 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
3453 const size_t i1( i+SIMDSIZE );
3454 const size_t i2( i+SIMDSIZE*2UL );
3455 const size_t i3( i+SIMDSIZE*3UL );
3460 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
3461 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
3462 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
3463 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
3466 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
3467 const size_t i1( i+SIMDSIZE );
3470 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
3471 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
3472 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
3473 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
3476 for( ; i<ipos; i+=SIMDSIZE ) {
3478 y[j ] +=
sum( x1 * A.load(i,j ) );
3479 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
3480 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
3481 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
3484 for( ; remainder && i<iend; ++i ) {
3485 y[j ] += x[i] * A(i,j );
3486 y[j+1UL] += x[i] * A(i,j+1UL);
3487 y[j+2UL] += x[i] * A(i,j+2UL);
3488 y[j+3UL] += x[i] * A(i,j+3UL);
3497 for( ; (j+2UL) <= N; j+=2UL )
3507 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3508 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3512 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
3513 const size_t i1( i+SIMDSIZE );
3514 const size_t i2( i+SIMDSIZE*2UL );
3515 const size_t i3( i+SIMDSIZE*3UL );
3520 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
3521 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
3524 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
3525 const size_t i1( i+SIMDSIZE );
3528 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
3529 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
3532 for( ; i<ipos; i+=SIMDSIZE ) {
3534 y[j ] +=
sum( x1 * A.load(i,j ) );
3535 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
3538 for( ; remainder && i<iend; ++i ) {
3539 y[j ] += x[i] * A(i,j );
3540 y[j+1UL] += x[i] * A(i,j+1UL);
3557 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3558 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3562 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
3563 const size_t i1( i+SIMDSIZE );
3564 const size_t i2( i+SIMDSIZE*2UL );
3565 const size_t i3( i+SIMDSIZE*3UL );
3570 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
3573 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
3574 const size_t i1( i+SIMDSIZE );
3577 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
3580 for( ; i<ipos; i+=SIMDSIZE ) {
3582 y[j] +=
sum( x1 * A.load(i,j) );
3585 for( ; remainder && i<iend; ++i ) {
3586 y[j] += x[i] * A(i,j);
3607 template<
typename VT1
3612 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3614 selectLargeAssignKernel( y, x, A, scalar );
3619 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 3633 template<
typename VT1
3638 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3643 assign( y, scalar * x );
3647 gemv( y, x, A, ET(scalar), ET(0) );
3665 template<
typename VT1
3678 assign( ~lhs, tmp );
3694 template<
typename VT1
3705 if( right.rows() == 0UL || right.columns() == 0UL ) {
3717 DVecScalarMultExpr::selectAddAssignKernel( ~lhs, x, A, rhs.
scalar_ );
3732 template<
typename VT1
3736 static inline void selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3740 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
3741 selectSmallAddAssignKernel( y, x, A, scalar );
3743 selectBlasAddAssignKernel( y, x, A, scalar );
3761 template<
typename VT1
3765 static inline void selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3767 y.addAssign( x * A * scalar );
3785 template<
typename VT1
3790 selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3792 selectDefaultAddAssignKernel( y, x, A, scalar );
3811 template<
typename VT1
3816 selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3820 const size_t M( A.rows() );
3821 const size_t N( A.columns() );
3825 for( ; (j+8UL) <= N; j+=8UL )
3835 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3836 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3838 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3841 for( ; i<ipos; i+=SIMDSIZE ) {
3843 xmm1 += x1 * A.load(i,j );
3844 xmm2 += x1 * A.load(i,j+1UL);
3845 xmm3 += x1 * A.load(i,j+2UL);
3846 xmm4 += x1 * A.load(i,j+3UL);
3847 xmm5 += x1 * A.load(i,j+4UL);
3848 xmm6 += x1 * A.load(i,j+5UL);
3849 xmm7 += x1 * A.load(i,j+6UL);
3850 xmm8 += x1 * A.load(i,j+7UL);
3853 y[j ] +=
sum( xmm1 ) * scalar;
3854 y[j+1UL] +=
sum( xmm2 ) * scalar;
3855 y[j+2UL] +=
sum( xmm3 ) * scalar;
3856 y[j+3UL] +=
sum( xmm4 ) * scalar;
3857 y[j+4UL] +=
sum( xmm5 ) * scalar;
3858 y[j+5UL] +=
sum( xmm6 ) * scalar;
3859 y[j+6UL] +=
sum( xmm7 ) * scalar;
3860 y[j+7UL] +=
sum( xmm8 ) * scalar;
3862 for( ; remainder && i<iend; ++i ) {
3863 y[j ] += x[i] * A(i,j ) * scalar;
3864 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3865 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3866 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3867 y[j+4UL] += x[i] * A(i,j+4UL) * scalar;
3868 y[j+5UL] += x[i] * A(i,j+5UL) * scalar;
3869 y[j+6UL] += x[i] * A(i,j+6UL) * scalar;
3870 y[j+7UL] += x[i] * A(i,j+7UL) * scalar;
3874 for( ; (j+4UL) <= N; j+=4UL )
3884 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3885 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3890 for( ; i<ipos; i+=SIMDSIZE ) {
3892 xmm1 += x1 * A.load(i,j );
3893 xmm2 += x1 * A.load(i,j+1UL);
3894 xmm3 += x1 * A.load(i,j+2UL);
3895 xmm4 += x1 * A.load(i,j+3UL);
3898 y[j ] +=
sum( xmm1 ) * scalar;
3899 y[j+1UL] +=
sum( xmm2 ) * scalar;
3900 y[j+2UL] +=
sum( xmm3 ) * scalar;
3901 y[j+3UL] +=
sum( xmm4 ) * scalar;
3903 for( ; remainder && i<iend; ++i ) {
3904 y[j ] += x[i] * A(i,j ) * scalar;
3905 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3906 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3907 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3911 for( ; (j+3UL) <= N; j+=3UL )
3921 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3922 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3927 for( ; i<ipos; i+=SIMDSIZE ) {
3929 xmm1 += x1 * A.load(i,j );
3930 xmm2 += x1 * A.load(i,j+1UL);
3931 xmm3 += x1 * A.load(i,j+2UL);
3934 y[j ] +=
sum( xmm1 ) * scalar;
3935 y[j+1UL] +=
sum( xmm2 ) * scalar;
3936 y[j+2UL] +=
sum( xmm3 ) * scalar;
3938 for( ; remainder && i<iend; ++i ) {
3939 y[j ] += x[i] * A(i,j ) * scalar;
3940 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3941 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3945 for( ; (j+2UL) <= N; j+=2UL )
3955 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3956 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3961 for( ; i<ipos; i+=SIMDSIZE ) {
3963 xmm1 += x1 * A.load(i,j );
3964 xmm2 += x1 * A.load(i,j+1UL);
3967 y[j ] +=
sum( xmm1 ) * scalar;
3968 y[j+1UL] +=
sum( xmm2 ) * scalar;
3970 for( ; remainder && i<iend; ++i ) {
3971 y[j ] += x[i] * A(i,j ) * scalar;
3972 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3986 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3987 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
3992 for( ; i<ipos; i+=SIMDSIZE ) {
3993 xmm1 += A.load(i,j) * x.load(i);
3996 y[j] +=
sum( xmm1 ) * scalar;
3998 for( ; remainder && i<iend; ++i ) {
3999 y[j] += x[i] * A(i,j) * scalar;
4019 template<
typename VT1
4024 selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4026 selectDefaultAddAssignKernel( y, x, A, scalar );
4045 template<
typename VT1
4050 selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4054 const size_t M( A.rows() );
4055 const size_t N( A.columns() );
4059 for( ; (j+8UL) <= N; j+=8UL )
4069 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4070 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4074 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4075 const size_t i1( i+SIMDSIZE );
4076 const size_t i2( i+SIMDSIZE*2UL );
4077 const size_t i3( i+SIMDSIZE*3UL );
4082 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4083 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4084 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4085 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4086 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) ) * scalar;
4087 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) ) * scalar;
4088 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) ) * scalar;
4089 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) ) * scalar;
4092 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4093 const size_t i1( i+SIMDSIZE );
4096 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4097 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4098 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4099 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4100 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) ) * scalar;
4101 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) ) * scalar;
4102 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) ) * scalar;
4103 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) ) * scalar;
4106 for( ; i<ipos; i+=SIMDSIZE ) {
4108 y[j ] +=
sum( x1 * A.load(i,j ) ) * scalar;
4109 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4110 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) ) * scalar;
4111 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) ) * scalar;
4112 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) ) * scalar;
4113 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) ) * scalar;
4114 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) ) * scalar;
4115 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) ) * scalar;
4118 for( ; remainder && i<iend; ++i ) {
4119 y[j ] += x[i] * A(i,j ) * scalar;
4120 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4121 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
4122 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
4123 y[j+4UL] += x[i] * A(i,j+4UL) * scalar;
4124 y[j+5UL] += x[i] * A(i,j+5UL) * scalar;
4125 y[j+6UL] += x[i] * A(i,j+6UL) * scalar;
4126 y[j+7UL] += x[i] * A(i,j+7UL) * scalar;
4130 for( ; (j+4UL) <= N; j+=4UL )
4140 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4141 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4145 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4146 const size_t i1( i+SIMDSIZE );
4147 const size_t i2( i+SIMDSIZE*2UL );
4148 const size_t i3( i+SIMDSIZE*3UL );
4153 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4154 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4155 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4156 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4159 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4160 const size_t i1( i+SIMDSIZE );
4163 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4164 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4165 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4166 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4169 for( ; i<ipos; i+=SIMDSIZE ) {
4171 y[j ] +=
sum( x1 * A.load(i,j ) ) * scalar;
4172 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4173 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) ) * scalar;
4174 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) ) * scalar;
4177 for( ; remainder && i<iend; ++i ) {
4178 y[j ] += x[i] * A(i,j ) * scalar;
4179 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4180 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
4181 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
4185 for( ; (j+2UL) <= N; j+=2UL )
4195 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4196 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4200 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4201 const size_t i1( i+SIMDSIZE );
4202 const size_t i2( i+SIMDSIZE*2UL );
4203 const size_t i3( i+SIMDSIZE*3UL );
4208 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4209 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4212 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4213 const size_t i1( i+SIMDSIZE );
4216 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4217 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4220 for( ; i<ipos; i+=SIMDSIZE ) {
4222 y[j ] +=
sum( x1 * A.load(i,j ) ) * scalar;
4223 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4226 for( ; remainder && i<iend; ++i ) {
4227 y[j ] += x[i] * A(i,j ) * scalar;
4228 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4242 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4243 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4247 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4248 const size_t i1( i+SIMDSIZE );
4249 const size_t i2( i+SIMDSIZE*2UL );
4250 const size_t i3( i+SIMDSIZE*3UL );
4255 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) ) * scalar;
4258 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4259 const size_t i1( i+SIMDSIZE );
4262 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) ) * scalar;
4265 for( ; i<ipos; i+=SIMDSIZE ) {
4267 y[j] +=
sum( x1 * A.load(i,j) ) * scalar;
4270 for( ; remainder && i<iend; ++i ) {
4271 y[j] += x[i] * A(i,j) * scalar;
4292 template<
typename VT1
4297 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4299 selectLargeAddAssignKernel( y, x, A, scalar );
4304 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 4318 template<
typename VT1
4323 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4330 addAssign( y, tmp );
4333 gemv( y, x, A, ET(scalar), ET(1) );
4355 template<
typename VT1
4366 if( right.rows() == 0UL || right.columns() == 0UL ) {
4378 DVecScalarMultExpr::selectSubAssignKernel( ~lhs, x, A, rhs.
scalar_ );
4393 template<
typename VT1
4397 static inline void selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4401 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
4402 selectSmallSubAssignKernel( y, x, A, scalar );
4404 selectBlasSubAssignKernel( y, x, A, scalar );
4422 template<
typename VT1
4426 static inline void selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4428 y.subAssign( x * A * scalar );
4446 template<
typename VT1
4451 selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4453 selectDefaultSubAssignKernel( y, x, A, scalar );
4472 template<
typename VT1
4477 selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4481 const size_t M( A.rows() );
4482 const size_t N( A.columns() );
4486 for( ; (j+8UL) <= N; j+=8UL )
4496 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4497 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4499 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4502 for( ; i<ipos; i+=SIMDSIZE ) {
4504 xmm1 += x1 * A.load(i,j );
4505 xmm2 += x1 * A.load(i,j+1UL);
4506 xmm3 += x1 * A.load(i,j+2UL);
4507 xmm4 += x1 * A.load(i,j+3UL);
4508 xmm5 += x1 * A.load(i,j+4UL);
4509 xmm6 += x1 * A.load(i,j+5UL);
4510 xmm7 += x1 * A.load(i,j+6UL);
4511 xmm8 += x1 * A.load(i,j+7UL);
4514 y[j ] -=
sum( xmm1 ) * scalar;
4515 y[j+1UL] -=
sum( xmm2 ) * scalar;
4516 y[j+2UL] -=
sum( xmm3 ) * scalar;
4517 y[j+3UL] -=
sum( xmm4 ) * scalar;
4518 y[j+4UL] -=
sum( xmm5 ) * scalar;
4519 y[j+5UL] -=
sum( xmm6 ) * scalar;
4520 y[j+6UL] -=
sum( xmm7 ) * scalar;
4521 y[j+7UL] -=
sum( xmm8 ) * scalar;
4523 for( ; remainder && i<iend; ++i ) {
4524 y[j ] -= x[i] * A(i,j ) * scalar;
4525 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4526 y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4527 y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4528 y[j+4UL] -= x[i] * A(i,j+4UL) * scalar;
4529 y[j+5UL] -= x[i] * A(i,j+5UL) * scalar;
4530 y[j+6UL] -= x[i] * A(i,j+6UL) * scalar;
4531 y[j+7UL] -= x[i] * A(i,j+7UL) * scalar;
4535 for( ; (j+4UL) <= N; j+=4UL )
4545 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4546 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4551 for( ; i<ipos; i+=SIMDSIZE ) {
4553 xmm1 += x1 * A.load(i,j );
4554 xmm2 += x1 * A.load(i,j+1UL);
4555 xmm3 += x1 * A.load(i,j+2UL);
4556 xmm4 += x1 * A.load(i,j+3UL);
4559 y[j ] -=
sum( xmm1 ) * scalar;
4560 y[j+1UL] -=
sum( xmm2 ) * scalar;
4561 y[j+2UL] -=
sum( xmm3 ) * scalar;
4562 y[j+3UL] -=
sum( xmm4 ) * scalar;
4564 for( ; remainder && i<iend; ++i ) {
4565 y[j ] -= x[i] * A(i,j ) * scalar;
4566 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4567 y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4568 y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4572 for( ; (j+3UL) <= N; j+=3UL )
4582 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4583 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4588 for( ; i<ipos; i+=SIMDSIZE ) {
4590 xmm1 += x1 * A.load(i,j );
4591 xmm2 += x1 * A.load(i,j+1UL);
4592 xmm3 += x1 * A.load(i,j+2UL);
4595 y[j ] -=
sum( xmm1 ) * scalar;
4596 y[j+1UL] -=
sum( xmm2 ) * scalar;
4597 y[j+2UL] -=
sum( xmm3 ) * scalar;
4599 for( ; remainder && i<iend; ++i ) {
4600 y[j ] -= x[i] * A(i,j ) * scalar;
4601 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4602 y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4606 for( ; (j+2UL) <= N; j+=2UL )
4616 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4617 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4622 for( ; i<ipos; i+=SIMDSIZE ) {
4624 xmm1 += x1 * A.load(i,j );
4625 xmm2 += x1 * A.load(i,j+1UL);
4628 y[j ] -=
sum( xmm1 ) * scalar;
4629 y[j+1UL] -=
sum( xmm2 ) * scalar;
4631 for( ; remainder && i<iend; ++i ) {
4632 y[j ] -= x[i] * A(i,j ) * scalar;
4633 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4647 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4648 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4653 for( ; i<ipos; i+=SIMDSIZE ) {
4654 xmm1 += A.load(i,j) * x.load(i);
4657 y[j] -=
sum( xmm1 ) * scalar;
4659 for( ; remainder && i<iend; ++i ) {
4660 y[j] -= x[i] * A(i,j) * scalar;
4680 template<
typename VT1
4685 selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4687 selectDefaultSubAssignKernel( y, x, A, scalar );
4706 template<
typename VT1
4711 selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4715 const size_t M( A.rows() );
4716 const size_t N( A.columns() );
4720 for( ; (j+8UL) <= N; j+=8UL )
4730 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4731 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4735 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4736 const size_t i1( i+SIMDSIZE );
4737 const size_t i2( i+SIMDSIZE*2UL );
4738 const size_t i3( i+SIMDSIZE*3UL );
4743 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4744 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4745 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4746 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4747 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) ) * scalar;
4748 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) ) * scalar;
4749 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) ) * scalar;
4750 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) ) * scalar;
4753 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4754 const size_t i1( i+SIMDSIZE );
4757 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4758 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4759 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4760 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4761 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) ) * scalar;
4762 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) ) * scalar;
4763 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) ) * scalar;
4764 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) ) * scalar;
4767 for( ; i<ipos; i+=SIMDSIZE ) {
4769 y[j ] -=
sum( x1 * A.load(i,j ) ) * scalar;
4770 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4771 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) ) * scalar;
4772 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) ) * scalar;
4773 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) ) * scalar;
4774 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) ) * scalar;
4775 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) ) * scalar;
4776 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) ) * scalar;
4779 for( ; remainder && i<iend; ++i ) {
4780 y[j ] -= x[i] * A(i,j ) * scalar;
4781 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4782 y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4783 y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4784 y[j+4UL] -= x[i] * A(i,j+4UL) * scalar;
4785 y[j+5UL] -= x[i] * A(i,j+5UL) * scalar;
4786 y[j+6UL] -= x[i] * A(i,j+6UL) * scalar;
4787 y[j+7UL] -= x[i] * A(i,j+7UL) * scalar;
4791 for( ; (j+4UL) <= N; j+=4UL )
4801 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4802 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4806 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4807 const size_t i1( i+SIMDSIZE );
4808 const size_t i2( i+SIMDSIZE*2UL );
4809 const size_t i3( i+SIMDSIZE*3UL );
4814 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4815 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4816 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4817 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4820 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4821 const size_t i1( i+SIMDSIZE );
4824 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4825 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4826 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4827 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4830 for( ; i<ipos; i+=SIMDSIZE ) {
4832 y[j ] -=
sum( x1 * A.load(i,j ) ) * scalar;
4833 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4834 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) ) * scalar;
4835 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) ) * scalar;
4838 for( ; remainder && i<iend; ++i ) {
4839 y[j ] -= x[i] * A(i,j ) * scalar;
4840 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4841 y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4842 y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4846 for( ; (j+2UL) <= N; j+=2UL )
4856 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4857 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4861 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4862 const size_t i1( i+SIMDSIZE );
4863 const size_t i2( i+SIMDSIZE*2UL );
4864 const size_t i3( i+SIMDSIZE*3UL );
4869 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4870 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4873 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4874 const size_t i1( i+SIMDSIZE );
4877 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4878 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4881 for( ; i<ipos; i+=SIMDSIZE ) {
4883 y[j ] -=
sum( x1 * A.load(i,j ) ) * scalar;
4884 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4887 for( ; remainder && i<iend; ++i ) {
4888 y[j ] -= x[i] * A(i,j ) * scalar;
4889 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4903 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4904 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % (SIMDSIZE) ) ) == ipos,
"Invalid end calculation" );
4908 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL ) {
4909 const size_t i1( i+SIMDSIZE );
4910 const size_t i2( i+SIMDSIZE*2UL );
4911 const size_t i3( i+SIMDSIZE*3UL );
4916 y[j] -=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) ) * scalar;
4919 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL ) {
4920 const size_t i1( i+SIMDSIZE );
4923 y[j] -=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) ) * scalar;
4926 for( ; i<ipos; i+=SIMDSIZE ) {
4928 y[j] -=
sum( x1 * A.load(i,j) ) * scalar;
4931 for( ; remainder && i<iend; ++i ) {
4932 y[j] -= x[i] * A(i,j) * scalar;
4953 template<
typename VT1
4958 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4960 selectLargeSubAssignKernel( y, x, A, scalar );
4965 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 4979 template<
typename VT1
4984 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4991 subAssign( y, tmp );
4994 gemv( y, x, A, ET(-scalar), ET(1) );
5016 template<
typename VT1
5029 multAssign( ~lhs, tmp );
5049 template<
typename VT1
5062 divAssign( ~lhs, tmp );
5084 template<
typename VT1
5096 if( right.rows() == 0UL ) {
5100 else if( right.columns() == 0UL ) {
5130 template<
typename VT1
5162 template<
typename VT1
5174 if( right.rows() == 0UL || right.columns() == 0UL ) {
5208 template<
typename VT1
5220 if( right.rows() == 0UL || right.columns() == 0UL ) {
5254 template<
typename VT1
5290 template<
typename VT1
5366 template<
typename VT
5368 inline decltype(
auto)
5375 if( (~vec).
size() != (~mat).
rows() ) {
5395 template<
typename VT,
typename MT >
5396 struct Size< TDVecTDMatMultExpr<VT,MT>, 0UL >
5397 :
public Size<MT,1UL>
5413 template<
typename VT,
typename MT >
5414 struct IsAligned< TDVecTDMatMultExpr<VT,MT> >
5415 :
public And< IsAligned<VT>, IsAligned<MT> >
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:329
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Expression object for transpose dense vector-transpose dense matrix multiplications.The TDVecTDMatMultExpr class represents the compile time expression for multiplications between transpose dense vectors and column-major dense matrices.
Definition: Forward.h:158
Header file for auxiliary alias declarations.
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:131
Header file for the blaze::checked and blaze::unchecked instances.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:71
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:86
Header file for basic type definitions.
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDVecTDMatMultExpr.h:207
MultTrait_< VRT, MRT > ResultType
Result type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:204
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:164
Header file for the serial shim.
Header file for the IsDiagonal type trait.
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
IfTrue_< evaluateMatrix, const MRT, MCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDVecTDMatMultExpr.h:221
Header file for the IsSame and IsStrictlySame type traits.
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:364
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:172
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:316
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDVecTDMatMultExpr.h:372
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:588
TDVecTDMatMultExpr(const VT &vec, const MT &mat) noexcept
Constructor for the TDVecTDMatMultExpr class.
Definition: TDVecTDMatMultExpr.h:247
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDVecTDMatMultExpr.h:261
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
EnableIf_< IsDenseVector< VT1 > > smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:193
typename DisableIf< Condition, T >::Type DisableIf_
Auxiliary type for the DisableIf class template.The DisableIf_ alias declaration provides a convenien...
Definition: DisableIf.h:224
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DVecScalarMultExpr.h:519
Header file for the And class template.
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:87
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:171
LeftOperand vec_
Left-hand side dense vector of the multiplication expression.
Definition: TDVecTDMatMultExpr.h:384
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:291
RightOperand rightOperand() const noexcept
Returns the right-hand side transpose dense matrix operand.
Definition: TDVecTDMatMultExpr.h:328
Header file for the Computation base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:140
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:87
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Compile time check for data types.This type trait tests whether or not the given types can be combine...
Definition: IsSIMDCombinable.h:120
Header file for the VecScalarMultExpr base class.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:343
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:133
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:80
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
Header file for the IsComplexDouble type trait.
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
Compile time check for the alignment of data types.This type trait tests whether the given data type ...
Definition: IsAligned.h:87
Constraint on the data type.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:71
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: TDVecTDMatMultExpr.h:308
Compile time check for the memory layout of data types.This type trait tests whether the given data t...
Definition: IsContiguous.h:86
Header file for the DisableIf class template.
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDVecTDMatMultExpr.h:209
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Header file for the IsDouble type trait.
Header file for the If class template.
ResultType_< MT > MRT
Result type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:128
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
If_< IsExpression< MT >, const MT, const MT &> RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:215
Compile time check for data types with padding.This type trait tests whether the given data type empl...
Definition: IsPadded.h:76
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:102
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDVecTDMatMultExpr.h:352
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:76
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
ResultType_< VT > VRT
Result type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:127
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:89
#define BLAZE_CONSTRAINT_MUST_NOT_BE_MATMATMULTEXPR_TYPE(T)
Constraint on the data type.In case the given data type T is a matrix/matrix multiplication expressio...
Definition: MatMatMultExpr.h:87
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:79
Header file for the IsTriangular type trait.
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:205
ElementType_< MRT > MET
Element type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:130
Constraint on the data type.
Header file for the exception macros of the math module.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: TDVecTDMatMultExpr.h:318
Constraint on the data type.
LeftOperand vector_
Left-hand side dense vector of the multiplication expression.
Definition: DVecScalarMultExpr.h:590
Header file for all forward declarations for expression class templates.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the IsPadded type trait.
typename T::LeftOperand LeftOperand_
Alias declaration for nested LeftOperand type definitions.The LeftOperand_ alias declaration provides...
Definition: Aliases.h:203
Header file for the HasConstDataAccess type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DVecScalarMultExpr.h:591
System settings for the BLAS mode.
Base class for all vector/scalar multiplication expression templates.The VecScalarMultExpr class serv...
Definition: VecScalarMultExpr.h:67
Header file for the IsSIMDCombinable type trait.
Header file for the HasSIMDMult type trait.
BLAZE_ALWAYS_INLINE ValueType_< T > sum(const SIMDi8< T > &a) noexcept
Returns the sum of all elements in the 8-bit integral SIMD vector.
Definition: Reduction.h:65
Header file for run time assertion macros.
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: TDVecTDMatMultExpr.h:295
RightOperand mat_
Right-hand side dense matrix of the multiplication expression.
Definition: TDVecTDMatMultExpr.h:385
Header file for the IsContiguous type trait.
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:154
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
EnableIf_< IsDenseVector< VT1 > > smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:222
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
Constraint on the data type.
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:296
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
Compile time check for built-in data types.This type trait tests whether or not the given template pa...
Definition: IsBuiltin.h:75
Header file for the TVecMatMultExpr base class.
Constraints on the storage order of matrix types.
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:816
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:93
Header file for the HasMutableDataAccess type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
typename EnableIf< Condition, T >::Type EnableIf_
Auxiliary alias declaration for the EnableIf class template.The EnableIf_ alias declaration provides ...
Definition: EnableIf.h:224
IfTrue_< evaluateVector, const VRT, VCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDVecTDMatMultExpr.h:218
Header file for BLAS general matrix/vector multiplication functions (gemv)
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
BLAZE_ALWAYS_INLINE size_t rows(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of rows of the matrix.
Definition: Matrix.h:490
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3080
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:61
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_TVECMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid vector/matrix ...
Definition: TVecMatMultExpr.h:108
Compile time check for complex types.This type trait tests whether or not the given template paramete...
Definition: IsComplex.h:76
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Expression object for dense vector-scalar multiplications.The DVecScalarMultExpr class represents the...
Definition: DVecScalarMultExpr.h:104
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:66
Compile time evaluation of the size of vectors and matrices.The Size type trait evaluates the size of...
Definition: Size.h:80
Base class for sparse vectors.The SparseVector class is a base class for all arbitrarily sized (N-dim...
Definition: Forward.h:130
ElementType_< ResultType > ElementType
Resulting element type.
Definition: TDVecTDMatMultExpr.h:206
Header file for the IsComplexFloat type trait.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a row dense or sparse vector type (i...
Definition: RowVector.h:61
CompositeType_< VT > VCT
Composite type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:131
Header file for the IsComplex type trait.
Compile time logical 'and' evaluation.The And alias declaration performs at compile time a logical 'a...
Definition: And.h:76
Header file for the complex data type.
typename T::RightOperand RightOperand_
Alias declaration for nested RightOperand type definitions.The RightOperand_ alias declaration provid...
Definition: Aliases.h:383
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:423
Header file for the IsUpper type trait.
CompositeType_< MT > MCT
Composite type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:132
Constraint on the data type.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDVecTDMatMultExpr.h:340
ElementType_< VRT > VET
Element type of the left-hand side dense vector epxression.
Definition: TDVecTDMatMultExpr.h:129
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:208
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDVecTDMatMultExpr.h:362
Header file for the Size type trait.
If_< IsExpression< VT >, const VT, const VT &> LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:212
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Constraint on the transpose flag of vector types.
Header file for the IsExpression type trait class.
Header file for the function trace functionality.