35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATTDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_DMATTDMATMULTEXPR_H_
145 template<
typename MT1
177 template<
typename T1,
typename T2,
typename T3 >
178 struct IsEvaluationRequired {
179 enum :
bool { value = ( evaluateLeft || evaluateRight ) };
189 template<
typename T1,
typename T2,
typename T3 >
190 struct UseBlasKernel {
192 HasMutableDataAccess<T1>::value &&
193 HasConstDataAccess<T2>::value &&
194 HasConstDataAccess<T3>::value &&
195 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
196 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
197 IsBLASCompatible< ElementType_<T1> >::value &&
198 IsBLASCompatible< ElementType_<T2> >::value &&
199 IsBLASCompatible< ElementType_<T3> >::value &&
200 IsSame< ElementType_<T1>, ElementType_<T2> >::value &&
201 IsSame< ElementType_<T1>, ElementType_<T3> >::value };
211 template<
typename T1,
typename T2,
typename T3 >
212 struct UseVectorizedDefaultKernel {
214 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
215 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
216 AreSIMDCombinable< ElementType_<T1>
218 , ElementType_<T3> >::value &&
219 HasSIMDAdd< ElementType_<T2>, ElementType_<T3> >::value &&
220 HasSIMDMult< ElementType_<T2>, ElementType_<T3> >::value };
252 MT1::simdEnabled && MT2::simdEnabled &&
257 enum :
bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
258 !evaluateRight && MT2::smpAssignable };
313 :(
lhs_.columns() ) ) );
317 const size_t n(
end - begin );
335 inline ReturnType
at(
size_t i,
size_t j )
const {
336 if( i >=
lhs_.rows() ) {
339 if( j >=
rhs_.columns() ) {
351 inline size_t rows() const noexcept {
362 return rhs_.columns();
392 template<
typename T >
393 inline bool canAlias(
const T* alias )
const noexcept {
394 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
404 template<
typename T >
405 inline bool isAliased(
const T* alias )
const noexcept {
406 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
416 return lhs_.isAligned() &&
rhs_.isAligned();
427 (
rows() *
columns() < DMATTDMATMULT_THRESHOLD ) ) &&
428 (
rows() *
columns() >= SMP_DMATTDMATMULT_THRESHOLD );
451 template<
typename MT
460 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
463 else if( rhs.
lhs_.columns() == 0UL ) {
478 DMatTDMatMultExpr::selectAssignKernel( ~lhs, A, B );
494 template<
typename MT3
497 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
500 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
501 selectSmallAssignKernel( C, A, B );
503 selectBlasAssignKernel( C, A, B );
522 template<
typename MT3
525 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
526 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
528 const size_t M( A.rows() );
529 const size_t N( B.columns() );
530 const size_t K( A.columns() );
532 const size_t ibegin( ( IsStrictlyLower<MT4>::value )
533 ?( ( IsStrictlyLower<MT5>::value && M > 1UL ) ? 2UL : 1UL )
535 const size_t iend( ( IsStrictlyUpper<MT4>::value )
536 ?( ( IsStrictlyUpper<MT5>::value && M > 1UL ) ? M-2UL : M-1UL )
540 for(
size_t i=0UL; i<ibegin; ++i ) {
541 for(
size_t j=0UL; j<N; ++j ) {
545 for(
size_t i=ibegin; i<iend; ++i )
547 const size_t jbegin( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
548 ?( ( IsStrictlyUpper<MT4>::value )
549 ?( IsStrictlyUpper<MT5>::value ? i+2UL : i+1UL )
550 :( IsStrictlyUpper<MT5>::value ? i+1UL : i ) )
551 :( IsStrictlyUpper<MT5>::value ? 1UL : 0UL ) );
552 const size_t jend( ( IsLower<MT4>::value && IsLower<MT5>::value )
553 ?( ( IsStrictlyLower<MT4>::value )
554 ?( IsStrictlyLower<MT5>::value ? i-1UL : i )
555 :( IsStrictlyLower<MT5>::value ? i : i+1UL ) )
556 :( IsStrictlyLower<MT5>::value ? N-1UL : N ) );
559 for(
size_t j=0UL; j<jbegin; ++j ) {
562 for(
size_t j=jbegin; j<jend; ++j )
564 const size_t kbegin( ( IsUpper<MT4>::value )
565 ?( ( IsLower<MT5>::value )
566 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i )
567 , ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
568 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
569 :( ( IsLower<MT5>::value )
570 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
572 const size_t kend( ( IsLower<MT4>::value )
573 ?( ( IsUpper<MT5>::value )
574 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL )
575 , ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
576 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
577 :( ( IsUpper<MT5>::value )
578 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
582 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
583 for(
size_t k=kbegin+1UL; k<kend; ++k ) {
584 (~C)(i,j) += A(i,k) * B(k,j);
587 for(
size_t j=jend; j<N; ++j ) {
591 for(
size_t i=iend; i<M; ++i ) {
592 for(
size_t j=0UL; j<N; ++j ) {
614 template<
typename MT3
617 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
618 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
620 const size_t M( A.rows() );
621 const size_t N( B.columns() );
622 const size_t K( A.columns() );
624 const size_t jbegin( ( IsStrictlyUpper<MT5>::value )
625 ?( ( IsStrictlyUpper<MT4>::value && N > 1UL ) ? 2UL : 1UL )
627 const size_t jend( ( IsStrictlyLower<MT5>::value )
628 ?( ( IsStrictlyLower<MT4>::value && N > 1UL ) ? N-2UL : N-1UL )
632 for(
size_t j=0UL; j<jbegin; ++j ) {
633 for(
size_t i=0UL; i<M; ++i ) {
637 for(
size_t j=jbegin; j<jend; ++j )
639 const size_t ibegin( ( IsLower<MT4>::value && IsLower<MT5>::value )
640 ?( ( IsStrictlyLower<MT4>::value )
641 ?( IsStrictlyLower<MT5>::value ? j+2UL : j+1UL )
642 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
643 :( IsStrictlyLower<MT4>::value ? 1UL : 0UL ) );
644 const size_t iend( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
645 ?( ( IsStrictlyUpper<MT4>::value )
646 ?( ( IsStrictlyUpper<MT5>::value )?( j-1UL ):( j ) )
647 :( ( IsStrictlyUpper<MT5>::value )?( j ):( j+1UL ) ) )
648 :( IsStrictlyUpper<MT4>::value ? M-1UL : M ) );
651 for(
size_t i=0UL; i<ibegin; ++i ) {
654 for(
size_t i=ibegin; i<iend; ++i )
656 const size_t kbegin( ( IsUpper<MT4>::value )
657 ?( ( IsLower<MT5>::value )
658 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i )
659 , ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
660 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
661 :( ( IsLower<MT5>::value )
662 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
664 const size_t kend( ( IsLower<MT4>::value )
665 ?( ( IsUpper<MT5>::value )
666 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL )
667 , ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
668 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
669 :( ( IsUpper<MT5>::value )
670 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
674 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
675 for(
size_t k=kbegin+1UL; k<kend; ++k ) {
676 (~C)(i,j) += A(i,k) * B(k,j);
679 for(
size_t i=iend; i<M; ++i ) {
683 for(
size_t j=jend; j<N; ++j ) {
684 for(
size_t i=0UL; i<M; ++i ) {
706 template<
typename MT3
709 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
710 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
712 const size_t M( A.rows() );
713 const size_t N( B.columns() );
715 for(
size_t i=0UL; i<M; ++i )
717 const size_t jbegin( ( IsUpper<MT4>::value )
718 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
720 const size_t jend( ( IsLower<MT4>::value )
721 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
725 if( IsUpper<MT4>::value ) {
726 for(
size_t j=0UL; j<jbegin; ++j ) {
730 for(
size_t j=jbegin; j<jend; ++j ) {
731 (~C)(i,j) = A(i,j) * B(j,j);
733 if( IsLower<MT4>::value ) {
734 for(
size_t j=jend; j<N; ++j ) {
757 template<
typename MT3
760 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
761 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
763 const size_t M( A.rows() );
764 const size_t N( B.columns() );
766 const size_t block( BLOCK_SIZE );
768 for(
size_t jj=0UL; jj<N; jj+=block ) {
769 const size_t jend(
min( N, jj+block ) );
770 for(
size_t ii=0UL; ii<M; ii+=block ) {
771 const size_t iend(
min( M, ii+block ) );
772 for(
size_t j=jj; j<jend; ++j )
774 const size_t ibegin( ( IsLower<MT4>::value )
775 ?(
max( ( IsStrictlyLower<MT4>::value ? j+1UL : j ), ii ) )
777 const size_t ipos( ( IsUpper<MT4>::value )
778 ?(
min( ( IsStrictlyUpper<MT4>::value ? j : j+1UL ), iend ) )
781 if( IsLower<MT4>::value ) {
782 for(
size_t i=ii; i<ibegin; ++i ) {
786 for(
size_t i=ibegin; i<ipos; ++i ) {
787 (~C)(i,j) = A(i,j) * B(j,j);
789 if( IsUpper<MT4>::value ) {
790 for(
size_t i=ipos; i<iend; ++i ) {
815 template<
typename MT3
818 static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
819 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
821 const size_t M( A.rows() );
822 const size_t N( B.columns() );
824 const size_t block( BLOCK_SIZE );
826 for(
size_t ii=0UL; ii<M; ii+=block ) {
827 const size_t iend(
min( M, ii+block ) );
828 for(
size_t jj=0UL; jj<N; jj+=block ) {
829 const size_t jend(
min( N, jj+block ) );
830 for(
size_t i=ii; i<iend; ++i )
832 const size_t jbegin( ( IsUpper<MT5>::value )
833 ?(
max( ( IsStrictlyUpper<MT5>::value ? i+1UL : i ), jj ) )
835 const size_t jpos( ( IsLower<MT5>::value )
836 ?(
min( ( IsStrictlyLower<MT5>::value ? i : i+1UL ), jend ) )
839 if( IsUpper<MT5>::value ) {
840 for(
size_t j=jj; j<jbegin; ++j ) {
844 for(
size_t j=jbegin; j<jpos; ++j ) {
845 (~C)(i,j) = A(i,i) * B(i,j);
847 if( IsLower<MT5>::value ) {
848 for(
size_t j=jpos; j<jend; ++j ) {
873 template<
typename MT3
876 static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
877 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
879 const size_t M( A.rows() );
880 const size_t N( B.columns() );
882 for(
size_t j=0UL; j<N; ++j )
884 const size_t ibegin( ( IsLower<MT5>::value )
885 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
887 const size_t iend( ( IsUpper<MT5>::value )
888 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
892 if( IsLower<MT5>::value ) {
893 for(
size_t i=0UL; i<ibegin; ++i ) {
897 for(
size_t i=ibegin; i<iend; ++i ) {
898 (~C)(i,j) = A(i,i) * B(i,j);
900 if( IsUpper<MT5>::value ) {
901 for(
size_t i=iend; i<M; ++i ) {
924 template<
typename MT3
927 static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
928 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
932 for(
size_t i=0UL; i<A.rows(); ++i ) {
933 C(i,i) = A(i,i) * B(i,i);
953 template<
typename MT3
956 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
957 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
959 selectDefaultAssignKernel( C, A, B );
979 template<
typename MT3
982 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
983 selectSmallAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
985 const size_t M( A.rows() );
986 const size_t N( B.columns() );
987 const size_t K( A.columns() );
989 const bool remainder( !IsPadded<MT4>::value || !IsPadded<MT5>::value );
993 for( ; (i+2UL) <= M; i+=2UL )
997 for( ; (j+4UL) <= N; j+=4UL )
999 const size_t kbegin( ( IsUpper<MT4>::value )
1000 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
1001 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
1002 const size_t kend( ( IsLower<MT4>::value )
1003 ?( IsUpper<MT5>::value ?
min( i+2UL, j+4UL ) : ( i+2UL ) )
1004 :( IsUpper<MT5>::value ? ( j+4UL ) : K ) );
1006 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
1007 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
1009 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1012 for( ; k<kpos; k+=SIMDSIZE ) {
1013 const SIMDType a1( A.load(i ,k) );
1014 const SIMDType a2( A.load(i+1UL,k) );
1015 const SIMDType b1( B.load(k,j ) );
1016 const SIMDType b2( B.load(k,j+1UL) );
1017 const SIMDType b3( B.load(k,j+2UL) );
1018 const SIMDType b4( B.load(k,j+3UL) );
1019 xmm1 = xmm1 + a1 * b1;
1020 xmm2 = xmm2 + a1 * b2;
1021 xmm3 = xmm3 + a1 * b3;
1022 xmm4 = xmm4 + a1 * b4;
1023 xmm5 = xmm5 + a2 * b1;
1024 xmm6 = xmm6 + a2 * b2;
1025 xmm7 = xmm7 + a2 * b3;
1026 xmm8 = xmm8 + a2 * b4;
1029 (~C)(i ,j ) =
sum( xmm1 );
1030 (~C)(i ,j+1UL) =
sum( xmm2 );
1031 (~C)(i ,j+2UL) =
sum( xmm3 );
1032 (~C)(i ,j+3UL) =
sum( xmm4 );
1033 (~C)(i+1UL,j ) =
sum( xmm5 );
1034 (~C)(i+1UL,j+1UL) =
sum( xmm6 );
1035 (~C)(i+1UL,j+2UL) =
sum( xmm7 );
1036 (~C)(i+1UL,j+3UL) =
sum( xmm8 );
1038 for( ; remainder && k<kend; ++k ) {
1039 (~C)(i ,j ) += A(i ,k) * B(k,j );
1040 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1041 (~C)(i ,j+2UL) += A(i ,k) * B(k,j+2UL);
1042 (~C)(i ,j+3UL) += A(i ,k) * B(k,j+3UL);
1043 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1044 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1045 (~C)(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL);
1046 (~C)(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL);
1050 for( ; (j+2UL) <= N; j+=2UL )
1052 const size_t kbegin( ( IsUpper<MT4>::value )
1053 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
1054 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
1055 const size_t kend( ( IsLower<MT4>::value )
1056 ?( IsUpper<MT5>::value ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
1057 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
1059 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
1060 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
1062 SIMDType xmm1, xmm2, xmm3, xmm4;
1065 for( ; k<kpos; k+=SIMDSIZE ) {
1066 const SIMDType a1( A.load(i ,k) );
1067 const SIMDType a2( A.load(i+1UL,k) );
1068 const SIMDType b1( B.load(k,j ) );
1069 const SIMDType b2( B.load(k,j+1UL) );
1070 xmm1 = xmm1 + a1 * b1;
1071 xmm2 = xmm2 + a1 * b2;
1072 xmm3 = xmm3 + a2 * b1;
1073 xmm4 = xmm4 + a2 * b2;
1076 (~C)(i ,j ) =
sum( xmm1 );
1077 (~C)(i ,j+1UL) =
sum( xmm2 );
1078 (~C)(i+1UL,j ) =
sum( xmm3 );
1079 (~C)(i+1UL,j+1UL) =
sum( xmm4 );
1081 for( ; remainder && k<kend; ++k ) {
1082 (~C)(i ,j ) += A(i ,k) * B(k,j );
1083 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1084 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1085 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1091 const size_t kbegin( ( IsUpper<MT4>::value )
1092 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
1093 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
1094 const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
1096 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
1097 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
1099 SIMDType xmm1, xmm2;
1102 for( ; k<kpos; k+=SIMDSIZE ) {
1103 const SIMDType b1( B.load(k,j) );
1104 xmm1 = xmm1 + A.load(i ,k) * b1;
1105 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
1108 (~C)(i ,j) =
sum( xmm1 );
1109 (~C)(i+1UL,j) =
sum( xmm2 );
1111 for( ; remainder && k<kend; ++k ) {
1112 (~C)(i ,j) += A(i ,k) * B(k,j);
1113 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
1122 for( ; (j+4UL) <= N; j+=4UL )
1124 const size_t kbegin( ( IsUpper<MT4>::value )
1125 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
1126 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
1127 const size_t kend( ( IsUpper<MT5>::value )?( j+4UL ):( K ) );
1129 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
1130 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
1132 SIMDType xmm1, xmm2, xmm3, xmm4;
1135 for( ; k<kpos; k+=SIMDSIZE ) {
1136 const SIMDType a1( A.load(i,k) );
1137 xmm1 = xmm1 + a1 * B.load(k,j );
1138 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
1139 xmm3 = xmm3 + a1 * B.load(k,j+2UL);
1140 xmm4 = xmm4 + a1 * B.load(k,j+3UL);
1143 (~C)(i,j ) =
sum( xmm1 );
1144 (~C)(i,j+1UL) =
sum( xmm2 );
1145 (~C)(i,j+2UL) =
sum( xmm3 );
1146 (~C)(i,j+3UL) =
sum( xmm4 );
1148 for( ; remainder && k<kend; ++k ) {
1149 (~C)(i,j ) += A(i,k) * B(k,j );
1150 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
1151 (~C)(i,j+2UL) += A(i,k) * B(k,j+2UL);
1152 (~C)(i,j+3UL) += A(i,k) * B(k,j+3UL);
1156 for( ; (j+2UL) <= N; j+=2UL )
1158 const size_t kbegin( ( IsUpper<MT4>::value )
1159 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
1160 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
1161 const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
1163 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
1164 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
1166 SIMDType xmm1, xmm2;
1169 for( ; k<kpos; k+=SIMDSIZE ) {
1170 const SIMDType a1( A.load(i,k) );
1171 xmm1 = xmm1 + a1 * B.load(k,j );
1172 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
1175 (~C)(i,j ) =
sum( xmm1 );
1176 (~C)(i,j+1UL) =
sum( xmm2 );
1178 for( ; remainder && k<kend; ++k ) {
1179 (~C)(i,j ) += A(i,k) * B(k,j );
1180 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
1186 const size_t kbegin( ( IsUpper<MT4>::value )
1187 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
1188 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
1190 const size_t kpos( remainder ? ( K &
size_t(-SIMDSIZE) ) : K );
1191 BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
1196 for( ; k<kpos; k+=SIMDSIZE ) {
1197 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
1200 (~C)(i,j) =
sum( xmm1 );
1202 for( ; remainder && k<K; ++k ) {
1203 (~C)(i,j) += A(i,k) * B(k,j);
1226 template<
typename MT3
1229 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
1230 selectSmallAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1232 const size_t M( A.rows() );
1233 const size_t N( B.columns() );
1234 const size_t K( A.columns() );
1236 const bool remainder( !IsPadded<MT4>::value || !IsPadded<MT5>::value );
1240 for( ; (i+4UL) <= M; i+=4UL )
1244 for( ; (j+2UL) <= N; j+=2UL )
1246 const size_t kbegin( ( IsUpper<MT4>::value )
1247 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
1248 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
1249 const size_t kend( ( IsLower<MT4>::value )
1250 ?( IsUpper<MT5>::value ?
min( i+4UL, j+2UL ) : ( i+4UL ) )
1251 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
1253 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
1254 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
1256 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1259 for( ; k<kpos; k+=SIMDSIZE ) {
1260 const SIMDType a1( A.load(i ,k) );
1261 const SIMDType a2( A.load(i+1UL,k) );
1262 const SIMDType a3( A.load(i+2UL,k) );
1263 const SIMDType a4( A.load(i+3UL,k) );
1264 const SIMDType b1( B.load(k,j ) );
1265 const SIMDType b2( B.load(k,j+1UL) );
1266 xmm1 = xmm1 + a1 * b1;
1267 xmm2 = xmm2 + a1 * b2;
1268 xmm3 = xmm3 + a2 * b1;
1269 xmm4 = xmm4 + a2 * b2;
1270 xmm5 = xmm5 + a3 * b1;
1271 xmm6 = xmm6 + a3 * b2;
1272 xmm7 = xmm7 + a4 * b1;
1273 xmm8 = xmm8 + a4 * b2;
1276 (~C)(i ,j ) =
sum( xmm1 );
1277 (~C)(i ,j+1UL) =
sum( xmm2 );
1278 (~C)(i+1UL,j ) =
sum( xmm3 );
1279 (~C)(i+1UL,j+1UL) =
sum( xmm4 );
1280 (~C)(i+2UL,j ) =
sum( xmm5 );
1281 (~C)(i+2UL,j+1UL) =
sum( xmm6 );
1282 (~C)(i+3UL,j ) =
sum( xmm7 );
1283 (~C)(i+3UL,j+1UL) =
sum( xmm8 );
1285 for( ; remainder && k<kend; ++k ) {
1286 (~C)(i ,j ) += A(i ,k) * B(k,j );
1287 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1288 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1289 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1290 (~C)(i+2UL,j ) += A(i+2UL,k) * B(k,j );
1291 (~C)(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
1292 (~C)(i+3UL,j ) += A(i+3UL,k) * B(k,j );
1293 (~C)(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL);
1299 const size_t kbegin( ( IsUpper<MT4>::value )
1300 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
1301 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
1302 const size_t kend( ( IsLower<MT4>::value )?( i+4UL ):( K ) );
1304 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
1305 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
1307 SIMDType xmm1, xmm2, xmm3, xmm4;
1310 for( ; k<kpos; k+=SIMDSIZE ) {
1311 const SIMDType b1( B.load(k,j) );
1312 xmm1 = xmm1 + A.load(i ,k) * b1;
1313 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
1314 xmm3 = xmm3 + A.load(i+2UL,k) * b1;
1315 xmm4 = xmm4 + A.load(i+3UL,k) * b1;
1318 (~C)(i ,j) =
sum( xmm1 );
1319 (~C)(i+1UL,j) =
sum( xmm2 );
1320 (~C)(i+2UL,j) =
sum( xmm3 );
1321 (~C)(i+3UL,j) =
sum( xmm4 );
1323 for( ; remainder && k<kend; ++k ) {
1324 (~C)(i ,j) += A(i ,k) * B(k,j);
1325 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
1326 (~C)(i+2UL,j) += A(i+2UL,k) * B(k,j);
1327 (~C)(i+3UL,j) += A(i+3UL,k) * B(k,j);
1332 for( ; (i+2UL) <= M; i+=2UL )
1336 for( ; (j+2UL) <= N; j+=2UL )
1338 const size_t kbegin( ( IsUpper<MT4>::value )
1339 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
1340 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
1341 const size_t kend( ( IsLower<MT4>::value )
1342 ?( IsUpper<MT5>::value ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
1343 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
1345 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
1346 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
1348 SIMDType xmm1, xmm2, xmm3, xmm4;
1351 for( ; k<kpos; k+=SIMDSIZE ) {
1352 const SIMDType a1( A.load(i ,k) );
1353 const SIMDType a2( A.load(i+1UL,k) );
1354 const SIMDType b1( B.load(k,j ) );
1355 const SIMDType b2( B.load(k,j+1UL) );
1356 xmm1 = xmm1 + a1 * b1;
1357 xmm2 = xmm2 + a1 * b2;
1358 xmm3 = xmm3 + a2 * b1;
1359 xmm4 = xmm4 + a2 * b2;
1362 (~C)(i ,j ) =
sum( xmm1 );
1363 (~C)(i ,j+1UL) =
sum( xmm2 );
1364 (~C)(i+1UL,j ) =
sum( xmm3 );
1365 (~C)(i+1UL,j+1UL) =
sum( xmm4 );
1367 for( ; remainder && k<kend; ++k ) {
1368 (~C)(i ,j ) += A(i ,k) * B(k,j );
1369 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1370 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1371 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1377 const size_t kbegin( ( IsUpper<MT4>::value )
1378 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
1379 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
1380 const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
1382 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
1383 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
1385 SIMDType xmm1, xmm2;
1388 for( ; k<kpos; k+=SIMDSIZE ) {
1389 const SIMDType b1( B.load(k,j) );
1390 xmm1 = xmm1 + A.load(i ,k) * b1;
1391 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
1394 (~C)(i ,j) =
sum( xmm1 );
1395 (~C)(i+1UL,j) =
sum( xmm2 );
1397 for( ; remainder && k<kend; ++k ) {
1398 (~C)(i ,j) += A(i ,k) * B(k,j);
1399 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
1408 for( ; (j+2UL) <= N; j+=2UL )
1410 const size_t kbegin( ( IsUpper<MT4>::value )
1411 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
1412 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
1413 const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
1415 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
1416 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
1418 SIMDType xmm1, xmm2;
1421 for( ; k<kpos; k+=SIMDSIZE ) {
1422 const SIMDType a1( A.load(i,k) );
1423 xmm1 = xmm1 + a1 * B.load(k,j );
1424 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
1427 (~C)(i,j ) =
sum( xmm1 );
1428 (~C)(i,j+1UL) =
sum( xmm2 );
1430 for( ; remainder && k<kend; ++k ) {
1431 (~C)(i,j ) += A(i,k) * B(k,j );
1432 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
1438 const size_t kbegin( ( IsUpper<MT4>::value )
1439 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
1440 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
1442 const size_t kpos( remainder ? ( K &
size_t(-SIMDSIZE) ) : K );
1443 BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
1448 for( ; k<kpos; k+=SIMDSIZE ) {
1449 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
1452 (~C)(i,j) =
sum( xmm1 );
1454 for( ; remainder && k<K; ++k ) {
1455 (~C)(i,j) += A(i,k) * B(k,j);
1477 template<
typename MT3
1480 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
1481 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1483 selectDefaultAssignKernel( C, A, B );
1503 template<
typename MT3
1506 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
1507 selectLargeAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1510 selectSmallAssignKernel( ~C, A, B );
1530 template<
typename MT3
1533 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
1534 selectLargeAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1537 selectSmallAssignKernel( ~C, A, B );
1556 template<
typename MT3
1559 static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5> >
1560 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1562 selectLargeAssignKernel( C, A, B );
1568 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
1582 template<
typename MT3
1585 static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5> >
1586 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1588 typedef ElementType_<MT3> ET;
1590 if( IsTriangular<MT4>::value ) {
1592 trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
1594 else if( IsTriangular<MT5>::value ) {
1596 trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
1599 gemm( C, A, B, ET(1), ET(0) );
1619 template<
typename MT
1621 friend inline void assign( SparseMatrix<MT,SO>& lhs,
const DMatTDMatMultExpr& rhs )
1625 typedef IfTrue_< SO, OppositeType, ResultType > TmpType;
1637 const TmpType tmp(
serial( rhs ) );
1638 assign( ~lhs, tmp );
1656 template<
typename MT
1658 friend inline void addAssign( DenseMatrix<MT,SO>& lhs,
const DMatTDMatMultExpr& rhs )
1665 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1669 LT A(
serial( rhs.lhs_ ) );
1670 RT B(
serial( rhs.rhs_ ) );
1679 DMatTDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1695 template<
typename MT3
1698 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1700 if( ( IsDiagonal<MT4>::value || IsDiagonal<MT5>::value ) ||
1701 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
1702 selectSmallAddAssignKernel( C, A, B );
1704 selectBlasAddAssignKernel( C, A, B );
1723 template<
typename MT3
1726 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
1727 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1729 const size_t M( A.rows() );
1730 const size_t N( B.columns() );
1731 const size_t K( A.columns() );
1733 const size_t ibegin( ( IsStrictlyLower<MT4>::value )
1734 ?( ( IsStrictlyLower<MT5>::value && M > 1UL ) ? 2UL : 1UL )
1736 const size_t iend( ( IsStrictlyUpper<MT4>::value )
1737 ?( ( IsStrictlyUpper<MT5>::value && M > 1UL ) ? M-2UL : M-1UL )
1741 for(
size_t i=ibegin; i<iend; ++i )
1743 const size_t jbegin( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
1744 ?( ( IsStrictlyUpper<MT4>::value )
1745 ?( IsStrictlyUpper<MT5>::value ? i+2UL : i+1UL )
1746 :( IsStrictlyUpper<MT5>::value ? i+1UL : i ) )
1747 :( IsStrictlyUpper<MT5>::value ? 1UL : 0UL ) );
1748 const size_t jend( ( IsLower<MT4>::value && IsLower<MT5>::value )
1749 ?( ( IsStrictlyLower<MT4>::value )
1750 ?( IsStrictlyLower<MT5>::value ? i-1UL : i )
1751 :( IsStrictlyLower<MT5>::value ? i : i+1UL ) )
1752 :( IsStrictlyLower<MT5>::value ? N-1UL : N ) );
1755 for(
size_t j=jbegin; j<jend; ++j )
1757 const size_t kbegin( ( IsUpper<MT4>::value )
1758 ?( ( IsLower<MT5>::value )
1759 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i )
1760 , ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1761 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1762 :( ( IsLower<MT5>::value )
1763 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
1765 const size_t kend( ( IsLower<MT4>::value )
1766 ?( ( IsUpper<MT5>::value )
1767 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL )
1768 , ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
1769 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
1770 :( ( IsUpper<MT5>::value )
1771 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
1775 const size_t knum( kend - kbegin );
1776 const size_t kpos( kbegin + ( knum &
size_t(-2) ) );
1778 for(
size_t k=kbegin; k<kpos; k+=2UL ) {
1779 (~C)(i,j) += A(i,k ) * B(k ,j);
1780 (~C)(i,j) += A(i,k+1UL) * B(k+1UL,j);
1783 (~C)(i,j) += A(i,kpos) * B(kpos,j);
1805 template<
typename MT3
1808 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
1809 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1811 const size_t M( A.rows() );
1812 const size_t N( B.columns() );
1813 const size_t K( A.columns() );
1815 const size_t jbegin( ( IsStrictlyUpper<MT5>::value )
1816 ?( ( IsStrictlyUpper<MT4>::value && N > 1UL ) ? 2UL : 1UL )
1818 const size_t jend( ( IsStrictlyLower<MT5>::value )
1819 ?( ( IsStrictlyLower<MT4>::value && N > 1UL ) ? N-2UL : N-1UL )
1823 for(
size_t j=jbegin; j<jend; ++j )
1825 const size_t ibegin( ( IsLower<MT4>::value && IsLower<MT5>::value )
1826 ?( ( IsStrictlyLower<MT4>::value )
1827 ?( IsStrictlyLower<MT5>::value ? j+2UL : j+1UL )
1828 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1829 :( IsStrictlyLower<MT4>::value ? 1UL : 0UL ) );
1830 const size_t iend( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
1831 ?( ( IsStrictlyUpper<MT4>::value )
1832 ?( ( IsStrictlyUpper<MT5>::value )?( j-1UL ):( j ) )
1833 :( ( IsStrictlyUpper<MT5>::value )?( j ):( j+1UL ) ) )
1834 :( IsStrictlyUpper<MT4>::value ? M-1UL : M ) );
1837 for(
size_t i=ibegin; i<iend; ++i )
1839 const size_t kbegin( ( IsUpper<MT4>::value )
1840 ?( ( IsLower<MT5>::value )
1841 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i )
1842 , ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1843 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1844 :( ( IsLower<MT5>::value )
1845 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
1847 const size_t kend( ( IsLower<MT4>::value )
1848 ?( ( IsUpper<MT5>::value )
1849 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL )
1850 , ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
1851 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
1852 :( ( IsUpper<MT5>::value )
1853 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
1857 const size_t knum( kend - kbegin );
1858 const size_t kpos( kbegin + ( knum &
size_t(-2) ) );
1860 for(
size_t k=kbegin; k<kpos; k+=2UL ) {
1861 (~C)(i,j) += A(i,k ) * B(k ,j);
1862 (~C)(i,j) += A(i,k+1UL) * B(k+1UL,j);
1865 (~C)(i,j) += A(i,kpos) * B(kpos,j);
1887 template<
typename MT3
1890 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
1891 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1893 const size_t M( A.rows() );
1894 const size_t N( B.columns() );
1896 for(
size_t i=0UL; i<M; ++i )
1898 const size_t jbegin( ( IsUpper<MT4>::value )
1899 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
1901 const size_t jend( ( IsLower<MT4>::value )
1902 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
1906 const size_t jnum( jend - jbegin );
1907 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
1909 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
1910 (~C)(i,j ) += A(i,j ) * B(j ,j );
1911 (~C)(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL);
1914 (~C)(i,jpos) += A(i,jpos) * B(jpos,jpos);
1935 template<
typename MT3
1938 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
1939 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1941 const size_t M( A.rows() );
1942 const size_t N( B.columns() );
1944 const size_t block( BLOCK_SIZE );
1946 for(
size_t jj=0UL; jj<N; jj+=block ) {
1947 const size_t jend(
min( N, jj+block ) );
1948 for(
size_t ii=0UL; ii<M; ii+=block ) {
1949 const size_t iend(
min( M, ii+block ) );
1950 for(
size_t j=jj; j<jend; ++j )
1952 const size_t ibegin( ( IsLower<MT4>::value )
1953 ?(
max( ( IsStrictlyLower<MT4>::value ? j+1UL : j ), ii ) )
1955 const size_t ipos( ( IsUpper<MT4>::value )
1956 ?(
min( ( IsStrictlyUpper<MT4>::value ? j : j+1UL ), iend ) )
1959 for(
size_t i=ibegin; i<ipos; ++i ) {
1960 (~C)(i,j) += A(i,j) * B(j,j);
1983 template<
typename MT3
1986 static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
1987 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1989 const size_t M( A.rows() );
1990 const size_t N( B.columns() );
1992 const size_t block( BLOCK_SIZE );
1994 for(
size_t ii=0UL; ii<M; ii+=block ) {
1995 const size_t iend(
min( M, ii+block ) );
1996 for(
size_t jj=0UL; jj<N; jj+=block ) {
1997 const size_t jend(
min( N, jj+block ) );
1998 for(
size_t i=ii; i<iend; ++i )
2000 const size_t jbegin( ( IsUpper<MT5>::value )
2001 ?(
max( ( IsStrictlyUpper<MT5>::value ? i+1UL : i ), jj ) )
2003 const size_t jpos( ( IsLower<MT5>::value )
2004 ?(
min( ( IsStrictlyLower<MT5>::value ? i : i+1UL ), jend ) )
2007 for(
size_t j=jbegin; j<jpos; ++j ) {
2008 (~C)(i,j) += A(i,i) * B(i,j);
2031 template<
typename MT3
2034 static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
2035 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
2037 const size_t M( A.rows() );
2038 const size_t N( B.columns() );
2040 for(
size_t j=0UL; j<N; ++j )
2042 const size_t ibegin( ( IsLower<MT5>::value )
2043 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
2045 const size_t iend( ( IsUpper<MT5>::value )
2046 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
2050 const size_t inum( iend - ibegin );
2051 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2053 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2054 (~C)(i ,j) += A(i ,i ) * B(i ,j);
2055 (~C)(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j);
2058 (~C)(ipos,j) += A(ipos,ipos) * B(ipos,j);
2079 template<
typename MT3
2082 static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
2083 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2085 for(
size_t i=0UL; i<A.rows(); ++i ) {
2086 C(i,i) += A(i,i) * B(i,i);
2106 template<
typename MT3
2109 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
2110 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2112 selectDefaultAddAssignKernel( C, A, B );
2132 template<
typename MT3
2135 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
2136 selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
2138 const size_t M( A.rows() );
2139 const size_t N( B.columns() );
2140 const size_t K( A.columns() );
2142 const bool remainder( !IsPadded<MT4>::value || !IsPadded<MT5>::value );
2146 for( ; (i+2UL) <= M; i+=2UL )
2150 for( ; (j+4UL) <= N; j+=4UL )
2152 const size_t kbegin( ( IsUpper<MT4>::value )
2153 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
2154 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
2155 const size_t kend( ( IsLower<MT4>::value )
2156 ?( IsUpper<MT5>::value ?
min( i+2UL, j+4UL ) : ( i+2UL ) )
2157 :( IsUpper<MT5>::value ? ( j+4UL ) : K ) );
2159 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
2160 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
2162 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2165 for( ; k<kpos; k+=SIMDSIZE ) {
2166 const SIMDType a1( A.load(i ,k) );
2167 const SIMDType a2( A.load(i+1UL,k) );
2168 const SIMDType b1( B.load(k,j ) );
2169 const SIMDType b2( B.load(k,j+1UL) );
2170 const SIMDType b3( B.load(k,j+2UL) );
2171 const SIMDType b4( B.load(k,j+3UL) );
2172 xmm1 = xmm1 + a1 * b1;
2173 xmm2 = xmm2 + a1 * b2;
2174 xmm3 = xmm3 + a1 * b3;
2175 xmm4 = xmm4 + a1 * b4;
2176 xmm5 = xmm5 + a2 * b1;
2177 xmm6 = xmm6 + a2 * b2;
2178 xmm7 = xmm7 + a2 * b3;
2179 xmm8 = xmm8 + a2 * b4;
2182 (~C)(i ,j ) +=
sum( xmm1 );
2183 (~C)(i ,j+1UL) +=
sum( xmm2 );
2184 (~C)(i ,j+2UL) +=
sum( xmm3 );
2185 (~C)(i ,j+3UL) +=
sum( xmm4 );
2186 (~C)(i+1UL,j ) +=
sum( xmm5 );
2187 (~C)(i+1UL,j+1UL) +=
sum( xmm6 );
2188 (~C)(i+1UL,j+2UL) +=
sum( xmm7 );
2189 (~C)(i+1UL,j+3UL) +=
sum( xmm8 );
2191 for( ; remainder && k<kend; ++k ) {
2192 (~C)(i ,j ) += A(i ,k) * B(k,j );
2193 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2194 (~C)(i ,j+2UL) += A(i ,k) * B(k,j+2UL);
2195 (~C)(i ,j+3UL) += A(i ,k) * B(k,j+3UL);
2196 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2197 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2198 (~C)(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL);
2199 (~C)(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL);
2203 for( ; (j+2UL) <= N; j+=2UL )
2205 const size_t kbegin( ( IsUpper<MT4>::value )
2206 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
2207 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
2208 const size_t kend( ( IsLower<MT4>::value )
2209 ?( IsUpper<MT5>::value ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
2210 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
2212 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
2213 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
2215 SIMDType xmm1, xmm2, xmm3, xmm4;
2218 for( ; k<kpos; k+=SIMDSIZE ) {
2219 const SIMDType a1( A.load(i ,k) );
2220 const SIMDType a2( A.load(i+1UL,k) );
2221 const SIMDType b1( B.load(k,j ) );
2222 const SIMDType b2( B.load(k,j+1UL) );
2223 xmm1 = xmm1 + a1 * b1;
2224 xmm2 = xmm2 + a1 * b2;
2225 xmm3 = xmm3 + a2 * b1;
2226 xmm4 = xmm4 + a2 * b2;
2229 (~C)(i ,j ) +=
sum( xmm1 );
2230 (~C)(i ,j+1UL) +=
sum( xmm2 );
2231 (~C)(i+1UL,j ) +=
sum( xmm3 );
2232 (~C)(i+1UL,j+1UL) +=
sum( xmm4 );
2234 for( ; remainder && k<kend; ++k ) {
2235 (~C)(i ,j ) += A(i ,k) * B(k,j );
2236 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2237 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2238 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2244 const size_t kbegin( ( IsUpper<MT4>::value )
2245 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
2246 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
2247 const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
2249 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
2250 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
2252 SIMDType xmm1, xmm2;
2255 for( ; k<kpos; k+=SIMDSIZE ) {
2256 const SIMDType b1( B.load(k,j) );
2257 xmm1 = xmm1 + A.load(i ,k) * b1;
2258 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
2261 (~C)(i ,j) +=
sum( xmm1 );
2262 (~C)(i+1UL,j) +=
sum( xmm2 );
2264 for( ; remainder && k<kend; ++k ) {
2265 (~C)(i ,j) += A(i ,k) * B(k,j);
2266 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
2274 for( ; (j+4UL) <= N; j+=4UL )
2276 const size_t kbegin( ( IsUpper<MT4>::value )
2277 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
2278 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
2279 const size_t kend( ( IsUpper<MT5>::value )?( j+4UL ):( K ) );
2281 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
2282 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
2284 SIMDType xmm1, xmm2, xmm3, xmm4;
2287 for( ; k<kpos; k+=SIMDSIZE ) {
2288 const SIMDType a1( A.load(i,k) );
2289 xmm1 = xmm1 + a1 * B.load(k,j );
2290 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
2291 xmm3 = xmm3 + a1 * B.load(k,j+2UL);
2292 xmm4 = xmm4 + a1 * B.load(k,j+3UL);
2295 (~C)(i,j ) +=
sum( xmm1 );
2296 (~C)(i,j+1UL) +=
sum( xmm2 );
2297 (~C)(i,j+2UL) +=
sum( xmm3 );
2298 (~C)(i,j+3UL) +=
sum( xmm4 );
2300 for( ; remainder && k<kend; ++k ) {
2301 (~C)(i,j ) += A(i,k) * B(k,j );
2302 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
2303 (~C)(i,j+2UL) += A(i,k) * B(k,j+2UL);
2304 (~C)(i,j+3UL) += A(i,k) * B(k,j+3UL);
2308 for( ; (j+2UL) <= N; j+=2UL )
2310 const size_t kbegin( ( IsUpper<MT4>::value )
2311 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
2312 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
2313 const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
2315 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
2316 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
2318 SIMDType xmm1, xmm2;
2321 for( ; k<kpos; k+=SIMDSIZE ) {
2322 const SIMDType a1( A.load(i,k) );
2323 xmm1 = xmm1 + a1 * B.load(k,j );
2324 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
2327 (~C)(i,j ) +=
sum( xmm1 );
2328 (~C)(i,j+1UL) +=
sum( xmm2 );
2330 for( ; remainder && k<kend; ++k ) {
2331 (~C)(i,j ) += A(i,k) * B(k,j );
2332 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
2338 const size_t kbegin( ( IsUpper<MT4>::value )
2339 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
2340 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
2342 const size_t kpos( remainder ? ( K &
size_t(-SIMDSIZE) ) : K );
2343 BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
2348 for( ; k<kpos; k+=SIMDSIZE ) {
2349 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
2352 (~C)(i,j) +=
sum( xmm1 );
2354 for( ; remainder && k<K; ++k ) {
2355 (~C)(i,j) += A(i,k) * B(k,j);
2378 template<
typename MT3
2381 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
2382 selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
2384 const size_t M( A.rows() );
2385 const size_t N( B.columns() );
2386 const size_t K( A.columns() );
2388 const bool remainder( !IsPadded<MT4>::value || !IsPadded<MT5>::value );
2392 for( ; (i+4UL) <= M; i+=4UL )
2396 for( ; (j+2UL) <= N; j+=2UL )
2398 const size_t kbegin( ( IsUpper<MT4>::value )
2399 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
2400 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
2401 const size_t kend( ( IsLower<MT4>::value )
2402 ?( IsUpper<MT5>::value ?
min( i+4UL, j+2UL ) : ( i+4UL ) )
2403 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
2405 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
2406 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
2408 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2411 for( ; k<kpos; k+=SIMDSIZE ) {
2412 const SIMDType a1( A.load(i ,k) );
2413 const SIMDType a2( A.load(i+1UL,k) );
2414 const SIMDType a3( A.load(i+2UL,k) );
2415 const SIMDType a4( A.load(i+3UL,k) );
2416 const SIMDType b1( B.load(k,j ) );
2417 const SIMDType b2( B.load(k,j+1UL) );
2418 xmm1 = xmm1 + a1 * b1;
2419 xmm2 = xmm2 + a1 * b2;
2420 xmm3 = xmm3 + a2 * b1;
2421 xmm4 = xmm4 + a2 * b2;
2422 xmm5 = xmm5 + a3 * b1;
2423 xmm6 = xmm6 + a3 * b2;
2424 xmm7 = xmm7 + a4 * b1;
2425 xmm8 = xmm8 + a4 * b2;
2428 (~C)(i ,j ) +=
sum( xmm1 );
2429 (~C)(i ,j+1UL) +=
sum( xmm2 );
2430 (~C)(i+1UL,j ) +=
sum( xmm3 );
2431 (~C)(i+1UL,j+1UL) +=
sum( xmm4 );
2432 (~C)(i+2UL,j ) +=
sum( xmm5 );
2433 (~C)(i+2UL,j+1UL) +=
sum( xmm6 );
2434 (~C)(i+3UL,j ) +=
sum( xmm7 );
2435 (~C)(i+3UL,j+1UL) +=
sum( xmm8 );
2437 for( ; remainder && k<kend; ++k ) {
2438 (~C)(i ,j ) += A(i ,k) * B(k,j );
2439 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2440 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2441 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2442 (~C)(i+2UL,j ) += A(i+2UL,k) * B(k,j );
2443 (~C)(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
2444 (~C)(i+3UL,j ) += A(i+3UL,k) * B(k,j );
2445 (~C)(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL);
2451 const size_t kbegin( ( IsUpper<MT4>::value )
2452 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
2453 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
2454 const size_t kend( ( IsLower<MT4>::value )?( i+4UL ):( K ) );
2456 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
2457 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
2459 SIMDType xmm1, xmm2, xmm3, xmm4;
2462 for( ; k<kpos; k+=SIMDSIZE ) {
2463 const SIMDType b1( B.load(k,j) );
2464 xmm1 = xmm1 + A.load(i ,k) * b1;
2465 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
2466 xmm3 = xmm3 + A.load(i+2UL,k) * b1;
2467 xmm4 = xmm4 + A.load(i+3UL,k) * b1;
2470 (~C)(i ,j) +=
sum( xmm1 );
2471 (~C)(i+1UL,j) +=
sum( xmm2 );
2472 (~C)(i+2UL,j) +=
sum( xmm3 );
2473 (~C)(i+3UL,j) +=
sum( xmm4 );
2475 for( ; remainder && k<kend; ++k ) {
2476 (~C)(i ,j) += A(i ,k) * B(k,j);
2477 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
2478 (~C)(i+2UL,j) += A(i+2UL,k) * B(k,j);
2479 (~C)(i+3UL,j) += A(i+3UL,k) * B(k,j);
2484 for( ; (i+2UL) <= M; i+=2UL )
2488 for( ; (j+2UL) <= N; j+=2UL )
2490 const size_t kbegin( ( IsUpper<MT4>::value )
2491 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
2492 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
2493 const size_t kend( ( IsLower<MT4>::value )
2494 ?( IsUpper<MT5>::value ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
2495 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
2497 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
2498 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
2500 SIMDType xmm1, xmm2, xmm3, xmm4;
2503 for( ; k<kpos; k+=SIMDSIZE ) {
2504 const SIMDType a1( A.load(i ,k) );
2505 const SIMDType a2( A.load(i+1UL,k) );
2506 const SIMDType b1( B.load(k,j ) );
2507 const SIMDType b2( B.load(k,j+1UL) );
2508 xmm1 = xmm1 + a1 * b1;
2509 xmm2 = xmm2 + a1 * b2;
2510 xmm3 = xmm3 + a2 * b1;
2511 xmm4 = xmm4 + a2 * b2;
2514 (~C)(i ,j ) +=
sum( xmm1 );
2515 (~C)(i ,j+1UL) +=
sum( xmm2 );
2516 (~C)(i+1UL,j ) +=
sum( xmm3 );
2517 (~C)(i+1UL,j+1UL) +=
sum( xmm4 );
2519 for( ; remainder && k<kend; ++k ) {
2520 (~C)(i ,j ) += A(i ,k) * B(k,j );
2521 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2522 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2523 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2529 const size_t kbegin( ( IsUpper<MT4>::value )
2530 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
2531 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
2532 const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
2534 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
2535 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
2537 SIMDType xmm1, xmm2;
2540 for( ; k<kpos; k+=SIMDSIZE ) {
2541 const SIMDType b1( B.load(k,j) );
2542 xmm1 = xmm1 + A.load(i ,k) * b1;
2543 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
2546 (~C)(i ,j) +=
sum( xmm1 );
2547 (~C)(i+1UL,j) +=
sum( xmm2 );
2549 for( ; remainder && k<kend; ++k ) {
2550 (~C)(i ,j) += A(i ,k) * B(k,j);
2551 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
2560 for( ; (j+2UL) <= N; j+=2UL )
2562 const size_t kbegin( ( IsUpper<MT4>::value )
2563 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
2564 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
2565 const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
2567 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
2568 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
2570 SIMDType xmm1, xmm2;
2573 for( ; k<kpos; k+=SIMDSIZE ) {
2574 const SIMDType a1( A.load(i,k) );
2575 xmm1 = xmm1 + a1 * B.load(k,j );
2576 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
2579 (~C)(i,j ) +=
sum( xmm1 );
2580 (~C)(i,j+1UL) +=
sum( xmm2 );
2582 for( ; remainder && k<kend; ++k ) {
2583 (~C)(i,j ) += A(i,k) * B(k,j );
2584 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
2590 const size_t kbegin( ( IsUpper<MT4>::value )
2591 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
2592 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
2594 const size_t kpos( remainder ? ( K &
size_t(-SIMDSIZE) ) : K );
2595 BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
2600 for( ; k<kpos; k+=SIMDSIZE ) {
2601 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
2604 (~C)(i,j) +=
sum( xmm1 );
2606 for( ; remainder && k<K; ++k ) {
2607 (~C)(i,j) += A(i,k) * B(k,j);
2629 template<
typename MT3
2632 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
2633 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2635 selectDefaultAddAssignKernel( C, A, B );
2655 template<
typename MT3
2658 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
2659 selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
2662 selectSmallAddAssignKernel( ~C, A, B );
2682 template<
typename MT3
2685 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
2686 selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
2689 selectSmallAddAssignKernel( ~C, A, B );
2708 template<
typename MT3
2711 static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5> >
2712 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2714 selectLargeAddAssignKernel( C, A, B );
2720 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
2734 template<
typename MT3
2737 static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5> >
2738 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2740 typedef ElementType_<MT3> ET;
2742 if( IsTriangular<MT4>::value ) {
2743 ResultType_<MT3> tmp(
serial( B ) );
2744 trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2745 addAssign( C, tmp );
2747 else if( IsTriangular<MT5>::value ) {
2748 ResultType_<MT3> tmp(
serial( A ) );
2749 trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2750 addAssign( C, tmp );
2753 gemm( C, A, B, ET(1), ET(1) );
2777 template<
typename MT
2779 friend inline void subAssign( DenseMatrix<MT,SO>& lhs,
const DMatTDMatMultExpr& rhs )
2786 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2790 LT A(
serial( rhs.lhs_ ) );
2791 RT B(
serial( rhs.rhs_ ) );
2800 DMatTDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
2816 template<
typename MT3
2819 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2821 if( ( IsDiagonal<MT4>::value || IsDiagonal<MT5>::value ) ||
2822 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
2823 selectSmallSubAssignKernel( C, A, B );
2825 selectBlasSubAssignKernel( C, A, B );
2844 template<
typename MT3
2847 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
2848 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
2850 const size_t M( A.rows() );
2851 const size_t N( B.columns() );
2852 const size_t K( A.columns() );
2854 const size_t ibegin( ( IsStrictlyLower<MT4>::value )
2855 ?( ( IsStrictlyLower<MT5>::value && M > 1UL ) ? 2UL : 1UL )
2857 const size_t iend( ( IsStrictlyUpper<MT4>::value )
2858 ?( ( IsStrictlyUpper<MT5>::value && M > 1UL ) ? M-2UL : M-1UL )
2862 for(
size_t i=ibegin; i<iend; ++i )
2864 const size_t jbegin( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
2865 ?( ( IsStrictlyUpper<MT4>::value )
2866 ?( IsStrictlyUpper<MT5>::value ? i+2UL : i+1UL )
2867 :( IsStrictlyUpper<MT5>::value ? i+1UL : i ) )
2868 :( IsStrictlyUpper<MT5>::value ? 1UL : 0UL ) );
2869 const size_t jend( ( IsLower<MT4>::value && IsLower<MT5>::value )
2870 ?( ( IsStrictlyLower<MT4>::value )
2871 ?( IsStrictlyLower<MT5>::value ? i-1UL : i )
2872 :( IsStrictlyLower<MT5>::value ? i : i+1UL ) )
2873 :( IsStrictlyLower<MT5>::value ? N-1UL : N ) );
2876 for(
size_t j=jbegin; j<jend; ++j )
2878 const size_t kbegin( ( IsUpper<MT4>::value )
2879 ?( ( IsLower<MT5>::value )
2880 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i )
2881 , ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2882 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2883 :( ( IsLower<MT5>::value )
2884 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
2886 const size_t kend( ( IsLower<MT4>::value )
2887 ?( ( IsUpper<MT5>::value )
2888 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL )
2889 , ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
2890 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
2891 :( ( IsUpper<MT5>::value )
2892 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
2896 const size_t knum( kend - kbegin );
2897 const size_t kpos( kbegin + ( knum &
size_t(-2) ) );
2899 for(
size_t k=kbegin; k<kpos; k+=2UL ) {
2900 (~C)(i,j) -= A(i,k ) * B(k ,j);
2901 (~C)(i,j) -= A(i,k+1UL) * B(k+1UL,j);
2904 (~C)(i,j) -= A(i,kpos) * B(kpos,j);
2926 template<
typename MT3
2929 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
2930 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
2932 const size_t M( A.rows() );
2933 const size_t N( B.columns() );
2934 const size_t K( A.columns() );
2936 const size_t jbegin( ( IsStrictlyUpper<MT5>::value )
2937 ?( ( IsStrictlyUpper<MT4>::value && N > 1UL ) ? 2UL : 1UL )
2939 const size_t jend( ( IsStrictlyLower<MT5>::value )
2940 ?( ( IsStrictlyLower<MT4>::value && N > 1UL ) ? N-2UL : N-1UL )
2944 for(
size_t j=jbegin; j<jend; ++j )
2946 const size_t ibegin( ( IsLower<MT4>::value && IsLower<MT5>::value )
2947 ?( ( IsStrictlyLower<MT4>::value )
2948 ?( IsStrictlyLower<MT5>::value ? j+2UL : j+1UL )
2949 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2950 :( IsStrictlyLower<MT4>::value ? 1UL : 0UL ) );
2951 const size_t iend( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
2952 ?( ( IsStrictlyUpper<MT4>::value )
2953 ?( ( IsStrictlyUpper<MT5>::value )?( j-1UL ):( j ) )
2954 :( ( IsStrictlyUpper<MT5>::value )?( j ):( j+1UL ) ) )
2955 :( IsStrictlyUpper<MT4>::value ? M-1UL : M ) );
2958 for(
size_t i=ibegin; i<iend; ++i )
2960 const size_t kbegin( ( IsUpper<MT4>::value )
2961 ?( ( IsLower<MT5>::value )
2962 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i )
2963 , ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2964 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2965 :( ( IsLower<MT5>::value )
2966 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
2968 const size_t kend( ( IsLower<MT4>::value )
2969 ?( ( IsUpper<MT5>::value )
2970 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL )
2971 , ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
2972 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
2973 :( ( IsUpper<MT5>::value )
2974 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
2978 const size_t knum( kend - kbegin );
2979 const size_t kpos( kbegin + ( knum &
size_t(-2) ) );
2981 for(
size_t k=kbegin; k<kpos; k+=2UL ) {
2982 (~C)(i,j) -= A(i,k ) * B(k ,j);
2983 (~C)(i,j) -= A(i,k+1UL) * B(k+1UL,j);
2986 (~C)(i,j) -= A(i,kpos) * B(kpos,j);
3008 template<
typename MT3
3011 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
3012 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
3014 const size_t M( A.rows() );
3015 const size_t N( B.columns() );
3017 for(
size_t i=0UL; i<M; ++i )
3019 const size_t jbegin( ( IsUpper<MT4>::value )
3020 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
3022 const size_t jend( ( IsLower<MT4>::value )
3023 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
3027 const size_t jnum( jend - jbegin );
3028 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
3030 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
3031 (~C)(i,j ) -= A(i,j ) * B(j ,j );
3032 (~C)(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL);
3035 (~C)(i,jpos) -= A(i,jpos) * B(jpos,jpos);
3056 template<
typename MT3
3059 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
3060 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
3062 const size_t M( A.rows() );
3063 const size_t N( B.columns() );
3065 const size_t block( BLOCK_SIZE );
3067 for(
size_t jj=0UL; jj<N; jj+=block ) {
3068 const size_t jend(
min( N, jj+block ) );
3069 for(
size_t ii=0UL; ii<M; ii+=block ) {
3070 const size_t iend(
min( M, ii+block ) );
3071 for(
size_t j=jj; j<jend; ++j )
3073 const size_t ibegin( ( IsLower<MT4>::value )
3074 ?(
max( ( IsStrictlyLower<MT4>::value ? j+1UL : j ), ii ) )
3076 const size_t ipos( ( IsUpper<MT4>::value )
3077 ?(
min( ( IsStrictlyUpper<MT4>::value ? j : j+1UL ), iend ) )
3080 for(
size_t i=ibegin; i<ipos; ++i ) {
3081 (~C)(i,j) -= A(i,j) * B(j,j);
3104 template<
typename MT3
3107 static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
3108 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
3110 const size_t M( A.rows() );
3111 const size_t N( B.columns() );
3113 const size_t block( BLOCK_SIZE );
3115 for(
size_t ii=0UL; ii<M; ii+=block ) {
3116 const size_t iend(
min( M, ii+block ) );
3117 for(
size_t jj=0UL; jj<N; jj+=block ) {
3118 const size_t jend(
min( N, jj+block ) );
3119 for(
size_t i=ii; i<iend; ++i )
3121 const size_t jbegin( ( IsUpper<MT5>::value )
3122 ?(
max( ( IsStrictlyUpper<MT5>::value ? i+1UL : i ), jj ) )
3124 const size_t jpos( ( IsLower<MT5>::value )
3125 ?(
min( ( IsStrictlyLower<MT5>::value ? i : i+1UL ), jend ) )
3128 for(
size_t j=jbegin; j<jpos; ++j ) {
3129 (~C)(i,j) -= A(i,i) * B(i,j);
3152 template<
typename MT3
3155 static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
3156 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
3158 const size_t M( A.rows() );
3159 const size_t N( B.columns() );
3161 for(
size_t j=0UL; j<N; ++j )
3163 const size_t ibegin( ( IsLower<MT5>::value )
3164 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
3166 const size_t iend( ( IsUpper<MT5>::value )
3167 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
3171 const size_t inum( iend - ibegin );
3172 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
3174 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
3175 (~C)(i ,j) -= A(i ,i ) * B(i ,j);
3176 (~C)(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j);
3179 (~C)(ipos,j) -= A(ipos,ipos) * B(ipos,j);
3200 template<
typename MT3
3203 static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
3204 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3206 for(
size_t i=0UL; i<A.rows(); ++i ) {
3207 C(i,i) -= A(i,i) * B(i,i);
3227 template<
typename MT3
3230 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
3231 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3233 selectDefaultSubAssignKernel( ~C, A, B );
3253 template<
typename MT3
3256 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
3257 selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
3259 const size_t M( A.rows() );
3260 const size_t N( B.columns() );
3261 const size_t K( A.columns() );
3263 const bool remainder( !IsPadded<MT4>::value || !IsPadded<MT5>::value );
3267 for( ; (i+2UL) <= M; i+=2UL )
3271 for( ; (j+4UL) <= N; j+=4UL )
3273 const size_t kbegin( ( IsUpper<MT4>::value )
3274 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
3275 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
3276 const size_t kend( ( IsLower<MT4>::value )
3277 ?( IsUpper<MT5>::value ?
min( i+2UL, j+4UL ) : ( i+2UL ) )
3278 :( IsUpper<MT5>::value ? ( j+4UL ) : K ) );
3280 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
3281 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
3283 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3286 for( ; k<kpos; k+=SIMDSIZE ) {
3287 const SIMDType a1( A.load(i ,k) );
3288 const SIMDType a2( A.load(i+1UL,k) );
3289 const SIMDType b1( B.load(k,j ) );
3290 const SIMDType b2( B.load(k,j+1UL) );
3291 const SIMDType b3( B.load(k,j+2UL) );
3292 const SIMDType b4( B.load(k,j+3UL) );
3293 xmm1 = xmm1 + a1 * b1;
3294 xmm2 = xmm2 + a1 * b2;
3295 xmm3 = xmm3 + a1 * b3;
3296 xmm4 = xmm4 + a1 * b4;
3297 xmm5 = xmm5 + a2 * b1;
3298 xmm6 = xmm6 + a2 * b2;
3299 xmm7 = xmm7 + a2 * b3;
3300 xmm8 = xmm8 + a2 * b4;
3303 (~C)(i ,j ) -=
sum( xmm1 );
3304 (~C)(i ,j+1UL) -=
sum( xmm2 );
3305 (~C)(i ,j+2UL) -=
sum( xmm3 );
3306 (~C)(i ,j+3UL) -=
sum( xmm4 );
3307 (~C)(i+1UL,j ) -=
sum( xmm5 );
3308 (~C)(i+1UL,j+1UL) -=
sum( xmm6 );
3309 (~C)(i+1UL,j+2UL) -=
sum( xmm7 );
3310 (~C)(i+1UL,j+3UL) -=
sum( xmm8 );
3312 for( ; remainder && k<kend; ++k ) {
3313 (~C)(i ,j ) -= A(i ,k) * B(k,j );
3314 (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
3315 (~C)(i ,j+2UL) -= A(i ,k) * B(k,j+2UL);
3316 (~C)(i ,j+3UL) -= A(i ,k) * B(k,j+3UL);
3317 (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3318 (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
3319 (~C)(i+1UL,j+2UL) -= A(i+1UL,k) * B(k,j+2UL);
3320 (~C)(i+1UL,j+3UL) -= A(i+1UL,k) * B(k,j+3UL);
3324 for( ; (j+2UL) <= N; j+=2UL )
3326 const size_t kbegin( ( IsUpper<MT4>::value )
3327 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
3328 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
3329 const size_t kend( ( IsLower<MT4>::value )
3330 ?( IsUpper<MT5>::value ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
3331 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
3333 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
3334 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
3336 SIMDType xmm1, xmm2, xmm3, xmm4;
3339 for( ; k<kpos; k+=SIMDSIZE ) {
3340 const SIMDType a1( A.load(i ,k) );
3341 const SIMDType a2( A.load(i+1UL,k) );
3342 const SIMDType b1( B.load(k,j ) );
3343 const SIMDType b2( B.load(k,j+1UL) );
3344 xmm1 = xmm1 + a1 * b1;
3345 xmm2 = xmm2 + a1 * b2;
3346 xmm3 = xmm3 + a2 * b1;
3347 xmm4 = xmm4 + a2 * b2;
3350 (~C)(i ,j ) -=
sum( xmm1 );
3351 (~C)(i ,j+1UL) -=
sum( xmm2 );
3352 (~C)(i+1UL,j ) -=
sum( xmm3 );
3353 (~C)(i+1UL,j+1UL) -=
sum( xmm4 );
3355 for( ; remainder && k<kend; ++k ) {
3356 (~C)(i ,j ) -= A(i ,k) * B(k,j );
3357 (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
3358 (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3359 (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
3365 const size_t kbegin( ( IsUpper<MT4>::value )
3366 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
3367 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
3368 const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
3370 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
3371 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
3373 SIMDType xmm1, xmm2;
3376 for( ; k<kpos; k+=SIMDSIZE ) {
3377 const SIMDType b1( B.load(k,j) );
3378 xmm1 = xmm1 + A.load(i ,k) * b1;
3379 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
3382 (~C)(i ,j) -=
sum( xmm1 );
3383 (~C)(i+1UL,j) -=
sum( xmm2 );
3385 for( ; remainder && k<kend; ++k ) {
3386 (~C)(i ,j) -= A(i ,k) * B(k,j);
3387 (~C)(i+1UL,j) -= A(i+1UL,k) * B(k,j);
3396 for( ; (j+4UL) <= N; j+=4UL )
3398 const size_t kbegin( ( IsUpper<MT4>::value )
3399 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
3400 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
3401 const size_t kend( ( IsUpper<MT5>::value )?( j+4UL ):( K ) );
3403 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
3404 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
3406 SIMDType xmm1, xmm2, xmm3, xmm4;
3409 for( ; k<kpos; k+=SIMDSIZE ) {
3410 const SIMDType a1( A.load(i,k) );
3411 xmm1 = xmm1 + a1 * B.load(k,j );
3412 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
3413 xmm3 = xmm3 + a1 * B.load(k,j+2UL);
3414 xmm4 = xmm4 + a1 * B.load(k,j+3UL);
3417 (~C)(i,j ) -=
sum( xmm1 );
3418 (~C)(i,j+1UL) -=
sum( xmm2 );
3419 (~C)(i,j+2UL) -=
sum( xmm3 );
3420 (~C)(i,j+3UL) -=
sum( xmm4 );
3422 for( ; remainder && k<kend; ++k ) {
3423 (~C)(i,j ) -= A(i,k) * B(k,j );
3424 (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL);
3425 (~C)(i,j+2UL) -= A(i,k) * B(k,j+2UL);
3426 (~C)(i,j+3UL) -= A(i,k) * B(k,j+3UL);
3430 for( ; (j+2UL) <= N; j+=2UL )
3432 const size_t kbegin( ( IsUpper<MT4>::value )
3433 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
3434 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
3435 const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
3437 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
3438 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
3440 SIMDType xmm1, xmm2;
3443 for( ; k<kpos; k+=SIMDSIZE ) {
3444 const SIMDType a1( A.load(i,k) );
3445 xmm1 = xmm1 + a1 * B.load(k,j );
3446 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
3449 (~C)(i,j ) -=
sum( xmm1 );
3450 (~C)(i,j+1UL) -=
sum( xmm2 );
3452 for( ; remainder && k<kend; ++k ) {
3453 (~C)(i,j ) -= A(i,k) * B(k,j );
3454 (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL);
3460 const size_t kbegin( ( IsUpper<MT4>::value )
3461 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
3462 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
3464 const size_t kpos( remainder ? ( K &
size_t(-SIMDSIZE) ) : K );
3465 BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
3470 for( ; k<kpos; k+=SIMDSIZE ) {
3471 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
3474 (~C)(i,j) -=
sum( xmm1 );
3476 for( ; remainder && k<K; ++k ) {
3477 (~C)(i,j) -= A(i,k) * B(k,j);
3500 template<
typename MT3
3503 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
3504 selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
3506 const size_t M( A.rows() );
3507 const size_t N( B.columns() );
3508 const size_t K( A.columns() );
3510 const bool remainder( !IsPadded<MT4>::value || !IsPadded<MT5>::value );
3514 for( ; (i+4UL) <= M; i+=4UL )
3518 for( ; (j+2UL) <= N; j+=2UL )
3520 const size_t kbegin( ( IsUpper<MT4>::value )
3521 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
3522 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
3523 const size_t kend( ( IsLower<MT4>::value )
3524 ?( IsUpper<MT5>::value ?
min( i+4UL, j+2UL ) : ( i+4UL ) )
3525 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
3527 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
3528 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
3530 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3533 for( ; k<kpos; k+=SIMDSIZE ) {
3534 const SIMDType a1( A.load(i ,k) );
3535 const SIMDType a2( A.load(i+1UL,k) );
3536 const SIMDType a3( A.load(i+2UL,k) );
3537 const SIMDType a4( A.load(i+3UL,k) );
3538 const SIMDType b1( B.load(k,j ) );
3539 const SIMDType b2( B.load(k,j+1UL) );
3540 xmm1 = xmm1 + a1 * b1;
3541 xmm2 = xmm2 + a1 * b2;
3542 xmm3 = xmm3 + a2 * b1;
3543 xmm4 = xmm4 + a2 * b2;
3544 xmm5 = xmm5 + a3 * b1;
3545 xmm6 = xmm6 + a3 * b2;
3546 xmm7 = xmm7 + a4 * b1;
3547 xmm8 = xmm8 + a4 * b2;
3550 (~C)(i ,j ) -=
sum( xmm1 );
3551 (~C)(i ,j+1UL) -=
sum( xmm2 );
3552 (~C)(i+1UL,j ) -=
sum( xmm3 );
3553 (~C)(i+1UL,j+1UL) -=
sum( xmm4 );
3554 (~C)(i+2UL,j ) -=
sum( xmm5 );
3555 (~C)(i+2UL,j+1UL) -=
sum( xmm6 );
3556 (~C)(i+3UL,j ) -=
sum( xmm7 );
3557 (~C)(i+3UL,j+1UL) -=
sum( xmm8 );
3559 for( ; remainder && k<kend; ++k ) {
3560 (~C)(i ,j ) -= A(i ,k) * B(k,j );
3561 (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
3562 (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3563 (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
3564 (~C)(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
3565 (~C)(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL);
3566 (~C)(i+3UL,j ) -= A(i+3UL,k) * B(k,j );
3567 (~C)(i+3UL,j+1UL) -= A(i+3UL,k) * B(k,j+1UL);
3573 const size_t kbegin( ( IsUpper<MT4>::value )
3574 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
3575 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
3576 const size_t kend( ( IsLower<MT4>::value )?( i+4UL ):( K ) );
3578 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
3579 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
3581 SIMDType xmm1, xmm2, xmm3, xmm4;
3584 for( ; k<kpos; k+=SIMDSIZE ) {
3585 const SIMDType b1( B.load(k,j) );
3586 xmm1 = xmm1 + A.load(i ,k) * b1;
3587 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
3588 xmm3 = xmm3 + A.load(i+2UL,k) * b1;
3589 xmm4 = xmm4 + A.load(i+3UL,k) * b1;
3592 (~C)(i ,j) -=
sum( xmm1 );
3593 (~C)(i+1UL,j) -=
sum( xmm2 );
3594 (~C)(i+2UL,j) -=
sum( xmm3 );
3595 (~C)(i+3UL,j) -=
sum( xmm4 );
3597 for( ; remainder && k<kend; ++k ) {
3598 (~C)(i ,j ) -= A(i ,k) * B(k,j );
3599 (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3600 (~C)(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
3601 (~C)(i+3UL,j ) -= A(i+3UL,k) * B(k,j );
3606 for( ; (i+2UL) <= M; i+=2UL )
3610 for( ; (j+2UL) <= N; j+=2UL )
3612 const size_t kbegin( ( IsUpper<MT4>::value )
3613 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
3614 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
3615 const size_t kend( ( IsLower<MT4>::value )
3616 ?( IsUpper<MT5>::value ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
3617 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
3619 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
3620 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
3622 SIMDType xmm1, xmm2, xmm3, xmm4;
3625 for( ; k<kpos; k+=SIMDSIZE ) {
3626 const SIMDType a1( A.load(i ,k) );
3627 const SIMDType a2( A.load(i+1UL,k) );
3628 const SIMDType b1( B.load(k,j ) );
3629 const SIMDType b2( B.load(k,j+1UL) );
3630 xmm1 = xmm1 + a1 * b1;
3631 xmm2 = xmm2 + a1 * b2;
3632 xmm3 = xmm3 + a2 * b1;
3633 xmm4 = xmm4 + a2 * b2;
3636 (~C)(i ,j ) -=
sum( xmm1 );
3637 (~C)(i ,j+1UL) -=
sum( xmm2 );
3638 (~C)(i+1UL,j ) -=
sum( xmm3 );
3639 (~C)(i+1UL,j+1UL) -=
sum( xmm4 );
3641 for( ; remainder && k<kend; ++k ) {
3642 (~C)(i ,j ) -= A(i ,k) * B(k,j );
3643 (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
3644 (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3645 (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
3651 const size_t kbegin( ( IsUpper<MT4>::value )
3652 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
3653 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
3654 const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
3656 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
3657 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
3659 SIMDType xmm1, xmm2;
3662 for( ; k<kpos; k+=SIMDSIZE ) {
3663 const SIMDType b1( B.load(k,j) );
3664 xmm1 = xmm1 + A.load(i ,k) * b1;
3665 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
3668 (~C)(i ,j) -=
sum( xmm1 );
3669 (~C)(i+1UL,j) -=
sum( xmm2 );
3671 for( ; remainder && k<kend; ++k ) {
3672 (~C)(i ,j) -= A(i ,k) * B(k,j);
3673 (~C)(i+1UL,j) -= A(i+1UL,k) * B(k,j);
3681 for( ; (j+2UL) <= N; j+=2UL )
3683 const size_t kbegin( ( IsUpper<MT4>::value )
3684 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
3685 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
3686 const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
3688 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
3689 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
3691 SIMDType xmm1, xmm2;
3694 for( ; k<kpos; k+=SIMDSIZE ) {
3695 const SIMDType a1( A.load(i,k) );
3696 xmm1 = xmm1 + a1 * B.load(k,j );
3697 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
3700 (~C)(i,j ) -=
sum( xmm1 );
3701 (~C)(i,j+1UL) -=
sum( xmm2 );
3703 for( ; remainder && k<kend; ++k ) {
3704 (~C)(i,j ) -= A(i,k) * B(k,j );
3705 (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL);
3711 const size_t kbegin( ( IsUpper<MT4>::value )
3712 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
3713 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
3715 const size_t kpos( remainder ? ( K &
size_t(-SIMDSIZE) ) : K );
3716 BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
3721 for( ; k<kpos; k+=SIMDSIZE ) {
3722 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
3725 (~C)(i,j) -=
sum( xmm1 );
3727 for( ; remainder && k<K; ++k ) {
3728 (~C)(i,j) -= A(i,k) * B(k,j);
3750 template<
typename MT3
3753 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
3754 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3756 selectDefaultSubAssignKernel( ~C, A, B );
3776 template<
typename MT3
3779 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
3780 selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
3783 selectSmallSubAssignKernel( ~C, A, B );
3803 template<
typename MT3
3806 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
3807 selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
3810 selectSmallSubAssignKernel( ~C, A, B );
3829 template<
typename MT3
3832 static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5> >
3833 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3835 selectLargeSubAssignKernel( C, A, B );
3841 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
3855 template<
typename MT3
3858 static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5> >
3859 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3861 typedef ElementType_<MT3> ET;
3863 if( IsTriangular<MT4>::value ) {
3864 ResultType_<MT3> tmp(
serial( B ) );
3865 trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
3866 subAssign( C, tmp );
3868 else if( IsTriangular<MT5>::value ) {
3869 ResultType_<MT3> tmp(
serial( A ) );
3870 trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
3871 subAssign( C, tmp );
3874 gemm( C, A, B, ET(-1), ET(1) );
3908 template<
typename MT
3910 friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
3918 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
3921 else if( rhs.lhs_.columns() == 0UL ) {
3956 template<
typename MT
3958 friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
3963 typedef IfTrue_< SO, OppositeType, ResultType > TmpType;
3975 const TmpType tmp( rhs );
3997 template<
typename MT
3999 friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
4007 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4046 template<
typename MT
4048 friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
4056 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4116 template<
typename MT1
4120 :
public DenseMatrix< DMatScalarMultExpr< DMatTDMatMultExpr<MT1,MT2>, ST, false >, false >
4121 ,
private MatScalarMultExpr
4122 ,
private Computation
4126 typedef DMatTDMatMultExpr<MT1,MT2> MMM;
4127 typedef ResultType_<MMM> RES;
4128 typedef ResultType_<MT1>
RT1;
4129 typedef ResultType_<MT2>
RT2;
4130 typedef ElementType_<RT1>
ET1;
4131 typedef ElementType_<RT2>
ET2;
4132 typedef CompositeType_<MT1>
CT1;
4133 typedef CompositeType_<MT2>
CT2;
4138 enum :
bool { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
4143 enum :
bool { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
4151 template<
typename T1,
typename T2,
typename T3 >
4152 struct IsEvaluationRequired {
4153 enum :
bool { value = ( evaluateLeft || evaluateRight ) };
4161 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4162 struct UseBlasKernel {
4164 HasMutableDataAccess<T1>::value &&
4165 HasConstDataAccess<T2>::value &&
4166 HasConstDataAccess<T3>::value &&
4167 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
4168 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4169 IsBLASCompatible< ElementType_<T1> >::value &&
4170 IsBLASCompatible< ElementType_<T2> >::value &&
4171 IsBLASCompatible< ElementType_<T3> >::value &&
4172 IsSame< ElementType_<T1>, ElementType_<T2> >::value &&
4173 IsSame< ElementType_<T1>, ElementType_<T3> >::value &&
4174 !( IsBuiltin< ElementType_<T1> >::value && IsComplex<T4>::value ) };
4182 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4183 struct UseVectorizedDefaultKernel {
4185 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
4186 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4187 AreSIMDCombinable< ElementType_<T1>
4191 HasSIMDAdd< ElementType_<T2>, ElementType_<T3> >::value &&
4192 HasSIMDMult< ElementType_<T2>, ElementType_<T3> >::value };
4198 typedef DMatScalarMultExpr<MMM,ST,false>
This;
4203 typedef SIMDTrait_<ElementType>
SIMDType;
4208 typedef const DMatTDMatMultExpr<MT1,MT2>
LeftOperand;
4214 typedef IfTrue_< evaluateLeft, const RT1, CT1 >
LT;
4217 typedef IfTrue_< evaluateRight, const RT2, CT2 >
RT;
4222 enum :
bool { simdEnabled = !IsDiagonal<MT1>::value && !IsDiagonal<MT2>::value &&
4223 MT1::simdEnabled && MT2::simdEnabled &&
4224 AreSIMDCombinable<ET1,ET2,ST>::value &&
4225 HasSIMDAdd<ET1,ET2>::value &&
4226 HasSIMDMult<ET1,ET2>::value };
4229 enum :
bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
4230 !evaluateRight && MT2::smpAssignable };
4244 explicit inline DMatScalarMultExpr(
const MMM& matrix, ST scalar )
4257 inline ReturnType
operator()(
size_t i,
size_t j )
const {
4260 return matrix_(i,j) * scalar_;
4272 inline ReturnType
at(
size_t i,
size_t j )
const {
4273 if( i >= matrix_.rows() ) {
4276 if( j >= matrix_.columns() ) {
4279 return (*
this)(i,j);
4288 inline size_t rows()
const {
4289 return matrix_.rows();
4298 inline size_t columns()
const {
4299 return matrix_.columns();
4329 template<
typename T >
4330 inline bool canAlias(
const T* alias )
const {
4331 return matrix_.canAlias( alias );
4341 template<
typename T >
4342 inline bool isAliased(
const T* alias )
const {
4343 return matrix_.isAliased( alias );
4353 return matrix_.isAligned();
4364 (
rows() *
columns() < DMATTDMATMULT_THRESHOLD ) ) &&
4365 (
rows() *
columns() >= SMP_DMATTDMATMULT_THRESHOLD );
4371 LeftOperand matrix_;
4372 RightOperand scalar_;
4387 template<
typename MT
4389 friend inline void assign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
4396 LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
4397 RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
4399 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
4402 else if( left.columns() == 0UL ) {
4417 DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
4432 template<
typename MT3
4436 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4438 if( ( IsDiagonal<MT4>::value || IsDiagonal<MT5>::value ) ||
4439 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
4440 selectSmallAssignKernel( C, A, B, scalar );
4442 selectBlasAssignKernel( C, A, B, scalar );
4460 template<
typename MT3
4464 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
4465 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
4467 const size_t M( A.rows() );
4468 const size_t N( B.columns() );
4469 const size_t K( A.columns() );
4471 const size_t ibegin( ( IsStrictlyLower<MT4>::value )
4472 ?( ( IsStrictlyLower<MT5>::value && M > 1UL ) ? 2UL : 1UL )
4474 const size_t iend( ( IsStrictlyUpper<MT4>::value )
4475 ?( ( IsStrictlyUpper<MT5>::value && M > 1UL ) ? M-2UL : M-1UL )
4479 for(
size_t i=0UL; i<ibegin; ++i ) {
4480 for(
size_t j=0UL; j<N; ++j ) {
4484 for(
size_t i=ibegin; i<iend; ++i )
4486 const size_t jbegin( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
4487 ?( ( IsStrictlyUpper<MT4>::value )
4488 ?( IsStrictlyUpper<MT5>::value ? i+2UL : i+1UL )
4489 :( IsStrictlyUpper<MT5>::value ? i+1UL : i ) )
4490 :( IsStrictlyUpper<MT5>::value ? 1UL : 0UL ) );
4491 const size_t jend( ( IsLower<MT4>::value && IsLower<MT5>::value )
4492 ?( ( IsStrictlyLower<MT4>::value )
4493 ?( IsStrictlyLower<MT5>::value ? i-1UL : i )
4494 :( IsStrictlyLower<MT5>::value ? i : i+1UL ) )
4495 :( IsStrictlyLower<MT5>::value ? N-1UL : N ) );
4498 for(
size_t j=0UL; j<jbegin; ++j ) {
4501 for(
size_t j=jbegin; j<jend; ++j )
4503 const size_t kbegin( ( IsUpper<MT4>::value )
4504 ?( ( IsLower<MT5>::value )
4505 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i )
4506 , ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4507 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4508 :( ( IsLower<MT5>::value )
4509 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
4511 const size_t kend( ( IsLower<MT4>::value )
4512 ?( ( IsUpper<MT5>::value )
4513 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL )
4514 , ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
4515 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
4516 :( ( IsUpper<MT5>::value )
4517 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
4521 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
4522 for(
size_t k=kbegin+1UL; k<kend; ++k ) {
4523 (~C)(i,j) += A(i,k) * B(k,j);
4525 (~C)(i,j) *= scalar;
4527 for(
size_t j=jend; j<N; ++j ) {
4531 for(
size_t i=iend; i<M; ++i ) {
4532 for(
size_t j=0UL; j<N; ++j ) {
4553 template<
typename MT3
4557 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
4558 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
4560 const size_t M( A.rows() );
4561 const size_t N( B.columns() );
4562 const size_t K( A.columns() );
4564 const size_t jbegin( ( IsStrictlyUpper<MT5>::value )
4565 ?( ( IsStrictlyUpper<MT4>::value && N > 1UL ) ? 2UL : 1UL )
4567 const size_t jend( ( IsStrictlyLower<MT5>::value )
4568 ?( ( IsStrictlyLower<MT4>::value && N > 1UL ) ? N-2UL : N-1UL )
4572 for(
size_t j=0UL; j<jbegin; ++j ) {
4573 for(
size_t i=0UL; i<M; ++i ) {
4577 for(
size_t j=jbegin; j<jend; ++j )
4579 const size_t ibegin( ( IsLower<MT4>::value && IsLower<MT5>::value )
4580 ?( ( IsStrictlyLower<MT4>::value )
4581 ?( IsStrictlyLower<MT5>::value ? j+2UL : j+1UL )
4582 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4583 :( IsStrictlyLower<MT4>::value ? 1UL : 0UL ) );
4584 const size_t iend( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
4585 ?( ( IsStrictlyUpper<MT4>::value )
4586 ?( ( IsStrictlyUpper<MT5>::value )?( j-1UL ):( j ) )
4587 :( ( IsStrictlyUpper<MT5>::value )?( j ):( j+1UL ) ) )
4588 :( IsStrictlyUpper<MT4>::value ? M-1UL : M ) );
4591 for(
size_t i=0UL; i<ibegin; ++i ) {
4594 for(
size_t i=ibegin; i<iend; ++i )
4596 const size_t kbegin( ( IsUpper<MT4>::value )
4597 ?( ( IsLower<MT5>::value )
4598 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i )
4599 , ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4600 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4601 :( ( IsLower<MT5>::value )
4602 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
4604 const size_t kend( ( IsLower<MT4>::value )
4605 ?( ( IsUpper<MT5>::value )
4606 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL )
4607 , ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
4608 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
4609 :( ( IsUpper<MT5>::value )
4610 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
4614 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
4615 for(
size_t k=kbegin+1UL; k<kend; ++k ) {
4616 (~C)(i,j) += A(i,k) * B(k,j);
4618 (~C)(i,j) *= scalar;
4620 for(
size_t i=iend; i<M; ++i ) {
4624 for(
size_t j=jend; j<N; ++j ) {
4625 for(
size_t i=0UL; i<M; ++i ) {
4646 template<
typename MT3
4650 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
4651 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
4653 const size_t M( A.rows() );
4654 const size_t N( B.columns() );
4656 for(
size_t i=0UL; i<M; ++i )
4658 const size_t jbegin( ( IsUpper<MT4>::value )
4659 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
4661 const size_t jend( ( IsLower<MT4>::value )
4662 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
4666 if( IsUpper<MT4>::value ) {
4667 for(
size_t j=0UL; j<jbegin; ++j ) {
4671 for(
size_t j=jbegin; j<jend; ++j ) {
4672 (~C)(i,j) = A(i,j) * B(j,j) * scalar;
4674 if( IsLower<MT4>::value ) {
4675 for(
size_t j=jend; j<N; ++j ) {
4697 template<
typename MT3
4701 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
4702 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
4704 const size_t M( A.rows() );
4705 const size_t N( B.columns() );
4707 const size_t block( BLOCK_SIZE );
4709 for(
size_t jj=0UL; jj<N; jj+=block ) {
4710 const size_t jend(
min( N, jj+block ) );
4711 for(
size_t ii=0UL; ii<M; ii+=block ) {
4712 const size_t iend(
min( M, ii+block ) );
4713 for(
size_t j=jj; j<jend; ++j )
4715 const size_t ibegin( ( IsLower<MT4>::value )
4716 ?(
max( ( IsStrictlyLower<MT4>::value ? j+1UL : j ), ii ) )
4718 const size_t ipos( ( IsUpper<MT4>::value )
4719 ?(
min( ( IsStrictlyUpper<MT4>::value ? j : j+1UL ), iend ) )
4722 if( IsLower<MT4>::value ) {
4723 for(
size_t i=ii; i<ibegin; ++i ) {
4727 for(
size_t i=ibegin; i<ipos; ++i ) {
4728 (~C)(i,j) = A(i,j) * B(j,j) * scalar;
4730 if( IsUpper<MT4>::value ) {
4731 for(
size_t i=ipos; i<iend; ++i ) {
4755 template<
typename MT3
4759 static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
4760 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
4762 const size_t M( A.rows() );
4763 const size_t N( B.columns() );
4765 const size_t block( BLOCK_SIZE );
4767 for(
size_t ii=0UL; ii<M; ii+=block ) {
4768 const size_t iend(
min( M, ii+block ) );
4769 for(
size_t jj=0UL; jj<N; jj+=block ) {
4770 const size_t jend(
min( N, jj+block ) );
4771 for(
size_t i=ii; i<iend; ++i )
4773 const size_t jbegin( ( IsUpper<MT5>::value )
4774 ?(
max( ( IsStrictlyUpper<MT5>::value ? i+1UL : i ), jj ) )
4776 const size_t jpos( ( IsLower<MT5>::value )
4777 ?(
min( ( IsStrictlyLower<MT5>::value ? i : i+1UL ), jend ) )
4780 if( IsUpper<MT5>::value ) {
4781 for(
size_t j=jj; j<jbegin; ++j ) {
4785 for(
size_t j=jbegin; j<jpos; ++j ) {
4786 (~C)(i,j) = A(i,i) * B(i,j) * scalar;
4788 if( IsLower<MT5>::value ) {
4789 for(
size_t j=jpos; j<jend; ++j ) {
4813 template<
typename MT3
4817 static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
4818 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
4820 const size_t M( A.rows() );
4821 const size_t N( B.columns() );
4823 for(
size_t j=0UL; j<N; ++j )
4825 const size_t ibegin( ( IsLower<MT5>::value )
4826 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
4828 const size_t iend( ( IsUpper<MT5>::value )
4829 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
4833 if( IsLower<MT5>::value ) {
4834 for(
size_t i=0UL; i<ibegin; ++i ) {
4838 for(
size_t i=ibegin; i<iend; ++i ) {
4839 (~C)(i,j) = A(i,i) * B(i,j) * scalar;
4841 if( IsUpper<MT5>::value ) {
4842 for(
size_t i=iend; i<M; ++i ) {
4864 template<
typename MT3
4868 static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
4869 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4873 for(
size_t i=0UL; i<A.rows(); ++i ) {
4874 C(i,i) = A(i,i) * B(i,i) * scalar;
4893 template<
typename MT3
4897 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
4898 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4900 selectDefaultAssignKernel( C, A, B, scalar );
4919 template<
typename MT3
4923 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
4924 selectSmallAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
4926 const size_t M( A.rows() );
4927 const size_t N( B.columns() );
4928 const size_t K( A.columns() );
4930 const bool remainder( !IsPadded<MT4>::value || !IsPadded<MT5>::value );
4934 for( ; (i+2UL) <= M; i+=2UL )
4938 for( ; (j+4UL) <= N; j+=4UL )
4940 const size_t kbegin( ( IsUpper<MT4>::value )
4941 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
4942 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
4943 const size_t kend( ( IsLower<MT4>::value )
4944 ?( IsUpper<MT5>::value ?
min( i+2UL, j+4UL ) : ( i+2UL ) )
4945 :( IsUpper<MT5>::value ? ( j+4UL ) : K ) );
4947 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
4948 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
4950 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4953 for( ; k<kpos; k+=SIMDSIZE ) {
4954 const SIMDType a1( A.load(i ,k) );
4955 const SIMDType a2( A.load(i+1UL,k) );
4956 const SIMDType b1( B.load(k,j ) );
4957 const SIMDType b2( B.load(k,j+1UL) );
4958 const SIMDType b3( B.load(k,j+2UL) );
4959 const SIMDType b4( B.load(k,j+3UL) );
4960 xmm1 = xmm1 + a1 * b1;
4961 xmm2 = xmm2 + a1 * b2;
4962 xmm3 = xmm3 + a1 * b3;
4963 xmm4 = xmm4 + a1 * b4;
4964 xmm5 = xmm5 + a2 * b1;
4965 xmm6 = xmm6 + a2 * b2;
4966 xmm7 = xmm7 + a2 * b3;
4967 xmm8 = xmm8 + a2 * b4;
4970 (~C)(i ,j ) =
sum( xmm1 ) * scalar;
4971 (~C)(i ,j+1UL) =
sum( xmm2 ) * scalar;
4972 (~C)(i ,j+2UL) =
sum( xmm3 ) * scalar;
4973 (~C)(i ,j+3UL) =
sum( xmm4 ) * scalar;
4974 (~C)(i+1UL,j ) =
sum( xmm5 ) * scalar;
4975 (~C)(i+1UL,j+1UL) =
sum( xmm6 ) * scalar;
4976 (~C)(i+1UL,j+2UL) =
sum( xmm7 ) * scalar;
4977 (~C)(i+1UL,j+3UL) =
sum( xmm8 ) * scalar;
4979 for( ; remainder && k<kend; ++k ) {
4980 (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
4981 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
4982 (~C)(i ,j+2UL) += A(i ,k) * B(k,j+2UL) * scalar;
4983 (~C)(i ,j+3UL) += A(i ,k) * B(k,j+3UL) * scalar;
4984 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
4985 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
4986 (~C)(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL) * scalar;
4987 (~C)(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL) * scalar;
4991 for( ; (j+2UL) <= N; j+=2UL )
4993 const size_t kbegin( ( IsUpper<MT4>::value )
4994 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
4995 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
4996 const size_t kend( ( IsLower<MT4>::value )
4997 ?( IsUpper<MT5>::value ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
4998 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
5000 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
5001 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
5003 SIMDType xmm1, xmm2, xmm3, xmm4;
5006 for( ; k<kpos; k+=SIMDSIZE ) {
5007 const SIMDType a1( A.load(i ,k) );
5008 const SIMDType a2( A.load(i+1UL,k) );
5009 const SIMDType b1( B.load(k,j ) );
5010 const SIMDType b2( B.load(k,j+1UL) );
5011 xmm1 = xmm1 + a1 * b1;
5012 xmm2 = xmm2 + a1 * b2;
5013 xmm3 = xmm3 + a2 * b1;
5014 xmm4 = xmm4 + a2 * b2;
5017 (~C)(i ,j ) =
sum( xmm1 ) * scalar;
5018 (~C)(i ,j+1UL) =
sum( xmm2 ) * scalar;
5019 (~C)(i+1UL,j ) =
sum( xmm3 ) * scalar;
5020 (~C)(i+1UL,j+1UL) =
sum( xmm4 ) * scalar;
5022 for( ; remainder && k<kend; ++k ) {
5023 (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
5024 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
5025 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
5026 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
5032 const size_t kbegin( ( IsUpper<MT4>::value )
5033 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
5034 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
5035 const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
5037 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
5038 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
5040 SIMDType xmm1, xmm2;
5043 for( ; k<kpos; k+=SIMDSIZE ) {
5044 const SIMDType b1( B.load(k,j) );
5045 xmm1 = xmm1 + A.load(i ,k) * b1;
5046 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
5049 (~C)(i ,j) =
sum( xmm1 ) * scalar;
5050 (~C)(i+1UL,j) =
sum( xmm2 ) * scalar;
5052 for( ; remainder && k<kend; ++k ) {
5053 (~C)(i ,j) += A(i ,k) * B(k,j) * scalar;
5054 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
5063 for( ; (j+4UL) <= N; j+=4UL )
5065 const size_t kbegin( ( IsUpper<MT4>::value )
5066 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
5067 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
5068 const size_t kend( ( IsUpper<MT5>::value )?( j+4UL ):( K ) );
5070 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
5071 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
5073 SIMDType xmm1, xmm2, xmm3, xmm4;
5076 for( ; k<kpos; k+=SIMDSIZE ) {
5077 const SIMDType a1( A.load(i,k) );
5078 xmm1 = xmm1 + a1 * B.load(k,j );
5079 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
5080 xmm3 = xmm3 + a1 * B.load(k,j+2UL);
5081 xmm4 = xmm4 + a1 * B.load(k,j+3UL);
5084 (~C)(i,j ) =
sum( xmm1 ) * scalar;
5085 (~C)(i,j+1UL) =
sum( xmm2 ) * scalar;
5086 (~C)(i,j+2UL) =
sum( xmm3 ) * scalar;
5087 (~C)(i,j+3UL) =
sum( xmm4 ) * scalar;
5089 for( ; remainder && k<kend; ++k ) {
5090 (~C)(i,j ) += A(i,k) * B(k,j ) * scalar;
5091 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
5092 (~C)(i,j+2UL) += A(i,k) * B(k,j+2UL) * scalar;
5093 (~C)(i,j+3UL) += A(i,k) * B(k,j+3UL) * scalar;
5097 for( ; (j+2UL) <= N; j+=2UL )
5099 const size_t kbegin( ( IsUpper<MT4>::value )
5100 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
5101 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
5102 const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
5104 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
5105 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
5107 SIMDType xmm1, xmm2;
5110 for( ; k<kpos; k+=SIMDSIZE ) {
5111 const SIMDType a1( A.load(i,k) );
5112 xmm1 = xmm1 + a1 * B.load(k,j );
5113 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
5116 (~C)(i,j ) =
sum( xmm1 ) * scalar;
5117 (~C)(i,j+1UL) =
sum( xmm2 ) * scalar;
5119 for( ; remainder && k<kend; ++k ) {
5120 (~C)(i,j ) += A(i,k) * B(k,j ) * scalar;
5121 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
5127 const size_t kbegin( ( IsUpper<MT4>::value )
5128 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
5129 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
5131 const size_t kpos( remainder ? ( K &
size_t(-SIMDSIZE) ) : K );
5132 BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
5137 for( ; k<kpos; k+=SIMDSIZE ) {
5138 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
5141 (~C)(i,j) =
sum( xmm1 ) * scalar;
5143 for( ; remainder && k<K; ++k ) {
5144 (~C)(i,j) += A(i,k) * B(k,j) * scalar;
5166 template<
typename MT3
5170 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
5171 selectSmallAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5173 const size_t M( A.rows() );
5174 const size_t N( B.columns() );
5175 const size_t K( A.columns() );
5177 const bool remainder( !IsPadded<MT4>::value || !IsPadded<MT5>::value );
5181 for( ; (i+4UL) <= M; i+=4UL )
5185 for( ; (j+2UL) <= N; j+=2UL )
5187 const size_t kbegin( ( IsUpper<MT4>::value )
5188 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
5189 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
5190 const size_t kend( ( IsLower<MT4>::value )
5191 ?( IsUpper<MT5>::value ?
min( i+4UL, j+2UL ) : ( i+4UL ) )
5192 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
5194 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
5195 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
5197 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5200 for( ; k<kpos; k+=SIMDSIZE ) {
5201 const SIMDType a1( A.load(i ,k) );
5202 const SIMDType a2( A.load(i+1UL,k) );
5203 const SIMDType a3( A.load(i+2UL,k) );
5204 const SIMDType a4( A.load(i+3UL,k) );
5205 const SIMDType b1( B.load(k,j ) );
5206 const SIMDType b2( B.load(k,j+1UL) );
5207 xmm1 = xmm1 + a1 * b1;
5208 xmm2 = xmm2 + a1 * b2;
5209 xmm3 = xmm3 + a2 * b1;
5210 xmm4 = xmm4 + a2 * b2;
5211 xmm5 = xmm5 + a3 * b1;
5212 xmm6 = xmm6 + a3 * b2;
5213 xmm7 = xmm7 + a4 * b1;
5214 xmm8 = xmm8 + a4 * b2;
5217 (~C)(i ,j ) =
sum( xmm1 ) * scalar;
5218 (~C)(i ,j+1UL) =
sum( xmm2 ) * scalar;
5219 (~C)(i+1UL,j ) =
sum( xmm3 ) * scalar;
5220 (~C)(i+1UL,j+1UL) =
sum( xmm4 ) * scalar;
5221 (~C)(i+2UL,j ) =
sum( xmm5 ) * scalar;
5222 (~C)(i+2UL,j+1UL) =
sum( xmm6 ) * scalar;
5223 (~C)(i+3UL,j ) =
sum( xmm7 ) * scalar;
5224 (~C)(i+3UL,j+1UL) =
sum( xmm8 ) * scalar;
5226 for( ; remainder && k<kend; ++k ) {
5227 (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
5228 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
5229 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
5230 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
5231 (~C)(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
5232 (~C)(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
5233 (~C)(i+3UL,j ) += A(i+3UL,k) * B(k,j ) * scalar;
5234 (~C)(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL) * scalar;
5240 const size_t kbegin( ( IsUpper<MT4>::value )
5241 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
5242 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
5243 const size_t kend( ( IsLower<MT4>::value )?( i+4UL ):( K ) );
5245 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
5246 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
5248 SIMDType xmm1, xmm2, xmm3, xmm4;
5251 for( ; k<kpos; k+=SIMDSIZE ) {
5252 const SIMDType b1( B.load(k,j) );
5253 xmm1 = xmm1 + A.load(i ,k) * b1;
5254 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
5255 xmm3 = xmm3 + A.load(i+2UL,k) * b1;
5256 xmm4 = xmm4 + A.load(i+3UL,k) * b1;
5259 (~C)(i ,j) =
sum( xmm1 ) * scalar;
5260 (~C)(i+1UL,j) =
sum( xmm2 ) * scalar;
5261 (~C)(i+2UL,j) =
sum( xmm3 ) * scalar;
5262 (~C)(i+3UL,j) =
sum( xmm4 ) * scalar;
5264 for( ; remainder && k<kend; ++k ) {
5265 (~C)(i ,j) += A(i ,k) * B(k,j) * scalar;
5266 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
5267 (~C)(i+2UL,j) += A(i+2UL,k) * B(k,j) * scalar;
5268 (~C)(i+3UL,j) += A(i+3UL,k) * B(k,j) * scalar;
5273 for( ; (i+2UL) <= M; i+=2UL )
5277 for( ; (j+2UL) <= N; j+=2UL )
5279 const size_t kbegin( ( IsUpper<MT4>::value )
5280 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
5281 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
5282 const size_t kend( ( IsLower<MT4>::value )
5283 ?( IsUpper<MT5>::value ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
5284 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
5286 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
5287 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
5289 SIMDType xmm1, xmm2, xmm3, xmm4;
5292 for( ; k<kpos; k+=SIMDSIZE ) {
5293 const SIMDType a1( A.load(i ,k) );
5294 const SIMDType a2( A.load(i+1UL,k) );
5295 const SIMDType b1( B.load(k,j ) );
5296 const SIMDType b2( B.load(k,j+1UL) );
5297 xmm1 = xmm1 + a1 * b1;
5298 xmm2 = xmm2 + a1 * b2;
5299 xmm3 = xmm3 + a2 * b1;
5300 xmm4 = xmm4 + a2 * b2;
5303 (~C)(i ,j ) =
sum( xmm1 ) * scalar;
5304 (~C)(i ,j+1UL) =
sum( xmm2 ) * scalar;
5305 (~C)(i+1UL,j ) =
sum( xmm3 ) * scalar;
5306 (~C)(i+1UL,j+1UL) =
sum( xmm4 ) * scalar;
5308 for( ; remainder && k<kend; ++k ) {
5309 (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
5310 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
5311 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
5312 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
5318 const size_t kbegin( ( IsUpper<MT4>::value )
5319 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
5320 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
5321 const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
5323 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
5324 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
5326 SIMDType xmm1, xmm2;
5329 for( ; k<kpos; k+=SIMDSIZE ) {
5330 const SIMDType b1( B.load(k,j) );
5331 xmm1 = xmm1 + A.load(i ,k) * b1;
5332 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
5335 (~C)(i ,j) =
sum( xmm1 ) * scalar;
5336 (~C)(i+1UL,j) =
sum( xmm2 ) * scalar;
5338 for( ; remainder && k<kend; ++k ) {
5339 (~C)(i ,j) += A(i ,k) * B(k,j) * scalar;
5340 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
5349 for( ; (j+2UL) <= N; j+=2UL )
5351 const size_t kbegin( ( IsUpper<MT4>::value )
5352 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
5353 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
5354 const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
5356 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
5357 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
5359 SIMDType xmm1, xmm2;
5362 for( ; k<kpos; k+=SIMDSIZE ) {
5363 const SIMDType a1( A.load(i,k) );
5364 xmm1 = xmm1 + a1 * B.load(k,j );
5365 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
5368 (~C)(i,j ) =
sum( xmm1 ) * scalar;
5369 (~C)(i,j+1UL) =
sum( xmm2 ) * scalar;
5371 for( ; remainder && k<kend; ++k ) {
5372 (~C)(i,j ) += A(i,k) * B(k,j ) * scalar;
5373 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
5379 const size_t kbegin( ( IsUpper<MT4>::value )
5380 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
5381 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
5383 const size_t kpos( remainder ? ( K &
size_t(-SIMDSIZE) ) : K );
5384 BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
5389 for( ; k<kpos; k+=SIMDSIZE ) {
5390 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
5393 (~C)(i,j) =
sum( xmm1 ) * scalar;
5395 for( ; remainder && k<K; ++k ) {
5396 (~C)(i,j) += A(i,k) * B(k,j) * scalar;
5417 template<
typename MT3
5421 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
5422 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5424 selectDefaultAssignKernel( C, A, B, scalar );
5443 template<
typename MT3
5447 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
5448 selectLargeAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5451 selectSmallAssignKernel( ~C, A, B, scalar );
5470 template<
typename MT3
5474 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
5475 selectLargeAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5478 selectSmallAssignKernel( ~C, A, B, scalar );
5496 template<
typename MT3
5500 static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
5501 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5503 selectLargeAssignKernel( C, A, B, scalar );
5508 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
5522 template<
typename MT3
5526 static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
5527 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5529 typedef ElementType_<MT3> ET;
5531 if( IsTriangular<MT4>::value ) {
5533 trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
5535 else if( IsTriangular<MT5>::value ) {
5537 trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
5540 gemm( C, A, B, ET(scalar), ET(0) );
5558 template<
typename MT
5560 friend inline void assign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
5564 typedef IfTrue_< SO, OppositeType, ResultType > TmpType;
5576 const TmpType tmp(
serial( rhs ) );
5577 assign( ~lhs, tmp );
5593 template<
typename MT
5595 friend inline void addAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
5602 LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
5603 RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
5605 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
5619 DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
5634 template<
typename MT3
5638 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5640 if( ( IsDiagonal<MT4>::value || IsDiagonal<MT5>::value ) ||
5641 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
5642 selectSmallAddAssignKernel( C, A, B, scalar );
5644 selectBlasAddAssignKernel( C, A, B, scalar );
5662 template<
typename MT3
5666 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
5667 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5669 const ResultType tmp(
serial( A * B * scalar ) );
5670 addAssign( C, tmp );
5688 template<
typename MT3
5692 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
5693 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5695 const size_t M( A.rows() );
5696 const size_t N( B.columns() );
5698 for(
size_t i=0UL; i<M; ++i )
5700 const size_t jbegin( ( IsUpper<MT4>::value )
5701 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
5703 const size_t jend( ( IsLower<MT4>::value )
5704 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
5708 const size_t jnum( jend - jbegin );
5709 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
5711 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
5712 (~C)(i,j ) += A(i,j ) * B(j ,j ) * scalar;
5713 (~C)(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
5716 (~C)(i,jpos) += A(i,jpos) * B(jpos,jpos) * scalar;
5736 template<
typename MT3
5740 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
5741 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5743 const size_t M( A.rows() );
5744 const size_t N( B.columns() );
5746 const size_t block( BLOCK_SIZE );
5748 for(
size_t jj=0UL; jj<N; jj+=block ) {
5749 const size_t jend(
min( N, jj+block ) );
5750 for(
size_t ii=0UL; ii<M; ii+=block ) {
5751 const size_t iend(
min( M, ii+block ) );
5752 for(
size_t j=jj; j<jend; ++j )
5754 const size_t ibegin( ( IsLower<MT4>::value )
5755 ?(
max( ( IsStrictlyLower<MT4>::value ? j+1UL : j ), ii ) )
5757 const size_t ipos( ( IsUpper<MT4>::value )
5758 ?(
min( ( IsStrictlyUpper<MT4>::value ? j : j+1UL ), iend ) )
5761 for(
size_t i=ibegin; i<ipos; ++i ) {
5762 (~C)(i,j) += A(i,j) * B(j,j) * scalar;
5784 template<
typename MT3
5788 static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
5789 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5791 const size_t M( A.rows() );
5792 const size_t N( B.columns() );
5794 const size_t block( BLOCK_SIZE );
5796 for(
size_t ii=0UL; ii<M; ii+=block ) {
5797 const size_t iend(
min( M, ii+block ) );
5798 for(
size_t jj=0UL; jj<N; jj+=block ) {
5799 const size_t jend(
min( N, jj+block ) );
5800 for(
size_t i=ii; i<iend; ++i )
5802 const size_t jbegin( ( IsUpper<MT5>::value )
5803 ?(
max( ( IsStrictlyUpper<MT5>::value ? i+1UL : i ), jj ) )
5805 const size_t jpos( ( IsLower<MT5>::value )
5806 ?(
min( ( IsStrictlyLower<MT5>::value ? i : i+1UL ), jend ) )
5809 for(
size_t j=jbegin; j<jpos; ++j ) {
5810 (~C)(i,j) += A(i,i) * B(i,j) * scalar;
5832 template<
typename MT3
5836 static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
5837 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5839 const size_t M( A.rows() );
5840 const size_t N( B.columns() );
5842 for(
size_t j=0UL; j<N; ++j )
5844 const size_t ibegin( ( IsLower<MT5>::value )
5845 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
5847 const size_t iend( ( IsUpper<MT5>::value )
5848 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
5852 const size_t inum( iend - ibegin );
5853 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
5855 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
5856 (~C)(i ,j) += A(i ,i ) * B(i ,j) * scalar;
5857 (~C)(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
5860 (~C)(ipos,j) += A(ipos,ipos) * B(ipos,j) * scalar;
5880 template<
typename MT3
5884 static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
5885 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5887 for(
size_t i=0UL; i<A.rows(); ++i ) {
5888 C(i,i) += A(i,i) * B(i,i) * scalar;
5907 template<
typename MT3
5911 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
5912 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5914 selectDefaultAddAssignKernel( C, A, B, scalar );
5933 template<
typename MT3
5937 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
5938 selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5940 const size_t M( A.rows() );
5941 const size_t N( B.columns() );
5942 const size_t K( A.columns() );
5944 const bool remainder( !IsPadded<MT4>::value || !IsPadded<MT5>::value );
5948 for( ; (i+2UL) <= M; i+=2UL )
5952 for( ; (j+4UL) <= N; j+=4UL )
5954 const size_t kbegin( ( IsUpper<MT4>::value )
5955 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
5956 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
5957 const size_t kend( ( IsLower<MT4>::value )
5958 ?( IsUpper<MT5>::value ?
min( i+2UL, j+4UL ) : ( i+2UL ) )
5959 :( IsUpper<MT5>::value ? ( j+4UL ) : K ) );
5961 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
5962 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
5964 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5967 for( ; k<kpos; k+=SIMDSIZE ) {
5968 const SIMDType a1( A.load(i ,k) );
5969 const SIMDType a2( A.load(i+1UL,k) );
5970 const SIMDType b1( B.load(k,j ) );
5971 const SIMDType b2( B.load(k,j+1UL) );
5972 const SIMDType b3( B.load(k,j+2UL) );
5973 const SIMDType b4( B.load(k,j+3UL) );
5974 xmm1 = xmm1 + a1 * b1;
5975 xmm2 = xmm2 + a1 * b2;
5976 xmm3 = xmm3 + a1 * b3;
5977 xmm4 = xmm4 + a1 * b4;
5978 xmm5 = xmm5 + a2 * b1;
5979 xmm6 = xmm6 + a2 * b2;
5980 xmm7 = xmm7 + a2 * b3;
5981 xmm8 = xmm8 + a2 * b4;
5984 (~C)(i ,j ) +=
sum( xmm1 ) * scalar;
5985 (~C)(i ,j+1UL) +=
sum( xmm2 ) * scalar;
5986 (~C)(i ,j+2UL) +=
sum( xmm3 ) * scalar;
5987 (~C)(i ,j+3UL) +=
sum( xmm4 ) * scalar;
5988 (~C)(i+1UL,j ) +=
sum( xmm5 ) * scalar;
5989 (~C)(i+1UL,j+1UL) +=
sum( xmm6 ) * scalar;
5990 (~C)(i+1UL,j+2UL) +=
sum( xmm7 ) * scalar;
5991 (~C)(i+1UL,j+3UL) +=
sum( xmm8 ) * scalar;
5993 for( ; remainder && k<kend; ++k ) {
5994 (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
5995 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
5996 (~C)(i ,j+2UL) += A(i ,k) * B(k,j+2UL) * scalar;
5997 (~C)(i ,j+3UL) += A(i ,k) * B(k,j+3UL) * scalar;
5998 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
5999 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
6000 (~C)(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL) * scalar;
6001 (~C)(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL) * scalar;
6005 for( ; (j+2UL) <= N; j+=2UL )
6007 const size_t kbegin( ( IsUpper<MT4>::value )
6008 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
6009 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
6010 const size_t kend( ( IsLower<MT4>::value )
6011 ?( IsUpper<MT5>::value ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
6012 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
6014 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
6015 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
6017 SIMDType xmm1, xmm2, xmm3, xmm4;
6020 for( ; k<kpos; k+=SIMDSIZE ) {
6021 const SIMDType a1( A.load(i ,k) );
6022 const SIMDType a2( A.load(i+1UL,k) );
6023 const SIMDType b1( B.load(k,j ) );
6024 const SIMDType b2( B.load(k,j+1UL) );
6025 xmm1 = xmm1 + a1 * b1;
6026 xmm2 = xmm2 + a1 * b2;
6027 xmm3 = xmm3 + a2 * b1;
6028 xmm4 = xmm4 + a2 * b2;
6031 (~C)(i ,j ) +=
sum( xmm1 ) * scalar;
6032 (~C)(i ,j+1UL) +=
sum( xmm2 ) * scalar;
6033 (~C)(i+1UL,j ) +=
sum( xmm3 ) * scalar;
6034 (~C)(i+1UL,j+1UL) +=
sum( xmm4 ) * scalar;
6036 for( ; remainder && k<kend; ++k ) {
6037 (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
6038 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
6039 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
6040 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
6046 const size_t kbegin( ( IsUpper<MT4>::value )
6047 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
6048 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
6049 const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
6051 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
6052 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
6054 SIMDType xmm1, xmm2;
6057 for( ; k<kpos; k+=SIMDSIZE ) {
6058 const SIMDType b1( B.load(k,j) );
6059 xmm1 = xmm1 + A.load(i ,k) * b1;
6060 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
6063 (~C)(i ,j) +=
sum( xmm1 ) * scalar;
6064 (~C)(i+1UL,j) +=
sum( xmm2 ) * scalar;
6066 for( ; remainder && k<kend; ++k ) {
6067 (~C)(i ,j) += A(i ,k) * B(k,j) * scalar;
6068 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
6077 for( ; (j+4UL) <= N; j+=4UL )
6079 const size_t kbegin( ( IsUpper<MT4>::value )
6080 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
6081 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
6082 const size_t kend( ( IsUpper<MT5>::value )?( j+4UL ):( K ) );
6084 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
6085 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
6087 SIMDType xmm1, xmm2, xmm3, xmm4;
6090 for( ; k<kpos; k+=SIMDSIZE ) {
6091 const SIMDType a1( A.load(i,k) );
6092 xmm1 = xmm1 + a1 * B.load(k,j );
6093 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
6094 xmm3 = xmm3 + a1 * B.load(k,j+2UL);
6095 xmm4 = xmm4 + a1 * B.load(k,j+3UL);
6098 (~C)(i,j ) +=
sum( xmm1 ) * scalar;
6099 (~C)(i,j+1UL) +=
sum( xmm2 ) * scalar;
6100 (~C)(i,j+2UL) +=
sum( xmm3 ) * scalar;
6101 (~C)(i,j+3UL) +=
sum( xmm4 ) * scalar;
6103 for( ; remainder && k<kend; ++k ) {
6104 (~C)(i,j ) += A(i,k) * B(k,j ) * scalar;
6105 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
6106 (~C)(i,j+2UL) += A(i,k) * B(k,j+2UL) * scalar;
6107 (~C)(i,j+3UL) += A(i,k) * B(k,j+3UL) * scalar;
6111 for( ; (j+2UL) <= N; j+=2UL )
6113 const size_t kbegin( ( IsUpper<MT4>::value )
6114 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
6115 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
6116 const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
6118 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
6119 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
6121 SIMDType xmm1, xmm2;
6124 for( ; k<kpos; k+=SIMDSIZE ) {
6125 const SIMDType a1( A.load(i,k) );
6126 xmm1 = xmm1 + a1 * B.load(k,j );
6127 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
6130 (~C)(i,j ) +=
sum( xmm1 ) * scalar;
6131 (~C)(i,j+1UL) +=
sum( xmm2 ) * scalar;
6133 for( ; remainder && k<kend; ++k ) {
6134 (~C)(i,j ) += A(i,k) * B(k,j ) * scalar;
6135 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
6141 const size_t kbegin( ( IsUpper<MT4>::value )
6142 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
6143 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
6145 const size_t kpos( remainder ? ( K &
size_t(-SIMDSIZE) ) : K );
6146 BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
6151 for( ; k<kpos; k+=SIMDSIZE ) {
6152 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
6155 (~C)(i,j) +=
sum( xmm1 ) * scalar;
6157 for( ; remainder && k<K; ++k ) {
6158 (~C)(i,j) += A(i,k) * B(k,j) * scalar;
6180 template<
typename MT3
6184 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
6185 selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6187 const size_t M( A.rows() );
6188 const size_t N( B.columns() );
6189 const size_t K( A.columns() );
6191 const bool remainder( !IsPadded<MT4>::value || !IsPadded<MT5>::value );
6195 for( ; (i+4UL) <= M; i+=4UL )
6199 for( ; (j+2UL) <= N; j+=2UL )
6201 const size_t kbegin( ( IsUpper<MT4>::value )
6202 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
6203 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
6204 const size_t kend( ( IsLower<MT4>::value )
6205 ?( IsUpper<MT5>::value ?
min( i+4UL, j+2UL ) : ( i+4UL ) )
6206 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
6208 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
6209 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
6211 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6214 for( ; k<kpos; k+=SIMDSIZE ) {
6215 const SIMDType a1( A.load(i ,k) );
6216 const SIMDType a2( A.load(i+1UL,k) );
6217 const SIMDType a3( A.load(i+2UL,k) );
6218 const SIMDType a4( A.load(i+3UL,k) );
6219 const SIMDType b1( B.load(k,j ) );
6220 const SIMDType b2( B.load(k,j+1UL) );
6221 xmm1 = xmm1 + a1 * b1;
6222 xmm2 = xmm2 + a1 * b2;
6223 xmm3 = xmm3 + a2 * b1;
6224 xmm4 = xmm4 + a2 * b2;
6225 xmm5 = xmm5 + a3 * b1;
6226 xmm6 = xmm6 + a3 * b2;
6227 xmm7 = xmm7 + a4 * b1;
6228 xmm8 = xmm8 + a4 * b2;
6231 (~C)(i ,j ) +=
sum( xmm1 ) * scalar;
6232 (~C)(i ,j+1UL) +=
sum( xmm2 ) * scalar;
6233 (~C)(i+1UL,j ) +=
sum( xmm3 ) * scalar;
6234 (~C)(i+1UL,j+1UL) +=
sum( xmm4 ) * scalar;
6235 (~C)(i+2UL,j ) +=
sum( xmm5 ) * scalar;
6236 (~C)(i+2UL,j+1UL) +=
sum( xmm6 ) * scalar;
6237 (~C)(i+3UL,j ) +=
sum( xmm7 ) * scalar;
6238 (~C)(i+3UL,j+1UL) +=
sum( xmm8 ) * scalar;
6240 for( ; remainder && k<kend; ++k ) {
6241 (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
6242 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
6243 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
6244 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
6245 (~C)(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
6246 (~C)(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
6247 (~C)(i+3UL,j ) += A(i+3UL,k) * B(k,j ) * scalar;
6248 (~C)(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL) * scalar;
6254 const size_t kbegin( ( IsUpper<MT4>::value )
6255 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
6256 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
6257 const size_t kend( ( IsLower<MT4>::value )?( i+4UL ):( K ) );
6259 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
6260 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
6262 SIMDType xmm1, xmm2, xmm3, xmm4;
6265 for( ; k<kpos; k+=SIMDSIZE ) {
6266 const SIMDType b1( B.load(k,j) );
6267 xmm1 = xmm1 + A.load(i ,k) * b1;
6268 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
6269 xmm3 = xmm3 + A.load(i+2UL,k) * b1;
6270 xmm4 = xmm4 + A.load(i+3UL,k) * b1;
6273 (~C)(i ,j) +=
sum( xmm1 ) * scalar;
6274 (~C)(i+1UL,j) +=
sum( xmm2 ) * scalar;
6275 (~C)(i+2UL,j) +=
sum( xmm3 ) * scalar;
6276 (~C)(i+3UL,j) +=
sum( xmm4 ) * scalar;
6278 for( ; remainder && k<kend; ++k ) {
6279 (~C)(i ,j) += A(i ,k) * B(k,j) * scalar;
6280 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
6281 (~C)(i+2UL,j) += A(i+2UL,k) * B(k,j) * scalar;
6282 (~C)(i+3UL,j) += A(i+3UL,k) * B(k,j) * scalar;
6287 for( ; (i+2UL) <= M; i+=2UL )
6291 for( ; (j+2UL) <= N; j+=2UL )
6293 const size_t kbegin( ( IsUpper<MT4>::value )
6294 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
6295 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
6296 const size_t kend( ( IsLower<MT4>::value )
6297 ?( IsUpper<MT5>::value ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
6298 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
6300 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
6301 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
6303 SIMDType xmm1, xmm2, xmm3, xmm4;
6306 for( ; k<kpos; k+=SIMDSIZE ) {
6307 const SIMDType a1( A.load(i ,k) );
6308 const SIMDType a2( A.load(i+1UL,k) );
6309 const SIMDType b1( B.load(k,j ) );
6310 const SIMDType b2( B.load(k,j+1UL) );
6311 xmm1 = xmm1 + a1 * b1;
6312 xmm2 = xmm2 + a1 * b2;
6313 xmm3 = xmm3 + a2 * b1;
6314 xmm4 = xmm4 + a2 * b2;
6317 (~C)(i ,j ) +=
sum( xmm1 ) * scalar;
6318 (~C)(i ,j+1UL) +=
sum( xmm2 ) * scalar;
6319 (~C)(i+1UL,j ) +=
sum( xmm3 ) * scalar;
6320 (~C)(i+1UL,j+1UL) +=
sum( xmm4 ) * scalar;
6322 for( ; remainder && k<kend; ++k ) {
6323 (~C)(i ,j ) += A(i ,k) * B(k,j ) * scalar;
6324 (~C)(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
6325 (~C)(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
6326 (~C)(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
6332 const size_t kbegin( ( IsUpper<MT4>::value )
6333 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
6334 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
6335 const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
6337 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
6338 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
6340 SIMDType xmm1, xmm2;
6343 for( ; k<kpos; k+=SIMDSIZE ) {
6344 const SIMDType b1( B.load(k,j) );
6345 xmm1 = xmm1 + A.load(i ,k) * b1;
6346 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
6349 (~C)(i ,j) +=
sum( xmm1 ) * scalar;
6350 (~C)(i+1UL,j) +=
sum( xmm2 ) * scalar;
6352 for( ; remainder && k<kend; ++k ) {
6353 (~C)(i ,j) += A(i ,k) * B(k,j) * scalar;
6354 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
6363 for( ; (j+2UL) <= N; j+=2UL )
6365 const size_t kbegin( ( IsUpper<MT4>::value )
6366 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
6367 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
6368 const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
6370 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
6371 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
6373 SIMDType xmm1, xmm2;
6376 for( ; k<kpos; k+=SIMDSIZE ) {
6377 const SIMDType a1( A.load(i,k) );
6378 xmm1 = xmm1 + a1 * B.load(k,j );
6379 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
6382 (~C)(i,j ) +=
sum( xmm1 ) * scalar;
6383 (~C)(i,j+1UL) +=
sum( xmm2 ) * scalar;
6385 for( ; remainder && k<kend; ++k ) {
6386 (~C)(i,j ) += A(i,k) * B(k,j ) * scalar;
6387 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
6393 const size_t kbegin( ( IsUpper<MT4>::value )
6394 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
6395 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
6397 const size_t kpos( remainder ? ( K &
size_t(-SIMDSIZE) ) : K );
6398 BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
6403 for( ; k<kpos; k+=SIMDSIZE ) {
6404 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
6407 (~C)(i,j) +=
sum( xmm1 ) * scalar;
6409 for( ; remainder && k<K; ++k ) {
6410 (~C)(i,j) += A(i,k) * B(k,j) * scalar;
6431 template<
typename MT3
6435 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
6436 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6438 selectDefaultAddAssignKernel( C, A, B, scalar );
6457 template<
typename MT3
6461 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
6462 selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6465 selectSmallAddAssignKernel( ~C, A, B, scalar );
6484 template<
typename MT3
6488 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
6489 selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6492 selectSmallAddAssignKernel( ~C, A, B, scalar );
6510 template<
typename MT3
6514 static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
6515 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6517 selectLargeAddAssignKernel( C, A, B, scalar );
6522 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
6536 template<
typename MT3
6540 static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
6541 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6543 typedef ElementType_<MT3> ET;
6545 if( IsTriangular<MT4>::value ) {
6546 ResultType_<MT3> tmp(
serial( B ) );
6547 trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6548 addAssign( C, tmp );
6550 else if( IsTriangular<MT5>::value ) {
6551 ResultType_<MT3> tmp(
serial( A ) );
6552 trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6553 addAssign( C, tmp );
6556 gemm( C, A, B, ET(scalar), ET(1) );
6578 template<
typename MT
6580 friend inline void subAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
6587 LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
6588 RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
6590 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
6604 DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
6619 template<
typename MT3
6623 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6625 if( ( IsDiagonal<MT4>::value || IsDiagonal<MT5>::value ) ||
6626 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
6627 selectSmallSubAssignKernel( C, A, B, scalar );
6629 selectBlasSubAssignKernel( C, A, B, scalar );
6647 template<
typename MT3
6651 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
6652 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6654 const ResultType tmp(
serial( A * B * scalar ) );
6655 subAssign( C, tmp );
6673 template<
typename MT3
6677 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
6678 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6680 const size_t M( A.rows() );
6681 const size_t N( B.columns() );
6683 for(
size_t i=0UL; i<M; ++i )
6685 const size_t jbegin( ( IsUpper<MT4>::value )
6686 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
6688 const size_t jend( ( IsLower<MT4>::value )
6689 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
6693 const size_t jnum( jend - jbegin );
6694 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
6696 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
6697 (~C)(i,j ) -= A(i,j ) * B(j ,j ) * scalar;
6698 (~C)(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
6701 (~C)(i,jpos) -= A(i,jpos) * B(jpos,jpos) * scalar;
6721 template<
typename MT3
6725 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
6726 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6728 const size_t M( A.rows() );
6729 const size_t N( B.columns() );
6731 const size_t block( BLOCK_SIZE );
6733 for(
size_t jj=0UL; jj<N; jj+=block ) {
6734 const size_t jend(
min( N, jj+block ) );
6735 for(
size_t ii=0UL; ii<M; ii+=block ) {
6736 const size_t iend(
min( M, ii+block ) );
6737 for(
size_t j=jj; j<jend; ++j )
6739 const size_t ibegin( ( IsLower<MT4>::value )
6740 ?(
max( ( IsStrictlyLower<MT4>::value ? j+1UL : j ), ii ) )
6742 const size_t ipos( ( IsUpper<MT4>::value )
6743 ?(
min( ( IsStrictlyUpper<MT4>::value ? j : j+1UL ), iend ) )
6746 for(
size_t i=ibegin; i<ipos; ++i ) {
6747 (~C)(i,j) -= A(i,j) * B(j,j) * scalar;
6770 template<
typename MT3
6774 static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
6775 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6777 const size_t M( A.rows() );
6778 const size_t N( B.columns() );
6780 const size_t block( BLOCK_SIZE );
6782 for(
size_t ii=0UL; ii<M; ii+=block ) {
6783 const size_t iend(
min( M, ii+block ) );
6784 for(
size_t jj=0UL; jj<N; jj+=block ) {
6785 const size_t jend(
min( N, jj+block ) );
6786 for(
size_t i=ii; i<iend; ++i )
6788 const size_t jbegin( ( IsUpper<MT5>::value )
6789 ?(
max( ( IsStrictlyUpper<MT5>::value ? i+1UL : i ), jj ) )
6791 const size_t jpos( ( IsLower<MT5>::value )
6792 ?(
min( ( IsStrictlyLower<MT5>::value ? i : i+1UL ), jend ) )
6795 for(
size_t j=jbegin; j<jpos; ++j ) {
6796 (~C)(i,j) -= A(i,i) * B(i,j) * scalar;
6819 template<
typename MT3
6823 static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
6824 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6826 const size_t M( A.rows() );
6827 const size_t N( B.columns() );
6829 for(
size_t j=0UL; j<N; ++j )
6831 const size_t ibegin( ( IsLower<MT5>::value )
6832 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
6834 const size_t iend( ( IsUpper<MT5>::value )
6835 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
6839 const size_t inum( iend - ibegin );
6840 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
6842 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
6843 (~C)(i ,j) -= A(i ,i ) * B(i ,j) * scalar;
6844 (~C)(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
6847 (~C)(ipos,j) -= A(ipos,ipos) * B(ipos,j) * scalar;
6867 template<
typename MT3
6871 static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
6872 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6874 for(
size_t i=0UL; i<A.rows(); ++i ) {
6875 C(i,i) -= A(i,i) * B(i,i) * scalar;
6894 template<
typename MT3
6898 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
6899 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6901 selectDefaultSubAssignKernel( C, A, B, scalar );
6920 template<
typename MT3
6924 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
6925 selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6927 const size_t M( A.rows() );
6928 const size_t N( B.columns() );
6929 const size_t K( A.columns() );
6931 const bool remainder( !IsPadded<MT4>::value || !IsPadded<MT5>::value );
6935 for( ; (i+2UL) <= M; i+=2UL )
6939 for( ; (j+4UL) <= N; j+=4UL )
6941 const size_t kbegin( ( IsUpper<MT4>::value )
6942 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
6943 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
6944 const size_t kend( ( IsLower<MT4>::value )
6945 ?( IsUpper<MT5>::value ?
min( i+2UL, j+4UL ) : ( i+2UL ) )
6946 :( IsUpper<MT5>::value ? ( j+4UL ) : K ) );
6948 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
6949 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
6951 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6954 for( ; k<kpos; k+=SIMDSIZE ) {
6955 const SIMDType a1( A.load(i ,k) );
6956 const SIMDType a2( A.load(i+1UL,k) );
6957 const SIMDType b1( B.load(k,j ) );
6958 const SIMDType b2( B.load(k,j+1UL) );
6959 const SIMDType b3( B.load(k,j+2UL) );
6960 const SIMDType b4( B.load(k,j+3UL) );
6961 xmm1 = xmm1 + a1 * b1;
6962 xmm2 = xmm2 + a1 * b2;
6963 xmm3 = xmm3 + a1 * b3;
6964 xmm4 = xmm4 + a1 * b4;
6965 xmm5 = xmm5 + a2 * b1;
6966 xmm6 = xmm6 + a2 * b2;
6967 xmm7 = xmm7 + a2 * b3;
6968 xmm8 = xmm8 + a2 * b4;
6971 (~C)(i ,j ) -=
sum( xmm1 ) * scalar;
6972 (~C)(i ,j+1UL) -=
sum( xmm2 ) * scalar;
6973 (~C)(i ,j+2UL) -=
sum( xmm3 ) * scalar;
6974 (~C)(i ,j+3UL) -=
sum( xmm4 ) * scalar;
6975 (~C)(i+1UL,j ) -=
sum( xmm5 ) * scalar;
6976 (~C)(i+1UL,j+1UL) -=
sum( xmm6 ) * scalar;
6977 (~C)(i+1UL,j+2UL) -=
sum( xmm7 ) * scalar;
6978 (~C)(i+1UL,j+3UL) -=
sum( xmm8 ) * scalar;
6980 for( ; remainder && k<kend; ++k ) {
6981 (~C)(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
6982 (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
6983 (~C)(i ,j+2UL) -= A(i ,k) * B(k,j+2UL) * scalar;
6984 (~C)(i ,j+3UL) -= A(i ,k) * B(k,j+3UL) * scalar;
6985 (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
6986 (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
6987 (~C)(i+1UL,j+2UL) -= A(i+1UL,k) * B(k,j+2UL) * scalar;
6988 (~C)(i+1UL,j+3UL) -= A(i+1UL,k) * B(k,j+3UL) * scalar;
6992 for( ; (j+2UL) <= N; j+=2UL )
6994 const size_t kbegin( ( IsUpper<MT4>::value )
6995 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
6996 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
6997 const size_t kend( ( IsLower<MT4>::value )
6998 ?( IsUpper<MT5>::value ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
6999 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
7001 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
7002 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
7004 SIMDType xmm1, xmm2, xmm3, xmm4;
7007 for( ; k<kpos; k+=SIMDSIZE ) {
7008 const SIMDType a1( A.load(i ,k) );
7009 const SIMDType a2( A.load(i+1UL,k) );
7010 const SIMDType b1( B.load(k,j ) );
7011 const SIMDType b2( B.load(k,j+1UL) );
7012 xmm1 = xmm1 + a1 * b1;
7013 xmm2 = xmm2 + a1 * b2;
7014 xmm3 = xmm3 + a2 * b1;
7015 xmm4 = xmm4 + a2 * b2;
7018 (~C)(i ,j ) -=
sum( xmm1 ) * scalar;
7019 (~C)(i ,j+1UL) -=
sum( xmm2 ) * scalar;
7020 (~C)(i+1UL,j ) -=
sum( xmm3 ) * scalar;
7021 (~C)(i+1UL,j+1UL) -=
sum( xmm4 ) * scalar;
7023 for( ; remainder && k<kend; ++k ) {
7024 (~C)(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
7025 (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
7026 (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
7027 (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
7033 const size_t kbegin( ( IsUpper<MT4>::value )
7034 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
7035 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
7036 const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
7038 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
7039 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
7041 SIMDType xmm1, xmm2;
7044 for( ; k<kpos; k+=SIMDSIZE ) {
7045 const SIMDType b1( B.load(k,j) );
7046 xmm1 = xmm1 + A.load(i ,k) * b1;
7047 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
7050 (~C)(i ,j) -=
sum( xmm1 ) * scalar;
7051 (~C)(i+1UL,j) -=
sum( xmm2 ) * scalar;
7053 for( ; remainder && k<kend; ++k ) {
7054 (~C)(i ,j) -= A(i ,k) * B(k,j) * scalar;
7055 (~C)(i+1UL,j) -= A(i+1UL,k) * B(k,j) * scalar;
7064 for( ; (j+4UL) <= N; j+=4UL )
7066 const size_t kbegin( ( IsUpper<MT4>::value )
7067 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
7068 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
7069 const size_t kend( ( IsUpper<MT5>::value )?( j+4UL ):( K ) );
7071 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
7072 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
7074 SIMDType xmm1, xmm2, xmm3, xmm4;
7077 for( ; k<kpos; k+=SIMDSIZE ) {
7078 const SIMDType a1( A.load(i,k) );
7079 xmm1 = xmm1 + a1 * B.load(k,j );
7080 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
7081 xmm3 = xmm3 + a1 * B.load(k,j+2UL);
7082 xmm4 = xmm4 + a1 * B.load(k,j+3UL);
7085 (~C)(i,j ) -=
sum( xmm1 ) * scalar;
7086 (~C)(i,j+1UL) -=
sum( xmm2 ) * scalar;
7087 (~C)(i,j+2UL) -=
sum( xmm3 ) * scalar;
7088 (~C)(i,j+3UL) -=
sum( xmm4 ) * scalar;
7090 for( ; remainder && k<kend; ++k ) {
7091 (~C)(i,j ) -= A(i,k) * B(k,j ) * scalar;
7092 (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL) * scalar;
7093 (~C)(i,j+2UL) -= A(i,k) * B(k,j+2UL) * scalar;
7094 (~C)(i,j+3UL) -= A(i,k) * B(k,j+3UL) * scalar;
7098 for( ; (j+2UL) <= N; j+=2UL )
7100 const size_t kbegin( ( IsUpper<MT4>::value )
7101 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
7102 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
7103 const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
7105 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
7106 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
7108 SIMDType xmm1, xmm2;
7111 for( ; k<kpos; k+=SIMDSIZE ) {
7112 const SIMDType a1( A.load(i,k) );
7113 xmm1 = xmm1 + a1 * B.load(k,j );
7114 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
7117 (~C)(i,j ) -=
sum( xmm1 ) * scalar;
7118 (~C)(i,j+1UL) -=
sum( xmm2 ) * scalar;
7120 for( ; remainder && k<kend; ++k ) {
7121 (~C)(i,j ) -= A(i,k) * B(k,j ) * scalar;
7122 (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL) * scalar;
7128 const size_t kbegin( ( IsUpper<MT4>::value )
7129 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
7130 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
7132 const size_t kpos( remainder ? ( K &
size_t(-SIMDSIZE) ) : K );
7133 BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
7138 for( ; k<kpos; k+=SIMDSIZE ) {
7139 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
7142 (~C)(i,j) -=
sum( xmm1 ) * scalar;
7144 for( ; remainder && k<K; ++k ) {
7145 (~C)(i,j) -= A(i,k) * B(k,j) * scalar;
7167 template<
typename MT3
7171 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
7172 selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
7174 const size_t M( A.rows() );
7175 const size_t N( B.columns() );
7176 const size_t K( A.columns() );
7178 const bool remainder( !IsPadded<MT4>::value || !IsPadded<MT5>::value );
7182 for( ; (i+4UL) <= M; i+=4UL )
7186 for( ; (j+2UL) <= N; j+=2UL )
7188 const size_t kbegin( ( IsUpper<MT4>::value )
7189 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
7190 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
7191 const size_t kend( ( IsLower<MT4>::value )
7192 ?( IsUpper<MT5>::value ?
min( i+4UL, j+2UL ) : ( i+4UL ) )
7193 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
7195 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
7196 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
7198 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7201 for( ; k<kpos; k+=SIMDSIZE )
7203 const SIMDType a1( A.load(i ,k) );
7204 const SIMDType a2( A.load(i+1UL,k) );
7205 const SIMDType a3( A.load(i+2UL,k) );
7206 const SIMDType a4( A.load(i+3UL,k) );
7207 const SIMDType b1( B.load(k,j ) );
7208 const SIMDType b2( B.load(k,j+1UL) );
7209 xmm1 = xmm1 + a1 * b1;
7210 xmm2 = xmm2 + a1 * b2;
7211 xmm3 = xmm3 + a2 * b1;
7212 xmm4 = xmm4 + a2 * b2;
7213 xmm5 = xmm5 + a3 * b1;
7214 xmm6 = xmm6 + a3 * b2;
7215 xmm7 = xmm7 + a4 * b1;
7216 xmm8 = xmm8 + a4 * b2;
7219 (~C)(i ,j ) -=
sum( xmm1 ) * scalar;
7220 (~C)(i ,j+1UL) -=
sum( xmm2 ) * scalar;
7221 (~C)(i+1UL,j ) -=
sum( xmm3 ) * scalar;
7222 (~C)(i+1UL,j+1UL) -=
sum( xmm4 ) * scalar;
7223 (~C)(i+2UL,j ) -=
sum( xmm5 ) * scalar;
7224 (~C)(i+2UL,j+1UL) -=
sum( xmm6 ) * scalar;
7225 (~C)(i+3UL,j ) -=
sum( xmm7 ) * scalar;
7226 (~C)(i+3UL,j+1UL) -=
sum( xmm8 ) * scalar;
7228 for( ; remainder && k<kend; ++k ) {
7229 (~C)(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
7230 (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
7231 (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
7232 (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
7233 (~C)(i+2UL,j ) -= A(i+2UL,k) * B(k,j ) * scalar;
7234 (~C)(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL) * scalar;
7235 (~C)(i+3UL,j ) -= A(i+3UL,k) * B(k,j ) * scalar;
7236 (~C)(i+3UL,j+1UL) -= A(i+3UL,k) * B(k,j+1UL) * scalar;
7242 const size_t kbegin( ( IsUpper<MT4>::value )
7243 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
7244 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
7245 const size_t kend( ( IsLower<MT4>::value )?( i+4UL ):( K ) );
7247 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
7248 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
7250 SIMDType xmm1, xmm2, xmm3, xmm4;
7253 for( ; k<kpos; k+=SIMDSIZE ) {
7254 const SIMDType b1( B.load(k,j) );
7255 xmm1 = xmm1 + A.load(i ,k) * b1;
7256 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
7257 xmm3 = xmm3 + A.load(i+2UL,k) * b1;
7258 xmm4 = xmm4 + A.load(i+3UL,k) * b1;
7261 (~C)(i ,j) -=
sum( xmm1 ) * scalar;
7262 (~C)(i+1UL,j) -=
sum( xmm2 ) * scalar;
7263 (~C)(i+2UL,j) -=
sum( xmm3 ) * scalar;
7264 (~C)(i+3UL,j) -=
sum( xmm4 ) * scalar;
7266 for( ; remainder && k<kend; ++k ) {
7267 (~C)(i ,j) -= A(i ,k) * B(k,j) * scalar;
7268 (~C)(i+1UL,j) -= A(i+1UL,k) * B(k,j) * scalar;
7269 (~C)(i+2UL,j) -= A(i+2UL,k) * B(k,j) * scalar;
7270 (~C)(i+3UL,j) -= A(i+3UL,k) * B(k,j) * scalar;
7275 for( ; (i+2UL) <= M; i+=2UL )
7279 for( ; (j+2UL) <= N; j+=2UL )
7281 const size_t kbegin( ( IsUpper<MT4>::value )
7282 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
7283 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
7284 const size_t kend( ( IsLower<MT4>::value )
7285 ?( IsUpper<MT5>::value ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
7286 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
7288 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
7289 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
7291 SIMDType xmm1, xmm2, xmm3, xmm4;
7294 for( ; k<kpos; k+=SIMDSIZE ) {
7295 const SIMDType a1( A.load(i ,k) );
7296 const SIMDType a2( A.load(i+1UL,k) );
7297 const SIMDType b1( B.load(k,j ) );
7298 const SIMDType b2( B.load(k,j+1UL) );
7299 xmm1 = xmm1 + a1 * b1;
7300 xmm2 = xmm2 + a1 * b2;
7301 xmm3 = xmm3 + a2 * b1;
7302 xmm4 = xmm4 + a2 * b2;
7305 (~C)(i ,j ) -=
sum( xmm1 ) * scalar;
7306 (~C)(i ,j+1UL) -=
sum( xmm2 ) * scalar;
7307 (~C)(i+1UL,j ) -=
sum( xmm3 ) * scalar;
7308 (~C)(i+1UL,j+1UL) -=
sum( xmm4 ) * scalar;
7310 for( ; remainder && k<kend; ++k ) {
7311 (~C)(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
7312 (~C)(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
7313 (~C)(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
7314 (~C)(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
7320 const size_t kbegin( ( IsUpper<MT4>::value )
7321 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
7322 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
7323 const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
7325 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
7326 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
7328 SIMDType xmm1, xmm2;
7331 for( ; k<kpos; k+=SIMDSIZE ) {
7332 const SIMDType b1( B.load(k,j) );
7333 xmm1 = xmm1 + A.load(i ,k) * b1;
7334 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
7337 (~C)(i ,j) -=
sum( xmm1 ) * scalar;
7338 (~C)(i+1UL,j) -=
sum( xmm2 ) * scalar;
7340 for( ; remainder && k<kend; ++k ) {
7341 (~C)(i ,j) -= A(i ,k) * B(k,j) * scalar;
7342 (~C)(i+1UL,j) -= A(i+1UL,k) * B(k,j) * scalar;
7351 for( ; (j+2UL) <= N; j+=2UL )
7353 const size_t kbegin( ( IsUpper<MT4>::value )
7354 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
7355 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
7356 const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
7358 const size_t kpos( remainder ? ( kend &
size_t(-SIMDSIZE) ) : kend );
7359 BLAZE_INTERNAL_ASSERT( !remainder || ( kend - ( kend % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
7361 SIMDType xmm1, xmm2;
7364 for( ; k<kpos; k+=SIMDSIZE ) {
7365 const SIMDType a1( A.load(i,k) );
7366 xmm1 = xmm1 + a1 * B.load(k,j );
7367 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
7370 (~C)(i,j ) -=
sum( xmm1 ) * scalar;
7371 (~C)(i,j+1UL) -=
sum( xmm2 ) * scalar;
7373 for( ; remainder && k<kend; ++k ) {
7374 (~C)(i,j ) -= A(i,k) * B(k,j ) * scalar;
7375 (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL) * scalar;
7381 const size_t kbegin( ( IsUpper<MT4>::value )
7382 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-SIMDSIZE) )
7383 :( IsLower<MT5>::value ? ( j &
size_t(-SIMDSIZE) ) : 0UL ) );
7385 const size_t kpos( remainder ? ( K &
size_t(-SIMDSIZE) ) : K );
7386 BLAZE_INTERNAL_ASSERT( !remainder || ( K - ( K % (SIMDSIZE) ) ) == kpos,
"Invalid end calculation" );
7391 for( ; k<kpos; k+=SIMDSIZE ) {
7392 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
7395 (~C)(i,j) -=
sum( xmm1 ) * scalar;
7397 for( ; remainder && k<K; ++k ) {
7398 (~C)(i,j) -= A(i,k) * B(k,j) * scalar;
7419 template<
typename MT3
7423 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
7424 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7426 selectDefaultSubAssignKernel( C, A, B, scalar );
7445 template<
typename MT3
7449 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
7450 selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
7453 selectSmallSubAssignKernel( ~C, A, B, scalar );
7472 template<
typename MT3
7476 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
7477 selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
7480 selectSmallSubAssignKernel( ~C, A, B, scalar );
7498 template<
typename MT3
7502 static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
7503 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7505 selectLargeSubAssignKernel( C, A, B, scalar );
7510 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
7524 template<
typename MT3
7528 static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
7529 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7531 typedef ElementType_<MT3> ET;
7533 if( IsTriangular<MT4>::value ) {
7534 ResultType_<MT3> tmp(
serial( B ) );
7535 trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7536 subAssign( C, tmp );
7538 else if( IsTriangular<MT5>::value ) {
7539 ResultType_<MT3> tmp(
serial( A ) );
7540 trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7541 subAssign( C, tmp );
7544 gemm( C, A, B, ET(-scalar), ET(1) );
7577 template<
typename MT
7579 friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
7580 smpAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7587 LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7588 RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7590 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
7593 else if( left.columns() == 0UL ) {
7627 template<
typename MT
7629 friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
7630 smpAssign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7634 typedef IfTrue_< SO, OppositeType, ResultType > TmpType;
7646 const TmpType tmp( rhs );
7666 template<
typename MT
7668 friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
7669 smpAddAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7676 LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7677 RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7679 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
7716 template<
typename MT
7718 friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
7719 smpSubAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7726 LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7727 RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7729 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
7811 template<
typename T1
7813 inline const DMatTDMatMultExpr<T1,T2>
7837 template<
typename MT1,
typename MT2 >
7854 template<
typename MT1,
typename MT2 >
7871 template<
typename MT1,
typename MT2 >
7873 :
public BoolConstant< And< IsAligned<MT1>, IsAligned<MT2> >::value >
7889 template<
typename MT1,
typename MT2 >
7891 :
public BoolConstant< And< IsLower<MT1>, IsLower<MT2> >::value >
7907 template<
typename MT1,
typename MT2 >
7909 :
public BoolConstant< And< IsUniLower<MT1>, IsUniLower<MT2> >::value >
7925 template<
typename MT1,
typename MT2 >
7927 :
public BoolConstant< Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
7928 , And< IsStrictlyLower<MT2>, IsLower<MT1> > >::value >
7944 template<
typename MT1,
typename MT2 >
7946 :
public BoolConstant< And< IsUpper<MT1>, IsUpper<MT2> >::value >
7962 template<
typename MT1,
typename MT2 >
7964 :
public BoolConstant< And< IsUniUpper<MT1>, IsUniUpper<MT2> >::value >
7980 template<
typename MT1,
typename MT2 >
7982 :
public BoolConstant< Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
7983 , And< IsStrictlyUpper<MT2>, IsUpper<MT1> > >::value >
7999 template<
typename MT1,
typename MT2,
typename VT >
8004 using Type = If_< And< IsDenseMatrix<MT1>, IsRowMajorMatrix<MT1>
8005 , IsDenseMatrix<MT2>, IsColumnMajorMatrix<MT2>
8006 , IsDenseVector<VT>, IsColumnVector<VT> >
8007 , DMatDVecMultExprTrait_< MT1, TDMatDVecMultExprTrait_<MT2,VT> >
8017 template<
typename MT1,
typename MT2,
typename VT >
8022 using Type = If_< And< IsDenseMatrix<MT1>, IsRowMajorMatrix<MT1>
8023 , IsDenseMatrix<MT2>, IsColumnMajorMatrix<MT2>
8024 , IsSparseVector<VT>, IsColumnVector<VT> >
8025 , DMatDVecMultExprTrait_< MT1, TDMatSVecMultExprTrait_<MT2,VT> >
8035 template<
typename VT,
typename MT1,
typename MT2 >
8040 using Type = If_< And< IsDenseVector<VT>, IsRowVector<VT>
8041 , IsDenseMatrix<MT1>, IsRowMajorMatrix<MT1>
8042 , IsDenseMatrix<MT2>, IsColumnMajorMatrix<MT2> >
8043 , TDVecTDMatMultExprTrait_< TDVecDMatMultExprTrait_<VT,MT1>, MT2 >
8053 template<
typename VT,
typename MT1,
typename MT2 >
8058 using Type = If_< And< IsSparseVector<VT>, IsRowVector<VT>
8059 , IsDenseMatrix<MT1>, IsRowMajorMatrix<MT1>
8060 , IsDenseMatrix<MT2>, IsColumnMajorMatrix<MT2> >
8061 , TDVecTDMatMultExprTrait_< TSVecDMatMultExprTrait_<VT,MT1>, MT2 >
8071 template<
typename MT1,
typename MT2,
bool AF >
8076 using Type = MultExprTrait_< SubmatrixExprTrait_<const MT1,AF>
8077 , SubmatrixExprTrait_<const MT2,AF> >;
8086 template<
typename MT1,
typename MT2 >
8091 using Type = MultExprTrait_< RowExprTrait_<const MT1>, MT2 >;
8100 template<
typename MT1,
typename MT2 >
8105 using Type = MultExprTrait_< MT1, ColumnExprTrait_<const MT2> >;
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
OppositeType_< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatTDMatMultExpr.h:229
IfTrue_< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:243
Header file for auxiliary alias declarations.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:72
Constraint on the data type.
Header file for kernel specific block sizes.
Header file for mathematical functions.
constexpr bool useOptimizedKernels
Configuration switch for optimized kernels.This configuration switch enables/disables all optimized c...
Definition: Optimizations.h:84
If_< IsExpression< MT2 >, const MT2, const MT2 & > RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:240
Header file for the Rows type trait.
Header file for the IsUniUpper type trait.
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:7800
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:87
Header file for basic type definitions.
BLAZE_ALWAYS_INLINE const complex< int8_t > sum(const SIMDcint8 &a) noexcept
Returns the sum of all elements in the 8-bit integral complex SIMD vector.
Definition: Reduction.h:63
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
Header file for the serial shim.
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatTDMatMultExpr.h:287
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:371
Header file for the IsDiagonal type trait.
BLAZE_ALWAYS_INLINE size_t size(const Vector< VT, TF > &vector) noexcept
Returns the current size/dimension of the vector.
Definition: Vector.h:258
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatTDMatMultExpr.h:351
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the ColumnExprTrait class template.
Header file for the IsSame and IsStrictlySame type traits.
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:188
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:162
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:315
Header file for the IsColumnMajorMatrix type trait.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:533
Header file for the IsRowVector type trait.
DMatTDMatMultExpr< MT1, MT2 > This
Type of this DMatTDMatMultExpr instance.
Definition: DMatTDMatMultExpr.h:227
Header file for the And class template.
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1669
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:88
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatTDMatMultExpr.h:335
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:162
Header file for the TDVecSMatMultExprTrait class template.
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:723
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:245
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:88
Constraints on the storage order of matrix types.
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatTDMatMultExpr.h:405
Header file for the RequiresEvaluation type trait.
ElementType_< ResultType > ElementType
Resulting element type.
Definition: DMatTDMatMultExpr.h:231
System settings for performance optimizations.
ResultType_< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:154
Header file for the IsUniLower type trait.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:323
const ElementType_< MT > max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1716
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT > >, ColumnExprTrait_< MT > > column(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific column of the given matrix.
Definition: Column.h:126
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
Header file for the IsComplexDouble type trait.
Constraint on the data type.
Header file for the MultExprTrait class template.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:72
SubvectorExprTrait_< VT, unaligned > subvector(Vector< VT, TF > &vector, size_t index, size_t size)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:152
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatTDMatMultExpr.h:361
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: DMatTDMatMultExpr.h:232
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatTDMatMultExpr.h:234
ElementType_< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:155
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:230
Header file for the IsSymmetric type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
Header file for the If class template.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
Header file for the Or class template.
Header file for the TDMatSVecMultExprTrait class template.
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatTDMatMultExpr.h:425
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
Header file for the Columns type trait.
Header file for the Not class template.
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
Header file for the DMatDVecMultExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:90
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:60
Header file for the IsTriangular type trait.
Constraints on the storage order of matrix types.
Header file for the exception macros of the math module.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
BLAZE_ALWAYS_INLINE MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:254
MultTrait_< RT1, RT2 > ResultType
Result type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:228
Header file for the RowExprTrait class template.
Header file for all forward declarations for expression class templates.
Header file for the IsDenseMatrix type trait.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
#define BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:93
If_< IsExpression< MT1 >, const MT1, const MT1 & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:237
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:109
IfTrue_< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:246
Header file for the IsNumeric type trait.
Header file for the HasConstDataAccess type trait.
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT > >, RowExprTrait_< MT > > row(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific row of the given matrix.
Definition: Row.h:126
System settings for the BLAS mode.
Base class for all matrix/matrix multiplication expression templates.The MatMatMultExpr class serves ...
Definition: MatMatMultExpr.h:65
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatTDMatMultExpr.h:415
Header file for the IsSparseVector type trait.
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
CompositeType_< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:157
Header file for the MatScalarMultExpr base class.
Header file for run time assertion macros.
Utility type for generic codes.
CompositeType_< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:158
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:160
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
Constraints on the storage order of matrix types.
Header file for the HasMutableDataAccess type trait.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatTDMatMultExpr.h:393
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:113
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant class template represents ...
Definition: IntegralConstant.h:100
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
typename T::OppositeType OppositeType_
Alias declaration for nested OppositeType type definitions.The OppositeType_ alias declaration provid...
Definition: Aliases.h:243
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
Header file for the IsDenseVector type trait.
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
Header file for the AreSIMDCombinable type trait.
Header file for the IsRowMajorMatrix type trait.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:55
DMatTDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the DMatTDMatMultExpr class.
Definition: DMatTDMatMultExpr.h:272
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:59
Header file for the TDVecDMatMultExprTrait class template.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:233
Header file for the TDMatDVecMultExprTrait class template.
ResultType_< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:153
Header file for BLAS general matrix/matrix multiplication functions (gemm)
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Header file for the IsComplex type trait.
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatTDMatMultExpr.h:434
Header file for the complex data type.
RightOperand rightOperand() const noexcept
Returns the right-hand side transpose dense matrix operand.
Definition: DMatTDMatMultExpr.h:381
Expression object for dense matrix-transpose dense matrix multiplications.The DMatTDMatMultExpr class...
Definition: DMatTDMatMultExpr.h:147
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: DMatTDMatMultExpr.h:435
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:403
Header file for the IsUpper type trait.
Header file for the DMatSVecMultExprTrait class template.
Header file for the IsColumnVector type trait.
Constraint on the data type.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the TDVecTDMatMultExprTrait class template.
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.
ElementType_< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:156