35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATTDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_DMATTDMATMULTEXPR_H_
130 template<
typename MT1
162 template<
typename T1,
typename T2,
typename T3 >
163 struct IsEvaluationRequired {
164 enum { value = ( evaluateLeft || evaluateRight ) };
174 template<
typename T1,
typename T2,
typename T3 >
175 struct UseSinglePrecisionKernel {
177 HasMutableDataAccess<T1>::value &&
178 HasConstDataAccess<T2>::value &&
179 HasConstDataAccess<T3>::value &&
180 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
181 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
182 IsFloat<typename T1::ElementType>::value &&
183 IsFloat<typename T2::ElementType>::value &&
184 IsFloat<typename T3::ElementType>::value };
194 template<
typename T1,
typename T2,
typename T3 >
195 struct UseDoublePrecisionKernel {
197 HasMutableDataAccess<T1>::value &&
198 HasConstDataAccess<T2>::value &&
199 HasConstDataAccess<T3>::value &&
200 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
201 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
202 IsDouble<typename T1::ElementType>::value &&
203 IsDouble<typename T2::ElementType>::value &&
204 IsDouble<typename T3::ElementType>::value };
215 template<
typename T1,
typename T2,
typename T3 >
216 struct UseSinglePrecisionComplexKernel {
217 typedef complex<float> Type;
219 HasMutableDataAccess<T1>::value &&
220 HasConstDataAccess<T2>::value &&
221 HasConstDataAccess<T3>::value &&
222 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
223 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
224 IsSame<typename T1::ElementType,Type>::value &&
225 IsSame<typename T2::ElementType,Type>::value &&
226 IsSame<typename T3::ElementType,Type>::value };
237 template<
typename T1,
typename T2,
typename T3 >
238 struct UseDoublePrecisionComplexKernel {
239 typedef complex<double> Type;
241 HasMutableDataAccess<T1>::value &&
242 HasConstDataAccess<T2>::value &&
243 HasConstDataAccess<T3>::value &&
244 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
245 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
246 IsSame<typename T1::ElementType,Type>::value &&
247 IsSame<typename T2::ElementType,Type>::value &&
248 IsSame<typename T3::ElementType,Type>::value };
258 template<
typename T1,
typename T2,
typename T3 >
259 struct UseDefaultKernel {
260 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
261 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
262 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
263 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
273 template<
typename T1,
typename T2,
typename T3 >
274 struct UseVectorizedDefaultKernel {
275 enum { value = !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
276 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
277 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
278 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
279 IntrinsicTrait<typename T1::ElementType>::addition &&
280 IntrinsicTrait<typename T1::ElementType>::multiplication };
312 MT1::vectorizable && MT2::vectorizable &&
318 enum { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
319 !evaluateRight && MT2::smpAssignable };
362 :(
lhs_.columns() ) ) );
364 if(
lhs_.columns() == 0UL ||
374 const size_t knum( kend - kbegin );
375 const size_t kpos( kbegin + ( ( knum - 1UL ) &
size_t(-2) ) + 1UL );
377 ElementType tmp(
lhs_(i,kbegin) *
rhs_(kbegin,j) );
379 for(
size_t k=kbegin+1UL; k<kpos; k+=2UL ) {
381 tmp +=
lhs_(i,k+1UL) *
rhs_(k+1UL,j);
407 return rhs_.columns();
437 template<
typename T >
439 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
449 template<
typename T >
451 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
461 return lhs_.isAligned() &&
rhs_.isAligned();
496 template<
typename MT
505 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
508 else if( rhs.
lhs_.columns() == 0UL ) {
523 DMatTDMatMultExpr::selectAssignKernel( ~lhs, A, B );
539 template<
typename MT3
542 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
546 selectSmallAssignKernel( C, A, B );
548 selectBlasAssignKernel( C, A, B );
567 template<
typename MT3
570 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
571 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
573 const size_t M( A.rows() );
574 const size_t N( B.columns() );
575 const size_t K( A.columns() );
577 const size_t ibegin( ( IsStrictlyLower<MT4>::value )
578 ?( ( IsStrictlyLower<MT5>::value && M > 1UL ) ? 2UL : 1UL )
580 const size_t iend( ( IsStrictlyUpper<MT4>::value )
581 ?( ( IsStrictlyUpper<MT5>::value && M > 1UL ) ? M-2UL : M-1UL )
585 for(
size_t i=0UL; i<ibegin; ++i ) {
586 for(
size_t j=0UL; j<N; ++j ) {
590 for(
size_t i=ibegin; i<iend; ++i )
592 const size_t jbegin( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
593 ?( ( IsStrictlyUpper<MT4>::value )
594 ?( IsStrictlyUpper<MT5>::value ? i+2UL : i+1UL )
595 :( IsStrictlyUpper<MT5>::value ? i+1UL : i ) )
596 :( IsStrictlyUpper<MT5>::value ? 1UL : 0UL ) );
597 const size_t jend( ( IsLower<MT4>::value && IsLower<MT5>::value )
598 ?( ( IsStrictlyLower<MT4>::value )
599 ?( IsStrictlyLower<MT5>::value ? i-1UL : i )
600 :( IsStrictlyLower<MT5>::value ? i : i+1UL ) )
601 :( IsStrictlyLower<MT5>::value ? N-1UL : N ) );
604 for(
size_t j=0UL; j<jbegin; ++j ) {
607 for(
size_t j=jbegin; j<jend; ++j )
609 const size_t kbegin( ( IsUpper<MT4>::value )
610 ?( ( IsLower<MT5>::value )
611 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i )
612 , ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
613 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
614 :( ( IsLower<MT5>::value )
615 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
617 const size_t kend( ( IsLower<MT4>::value )
618 ?( ( IsUpper<MT5>::value )
619 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL )
620 , ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
621 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
622 :( ( IsUpper<MT5>::value )
623 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
627 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
628 for(
size_t k=kbegin+1UL; k<kend; ++k ) {
629 (~C)(i,j) += A(i,k) * B(k,j);
632 for(
size_t j=jend; j<N; ++j ) {
636 for(
size_t i=iend; i<M; ++i ) {
637 for(
size_t j=0UL; j<N; ++j ) {
659 template<
typename MT3
662 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
663 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
665 const size_t M( A.rows() );
666 const size_t N( B.columns() );
667 const size_t K( A.columns() );
669 const size_t jbegin( ( IsStrictlyUpper<MT5>::value )
670 ?( ( IsStrictlyUpper<MT4>::value && N > 1UL ) ? 2UL : 1UL )
672 const size_t jend( ( IsStrictlyLower<MT5>::value )
673 ?( ( IsStrictlyLower<MT4>::value && N > 1UL ) ? N-2UL : N-1UL )
677 for(
size_t j=0UL; j<jbegin; ++j ) {
678 for(
size_t i=0UL; i<M; ++i ) {
682 for(
size_t j=jbegin; j<jend; ++j )
684 const size_t ibegin( ( IsLower<MT4>::value && IsLower<MT5>::value )
685 ?( ( IsStrictlyLower<MT4>::value )
686 ?( IsStrictlyLower<MT5>::value ? j+2UL : j+1UL )
687 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
688 :( IsStrictlyLower<MT4>::value ? 1UL : 0UL ) );
689 const size_t iend( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
690 ?( ( IsStrictlyUpper<MT4>::value )
691 ?( ( IsStrictlyUpper<MT5>::value )?( j-1UL ):( j ) )
692 :( ( IsStrictlyUpper<MT5>::value )?( j ):( j+1UL ) ) )
693 :( IsStrictlyUpper<MT4>::value ? M-1UL : M ) );
696 for(
size_t i=0UL; i<ibegin; ++i ) {
699 for(
size_t i=ibegin; i<iend; ++i )
701 const size_t kbegin( ( IsUpper<MT4>::value )
702 ?( ( IsLower<MT5>::value )
703 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i )
704 , ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
705 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
706 :( ( IsLower<MT5>::value )
707 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
709 const size_t kend( ( IsLower<MT4>::value )
710 ?( ( IsUpper<MT5>::value )
711 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL )
712 , ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
713 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
714 :( ( IsUpper<MT5>::value )
715 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
719 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
720 for(
size_t k=kbegin+1UL; k<kend; ++k ) {
721 (~C)(i,j) += A(i,k) * B(k,j);
724 for(
size_t i=iend; i<M; ++i ) {
728 for(
size_t j=jend; j<N; ++j ) {
729 for(
size_t i=0UL; i<M; ++i ) {
751 template<
typename MT3
754 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
755 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
757 const size_t M( A.rows() );
758 const size_t N( B.columns() );
760 for(
size_t i=0UL; i<M; ++i )
762 const size_t jbegin( ( IsUpper<MT4>::value )
763 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
765 const size_t jend( ( IsLower<MT4>::value )
766 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
770 if( IsUpper<MT4>::value ) {
771 for(
size_t j=0UL; j<jbegin; ++j ) {
775 for(
size_t j=jbegin; j<jend; ++j ) {
776 (~C)(i,j) = A(i,j) * B(j,j);
778 if( IsLower<MT4>::value ) {
779 for(
size_t j=jend; j<N; ++j ) {
802 template<
typename MT3
805 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
806 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
808 const size_t M( A.rows() );
809 const size_t N( B.columns() );
811 const size_t block( 16UL );
813 for(
size_t jj=0UL; jj<N; jj+=block ) {
814 const size_t jend(
min( N, jj+block ) );
815 for(
size_t ii=0UL; ii<M; ii+=block ) {
816 const size_t iend(
min( M, ii+block ) );
817 for(
size_t j=jj; j<jend; ++j )
819 const size_t ibegin( ( IsLower<MT4>::value )
820 ?(
max( ( IsStrictlyLower<MT4>::value ? j+1UL : j ), ii ) )
822 const size_t ipos( ( IsUpper<MT4>::value )
823 ?(
min( ( IsStrictlyUpper<MT4>::value ? j : j+1UL ), iend ) )
826 if( IsLower<MT4>::value ) {
827 for(
size_t i=ii; i<ibegin; ++i ) {
831 for(
size_t i=ibegin; i<ipos; ++i ) {
832 (~C)(i,j) = A(i,j) * B(j,j);
834 if( IsUpper<MT4>::value ) {
835 for(
size_t i=ipos; i<iend; ++i ) {
860 template<
typename MT3
863 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
864 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
866 const size_t M( A.rows() );
867 const size_t N( B.columns() );
869 const size_t block( 16UL );
871 for(
size_t ii=0UL; ii<M; ii+=block ) {
872 const size_t iend(
min( M, ii+block ) );
873 for(
size_t jj=0UL; jj<N; jj+=block ) {
874 const size_t jend(
min( N, jj+block ) );
875 for(
size_t i=ii; i<iend; ++i )
877 const size_t jbegin( ( IsUpper<MT5>::value )
878 ?(
max( ( IsStrictlyUpper<MT5>::value ? i+1UL : i ), jj ) )
880 const size_t jpos( ( IsLower<MT5>::value )
881 ?(
min( ( IsStrictlyLower<MT5>::value ? i : i+1UL ), jend ) )
884 if( IsUpper<MT5>::value ) {
885 for(
size_t j=jj; j<jbegin; ++j ) {
889 for(
size_t j=jbegin; j<jpos; ++j ) {
890 (~C)(i,j) = A(i,i) * B(i,j);
892 if( IsLower<MT5>::value ) {
893 for(
size_t j=jpos; j<jend; ++j ) {
918 template<
typename MT3
921 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
922 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
924 const size_t M( A.rows() );
925 const size_t N( B.columns() );
927 for(
size_t j=0UL; j<N; ++j )
929 const size_t ibegin( ( IsLower<MT5>::value )
930 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
932 const size_t iend( ( IsUpper<MT5>::value )
933 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
937 if( IsLower<MT5>::value ) {
938 for(
size_t i=0UL; i<ibegin; ++i ) {
942 for(
size_t i=ibegin; i<iend; ++i ) {
943 (~C)(i,j) = A(i,i) * B(i,j);
945 if( IsUpper<MT5>::value ) {
946 for(
size_t i=iend; i<M; ++i ) {
969 template<
typename MT3
972 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
973 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
977 for(
size_t i=0UL; i<A.rows(); ++i ) {
978 C(i,i) = A(i,i) * B(i,i);
998 template<
typename MT3
1001 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1002 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1004 selectDefaultAssignKernel( C, A, B );
1024 template<
typename MT3
1027 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1028 selectSmallAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1030 typedef IntrinsicTrait<ElementType> IT;
1032 const size_t M( A.rows() );
1033 const size_t N( B.columns() );
1034 const size_t K( A.columns() );
1038 for( ; (i+2UL) <= M; i+=2UL )
1042 for( ; (j+4UL) <= N; j+=4UL )
1044 const size_t kbegin( ( IsUpper<MT4>::value )
1045 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
1046 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
1047 const size_t kend( ( IsLower<MT4>::value )
1048 ?( IsUpper<MT5>::value ?
min( i+2UL, j+4UL ) : ( i+2UL ) )
1049 :( IsUpper<MT5>::value ? ( j+4UL ) : K ) );
1051 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1053 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
1054 const IntrinsicType a1( A.load(i ,k) );
1055 const IntrinsicType a2( A.load(i+1UL,k) );
1056 const IntrinsicType b1( B.load(k,j ) );
1057 const IntrinsicType b2( B.load(k,j+1UL) );
1058 const IntrinsicType b3( B.load(k,j+2UL) );
1059 const IntrinsicType b4( B.load(k,j+3UL) );
1060 xmm1 = xmm1 + a1 * b1;
1061 xmm2 = xmm2 + a1 * b2;
1062 xmm3 = xmm3 + a1 * b3;
1063 xmm4 = xmm4 + a1 * b4;
1064 xmm5 = xmm5 + a2 * b1;
1065 xmm6 = xmm6 + a2 * b2;
1066 xmm7 = xmm7 + a2 * b3;
1067 xmm8 = xmm8 + a2 * b4;
1070 (~C)(i ,j ) =
sum( xmm1 );
1071 (~C)(i ,j+1UL) =
sum( xmm2 );
1072 (~C)(i ,j+2UL) =
sum( xmm3 );
1073 (~C)(i ,j+3UL) =
sum( xmm4 );
1074 (~C)(i+1UL,j ) =
sum( xmm5 );
1075 (~C)(i+1UL,j+1UL) =
sum( xmm6 );
1076 (~C)(i+1UL,j+2UL) =
sum( xmm7 );
1077 (~C)(i+1UL,j+3UL) =
sum( xmm8 );
1080 for( ; (j+2UL) <= N; j+=2UL )
1082 const size_t kbegin( ( IsUpper<MT4>::value )
1083 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
1084 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
1085 const size_t kend( ( IsLower<MT4>::value )
1086 ?( IsUpper<MT5>::value ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
1087 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
1089 IntrinsicType xmm1, xmm2, xmm3, xmm4;
1091 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
1092 const IntrinsicType a1( A.load(i ,k) );
1093 const IntrinsicType a2( A.load(i+1UL,k) );
1094 const IntrinsicType b1( B.load(k,j ) );
1095 const IntrinsicType b2( B.load(k,j+1UL) );
1096 xmm1 = xmm1 + a1 * b1;
1097 xmm2 = xmm2 + a1 * b2;
1098 xmm3 = xmm3 + a2 * b1;
1099 xmm4 = xmm4 + a2 * b2;
1102 (~C)(i ,j ) =
sum( xmm1 );
1103 (~C)(i ,j+1UL) =
sum( xmm2 );
1104 (~C)(i+1UL,j ) =
sum( xmm3 );
1105 (~C)(i+1UL,j+1UL) =
sum( xmm4 );
1110 const size_t kbegin( ( IsUpper<MT4>::value )
1111 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
1112 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
1113 const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
1115 IntrinsicType xmm1, xmm2;
1117 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
1118 const IntrinsicType b1( B.load(k,j) );
1119 xmm1 = xmm1 + A.load(i ,k) * b1;
1120 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
1123 (~C)(i ,j) =
sum( xmm1 );
1124 (~C)(i+1UL,j) =
sum( xmm2 );
1132 for( ; (j+4UL) <= N; j+=4UL )
1134 const size_t kbegin( ( IsUpper<MT4>::value )
1135 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
1136 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
1137 const size_t kend( ( IsUpper<MT5>::value )?( j+4UL ):( K ) );
1139 IntrinsicType xmm1, xmm2, xmm3, xmm4;
1141 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
1142 const IntrinsicType a1( A.load(i,k) );
1143 xmm1 = xmm1 + a1 * B.load(k,j );
1144 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
1145 xmm3 = xmm3 + a1 * B.load(k,j+2UL);
1146 xmm4 = xmm4 + a1 * B.load(k,j+3UL);
1149 (~C)(i,j ) =
sum( xmm1 );
1150 (~C)(i,j+1UL) =
sum( xmm2 );
1151 (~C)(i,j+2UL) =
sum( xmm3 );
1152 (~C)(i,j+3UL) =
sum( xmm4 );
1155 for( ; (j+2UL) <= N; j+=2UL )
1157 const size_t kbegin( ( IsUpper<MT4>::value )
1158 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
1159 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
1160 const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
1162 IntrinsicType xmm1, xmm2;
1164 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
1165 const IntrinsicType a1( A.load(i,k) );
1166 xmm1 = xmm1 + a1 * B.load(k,j );
1167 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
1170 (~C)(i,j ) =
sum( xmm1 );
1171 (~C)(i,j+1UL) =
sum( xmm2 );
1176 const size_t kbegin( ( IsUpper<MT4>::value )
1177 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
1178 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
1182 for(
size_t k=kbegin; k<K; k+=
IT::size ) {
1183 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
1186 (~C)(i,j) =
sum( xmm1 );
1208 template<
typename MT3
1211 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1212 selectSmallAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1214 typedef IntrinsicTrait<ElementType> IT;
1216 const size_t M( A.rows() );
1217 const size_t N( B.columns() );
1218 const size_t K( A.columns() );
1222 for( ; (i+4UL) <= M; i+=4UL )
1226 for( ; (j+2UL) <= N; j+=2UL )
1228 const size_t kbegin( ( IsUpper<MT4>::value )
1229 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
1230 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
1231 const size_t kend( ( IsLower<MT4>::value )
1232 ?( IsUpper<MT5>::value ?
min( i+4UL, j+2UL ) : ( i+4UL ) )
1233 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
1235 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1237 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
1238 const IntrinsicType a1( A.load(i ,k) );
1239 const IntrinsicType a2( A.load(i+1UL,k) );
1240 const IntrinsicType a3( A.load(i+2UL,k) );
1241 const IntrinsicType a4( A.load(i+3UL,k) );
1242 const IntrinsicType b1( B.load(k,j ) );
1243 const IntrinsicType b2( B.load(k,j+1UL) );
1244 xmm1 = xmm1 + a1 * b1;
1245 xmm2 = xmm2 + a1 * b2;
1246 xmm3 = xmm3 + a2 * b1;
1247 xmm4 = xmm4 + a2 * b2;
1248 xmm5 = xmm5 + a3 * b1;
1249 xmm6 = xmm6 + a3 * b2;
1250 xmm7 = xmm7 + a4 * b1;
1251 xmm8 = xmm8 + a4 * b2;
1254 (~C)(i ,j ) =
sum( xmm1 );
1255 (~C)(i ,j+1UL) =
sum( xmm2 );
1256 (~C)(i+1UL,j ) =
sum( xmm3 );
1257 (~C)(i+1UL,j+1UL) =
sum( xmm4 );
1258 (~C)(i+2UL,j ) =
sum( xmm5 );
1259 (~C)(i+2UL,j+1UL) =
sum( xmm6 );
1260 (~C)(i+3UL,j ) =
sum( xmm7 );
1261 (~C)(i+3UL,j+1UL) =
sum( xmm8 );
1266 const size_t kbegin( ( IsUpper<MT4>::value )
1267 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
1268 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
1269 const size_t kend( ( IsLower<MT4>::value )?( i+4UL ):( K ) );
1271 IntrinsicType xmm1, xmm2, xmm3, xmm4;
1273 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
1274 const IntrinsicType b1( B.load(k,j) );
1275 xmm1 = xmm1 + A.load(i ,k) * b1;
1276 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
1277 xmm3 = xmm3 + A.load(i+2UL,k) * b1;
1278 xmm4 = xmm4 + A.load(i+3UL,k) * b1;
1281 (~C)(i ,j) =
sum( xmm1 );
1282 (~C)(i+1UL,j) =
sum( xmm2 );
1283 (~C)(i+2UL,j) =
sum( xmm3 );
1284 (~C)(i+3UL,j) =
sum( xmm4 );
1288 for( ; (i+2UL) <= M; i+=2UL )
1292 for( ; (j+2UL) <= N; j+=2UL )
1294 const size_t kbegin( ( IsUpper<MT4>::value )
1295 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
1296 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
1297 const size_t kend( ( IsLower<MT4>::value )
1298 ?( IsUpper<MT5>::value ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
1299 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
1301 IntrinsicType xmm1, xmm2, xmm3, xmm4;
1303 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
1304 const IntrinsicType a1( A.load(i ,k) );
1305 const IntrinsicType a2( A.load(i+1UL,k) );
1306 const IntrinsicType b1( B.load(k,j ) );
1307 const IntrinsicType b2( B.load(k,j+1UL) );
1308 xmm1 = xmm1 + a1 * b1;
1309 xmm2 = xmm2 + a1 * b2;
1310 xmm3 = xmm3 + a2 * b1;
1311 xmm4 = xmm4 + a2 * b2;
1314 (~C)(i ,j ) =
sum( xmm1 );
1315 (~C)(i ,j+1UL) =
sum( xmm2 );
1316 (~C)(i+1UL,j ) =
sum( xmm3 );
1317 (~C)(i+1UL,j+1UL) =
sum( xmm4 );
1322 const size_t kbegin( ( IsUpper<MT4>::value )
1323 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
1324 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
1325 const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
1327 IntrinsicType xmm1, xmm2;
1329 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
1330 const IntrinsicType b1( B.load(k,j) );
1331 xmm1 = xmm1 + A.load(i ,k) * b1;
1332 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
1335 (~C)(i ,j) =
sum( xmm1 );
1336 (~C)(i+1UL,j) =
sum( xmm2 );
1344 for( ; (j+2UL) <= N; j+=2UL )
1346 const size_t kbegin( ( IsUpper<MT4>::value )
1347 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
1348 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
1349 const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
1351 IntrinsicType xmm1, xmm2;
1353 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
1354 const IntrinsicType a1( A.load(i,k) );
1355 xmm1 = xmm1 + a1 * B.load(k,j );
1356 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
1359 (~C)(i,j ) =
sum( xmm1 );
1360 (~C)(i,j+1UL) =
sum( xmm2 );
1365 const size_t kbegin( ( IsUpper<MT4>::value )
1366 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
1367 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
1371 for(
size_t k=kbegin; k<K; k+=
IT::size ) {
1372 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
1375 (~C)(i,j) =
sum( xmm1 );
1396 template<
typename MT3
1399 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1400 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1402 selectDefaultAssignKernel( C, A, B );
1422 template<
typename MT3
1425 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1426 selectLargeAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1429 selectSmallAssignKernel( ~C, A, B );
1449 template<
typename MT3
1452 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1453 selectLargeAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1456 selectSmallAssignKernel( ~C, A, B );
1475 template<
typename MT3
1478 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1479 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1481 selectLargeAssignKernel( C, A, B );
1501 template<
typename MT3
1504 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1505 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1507 if( IsTriangular<MT4>::value ) {
1509 strmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), 1.0F );
1511 else if( IsTriangular<MT5>::value ) {
1513 strmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), 1.0F );
1516 sgemm( C, A, B, 1.0F, 0.0F );
1538 template<
typename MT3
1541 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1542 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1544 if( IsTriangular<MT4>::value ) {
1546 dtrmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), 1.0 );
1548 else if( IsTriangular<MT5>::value ) {
1550 dtrmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), 1.0 );
1553 dgemm( C, A, B, 1.0, 0.0 );
1575 template<
typename MT3
1578 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1579 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1581 if( IsTriangular<MT4>::value ) {
1583 ctrmm( C, A, CblasLeft,
1584 ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
1585 complex<float>( 1.0F, 0.0F ) );
1587 else if( IsTriangular<MT5>::value ) {
1589 ctrmm( C, B, CblasRight,
1590 ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
1591 complex<float>( 1.0F, 0.0F ) );
1594 cgemm( C, A, B, complex<float>( 1.0F, 0.0F ), complex<float>( 0.0F, 0.0F ) );
1616 template<
typename MT3
1619 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1620 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1622 if( IsTriangular<MT4>::value ) {
1624 ztrmm( C, A, CblasLeft,
1625 ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
1626 complex<double>( 1.0, 0.0 ) );
1628 else if( IsTriangular<MT5>::value ) {
1630 ztrmm( C, B, CblasRight,
1631 ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
1632 complex<double>( 1.0, 0.0 ) );
1635 zgemm( C, A, B, complex<double>( 1.0, 0.0 ), complex<double>( 0.0, 0.0 ) );
1655 template<
typename MT
1661 typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
1673 const TmpType tmp(
serial( rhs ) );
1692 template<
typename MT
1701 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1705 LT A(
serial( rhs.lhs_ ) );
1706 RT B(
serial( rhs.rhs_ ) );
1715 DMatTDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1731 template<
typename MT3
1734 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1736 if( ( IsDiagonal<MT4>::value || IsDiagonal<MT5>::value ) ||
1738 selectSmallAddAssignKernel( C, A, B );
1740 selectBlasAddAssignKernel( C, A, B );
1759 template<
typename MT3
1762 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
1763 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1765 const size_t M( A.rows() );
1766 const size_t N( B.columns() );
1767 const size_t K( A.columns() );
1769 const size_t ibegin( ( IsStrictlyLower<MT4>::value )
1770 ?( ( IsStrictlyLower<MT5>::value && M > 1UL ) ? 2UL : 1UL )
1772 const size_t iend( ( IsStrictlyUpper<MT4>::value )
1773 ?( ( IsStrictlyUpper<MT5>::value && M > 1UL ) ? M-2UL : M-1UL )
1777 for(
size_t i=ibegin; i<iend; ++i )
1779 const size_t jbegin( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
1780 ?( ( IsStrictlyUpper<MT4>::value )
1781 ?( IsStrictlyUpper<MT5>::value ? i+2UL : i+1UL )
1782 :( IsStrictlyUpper<MT5>::value ? i+1UL : i ) )
1783 :( IsStrictlyUpper<MT5>::value ? 1UL : 0UL ) );
1784 const size_t jend( ( IsLower<MT4>::value && IsLower<MT5>::value )
1785 ?( ( IsStrictlyLower<MT4>::value )
1786 ?( IsStrictlyLower<MT5>::value ? i-1UL : i )
1787 :( IsStrictlyLower<MT5>::value ? i : i+1UL ) )
1788 :( IsStrictlyLower<MT5>::value ? N-1UL : N ) );
1791 for(
size_t j=jbegin; j<jend; ++j )
1793 const size_t kbegin( ( IsUpper<MT4>::value )
1794 ?( ( IsLower<MT5>::value )
1795 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i )
1796 , ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1797 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1798 :( ( IsLower<MT5>::value )
1799 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
1801 const size_t kend( ( IsLower<MT4>::value )
1802 ?( ( IsUpper<MT5>::value )
1803 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL )
1804 , ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
1805 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
1806 :( ( IsUpper<MT5>::value )
1807 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
1811 const size_t knum( kend - kbegin );
1812 const size_t kpos( kbegin + ( knum &
size_t(-2) ) );
1814 for(
size_t k=kbegin; k<kpos; k+=2UL ) {
1815 (~C)(i,j) += A(i,k ) * B(k ,j);
1816 (~C)(i,j) += A(i,k+1UL) * B(k+1UL,j);
1819 (~C)(i,j) += A(i,kpos) * B(kpos,j);
1841 template<
typename MT3
1844 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
1845 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1847 const size_t M( A.rows() );
1848 const size_t N( B.columns() );
1849 const size_t K( A.columns() );
1851 const size_t jbegin( ( IsStrictlyUpper<MT5>::value )
1852 ?( ( IsStrictlyUpper<MT4>::value && N > 1UL ) ? 2UL : 1UL )
1854 const size_t jend( ( IsStrictlyLower<MT5>::value )
1855 ?( ( IsStrictlyLower<MT4>::value && N > 1UL ) ? N-2UL : N-1UL )
1859 for(
size_t j=jbegin; j<jend; ++j )
1861 const size_t ibegin( ( IsLower<MT4>::value && IsLower<MT5>::value )
1862 ?( ( IsStrictlyLower<MT4>::value )
1863 ?( IsStrictlyLower<MT5>::value ? j+2UL : j+1UL )
1864 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1865 :( IsStrictlyLower<MT4>::value ? 1UL : 0UL ) );
1866 const size_t iend( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
1867 ?( ( IsStrictlyUpper<MT4>::value )
1868 ?( ( IsStrictlyUpper<MT5>::value )?( j-1UL ):( j ) )
1869 :( ( IsStrictlyUpper<MT5>::value )?( j ):( j+1UL ) ) )
1870 :( IsStrictlyUpper<MT4>::value ? M-1UL : M ) );
1873 for(
size_t i=ibegin; i<iend; ++i )
1875 const size_t kbegin( ( IsUpper<MT4>::value )
1876 ?( ( IsLower<MT5>::value )
1877 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i )
1878 , ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1879 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1880 :( ( IsLower<MT5>::value )
1881 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
1883 const size_t kend( ( IsLower<MT4>::value )
1884 ?( ( IsUpper<MT5>::value )
1885 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL )
1886 , ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
1887 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
1888 :( ( IsUpper<MT5>::value )
1889 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
1893 const size_t knum( kend - kbegin );
1894 const size_t kpos( kbegin + ( knum &
size_t(-2) ) );
1896 for(
size_t k=kbegin; k<kpos; k+=2UL ) {
1897 (~C)(i,j) += A(i,k ) * B(k ,j);
1898 (~C)(i,j) += A(i,k+1UL) * B(k+1UL,j);
1901 (~C)(i,j) += A(i,kpos) * B(kpos,j);
1923 template<
typename MT3
1926 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
1927 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1929 const size_t M( A.rows() );
1930 const size_t N( B.columns() );
1932 for(
size_t i=0UL; i<M; ++i )
1934 const size_t jbegin( ( IsUpper<MT4>::value )
1935 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
1937 const size_t jend( ( IsLower<MT4>::value )
1938 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
1942 const size_t jnum( jend - jbegin );
1943 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
1945 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
1946 (~C)(i,j ) += A(i,j ) * B(j ,j );
1947 (~C)(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL);
1950 (~C)(i,jpos) += A(i,jpos) * B(jpos,jpos);
1971 template<
typename MT3
1974 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
1975 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1977 const size_t M( A.rows() );
1978 const size_t N( B.columns() );
1980 const size_t block( 16UL );
1982 for(
size_t jj=0UL; jj<N; jj+=block ) {
1983 const size_t jend(
min( N, jj+block ) );
1984 for(
size_t ii=0UL; ii<M; ii+=block ) {
1985 const size_t iend(
min( M, ii+block ) );
1986 for(
size_t j=jj; j<jend; ++j )
1988 const size_t ibegin( ( IsLower<MT4>::value )
1989 ?(
max( ( IsStrictlyLower<MT4>::value ? j+1UL : j ), ii ) )
1991 const size_t ipos( ( IsUpper<MT4>::value )
1992 ?(
min( ( IsStrictlyUpper<MT4>::value ? j : j+1UL ), iend ) )
1995 for(
size_t i=ibegin; i<ipos; ++i ) {
1996 (~C)(i,j) += A(i,j) * B(j,j);
2019 template<
typename MT3
2022 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
2023 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
2025 const size_t M( A.rows() );
2026 const size_t N( B.columns() );
2028 const size_t block( 16UL );
2030 for(
size_t ii=0UL; ii<M; ii+=block ) {
2031 const size_t iend(
min( M, ii+block ) );
2032 for(
size_t jj=0UL; jj<N; jj+=block ) {
2033 const size_t jend(
min( N, jj+block ) );
2034 for(
size_t i=ii; i<iend; ++i )
2036 const size_t jbegin( ( IsUpper<MT5>::value )
2037 ?(
max( ( IsStrictlyUpper<MT5>::value ? i+1UL : i ), jj ) )
2039 const size_t jpos( ( IsLower<MT5>::value )
2040 ?(
min( ( IsStrictlyLower<MT5>::value ? i : i+1UL ), jend ) )
2043 for(
size_t j=jbegin; j<jpos; ++j ) {
2044 (~C)(i,j) += A(i,i) * B(i,j);
2067 template<
typename MT3
2070 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
2071 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
2073 const size_t M( A.rows() );
2074 const size_t N( B.columns() );
2076 for(
size_t j=0UL; j<N; ++j )
2078 const size_t ibegin( ( IsLower<MT5>::value )
2079 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
2081 const size_t iend( ( IsUpper<MT5>::value )
2082 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
2086 const size_t inum( iend - ibegin );
2087 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2089 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2090 (~C)(i ,j) += A(i ,i ) * B(i ,j);
2091 (~C)(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j);
2094 (~C)(ipos,j) += A(ipos,ipos) * B(ipos,j);
2115 template<
typename MT3
2118 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
2119 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2121 for(
size_t i=0UL; i<A.rows(); ++i ) {
2122 C(i,i) += A(i,i) * B(i,i);
2142 template<
typename MT3
2145 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2146 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2148 selectDefaultAddAssignKernel( C, A, B );
2168 template<
typename MT3
2171 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2172 selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
2174 typedef IntrinsicTrait<ElementType> IT;
2176 const size_t M( A.rows() );
2177 const size_t N( B.columns() );
2178 const size_t K( A.columns() );
2182 for( ; (i+2UL) <= M; i+=2UL )
2186 for( ; (j+4UL) <= N; j+=4UL )
2188 const size_t kbegin( ( IsUpper<MT4>::value )
2189 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
2190 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
2191 const size_t kend( ( IsLower<MT4>::value )
2192 ?( IsUpper<MT5>::value ?
min( i+2UL, j+4UL ) : ( i+2UL ) )
2193 :( IsUpper<MT5>::value ? ( j+4UL ) : K ) );
2195 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2197 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
2198 const IntrinsicType a1( A.load(i ,k) );
2199 const IntrinsicType a2( A.load(i+1UL,k) );
2200 const IntrinsicType b1( B.load(k,j ) );
2201 const IntrinsicType b2( B.load(k,j+1UL) );
2202 const IntrinsicType b3( B.load(k,j+2UL) );
2203 const IntrinsicType b4( B.load(k,j+3UL) );
2204 xmm1 = xmm1 + a1 * b1;
2205 xmm2 = xmm2 + a1 * b2;
2206 xmm3 = xmm3 + a1 * b3;
2207 xmm4 = xmm4 + a1 * b4;
2208 xmm5 = xmm5 + a2 * b1;
2209 xmm6 = xmm6 + a2 * b2;
2210 xmm7 = xmm7 + a2 * b3;
2211 xmm8 = xmm8 + a2 * b4;
2214 (~C)(i ,j ) +=
sum( xmm1 );
2215 (~C)(i ,j+1UL) +=
sum( xmm2 );
2216 (~C)(i ,j+2UL) +=
sum( xmm3 );
2217 (~C)(i ,j+3UL) +=
sum( xmm4 );
2218 (~C)(i+1UL,j ) +=
sum( xmm5 );
2219 (~C)(i+1UL,j+1UL) +=
sum( xmm6 );
2220 (~C)(i+1UL,j+2UL) +=
sum( xmm7 );
2221 (~C)(i+1UL,j+3UL) +=
sum( xmm8 );
2224 for( ; (j+2UL) <= N; j+=2UL )
2226 const size_t kbegin( ( IsUpper<MT4>::value )
2227 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
2228 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
2229 const size_t kend( ( IsLower<MT4>::value )
2230 ?( IsUpper<MT5>::value ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
2231 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
2233 IntrinsicType xmm1, xmm2, xmm3, xmm4;
2235 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
2236 const IntrinsicType a1( A.load(i ,k) );
2237 const IntrinsicType a2( A.load(i+1UL,k) );
2238 const IntrinsicType b1( B.load(k,j ) );
2239 const IntrinsicType b2( B.load(k,j+1UL) );
2240 xmm1 = xmm1 + a1 * b1;
2241 xmm2 = xmm2 + a1 * b2;
2242 xmm3 = xmm3 + a2 * b1;
2243 xmm4 = xmm4 + a2 * b2;
2246 (~C)(i ,j ) +=
sum( xmm1 );
2247 (~C)(i ,j+1UL) +=
sum( xmm2 );
2248 (~C)(i+1UL,j ) +=
sum( xmm3 );
2249 (~C)(i+1UL,j+1UL) +=
sum( xmm4 );
2254 const size_t kbegin( ( IsUpper<MT4>::value )
2255 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
2256 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
2257 const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
2259 IntrinsicType xmm1, xmm2;
2261 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
2262 const IntrinsicType b1( B.load(k,j) );
2263 xmm1 = xmm1 + A.load(i ,k) * b1;
2264 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
2267 (~C)(i ,j) +=
sum( xmm1 );
2268 (~C)(i+1UL,j) +=
sum( xmm2 );
2275 for( ; (j+4UL) <= N; j+=4UL )
2277 const size_t kbegin( ( IsUpper<MT4>::value )
2278 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
2279 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
2280 const size_t kend( ( IsUpper<MT5>::value )?( j+4UL ):( K ) );
2282 IntrinsicType xmm1, xmm2, xmm3, xmm4;
2284 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
2285 const IntrinsicType a1( A.load(i,k) );
2286 xmm1 = xmm1 + a1 * B.load(k,j );
2287 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
2288 xmm3 = xmm3 + a1 * B.load(k,j+2UL);
2289 xmm4 = xmm4 + a1 * B.load(k,j+3UL);
2292 (~C)(i,j ) +=
sum( xmm1 );
2293 (~C)(i,j+1UL) +=
sum( xmm2 );
2294 (~C)(i,j+2UL) +=
sum( xmm3 );
2295 (~C)(i,j+3UL) +=
sum( xmm4 );
2298 for( ; (j+2UL) <= N; j+=2UL )
2300 const size_t kbegin( ( IsUpper<MT4>::value )
2301 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
2302 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
2303 const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
2305 IntrinsicType xmm1, xmm2;
2307 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
2308 const IntrinsicType a1( A.load(i,k) );
2309 xmm1 = xmm1 + a1 * B.load(k,j );
2310 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
2313 (~C)(i,j ) +=
sum( xmm1 );
2314 (~C)(i,j+1UL) +=
sum( xmm2 );
2319 const size_t kbegin( ( IsUpper<MT4>::value )
2320 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
2321 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
2325 for(
size_t k=kbegin; k<K; k+=
IT::size ) {
2326 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
2329 (~C)(i,j) +=
sum( xmm1 );
2351 template<
typename MT3
2354 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2355 selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
2357 typedef IntrinsicTrait<ElementType> IT;
2359 const size_t M( A.rows() );
2360 const size_t N( B.columns() );
2361 const size_t K( A.columns() );
2365 for( ; (i+4UL) <= M; i+=4UL )
2369 for( ; (j+2UL) <= N; j+=2UL )
2371 const size_t kbegin( ( IsUpper<MT4>::value )
2372 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
2373 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
2374 const size_t kend( ( IsLower<MT4>::value )
2375 ?( IsUpper<MT5>::value ?
min( i+4UL, j+2UL ) : ( i+4UL ) )
2376 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
2378 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2380 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
2381 const IntrinsicType a1( A.load(i ,k) );
2382 const IntrinsicType a2( A.load(i+1UL,k) );
2383 const IntrinsicType a3( A.load(i+2UL,k) );
2384 const IntrinsicType a4( A.load(i+3UL,k) );
2385 const IntrinsicType b1( B.load(k,j ) );
2386 const IntrinsicType b2( B.load(k,j+1UL) );
2387 xmm1 = xmm1 + a1 * b1;
2388 xmm2 = xmm2 + a1 * b2;
2389 xmm3 = xmm3 + a2 * b1;
2390 xmm4 = xmm4 + a2 * b2;
2391 xmm5 = xmm5 + a3 * b1;
2392 xmm6 = xmm6 + a3 * b2;
2393 xmm7 = xmm7 + a4 * b1;
2394 xmm8 = xmm8 + a4 * b2;
2397 (~C)(i ,j ) +=
sum( xmm1 );
2398 (~C)(i ,j+1UL) +=
sum( xmm2 );
2399 (~C)(i+1UL,j ) +=
sum( xmm3 );
2400 (~C)(i+1UL,j+1UL) +=
sum( xmm4 );
2401 (~C)(i+2UL,j ) +=
sum( xmm5 );
2402 (~C)(i+2UL,j+1UL) +=
sum( xmm6 );
2403 (~C)(i+3UL,j ) +=
sum( xmm7 );
2404 (~C)(i+3UL,j+1UL) +=
sum( xmm8 );
2409 const size_t kbegin( ( IsUpper<MT4>::value )
2410 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
2411 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
2412 const size_t kend( ( IsLower<MT4>::value )?( i+4UL ):( K ) );
2414 IntrinsicType xmm1, xmm2, xmm3, xmm4;
2416 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
2417 const IntrinsicType b1( B.load(k,j) );
2418 xmm1 = xmm1 + A.load(i ,k) * b1;
2419 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
2420 xmm3 = xmm3 + A.load(i+2UL,k) * b1;
2421 xmm4 = xmm4 + A.load(i+3UL,k) * b1;
2424 (~C)(i ,j) +=
sum( xmm1 );
2425 (~C)(i+1UL,j) +=
sum( xmm2 );
2426 (~C)(i+2UL,j) +=
sum( xmm3 );
2427 (~C)(i+3UL,j) +=
sum( xmm4 );
2431 for( ; (i+2UL) <= M; i+=2UL )
2435 for( ; (j+2UL) <= N; j+=2UL )
2437 const size_t kbegin( ( IsUpper<MT4>::value )
2438 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
2439 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
2440 const size_t kend( ( IsLower<MT4>::value )
2441 ?( IsUpper<MT5>::value ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
2442 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
2444 IntrinsicType xmm1, xmm2, xmm3, xmm4;
2446 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
2447 const IntrinsicType a1( A.load(i ,k) );
2448 const IntrinsicType a2( A.load(i+1UL,k) );
2449 const IntrinsicType b1( B.load(k,j ) );
2450 const IntrinsicType b2( B.load(k,j+1UL) );
2451 xmm1 = xmm1 + a1 * b1;
2452 xmm2 = xmm2 + a1 * b2;
2453 xmm3 = xmm3 + a2 * b1;
2454 xmm4 = xmm4 + a2 * b2;
2457 (~C)(i ,j ) +=
sum( xmm1 );
2458 (~C)(i ,j+1UL) +=
sum( xmm2 );
2459 (~C)(i+1UL,j ) +=
sum( xmm3 );
2460 (~C)(i+1UL,j+1UL) +=
sum( xmm4 );
2465 const size_t kbegin( ( IsUpper<MT4>::value )
2466 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
2467 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
2468 const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
2470 IntrinsicType xmm1, xmm2;
2472 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
2473 const IntrinsicType b1( B.load(k,j) );
2474 xmm1 = xmm1 + A.load(i ,k) * b1;
2475 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
2478 (~C)(i ,j) +=
sum( xmm1 );
2479 (~C)(i+1UL,j) +=
sum( xmm2 );
2487 for( ; (j+2UL) <= N; j+=2UL )
2489 const size_t kbegin( ( IsUpper<MT4>::value )
2490 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
2491 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
2492 const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
2494 IntrinsicType xmm1, xmm2;
2496 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
2497 const IntrinsicType a1( A.load(i,k) );
2498 xmm1 = xmm1 + a1 * B.load(k,j );
2499 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
2502 (~C)(i,j ) +=
sum( xmm1 );
2503 (~C)(i,j+1UL) +=
sum( xmm2 );
2508 const size_t kbegin( ( IsUpper<MT4>::value )
2509 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
2510 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
2514 for(
size_t k=kbegin; k<K; k+=
IT::size ) {
2515 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
2518 (~C)(i,j) +=
sum( xmm1 );
2539 template<
typename MT3
2542 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2543 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2545 selectDefaultAddAssignKernel( C, A, B );
2565 template<
typename MT3
2568 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2569 selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
2572 selectSmallAddAssignKernel( ~C, A, B );
2592 template<
typename MT3
2595 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2596 selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
2599 selectSmallAddAssignKernel( ~C, A, B );
2618 template<
typename MT3
2621 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
2622 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2624 selectLargeAddAssignKernel( C, A, B );
2644 template<
typename MT3
2647 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
2648 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2650 if( IsTriangular<MT4>::value ) {
2652 strmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), 1.0F );
2655 else if( IsTriangular<MT5>::value ) {
2657 strmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), 1.0F );
2661 sgemm( C, A, B, 1.0F, 1.0F );
2683 template<
typename MT3
2686 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
2687 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2689 if( IsTriangular<MT4>::value ) {
2691 dtrmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), 1.0 );
2694 else if( IsTriangular<MT5>::value ) {
2696 dtrmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), 1.0 );
2700 dgemm( C, A, B, 1.0, 1.0 );
2722 template<
typename MT3
2725 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2726 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2728 if( IsTriangular<MT4>::value ) {
2730 ctrmm( tmp, A, CblasLeft,
2731 ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
2732 complex<float>( 1.0F, 0.0F ) );
2735 else if( IsTriangular<MT5>::value ) {
2737 ctrmm( tmp, B, CblasRight,
2738 ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
2739 complex<float>( 1.0F, 0.0F ) );
2743 cgemm( C, A, B, complex<float>( 1.0F, 0.0F ), complex<float>( 1.0F, 0.0F ) );
2765 template<
typename MT3
2768 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2769 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2771 if( IsTriangular<MT4>::value ) {
2773 ztrmm( tmp, A, CblasLeft,
2774 ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
2775 complex<double>( 1.0, 0.0 ) );
2778 else if( IsTriangular<MT5>::value ) {
2780 ztrmm( tmp, B, CblasRight,
2781 ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
2782 complex<double>( 1.0, 0.0 ) );
2786 zgemm( C, A, B, complex<double>( 1.0, 0.0 ), complex<double>( 1.0, 0.0 ) );
2810 template<
typename MT
2819 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2823 LT A(
serial( rhs.lhs_ ) );
2824 RT B(
serial( rhs.rhs_ ) );
2833 DMatTDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
2849 template<
typename MT3
2852 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2854 if( ( IsDiagonal<MT4>::value || IsDiagonal<MT5>::value ) ||
2856 selectSmallSubAssignKernel( C, A, B );
2858 selectBlasSubAssignKernel( C, A, B );
2877 template<
typename MT3
2880 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
2881 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
2883 const size_t M( A.rows() );
2884 const size_t N( B.columns() );
2885 const size_t K( A.columns() );
2887 const size_t ibegin( ( IsStrictlyLower<MT4>::value )
2888 ?( ( IsStrictlyLower<MT5>::value && M > 1UL ) ? 2UL : 1UL )
2890 const size_t iend( ( IsStrictlyUpper<MT4>::value )
2891 ?( ( IsStrictlyUpper<MT5>::value && M > 1UL ) ? M-2UL : M-1UL )
2895 for(
size_t i=ibegin; i<iend; ++i )
2897 const size_t jbegin( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
2898 ?( ( IsStrictlyUpper<MT4>::value )
2899 ?( IsStrictlyUpper<MT5>::value ? i+2UL : i+1UL )
2900 :( IsStrictlyUpper<MT5>::value ? i+1UL : i ) )
2901 :( IsStrictlyUpper<MT5>::value ? 1UL : 0UL ) );
2902 const size_t jend( ( IsLower<MT4>::value && IsLower<MT5>::value )
2903 ?( ( IsStrictlyLower<MT4>::value )
2904 ?( IsStrictlyLower<MT5>::value ? i-1UL : i )
2905 :( IsStrictlyLower<MT5>::value ? i : i+1UL ) )
2906 :( IsStrictlyLower<MT5>::value ? N-1UL : N ) );
2909 for(
size_t j=jbegin; j<jend; ++j )
2911 const size_t kbegin( ( IsUpper<MT4>::value )
2912 ?( ( IsLower<MT5>::value )
2913 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i )
2914 , ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2915 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2916 :( ( IsLower<MT5>::value )
2917 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
2919 const size_t kend( ( IsLower<MT4>::value )
2920 ?( ( IsUpper<MT5>::value )
2921 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL )
2922 , ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
2923 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
2924 :( ( IsUpper<MT5>::value )
2925 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
2929 const size_t knum( kend - kbegin );
2930 const size_t kpos( kbegin + ( knum &
size_t(-2) ) );
2932 for(
size_t k=kbegin; k<kpos; k+=2UL ) {
2933 (~C)(i,j) -= A(i,k ) * B(k ,j);
2934 (~C)(i,j) -= A(i,k+1UL) * B(k+1UL,j);
2937 (~C)(i,j) -= A(i,kpos) * B(kpos,j);
2959 template<
typename MT3
2962 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
2963 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
2965 const size_t M( A.rows() );
2966 const size_t N( B.columns() );
2967 const size_t K( A.columns() );
2969 const size_t jbegin( ( IsStrictlyUpper<MT5>::value )
2970 ?( ( IsStrictlyUpper<MT4>::value && N > 1UL ) ? 2UL : 1UL )
2972 const size_t jend( ( IsStrictlyLower<MT5>::value )
2973 ?( ( IsStrictlyLower<MT4>::value && N > 1UL ) ? N-2UL : N-1UL )
2977 for(
size_t j=jbegin; j<jend; ++j )
2979 const size_t ibegin( ( IsLower<MT4>::value && IsLower<MT5>::value )
2980 ?( ( IsStrictlyLower<MT4>::value )
2981 ?( IsStrictlyLower<MT5>::value ? j+2UL : j+1UL )
2982 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2983 :( IsStrictlyLower<MT4>::value ? 1UL : 0UL ) );
2984 const size_t iend( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
2985 ?( ( IsStrictlyUpper<MT4>::value )
2986 ?( ( IsStrictlyUpper<MT5>::value )?( j-1UL ):( j ) )
2987 :( ( IsStrictlyUpper<MT5>::value )?( j ):( j+1UL ) ) )
2988 :( IsStrictlyUpper<MT4>::value ? M-1UL : M ) );
2991 for(
size_t i=ibegin; i<iend; ++i )
2993 const size_t kbegin( ( IsUpper<MT4>::value )
2994 ?( ( IsLower<MT5>::value )
2995 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i )
2996 , ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2997 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2998 :( ( IsLower<MT5>::value )
2999 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
3001 const size_t kend( ( IsLower<MT4>::value )
3002 ?( ( IsUpper<MT5>::value )
3003 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL )
3004 , ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
3005 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
3006 :( ( IsUpper<MT5>::value )
3007 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
3011 const size_t knum( kend - kbegin );
3012 const size_t kpos( kbegin + ( knum &
size_t(-2) ) );
3014 for(
size_t k=kbegin; k<kpos; k+=2UL ) {
3015 (~C)(i,j) -= A(i,k ) * B(k ,j);
3016 (~C)(i,j) -= A(i,k+1UL) * B(k+1UL,j);
3019 (~C)(i,j) -= A(i,kpos) * B(kpos,j);
3041 template<
typename MT3
3044 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
3045 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
3047 const size_t M( A.rows() );
3048 const size_t N( B.columns() );
3050 for(
size_t i=0UL; i<M; ++i )
3052 const size_t jbegin( ( IsUpper<MT4>::value )
3053 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
3055 const size_t jend( ( IsLower<MT4>::value )
3056 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
3060 const size_t jnum( jend - jbegin );
3061 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
3063 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
3064 (~C)(i,j ) -= A(i,j ) * B(j ,j );
3065 (~C)(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL);
3068 (~C)(i,jpos) -= A(i,jpos) * B(jpos,jpos);
3089 template<
typename MT3
3092 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
3093 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
3095 const size_t M( A.rows() );
3096 const size_t N( B.columns() );
3098 const size_t block( 16UL );
3100 for(
size_t jj=0UL; jj<N; jj+=block ) {
3101 const size_t jend(
min( N, jj+block ) );
3102 for(
size_t ii=0UL; ii<M; ii+=block ) {
3103 const size_t iend(
min( M, ii+block ) );
3104 for(
size_t j=jj; j<jend; ++j )
3106 const size_t ibegin( ( IsLower<MT4>::value )
3107 ?(
max( ( IsStrictlyLower<MT4>::value ? j+1UL : j ), ii ) )
3109 const size_t ipos( ( IsUpper<MT4>::value )
3110 ?(
min( ( IsStrictlyUpper<MT4>::value ? j : j+1UL ), iend ) )
3113 for(
size_t i=ibegin; i<ipos; ++i ) {
3114 (~C)(i,j) -= A(i,j) * B(j,j);
3137 template<
typename MT3
3140 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
3141 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
3143 const size_t M( A.rows() );
3144 const size_t N( B.columns() );
3146 const size_t block( 16UL );
3148 for(
size_t ii=0UL; ii<M; ii+=block ) {
3149 const size_t iend(
min( M, ii+block ) );
3150 for(
size_t jj=0UL; jj<N; jj+=block ) {
3151 const size_t jend(
min( N, jj+block ) );
3152 for(
size_t i=ii; i<iend; ++i )
3154 const size_t jbegin( ( IsUpper<MT5>::value )
3155 ?(
max( ( IsStrictlyUpper<MT5>::value ? i+1UL : i ), jj ) )
3157 const size_t jpos( ( IsLower<MT5>::value )
3158 ?(
min( ( IsStrictlyLower<MT5>::value ? i : i+1UL ), jend ) )
3161 for(
size_t j=jbegin; j<jpos; ++j ) {
3162 (~C)(i,j) -= A(i,i) * B(i,j);
3185 template<
typename MT3
3188 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
3189 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
3191 const size_t M( A.rows() );
3192 const size_t N( B.columns() );
3194 for(
size_t j=0UL; j<N; ++j )
3196 const size_t ibegin( ( IsLower<MT5>::value )
3197 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
3199 const size_t iend( ( IsUpper<MT5>::value )
3200 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
3204 const size_t inum( iend - ibegin );
3205 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
3207 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
3208 (~C)(i ,j) -= A(i ,i ) * B(i ,j);
3209 (~C)(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j);
3212 (~C)(ipos,j) -= A(ipos,ipos) * B(ipos,j);
3233 template<
typename MT3
3236 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
3237 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3239 for(
size_t i=0UL; i<A.rows(); ++i ) {
3240 C(i,i) -= A(i,i) * B(i,i);
3260 template<
typename MT3
3263 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3264 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3266 selectDefaultSubAssignKernel( ~C, A, B );
3286 template<
typename MT3
3289 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3290 selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
3292 typedef IntrinsicTrait<ElementType> IT;
3294 const size_t M( A.rows() );
3295 const size_t N( B.columns() );
3296 const size_t K( A.columns() );
3300 for( ; (i+2UL) <= M; i+=2UL )
3304 for( ; (j+4UL) <= N; j+=4UL )
3306 const size_t kbegin( ( IsUpper<MT4>::value )
3307 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
3308 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
3309 const size_t kend( ( IsLower<MT4>::value )
3310 ?( IsUpper<MT5>::value ?
min( i+2UL, j+4UL ) : ( i+2UL ) )
3311 :( IsUpper<MT5>::value ? ( j+4UL ) : K ) );
3313 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3315 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
3316 const IntrinsicType a1( A.load(i ,k) );
3317 const IntrinsicType a2( A.load(i+1UL,k) );
3318 const IntrinsicType b1( B.load(k,j ) );
3319 const IntrinsicType b2( B.load(k,j+1UL) );
3320 const IntrinsicType b3( B.load(k,j+2UL) );
3321 const IntrinsicType b4( B.load(k,j+3UL) );
3322 xmm1 = xmm1 + a1 * b1;
3323 xmm2 = xmm2 + a1 * b2;
3324 xmm3 = xmm3 + a1 * b3;
3325 xmm4 = xmm4 + a1 * b4;
3326 xmm5 = xmm5 + a2 * b1;
3327 xmm6 = xmm6 + a2 * b2;
3328 xmm7 = xmm7 + a2 * b3;
3329 xmm8 = xmm8 + a2 * b4;
3332 (~C)(i ,j ) -=
sum( xmm1 );
3333 (~C)(i ,j+1UL) -=
sum( xmm2 );
3334 (~C)(i ,j+2UL) -=
sum( xmm3 );
3335 (~C)(i ,j+3UL) -=
sum( xmm4 );
3336 (~C)(i+1UL,j ) -=
sum( xmm5 );
3337 (~C)(i+1UL,j+1UL) -=
sum( xmm6 );
3338 (~C)(i+1UL,j+2UL) -=
sum( xmm7 );
3339 (~C)(i+1UL,j+3UL) -=
sum( xmm8 );
3342 for( ; (j+2UL) <= N; j+=2UL )
3344 const size_t kbegin( ( IsUpper<MT4>::value )
3345 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
3346 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
3347 const size_t kend( ( IsLower<MT4>::value )
3348 ?( IsUpper<MT5>::value ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
3349 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
3351 IntrinsicType xmm1, xmm2, xmm3, xmm4;
3353 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
3354 const IntrinsicType a1( A.load(i ,k) );
3355 const IntrinsicType a2( A.load(i+1UL,k) );
3356 const IntrinsicType b1( B.load(k,j ) );
3357 const IntrinsicType b2( B.load(k,j+1UL) );
3358 xmm1 = xmm1 + a1 * b1;
3359 xmm2 = xmm2 + a1 * b2;
3360 xmm3 = xmm3 + a2 * b1;
3361 xmm4 = xmm4 + a2 * b2;
3364 (~C)(i ,j ) -=
sum( xmm1 );
3365 (~C)(i ,j+1UL) -=
sum( xmm2 );
3366 (~C)(i+1UL,j ) -=
sum( xmm3 );
3367 (~C)(i+1UL,j+1UL) -=
sum( xmm4 );
3372 const size_t kbegin( ( IsUpper<MT4>::value )
3373 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
3374 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
3375 const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
3377 IntrinsicType xmm1, xmm2;
3379 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
3380 const IntrinsicType b1( B.load(k,j) );
3381 xmm1 = xmm1 + A.load(i ,k) * b1;
3382 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
3385 (~C)(i ,j) -=
sum( xmm1 );
3386 (~C)(i+1UL,j) -=
sum( xmm2 );
3394 for( ; (j+4UL) <= N; j+=4UL )
3396 const size_t kbegin( ( IsUpper<MT4>::value )
3397 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
3398 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
3399 const size_t kend( ( IsUpper<MT5>::value )?( j+4UL ):( K ) );
3401 IntrinsicType xmm1, xmm2, xmm3, xmm4;
3403 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
3404 const IntrinsicType a1( A.load(i,k) );
3405 xmm1 = xmm1 + a1 * B.load(k,j );
3406 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
3407 xmm3 = xmm3 + a1 * B.load(k,j+2UL);
3408 xmm4 = xmm4 + a1 * B.load(k,j+3UL);
3411 (~C)(i,j ) -=
sum( xmm1 );
3412 (~C)(i,j+1UL) -=
sum( xmm2 );
3413 (~C)(i,j+2UL) -=
sum( xmm3 );
3414 (~C)(i,j+3UL) -=
sum( xmm4 );
3417 for( ; (j+2UL) <= N; j+=2UL )
3419 const size_t kbegin( ( IsUpper<MT4>::value )
3420 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
3421 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
3422 const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
3424 IntrinsicType xmm1, xmm2;
3426 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
3427 const IntrinsicType a1( A.load(i,k) );
3428 xmm1 = xmm1 + a1 * B.load(k,j );
3429 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
3432 (~C)(i,j ) -=
sum( xmm1 );
3433 (~C)(i,j+1UL) -=
sum( xmm2 );
3438 const size_t kbegin( ( IsUpper<MT4>::value )
3439 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
3440 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
3444 for(
size_t k=kbegin; k<K; k+=
IT::size ) {
3445 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
3448 (~C)(i,j) -=
sum( xmm1 );
3470 template<
typename MT3
3473 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3474 selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
3476 typedef IntrinsicTrait<ElementType> IT;
3478 const size_t M( A.rows() );
3479 const size_t N( B.columns() );
3480 const size_t K( A.columns() );
3484 for( ; (i+4UL) <= M; i+=4UL )
3488 for( ; (j+2UL) <= N; j+=2UL )
3490 const size_t kbegin( ( IsUpper<MT4>::value )
3491 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
3492 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
3493 const size_t kend( ( IsLower<MT4>::value )
3494 ?( IsUpper<MT5>::value ?
min( i+4UL, j+2UL ) : ( i+4UL ) )
3495 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
3497 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3499 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
3500 const IntrinsicType a1( A.load(i ,k) );
3501 const IntrinsicType a2( A.load(i+1UL,k) );
3502 const IntrinsicType a3( A.load(i+2UL,k) );
3503 const IntrinsicType a4( A.load(i+3UL,k) );
3504 const IntrinsicType b1( B.load(k,j ) );
3505 const IntrinsicType b2( B.load(k,j+1UL) );
3506 xmm1 = xmm1 + a1 * b1;
3507 xmm2 = xmm2 + a1 * b2;
3508 xmm3 = xmm3 + a2 * b1;
3509 xmm4 = xmm4 + a2 * b2;
3510 xmm5 = xmm5 + a3 * b1;
3511 xmm6 = xmm6 + a3 * b2;
3512 xmm7 = xmm7 + a4 * b1;
3513 xmm8 = xmm8 + a4 * b2;
3516 (~C)(i ,j ) -=
sum( xmm1 );
3517 (~C)(i ,j+1UL) -=
sum( xmm2 );
3518 (~C)(i+1UL,j ) -=
sum( xmm3 );
3519 (~C)(i+1UL,j+1UL) -=
sum( xmm4 );
3520 (~C)(i+2UL,j ) -=
sum( xmm5 );
3521 (~C)(i+2UL,j+1UL) -=
sum( xmm6 );
3522 (~C)(i+3UL,j ) -=
sum( xmm7 );
3523 (~C)(i+3UL,j+1UL) -=
sum( xmm8 );
3528 const size_t kbegin( ( IsUpper<MT4>::value )
3529 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
3530 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
3531 const size_t kend( ( IsLower<MT4>::value )?( i+4UL ):( K ) );
3533 IntrinsicType xmm1, xmm2, xmm3, xmm4;
3535 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
3536 const IntrinsicType b1( B.load(k,j) );
3537 xmm1 = xmm1 + A.load(i ,k) * b1;
3538 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
3539 xmm3 = xmm3 + A.load(i+2UL,k) * b1;
3540 xmm4 = xmm4 + A.load(i+3UL,k) * b1;
3543 (~C)(i ,j) -=
sum( xmm1 );
3544 (~C)(i+1UL,j) -=
sum( xmm2 );
3545 (~C)(i+2UL,j) -=
sum( xmm3 );
3546 (~C)(i+3UL,j) -=
sum( xmm4 );
3550 for( ; (i+2UL) <= M; i+=2UL )
3554 for( ; (j+2UL) <= N; j+=2UL )
3556 const size_t kbegin( ( IsUpper<MT4>::value )
3557 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
3558 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
3559 const size_t kend( ( IsLower<MT4>::value )
3560 ?( IsUpper<MT5>::value ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
3561 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
3563 IntrinsicType xmm1, xmm2, xmm3, xmm4;
3565 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
3566 const IntrinsicType a1( A.load(i ,k) );
3567 const IntrinsicType a2( A.load(i+1UL,k) );
3568 const IntrinsicType b1( B.load(k,j ) );
3569 const IntrinsicType b2( B.load(k,j+1UL) );
3570 xmm1 = xmm1 + a1 * b1;
3571 xmm2 = xmm2 + a1 * b2;
3572 xmm3 = xmm3 + a2 * b1;
3573 xmm4 = xmm4 + a2 * b2;
3576 (~C)(i ,j ) -=
sum( xmm1 );
3577 (~C)(i ,j+1UL) -=
sum( xmm2 );
3578 (~C)(i+1UL,j ) -=
sum( xmm3 );
3579 (~C)(i+1UL,j+1UL) -=
sum( xmm4 );
3584 const size_t kbegin( ( IsUpper<MT4>::value )
3585 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
3586 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
3587 const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
3589 IntrinsicType xmm1, xmm2;
3591 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
3592 const IntrinsicType b1( B.load(k,j) );
3593 xmm1 = xmm1 + A.load(i ,k) * b1;
3594 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
3597 (~C)(i ,j) -=
sum( xmm1 );
3598 (~C)(i+1UL,j) -=
sum( xmm2 );
3605 for( ; (j+2UL) <= N; j+=2UL )
3607 const size_t kbegin( ( IsUpper<MT4>::value )
3608 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
3609 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
3610 const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
3612 IntrinsicType xmm1, xmm2;
3614 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
3615 const IntrinsicType a1( A.load(i,k) );
3616 xmm1 = xmm1 + a1 * B.load(k,j );
3617 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
3620 (~C)(i,j ) -=
sum( xmm1 );
3621 (~C)(i,j+1UL) -=
sum( xmm2 );
3626 const size_t kbegin( ( IsUpper<MT4>::value )
3627 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
3628 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
3632 for(
size_t k=kbegin; k<K; k+=
IT::size ) {
3633 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
3636 (~C)(i,j) -=
sum( xmm1 );
3657 template<
typename MT3
3660 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3661 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3663 selectDefaultSubAssignKernel( ~C, A, B );
3683 template<
typename MT3
3686 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3687 selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
3690 selectSmallSubAssignKernel( ~C, A, B );
3710 template<
typename MT3
3713 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3714 selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
3717 selectSmallSubAssignKernel( ~C, A, B );
3736 template<
typename MT3
3739 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
3740 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3742 selectLargeSubAssignKernel( C, A, B );
3762 template<
typename MT3
3765 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
3766 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3768 if( IsTriangular<MT4>::value ) {
3770 strmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), 1.0F );
3773 else if( IsTriangular<MT5>::value ) {
3775 strmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), 1.0F );
3779 sgemm( C, A, B, -1.0F, 1.0F );
3801 template<
typename MT3
3804 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
3805 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3807 if( IsTriangular<MT4>::value ) {
3809 dtrmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), 1.0 );
3812 else if( IsTriangular<MT5>::value ) {
3814 dtrmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), 1.0 );
3818 dgemm( C, A, B, -1.0, 1.0 );
3840 template<
typename MT3
3843 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3844 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3846 if( IsTriangular<MT4>::value ) {
3848 ctrmm( tmp, A, CblasLeft,
3849 ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
3850 complex<float>( 1.0F, 0.0F ) );
3853 else if( IsTriangular<MT5>::value ) {
3855 ctrmm( tmp, B, CblasRight,
3856 ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
3857 complex<float>( 1.0F, 0.0F ) );
3861 cgemm( C, A, B, complex<float>( -1.0F, 0.0F ), complex<float>( 1.0F, 0.0F ) );
3883 template<
typename MT3
3886 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3887 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3889 if( IsTriangular<MT4>::value ) {
3891 ztrmm( tmp, A, CblasLeft,
3892 ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
3893 complex<float>( 1.0, 0.0 ) );
3896 else if( IsTriangular<MT5>::value ) {
3898 ztrmm( tmp, B, CblasRight,
3899 ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
3900 complex<float>( 1.0, 0.0 ) );
3904 zgemm( C, A, B, complex<double>( -1.0, 0.0 ), complex<double>( 1.0, 0.0 ) );
3938 template<
typename MT
3940 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
3948 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
3951 else if( rhs.lhs_.columns() == 0UL ) {
3986 template<
typename MT
3988 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
3993 typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
4005 const TmpType tmp( rhs );
4027 template<
typename MT
4029 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
4037 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4076 template<
typename MT
4078 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
4086 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4146 template<
typename MT1
4150 :
public DenseMatrix< DMatScalarMultExpr< DMatTDMatMultExpr<MT1,MT2>, ST, false >, false >
4151 ,
private MatScalarMultExpr
4152 ,
private Computation
4156 typedef DMatTDMatMultExpr<MT1,MT2> MMM;
4168 enum { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
4173 enum { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
4181 template<
typename T1,
typename T2,
typename T3 >
4182 struct IsEvaluationRequired {
4183 enum { value = ( evaluateLeft || evaluateRight ) };
4192 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4193 struct UseSinglePrecisionKernel {
4195 HasMutableDataAccess<T1>::value &&
4196 HasConstDataAccess<T2>::value &&
4197 HasConstDataAccess<T3>::value &&
4198 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
4199 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
4200 IsFloat<typename T1::ElementType>::value &&
4201 IsFloat<typename T2::ElementType>::value &&
4202 IsFloat<typename T3::ElementType>::value &&
4203 !IsComplex<T4>::value };
4212 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4213 struct UseDoublePrecisionKernel {
4215 HasMutableDataAccess<T1>::value &&
4216 HasConstDataAccess<T2>::value &&
4217 HasConstDataAccess<T3>::value &&
4218 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
4219 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
4220 IsDouble<typename T1::ElementType>::value &&
4221 IsDouble<typename T2::ElementType>::value &&
4222 IsDouble<typename T3::ElementType>::value &&
4223 !IsComplex<T4>::value };
4232 template<
typename T1,
typename T2,
typename T3 >
4233 struct UseSinglePrecisionComplexKernel {
4234 typedef complex<float> Type;
4236 HasMutableDataAccess<T1>::value &&
4237 HasConstDataAccess<T2>::value &&
4238 HasConstDataAccess<T3>::value &&
4239 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
4240 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
4241 IsSame<typename T1::ElementType,Type>::value &&
4242 IsSame<typename T2::ElementType,Type>::value &&
4243 IsSame<typename T3::ElementType,Type>::value };
4252 template<
typename T1,
typename T2,
typename T3 >
4253 struct UseDoublePrecisionComplexKernel {
4254 typedef complex<double> Type;
4256 HasMutableDataAccess<T1>::value &&
4257 HasConstDataAccess<T2>::value &&
4258 HasConstDataAccess<T3>::value &&
4259 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
4260 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
4261 IsSame<typename T1::ElementType,Type>::value &&
4262 IsSame<typename T2::ElementType,Type>::value &&
4263 IsSame<typename T3::ElementType,Type>::value };
4271 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4272 struct UseDefaultKernel {
4273 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
4274 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
4275 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
4276 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
4284 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4285 struct UseVectorizedDefaultKernel {
4286 enum { value = !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
4287 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
4288 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
4289 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
4290 IsSame<typename T1::ElementType,T4>::value &&
4291 IntrinsicTrait<typename T1::ElementType>::addition &&
4292 IntrinsicTrait<typename T1::ElementType>::multiplication };
4298 typedef DMatScalarMultExpr<MMM,ST,false>
This;
4299 typedef typename MultTrait<RES,ST>::Type
ResultType;
4303 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
4308 typedef const DMatTDMatMultExpr<MT1,MT2>
LeftOperand;
4314 typedef typename SelectType< evaluateLeft, const RT1, CT1 >::Type
LT;
4317 typedef typename SelectType< evaluateRight, const RT2, CT2 >::Type
RT;
4322 enum { vectorizable = !IsDiagonal<MT1>::value && !IsDiagonal<MT2>::value &&
4323 MT1::vectorizable && MT2::vectorizable &&
4324 IsSame<ET1,ET2>::value &&
4325 IsSame<ET1,ST>::value &&
4326 IntrinsicTrait<ET1>::addition &&
4327 IntrinsicTrait<ET1>::multiplication };
4330 enum { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
4331 !evaluateRight && MT2::smpAssignable };
4340 explicit inline DMatScalarMultExpr(
const MMM& matrix, ST scalar )
4353 inline ReturnType
operator()(
size_t i,
size_t j )
const {
4356 return matrix_(i,j) * scalar_;
4365 inline size_t rows()
const {
4366 return matrix_.rows();
4375 inline size_t columns()
const {
4376 return matrix_.columns();
4406 template<
typename T >
4407 inline bool canAlias(
const T* alias )
const {
4408 return matrix_.canAlias( alias );
4418 template<
typename T >
4419 inline bool isAliased(
const T* alias )
const {
4420 return matrix_.isAliased( alias );
4430 return matrix_.isAligned();
4440 typename MMM::LeftOperand A( matrix_.leftOperand() );
4449 LeftOperand matrix_;
4450 RightOperand scalar_;
4465 template<
typename MT
4467 friend inline void assign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
4474 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
4475 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
4477 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
4480 else if( left.columns() == 0UL ) {
4495 DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
4510 template<
typename MT3
4514 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4516 if( ( IsDiagonal<MT4>::value || IsDiagonal<MT5>::value ) ||
4518 selectSmallAssignKernel( C, A, B, scalar );
4520 selectBlasAssignKernel( C, A, B, scalar );
4538 template<
typename MT3
4542 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
4543 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
4545 const size_t M( A.rows() );
4546 const size_t N( B.columns() );
4547 const size_t K( A.columns() );
4549 const size_t ibegin( ( IsStrictlyLower<MT4>::value )
4550 ?( ( IsStrictlyLower<MT5>::value && M > 1UL ) ? 2UL : 1UL )
4552 const size_t iend( ( IsStrictlyUpper<MT4>::value )
4553 ?( ( IsStrictlyUpper<MT5>::value && M > 1UL ) ? M-2UL : M-1UL )
4557 for(
size_t i=0UL; i<ibegin; ++i ) {
4558 for(
size_t j=0UL; j<N; ++j ) {
4562 for(
size_t i=ibegin; i<iend; ++i )
4564 const size_t jbegin( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
4565 ?( ( IsStrictlyUpper<MT4>::value )
4566 ?( IsStrictlyUpper<MT5>::value ? i+2UL : i+1UL )
4567 :( IsStrictlyUpper<MT5>::value ? i+1UL : i ) )
4568 :( IsStrictlyUpper<MT5>::value ? 1UL : 0UL ) );
4569 const size_t jend( ( IsLower<MT4>::value && IsLower<MT5>::value )
4570 ?( ( IsStrictlyLower<MT4>::value )
4571 ?( IsStrictlyLower<MT5>::value ? i-1UL : i )
4572 :( IsStrictlyLower<MT5>::value ? i : i+1UL ) )
4573 :( IsStrictlyLower<MT5>::value ? N-1UL : N ) );
4576 for(
size_t j=0UL; j<jbegin; ++j ) {
4579 for(
size_t j=jbegin; j<jend; ++j )
4581 const size_t kbegin( ( IsUpper<MT4>::value )
4582 ?( ( IsLower<MT5>::value )
4583 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i )
4584 , ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4585 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4586 :( ( IsLower<MT5>::value )
4587 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
4589 const size_t kend( ( IsLower<MT4>::value )
4590 ?( ( IsUpper<MT5>::value )
4591 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL )
4592 , ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
4593 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
4594 :( ( IsUpper<MT5>::value )
4595 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
4599 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
4600 for(
size_t k=kbegin+1UL; k<kend; ++k ) {
4601 (~C)(i,j) += A(i,k) * B(k,j);
4603 (~C)(i,j) *= scalar;
4605 for(
size_t j=jend; j<N; ++j ) {
4609 for(
size_t i=iend; i<M; ++i ) {
4610 for(
size_t j=0UL; j<N; ++j ) {
4631 template<
typename MT3
4635 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
4636 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
4638 const size_t M( A.rows() );
4639 const size_t N( B.columns() );
4640 const size_t K( A.columns() );
4642 const size_t jbegin( ( IsStrictlyUpper<MT5>::value )
4643 ?( ( IsStrictlyUpper<MT4>::value && N > 1UL ) ? 2UL : 1UL )
4645 const size_t jend( ( IsStrictlyLower<MT5>::value )
4646 ?( ( IsStrictlyLower<MT4>::value && N > 1UL ) ? N-2UL : N-1UL )
4650 for(
size_t j=0UL; j<jbegin; ++j ) {
4651 for(
size_t i=0UL; i<M; ++i ) {
4655 for(
size_t j=jbegin; j<jend; ++j )
4657 const size_t ibegin( ( IsLower<MT4>::value && IsLower<MT5>::value )
4658 ?( ( IsStrictlyLower<MT4>::value )
4659 ?( IsStrictlyLower<MT5>::value ? j+2UL : j+1UL )
4660 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4661 :( IsStrictlyLower<MT4>::value ? 1UL : 0UL ) );
4662 const size_t iend( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
4663 ?( ( IsStrictlyUpper<MT4>::value )
4664 ?( ( IsStrictlyUpper<MT5>::value )?( j-1UL ):( j ) )
4665 :( ( IsStrictlyUpper<MT5>::value )?( j ):( j+1UL ) ) )
4666 :( IsStrictlyUpper<MT4>::value ? M-1UL : M ) );
4669 for(
size_t i=0UL; i<ibegin; ++i ) {
4672 for(
size_t i=ibegin; i<iend; ++i )
4674 const size_t kbegin( ( IsUpper<MT4>::value )
4675 ?( ( IsLower<MT5>::value )
4676 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i )
4677 , ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4678 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4679 :( ( IsLower<MT5>::value )
4680 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
4682 const size_t kend( ( IsLower<MT4>::value )
4683 ?( ( IsUpper<MT5>::value )
4684 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL )
4685 , ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
4686 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
4687 :( ( IsUpper<MT5>::value )
4688 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
4692 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
4693 for(
size_t k=kbegin+1UL; k<kend; ++k ) {
4694 (~C)(i,j) += A(i,k) * B(k,j);
4696 (~C)(i,j) *= scalar;
4698 for(
size_t i=iend; i<M; ++i ) {
4702 for(
size_t j=jend; j<N; ++j ) {
4703 for(
size_t i=0UL; i<M; ++i ) {
4724 template<
typename MT3
4728 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
4729 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
4731 const size_t M( A.rows() );
4732 const size_t N( B.columns() );
4734 for(
size_t i=0UL; i<M; ++i )
4736 const size_t jbegin( ( IsUpper<MT4>::value )
4737 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
4739 const size_t jend( ( IsLower<MT4>::value )
4740 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
4744 if( IsUpper<MT4>::value ) {
4745 for(
size_t j=0UL; j<jbegin; ++j ) {
4749 for(
size_t j=jbegin; j<jend; ++j ) {
4750 (~C)(i,j) = A(i,j) * B(j,j) * scalar;
4752 if( IsLower<MT4>::value ) {
4753 for(
size_t j=jend; j<N; ++j ) {
4775 template<
typename MT3
4779 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
4780 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
4782 const size_t M( A.rows() );
4783 const size_t N( B.columns() );
4785 const size_t block( 16UL );
4787 for(
size_t jj=0UL; jj<N; jj+=block ) {
4788 const size_t jend(
min( N, jj+block ) );
4789 for(
size_t ii=0UL; ii<M; ii+=block ) {
4790 const size_t iend(
min( M, ii+block ) );
4791 for(
size_t j=jj; j<jend; ++j )
4793 const size_t ibegin( ( IsLower<MT4>::value )
4794 ?(
max( ( IsStrictlyLower<MT4>::value ? j+1UL : j ), ii ) )
4796 const size_t ipos( ( IsUpper<MT4>::value )
4797 ?(
min( ( IsStrictlyUpper<MT4>::value ? j : j+1UL ), iend ) )
4800 if( IsLower<MT4>::value ) {
4801 for(
size_t i=ii; i<ibegin; ++i ) {
4805 for(
size_t i=ibegin; i<ipos; ++i ) {
4806 (~C)(i,j) = A(i,j) * B(j,j) * scalar;
4808 if( IsUpper<MT4>::value ) {
4809 for(
size_t i=ipos; i<iend; ++i ) {
4833 template<
typename MT3
4837 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
4838 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
4840 const size_t M( A.rows() );
4841 const size_t N( B.columns() );
4843 const size_t block( 16UL );
4845 for(
size_t ii=0UL; ii<M; ii+=block ) {
4846 const size_t iend(
min( M, ii+block ) );
4847 for(
size_t jj=0UL; jj<N; jj+=block ) {
4848 const size_t jend(
min( N, jj+block ) );
4849 for(
size_t i=ii; i<iend; ++i )
4851 const size_t jbegin( ( IsUpper<MT5>::value )
4852 ?(
max( ( IsStrictlyUpper<MT5>::value ? i+1UL : i ), jj ) )
4854 const size_t jpos( ( IsLower<MT5>::value )
4855 ?(
min( ( IsStrictlyLower<MT5>::value ? i : i+1UL ), jend ) )
4858 if( IsUpper<MT5>::value ) {
4859 for(
size_t j=jj; j<jbegin; ++j ) {
4863 for(
size_t j=jbegin; j<jpos; ++j ) {
4864 (~C)(i,j) = A(i,i) * B(i,j) * scalar;
4866 if( IsLower<MT5>::value ) {
4867 for(
size_t j=jpos; j<jend; ++j ) {
4891 template<
typename MT3
4895 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
4896 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
4898 const size_t M( A.rows() );
4899 const size_t N( B.columns() );
4901 for(
size_t j=0UL; j<N; ++j )
4903 const size_t ibegin( ( IsLower<MT5>::value )
4904 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
4906 const size_t iend( ( IsUpper<MT5>::value )
4907 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
4911 if( IsLower<MT5>::value ) {
4912 for(
size_t i=0UL; i<ibegin; ++i ) {
4916 for(
size_t i=ibegin; i<iend; ++i ) {
4917 (~C)(i,j) = A(i,i) * B(i,j) * scalar;
4919 if( IsUpper<MT5>::value ) {
4920 for(
size_t i=iend; i<M; ++i ) {
4942 template<
typename MT3
4946 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
4947 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4951 for(
size_t i=0UL; i<A.rows(); ++i ) {
4952 C(i,i) = A(i,i) * B(i,i) * scalar;
4971 template<
typename MT3
4975 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4976 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4978 selectDefaultAssignKernel( C, A, B, scalar );
4997 template<
typename MT3
5001 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5002 selectSmallAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5004 typedef IntrinsicTrait<ElementType> IT;
5006 const size_t M( A.rows() );
5007 const size_t N( B.columns() );
5008 const size_t K( A.columns() );
5012 for( ; (i+2UL) <= M; i+=2UL )
5016 for( ; (j+4UL) <= N; j+=4UL )
5018 const size_t kbegin( ( IsUpper<MT4>::value )
5019 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
5020 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
5021 const size_t kend( ( IsLower<MT4>::value )
5022 ?( IsUpper<MT5>::value ?
min( i+2UL, j+4UL ) : ( i+2UL ) )
5023 :( IsUpper<MT5>::value ? ( j+4UL ) : K ) );
5025 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5027 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
5028 const IntrinsicType a1( A.load(i ,k) );
5029 const IntrinsicType a2( A.load(i+1UL,k) );
5030 const IntrinsicType b1( B.load(k,j ) );
5031 const IntrinsicType b2( B.load(k,j+1UL) );
5032 const IntrinsicType b3( B.load(k,j+2UL) );
5033 const IntrinsicType b4( B.load(k,j+3UL) );
5034 xmm1 = xmm1 + a1 * b1;
5035 xmm2 = xmm2 + a1 * b2;
5036 xmm3 = xmm3 + a1 * b3;
5037 xmm4 = xmm4 + a1 * b4;
5038 xmm5 = xmm5 + a2 * b1;
5039 xmm6 = xmm6 + a2 * b2;
5040 xmm7 = xmm7 + a2 * b3;
5041 xmm8 = xmm8 + a2 * b4;
5044 (~C)(i ,j ) =
sum( xmm1 ) * scalar;
5045 (~C)(i ,j+1UL) =
sum( xmm2 ) * scalar;
5046 (~C)(i ,j+2UL) =
sum( xmm3 ) * scalar;
5047 (~C)(i ,j+3UL) =
sum( xmm4 ) * scalar;
5048 (~C)(i+1UL,j ) =
sum( xmm5 ) * scalar;
5049 (~C)(i+1UL,j+1UL) =
sum( xmm6 ) * scalar;
5050 (~C)(i+1UL,j+2UL) =
sum( xmm7 ) * scalar;
5051 (~C)(i+1UL,j+3UL) =
sum( xmm8 ) * scalar;
5054 for( ; (j+2UL) <= N; j+=2UL )
5056 const size_t kbegin( ( IsUpper<MT4>::value )
5057 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
5058 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
5059 const size_t kend( ( IsLower<MT4>::value )
5060 ?( IsUpper<MT5>::value ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
5061 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
5063 IntrinsicType xmm1, xmm2, xmm3, xmm4;
5065 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
5066 const IntrinsicType a1( A.load(i ,k) );
5067 const IntrinsicType a2( A.load(i+1UL,k) );
5068 const IntrinsicType b1( B.load(k,j ) );
5069 const IntrinsicType b2( B.load(k,j+1UL) );
5070 xmm1 = xmm1 + a1 * b1;
5071 xmm2 = xmm2 + a1 * b2;
5072 xmm3 = xmm3 + a2 * b1;
5073 xmm4 = xmm4 + a2 * b2;
5076 (~C)(i ,j ) =
sum( xmm1 ) * scalar;
5077 (~C)(i ,j+1UL) =
sum( xmm2 ) * scalar;
5078 (~C)(i+1UL,j ) =
sum( xmm3 ) * scalar;
5079 (~C)(i+1UL,j+1UL) =
sum( xmm4 ) * scalar;
5084 const size_t kbegin( ( IsUpper<MT4>::value )
5085 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
5086 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
5087 const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
5089 IntrinsicType xmm1, xmm2;
5091 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
5092 const IntrinsicType b1( B.load(k,j) );
5093 xmm1 = xmm1 + A.load(i ,k) * b1;
5094 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
5097 (~C)(i ,j) =
sum( xmm1 ) * scalar;
5098 (~C)(i+1UL,j) =
sum( xmm2 ) * scalar;
5106 for( ; (j+4UL) <= N; j+=4UL )
5108 const size_t kbegin( ( IsUpper<MT4>::value )
5109 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
5110 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
5111 const size_t kend( ( IsUpper<MT5>::value )?( j+4UL ):( K ) );
5113 IntrinsicType xmm1, xmm2, xmm3, xmm4;
5115 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
5116 const IntrinsicType a1( A.load(i,k) );
5117 xmm1 = xmm1 + a1 * B.load(k,j );
5118 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
5119 xmm3 = xmm3 + a1 * B.load(k,j+2UL);
5120 xmm4 = xmm4 + a1 * B.load(k,j+3UL);
5123 (~C)(i,j ) =
sum( xmm1 ) * scalar;
5124 (~C)(i,j+1UL) =
sum( xmm2 ) * scalar;
5125 (~C)(i,j+2UL) =
sum( xmm3 ) * scalar;
5126 (~C)(i,j+3UL) =
sum( xmm4 ) * scalar;
5129 for( ; (j+2UL) <= N; j+=2UL )
5131 const size_t kbegin( ( IsUpper<MT4>::value )
5132 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
5133 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
5134 const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
5136 IntrinsicType xmm1, xmm2;
5138 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
5139 const IntrinsicType a1( A.load(i,k) );
5140 xmm1 = xmm1 + a1 * B.load(k,j );
5141 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
5144 (~C)(i,j ) =
sum( xmm1 ) * scalar;
5145 (~C)(i,j+1UL) =
sum( xmm2 ) * scalar;
5150 const size_t kbegin( ( IsUpper<MT4>::value )
5151 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
5152 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
5156 for(
size_t k=kbegin; k<K; k+=
IT::size ) {
5157 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
5160 (~C)(i,j) =
sum( xmm1 ) * scalar;
5181 template<
typename MT3
5185 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5186 selectSmallAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5188 typedef IntrinsicTrait<ElementType> IT;
5190 const size_t M( A.rows() );
5191 const size_t N( B.columns() );
5192 const size_t K( A.columns() );
5196 for( ; (i+4UL) <= M; i+=4UL )
5200 for( ; (j+2UL) <= N; j+=2UL )
5202 const size_t kbegin( ( IsUpper<MT4>::value )
5203 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
5204 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
5205 const size_t kend( ( IsLower<MT4>::value )
5206 ?( IsUpper<MT5>::value ?
min( i+4UL, j+2UL ) : ( i+4UL ) )
5207 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
5209 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5211 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
5212 const IntrinsicType a1( A.load(i ,k) );
5213 const IntrinsicType a2( A.load(i+1UL,k) );
5214 const IntrinsicType a3( A.load(i+2UL,k) );
5215 const IntrinsicType a4( A.load(i+3UL,k) );
5216 const IntrinsicType b1( B.load(k,j ) );
5217 const IntrinsicType b2( B.load(k,j+1UL) );
5218 xmm1 = xmm1 + a1 * b1;
5219 xmm2 = xmm2 + a1 * b2;
5220 xmm3 = xmm3 + a2 * b1;
5221 xmm4 = xmm4 + a2 * b2;
5222 xmm5 = xmm5 + a3 * b1;
5223 xmm6 = xmm6 + a3 * b2;
5224 xmm7 = xmm7 + a4 * b1;
5225 xmm8 = xmm8 + a4 * b2;
5228 (~C)(i ,j ) =
sum( xmm1 ) * scalar;
5229 (~C)(i ,j+1UL) =
sum( xmm2 ) * scalar;
5230 (~C)(i+1UL,j ) =
sum( xmm3 ) * scalar;
5231 (~C)(i+1UL,j+1UL) =
sum( xmm4 ) * scalar;
5232 (~C)(i+2UL,j ) =
sum( xmm5 ) * scalar;
5233 (~C)(i+2UL,j+1UL) =
sum( xmm6 ) * scalar;
5234 (~C)(i+3UL,j ) =
sum( xmm7 ) * scalar;
5235 (~C)(i+3UL,j+1UL) =
sum( xmm8 ) * scalar;
5240 const size_t kbegin( ( IsUpper<MT4>::value )
5241 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
5242 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
5243 const size_t kend( ( IsLower<MT4>::value )?( i+4UL ):( K ) );
5245 IntrinsicType xmm1, xmm2, xmm3, xmm4;
5247 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
5248 const IntrinsicType b1( B.load(k,j) );
5249 xmm1 = xmm1 + A.load(i ,k) * b1;
5250 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
5251 xmm3 = xmm3 + A.load(i+2UL,k) * b1;
5252 xmm4 = xmm4 + A.load(i+3UL,k) * b1;
5255 (~C)(i ,j) =
sum( xmm1 ) * scalar;
5256 (~C)(i+1UL,j) =
sum( xmm2 ) * scalar;
5257 (~C)(i+2UL,j) =
sum( xmm3 ) * scalar;
5258 (~C)(i+3UL,j) =
sum( xmm4 ) * scalar;
5262 for( ; (i+2UL) <= M; i+=2UL )
5266 for( ; (j+2UL) <= N; j+=2UL )
5268 const size_t kbegin( ( IsUpper<MT4>::value )
5269 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
5270 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
5271 const size_t kend( ( IsLower<MT4>::value )
5272 ?( IsUpper<MT5>::value ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
5273 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
5275 IntrinsicType xmm1, xmm2, xmm3, xmm4;
5277 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
5278 const IntrinsicType a1( A.load(i ,k) );
5279 const IntrinsicType a2( A.load(i+1UL,k) );
5280 const IntrinsicType b1( B.load(k,j ) );
5281 const IntrinsicType b2( B.load(k,j+1UL) );
5282 xmm1 = xmm1 + a1 * b1;
5283 xmm2 = xmm2 + a1 * b2;
5284 xmm3 = xmm3 + a2 * b1;
5285 xmm4 = xmm4 + a2 * b2;
5288 (~C)(i ,j ) =
sum( xmm1 ) * scalar;
5289 (~C)(i ,j+1UL) =
sum( xmm2 ) * scalar;
5290 (~C)(i+1UL,j ) =
sum( xmm3 ) * scalar;
5291 (~C)(i+1UL,j+1UL) =
sum( xmm4 ) * scalar;
5296 const size_t kbegin( ( IsUpper<MT4>::value )
5297 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
5298 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
5299 const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
5301 IntrinsicType xmm1, xmm2;
5303 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
5304 const IntrinsicType b1( B.load(k,j) );
5305 xmm1 = xmm1 + A.load(i ,k) * b1;
5306 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
5309 (~C)(i ,j) =
sum( xmm1 ) * scalar;
5310 (~C)(i+1UL,j) =
sum( xmm2 ) * scalar;
5318 for( ; (j+2UL) <= N; j+=2UL )
5320 const size_t kbegin( ( IsUpper<MT4>::value )
5321 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
5322 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
5323 const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
5325 IntrinsicType xmm1, xmm2;
5327 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
5328 const IntrinsicType a1( A.load(i,k) );
5329 xmm1 = xmm1 + a1 * B.load(k,j );
5330 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
5333 (~C)(i,j ) =
sum( xmm1 ) * scalar;
5334 (~C)(i,j+1UL) =
sum( xmm2 ) * scalar;
5339 const size_t kbegin( ( IsUpper<MT4>::value )
5340 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
5341 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
5345 for(
size_t k=kbegin; k<K; k+=
IT::size ) {
5346 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
5349 (~C)(i,j) =
sum( xmm1 ) * scalar;
5369 template<
typename MT3
5373 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5374 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5376 selectDefaultAssignKernel( C, A, B, scalar );
5395 template<
typename MT3
5399 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5400 selectLargeAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5403 selectSmallAssignKernel( ~C, A, B, scalar );
5422 template<
typename MT3
5426 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5427 selectLargeAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5430 selectSmallAssignKernel( ~C, A, B, scalar );
5448 template<
typename MT3
5452 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5453 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5455 selectLargeAssignKernel( C, A, B, scalar );
5474 template<
typename MT3
5478 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
5479 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5481 if( IsTriangular<MT4>::value ) {
5483 strmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), scalar );
5485 else if( IsTriangular<MT5>::value ) {
5487 strmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), scalar );
5490 sgemm( C, A, B, scalar, 0.0F );
5511 template<
typename MT3
5515 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
5516 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5518 if( IsTriangular<MT4>::value ) {
5520 dtrmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), scalar );
5522 else if( IsTriangular<MT5>::value ) {
5524 dtrmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), scalar );
5527 dgemm( C, A, B, scalar, 0.0 );
5548 template<
typename MT3
5552 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
5553 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5555 if( IsTriangular<MT4>::value ) {
5557 ctrmm( C, A, CblasLeft,
5558 ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
5559 complex<float>( scalar, 0.0F ) );
5561 else if( IsTriangular<MT5>::value ) {
5563 ctrmm( C, B, CblasRight,
5564 ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
5565 complex<float>( scalar, 0.0F ) );
5568 cgemm( C, A, B, complex<float>( scalar, 0.0F ), complex<float>( 0.0F, 0.0F ) );
5589 template<
typename MT3
5593 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
5594 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5596 if( IsTriangular<MT4>::value ) {
5598 ztrmm( C, A, CblasLeft,
5599 ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
5600 complex<double>( scalar, 0.0 ) );
5602 else if( IsTriangular<MT5>::value ) {
5604 ztrmm( C, B, CblasRight,
5605 ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
5606 complex<double>( scalar, 0.0 ) );
5609 zgemm( C, A, B, complex<double>( scalar, 0.0 ), complex<double>( 0.0, 0.0 ) );
5627 template<
typename MT
5629 friend inline void assign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
5633 typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
5645 const TmpType tmp(
serial( rhs ) );
5662 template<
typename MT
5664 friend inline void addAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
5671 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
5672 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
5674 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
5688 DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
5703 template<
typename MT3
5707 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5709 if( ( IsDiagonal<MT4>::value || IsDiagonal<MT5>::value ) ||
5711 selectSmallAddAssignKernel( C, A, B, scalar );
5713 selectBlasAddAssignKernel( C, A, B, scalar );
5731 template<
typename MT3
5735 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
5736 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5738 const ResultType tmp(
serial( A * B * scalar ) );
5757 template<
typename MT3
5761 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
5762 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5764 const size_t M( A.rows() );
5765 const size_t N( B.columns() );
5767 for(
size_t i=0UL; i<M; ++i )
5769 const size_t jbegin( ( IsUpper<MT4>::value )
5770 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
5772 const size_t jend( ( IsLower<MT4>::value )
5773 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
5777 const size_t jnum( jend - jbegin );
5778 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
5780 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
5781 (~C)(i,j ) += A(i,j ) * B(j ,j ) * scalar;
5782 (~C)(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
5785 (~C)(i,jpos) += A(i,jpos) * B(jpos,jpos) * scalar;
5805 template<
typename MT3
5809 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
5810 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5812 const size_t M( A.rows() );
5813 const size_t N( B.columns() );
5815 const size_t block( 16UL );
5817 for(
size_t jj=0UL; jj<N; jj+=block ) {
5818 const size_t jend(
min( N, jj+block ) );
5819 for(
size_t ii=0UL; ii<M; ii+=block ) {
5820 const size_t iend(
min( M, ii+block ) );
5821 for(
size_t j=jj; j<jend; ++j )
5823 const size_t ibegin( ( IsLower<MT4>::value )
5824 ?(
max( ( IsStrictlyLower<MT4>::value ? j+1UL : j ), ii ) )
5826 const size_t ipos( ( IsUpper<MT4>::value )
5827 ?(
min( ( IsStrictlyUpper<MT4>::value ? j : j+1UL ), iend ) )
5830 for(
size_t i=ibegin; i<ipos; ++i ) {
5831 (~C)(i,j) += A(i,j) * B(j,j) * scalar;
5853 template<
typename MT3
5857 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
5858 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5860 const size_t M( A.rows() );
5861 const size_t N( B.columns() );
5863 const size_t block( 16UL );
5865 for(
size_t ii=0UL; ii<M; ii+=block ) {
5866 const size_t iend(
min( M, ii+block ) );
5867 for(
size_t jj=0UL; jj<N; jj+=block ) {
5868 const size_t jend(
min( N, jj+block ) );
5869 for(
size_t i=ii; i<iend; ++i )
5871 const size_t jbegin( ( IsUpper<MT5>::value )
5872 ?(
max( ( IsStrictlyUpper<MT5>::value ? i+1UL : i ), jj ) )
5874 const size_t jpos( ( IsLower<MT5>::value )
5875 ?(
min( ( IsStrictlyLower<MT5>::value ? i : i+1UL ), jend ) )
5878 for(
size_t j=jbegin; j<jpos; ++j ) {
5879 (~C)(i,j) += A(i,i) * B(i,j) * scalar;
5901 template<
typename MT3
5905 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
5906 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5908 const size_t M( A.rows() );
5909 const size_t N( B.columns() );
5911 for(
size_t j=0UL; j<N; ++j )
5913 const size_t ibegin( ( IsLower<MT5>::value )
5914 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
5916 const size_t iend( ( IsUpper<MT5>::value )
5917 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
5921 const size_t inum( iend - ibegin );
5922 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
5924 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
5925 (~C)(i ,j) += A(i ,i ) * B(i ,j) * scalar;
5926 (~C)(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
5929 (~C)(ipos,j) += A(ipos,ipos) * B(ipos,j) * scalar;
5949 template<
typename MT3
5953 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
5954 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5956 for(
size_t i=0UL; i<A.rows(); ++i ) {
5957 C(i,i) += A(i,i) * B(i,i) * scalar;
5976 template<
typename MT3
5980 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5981 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5983 selectDefaultAddAssignKernel( C, A, B, scalar );
6002 template<
typename MT3
6006 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6007 selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6009 typedef IntrinsicTrait<ElementType> IT;
6011 const size_t M( A.rows() );
6012 const size_t N( B.columns() );
6013 const size_t K( A.columns() );
6017 for( ; (i+2UL) <= M; i+=2UL )
6021 for( ; (j+4UL) <= N; j+=4UL )
6023 const size_t kbegin( ( IsUpper<MT4>::value )
6024 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
6025 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
6026 const size_t kend( ( IsLower<MT4>::value )
6027 ?( IsUpper<MT5>::value ?
min( i+2UL, j+4UL ) : ( i+2UL ) )
6028 :( IsUpper<MT5>::value ? ( j+4UL ) : K ) );
6030 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6032 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
6033 const IntrinsicType a1( A.load(i ,k) );
6034 const IntrinsicType a2( A.load(i+1UL,k) );
6035 const IntrinsicType b1( B.load(k,j ) );
6036 const IntrinsicType b2( B.load(k,j+1UL) );
6037 const IntrinsicType b3( B.load(k,j+2UL) );
6038 const IntrinsicType b4( B.load(k,j+3UL) );
6039 xmm1 = xmm1 + a1 * b1;
6040 xmm2 = xmm2 + a1 * b2;
6041 xmm3 = xmm3 + a1 * b3;
6042 xmm4 = xmm4 + a1 * b4;
6043 xmm5 = xmm5 + a2 * b1;
6044 xmm6 = xmm6 + a2 * b2;
6045 xmm7 = xmm7 + a2 * b3;
6046 xmm8 = xmm8 + a2 * b4;
6049 (~C)(i ,j ) +=
sum( xmm1 ) * scalar;
6050 (~C)(i ,j+1UL) +=
sum( xmm2 ) * scalar;
6051 (~C)(i ,j+2UL) +=
sum( xmm3 ) * scalar;
6052 (~C)(i ,j+3UL) +=
sum( xmm4 ) * scalar;
6053 (~C)(i+1UL,j ) +=
sum( xmm5 ) * scalar;
6054 (~C)(i+1UL,j+1UL) +=
sum( xmm6 ) * scalar;
6055 (~C)(i+1UL,j+2UL) +=
sum( xmm7 ) * scalar;
6056 (~C)(i+1UL,j+3UL) +=
sum( xmm8 ) * scalar;
6059 for( ; (j+2UL) <= N; j+=2UL )
6061 const size_t kbegin( ( IsUpper<MT4>::value )
6062 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
6063 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
6064 const size_t kend( ( IsLower<MT4>::value )
6065 ?( IsUpper<MT5>::value ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
6066 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
6068 IntrinsicType xmm1, xmm2, xmm3, xmm4;
6070 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
6071 const IntrinsicType a1( A.load(i ,k) );
6072 const IntrinsicType a2( A.load(i+1UL,k) );
6073 const IntrinsicType b1( B.load(k,j ) );
6074 const IntrinsicType b2( B.load(k,j+1UL) );
6075 xmm1 = xmm1 + a1 * b1;
6076 xmm2 = xmm2 + a1 * b2;
6077 xmm3 = xmm3 + a2 * b1;
6078 xmm4 = xmm4 + a2 * b2;
6081 (~C)(i ,j ) +=
sum( xmm1 ) * scalar;
6082 (~C)(i ,j+1UL) +=
sum( xmm2 ) * scalar;
6083 (~C)(i+1UL,j ) +=
sum( xmm3 ) * scalar;
6084 (~C)(i+1UL,j+1UL) +=
sum( xmm4 ) * scalar;
6089 const size_t kbegin( ( IsUpper<MT4>::value )
6090 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
6091 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
6092 const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
6094 IntrinsicType xmm1, xmm2;
6096 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
6097 const IntrinsicType b1( B.load(k,j) );
6098 xmm1 = xmm1 + A.load(i ,k) * b1;
6099 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
6102 (~C)(i ,j) +=
sum( xmm1 ) * scalar;
6103 (~C)(i+1UL,j) +=
sum( xmm2 ) * scalar;
6111 for( ; (j+4UL) <= N; j+=4UL )
6113 const size_t kbegin( ( IsUpper<MT4>::value )
6114 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
6115 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
6116 const size_t kend( ( IsUpper<MT5>::value )?( j+4UL ):( K ) );
6118 IntrinsicType xmm1, xmm2, xmm3, xmm4;
6120 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
6121 const IntrinsicType a1( A.load(i,k) );
6122 xmm1 = xmm1 + a1 * B.load(k,j );
6123 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
6124 xmm3 = xmm3 + a1 * B.load(k,j+2UL);
6125 xmm4 = xmm4 + a1 * B.load(k,j+3UL);
6128 (~C)(i,j ) +=
sum( xmm1 ) * scalar;
6129 (~C)(i,j+1UL) +=
sum( xmm2 ) * scalar;
6130 (~C)(i,j+2UL) +=
sum( xmm3 ) * scalar;
6131 (~C)(i,j+3UL) +=
sum( xmm4 ) * scalar;
6134 for( ; (j+2UL) <= N; j+=2UL )
6136 const size_t kbegin( ( IsUpper<MT4>::value )
6137 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
6138 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
6139 const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
6141 IntrinsicType xmm1, xmm2;
6143 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
6144 const IntrinsicType a1( A.load(i,k) );
6145 xmm1 = xmm1 + a1 * B.load(k,j );
6146 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
6149 (~C)(i,j ) +=
sum( xmm1 ) * scalar;
6150 (~C)(i,j+1UL) +=
sum( xmm2 ) * scalar;
6155 const size_t kbegin( ( IsUpper<MT4>::value )
6156 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
6157 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
6161 for(
size_t k=kbegin; k<K; k+=
IT::size ) {
6162 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
6165 (~C)(i,j) +=
sum( xmm1 ) * scalar;
6186 template<
typename MT3
6190 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6191 selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6193 typedef IntrinsicTrait<ElementType> IT;
6195 const size_t M( A.rows() );
6196 const size_t N( B.columns() );
6197 const size_t K( A.columns() );
6201 for( ; (i+4UL) <= M; i+=4UL )
6205 for( ; (j+2UL) <= N; j+=2UL )
6207 const size_t kbegin( ( IsUpper<MT4>::value )
6208 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
6209 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
6210 const size_t kend( ( IsLower<MT4>::value )
6211 ?( IsUpper<MT5>::value ?
min( i+4UL, j+2UL ) : ( i+4UL ) )
6212 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
6214 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6216 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
6217 const IntrinsicType a1( A.load(i ,k) );
6218 const IntrinsicType a2( A.load(i+1UL,k) );
6219 const IntrinsicType a3( A.load(i+2UL,k) );
6220 const IntrinsicType a4( A.load(i+3UL,k) );
6221 const IntrinsicType b1( B.load(k,j ) );
6222 const IntrinsicType b2( B.load(k,j+1UL) );
6223 xmm1 = xmm1 + a1 * b1;
6224 xmm2 = xmm2 + a1 * b2;
6225 xmm3 = xmm3 + a2 * b1;
6226 xmm4 = xmm4 + a2 * b2;
6227 xmm5 = xmm5 + a3 * b1;
6228 xmm6 = xmm6 + a3 * b2;
6229 xmm7 = xmm7 + a4 * b1;
6230 xmm8 = xmm8 + a4 * b2;
6233 (~C)(i ,j ) +=
sum( xmm1 ) * scalar;
6234 (~C)(i ,j+1UL) +=
sum( xmm2 ) * scalar;
6235 (~C)(i+1UL,j ) +=
sum( xmm3 ) * scalar;
6236 (~C)(i+1UL,j+1UL) +=
sum( xmm4 ) * scalar;
6237 (~C)(i+2UL,j ) +=
sum( xmm5 ) * scalar;
6238 (~C)(i+2UL,j+1UL) +=
sum( xmm6 ) * scalar;
6239 (~C)(i+3UL,j ) +=
sum( xmm7 ) * scalar;
6240 (~C)(i+3UL,j+1UL) +=
sum( xmm8 ) * scalar;
6245 const size_t kbegin( ( IsUpper<MT4>::value )
6246 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
6247 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
6248 const size_t kend( ( IsLower<MT4>::value )?( i+4UL ):( K ) );
6250 IntrinsicType xmm1, xmm2, xmm3, xmm4;
6252 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
6253 const IntrinsicType b1( B.load(k,j) );
6254 xmm1 = xmm1 + A.load(i ,k) * b1;
6255 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
6256 xmm3 = xmm3 + A.load(i+2UL,k) * b1;
6257 xmm4 = xmm4 + A.load(i+3UL,k) * b1;
6260 (~C)(i ,j) +=
sum( xmm1 ) * scalar;
6261 (~C)(i+1UL,j) +=
sum( xmm2 ) * scalar;
6262 (~C)(i+2UL,j) +=
sum( xmm3 ) * scalar;
6263 (~C)(i+3UL,j) +=
sum( xmm4 ) * scalar;
6267 for( ; (i+2UL) <= M; i+=2UL )
6271 for( ; (j+2UL) <= N; j+=2UL )
6273 const size_t kbegin( ( IsUpper<MT4>::value )
6274 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
6275 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
6276 const size_t kend( ( IsLower<MT4>::value )
6277 ?( IsUpper<MT5>::value ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
6278 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
6280 IntrinsicType xmm1, xmm2, xmm3, xmm4;
6282 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
6283 const IntrinsicType a1( A.load(i ,k) );
6284 const IntrinsicType a2( A.load(i+1UL,k) );
6285 const IntrinsicType b1( B.load(k,j ) );
6286 const IntrinsicType b2( B.load(k,j+1UL) );
6287 xmm1 = xmm1 + a1 * b1;
6288 xmm2 = xmm2 + a1 * b2;
6289 xmm3 = xmm3 + a2 * b1;
6290 xmm4 = xmm4 + a2 * b2;
6293 (~C)(i ,j ) +=
sum( xmm1 ) * scalar;
6294 (~C)(i ,j+1UL) +=
sum( xmm2 ) * scalar;
6295 (~C)(i+1UL,j ) +=
sum( xmm3 ) * scalar;
6296 (~C)(i+1UL,j+1UL) +=
sum( xmm4 ) * scalar;
6301 const size_t kbegin( ( IsUpper<MT4>::value )
6302 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
6303 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
6304 const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
6306 IntrinsicType xmm1, xmm2;
6308 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
6309 const IntrinsicType b1( B.load(k,j) );
6310 xmm1 = xmm1 + A.load(i ,k) * b1;
6311 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
6314 (~C)(i ,j) +=
sum( xmm1 ) * scalar;
6315 (~C)(i+1UL,j) +=
sum( xmm2 ) * scalar;
6323 for( ; (j+2UL) <= N; j+=2UL )
6325 const size_t kbegin( ( IsUpper<MT4>::value )
6326 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
6327 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
6328 const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
6330 IntrinsicType xmm1, xmm2;
6332 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
6333 const IntrinsicType a1( A.load(i,k) );
6334 xmm1 = xmm1 + a1 * B.load(k,j );
6335 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
6338 (~C)(i,j ) +=
sum( xmm1 ) * scalar;
6339 (~C)(i,j+1UL) +=
sum( xmm2 ) * scalar;
6344 const size_t kbegin( ( IsUpper<MT4>::value )
6345 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
6346 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
6350 for(
size_t k=kbegin; k<K; k+=
IT::size ) {
6351 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
6354 (~C)(i,j) +=
sum( xmm1 ) * scalar;
6374 template<
typename MT3
6378 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6379 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6381 selectDefaultAddAssignKernel( C, A, B, scalar );
6400 template<
typename MT3
6404 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6405 selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6408 selectSmallAddAssignKernel( ~C, A, B, scalar );
6427 template<
typename MT3
6431 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6432 selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6435 selectSmallAddAssignKernel( ~C, A, B, scalar );
6453 template<
typename MT3
6457 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6458 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6460 selectLargeAddAssignKernel( C, A, B, scalar );
6479 template<
typename MT3
6483 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
6484 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6486 if( IsTriangular<MT4>::value ) {
6488 strmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), scalar );
6491 else if( IsTriangular<MT5>::value ) {
6493 strmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), scalar );
6497 sgemm( C, A, B, scalar, 1.0F );
6518 template<
typename MT3
6522 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
6523 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6525 if( IsTriangular<MT4>::value ) {
6527 dtrmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), scalar );
6530 else if( IsTriangular<MT5>::value ) {
6532 dtrmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), scalar );
6536 dgemm( C, A, B, scalar, 1.0 );
6557 template<
typename MT3
6561 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
6562 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6564 if( IsTriangular<MT4>::value ) {
6566 ctrmm( tmp, A, CblasLeft,
6567 ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
6568 complex<float>( scalar, 0.0F ) );
6571 else if( IsTriangular<MT5>::value ) {
6573 ctrmm( tmp, B, CblasRight,
6574 ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
6575 complex<float>( scalar, 0.0F ) );
6579 cgemm( C, A, B, complex<float>( scalar, 0.0F ), complex<float>( 1.0F, 0.0F ) );
6600 template<
typename MT3
6604 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
6605 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6607 if( IsTriangular<MT4>::value ) {
6609 ztrmm( tmp, A, CblasLeft,
6610 ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
6611 complex<double>( scalar, 0.0 ) );
6614 else if( IsTriangular<MT5>::value ) {
6616 ztrmm( tmp, B, CblasRight,
6617 ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
6618 complex<double>( scalar, 0.0 ) );
6622 zgemm( C, A, B, complex<double>( scalar, 0.0 ), complex<double>( 1.0, 0.0 ) );
6644 template<
typename MT
6646 friend inline void subAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
6653 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
6654 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
6656 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
6670 DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
6685 template<
typename MT3
6689 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6691 if( ( IsDiagonal<MT4>::value || IsDiagonal<MT5>::value ) ||
6693 selectSmallSubAssignKernel( C, A, B, scalar );
6695 selectBlasSubAssignKernel( C, A, B, scalar );
6713 template<
typename MT3
6717 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
6718 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6720 const ResultType tmp(
serial( A * B * scalar ) );
6739 template<
typename MT3
6743 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
6744 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6746 const size_t M( A.rows() );
6747 const size_t N( B.columns() );
6749 for(
size_t i=0UL; i<M; ++i )
6751 const size_t jbegin( ( IsUpper<MT4>::value )
6752 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
6754 const size_t jend( ( IsLower<MT4>::value )
6755 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
6759 const size_t jnum( jend - jbegin );
6760 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
6762 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
6763 (~C)(i,j ) -= A(i,j ) * B(j ,j ) * scalar;
6764 (~C)(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
6767 (~C)(i,jpos) -= A(i,jpos) * B(jpos,jpos) * scalar;
6787 template<
typename MT3
6791 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
6792 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6794 const size_t M( A.rows() );
6795 const size_t N( B.columns() );
6797 const size_t block( 16UL );
6799 for(
size_t jj=0UL; jj<N; jj+=block ) {
6800 const size_t jend(
min( N, jj+block ) );
6801 for(
size_t ii=0UL; ii<M; ii+=block ) {
6802 const size_t iend(
min( M, ii+block ) );
6803 for(
size_t j=jj; j<jend; ++j )
6805 const size_t ibegin( ( IsLower<MT4>::value )
6806 ?(
max( ( IsStrictlyLower<MT4>::value ? j+1UL : j ), ii ) )
6808 const size_t ipos( ( IsUpper<MT4>::value )
6809 ?(
min( ( IsStrictlyUpper<MT4>::value ? j : j+1UL ), iend ) )
6812 for(
size_t i=ibegin; i<ipos; ++i ) {
6813 (~C)(i,j) -= A(i,j) * B(j,j) * scalar;
6836 template<
typename MT3
6840 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
6841 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6843 const size_t M( A.rows() );
6844 const size_t N( B.columns() );
6846 const size_t block( 16UL );
6848 for(
size_t ii=0UL; ii<M; ii+=block ) {
6849 const size_t iend(
min( M, ii+block ) );
6850 for(
size_t jj=0UL; jj<N; jj+=block ) {
6851 const size_t jend(
min( N, jj+block ) );
6852 for(
size_t i=ii; i<iend; ++i )
6854 const size_t jbegin( ( IsUpper<MT5>::value )
6855 ?(
max( ( IsStrictlyUpper<MT5>::value ? i+1UL : i ), jj ) )
6857 const size_t jpos( ( IsLower<MT5>::value )
6858 ?(
min( ( IsStrictlyLower<MT5>::value ? i : i+1UL ), jend ) )
6861 for(
size_t j=jbegin; j<jpos; ++j ) {
6862 (~C)(i,j) -= A(i,i) * B(i,j) * scalar;
6885 template<
typename MT3
6889 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
6890 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6892 const size_t M( A.rows() );
6893 const size_t N( B.columns() );
6895 for(
size_t j=0UL; j<N; ++j )
6897 const size_t ibegin( ( IsLower<MT5>::value )
6898 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
6900 const size_t iend( ( IsUpper<MT5>::value )
6901 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
6905 const size_t inum( iend - ibegin );
6906 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
6908 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
6909 (~C)(i ,j) -= A(i ,i ) * B(i ,j) * scalar;
6910 (~C)(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
6913 (~C)(ipos,j) -= A(ipos,ipos) * B(ipos,j) * scalar;
6933 template<
typename MT3
6937 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
6938 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6940 for(
size_t i=0UL; i<A.rows(); ++i ) {
6941 C(i,i) -= A(i,i) * B(i,i) * scalar;
6960 template<
typename MT3
6964 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6965 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6967 selectDefaultSubAssignKernel( C, A, B, scalar );
6986 template<
typename MT3
6990 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6991 selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6993 typedef IntrinsicTrait<ElementType> IT;
6995 const size_t M( A.rows() );
6996 const size_t N( B.columns() );
6997 const size_t K( A.columns() );
7001 for( ; (i+2UL) <= M; i+=2UL )
7005 for( ; (j+4UL) <= N; j+=4UL )
7007 const size_t kbegin( ( IsUpper<MT4>::value )
7008 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
7009 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
7010 const size_t kend( ( IsLower<MT4>::value )
7011 ?( IsUpper<MT5>::value ?
min( i+2UL, j+4UL ) : ( i+2UL ) )
7012 :( IsUpper<MT5>::value ? ( j+4UL ) : K ) );
7014 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7016 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
7017 const IntrinsicType a1( A.load(i ,k) );
7018 const IntrinsicType a2( A.load(i+1UL,k) );
7019 const IntrinsicType b1( B.load(k,j ) );
7020 const IntrinsicType b2( B.load(k,j+1UL) );
7021 const IntrinsicType b3( B.load(k,j+2UL) );
7022 const IntrinsicType b4( B.load(k,j+3UL) );
7023 xmm1 = xmm1 + a1 * b1;
7024 xmm2 = xmm2 + a1 * b2;
7025 xmm3 = xmm3 + a1 * b3;
7026 xmm4 = xmm4 + a1 * b4;
7027 xmm5 = xmm5 + a2 * b1;
7028 xmm6 = xmm6 + a2 * b2;
7029 xmm7 = xmm7 + a2 * b3;
7030 xmm8 = xmm8 + a2 * b4;
7033 (~C)(i ,j ) -=
sum( xmm1 ) * scalar;
7034 (~C)(i ,j+1UL) -=
sum( xmm2 ) * scalar;
7035 (~C)(i ,j+2UL) -=
sum( xmm3 ) * scalar;
7036 (~C)(i ,j+3UL) -=
sum( xmm4 ) * scalar;
7037 (~C)(i+1UL,j ) -=
sum( xmm5 ) * scalar;
7038 (~C)(i+1UL,j+1UL) -=
sum( xmm6 ) * scalar;
7039 (~C)(i+1UL,j+2UL) -=
sum( xmm7 ) * scalar;
7040 (~C)(i+1UL,j+3UL) -=
sum( xmm8 ) * scalar;
7043 for( ; (j+2UL) <= N; j+=2UL )
7045 const size_t kbegin( ( IsUpper<MT4>::value )
7046 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
7047 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
7048 const size_t kend( ( IsLower<MT4>::value )
7049 ?( IsUpper<MT5>::value ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
7050 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
7052 IntrinsicType xmm1, xmm2, xmm3, xmm4;
7054 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
7055 const IntrinsicType a1( A.load(i ,k) );
7056 const IntrinsicType a2( A.load(i+1UL,k) );
7057 const IntrinsicType b1( B.load(k,j ) );
7058 const IntrinsicType b2( B.load(k,j+1UL) );
7059 xmm1 = xmm1 + a1 * b1;
7060 xmm2 = xmm2 + a1 * b2;
7061 xmm3 = xmm3 + a2 * b1;
7062 xmm4 = xmm4 + a2 * b2;
7065 (~C)(i ,j ) -=
sum( xmm1 ) * scalar;
7066 (~C)(i ,j+1UL) -=
sum( xmm2 ) * scalar;
7067 (~C)(i+1UL,j ) -=
sum( xmm3 ) * scalar;
7068 (~C)(i+1UL,j+1UL) -=
sum( xmm4 ) * scalar;
7073 const size_t kbegin( ( IsUpper<MT4>::value )
7074 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
7075 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
7076 const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
7078 IntrinsicType xmm1, xmm2;
7080 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
7081 const IntrinsicType b1( B.load(k,j) );
7082 xmm1 = xmm1 + A.load(i ,k) * b1;
7083 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
7086 (~C)(i ,j) -=
sum( xmm1 ) * scalar;
7087 (~C)(i+1UL,j) -=
sum( xmm2 ) * scalar;
7095 for( ; (j+4UL) <= N; j+=4UL )
7097 const size_t kbegin( ( IsUpper<MT4>::value )
7098 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
7099 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
7100 const size_t kend( ( IsUpper<MT5>::value )?( j+4UL ):( K ) );
7102 IntrinsicType xmm1, xmm2, xmm3, xmm4;
7104 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
7105 const IntrinsicType a1( A.load(i,k) );
7106 xmm1 = xmm1 + a1 * B.load(k,j );
7107 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
7108 xmm3 = xmm3 + a1 * B.load(k,j+2UL);
7109 xmm4 = xmm4 + a1 * B.load(k,j+3UL);
7112 (~C)(i,j ) -=
sum( xmm1 ) * scalar;
7113 (~C)(i,j+1UL) -=
sum( xmm2 ) * scalar;
7114 (~C)(i,j+2UL) -=
sum( xmm3 ) * scalar;
7115 (~C)(i,j+3UL) -=
sum( xmm4 ) * scalar;
7118 for( ; (j+2UL) <= N; j+=2UL )
7120 const size_t kbegin( ( IsUpper<MT4>::value )
7121 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
7122 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
7123 const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
7125 IntrinsicType xmm1, xmm2;
7127 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
7128 const IntrinsicType a1( A.load(i,k) );
7129 xmm1 = xmm1 + a1 * B.load(k,j );
7130 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
7133 (~C)(i,j ) -=
sum( xmm1 ) * scalar;
7134 (~C)(i,j+1UL) -=
sum( xmm2 ) * scalar;
7139 const size_t kbegin( ( IsUpper<MT4>::value )
7140 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
7141 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
7145 for(
size_t k=kbegin; k<K; k+=
IT::size ) {
7146 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
7149 (~C)(i,j) -=
sum( xmm1 ) * scalar;
7170 template<
typename MT3
7174 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7175 selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
7177 typedef IntrinsicTrait<ElementType> IT;
7179 const size_t M( A.rows() );
7180 const size_t N( B.columns() );
7181 const size_t K( A.columns() );
7185 for( ; (i+4UL) <= M; i+=4UL )
7189 for( ; (j+2UL) <= N; j+=2UL )
7191 const size_t kbegin( ( IsUpper<MT4>::value )
7192 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
7193 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
7194 const size_t kend( ( IsLower<MT4>::value )
7195 ?( IsUpper<MT5>::value ?
min( i+4UL, j+2UL ) : ( i+4UL ) )
7196 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
7198 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7200 for(
size_t k=kbegin; k<kend; k+=
IT::size )
7202 const IntrinsicType a1( A.load(i ,k) );
7203 const IntrinsicType a2( A.load(i+1UL,k) );
7204 const IntrinsicType a3( A.load(i+2UL,k) );
7205 const IntrinsicType a4( A.load(i+3UL,k) );
7206 const IntrinsicType b1( B.load(k,j ) );
7207 const IntrinsicType b2( B.load(k,j+1UL) );
7208 xmm1 = xmm1 + a1 * b1;
7209 xmm2 = xmm2 + a1 * b2;
7210 xmm3 = xmm3 + a2 * b1;
7211 xmm4 = xmm4 + a2 * b2;
7212 xmm5 = xmm5 + a3 * b1;
7213 xmm6 = xmm6 + a3 * b2;
7214 xmm7 = xmm7 + a4 * b1;
7215 xmm8 = xmm8 + a4 * b2;
7218 (~C)(i ,j ) -=
sum( xmm1 ) * scalar;
7219 (~C)(i ,j+1UL) -=
sum( xmm2 ) * scalar;
7220 (~C)(i+1UL,j ) -=
sum( xmm3 ) * scalar;
7221 (~C)(i+1UL,j+1UL) -=
sum( xmm4 ) * scalar;
7222 (~C)(i+2UL,j ) -=
sum( xmm5 ) * scalar;
7223 (~C)(i+2UL,j+1UL) -=
sum( xmm6 ) * scalar;
7224 (~C)(i+3UL,j ) -=
sum( xmm7 ) * scalar;
7225 (~C)(i+3UL,j+1UL) -=
sum( xmm8 ) * scalar;
7230 const size_t kbegin( ( IsUpper<MT4>::value )
7231 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
7232 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
7233 const size_t kend( ( IsLower<MT4>::value )?( i+4UL ):( K ) );
7235 IntrinsicType xmm1, xmm2, xmm3, xmm4;
7237 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
7238 const IntrinsicType b1( B.load(k,j) );
7239 xmm1 = xmm1 + A.load(i ,k) * b1;
7240 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
7241 xmm3 = xmm3 + A.load(i+2UL,k) * b1;
7242 xmm4 = xmm4 + A.load(i+3UL,k) * b1;
7245 (~C)(i ,j) -=
sum( xmm1 ) * scalar;
7246 (~C)(i+1UL,j) -=
sum( xmm2 ) * scalar;
7247 (~C)(i+2UL,j) -=
sum( xmm3 ) * scalar;
7248 (~C)(i+3UL,j) -=
sum( xmm4 ) * scalar;
7252 for( ; (i+2UL) <= M; i+=2UL )
7256 for( ; (j+2UL) <= N; j+=2UL )
7258 const size_t kbegin( ( IsUpper<MT4>::value )
7259 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
7260 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
7261 const size_t kend( ( IsLower<MT4>::value )
7262 ?( IsUpper<MT5>::value ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
7263 :( IsUpper<MT5>::value ? ( j+2UL ) : K ) );
7265 IntrinsicType xmm1, xmm2, xmm3, xmm4;
7267 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
7268 const IntrinsicType a1( A.load(i ,k) );
7269 const IntrinsicType a2( A.load(i+1UL,k) );
7270 const IntrinsicType b1( B.load(k,j ) );
7271 const IntrinsicType b2( B.load(k,j+1UL) );
7272 xmm1 = xmm1 + a1 * b1;
7273 xmm2 = xmm2 + a1 * b2;
7274 xmm3 = xmm3 + a2 * b1;
7275 xmm4 = xmm4 + a2 * b2;
7278 (~C)(i ,j ) -=
sum( xmm1 ) * scalar;
7279 (~C)(i ,j+1UL) -=
sum( xmm2 ) * scalar;
7280 (~C)(i+1UL,j ) -=
sum( xmm3 ) * scalar;
7281 (~C)(i+1UL,j+1UL) -=
sum( xmm4 ) * scalar;
7286 const size_t kbegin( ( IsUpper<MT4>::value )
7287 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
7288 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
7289 const size_t kend( ( IsLower<MT4>::value )?( i+2UL ):( K ) );
7291 IntrinsicType xmm1, xmm2;
7293 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
7294 const IntrinsicType b1( B.load(k,j) );
7295 xmm1 = xmm1 + A.load(i ,k) * b1;
7296 xmm2 = xmm2 + A.load(i+1UL,k) * b1;
7299 (~C)(i ,j) -=
sum( xmm1 ) * scalar;
7300 (~C)(i+1UL,j) -=
sum( xmm2 ) * scalar;
7308 for( ; (j+2UL) <= N; j+=2UL )
7310 const size_t kbegin( ( IsUpper<MT4>::value )
7311 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
7312 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
7313 const size_t kend( ( IsUpper<MT5>::value )?( j+2UL ):( K ) );
7315 IntrinsicType xmm1, xmm2;
7317 for(
size_t k=kbegin; k<kend; k+=
IT::size ) {
7318 const IntrinsicType a1( A.load(i,k) );
7319 xmm1 = xmm1 + a1 * B.load(k,j );
7320 xmm2 = xmm2 + a1 * B.load(k,j+1UL);
7323 (~C)(i,j ) -=
sum( xmm1 ) * scalar;
7324 (~C)(i,j+1UL) -=
sum( xmm2 ) * scalar;
7329 const size_t kbegin( ( IsUpper<MT4>::value )
7330 ?( ( IsLower<MT5>::value ?
max( i, j ) : i ) &
size_t(-
IT::size) )
7331 :( IsLower<MT5>::value ? ( j &
size_t(-
IT::size) ) : 0UL ) );
7335 for(
size_t k=kbegin; k<K; k+=
IT::size ) {
7336 xmm1 = xmm1 + A.load(i,k) * B.load(k,j);
7339 (~C)(i,j) -=
sum( xmm1 ) * scalar;
7359 template<
typename MT3
7363 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7364 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7366 selectDefaultSubAssignKernel( C, A, B, scalar );
7385 template<
typename MT3
7389 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7390 selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
7393 selectSmallSubAssignKernel( ~C, A, B, scalar );
7412 template<
typename MT3
7416 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7417 selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
7420 selectSmallSubAssignKernel( ~C, A, B, scalar );
7438 template<
typename MT3
7442 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7443 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7445 selectLargeSubAssignKernel( C, A, B, scalar );
7464 template<
typename MT3
7468 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
7469 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7471 if( IsTriangular<MT4>::value ) {
7473 strmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), scalar );
7476 else if( IsTriangular<MT5>::value ) {
7478 strmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), scalar );
7482 sgemm( C, A, B, -scalar, 1.0F );
7503 template<
typename MT3
7507 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
7508 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7510 if( IsTriangular<MT4>::value ) {
7512 dtrmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), scalar );
7515 else if( IsTriangular<MT5>::value ) {
7517 dtrmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), scalar );
7521 dgemm( C, A, B, -scalar, 1.0 );
7542 template<
typename MT3
7546 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
7547 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7549 if( IsTriangular<MT4>::value ) {
7551 ctrmm( tmp, A, CblasLeft,
7552 ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
7553 complex<float>( scalar, 0.0F ) );
7556 else if( IsTriangular<MT5>::value ) {
7558 ctrmm( tmp, B, CblasRight,
7559 ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
7560 complex<float>( scalar, 0.0F ) );
7564 cgemm( C, A, B, complex<float>( -scalar, 0.0F ), complex<float>( 1.0F, 0.0F ) );
7585 template<
typename MT3
7589 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
7590 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7592 if( IsTriangular<MT4>::value ) {
7594 ztrmm( tmp, A, CblasLeft,
7595 ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
7596 complex<float>( scalar, 0.0 ) );
7599 else if( IsTriangular<MT5>::value ) {
7601 ztrmm( tmp, B, CblasRight,
7602 ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
7603 complex<float>( scalar, 0.0 ) );
7607 zgemm( C, A, B, complex<double>( -scalar, 0.0 ), complex<double>( 1.0, 0.0 ) );
7640 template<
typename MT
7642 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
7643 smpAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7650 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7651 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7653 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
7656 else if( left.columns() == 0UL ) {
7690 template<
typename MT
7692 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
7693 smpAssign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7697 typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
7709 const TmpType tmp( rhs );
7729 template<
typename MT
7731 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
7732 smpAddAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7739 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7740 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7742 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
7779 template<
typename MT
7781 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
7782 smpSubAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7789 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7790 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7792 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
7874 template<
typename T1
7876 inline const DMatTDMatMultExpr<T1,T2>
7882 throw std::invalid_argument(
"Matrix sizes do not match" );
7899 template<
typename MT1,
typename MT2 >
7917 template<
typename MT1,
typename MT2 >
7919 :
public Columns<MT2>
7935 template<
typename MT1,
typename MT2 >
7937 :
public IsTrue< And< IsLower<MT1>, IsLower<MT2> >::value >
7953 template<
typename MT1,
typename MT2 >
7955 :
public IsTrue< And< IsUniLower<MT1>, IsUniLower<MT2> >::value >
7971 template<
typename MT1,
typename MT2 >
7973 :
public IsTrue< Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
7974 , And< IsStrictlyLower<MT2>, IsLower<MT1> > >::value >
7990 template<
typename MT1,
typename MT2 >
7992 :
public IsTrue< And< IsUpper<MT1>, IsUpper<MT2> >::value >
8008 template<
typename MT1,
typename MT2 >
8010 :
public IsTrue< And< IsUniUpper<MT1>, IsUniUpper<MT2> >::value >
8026 template<
typename MT1,
typename MT2 >
8028 :
public IsTrue< Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
8029 , And< IsStrictlyUpper<MT2>, IsUpper<MT1> > >::value >
8045 template<
typename MT1,
typename MT2,
typename VT >
8050 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
8051 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value &&
8052 IsDenseVector<VT>::value && IsColumnVector<VT>::value
8053 ,
typename DMatDVecMultExprTrait< MT1, typename TDMatDVecMultExprTrait<MT2,VT>::Type >::Type
8054 , INVALID_TYPE >::Type Type;
8063 template<
typename MT1,
typename MT2,
typename VT >
8068 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
8069 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value &&
8070 IsSparseVector<VT>::value && IsColumnVector<VT>::value
8071 ,
typename DMatDVecMultExprTrait< MT1, typename TDMatSVecMultExprTrait<MT2,VT>::Type >::Type
8072 , INVALID_TYPE >::Type Type;
8081 template<
typename VT,
typename MT1,
typename MT2 >
8086 typedef typename SelectType< IsDenseVector<VT>::value && IsRowVector<VT>::value &&
8087 IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
8088 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value
8089 ,
typename TDVecTDMatMultExprTrait< typename TDVecDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
8090 , INVALID_TYPE >::Type Type;
8099 template<
typename VT,
typename MT1,
typename MT2 >
8104 typedef typename SelectType< IsSparseVector<VT>::value && IsRowVector<VT>::value &&
8105 IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
8106 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value
8107 ,
typename TDVecTDMatMultExprTrait< typename TSVecDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
8108 , INVALID_TYPE >::Type Type;
8117 template<
typename MT1,
typename MT2,
bool AF >
8122 typedef typename MultExprTrait< typename SubmatrixExprTrait<const MT1,AF>::Type
8123 ,
typename SubmatrixExprTrait<const MT2,AF>::Type >::Type Type;
8132 template<
typename MT1,
typename MT2 >
8137 typedef typename MultExprTrait< typename RowExprTrait<const MT1>::Type, MT2 >::Type Type;
8146 template<
typename MT1,
typename MT2 >
8151 typedef typename MultExprTrait< MT1, typename ColumnExprTrait<const MT2>::Type >::Type Type;
bool canSMPAssign() const
Returns whether the expression can be used in SMP assignments.
Definition: DMatTDMatMultExpr.h:470
ResultType::OppositeType OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatTDMatMultExpr.h:289
BLAZE_ALWAYS_INLINE int16_t sum(const sse_int16_t &a)
Returns the sum of all elements in the 16-bit integral intrinsic vector.
Definition: Reduction.h:63
const MT::ElementType max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1649
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:89
Constraint on the data type.
Header file for mathematical functions.
SelectType< IsExpression< MT1 >::value, const MT1, const MT1 & >::Type LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:297
Header file for the Rows type trait.
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:86
Header file for the IsUniUpper type trait.
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:8247
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:105
Header file for basic type definitions.
ResultType::ElementType ElementType
Resulting element type.
Definition: DMatTDMatMultExpr.h:291
BLAZE_ALWAYS_INLINE size_t size(const Vector< VT, TF > &vector)
Returns the current size/dimension of the vector.
Definition: Vector.h:264
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatTDMatMultExpr.h:343
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:209
Header file for the IsDiagonal type trait.
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
Header file for the ColumnExprTrait class template.
Header file for the IsSame and IsStrictlySame type traits.
Header file for the IsColumnMajorMatrix type trait.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:821
bool isAligned() const
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatTDMatMultExpr.h:460
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2507
Header file for the IsRowVector type trait.
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:261
DMatTDMatMultExpr< MT1, MT2 > This
Type of this DMatTDMatMultExpr instance.
Definition: DMatTDMatMultExpr.h:287
Header file for the And class template.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:90
CompressedMatrix< Type,!SO > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: CompressedMatrix.h:259
Header file for the TDVecSMatMultExprTrait class template.
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:699
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member enumeration is set to 1, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to 0, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:158
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:90
RT1::ElementType ET1
Element type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:140
Header file for the IsUniLower type trait.
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
MT1::CompositeType CT1
Composite type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:142
Constraint on the data type.
Header file for the MultExprTrait class template.
DMatTDMatMultExpr(const MT1 &lhs, const MT2 &rhs)
Constructor for the DMatTDMatMultExpr class.
Definition: DMatTDMatMultExpr.h:328
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:90
size_t columns() const
Returns the current number of columns of the matrix.
Definition: DMatTDMatMultExpr.h:406
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatTDMatMultExpr.h:294
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: StorageOrder.h:161
const size_t SMP_DMATTDMATMULT_THRESHOLD
SMP row-major dense matrix/column-major dense matrix multiplication threshold.This threshold specifie...
Definition: Thresholds.h:857
Header file for the Or class template.
Header file for the TDMatSVecMultExprTrait class template.
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: DMatTDMatMultExpr.h:450
const MT::ElementType min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1602
Header file for the DenseMatrix base class.
BLAZE_ALWAYS_INLINE void assign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the assignment of a matrix to a matrix.
Definition: Matrix.h:635
Header file for the Columns type trait.
MT2::ResultType RT2
Result type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:139
Header file for the Not class template.
Header file for the DMatDVecMultExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
Header file for the IsLower type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:92
Header file for BLAS level 3 functions.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
MT2::CompositeType CT2
Composite type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:143
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
Header file for the IsTriangular type trait.
Constraints on the storage order of matrix types.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2505
Header file for the SelectType class template.
Header file for the RowExprTrait class template.
Header file for all forward declarations for expression class templates.
Header file for the IsDenseMatrix type trait.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the serial shim.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:165
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: DMatTDMatMultExpr.h:292
size_t rows() const
Returns the current number of rows of the matrix.
Definition: DMatTDMatMultExpr.h:396
Header file for the IsNumeric type trait.
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
Base class for all matrix/matrix multiplication expression templates.The MatMatMultExpr class serves ...
Definition: MatMatMultExpr.h:65
EnableIf< IsDenseMatrix< MT1 > >::Type smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
Header file for the IsSparseVector type trait.
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: StorageOrder.h:81
Header file for the MatScalarMultExpr base class.
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:749
Header file for run time assertion macros.
EnableIf< IsDenseMatrix< MT1 > >::Type smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
Utility type for generic codes.
Base template for the MultTrait class.
Definition: MultTrait.h:150
MultTrait< RT1, RT2 >::Type ResultType
Result type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:288
BLAZE_ALWAYS_INLINE void addAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the addition assignment of a matrix to a matrix.
Definition: Matrix.h:742
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
Header file for the reset shim.
Header file for the HasMutableDataAccess type trait.
const size_t DMATTDMATMULT_THRESHOLD
Row-major dense matrix/column-major dense matrix multiplication threshold.This setting specifies the ...
Definition: Thresholds.h:142
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: DMatTDMatMultExpr.h:438
RT2::ElementType ET2
Element type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:141
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:283
Header file for the IsDenseVector type trait.
Header file for all intrinsic functionality.
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
LeftOperand leftOperand() const
Returns the left-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:416
Header file for the IsRowMajorMatrix type trait.
Header file for the IsComputation type trait class.
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:260
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:59
Header file for the TDVecDMatMultExprTrait class template.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:293
EnableIf< IsDenseMatrix< MT1 > >::Type smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
Header file for the TDMatDVecMultExprTrait class template.
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2502
Header file for the IsTrue value trait.
Header file for the IsComplex type trait.
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatTDMatMultExpr.h:479
Header file for the complex data type.
Expression object for dense matrix-transpose dense matrix multiplications.The DMatTDMatMultExpr class...
Definition: DMatTDMatMultExpr.h:132
MT1::ResultType RT1
Result type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:138
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: DMatTDMatMultExpr.h:480
Header file for the IsUpper type trait.
RightOperand rightOperand() const
Returns the right-hand side transpose dense matrix operand.
Definition: DMatTDMatMultExpr.h:426
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:290
Header file for the DMatSVecMultExprTrait class template.
Header file for the IsColumnVector type trait.
Constraint on the data type.
SelectType< evaluateLeft, const RT1, CT1 >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:303
SelectType< evaluateRight, const RT2, CT2 >::Type RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:306
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the TDVecTDMatMultExprTrait class template.
SelectType< IsExpression< MT2 >::value, const MT2, const MT2 & >::Type RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:300
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.
BLAZE_ALWAYS_INLINE void subAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the subtraction assignment of a matrix to matrix.
Definition: Matrix.h:849