35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_
130 template<
typename MT1
132 class TDMatTDMatMultExpr :
public DenseMatrix< TDMatTDMatMultExpr<MT1,MT2>, true >
133 ,
private MatMatMultExpr
134 ,
private Computation
164 template<
typename T1,
typename T2,
typename T3 >
165 struct CanExploitSymmetry {
166 enum { value = IsRowMajorMatrix<T1>::value &&
167 ( IsSymmetric<T2>::value || IsSymmetric<T3>::value ) };
178 template<
typename T1,
typename T2,
typename T3 >
179 struct IsEvaluationRequired {
180 enum { value = ( evaluateLeft || evaluateRight ) &&
181 CanExploitSymmetry<T1,T2,T3>::value };
191 template<
typename T1,
typename T2,
typename T3 >
192 struct UseSinglePrecisionKernel {
194 HasMutableDataAccess<T1>::value &&
195 HasConstDataAccess<T2>::value &&
196 HasConstDataAccess<T3>::value &&
197 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
198 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
199 IsFloat<typename T1::ElementType>::value &&
200 IsFloat<typename T2::ElementType>::value &&
201 IsFloat<typename T3::ElementType>::value };
211 template<
typename T1,
typename T2,
typename T3 >
212 struct UseDoublePrecisionKernel {
214 HasMutableDataAccess<T1>::value &&
215 HasConstDataAccess<T2>::value &&
216 HasConstDataAccess<T3>::value &&
217 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
218 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
219 IsDouble<typename T1::ElementType>::value &&
220 IsDouble<typename T2::ElementType>::value &&
221 IsDouble<typename T3::ElementType>::value };
232 template<
typename T1,
typename T2,
typename T3 >
233 struct UseSinglePrecisionComplexKernel {
234 typedef complex<float> Type;
236 HasMutableDataAccess<T1>::value &&
237 HasConstDataAccess<T2>::value &&
238 HasConstDataAccess<T3>::value &&
239 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
240 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
241 IsSame<typename T1::ElementType,Type>::value &&
242 IsSame<typename T2::ElementType,Type>::value &&
243 IsSame<typename T3::ElementType,Type>::value };
254 template<
typename T1,
typename T2,
typename T3 >
255 struct UseDoublePrecisionComplexKernel {
256 typedef complex<double> Type;
258 HasMutableDataAccess<T1>::value &&
259 HasConstDataAccess<T2>::value &&
260 HasConstDataAccess<T3>::value &&
261 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
262 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
263 IsSame<typename T1::ElementType,Type>::value &&
264 IsSame<typename T2::ElementType,Type>::value &&
265 IsSame<typename T3::ElementType,Type>::value };
275 template<
typename T1,
typename T2,
typename T3 >
276 struct UseDefaultKernel {
277 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
278 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
279 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
280 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
290 template<
typename T1,
typename T2,
typename T3 >
291 struct UseVectorizedDefaultKernel {
292 enum { value = !IsDiagonal<T2>::value &&
293 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
294 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
295 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
296 IntrinsicTrait<typename T1::ElementType>::addition &&
297 IntrinsicTrait<typename T1::ElementType>::subtraction &&
298 IntrinsicTrait<typename T1::ElementType>::multiplication };
330 MT1::vectorizable && MT2::vectorizable &&
336 enum { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
337 !evaluateRight && MT2::smpAssignable };
380 :(
lhs_.columns() ) ) );
382 if(
lhs_.columns() == 0UL ||
392 const size_t knum( kend - kbegin );
393 const size_t kpos( kbegin + ( ( knum - 1UL ) &
size_t(-2) ) + 1UL );
395 ElementType tmp(
lhs_(i,kbegin) *
rhs_(kbegin,j) );
397 for(
size_t k=kbegin+1UL; k<kpos; k+=2UL ) {
399 tmp +=
lhs_(i,k+1UL) *
rhs_(k+1UL,j);
425 return rhs_.columns();
455 template<
typename T >
457 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
467 template<
typename T >
469 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
479 return lhs_.isAligned() &&
rhs_.isAligned();
514 template<
typename MT
524 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
527 else if( rhs.lhs_.columns() == 0UL ) {
532 LT A(
serial( rhs.lhs_ ) );
533 RT B(
serial( rhs.rhs_ ) );
542 TDMatTDMatMultExpr::selectAssignKernel( ~lhs, A, B );
558 template<
typename MT3
561 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
565 selectSmallAssignKernel( C, A, B );
567 selectBlasAssignKernel( C, A, B );
586 template<
typename MT3
589 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
590 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
592 const size_t M( A.rows() );
593 const size_t N( B.columns() );
594 const size_t K( A.columns() );
596 for(
size_t j=0UL; j<N; ++j )
598 const size_t kbegin( ( IsLower<MT5>::value )
599 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
601 const size_t kend( ( IsUpper<MT5>::value )
602 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
606 if( IsStrictlyTriangular<MT5>::value && kbegin == kend ) {
607 for(
size_t i=0UL; i<M; ++i ) {
614 const size_t ibegin( ( IsLower<MT4>::value )
615 ?( IsStrictlyLower<MT4>::value ? kbegin+1UL : kbegin )
617 const size_t iend( ( IsUpper<MT4>::value )
618 ?( IsStrictlyUpper<MT4>::value ? kbegin : kbegin+1UL )
622 if( IsLower<MT4>::value && IsLower<MT5>::value ) {
623 for(
size_t i=0UL; i<ibegin; ++i ) {
627 else if( IsStrictlyLower<MT4>::value ) {
630 for(
size_t i=ibegin; i<iend; ++i ) {
631 C(i,j) = A(i,kbegin) * B(kbegin,j);
633 if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
634 for(
size_t i=iend; i<M; ++i ) {
638 else if( IsStrictlyUpper<MT4>::value ) {
643 for(
size_t k=kbegin+1UL; k<kend; ++k )
645 const size_t ibegin( ( IsLower<MT4>::value )
646 ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
648 const size_t iend( ( IsUpper<MT4>::value )
649 ?( IsStrictlyUpper<MT4>::value ? k-1UL : k )
653 for(
size_t i=ibegin; i<iend; ++i ) {
654 C(i,j) += A(i,k) * B(k,j);
656 if( IsUpper<MT4>::value ) {
657 C(iend,j) = A(iend,k) * B(k,j);
679 template<
typename MT3
682 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
683 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
687 const size_t M( A.rows() );
688 const size_t N( B.columns() );
690 for(
size_t j=0UL; j<N; ++j )
692 const size_t ibegin( ( IsLower<MT4>::value )
693 ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
695 const size_t iend( ( IsUpper<MT4>::value )
696 ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
700 if( IsLower<MT4>::value ) {
701 for(
size_t i=0UL; i<ibegin; ++i ) {
705 for(
size_t i=ibegin; i<iend; ++i ) {
706 C(i,j) = A(i,j) * B(j,j);
708 if( IsUpper<MT4>::value ) {
709 for(
size_t i=iend; i<M; ++i ) {
732 template<
typename MT3
735 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
736 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
740 const size_t M( A.rows() );
741 const size_t N( B.columns() );
743 for(
size_t j=0UL; j<N; ++j )
745 const size_t ibegin( ( IsLower<MT5>::value )
746 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
748 const size_t iend( ( IsUpper<MT5>::value )
749 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
753 if( IsLower<MT4>::value ) {
754 for(
size_t i=0UL; i<ibegin; ++i ) {
758 for(
size_t i=ibegin; i<iend; ++i ) {
759 C(i,j) = A(i,i) * B(i,j);
761 if( IsUpper<MT4>::value ) {
762 for(
size_t i=iend; i<M; ++i ) {
785 template<
typename MT3
788 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
789 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
795 for(
size_t i=0UL; i<A.rows(); ++i ) {
796 C(i,i) = A(i,i) * B(i,i);
816 template<
typename MT3
819 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
820 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
822 selectDefaultAssignKernel( C, A, B );
842 template<
typename MT3
845 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
846 selectSmallAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
853 if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
857 else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
861 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
888 template<
typename MT3
891 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
892 selectSmallAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
894 typedef IntrinsicTrait<ElementType> IT;
896 const size_t M( A.rows() );
897 const size_t N( B.columns() );
898 const size_t K( A.columns() );
903 for(
size_t j=0UL; j<N; ++j )
905 const size_t kbegin( ( IsLower<MT5>::value )
906 ?( ( IsUpper<MT4>::value )
907 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
908 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
909 :( IsUpper<MT4>::value ? i : 0UL ) );
910 const size_t kend( ( IsUpper<MT5>::value )
911 ?( ( IsLower<MT4>::value )
912 ?(
min( i+
IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
913 :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
914 :( IsLower<MT4>::value ?
min( i+
IT::size*8UL, K ) : K ) );
916 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
918 for(
size_t k=kbegin; k<kend; ++k ) {
919 const IntrinsicType b1(
set( B(k,j) ) );
920 xmm1 = xmm1 + A.load(i ,k) * b1;
921 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
922 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
923 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
924 xmm5 = xmm5 + A.load(i+
IT::size*4UL,k) * b1;
925 xmm6 = xmm6 + A.load(i+
IT::size*5UL,k) * b1;
926 xmm7 = xmm7 + A.load(i+
IT::size*6UL,k) * b1;
927 xmm8 = xmm8 + A.load(i+
IT::size*7UL,k) * b1;
930 (~C).
store( i , j, xmm1 );
945 for( ; (j+2UL) <= N; j+=2UL )
947 const size_t kbegin( ( IsLower<MT5>::value )
948 ?( ( IsUpper<MT4>::value )
949 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
950 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
951 :( IsUpper<MT4>::value ? i : 0UL ) );
952 const size_t kend( ( IsUpper<MT5>::value )
953 ?( ( IsLower<MT4>::value )
954 ?(
min( i+
IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
955 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
956 :( IsLower<MT4>::value ?
min( i+
IT::size*4UL, K ) : K ) );
958 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
960 for(
size_t k=kbegin; k<kend; ++k ) {
961 const IntrinsicType a1( A.load(i ,k) );
962 const IntrinsicType a2( A.load(i+
IT::size ,k) );
963 const IntrinsicType a3( A.load(i+
IT::size*2UL,k) );
964 const IntrinsicType a4( A.load(i+
IT::size*3UL,k) );
965 const IntrinsicType b1(
set( B(k,j ) ) );
966 const IntrinsicType b2(
set( B(k,j+1UL) ) );
967 xmm1 = xmm1 + a1 * b1;
968 xmm2 = xmm2 + a2 * b1;
969 xmm3 = xmm3 + a3 * b1;
970 xmm4 = xmm4 + a4 * b1;
971 xmm5 = xmm5 + a1 * b2;
972 xmm6 = xmm6 + a2 * b2;
973 xmm7 = xmm7 + a3 * b2;
974 xmm8 = xmm8 + a4 * b2;
977 (~C).
store( i , j , xmm1 );
981 (~C).
store( i , j+1UL, xmm5 );
989 const size_t kbegin( ( IsLower<MT5>::value )
990 ?( ( IsUpper<MT4>::value )
991 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
992 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
993 :( IsUpper<MT4>::value ? i : 0UL ) );
994 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, K ) ):( K ) );
996 IntrinsicType xmm1, xmm2, xmm3, xmm4;
998 for(
size_t k=kbegin; k<kend; ++k ) {
999 const IntrinsicType b1(
set( B(k,j) ) );
1000 xmm1 = xmm1 + A.load(i ,k) * b1;
1001 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
1002 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
1003 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
1006 (~C).
store( i , j, xmm1 );
1017 for( ; (j+2UL) <= N; j+=2UL )
1019 const size_t kbegin( ( IsLower<MT5>::value )
1020 ?( ( IsUpper<MT4>::value )
1021 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1022 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1023 :( IsUpper<MT4>::value ? i : 0UL ) );
1024 const size_t kend( ( IsUpper<MT5>::value )
1025 ?( ( IsLower<MT4>::value )
1026 ?(
min( i+
IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
1027 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
1028 :( IsLower<MT4>::value ?
min( i+
IT::size*2UL, K ) : K ) );
1030 IntrinsicType xmm1, xmm2, xmm3, xmm4;
1032 for(
size_t k=kbegin; k<kend; ++k ) {
1033 const IntrinsicType a1( A.load(i ,k) );
1034 const IntrinsicType a2( A.load(i+
IT::size,k) );
1035 const IntrinsicType b1(
set( B(k,j ) ) );
1036 const IntrinsicType b2(
set( B(k,j+1UL) ) );
1037 xmm1 = xmm1 + a1 * b1;
1038 xmm2 = xmm2 + a2 * b1;
1039 xmm3 = xmm3 + a1 * b2;
1040 xmm4 = xmm4 + a2 * b2;
1043 (~C).
store( i , j , xmm1 );
1045 (~C).
store( i , j+1UL, xmm3 );
1051 const size_t kbegin( ( IsLower<MT5>::value )
1052 ?( ( IsUpper<MT4>::value )
1053 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1054 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1055 :( IsUpper<MT4>::value ? i : 0UL ) );
1056 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, K ) ):( K ) );
1058 IntrinsicType xmm1, xmm2;
1060 for(
size_t k=kbegin; k<kend; ++k ) {
1061 const IntrinsicType b1(
set( B(k,j) ) );
1062 xmm1 = xmm1 + A.load(i ,k) * b1;
1063 xmm2 = xmm2 + A.load(i+
IT::size,k) * b1;
1066 (~C).
store( i , j, xmm1 );
1075 for( ; (j+2UL) <= N; j+=2UL )
1077 const size_t kbegin( ( IsLower<MT5>::value )
1078 ?( ( IsUpper<MT4>::value )
1079 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1080 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1081 :( IsUpper<MT4>::value ? i : 0UL ) );
1082 const size_t kend( ( IsUpper<MT5>::value )
1083 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
1086 IntrinsicType xmm1, xmm2;
1088 for(
size_t k=kbegin; k<kend; ++k ) {
1089 const IntrinsicType a1( A.load(i,k) );
1090 xmm1 = xmm1 + a1 *
set( B(k,j ) );
1091 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
1094 (~C).
store( i, j , xmm1 );
1095 (~C).
store( i, j+1UL, xmm2 );
1100 const size_t kbegin( ( IsLower<MT5>::value )
1101 ?( ( IsUpper<MT4>::value )
1102 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1103 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1104 :( IsUpper<MT4>::value ? i : 0UL ) );
1108 for(
size_t k=kbegin; k<K; ++k ) {
1109 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
1112 (~C).
store( i, j, xmm1 );
1133 template<
typename MT3
1136 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1137 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1139 selectDefaultAssignKernel( C, A, B );
1159 template<
typename MT3
1162 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1163 selectLargeAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1165 selectSmallAssignKernel( ~C, A, B );
1185 template<
typename MT3
1188 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1189 selectLargeAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1191 typedef IntrinsicTrait<ElementType> IT;
1193 const size_t M( A.rows() );
1194 const size_t N( B.columns() );
1195 const size_t K( A.columns() );
1197 const size_t iblock( 128UL );
1198 const size_t jblock( 64UL );
1199 const size_t kblock( 128UL );
1201 for(
size_t ii=0UL; ii<M; ii+=iblock )
1203 const size_t iend(
min( ii+iblock, M ) );
1205 for(
size_t jj=0UL; jj<N; jj+=jblock )
1207 const size_t jend(
min( jj+jblock, N ) );
1209 for(
size_t j=jj; j<jend; ++j ) {
1210 for(
size_t i=ii; i<iend; ++i ) {
1215 for(
size_t kk=0UL; kk<K; kk+=kblock )
1217 const size_t ktmp(
min( kk+kblock, K ) );
1229 for( ; (j+2UL) <= jend; j+=2UL )
1231 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1232 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1233 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
1234 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
1236 IntrinsicType xmm1( (~C).
load(i ,j ) );
1237 IntrinsicType xmm2( (~C).
load(i1,j ) );
1238 IntrinsicType xmm3( (~C).
load(i2,j ) );
1239 IntrinsicType xmm4( (~C).
load(i3,j ) );
1240 IntrinsicType xmm5( (~C).
load(i ,j+1UL) );
1241 IntrinsicType xmm6( (~C).
load(i1,j+1UL) );
1242 IntrinsicType xmm7( (~C).
load(i2,j+1UL) );
1243 IntrinsicType xmm8( (~C).
load(i3,j+1UL) );
1245 for(
size_t k=kbegin; k<kend; ++k ) {
1246 const IntrinsicType a1( A.load(i ,k) );
1247 const IntrinsicType a2( A.load(i1,k) );
1248 const IntrinsicType a3( A.load(i2,k) );
1249 const IntrinsicType a4( A.load(i3,k) );
1250 const IntrinsicType b1(
set( B(k,j ) ) );
1251 const IntrinsicType b2(
set( B(k,j+1UL) ) );
1252 xmm1 = xmm1 + a1 * b1;
1253 xmm2 = xmm2 + a2 * b1;
1254 xmm3 = xmm3 + a3 * b1;
1255 xmm4 = xmm4 + a4 * b1;
1256 xmm5 = xmm5 + a1 * b2;
1257 xmm6 = xmm6 + a2 * b2;
1258 xmm7 = xmm7 + a3 * b2;
1259 xmm8 = xmm8 + a4 * b2;
1262 (~C).
store( i , j , xmm1 );
1263 (~C).
store( i1, j , xmm2 );
1264 (~C).
store( i2, j , xmm3 );
1265 (~C).
store( i3, j , xmm4 );
1266 (~C).
store( i , j+1UL, xmm5 );
1267 (~C).
store( i1, j+1UL, xmm6 );
1268 (~C).
store( i2, j+1UL, xmm7 );
1269 (~C).
store( i3, j+1UL, xmm8 );
1274 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1275 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1276 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
1277 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
1279 IntrinsicType xmm1( (~C).
load(i ,j) );
1280 IntrinsicType xmm2( (~C).
load(i1,j) );
1281 IntrinsicType xmm3( (~C).
load(i2,j) );
1282 IntrinsicType xmm4( (~C).
load(i3,j) );
1284 for(
size_t k=kbegin; k<kend; ++k ) {
1285 const IntrinsicType b1(
set( B(k,j) ) );
1286 xmm1 = xmm1 + A.load(i ,k) * b1;
1287 xmm2 = xmm2 + A.load(i1,k) * b1;
1288 xmm3 = xmm3 + A.load(i2,k) * b1;
1289 xmm4 = xmm4 + A.load(i3,k) * b1;
1292 (~C).
store( i , j, xmm1 );
1293 (~C).
store( i1, j, xmm2 );
1294 (~C).
store( i2, j, xmm3 );
1295 (~C).
store( i3, j, xmm4 );
1305 for( ; (j+4UL) <= jend; j+=4UL )
1307 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1308 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1309 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
1310 ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
1312 IntrinsicType xmm1( (~C).
load(i ,j ) );
1313 IntrinsicType xmm2( (~C).
load(i1,j ) );
1314 IntrinsicType xmm3( (~C).
load(i ,j+1UL) );
1315 IntrinsicType xmm4( (~C).
load(i1,j+1UL) );
1316 IntrinsicType xmm5( (~C).
load(i ,j+2UL) );
1317 IntrinsicType xmm6( (~C).
load(i1,j+2UL) );
1318 IntrinsicType xmm7( (~C).
load(i ,j+3UL) );
1319 IntrinsicType xmm8( (~C).
load(i1,j+3UL) );
1321 for(
size_t k=kbegin; k<kend; ++k ) {
1322 const IntrinsicType a1( A.load(i ,k) );
1323 const IntrinsicType a2( A.load(i1,k) );
1324 const IntrinsicType b1(
set( B(k,j ) ) );
1325 const IntrinsicType b2(
set( B(k,j+1UL) ) );
1326 const IntrinsicType b3(
set( B(k,j+2UL) ) );
1327 const IntrinsicType b4(
set( B(k,j+3UL) ) );
1328 xmm1 = xmm1 + a1 * b1;
1329 xmm2 = xmm2 + a2 * b1;
1330 xmm3 = xmm3 + a1 * b2;
1331 xmm4 = xmm4 + a2 * b2;
1332 xmm5 = xmm5 + a1 * b3;
1333 xmm6 = xmm6 + a2 * b3;
1334 xmm7 = xmm7 + a1 * b4;
1335 xmm8 = xmm8 + a2 * b4;
1338 (~C).
store( i , j , xmm1 );
1339 (~C).
store( i1, j , xmm2 );
1340 (~C).
store( i , j+1UL, xmm3 );
1341 (~C).
store( i1, j+1UL, xmm4 );
1342 (~C).
store( i , j+2UL, xmm5 );
1343 (~C).
store( i1, j+2UL, xmm6 );
1344 (~C).
store( i , j+3UL, xmm7 );
1345 (~C).
store( i1, j+3UL, xmm8 );
1348 for( ; (j+2UL) <= jend; j+=2UL )
1350 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1351 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1352 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
1353 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
1355 IntrinsicType xmm1( (~C).
load(i ,j ) );
1356 IntrinsicType xmm2( (~C).
load(i1,j ) );
1357 IntrinsicType xmm3( (~C).
load(i ,j+1UL) );
1358 IntrinsicType xmm4( (~C).
load(i1,j+1UL) );
1360 for(
size_t k=kbegin; k<kend; ++k ) {
1361 const IntrinsicType a1( A.load(i ,k) );
1362 const IntrinsicType a2( A.load(i1,k) );
1363 const IntrinsicType b1(
set( B(k,j ) ) );
1364 const IntrinsicType b2(
set( B(k,j+1UL) ) );
1365 xmm1 = xmm1 + a1 * b1;
1366 xmm2 = xmm2 + a2 * b1;
1367 xmm3 = xmm3 + a1 * b2;
1368 xmm4 = xmm4 + a2 * b2;
1371 (~C).
store( i , j , xmm1 );
1372 (~C).
store( i1, j , xmm2 );
1373 (~C).
store( i , j+1UL, xmm3 );
1374 (~C).
store( i1, j+1UL, xmm4 );
1379 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1380 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1381 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
1382 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
1384 IntrinsicType xmm1( (~C).
load(i ,j) );
1385 IntrinsicType xmm2( (~C).
load(i1,j) );
1387 for(
size_t k=kbegin; k<kend; ++k ) {
1388 const IntrinsicType b1(
set( B(k,j) ) );
1389 xmm1 = xmm1 + A.load(i ,k) * b1;
1390 xmm2 = xmm2 + A.load(i1,k) * b1;
1393 (~C).
store( i , j, xmm1 );
1394 (~C).
store( i1, j, xmm2 );
1400 for(
size_t j=jj; j<jend; ++j )
1402 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1403 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1404 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size, ktmp ) ):( ktmp ),
1405 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
1407 IntrinsicType xmm1( (~C).
load(i,j) );
1409 for(
size_t k=kbegin; k<kend; ++k ) {
1410 const IntrinsicType b1(
set( B(k,j) ) );
1411 xmm1 = xmm1 + A.load(i,k) * b1;
1414 (~C).
store( i, j, xmm1 );
1438 template<
typename MT3
1441 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
1442 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1444 selectLargeAssignKernel( C, A, B );
1464 template<
typename MT3
1467 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
1468 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1470 if( IsTriangular<MT4>::value ) {
1472 strmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), 1.0F );
1474 else if( IsTriangular<MT5>::value ) {
1476 strmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), 1.0F );
1479 sgemm( C, A, B, 1.0F, 0.0F );
1501 template<
typename MT3
1504 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
1505 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1507 if( IsTriangular<MT4>::value ) {
1509 dtrmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), 1.0 );
1511 else if( IsTriangular<MT5>::value ) {
1513 dtrmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), 1.0 );
1516 dgemm( C, A, B, 1.0, 0.0 );
1538 template<
typename MT3
1541 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1542 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1544 if( IsTriangular<MT4>::value ) {
1546 ctrmm( C, A, CblasLeft,
1547 ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
1548 complex<float>( 1.0F, 0.0F ) );
1550 else if( IsTriangular<MT5>::value ) {
1552 ctrmm( C, B, CblasRight,
1553 ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
1554 complex<float>( 1.0F, 0.0F ) );
1557 cgemm( C, A, B, complex<float>( 1.0F, 0.0F ), complex<float>( 0.0F, 0.0F ) );
1579 template<
typename MT3
1582 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
1583 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1585 if( IsTriangular<MT4>::value ) {
1587 ztrmm( C, A, CblasLeft,
1588 ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
1589 complex<double>( 1.0, 0.0 ) );
1591 else if( IsTriangular<MT5>::value ) {
1593 ztrmm( C, B, CblasRight,
1594 ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
1595 complex<double>( 1.0, 0.0 ) );
1598 zgemm( C, A, B, complex<double>( 1.0, 0.0 ), complex<double>( 0.0, 0.0 ) );
1618 template<
typename MT
1620 friend inline typename DisableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
1625 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
1637 const TmpType tmp(
serial( rhs ) );
1658 template<
typename MT >
1659 friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
1669 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
1671 else if( IsSymmetric<MT1>::value )
1692 template<
typename MT
1694 friend inline typename DisableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
1702 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1706 LT A(
serial( rhs.lhs_ ) );
1707 RT B(
serial( rhs.rhs_ ) );
1716 TDMatTDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1732 template<
typename MT3
1735 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1737 if( ( IsDiagonal<MT4>::value ) ||
1739 selectSmallAddAssignKernel( C, A, B );
1741 selectBlasAddAssignKernel( C, A, B );
1760 template<
typename MT3
1763 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
1764 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1766 const size_t M( A.rows() );
1767 const size_t N( B.columns() );
1768 const size_t K( A.columns() );
1770 for(
size_t j=0UL; j<N; ++j )
1772 const size_t kbegin( ( IsLower<MT5>::value )
1773 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
1775 const size_t kend( ( IsUpper<MT5>::value )
1776 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
1780 for(
size_t k=kbegin; k<kend; ++k )
1782 const size_t ibegin( ( IsLower<MT4>::value )
1783 ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
1785 const size_t iend( ( IsUpper<MT4>::value )
1786 ?( IsStrictlyUpper<MT4>::value ? k : k+1UL )
1790 const size_t inum( iend - ibegin );
1791 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
1793 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
1794 C(i ,j) += A(i ,k) * B(k,j);
1795 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
1798 C(ipos,j) += A(ipos,k) * B(k,j);
1820 template<
typename MT3
1823 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
1824 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1828 const size_t M( A.rows() );
1829 const size_t N( B.columns() );
1831 for(
size_t j=0UL; j<N; ++j )
1833 const size_t ibegin( ( IsLower<MT4>::value )
1834 ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
1836 const size_t iend( ( IsUpper<MT4>::value )
1837 ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
1841 const size_t inum( iend - ibegin );
1842 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
1844 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
1845 C(i ,j) += A(i ,j) * B(j,j);
1846 C(i+1UL,j) += A(i+1UL,j) * B(j,j);
1849 C(ipos,j) += A(ipos,j) * B(j,j);
1870 template<
typename MT3
1873 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
1874 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1878 const size_t M( A.rows() );
1879 const size_t N( B.columns() );
1881 for(
size_t j=0UL; j<N; ++j )
1883 const size_t ibegin( ( IsLower<MT5>::value )
1884 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
1886 const size_t iend( ( IsUpper<MT5>::value )
1887 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
1891 const size_t inum( iend - ibegin );
1892 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
1894 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
1895 C(i ,j) += A(i ,i ) * B(i ,j);
1896 C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j);
1899 C(ipos,j) += A(ipos,ipos) * B(ipos,j);
1920 template<
typename MT3
1923 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
1924 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1928 for(
size_t i=0UL; i<A.rows(); ++i ) {
1929 C(i,i) += A(i,i) * B(i,i);
1949 template<
typename MT3
1952 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1953 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1955 selectDefaultAddAssignKernel( C, A, B );
1975 template<
typename MT3
1978 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1979 selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1986 if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
1990 else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
1994 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
2021 template<
typename MT3
2024 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2025 selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
2027 typedef IntrinsicTrait<ElementType> IT;
2029 const size_t M( A.rows() );
2030 const size_t N( B.columns() );
2031 const size_t K( A.columns() );
2036 for(
size_t j=0UL; j<N; ++j )
2038 const size_t kbegin( ( IsLower<MT5>::value )
2039 ?( ( IsUpper<MT4>::value )
2040 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2041 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2042 :( IsUpper<MT4>::value ? i : 0UL ) );
2043 const size_t kend( ( IsUpper<MT5>::value )
2044 ?( ( IsLower<MT4>::value )
2045 ?(
min( i+
IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
2046 :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
2047 :( IsLower<MT4>::value ?
min( i+
IT::size*8UL, K ) : K ) );
2049 IntrinsicType xmm1( (~C).
load(i ,j) );
2058 for(
size_t k=kbegin; k<kend; ++k ) {
2059 const IntrinsicType b1(
set( B(k,j) ) );
2060 xmm1 = xmm1 + A.load(i ,k) * b1;
2061 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
2062 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
2063 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
2064 xmm5 = xmm5 + A.load(i+
IT::size*4UL,k) * b1;
2065 xmm6 = xmm6 + A.load(i+
IT::size*5UL,k) * b1;
2066 xmm7 = xmm7 + A.load(i+
IT::size*6UL,k) * b1;
2067 xmm8 = xmm8 + A.load(i+
IT::size*7UL,k) * b1;
2070 (~C).
store( i , j, xmm1 );
2085 for( ; (j+2UL) <= N; j+=2UL )
2087 const size_t kbegin( ( IsLower<MT5>::value )
2088 ?( ( IsUpper<MT4>::value )
2089 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2090 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2091 :( IsUpper<MT4>::value ? i : 0UL ) );
2092 const size_t kend( ( IsUpper<MT5>::value )
2093 ?( ( IsLower<MT4>::value )
2094 ?(
min( i+
IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
2095 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
2096 :( IsLower<MT4>::value ?
min( i+
IT::size*4UL, K ) : K ) );
2098 IntrinsicType xmm1( (~C).
load(i ,j ) );
2102 IntrinsicType xmm5( (~C).
load(i ,j+1UL) );
2107 for(
size_t k=kbegin; k<kend; ++k ) {
2108 const IntrinsicType a1( A.load(i ,k) );
2109 const IntrinsicType a2( A.load(i+
IT::size ,k) );
2110 const IntrinsicType a3( A.load(i+
IT::size*2UL,k) );
2111 const IntrinsicType a4( A.load(i+
IT::size*3UL,k) );
2112 const IntrinsicType b1(
set( B(k,j ) ) );
2113 const IntrinsicType b2(
set( B(k,j+1UL) ) );
2114 xmm1 = xmm1 + a1 * b1;
2115 xmm2 = xmm2 + a2 * b1;
2116 xmm3 = xmm3 + a3 * b1;
2117 xmm4 = xmm4 + a4 * b1;
2118 xmm5 = xmm5 + a1 * b2;
2119 xmm6 = xmm6 + a2 * b2;
2120 xmm7 = xmm7 + a3 * b2;
2121 xmm8 = xmm8 + a4 * b2;
2124 (~C).
store( i , j , xmm1 );
2128 (~C).
store( i , j+1UL, xmm5 );
2136 const size_t kbegin( ( IsLower<MT5>::value )
2137 ?( ( IsUpper<MT4>::value )
2138 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2139 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2140 :( IsUpper<MT4>::value ? i : 0UL ) );
2141 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, K ) ):( K ) );
2143 IntrinsicType xmm1( (~C).
load(i ,j) );
2148 for(
size_t k=kbegin; k<kend; ++k ) {
2149 const IntrinsicType b1(
set( B(k,j) ) );
2150 xmm1 = xmm1 + A.load(i ,k) * b1;
2151 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
2152 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
2153 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
2156 (~C).
store( i , j, xmm1 );
2167 for( ; (j+2UL) <= N; j+=2UL )
2169 const size_t kbegin( ( IsLower<MT5>::value )
2170 ?( ( IsUpper<MT4>::value )
2171 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2172 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2173 :( IsUpper<MT4>::value ? i : 0UL ) );
2174 const size_t kend( ( IsUpper<MT5>::value )
2175 ?( ( IsLower<MT4>::value )
2176 ?(
min( i+
IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
2177 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
2178 :( IsLower<MT4>::value ?
min( i+
IT::size*2UL, K ) : K ) );
2180 IntrinsicType xmm1( (~C).
load(i ,j ) );
2182 IntrinsicType xmm3( (~C).
load(i ,j+1UL) );
2185 for(
size_t k=kbegin; k<kend; ++k ) {
2186 const IntrinsicType a1( A.load(i ,k) );
2187 const IntrinsicType a2( A.load(i+
IT::size,k) );
2188 const IntrinsicType b1(
set( B(k,j ) ) );
2189 const IntrinsicType b2(
set( B(k,j+1UL) ) );
2190 xmm1 = xmm1 + a1 * b1;
2191 xmm2 = xmm2 + a2 * b1;
2192 xmm3 = xmm3 + a1 * b2;
2193 xmm4 = xmm4 + a2 * b2;
2196 (~C).
store( i , j , xmm1 );
2198 (~C).
store( i , j+1UL, xmm3 );
2204 const size_t kbegin( ( IsLower<MT5>::value )
2205 ?( ( IsUpper<MT4>::value )
2206 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2207 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2208 :( IsUpper<MT4>::value ? i : 0UL ) );
2209 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, K ) ):( K ) );
2211 IntrinsicType xmm1( (~C).
load(i ,j) );
2214 for(
size_t k=kbegin; k<kend; ++k ) {
2215 const IntrinsicType b1(
set( B(k,j) ) );
2216 xmm1 = xmm1 + A.load(i ,k) * b1;
2217 xmm2 = xmm2 + A.load(i+
IT::size,k) * b1;
2220 (~C).
store( i , j, xmm1 );
2229 for( ; (j+2UL) <= N; j+=2UL )
2231 const size_t kbegin( ( IsLower<MT5>::value )
2232 ?( ( IsUpper<MT4>::value )
2233 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2234 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2235 :( IsUpper<MT4>::value ? i : 0UL ) );
2236 const size_t kend( ( IsUpper<MT5>::value )
2237 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
2240 IntrinsicType xmm1( (~C).
load(i,j ) );
2241 IntrinsicType xmm2( (~C).
load(i,j+1UL) );
2243 for(
size_t k=kbegin; k<kend; ++k ) {
2244 const IntrinsicType a1( A.load(i,k) );
2245 xmm1 = xmm1 + a1 *
set( B(k,j ) );
2246 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
2249 (~C).
store( i, j , xmm1 );
2250 (~C).
store( i, j+1UL, xmm2 );
2255 const size_t kbegin( ( IsLower<MT5>::value )
2256 ?( ( IsUpper<MT4>::value )
2257 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2258 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2259 :( IsUpper<MT4>::value ? i : 0UL ) );
2261 IntrinsicType xmm1( (~C).
load(i,j) );
2263 for(
size_t k=kbegin; k<K; ++k ) {
2264 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
2267 (~C).
store( i, j, xmm1 );
2288 template<
typename MT3
2291 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2292 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2294 selectDefaultAddAssignKernel( C, A, B );
2314 template<
typename MT3
2317 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2318 selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
2320 selectSmallAddAssignKernel( ~C, A, B );
2340 template<
typename MT3
2343 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2344 selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
2346 typedef IntrinsicTrait<ElementType> IT;
2348 const size_t M( A.rows() );
2349 const size_t N( B.columns() );
2350 const size_t K( A.columns() );
2352 const size_t iblock( 128UL );
2353 const size_t jblock( 64UL );
2354 const size_t kblock( 128UL );
2356 for(
size_t ii=0UL; ii<M; ii+=iblock )
2358 const size_t iend(
min( ii+iblock, M ) );
2360 for(
size_t jj=0UL; jj<N; jj+=jblock )
2362 const size_t jend(
min( jj+jblock, N ) );
2364 for(
size_t kk=0UL; kk<K; kk+=kblock )
2366 const size_t ktmp(
min( kk+kblock, K ) );
2378 for( ; (j+2UL) <= jend; j+=2UL )
2380 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2381 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2382 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
2383 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
2385 IntrinsicType xmm1( (~C).
load(i ,j ) );
2386 IntrinsicType xmm2( (~C).
load(i1,j ) );
2387 IntrinsicType xmm3( (~C).
load(i2,j ) );
2388 IntrinsicType xmm4( (~C).
load(i3,j ) );
2389 IntrinsicType xmm5( (~C).
load(i ,j+1UL) );
2390 IntrinsicType xmm6( (~C).
load(i1,j+1UL) );
2391 IntrinsicType xmm7( (~C).
load(i2,j+1UL) );
2392 IntrinsicType xmm8( (~C).
load(i3,j+1UL) );
2394 for(
size_t k=kbegin; k<kend; ++k ) {
2395 const IntrinsicType a1( A.load(i ,k) );
2396 const IntrinsicType a2( A.load(i1,k) );
2397 const IntrinsicType a3( A.load(i2,k) );
2398 const IntrinsicType a4( A.load(i3,k) );
2399 const IntrinsicType b1(
set( B(k,j ) ) );
2400 const IntrinsicType b2(
set( B(k,j+1UL) ) );
2401 xmm1 = xmm1 + a1 * b1;
2402 xmm2 = xmm2 + a2 * b1;
2403 xmm3 = xmm3 + a3 * b1;
2404 xmm4 = xmm4 + a4 * b1;
2405 xmm5 = xmm5 + a1 * b2;
2406 xmm6 = xmm6 + a2 * b2;
2407 xmm7 = xmm7 + a3 * b2;
2408 xmm8 = xmm8 + a4 * b2;
2411 (~C).
store( i , j , xmm1 );
2412 (~C).
store( i1, j , xmm2 );
2413 (~C).
store( i2, j , xmm3 );
2414 (~C).
store( i3, j , xmm4 );
2415 (~C).
store( i , j+1UL, xmm5 );
2416 (~C).
store( i1, j+1UL, xmm6 );
2417 (~C).
store( i2, j+1UL, xmm7 );
2418 (~C).
store( i3, j+1UL, xmm8 );
2423 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2424 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2425 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
2426 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
2428 IntrinsicType xmm1( (~C).
load(i ,j) );
2429 IntrinsicType xmm2( (~C).
load(i1,j) );
2430 IntrinsicType xmm3( (~C).
load(i2,j) );
2431 IntrinsicType xmm4( (~C).
load(i3,j) );
2433 for(
size_t k=kbegin; k<kend; ++k ) {
2434 const IntrinsicType b1(
set( B(k,j) ) );
2435 xmm1 = xmm1 + A.load(i ,k) * b1;
2436 xmm2 = xmm2 + A.load(i1,k) * b1;
2437 xmm3 = xmm3 + A.load(i2,k) * b1;
2438 xmm4 = xmm4 + A.load(i3,k) * b1;
2441 (~C).
store( i , j, xmm1 );
2442 (~C).
store( i1, j, xmm2 );
2443 (~C).
store( i2, j, xmm3 );
2444 (~C).
store( i3, j, xmm4 );
2454 for( ; (j+4UL) <= jend; j+=4UL )
2456 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2457 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2458 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
2459 ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
2461 IntrinsicType xmm1( (~C).
load(i ,j ) );
2462 IntrinsicType xmm2( (~C).
load(i1,j ) );
2463 IntrinsicType xmm3( (~C).
load(i ,j+1UL) );
2464 IntrinsicType xmm4( (~C).
load(i1,j+1UL) );
2465 IntrinsicType xmm5( (~C).
load(i ,j+2UL) );
2466 IntrinsicType xmm6( (~C).
load(i1,j+2UL) );
2467 IntrinsicType xmm7( (~C).
load(i ,j+3UL) );
2468 IntrinsicType xmm8( (~C).
load(i1,j+3UL) );
2470 for(
size_t k=kbegin; k<kend; ++k ) {
2471 const IntrinsicType a1( A.load(i ,k) );
2472 const IntrinsicType a2( A.load(i1,k) );
2473 const IntrinsicType b1(
set( B(k,j ) ) );
2474 const IntrinsicType b2(
set( B(k,j+1UL) ) );
2475 const IntrinsicType b3(
set( B(k,j+2UL) ) );
2476 const IntrinsicType b4(
set( B(k,j+3UL) ) );
2477 xmm1 = xmm1 + a1 * b1;
2478 xmm2 = xmm2 + a2 * b1;
2479 xmm3 = xmm3 + a1 * b2;
2480 xmm4 = xmm4 + a2 * b2;
2481 xmm5 = xmm5 + a1 * b3;
2482 xmm6 = xmm6 + a2 * b3;
2483 xmm7 = xmm7 + a1 * b4;
2484 xmm8 = xmm8 + a2 * b4;
2487 (~C).
store( i , j , xmm1 );
2488 (~C).
store( i1, j , xmm2 );
2489 (~C).
store( i , j+1UL, xmm3 );
2490 (~C).
store( i1, j+1UL, xmm4 );
2491 (~C).
store( i , j+2UL, xmm5 );
2492 (~C).
store( i1, j+2UL, xmm6 );
2493 (~C).
store( i , j+3UL, xmm7 );
2494 (~C).
store( i1, j+3UL, xmm8 );
2497 for( ; (j+2UL) <= jend; j+=2UL )
2499 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2500 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2501 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
2502 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
2504 IntrinsicType xmm1( (~C).
load(i ,j ) );
2505 IntrinsicType xmm2( (~C).
load(i1,j ) );
2506 IntrinsicType xmm3( (~C).
load(i ,j+1UL) );
2507 IntrinsicType xmm4( (~C).
load(i1,j+1UL) );
2509 for(
size_t k=kbegin; k<kend; ++k ) {
2510 const IntrinsicType a1( A.load(i ,k) );
2511 const IntrinsicType a2( A.load(i1,k) );
2512 const IntrinsicType b1(
set( B(k,j ) ) );
2513 const IntrinsicType b2(
set( B(k,j+1UL) ) );
2514 xmm1 = xmm1 + a1 * b1;
2515 xmm2 = xmm2 + a2 * b1;
2516 xmm3 = xmm3 + a1 * b2;
2517 xmm4 = xmm4 + a2 * b2;
2520 (~C).
store( i , j , xmm1 );
2521 (~C).
store( i1, j , xmm2 );
2522 (~C).
store( i , j+1UL, xmm3 );
2523 (~C).
store( i1, j+1UL, xmm4 );
2528 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2529 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2530 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
2531 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
2533 IntrinsicType xmm1( (~C).
load(i ,j) );
2534 IntrinsicType xmm2( (~C).
load(i1,j) );
2536 for(
size_t k=kbegin; k<kend; ++k ) {
2537 const IntrinsicType b1(
set( B(k,j) ) );
2538 xmm1 = xmm1 + A.load(i ,k) * b1;
2539 xmm2 = xmm2 + A.load(i1,k) * b1;
2542 (~C).
store( i , j, xmm1 );
2543 (~C).
store( i1, j, xmm2 );
2549 for(
size_t j=jj; j<jend; ++j )
2551 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2552 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2553 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size, ktmp ) ):( ktmp ),
2554 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
2556 IntrinsicType xmm1( (~C).
load(i,j) );
2558 for(
size_t k=kbegin; k<kend; ++k ) {
2559 const IntrinsicType b1(
set( B(k,j) ) );
2560 xmm1 = xmm1 + A.load(i,k) * b1;
2563 (~C).
store( i, j, xmm1 );
2587 template<
typename MT3
2590 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
2591 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2593 selectLargeAddAssignKernel( C, A, B );
2613 template<
typename MT3
2616 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
2617 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2619 if( IsTriangular<MT4>::value ) {
2621 strmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), 1.0F );
2624 else if( IsTriangular<MT5>::value ) {
2626 strmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), 1.0F );
2630 sgemm( C, A, B, 1.0F, 1.0F );
2652 template<
typename MT3
2655 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
2656 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2658 if( IsTriangular<MT4>::value ) {
2660 dtrmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), 1.0 );
2663 else if( IsTriangular<MT5>::value ) {
2665 dtrmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), 1.0 );
2669 dgemm( C, A, B, 1.0, 1.0 );
2691 template<
typename MT3
2694 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2695 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2697 if( IsTriangular<MT4>::value ) {
2699 ctrmm( tmp, A, CblasLeft,
2700 ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
2701 complex<float>( 1.0F, 0.0F ) );
2704 else if( IsTriangular<MT5>::value ) {
2706 ctrmm( tmp, B, CblasRight,
2707 ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
2708 complex<float>( 1.0F, 0.0F ) );
2712 cgemm( C, A, B, complex<float>( 1.0F, 0.0F ), complex<float>( 1.0F, 0.0F ) );
2734 template<
typename MT3
2737 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2738 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2740 if( IsTriangular<MT4>::value ) {
2742 ztrmm( tmp, A, CblasLeft,
2743 ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
2744 complex<double>( 1.0, 0.0 ) );
2747 else if( IsTriangular<MT5>::value ) {
2749 ztrmm( tmp, B, CblasRight,
2750 ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
2751 complex<double>( 1.0, 0.0 ) );
2755 zgemm( C, A, B, complex<double>( 1.0, 0.0 ), complex<double>( 1.0, 0.0 ) );
2777 template<
typename MT >
2778 friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
2788 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
2790 else if( IsSymmetric<MT1>::value )
2815 template<
typename MT
2817 friend inline typename DisableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
2825 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2829 LT A(
serial( rhs.lhs_ ) );
2830 RT B(
serial( rhs.rhs_ ) );
2839 TDMatTDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
2855 template<
typename MT3
2858 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2860 if( ( IsDiagonal<MT4>::value ) ||
2862 selectSmallSubAssignKernel( C, A, B );
2864 selectBlasSubAssignKernel( C, A, B );
2883 template<
typename MT3
2886 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
2887 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2889 const size_t M( A.rows() );
2890 const size_t N( B.columns() );
2891 const size_t K( A.columns() );
2893 for(
size_t j=0UL; j<N; ++j )
2895 const size_t kbegin( ( IsLower<MT5>::value )
2896 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
2898 const size_t kend( ( IsUpper<MT5>::value )
2899 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
2903 for(
size_t k=kbegin; k<kend; ++k )
2905 const size_t ibegin( ( IsLower<MT4>::value )
2906 ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
2908 const size_t iend( ( IsUpper<MT4>::value )
2909 ?( IsStrictlyUpper<MT4>::value ? k : k+1UL )
2913 const size_t inum( iend - ibegin );
2914 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2916 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2917 C(i ,j) -= A(i ,k) * B(k,j);
2918 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
2921 C(ipos,j) -= A(ipos,k) * B(k,j);
2943 template<
typename MT3
2946 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
2947 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2951 const size_t M( A.rows() );
2952 const size_t N( B.columns() );
2954 for(
size_t j=0UL; j<N; ++j )
2956 const size_t ibegin( ( IsLower<MT4>::value )
2957 ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
2959 const size_t iend( ( IsUpper<MT4>::value )
2960 ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
2964 const size_t inum( iend - ibegin );
2965 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2967 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2968 C(i ,j) -= A(i ,j) * B(j,j);
2969 C(i+1UL,j) -= A(i+1UL,j) * B(j,j);
2972 C(ipos,j) -= A(ipos,j) * B(j,j);
2993 template<
typename MT3
2996 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
2997 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3001 const size_t M( A.rows() );
3002 const size_t N( B.columns() );
3004 for(
size_t j=0UL; j<N; ++j )
3006 const size_t ibegin( ( IsLower<MT5>::value )
3007 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
3009 const size_t iend( ( IsUpper<MT5>::value )
3010 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
3014 const size_t inum( iend - ibegin );
3015 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
3017 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
3018 C(i ,j) -= A(i ,i ) * B(i ,j);
3019 C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j);
3022 C(ipos,j) -= A(ipos,ipos) * B(ipos,j);
3043 template<
typename MT3
3046 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
3047 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3051 for(
size_t i=0UL; i<A.rows(); ++i ) {
3052 C(i,i) -= A(i,i) * B(i,i);
3072 template<
typename MT3
3075 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3076 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3078 selectDefaultSubAssignKernel( C, A, B );
3098 template<
typename MT3
3101 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3102 selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
3109 if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
3113 else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
3117 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
3144 template<
typename MT3
3147 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3148 selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
3150 typedef IntrinsicTrait<ElementType> IT;
3152 const size_t M( A.rows() );
3153 const size_t N( B.columns() );
3154 const size_t K( A.columns() );
3159 for(
size_t j=0UL; j<N; ++j )
3161 const size_t kbegin( ( IsLower<MT5>::value )
3162 ?( ( IsUpper<MT4>::value )
3163 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3164 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3165 :( IsUpper<MT4>::value ? i : 0UL ) );
3166 const size_t kend( ( IsUpper<MT5>::value )
3167 ?( ( IsLower<MT4>::value )
3168 ?(
min( i+
IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
3169 :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
3170 :( IsLower<MT4>::value ?
min( i+
IT::size*8UL, K ) : K ) );
3172 IntrinsicType xmm1( (~C).
load(i ,j) );
3181 for(
size_t k=kbegin; k<kend; ++k ) {
3182 const IntrinsicType b1(
set( B(k,j) ) );
3183 xmm1 = xmm1 - A.load(i ,k) * b1;
3184 xmm2 = xmm2 - A.load(i+
IT::size ,k) * b1;
3185 xmm3 = xmm3 - A.load(i+
IT::size*2UL,k) * b1;
3186 xmm4 = xmm4 - A.load(i+
IT::size*3UL,k) * b1;
3187 xmm5 = xmm5 - A.load(i+
IT::size*4UL,k) * b1;
3188 xmm6 = xmm6 - A.load(i+
IT::size*5UL,k) * b1;
3189 xmm7 = xmm7 - A.load(i+
IT::size*6UL,k) * b1;
3190 xmm8 = xmm8 - A.load(i+
IT::size*7UL,k) * b1;
3193 (~C).
store( i , j, xmm1 );
3208 for( ; (j+2UL) <= N; j+=2UL )
3210 const size_t kbegin( ( IsLower<MT5>::value )
3211 ?( ( IsUpper<MT4>::value )
3212 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3213 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3214 :( IsUpper<MT4>::value ? i : 0UL ) );
3215 const size_t kend( ( IsUpper<MT5>::value )
3216 ?( ( IsLower<MT4>::value )
3217 ?(
min( i+
IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3218 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3219 :( IsLower<MT4>::value ?
min( i+
IT::size*4UL, K ) : K ) );
3221 IntrinsicType xmm1( (~C).
load(i ,j ) );
3225 IntrinsicType xmm5( (~C).
load(i ,j+1UL) );
3230 for(
size_t k=kbegin; k<kend; ++k ) {
3231 const IntrinsicType a1( A.load(i ,k) );
3232 const IntrinsicType a2( A.load(i+
IT::size ,k) );
3233 const IntrinsicType a3( A.load(i+
IT::size*2UL,k) );
3234 const IntrinsicType a4( A.load(i+
IT::size*3UL,k) );
3235 const IntrinsicType b1(
set( B(k,j ) ) );
3236 const IntrinsicType b2(
set( B(k,j+1UL) ) );
3237 xmm1 = xmm1 - a1 * b1;
3238 xmm2 = xmm2 - a2 * b1;
3239 xmm3 = xmm3 - a3 * b1;
3240 xmm4 = xmm4 - a4 * b1;
3241 xmm5 = xmm5 - a1 * b2;
3242 xmm6 = xmm6 - a2 * b2;
3243 xmm7 = xmm7 - a3 * b2;
3244 xmm8 = xmm8 - a4 * b2;
3247 (~C).
store( i , j , xmm1 );
3251 (~C).
store( i , j+1UL, xmm5 );
3259 const size_t kbegin( ( IsLower<MT5>::value )
3260 ?( ( IsUpper<MT4>::value )
3261 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3262 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3263 :( IsUpper<MT4>::value ? i : 0UL ) );
3264 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, K ) ):( K ) );
3266 IntrinsicType xmm1( (~C).
load(i ,j) );
3271 for(
size_t k=kbegin; k<kend; ++k ) {
3272 const IntrinsicType b1(
set( B(k,j) ) );
3273 xmm1 = xmm1 - A.load(i ,k) * b1;
3274 xmm2 = xmm2 - A.load(i+
IT::size ,k) * b1;
3275 xmm3 = xmm3 - A.load(i+
IT::size*2UL,k) * b1;
3276 xmm4 = xmm4 - A.load(i+
IT::size*3UL,k) * b1;
3279 (~C).
store( i , j, xmm1 );
3290 for( ; (j+2UL) <= N; j+=2UL )
3292 const size_t kbegin( ( IsLower<MT5>::value )
3293 ?( ( IsUpper<MT4>::value )
3294 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3295 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3296 :( IsUpper<MT4>::value ? i : 0UL ) );
3297 const size_t kend( ( IsUpper<MT5>::value )
3298 ?( ( IsLower<MT4>::value )
3299 ?(
min( i+
IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3300 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3301 :( IsLower<MT4>::value ?
min( i+
IT::size*2UL, K ) : K ) );
3303 IntrinsicType xmm1( (~C).
load(i ,j ) );
3305 IntrinsicType xmm3( (~C).
load(i ,j+1UL) );
3308 for(
size_t k=kbegin; k<kend; ++k ) {
3309 const IntrinsicType a1( A.load(i ,k) );
3310 const IntrinsicType a2( A.load(i+
IT::size,k) );
3311 const IntrinsicType b1(
set( B(k,j ) ) );
3312 const IntrinsicType b2(
set( B(k,j+1UL) ) );
3313 xmm1 = xmm1 - a1 * b1;
3314 xmm2 = xmm2 - a2 * b1;
3315 xmm3 = xmm3 - a1 * b2;
3316 xmm4 = xmm4 - a2 * b2;
3319 (~C).
store( i , j , xmm1 );
3321 (~C).
store( i , j+1UL, xmm3 );
3327 const size_t kbegin( ( IsLower<MT5>::value )
3328 ?( ( IsUpper<MT4>::value )
3329 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3330 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3331 :( IsUpper<MT4>::value ? i : 0UL ) );
3332 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, K ) ):( K ) );
3334 IntrinsicType xmm1( (~C).
load(i ,j) );
3337 for(
size_t k=kbegin; k<kend; ++k ) {
3338 const IntrinsicType b1(
set( B(k,j) ) );
3339 xmm1 = xmm1 - A.load(i ,k) * b1;
3340 xmm2 = xmm2 - A.load(i+
IT::size,k) * b1;
3343 (~C).
store( i , j, xmm1 );
3352 for( ; (j+2UL) <= N; j+=2UL )
3354 const size_t kbegin( ( IsLower<MT5>::value )
3355 ?( ( IsUpper<MT4>::value )
3356 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3357 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3358 :( IsUpper<MT4>::value ? i : 0UL ) );
3359 const size_t kend( ( IsUpper<MT5>::value )
3360 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
3363 IntrinsicType xmm1( (~C).
load(i,j ) );
3364 IntrinsicType xmm2( (~C).
load(i,j+1UL) );
3366 for(
size_t k=kbegin; k<kend; ++k ) {
3367 const IntrinsicType a1( A.load(i,k) );
3368 xmm1 = xmm1 - a1 *
set( B(k,j ) );
3369 xmm2 = xmm2 - a1 *
set( B(k,j+1UL) );
3372 (~C).
store( i, j , xmm1 );
3373 (~C).
store( i, j+1UL, xmm2 );
3378 const size_t kbegin( ( IsLower<MT5>::value )
3379 ?( ( IsUpper<MT4>::value )
3380 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3381 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3382 :( IsUpper<MT4>::value ? i : 0UL ) );
3384 IntrinsicType xmm1( (~C).
load(i,j) );
3386 for(
size_t k=kbegin; k<K; ++k ) {
3387 xmm1 = xmm1 - A.load(i,k) *
set( B(k,j) );
3390 (~C).
store( i, j, xmm1 );
3411 template<
typename MT3
3414 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3415 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3417 selectDefaultSubAssignKernel( C, A, B );
3437 template<
typename MT3
3440 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3441 selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
3443 selectSmallSubAssignKernel( ~C, A, B );
3463 template<
typename MT3
3466 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3467 selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
3469 typedef IntrinsicTrait<ElementType> IT;
3471 const size_t M( A.rows() );
3472 const size_t N( B.columns() );
3473 const size_t K( A.columns() );
3475 const size_t iblock( 128UL );
3476 const size_t jblock( 64UL );
3477 const size_t kblock( 128UL );
3479 for(
size_t ii=0UL; ii<M; ii+=iblock )
3481 const size_t iend(
min( ii+iblock, M ) );
3483 for(
size_t jj=0UL; jj<N; jj+=jblock )
3485 const size_t jend(
min( jj+jblock, N ) );
3487 for(
size_t kk=0UL; kk<K; kk+=kblock )
3489 const size_t ktmp(
min( kk+kblock, K ) );
3501 for( ; (j+2UL) <= jend; j+=2UL )
3503 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3504 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3505 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
3506 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
3508 IntrinsicType xmm1( (~C).
load(i ,j ) );
3509 IntrinsicType xmm2( (~C).
load(i1,j ) );
3510 IntrinsicType xmm3( (~C).
load(i2,j ) );
3511 IntrinsicType xmm4( (~C).
load(i3,j ) );
3512 IntrinsicType xmm5( (~C).
load(i ,j+1UL) );
3513 IntrinsicType xmm6( (~C).
load(i1,j+1UL) );
3514 IntrinsicType xmm7( (~C).
load(i2,j+1UL) );
3515 IntrinsicType xmm8( (~C).
load(i3,j+1UL) );
3517 for(
size_t k=kbegin; k<kend; ++k ) {
3518 const IntrinsicType a1( A.load(i ,k) );
3519 const IntrinsicType a2( A.load(i1,k) );
3520 const IntrinsicType a3( A.load(i2,k) );
3521 const IntrinsicType a4( A.load(i3,k) );
3522 const IntrinsicType b1(
set( B(k,j ) ) );
3523 const IntrinsicType b2(
set( B(k,j+1UL) ) );
3524 xmm1 = xmm1 - a1 * b1;
3525 xmm2 = xmm2 - a2 * b1;
3526 xmm3 = xmm3 - a3 * b1;
3527 xmm4 = xmm4 - a4 * b1;
3528 xmm5 = xmm5 - a1 * b2;
3529 xmm6 = xmm6 - a2 * b2;
3530 xmm7 = xmm7 - a3 * b2;
3531 xmm8 = xmm8 - a4 * b2;
3534 (~C).
store( i , j , xmm1 );
3535 (~C).
store( i1, j , xmm2 );
3536 (~C).
store( i2, j , xmm3 );
3537 (~C).
store( i3, j , xmm4 );
3538 (~C).
store( i , j+1UL, xmm5 );
3539 (~C).
store( i1, j+1UL, xmm6 );
3540 (~C).
store( i2, j+1UL, xmm7 );
3541 (~C).
store( i3, j+1UL, xmm8 );
3546 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3547 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3548 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
3549 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3551 IntrinsicType xmm1( (~C).
load(i ,j) );
3552 IntrinsicType xmm2( (~C).
load(i1,j) );
3553 IntrinsicType xmm3( (~C).
load(i2,j) );
3554 IntrinsicType xmm4( (~C).
load(i3,j) );
3556 for(
size_t k=kbegin; k<kend; ++k ) {
3557 const IntrinsicType b1(
set( B(k,j) ) );
3558 xmm1 = xmm1 - A.load(i ,k) * b1;
3559 xmm2 = xmm2 - A.load(i1,k) * b1;
3560 xmm3 = xmm3 - A.load(i2,k) * b1;
3561 xmm4 = xmm4 - A.load(i3,k) * b1;
3564 (~C).
store( i , j, xmm1 );
3565 (~C).
store( i1, j, xmm2 );
3566 (~C).
store( i2, j, xmm3 );
3567 (~C).
store( i3, j, xmm4 );
3577 for( ; (j+4UL) <= jend; j+=4UL )
3579 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3580 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3581 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
3582 ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
3584 IntrinsicType xmm1( (~C).
load(i ,j ) );
3585 IntrinsicType xmm2( (~C).
load(i1,j ) );
3586 IntrinsicType xmm3( (~C).
load(i ,j+1UL) );
3587 IntrinsicType xmm4( (~C).
load(i1,j+1UL) );
3588 IntrinsicType xmm5( (~C).
load(i ,j+2UL) );
3589 IntrinsicType xmm6( (~C).
load(i1,j+2UL) );
3590 IntrinsicType xmm7( (~C).
load(i ,j+3UL) );
3591 IntrinsicType xmm8( (~C).
load(i1,j+3UL) );
3593 for(
size_t k=kbegin; k<kend; ++k ) {
3594 const IntrinsicType a1( A.load(i ,k) );
3595 const IntrinsicType a2( A.load(i1,k) );
3596 const IntrinsicType b1(
set( B(k,j ) ) );
3597 const IntrinsicType b2(
set( B(k,j+1UL) ) );
3598 const IntrinsicType b3(
set( B(k,j+2UL) ) );
3599 const IntrinsicType b4(
set( B(k,j+3UL) ) );
3600 xmm1 = xmm1 - a1 * b1;
3601 xmm2 = xmm2 - a2 * b1;
3602 xmm3 = xmm3 - a1 * b2;
3603 xmm4 = xmm4 - a2 * b2;
3604 xmm5 = xmm5 - a1 * b3;
3605 xmm6 = xmm6 - a2 * b3;
3606 xmm7 = xmm7 - a1 * b4;
3607 xmm8 = xmm8 - a2 * b4;
3610 (~C).
store( i , j , xmm1 );
3611 (~C).
store( i1, j , xmm2 );
3612 (~C).
store( i , j+1UL, xmm3 );
3613 (~C).
store( i1, j+1UL, xmm4 );
3614 (~C).
store( i , j+2UL, xmm5 );
3615 (~C).
store( i1, j+2UL, xmm6 );
3616 (~C).
store( i , j+3UL, xmm7 );
3617 (~C).
store( i1, j+3UL, xmm8 );
3620 for( ; (j+2UL) <= jend; j+=2UL )
3622 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3623 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3624 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
3625 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
3627 IntrinsicType xmm1( (~C).
load(i ,j ) );
3628 IntrinsicType xmm2( (~C).
load(i1,j ) );
3629 IntrinsicType xmm3( (~C).
load(i ,j+1UL) );
3630 IntrinsicType xmm4( (~C).
load(i1,j+1UL) );
3632 for(
size_t k=kbegin; k<kend; ++k ) {
3633 const IntrinsicType a1( A.load(i ,k) );
3634 const IntrinsicType a2( A.load(i1,k) );
3635 const IntrinsicType b1(
set( B(k,j ) ) );
3636 const IntrinsicType b2(
set( B(k,j+1UL) ) );
3637 xmm1 = xmm1 - a1 * b1;
3638 xmm2 = xmm2 - a2 * b1;
3639 xmm3 = xmm3 - a1 * b2;
3640 xmm4 = xmm4 - a2 * b2;
3643 (~C).
store( i , j , xmm1 );
3644 (~C).
store( i1, j , xmm2 );
3645 (~C).
store( i , j+1UL, xmm3 );
3646 (~C).
store( i1, j+1UL, xmm4 );
3651 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3652 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3653 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
3654 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3656 IntrinsicType xmm1( (~C).
load(i ,j) );
3657 IntrinsicType xmm2( (~C).
load(i1,j) );
3659 for(
size_t k=kbegin; k<kend; ++k ) {
3660 const IntrinsicType b1(
set( B(k,j) ) );
3661 xmm1 = xmm1 - A.load(i ,k) * b1;
3662 xmm2 = xmm2 - A.load(i1,k) * b1;
3665 (~C).
store( i , j, xmm1 );
3666 (~C).
store( i1, j, xmm2 );
3672 for(
size_t j=jj; j<jend; ++j )
3674 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3675 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3676 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size, ktmp ) ):( ktmp ),
3677 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3679 IntrinsicType xmm1( (~C).
load(i,j) );
3681 for(
size_t k=kbegin; k<kend; ++k ) {
3682 const IntrinsicType b1(
set( B(k,j) ) );
3683 xmm1 = xmm1 - A.load(i,k) * b1;
3686 (~C).
store( i, j, xmm1 );
3710 template<
typename MT3
3713 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
3714 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3716 selectLargeSubAssignKernel( C, A, B );
3736 template<
typename MT3
3739 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
3740 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3742 if( IsTriangular<MT4>::value ) {
3744 strmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), 1.0F );
3747 else if( IsTriangular<MT5>::value ) {
3749 strmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), 1.0F );
3753 sgemm( C, A, B, -1.0F, 1.0F );
3775 template<
typename MT3
3778 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
3779 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3781 if( IsTriangular<MT4>::value ) {
3783 dtrmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), 1.0 );
3786 else if( IsTriangular<MT5>::value ) {
3788 dtrmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), 1.0 );
3792 dgemm( C, A, B, -1.0, 1.0 );
3814 template<
typename MT3
3817 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3818 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3820 if( IsTriangular<MT4>::value ) {
3822 ctrmm( tmp, A, CblasLeft,
3823 ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
3824 complex<float>( 1.0F, 0.0F ) );
3827 else if( IsTriangular<MT5>::value ) {
3829 ctrmm( tmp, B, CblasRight,
3830 ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
3831 complex<float>( 1.0F, 0.0F ) );
3835 cgemm( C, A, B, complex<float>( -1.0F, 0.0F ), complex<float>( 1.0F, 0.0F ) );
3857 template<
typename MT3
3860 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3861 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3863 if( IsTriangular<MT4>::value ) {
3865 ztrmm( tmp, A, CblasLeft,
3866 ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
3867 complex<float>( 1.0, 0.0 ) );
3870 else if( IsTriangular<MT5>::value ) {
3872 ztrmm( tmp, B, CblasRight,
3873 ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
3874 complex<float>( 1.0, 0.0 ) );
3878 zgemm( C, A, B, complex<double>( -1.0, 0.0 ), complex<double>( 1.0, 0.0 ) );
3901 template<
typename MT >
3902 friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
3912 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
3914 else if( IsSymmetric<MT1>::value )
3950 template<
typename MT
3952 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
3960 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
3963 else if( rhs.lhs_.columns() == 0UL ) {
3999 template<
typename MT
4001 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
4006 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
4018 const TmpType tmp( rhs );
4039 template<
typename MT >
4040 friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
4050 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
4052 else if( IsSymmetric<MT1>::value )
4076 template<
typename MT
4078 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
4086 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4121 template<
typename MT >
4122 friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
4132 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
4134 else if( IsSymmetric<MT1>::value )
4162 template<
typename MT
4164 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
4172 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4207 template<
typename MT >
4208 friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
4218 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
4220 else if( IsSymmetric<MT1>::value )
4269 template<
typename MT1
4273 :
public DenseMatrix< DMatScalarMultExpr< TDMatTDMatMultExpr<MT1,MT2>, ST, true >, true >
4274 ,
private MatScalarMultExpr
4275 ,
private Computation
4279 typedef TDMatTDMatMultExpr<MT1,MT2> MMM;
4291 enum { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
4296 enum { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
4306 template<
typename T1,
typename T2,
typename T3 >
4307 struct CanExploitSymmetry {
4308 enum { value = IsRowMajorMatrix<T1>::value &&
4309 ( IsSymmetric<T2>::value || IsSymmetric<T3>::value ) };
4318 template<
typename T1,
typename T2,
typename T3 >
4319 struct IsEvaluationRequired {
4320 enum { value = ( evaluateLeft || evaluateRight ) &&
4321 !CanExploitSymmetry<T1,T2,T3>::value };
4330 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4331 struct UseSinglePrecisionKernel {
4333 HasMutableDataAccess<T1>::value &&
4334 HasConstDataAccess<T2>::value &&
4335 HasConstDataAccess<T3>::value &&
4336 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
4337 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
4338 IsFloat<typename T1::ElementType>::value &&
4339 IsFloat<typename T2::ElementType>::value &&
4340 IsFloat<typename T3::ElementType>::value &&
4341 !IsComplex<T4>::value };
4350 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4351 struct UseDoublePrecisionKernel {
4353 HasMutableDataAccess<T1>::value &&
4354 HasConstDataAccess<T2>::value &&
4355 HasConstDataAccess<T3>::value &&
4356 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
4357 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
4358 IsDouble<typename T1::ElementType>::value &&
4359 IsDouble<typename T2::ElementType>::value &&
4360 IsDouble<typename T3::ElementType>::value &&
4361 !IsComplex<T4>::value };
4370 template<
typename T1,
typename T2,
typename T3 >
4371 struct UseSinglePrecisionComplexKernel {
4372 typedef complex<float> Type;
4374 HasMutableDataAccess<T1>::value &&
4375 HasConstDataAccess<T2>::value &&
4376 HasConstDataAccess<T3>::value &&
4377 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
4378 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
4379 IsSame<typename T1::ElementType,Type>::value &&
4380 IsSame<typename T2::ElementType,Type>::value &&
4381 IsSame<typename T3::ElementType,Type>::value };
4390 template<
typename T1,
typename T2,
typename T3 >
4391 struct UseDoublePrecisionComplexKernel {
4392 typedef complex<double> Type;
4394 HasMutableDataAccess<T1>::value &&
4395 HasConstDataAccess<T2>::value &&
4396 HasConstDataAccess<T3>::value &&
4397 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
4398 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
4399 IsSame<typename T1::ElementType,Type>::value &&
4400 IsSame<typename T2::ElementType,Type>::value &&
4401 IsSame<typename T3::ElementType,Type>::value };
4409 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4410 struct UseDefaultKernel {
4411 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
4412 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
4413 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
4414 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
4422 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4423 struct UseVectorizedDefaultKernel {
4424 enum { value = !IsDiagonal<T2>::value &&
4425 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
4426 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
4427 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
4428 IsSame<typename T1::ElementType,T4>::value &&
4429 IntrinsicTrait<typename T1::ElementType>::addition &&
4430 IntrinsicTrait<typename T1::ElementType>::subtraction &&
4431 IntrinsicTrait<typename T1::ElementType>::multiplication };
4437 typedef DMatScalarMultExpr<MMM,ST,true>
This;
4438 typedef typename MultTrait<RES,ST>::Type
ResultType;
4442 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
4447 typedef const TDMatTDMatMultExpr<MT1,MT2>
LeftOperand;
4453 typedef typename SelectType< evaluateLeft, const RT1, CT1 >::Type
LT;
4456 typedef typename SelectType< evaluateRight, const RT2, CT2 >::Type
RT;
4461 enum { vectorizable = !IsDiagonal<MT1>::value &&
4462 MT1::vectorizable && MT2::vectorizable &&
4463 IsSame<ET1,ET2>::value &&
4464 IsSame<ET1,ST>::value &&
4465 IntrinsicTrait<ET1>::addition &&
4466 IntrinsicTrait<ET1>::multiplication };
4469 enum { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
4470 !evaluateRight && MT2::smpAssignable };
4479 explicit inline DMatScalarMultExpr(
const MMM& matrix, ST scalar )
4492 inline ReturnType
operator()(
size_t i,
size_t j )
const {
4495 return matrix_(i,j) * scalar_;
4504 inline size_t rows()
const {
4505 return matrix_.rows();
4514 inline size_t columns()
const {
4515 return matrix_.columns();
4545 template<
typename T >
4546 inline bool canAlias(
const T* alias )
const {
4547 return matrix_.canAlias( alias );
4557 template<
typename T >
4558 inline bool isAliased(
const T* alias )
const {
4559 return matrix_.isAliased( alias );
4569 return matrix_.isAligned();
4579 typename MMM::RightOperand B( matrix_.rightOperand() );
4588 LeftOperand matrix_;
4589 RightOperand scalar_;
4604 template<
typename MT
4606 friend inline typename DisableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
4607 assign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
4614 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
4615 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
4617 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
4620 else if( left.columns() == 0UL ) {
4635 DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
4650 template<
typename MT3
4654 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4656 if( ( IsDiagonal<MT4>::value ) ||
4658 selectSmallAssignKernel( C, A, B, scalar );
4660 selectBlasAssignKernel( C, A, B, scalar );
4678 template<
typename MT3
4682 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
4683 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4685 const size_t M( A.rows() );
4686 const size_t N( B.columns() );
4687 const size_t K( A.columns() );
4689 for(
size_t j=0UL; j<N; ++j )
4691 const size_t kbegin( ( IsLower<MT5>::value )
4692 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
4694 const size_t kend( ( IsUpper<MT5>::value )
4695 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
4699 if( IsStrictlyTriangular<MT5>::value && kbegin == kend ) {
4700 for(
size_t i=0UL; i<M; ++i ) {
4707 const size_t ibegin( ( IsLower<MT4>::value )
4708 ?( IsStrictlyLower<MT4>::value ? kbegin+1UL : kbegin )
4710 const size_t iend( ( IsUpper<MT4>::value )
4711 ?( IsStrictlyUpper<MT4>::value ? kbegin : kbegin+1UL )
4715 if( IsLower<MT4>::value && IsLower<MT5>::value ) {
4716 for(
size_t i=0UL; i<ibegin; ++i ) {
4720 else if( IsStrictlyLower<MT4>::value ) {
4723 for(
size_t i=ibegin; i<iend; ++i ) {
4724 C(i,j) = A(i,kbegin) * B(kbegin,j);
4726 if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
4727 for(
size_t i=iend; i<M; ++i ) {
4731 else if( IsStrictlyUpper<MT4>::value ) {
4732 reset( C(M-1UL,j) );
4736 for(
size_t k=kbegin+1UL; k<kend; ++k )
4738 const size_t ibegin( ( IsLower<MT4>::value )
4739 ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
4741 const size_t iend( ( IsUpper<MT4>::value )
4742 ?( IsStrictlyUpper<MT4>::value ? k-1UL : k )
4746 for(
size_t i=ibegin; i<iend; ++i ) {
4747 C(i,j) += A(i,k) * B(k,j);
4749 if( IsUpper<MT4>::value ) {
4750 C(iend,j) = A(iend,k) * B(k,j);
4755 const size_t ibegin( ( IsLower<MT4>::value && IsLower<MT5>::value )
4756 ?( IsStrictlyLower<MT4>::value || IsStrictlyLower<MT5>::value ? j+1UL : j )
4758 const size_t iend( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
4759 ?( IsStrictlyUpper<MT4>::value || IsStrictlyUpper<MT5>::value ? j : j+1UL )
4763 for(
size_t i=ibegin; i<iend; ++i ) {
4785 template<
typename MT3
4789 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
4790 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4794 const size_t M( A.rows() );
4795 const size_t N( B.columns() );
4797 for(
size_t j=0UL; j<N; ++j )
4799 const size_t ibegin( ( IsLower<MT4>::value )
4800 ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
4802 const size_t iend( ( IsUpper<MT4>::value )
4803 ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
4807 if( IsLower<MT4>::value ) {
4808 for(
size_t i=0UL; i<ibegin; ++i ) {
4812 for(
size_t i=ibegin; i<iend; ++i ) {
4813 C(i,j) = A(i,j) * B(j,j) * scalar;
4815 if( IsUpper<MT4>::value ) {
4816 for(
size_t i=iend; i<M; ++i ) {
4838 template<
typename MT3
4842 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
4843 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4847 const size_t M( A.rows() );
4848 const size_t N( B.columns() );
4850 for(
size_t j=0UL; j<N; ++j )
4852 const size_t ibegin( ( IsLower<MT5>::value )
4853 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
4855 const size_t iend( ( IsUpper<MT5>::value )
4856 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
4860 if( IsLower<MT4>::value ) {
4861 for(
size_t i=0UL; i<ibegin; ++i ) {
4865 for(
size_t i=ibegin; i<iend; ++i ) {
4866 C(i,j) = A(i,i) * B(i,j) * scalar;
4868 if( IsUpper<MT4>::value ) {
4869 for(
size_t i=iend; i<M; ++i ) {
4891 template<
typename MT3
4895 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
4896 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4902 for(
size_t i=0UL; i<A.rows(); ++i ) {
4903 C(i,i) = A(i,i) * B(i,i) * scalar;
4922 template<
typename MT3
4926 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4927 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4929 selectDefaultAssignKernel( C, A, B, scalar );
4948 template<
typename MT3
4952 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4953 selectSmallAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
4960 if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
4962 assign( ~C, A * tmp * scalar );
4964 else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
4966 assign( ~C, tmp * B * scalar );
4968 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
4970 assign( ~C, A * tmp * scalar );
4974 assign( ~C, tmp * B * scalar );
4994 template<
typename MT3
4998 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4999 selectSmallAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5001 typedef IntrinsicTrait<ElementType> IT;
5003 const size_t M( A.rows() );
5004 const size_t N( B.columns() );
5005 const size_t K( A.columns() );
5007 const IntrinsicType factor(
set( scalar ) );
5012 for(
size_t j=0UL; j<N; ++j )
5014 const size_t kbegin( ( IsLower<MT5>::value )
5015 ?( ( IsUpper<MT4>::value )
5016 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5017 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5018 :( IsUpper<MT4>::value ? i : 0UL ) );
5019 const size_t kend( ( IsUpper<MT5>::value )
5020 ?( ( IsLower<MT4>::value )
5021 ?(
min( i+
IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
5022 :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
5023 :( IsLower<MT4>::value ?
min( i+
IT::size*8UL, K ) : K ) );
5025 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5027 for(
size_t k=kbegin; k<kend; ++k ) {
5028 const IntrinsicType b1(
set( B(k,j) ) );
5029 xmm1 = xmm1 + A.load(i ,k) * b1;
5030 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
5031 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
5032 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
5033 xmm5 = xmm5 + A.load(i+
IT::size*4UL,k) * b1;
5034 xmm6 = xmm6 + A.load(i+
IT::size*5UL,k) * b1;
5035 xmm7 = xmm7 + A.load(i+
IT::size*6UL,k) * b1;
5036 xmm8 = xmm8 + A.load(i+
IT::size*7UL,k) * b1;
5039 (~C).
store( i , j, xmm1 * factor );
5054 for( ; (j+2UL) <= N; j+=2UL )
5056 const size_t kbegin( ( IsLower<MT5>::value )
5057 ?( ( IsUpper<MT4>::value )
5058 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5059 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5060 :( IsUpper<MT4>::value ? i : 0UL ) );
5061 const size_t kend( ( IsUpper<MT5>::value )
5062 ?( ( IsLower<MT4>::value )
5063 ?(
min( i+
IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
5064 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
5065 :( IsLower<MT4>::value ?
min( i+
IT::size*4UL, K ) : K ) );
5067 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5069 for(
size_t k=kbegin; k<kend; ++k ) {
5070 const IntrinsicType a1( A.load(i ,k) );
5071 const IntrinsicType a2( A.load(i+
IT::size ,k) );
5072 const IntrinsicType a3( A.load(i+
IT::size*2UL,k) );
5073 const IntrinsicType a4( A.load(i+
IT::size*3UL,k) );
5074 const IntrinsicType b1(
set( B(k,j ) ) );
5075 const IntrinsicType b2(
set( B(k,j+1UL) ) );
5076 xmm1 = xmm1 + a1 * b1;
5077 xmm2 = xmm2 + a2 * b1;
5078 xmm3 = xmm3 + a3 * b1;
5079 xmm4 = xmm4 + a4 * b1;
5080 xmm5 = xmm5 + a1 * b2;
5081 xmm6 = xmm6 + a2 * b2;
5082 xmm7 = xmm7 + a3 * b2;
5083 xmm8 = xmm8 + a4 * b2;
5086 (~C).
store( i , j , xmm1 * factor );
5090 (~C).
store( i , j+1UL, xmm5 * factor );
5098 const size_t kbegin( ( IsLower<MT5>::value )
5099 ?( ( IsUpper<MT4>::value )
5100 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5101 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5102 :( IsUpper<MT4>::value ? i : 0UL ) );
5103 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, K ) ):( K ) );
5105 IntrinsicType xmm1, xmm2, xmm3, xmm4;
5107 for(
size_t k=kbegin; k<kend; ++k ) {
5108 const IntrinsicType b1(
set( B(k,j) ) );
5109 xmm1 = xmm1 + A.load(i ,k) * b1;
5110 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
5111 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
5112 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
5115 (~C).
store( i , j, xmm1 * factor );
5126 for( ; (j+2UL) <= N; j+=2UL )
5128 const size_t kbegin( ( IsLower<MT5>::value )
5129 ?( ( IsUpper<MT4>::value )
5130 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5131 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5132 :( IsUpper<MT4>::value ? i : 0UL ) );
5133 const size_t kend( ( IsUpper<MT5>::value )
5134 ?( ( IsLower<MT4>::value )
5135 ?(
min( i+
IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
5136 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
5137 :( IsLower<MT4>::value ?
min( i+
IT::size*2UL, K ) : K ) );
5139 IntrinsicType xmm1, xmm2, xmm3, xmm4;
5141 for(
size_t k=kbegin; k<kend; ++k ) {
5142 const IntrinsicType a1( A.load(i ,k) );
5143 const IntrinsicType a2( A.load(i+
IT::size,k) );
5144 const IntrinsicType b1(
set( B(k,j ) ) );
5145 const IntrinsicType b2(
set( B(k,j+1UL) ) );
5146 xmm1 = xmm1 + a1 * b1;
5147 xmm2 = xmm2 + a2 * b1;
5148 xmm3 = xmm3 + a1 * b2;
5149 xmm4 = xmm4 + a2 * b2;
5152 (~C).
store( i , j , xmm1 * factor );
5154 (~C).
store( i , j+1UL, xmm3 * factor );
5160 const size_t kbegin( ( IsLower<MT5>::value )
5161 ?( ( IsUpper<MT4>::value )
5162 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5163 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5164 :( IsUpper<MT4>::value ? i : 0UL ) );
5165 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, K ) ):( K ) );
5167 IntrinsicType xmm1, xmm2;
5169 for(
size_t k=kbegin; k<kend; ++k ) {
5170 const IntrinsicType b1(
set( B(k,j) ) );
5171 xmm1 = xmm1 + A.load(i ,k) * b1;
5172 xmm2 = xmm2 + A.load(i+
IT::size,k) * b1;
5175 (~C).
store( i , j, xmm1 * factor );
5184 for( ; (j+2UL) <= N; j+=2UL )
5186 const size_t kbegin( ( IsLower<MT5>::value )
5187 ?( ( IsUpper<MT4>::value )
5188 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5189 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5190 :( IsUpper<MT4>::value ? i : 0UL ) );
5191 const size_t kend( ( IsUpper<MT5>::value )
5192 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
5195 IntrinsicType xmm1, xmm2;
5197 for(
size_t k=kbegin; k<kend; ++k ) {
5198 const IntrinsicType a1( A.load(i,k) );
5199 xmm1 = xmm1 + a1 *
set( B(k,j ) );
5200 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
5203 (~C).
store( i, j , xmm1 * factor );
5204 (~C).
store( i, j+1UL, xmm2 * factor );
5209 const size_t kbegin( ( IsLower<MT5>::value )
5210 ?( ( IsUpper<MT4>::value )
5211 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5212 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5213 :( IsUpper<MT4>::value ? i : 0UL ) );
5217 for(
size_t k=kbegin; k<K; ++k ) {
5218 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
5221 (~C).
store( i, j, xmm1 * factor );
5241 template<
typename MT3
5245 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5246 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5248 selectDefaultAssignKernel( C, A, B, scalar );
5267 template<
typename MT3
5271 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5272 selectLargeAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5274 selectSmallAssignKernel( ~C, A, B, scalar );
5293 template<
typename MT3
5297 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5298 selectLargeAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5300 typedef IntrinsicTrait<ElementType> IT;
5302 const size_t M( A.rows() );
5303 const size_t N( B.columns() );
5304 const size_t K( A.columns() );
5306 const size_t iblock( 128UL );
5307 const size_t jblock( 64UL );
5308 const size_t kblock( 128UL );
5310 const IntrinsicType factor(
set( scalar ) );
5312 for(
size_t ii=0UL; ii<M; ii+=iblock )
5314 const size_t iend(
min( ii+iblock, M ) );
5316 for(
size_t jj=0UL; jj<N; jj+=jblock )
5318 const size_t jend(
min( jj+jblock, N ) );
5320 for(
size_t j=jj; j<jend; ++j ) {
5321 for(
size_t i=ii; i<iend; ++i ) {
5326 for(
size_t kk=0UL; kk<K; kk+=kblock )
5328 const size_t ktmp(
min( kk+kblock, K ) );
5340 for( ; (j+2UL) <= jend; j+=2UL )
5342 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5343 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5344 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
5345 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
5347 IntrinsicType xmm1( (~C).
load(i ,j ) );
5348 IntrinsicType xmm2( (~C).
load(i1,j ) );
5349 IntrinsicType xmm3( (~C).
load(i2,j ) );
5350 IntrinsicType xmm4( (~C).
load(i3,j ) );
5351 IntrinsicType xmm5( (~C).
load(i ,j+1UL) );
5352 IntrinsicType xmm6( (~C).
load(i1,j+1UL) );
5353 IntrinsicType xmm7( (~C).
load(i2,j+1UL) );
5354 IntrinsicType xmm8( (~C).
load(i3,j+1UL) );
5356 for(
size_t k=kbegin; k<kend; ++k ) {
5357 const IntrinsicType a1( A.load(i ,k) );
5358 const IntrinsicType a2( A.load(i1,k) );
5359 const IntrinsicType a3( A.load(i2,k) );
5360 const IntrinsicType a4( A.load(i3,k) );
5361 const IntrinsicType b1(
set( B(k,j ) ) );
5362 const IntrinsicType b2(
set( B(k,j+1UL) ) );
5363 xmm1 = xmm1 + a1 * b1;
5364 xmm2 = xmm2 + a2 * b1;
5365 xmm3 = xmm3 + a3 * b1;
5366 xmm4 = xmm4 + a4 * b1;
5367 xmm5 = xmm5 + a1 * b2;
5368 xmm6 = xmm6 + a2 * b2;
5369 xmm7 = xmm7 + a3 * b2;
5370 xmm8 = xmm8 + a4 * b2;
5373 (~C).
store( i , j , xmm1 * factor );
5374 (~C).
store( i1, j , xmm2 * factor );
5375 (~C).
store( i2, j , xmm3 * factor );
5376 (~C).
store( i3, j , xmm4 * factor );
5377 (~C).
store( i , j+1UL, xmm5 * factor );
5378 (~C).
store( i1, j+1UL, xmm6 * factor );
5379 (~C).
store( i2, j+1UL, xmm7 * factor );
5380 (~C).
store( i3, j+1UL, xmm8 * factor );
5385 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5386 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5387 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
5388 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5390 IntrinsicType xmm1( (~C).
load(i ,j) );
5391 IntrinsicType xmm2( (~C).
load(i1,j) );
5392 IntrinsicType xmm3( (~C).
load(i2,j) );
5393 IntrinsicType xmm4( (~C).
load(i3,j) );
5395 for(
size_t k=kbegin; k<kend; ++k ) {
5396 const IntrinsicType b1(
set( B(k,j) ) );
5397 xmm1 = xmm1 + A.load(i ,k) * b1;
5398 xmm2 = xmm2 + A.load(i1,k) * b1;
5399 xmm3 = xmm3 + A.load(i2,k) * b1;
5400 xmm4 = xmm4 + A.load(i3,k) * b1;
5403 (~C).
store( i , j, xmm1 * factor );
5404 (~C).
store( i1, j, xmm2 * factor );
5405 (~C).
store( i2, j, xmm3 * factor );
5406 (~C).
store( i3, j, xmm4 * factor );
5416 for( ; (j+4UL) <= jend; j+=4UL )
5418 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5419 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5420 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
5421 ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
5423 IntrinsicType xmm1( (~C).
load(i ,j ) );
5424 IntrinsicType xmm2( (~C).
load(i1,j ) );
5425 IntrinsicType xmm3( (~C).
load(i ,j+1UL) );
5426 IntrinsicType xmm4( (~C).
load(i1,j+1UL) );
5427 IntrinsicType xmm5( (~C).
load(i ,j+2UL) );
5428 IntrinsicType xmm6( (~C).
load(i1,j+2UL) );
5429 IntrinsicType xmm7( (~C).
load(i ,j+3UL) );
5430 IntrinsicType xmm8( (~C).
load(i1,j+3UL) );
5432 for(
size_t k=kbegin; k<kend; ++k ) {
5433 const IntrinsicType a1( A.load(i ,k) );
5434 const IntrinsicType a2( A.load(i1,k) );
5435 const IntrinsicType b1(
set( B(k,j ) ) );
5436 const IntrinsicType b2(
set( B(k,j+1UL) ) );
5437 const IntrinsicType b3(
set( B(k,j+2UL) ) );
5438 const IntrinsicType b4(
set( B(k,j+3UL) ) );
5439 xmm1 = xmm1 + a1 * b1;
5440 xmm2 = xmm2 + a2 * b1;
5441 xmm3 = xmm3 + a1 * b2;
5442 xmm4 = xmm4 + a2 * b2;
5443 xmm5 = xmm5 + a1 * b3;
5444 xmm6 = xmm6 + a2 * b3;
5445 xmm7 = xmm7 + a1 * b4;
5446 xmm8 = xmm8 + a2 * b4;
5449 (~C).
store( i , j , xmm1 * factor );
5450 (~C).
store( i1, j , xmm2 * factor );
5451 (~C).
store( i , j+1UL, xmm3 * factor );
5452 (~C).
store( i1, j+1UL, xmm4 * factor );
5453 (~C).
store( i , j+2UL, xmm5 * factor );
5454 (~C).
store( i1, j+2UL, xmm6 * factor );
5455 (~C).
store( i , j+3UL, xmm7 * factor );
5456 (~C).
store( i1, j+3UL, xmm8 * factor );
5459 for( ; (j+2UL) <= jend; j+=2UL )
5461 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5462 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5463 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
5464 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
5466 IntrinsicType xmm1( (~C).
load(i ,j ) );
5467 IntrinsicType xmm2( (~C).
load(i1,j ) );
5468 IntrinsicType xmm3( (~C).
load(i ,j+1UL) );
5469 IntrinsicType xmm4( (~C).
load(i1,j+1UL) );
5471 for(
size_t k=kbegin; k<kend; ++k ) {
5472 const IntrinsicType a1( A.load(i ,k) );
5473 const IntrinsicType a2( A.load(i1,k) );
5474 const IntrinsicType b1(
set( B(k,j ) ) );
5475 const IntrinsicType b2(
set( B(k,j+1UL) ) );
5476 xmm1 = xmm1 + a1 * b1;
5477 xmm2 = xmm2 + a2 * b1;
5478 xmm3 = xmm3 + a1 * b2;
5479 xmm4 = xmm4 + a2 * b2;
5482 (~C).
store( i , j , xmm1 * factor );
5483 (~C).
store( i1, j , xmm2 * factor );
5484 (~C).
store( i , j+1UL, xmm3 * factor );
5485 (~C).
store( i1, j+1UL, xmm4 * factor );
5490 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5491 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5492 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
5493 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5495 IntrinsicType xmm1( (~C).
load(i ,j) );
5496 IntrinsicType xmm2( (~C).
load(i1,j) );
5498 for(
size_t k=kbegin; k<kend; ++k ) {
5499 const IntrinsicType b1(
set( B(k,j) ) );
5500 xmm1 = xmm1 + A.load(i ,k) * b1;
5501 xmm2 = xmm2 + A.load(i1,k) * b1;
5504 (~C).
store( i , j, xmm1 * factor );
5505 (~C).
store( i1, j, xmm2 * factor );
5511 for(
size_t j=jj; j<jend; ++j )
5513 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5514 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5515 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size, ktmp ) ):( ktmp ),
5516 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5518 IntrinsicType xmm1( (~C).
load(i,j) );
5520 for(
size_t k=kbegin; k<kend; ++k ) {
5521 const IntrinsicType b1(
set( B(k,j) ) );
5522 xmm1 = xmm1 + A.load(i,k) * b1;
5525 (~C).
store( i, j, xmm1 * factor );
5548 template<
typename MT3
5552 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5553 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5555 selectLargeAssignKernel( C, A, B, scalar );
5574 template<
typename MT3
5578 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
5579 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5581 if( IsTriangular<MT4>::value ) {
5583 strmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), scalar );
5585 else if( IsTriangular<MT5>::value ) {
5587 strmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), scalar );
5590 sgemm( C, A, B, scalar, 0.0F );
5611 template<
typename MT3
5615 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
5616 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5618 if( IsTriangular<MT4>::value ) {
5620 dtrmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), scalar );
5622 else if( IsTriangular<MT5>::value ) {
5624 dtrmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), scalar );
5627 dgemm( C, A, B, scalar, 0.0 );
5648 template<
typename MT3
5652 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
5653 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5655 if( IsTriangular<MT4>::value ) {
5657 ctrmm( C, A, CblasLeft,
5658 ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
5659 complex<float>( scalar, 0.0F ) );
5661 else if( IsTriangular<MT5>::value ) {
5663 ctrmm( C, B, CblasRight,
5664 ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
5665 complex<float>( scalar, 0.0F ) );
5668 cgemm( C, A, B, complex<float>( scalar, 0.0F ), complex<float>( 0.0F, 0.0F ) );
5689 template<
typename MT3
5693 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
5694 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5696 if( IsTriangular<MT4>::value ) {
5698 ztrmm( C, A, CblasLeft,
5699 ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
5700 complex<double>( scalar, 0.0 ) );
5702 else if( IsTriangular<MT5>::value ) {
5704 ztrmm( C, B, CblasRight,
5705 ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
5706 complex<double>( scalar, 0.0 ) );
5709 zgemm( C, A, B, complex<double>( scalar, 0.0 ), complex<double>( 0.0, 0.0 ) );
5727 template<
typename MT
5729 friend inline typename DisableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
5730 assign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
5734 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
5746 const TmpType tmp(
serial( rhs ) );
5765 template<
typename MT >
5766 friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
5767 assign( Matrix<MT,false>& lhs,
const DMatScalarMultExpr& rhs )
5776 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
5777 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
5779 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
5781 else if( IsSymmetric<MT1>::value )
5782 assign( ~lhs,
trans( left ) * right * rhs.scalar_ );
5784 assign( ~lhs, left *
trans( right ) * rhs.scalar_ );
5800 template<
typename MT
5802 friend inline typename DisableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
5803 addAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
5810 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
5811 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
5813 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
5827 DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
5842 template<
typename MT3
5846 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5848 if( ( IsDiagonal<MT4>::value ) ||
5850 selectSmallAddAssignKernel( C, A, B, scalar );
5852 selectBlasAddAssignKernel( C, A, B, scalar );
5870 template<
typename MT3
5874 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
5875 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5877 const ResultType tmp(
serial( A * B * scalar ) );
5896 template<
typename MT3
5900 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
5901 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5905 const size_t M( A.rows() );
5906 const size_t N( B.columns() );
5908 for(
size_t j=0UL; j<N; ++j )
5910 const size_t ibegin( ( IsLower<MT4>::value )
5911 ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
5913 const size_t iend( ( IsUpper<MT4>::value )
5914 ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
5918 const size_t inum( iend - ibegin );
5919 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
5921 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
5922 C(i ,j) += A(i ,j) * B(j,j) * scalar;
5923 C(i+1UL,j) += A(i+1UL,j) * B(j,j) * scalar;
5926 C(ipos,j) += A(ipos,j) * B(j,j) * scalar;
5946 template<
typename MT3
5950 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
5951 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5955 const size_t M( A.rows() );
5956 const size_t N( B.columns() );
5958 for(
size_t j=0UL; j<N; ++j )
5960 const size_t ibegin( ( IsLower<MT5>::value )
5961 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
5963 const size_t iend( ( IsUpper<MT5>::value )
5964 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
5968 const size_t inum( iend - ibegin );
5969 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
5971 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
5972 C(i ,j) += A(i ,i ) * B(i ,j) * scalar;
5973 C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
5976 C(ipos,j) += A(ipos,ipos) * B(ipos,j) * scalar;
5996 template<
typename MT3
6000 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
6001 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6005 for(
size_t i=0UL; i<A.rows(); ++i ) {
6006 C(i,i) += A(i,i) * B(i,i) * scalar;
6025 template<
typename MT3
6029 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6030 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6032 selectDefaultAddAssignKernel( C, A, B, scalar );
6051 template<
typename MT3
6055 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6056 selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6063 if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
6067 else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
6071 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
6097 template<
typename MT3
6101 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6102 selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6104 typedef IntrinsicTrait<ElementType> IT;
6106 const size_t M( A.rows() );
6107 const size_t N( B.columns() );
6108 const size_t K( A.columns() );
6110 const IntrinsicType factor(
set( scalar ) );
6115 for(
size_t j=0UL; j<N; ++j )
6117 const size_t kbegin( ( IsLower<MT5>::value )
6118 ?( ( IsUpper<MT4>::value )
6119 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6120 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6121 :( IsUpper<MT4>::value ? i : 0UL ) );
6122 const size_t kend( ( IsUpper<MT5>::value )
6123 ?( ( IsLower<MT4>::value )
6124 ?(
min( i+
IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
6125 :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
6126 :( IsLower<MT4>::value ?
min( i+
IT::size*8UL, K ) : K ) );
6128 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6130 for(
size_t k=kbegin; k<kend; ++k ) {
6131 const IntrinsicType b1(
set( B(k,j) ) );
6132 xmm1 = xmm1 + A.load(i ,k) * b1;
6133 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
6134 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
6135 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
6136 xmm5 = xmm5 + A.load(i+
IT::size*4UL,k) * b1;
6137 xmm6 = xmm6 + A.load(i+
IT::size*5UL,k) * b1;
6138 xmm7 = xmm7 + A.load(i+
IT::size*6UL,k) * b1;
6139 xmm8 = xmm8 + A.load(i+
IT::size*7UL,k) * b1;
6142 (~C).
store( i , j, (~C).load(i ,j) + xmm1 * factor );
6157 for( ; (j+2UL) <= N; j+=2UL )
6159 const size_t kbegin( ( IsLower<MT5>::value )
6160 ?( ( IsUpper<MT4>::value )
6161 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6162 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6163 :( IsUpper<MT4>::value ? i : 0UL ) );
6164 const size_t kend( ( IsUpper<MT5>::value )
6165 ?( ( IsLower<MT4>::value )
6166 ?(
min( i+
IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
6167 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
6168 :( IsLower<MT4>::value ?
min( i+
IT::size*4UL, K ) : K ) );
6170 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6172 for(
size_t k=kbegin; k<kend; ++k ) {
6173 const IntrinsicType a1( A.load(i ,k) );
6174 const IntrinsicType a2( A.load(i+
IT::size ,k) );
6175 const IntrinsicType a3( A.load(i+
IT::size*2UL,k) );
6176 const IntrinsicType a4( A.load(i+
IT::size*3UL,k) );
6177 const IntrinsicType b1(
set( B(k,j ) ) );
6178 const IntrinsicType b2(
set( B(k,j+1UL) ) );
6179 xmm1 = xmm1 + a1 * b1;
6180 xmm2 = xmm2 + a2 * b1;
6181 xmm3 = xmm3 + a3 * b1;
6182 xmm4 = xmm4 + a4 * b1;
6183 xmm5 = xmm5 + a1 * b2;
6184 xmm6 = xmm6 + a2 * b2;
6185 xmm7 = xmm7 + a3 * b2;
6186 xmm8 = xmm8 + a4 * b2;
6189 (~C).
store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6193 (~C).
store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
6201 const size_t kbegin( ( IsLower<MT5>::value )
6202 ?( ( IsUpper<MT4>::value )
6203 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6204 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6205 :( IsUpper<MT4>::value ? i : 0UL ) );
6206 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, K ) ):( K ) );
6208 IntrinsicType xmm1, xmm2, xmm3, xmm4;
6210 for(
size_t k=kbegin; k<kend; ++k ) {
6211 const IntrinsicType b1(
set( B(k,j) ) );
6212 xmm1 = xmm1 + A.load(i ,k) * b1;
6213 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
6214 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
6215 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
6218 (~C).
store( i , j, (~C).load(i ,j) + xmm1 * factor );
6229 for( ; (j+2UL) <= N; j+=2UL )
6231 const size_t kbegin( ( IsLower<MT5>::value )
6232 ?( ( IsUpper<MT4>::value )
6233 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6234 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6235 :( IsUpper<MT4>::value ? i : 0UL ) );
6236 const size_t kend( ( IsUpper<MT5>::value )
6237 ?( ( IsLower<MT4>::value )
6238 ?(
min( i+
IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
6239 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
6240 :( IsLower<MT4>::value ?
min( i+
IT::size*2UL, K ) : K ) );
6242 IntrinsicType xmm1, xmm2, xmm3, xmm4;
6244 for(
size_t k=kbegin; k<kend; ++k ) {
6245 const IntrinsicType a1( A.load(i ,k) );
6246 const IntrinsicType a2( A.load(i+
IT::size,k) );
6247 const IntrinsicType b1(
set( B(k,j ) ) );
6248 const IntrinsicType b2(
set( B(k,j+1UL) ) );
6249 xmm1 = xmm1 + a1 * b1;
6250 xmm2 = xmm2 + a2 * b1;
6251 xmm3 = xmm3 + a1 * b2;
6252 xmm4 = xmm4 + a2 * b2;
6255 (~C).
store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6257 (~C).
store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
6263 const size_t kbegin( ( IsLower<MT5>::value )
6264 ?( ( IsUpper<MT4>::value )
6265 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6266 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6267 :( IsUpper<MT4>::value ? i : 0UL ) );
6268 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, K ) ):( K ) );
6270 IntrinsicType xmm1, xmm2;
6272 for(
size_t k=kbegin; k<kend; ++k ) {
6273 const IntrinsicType b1(
set( B(k,j) ) );
6274 xmm1 = xmm1 + A.load(i ,k) * b1;
6275 xmm2 = xmm2 + A.load(i+
IT::size,k) * b1;
6278 (~C).
store( i , j, (~C).load(i ,j) + xmm1 * factor );
6287 for( ; (j+2UL) <= N; j+=2UL )
6289 const size_t kbegin( ( IsLower<MT5>::value )
6290 ?( ( IsUpper<MT4>::value )
6291 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6292 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6293 :( IsUpper<MT4>::value ? i : 0UL ) );
6294 const size_t kend( ( IsUpper<MT5>::value )
6295 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
6298 IntrinsicType xmm1, xmm2;
6300 for(
size_t k=kbegin; k<kend; ++k ) {
6301 const IntrinsicType a1( A.load(i,k) );
6302 xmm1 = xmm1 + a1 *
set( B(k,j ) );
6303 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
6306 (~C).
store( i, j , (~C).load(i,j ) + xmm1 * factor );
6307 (~C).
store( i, j+1UL, (~C).load(i,j+1UL) + xmm2 * factor );
6312 const size_t kbegin( ( IsLower<MT5>::value )
6313 ?( ( IsUpper<MT4>::value )
6314 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6315 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6316 :( IsUpper<MT4>::value ? i : 0UL ) );
6320 for(
size_t k=kbegin; k<K; ++k ) {
6321 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
6324 (~C).
store( i, j, (~C).load(i,j) + xmm1 * factor );
6344 template<
typename MT3
6348 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6349 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6351 selectDefaultAddAssignKernel( C, A, B, scalar );
6370 template<
typename MT3
6374 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6375 selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6377 selectSmallAddAssignKernel( ~C, A, B, scalar );
6396 template<
typename MT3
6400 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6401 selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6403 typedef IntrinsicTrait<ElementType> IT;
6405 const size_t M( A.rows() );
6406 const size_t N( B.columns() );
6407 const size_t K( A.columns() );
6409 const size_t iblock( 128UL );
6410 const size_t jblock( 64UL );
6411 const size_t kblock( 128UL );
6413 const IntrinsicType factor(
set( scalar ) );
6415 for(
size_t ii=0UL; ii<M; ii+=iblock )
6417 const size_t iend(
min( ii+iblock, M ) );
6419 for(
size_t jj=0UL; jj<N; jj+=jblock )
6421 const size_t jend(
min( jj+jblock, N ) );
6423 for(
size_t kk=0UL; kk<K; kk+=kblock )
6425 const size_t ktmp(
min( kk+kblock, K ) );
6437 for( ; (j+2UL) <= jend; j+=2UL )
6439 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
6440 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
6441 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
6442 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
6444 IntrinsicType xmm1( (~C).
load(i ,j ) );
6445 IntrinsicType xmm2( (~C).
load(i1,j ) );
6446 IntrinsicType xmm3( (~C).
load(i2,j ) );
6447 IntrinsicType xmm4( (~C).
load(i3,j ) );
6448 IntrinsicType xmm5( (~C).
load(i ,j+1UL) );
6449 IntrinsicType xmm6( (~C).
load(i1,j+1UL) );
6450 IntrinsicType xmm7( (~C).
load(i2,j+1UL) );
6451 IntrinsicType xmm8( (~C).
load(i3,j+1UL) );
6453 for(
size_t k=kbegin; k<kend; ++k ) {
6454 const IntrinsicType a1( A.load(i ,k) );
6455 const IntrinsicType a2( A.load(i1,k) );
6456 const IntrinsicType a3( A.load(i2,k) );
6457 const IntrinsicType a4( A.load(i3,k) );
6458 const IntrinsicType b1(
set( B(k,j ) ) );
6459 const IntrinsicType b2(
set( B(k,j+1UL) ) );
6460 xmm1 = xmm1 + a1 * b1;
6461 xmm2 = xmm2 + a2 * b1;
6462 xmm3 = xmm3 + a3 * b1;
6463 xmm4 = xmm4 + a4 * b1;
6464 xmm5 = xmm5 + a1 * b2;
6465 xmm6 = xmm6 + a2 * b2;
6466 xmm7 = xmm7 + a3 * b2;
6467 xmm8 = xmm8 + a4 * b2;
6470 (~C).
store( i , j , xmm1 * factor );
6471 (~C).
store( i1, j , xmm2 * factor );
6472 (~C).
store( i2, j , xmm3 * factor );
6473 (~C).
store( i3, j , xmm4 * factor );
6474 (~C).
store( i , j+1UL, xmm5 * factor );
6475 (~C).
store( i1, j+1UL, xmm6 * factor );
6476 (~C).
store( i2, j+1UL, xmm7 * factor );
6477 (~C).
store( i3, j+1UL, xmm8 * factor );
6482 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
6483 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
6484 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
6485 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
6487 IntrinsicType xmm1( (~C).
load(i ,j) );
6488 IntrinsicType xmm2( (~C).
load(i1,j) );
6489 IntrinsicType xmm3( (~C).
load(i2,j) );
6490 IntrinsicType xmm4( (~C).
load(i3,j) );
6492 for(
size_t k=kbegin; k<kend; ++k ) {
6493 const IntrinsicType b1(
set( B(k,j) ) );
6494 xmm1 = xmm1 + A.load(i ,k) * b1;
6495 xmm2 = xmm2 + A.load(i1,k) * b1;
6496 xmm3 = xmm3 + A.load(i2,k) * b1;
6497 xmm4 = xmm4 + A.load(i3,k) * b1;
6500 (~C).
store( i , j, xmm1 * factor );
6501 (~C).
store( i1, j, xmm2 * factor );
6502 (~C).
store( i2, j, xmm3 * factor );
6503 (~C).
store( i3, j, xmm4 * factor );
6513 for( ; (j+4UL) <= jend; j+=4UL )
6515 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
6516 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
6517 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
6518 ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
6520 IntrinsicType xmm1( (~C).
load(i ,j ) );
6521 IntrinsicType xmm2( (~C).
load(i1,j ) );
6522 IntrinsicType xmm3( (~C).
load(i ,j+1UL) );
6523 IntrinsicType xmm4( (~C).
load(i1,j+1UL) );
6524 IntrinsicType xmm5( (~C).
load(i ,j+2UL) );
6525 IntrinsicType xmm6( (~C).
load(i1,j+2UL) );
6526 IntrinsicType xmm7( (~C).
load(i ,j+3UL) );
6527 IntrinsicType xmm8( (~C).
load(i1,j+3UL) );
6529 for(
size_t k=kbegin; k<kend; ++k ) {
6530 const IntrinsicType a1( A.load(i ,k) );
6531 const IntrinsicType a2( A.load(i1,k) );
6532 const IntrinsicType b1(
set( B(k,j ) ) );
6533 const IntrinsicType b2(
set( B(k,j+1UL) ) );
6534 const IntrinsicType b3(
set( B(k,j+2UL) ) );
6535 const IntrinsicType b4(
set( B(k,j+3UL) ) );
6536 xmm1 = xmm1 + a1 * b1;
6537 xmm2 = xmm2 + a2 * b1;
6538 xmm3 = xmm3 + a1 * b2;
6539 xmm4 = xmm4 + a2 * b2;
6540 xmm5 = xmm5 + a1 * b3;
6541 xmm6 = xmm6 + a2 * b3;
6542 xmm7 = xmm7 + a1 * b4;
6543 xmm8 = xmm8 + a2 * b4;
6546 (~C).
store( i , j , xmm1 * factor );
6547 (~C).
store( i1, j , xmm2 * factor );
6548 (~C).
store( i , j+1UL, xmm3 * factor );
6549 (~C).
store( i1, j+1UL, xmm4 * factor );
6550 (~C).
store( i , j+2UL, xmm5 * factor );
6551 (~C).
store( i1, j+2UL, xmm6 * factor );
6552 (~C).
store( i , j+3UL, xmm7 * factor );
6553 (~C).
store( i1, j+3UL, xmm8 * factor );
6556 for( ; (j+2UL) <= jend; j+=2UL )
6558 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
6559 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
6560 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
6561 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
6563 IntrinsicType xmm1( (~C).
load(i ,j ) );
6564 IntrinsicType xmm2( (~C).
load(i1,j ) );
6565 IntrinsicType xmm3( (~C).
load(i ,j+1UL) );
6566 IntrinsicType xmm4( (~C).
load(i1,j+1UL) );
6568 for(
size_t k=kbegin; k<kend; ++k ) {
6569 const IntrinsicType a1( A.load(i ,k) );
6570 const IntrinsicType a2( A.load(i1,k) );
6571 const IntrinsicType b1(
set( B(k,j ) ) );
6572 const IntrinsicType b2(
set( B(k,j+1UL) ) );
6573 xmm1 = xmm1 + a1 * b1;
6574 xmm2 = xmm2 + a2 * b1;
6575 xmm3 = xmm3 + a1 * b2;
6576 xmm4 = xmm4 + a2 * b2;
6579 (~C).
store( i , j , xmm1 * factor );
6580 (~C).
store( i1, j , xmm2 * factor );
6581 (~C).
store( i , j+1UL, xmm3 * factor );
6582 (~C).
store( i1, j+1UL, xmm4 * factor );
6587 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
6588 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
6589 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
6590 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
6592 IntrinsicType xmm1( (~C).
load(i ,j) );
6593 IntrinsicType xmm2( (~C).
load(i1,j) );
6595 for(
size_t k=kbegin; k<kend; ++k ) {
6596 const IntrinsicType b1(
set( B(k,j) ) );
6597 xmm1 = xmm1 + A.load(i ,k) * b1;
6598 xmm2 = xmm2 + A.load(i1,k) * b1;
6601 (~C).
store( i , j, xmm1 * factor );
6602 (~C).
store( i1, j, xmm2 * factor );
6608 for(
size_t j=jj; j<jend; ++j )
6610 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
6611 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
6612 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size, ktmp ) ):( ktmp ),
6613 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
6615 IntrinsicType xmm1( (~C).
load(i,j) );
6617 for(
size_t k=kbegin; k<kend; ++k ) {
6618 const IntrinsicType b1(
set( B(k,j) ) );
6619 xmm1 = xmm1 + A.load(i,k) * b1;
6622 (~C).
store( i, j, xmm1 * factor );
6646 template<
typename MT3
6650 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6651 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6653 selectLargeAddAssignKernel( C, A, B, scalar );
6672 template<
typename MT3
6676 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
6677 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6679 if( IsTriangular<MT4>::value ) {
6681 strmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), scalar );
6684 else if( IsTriangular<MT5>::value ) {
6686 strmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), scalar );
6690 sgemm( C, A, B, scalar, 1.0F );
6711 template<
typename MT3
6715 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
6716 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6718 if( IsTriangular<MT4>::value ) {
6720 dtrmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), scalar );
6723 else if( IsTriangular<MT5>::value ) {
6725 dtrmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), scalar );
6729 dgemm( C, A, B, scalar, 1.0 );
6750 template<
typename MT3
6754 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
6755 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6757 if( IsTriangular<MT4>::value ) {
6759 ctrmm( tmp, A, CblasLeft,
6760 ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
6761 complex<float>( scalar, 0.0F ) );
6764 else if( IsTriangular<MT5>::value ) {
6766 ctrmm( tmp, B, CblasRight,
6767 ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
6768 complex<float>( scalar, 0.0F ) );
6772 cgemm( C, A, B, complex<float>( scalar, 0.0F ), complex<float>( 1.0F, 0.0F ) );
6793 template<
typename MT3
6797 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
6798 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6800 if( IsTriangular<MT4>::value ) {
6802 ztrmm( tmp, A, CblasLeft,
6803 ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
6804 complex<double>( scalar, 0.0 ) );
6807 else if( IsTriangular<MT5>::value ) {
6809 ztrmm( tmp, B, CblasRight,
6810 ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
6811 complex<double>( scalar, 0.0 ) );
6815 zgemm( C, A, B, complex<double>( scalar, 0.0 ), complex<double>( 1.0, 0.0 ) );
6836 template<
typename MT >
6837 friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
6838 addAssign( Matrix<MT,false>& lhs,
const DMatScalarMultExpr& rhs )
6847 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
6848 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
6850 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
6852 else if( IsSymmetric<MT1>::value )
6875 template<
typename MT
6877 friend inline typename DisableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
6878 subAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
6885 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
6886 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
6888 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
6902 DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
6917 template<
typename MT3
6921 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6923 if( ( IsDiagonal<MT4>::value ) ||
6925 selectSmallSubAssignKernel( C, A, B, scalar );
6927 selectBlasSubAssignKernel( C, A, B, scalar );
6945 template<
typename MT3
6949 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
6950 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6952 const ResultType tmp(
serial( A * B * scalar ) );
6971 template<
typename MT3
6975 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
6976 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6980 const size_t M( A.rows() );
6981 const size_t N( B.columns() );
6983 for(
size_t j=0UL; j<N; ++j )
6985 const size_t ibegin( ( IsLower<MT4>::value )
6986 ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
6988 const size_t iend( ( IsUpper<MT4>::value )
6989 ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
6993 const size_t inum( iend - ibegin );
6994 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
6996 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
6997 C(i ,j) -= A(i ,j) * B(j,j) * scalar;
6998 C(i+1UL,j) -= A(i+1UL,j) * B(j,j) * scalar;
7001 C(ipos,j) -= A(ipos,j) * B(j,j) * scalar;
7021 template<
typename MT3
7025 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
7026 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7030 const size_t M( A.rows() );
7031 const size_t N( B.columns() );
7033 for(
size_t j=0UL; j<N; ++j )
7035 const size_t ibegin( ( IsLower<MT5>::value )
7036 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
7038 const size_t iend( ( IsUpper<MT5>::value )
7039 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
7043 const size_t inum( iend - ibegin );
7044 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
7046 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
7047 C(i ,j) -= A(i ,i ) * B(i ,j) * scalar;
7048 C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
7051 C(ipos,j) -= A(ipos,ipos) * B(ipos,j) * scalar;
7071 template<
typename MT3
7075 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
7076 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7080 for(
size_t i=0UL; i<A.rows(); ++i ) {
7081 C(i,i) -= A(i,i) * B(i,i) * scalar;
7100 template<
typename MT3
7104 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7105 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7107 selectDefaultSubAssignKernel( C, A, B, scalar );
7126 template<
typename MT3
7130 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7131 selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
7138 if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
7142 else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
7146 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
7172 template<
typename MT3
7176 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7177 selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
7179 typedef IntrinsicTrait<ElementType> IT;
7181 const size_t M( A.rows() );
7182 const size_t N( B.columns() );
7183 const size_t K( A.columns() );
7185 const IntrinsicType factor(
set( scalar ) );
7190 for(
size_t j=0UL; j<N; ++j )
7192 const size_t kbegin( ( IsLower<MT5>::value )
7193 ?( ( IsUpper<MT4>::value )
7194 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7195 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7196 :( IsUpper<MT4>::value ? i : 0UL ) );
7197 const size_t kend( ( IsUpper<MT5>::value )
7198 ?( ( IsLower<MT4>::value )
7199 ?(
min( i+
IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
7200 :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
7201 :( IsLower<MT4>::value ?
min( i+
IT::size*8UL, K ) : K ) );
7203 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7205 for(
size_t k=kbegin; k<kend; ++k ) {
7206 const IntrinsicType b1(
set( B(k,j) ) );
7207 xmm1 = xmm1 + A.load(i ,k) * b1;
7208 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
7209 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
7210 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
7211 xmm5 = xmm5 + A.load(i+
IT::size*4UL,k) * b1;
7212 xmm6 = xmm6 + A.load(i+
IT::size*5UL,k) * b1;
7213 xmm7 = xmm7 + A.load(i+
IT::size*6UL,k) * b1;
7214 xmm8 = xmm8 + A.load(i+
IT::size*7UL,k) * b1;
7217 (~C).
store( i , j, (~C).load(i ,j) - xmm1 * factor );
7232 for( ; (j+2UL) <= N; j+=2UL )
7234 const size_t kbegin( ( IsLower<MT5>::value )
7235 ?( ( IsUpper<MT4>::value )
7236 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7237 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7238 :( IsUpper<MT4>::value ? i : 0UL ) );
7239 const size_t kend( ( IsUpper<MT5>::value )
7240 ?( ( IsLower<MT4>::value )
7241 ?(
min( i+
IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
7242 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
7243 :( IsLower<MT4>::value ?
min( i+
IT::size*4UL, K ) : K ) );
7245 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7247 for(
size_t k=kbegin; k<kend; ++k ) {
7248 const IntrinsicType a1( A.load(i ,k) );
7249 const IntrinsicType a2( A.load(i+
IT::size ,k) );
7250 const IntrinsicType a3( A.load(i+
IT::size*2UL,k) );
7251 const IntrinsicType a4( A.load(i+
IT::size*3UL,k) );
7252 const IntrinsicType b1(
set( B(k,j ) ) );
7253 const IntrinsicType b2(
set( B(k,j+1UL) ) );
7254 xmm1 = xmm1 + a1 * b1;
7255 xmm2 = xmm2 + a2 * b1;
7256 xmm3 = xmm3 + a3 * b1;
7257 xmm4 = xmm4 + a4 * b1;
7258 xmm5 = xmm5 + a1 * b2;
7259 xmm6 = xmm6 + a2 * b2;
7260 xmm7 = xmm7 + a3 * b2;
7261 xmm8 = xmm8 + a4 * b2;
7264 (~C).
store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7268 (~C).
store( i , j+1UL, (~C).load(i ,j+1UL) - xmm5 * factor );
7276 const size_t kbegin( ( IsLower<MT5>::value )
7277 ?( ( IsUpper<MT4>::value )
7278 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7279 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7280 :( IsUpper<MT4>::value ? i : 0UL ) );
7281 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, K ) ):( K ) );
7283 IntrinsicType xmm1, xmm2, xmm3, xmm4;
7285 for(
size_t k=kbegin; k<kend; ++k ) {
7286 const IntrinsicType b1(
set( B(k,j) ) );
7287 xmm1 = xmm1 + A.load(i ,k) * b1;
7288 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
7289 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
7290 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
7293 (~C).
store( i , j, (~C).load(i ,j) - xmm1 * factor );
7304 for( ; (j+2UL) <= N; j+=2UL )
7306 const size_t kbegin( ( IsLower<MT5>::value )
7307 ?( ( IsUpper<MT4>::value )
7308 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7309 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7310 :( IsUpper<MT4>::value ? i : 0UL ) );
7311 const size_t kend( ( IsUpper<MT5>::value )
7312 ?( ( IsLower<MT4>::value )
7313 ?(
min( i+
IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
7314 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
7315 :( IsLower<MT4>::value ?
min( i+
IT::size*2UL, K ) : K ) );
7317 IntrinsicType xmm1, xmm2, xmm3, xmm4;
7319 for(
size_t k=kbegin; k<kend; ++k ) {
7320 const IntrinsicType a1( A.load(i ,k) );
7321 const IntrinsicType a2( A.load(i+
IT::size,k) );
7322 const IntrinsicType b1(
set( B(k,j ) ) );
7323 const IntrinsicType b2(
set( B(k,j+1UL) ) );
7324 xmm1 = xmm1 + a1 * b1;
7325 xmm2 = xmm2 + a2 * b1;
7326 xmm3 = xmm3 + a1 * b2;
7327 xmm4 = xmm4 + a2 * b2;
7330 (~C).
store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7332 (~C).
store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
7338 const size_t kbegin( ( IsLower<MT5>::value )
7339 ?( ( IsUpper<MT4>::value )
7340 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7341 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7342 :( IsUpper<MT4>::value ? i : 0UL ) );
7343 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, K ) ):( K ) );
7345 IntrinsicType xmm1, xmm2;
7347 for(
size_t k=kbegin; k<kend; ++k ) {
7348 const IntrinsicType b1(
set( B(k,j) ) );
7349 xmm1 = xmm1 + A.load(i ,k) * b1;
7350 xmm2 = xmm2 + A.load(i+
IT::size,k) * b1;
7353 (~C).
store( i , j, (~C).load(i ,j) - xmm1 * factor );
7362 for( ; (j+2UL) <= N; j+=2UL )
7364 const size_t kbegin( ( IsLower<MT5>::value )
7365 ?( ( IsUpper<MT4>::value )
7366 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7367 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7368 :( IsUpper<MT4>::value ? i : 0UL ) );
7369 const size_t kend( ( IsUpper<MT5>::value )
7370 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
7373 IntrinsicType xmm1, xmm2;
7375 for(
size_t k=kbegin; k<kend; ++k ) {
7376 const IntrinsicType a1( A.load(i,k) );
7377 xmm1 = xmm1 + a1 *
set( B(k,j ) );
7378 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
7381 (~C).
store( i, j , (~C).load(i,j ) - xmm1 * factor );
7382 (~C).
store( i, j+1UL, (~C).load(i,j+1UL) - xmm2 * factor );
7387 const size_t kbegin( ( IsLower<MT5>::value )
7388 ?( ( IsUpper<MT4>::value )
7389 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7390 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7391 :( IsUpper<MT4>::value ? i : 0UL ) );
7395 for(
size_t k=kbegin; k<K; ++k ) {
7396 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
7399 (~C).
store( i, j, (~C).load(i,j) - xmm1 * factor );
7419 template<
typename MT3
7423 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7424 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7426 selectDefaultSubAssignKernel( C, A, B, scalar );
7445 template<
typename MT3
7449 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7450 selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
7452 selectSmallSubAssignKernel( ~C, A, B, scalar );
7471 template<
typename MT3
7475 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7476 selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
7478 typedef IntrinsicTrait<ElementType> IT;
7480 const size_t M( A.rows() );
7481 const size_t N( B.columns() );
7482 const size_t K( A.columns() );
7484 const size_t iblock( 128UL );
7485 const size_t jblock( 64UL );
7486 const size_t kblock( 128UL );
7488 const IntrinsicType factor(
set( scalar ) );
7490 for(
size_t ii=0UL; ii<M; ii+=iblock )
7492 const size_t iend(
min( ii+iblock, M ) );
7494 for(
size_t jj=0UL; jj<N; jj+=jblock )
7496 const size_t jend(
min( jj+jblock, N ) );
7498 for(
size_t kk=0UL; kk<K; kk+=kblock )
7500 const size_t ktmp(
min( kk+kblock, K ) );
7512 for( ; (j+2UL) <= jend; j+=2UL )
7514 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7515 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7516 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
7517 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
7519 IntrinsicType xmm1( (~C).
load(i ,j ) );
7520 IntrinsicType xmm2( (~C).
load(i1,j ) );
7521 IntrinsicType xmm3( (~C).
load(i2,j ) );
7522 IntrinsicType xmm4( (~C).
load(i3,j ) );
7523 IntrinsicType xmm5( (~C).
load(i ,j+1UL) );
7524 IntrinsicType xmm6( (~C).
load(i1,j+1UL) );
7525 IntrinsicType xmm7( (~C).
load(i2,j+1UL) );
7526 IntrinsicType xmm8( (~C).
load(i3,j+1UL) );
7528 for(
size_t k=kbegin; k<kend; ++k ) {
7529 const IntrinsicType a1( A.load(i ,k) );
7530 const IntrinsicType a2( A.load(i1,k) );
7531 const IntrinsicType a3( A.load(i2,k) );
7532 const IntrinsicType a4( A.load(i3,k) );
7533 const IntrinsicType b1(
set( B(k,j ) ) );
7534 const IntrinsicType b2(
set( B(k,j+1UL) ) );
7535 xmm1 = xmm1 - a1 * b1;
7536 xmm2 = xmm2 - a2 * b1;
7537 xmm3 = xmm3 - a3 * b1;
7538 xmm4 = xmm4 - a4 * b1;
7539 xmm5 = xmm5 - a1 * b2;
7540 xmm6 = xmm6 - a2 * b2;
7541 xmm7 = xmm7 - a3 * b2;
7542 xmm8 = xmm8 - a4 * b2;
7545 (~C).
store( i , j , xmm1 * factor );
7546 (~C).
store( i1, j , xmm2 * factor );
7547 (~C).
store( i2, j , xmm3 * factor );
7548 (~C).
store( i3, j , xmm4 * factor );
7549 (~C).
store( i , j+1UL, xmm5 * factor );
7550 (~C).
store( i1, j+1UL, xmm6 * factor );
7551 (~C).
store( i2, j+1UL, xmm7 * factor );
7552 (~C).
store( i3, j+1UL, xmm8 * factor );
7557 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7558 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7559 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
7560 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7562 IntrinsicType xmm1( (~C).
load(i ,j) );
7563 IntrinsicType xmm2( (~C).
load(i1,j) );
7564 IntrinsicType xmm3( (~C).
load(i2,j) );
7565 IntrinsicType xmm4( (~C).
load(i3,j) );
7567 for(
size_t k=kbegin; k<kend; ++k ) {
7568 const IntrinsicType b1(
set( B(k,j) ) );
7569 xmm1 = xmm1 - A.load(i ,k) * b1;
7570 xmm2 = xmm2 - A.load(i1,k) * b1;
7571 xmm3 = xmm3 - A.load(i2,k) * b1;
7572 xmm4 = xmm4 - A.load(i3,k) * b1;
7575 (~C).
store( i , j, xmm1 * factor );
7576 (~C).
store( i1, j, xmm2 * factor );
7577 (~C).
store( i2, j, xmm3 * factor );
7578 (~C).
store( i3, j, xmm4 * factor );
7588 for( ; (j+4UL) <= jend; j+=4UL )
7590 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7591 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7592 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
7593 ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
7595 IntrinsicType xmm1( (~C).
load(i ,j ) );
7596 IntrinsicType xmm2( (~C).
load(i1,j ) );
7597 IntrinsicType xmm3( (~C).
load(i ,j+1UL) );
7598 IntrinsicType xmm4( (~C).
load(i1,j+1UL) );
7599 IntrinsicType xmm5( (~C).
load(i ,j+2UL) );
7600 IntrinsicType xmm6( (~C).
load(i1,j+2UL) );
7601 IntrinsicType xmm7( (~C).
load(i ,j+3UL) );
7602 IntrinsicType xmm8( (~C).
load(i1,j+3UL) );
7604 for(
size_t k=kbegin; k<kend; ++k ) {
7605 const IntrinsicType a1( A.load(i ,k) );
7606 const IntrinsicType a2( A.load(i1,k) );
7607 const IntrinsicType b1(
set( B(k,j ) ) );
7608 const IntrinsicType b2(
set( B(k,j+1UL) ) );
7609 const IntrinsicType b3(
set( B(k,j+2UL) ) );
7610 const IntrinsicType b4(
set( B(k,j+3UL) ) );
7611 xmm1 = xmm1 - a1 * b1;
7612 xmm2 = xmm2 - a2 * b1;
7613 xmm3 = xmm3 - a1 * b2;
7614 xmm4 = xmm4 - a2 * b2;
7615 xmm5 = xmm5 - a1 * b3;
7616 xmm6 = xmm6 - a2 * b3;
7617 xmm7 = xmm7 - a1 * b4;
7618 xmm8 = xmm8 - a2 * b4;
7621 (~C).
store( i , j , xmm1 * factor );
7622 (~C).
store( i1, j , xmm2 * factor );
7623 (~C).
store( i , j+1UL, xmm3 * factor );
7624 (~C).
store( i1, j+1UL, xmm4 * factor );
7625 (~C).
store( i , j+2UL, xmm5 * factor );
7626 (~C).
store( i1, j+2UL, xmm6 * factor );
7627 (~C).
store( i , j+3UL, xmm7 * factor );
7628 (~C).
store( i1, j+3UL, xmm8 * factor );
7631 for( ; (j+2UL) <= jend; j+=2UL )
7633 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7634 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7635 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
7636 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
7638 IntrinsicType xmm1( (~C).
load(i ,j ) );
7639 IntrinsicType xmm2( (~C).
load(i1,j ) );
7640 IntrinsicType xmm3( (~C).
load(i ,j+1UL) );
7641 IntrinsicType xmm4( (~C).
load(i1,j+1UL) );
7643 for(
size_t k=kbegin; k<kend; ++k ) {
7644 const IntrinsicType a1( A.load(i ,k) );
7645 const IntrinsicType a2( A.load(i1,k) );
7646 const IntrinsicType b1(
set( B(k,j ) ) );
7647 const IntrinsicType b2(
set( B(k,j+1UL) ) );
7648 xmm1 = xmm1 - a1 * b1;
7649 xmm2 = xmm2 - a2 * b1;
7650 xmm3 = xmm3 - a1 * b2;
7651 xmm4 = xmm4 - a2 * b2;
7654 (~C).
store( i , j , xmm1 * factor );
7655 (~C).
store( i1, j , xmm2 * factor );
7656 (~C).
store( i , j+1UL, xmm3 * factor );
7657 (~C).
store( i1, j+1UL, xmm4 * factor );
7662 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7663 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7664 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
7665 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7667 IntrinsicType xmm1( (~C).
load(i ,j) );
7668 IntrinsicType xmm2( (~C).
load(i1,j) );
7670 for(
size_t k=kbegin; k<kend; ++k ) {
7671 const IntrinsicType b1(
set( B(k,j) ) );
7672 xmm1 = xmm1 - A.load(i ,k) * b1;
7673 xmm2 = xmm2 - A.load(i1,k) * b1;
7676 (~C).
store( i , j, xmm1 * factor );
7677 (~C).
store( i1, j, xmm2 * factor );
7683 for(
size_t j=jj; j<jend; ++j )
7685 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7686 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7687 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size, ktmp ) ):( ktmp ),
7688 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7690 IntrinsicType xmm1( (~C).
load(i,j) );
7692 for(
size_t k=kbegin; k<kend; ++k ) {
7693 const IntrinsicType b1(
set( B(k,j) ) );
7694 xmm1 = xmm1 - A.load(i,k) * b1;
7697 (~C).
store( i, j, xmm1 * factor );
7721 template<
typename MT3
7725 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7726 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7728 selectLargeSubAssignKernel( C, A, B, scalar );
7747 template<
typename MT3
7751 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
7752 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7754 if( IsTriangular<MT4>::value ) {
7756 strmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), scalar );
7759 else if( IsTriangular<MT5>::value ) {
7761 strmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), scalar );
7765 sgemm( C, A, B, -scalar, 1.0F );
7786 template<
typename MT3
7790 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
7791 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7793 if( IsTriangular<MT4>::value ) {
7795 dtrmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), scalar );
7798 else if( IsTriangular<MT5>::value ) {
7800 dtrmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), scalar );
7804 dgemm( C, A, B, -scalar, 1.0 );
7825 template<
typename MT3
7829 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
7830 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7832 if( IsTriangular<MT4>::value ) {
7834 ctrmm( tmp, A, CblasLeft,
7835 ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
7836 complex<float>( scalar, 0.0F ) );
7839 else if( IsTriangular<MT5>::value ) {
7841 ctrmm( tmp, B, CblasRight,
7842 ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
7843 complex<float>( scalar, 0.0F ) );
7847 cgemm( C, A, B, complex<float>( -scalar, 0.0F ), complex<float>( 1.0F, 0.0F ) );
7868 template<
typename MT3
7872 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
7873 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7875 if( IsTriangular<MT4>::value ) {
7877 ztrmm( tmp, A, CblasLeft,
7878 ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
7879 complex<float>( scalar, 0.0 ) );
7882 else if( IsTriangular<MT5>::value ) {
7884 ztrmm( tmp, B, CblasRight,
7885 ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
7886 complex<float>( scalar, 0.0 ) );
7890 zgemm( C, A, B, complex<double>( -scalar, 0.0 ), complex<double>( 1.0, 0.0 ) );
7910 template<
typename MT >
7911 friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
7912 subAssign( Matrix<MT,false>& lhs,
const DMatScalarMultExpr& rhs )
7921 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7922 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7924 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
7926 else if( IsSymmetric<MT1>::value )
7960 template<
typename MT
7962 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
7963 smpAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7970 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7971 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7973 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
7976 else if( left.columns() == 0UL ) {
8010 template<
typename MT
8012 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
8013 smpAssign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
8017 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
8029 const TmpType tmp( rhs );
8048 template<
typename MT >
8049 friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
8050 smpAssign( Matrix<MT,false>& lhs,
const DMatScalarMultExpr& rhs )
8059 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
8060 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
8062 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
8064 else if( IsSymmetric<MT1>::value )
8086 template<
typename MT
8088 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
8089 smpAddAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
8096 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
8097 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
8099 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
8132 template<
typename MT >
8133 friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
8134 smpAddAssign( Matrix<MT,false>& lhs,
const DMatScalarMultExpr& rhs )
8143 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
8144 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
8146 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
8148 else if( IsSymmetric<MT1>::value )
8174 template<
typename MT
8176 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
8177 smpSubAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
8184 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
8185 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
8187 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
8220 template<
typename MT >
8221 friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
8222 smpSubAssign( Matrix<MT,false>& lhs,
const DMatScalarMultExpr& rhs )
8231 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
8232 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
8234 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
8236 else if( IsSymmetric<MT1>::value )
8304 template<
typename T1
8306 inline const TDMatTDMatMultExpr<T1,T2>
8312 throw std::invalid_argument(
"Matrix sizes do not match" );
8329 template<
typename MT1,
typename MT2 >
8347 template<
typename MT1,
typename MT2 >
8349 :
public Columns<MT2>
8365 template<
typename MT1,
typename MT2 >
8367 :
public IsTrue< And< IsLower<MT1>, IsLower<MT2> >::value >
8383 template<
typename MT1,
typename MT2 >
8385 :
public IsTrue< And< IsUniLower<MT1>, IsUniLower<MT2> >::value >
8401 template<
typename MT1,
typename MT2 >
8403 :
public IsTrue< Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
8404 , And< IsStrictlyLower<MT2>, IsLower<MT1> > >::value >
8420 template<
typename MT1,
typename MT2 >
8422 :
public IsTrue< And< IsUpper<MT1>, IsUpper<MT2> >::value >
8438 template<
typename MT1,
typename MT2 >
8440 :
public IsTrue< And< IsUniUpper<MT1>, IsUniUpper<MT2> >::value >
8456 template<
typename MT1,
typename MT2 >
8458 :
public IsTrue< Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
8459 , And< IsStrictlyUpper<MT2>, IsUpper<MT1> > >::value >
8475 template<
typename MT1,
typename MT2,
typename VT >
8480 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
8481 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value &&
8482 IsDenseVector<VT>::value && IsColumnVector<VT>::value
8483 ,
typename TDMatDVecMultExprTrait< MT1, typename TDMatDVecMultExprTrait<MT2,VT>::Type >::Type
8484 , INVALID_TYPE >::Type Type;
8493 template<
typename MT1,
typename MT2,
typename VT >
8498 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
8499 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value &&
8500 IsSparseVector<VT>::value && IsColumnVector<VT>::value
8501 ,
typename TDMatDVecMultExprTrait< MT1, typename TDMatSVecMultExprTrait<MT2,VT>::Type >::Type
8502 , INVALID_TYPE >::Type Type;
8511 template<
typename VT,
typename MT1,
typename MT2 >
8516 typedef typename SelectType< IsDenseVector<VT>::value && IsRowVector<VT>::value &&
8517 IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
8518 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value
8519 ,
typename TDVecTDMatMultExprTrait< typename TDVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
8520 , INVALID_TYPE >::Type Type;
8529 template<
typename VT,
typename MT1,
typename MT2 >
8534 typedef typename SelectType< IsSparseVector<VT>::value && IsRowVector<VT>::value &&
8535 IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
8536 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value
8537 ,
typename TDVecTDMatMultExprTrait< typename TSVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
8538 , INVALID_TYPE >::Type Type;
8547 template<
typename MT1,
typename MT2,
bool AF >
8552 typedef typename MultExprTrait< typename SubmatrixExprTrait<const MT1,AF>::Type
8553 ,
typename SubmatrixExprTrait<const MT2,AF>::Type >::Type Type;
8562 template<
typename MT1,
typename MT2 >
8567 typedef typename MultExprTrait< typename RowExprTrait<const MT1>::Type, MT2 >::Type Type;
8576 template<
typename MT1,
typename MT2 >
8581 typedef typename MultExprTrait< MT1, typename ColumnExprTrait<const MT2>::Type >::Type Type;
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatTDMatMultExpr.h:361
const MT::ElementType max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1649
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:89
Constraint on the data type.
Header file for mathematical functions.
Header file for the Rows type trait.
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:86
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatTDMatMultExpr.h:312
SelectType< IsExpression< MT2 >::value, const MT2, const MT2 & >::Type RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:318
Header file for the IsUniUpper type trait.
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:8247
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:105
SelectType< IsExpression< MT1 >::value, const MT1, const MT1 & >::Type LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:315
Header file for basic type definitions.
MultTrait< RT1, RT2 >::Type ResultType
Result type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:306
BLAZE_ALWAYS_INLINE size_t size(const Vector< VT, TF > &vector)
Returns the current size/dimension of the vector.
Definition: Vector.h:264
RT1::ElementType ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:140
const size_t TDMATTDMATMULT_THRESHOLD
Column-major dense matrix/column-major dense matrix multiplication threshold.This setting specifies t...
Definition: Thresholds.h:176
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:209
Header file for the IsDiagonal type trait.
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
Header file for the ColumnExprTrait class template.
Header file for the IsSame and IsStrictlySame type traits.
Header file for the IsColumnMajorMatrix type trait.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:821
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: TDMatTDMatMultExpr.h:310
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2507
Header file for the IsRowVector type trait.
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:261
Header file for the And class template.
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatTDMatMultExpr.h:497
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:90
CompressedMatrix< Type,!SO > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: CompressedMatrix.h:259
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:699
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member enumeration is set to 1, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to 0, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:158
Expression object for transpose dense matrix-transpose dense matrix multiplications.The TDMatTDMatMultExpr class represents the compile time expression for multiplications between two column-major dense matrices.
Definition: Forward.h:131
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:90
ResultType::OppositeType OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:307
Header file for the IsUniLower type trait.
CompressedMatrix< Type, false > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: CompressedMatrix.h:2503
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
MT2::CompositeType CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:143
TDMatTDMatMultExpr< MT1, MT2 > This
Type of this TDMatTDMatMultExpr instance.
Definition: TDMatTDMatMultExpr.h:305
size_t columns() const
Returns the current number of columns of the matrix.
Definition: TDMatTDMatMultExpr.h:424
const size_t SMP_TDMATTDMATMULT_THRESHOLD
SMP column-major dense matrix/column-major dense matrix multiplication threshold.This threshold speci...
Definition: Thresholds.h:903
RightOperand rightOperand() const
Returns the right-hand side transpose dense matrix operand.
Definition: TDMatTDMatMultExpr.h:444
Constraint on the data type.
Header file for the MultExprTrait class template.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:90
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: StorageOrder.h:161
Header file for the TSVecTDMatMultExprTrait class template.
Header file for the Or class template.
BLAZE_ALWAYS_INLINE EnableIf< And< IsIntegral< T >, HasSize< T, 2UL > > >::Type store(T *address, const sse_int16_t &value)
Aligned store of a vector of 2-byte integral values.
Definition: Store.h:80
Header file for the TDMatSVecMultExprTrait class template.
const MT::ElementType min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1602
Header file for the DenseMatrix base class.
BLAZE_ALWAYS_INLINE void assign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the assignment of a matrix to a matrix.
Definition: Matrix.h:635
Header file for the Columns type trait.
Header file for the Not class template.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:311
bool isAligned() const
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatTDMatMultExpr.h:478
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
Header file for the IsLower type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:92
Header file for BLAS level 3 functions.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
Header file for the IsStrictlyTriangular type trait.
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
Header file for the IsTriangular type trait.
Constraints on the storage order of matrix types.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2505
Header file for the SelectType class template.
Header file for the RowExprTrait class template.
Header file for all forward declarations for expression class templates.
Header file for the IsDenseMatrix type trait.
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: TDMatTDMatMultExpr.h:468
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the serial shim.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:165
BLAZE_ALWAYS_INLINE EnableIf< And< IsIntegral< T >, HasSize< T, 2UL > >, sse_int16_t >::Type load(const T *address)
Loads a vector of 2-byte integral values.
Definition: Load.h:79
MT1::ResultType RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:138
TDMatTDMatMultExpr(const MT1 &lhs, const MT2 &rhs)
Constructor for the TDMatTDMatMultExpr class.
Definition: TDMatTDMatMultExpr.h:346
Header file for the IsNumeric type trait.
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
MT2::ResultType RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:139
EnableIf< IsDenseMatrix< MT1 > >::Type smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
Header file for the IsSparseVector type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a symmetric matrix type, a compilation error is created.
Definition: Symmetric.h:116
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: StorageOrder.h:81
Header file for the MatScalarMultExpr base class.
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:749
Header file for run time assertion macros.
EnableIf< IsDenseMatrix< MT1 > >::Type smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
bool canSMPAssign() const
Returns whether the expression can be used in SMP assignments.
Definition: TDMatTDMatMultExpr.h:488
Utility type for generic codes.
Base template for the MultTrait class.
Definition: MultTrait.h:150
BLAZE_ALWAYS_INLINE void addAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the addition assignment of a matrix to a matrix.
Definition: Matrix.h:742
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
Header file for the reset shim.
Constraint on the data type.
Header file for the HasMutableDataAccess type trait.
Substitution Failure Is Not An Error (SFINAE) class.The DisableIf class template is an auxiliary tool...
Definition: DisableIf.h:184
RT2::ElementType ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:141
BLAZE_ALWAYS_INLINE EnableIf< And< IsIntegral< T >, HasSize< T, 2UL > >, sse_int16_t >::Type set(T value)
Sets all values in the vector to the given 2-byte integral value.
Definition: Set.h:73
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:283
Header file for the IsDenseVector type trait.
size_t rows() const
Returns the current number of rows of the matrix.
Definition: TDMatTDMatMultExpr.h:414
Header file for all intrinsic functionality.
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: TDMatTDMatMultExpr.h:456
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
ResultType::ElementType ElementType
Resulting element type.
Definition: TDMatTDMatMultExpr.h:309
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatTDMatMultExpr.h:498
Header file for the IsRowMajorMatrix type trait.
const DMatTransExpr< MT,!SO > trans(const DenseMatrix< MT, SO > &dm)
Calculation of the transpose of the given dense matrix.
Definition: DMatTransExpr.h:937
Header file for the IsComputation type trait class.
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:260
EnableIf< IsDenseMatrix< MT1 > >::Type smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
Header file for the TDMatDVecMultExprTrait class template.
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2502
Header file for the IsTrue value trait.
Header file for the IsComplex type trait.
LeftOperand leftOperand() const
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatTDMatMultExpr.h:434
Header file for the complex data type.
SelectType< evaluateRight, const RT2, CT2 >::Type RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatTDMatMultExpr.h:324
Header file for the IsUpper type trait.
Header file for the IsColumnVector type trait.
Constraint on the data type.
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:308
Header file for the IsResizable type trait.
SelectType< evaluateLeft, const RT1, CT1 >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatTDMatMultExpr.h:321
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the TDVecTDMatMultExprTrait class template.
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.
BLAZE_ALWAYS_INLINE void subAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the subtraction assignment of a matrix to matrix.
Definition: Matrix.h:849
MT1::CompositeType CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:142