35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_
145 template<
typename MT1
147 class TDMatTDMatMultExpr :
public DenseMatrix< TDMatTDMatMultExpr<MT1,MT2>, true >
148 ,
private MatMatMultExpr
149 ,
private Computation
179 template<
typename T1,
typename T2,
typename T3 >
180 struct CanExploitSymmetry {
181 enum :
bool { value = IsRowMajorMatrix<T1>::value &&
182 ( IsSymmetric<T2>::value || IsSymmetric<T3>::value ) };
193 template<
typename T1,
typename T2,
typename T3 >
194 struct IsEvaluationRequired {
195 enum :
bool { value = ( evaluateLeft || evaluateRight ) &&
196 CanExploitSymmetry<T1,T2,T3>::value };
206 template<
typename T1,
typename T2,
typename T3 >
207 struct UseBlasKernel {
209 HasMutableDataAccess<T1>::value &&
210 HasConstDataAccess<T2>::value &&
211 HasConstDataAccess<T3>::value &&
212 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
213 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
214 IsBLASCompatible< ElementType_<T1> >::value &&
215 IsBLASCompatible< ElementType_<T2> >::value &&
216 IsBLASCompatible< ElementType_<T3> >::value &&
217 IsSame< ElementType_<T1>, ElementType_<T2> >::value &&
218 IsSame< ElementType_<T1>, ElementType_<T3> >::value };
228 template<
typename T1,
typename T2,
typename T3 >
229 struct UseVectorizedDefaultKernel {
231 !IsDiagonal<T2>::value &&
232 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
233 AreSIMDCombinable< ElementType_<T1>
235 , ElementType_<T3> >::value &&
236 HasSIMDAdd< ElementType_<T2>, ElementType_<T3> >::value &&
237 HasSIMDMult< ElementType_<T2>, ElementType_<T3> >::value };
269 MT1::simdEnabled && MT2::simdEnabled &&
274 enum :
bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
275 !evaluateRight && MT2::smpAssignable };
330 :(
lhs_.columns() ) ) );
334 const size_t n(
end - begin );
352 inline ReturnType
at(
size_t i,
size_t j )
const {
353 if( i >=
lhs_.rows() ) {
356 if( j >=
rhs_.columns() ) {
368 inline size_t rows() const noexcept {
379 return rhs_.columns();
409 template<
typename T >
410 inline bool canAlias(
const T* alias )
const noexcept {
411 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
421 template<
typename T >
422 inline bool isAliased(
const T* alias )
const noexcept {
423 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
433 return lhs_.isAligned() &&
rhs_.isAligned();
444 (
rows() *
columns() < TDMATTDMATMULT_THRESHOLD ) ) &&
445 (
rows() *
columns() >= SMP_TDMATTDMATMULT_THRESHOLD );
468 template<
typename MT
478 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
481 else if( rhs.lhs_.columns() == 0UL ) {
486 LT A(
serial( rhs.lhs_ ) );
487 RT B(
serial( rhs.rhs_ ) );
496 TDMatTDMatMultExpr::selectAssignKernel( ~lhs, A, B );
512 template<
typename MT3
515 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
518 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
519 selectSmallAssignKernel( C, A, B );
521 selectBlasAssignKernel( C, A, B );
540 template<
typename MT3
543 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
544 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
546 const size_t M( A.rows() );
547 const size_t N( B.columns() );
548 const size_t K( A.columns() );
550 for(
size_t j=0UL; j<N; ++j )
552 const size_t kbegin( ( IsLower<MT5>::value )
553 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
555 const size_t kend( ( IsUpper<MT5>::value )
556 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
560 if( IsStrictlyTriangular<MT5>::value && kbegin == kend ) {
561 for(
size_t i=0UL; i<M; ++i ) {
568 const size_t ibegin( ( IsLower<MT4>::value )
569 ?( IsStrictlyLower<MT4>::value ? kbegin+1UL : kbegin )
571 const size_t iend( ( IsUpper<MT4>::value )
572 ?( IsStrictlyUpper<MT4>::value ? kbegin : kbegin+1UL )
576 if( IsLower<MT4>::value && IsLower<MT5>::value ) {
577 for(
size_t i=0UL; i<ibegin; ++i ) {
581 else if( IsStrictlyLower<MT4>::value ) {
584 for(
size_t i=ibegin; i<iend; ++i ) {
585 C(i,j) = A(i,kbegin) * B(kbegin,j);
587 if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
588 for(
size_t i=iend; i<M; ++i ) {
592 else if( IsStrictlyUpper<MT4>::value ) {
597 for(
size_t k=kbegin+1UL; k<kend; ++k )
599 const size_t ibegin( ( IsLower<MT4>::value )
600 ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
602 const size_t iend( ( IsUpper<MT4>::value )
603 ?( IsStrictlyUpper<MT4>::value ? k-1UL : k )
607 for(
size_t i=ibegin; i<iend; ++i ) {
608 C(i,j) += A(i,k) * B(k,j);
610 if( IsUpper<MT4>::value ) {
611 C(iend,j) = A(iend,k) * B(k,j);
633 template<
typename MT3
636 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
637 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
641 const size_t M( A.rows() );
642 const size_t N( B.columns() );
644 for(
size_t j=0UL; j<N; ++j )
646 const size_t ibegin( ( IsLower<MT4>::value )
647 ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
649 const size_t iend( ( IsUpper<MT4>::value )
650 ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
654 if( IsLower<MT4>::value ) {
655 for(
size_t i=0UL; i<ibegin; ++i ) {
659 for(
size_t i=ibegin; i<iend; ++i ) {
660 C(i,j) = A(i,j) * B(j,j);
662 if( IsUpper<MT4>::value ) {
663 for(
size_t i=iend; i<M; ++i ) {
686 template<
typename MT3
689 static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
690 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
694 const size_t M( A.rows() );
695 const size_t N( B.columns() );
697 for(
size_t j=0UL; j<N; ++j )
699 const size_t ibegin( ( IsLower<MT5>::value )
700 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
702 const size_t iend( ( IsUpper<MT5>::value )
703 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
707 if( IsLower<MT4>::value ) {
708 for(
size_t i=0UL; i<ibegin; ++i ) {
712 for(
size_t i=ibegin; i<iend; ++i ) {
713 C(i,j) = A(i,i) * B(i,j);
715 if( IsUpper<MT4>::value ) {
716 for(
size_t i=iend; i<M; ++i ) {
739 template<
typename MT3
742 static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
743 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
749 for(
size_t i=0UL; i<A.rows(); ++i ) {
750 C(i,i) = A(i,i) * B(i,i);
770 template<
typename MT3
773 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
774 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
776 selectDefaultAssignKernel( C, A, B );
796 template<
typename MT3
799 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
800 selectSmallAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
807 if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
808 const OppositeType_<MT5> tmp(
serial( B ) );
809 assign( ~C, A * tmp );
811 else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
812 const OppositeType_<MT4> tmp(
serial( A ) );
813 assign( ~C, tmp * B );
815 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
816 const OppositeType_<MT5> tmp(
serial( B ) );
817 assign( ~C, A * tmp );
820 const OppositeType_<MT4> tmp(
serial( A ) );
821 assign( ~C, tmp * B );
842 template<
typename MT3
845 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
846 selectSmallAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
848 const size_t M( A.rows() );
849 const size_t N( B.columns() );
850 const size_t K( A.columns() );
852 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
854 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
859 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
860 for(
size_t j=0UL; j<N; ++j )
862 const size_t kbegin( ( IsLower<MT5>::value )
863 ?( ( IsUpper<MT4>::value )
864 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
865 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
866 :( IsUpper<MT4>::value ? i : 0UL ) );
867 const size_t kend( ( IsUpper<MT5>::value )
868 ?( ( IsLower<MT4>::value )
869 ?(
min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
870 :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
871 :( IsLower<MT4>::value ?
min( i+SIMDSIZE*8UL, K ) : K ) );
873 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
875 for(
size_t k=kbegin; k<kend; ++k ) {
876 const SIMDType b1(
set( B(k,j) ) );
877 xmm1 = xmm1 + A.load(i ,k) * b1;
878 xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
879 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
880 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
881 xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,k) * b1;
882 xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,k) * b1;
883 xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,k) * b1;
884 xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,k) * b1;
887 (~C).store( i , j, xmm1 );
888 (~C).store( i+SIMDSIZE , j, xmm2 );
889 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
890 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
891 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
892 (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
893 (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
894 (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
898 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
902 for( ; (j+2UL) <= N; j+=2UL )
904 const size_t kbegin( ( IsLower<MT5>::value )
905 ?( ( IsUpper<MT4>::value )
906 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
907 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
908 :( IsUpper<MT4>::value ? i : 0UL ) );
909 const size_t kend( ( IsUpper<MT5>::value )
910 ?( ( IsLower<MT4>::value )
911 ?(
min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
912 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
913 :( IsLower<MT4>::value ?
min( i+SIMDSIZE*4UL, K ) : K ) );
915 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
917 for(
size_t k=kbegin; k<kend; ++k ) {
918 const SIMDType a1( A.load(i ,k) );
919 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
920 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
921 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
922 const SIMDType b1(
set( B(k,j ) ) );
923 const SIMDType b2(
set( B(k,j+1UL) ) );
924 xmm1 = xmm1 + a1 * b1;
925 xmm2 = xmm2 + a2 * b1;
926 xmm3 = xmm3 + a3 * b1;
927 xmm4 = xmm4 + a4 * b1;
928 xmm5 = xmm5 + a1 * b2;
929 xmm6 = xmm6 + a2 * b2;
930 xmm7 = xmm7 + a3 * b2;
931 xmm8 = xmm8 + a4 * b2;
934 (~C).store( i , j , xmm1 );
935 (~C).store( i+SIMDSIZE , j , xmm2 );
936 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
937 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
938 (~C).store( i , j+1UL, xmm5 );
939 (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
940 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
941 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
946 const size_t kbegin( ( IsLower<MT5>::value )
947 ?( ( IsUpper<MT4>::value )
948 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
949 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
950 :( IsUpper<MT4>::value ? i : 0UL ) );
951 const size_t kend( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*4UL, K ) ):( K ) );
953 SIMDType xmm1, xmm2, xmm3, xmm4;
955 for(
size_t k=kbegin; k<kend; ++k ) {
956 const SIMDType b1(
set( B(k,j) ) );
957 xmm1 = xmm1 + A.load(i ,k) * b1;
958 xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
959 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
960 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
963 (~C).store( i , j, xmm1 );
964 (~C).store( i+SIMDSIZE , j, xmm2 );
965 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
966 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
970 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
974 for( ; (j+2UL) <= N; j+=2UL )
976 const size_t kbegin( ( IsLower<MT5>::value )
977 ?( ( IsUpper<MT4>::value )
978 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
979 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
980 :( IsUpper<MT4>::value ? i : 0UL ) );
981 const size_t kend( ( IsUpper<MT5>::value )
982 ?( ( IsLower<MT4>::value )
983 ?(
min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
984 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
985 :( IsLower<MT4>::value ?
min( i+SIMDSIZE*2UL, K ) : K ) );
987 SIMDType xmm1, xmm2, xmm3, xmm4;
989 for(
size_t k=kbegin; k<kend; ++k ) {
990 const SIMDType a1( A.load(i ,k) );
991 const SIMDType a2( A.load(i+SIMDSIZE,k) );
992 const SIMDType b1(
set( B(k,j ) ) );
993 const SIMDType b2(
set( B(k,j+1UL) ) );
994 xmm1 = xmm1 + a1 * b1;
995 xmm2 = xmm2 + a2 * b1;
996 xmm3 = xmm3 + a1 * b2;
997 xmm4 = xmm4 + a2 * b2;
1000 (~C).store( i , j , xmm1 );
1001 (~C).store( i+SIMDSIZE, j , xmm2 );
1002 (~C).store( i , j+1UL, xmm3 );
1003 (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
1008 const size_t kbegin( ( IsLower<MT5>::value )
1009 ?( ( IsUpper<MT4>::value )
1010 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1011 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1012 :( IsUpper<MT4>::value ? i : 0UL ) );
1013 const size_t kend( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, K ) ):( K ) );
1015 SIMDType xmm1, xmm2;
1017 for(
size_t k=kbegin; k<kend; ++k ) {
1018 const SIMDType b1(
set( B(k,j) ) );
1019 xmm1 = xmm1 + A.load(i ,k) * b1;
1020 xmm2 = xmm2 + A.load(i+SIMDSIZE,k) * b1;
1023 (~C).store( i , j, xmm1 );
1024 (~C).store( i+SIMDSIZE, j, xmm2 );
1028 for( ; i<ipos; i+=SIMDSIZE )
1032 for( ; (j+2UL) <= N; j+=2UL )
1034 const size_t kbegin( ( IsLower<MT5>::value )
1035 ?( ( IsUpper<MT4>::value )
1036 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1037 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1038 :( IsUpper<MT4>::value ? i : 0UL ) );
1039 const size_t kend( ( IsUpper<MT5>::value )
1040 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
1043 SIMDType xmm1, xmm2;
1045 for(
size_t k=kbegin; k<kend; ++k ) {
1046 const SIMDType a1( A.load(i,k) );
1047 xmm1 = xmm1 + a1 *
set( B(k,j ) );
1048 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
1051 (~C).store( i, j , xmm1 );
1052 (~C).store( i, j+1UL, xmm2 );
1057 const size_t kbegin( ( IsLower<MT5>::value )
1058 ?( ( IsUpper<MT4>::value )
1059 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1060 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1061 :( IsUpper<MT4>::value ? i : 0UL ) );
1065 for(
size_t k=kbegin; k<K; ++k ) {
1066 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
1069 (~C).store( i, j, xmm1 );
1073 for( ; remainder && i<M; ++i )
1077 for( ; (j+2UL) <= N; j+=2UL )
1079 const size_t kbegin( ( IsLower<MT5>::value )
1080 ?( ( IsUpper<MT4>::value )
1081 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1082 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1083 :( IsUpper<MT4>::value ? i : 0UL ) );
1084 const size_t kend( ( IsUpper<MT5>::value )
1085 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
1091 for(
size_t k=kbegin; k<kend; ++k ) {
1092 value1 += A(i,k) * B(k,j );
1093 value2 += A(i,k) * B(k,j+1UL);
1096 (~C)(i,j ) = value1;
1097 (~C)(i,j+1UL) = value2;
1102 const size_t kbegin( ( IsLower<MT5>::value )
1103 ?( ( IsUpper<MT4>::value )
1104 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1105 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1106 :( IsUpper<MT4>::value ? i : 0UL ) );
1110 for(
size_t k=kbegin; k<K; ++k ) {
1111 value += A(i,k) * B(k,j);
1135 template<
typename MT3
1138 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
1139 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1141 selectDefaultAssignKernel( C, A, B );
1161 template<
typename MT3
1164 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
1165 selectLargeAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1167 selectSmallAssignKernel( ~C, A, B );
1187 template<
typename MT3
1190 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
1191 selectLargeAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1193 const size_t M( A.rows() );
1194 const size_t N( B.columns() );
1195 const size_t K( A.columns() );
1197 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
1199 for(
size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_IBLOCK_SIZE )
1201 const size_t iend(
min( ii+TDMATTDMATMULT_IBLOCK_SIZE, M ) );
1203 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1204 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
1206 for(
size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_JBLOCK_SIZE )
1208 const size_t jend(
min( jj+TDMATTDMATMULT_JBLOCK_SIZE, N ) );
1210 for(
size_t j=jj; j<jend; ++j ) {
1211 for(
size_t i=ii; i<iend; ++i ) {
1216 for(
size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_KBLOCK_SIZE )
1218 const size_t ktmp(
min( kk+TDMATTDMATMULT_KBLOCK_SIZE, K ) );
1222 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1224 const size_t i1( i+SIMDSIZE );
1225 const size_t i2( i+SIMDSIZE*2UL );
1226 const size_t i3( i+SIMDSIZE*3UL );
1230 for( ; (j+2UL) <= jend; j+=2UL )
1232 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1233 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1234 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
1235 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
1237 SIMDType xmm1( (~C).load(i ,j ) );
1238 SIMDType xmm2( (~C).load(i1,j ) );
1239 SIMDType xmm3( (~C).load(i2,j ) );
1240 SIMDType xmm4( (~C).load(i3,j ) );
1241 SIMDType xmm5( (~C).load(i ,j+1UL) );
1242 SIMDType xmm6( (~C).load(i1,j+1UL) );
1243 SIMDType xmm7( (~C).load(i2,j+1UL) );
1244 SIMDType xmm8( (~C).load(i3,j+1UL) );
1246 for(
size_t k=kbegin; k<kend; ++k ) {
1247 const SIMDType a1( A.load(i ,k) );
1248 const SIMDType a2( A.load(i1,k) );
1249 const SIMDType a3( A.load(i2,k) );
1250 const SIMDType a4( A.load(i3,k) );
1251 const SIMDType b1(
set( B(k,j ) ) );
1252 const SIMDType b2(
set( B(k,j+1UL) ) );
1253 xmm1 = xmm1 + a1 * b1;
1254 xmm2 = xmm2 + a2 * b1;
1255 xmm3 = xmm3 + a3 * b1;
1256 xmm4 = xmm4 + a4 * b1;
1257 xmm5 = xmm5 + a1 * b2;
1258 xmm6 = xmm6 + a2 * b2;
1259 xmm7 = xmm7 + a3 * b2;
1260 xmm8 = xmm8 + a4 * b2;
1263 (~C).store( i , j , xmm1 );
1264 (~C).store( i1, j , xmm2 );
1265 (~C).store( i2, j , xmm3 );
1266 (~C).store( i3, j , xmm4 );
1267 (~C).store( i , j+1UL, xmm5 );
1268 (~C).store( i1, j+1UL, xmm6 );
1269 (~C).store( i2, j+1UL, xmm7 );
1270 (~C).store( i3, j+1UL, xmm8 );
1275 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1276 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1277 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
1278 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
1280 SIMDType xmm1( (~C).load(i ,j) );
1281 SIMDType xmm2( (~C).load(i1,j) );
1282 SIMDType xmm3( (~C).load(i2,j) );
1283 SIMDType xmm4( (~C).load(i3,j) );
1285 for(
size_t k=kbegin; k<kend; ++k ) {
1286 const SIMDType b1(
set( B(k,j) ) );
1287 xmm1 = xmm1 + A.load(i ,k) * b1;
1288 xmm2 = xmm2 + A.load(i1,k) * b1;
1289 xmm3 = xmm3 + A.load(i2,k) * b1;
1290 xmm4 = xmm4 + A.load(i3,k) * b1;
1293 (~C).store( i , j, xmm1 );
1294 (~C).store( i1, j, xmm2 );
1295 (~C).store( i2, j, xmm3 );
1296 (~C).store( i3, j, xmm4 );
1300 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
1302 const size_t i1( i+SIMDSIZE );
1306 for( ; (j+4UL) <= jend; j+=4UL )
1308 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1309 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1310 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
1311 ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
1313 SIMDType xmm1( (~C).load(i ,j ) );
1314 SIMDType xmm2( (~C).load(i1,j ) );
1315 SIMDType xmm3( (~C).load(i ,j+1UL) );
1316 SIMDType xmm4( (~C).load(i1,j+1UL) );
1317 SIMDType xmm5( (~C).load(i ,j+2UL) );
1318 SIMDType xmm6( (~C).load(i1,j+2UL) );
1319 SIMDType xmm7( (~C).load(i ,j+3UL) );
1320 SIMDType xmm8( (~C).load(i1,j+3UL) );
1322 for(
size_t k=kbegin; k<kend; ++k ) {
1323 const SIMDType a1( A.load(i ,k) );
1324 const SIMDType a2( A.load(i1,k) );
1325 const SIMDType b1(
set( B(k,j ) ) );
1326 const SIMDType b2(
set( B(k,j+1UL) ) );
1327 const SIMDType b3(
set( B(k,j+2UL) ) );
1328 const SIMDType b4(
set( B(k,j+3UL) ) );
1329 xmm1 = xmm1 + a1 * b1;
1330 xmm2 = xmm2 + a2 * b1;
1331 xmm3 = xmm3 + a1 * b2;
1332 xmm4 = xmm4 + a2 * b2;
1333 xmm5 = xmm5 + a1 * b3;
1334 xmm6 = xmm6 + a2 * b3;
1335 xmm7 = xmm7 + a1 * b4;
1336 xmm8 = xmm8 + a2 * b4;
1339 (~C).store( i , j , xmm1 );
1340 (~C).store( i1, j , xmm2 );
1341 (~C).store( i , j+1UL, xmm3 );
1342 (~C).store( i1, j+1UL, xmm4 );
1343 (~C).store( i , j+2UL, xmm5 );
1344 (~C).store( i1, j+2UL, xmm6 );
1345 (~C).store( i , j+3UL, xmm7 );
1346 (~C).store( i1, j+3UL, xmm8 );
1349 for( ; (j+2UL) <= jend; j+=2UL )
1351 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1352 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1353 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
1354 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
1356 SIMDType xmm1( (~C).load(i ,j ) );
1357 SIMDType xmm2( (~C).load(i1,j ) );
1358 SIMDType xmm3( (~C).load(i ,j+1UL) );
1359 SIMDType xmm4( (~C).load(i1,j+1UL) );
1361 for(
size_t k=kbegin; k<kend; ++k ) {
1362 const SIMDType a1( A.load(i ,k) );
1363 const SIMDType a2( A.load(i1,k) );
1364 const SIMDType b1(
set( B(k,j ) ) );
1365 const SIMDType b2(
set( B(k,j+1UL) ) );
1366 xmm1 = xmm1 + a1 * b1;
1367 xmm2 = xmm2 + a2 * b1;
1368 xmm3 = xmm3 + a1 * b2;
1369 xmm4 = xmm4 + a2 * b2;
1372 (~C).store( i , j , xmm1 );
1373 (~C).store( i1, j , xmm2 );
1374 (~C).store( i , j+1UL, xmm3 );
1375 (~C).store( i1, j+1UL, xmm4 );
1380 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1381 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1382 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
1383 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
1385 SIMDType xmm1( (~C).load(i ,j) );
1386 SIMDType xmm2( (~C).load(i1,j) );
1388 for(
size_t k=kbegin; k<kend; ++k ) {
1389 const SIMDType b1(
set( B(k,j) ) );
1390 xmm1 = xmm1 + A.load(i ,k) * b1;
1391 xmm2 = xmm2 + A.load(i1,k) * b1;
1394 (~C).store( i , j, xmm1 );
1395 (~C).store( i1, j, xmm2 );
1399 for( ; i<ipos; i+=SIMDSIZE )
1401 for(
size_t j=jj; j<jend; ++j )
1403 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1404 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1405 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE, ktmp ) ):( ktmp ),
1406 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
1408 SIMDType xmm1( (~C).load(i,j) );
1410 for(
size_t k=kbegin; k<kend; ++k ) {
1411 const SIMDType b1(
set( B(k,j) ) );
1412 xmm1 = xmm1 + A.load(i,k) * b1;
1415 (~C).store( i, j, xmm1 );
1419 for( ; remainder && i<iend; ++i )
1421 for(
size_t j=jj; j<jend; ++j )
1423 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1424 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1425 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+1UL, ktmp ) ):( ktmp ),
1426 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
1428 ElementType value( (~C)(i,j) );
1430 for(
size_t k=kbegin; k<kend; ++k ) {
1431 value += A(i,k) * B(k,j);
1458 template<
typename MT3
1461 static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5> >
1462 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1464 selectLargeAssignKernel( C, A, B );
1470 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
1484 template<
typename MT3
1487 static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5> >
1488 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1490 typedef ElementType_<MT3> ET;
1492 if( IsTriangular<MT4>::value ) {
1494 trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
1496 else if( IsTriangular<MT5>::value ) {
1498 trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
1501 gemm( C, A, B, ET(1), ET(0) );
1521 template<
typename MT
1523 friend inline DisableIf_< CanExploitSymmetry<MT,MT1,MT2> >
1528 typedef IfTrue_< SO, ResultType, OppositeType > TmpType;
1540 const TmpType tmp(
serial( rhs ) );
1541 assign( ~lhs, tmp );
1561 template<
typename MT >
1562 friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
1572 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
1573 assign( ~lhs,
trans( rhs.lhs_ ) *
trans( rhs.rhs_ ) );
1574 else if( IsSymmetric<MT1>::value )
1575 assign( ~lhs,
trans( rhs.lhs_ ) * rhs.rhs_ );
1577 assign( ~lhs, rhs.lhs_ *
trans( rhs.rhs_ ) );
1595 template<
typename MT
1597 friend inline DisableIf_< CanExploitSymmetry<MT,MT1,MT2> >
1605 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1609 LT A(
serial( rhs.lhs_ ) );
1610 RT B(
serial( rhs.rhs_ ) );
1619 TDMatTDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1635 template<
typename MT3
1638 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1640 if( ( IsDiagonal<MT4>::value ) ||
1641 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
1642 selectSmallAddAssignKernel( C, A, B );
1644 selectBlasAddAssignKernel( C, A, B );
1663 template<
typename MT3
1666 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
1667 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1669 const size_t M( A.rows() );
1670 const size_t N( B.columns() );
1671 const size_t K( A.columns() );
1673 for(
size_t j=0UL; j<N; ++j )
1675 const size_t kbegin( ( IsLower<MT5>::value )
1676 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
1678 const size_t kend( ( IsUpper<MT5>::value )
1679 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
1683 for(
size_t k=kbegin; k<kend; ++k )
1685 const size_t ibegin( ( IsLower<MT4>::value )
1686 ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
1688 const size_t iend( ( IsUpper<MT4>::value )
1689 ?( IsStrictlyUpper<MT4>::value ? k : k+1UL )
1693 const size_t inum( iend - ibegin );
1694 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
1696 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
1697 C(i ,j) += A(i ,k) * B(k,j);
1698 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
1701 C(ipos,j) += A(ipos,k) * B(k,j);
1723 template<
typename MT3
1726 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
1727 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1731 const size_t M( A.rows() );
1732 const size_t N( B.columns() );
1734 for(
size_t j=0UL; j<N; ++j )
1736 const size_t ibegin( ( IsLower<MT4>::value )
1737 ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
1739 const size_t iend( ( IsUpper<MT4>::value )
1740 ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
1744 const size_t inum( iend - ibegin );
1745 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
1747 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
1748 C(i ,j) += A(i ,j) * B(j,j);
1749 C(i+1UL,j) += A(i+1UL,j) * B(j,j);
1752 C(ipos,j) += A(ipos,j) * B(j,j);
1773 template<
typename MT3
1776 static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
1777 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1781 const size_t M( A.rows() );
1782 const size_t N( B.columns() );
1784 for(
size_t j=0UL; j<N; ++j )
1786 const size_t ibegin( ( IsLower<MT5>::value )
1787 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
1789 const size_t iend( ( IsUpper<MT5>::value )
1790 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
1794 const size_t inum( iend - ibegin );
1795 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
1797 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
1798 C(i ,j) += A(i ,i ) * B(i ,j);
1799 C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j);
1802 C(ipos,j) += A(ipos,ipos) * B(ipos,j);
1823 template<
typename MT3
1826 static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
1827 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1831 for(
size_t i=0UL; i<A.rows(); ++i ) {
1832 C(i,i) += A(i,i) * B(i,i);
1852 template<
typename MT3
1855 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
1856 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1858 selectDefaultAddAssignKernel( C, A, B );
1878 template<
typename MT3
1881 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
1882 selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1889 if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
1890 const OppositeType_<MT5> tmp(
serial( B ) );
1891 addAssign( ~C, A * tmp );
1893 else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
1894 const OppositeType_<MT4> tmp(
serial( A ) );
1895 addAssign( ~C, tmp * B );
1897 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
1898 const OppositeType_<MT5> tmp(
serial( B ) );
1899 addAssign( ~C, A * tmp );
1902 const OppositeType_<MT4> tmp(
serial( A ) );
1903 addAssign( ~C, tmp * B );
1924 template<
typename MT3
1927 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
1928 selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1930 const size_t M( A.rows() );
1931 const size_t N( B.columns() );
1932 const size_t K( A.columns() );
1934 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
1936 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
1941 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
1942 for(
size_t j=0UL; j<N; ++j )
1944 const size_t kbegin( ( IsLower<MT5>::value )
1945 ?( ( IsUpper<MT4>::value )
1946 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1947 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1948 :( IsUpper<MT4>::value ? i : 0UL ) );
1949 const size_t kend( ( IsUpper<MT5>::value )
1950 ?( ( IsLower<MT4>::value )
1951 ?(
min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
1952 :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
1953 :( IsLower<MT4>::value ?
min( i+SIMDSIZE*8UL, K ) : K ) );
1955 SIMDType xmm1( (~C).load(i ,j) );
1956 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
1957 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
1958 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
1959 SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
1960 SIMDType xmm6( (~C).load(i+SIMDSIZE*5UL,j) );
1961 SIMDType xmm7( (~C).load(i+SIMDSIZE*6UL,j) );
1962 SIMDType xmm8( (~C).load(i+SIMDSIZE*7UL,j) );
1964 for(
size_t k=kbegin; k<kend; ++k ) {
1965 const SIMDType b1(
set( B(k,j) ) );
1966 xmm1 = xmm1 + A.load(i ,k) * b1;
1967 xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
1968 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
1969 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
1970 xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,k) * b1;
1971 xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,k) * b1;
1972 xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,k) * b1;
1973 xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,k) * b1;
1976 (~C).store( i , j, xmm1 );
1977 (~C).store( i+SIMDSIZE , j, xmm2 );
1978 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1979 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
1980 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
1981 (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
1982 (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
1983 (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
1987 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1991 for( ; (j+2UL) <= N; j+=2UL )
1993 const size_t kbegin( ( IsLower<MT5>::value )
1994 ?( ( IsUpper<MT4>::value )
1995 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1996 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1997 :( IsUpper<MT4>::value ? i : 0UL ) );
1998 const size_t kend( ( IsUpper<MT5>::value )
1999 ?( ( IsLower<MT4>::value )
2000 ?(
min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
2001 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
2002 :( IsLower<MT4>::value ?
min( i+SIMDSIZE*4UL, K ) : K ) );
2004 SIMDType xmm1( (~C).load(i ,j ) );
2005 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
2006 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
2007 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j ) );
2008 SIMDType xmm5( (~C).load(i ,j+1UL) );
2009 SIMDType xmm6( (~C).load(i+SIMDSIZE ,j+1UL) );
2010 SIMDType xmm7( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
2011 SIMDType xmm8( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
2013 for(
size_t k=kbegin; k<kend; ++k ) {
2014 const SIMDType a1( A.load(i ,k) );
2015 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
2016 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
2017 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
2018 const SIMDType b1(
set( B(k,j ) ) );
2019 const SIMDType b2(
set( B(k,j+1UL) ) );
2020 xmm1 = xmm1 + a1 * b1;
2021 xmm2 = xmm2 + a2 * b1;
2022 xmm3 = xmm3 + a3 * b1;
2023 xmm4 = xmm4 + a4 * b1;
2024 xmm5 = xmm5 + a1 * b2;
2025 xmm6 = xmm6 + a2 * b2;
2026 xmm7 = xmm7 + a3 * b2;
2027 xmm8 = xmm8 + a4 * b2;
2030 (~C).store( i , j , xmm1 );
2031 (~C).store( i+SIMDSIZE , j , xmm2 );
2032 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
2033 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
2034 (~C).store( i , j+1UL, xmm5 );
2035 (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
2036 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
2037 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
2042 const size_t kbegin( ( IsLower<MT5>::value )
2043 ?( ( IsUpper<MT4>::value )
2044 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2045 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2046 :( IsUpper<MT4>::value ? i : 0UL ) );
2047 const size_t kend( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*4UL, K ) ):( K ) );
2049 SIMDType xmm1( (~C).load(i ,j) );
2050 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
2051 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
2052 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
2054 for(
size_t k=kbegin; k<kend; ++k ) {
2055 const SIMDType b1(
set( B(k,j) ) );
2056 xmm1 = xmm1 + A.load(i ,k) * b1;
2057 xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
2058 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
2059 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
2062 (~C).store( i , j, xmm1 );
2063 (~C).store( i+SIMDSIZE , j, xmm2 );
2064 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
2065 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
2069 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
2073 for( ; (j+2UL) <= N; j+=2UL )
2075 const size_t kbegin( ( IsLower<MT5>::value )
2076 ?( ( IsUpper<MT4>::value )
2077 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2078 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2079 :( IsUpper<MT4>::value ? i : 0UL ) );
2080 const size_t kend( ( IsUpper<MT5>::value )
2081 ?( ( IsLower<MT4>::value )
2082 ?(
min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
2083 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
2084 :( IsLower<MT4>::value ?
min( i+SIMDSIZE*2UL, K ) : K ) );
2086 SIMDType xmm1( (~C).load(i ,j ) );
2087 SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
2088 SIMDType xmm3( (~C).load(i ,j+1UL) );
2089 SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
2091 for(
size_t k=kbegin; k<kend; ++k ) {
2092 const SIMDType a1( A.load(i ,k) );
2093 const SIMDType a2( A.load(i+SIMDSIZE,k) );
2094 const SIMDType b1(
set( B(k,j ) ) );
2095 const SIMDType b2(
set( B(k,j+1UL) ) );
2096 xmm1 = xmm1 + a1 * b1;
2097 xmm2 = xmm2 + a2 * b1;
2098 xmm3 = xmm3 + a1 * b2;
2099 xmm4 = xmm4 + a2 * b2;
2102 (~C).store( i , j , xmm1 );
2103 (~C).store( i+SIMDSIZE, j , xmm2 );
2104 (~C).store( i , j+1UL, xmm3 );
2105 (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
2110 const size_t kbegin( ( IsLower<MT5>::value )
2111 ?( ( IsUpper<MT4>::value )
2112 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2113 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2114 :( IsUpper<MT4>::value ? i : 0UL ) );
2115 const size_t kend( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, K ) ):( K ) );
2117 SIMDType xmm1( (~C).load(i ,j) );
2118 SIMDType xmm2( (~C).load(i+SIMDSIZE,j) );
2120 for(
size_t k=kbegin; k<kend; ++k ) {
2121 const SIMDType b1(
set( B(k,j) ) );
2122 xmm1 = xmm1 + A.load(i ,k) * b1;
2123 xmm2 = xmm2 + A.load(i+SIMDSIZE,k) * b1;
2126 (~C).store( i , j, xmm1 );
2127 (~C).store( i+SIMDSIZE, j, xmm2 );
2131 for( ; i<ipos; i+=SIMDSIZE )
2135 for( ; (j+2UL) <= N; j+=2UL )
2137 const size_t kbegin( ( IsLower<MT5>::value )
2138 ?( ( IsUpper<MT4>::value )
2139 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2140 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2141 :( IsUpper<MT4>::value ? i : 0UL ) );
2142 const size_t kend( ( IsUpper<MT5>::value )
2143 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
2146 SIMDType xmm1( (~C).load(i,j ) );
2147 SIMDType xmm2( (~C).load(i,j+1UL) );
2149 for(
size_t k=kbegin; k<kend; ++k ) {
2150 const SIMDType a1( A.load(i,k) );
2151 xmm1 = xmm1 + a1 *
set( B(k,j ) );
2152 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
2155 (~C).store( i, j , xmm1 );
2156 (~C).store( i, j+1UL, xmm2 );
2161 const size_t kbegin( ( IsLower<MT5>::value )
2162 ?( ( IsUpper<MT4>::value )
2163 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2164 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2165 :( IsUpper<MT4>::value ? i : 0UL ) );
2167 SIMDType xmm1( (~C).load(i,j) );
2169 for(
size_t k=kbegin; k<K; ++k ) {
2170 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
2173 (~C).store( i, j, xmm1 );
2177 for( ; remainder && i<M; ++i )
2181 for( ; (j+2UL) <= N; j+=2UL )
2183 const size_t kbegin( ( IsLower<MT5>::value )
2184 ?( ( IsUpper<MT4>::value )
2185 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2186 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2187 :( IsUpper<MT4>::value ? i : 0UL ) );
2188 const size_t kend( ( IsUpper<MT5>::value )
2189 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
2192 ElementType value1( (~C)(i,j ) );
2193 ElementType value2( (~C)(i,j+1UL) );
2195 for(
size_t k=kbegin; k<kend; ++k ) {
2196 value1 += A(i,k) * B(k,j );
2197 value2 += A(i,k) * B(k,j+1UL);
2200 (~C)(i,j ) = value1;
2201 (~C)(i,j+1UL) = value2;
2206 const size_t kbegin( ( IsLower<MT5>::value )
2207 ?( ( IsUpper<MT4>::value )
2208 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2209 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2210 :( IsUpper<MT4>::value ? i : 0UL ) );
2212 ElementType value( (~C)(i,j) );
2214 for(
size_t k=kbegin; k<K; ++k ) {
2215 value += A(i,k) * B(k,j);
2239 template<
typename MT3
2242 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
2243 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2245 selectDefaultAddAssignKernel( C, A, B );
2265 template<
typename MT3
2268 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
2269 selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
2271 selectSmallAddAssignKernel( ~C, A, B );
2291 template<
typename MT3
2294 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
2295 selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
2297 const size_t M( A.rows() );
2298 const size_t N( B.columns() );
2299 const size_t K( A.columns() );
2301 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
2303 for(
size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_IBLOCK_SIZE )
2305 const size_t iend(
min( ii+TDMATTDMATMULT_IBLOCK_SIZE, M ) );
2307 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
2308 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
2310 for(
size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_JBLOCK_SIZE )
2312 const size_t jend(
min( jj+TDMATTDMATMULT_JBLOCK_SIZE, N ) );
2314 for(
size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_KBLOCK_SIZE )
2316 const size_t ktmp(
min( kk+TDMATTDMATMULT_KBLOCK_SIZE, K ) );
2320 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
2322 const size_t i1( i+SIMDSIZE );
2323 const size_t i2( i+SIMDSIZE*2UL );
2324 const size_t i3( i+SIMDSIZE*3UL );
2328 for( ; (j+2UL) <= jend; j+=2UL )
2330 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2331 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2332 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
2333 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
2335 SIMDType xmm1( (~C).load(i ,j ) );
2336 SIMDType xmm2( (~C).load(i1,j ) );
2337 SIMDType xmm3( (~C).load(i2,j ) );
2338 SIMDType xmm4( (~C).load(i3,j ) );
2339 SIMDType xmm5( (~C).load(i ,j+1UL) );
2340 SIMDType xmm6( (~C).load(i1,j+1UL) );
2341 SIMDType xmm7( (~C).load(i2,j+1UL) );
2342 SIMDType xmm8( (~C).load(i3,j+1UL) );
2344 for(
size_t k=kbegin; k<kend; ++k ) {
2345 const SIMDType a1( A.load(i ,k) );
2346 const SIMDType a2( A.load(i1,k) );
2347 const SIMDType a3( A.load(i2,k) );
2348 const SIMDType a4( A.load(i3,k) );
2349 const SIMDType b1(
set( B(k,j ) ) );
2350 const SIMDType b2(
set( B(k,j+1UL) ) );
2351 xmm1 = xmm1 + a1 * b1;
2352 xmm2 = xmm2 + a2 * b1;
2353 xmm3 = xmm3 + a3 * b1;
2354 xmm4 = xmm4 + a4 * b1;
2355 xmm5 = xmm5 + a1 * b2;
2356 xmm6 = xmm6 + a2 * b2;
2357 xmm7 = xmm7 + a3 * b2;
2358 xmm8 = xmm8 + a4 * b2;
2361 (~C).store( i , j , xmm1 );
2362 (~C).store( i1, j , xmm2 );
2363 (~C).store( i2, j , xmm3 );
2364 (~C).store( i3, j , xmm4 );
2365 (~C).store( i , j+1UL, xmm5 );
2366 (~C).store( i1, j+1UL, xmm6 );
2367 (~C).store( i2, j+1UL, xmm7 );
2368 (~C).store( i3, j+1UL, xmm8 );
2373 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2374 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2375 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
2376 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
2378 SIMDType xmm1( (~C).load(i ,j) );
2379 SIMDType xmm2( (~C).load(i1,j) );
2380 SIMDType xmm3( (~C).load(i2,j) );
2381 SIMDType xmm4( (~C).load(i3,j) );
2383 for(
size_t k=kbegin; k<kend; ++k ) {
2384 const SIMDType b1(
set( B(k,j) ) );
2385 xmm1 = xmm1 + A.load(i ,k) * b1;
2386 xmm2 = xmm2 + A.load(i1,k) * b1;
2387 xmm3 = xmm3 + A.load(i2,k) * b1;
2388 xmm4 = xmm4 + A.load(i3,k) * b1;
2391 (~C).store( i , j, xmm1 );
2392 (~C).store( i1, j, xmm2 );
2393 (~C).store( i2, j, xmm3 );
2394 (~C).store( i3, j, xmm4 );
2398 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
2400 const size_t i1( i+SIMDSIZE );
2404 for( ; (j+4UL) <= jend; j+=4UL )
2406 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2407 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2408 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
2409 ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
2411 SIMDType xmm1( (~C).load(i ,j ) );
2412 SIMDType xmm2( (~C).load(i1,j ) );
2413 SIMDType xmm3( (~C).load(i ,j+1UL) );
2414 SIMDType xmm4( (~C).load(i1,j+1UL) );
2415 SIMDType xmm5( (~C).load(i ,j+2UL) );
2416 SIMDType xmm6( (~C).load(i1,j+2UL) );
2417 SIMDType xmm7( (~C).load(i ,j+3UL) );
2418 SIMDType xmm8( (~C).load(i1,j+3UL) );
2420 for(
size_t k=kbegin; k<kend; ++k ) {
2421 const SIMDType a1( A.load(i ,k) );
2422 const SIMDType a2( A.load(i1,k) );
2423 const SIMDType b1(
set( B(k,j ) ) );
2424 const SIMDType b2(
set( B(k,j+1UL) ) );
2425 const SIMDType b3(
set( B(k,j+2UL) ) );
2426 const SIMDType b4(
set( B(k,j+3UL) ) );
2427 xmm1 = xmm1 + a1 * b1;
2428 xmm2 = xmm2 + a2 * b1;
2429 xmm3 = xmm3 + a1 * b2;
2430 xmm4 = xmm4 + a2 * b2;
2431 xmm5 = xmm5 + a1 * b3;
2432 xmm6 = xmm6 + a2 * b3;
2433 xmm7 = xmm7 + a1 * b4;
2434 xmm8 = xmm8 + a2 * b4;
2437 (~C).store( i , j , xmm1 );
2438 (~C).store( i1, j , xmm2 );
2439 (~C).store( i , j+1UL, xmm3 );
2440 (~C).store( i1, j+1UL, xmm4 );
2441 (~C).store( i , j+2UL, xmm5 );
2442 (~C).store( i1, j+2UL, xmm6 );
2443 (~C).store( i , j+3UL, xmm7 );
2444 (~C).store( i1, j+3UL, xmm8 );
2447 for( ; (j+2UL) <= jend; j+=2UL )
2449 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2450 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2451 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
2452 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
2454 SIMDType xmm1( (~C).load(i ,j ) );
2455 SIMDType xmm2( (~C).load(i1,j ) );
2456 SIMDType xmm3( (~C).load(i ,j+1UL) );
2457 SIMDType xmm4( (~C).load(i1,j+1UL) );
2459 for(
size_t k=kbegin; k<kend; ++k ) {
2460 const SIMDType a1( A.load(i ,k) );
2461 const SIMDType a2( A.load(i1,k) );
2462 const SIMDType b1(
set( B(k,j ) ) );
2463 const SIMDType b2(
set( B(k,j+1UL) ) );
2464 xmm1 = xmm1 + a1 * b1;
2465 xmm2 = xmm2 + a2 * b1;
2466 xmm3 = xmm3 + a1 * b2;
2467 xmm4 = xmm4 + a2 * b2;
2470 (~C).store( i , j , xmm1 );
2471 (~C).store( i1, j , xmm2 );
2472 (~C).store( i , j+1UL, xmm3 );
2473 (~C).store( i1, j+1UL, xmm4 );
2478 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2479 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2480 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
2481 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
2483 SIMDType xmm1( (~C).load(i ,j) );
2484 SIMDType xmm2( (~C).load(i1,j) );
2486 for(
size_t k=kbegin; k<kend; ++k ) {
2487 const SIMDType b1(
set( B(k,j) ) );
2488 xmm1 = xmm1 + A.load(i ,k) * b1;
2489 xmm2 = xmm2 + A.load(i1,k) * b1;
2492 (~C).store( i , j, xmm1 );
2493 (~C).store( i1, j, xmm2 );
2497 for( ; i<ipos; i+=SIMDSIZE )
2499 for(
size_t j=jj; j<jend; ++j )
2501 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2502 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2503 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE, ktmp ) ):( ktmp ),
2504 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
2506 SIMDType xmm1( (~C).load(i,j) );
2508 for(
size_t k=kbegin; k<kend; ++k ) {
2509 const SIMDType b1(
set( B(k,j) ) );
2510 xmm1 = xmm1 + A.load(i,k) * b1;
2513 (~C).store( i, j, xmm1 );
2517 for( ; remainder && i<iend; ++i )
2519 for(
size_t j=jj; j<jend; ++j )
2521 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2522 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2523 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+1UL, ktmp ) ):( ktmp ),
2524 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
2526 ElementType value( (~C)(i,j) );
2528 for(
size_t k=kbegin; k<kend; ++k ) {
2529 value += A(i,k) * B(k,j);
2556 template<
typename MT3
2559 static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5> >
2560 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2562 selectLargeAddAssignKernel( C, A, B );
2568 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
2582 template<
typename MT3
2585 static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5> >
2586 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2588 typedef ElementType_<MT3> ET;
2590 if( IsTriangular<MT4>::value ) {
2591 ResultType_<MT3> tmp(
serial( B ) );
2592 trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2593 addAssign( C, tmp );
2595 else if( IsTriangular<MT5>::value ) {
2596 ResultType_<MT3> tmp(
serial( A ) );
2597 trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2598 addAssign( C, tmp );
2601 gemm( C, A, B, ET(1), ET(1) );
2623 template<
typename MT >
2624 friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
2634 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
2635 addAssign( ~lhs,
trans( rhs.lhs_ ) *
trans( rhs.rhs_ ) );
2636 else if( IsSymmetric<MT1>::value )
2637 addAssign( ~lhs,
trans( rhs.lhs_ ) * rhs.rhs_ );
2639 addAssign( ~lhs, rhs.lhs_ *
trans( rhs.rhs_ ) );
2661 template<
typename MT
2663 friend inline DisableIf_< CanExploitSymmetry<MT,MT1,MT2> >
2671 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2675 LT A(
serial( rhs.lhs_ ) );
2676 RT B(
serial( rhs.rhs_ ) );
2685 TDMatTDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
2701 template<
typename MT3
2704 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2706 if( ( IsDiagonal<MT4>::value ) ||
2707 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
2708 selectSmallSubAssignKernel( C, A, B );
2710 selectBlasSubAssignKernel( C, A, B );
2729 template<
typename MT3
2732 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
2733 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2735 const size_t M( A.rows() );
2736 const size_t N( B.columns() );
2737 const size_t K( A.columns() );
2739 for(
size_t j=0UL; j<N; ++j )
2741 const size_t kbegin( ( IsLower<MT5>::value )
2742 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
2744 const size_t kend( ( IsUpper<MT5>::value )
2745 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
2749 for(
size_t k=kbegin; k<kend; ++k )
2751 const size_t ibegin( ( IsLower<MT4>::value )
2752 ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
2754 const size_t iend( ( IsUpper<MT4>::value )
2755 ?( IsStrictlyUpper<MT4>::value ? k : k+1UL )
2759 const size_t inum( iend - ibegin );
2760 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2762 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2763 C(i ,j) -= A(i ,k) * B(k,j);
2764 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
2767 C(ipos,j) -= A(ipos,k) * B(k,j);
2789 template<
typename MT3
2792 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
2793 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2797 const size_t M( A.rows() );
2798 const size_t N( B.columns() );
2800 for(
size_t j=0UL; j<N; ++j )
2802 const size_t ibegin( ( IsLower<MT4>::value )
2803 ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
2805 const size_t iend( ( IsUpper<MT4>::value )
2806 ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
2810 const size_t inum( iend - ibegin );
2811 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2813 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2814 C(i ,j) -= A(i ,j) * B(j,j);
2815 C(i+1UL,j) -= A(i+1UL,j) * B(j,j);
2818 C(ipos,j) -= A(ipos,j) * B(j,j);
2839 template<
typename MT3
2842 static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
2843 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2847 const size_t M( A.rows() );
2848 const size_t N( B.columns() );
2850 for(
size_t j=0UL; j<N; ++j )
2852 const size_t ibegin( ( IsLower<MT5>::value )
2853 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
2855 const size_t iend( ( IsUpper<MT5>::value )
2856 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
2860 const size_t inum( iend - ibegin );
2861 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2863 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2864 C(i ,j) -= A(i ,i ) * B(i ,j);
2865 C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j);
2868 C(ipos,j) -= A(ipos,ipos) * B(ipos,j);
2889 template<
typename MT3
2892 static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
2893 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2897 for(
size_t i=0UL; i<A.rows(); ++i ) {
2898 C(i,i) -= A(i,i) * B(i,i);
2918 template<
typename MT3
2921 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
2922 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2924 selectDefaultSubAssignKernel( C, A, B );
2944 template<
typename MT3
2947 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
2948 selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
2955 if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
2956 const OppositeType_<MT5> tmp(
serial( B ) );
2957 subAssign( ~C, A * tmp );
2959 else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
2960 const OppositeType_<MT4> tmp(
serial( A ) );
2961 subAssign( ~C, tmp * B );
2963 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
2964 const OppositeType_<MT5> tmp(
serial( B ) );
2965 subAssign( ~C, A * tmp );
2968 const OppositeType_<MT4> tmp(
serial( A ) );
2969 subAssign( ~C, tmp * B );
2990 template<
typename MT3
2993 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
2994 selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
2996 const size_t M( A.rows() );
2997 const size_t N( B.columns() );
2998 const size_t K( A.columns() );
3000 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
3002 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
3007 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
3008 for(
size_t j=0UL; j<N; ++j )
3010 const size_t kbegin( ( IsLower<MT5>::value )
3011 ?( ( IsUpper<MT4>::value )
3012 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3013 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3014 :( IsUpper<MT4>::value ? i : 0UL ) );
3015 const size_t kend( ( IsUpper<MT5>::value )
3016 ?( ( IsLower<MT4>::value )
3017 ?(
min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
3018 :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
3019 :( IsLower<MT4>::value ?
min( i+SIMDSIZE*8UL, K ) : K ) );
3021 SIMDType xmm1( (~C).load(i ,j) );
3022 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3023 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3024 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3025 SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
3026 SIMDType xmm6( (~C).load(i+SIMDSIZE*5UL,j) );
3027 SIMDType xmm7( (~C).load(i+SIMDSIZE*6UL,j) );
3028 SIMDType xmm8( (~C).load(i+SIMDSIZE*7UL,j) );
3030 for(
size_t k=kbegin; k<kend; ++k ) {
3031 const SIMDType b1(
set( B(k,j) ) );
3032 xmm1 = xmm1 - A.load(i ,k) * b1;
3033 xmm2 = xmm2 - A.load(i+SIMDSIZE ,k) * b1;
3034 xmm3 = xmm3 - A.load(i+SIMDSIZE*2UL,k) * b1;
3035 xmm4 = xmm4 - A.load(i+SIMDSIZE*3UL,k) * b1;
3036 xmm5 = xmm5 - A.load(i+SIMDSIZE*4UL,k) * b1;
3037 xmm6 = xmm6 - A.load(i+SIMDSIZE*5UL,k) * b1;
3038 xmm7 = xmm7 - A.load(i+SIMDSIZE*6UL,k) * b1;
3039 xmm8 = xmm8 - A.load(i+SIMDSIZE*7UL,k) * b1;
3042 (~C).store( i , j, xmm1 );
3043 (~C).store( i+SIMDSIZE , j, xmm2 );
3044 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3045 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3046 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
3047 (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
3048 (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
3049 (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
3053 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3057 for( ; (j+2UL) <= N; j+=2UL )
3059 const size_t kbegin( ( IsLower<MT5>::value )
3060 ?( ( IsUpper<MT4>::value )
3061 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3062 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3063 :( IsUpper<MT4>::value ? i : 0UL ) );
3064 const size_t kend( ( IsUpper<MT5>::value )
3065 ?( ( IsLower<MT4>::value )
3066 ?(
min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3067 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3068 :( IsLower<MT4>::value ?
min( i+SIMDSIZE*4UL, K ) : K ) );
3070 SIMDType xmm1( (~C).load(i ,j ) );
3071 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
3072 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
3073 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j ) );
3074 SIMDType xmm5( (~C).load(i ,j+1UL) );
3075 SIMDType xmm6( (~C).load(i+SIMDSIZE ,j+1UL) );
3076 SIMDType xmm7( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
3077 SIMDType xmm8( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
3079 for(
size_t k=kbegin; k<kend; ++k ) {
3080 const SIMDType a1( A.load(i ,k) );
3081 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3082 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3083 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
3084 const SIMDType b1(
set( B(k,j ) ) );
3085 const SIMDType b2(
set( B(k,j+1UL) ) );
3086 xmm1 = xmm1 - a1 * b1;
3087 xmm2 = xmm2 - a2 * b1;
3088 xmm3 = xmm3 - a3 * b1;
3089 xmm4 = xmm4 - a4 * b1;
3090 xmm5 = xmm5 - a1 * b2;
3091 xmm6 = xmm6 - a2 * b2;
3092 xmm7 = xmm7 - a3 * b2;
3093 xmm8 = xmm8 - a4 * b2;
3096 (~C).store( i , j , xmm1 );
3097 (~C).store( i+SIMDSIZE , j , xmm2 );
3098 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
3099 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
3100 (~C).store( i , j+1UL, xmm5 );
3101 (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
3102 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
3103 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
3108 const size_t kbegin( ( IsLower<MT5>::value )
3109 ?( ( IsUpper<MT4>::value )
3110 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3111 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3112 :( IsUpper<MT4>::value ? i : 0UL ) );
3113 const size_t kend( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*4UL, K ) ):( K ) );
3115 SIMDType xmm1( (~C).load(i ,j) );
3116 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3117 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3118 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3120 for(
size_t k=kbegin; k<kend; ++k ) {
3121 const SIMDType b1(
set( B(k,j) ) );
3122 xmm1 = xmm1 - A.load(i ,k) * b1;
3123 xmm2 = xmm2 - A.load(i+SIMDSIZE ,k) * b1;
3124 xmm3 = xmm3 - A.load(i+SIMDSIZE*2UL,k) * b1;
3125 xmm4 = xmm4 - A.load(i+SIMDSIZE*3UL,k) * b1;
3128 (~C).store( i , j, xmm1 );
3129 (~C).store( i+SIMDSIZE , j, xmm2 );
3130 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3131 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3135 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3139 for( ; (j+2UL) <= N; j+=2UL )
3141 const size_t kbegin( ( IsLower<MT5>::value )
3142 ?( ( IsUpper<MT4>::value )
3143 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3144 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3145 :( IsUpper<MT4>::value ? i : 0UL ) );
3146 const size_t kend( ( IsUpper<MT5>::value )
3147 ?( ( IsLower<MT4>::value )
3148 ?(
min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3149 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3150 :( IsLower<MT4>::value ?
min( i+SIMDSIZE*2UL, K ) : K ) );
3152 SIMDType xmm1( (~C).load(i ,j ) );
3153 SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
3154 SIMDType xmm3( (~C).load(i ,j+1UL) );
3155 SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
3157 for(
size_t k=kbegin; k<kend; ++k ) {
3158 const SIMDType a1( A.load(i ,k) );
3159 const SIMDType a2( A.load(i+SIMDSIZE,k) );
3160 const SIMDType b1(
set( B(k,j ) ) );
3161 const SIMDType b2(
set( B(k,j+1UL) ) );
3162 xmm1 = xmm1 - a1 * b1;
3163 xmm2 = xmm2 - a2 * b1;
3164 xmm3 = xmm3 - a1 * b2;
3165 xmm4 = xmm4 - a2 * b2;
3168 (~C).store( i , j , xmm1 );
3169 (~C).store( i+SIMDSIZE, j , xmm2 );
3170 (~C).store( i , j+1UL, xmm3 );
3171 (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
3176 const size_t kbegin( ( IsLower<MT5>::value )
3177 ?( ( IsUpper<MT4>::value )
3178 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3179 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3180 :( IsUpper<MT4>::value ? i : 0UL ) );
3181 const size_t kend( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, K ) ):( K ) );
3183 SIMDType xmm1( (~C).load(i ,j) );
3184 SIMDType xmm2( (~C).load(i+SIMDSIZE,j) );
3186 for(
size_t k=kbegin; k<kend; ++k ) {
3187 const SIMDType b1(
set( B(k,j) ) );
3188 xmm1 = xmm1 - A.load(i ,k) * b1;
3189 xmm2 = xmm2 - A.load(i+SIMDSIZE,k) * b1;
3192 (~C).store( i , j, xmm1 );
3193 (~C).store( i+SIMDSIZE, j, xmm2 );
3197 for( ; i<ipos; i+=SIMDSIZE )
3201 for( ; (j+2UL) <= N; j+=2UL )
3203 const size_t kbegin( ( IsLower<MT5>::value )
3204 ?( ( IsUpper<MT4>::value )
3205 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3206 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3207 :( IsUpper<MT4>::value ? i : 0UL ) );
3208 const size_t kend( ( IsUpper<MT5>::value )
3209 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
3212 SIMDType xmm1( (~C).load(i,j ) );
3213 SIMDType xmm2( (~C).load(i,j+1UL) );
3215 for(
size_t k=kbegin; k<kend; ++k ) {
3216 const SIMDType a1( A.load(i,k) );
3217 xmm1 = xmm1 - a1 *
set( B(k,j ) );
3218 xmm2 = xmm2 - a1 *
set( B(k,j+1UL) );
3221 (~C).store( i, j , xmm1 );
3222 (~C).store( i, j+1UL, xmm2 );
3227 const size_t kbegin( ( IsLower<MT5>::value )
3228 ?( ( IsUpper<MT4>::value )
3229 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3230 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3231 :( IsUpper<MT4>::value ? i : 0UL ) );
3233 SIMDType xmm1( (~C).load(i,j) );
3235 for(
size_t k=kbegin; k<K; ++k ) {
3236 xmm1 = xmm1 - A.load(i,k) *
set( B(k,j) );
3239 (~C).store( i, j, xmm1 );
3243 for( ; remainder && i<M; ++i )
3247 for( ; (j+2UL) <= N; j+=2UL )
3249 const size_t kbegin( ( IsLower<MT5>::value )
3250 ?( ( IsUpper<MT4>::value )
3251 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3252 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3253 :( IsUpper<MT4>::value ? i : 0UL ) );
3254 const size_t kend( ( IsUpper<MT5>::value )
3255 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
3258 ElementType value1( (~C)(i,j ) );
3259 ElementType value2( (~C)(i,j+1UL) );
3261 for(
size_t k=kbegin; k<kend; ++k ) {
3262 value1 -= A(i,k) * B(k,j );
3263 value2 -= A(i,k) * B(k,j+1UL);
3266 (~C)(i,j ) = value1;
3267 (~C)(i,j+1UL) = value2;
3272 const size_t kbegin( ( IsLower<MT5>::value )
3273 ?( ( IsUpper<MT4>::value )
3274 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3275 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3276 :( IsUpper<MT4>::value ? i : 0UL ) );
3278 ElementType value( (~C)(i,j) );
3280 for(
size_t k=kbegin; k<K; ++k ) {
3281 value -= A(i,k) * B(k,j);
3305 template<
typename MT3
3308 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
3309 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3311 selectDefaultSubAssignKernel( C, A, B );
3331 template<
typename MT3
3334 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
3335 selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
3337 selectSmallSubAssignKernel( ~C, A, B );
3357 template<
typename MT3
3360 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
3361 selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
3363 const size_t M( A.rows() );
3364 const size_t N( B.columns() );
3365 const size_t K( A.columns() );
3367 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
3369 for(
size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_IBLOCK_SIZE )
3371 const size_t iend(
min( ii+TDMATTDMATMULT_IBLOCK_SIZE, M ) );
3373 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3374 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
3376 for(
size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_JBLOCK_SIZE )
3378 const size_t jend(
min( jj+TDMATTDMATMULT_JBLOCK_SIZE, N ) );
3380 for(
size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_KBLOCK_SIZE )
3382 const size_t ktmp(
min( kk+TDMATTDMATMULT_KBLOCK_SIZE, K ) );
3386 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3388 const size_t i1( i+SIMDSIZE );
3389 const size_t i2( i+SIMDSIZE*2UL );
3390 const size_t i3( i+SIMDSIZE*3UL );
3394 for( ; (j+2UL) <= jend; j+=2UL )
3396 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3397 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3398 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
3399 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
3401 SIMDType xmm1( (~C).load(i ,j ) );
3402 SIMDType xmm2( (~C).load(i1,j ) );
3403 SIMDType xmm3( (~C).load(i2,j ) );
3404 SIMDType xmm4( (~C).load(i3,j ) );
3405 SIMDType xmm5( (~C).load(i ,j+1UL) );
3406 SIMDType xmm6( (~C).load(i1,j+1UL) );
3407 SIMDType xmm7( (~C).load(i2,j+1UL) );
3408 SIMDType xmm8( (~C).load(i3,j+1UL) );
3410 for(
size_t k=kbegin; k<kend; ++k ) {
3411 const SIMDType a1( A.load(i ,k) );
3412 const SIMDType a2( A.load(i1,k) );
3413 const SIMDType a3( A.load(i2,k) );
3414 const SIMDType a4( A.load(i3,k) );
3415 const SIMDType b1(
set( B(k,j ) ) );
3416 const SIMDType b2(
set( B(k,j+1UL) ) );
3417 xmm1 = xmm1 - a1 * b1;
3418 xmm2 = xmm2 - a2 * b1;
3419 xmm3 = xmm3 - a3 * b1;
3420 xmm4 = xmm4 - a4 * b1;
3421 xmm5 = xmm5 - a1 * b2;
3422 xmm6 = xmm6 - a2 * b2;
3423 xmm7 = xmm7 - a3 * b2;
3424 xmm8 = xmm8 - a4 * b2;
3427 (~C).store( i , j , xmm1 );
3428 (~C).store( i1, j , xmm2 );
3429 (~C).store( i2, j , xmm3 );
3430 (~C).store( i3, j , xmm4 );
3431 (~C).store( i , j+1UL, xmm5 );
3432 (~C).store( i1, j+1UL, xmm6 );
3433 (~C).store( i2, j+1UL, xmm7 );
3434 (~C).store( i3, j+1UL, xmm8 );
3439 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3440 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3441 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
3442 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3444 SIMDType xmm1( (~C).load(i ,j) );
3445 SIMDType xmm2( (~C).load(i1,j) );
3446 SIMDType xmm3( (~C).load(i2,j) );
3447 SIMDType xmm4( (~C).load(i3,j) );
3449 for(
size_t k=kbegin; k<kend; ++k ) {
3450 const SIMDType b1(
set( B(k,j) ) );
3451 xmm1 = xmm1 - A.load(i ,k) * b1;
3452 xmm2 = xmm2 - A.load(i1,k) * b1;
3453 xmm3 = xmm3 - A.load(i2,k) * b1;
3454 xmm4 = xmm4 - A.load(i3,k) * b1;
3457 (~C).store( i , j, xmm1 );
3458 (~C).store( i1, j, xmm2 );
3459 (~C).store( i2, j, xmm3 );
3460 (~C).store( i3, j, xmm4 );
3464 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3466 const size_t i1( i+SIMDSIZE );
3470 for( ; (j+4UL) <= jend; j+=4UL )
3472 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3473 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3474 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
3475 ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
3477 SIMDType xmm1( (~C).load(i ,j ) );
3478 SIMDType xmm2( (~C).load(i1,j ) );
3479 SIMDType xmm3( (~C).load(i ,j+1UL) );
3480 SIMDType xmm4( (~C).load(i1,j+1UL) );
3481 SIMDType xmm5( (~C).load(i ,j+2UL) );
3482 SIMDType xmm6( (~C).load(i1,j+2UL) );
3483 SIMDType xmm7( (~C).load(i ,j+3UL) );
3484 SIMDType xmm8( (~C).load(i1,j+3UL) );
3486 for(
size_t k=kbegin; k<kend; ++k ) {
3487 const SIMDType a1( A.load(i ,k) );
3488 const SIMDType a2( A.load(i1,k) );
3489 const SIMDType b1(
set( B(k,j ) ) );
3490 const SIMDType b2(
set( B(k,j+1UL) ) );
3491 const SIMDType b3(
set( B(k,j+2UL) ) );
3492 const SIMDType b4(
set( B(k,j+3UL) ) );
3493 xmm1 = xmm1 - a1 * b1;
3494 xmm2 = xmm2 - a2 * b1;
3495 xmm3 = xmm3 - a1 * b2;
3496 xmm4 = xmm4 - a2 * b2;
3497 xmm5 = xmm5 - a1 * b3;
3498 xmm6 = xmm6 - a2 * b3;
3499 xmm7 = xmm7 - a1 * b4;
3500 xmm8 = xmm8 - a2 * b4;
3503 (~C).store( i , j , xmm1 );
3504 (~C).store( i1, j , xmm2 );
3505 (~C).store( i , j+1UL, xmm3 );
3506 (~C).store( i1, j+1UL, xmm4 );
3507 (~C).store( i , j+2UL, xmm5 );
3508 (~C).store( i1, j+2UL, xmm6 );
3509 (~C).store( i , j+3UL, xmm7 );
3510 (~C).store( i1, j+3UL, xmm8 );
3513 for( ; (j+2UL) <= jend; j+=2UL )
3515 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3516 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3517 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
3518 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
3520 SIMDType xmm1( (~C).load(i ,j ) );
3521 SIMDType xmm2( (~C).load(i1,j ) );
3522 SIMDType xmm3( (~C).load(i ,j+1UL) );
3523 SIMDType xmm4( (~C).load(i1,j+1UL) );
3525 for(
size_t k=kbegin; k<kend; ++k ) {
3526 const SIMDType a1( A.load(i ,k) );
3527 const SIMDType a2( A.load(i1,k) );
3528 const SIMDType b1(
set( B(k,j ) ) );
3529 const SIMDType b2(
set( B(k,j+1UL) ) );
3530 xmm1 = xmm1 - a1 * b1;
3531 xmm2 = xmm2 - a2 * b1;
3532 xmm3 = xmm3 - a1 * b2;
3533 xmm4 = xmm4 - a2 * b2;
3536 (~C).store( i , j , xmm1 );
3537 (~C).store( i1, j , xmm2 );
3538 (~C).store( i , j+1UL, xmm3 );
3539 (~C).store( i1, j+1UL, xmm4 );
3544 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3545 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3546 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
3547 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3549 SIMDType xmm1( (~C).load(i ,j) );
3550 SIMDType xmm2( (~C).load(i1,j) );
3552 for(
size_t k=kbegin; k<kend; ++k ) {
3553 const SIMDType b1(
set( B(k,j) ) );
3554 xmm1 = xmm1 - A.load(i ,k) * b1;
3555 xmm2 = xmm2 - A.load(i1,k) * b1;
3558 (~C).store( i , j, xmm1 );
3559 (~C).store( i1, j, xmm2 );
3563 for( ; i<ipos; i+=SIMDSIZE )
3565 for(
size_t j=jj; j<jend; ++j )
3567 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3568 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3569 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE, ktmp ) ):( ktmp ),
3570 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3572 SIMDType xmm1( (~C).load(i,j) );
3574 for(
size_t k=kbegin; k<kend; ++k ) {
3575 const SIMDType b1(
set( B(k,j) ) );
3576 xmm1 = xmm1 - A.load(i,k) * b1;
3579 (~C).store( i, j, xmm1 );
3583 for( ; remainder && i<iend; ++i )
3585 for(
size_t j=jj; j<jend; ++j )
3587 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3588 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3589 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+1UL, ktmp ) ):( ktmp ),
3590 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3592 ElementType value( (~C)(i,j) );
3594 for(
size_t k=kbegin; k<kend; ++k ) {
3595 value -= A(i,k) * B(k,j);
3622 template<
typename MT3
3625 static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5> >
3626 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3628 selectLargeSubAssignKernel( C, A, B );
3634 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
3648 template<
typename MT3
3651 static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5> >
3652 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3654 typedef ElementType_<MT3> ET;
3656 if( IsTriangular<MT4>::value ) {
3657 ResultType_<MT3> tmp(
serial( B ) );
3658 trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
3659 subAssign( C, tmp );
3661 else if( IsTriangular<MT5>::value ) {
3662 ResultType_<MT3> tmp(
serial( A ) );
3663 trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
3664 subAssign( C, tmp );
3667 gemm( C, A, B, ET(-1), ET(1) );
3690 template<
typename MT >
3691 friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
3701 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
3702 subAssign( ~lhs,
trans( rhs.lhs_ ) *
trans( rhs.rhs_ ) );
3703 else if( IsSymmetric<MT1>::value )
3704 subAssign( ~lhs,
trans( rhs.lhs_ ) * rhs.rhs_ );
3706 subAssign( ~lhs, rhs.lhs_ *
trans( rhs.rhs_ ) );
3739 template<
typename MT
3741 friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
3749 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
3752 else if( rhs.lhs_.columns() == 0UL ) {
3788 template<
typename MT
3790 friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
3795 typedef IfTrue_< SO, ResultType, OppositeType > TmpType;
3807 const TmpType tmp( rhs );
3828 template<
typename MT >
3829 friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
3839 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
3841 else if( IsSymmetric<MT1>::value )
3865 template<
typename MT
3867 friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
3875 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3910 template<
typename MT >
3911 friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
3921 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
3923 else if( IsSymmetric<MT1>::value )
3951 template<
typename MT
3953 friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
3961 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3996 template<
typename MT >
3997 friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
4007 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
4009 else if( IsSymmetric<MT1>::value )
4058 template<
typename MT1
4062 :
public DenseMatrix< DMatScalarMultExpr< TDMatTDMatMultExpr<MT1,MT2>, ST, true >, true >
4063 ,
private MatScalarMultExpr
4064 ,
private Computation
4068 typedef TDMatTDMatMultExpr<MT1,MT2> MMM;
4069 typedef ResultType_<MMM> RES;
4070 typedef ResultType_<MT1>
RT1;
4071 typedef ResultType_<MT2>
RT2;
4072 typedef ElementType_<RT1>
ET1;
4073 typedef ElementType_<RT2>
ET2;
4074 typedef CompositeType_<MT1>
CT1;
4075 typedef CompositeType_<MT2>
CT2;
4080 enum :
bool { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
4085 enum :
bool { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
4095 template<
typename T1,
typename T2,
typename T3 >
4096 struct CanExploitSymmetry {
4097 enum :
bool { value = IsRowMajorMatrix<T1>::value &&
4098 ( IsSymmetric<T2>::value || IsSymmetric<T3>::value ) };
4107 template<
typename T1,
typename T2,
typename T3 >
4108 struct IsEvaluationRequired {
4109 enum :
bool { value = ( evaluateLeft || evaluateRight ) &&
4110 !CanExploitSymmetry<T1,T2,T3>::value };
4118 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4119 struct UseBlasKernel {
4121 HasMutableDataAccess<T1>::value &&
4122 HasConstDataAccess<T2>::value &&
4123 HasConstDataAccess<T3>::value &&
4124 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
4125 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4126 IsBLASCompatible< ElementType_<T1> >::value &&
4127 IsBLASCompatible< ElementType_<T2> >::value &&
4128 IsBLASCompatible< ElementType_<T3> >::value &&
4129 IsSame< ElementType_<T1>, ElementType_<T2> >::value &&
4130 IsSame< ElementType_<T1>, ElementType_<T3> >::value &&
4131 !( IsBuiltin< ElementType_<T1> >::value && IsComplex<T4>::value ) };
4139 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4140 struct UseVectorizedDefaultKernel {
4142 !IsDiagonal<T2>::value &&
4143 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4144 AreSIMDCombinable< ElementType_<T1>
4148 HasSIMDAdd< ElementType_<T2>, ElementType_<T2> >::value &&
4149 HasSIMDMult< ElementType_<T3>, ElementType_<T3> >::value };
4155 typedef DMatScalarMultExpr<MMM,ST,true>
This;
4160 typedef SIMDTrait_<ElementType>
SIMDType;
4165 typedef const TDMatTDMatMultExpr<MT1,MT2>
LeftOperand;
4171 typedef IfTrue_< evaluateLeft, const RT1, CT1 >
LT;
4174 typedef IfTrue_< evaluateRight, const RT2, CT2 >
RT;
4179 enum :
bool { simdEnabled = !IsDiagonal<MT1>::value &&
4180 MT1::simdEnabled && MT2::simdEnabled &&
4181 AreSIMDCombinable<ET1,ET2,ST>::value &&
4182 HasSIMDAdd<ET1,ET2>::value &&
4183 HasSIMDMult<ET1,ET2>::value };
4186 enum :
bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
4187 !evaluateRight && MT2::smpAssignable };
4201 explicit inline DMatScalarMultExpr(
const MMM& matrix, ST scalar )
4214 inline ReturnType
operator()(
size_t i,
size_t j )
const {
4217 return matrix_(i,j) * scalar_;
4229 inline ReturnType
at(
size_t i,
size_t j )
const {
4230 if( i >= matrix_.rows() ) {
4233 if( j >= matrix_.columns() ) {
4236 return (*
this)(i,j);
4245 inline size_t rows()
const {
4246 return matrix_.rows();
4255 inline size_t columns()
const {
4256 return matrix_.columns();
4286 template<
typename T >
4287 inline bool canAlias(
const T* alias )
const {
4288 return matrix_.canAlias( alias );
4298 template<
typename T >
4299 inline bool isAliased(
const T* alias )
const {
4300 return matrix_.isAliased( alias );
4310 return matrix_.isAligned();
4321 (
rows() *
columns() < TDMATTDMATMULT_THRESHOLD ) ) &&
4322 (
rows() *
columns() >= SMP_TDMATTDMATMULT_THRESHOLD );
4328 LeftOperand matrix_;
4329 RightOperand scalar_;
4344 template<
typename MT
4346 friend inline DisableIf_< CanExploitSymmetry<MT,MT1,MT2> >
4347 assign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
4354 LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
4355 RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
4357 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
4360 else if( left.columns() == 0UL ) {
4375 DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
4390 template<
typename MT3
4394 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4396 if( ( IsDiagonal<MT4>::value ) ||
4397 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
4398 selectSmallAssignKernel( C, A, B, scalar );
4400 selectBlasAssignKernel( C, A, B, scalar );
4418 template<
typename MT3
4422 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
4423 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4425 const size_t M( A.rows() );
4426 const size_t N( B.columns() );
4427 const size_t K( A.columns() );
4429 for(
size_t j=0UL; j<N; ++j )
4431 const size_t kbegin( ( IsLower<MT5>::value )
4432 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
4434 const size_t kend( ( IsUpper<MT5>::value )
4435 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
4439 if( IsStrictlyTriangular<MT5>::value && kbegin == kend ) {
4440 for(
size_t i=0UL; i<M; ++i ) {
4447 const size_t ibegin( ( IsLower<MT4>::value )
4448 ?( IsStrictlyLower<MT4>::value ? kbegin+1UL : kbegin )
4450 const size_t iend( ( IsUpper<MT4>::value )
4451 ?( IsStrictlyUpper<MT4>::value ? kbegin : kbegin+1UL )
4455 if( IsLower<MT4>::value && IsLower<MT5>::value ) {
4456 for(
size_t i=0UL; i<ibegin; ++i ) {
4460 else if( IsStrictlyLower<MT4>::value ) {
4463 for(
size_t i=ibegin; i<iend; ++i ) {
4464 C(i,j) = A(i,kbegin) * B(kbegin,j);
4466 if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
4467 for(
size_t i=iend; i<M; ++i ) {
4471 else if( IsStrictlyUpper<MT4>::value ) {
4472 reset( C(M-1UL,j) );
4476 for(
size_t k=kbegin+1UL; k<kend; ++k )
4478 const size_t ibegin( ( IsLower<MT4>::value )
4479 ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
4481 const size_t iend( ( IsUpper<MT4>::value )
4482 ?( IsStrictlyUpper<MT4>::value ? k-1UL : k )
4486 for(
size_t i=ibegin; i<iend; ++i ) {
4487 C(i,j) += A(i,k) * B(k,j);
4489 if( IsUpper<MT4>::value ) {
4490 C(iend,j) = A(iend,k) * B(k,j);
4495 const size_t ibegin( ( IsLower<MT4>::value && IsLower<MT5>::value )
4496 ?( IsStrictlyLower<MT4>::value || IsStrictlyLower<MT5>::value ? j+1UL : j )
4498 const size_t iend( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
4499 ?( IsStrictlyUpper<MT4>::value || IsStrictlyUpper<MT5>::value ? j : j+1UL )
4503 for(
size_t i=ibegin; i<iend; ++i ) {
4525 template<
typename MT3
4529 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
4530 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4534 const size_t M( A.rows() );
4535 const size_t N( B.columns() );
4537 for(
size_t j=0UL; j<N; ++j )
4539 const size_t ibegin( ( IsLower<MT4>::value )
4540 ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
4542 const size_t iend( ( IsUpper<MT4>::value )
4543 ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
4547 if( IsLower<MT4>::value ) {
4548 for(
size_t i=0UL; i<ibegin; ++i ) {
4552 for(
size_t i=ibegin; i<iend; ++i ) {
4553 C(i,j) = A(i,j) * B(j,j) * scalar;
4555 if( IsUpper<MT4>::value ) {
4556 for(
size_t i=iend; i<M; ++i ) {
4578 template<
typename MT3
4582 static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
4583 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4587 const size_t M( A.rows() );
4588 const size_t N( B.columns() );
4590 for(
size_t j=0UL; j<N; ++j )
4592 const size_t ibegin( ( IsLower<MT5>::value )
4593 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
4595 const size_t iend( ( IsUpper<MT5>::value )
4596 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
4600 if( IsLower<MT4>::value ) {
4601 for(
size_t i=0UL; i<ibegin; ++i ) {
4605 for(
size_t i=ibegin; i<iend; ++i ) {
4606 C(i,j) = A(i,i) * B(i,j) * scalar;
4608 if( IsUpper<MT4>::value ) {
4609 for(
size_t i=iend; i<M; ++i ) {
4631 template<
typename MT3
4635 static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
4636 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4642 for(
size_t i=0UL; i<A.rows(); ++i ) {
4643 C(i,i) = A(i,i) * B(i,i) * scalar;
4662 template<
typename MT3
4666 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
4667 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4669 selectDefaultAssignKernel( C, A, B, scalar );
4688 template<
typename MT3
4692 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
4693 selectSmallAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
4700 if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
4701 const OppositeType_<MT5> tmp(
serial( B ) );
4702 assign( ~C, A * tmp * scalar );
4704 else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
4705 const OppositeType_<MT4> tmp(
serial( A ) );
4706 assign( ~C, tmp * B * scalar );
4708 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
4709 const OppositeType_<MT5> tmp(
serial( B ) );
4710 assign( ~C, A * tmp * scalar );
4713 const OppositeType_<MT4> tmp(
serial( A ) );
4714 assign( ~C, tmp * B * scalar );
4734 template<
typename MT3
4738 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
4739 selectSmallAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
4741 const size_t M( A.rows() );
4742 const size_t N( B.columns() );
4743 const size_t K( A.columns() );
4745 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
4747 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
4750 const SIMDType factor(
set( scalar ) );
4754 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
4755 for(
size_t j=0UL; j<N; ++j )
4757 const size_t kbegin( ( IsLower<MT5>::value )
4758 ?( ( IsUpper<MT4>::value )
4759 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4760 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4761 :( IsUpper<MT4>::value ? i : 0UL ) );
4762 const size_t kend( ( IsUpper<MT5>::value )
4763 ?( ( IsLower<MT4>::value )
4764 ?(
min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
4765 :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
4766 :( IsLower<MT4>::value ?
min( i+SIMDSIZE*8UL, K ) : K ) );
4768 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4770 for(
size_t k=kbegin; k<kend; ++k ) {
4771 const SIMDType b1(
set( B(k,j) ) );
4772 xmm1 = xmm1 + A.load(i ,k) * b1;
4773 xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
4774 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
4775 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
4776 xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,k) * b1;
4777 xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,k) * b1;
4778 xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,k) * b1;
4779 xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,k) * b1;
4782 (~C).store( i , j, xmm1 * factor );
4783 (~C).store( i+SIMDSIZE , j, xmm2 * factor );
4784 (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
4785 (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
4786 (~C).store( i+SIMDSIZE*4UL, j, xmm5 * factor );
4787 (~C).store( i+SIMDSIZE*5UL, j, xmm6 * factor );
4788 (~C).store( i+SIMDSIZE*6UL, j, xmm7 * factor );
4789 (~C).store( i+SIMDSIZE*7UL, j, xmm8 * factor );
4793 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
4797 for( ; (j+2UL) <= N; j+=2UL )
4799 const size_t kbegin( ( IsLower<MT5>::value )
4800 ?( ( IsUpper<MT4>::value )
4801 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4802 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4803 :( IsUpper<MT4>::value ? i : 0UL ) );
4804 const size_t kend( ( IsUpper<MT5>::value )
4805 ?( ( IsLower<MT4>::value )
4806 ?(
min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
4807 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
4808 :( IsLower<MT4>::value ?
min( i+SIMDSIZE*4UL, K ) : K ) );
4810 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4812 for(
size_t k=kbegin; k<kend; ++k ) {
4813 const SIMDType a1( A.load(i ,k) );
4814 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
4815 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
4816 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
4817 const SIMDType b1(
set( B(k,j ) ) );
4818 const SIMDType b2(
set( B(k,j+1UL) ) );
4819 xmm1 = xmm1 + a1 * b1;
4820 xmm2 = xmm2 + a2 * b1;
4821 xmm3 = xmm3 + a3 * b1;
4822 xmm4 = xmm4 + a4 * b1;
4823 xmm5 = xmm5 + a1 * b2;
4824 xmm6 = xmm6 + a2 * b2;
4825 xmm7 = xmm7 + a3 * b2;
4826 xmm8 = xmm8 + a4 * b2;
4829 (~C).store( i , j , xmm1 * factor );
4830 (~C).store( i+SIMDSIZE , j , xmm2 * factor );
4831 (~C).store( i+SIMDSIZE*2UL, j , xmm3 * factor );
4832 (~C).store( i+SIMDSIZE*3UL, j , xmm4 * factor );
4833 (~C).store( i , j+1UL, xmm5 * factor );
4834 (~C).store( i+SIMDSIZE , j+1UL, xmm6 * factor );
4835 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 * factor );
4836 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 * factor );
4841 const size_t kbegin( ( IsLower<MT5>::value )
4842 ?( ( IsUpper<MT4>::value )
4843 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4844 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4845 :( IsUpper<MT4>::value ? i : 0UL ) );
4846 const size_t kend( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*4UL, K ) ):( K ) );
4848 SIMDType xmm1, xmm2, xmm3, xmm4;
4850 for(
size_t k=kbegin; k<kend; ++k ) {
4851 const SIMDType b1(
set( B(k,j) ) );
4852 xmm1 = xmm1 + A.load(i ,k) * b1;
4853 xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
4854 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
4855 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
4858 (~C).store( i , j, xmm1 * factor );
4859 (~C).store( i+SIMDSIZE , j, xmm2 * factor );
4860 (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
4861 (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
4865 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
4869 for( ; (j+2UL) <= N; j+=2UL )
4871 const size_t kbegin( ( IsLower<MT5>::value )
4872 ?( ( IsUpper<MT4>::value )
4873 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4874 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4875 :( IsUpper<MT4>::value ? i : 0UL ) );
4876 const size_t kend( ( IsUpper<MT5>::value )
4877 ?( ( IsLower<MT4>::value )
4878 ?(
min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
4879 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
4880 :( IsLower<MT4>::value ?
min( i+SIMDSIZE*2UL, K ) : K ) );
4882 SIMDType xmm1, xmm2, xmm3, xmm4;
4884 for(
size_t k=kbegin; k<kend; ++k ) {
4885 const SIMDType a1( A.load(i ,k) );
4886 const SIMDType a2( A.load(i+SIMDSIZE,k) );
4887 const SIMDType b1(
set( B(k,j ) ) );
4888 const SIMDType b2(
set( B(k,j+1UL) ) );
4889 xmm1 = xmm1 + a1 * b1;
4890 xmm2 = xmm2 + a2 * b1;
4891 xmm3 = xmm3 + a1 * b2;
4892 xmm4 = xmm4 + a2 * b2;
4895 (~C).store( i , j , xmm1 * factor );
4896 (~C).store( i+SIMDSIZE, j , xmm2 * factor );
4897 (~C).store( i , j+1UL, xmm3 * factor );
4898 (~C).store( i+SIMDSIZE, j+1UL, xmm4 * factor );
4903 const size_t kbegin( ( IsLower<MT5>::value )
4904 ?( ( IsUpper<MT4>::value )
4905 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4906 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4907 :( IsUpper<MT4>::value ? i : 0UL ) );
4908 const size_t kend( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, K ) ):( K ) );
4910 SIMDType xmm1, xmm2;
4912 for(
size_t k=kbegin; k<kend; ++k ) {
4913 const SIMDType b1(
set( B(k,j) ) );
4914 xmm1 = xmm1 + A.load(i ,k) * b1;
4915 xmm2 = xmm2 + A.load(i+SIMDSIZE,k) * b1;
4918 (~C).store( i , j, xmm1 * factor );
4919 (~C).store( i+SIMDSIZE, j, xmm2 * factor );
4923 for( ; i<ipos; i+=SIMDSIZE )
4927 for( ; (j+2UL) <= N; j+=2UL )
4929 const size_t kbegin( ( IsLower<MT5>::value )
4930 ?( ( IsUpper<MT4>::value )
4931 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4932 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4933 :( IsUpper<MT4>::value ? i : 0UL ) );
4934 const size_t kend( ( IsUpper<MT5>::value )
4935 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
4938 SIMDType xmm1, xmm2;
4940 for(
size_t k=kbegin; k<kend; ++k ) {
4941 const SIMDType a1( A.load(i,k) );
4942 xmm1 = xmm1 + a1 *
set( B(k,j ) );
4943 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
4946 (~C).store( i, j , xmm1 * factor );
4947 (~C).store( i, j+1UL, xmm2 * factor );
4952 const size_t kbegin( ( IsLower<MT5>::value )
4953 ?( ( IsUpper<MT4>::value )
4954 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4955 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4956 :( IsUpper<MT4>::value ? i : 0UL ) );
4960 for(
size_t k=kbegin; k<K; ++k ) {
4961 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
4964 (~C).store( i, j, xmm1 * factor );
4968 for( ; remainder && i<M; ++i )
4972 for( ; (j+2UL) <= N; j+=2UL )
4974 const size_t kbegin( ( IsLower<MT5>::value )
4975 ?( ( IsUpper<MT4>::value )
4976 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4977 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4978 :( IsUpper<MT4>::value ? i : 0UL ) );
4979 const size_t kend( ( IsUpper<MT5>::value )
4980 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
4986 for(
size_t k=kbegin; k<kend; ++k ) {
4987 value1 += A(i,k) * B(k,j );
4988 value2 += A(i,k) * B(k,j+1UL);
4991 (~C)(i,j ) = value1 * scalar;
4992 (~C)(i,j+1UL) = value2 * scalar;
4997 const size_t kbegin( ( IsLower<MT5>::value )
4998 ?( ( IsUpper<MT4>::value )
4999 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5000 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5001 :( IsUpper<MT4>::value ? i : 0UL ) );
5005 for(
size_t k=kbegin; k<K; ++k ) {
5006 value += A(i,k) * B(k,j);
5009 (~C)(i,j) = value * scalar;
5029 template<
typename MT3
5033 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
5034 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5036 selectDefaultAssignKernel( C, A, B, scalar );
5055 template<
typename MT3
5059 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
5060 selectLargeAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5062 selectSmallAssignKernel( ~C, A, B, scalar );
5081 template<
typename MT3
5085 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
5086 selectLargeAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5088 const size_t M( A.rows() );
5089 const size_t N( B.columns() );
5090 const size_t K( A.columns() );
5092 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
5094 const SIMDType factor(
set( scalar ) );
5096 for(
size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_IBLOCK_SIZE )
5098 const size_t iend(
min( ii+TDMATTDMATMULT_IBLOCK_SIZE, M ) );
5100 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
5101 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
5103 for(
size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_JBLOCK_SIZE )
5105 const size_t jend(
min( jj+TDMATTDMATMULT_JBLOCK_SIZE, N ) );
5107 for(
size_t j=jj; j<jend; ++j ) {
5108 for(
size_t i=ii; i<iend; ++i ) {
5113 for(
size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_KBLOCK_SIZE )
5115 const size_t ktmp(
min( kk+TDMATTDMATMULT_KBLOCK_SIZE, K ) );
5119 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
5121 const size_t i1( i+SIMDSIZE );
5122 const size_t i2( i+SIMDSIZE*2UL );
5123 const size_t i3( i+SIMDSIZE*3UL );
5127 for( ; (j+2UL) <= jend; j+=2UL )
5129 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5130 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5131 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
5132 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
5134 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5136 for(
size_t k=kbegin; k<kend; ++k ) {
5137 const SIMDType a1( A.load(i ,k) );
5138 const SIMDType a2( A.load(i1,k) );
5139 const SIMDType a3( A.load(i2,k) );
5140 const SIMDType a4( A.load(i3,k) );
5141 const SIMDType b1(
set( B(k,j ) ) );
5142 const SIMDType b2(
set( B(k,j+1UL) ) );
5143 xmm1 = xmm1 + a1 * b1;
5144 xmm2 = xmm2 + a2 * b1;
5145 xmm3 = xmm3 + a3 * b1;
5146 xmm4 = xmm4 + a4 * b1;
5147 xmm5 = xmm5 + a1 * b2;
5148 xmm6 = xmm6 + a2 * b2;
5149 xmm7 = xmm7 + a3 * b2;
5150 xmm8 = xmm8 + a4 * b2;
5153 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5154 (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
5155 (~C).store( i2, j , (~C).load(i2,j ) + xmm3 * factor );
5156 (~C).store( i3, j , (~C).load(i3,j ) + xmm4 * factor );
5157 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
5158 (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm6 * factor );
5159 (~C).store( i2, j+1UL, (~C).load(i2,j+1UL) + xmm7 * factor );
5160 (~C).store( i3, j+1UL, (~C).load(i3,j+1UL) + xmm8 * factor );
5165 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5166 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5167 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
5168 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5170 SIMDType xmm1, xmm2, xmm3, xmm4;
5172 for(
size_t k=kbegin; k<kend; ++k ) {
5173 const SIMDType b1(
set( B(k,j) ) );
5174 xmm1 = xmm1 + A.load(i ,k) * b1;
5175 xmm2 = xmm2 + A.load(i1,k) * b1;
5176 xmm3 = xmm3 + A.load(i2,k) * b1;
5177 xmm4 = xmm4 + A.load(i3,k) * b1;
5180 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
5181 (~C).store( i1, j, (~C).load(i1,j) + xmm2 * factor );
5182 (~C).store( i2, j, (~C).load(i2,j) + xmm3 * factor );
5183 (~C).store( i3, j, (~C).load(i3,j) + xmm4 * factor );
5187 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
5189 const size_t i1( i+SIMDSIZE );
5193 for( ; (j+4UL) <= jend; j+=4UL )
5195 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5196 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5197 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
5198 ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
5200 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5202 for(
size_t k=kbegin; k<kend; ++k ) {
5203 const SIMDType a1( A.load(i ,k) );
5204 const SIMDType a2( A.load(i1,k) );
5205 const SIMDType b1(
set( B(k,j ) ) );
5206 const SIMDType b2(
set( B(k,j+1UL) ) );
5207 const SIMDType b3(
set( B(k,j+2UL) ) );
5208 const SIMDType b4(
set( B(k,j+3UL) ) );
5209 xmm1 = xmm1 + a1 * b1;
5210 xmm2 = xmm2 + a2 * b1;
5211 xmm3 = xmm3 + a1 * b2;
5212 xmm4 = xmm4 + a2 * b2;
5213 xmm5 = xmm5 + a1 * b3;
5214 xmm6 = xmm6 + a2 * b3;
5215 xmm7 = xmm7 + a1 * b4;
5216 xmm8 = xmm8 + a2 * b4;
5219 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5220 (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
5221 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
5222 (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm4 * factor );
5223 (~C).store( i , j+2UL, (~C).load(i ,j+2UL) + xmm5 * factor );
5224 (~C).store( i1, j+2UL, (~C).load(i1,j+2UL) + xmm6 * factor );
5225 (~C).store( i , j+3UL, (~C).load(i ,j+3UL) + xmm7 * factor );
5226 (~C).store( i1, j+3UL, (~C).load(i1,j+3UL) + xmm8 * factor );
5229 for( ; (j+2UL) <= jend; j+=2UL )
5231 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5232 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5233 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
5234 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
5236 SIMDType xmm1, xmm2, xmm3, xmm4;
5238 for(
size_t k=kbegin; k<kend; ++k ) {
5239 const SIMDType a1( A.load(i ,k) );
5240 const SIMDType a2( A.load(i1,k) );
5241 const SIMDType b1(
set( B(k,j ) ) );
5242 const SIMDType b2(
set( B(k,j+1UL) ) );
5243 xmm1 = xmm1 + a1 * b1;
5244 xmm2 = xmm2 + a2 * b1;
5245 xmm3 = xmm3 + a1 * b2;
5246 xmm4 = xmm4 + a2 * b2;
5249 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5250 (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
5251 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
5252 (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm4 * factor );
5257 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5258 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5259 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
5260 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5262 SIMDType xmm1, xmm2;
5264 for(
size_t k=kbegin; k<kend; ++k ) {
5265 const SIMDType b1(
set( B(k,j) ) );
5266 xmm1 = xmm1 + A.load(i ,k) * b1;
5267 xmm2 = xmm2 + A.load(i1,k) * b1;
5270 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
5271 (~C).store( i1, j, (~C).load(i1,j) + xmm2 * factor );
5275 for( ; i<ipos; i+=SIMDSIZE )
5277 for(
size_t j=jj; j<jend; ++j )
5279 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5280 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5281 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE, ktmp ) ):( ktmp ),
5282 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5286 for(
size_t k=kbegin; k<kend; ++k ) {
5287 const SIMDType b1(
set( B(k,j) ) );
5288 xmm1 = xmm1 + A.load(i,k) * b1;
5291 (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
5295 for( ; remainder && i<iend; ++i )
5297 for(
size_t j=jj; j<jend; ++j )
5299 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5300 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5301 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+1UL, ktmp ) ):( ktmp ),
5302 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5306 for(
size_t k=kbegin; k<kend; ++k ) {
5307 value += A(i,k) * B(k,j);
5310 (~C)(i,j) += value * scalar;
5333 template<
typename MT3
5337 static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
5338 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5340 selectLargeAssignKernel( C, A, B, scalar );
5345 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
5359 template<
typename MT3
5363 static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
5364 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5366 typedef ElementType_<MT3> ET;
5368 if( IsTriangular<MT4>::value ) {
5370 trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
5372 else if( IsTriangular<MT5>::value ) {
5374 trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
5377 gemm( C, A, B, ET(scalar), ET(0) );
5395 template<
typename MT
5397 friend inline DisableIf_< CanExploitSymmetry<MT,MT1,MT2> >
5398 assign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
5402 typedef IfTrue_< SO, ResultType, OppositeType > TmpType;
5414 const TmpType tmp(
serial( rhs ) );
5415 assign( ~lhs, tmp );
5433 template<
typename MT >
5434 friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
5435 assign( Matrix<MT,false>& lhs,
const DMatScalarMultExpr& rhs )
5444 LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
5445 RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
5447 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
5448 assign( ~lhs,
trans( left ) *
trans( right ) * rhs.scalar_ );
5449 else if( IsSymmetric<MT1>::value )
5450 assign( ~lhs,
trans( left ) * right * rhs.scalar_ );
5452 assign( ~lhs, left *
trans( right ) * rhs.scalar_ );
5468 template<
typename MT
5470 friend inline DisableIf_< CanExploitSymmetry<MT,MT1,MT2> >
5471 addAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
5478 LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
5479 RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
5481 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
5495 DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
5510 template<
typename MT3
5514 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5516 if( ( IsDiagonal<MT4>::value ) ||
5517 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
5518 selectSmallAddAssignKernel( C, A, B, scalar );
5520 selectBlasAddAssignKernel( C, A, B, scalar );
5538 template<
typename MT3
5542 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
5543 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5545 const ResultType tmp(
serial( A * B * scalar ) );
5546 addAssign( C, tmp );
5564 template<
typename MT3
5568 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
5569 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5573 const size_t M( A.rows() );
5574 const size_t N( B.columns() );
5576 for(
size_t j=0UL; j<N; ++j )
5578 const size_t ibegin( ( IsLower<MT4>::value )
5579 ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
5581 const size_t iend( ( IsUpper<MT4>::value )
5582 ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
5586 const size_t inum( iend - ibegin );
5587 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
5589 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
5590 C(i ,j) += A(i ,j) * B(j,j) * scalar;
5591 C(i+1UL,j) += A(i+1UL,j) * B(j,j) * scalar;
5594 C(ipos,j) += A(ipos,j) * B(j,j) * scalar;
5614 template<
typename MT3
5618 static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
5619 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5623 const size_t M( A.rows() );
5624 const size_t N( B.columns() );
5626 for(
size_t j=0UL; j<N; ++j )
5628 const size_t ibegin( ( IsLower<MT5>::value )
5629 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
5631 const size_t iend( ( IsUpper<MT5>::value )
5632 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
5636 const size_t inum( iend - ibegin );
5637 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
5639 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
5640 C(i ,j) += A(i ,i ) * B(i ,j) * scalar;
5641 C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
5644 C(ipos,j) += A(ipos,ipos) * B(ipos,j) * scalar;
5664 template<
typename MT3
5668 static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
5669 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5673 for(
size_t i=0UL; i<A.rows(); ++i ) {
5674 C(i,i) += A(i,i) * B(i,i) * scalar;
5693 template<
typename MT3
5697 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
5698 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5700 selectDefaultAddAssignKernel( C, A, B, scalar );
5719 template<
typename MT3
5723 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
5724 selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5731 if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
5732 const OppositeType_<MT5> tmp(
serial( B ) );
5733 addAssign( ~C, A * tmp * scalar );
5735 else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
5736 const OppositeType_<MT4> tmp(
serial( A ) );
5737 addAssign( ~C, tmp * B * scalar );
5739 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
5740 const OppositeType_<MT5> tmp(
serial( B ) );
5741 addAssign( ~C, A * tmp * scalar );
5744 const OppositeType_<MT4> tmp(
serial( A ) );
5745 addAssign( ~C, tmp * B * scalar );
5765 template<
typename MT3
5769 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
5770 selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5772 const size_t M( A.rows() );
5773 const size_t N( B.columns() );
5774 const size_t K( A.columns() );
5776 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
5778 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
5781 const SIMDType factor(
set( scalar ) );
5785 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
5786 for(
size_t j=0UL; j<N; ++j )
5788 const size_t kbegin( ( IsLower<MT5>::value )
5789 ?( ( IsUpper<MT4>::value )
5790 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5791 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5792 :( IsUpper<MT4>::value ? i : 0UL ) );
5793 const size_t kend( ( IsUpper<MT5>::value )
5794 ?( ( IsLower<MT4>::value )
5795 ?(
min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
5796 :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
5797 :( IsLower<MT4>::value ?
min( i+SIMDSIZE*8UL, K ) : K ) );
5799 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5801 for(
size_t k=kbegin; k<kend; ++k ) {
5802 const SIMDType b1(
set( B(k,j) ) );
5803 xmm1 = xmm1 + A.load(i ,k) * b1;
5804 xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
5805 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
5806 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
5807 xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,k) * b1;
5808 xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,k) * b1;
5809 xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,k) * b1;
5810 xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,k) * b1;
5813 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
5814 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
5815 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
5816 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
5817 (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
5818 (~C).store( i+SIMDSIZE*5UL, j, (~C).load(i+SIMDSIZE*5UL,j) + xmm6 * factor );
5819 (~C).store( i+SIMDSIZE*6UL, j, (~C).load(i+SIMDSIZE*6UL,j) + xmm7 * factor );
5820 (~C).store( i+SIMDSIZE*7UL, j, (~C).load(i+SIMDSIZE*7UL,j) + xmm8 * factor );
5824 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
5828 for( ; (j+2UL) <= N; j+=2UL )
5830 const size_t kbegin( ( IsLower<MT5>::value )
5831 ?( ( IsUpper<MT4>::value )
5832 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5833 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5834 :( IsUpper<MT4>::value ? i : 0UL ) );
5835 const size_t kend( ( IsUpper<MT5>::value )
5836 ?( ( IsLower<MT4>::value )
5837 ?(
min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
5838 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
5839 :( IsLower<MT4>::value ?
min( i+SIMDSIZE*4UL, K ) : K ) );
5841 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5843 for(
size_t k=kbegin; k<kend; ++k ) {
5844 const SIMDType a1( A.load(i ,k) );
5845 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
5846 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
5847 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
5848 const SIMDType b1(
set( B(k,j ) ) );
5849 const SIMDType b2(
set( B(k,j+1UL) ) );
5850 xmm1 = xmm1 + a1 * b1;
5851 xmm2 = xmm2 + a2 * b1;
5852 xmm3 = xmm3 + a3 * b1;
5853 xmm4 = xmm4 + a4 * b1;
5854 xmm5 = xmm5 + a1 * b2;
5855 xmm6 = xmm6 + a2 * b2;
5856 xmm7 = xmm7 + a3 * b2;
5857 xmm8 = xmm8 + a4 * b2;
5860 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5861 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) + xmm2 * factor );
5862 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
5863 (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
5864 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
5865 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) + xmm6 * factor );
5866 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) + xmm7 * factor );
5867 (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) + xmm8 * factor );
5872 const size_t kbegin( ( IsLower<MT5>::value )
5873 ?( ( IsUpper<MT4>::value )
5874 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5875 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5876 :( IsUpper<MT4>::value ? i : 0UL ) );
5877 const size_t kend( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*4UL, K ) ):( K ) );
5879 SIMDType xmm1, xmm2, xmm3, xmm4;
5881 for(
size_t k=kbegin; k<kend; ++k ) {
5882 const SIMDType b1(
set( B(k,j) ) );
5883 xmm1 = xmm1 + A.load(i ,k) * b1;
5884 xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
5885 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
5886 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
5889 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
5890 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
5891 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
5892 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
5896 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
5900 for( ; (j+2UL) <= N; j+=2UL )
5902 const size_t kbegin( ( IsLower<MT5>::value )
5903 ?( ( IsUpper<MT4>::value )
5904 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5905 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5906 :( IsUpper<MT4>::value ? i : 0UL ) );
5907 const size_t kend( ( IsUpper<MT5>::value )
5908 ?( ( IsLower<MT4>::value )
5909 ?(
min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
5910 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
5911 :( IsLower<MT4>::value ?
min( i+SIMDSIZE*2UL, K ) : K ) );
5913 SIMDType xmm1, xmm2, xmm3, xmm4;
5915 for(
size_t k=kbegin; k<kend; ++k ) {
5916 const SIMDType a1( A.load(i ,k) );
5917 const SIMDType a2( A.load(i+SIMDSIZE,k) );
5918 const SIMDType b1(
set( B(k,j ) ) );
5919 const SIMDType b2(
set( B(k,j+1UL) ) );
5920 xmm1 = xmm1 + a1 * b1;
5921 xmm2 = xmm2 + a2 * b1;
5922 xmm3 = xmm3 + a1 * b2;
5923 xmm4 = xmm4 + a2 * b2;
5926 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5927 (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) + xmm2 * factor );
5928 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
5929 (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) + xmm4 * factor );
5934 const size_t kbegin( ( IsLower<MT5>::value )
5935 ?( ( IsUpper<MT4>::value )
5936 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5937 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5938 :( IsUpper<MT4>::value ? i : 0UL ) );
5939 const size_t kend( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, K ) ):( K ) );
5941 SIMDType xmm1, xmm2;
5943 for(
size_t k=kbegin; k<kend; ++k ) {
5944 const SIMDType b1(
set( B(k,j) ) );
5945 xmm1 = xmm1 + A.load(i ,k) * b1;
5946 xmm2 = xmm2 + A.load(i+SIMDSIZE,k) * b1;
5949 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
5950 (~C).store( i+SIMDSIZE, j, (~C).load(i+SIMDSIZE,j) + xmm2 * factor );
5954 for( ; i<ipos; i+=SIMDSIZE )
5958 for( ; (j+2UL) <= N; j+=2UL )
5960 const size_t kbegin( ( IsLower<MT5>::value )
5961 ?( ( IsUpper<MT4>::value )
5962 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5963 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5964 :( IsUpper<MT4>::value ? i : 0UL ) );
5965 const size_t kend( ( IsUpper<MT5>::value )
5966 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
5969 SIMDType xmm1, xmm2;
5971 for(
size_t k=kbegin; k<kend; ++k ) {
5972 const SIMDType a1( A.load(i,k) );
5973 xmm1 = xmm1 + a1 *
set( B(k,j ) );
5974 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
5977 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
5978 (~C).store( i, j+1UL, (~C).load(i,j+1UL) + xmm2 * factor );
5983 const size_t kbegin( ( IsLower<MT5>::value )
5984 ?( ( IsUpper<MT4>::value )
5985 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5986 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5987 :( IsUpper<MT4>::value ? i : 0UL ) );
5991 for(
size_t k=kbegin; k<K; ++k ) {
5992 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
5995 (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
5999 for( ; remainder && i<M; ++i )
6003 for( ; (j+2UL) <= N; j+=2UL )
6005 const size_t kbegin( ( IsLower<MT5>::value )
6006 ?( ( IsUpper<MT4>::value )
6007 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6008 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6009 :( IsUpper<MT4>::value ? i : 0UL ) );
6010 const size_t kend( ( IsUpper<MT5>::value )
6011 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
6017 for(
size_t k=kbegin; k<kend; ++k ) {
6018 value1 += A(i,k) * B(k,j );
6019 value2 += A(i,k) * B(k,j+1UL);
6022 (~C)(i,j ) += value1 * scalar;
6023 (~C)(i,j+1UL) += value2 * scalar;
6028 const size_t kbegin( ( IsLower<MT5>::value )
6029 ?( ( IsUpper<MT4>::value )
6030 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6031 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6032 :( IsUpper<MT4>::value ? i : 0UL ) );
6036 for(
size_t k=kbegin; k<K; ++k ) {
6037 value += A(i,k) * B(k,j);
6040 (~C)(i,j) += value * scalar;
6060 template<
typename MT3
6064 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
6065 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6067 selectDefaultAddAssignKernel( C, A, B, scalar );
6086 template<
typename MT3
6090 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
6091 selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6093 selectSmallAddAssignKernel( ~C, A, B, scalar );
6112 template<
typename MT3
6116 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
6117 selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6119 const size_t M( A.rows() );
6120 const size_t N( B.columns() );
6121 const size_t K( A.columns() );
6123 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
6125 const SIMDType factor(
set( scalar ) );
6127 for(
size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_IBLOCK_SIZE )
6129 const size_t iend(
min( ii+TDMATTDMATMULT_IBLOCK_SIZE, M ) );
6131 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
6132 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
6134 for(
size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_JBLOCK_SIZE )
6136 const size_t jend(
min( jj+TDMATTDMATMULT_JBLOCK_SIZE, N ) );
6138 for(
size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_KBLOCK_SIZE )
6140 const size_t ktmp(
min( kk+TDMATTDMATMULT_KBLOCK_SIZE, K ) );
6144 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
6146 const size_t i1( i+SIMDSIZE );
6147 const size_t i2( i+SIMDSIZE*2UL );
6148 const size_t i3( i+SIMDSIZE*3UL );
6152 for( ; (j+2UL) <= jend; j+=2UL )
6154 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
6155 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
6156 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
6157 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
6159 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6161 for(
size_t k=kbegin; k<kend; ++k ) {
6162 const SIMDType a1( A.load(i ,k) );
6163 const SIMDType a2( A.load(i1,k) );
6164 const SIMDType a3( A.load(i2,k) );
6165 const SIMDType a4( A.load(i3,k) );
6166 const SIMDType b1(
set( B(k,j ) ) );
6167 const SIMDType b2(
set( B(k,j+1UL) ) );
6168 xmm1 = xmm1 + a1 * b1;
6169 xmm2 = xmm2 + a2 * b1;
6170 xmm3 = xmm3 + a3 * b1;
6171 xmm4 = xmm4 + a4 * b1;
6172 xmm5 = xmm5 + a1 * b2;
6173 xmm6 = xmm6 + a2 * b2;
6174 xmm7 = xmm7 + a3 * b2;
6175 xmm8 = xmm8 + a4 * b2;
6178 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6179 (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
6180 (~C).store( i2, j , (~C).load(i2,j ) + xmm3 * factor );
6181 (~C).store( i3, j , (~C).load(i3,j ) + xmm4 * factor );
6182 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
6183 (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm6 * factor );
6184 (~C).store( i2, j+1UL, (~C).load(i2,j+1UL) + xmm7 * factor );
6185 (~C).store( i3, j+1UL, (~C).load(i3,j+1UL) + xmm8 * factor );
6190 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
6191 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
6192 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
6193 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
6195 SIMDType xmm1, xmm2, xmm3, xmm4;
6197 for(
size_t k=kbegin; k<kend; ++k ) {
6198 const SIMDType b1(
set( B(k,j) ) );
6199 xmm1 = xmm1 + A.load(i ,k) * b1;
6200 xmm2 = xmm2 + A.load(i1,k) * b1;
6201 xmm3 = xmm3 + A.load(i2,k) * b1;
6202 xmm4 = xmm4 + A.load(i3,k) * b1;
6205 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
6206 (~C).store( i1, j, (~C).load(i1,j) + xmm2 * factor );
6207 (~C).store( i2, j, (~C).load(i2,j) + xmm3 * factor );
6208 (~C).store( i3, j, (~C).load(i3,j) + xmm4 * factor );
6212 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
6214 const size_t i1( i+SIMDSIZE );
6218 for( ; (j+4UL) <= jend; j+=4UL )
6220 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
6221 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
6222 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
6223 ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
6225 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6227 for(
size_t k=kbegin; k<kend; ++k ) {
6228 const SIMDType a1( A.load(i ,k) );
6229 const SIMDType a2( A.load(i1,k) );
6230 const SIMDType b1(
set( B(k,j ) ) );
6231 const SIMDType b2(
set( B(k,j+1UL) ) );
6232 const SIMDType b3(
set( B(k,j+2UL) ) );
6233 const SIMDType b4(
set( B(k,j+3UL) ) );
6234 xmm1 = xmm1 + a1 * b1;
6235 xmm2 = xmm2 + a2 * b1;
6236 xmm3 = xmm3 + a1 * b2;
6237 xmm4 = xmm4 + a2 * b2;
6238 xmm5 = xmm5 + a1 * b3;
6239 xmm6 = xmm6 + a2 * b3;
6240 xmm7 = xmm7 + a1 * b4;
6241 xmm8 = xmm8 + a2 * b4;
6244 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6245 (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
6246 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
6247 (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm4 * factor );
6248 (~C).store( i , j+2UL, (~C).load(i ,j+2UL) + xmm5 * factor );
6249 (~C).store( i1, j+2UL, (~C).load(i1,j+2UL) + xmm6 * factor );
6250 (~C).store( i , j+3UL, (~C).load(i ,j+3UL) + xmm7 * factor );
6251 (~C).store( i1, j+3UL, (~C).load(i1,j+3UL) + xmm8 * factor );
6254 for( ; (j+2UL) <= jend; j+=2UL )
6256 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
6257 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
6258 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
6259 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
6261 SIMDType xmm1, xmm2, xmm3, xmm4;
6263 for(
size_t k=kbegin; k<kend; ++k ) {
6264 const SIMDType a1( A.load(i ,k) );
6265 const SIMDType a2( A.load(i1,k) );
6266 const SIMDType b1(
set( B(k,j ) ) );
6267 const SIMDType b2(
set( B(k,j+1UL) ) );
6268 xmm1 = xmm1 + a1 * b1;
6269 xmm2 = xmm2 + a2 * b1;
6270 xmm3 = xmm3 + a1 * b2;
6271 xmm4 = xmm4 + a2 * b2;
6274 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6275 (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
6276 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
6277 (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm4 * factor );
6282 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
6283 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
6284 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
6285 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
6287 SIMDType xmm1, xmm2;
6289 for(
size_t k=kbegin; k<kend; ++k ) {
6290 const SIMDType b1(
set( B(k,j) ) );
6291 xmm1 = xmm1 + A.load(i ,k) * b1;
6292 xmm2 = xmm2 + A.load(i1,k) * b1;
6295 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
6296 (~C).store( i1, j, (~C).load(i1,j) + xmm2 * factor );
6300 for( ; i<ipos; i+=SIMDSIZE )
6302 for(
size_t j=jj; j<jend; ++j )
6304 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
6305 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
6306 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE, ktmp ) ):( ktmp ),
6307 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
6311 for(
size_t k=kbegin; k<kend; ++k ) {
6312 const SIMDType b1(
set( B(k,j) ) );
6313 xmm1 = xmm1 + A.load(i,k) * b1;
6316 (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
6320 for( ; remainder && i<iend; ++i )
6322 for(
size_t j=jj; j<jend; ++j )
6324 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
6325 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
6326 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+1UL, ktmp ) ):( ktmp ),
6327 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
6331 for(
size_t k=kbegin; k<kend; ++k ) {
6332 value += A(i,k) * B(k,j);
6335 (~C)(i,j) += value * scalar;
6359 template<
typename MT3
6363 static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
6364 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6366 selectLargeAddAssignKernel( C, A, B, scalar );
6371 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
6385 template<
typename MT3
6389 static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
6390 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6392 typedef ElementType_<MT3> ET;
6394 if( IsTriangular<MT4>::value ) {
6395 ResultType_<MT3> tmp(
serial( B ) );
6396 trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6397 addAssign( C, tmp );
6399 else if( IsTriangular<MT5>::value ) {
6400 ResultType_<MT3> tmp(
serial( A ) );
6401 trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6402 addAssign( C, tmp );
6405 gemm( C, A, B, ET(scalar), ET(1) );
6426 template<
typename MT >
6427 friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
6428 addAssign( Matrix<MT,false>& lhs,
const DMatScalarMultExpr& rhs )
6437 LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
6438 RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
6440 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
6441 addAssign( ~lhs,
trans( left ) *
trans( right ) * rhs.scalar_ );
6442 else if( IsSymmetric<MT1>::value )
6443 addAssign( ~lhs,
trans( left ) * right * rhs.scalar_ );
6445 addAssign( ~lhs, left *
trans( right ) * rhs.scalar_ );
6465 template<
typename MT
6467 friend inline DisableIf_< CanExploitSymmetry<MT,MT1,MT2> >
6468 subAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
6475 LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
6476 RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
6478 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
6492 DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
6507 template<
typename MT3
6511 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6513 if( ( IsDiagonal<MT4>::value ) ||
6514 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
6515 selectSmallSubAssignKernel( C, A, B, scalar );
6517 selectBlasSubAssignKernel( C, A, B, scalar );
6535 template<
typename MT3
6539 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
6540 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6542 const ResultType tmp(
serial( A * B * scalar ) );
6543 subAssign( C, tmp );
6561 template<
typename MT3
6565 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
6566 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6570 const size_t M( A.rows() );
6571 const size_t N( B.columns() );
6573 for(
size_t j=0UL; j<N; ++j )
6575 const size_t ibegin( ( IsLower<MT4>::value )
6576 ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
6578 const size_t iend( ( IsUpper<MT4>::value )
6579 ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
6583 const size_t inum( iend - ibegin );
6584 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
6586 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
6587 C(i ,j) -= A(i ,j) * B(j,j) * scalar;
6588 C(i+1UL,j) -= A(i+1UL,j) * B(j,j) * scalar;
6591 C(ipos,j) -= A(ipos,j) * B(j,j) * scalar;
6611 template<
typename MT3
6615 static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
6616 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6620 const size_t M( A.rows() );
6621 const size_t N( B.columns() );
6623 for(
size_t j=0UL; j<N; ++j )
6625 const size_t ibegin( ( IsLower<MT5>::value )
6626 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
6628 const size_t iend( ( IsUpper<MT5>::value )
6629 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
6633 const size_t inum( iend - ibegin );
6634 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
6636 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
6637 C(i ,j) -= A(i ,i ) * B(i ,j) * scalar;
6638 C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
6641 C(ipos,j) -= A(ipos,ipos) * B(ipos,j) * scalar;
6661 template<
typename MT3
6665 static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
6666 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6670 for(
size_t i=0UL; i<A.rows(); ++i ) {
6671 C(i,i) -= A(i,i) * B(i,i) * scalar;
6690 template<
typename MT3
6694 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
6695 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6697 selectDefaultSubAssignKernel( C, A, B, scalar );
6716 template<
typename MT3
6720 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
6721 selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6728 if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
6729 const OppositeType_<MT5> tmp(
serial( B ) );
6730 subAssign( ~C, A * tmp * scalar );
6732 else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
6733 const OppositeType_<MT4> tmp(
serial( A ) );
6734 subAssign( ~C, tmp * B * scalar );
6736 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
6737 const OppositeType_<MT5> tmp(
serial( B ) );
6738 subAssign( ~C, A * tmp * scalar );
6741 const OppositeType_<MT4> tmp(
serial( A ) );
6742 subAssign( ~C, tmp * B * scalar );
6762 template<
typename MT3
6766 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
6767 selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6769 const size_t M( A.rows() );
6770 const size_t N( B.columns() );
6771 const size_t K( A.columns() );
6773 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
6775 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
6778 const SIMDType factor(
set( scalar ) );
6782 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
6783 for(
size_t j=0UL; j<N; ++j )
6785 const size_t kbegin( ( IsLower<MT5>::value )
6786 ?( ( IsUpper<MT4>::value )
6787 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6788 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6789 :( IsUpper<MT4>::value ? i : 0UL ) );
6790 const size_t kend( ( IsUpper<MT5>::value )
6791 ?( ( IsLower<MT4>::value )
6792 ?(
min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
6793 :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
6794 :( IsLower<MT4>::value ?
min( i+SIMDSIZE*8UL, K ) : K ) );
6796 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6798 for(
size_t k=kbegin; k<kend; ++k ) {
6799 const SIMDType b1(
set( B(k,j) ) );
6800 xmm1 = xmm1 + A.load(i ,k) * b1;
6801 xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
6802 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
6803 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
6804 xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,k) * b1;
6805 xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,k) * b1;
6806 xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,k) * b1;
6807 xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,k) * b1;
6810 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
6811 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
6812 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
6813 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
6814 (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
6815 (~C).store( i+SIMDSIZE*5UL, j, (~C).load(i+SIMDSIZE*5UL,j) - xmm6 * factor );
6816 (~C).store( i+SIMDSIZE*6UL, j, (~C).load(i+SIMDSIZE*6UL,j) - xmm7 * factor );
6817 (~C).store( i+SIMDSIZE*7UL, j, (~C).load(i+SIMDSIZE*7UL,j) - xmm8 * factor );
6821 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
6825 for( ; (j+2UL) <= N; j+=2UL )
6827 const size_t kbegin( ( IsLower<MT5>::value )
6828 ?( ( IsUpper<MT4>::value )
6829 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6830 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6831 :( IsUpper<MT4>::value ? i : 0UL ) );
6832 const size_t kend( ( IsUpper<MT5>::value )
6833 ?( ( IsLower<MT4>::value )
6834 ?(
min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
6835 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
6836 :( IsLower<MT4>::value ?
min( i+SIMDSIZE*4UL, K ) : K ) );
6838 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6840 for(
size_t k=kbegin; k<kend; ++k ) {
6841 const SIMDType a1( A.load(i ,k) );
6842 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
6843 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
6844 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
6845 const SIMDType b1(
set( B(k,j ) ) );
6846 const SIMDType b2(
set( B(k,j+1UL) ) );
6847 xmm1 = xmm1 + a1 * b1;
6848 xmm2 = xmm2 + a2 * b1;
6849 xmm3 = xmm3 + a3 * b1;
6850 xmm4 = xmm4 + a4 * b1;
6851 xmm5 = xmm5 + a1 * b2;
6852 xmm6 = xmm6 + a2 * b2;
6853 xmm7 = xmm7 + a3 * b2;
6854 xmm8 = xmm8 + a4 * b2;
6857 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
6858 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) - xmm2 * factor );
6859 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
6860 (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
6861 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm5 * factor );
6862 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) - xmm6 * factor );
6863 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) - xmm7 * factor );
6864 (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) - xmm8 * factor );
6869 const size_t kbegin( ( IsLower<MT5>::value )
6870 ?( ( IsUpper<MT4>::value )
6871 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6872 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6873 :( IsUpper<MT4>::value ? i : 0UL ) );
6874 const size_t kend( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*4UL, K ) ):( K ) );
6876 SIMDType xmm1, xmm2, xmm3, xmm4;
6878 for(
size_t k=kbegin; k<kend; ++k ) {
6879 const SIMDType b1(
set( B(k,j) ) );
6880 xmm1 = xmm1 + A.load(i ,k) * b1;
6881 xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
6882 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
6883 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
6886 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
6887 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
6888 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
6889 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
6893 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
6897 for( ; (j+2UL) <= N; j+=2UL )
6899 const size_t kbegin( ( IsLower<MT5>::value )
6900 ?( ( IsUpper<MT4>::value )
6901 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6902 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6903 :( IsUpper<MT4>::value ? i : 0UL ) );
6904 const size_t kend( ( IsUpper<MT5>::value )
6905 ?( ( IsLower<MT4>::value )
6906 ?(
min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
6907 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
6908 :( IsLower<MT4>::value ?
min( i+SIMDSIZE*2UL, K ) : K ) );
6910 SIMDType xmm1, xmm2, xmm3, xmm4;
6912 for(
size_t k=kbegin; k<kend; ++k ) {
6913 const SIMDType a1( A.load(i ,k) );
6914 const SIMDType a2( A.load(i+SIMDSIZE,k) );
6915 const SIMDType b1(
set( B(k,j ) ) );
6916 const SIMDType b2(
set( B(k,j+1UL) ) );
6917 xmm1 = xmm1 + a1 * b1;
6918 xmm2 = xmm2 + a2 * b1;
6919 xmm3 = xmm3 + a1 * b2;
6920 xmm4 = xmm4 + a2 * b2;
6923 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
6924 (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) - xmm2 * factor );
6925 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
6926 (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) - xmm4 * factor );
6931 const size_t kbegin( ( IsLower<MT5>::value )
6932 ?( ( IsUpper<MT4>::value )
6933 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6934 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6935 :( IsUpper<MT4>::value ? i : 0UL ) );
6936 const size_t kend( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, K ) ):( K ) );
6938 SIMDType xmm1, xmm2;
6940 for(
size_t k=kbegin; k<kend; ++k ) {
6941 const SIMDType b1(
set( B(k,j) ) );
6942 xmm1 = xmm1 + A.load(i ,k) * b1;
6943 xmm2 = xmm2 + A.load(i+SIMDSIZE,k) * b1;
6946 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
6947 (~C).store( i+SIMDSIZE, j, (~C).load(i+SIMDSIZE,j) - xmm2 * factor );
6951 for( ; i<ipos; i+=SIMDSIZE )
6955 for( ; (j+2UL) <= N; j+=2UL )
6957 const size_t kbegin( ( IsLower<MT5>::value )
6958 ?( ( IsUpper<MT4>::value )
6959 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6960 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6961 :( IsUpper<MT4>::value ? i : 0UL ) );
6962 const size_t kend( ( IsUpper<MT5>::value )
6963 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
6966 SIMDType xmm1, xmm2;
6968 for(
size_t k=kbegin; k<kend; ++k ) {
6969 const SIMDType a1( A.load(i,k) );
6970 xmm1 = xmm1 + a1 *
set( B(k,j ) );
6971 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
6974 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
6975 (~C).store( i, j+1UL, (~C).load(i,j+1UL) - xmm2 * factor );
6980 const size_t kbegin( ( IsLower<MT5>::value )
6981 ?( ( IsUpper<MT4>::value )
6982 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6983 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6984 :( IsUpper<MT4>::value ? i : 0UL ) );
6988 for(
size_t k=kbegin; k<K; ++k ) {
6989 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
6992 (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
6996 for( ; remainder && i<M; ++i )
7000 for( ; (j+2UL) <= N; j+=2UL )
7002 const size_t kbegin( ( IsLower<MT5>::value )
7003 ?( ( IsUpper<MT4>::value )
7004 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7005 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7006 :( IsUpper<MT4>::value ? i : 0UL ) );
7007 const size_t kend( ( IsUpper<MT5>::value )
7008 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
7014 for(
size_t k=kbegin; k<kend; ++k ) {
7015 value1 += A(i,k) * B(k,j );
7016 value2 += A(i,k) * B(k,j+1UL);
7019 (~C)(i,j ) -= value1 * scalar;
7020 (~C)(i,j+1UL) -= value2 * scalar;
7025 const size_t kbegin( ( IsLower<MT5>::value )
7026 ?( ( IsUpper<MT4>::value )
7027 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7028 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7029 :( IsUpper<MT4>::value ? i : 0UL ) );
7033 for(
size_t k=kbegin; k<K; ++k ) {
7034 value += A(i,k) * B(k,j);
7037 (~C)(i,j) -= value * scalar;
7057 template<
typename MT3
7061 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
7062 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7064 selectDefaultSubAssignKernel( C, A, B, scalar );
7083 template<
typename MT3
7087 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
7088 selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
7090 selectSmallSubAssignKernel( ~C, A, B, scalar );
7109 template<
typename MT3
7113 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
7114 selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
7116 const size_t M( A.rows() );
7117 const size_t N( B.columns() );
7118 const size_t K( A.columns() );
7120 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
7122 const SIMDType factor(
set( scalar ) );
7124 for(
size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_IBLOCK_SIZE )
7126 const size_t iend(
min( ii+TDMATTDMATMULT_IBLOCK_SIZE, M ) );
7128 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
7129 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
7131 for(
size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_JBLOCK_SIZE )
7133 const size_t jend(
min( jj+TDMATTDMATMULT_JBLOCK_SIZE, N ) );
7135 for(
size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_KBLOCK_SIZE )
7137 const size_t ktmp(
min( kk+TDMATTDMATMULT_KBLOCK_SIZE, K ) );
7141 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
7143 const size_t i1( i+SIMDSIZE );
7144 const size_t i2( i+SIMDSIZE*2UL );
7145 const size_t i3( i+SIMDSIZE*3UL );
7149 for( ; (j+2UL) <= jend; j+=2UL )
7151 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7152 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7153 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
7154 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
7156 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7158 for(
size_t k=kbegin; k<kend; ++k ) {
7159 const SIMDType a1( A.load(i ,k) );
7160 const SIMDType a2( A.load(i1,k) );
7161 const SIMDType a3( A.load(i2,k) );
7162 const SIMDType a4( A.load(i3,k) );
7163 const SIMDType b1(
set( B(k,j ) ) );
7164 const SIMDType b2(
set( B(k,j+1UL) ) );
7165 xmm1 = xmm1 + a1 * b1;
7166 xmm2 = xmm2 + a2 * b1;
7167 xmm3 = xmm3 + a3 * b1;
7168 xmm4 = xmm4 + a4 * b1;
7169 xmm5 = xmm5 + a1 * b2;
7170 xmm6 = xmm6 + a2 * b2;
7171 xmm7 = xmm7 + a3 * b2;
7172 xmm8 = xmm8 + a4 * b2;
7175 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7176 (~C).store( i1, j , (~C).load(i1,j ) - xmm2 * factor );
7177 (~C).store( i2, j , (~C).load(i2,j ) - xmm3 * factor );
7178 (~C).store( i3, j , (~C).load(i3,j ) - xmm4 * factor );
7179 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm5 * factor );
7180 (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) - xmm6 * factor );
7181 (~C).store( i2, j+1UL, (~C).load(i2,j+1UL) - xmm7 * factor );
7182 (~C).store( i3, j+1UL, (~C).load(i3,j+1UL) - xmm8 * factor );
7187 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7188 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7189 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
7190 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7192 SIMDType xmm1, xmm2, xmm3, xmm4;
7194 for(
size_t k=kbegin; k<kend; ++k ) {
7195 const SIMDType b1(
set( B(k,j) ) );
7196 xmm1 = xmm1 + A.load(i ,k) * b1;
7197 xmm2 = xmm2 + A.load(i1,k) * b1;
7198 xmm3 = xmm3 + A.load(i2,k) * b1;
7199 xmm4 = xmm4 + A.load(i3,k) * b1;
7202 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
7203 (~C).store( i1, j, (~C).load(i1,j) - xmm2 * factor );
7204 (~C).store( i2, j, (~C).load(i2,j) - xmm3 * factor );
7205 (~C).store( i3, j, (~C).load(i3,j) - xmm4 * factor );
7209 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
7211 const size_t i1( i+SIMDSIZE );
7215 for( ; (j+4UL) <= jend; j+=4UL )
7217 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7218 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7219 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
7220 ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
7222 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7224 for(
size_t k=kbegin; k<kend; ++k ) {
7225 const SIMDType a1( A.load(i ,k) );
7226 const SIMDType a2( A.load(i1,k) );
7227 const SIMDType b1(
set( B(k,j ) ) );
7228 const SIMDType b2(
set( B(k,j+1UL) ) );
7229 const SIMDType b3(
set( B(k,j+2UL) ) );
7230 const SIMDType b4(
set( B(k,j+3UL) ) );
7231 xmm1 = xmm1 + a1 * b1;
7232 xmm2 = xmm2 + a2 * b1;
7233 xmm3 = xmm3 + a1 * b2;
7234 xmm4 = xmm4 + a2 * b2;
7235 xmm5 = xmm5 + a1 * b3;
7236 xmm6 = xmm6 + a2 * b3;
7237 xmm7 = xmm7 + a1 * b4;
7238 xmm8 = xmm8 + a2 * b4;
7241 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7242 (~C).store( i1, j , (~C).load(i1,j ) - xmm2 * factor );
7243 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
7244 (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) - xmm4 * factor );
7245 (~C).store( i , j+2UL, (~C).load(i ,j+2UL) - xmm5 * factor );
7246 (~C).store( i1, j+2UL, (~C).load(i1,j+2UL) - xmm6 * factor );
7247 (~C).store( i , j+3UL, (~C).load(i ,j+3UL) - xmm7 * factor );
7248 (~C).store( i1, j+3UL, (~C).load(i1,j+3UL) - xmm8 * factor );
7251 for( ; (j+2UL) <= jend; j+=2UL )
7253 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7254 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7255 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
7256 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
7258 SIMDType xmm1, xmm2, xmm3, xmm4;
7260 for(
size_t k=kbegin; k<kend; ++k ) {
7261 const SIMDType a1( A.load(i ,k) );
7262 const SIMDType a2( A.load(i1,k) );
7263 const SIMDType b1(
set( B(k,j ) ) );
7264 const SIMDType b2(
set( B(k,j+1UL) ) );
7265 xmm1 = xmm1 + a1 * b1;
7266 xmm2 = xmm2 + a2 * b1;
7267 xmm3 = xmm3 + a1 * b2;
7268 xmm4 = xmm4 + a2 * b2;
7271 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7272 (~C).store( i1, j , (~C).load(i1,j ) - xmm2 * factor );
7273 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
7274 (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) - xmm4 * factor );
7279 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7280 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7281 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
7282 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7284 SIMDType xmm1, xmm2;
7286 for(
size_t k=kbegin; k<kend; ++k ) {
7287 const SIMDType b1(
set( B(k,j) ) );
7288 xmm1 = xmm1 + A.load(i ,k) * b1;
7289 xmm2 = xmm2 + A.load(i1,k) * b1;
7292 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
7293 (~C).store( i1, j, (~C).load(i1,j) - xmm2 * factor );
7297 for( ; i<ipos; i+=SIMDSIZE )
7299 for(
size_t j=jj; j<jend; ++j )
7301 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7302 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7303 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE, ktmp ) ):( ktmp ),
7304 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7308 for(
size_t k=kbegin; k<kend; ++k ) {
7309 const SIMDType b1(
set( B(k,j) ) );
7310 xmm1 = xmm1 + A.load(i,k) * b1;
7313 (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
7317 for( ; remainder && i<iend; ++i )
7319 for(
size_t j=jj; j<jend; ++j )
7321 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7322 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7323 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+1UL, ktmp ) ):( ktmp ),
7324 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7328 for(
size_t k=kbegin; k<kend; ++k ) {
7329 value += A(i,k) * B(k,j);
7332 (~C)(i,j) -= value * scalar;
7356 template<
typename MT3
7360 static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
7361 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7363 selectLargeSubAssignKernel( C, A, B, scalar );
7368 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
7382 template<
typename MT3
7386 static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
7387 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7389 typedef ElementType_<MT3> ET;
7391 if( IsTriangular<MT4>::value ) {
7392 ResultType_<MT3> tmp(
serial( B ) );
7393 trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7394 subAssign( C, tmp );
7396 else if( IsTriangular<MT5>::value ) {
7397 ResultType_<MT3> tmp(
serial( A ) );
7398 trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7399 subAssign( C, tmp );
7402 gemm( C, A, B, ET(-scalar), ET(1) );
7422 template<
typename MT >
7423 friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
7424 subAssign( Matrix<MT,false>& lhs,
const DMatScalarMultExpr& rhs )
7433 LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7434 RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7436 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
7437 subAssign( ~lhs,
trans( left ) *
trans( right ) * rhs.scalar_ );
7438 else if( IsSymmetric<MT1>::value )
7439 subAssign( ~lhs,
trans( left ) * right * rhs.scalar_ );
7441 subAssign( ~lhs, left *
trans( right ) * rhs.scalar_ );
7472 template<
typename MT
7474 friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
7475 smpAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7482 LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7483 RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7485 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
7488 else if( left.columns() == 0UL ) {
7522 template<
typename MT
7524 friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
7525 smpAssign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7529 typedef IfTrue_< SO, ResultType, OppositeType > TmpType;
7541 const TmpType tmp( rhs );
7560 template<
typename MT >
7561 friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
7562 smpAssign( Matrix<MT,false>& lhs,
const DMatScalarMultExpr& rhs )
7571 LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7572 RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7574 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
7576 else if( IsSymmetric<MT1>::value )
7598 template<
typename MT
7600 friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
7601 smpAddAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7608 LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7609 RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7611 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
7644 template<
typename MT >
7645 friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
7646 smpAddAssign( Matrix<MT,false>& lhs,
const DMatScalarMultExpr& rhs )
7655 LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7656 RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7658 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
7660 else if( IsSymmetric<MT1>::value )
7686 template<
typename MT
7688 friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
7689 smpSubAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7696 LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7697 RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7699 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
7732 template<
typename MT >
7733 friend inline EnableIf_< CanExploitSymmetry<MT,MT1,MT2> >
7734 smpSubAssign( Matrix<MT,false>& lhs,
const DMatScalarMultExpr& rhs )
7743 LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7744 RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7746 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
7748 else if( IsSymmetric<MT1>::value )
7816 template<
typename T1
7818 inline const TDMatTDMatMultExpr<T1,T2>
7842 template<
typename MT1,
typename MT2 >
7859 template<
typename MT1,
typename MT2 >
7876 template<
typename MT1,
typename MT2 >
7878 :
public BoolConstant< And< IsAligned<MT1>, IsAligned<MT2> >::value >
7894 template<
typename MT1,
typename MT2 >
7896 :
public BoolConstant< And< IsLower<MT1>, IsLower<MT2> >::value >
7912 template<
typename MT1,
typename MT2 >
7914 :
public BoolConstant< And< IsUniLower<MT1>, IsUniLower<MT2> >::value >
7930 template<
typename MT1,
typename MT2 >
7932 :
public BoolConstant< Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
7933 , And< IsStrictlyLower<MT2>, IsLower<MT1> > >::value >
7949 template<
typename MT1,
typename MT2 >
7951 :
public BoolConstant< And< IsUpper<MT1>, IsUpper<MT2> >::value >
7967 template<
typename MT1,
typename MT2 >
7969 :
public BoolConstant< And< IsUniUpper<MT1>, IsUniUpper<MT2> >::value >
7985 template<
typename MT1,
typename MT2 >
7987 :
public BoolConstant< Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
7988 , And< IsStrictlyUpper<MT2>, IsUpper<MT1> > >::value >
8004 template<
typename MT1,
typename MT2,
typename VT >
8009 using Type = If_< And< IsDenseMatrix<MT1>, IsColumnMajorMatrix<MT1>
8010 , IsDenseMatrix<MT2>, IsColumnMajorMatrix<MT2>
8011 , IsDenseVector<VT>, IsColumnVector<VT> >
8012 , TDMatDVecMultExprTrait_< MT1, TDMatDVecMultExprTrait_<MT2,VT> >
8022 template<
typename MT1,
typename MT2,
typename VT >
8027 using Type = If_< And< IsDenseMatrix<MT1>, IsColumnMajorMatrix<MT1>
8028 , IsDenseMatrix<MT2>, IsColumnMajorMatrix<MT2>
8029 , IsSparseVector<VT>, IsColumnVector<VT> >
8030 , TDMatDVecMultExprTrait_< MT1, TDMatSVecMultExprTrait_<MT2,VT> >
8040 template<
typename VT,
typename MT1,
typename MT2 >
8045 using Type = If_< And< IsDenseVector<VT>, IsRowVector<VT>
8046 , IsDenseMatrix<MT1>, IsColumnMajorMatrix<MT1>
8047 , IsDenseMatrix<MT2>, IsColumnMajorMatrix<MT2> >
8048 , TDVecTDMatMultExprTrait_< TDVecTDMatMultExprTrait_<VT,MT1>, MT2 >
8058 template<
typename VT,
typename MT1,
typename MT2 >
8063 using Type = If_< And< IsSparseVector<VT>, IsRowVector<VT>
8064 , IsDenseMatrix<MT1>, IsColumnMajorMatrix<MT1>
8065 , IsDenseMatrix<MT2>, IsColumnMajorMatrix<MT2> >
8066 , TDVecTDMatMultExprTrait_< TSVecTDMatMultExprTrait_<VT,MT1>, MT2 >
8076 template<
typename MT1,
typename MT2,
bool AF >
8081 using Type = MultExprTrait_< SubmatrixExprTrait_<const MT1,AF>
8082 , SubmatrixExprTrait_<const MT2,AF> >;
8091 template<
typename MT1,
typename MT2 >
8096 using Type = MultExprTrait_< RowExprTrait_<const MT1>, MT2 >;
8105 template<
typename MT1,
typename MT2 >
8110 using Type = MultExprTrait_< MT1, ColumnExprTrait_<const MT2> >;
ResultType_< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:153
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatTDMatMultExpr.h:304
Header file for auxiliary alias declarations.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:72
Constraint on the data type.
Header file for kernel specific block sizes.
Header file for mathematical functions.
constexpr bool useOptimizedKernels
Configuration switch for optimized kernels.This configuration switch enables/disables all optimized c...
Definition: Optimizations.h:84
MultTrait_< RT1, RT2 > ResultType
Result type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:245
Header file for the Rows type trait.
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatTDMatMultExpr.h:251
Header file for the IsUniUpper type trait.
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:7800
RightOperand rightOperand() const noexcept
Returns the right-hand side transpose dense matrix operand.
Definition: TDMatTDMatMultExpr.h:398
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:87
Header file for basic type definitions.
CompositeType_< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:158
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatTDMatMultExpr.h:432
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
Header file for the serial shim.
Header file for the IsDiagonal type trait.
BLAZE_ALWAYS_INLINE size_t size(const Vector< VT, TF > &vector) noexcept
Returns the current size/dimension of the vector.
Definition: Vector.h:258
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: TDMatTDMatMultExpr.h:368
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the ColumnExprTrait class template.
Header file for the IsSame and IsStrictlySame type traits.
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:188
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:162
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:315
Header file for the IsColumnMajorMatrix type trait.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:533
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: TDMatTDMatMultExpr.h:352
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDMatTDMatMultExpr.h:249
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDMatTDMatMultExpr.h:410
Header file for the IsRowVector type trait.
typename DisableIf< Condition, T >::Type DisableIf_
Auxiliary type for the DisableIf class template.The DisableIf_ alias declaration provides a convenien...
Definition: DisableIf.h:223
Header file for the And class template.
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatTDMatMultExpr.h:451
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1669
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:88
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:162
ElementType_< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:156
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:723
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:245
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Expression object for transpose dense matrix-transpose dense matrix multiplications.The TDMatTDMatMultExpr class represents the compile time expression for multiplications between two column-major dense matrices.
Definition: Forward.h:134
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:88
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: TDMatTDMatMultExpr.h:378
Header file for the IsUniLower type trait.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:323
const ElementType_< MT > max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1716
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT > >, ColumnExprTrait_< MT > > column(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific column of the given matrix.
Definition: Column.h:126
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
TDMatTDMatMultExpr< MT1, MT2 > This
Type of this TDMatTDMatMultExpr instance.
Definition: TDMatTDMatMultExpr.h:244
IfTrue_< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatTDMatMultExpr.h:263
Header file for the IsComplexDouble type trait.
Constraint on the data type.
Header file for the MultExprTrait class template.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:72
SubvectorExprTrait_< VT, unaligned > subvector(Vector< VT, TF > &vector, size_t index, size_t size)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:152
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
Header file for the DisableIf class template.
If_< IsExpression< MT2 >, const MT2, const MT2 & > RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:257
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
Header file for the If class template.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
CompositeType_< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:157
Header file for the TSVecTDMatMultExprTrait class template.
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
Header file for the Or class template.
Header file for the TDMatSVecMultExprTrait class template.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
Header file for the Columns type trait.
Header file for the Not class template.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:250
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:90
Header file for the IsStrictlyTriangular type trait.
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:60
Header file for the IsTriangular type trait.
Constraints on the storage order of matrix types.
Header file for the exception macros of the math module.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
BLAZE_ALWAYS_INLINE MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:254
Header file for the RowExprTrait class template.
Header file for all forward declarations for expression class templates.
Header file for the IsDenseMatrix type trait.
ElementType_< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:155
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
#define BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:93
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:109
Header file for the IsNumeric type trait.
BLAZE_ALWAYS_INLINE const EnableIf_< And< IsIntegral< T >, HasSize< T, 1UL > >, If_< IsSigned< T >, SIMDint8, SIMDuint8 > > set(T value) noexcept
Sets all values in the vector to the given 1-byte integral value.
Definition: Set.h:76
Header file for the HasConstDataAccess type trait.
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT > >, RowExprTrait_< MT > > row(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific row of the given matrix.
Definition: Row.h:126
System settings for the BLAS mode.
OppositeType_< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:246
Header file for the IsSparseVector type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a symmetric matrix type, a compilation error is created.
Definition: Symmetric.h:79
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
Header file for the MatScalarMultExpr base class.
Header file for run time assertion macros.
Utility type for generic codes.
LeftOperand leftOperand() const noexcept
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatTDMatMultExpr.h:388
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:160
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDMatTDMatMultExpr.h:442
ResultType_< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:154
Constraint on the data type.
If_< IsExpression< MT1 >, const MT1, const MT1 & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:254
Constraints on the storage order of matrix types.
Header file for the HasMutableDataAccess type trait.
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:247
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:113
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant class template represents ...
Definition: IntegralConstant.h:100
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
typename T::OppositeType OppositeType_
Alias declaration for nested OppositeType type definitions.The OppositeType_ alias declaration provid...
Definition: Aliases.h:243
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
Header file for the IsDenseVector type trait.
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
ElementType_< ResultType > ElementType
Resulting element type.
Definition: TDMatTDMatMultExpr.h:248
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatTDMatMultExpr.h:452
TDMatTDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the TDMatTDMatMultExpr class.
Definition: TDMatTDMatMultExpr.h:289
Header file for the AreSIMDCombinable type trait.
Header file for the IsRowMajorMatrix type trait.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:55
const DMatTransExpr< MT,!SO > trans(const DenseMatrix< MT, SO > &dm)
Calculation of the transpose of the given dense matrix.
Definition: DMatTransExpr.h:950
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Header file for the TDMatDVecMultExprTrait class template.
Header file for BLAS general matrix/matrix multiplication functions (gemm)
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
Header file for the IsComplexFloat type trait.
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDMatTDMatMultExpr.h:422
Header file for the IntegralConstant class template.
Header file for the IsComplex type trait.
Header file for the complex data type.
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:403
Header file for the IsUpper type trait.
IfTrue_< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatTDMatMultExpr.h:260
Header file for the IsColumnVector type trait.
Constraint on the data type.
Header file for the IsResizable type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the TDVecTDMatMultExprTrait class template.
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.